Telegram Bot Scraping Apartments for Sale

main

I'm looking for a new apartment, but I don't want to waste time checking the real estate website. Hopefully, we can code! So let's write a TypeScript program that will run on AWS Lambda every hour, scrap new apartments, and send them to a Telegram chat.

Let's set up AWS infrastructure with Terraform. We start with an S3 bucket that will store the code for the lambda. Then we create a DynamoDB table for keeping the state like a list of apartments ids we have already seen. The lambda will receive configuration through environment variables. It includes a Sentry key for reporting, a telegram token with a chat id to send apartments, and the name of the DynamoDB table. To run the lambda every hour, we'll use CloudWatch events.

resource "aws_s3_bucket" "lambda_storage" {
  bucket = "${var.name}-storage"
}

data "archive_file" "local_zipped_lambda" {
  type        = "zip"
  source_dir  = "${path.module}/lambda"
  output_path = "${path.module}/lambda.zip"
}

resource "aws_s3_object" "zipped_lambda" {
  bucket = aws_s3_bucket.lambda_storage.bucket
  key    = "lambda.zip"
  source = data.archive_file.local_zipped_lambda.output_path
}

resource "aws_dynamodb_table" "state" {
  name         = "${var.name}-state"
  billing_mode = "PAY_PER_REQUEST"
  hash_key     = "id"

  attribute {
    name = "id"
    type = "S"
  }
}

resource "aws_iam_role" "service" {
  name = var.name

  assume_role_policy = jsonencode(
    {
      Version = "2012-10-17",
      Statement = [
        {
          Action = "sts:AssumeRole",
          Principal = {
            Service = "lambda.amazonaws.com"
          },
          Effect = "Allow",
          Sid    = ""
        }
      ]
    }
  )
}

resource "aws_iam_role_policy_attachment" "service" {
  role       = aws_iam_role.service.name
  policy_arn = aws_iam_policy.service.arn
}

resource "aws_lambda_function" "service" {
  function_name = var.name

  s3_bucket   = aws_s3_bucket.lambda_storage.bucket
  s3_key      = "lambda.zip"
  memory_size = "1024"

  handler = "index.handler"
  runtime = "nodejs16.x"
  timeout = "50"
  role    = aws_iam_role.service.arn

  environment {
    variables = {
      SENTRY_KEY           = var.sentry_key
      TELEGRAM_BOT_TOKEN   = var.telegram_bot_token
      TELEGRAM_BOT_CHAT_ID = var.telegram_bot_chat_id
      STATE_TABLE_NAME     = aws_dynamodb_table.state.name
    }
  }
}

resource "aws_iam_policy" "service" {
  name = var.name
  path = "/"

  policy = jsonencode(
    {
      Version = "2012-10-17",
      Statement = [
        {
          Action = [
            "logs:CreateLogGroup",
            "logs:CreateLogStream",
            "logs:PutLogEvents"
          ],
          Resource = "arn:aws:logs:*:*:*",
          Effect   = "Allow"
        },
        {
          Action   = "dynamodb:*",
          Resource = "${aws_dynamodb_table.state.arn}",
          Effect   = "Allow"
        }
      ]
  })
}

resource "aws_cloudwatch_event_rule" "lambda" {
  name                = var.name
  schedule_expression = "rate(1 hour)"
}

resource "aws_cloudwatch_event_target" "lambda" {
  rule      = aws_cloudwatch_event_rule.lambda.name
  target_id = var.name
  arn       = aws_lambda_function.service.arn
}

resource "aws_lambda_permission" "lambda_cloudwatch" {
  statement_id  = "AllowExecutionFromCloudWatch"
  action        = "lambda:InvokeFunction"
  function_name = aws_lambda_function.service.function_name
  principal     = "events.amazonaws.com"
  source_arn    = aws_cloudwatch_event_rule.lambda.arn
}

At the entry point of our lambda, we set up Sentry for error handling and export the handler function. Here we get new real estate and send them to a telegram chat.

import * as Sentry from '@sentry/serverless'

import { assertEnvVar } from './utils/assertEnvVar'
import { findNewRealEstate } from './findNewRealEstate'

Sentry.AWSLambda.init({
  dsn: assertEnvVar('SENTRY_KEY'),
  autoSessionTracking: false,
})

export const handler = Sentry.AWSLambda.wrapHandler(findNewRealEstate)
import { getNewRealEstate } from './sources/myHomeGe'
import { tellAboutUnits } from './tellAboutUnits'

export const findNewRealEstate = async () => {
  const units = await getNewRealEstate()

  await tellAboutUnits(units)
}

We have only one scrapper, but we can expand the app by coding the getNewRealEstate function for other marketplaces. First, we create an instance of a state provider. It uses DynamoDB to store the data of a given website and provides two methods, one to get the state and another to update. The state keeps the timestamp of the last visit and the ids of apartments we've seen already.

import { defaultSourceState, SourceState } from './SourceState'
import { assertEnvVar } from './utils/assertEnvVar'
import { DynamoDB } from 'aws-sdk'
import { getUpdateParams } from './shared/db/getUpdateParams'

const documentClient = new DynamoDB.DocumentClient()
const tableName = assertEnvVar('STATE_TABLE_NAME')

export class StateProvider {
  readonly name: string

  constructor(name: string) {
    this.name = name
  }

  async get() {
    const { Item } = await documentClient
      .get({
        TableName: tableName,
        Key: { id: this.name },
      })
      .promise()

    return (Item || defaultSourceState) as SourceState
  }

  async update(params: Partial<SourceState>) {
    await documentClient
      .update({
        TableName: tableName,
        Key: { id: this.name },
        ...getUpdateParams(params),
      })
      .promise()
  }
}
import fetch from 'node-fetch'
import { load } from 'cheerio'
import { Unit } from '../Unit'
import { StateProvider } from '../StateProvider'

const msInDay = 86400000

const sourceName = 'myhome.ge'
const realEstateSearchPage = `https://www.myhome.ge/en/s/Apartment-for-sale-Tbilisi?Keyword=Tbilisi&AdTypeID=1&PrTypeID=1&mapC=41.70931%2C44.78487&mapZ=12&mapOp=1&EnableMap=0&regions=687586034.689678147.689701920&districts=2022621279.906139527.1650325628.2185664.5965823289.798496409&cities=1996871&GID=1996871&FCurrencyID=1&FPriceTo=110000&AreaSizeFrom=70&FloorNums=notlast.notfirst&BedRoomNums=2.3&action_map=on&RenovationID=1.5.7`

const getUnitsFromPage = (body: string) => {
  const $ = load(body)

  const year = new Date().getFullYear()

  const cards = $('.statement-card')
    .filter(':not(.banner)')
    .filter(':not(..ado_ban)')

  return cards
    .toArray()
    .map((card) => {
      const $card = load(card)
      const [rawId, rawDate] = $card('.d-block')
        .toArray()
        .map((el) => $(el).text())

      if (!rawId || !rawDate) return

      const [day, monthString, time] = rawDate.split(' ')
      const rawDateWithYear = [day, monthString, year, time].join(' ')
      const id = rawId.split(' ')[1]

      const url = $card('a:first').attr('href')
      if (!url) return

      return {
        url,
        id,
        createdAt: new Date(rawDateWithYear).getTime(),
      }
    })
    .filter((unit) => unit) as Unit[]
}

const getUnits = async (lastVisitAt: number) => {
  const recursive = async (units: Unit[], page: number): Promise<Unit[]> => {
    const response = await fetch(`${realEstateSearchPage}&Page=${page}`)
    const body = await response.text()

    const newUnits = getUnitsFromPage(body).filter(
      (unit) => unit.createdAt > lastVisitAt
    )
    if (newUnits.length < 1) return units

    return recursive([...units, ...newUnits], page + 1)
  }

  return await recursive([], 1)
}

export const getNewRealEstate = async (): Promise<Unit[]> => {
  const stateProvider = new StateProvider(sourceName)
  const state = await stateProvider.get()

  const units = (
    await getUnits(state.lastVisitAt || Date.now() - msInDay * 2)
  ).filter((a) => !state.shown.includes(a.id))

  await stateProvider.update({
    lastVisitAt: Date.now(),
    shown: [...state.shown, ...units.map((unit) => unit.id)],
  })

  return units
}

Once we have the last visit date, we want to get all units posted since that time and filter them to skip already shown ones. After that, we update the state and return these units.

The website has pagination, so we'll use a recursive function that receives units and the page number. First, we fetch the page, then take the body and send it to the scrapper function. If there are no new apartments, we exit the recursion.

We scrap the page with the cheerio library that provides the same API as jQuery. The function takes all the cards and converts them to the Unit type with URL, id, and creation date.

After we've collected new apartments, we want to send them to a Telegram chat. Here we get token and chat id from environment variables and send each apartment as a new message.

import { Unit } from './Unit'
import { assertEnvVar } from './utils/assertEnvVar'
import TelegramBot from 'node-telegram-bot-api'

export const tellAboutUnits = async (units: Unit[]) => {
  const telegramBotToken = assertEnvVar('TELEGRAM_BOT_TOKEN')
  const telegramChatId = assertEnvVar('TELEGRAM_BOT_CHAT_ID')

  const bot = new TelegramBot(telegramBotToken)
  await Promise.all(
    units.map(({ url }) => bot.sendMessage(telegramChatId, url))
  )
}