Source code for pyhousehunter.scraper

# author: Alex Truong Hai Yen
# date: 2021-02-25

from bs4 import BeautifulSoup
import requests
import pandas as pd
import regex as re


[docs]def scraper(url, online=False):
    """Function to scrape housing data from a given Craiglist url

    Parameters
    ----------
    url : str
        The given housing craiglist URL to scrape the data from

    online: bool
        Whether the data is scraped directly online from the url (default = False)
        False means the data is scraped from a local HTML file

    ReturnsF
    -------
    pandas.core.frame.DataFrame
        A dataframe containing listing information like listing url, price, house type.

    Examples
    -------
    >>> scraper(url = 'https://vancouver.craigslist.org/d/apartments-housing-for-rent/search/apa')
    """
    # PART 0: Exception handling/ Input validation

    # the right Craiglist URL
    regex = r"(http|https):\/\/vancouver.craigslist.org\/d\/apartments-housing-for-rent\/search\/apa.*"

    try:
        re.search(regex, url)
    except TypeError:
        print("Wrong data type. Please enter a correct Craiglist Housing URL")
    except SyntaxError:
        print("Wrong syntax. Please enter a correct Craiglist Housing URL")

    if re.search(regex, url) is None:
        raise ValueError(
            "Invalid URL. Please enter a Craiglist Housing URL with this formatn\
            https://vancouver.craigslist.org/d/apartments-housing-for-rent/search/apa"
        )

    # the right option for online
    if type(online) != bool:
        raise TypeError("Please enter Boolean value: True or False")

    # PART 1: create soup object either from the url or local HTML file
    if online is True:
        headers = {
            "DNT": "1",
            "Referer": "https://vancouver.craigslist.org/",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        }
        page = requests.get(url, headers=headers)

        # checking scrape status
        if page.status_code == 200:
            print("OK")
        else:
            print(page.status_code)

        soup = BeautifulSoup(page.text, "html.parser")

    else:
        soup = BeautifulSoup(
            open("pyhousehunter/temp/van_housing_listings.html"), "html.parser"
        )  # local scraping

    # PART2: extracting information from scrape results into dataframe
    listings = soup.find_all("div", attrs={"class": "result-info"})
    data = []

    for i in range(len(listings)):
        listing_id = listings[i].find("a").get("data-id")
        listing_url = listings[i].find("a").get("href")
        price = listings[i].find("span", attrs={"class": "result-price"}).text

        house = listings[i].find_all("span", attrs={"class": "housing"})
        if len(house) != 0:
            house_type = house[0].text.strip().replace(" ", "").replace("\n", "")[:-1]
        else:
            house_type = ""

        neighborhood = listings[i].find_all("span", attrs={"class": "result-hood"})
        if len(neighborhood) != 0:
            neighborhood = (
                neighborhood[0].text.strip().replace("(", "").replace(")", "")
            )
        else:
            neighborhood = ""
        data.append((listing_id, listing_url, price, house_type, neighborhood))

    output = pd.DataFrame(
        data,
        columns=["listing_id", "listing_url", "price", "house_type", "neighborhood"],
    )

    return output
Source code for pyhousehunter.scraper

Python House Hunter

Navigation