Source code for pyhousehunter.scraper

# author: Alex Truong Hai Yen
# date: 2021-02-25

from bs4 import BeautifulSoup
import requests
import pandas as pd
import regex as re


[docs]def scraper(url, online=False): """Function to scrape housing data from a given Craiglist url Parameters ---------- url : str The given housing craiglist URL to scrape the data from online: bool Whether the data is scraped directly online from the url (default = False) False means the data is scraped from a local HTML file ReturnsF ------- pandas.core.frame.DataFrame A dataframe containing listing information like listing url, price, house type. Examples ------- >>> scraper(url = 'https://vancouver.craigslist.org/d/apartments-housing-for-rent/search/apa') """ # PART 0: Exception handling/ Input validation # the right Craiglist URL regex = r"(http|https):\/\/vancouver.craigslist.org\/d\/apartments-housing-for-rent\/search\/apa.*" try: re.search(regex, url) except TypeError: print("Wrong data type. Please enter a correct Craiglist Housing URL") except SyntaxError: print("Wrong syntax. Please enter a correct Craiglist Housing URL") if re.search(regex, url) is None: raise ValueError( "Invalid URL. Please enter a Craiglist Housing URL with this formatn\ https://vancouver.craigslist.org/d/apartments-housing-for-rent/search/apa" ) # the right option for online if type(online) != bool: raise TypeError("Please enter Boolean value: True or False") # PART 1: create soup object either from the url or local HTML file if online is True: headers = { "DNT": "1", "Referer": "https://vancouver.craigslist.org/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36", } page = requests.get(url, headers=headers) # checking scrape status if page.status_code == 200: print("OK") else: print(page.status_code) soup = BeautifulSoup(page.text, "html.parser") else: soup = BeautifulSoup( open("pyhousehunter/temp/van_housing_listings.html"), "html.parser" ) # local scraping # PART2: extracting information from scrape results into dataframe listings = soup.find_all("div", attrs={"class": "result-info"}) data = [] for i in range(len(listings)): listing_id = listings[i].find("a").get("data-id") listing_url = listings[i].find("a").get("href") price = listings[i].find("span", attrs={"class": "result-price"}).text house = listings[i].find_all("span", attrs={"class": "housing"}) if len(house) != 0: house_type = house[0].text.strip().replace(" ", "").replace("\n", "")[:-1] else: house_type = "" neighborhood = listings[i].find_all("span", attrs={"class": "result-hood"}) if len(neighborhood) != 0: neighborhood = ( neighborhood[0].text.strip().replace("(", "").replace(")", "") ) else: neighborhood = "" data.append((listing_id, listing_url, price, house_type, neighborhood)) output = pd.DataFrame( data, columns=["listing_id", "listing_url", "price", "house_type", "neighborhood"], ) return output