Source code for pyhousehunter.cleaner
# author: Junting He
# date: 2021-03-05
import pandas as pd
import numpy as np
from geotext import GeoText
from operator import attrgetter
[docs]def data_cleaner(scraped_df):
"""A function to clean web-scraped data with Pandas and Regex.
Parameters
----------
scraped_df: pandas.core.frame.DataFrame
A dataframe containing web-scraped data like listing url, price and house type.
Returns
-------
pandas.core.frame.DataFrame
A cleaned dataframe containing information like listing url, price,
number of bedrooms, area in sqft, and city.
Examples
--------
>>> data_cleaner(scraped_df)
"""
if not isinstance(scraped_df, pd.DataFrame):
raise ValueError("Invalid input. Please enter a dataframe object")
if scraped_df.empty:
raise ValueError("The input dataframe is empty")
if "price" not in scraped_df.columns:
raise ValueError("Missing price column in the input dataframe")
if "house_type" not in scraped_df.columns:
raise ValueError("Missing house_type column in the input dataframe")
if "listing_url" not in scraped_df.columns:
raise ValueError("Missing price listing_url column in the input dataframe")
data = scraped_df.copy()
# convert the price column into numerice data types
data["price"] = data["price"].str.replace("$", "", regex=False).str.replace(",", "")
data["price"] = pd.to_numeric(data["price"])
# extract the information about the number of bedroom of the housing
data["num_bedroom"] = data["house_type"].str.extract(r"([0-9]+[b]{1}[r]{1})")
data["num_bedroom"] = data["num_bedroom"].str.replace("br", "")
data["num_bedroom"] = pd.to_numeric(data["num_bedroom"])
data["num_bedroom"] = data["num_bedroom"].astype("Int64")
# extract the information about the area of the housing
data["area_sqft"] = data["house_type"].str.extract(r"([0-9]+[f]{1}[t]{1}[2]{1})")
data["area_sqft"] = data["area_sqft"].str.replace("ft2", "")
data["area_sqft"] = pd.to_numeric(data["area_sqft"])
data["area_sqft"] = data["area_sqft"].astype("Int64")
# extract the information about which city are the housing located in
data["city"] = data["listing_url"].str.extract(r"([d]{1}[/]{1}[a-z]+[-]{1})")
data["city"] = (
data["city"].str.replace("d/", "", regex=False).str.replace("-", "").str.title()
)
data["city"] = data["city"].apply(GeoText).apply(attrgetter("cities"))
data["city"] = data["city"].apply(lambda x: np.nan if len(x) == 0 else x).str[0]
# select the columns which are useful for future filtering
cleaned_data = data[["listing_url", "price", "num_bedroom", "area_sqft", "city"]]
# activate the option to write results to CSV file
return cleaned_data