forked from ethanol-cx/python-webscraper-redfin
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
125 lines (114 loc) · 5.94 KB
/
scraper.py
File metadata and controls
125 lines (114 loc) · 5.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import requests
from house import House
from random import uniform
import time
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
class Scraper():
"""
Scraper class contains the basic information that is needed for scraping.
Each Scraper instance should contains its own session as well as the houses that it has obtained from the webpage.
"""
# max_iter defines the max number of reqeust attempts. If the max number of attempt is reached, the requests might be blocked.
max_iter = 15
user_agent_header = {
'User-agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'}
def __init__(self, url='https://www.redfin.com/city/11203/CA/Los-Angeles/'):
"""
Constructor of the Scraper class
Keyword Arguments:
url {str} -- [The url corresponding to the city that user chooses from the beginning] (default: {'https://www.redfin.com/city/11203/CA/Los-Angeles/'})
"""
self.url = url
self.session = requests.Session()
# self.houses = dict() # id to house
self.houses = pd.DataFrame(columns=[
'id', 'address', 'status', 'date', 'lastListedPrice', 'numBed', 'numBath', 'size'])
def random_sleep(self):
"""
Allows the program to sleep for random amount of time to avoid blocked by the site
Raises:
Exception -- If the provided url is not valid (HTTP 404) or the network condition is off, an exception will be thrown.
Returns:
None
"""
time.sleep(uniform(0, 10))
def parse_status_dates(self, statusDate):
"""
Parses the status(i.e. 'SOLD, SOLD with Redfin, etc.') and the date from the statusDate string that is displayed on the site
Arguments:
statusDate {[str, str]} -- A list with two elements: status, date
"""
splitArray = statusDate.split(' ')
return [' '.join(splitArray[:-3]), datetime.strptime(' '.join(splitArray[-3:]), '%b %d, %Y')]
def get_page_soup(self, url):
"""
Given the url, it return the BeautifulSoup object containing the page source of that url.
Arguments:
url {str} -- the url linking to the page that the program is going to scrape
"""
# sleep to avoid detection
self.random_sleep()
# repeat the request for max_iter times just to avoid package loss or network glitches
for i in range(self.max_iter):
self.random_sleep()
resp = self.session.get(
url, headers=self.user_agent_header, verify=False)
if resp.status_code == 200:
return BeautifulSoup(resp.text, 'html.parser')
print('ERROR with status code {}'.format(resp))
print('HTTP response body {}'.format(resp.text))
raise Exception(
'Request failed {} times. It is probably blocked.'.format(self.max_iter))
return None
def search_houses(self, query):
"""
Given the query string (i.e. sold-6mo), we search the houses from Redfin.
Arguments:
query {str} -- a query string that acts as the filter of the search
"""
url = self.url + \
'/filter/include={}'.format(query)
soup = self.get_page_soup(url)
# first finds the number of pages in the search list
numPages = int(soup.find_all('span', attrs={'class': 'pageText'})[
0].text.split()[-1])
# loop through every page of the search result
for i in range(numPages):
# if this is not the first iteration, go to the next page of the search results
if i != 0:
soup = self.get_page_soup(url + '/page-{}'.format(i+1))
self.random_sleep()
# get the corresponding information on the page
try:
ids = list(map(lambda tag: tag['href'].split('/')
[-1], soup.find_all('a', attrs={'class': 'cover-all'})))
addresses = list(map(lambda tag: tag.text, soup.find_all(
'span', attrs={'data-rf-test-id': 'abp-streetLine'})))
statusDates = list(map(lambda tag: self.parse_status_dates(tag.text), soup.find_all(
'span', attrs={'class': 'HomeSash font-weight-bold roundedCorners'})))
prices = list(map(lambda tag: tag.text, soup.find_all(
'span', attrs={'class': 'homecardV2Price'})))
stats = [list(map(lambda tag: tag.text, singleHouseStats))
for singleHouseStats in soup.find_all('div', attrs={'class': 'HomeStatsV2'})]
print("Finished page {}/{} of the results".format(i+1, numPages))
except:
raise("Exception occurred when parsing the information from the page {}.The page might have been changed and the scraping script is probably not updated.".format(url))
# loop through all information we obtained and store them as house objects
for j in range(len(ids)):
id = ids[j]
if (j < len(addresses)):
address = addresses[j]
if (j < len(statusDates)):
[status, date] = statusDates[j]
if (j < len(prices)):
price = prices[j]
if (j < len(stats)):
[bed, bath, size] = stats[j]
# self.houses[id] = (House(id=id, streetAddress=address, status=status, date=date,
# lastListedPrice=price, numBed=bed, numBath=bath, size=size))
#
# alternatively keep a dataframe
self.houses = self.houses.append(
{'id': id, 'address': address, 'status': status, 'date': date, 'lastListedPrice': price, 'numBed': bed, 'numBath': bath, 'size': size}, ignore_index=True)