-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
51 lines (43 loc) · 2.34 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import pandas
import requests
from bs4 import BeautifulSoup
base_url = "http://www.pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s=" #Enter your own reqd base url which is basically the first page
listall = []
rtemp = requests.get("http://www.pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/") #To obtain number of pages so you can crawl through the web pages
ctemp = rtemp.content
souptemp = BeautifulSoup(ctemp, "html.parser")
page_number = souptemp.find_all("a", {"class":"Page"})[-1].text
for page in range(0, int(page_number)*10, 10): #figure out url pattern while changing pages
req = requests.get(base_url + str(page) + ".html") #and do the needful modifications to the base url to get data
c = req.content
soup = BeautifulSoup(c, "html.parser")
all = soup.find_all("div", {"class":"propertyRow"})
for prop in all:
dict = {}
dict["Address"] = prop.find_all("span", {"class":"propAddressCollapse"})[0].text.replace("\n","")
dict["Address State"] = prop.find_all("span", {"class":"propAddressCollapse"})[1].text.replace("\n","")
dict["Price"] = prop.find("h4", {"class":"propPrice"}).text.replace("\n","").replace(" ","")
try:
dict["Beds"] = prop.find("span",{"class":"infoBed"}).find("b").text
except:
dict["Beds"] = None
try:
dict["Sq Ft"] = prop.find("span",{"class":"infoSqFt"}).find("b").text
except:
dict["Sq Ft"] = None
try:
dict["Baths"] = prop.find("span",{"class":"infoValueFullBath"}).find("b").text
except:
dict["Baths"] = None
try:
dict["Half Baths"] = prop.find("span",{"class":"infoValueHalfBath"}).find("b").text
except:
dict["Half Baths"] = None
for col_grp in prop.find_all("div",{"class":"columnGroup"}):
#print(col_grp)
for feat_grp, feat_name in zip(col_grp.find_all("span",{"class":"featureGroup"}), col_grp.find_all("span",{"class":"featureName"})):
if 'Lot Size' in feat_grp.text:
dict["Lot size"] = feat_name.text.replace(",","")
listall.append(dict)
dataframe = pandas.DataFrame(listall)
dataframe.to_csv("Scraped_info.csv")