-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
161 lines (134 loc) · 4.89 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import requests
from bs4 import BeautifulSoup
def convert_price(price):
"""
Convert crore, lakhs, millions and Thousand into numbers
:param price: str
:return: float
"""
if price.endswith('Crore'):
return round(float(price[:-5]) * 10000000)
elif price.endswith('Lakh'):
return round(float(price[:-4]) * 100000)
elif price.endswith('Million'):
return round(float(price[:-7]) * 1000000)
elif price.endswith('Arab'):
return round(float(price[:-4]) * 1000000000)
elif price.endswith('Thousand'):
return round(float(price[:-8]) * 1000)
else:
return round(float(price))
# convert kanal, merla, Sq. Yd., S into square feets
def convert_size(size):
"""
Convert kanal merla into sqft
:param size: str
:return: float
"""
if size.endswith('Marla'):
return round(float(size[:-5].replace(",", "")) * 225)
elif size.endswith('Kanal'):
return round(float(size[:-5].replace(",", "")) * 4500)
elif size.endswith('Sq. Yd.'):
return round(float(size[:-7].replace(",", "")) * 9)
else:
return round(float(size))
def text(tag, datatype="str"):
"""
This function will return the text of the tag.
:param tag: tag object
:param datatype: num or str or price, size
:return: price in number or string
"""
if tag is None and datatype == "num":
return 0
if datatype == "num":
try:
return int(tag.text.strip())
except ValueError:
return 0
if tag is None and datatype == "str":
return ""
if datatype == "str":
return tag.text.strip()
if tag is None and datatype == "price":
return 0.0
if datatype == "price":
return convert_price(tag.text.strip())
if tag is None and datatype == "size":
return 0.0
if datatype == "size":
return convert_size(tag.text.strip())
def scrap(city, pages_range):
"""
This function will scrap the zameen.com website and
return the list of houses information
:param city: str
:param pages_range: int
:return: list
"""
house_info = []
for page_number in range(1, pages_range+1):
url = f'https://www.zameen.com/Homes/{city}-{page_number}.html'
print(url)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
house_list = soup.select("main > div > div > div > div > ul > li")
# store lenght of previous length of house info list
prev_len = len(house_info)
for house in house_list:
baths = house.select_one("span[aria-label='Baths']")
beds = house.select_one("span[aria-label='Beds']")
location = house.select_one("div[aria-label='Location']")
price = house.select_one("span[aria-label='Price']")
size = house.select_one("div[title]>div > div > span:nth-child(1)")
if price:
if size is None:
size = location.parent.select_one(
"div:nth-child(2) > div > span:nth-child(3)")
house_info.append(
{
"location": text(location),
"price": text(price, datatype="price"),
"bedrooms": text(beds, datatype="num"),
"baths": text(baths, datatype="num"),
"size": text(size, datatype="size")
}
)
# get out of the loop if the last accessed page
# doesnot exist to avoid useless requests because
# next pages will not exist as well
if len(house_info) == prev_len:
break
return house_info
if __name__ == "__main__":
house_info = []
cities = [
{'id': 1, 'name': 'Lahore'},
{'id': 2, 'name': 'Karachi'},
{'id': 3, 'name': 'Islamabad'},
{'id': 15, 'name': 'Multan'},
{'id': 16, 'name': 'Faisalabad'},
{'id': 17, 'name': 'Peshawar'},
{'id': 18, 'name': 'Quetta'},
{'id': 41, 'name': 'Rawalpindi'},
{'id': 36, 'name': 'Murree'},
{'id': 327, 'name': 'Gujranwala'},
{'id': 1233, 'name': 'Attock'},
{'id': 3234, 'name': '2_FECHS'},
]
for city in cities:
# change 20 to any number of pages you want to scrap
house_info.append(
{
"city": city.get('name'),
"info": scrap(f"{city.get('name')}-{city.get('id')}", 100)
}
)
with open("zameen.csv", "w") as f:
# write csv header
f.write("city|location|price|bedrooms|baths|size\n")
for house in house_info:
for info in house.get('info'):
f.write(
f"{house.get('city')}|{info.get('location')}|{info.get('price')}|{info.get('bedrooms')}|{info.get('baths')}|{info.get('size')}\n")