-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_page.py
221 lines (177 loc) · 6.52 KB
/
main_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
from os import path
import pandas as pd
from serpapi import GoogleSearch
import datetime
import streamlit as st
import plotly.express as px
import matplotlib.pyplot as plt
import unidecode
# Set title
st.image("https://ygdata.ch/portfolio-details-pages/media/JobJob.png", width=100)
st.title("JobJob !")
st.divider()
st.write("### Let's find your dream job ! Exciting, isn't it?")
st.sidebar.write("Enter your filters")
# Ask about request needed
search_term = unidecode.unidecode(
st.sidebar.text_input("Job you're looking for :", "Data Analyst").lower()
)
search_location = unidecode.unidecode(
st.sidebar.text_input(
"Location you're looking for :", "Geneva, Switzerland"
).lower()
)
# Add 2 cols
left_col, right_col = st.sidebar.columns(2)
# Left side
domain_country = unidecode.unidecode(
left_col.text_input("Country code :", "ch").lower()
)
# Right side
results_number = right_col.number_input(
"Maximum results :", min_value=1, max_value=500, value=70
)
search_radius = st.sidebar.slider(
"Maximum radius ( km ) :", min_value=1, max_value=500, value=20
)
words_toban = st.sidebar.text_input(
'Techno to ban ( split by " , " ) :', "C,Go,JavaScript"
)
api_key = st.sidebar.text_input(
"API key from SerpApi 🔑 :", "A1bcD23eF4ghij56APIKEYA1bcD23eF4ghij56"
)
# Passmode for developpers
if api_key == "ImJesus":
api_key = open("../API keys/serpapi.txt", "r").read()
# Select only post of the day
today_post = st.sidebar.checkbox("Today's posts only")
agreed = "date_posted:today" if today_post else ""
# Select all techno to drop
banned_word = (
[]
if words_toban in ["C,Go,JavaScript", ""]
else words_toban.replace(" ", "").split(",")
)
# Run only if the button is clicked
if st.sidebar.button("Let's GO !"):
# ---------------------------------------------
# START SCRAPING PART
# Create a text element and let the reader know the data is loading.
data_load_state = st.text("Loading data...")
# Scraping part
for num in range(int(results_number / 10)):
if num == 0:
next_page_token = ""
else:
next_page_token = results.get("serpapi_pagination", {}).get(
"next_page_token", ""
)
start = num
params = {
"api_key": api_key,
"device": "desktop",
"engine": "google_jobs",
# "google_domain": "google.com",
"q": search_term, # Lowercase forced
# "hl": "en", # Language parameter can return No result
"gl": domain_country, # Domain of country, Lowercase forced
"lrad": search_radius,
"location": search_location,
"chips": agreed,
"next_page_token": next_page_token,
}
search = GoogleSearch(params)
results = search.get_dict()
# Check if the last search page (i.e., no results)
try:
if results["error"] == "Google hasn't returned any results for this query.":
st.write(f"{results["error"]}")
break
elif (
results["error"]
== "Invalid API key. Your API key should be here: https://serpapi.com/manage-api-key"
):
data_load_state.text("Can't run : Check your API key 🔑")
break
except KeyError:
# Create dataframe of 10 pulled results
jobs = results["jobs_results"]
jobs = pd.DataFrame(jobs)
jobs = pd.concat(
[pd.DataFrame(jobs), pd.json_normalize(jobs["detected_extensions"])],
axis=1,
)
jobs["date_time"] = datetime.datetime.now(datetime.UTC).strftime(
"%d-%m-%Y %H:%M"
) # Request time add
# Concat dataframe
if start == 0:
jobs_all = jobs
else:
jobs_all = pd.concat([jobs_all, jobs])
jobs_all["search_term"] = search_term
jobs_all["search_location"] = search_location
else:
continue
# END SCRAPING PART
# ---------------------------------------------
# If no results found, inform the reader
try:
if results["error"] == "Google hasn't returned any results for this query.":
st.write("Change parameters.")
# Elif there are some results, cleaning
except KeyError:
# Notify the reader that the data was successfully loaded
data_load_state.text("Success: Data Loaded!")
# ---------------------------------------------
# START of CLEANING PART
# Drop the useless columns
to_drop = [
"detected_extensions",
"extensions",
"apply_options",
"job_id",
"thumbnail",
"search_term",
"search_location",
]
# Looking if each col exist, then delete
for col in to_drop:
if col in jobs_all.columns:
jobs_all.drop(columns=[col], inplace=True)
# Drop the duplicates offers
jobs_all.drop_duplicates(subset="description", inplace=True)
# Drop offers with specifics words (technology)
def find_words(sentence, words):
"""
Input : sentence and list of words banned
Output : True or False check
Do : normalize text and compare each word of the sentence with the words banned
"""
for word in words:
if word.lower() in sentence.lower():
return False
else:
return True
# Count of total rows (with banned_words filter)
total_rows = jobs_all.shape[0]
# Filter the offers with banned words
if banned_word != []:
jobs_all = jobs_all[
jobs_all["description"].apply(lambda a: find_words(a, banned_word))
]
# Count of filtred rows (without banned_words)
rows_deleted = total_rows - jobs_all.shape[0]
# END of CLEANING PART
# ---------------------------------------------
# Notify the data was successfully loaded
data_load_state.text("Success: Data Loaded!")
# Notify the rows keeped
st.write(
f"Total row(s) : {jobs_all.shape[0]}. Deleted row(s) due to banned words selected : {rows_deleted}."
)
# Show the dataframe
st.dataframe(jobs_all)
# Show stats from graph
fig = px.histogram(jobs_all, x="location", color="via")
st.plotly_chart(fig)