-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
187 lines (138 loc) · 6.53 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
import streamlit as st
import pandas as pd
from hasdata import extract_serp_via_api, extract_html_via_api, extract_text_from_html
from google_api import analyze_entities, get_entities_dataframe
@st.cache_data
def stepOne(keyword):
with st.spinner(f"Extract SERP Results for keyword \"{keyword}\""):
organic_results = extract_serp_via_api(keyword, has_data_api_key)
serp_results = []
for result in organic_results:
serp_results.append({
"Position": result.get("position"),
"Source": result.get("source"),
"Link": result.get("link"),
"Snippet": result.get("snippet")
})
# Create a DataFrame from the list of results
serp_results_df = pd.DataFrame(serp_results)
st.write("Serp API Results:")
st.dataframe(serp_results_df, hide_index=True)
st.success("SERP Results Extracted!")
return serp_results_df
@st.cache_data
def stepTwo(serp_results_df):
content_results_placeholder = st.empty()
content_results_df = pd.DataFrame(columns=["Position", "Link", "Content"])
progress_bar = st.progress(0, text="Extracting page content from each result...")
for idx, row in serp_results_df.iterrows():
html = extract_html_via_api(row.get("Link"), has_data_api_key)
content = extract_text_from_html(html)
new_row = {
"Position": row.get("Position"),
"Link": row.get("Link"),
"Content": content
}
content_results_df = pd.concat([content_results_df, pd.DataFrame([new_row])], ignore_index=True)
# Update the placeholder with the new DataFrame
with content_results_placeholder.container():
st.write("Pages Content Results:")
st.dataframe(content_results_df, hide_index=True)
progress_bar.progress((idx + 1) / len(serp_results_df))
st.success("Pages Content Extracted!")
return content_results_df
@st.cache_data
def stepThree(content_results_df):
progress_bar = st.progress(0, text="Extracting text entities from each page...")
# Initialize an empty DataFrame to store combined entities
combined_entity_df = pd.DataFrame(columns=["Entity", "Salience", "Page Link"])
st.write("Entities Extraction Results:")
# Process each page and dynamically show expanders
for idx, row in content_results_df.iterrows():
# Extract entities for the current page
api_response = analyze_entities(row.get("Content"), google_nlp_engine_api_key)
entity_df = get_entities_dataframe(api_response)
# Add a column for the page link to track which page each entity comes from
entity_df["Page Link"] = row.get("Link")
# Append to the combined DataFrame
combined_entity_df = pd.concat([combined_entity_df, entity_df], ignore_index=True)
# Display the expander for the current page
with st.expander(f"Entities for {row.get('Link')}"):
st.dataframe(entity_df.drop(columns=["Page Link"]))
# Update the progress bar
progress_bar.progress((idx + 1) / len(content_results_df))
# Display a success message
st.success("Entity extraction completed for all pages!")
return combined_entity_df
@st.cache_data
def stepFour(target_page):
with st.spinner(f"Extract Entities for target page {target_page}:"):
html = extract_html_via_api(target_page, has_data_api_key)
content = extract_text_from_html(html)
api_response = analyze_entities(content, google_nlp_engine_api_key)
entity_df = get_entities_dataframe(api_response)
st.write("Target Page Entities:")
st.dataframe(entity_df)
st.success("Entities extraction completed for target page!")
return entity_df
@st.cache_data
def stepFive(entites_df, target_page_entity_df):
with st.spinner("Analyzing entities across pages..."):
# Extract top 30 entities for each link based on salience
top_entities = (
entites_df.sort_values(by="Salience", ascending=False)
.groupby("Page Link")
.head(30)
.reset_index(drop=True)
)
# Flatten target page entities into a set for quick look-up
target_entities_set = set(target_page_entity_df["Entity"])
# Create a dictionary to count occurrences and track URLs
entity_data = {}
for idx, row in top_entities.iterrows():
entity = row["Entity"]
page_link = row["Page Link"]
# Increment count and append URL for the entity
if entity not in entity_data:
entity_data[entity] = {"Count": 0, "URLs": set()}
entity_data[entity]["Count"] += 1
entity_data[entity]["URLs"].add(page_link)
# Prepare the final DataFrame
final_df_data = []
for entity, data in entity_data.items():
row_data = {
"Entity": entity,
"Count": data["Count"],
"URLs": ", ".join(data["URLs"]), # Convert set of URLs to a string
}
final_df_data.append(row_data)
final_df = pd.DataFrame(final_df_data)
final_df["Missed"] = final_df["Entity"].apply(
lambda entity: entity not in target_entities_set
)
sorted_df = final_df.sort_values(by="Count", ascending=False)
st.write("Final Entity Analysis:")
def highlight_missed(s):
return ['background-color: orange'] * len(s) if s.Missed * len(s) else [''] * len(s)
st.dataframe(sorted_df.style.apply(highlight_missed, axis=1))
st.success("Entity analysis completed!")
return sorted_df
st.title('Page Content Gap Analyzer')
with st.form("setup"):
st.write("Enter Page URL and Keyword for Analysis")
page_url = st.text_input('Page URL')
keyword = st.text_input('Keyword')
st.write("Enter API Credentials")
has_data_api_key = st.text_input('HasData API Key')
google_nlp_engine_api_key = st.text_input('Google NLP Engine API Key')
setup_form_submit = st.form_submit_button('Start Analysis')
if setup_form_submit and (page_url == '' or keyword == ''):
raise ValueError('No Page URL or Keyword')
if setup_form_submit and (has_data_api_key == '' or google_nlp_engine_api_key == ''):
raise ValueError('No HasData API Key or Google NLP Engine API Key')
if setup_form_submit:
serp_results_df = stepOne(keyword)
content_results_df = stepTwo(serp_results_df)
entites_df = stepThree(content_results_df)
target_page_entity_df = stepFour(page_url)
final_entity_analysis_df = stepFive(entites_df, target_page_entity_df)