Skip to content

Commit

Permalink
updated parser
Browse files Browse the repository at this point in the history
  • Loading branch information
atbuy committed Jan 20, 2022
1 parent 00dca01 commit d42a63c
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 9 deletions.
2 changes: 1 addition & 1 deletion pygsearch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
__author__ = "Vitaman02"
__license__ = "MIT"
__copyright__ = "Copyright 2022-present Vitaman02"
__version__ = "0.4.3"
__version__ = "0.5.1"


VersionInfo = namedtuple("VersionInfo", "major minor patch")
Expand Down
31 changes: 24 additions & 7 deletions pygsearch/gsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,39 +42,56 @@ def __init__(self, query: str = None, num: int = 10, lang: str = "en", headers:

# If there is a query passed to the class, then search it and save the results
if query:
self.results = self.search(self.query, self.num, self.lang, self.headers, self.proxies)
self.results = list(self.search(self.query, self.num, self.lang, self.headers, self.proxies))

def search(self, query: str, results: int = 10, lang: str = "en", headers: dict = None, proxies: Dict[str, str] = None) -> List[SearchResult]:
cleaned_query = query.replace(" ", "+")

# Use default headers, if none are passed
if not headers:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}

# Google search url
url = f"https://www.google.com/search?q={query}&num={results+1}&hl={lang}"
url = f"https://www.google.com/search?q={cleaned_query}&num={results*2+1}&hl={lang}"
response = requests.get(url, headers=headers, proxies=proxies)
soup = BeautifulSoup(response.text, "lxml")

# Get all results
elements = soup.find_all("div", {"class": "g"})

# Parse each result and return the list of SearchResults
out = []
seen = []
counter = 0
for element in elements:
if counter == results:
break

has_content = element.find_all("div", {"class": "g"})
if has_content:
element = has_content[0]

title = element.find("h3")
if title:
title = title.get_text()

link = element.find("a")
link = element.find("a", href=True)
if link:
link = link.get("href")

if not link.startswith("http"):
continue

description = element.find("div", {"data-content-feature": "1"})
descr2 = element.find("div", {"class": "IsZvec"})
if description:
description = description.get_text()
elif descr2:
description = descr2.get_text()

out.append(SearchResult(title, link, description))
if not link in seen:
yield SearchResult(title, link, description)
counter += 1

return out
seen.append(link)

def __iter__(self):
self.index = 0
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
author="Vitaman02",
url="https://github.com/Vitaman02/pygsearch",
project_urls={},
version="0.4.3",
version="0.5.1",
packages=find_packages(),
license="MIT",
description="Python library to get google search results.",
Expand Down

0 comments on commit d42a63c

Please sign in to comment.