Skip to content

Commit

Permalink
added search settings for google selenium mode
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikolai Tschacher committed Aug 29, 2018
1 parent cb49803 commit 44862b1
Show file tree
Hide file tree
Showing 72 changed files with 11,599 additions and 3,178 deletions.
14 changes: 13 additions & 1 deletion GoogleScraper/scrape_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,18 @@
# The google base search url
google_search_url = 'https://www.google.com/search?'

# Search Settings for Google Scraping in Selenium Mode
# 10, 20, 30, 50, 100
google_selenium_num_results = 100
# Private results help find more relevant content for you, including content and connections that only you can see.
google_selenium_personalization = False
# use a country code such as US, DE, GB, CH, ...
google_selenium_region = 'DE'
google_selenium_safe_search = False
# the language for google search results
google_selenium_language = 'English'


# The yandex base search url
yandex_search_url = 'http://yandex.ru/yandsearch?'

Expand Down Expand Up @@ -275,7 +287,7 @@
# Set to False to disable.
# If the captcha isn't solved in the specified time interval, the browser instance
# with the current proxy is discarded.
manual_captcha_solving = False
manual_captcha_solving = True

# Xvfb display option
# You should start xvfb on your own before this option has any effect.
Expand Down
103 changes: 91 additions & 12 deletions GoogleScraper/selenium_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,16 @@ def _get_Firefox(self):

return False

def handle_request_denied(self, status_code):

def malicious_request_detected(self):
"""Checks whether a malicious request was detected.
"""
needles = self.malicious_request_needles[self.search_engine_name]

return needles and needles['inurl'] in self.webdriver.current_url \
and needles['inhtml'] in self.webdriver.page_source

def handle_request_denied(self):
"""Checks whether Google detected a potentially harmful request.
Whenever such potential abuse is detected, Google shows an captcha.
Expand All @@ -297,29 +306,22 @@ def handle_request_denied(self, status_code):
# selenium webdriver objects have no status code :/
super().handle_request_denied('400')

needles = self.malicious_request_needles[self.search_engine_name]

if needles and needles['inurl'] in self.webdriver.current_url \
and needles['inhtml'] in self.webdriver.page_source:

if self.malicious_request_detected():
if self.config.get('manual_captcha_solving', False):
with self.captcha_lock:
import tempfile

tf = tempfile.NamedTemporaryFile('wb')
tf.write(self.webdriver.get_screenshot_as_png())
import webbrowser

webbrowser.open('file://{}'.format(tf.name))
solution = input('enter the captcha please...')
self.webdriver.find_element_by_name('submit').send_keys(solution + Keys.ENTER)
solution = input('Please solve the captcha in the browser! Enter any key when done...')
tf.close()
try:
self.search_input = WebDriverWait(self.webdriver, 5).until(
self.search_input = WebDriverWait(self.webdriver, 7).until(
EC.visibility_of_element_located(self._get_search_input_field()))
except TimeoutException:
raise MaliciousRequestDetected('Requesting with this ip is not possible at the moment.')
tf.close()

else:
# Just wait until the user solves the captcha in the browser window
# 10 hours if needed :D
Expand Down Expand Up @@ -371,6 +373,7 @@ def find_visible_search_input(driver):
logger.error('{}: TimeoutException waiting for search input field: {}'.format(self.name, e))
return False


def _wait_until_search_param_fields_appears(self, max_wait=5):
"""Waits until the search input field contains the query.
Expand Down Expand Up @@ -680,6 +683,82 @@ def run(self):
"""


class GoogleSelScrape(SelScrape):
"""
Add Google Settings via this subclass.
"""

def __init__(self, *args, **kwargs):
SelScrape.__init__(self, *args, **kwargs)
self.largest_id = 0

def build_search(self):
"""
Specify google page settings according to config.
Doing this automatically often provocates a captcha question.
This is highly sensitive.
"""
super().build_search()
# assume we are on the normal google search page right now
self.webdriver.get('https://www.google.com/preferences?hl=en')

time.sleep(1)

# wait until we see the settings
element = WebDriverWait(self.webdriver, 7).until(EC.presence_of_element_located((By.NAME, 'safeui')))

try:
if self.config.get('google_selenium_safe_search', False):
if self.webdriver.find_element_by_name('safeui').get_attribute('value') != 'on':
self.webdriver.find_element_by_name('safeui').click()

try:
if self.config.get('google_selenium_personalization', False):
self.webdriver.find_element_by_css_selector('#pson-radio > div:first-child').click()
else:
self.webdriver.find_element_by_css_selector('#pson-radio > div:nth-child(2)').click()
except WebDriverException as e:
logger.warning('Cannot set personalization settings.')

time.sleep(1)

# set the region
try:
self.webdriver.find_element_by_id('regionanchormore').click()
except WebDriverException as e:
logger.warning('Regions probably already expanded.')

region = self.config.get('google_selenium_region', 'US')
self.webdriver.find_element_by_css_selector('div[data-value="{}"]'.format(region)).click()

# set the number of results
num_results = self.config.get('google_selenium_num_results', 10)
self.webdriver.find_element_by_id('result_slider').click()
# reset
for i in range(5):
self.webdriver.find_element_by_id('result_slider').send_keys(Keys.LEFT)
# move to desicred result
for i in range((num_results//10)-1):
time.sleep(.25)
self.webdriver.find_element_by_id('result_slider').send_keys(Keys.RIGHT)

time.sleep(1)

# save settings
self.webdriver.find_element_by_css_selector('#form-buttons div:first-child').click()
# accept alert
self.webdriver.switch_to.alert.accept()

time.sleep(2)

self.handle_request_denied()

except WebDriverException as e:
logger.error(e)
raise e


class DuckduckgoSelScrape(SelScrape):
"""
Duckduckgo is a little special since new results are obtained by ajax.
Expand Down
13 changes: 10 additions & 3 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,14 @@ of the NodeJS version. Thus I will use https://github.com/GoogleChrome/puppeteer
Alternative to using puppeteer: use selenium with chrome headless:
https://duo.com/decipher/driving-headless-chrome-with-python

### 27.8.23
### 27.8.18

+ Write functional test for Google and Bing
+ Think about integrating https://2captcha.com/2captcha-api#rates
+ Write functional test for Google and Bing [DONE]
+ Think about integrating https://2captcha.com/2captcha-api#rates

### 29.8.18

+ Add possibility to change search settings to selenium mode for Google
+ Change country/region
+ Change language
+ Change number of search results
Loading

0 comments on commit 44862b1

Please sign in to comment.