added search settings for google selenium mode

vergamota · Aug 29, 2018 · 44862b1 · 44862b1
1 parent cb49803
commit 44862b1
Show file tree

Hide file tree

Showing 72 changed files with 11,599 additions and 3,178 deletions.
diff --git a/GoogleScraper/scrape_config.py b/GoogleScraper/scrape_config.py
@@ -99,6 +99,18 @@
 # The google base search url
 google_search_url = 'https://www.google.com/search?'
 
+# Search Settings for Google Scraping in Selenium Mode
+# 10, 20, 30, 50, 100
+google_selenium_num_results = 100 
+# Private results help find more relevant content for you, including content and connections that only you can see.
+google_selenium_personalization = False
+# use a country code such as US, DE, GB, CH, ...
+google_selenium_region = 'DE' 
+google_selenium_safe_search = False
+# the language for google search results
+google_selenium_language = 'English'
+
+
 # The yandex base search url
 yandex_search_url = 'http://yandex.ru/yandsearch?'
 
@@ -275,7 +287,7 @@
 # Set to False to disable.
 # If the captcha isn't solved in the specified time interval, the browser instance
 # with the current proxy is discarded.
-manual_captcha_solving = False
+manual_captcha_solving = True
 
 # Xvfb display option
 # You should start xvfb on your own before this option has any effect.

diff --git a/GoogleScraper/selenium_mode.py b/GoogleScraper/selenium_mode.py
@@ -279,7 +279,16 @@ def _get_Firefox(self):
 
         return False
 
-    def handle_request_denied(self, status_code):
+
+    def malicious_request_detected(self):
+        """Checks whether a malicious request was detected.
+        """
+        needles = self.malicious_request_needles[self.search_engine_name]
+
+        return needles and needles['inurl'] in self.webdriver.current_url \
+                and needles['inhtml'] in self.webdriver.page_source
+
+    def handle_request_denied(self):
         """Checks whether Google detected a potentially harmful request.
 
         Whenever such potential abuse is detected, Google shows an captcha.
@@ -297,29 +306,22 @@ def handle_request_denied(self, status_code):
         # selenium webdriver objects have no status code :/
         super().handle_request_denied('400')
 
-        needles = self.malicious_request_needles[self.search_engine_name]
-
-        if needles and needles['inurl'] in self.webdriver.current_url \
-                and needles['inhtml'] in self.webdriver.page_source:
-
+        if self.malicious_request_detected():
             if self.config.get('manual_captcha_solving', False):
                 with self.captcha_lock:
                     import tempfile
 
                     tf = tempfile.NamedTemporaryFile('wb')
                     tf.write(self.webdriver.get_screenshot_as_png())
                     import webbrowser
-
                     webbrowser.open('file://{}'.format(tf.name))
-                    solution = input('enter the captcha please...')
-                    self.webdriver.find_element_by_name('submit').send_keys(solution + Keys.ENTER)
+                    solution = input('Please solve the captcha in the browser! Enter any key when done...')
+                    tf.close()
                     try:
-                        self.search_input = WebDriverWait(self.webdriver, 5).until(
+                        self.search_input = WebDriverWait(self.webdriver, 7).until(
                             EC.visibility_of_element_located(self._get_search_input_field()))
                     except TimeoutException:
                         raise MaliciousRequestDetected('Requesting with this ip is not possible at the moment.')
-                    tf.close()
-
             else:
                 # Just wait until the user solves the captcha in the browser window
                 # 10 hours if needed :D
@@ -371,6 +373,7 @@ def find_visible_search_input(driver):
             logger.error('{}: TimeoutException waiting for search input field: {}'.format(self.name, e))
             return False
 
+
     def _wait_until_search_param_fields_appears(self, max_wait=5):
         """Waits until the search input field contains the query.
 
@@ -680,6 +683,82 @@ def run(self):
 """
 
 
+class GoogleSelScrape(SelScrape):
+    """
+    Add Google Settings via this subclass.
+    """
+
+    def __init__(self, *args, **kwargs):
+        SelScrape.__init__(self, *args, **kwargs)
+        self.largest_id = 0
+
+    def build_search(self):
+        """
+        Specify google page settings according to config.
+
+        Doing this automatically often provocates a captcha question.
+        This is highly sensitive.
+        """
+        super().build_search()
+        # assume we are on the normal google search page right now
+        self.webdriver.get('https://www.google.com/preferences?hl=en')
+
+        time.sleep(1)
+
+        # wait until we see the settings
+        element = WebDriverWait(self.webdriver, 7).until(EC.presence_of_element_located((By.NAME, 'safeui')))
+
+        try:
+            if self.config.get('google_selenium_safe_search', False):
+                if self.webdriver.find_element_by_name('safeui').get_attribute('value') != 'on':
+                    self.webdriver.find_element_by_name('safeui').click()
+
+            try:
+                if self.config.get('google_selenium_personalization', False):
+                    self.webdriver.find_element_by_css_selector('#pson-radio > div:first-child').click()
+                else:
+                    self.webdriver.find_element_by_css_selector('#pson-radio > div:nth-child(2)').click()
+            except WebDriverException as e:
+                logger.warning('Cannot set personalization settings.')
+
+            time.sleep(1)
+
+            # set the region
+            try:
+                self.webdriver.find_element_by_id('regionanchormore').click()
+            except WebDriverException as e:
+                logger.warning('Regions probably already expanded.')
+
+            region = self.config.get('google_selenium_region', 'US')
+            self.webdriver.find_element_by_css_selector('div[data-value="{}"]'.format(region)).click()
+
+            # set the number of results
+            num_results = self.config.get('google_selenium_num_results', 10)
+            self.webdriver.find_element_by_id('result_slider').click()
+            # reset
+            for i in range(5):
+                self.webdriver.find_element_by_id('result_slider').send_keys(Keys.LEFT)
+            # move to desicred result
+            for i in range((num_results//10)-1):
+                time.sleep(.25)
+                self.webdriver.find_element_by_id('result_slider').send_keys(Keys.RIGHT)
+
+            time.sleep(1)
+
+            # save settings
+            self.webdriver.find_element_by_css_selector('#form-buttons div:first-child').click()
+            # accept alert
+            self.webdriver.switch_to.alert.accept()
+
+            time.sleep(2)
+
+            self.handle_request_denied()
+
+        except WebDriverException as e:
+            logger.error(e)
+            raise e
+
+
 class DuckduckgoSelScrape(SelScrape):
     """
     Duckduckgo is a little special since new results are obtained by ajax.

diff --git a/TODO.md b/TODO.md
@@ -246,7 +246,14 @@ of the NodeJS version. Thus I will use https://github.com/GoogleChrome/puppeteer
 Alternative to using puppeteer: use selenium with chrome headless:
 https://duo.com/decipher/driving-headless-chrome-with-python
 
-### 27.8.23
+### 27.8.18
 
-+ Write functional test for Google and Bing
-+ Think about integrating https://2captcha.com/2captcha-api#rates
++ Write functional test for Google and Bing [DONE]
++ Think about integrating https://2captcha.com/2captcha-api#rates
+
+### 29.8.18
+
++ Add possibility to change search settings to selenium mode for Google
+  + Change country/region
+  + Change language
+  + Change number of search results