Added Downloader support: A class that handles the download of the da…

…ta of the url provided. Added exception and updated tests. The crawlers does NOT receive the URL as main parameter, receiving the html content instead. Added a simple example to the example dir. Added downloader tests (must make more)
fmartingr · Mar 16, 2013 · eedeef8 · eedeef8
1 parent 82a7cb4
commit eedeef8
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 16 deletions.
diff --git a/datCrawl/__init__.py b/datCrawl/__init__.py
@@ -1,7 +1,7 @@
 from datCrawl.exceptions import CrawlerDontHaveUrlsToWatch, \
     CrawlerIsNotInstanceOfBase, CrawlerForThisURLNotFound, \
     NoCrawlerRegistered, CrawlerAlreadyRegistered, DownloaderAlreadyRegistered, \
-    DownloaderIsNotInstanceOfBase
+    DownloaderIsNotInstanceOfBase, DownloaderIsNotRegistered
 from datCrawl.crawlers import Crawler
 from datCrawl.downloaders import Downloader
 import re
@@ -14,6 +14,7 @@ def __init__(self):
         self.crawlers = {}
         self.downloaders = {}
         self.urls = []
+        self.register_downloader(Downloader)
 
     def register_crawler(self, crawler):
         "Registers a crawler on the core to use in certain urls."
@@ -22,8 +23,12 @@ def register_crawler(self, crawler):
             if isinstance(crawler(), Crawler):
                 urls = crawler().urls
                 if len(urls) > 0:
-                    [self.register_url(url, action, class_name) for action, url in urls]
-                    self.crawlers[class_name] = crawler
+                    downloader = crawler.downloader
+                    if not self.downloader_is_registered(downloader):
+                        raise DownloaderIsNotRegistered("Downloader %s is not registered. Register it before your crawler." % downloader)
+                    else:
+                        [self.register_url(url, action, class_name) for action, url in urls]
+                        self.crawlers[class_name] = crawler
                 else:
                     raise CrawlerDontHaveUrlsToWatch('Crawler %s dont have URLs to watch for.' % class_name)
             else:
@@ -43,13 +48,24 @@ def autoregister_crawlers():
     def register_downloader(self, downloader):
         downloader_name = downloader().__class__.__name__
         if isinstance(downloader(), Downloader):
-            if downloader_name not in self.downloaders:
+            if not self.downloader_is_registered(downloader_name):
                 self.downloaders[downloader_name] = downloader
             else:
                 raise DownloaderAlreadyRegistered("Downloader %s is already registered" % downloader_name)
         else:
             raise DownloaderIsNotInstanceOfBase('Downloader %s is not correctly created. (must be instance of base Downloader class)' % downloader_name)
 
+    def downloader_is_registered(self, downloader_name):
+        return downloader_name in self.downloaders
+
+    def download(self, url, downloader):
+        if self.downloader_is_registered(downloader):
+            getter = self.downloaders[downloader]()
+            data = getter.get(url)
+            return data
+        else:
+            raise DownloaderIsNotRegistered("Downloader %s is not registered. Register it before your crawler." % downloader)
+
     def run(self, url):
         if self.crawlers:
             for registered_url in self.urls:
@@ -58,7 +74,9 @@ def run(self, url):
                 if regexp.match(url):
                     action = registered_url[1]
                     crawler = registered_url[2]
-                    return self.crawlers[crawler]().do(action, url)
+                    downloader = getattr(self.crawlers[crawler], 'downloader')
+                    data = self.download(url, downloader)
+                    return self.crawlers[crawler]().do(action, data)
             raise CrawlerForThisURLNotFound("No crawler registered a URL pattern for: %s" % url)
         else:
             raise NoCrawlerRegistered("You must register a Crawler in order to do something.")
diff --git a/datCrawl/crawlers.py b/datCrawl/crawlers.py
@@ -4,11 +4,12 @@
 class Crawler(object):
     "Base crawler class."
     urls = []  # List of tuples with regular expression of URLs that the crawler handle
+    downloader = 'Downloader'  # Name of the downloader class to use
 
-    def do(self, action, url):
+    def do(self, action, data):
         try:
             method = getattr(self, 'action_%s' % action)
-            result = method(url)
+            result = method(data)
             return result
         except AttributeError:
             raise CrawlerActionDoesNotExist('%s: action (%s) does not exist' % (self.__class__.__name__, action))
diff --git a/datCrawl/downloaders.py b/datCrawl/downloaders.py
@@ -1,6 +1,7 @@
 class Downloader(object):
     "Base downloader object"
-    pass
+    def get(self, url):
+        print("I'm a useless downloader :_")
 
 
 class DefaultDownloader(Downloader):
@@ -13,5 +14,5 @@ def get(self, url):
             response = urllib2.urlopen(req)
             data = response.read()
             return data
-        except Exception:
-            raise Exception("Error downloading %s" % url)
+        except Exception as error:
+            raise Exception("Error downloading %s: %s" % (url, error))
diff --git a/datCrawl/exceptions.py b/datCrawl/exceptions.py
@@ -36,3 +36,8 @@ class CrawlerAlreadyRegistered(Exception):
 class DownloaderAlreadyRegistered(Exception):
     "When you try to register the same downloader."
     pass
+
+
+class DownloaderIsNotRegistered(Exception):
+    "When you try to register a Crawler before its downloader."
+    pass
diff --git a/examples/simple.py b/examples/simple.py
@@ -1,14 +1,44 @@
-import datCrawl
+from datCrawl import *
+from datCrawl.downloaders import DefaultDownloader
+try:
+    from lxml.cssselect import CSSSelector
+    from lxml.etree import fromstring as document_fromstring
+except:
+    print "For this example to work you must install lxml and cssselect"
+    print " via pip or easy_install:"
+    print " # pip install lxml csselect"
+    exit()
+
+
+class DefaultDownloaderWithCustomUserAgent(DefaultDownloader):
+    def get(self, url):
+        import urllib2
+        try:
+            headers = {'User-Agent': 'Firefox'}
+            req = urllib2.Request(url, "", headers)
+            response = urllib2.urlopen(req)
+            data = response.read()
+            return data
+        except Exception:
+            raise Exception("Error downloading %s" % url)
 
 
 class AwesomeWikipediaTitleCrawler(Crawler):
     urls = [
-        ('title', 'http\:\/\/en.wikipedia.org\/wiki\/(.*)', )
+        ('get_title', 'http\:\/\/en.wikipedia.org\/wiki\/(.*)', )
     ]
+    downloader = 'DefaultDownloaderWithCustomUserAgent'
 
-    def action_title(self, url):
-        return {'title': 'Python'}
+    def action_get_title(self, data):
+        try:
+            document = document_fromstring(data)
+            selector = CSSSelector('h1.firstHeading > span')
+            return {'title': selector(document)[0].text}
+        except Exception as e:
+            print e
+        #return {'title': 'Python'}
 
 crawler = datCrawl()
+crawler.register_downloader(DefaultDownloaderWithCustomUserAgent)
 crawler.register_crawler(AwesomeWikipediaTitleCrawler)
 print crawler.run("http://en.wikipedia.org/wiki/Python_(programming_language)")
diff --git a/test/requirements.py b/test/requirements.py
@@ -14,9 +14,10 @@ class AwesomeEmptyCrawler(Crawler):
 
 class AwesomeWikipediaTitleCrawler(Crawler):
     urls = [
-        ('title', 'http\:\/\/en.wikipedia.org\/wiki\/(.*)', )
+        ('get_title', 'http\:\/\/en.wikipedia.org\/wiki\/(.*)', )
     ]
+    downloader = 'Downloader'
 
-    def action_title(self, url):
+    def action_get_title(self, data):
         # LOOK, IM CRAWLING THE INTERNETS!
         return {'title': 'Python'}