Modified crawling behaviour in three steps

+ Match: Receives an URL and returns a worker class with all the parameters needed to download and crawl + Download: Downloads the URL + Crawl: Crawls the data Now download and crawl actions are done in a separate class called datCrawlWorker. Now you can only receive the worker, doing the download and crawl actions independently. For advanced crawling.
fmartingr · Mar 23, 2013 · 7132f7a · 7132f7a
1 parent 39e138e
commit 7132f7a
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 5 deletions.
diff --git a/datCrawl/__init__.py b/datCrawl/__init__.py
@@ -66,7 +66,7 @@ def download(self, url, downloader, options=None):
         else:
             raise DownloaderIsNotRegistered("Downloader %s is not registered. Register it before your crawler." % downloader)
 
-    def run(self, url):
+    def match(self, url):
         if self.crawlers:
             for registered_url in self.urls:
                 pattern = registered_url[0]
@@ -80,9 +80,45 @@ def run(self, url):
                         raise CrawlerUrlDontHaveGroupDefined('The pattern [%s] of crawler [%s] dont have a url group defined.' % (pattern, crawler))
                     action = registered_url[1]
                     downloader = getattr(self.crawlers[crawler], 'downloader')
-                    downloader_options = getattr(self.crawlers[crawler], 'downloader_options')
-                    data = self.download(crawl_url, downloader, downloader_options)
-                    return self.crawlers[crawler]().do(action, data, matches=matches)
+                    worker = datCrawlWorker(crawl_url, self.crawlers[crawler], action, self.downloaders[downloader], matches)
+                    return worker
             raise CrawlerForThisURLNotFound("No crawler registered a URL pattern for: %s" % url)
         else:
             raise NoCrawlerRegistered("You must register a Crawler in order to do something.")
+
+    def worker(self, url):
+        worker = self.match(url)
+        return worker
+
+    def run(self, url):
+        worker = self.match(url)
+        return worker.run()
+
+
+class datCrawlWorker(object):
+    def __init__(self, url, crawler, action, downloader, matches):
+        self.url = url
+        self.crawler = crawler
+        self.crawler_action = action
+        self.downloader = downloader
+        self.downloader_options = getattr(self.crawler, 'downloader_options')
+        self.matches = matches
+        self.data = None
+        self.crawled_data = None
+
+    def download(self):
+        if self.downloader:
+            getter = self.downloader()
+            data = getter.get(self.url, options=self.downloader_options)
+            self.data = data
+        else:
+            raise DownloaderIsNotRegistered("Downloader %s is not registered. Register it before your crawler." % self.downloader)
+
+    def crawl(self):
+        if self.data:
+            self.crawled_data = self.crawler().do(self.crawler_action, self.data, matches=self.matches)
+
+    def run(self):
+        self.download()
+        self.crawl()
+        return self.crawled_data
diff --git a/test/test_base.py b/test/test_base.py
@@ -2,6 +2,8 @@
 from datCrawl import *
 from test.requirements import *
 
+URL = 'http://en.wikipedia.org/wiki/Python'
+
 
 class datCrawlBaseTests(unittest.TestCase):
 
@@ -18,8 +20,15 @@ def test_register_urls(self):
     def test_running_full_crawler(self):
         core = datCrawl()
         core.register_crawler(AwesomeWikipediaTitleCrawler)
-        result = core.run('http://en.wikipedia.org/wiki/Python')
+        result = core.run(URL)
         self.assertEqual(result['title'], 'Python')
 
+    def test_worker_instance(self):
+        core = datCrawl()
+        core.register_crawler(AwesomeWikipediaTitleCrawler)
+        worker = core.worker(URL)
+        self.assertTrue(isinstance(worker, datCrawlWorker))
+        self.assertEqual(URL, worker.url)
+
 if __name__ == '__main__':
     unittest.main()