Skip to content
This repository has been archived by the owner on Aug 9, 2019. It is now read-only.

Commit

Permalink
Modified crawling behaviour in three steps
Browse files Browse the repository at this point in the history
+ Match: Receives an URL and returns a worker class with all the parameters needed to download and crawl
+ Download: Downloads the URL
+ Crawl: Crawls the data

Now download and crawl actions are done in a separate class called datCrawlWorker. Now you can only receive the worker, doing the download and crawl actions independently. For advanced crawling.
  • Loading branch information
Felipe Martin Garcia committed Mar 23, 2013
1 parent 39e138e commit 7132f7a
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 5 deletions.
44 changes: 40 additions & 4 deletions datCrawl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def download(self, url, downloader, options=None):
else:
raise DownloaderIsNotRegistered("Downloader %s is not registered. Register it before your crawler." % downloader)

def run(self, url):
def match(self, url):
if self.crawlers:
for registered_url in self.urls:
pattern = registered_url[0]
Expand All @@ -80,9 +80,45 @@ def run(self, url):
raise CrawlerUrlDontHaveGroupDefined('The pattern [%s] of crawler [%s] dont have a url group defined.' % (pattern, crawler))
action = registered_url[1]
downloader = getattr(self.crawlers[crawler], 'downloader')
downloader_options = getattr(self.crawlers[crawler], 'downloader_options')
data = self.download(crawl_url, downloader, downloader_options)
return self.crawlers[crawler]().do(action, data, matches=matches)
worker = datCrawlWorker(crawl_url, self.crawlers[crawler], action, self.downloaders[downloader], matches)
return worker
raise CrawlerForThisURLNotFound("No crawler registered a URL pattern for: %s" % url)
else:
raise NoCrawlerRegistered("You must register a Crawler in order to do something.")

def worker(self, url):
worker = self.match(url)
return worker

def run(self, url):
worker = self.match(url)
return worker.run()


class datCrawlWorker(object):
def __init__(self, url, crawler, action, downloader, matches):
self.url = url
self.crawler = crawler
self.crawler_action = action
self.downloader = downloader
self.downloader_options = getattr(self.crawler, 'downloader_options')
self.matches = matches
self.data = None
self.crawled_data = None

def download(self):
if self.downloader:
getter = self.downloader()
data = getter.get(self.url, options=self.downloader_options)
self.data = data
else:
raise DownloaderIsNotRegistered("Downloader %s is not registered. Register it before your crawler." % self.downloader)

def crawl(self):
if self.data:
self.crawled_data = self.crawler().do(self.crawler_action, self.data, matches=self.matches)

def run(self):
self.download()
self.crawl()
return self.crawled_data
11 changes: 10 additions & 1 deletion test/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from datCrawl import *
from test.requirements import *

URL = 'http://en.wikipedia.org/wiki/Python'


class datCrawlBaseTests(unittest.TestCase):

Expand All @@ -18,8 +20,15 @@ def test_register_urls(self):
def test_running_full_crawler(self):
core = datCrawl()
core.register_crawler(AwesomeWikipediaTitleCrawler)
result = core.run('http://en.wikipedia.org/wiki/Python')
result = core.run(URL)
self.assertEqual(result['title'], 'Python')

def test_worker_instance(self):
core = datCrawl()
core.register_crawler(AwesomeWikipediaTitleCrawler)
worker = core.worker(URL)
self.assertTrue(isinstance(worker, datCrawlWorker))
self.assertEqual(URL, worker.url)

if __name__ == '__main__':
unittest.main()

0 comments on commit 7132f7a

Please sign in to comment.