Skip to content
This repository has been archived by the owner on Aug 9, 2019. It is now read-only.

Commit

Permalink
Added Downloader support: A class that handles the download of the da…
Browse files Browse the repository at this point in the history
…ta of the url provided.

Added exception and updated tests.
The crawlers does NOT receive the URL as main parameter, receiving the html content instead.
Added a simple example to the example dir.
Added downloader tests (must make more)
  • Loading branch information
Felipe Martin Garcia committed Mar 16, 2013
1 parent 82a7cb4 commit eedeef8
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 16 deletions.
28 changes: 23 additions & 5 deletions datCrawl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from datCrawl.exceptions import CrawlerDontHaveUrlsToWatch, \
CrawlerIsNotInstanceOfBase, CrawlerForThisURLNotFound, \
NoCrawlerRegistered, CrawlerAlreadyRegistered, DownloaderAlreadyRegistered, \
DownloaderIsNotInstanceOfBase
DownloaderIsNotInstanceOfBase, DownloaderIsNotRegistered
from datCrawl.crawlers import Crawler
from datCrawl.downloaders import Downloader
import re
Expand All @@ -14,6 +14,7 @@ def __init__(self):
self.crawlers = {}
self.downloaders = {}
self.urls = []
self.register_downloader(Downloader)

def register_crawler(self, crawler):
"Registers a crawler on the core to use in certain urls."
Expand All @@ -22,8 +23,12 @@ def register_crawler(self, crawler):
if isinstance(crawler(), Crawler):
urls = crawler().urls
if len(urls) > 0:
[self.register_url(url, action, class_name) for action, url in urls]
self.crawlers[class_name] = crawler
downloader = crawler.downloader
if not self.downloader_is_registered(downloader):
raise DownloaderIsNotRegistered("Downloader %s is not registered. Register it before your crawler." % downloader)
else:
[self.register_url(url, action, class_name) for action, url in urls]
self.crawlers[class_name] = crawler
else:
raise CrawlerDontHaveUrlsToWatch('Crawler %s dont have URLs to watch for.' % class_name)
else:
Expand All @@ -43,13 +48,24 @@ def autoregister_crawlers():
def register_downloader(self, downloader):
downloader_name = downloader().__class__.__name__
if isinstance(downloader(), Downloader):
if downloader_name not in self.downloaders:
if not self.downloader_is_registered(downloader_name):
self.downloaders[downloader_name] = downloader
else:
raise DownloaderAlreadyRegistered("Downloader %s is already registered" % downloader_name)
else:
raise DownloaderIsNotInstanceOfBase('Downloader %s is not correctly created. (must be instance of base Downloader class)' % downloader_name)

def downloader_is_registered(self, downloader_name):
return downloader_name in self.downloaders

def download(self, url, downloader):
if self.downloader_is_registered(downloader):
getter = self.downloaders[downloader]()
data = getter.get(url)
return data
else:
raise DownloaderIsNotRegistered("Downloader %s is not registered. Register it before your crawler." % downloader)

def run(self, url):
if self.crawlers:
for registered_url in self.urls:
Expand All @@ -58,7 +74,9 @@ def run(self, url):
if regexp.match(url):
action = registered_url[1]
crawler = registered_url[2]
return self.crawlers[crawler]().do(action, url)
downloader = getattr(self.crawlers[crawler], 'downloader')
data = self.download(url, downloader)
return self.crawlers[crawler]().do(action, data)
raise CrawlerForThisURLNotFound("No crawler registered a URL pattern for: %s" % url)
else:
raise NoCrawlerRegistered("You must register a Crawler in order to do something.")
5 changes: 3 additions & 2 deletions datCrawl/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
class Crawler(object):
"Base crawler class."
urls = [] # List of tuples with regular expression of URLs that the crawler handle
downloader = 'Downloader' # Name of the downloader class to use

def do(self, action, url):
def do(self, action, data):
try:
method = getattr(self, 'action_%s' % action)
result = method(url)
result = method(data)
return result
except AttributeError:
raise CrawlerActionDoesNotExist('%s: action (%s) does not exist' % (self.__class__.__name__, action))
7 changes: 4 additions & 3 deletions datCrawl/downloaders.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
class Downloader(object):
"Base downloader object"
pass
def get(self, url):
print("I'm a useless downloader :_")


class DefaultDownloader(Downloader):
Expand All @@ -13,5 +14,5 @@ def get(self, url):
response = urllib2.urlopen(req)
data = response.read()
return data
except Exception:
raise Exception("Error downloading %s" % url)
except Exception as error:
raise Exception("Error downloading %s: %s" % (url, error))
5 changes: 5 additions & 0 deletions datCrawl/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,8 @@ class CrawlerAlreadyRegistered(Exception):
class DownloaderAlreadyRegistered(Exception):
"When you try to register the same downloader."
pass


class DownloaderIsNotRegistered(Exception):
"When you try to register a Crawler before its downloader."
pass
38 changes: 34 additions & 4 deletions examples/simple.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,44 @@
import datCrawl
from datCrawl import *
from datCrawl.downloaders import DefaultDownloader
try:
from lxml.cssselect import CSSSelector
from lxml.etree import fromstring as document_fromstring
except:
print "For this example to work you must install lxml and cssselect"
print " via pip or easy_install:"
print " # pip install lxml csselect"
exit()


class DefaultDownloaderWithCustomUserAgent(DefaultDownloader):
def get(self, url):
import urllib2
try:
headers = {'User-Agent': 'Firefox'}
req = urllib2.Request(url, "", headers)
response = urllib2.urlopen(req)
data = response.read()
return data
except Exception:
raise Exception("Error downloading %s" % url)


class AwesomeWikipediaTitleCrawler(Crawler):
urls = [
('title', 'http\:\/\/en.wikipedia.org\/wiki\/(.*)', )
('get_title', 'http\:\/\/en.wikipedia.org\/wiki\/(.*)', )
]
downloader = 'DefaultDownloaderWithCustomUserAgent'

def action_title(self, url):
return {'title': 'Python'}
def action_get_title(self, data):
try:
document = document_fromstring(data)
selector = CSSSelector('h1.firstHeading > span')
return {'title': selector(document)[0].text}
except Exception as e:
print e
#return {'title': 'Python'}

crawler = datCrawl()
crawler.register_downloader(DefaultDownloaderWithCustomUserAgent)
crawler.register_crawler(AwesomeWikipediaTitleCrawler)
print crawler.run("http://en.wikipedia.org/wiki/Python_(programming_language)")
5 changes: 3 additions & 2 deletions test/requirements.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ class AwesomeEmptyCrawler(Crawler):

class AwesomeWikipediaTitleCrawler(Crawler):
urls = [
('title', 'http\:\/\/en.wikipedia.org\/wiki\/(.*)', )
('get_title', 'http\:\/\/en.wikipedia.org\/wiki\/(.*)', )
]
downloader = 'Downloader'

def action_title(self, url):
def action_get_title(self, data):
# LOOK, IM CRAWLING THE INTERNETS!
return {'title': 'Python'}

0 comments on commit eedeef8

Please sign in to comment.