Skip to content
This repository has been archived by the owner on Aug 9, 2019. It is now read-only.

Commit

Permalink
+ Added options kwarg to Downloader methods for behaviour customisation
Browse files Browse the repository at this point in the history
  • Loading branch information
Felipe Martín committed Mar 20, 2013
1 parent 0184580 commit 27deb2e
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 12 deletions.
7 changes: 4 additions & 3 deletions datCrawl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,10 @@ def register_downloader(self, downloader):
def downloader_is_registered(self, downloader_name):
return downloader_name in self.downloaders

def download(self, url, downloader):
def download(self, url, downloader, options):
if self.downloader_is_registered(downloader):
getter = self.downloaders[downloader]()
data = getter.get(url)
data = getter.get(url, options=options)
return data
else:
raise DownloaderIsNotRegistered("Downloader %s is not registered. Register it before your crawler." % downloader)
Expand All @@ -80,7 +80,8 @@ def run(self, url):
raise CrawlerUrlDontHaveGroupDefined('The pattern [%s] of crawler [%s] dont have a url group defined.' % (pattern, crawler))
action = registered_url[1]
downloader = getattr(self.crawlers[crawler], 'downloader')
data = self.download(crawl_url, downloader)
downloader_options = getattr(self.crawlers[crawler], 'downloader_options')
data = self.download(crawl_url, downloader, downloader_options)
return self.crawlers[crawler]().do(action, data, matches=matches)
raise CrawlerForThisURLNotFound("No crawler registered a URL pattern for: %s" % url)
else:
Expand Down
1 change: 1 addition & 0 deletions datCrawl/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ class Crawler(object):
"Base crawler class."
urls = [] # List of tuples with regular expression of URLs that the crawler handle
downloader = 'Downloader' # Name of the downloader class to use
downloader_options = None

def do(self, action, data, **kwargs):
try:
Expand Down
4 changes: 2 additions & 2 deletions datCrawl/downloaders.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
class Downloader(object):
"Base downloader object"
def get(self, url):
def get(self, url, **kwargs):
print("I'm a useless downloader :_")


class DefaultDownloader(Downloader):
"Downloader using urllib2"

def get(self, url):
def get(self, url, **kwargs):
import urllib2
try:
req = urllib2.Request(url)
Expand Down
21 changes: 14 additions & 7 deletions examples/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,34 +11,41 @@


class DefaultDownloaderWithCustomUserAgent(DefaultDownloader):
def get(self, url):
def get(self, url, **kwargs):
import urllib2
try:
headers = {'User-Agent': 'Firefox'}
options = kwargs.get('options')
headers = {}
if 'headers' in options:
headers = options['headers']
req = urllib2.Request(url, "", headers)
response = urllib2.urlopen(req)
data = response.read()
return data
except Exception:
raise Exception("Error downloading %s" % url)
except Exception as error:
raise Exception("Error downloading %s:" % (url, error))


class AwesomeWikipediaTitleCrawler(Crawler):
urls = [
('get_title', 'http\:\/\/en.wikipedia.org\/wiki\/(.*)', )
('get_title', '(?P<url>http\:\/\/en.wikipedia.org\/wiki\/(?P<name>.*))', )
]
downloader = 'DefaultDownloaderWithCustomUserAgent'
# Downloader options with custom user agent.
downloader_options = {
'headers': {'User-agent': 'Firefox'}
}

def action_get_title(self, data):
def action_get_title(self, data, **kwargs):
try:
document = document_fromstring(data)
selector = CSSSelector('h1.firstHeading > span')
return {'title': selector(document)[0].text}
except Exception as e:
print e
#return {'title': 'Python'}

crawler = datCrawl()
crawler.register_downloader(DefaultDownloaderWithCustomUserAgent)
crawler.register_crawler(AwesomeWikipediaTitleCrawler)
print crawler.run("http://en.wikipedia.org/wiki/Python_(programming_language)")
# returns {'title': 'Python (programming language)'}

0 comments on commit 27deb2e

Please sign in to comment.