This repository has been archived by the owner on Aug 9, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added Downloader support: A class that handles the download of the da…
…ta of the url provided. Added exception and updated tests. The crawlers does NOT receive the URL as main parameter, receiving the html content instead. Added a simple example to the example dir. Added downloader tests (must make more)
- Loading branch information
Felipe Martin Garcia
committed
Mar 16, 2013
1 parent
82a7cb4
commit eedeef8
Showing
6 changed files
with
72 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,44 @@ | ||
import datCrawl | ||
from datCrawl import * | ||
from datCrawl.downloaders import DefaultDownloader | ||
try: | ||
from lxml.cssselect import CSSSelector | ||
from lxml.etree import fromstring as document_fromstring | ||
except: | ||
print "For this example to work you must install lxml and cssselect" | ||
print " via pip or easy_install:" | ||
print " # pip install lxml csselect" | ||
exit() | ||
|
||
|
||
class DefaultDownloaderWithCustomUserAgent(DefaultDownloader): | ||
def get(self, url): | ||
import urllib2 | ||
try: | ||
headers = {'User-Agent': 'Firefox'} | ||
req = urllib2.Request(url, "", headers) | ||
response = urllib2.urlopen(req) | ||
data = response.read() | ||
return data | ||
except Exception: | ||
raise Exception("Error downloading %s" % url) | ||
|
||
|
||
class AwesomeWikipediaTitleCrawler(Crawler): | ||
urls = [ | ||
('title', 'http\:\/\/en.wikipedia.org\/wiki\/(.*)', ) | ||
('get_title', 'http\:\/\/en.wikipedia.org\/wiki\/(.*)', ) | ||
] | ||
downloader = 'DefaultDownloaderWithCustomUserAgent' | ||
|
||
def action_title(self, url): | ||
return {'title': 'Python'} | ||
def action_get_title(self, data): | ||
try: | ||
document = document_fromstring(data) | ||
selector = CSSSelector('h1.firstHeading > span') | ||
return {'title': selector(document)[0].text} | ||
except Exception as e: | ||
print e | ||
#return {'title': 'Python'} | ||
|
||
crawler = datCrawl() | ||
crawler.register_downloader(DefaultDownloaderWithCustomUserAgent) | ||
crawler.register_crawler(AwesomeWikipediaTitleCrawler) | ||
print crawler.run("http://en.wikipedia.org/wiki/Python_(programming_language)") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters