-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathfoodnetwork.py
88 lines (70 loc) · 2.92 KB
/
foodnetwork.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python
"""
foodnetwork.py
This module inherits from RecipeParser, and provides an implementation
for parsing recipes from the foodnetwork.com site.
"""
import re
import json
from parser import RecipeParser
from settings import ENCODING
class FoodNetwork(RecipeParser):
def __init__(self, url, pageEncoding=ENCODING):
RecipeParser.__init__(self, url, pageEncoding)
# this site now has all the recipe data available as an embedded json object
for node in self.tree.xpath('//*[@type="application/ld+json"]'):
self.recipeJSON = json.loads(u''.join(node.xpath('descendant-or-self::text()')))
# define some patterns to match/filter
otherURL = re.compile(r'/recipes/', re.I)
seriesURL = re.compile(r'recipes$', re.I)
sectionURL = re.compile(r'#', re.I)
def getTitle(self):
"""The title format is:
<title>Recipe | Contributor Name | Food Network</title>
we want just 'Recipe'
"""
try:
# use the json object data
return self.recipeJSON['name']
except (AttributeError, KeyError) as e:
print('[warning]: likely no recipe at', self.url)
# fall back to parsing the html title, which is colon-separated
return self.tree.xpath('//title')[0].text.split(':')[0].strip()
def getImage(self):
"""The image format is:
<meta property="og:image" content="IMG_URL">
we want just 'IMG_URL'
"""
return self.tree.xpath('//meta[@property="og:image"]')[0].get('content')
def getIngredients(self):
"""Return a list or a map of the recipe ingredients"""
try:
return list(filter(None, map(lambda x: x.strip(), self.recipeJSON['recipeIngredient'])))
except (AttributeError, KeyError):
self.valid = False
return []
def getDirections(self):
"""Return a list or a map of the preparation instructions"""
try:
return list(filter(None, map(lambda x: x.strip(), self.recipeJSON['recipeInstructions'])))
except (AttributeError, KeyError):
self.valid = False
return []
def getTags(self):
"""Return a list of tags for this recipe"""
try:
return list(filter(None, map(lambda x: x.strip(), self.recipeJSON['recipeCategory'])))
except (AttributeError, KeyError):
self.valid = False
return []
def getOtherRecipeLinks(self):
"""Return a list of other recipes found in the page"""
data = []
for link in self.tree.xpath('//div[contains(@class,"m-MediaBlock__m-MediaWrap")]/a'):
if 'href' in link.keys():
l = link.get('href')
if self.otherURL.search(l) and \
not self.seriesURL.search(l) and \
not self.sectionURL.search(l):
data.append(l)
return data