-
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdcachebase.py
127 lines (96 loc) · 3.79 KB
/
dcachebase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import typing
import json
import dataclasses
import dataclasses_json
import globs
@dataclasses_json.dataclass_json
@dataclasses.dataclass
class CacheItem:
# description text derived from meta tags of document
description: str
# error while retrieving description
description_error: str
# error description - if selenium lookup is enabled.
description_error_selenium: str
# the language of the description (derived from language recognition model run over description text)
language_description: str
# http content language header (optional)
http_content_language: str
# language attribue in body tag of html response (optional)
html_document_language: str
# auto translation of descriptions
translations: typing.Dict[str, str]
# geo ip language (determined from host)
geoip_lan: str
class FlagList:
def __init__(self):
self.flag_list = {}
def read(self):
if not os.path.isfile(globs.Globals.description_cache_file):
raise ValueError(f"file not found {globs.Globals.description_cache_file}")
with open(globs.Globals.flag_list, 'r') as flags_file:
lines = flags_file.readlines()
for line in lines:
if line != "":
pos = line.find(".")
self.flag_list[ line[:pos] ] = 1
print(f"countries: {' '.join(list(self.flag_list.keys()))}")
def has_flag(self, country_code):
return self.flag_list.get(country_code) is not None
class DescriptionCacheBase:
def __init__(self):
self.map_url_to_descr = {}
self.map_url_to_descr_changed = False
self.flag_list = FlagList()
self.flag_list.read()
@staticmethod
def set_file_name(name):
globs.Globals.description_cache_file = name
def read_description_cache(self):
if os.path.isfile(globs.Globals.description_cache_file):
with open(globs.Globals.description_cache_file, 'r') as cache_file:
self.map_url_to_descr = json.load(cache_file)
def write_description_cache(self):
if self.map_url_to_descr_changed:
with open(globs.Globals.description_cache_file, 'w') as cache_file:
json.dump( self.map_url_to_descr, cache_file, indent=2 )
self.map_url_to_descr_changed = False
return True
return False
def set_changed(self):
self.map_url_to_descr_changed = True
def cache_get(self, url):
descr = self.map_url_to_descr.get(url, None)
if descr is not None:
return CacheItem.from_dict(descr)
return None
def cache_set(self, url, obj):
self.map_url_to_descr[ url ] = obj.to_dict()
self.map_url_to_descr_changed = True
def get_country(self, cache_item):
# try html document language first (know it's misleading, but very few sites are setting the http-content-language header correctly...)
lan = cache_item.html_document_language
country = ""
pos = lan.find("-")
if pos == -1:
pos = lan.find("_")
if pos != -1:
country = lan[pos+1:]
if self.flag_list.has_flag(country):
return country
country = lan[:pos]
else:
country = lan
if country == "":
if cache_item.language_description.startswith("__label__"):
country = cache_item.language_description[ len("__label__") : ]
if country != "":
if self.flag_list.has_flag(country):
return country
# use country from geopid as last resort.
country = cache_item.geoip_lan
if self.flag_list.has_flag(country):
return country
print(f"no such country: {country} {repr(cache_item)}")
return ""