Skip to content

Commit

Permalink
Increase read text speed
Browse files Browse the repository at this point in the history
  • Loading branch information
aikikode committed Feb 20, 2017
1 parent 5f06b60 commit 010efb9
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 21 deletions.
20 changes: 11 additions & 9 deletions geotext/geotext.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def _build_geodb(self):
country_db, state_db, city_db, nationality_db,
city_abbreviation_db, country_abbreviation_db
)
self._location_length = self._get_locations_length()
self._max_location_length = self._get_locations_length()[0]

def _get_locations_length(self):
words_counts = set()
Expand All @@ -81,7 +81,9 @@ def _get_candidates(self, text):
# TODO: improve this, since DB has unicode symbols in cities
text = re.sub(r'[^\w]+', ' ', text).strip()

return CandidateDB(text).get_candidates()
return CandidateDB(
text, max_phrase_len=self._max_location_length
).get_candidates()

def read(self, text, min_population=0, skip_nationalities=False):
self.text = text
Expand Down Expand Up @@ -147,7 +149,7 @@ def _get_locations_from_candidates(
city_abbrev_match.place.population >= min_population
):
cities.add(city_abbrev_match.place)
candidate.is_location = True
candidate.mark_as_location()
continue

# 2
Expand All @@ -159,7 +161,7 @@ def _get_locations_from_candidates(
state_match.country.population >= min_population
):
states.add(state_match)
candidate.is_location = True
candidate.mark_as_location()
continue

# 3
Expand All @@ -169,7 +171,7 @@ def _get_locations_from_candidates(
)
if country_match and country_match.population >= min_population:
countries.add(country_match)
candidate.is_location = True
candidate.mark_as_location()
continue

# 4
Expand All @@ -182,7 +184,7 @@ def _get_locations_from_candidates(
nationality_match.place.population >= min_population
):
nationalities.add(nationality_match.place)
candidate.is_location = True
candidate.mark_as_location()
continue

# 5
Expand All @@ -194,14 +196,14 @@ def _get_locations_from_candidates(
country_abbrev_match.place.population >= min_population
):
countries.add(country_abbrev_match.place)
candidate.is_location = True
candidate.mark_as_location()
continue

# 6
city_match = self._geodb.city_db.search(candidate.text.lower())
if city_match and city_match.population >= min_population:
cities.add(city_match)
candidate.is_location = True
candidate.mark_as_location()
continue

# 7
Expand All @@ -213,7 +215,7 @@ def _get_locations_from_candidates(
state_match.country.population >= min_population
):
states.add(state_match)
candidate.is_location = True
candidate.mark_as_location()
continue
return (
tuple(countries), tuple(nationalities), tuple(states),
Expand Down
29 changes: 17 additions & 12 deletions geotext/models/candidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,27 @@ class Candidate(object):
def __init__(self, text):
self.text = text
self.parents = set()
self.is_location = False # whether this candidate is a valid location
self.children = set()
# Whether this candidate is a valid location
self.is_location = False
# Whether any of candidate parent and their parents and so on is
# a valid location
self.has_parent_location = False

def add_parent(self, parent):
self.parents.add(parent)
parent.children.add(self)

@property
def has_parent_location(self):
"""
Whether any of candidate parent and their parents and so on is
a valid location
:return:
"""
for parent in self.parents:
if parent.is_location or parent.has_parent_location:
return True
return False
def get_all_children(self):
result = self.children
for child in self.children:
result |= child.get_all_children()
return result

def mark_as_location(self):
self.is_location = True
for child in self.get_all_children():
child.has_parent_location = True

def __repr__(self):
return '{}: "{}"'.format(type(self).__name__, self.text)
Expand Down

0 comments on commit 010efb9

Please sign in to comment.