-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatchers.py
495 lines (405 loc) · 20.1 KB
/
matchers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
# coding: utf-8
from Levenshtein import hamming
from Levenshtein import jaro
import sqlite3
from operations import execute
from operations import connect_toponym
from operations import merge_suggestions
from collections.abc import Iterable
from collections import namedtuple
import logging
# logging.basicConfig(filename='anvil_matcher.log',
# level=logging.INFO,
# format='%(asctime)s %(message)s',
# datefmt='%Y-%m-%d @ %H:%M:%S')
ToponymTuple = namedtuple('ToponymTuple',
['toponym_id', 'name', 'asciiname', 'tokens',
'asciitokens', 'pattern', 'language',
'position_fk'])
# later: homogenize the use of target, target_id, new_toponym etc.
class matcher():
def __init__(self, source=None, execute_function=execute):
"""Preforms auto matching and generate suggestions for added toponyms
Takes:
source - if not None, the matching process will be limited to the
one source.name supplied.
execute_function - the function that communicates with the database
The matching process is divided into several parts, including multiple
functions for comparing strings and generating probable suggestions.
It is initiated by running the 'long_matching' function, however each
individual matching function could be called separately if needed.
The order prescribed here is designed so that each matcher is slightly
looser than what preceeds it, to make sure that we retreive the most
likely candidates first and resolve the simple cases before having to
calculate similarity or distance metrics against each toponym in the
database.
Whenever a toponym is automatically resolved, or multiple suggestions
are found, a short short message is left in the relevant comment field
to explain the level and (when relevant) the strength of the pairing.
perfect_suggestions - searches for perfect matches across the name
variants in the fields.
pattern_matcher - searches using the pattern-name variant across
raw names
The pattern-name variant is generated by replacing differences
between the raw name and ascii name with "_", which matches
any one character.
distance_matcher - Calulates the distance fromt he toponym to each
of the viable options and returns suggestions only if the
distance/similarity measure is below/above a preset limit.
Hamming - 1 (one characted difference for equally long strings)
Jaro - 0.9, ~90% similarity. This very high level makes sure
that the suggestions are not cluttered and when it does
match something it is a very high chance that it is the
best available option.
all_in_one - checks if either toponym's tokens can all be found
among the other's tokens.
# later: improve docstring
"""
unmatched_query = 'select toponym_id, language from toponym where '\
'position_fk is null and comment not like '\
' "multiple_%" '
if source is not None:
unmatched_query += f' and source_fk == "{source}" '
self.query = unmatched_query
self.execute = execute_function
self.toponym_fields = ', '.join(fld for fld in ToponymTuple._fields)
self.suggest_query = 'insert into suggestion '\
'(added_toponym_fk, stable_toponym_fk, comment) '\
'values (?, ?, ?)'
self.mark_query = 'update toponym set comment = :comment || " \n " || '\
'comment where toponym_id == :toponym_id'
def format_languages(func):
"""Decorator to make sure that any supplied languages are formatted"""
def formatter(self, *args, **kwargs):
"""Formats any supplied languages to conform to the queries"""
languages = kwargs['languages']
del kwargs['languages']
if languages is None:
languages = ''
elif len(languages) > 0:
if not isinstance(
languages[0], str) and isinstance(
languages[0], Iterable):
languages = languages[0]
if not languages.strip().startswith('and language in'):
languages = ', '.join(
(f'"{_}"' for _ in languages if _ is not None and
len(_) > 1))
languages = f' and language in ({languages})'
kwargs['languages'] = languages
return func(self, *args, **kwargs)
return formatter
@format_languages
def get_options(self, new_toponym, languages=''):
"""Queries the database for all the viable options
Takes:
new_toponym - toponym_id for the toponym seeking geolocating
Returns:
A list of all viable candidates as ToponymTuple (NamedTuple)
"""
query = f'select {self.toponym_fields} from toponym where '\
'position_fk is not null and '\
'toponym_id not in ( '\
'select stable_toponym_fk from suggestion where '\
'added_toponym_fk == :new_toponym and outcome == FALSE ) '\
' and toponym_id not in ( '\
'select stable_toponym_fk from nemo where '\
'added_toponym_fk == :new_toponym and outcome == FALSE ) '\
'and source_fk not in ( '\
'select source_fk from toponym where '\
'toponym_id == :new_toponym) '
if len(languages) > 0:
query += languages
return set(ToponymTuple(*toponym) for toponym in self.execute(
query, values={'new_toponym': new_toponym}))
@format_languages
def perfect_matches(self, target_id, target, target_field='name',
languages=''):
"""
Input:
target: the exact name/asciiname/tokens of a toponym field
target_field: the field to compare to,
defaults to "name"
languages (optional): limit the search to the provided languages
Output:
A set of unique position ids that perfectly match the name in the
selected languages
"""
query = f'select toponym_id, position_fk, name from toponym where '\
f'{target_field} == :target and position_fk is not NULL '\
'and toponym_id not in ('\
'select stable_toponym_fk from suggestion where '\
'added_toponym_fk == :target_id and outcome == FALSE) '\
'and source_fk not in ( '\
'select source_fk from toponym where '\
'toponym_id == :target) '\
'group by position_fk'
if len(languages) > 0:
query += languages[0]
return set(self.execute(query, values={'target': target,
'target_id': target_id}))
def get_target_data(self, target_id):
"""
Takes:
target_id - a toponym ID for the target seeking geolocating
Returns:
ToponymTuple (namedtuple) of the toponym row
"""
query = f'select {self.toponym_fields}'\
' from toponym where toponym_id == :target_id'
_ = execute(query,
{'target_id': target_id})
if len(_) == 0:
msg = f'{target_id=} could not be found among toponyms'
logging.debug(msg)
raise ValueError(msg)
elif len(_) > 1:
msg = f'{target_id=} was found multiple times among toponyms'
logging.critical(msg)
sqlite3.IntegrityError(msg)
else:
return ToponymTuple(*_[0])
def suggest_toponyms(self, toponym_id, suggestions, comment, suggest=False):
"""
Takes:
toponym_id
suggestions - a set of suggested position_fk
comment - str
Inserts a row in to the suggestion table for each suggestion:
toponym_id as new_toponym_fk
toponym_fk from suggestion set as stable_toponym_fk
comment as explanation for how they were matched, which will
sometimes include a measure of how close they are.
It then tests if all the suggested toponyms link to the same
location, in which case this is also resoled. Unless specifically
Instructed not to.
Returns:
A status message
"""
# adding suggestions first:
suggestions = [
(toponym_id, stable_toponym_fk, ' '.join((comment, str(outcome)))
) for stable_toponym_fk, postition_fk, outcome in suggestions]
execute(self.suggest_query, values=suggestions, many=True)
# Marking the added toponym with "multiple"
execute(self.mark_query, values={'comment': comment+' \n ',
'toponym_id': toponym_id})
if not suggest:
merge_message = merge_suggestions(target_id=toponym_id)
if merge_message is not None:
return merge_message
return 'Added all suggestions'
# a wrapper for getting target row in order and registering outcome
def matcher_decorator(match_suggester):
"""Decorator for managing input and recording output of matchers"""
def matcher_wrapper(self, suggest=False, **kwargs):
"""Homogenise input and output of matcher functions"""
target_row = kwargs['target_row']
if not isinstance(target_row, ToponymTuple):
target_row = self.get_target_data(target_row)
matches, message = match_suggester(self, **kwargs)
if len(matches) < 1:
return 0, message
elif len(matches) == 1 and not suggest:
matches = list(matches)[0]
stable_toponym_fk = matches[0]
position_fk = matches[1]
comment = f'single_{message} to {stable_toponym_fk} -> '\
f'{position_fk}'
connect_toponym(target_row.toponym_id, position_fk, comment)
return 1, message
elif len(matches) > 1 or suggest:
comment = f'multiple_{message}'
message += ' - ' + self.suggest_toponyms(target_row.toponym_id,
matches, comment,
suggest)
return len(matches), message
return matcher_wrapper
# Suggestions for all the relevant fields
# -- which may not be all.
@matcher_decorator
def perfect_suggestions(self, target_row, fields=None):
"""Finds the perfect matches among the options across a set of fields
Takes:
target_row - ToponymTuple (NamedTuple) of toponym seeking geolocating
fields - iterable of the fields to compare the toponyms on, defaults to
all available fields.
Returns:
set of suggestions
result message
"""
if fields is None:
fields = ToponymTuple._fields
for field, value in zip(fields, target_row):
if field == 'toponym_id':
continue
matches = self.perfect_matches(
target_id=target_row.toponym_id,
target=value,
target_field=field, languages=target_row.language)
if len(matches) == 0:
continue
elif len(matches) > 0:
# returning the matches and the matched field for logging
return matches, f'perfect on {field}'
return set(), f'No perfect matches found for {target_row.name}'
@matcher_decorator
def pattern_matcher(self, target_row):
"""Matches topnym on their stored sql-like pattern"""
query = 'select toponym_id, position_fk, name from toponym where name'\
f' like "{target_row.pattern}" and position_fk is not NULL '\
'and toponym_id not in ( select stable_toponym_fk from '\
'suggestion where outcome == 0 and added_toponym_fk '\
'== :target_id) '\
'and source_fk not in ( '\
'select source_fk from toponym where '\
'toponym_id == :target_id) '\
'group by position_fk'
matches = set(execute(query, values={
'target_id': target_row.toponym_id}))
if len(matches) > 0:
return matches, 'pattern_match'
else:
return set(), f'No pattern matches found for {target_row.pattern}'
def hamming1(self, target, option):
"""Calculates the hamming distance, when strings are of equal length"""
if len(target) == len(option):
return hamming(target, option) == 1, 1
return False, ''
def jairo_measure(self, target, option, level):
"""Base function for creating a binary outcome of jaro distance calc"""
score = jaro(target, option)
if score >= level:
return True, score
else:
return False, score
def jairo9(self, target, option):
"""Distance measure for jaro 0.9"""
return self.jairo_measure(target, option, level=0.9)
def all_in_one(self, target, option):
"""Checks if all tokens of one string are in the other, or vice versa"""
target = target.split()
option = option.split()
if target == option:
return True, 'identical'
elif all(t in option for t in target):
return True, 'Target found in option'
elif all(o in target for o in option):
return True, 'Option found in target'
else:
return False, 'Not a match'
@matcher_decorator
def distance_matcher(self, target_row, options=None,
functions=(hamming1, jairo9, all_in_one)):
"""Matching function that progresses through the distance measures"""
if options is None:
options = self.get_options(target_row.toponym_id,
languages=target_row.language)
for func in functions:
if func.__name__ == 'all_in_one':
fields = ['tokens', 'asciitokens']
else:
fields = ['name', 'asciiname']
for field in fields:
idx = ToponymTuple._fields.index(field)
matches = []
for option in options:
usable, score = func(self, target_row[idx], option[idx])
if usable:
matches.append(
(option.toponym_id, option.position_fk,
f'{target_row[idx]} ={score}= {option[idx]}')
)
if len(matches) > 0:
return set(matches), f'{func.__name__}_match'
return set(), 'No distance matches found'
def jairo6(self, target, option):
"""Distance measure for jaro 0.6 -- tested not implemented"""
return self.jairo_measure(target, option, level=0.6)
@matcher_decorator
def distance_suggester(self, target_row, options=None):
"""Suggests matches based on the jaro6 distance -- not implemented"""
if options is None:
options = self.get_options(target_row.toponym_id,
languages=target_row.language)
fields = ['name', 'asciiname', 'pattern', 'tokens',
'asciitokens']
for field in fields:
idx = ToponymTuple._fields.index(field)
matches = []
for option in options:
usable, score = self.jairo6(self, target_row[idx], option[idx])
if usable:
matches.append(
(option.toponym_id, option.position_fk,
f'{target_row[idx]} ={score}= {option[idx]}')
)
if len(matches) > 0:
matches.append((0, 0, 'dummy suggestion'))
return set(matches), 'jairo6_match'
return set(), 'No jairo6 matches found'
def run_all_matchers(self, toponym_id, suggest=False):
"""Runs all the matching functions in order for a given toponym"""
toponym_row = self.get_target_data(toponym_id)
for func in (self.perfect_suggestions,
self.pattern_matcher,
self.distance_matcher,
# self.distance_suggester
):
res, message = func(target_row=toponym_row, suggest=suggest)
if res > 0:
return res, message
return res, message
def get_undone(self):
"""Gets all the toponyms that are without a match for the base query"""
return self.execute(self.query, status='Matcher_count')
def long_matching(self):
"""Starts the automatic matching process for a given query"""
undone = self.get_undone()
start = len(undone)
last = start + 1
round_count = 0
while last > start:
round_count += 1
prev_last = last
last = start
status_message = f'Starting with: {start} unsolved cases'
print(status_message)
logging.info(status_message)
for iterator, (toponym_id, language) in enumerate(undone):
res, message = self.run_all_matchers(toponym_id=toponym_id)
yield round_count, last, prev_last, iterator, message
# for iterator, multi_string in self.matcher(func, string):
status_message = f'Finished round {round_count} - '\
f'{start=}, {last=}'
logging.info(status_message)
undone = self.get_undone()
start = len(undone)
class Nemo(matcher):
def __init__(self):
matcher.__init__(self)
self.query = 'select toponym_id from toponym where '\
'position_fk is null limit 30'
self.suggest_query = self.suggest_query.replace('suggestion', 'nemo')
def top_10(self, *toponyms):
if len(toponyms) == 0:
toponyms = [_[0] for _ in self.execute(self.query)[:30]]
elif len(toponyms) > 30:
toponyms = toponyms[:30]
logging.info('Top_10 nemo starting')
for toponym_id in toponyms:
logging.debug(f'Top_10, working on {toponym_id}')
# checking if it has non-rejected suggestions
if len(self.execute('select * from nemo where added_toponym_fk == :toponym_id and outcome is not False', values={'toponym_id': toponym_id})) > 0:
continue
name = self.execute('select name from toponym where toponym_id == :toponym_id', values={'toponym_id': toponym_id})[0][0]
contenders = []
for candidate in self.get_options(toponym_id, languages=''):
contenders.append((
self.jairo_measure(name, candidate.name, 0)[1],
candidate.toponym_id, candidate.position_fk))
# print(toponym_id, len(contenders))
contenders = [(toponym, position, score)
for score, toponym, position in sorted(
contenders, reverse=True)[:10]]
self.suggest_toponyms(toponym_id, contenders, 'Nemo_')