matchers.py

# coding: utf-8

from Levenshtein import hamming
from Levenshtein import jaro
import sqlite3

from operations import execute
from operations import connect_toponym
from operations import merge_suggestions

from collections.abc import Iterable
from collections import namedtuple

import logging
# logging.basicConfig(filename='anvil_matcher.log',
#                     level=logging.INFO,
#                     format='%(asctime)s %(message)s',
#                     datefmt='%Y-%m-%d @ %H:%M:%S')


ToponymTuple = namedtuple('ToponymTuple',
                          ['toponym_id', 'name', 'asciiname', 'tokens',
                           'asciitokens', 'pattern', 'language',
                           'position_fk'])


# later: homogenize the use of target, target_id, new_toponym etc.
class matcher():

    def __init__(self, source=None, execute_function=execute):
        """Preforms auto matching and generate suggestions for added toponyms

        Takes:
            source - if not None, the matching process will be limited to the
                one source.name supplied.
            execute_function - the function that communicates with the database

        The matching process is divided into several parts, including multiple
        functions for comparing strings and generating probable suggestions.
        It is initiated by running the 'long_matching' function, however each
        individual matching function could be called separately if needed.
        The order prescribed here is designed so that each matcher is slightly
        looser than what preceeds it, to make sure that we retreive the most
        likely candidates first and resolve the simple cases before having to
        calculate similarity or distance metrics against each toponym in the
        database.

        Whenever a toponym is automatically resolved, or multiple suggestions
        are found, a short short message is left in the relevant comment field
        to explain the level and (when relevant) the strength of the pairing.


            perfect_suggestions - searches for perfect matches across the name
                variants in the fields.

            pattern_matcher - searches using the pattern-name variant across
                raw names
                The pattern-name variant is generated by replacing differences
                between the raw name and ascii name with "_", which matches
                any one character.

            distance_matcher - Calulates the distance fromt he toponym to each
                of the viable options and returns suggestions only if the
                distance/similarity measure is below/above a preset limit.
                Hamming - 1 (one characted difference for equally long strings)
                Jaro - 0.9, ~90% similarity. This very high level makes sure
                    that the suggestions are not cluttered and when it does
                    match something it is a very high chance that it is the
                    best available option.
                all_in_one - checks if either toponym's tokens can all be found
                    among the other's tokens.

        # later: improve docstring
        """

        unmatched_query = 'select toponym_id, language from toponym where '\
                          'position_fk is null and comment not like '\
                          ' "multiple_%" '
        if source is not None:
            unmatched_query += f' and source_fk == "{source}" '

        self.query = unmatched_query
        self.execute = execute_function
        self.toponym_fields = ', '.join(fld for fld in ToponymTuple._fields)


        self.suggest_query = 'insert into suggestion '\
                        '(added_toponym_fk, stable_toponym_fk, comment) '\
                        'values (?, ?, ?)'
        self.mark_query = 'update toponym set comment = :comment || " \n " || '\
                     'comment where toponym_id == :toponym_id'



    def format_languages(func):
        """Decorator to make sure that any supplied languages are formatted"""
        def formatter(self, *args, **kwargs):
            """Formats any supplied languages to conform to the queries"""
            languages = kwargs['languages']
            del kwargs['languages']
            if languages is None:
                languages = ''
            elif len(languages) > 0:
                if not isinstance(
                    languages[0], str) and isinstance(
                     languages[0], Iterable):
                    languages = languages[0]
                if not languages.strip().startswith('and language in'):
                    languages = ', '.join(
                        (f'"{_}"' for _ in languages if _ is not None and
                            len(_) > 1))

                    languages = f' and language in ({languages})'
            kwargs['languages'] = languages
            return func(self, *args, **kwargs)
        return formatter

    @format_languages
    def get_options(self, new_toponym, languages=''):
        """Queries the database for all the viable options

        Takes:
            new_toponym - toponym_id for the toponym seeking geolocating

        Returns:
            A list of all viable candidates as ToponymTuple (NamedTuple)
        """

        query = f'select {self.toponym_fields} from toponym where '\
                'position_fk is not null and '\
                'toponym_id not in ( '\
                'select stable_toponym_fk from suggestion where '\
                'added_toponym_fk == :new_toponym and outcome == FALSE ) '\
                ' and toponym_id not in ( '\
                'select stable_toponym_fk from nemo where '\
                'added_toponym_fk == :new_toponym and outcome == FALSE ) '\
                'and source_fk not in ( '\
                'select source_fk from toponym where '\
                'toponym_id == :new_toponym) '
        if len(languages) > 0:
            query += languages

        return set(ToponymTuple(*toponym) for toponym in self.execute(
            query, values={'new_toponym': new_toponym}))

    @format_languages
    def perfect_matches(self, target_id, target, target_field='name',
                        languages=''):
        """
        Input:
            target: the exact name/asciiname/tokens of a toponym field
            target_field: the field to compare to,
            defaults to "name"
            languages (optional): limit the search to the provided languages
        Output:
            A set of unique position ids that perfectly match the name in the
            selected languages
        """
        query = f'select toponym_id, position_fk, name from toponym where '\
                f'{target_field} == :target and position_fk is not NULL '\
                'and toponym_id not in ('\
                'select stable_toponym_fk from suggestion where '\
                'added_toponym_fk == :target_id and outcome == FALSE) '\
                'and source_fk not in ( '\
                'select source_fk from toponym where '\
                'toponym_id == :target) '\
                'group by position_fk'

        if len(languages) > 0:
            query += languages[0]

        return set(self.execute(query, values={'target': target,
                                               'target_id': target_id}))

    def get_target_data(self, target_id):
        """
        Takes:
            target_id - a toponym ID for the target seeking geolocating

        Returns:
            ToponymTuple (namedtuple) of the toponym row
        """
        query = f'select {self.toponym_fields}'\
                ' from toponym where toponym_id == :target_id'
        _ = execute(query,
                    {'target_id': target_id})
        if len(_) == 0:
            msg = f'{target_id=} could not be found among toponyms'
            logging.debug(msg)
            raise ValueError(msg)
        elif len(_) > 1:
            msg = f'{target_id=} was found multiple times among toponyms'
            logging.critical(msg)
            sqlite3.IntegrityError(msg)
        else:
            return ToponymTuple(*_[0])

    def suggest_toponyms(self, toponym_id, suggestions, comment, suggest=False):
        """
        Takes:
            toponym_id
            suggestions - a set of suggested position_fk
            comment - str

        Inserts a row in to the suggestion table for each suggestion:
            toponym_id as new_toponym_fk
            toponym_fk from suggestion set as stable_toponym_fk
            comment as explanation for how they were matched, which will
                sometimes include a measure of how close they are.

            It then tests if all the suggested toponyms link to the same
             location, in which case this is also resoled. Unless specifically
             Instructed not to.

        Returns:
            A status message
        """

        # adding suggestions first:
        suggestions = [
            (toponym_id, stable_toponym_fk, ' '.join((comment, str(outcome)))
             ) for stable_toponym_fk, postition_fk, outcome in suggestions]

        execute(self.suggest_query, values=suggestions, many=True)

        # Marking the added toponym with "multiple"
        execute(self.mark_query, values={'comment': comment+' \n ',
                'toponym_id': toponym_id})

        if not suggest:
            merge_message = merge_suggestions(target_id=toponym_id)

            if merge_message is not None:
                return merge_message

        return 'Added all suggestions'

    # a wrapper for getting target row in order and registering outcome
    def matcher_decorator(match_suggester):
        """Decorator for managing input and recording output of matchers"""
        def matcher_wrapper(self, suggest=False, **kwargs):
            """Homogenise input and output of matcher functions"""
            target_row = kwargs['target_row']
            if not isinstance(target_row, ToponymTuple):
                target_row = self.get_target_data(target_row)

            matches, message = match_suggester(self, **kwargs)

            if len(matches) < 1:
                return 0, message
            elif len(matches) == 1 and not suggest:
                matches = list(matches)[0]
                stable_toponym_fk = matches[0]
                position_fk = matches[1]
                comment = f'single_{message} to {stable_toponym_fk} -> '\
                          f'{position_fk}'
                connect_toponym(target_row.toponym_id, position_fk, comment)
                return 1, message
            elif len(matches) > 1 or suggest:
                comment = f'multiple_{message}'
                message += ' - ' + self.suggest_toponyms(target_row.toponym_id,
                                                         matches, comment,
                                                         suggest)

                return len(matches), message
        return matcher_wrapper

    # Suggestions for all the relevant fields
    #   -- which may not be all.
    @matcher_decorator
    def perfect_suggestions(self, target_row, fields=None):
        """Finds the perfect matches among the options across a set of fields
        Takes:
            target_row - ToponymTuple (NamedTuple) of toponym seeking geolocating
            fields - iterable of the fields to compare the toponyms on, defaults to
                all available fields.

        Returns:
            set of suggestions
            result message
        """

        if fields is None:
            fields = ToponymTuple._fields

        for field, value in zip(fields, target_row):
            if field == 'toponym_id':
                continue
            matches = self.perfect_matches(
                target_id=target_row.toponym_id,
                target=value,
                target_field=field, languages=target_row.language)
            if len(matches) == 0:
                continue
            elif len(matches) > 0:

                # returning the matches and the matched field for logging
                return matches, f'perfect on {field}'

        return set(), f'No perfect matches found for {target_row.name}'

    @matcher_decorator
    def pattern_matcher(self, target_row):
        """Matches topnym on their stored sql-like pattern"""
        query = 'select toponym_id, position_fk, name from toponym where name'\
                f' like "{target_row.pattern}" and position_fk is not NULL '\
                'and toponym_id not in ( select stable_toponym_fk from '\
                'suggestion where outcome == 0 and added_toponym_fk '\
                '== :target_id) '\
                'and source_fk not in ( '\
                'select source_fk from toponym where '\
                'toponym_id == :target_id) '\
                'group by position_fk'

        matches = set(execute(query, values={
                'target_id': target_row.toponym_id}))

        if len(matches) > 0:
            return matches, 'pattern_match'
        else:
            return set(), f'No pattern matches found for {target_row.pattern}'

    def hamming1(self, target, option):
        """Calculates the hamming distance, when strings are of equal length"""
        if len(target) == len(option):
            return hamming(target, option) == 1, 1
        return False, ''

    def jairo_measure(self, target, option, level):
        """Base function for creating a binary outcome of jaro distance calc"""
        score = jaro(target, option)
        if score >= level:
            return True, score
        else:
            return False, score

    def jairo9(self, target, option):
        """Distance measure for jaro 0.9"""
        return self.jairo_measure(target, option, level=0.9)

    def all_in_one(self, target, option):
        """Checks if all tokens of one string are in the other, or vice versa"""
        target = target.split()
        option = option.split()

        if target == option:
            return True, 'identical'
        elif all(t in option for t in target):
            return True, 'Target found in option'
        elif all(o in target for o in option):
            return True, 'Option found in target'
        else:
            return False, 'Not a match'

    @matcher_decorator
    def distance_matcher(self, target_row, options=None,
                         functions=(hamming1, jairo9, all_in_one)):
        """Matching function that progresses through the distance measures"""

        if options is None:
            options = self.get_options(target_row.toponym_id,
                                       languages=target_row.language)

        for func in functions:
            if func.__name__ == 'all_in_one':
                fields = ['tokens', 'asciitokens']
            else:
                fields = ['name', 'asciiname']
            for field in fields:
                idx = ToponymTuple._fields.index(field)
                matches = []
                for option in options:
                    usable, score = func(self, target_row[idx], option[idx])
                    if usable:
                        matches.append(
                            (option.toponym_id, option.position_fk,
                             f'{target_row[idx]} ={score}= {option[idx]}')
                            )
                if len(matches) > 0:
                    return set(matches), f'{func.__name__}_match'
        return set(), 'No distance matches found'

    def jairo6(self, target, option):
        """Distance measure for jaro 0.6 -- tested not implemented"""
        return self.jairo_measure(target, option, level=0.6)

    @matcher_decorator
    def distance_suggester(self, target_row, options=None):
        """Suggests matches based on the jaro6 distance -- not implemented"""
        if options is None:
            options = self.get_options(target_row.toponym_id,
                                       languages=target_row.language)

        fields = ['name', 'asciiname', 'pattern', 'tokens',
                          'asciitokens']
        for field in fields:
            idx = ToponymTuple._fields.index(field)
            matches = []
            for option in options:
                usable, score = self.jairo6(self, target_row[idx], option[idx])
                if usable:
                    matches.append(
                        (option.toponym_id, option.position_fk,
                         f'{target_row[idx]} ={score}= {option[idx]}')
                        )
            if len(matches) > 0:
                matches.append((0, 0, 'dummy suggestion'))
                return set(matches), 'jairo6_match'

        return set(), 'No jairo6 matches found'

    def run_all_matchers(self, toponym_id, suggest=False):
        """Runs all the matching functions in order for a given toponym"""
        toponym_row = self.get_target_data(toponym_id)
        for func in (self.perfect_suggestions,
                     self.pattern_matcher,
                     self.distance_matcher,
                     # self.distance_suggester
                     ):
            res, message = func(target_row=toponym_row, suggest=suggest)
            if res > 0:
                return res, message
        return res, message

    def get_undone(self):
        """Gets all the toponyms that are without a match for the base query"""
        return self.execute(self.query, status='Matcher_count')

    def long_matching(self):
        """Starts the automatic matching process for a given query"""
        undone = self.get_undone()
        start = len(undone)
        last = start + 1
        round_count = 0
        while last > start:
            round_count += 1
            prev_last = last
            last = start
            status_message = f'Starting with: {start} unsolved cases'
            print(status_message)
            logging.info(status_message)
            for iterator, (toponym_id, language) in enumerate(undone):

                res, message = self.run_all_matchers(toponym_id=toponym_id)
                yield round_count, last, prev_last, iterator, message
                # for iterator, multi_string in self.matcher(func, string):

            status_message = f'Finished round {round_count} - '\
                             f'{start=}, {last=}'

            logging.info(status_message)
            undone = self.get_undone()
            start = len(undone)


class Nemo(matcher):
    def __init__(self):
        matcher.__init__(self)
        self.query = 'select toponym_id from toponym where '\
                     'position_fk is null limit 30'
        self.suggest_query = self.suggest_query.replace('suggestion', 'nemo')

    def top_10(self, *toponyms):

        if len(toponyms) == 0:
            toponyms = [_[0] for _ in self.execute(self.query)[:30]]
        elif len(toponyms) > 30:
            toponyms = toponyms[:30]

        logging.info('Top_10 nemo starting')

        for toponym_id in toponyms:

            logging.debug(f'Top_10, working on {toponym_id}')

            # checking if it has  non-rejected suggestions
            if len(self.execute('select * from nemo where added_toponym_fk == :toponym_id and outcome is not False', values={'toponym_id': toponym_id})) > 0:
                continue

            name = self.execute('select name from toponym where toponym_id == :toponym_id', values={'toponym_id': toponym_id})[0][0]

            contenders = []
            for candidate in self.get_options(toponym_id, languages=''):
                contenders.append((
                    self.jairo_measure(name, candidate.name, 0)[1],
                    candidate.toponym_id, candidate.position_fk))

            # print(toponym_id, len(contenders))

            contenders = [(toponym, position, score)
                          for score, toponym, position in sorted(
                            contenders, reverse=True)[:10]]

            self.suggest_toponyms(toponym_id, contenders, 'Nemo_')