-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlevenshtein_distance.py
76 lines (53 loc) · 3.59 KB
/
levenshtein_distance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
## Levenshtein distance
import numpy as np
####################################################################################################################################################################################
# This module contains a method that uses the Levensthein distance between two string, in order
# to suggest a correction to a word provided from the user (test_word), drawn from a list of possible
# words (words)
def Leven_dist_comparison(test_word, words):
## Levensthein algorithm
# Given two words (test_word and word), being n_rows and n_cols, repsectively, the lengths of the two
# strings increased by one, a matrix is created accounting for the distance between each two possible
# pair of substrings of the first two strings (including a null string). For each pair of substrings
# (:j_test)|(:j_word) such distance is given by the minimum value between d((:j_test - 1),(:j_word)) + 1,
# that accounts for the addition of a character to the first substring, d((:j_test1),(:j_word - 1)) + 1,
# that accounts for the removal of a character from the second substring and d((:j_test1 - 1),(:j_word - 1)) + increase,
# that accounts for the change of a character in the substring, where increase can either be 0 if the current characters
# are equal, or 1 otherwise.
# Creating a vector where to store the distances between the test word and the other ones (count runs through the vector)
distances = np.zeros((len(words)))
count = 0
# Setting the number of rows (axis = 0) of the Levenshtein matrix
n_rows = len(test_word) + 1
# Running through the list of words, in order to evaluate their distance from the test word
for word in words:
# Setting the number of columns (axis = 1) of the Levenhtein matrix
n_cols = len(word) + 1
# Initializing the Levensthein matrix
leven_matrix = np.zeros((n_rows, n_cols))
# Setting the first row and columns of the Levensthein matrix (distance of each sub-string from the null string)
leven_matrix[:, 0] = np.arange(n_rows)
leven_matrix[0, :] = np.arange(n_cols)
# Running thorught the rows and the columns in order to evaluate the distance between each sub-string
for i_rows in range(1, n_rows):
# Index to be used for slicing the test_word string
j_test = i_rows - 1
for i_cols in range(1, n_cols):
# Index to be used for slicing the word string
j_word = i_cols - 1
# Increase in the distance between the substrings (: jtest)|(:jword) and (: jtest - 1)|(:jword - 1),
# that is equal to 0 if the current characters are equal, 1 otherwise
increase = 0
if test_word[j_test] != word[j_word]: increase = 1
# Evaluating the distance between the substring (: jtest)|(:jword)
leven_matrix[i_rows, i_cols] = min(leven_matrix[i_rows - 1, i_cols] + 1,
leven_matrix[i_rows, i_cols - 1] + 1,
leven_matrix[i_rows- 1, i_cols - 1] + increase)
# Storing the distance evaluated and updating the counter
distances[count] = leven_matrix[n_rows - 1, n_cols - 1]
count += 1
## Post-processing
# The closest word is evaluated as the word with the minimum distance from the test_word.
# Since many minima may appear, they must be all found
indices = np.nonzero(distances == np.min(distances))[0]
return([words[index] for index in indices])