Skip to content

Commit

Permalink
最长公共子序列相似度
Browse files Browse the repository at this point in the history
  • Loading branch information
Tongjilibo committed Jul 8, 2024
1 parent a3c5772 commit 2ca5d27
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 5 deletions.
23 changes: 22 additions & 1 deletion bert4vector/core/lteral.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numpy as np
from loguru import logger
from .base import PairedSimilarity, VectorSimilarity
from ..snippets.distance import string_hash, hamming_distance, longest_common_substring_size
from ..snippets.distance import string_hash, hamming_distance, longest_common_substring_size, longest_common_subsequence_size
from ..snippets.rank_bm25 import BM25Okapi
from ..snippets.tfidf import TFIDF, load_stopwords, default_stopwords_file
from ..snippets.util import cos_sim, semantic_search
Expand All @@ -19,6 +19,7 @@
__all__ = [
'SameCharsSimilarity',
'LongestCommonSubstringSimilarity',
'LongestCommonSubsequenceSimilarity',
'HownetSimilarity',
'SimHashSimilarity',
'TfidfSimilarity',
Expand Down Expand Up @@ -98,6 +99,26 @@ def calc_pair_sim(self, emb1:str, emb2:str, **kwargs):
return similarity_score


class LongestCommonSubsequenceSimilarity(PairedSimilarity):
"""基于最长公共子串占比计算相似度
"""
def __init__(self, corpus: List[str] = None,
min_same_len: int = 70,
min_same_len_score: float = 0.9):
super().__init__(corpus=corpus, matching_type='LongestCommonSubsequenceSimilarity')
self.min_same_len = min_same_len
self.min_same_len_score = min_same_len_score

def calc_pair_sim(self, emb1:str, emb2:str, **kwargs):
if not emb1 or not emb2:
return 0.0
same_size = longest_common_subsequence_size(emb1, emb2)
same_score = self.min_same_len_score if same_size > self.min_same_len else 0.0
# 取最长公共子串/多个序列长度的最大值
similarity_score = max(same_size / len(emb1), same_size / len(emb2), same_score)
return similarity_score


class HownetSimilarity(PairedSimilarity):
"""计算两组texts之间的Hownet相似度
"""
Expand Down
22 changes: 21 additions & 1 deletion bert4vector/snippets/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def is_str_match(str1, str2, threshold=1.0):
return (1.0 - edit_distance(str1, str2)) >= threshold


def longest_common_substring_size(str1, str2):
def longest_common_substring_size(str1:str, str2:str):
"""最长公共子串长度"""
sq = SequenceMatcher(None, str1, str2)
match = sq.find_longest_match(0, len(str1), 0, len(str2))
Expand All @@ -126,6 +126,26 @@ def longest_common_substring_ratio(str1, str2):
return try_divide(match.size, min(len(str1), len(str2)))


def longest_common_subsequence_size(text1: str, text2: str) -> int:
'''最长公共子序列'''
m, n = len(text1), len(text2)
# dp[i][j] 表示 text1[..i] 和 text2[..j] 的最长公共子序列的长度
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(m):
for j in range(n):
if text1[i] == text2[j]:
# 如果 text1[i] == text2[j] ,则必定选择这两个字符作为最长公共子序列的结尾,
# 那么状态 dp[i + 1][j + 1] 可由 dp[i][j] + 1 转移而来
dp[i + 1][j + 1] = dp[i][j] + 1
else:
# 如果 text1[i] != text2[j] ,
# 则 dp[i + 1][j + 1] 只能从 dp[i + 1][j] 和 dp[i][j + 1] 直接转移
dp[i + 1][j + 1] = max(dp[i + 1][j], dp[i][j + 1])

# dp[m][n] 就是 text1 和 text2 的最长公共子序列的长度
return dp[m][n]


def jaccard_coef(A, B):
if not isinstance(A, set):
A = set(A)
Expand Down
8 changes: 5 additions & 3 deletions test/test_lteral_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
'''
from bert4vector.core import SameCharsSimilarity, LongestCommonSubstringSimilarity, CilinSimilarity
from bert4vector.core import HownetSimilarity, SimHashSimilarity, TfidfSimilarity, BM25Similarity
from bert4vector.core import LongestCommonSubsequenceSimilarity
import pytest


Expand All @@ -11,7 +12,8 @@
SimHashSimilarity,
TfidfSimilarity,
BM25Similarity,
CilinSimilarity])
CilinSimilarity,
LongestCommonSubsequenceSimilarity])
def test_literal_similarity(text2vecClass):
'''最长公共子序列相似度'''
text2vec = text2vecClass()
Expand All @@ -23,7 +25,7 @@ def test_literal_similarity(text2vecClass):
similarity = text2vec.similarity(sent1, sent2)
print(similarity)

text2vec.add_corpus(['你好', '我选你'])
text2vec.add_corpus(['你们好', '我选你'])
text2vec.add_corpus(['天气不错', '人很好看'])
text2vec.save(corpus_path='../cache/corpus.jsonl', emb_path='../cache/emb.jsonl')
text2vec.load(corpus_path='../cache/corpus.jsonl', emb_path='../cache/emb.jsonl')
Expand All @@ -32,4 +34,4 @@ def test_literal_similarity(text2vecClass):


if __name__ == '__main__':
test_literal_similarity(CilinSimilarity)
test_literal_similarity(LongestCommonSubstringSimilarity)

0 comments on commit 2ca5d27

Please sign in to comment.