implement all editops APIs using RapidFuzz

rapidfuzz · Jul 22, 2022 · d252234 · d252234
1 parent f812040
commit d252234
Show file tree

Hide file tree

Showing 9 changed files with 85 additions and 576 deletions.
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,13 +1,12 @@
 ## Changelog
 
-### v0.19.4
+### v0.20.0
 #### Changed
-- use `matching_blocks`/`apply` implementation from RapidFuzz
+- use `matching_blocks`/`apply`/`remove_subsequence`/`inverse` implementation from RapidFuzz
 
 #### Fixed
 - stop adding data to wheels
-- add editops validation to subtract_edit to fix segmentation fault on some
-  invalid editop sequences
+- fix segmentation fault on some invalid editop sequences in subtract_edit
 - detect duplicated entries in editops validation
 
 ### v0.19.3

diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ This is a fork of [ztane/python-Levenshtein](https://github.com/ztane/python-Lev
 project is no longer actively maintained.
 
 ## Requirements
-* Python 3.5 or later
+* Python 3.6 or later
 
 ## Installation
 ```bash

diff --git a/docs/conf.py b/docs/conf.py
@@ -22,7 +22,7 @@
 author = 'Max Bachmann'
 
 # The full version, including alpha/beta/rc tags
-release = '0.19.3'
+release = '0.20.0'
 
 # -- General configuration ---------------------------------------------------
 

diff --git a/extern/rapidfuzz-cpp b/extern/rapidfuzz-cpp
diff --git a/setup.py b/setup.py
@@ -5,10 +5,10 @@
 
 setup(
     name="Levenshtein",
-    version="0.19.3",
+    version="0.20.0",
     url="https://github.com/maxbachmann/Levenshtein",
     author="Max Bachmann",
-    install_requires=["rapidfuzz >= 2.0.1, < 3.0.0"],
+    install_requires=["rapidfuzz >= 2.3.0, < 3.0.0"],
     author_email="[email protected]",
     description="Python extension for computing string edit distances and similarities.",
     long_description=readme,

diff --git a/src/Levenshtein/Levenshtein-c/_levenshtein.cpp b/src/Levenshtein/Levenshtein-c/_levenshtein.cpp
@@ -405,241 +405,3 @@ munkers_blackman(size_t n1, size_t n2, double *dists)
 }
 
 /* }}} */
-
-/****************************************************************************
- *
- * Editops and other difflib-like stuff.
- *
- ****************************************************************************/
-/* {{{ */
-
-/**
- * lev_editops_valid:
- * @len1: The length of an eventual @ops source string.
- * @len2: The length of an eventual @ops destination string.
- * @n: The length of @ops.
- * @ops: An array of elementary edit operations.
- *
- * Checks whether @ops is consistent and applicable as a partial edit from a
- * string of length @len1 to a string of length @len2.
- *
- * Returns: true if valid, false otherwise
- **/
-bool lev_editops_valid(size_t len1, size_t len2, size_t n, const LevEditOp *ops)
-{
-  if (!n)
-    return true;
-
-  /* check bounds */
-  const LevEditOp* o = ops;
-  for (size_t i = n; i; i--, o++) {
-    if (o->type >= LEV_EDIT_LAST)
-      return false;
-    if (o->spos > len1 || o->dpos > len2)
-      return false;
-    if (o->spos == len1 && o->type != LEV_EDIT_INSERT)
-      return false;
-    if (o->dpos == len2 && o->type != LEV_EDIT_DELETE)
-      return false;
-  }
-
-  /* check ordering */
-  o = ops + 1;
-  for (size_t i = n - 1; i; i--, o++, ops++) {
-    if (o->spos > ops->spos || o->dpos > ops->dpos)
-      continue;
-
-    return false;
-  }
-
-  return true;
-}
-
-/**
- * lev_opcodes_valid:
- * @len1: The length of an eventual @ops source string.
- * @len2: The length of an eventual @ops destination string.
- * @nb: The length of @bops.
- * @bops: An array of difflib block edit operation codes.
- *
- * Checks whether @bops is consistent and applicable as an edit from a
- * string of length @len1 to a string of length @len2.
- *
- * Returns: true if valid, false otherwise
- **/
-bool lev_opcodes_valid(size_t len1, size_t len2, size_t nb, const LevOpCode *bops)
-{
-  if (!nb)
-    return (len1 == 0 && len2 == 0);
-
-  /* check completenes */
-  if (bops->sbeg || bops->dbeg
-      || bops[nb - 1].send != len1 || bops[nb - 1].dend != len2)
-    return false;
-
-  /* check bounds and block consistency */
-  const LevOpCode* b = bops;
-  for (size_t i = nb; i; i--, b++) {
-    if (b->send > len1 || b->dend > len2)
-      return false;
-    switch (b->type) {
-    case LEV_EDIT_KEEP:
-    case LEV_EDIT_REPLACE:
-      if (b->dend - b->dbeg != b->send - b->sbeg || b->dend == b->dbeg)
-        return false;
-      break;
-    case LEV_EDIT_INSERT:
-      if (b->dend - b->dbeg == 0 || b->send - b->sbeg != 0)
-        return false;
-      break;
-    case LEV_EDIT_DELETE:
-      if (b->send - b->sbeg == 0 || b->dend - b->dbeg != 0)
-        return false;
-      break;
-
-    default:
-      return false;
-    }
-  }
-
-  /* check ordering */
-  b = bops + 1;
-  for (size_t i = nb - 1; i; i--, b++, bops++) {
-    if (b->sbeg != bops->send || b->dbeg != bops->dend)
-      return false;
-  }
-
-  return true;
-}
-
-/**
- * lev_editops_invert:
- * @n: The length of @ops.
- * @ops: An array of elementary edit operations.
- *
- * Inverts the sense of @ops.  It is modified in place.
- *
- * In other words, @ops becomes a valid partial edit for the original source
- * and destination strings with their roles exchanged.
- **/
-void lev_editops_invert(size_t n, LevEditOp *ops)
-{
-  for (size_t i = n; i; i--, ops++) {
-    size_t z = ops->dpos;
-    ops->dpos = ops->spos;
-    ops->spos = z;
-    if (ops->type & 2)
-      ops->type = (LevEditType)(ops->type ^ 1);
-  }
-}
-
-/**
- * lev_opcodes_invert:
- * @nb: The length of @ops.
- * @bops: An array of difflib block edit operation codes.
- *
- * Inverts the sense of @ops.  It is modified in place.
- *
- * In other words, @ops becomes a partial edit for the original source
- * and destination strings with their roles exchanged.
- **/
-void lev_opcodes_invert(size_t nb, LevOpCode *bops)
-{
-  for (size_t i = nb; i; i--, bops++) {
-    size_t z = bops->dbeg;
-    bops->dbeg = bops->sbeg;
-    bops->sbeg = z;
-    z = bops->dend;
-    bops->dend = bops->send;
-    bops->send = z;
-    if (bops->type & 2)
-      bops->type = (LevEditType)(bops->type ^ 1);
-  }
-}
-
-/**
- * lev_editops_subtract:
- * @n: The size of @ops.
- * @ops: An array of elementary edit operations.
- * @ns: The size of @sub.
- * @sub: A subsequence (ordered subset) of @ops
- * @nrem: Where to store then length of the remainder array.
- *
- * Subtracts a subsequence of elementary edit operations from a sequence.
- *
- * The remainder is a sequence that, applied to result of application of @sub,
- * gives the same final result as application of @ops to original string.
- *
- * Returns: A newly allocated array of normalized edit operations, its length
- *          is stored to @nrem.  It is always normalized, i.e, without any
- *          keep operations.  On failure, %NULL is returned.
- **/
-LevEditOp*
-lev_editops_subtract(size_t n,
-                     const LevEditOp* ops,
-                     size_t ns,
-                     const LevEditOp* sub,
-                     size_t *nrem)
-{
-    static const int shifts[] = { 0, 0, 1, -1 };
-    LevEditOp *rem;
-    size_t i, j;
-    int shift;
-
-    /* compute remainder size */
-    *nrem = (size_t)-1;
-
-    size_t nr = std::accumulate(ops, ops + n, 0, [](size_t a, const LevEditOp& op) {
-      return a + (op.type != LEV_EDIT_KEEP);
-    });
-
-    size_t nn = std::accumulate(sub, sub + ns, 0, [](size_t a, const LevEditOp& op) {
-      return a + (op.type != LEV_EDIT_KEEP);
-    });
-
-    if (nn > nr)
-        return NULL;
-    nr -= nn;
-
-    /* subtract */
-    /* we could simply return NULL when nr == 0, but then it would be possible
-     * to subtract *any* sequence of the right length to get an empty sequence
-     * -- clrealy incorrectly; so we have to scan the list to check */
-    rem = nr ? (LevEditOp*)safe_malloc(nr, sizeof(LevEditOp)) : NULL;
-    j = nn = 0;
-    shift = 0;
-    for (i = 0; i < ns; i++) {
-        while ((ops[j].spos != sub[i].spos
-                || ops[j].dpos != sub[i].dpos
-                || ops[j].type != sub[i].type)
-               && j < n) {
-            if (ops[j].type != LEV_EDIT_KEEP) {
-                rem[nn] = ops[j];
-                rem[nn].spos = (size_t)((int)rem[nn].spos + shift);
-                nn++;
-            }
-            j++;
-        }
-        if (j == n) {
-            free(rem);
-            return NULL;
-        }
-
-        shift += shifts[sub[i].type];
-        j++;
-    }
-
-    while (j < n) {
-        if (ops[j].type != LEV_EDIT_KEEP) {
-            rem[nn] = ops[j];
-            rem[nn].spos = (size_t)((int)rem[nn].spos + shift);
-            nn++;
-        }
-        j++;
-    }
-    assert(nn == nr);
-
-    *nrem = nr;
-    return rem;
-}
-/* }}} */
diff --git a/src/Levenshtein/Levenshtein-c/_levenshtein.hpp b/src/Levenshtein/Levenshtein-c/_levenshtein.hpp
@@ -1,6 +1,4 @@
-/* @(#) $Id: Levenshtein.h,v 1.22 2005/01/13 20:02:56 yeti Exp $ */
-#ifndef LEVENSHTEIN_H
-#define LEVENSHTEIN_H
+#pragma once
 
 #include "Python.h"
 #include <cstdint>
@@ -118,40 +116,6 @@ enum LevEditType {
   LEV_EDIT_LAST  /* sometimes returned when an error occurs */
 };
 
-/* Edit operation (atomic).
- * This is the `native' atomic edit operation.  It differs from the difflib
- * one's because it represents a change of one character, not a block.  And
- * we usually don't care about LEV_EDIT_KEEP, though the functions can handle
- * them.  The positions are interpreted as at the left edge of a character.
- */
-typedef struct {
-  LevEditType type;  /* editing operation type */
-  size_t spos;  /* source block position */
-  size_t dpos;  /* destination position */
-} LevEditOp;
-
-/* Edit operation (difflib-compatible).
- * This is not `native', but conversion functions exist.  These fields exactly
- * correspond to the codeops() tuples fields (and this method is also the
- * source of the silly OpCode name).  Sequences must span over complete
- * strings, subsequences are simply edit sequences with more (or larger)
- * LEV_EDIT_KEEP blocks.
- */
-typedef struct {
-  LevEditType type;  /* editing operation type */
-  size_t sbeg, send;  /* source block begin, end */
-  size_t dbeg, dend;  /* destination block begin, end */
-} LevOpCode;
-
-static void *
-safe_malloc(size_t nmemb, size_t size) {
-  /* extra-conservative overflow check */
-  if (SIZE_MAX / size <= nmemb) {
-    return NULL;
-  }
-  return malloc(nmemb * size);
-}
-
 /* compute the sets of symbols each string contains, and the set of symbols
  * in any of them (symset).  meanwhile, count how many different symbols
  * there are (used below for symlist). */
@@ -784,13 +748,3 @@ static inline double lev_set_distance(const std::vector<RF_String>& strings1, co
 
   return sum;
 }
-
-bool lev_editops_valid(size_t len1, size_t len2, size_t n, const LevEditOp *ops);
-bool lev_opcodes_valid(size_t len1, size_t len2, size_t nb, const LevOpCode *bops);
-
-void lev_editops_invert(size_t n, LevEditOp *ops);
-void lev_opcodes_invert(size_t nb, LevOpCode *bops);
-
-LevEditOp* lev_editops_subtract(size_t n, const LevEditOp *ops, size_t ns, const LevEditOp *sub, size_t *nrem);
-
-#endif /* not LEVENSHTEIN_H */