Skip to content

Commit

Permalink
implement all editops APIs using RapidFuzz
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann committed Jul 22, 2022
1 parent f812040 commit d252234
Show file tree
Hide file tree
Showing 9 changed files with 85 additions and 576 deletions.
7 changes: 3 additions & 4 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
## Changelog

### v0.19.4
### v0.20.0
#### Changed
- use `matching_blocks`/`apply` implementation from RapidFuzz
- use `matching_blocks`/`apply`/`remove_subsequence`/`inverse` implementation from RapidFuzz

#### Fixed
- stop adding data to wheels
- add editops validation to subtract_edit to fix segmentation fault on some
invalid editop sequences
- fix segmentation fault on some invalid editop sequences in subtract_edit
- detect duplicated entries in editops validation

### v0.19.3
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ This is a fork of [ztane/python-Levenshtein](https://github.com/ztane/python-Lev
project is no longer actively maintained.

## Requirements
* Python 3.5 or later
* Python 3.6 or later

## Installation
```bash
Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
author = 'Max Bachmann'

# The full version, including alpha/beta/rc tags
release = '0.19.3'
release = '0.20.0'

# -- General configuration ---------------------------------------------------

Expand Down
2 changes: 1 addition & 1 deletion extern/rapidfuzz-cpp
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@

setup(
name="Levenshtein",
version="0.19.3",
version="0.20.0",
url="https://github.com/maxbachmann/Levenshtein",
author="Max Bachmann",
install_requires=["rapidfuzz >= 2.0.1, < 3.0.0"],
install_requires=["rapidfuzz >= 2.3.0, < 3.0.0"],
author_email="[email protected]",
description="Python extension for computing string edit distances and similarities.",
long_description=readme,
Expand Down
238 changes: 0 additions & 238 deletions src/Levenshtein/Levenshtein-c/_levenshtein.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,241 +405,3 @@ munkers_blackman(size_t n1, size_t n2, double *dists)
}

/* }}} */

/****************************************************************************
*
* Editops and other difflib-like stuff.
*
****************************************************************************/
/* {{{ */

/**
* lev_editops_valid:
* @len1: The length of an eventual @ops source string.
* @len2: The length of an eventual @ops destination string.
* @n: The length of @ops.
* @ops: An array of elementary edit operations.
*
* Checks whether @ops is consistent and applicable as a partial edit from a
* string of length @len1 to a string of length @len2.
*
* Returns: true if valid, false otherwise
**/
bool lev_editops_valid(size_t len1, size_t len2, size_t n, const LevEditOp *ops)
{
if (!n)
return true;

/* check bounds */
const LevEditOp* o = ops;
for (size_t i = n; i; i--, o++) {
if (o->type >= LEV_EDIT_LAST)
return false;
if (o->spos > len1 || o->dpos > len2)
return false;
if (o->spos == len1 && o->type != LEV_EDIT_INSERT)
return false;
if (o->dpos == len2 && o->type != LEV_EDIT_DELETE)
return false;
}

/* check ordering */
o = ops + 1;
for (size_t i = n - 1; i; i--, o++, ops++) {
if (o->spos > ops->spos || o->dpos > ops->dpos)
continue;

return false;
}

return true;
}

/**
* lev_opcodes_valid:
* @len1: The length of an eventual @ops source string.
* @len2: The length of an eventual @ops destination string.
* @nb: The length of @bops.
* @bops: An array of difflib block edit operation codes.
*
* Checks whether @bops is consistent and applicable as an edit from a
* string of length @len1 to a string of length @len2.
*
* Returns: true if valid, false otherwise
**/
bool lev_opcodes_valid(size_t len1, size_t len2, size_t nb, const LevOpCode *bops)
{
if (!nb)
return (len1 == 0 && len2 == 0);

/* check completenes */
if (bops->sbeg || bops->dbeg
|| bops[nb - 1].send != len1 || bops[nb - 1].dend != len2)
return false;

/* check bounds and block consistency */
const LevOpCode* b = bops;
for (size_t i = nb; i; i--, b++) {
if (b->send > len1 || b->dend > len2)
return false;
switch (b->type) {
case LEV_EDIT_KEEP:
case LEV_EDIT_REPLACE:
if (b->dend - b->dbeg != b->send - b->sbeg || b->dend == b->dbeg)
return false;
break;
case LEV_EDIT_INSERT:
if (b->dend - b->dbeg == 0 || b->send - b->sbeg != 0)
return false;
break;
case LEV_EDIT_DELETE:
if (b->send - b->sbeg == 0 || b->dend - b->dbeg != 0)
return false;
break;

default:
return false;
}
}

/* check ordering */
b = bops + 1;
for (size_t i = nb - 1; i; i--, b++, bops++) {
if (b->sbeg != bops->send || b->dbeg != bops->dend)
return false;
}

return true;
}

/**
* lev_editops_invert:
* @n: The length of @ops.
* @ops: An array of elementary edit operations.
*
* Inverts the sense of @ops. It is modified in place.
*
* In other words, @ops becomes a valid partial edit for the original source
* and destination strings with their roles exchanged.
**/
void lev_editops_invert(size_t n, LevEditOp *ops)
{
for (size_t i = n; i; i--, ops++) {
size_t z = ops->dpos;
ops->dpos = ops->spos;
ops->spos = z;
if (ops->type & 2)
ops->type = (LevEditType)(ops->type ^ 1);
}
}

/**
* lev_opcodes_invert:
* @nb: The length of @ops.
* @bops: An array of difflib block edit operation codes.
*
* Inverts the sense of @ops. It is modified in place.
*
* In other words, @ops becomes a partial edit for the original source
* and destination strings with their roles exchanged.
**/
void lev_opcodes_invert(size_t nb, LevOpCode *bops)
{
for (size_t i = nb; i; i--, bops++) {
size_t z = bops->dbeg;
bops->dbeg = bops->sbeg;
bops->sbeg = z;
z = bops->dend;
bops->dend = bops->send;
bops->send = z;
if (bops->type & 2)
bops->type = (LevEditType)(bops->type ^ 1);
}
}

/**
* lev_editops_subtract:
* @n: The size of @ops.
* @ops: An array of elementary edit operations.
* @ns: The size of @sub.
* @sub: A subsequence (ordered subset) of @ops
* @nrem: Where to store then length of the remainder array.
*
* Subtracts a subsequence of elementary edit operations from a sequence.
*
* The remainder is a sequence that, applied to result of application of @sub,
* gives the same final result as application of @ops to original string.
*
* Returns: A newly allocated array of normalized edit operations, its length
* is stored to @nrem. It is always normalized, i.e, without any
* keep operations. On failure, %NULL is returned.
**/
LevEditOp*
lev_editops_subtract(size_t n,
const LevEditOp* ops,
size_t ns,
const LevEditOp* sub,
size_t *nrem)
{
static const int shifts[] = { 0, 0, 1, -1 };
LevEditOp *rem;
size_t i, j;
int shift;

/* compute remainder size */
*nrem = (size_t)-1;

size_t nr = std::accumulate(ops, ops + n, 0, [](size_t a, const LevEditOp& op) {
return a + (op.type != LEV_EDIT_KEEP);
});

size_t nn = std::accumulate(sub, sub + ns, 0, [](size_t a, const LevEditOp& op) {
return a + (op.type != LEV_EDIT_KEEP);
});

if (nn > nr)
return NULL;
nr -= nn;

/* subtract */
/* we could simply return NULL when nr == 0, but then it would be possible
* to subtract *any* sequence of the right length to get an empty sequence
* -- clrealy incorrectly; so we have to scan the list to check */
rem = nr ? (LevEditOp*)safe_malloc(nr, sizeof(LevEditOp)) : NULL;
j = nn = 0;
shift = 0;
for (i = 0; i < ns; i++) {
while ((ops[j].spos != sub[i].spos
|| ops[j].dpos != sub[i].dpos
|| ops[j].type != sub[i].type)
&& j < n) {
if (ops[j].type != LEV_EDIT_KEEP) {
rem[nn] = ops[j];
rem[nn].spos = (size_t)((int)rem[nn].spos + shift);
nn++;
}
j++;
}
if (j == n) {
free(rem);
return NULL;
}

shift += shifts[sub[i].type];
j++;
}

while (j < n) {
if (ops[j].type != LEV_EDIT_KEEP) {
rem[nn] = ops[j];
rem[nn].spos = (size_t)((int)rem[nn].spos + shift);
nn++;
}
j++;
}
assert(nn == nr);

*nrem = nr;
return rem;
}
/* }}} */
48 changes: 1 addition & 47 deletions src/Levenshtein/Levenshtein-c/_levenshtein.hpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
/* @(#) $Id: Levenshtein.h,v 1.22 2005/01/13 20:02:56 yeti Exp $ */
#ifndef LEVENSHTEIN_H
#define LEVENSHTEIN_H
#pragma once

#include "Python.h"
#include <cstdint>
Expand Down Expand Up @@ -118,40 +116,6 @@ enum LevEditType {
LEV_EDIT_LAST /* sometimes returned when an error occurs */
};

/* Edit operation (atomic).
* This is the `native' atomic edit operation. It differs from the difflib
* one's because it represents a change of one character, not a block. And
* we usually don't care about LEV_EDIT_KEEP, though the functions can handle
* them. The positions are interpreted as at the left edge of a character.
*/
typedef struct {
LevEditType type; /* editing operation type */
size_t spos; /* source block position */
size_t dpos; /* destination position */
} LevEditOp;

/* Edit operation (difflib-compatible).
* This is not `native', but conversion functions exist. These fields exactly
* correspond to the codeops() tuples fields (and this method is also the
* source of the silly OpCode name). Sequences must span over complete
* strings, subsequences are simply edit sequences with more (or larger)
* LEV_EDIT_KEEP blocks.
*/
typedef struct {
LevEditType type; /* editing operation type */
size_t sbeg, send; /* source block begin, end */
size_t dbeg, dend; /* destination block begin, end */
} LevOpCode;

static void *
safe_malloc(size_t nmemb, size_t size) {
/* extra-conservative overflow check */
if (SIZE_MAX / size <= nmemb) {
return NULL;
}
return malloc(nmemb * size);
}

/* compute the sets of symbols each string contains, and the set of symbols
* in any of them (symset). meanwhile, count how many different symbols
* there are (used below for symlist). */
Expand Down Expand Up @@ -784,13 +748,3 @@ static inline double lev_set_distance(const std::vector<RF_String>& strings1, co

return sum;
}

bool lev_editops_valid(size_t len1, size_t len2, size_t n, const LevEditOp *ops);
bool lev_opcodes_valid(size_t len1, size_t len2, size_t nb, const LevOpCode *bops);

void lev_editops_invert(size_t n, LevEditOp *ops);
void lev_opcodes_invert(size_t nb, LevOpCode *bops);

LevEditOp* lev_editops_subtract(size_t n, const LevEditOp *ops, size_t ns, const LevEditOp *sub, size_t *nrem);

#endif /* not LEVENSHTEIN_H */
Loading

0 comments on commit d252234

Please sign in to comment.