Skip to content

Latest commit

 

History

History
493 lines (451 loc) · 19.7 KB

Document Comparison.md

File metadata and controls

493 lines (451 loc) · 19.7 KB

Document Comparison

This script compares a document with another document and shows all differences in a table list, where they can be accepted or rejected. This supports documents of any length and efficiently finds all the differences.
The results are shown in a table. You can see the deletions are shown with -DEL- and insertions with -INS-. The new document is marked as valid and the original is read-only. The user only has to confirm a result by pressing ENTER and rejecting by pressing SPACE and ENTER.
image
This simple example is only showing mistmatches. You can adjust to show more word context for each documen, but if you click or tab onto the Document Text, then the document image will scroll to show the document.
The script is fast and will find the optimal match - if the documents are very similar the match is even faster.
If you want to see both documents, you will have to merge the images of the original documents together, or launch it with an external program...

Configuration

  • Add the following script to the document class you want to compare or to any higher class.
  • Create a Table Model with 3 columns Document, OK and Original
    image
  • Add a Table Locator to the class called "TL_Diffs". The name is checked in the Sub Document_AfterLocate below, which customizes the table locator.
  • You will need a method to find the reference document that you want to compare with. The script below just uses the first Extraction Training Sample from the class.

Design your Validation Form

  • Make column 2 very narrow and set as a checkbox.
  • Make column 3 read only.

How it works

The algorithm finds all the unique words in each document, and then finds the largest match of unique words between the two documents. This breaks the document into smaller documents - this is done successively until unique word matches are found. The remaining document fragments are then aligned using backtracking of the Levenshtein Distance.

'#Language "WWB-COM"
Option Explicit

' Class script: diff
'==========================
'Sort Function Delegates https://github.com/KofaxRPA/KTScripts/blob/master/QuickSort.vb
Delegate Function ComparerDelegate(a As Variant, b As Variant) As Boolean ' Delegate definition for sorting comparers. These MUST come first in the script so that the compiler can find them
Public Function Comparer_WidthLarge( a As Variant, b As Variant) As Boolean
   'Used to sort by width of elements, largest first
   Return a.Width > b.Width
End Function
Public Function Comparer_Left( a As Variant, b As Variant) As Boolean
   'Used to sort by left pixel of elements, smallest first
   Return a.Left < b.Left
End Function
'=============================

Private Sub Document_AfterLocate(ByVal pXDoc As CASCADELib.CscXDocument, ByVal LocatorName As String)
   Dim refXDoc As New CscXDocument
   If LocatorName="TL_Diffs" Then   'Custom Table Locator
      Set refXDoc=XDoc_GetReferenceDocument(pXDoc) ' write your logic here to find the reference document
      Document_Compare(pXDoc, refXDoc,LocatorName) 'Put document differences into the tablelocator
   End If
End Sub

Private Sub Document_Compare(ByVal pXDoc As CASCADELib.CscXDocument, refXDoc As CscXDocument, TableLocName As String)
   'Compare this document with it's reference document and put results into a table
   Dim Row As CscXDocTableRow, WordsA As CscXDocWords, WordsB As CscXDocWords, W As Long, Table As CscXDocTable
   If pXDoc.ExtractionClass="" Then Exit Sub
   With pXDoc.Locators.ItemByName(TableLocName).Alternatives(0)
      .Confidence=1
      Set Table=.Table
   End With
   Table.Rows.Clear 'ignore what Table Locator may have found!
   Set WordsA=TextLines2Words(pXDoc.TextLines) 'pxdoc.words is not in natural order. pxdoc.Textlines.words is in correct order
   Set WordsB=TextLines2Words(refXDoc.TextLines)
   Words_Align(WordsA,WordsB)
   For W =0 To WordsA.Count-1
      If WordsA(W).Text<>WordsB(W).Text Then 'equal
         Set Row = Table.Rows.Append()
         Row.Cells(0).AddWordData(WordsA(W))
         Row.Cells(2).Text=WordsB(W).Text
      End If
   Next
End Sub

Private Function XDoc_GetReferenceDocument(pXDoc As CscXDocument) As CscXDocument
   'Find the first training sample for the exctraction class
   Dim refXDoc As New CscXDocument, refXDocDir As String, refXDocFileName As String
   refXDocDir=Left(Project.FileName,InStrRev(Project.FileName,"\")) & "ExtractionTraining\" & pXDoc.ExtractionClass
   ChDir refXDocDir
   refXDocFileName=Dir("*.xdc")
   If refXDocFileName="" Then Err.Raise(457,,"Cannot find original document for class " & pXDoc.ExtractionClass)
   refXDoc.Load(refXDocFileName)
   Return refXDoc
End Function

Public Sub Words_Align(ByRef WordsA As CscXDocWords,ByRef WordsB As CscXDocWords)
   Dim Matches As CscXDocFieldAlternatives, score As Double
   Set Matches=Matches_Find(WordsA, WordsB,score)
   If score<0.2 Then Exit Sub ' these texts are not matching in any way - don't even try
   Alternatives_Sort(Matches,AddressOf Comparer_WidthLarge) 'sort matching texts by size
   Matches_DeleteConflicts(Matches) 'delete outlier runs
   Alternatives_Sort(Matches,AddressOf Comparer_Left) ' re-sort matching texts or order
   Levenshtein_Calculate(Matches, WordsA, WordsB) ' aligns all words using the perfect matches
End Sub

'===FAST LEVENSHTEIN============================
'Fast Levenshtein matching of large documents

Public Sub Levenshtein_Calculate(Matches As CscXDocFieldAlternatives, ByRef WordsA As CscXDocWords, ByRef WordsB As CscXDocWords)
   Dim Match As CscXDocFieldAlternative, M As Long, Inserts As Long, Deletes As Long
   Dim AStart As Long, AEnd As Long, BStart As Long, BEnd As Long
   If Matches.Count=0 Then Exit Sub
   For M=0 To Matches.Count-1
      Set Match=Matches(M)
      AEnd= Match.Left + Inserts
      BEnd= Match.Top  + Deletes
      Levenshtein_Forward(WordsA,WordsB,AStart,AEnd,BStart,BEnd,Inserts,Deletes)
      AStart=Match.Left + Match.Width-1  + Inserts
      BStart=Match.Top  + Match.Height-1 + Deletes
   Next
   AEnd= WordsA.Count-1
   BEnd= WordsB.Count-1
   Levenshtein_Forward(WordsA,WordsB,AStart,AEnd,BStart,BEnd,Inserts,Deletes)
End Sub

Public Sub Levenshtein_Forward(ByRef WordsA As CscXDocWords, ByRef WordsB As CscXDocWords, AStart As Long, AEnd As Long, BStart As Long, BEnd As Long, ByRef Inserts As Long, ByRef Deletes As Long)
   Dim Matrix() As Long, A As Long, B As Long, WordA As CscXDocWord, WordB As CscXDocWord, del As Double, Ins As Double, subs As Double
   ReDim Matrix(AEnd-AStart+1,BEnd-BStart+1)
   For A=1 To UBound(Matrix,1)
      Matrix(A,0)=Len(WordsA(AStart+A-1).Text)+Matrix(A-1,0)
   Next
   For B=1 To UBound(Matrix,2)
      Matrix(0,B)=Len(WordsB(BStart+B-1).Text)+Matrix(0,B-1)
   Next
   For A  = AStart To AEnd
      Set WordA=WordsA(A)
      For B = BStart To BEnd
         Set WordB=WordsB(B)
         del = Matrix( A  -AStart,B +1-BStart)+Len(WordA.Text)' + delcost   '*B/wordsB.Count) ' cost of deletion
         Ins = Matrix( A+1-AStart,B   -BStart)+Len(WordB.Text)' cost of insertion
         subs= Matrix( A  -AStart,B   -BStart)+String_LevenshteinDistanceBasic(WordA.Text,WordB.Text)'1-String_FuzzyMatch(WordA.Text,WordB.Text) 'Perform Levenshtein on the words themselves at the character level
         Matrix(A+1-AStart,B+1-BStart)=Min(Ins,Min(del,subs))
      Next
   Next
   Levenshtein_Traceback(WordsA,WordsB,AStart,BStart, Matrix, Inserts, Deletes)
End Sub

Sub Matrix_Log(Matrix() As Long, ByRef WordsA As CscXDocWords, ByRef WordsB As CscXDocWords,AStart  As Long, BStart As Long)
   Dim A As Long, B As Long, AEnd As Long, BEnd As Long
   Exit Sub
   AEnd=AStart+UBound(Matrix,1)-1
   BEnd=BStart+UBound(Matrix,2)-1
    Open "c:\temp\out.txt" For Output As 1
   Print #1, "A\B" & vbTab &"-" & vbTab;
   For B= BStart To BEnd+1
      Print #1, WordsB(B).Text & vbTab;
   Next
   Print #1
   Print #1,"-" & vbTab;
   For A = AStart To AEnd+1
      For B = BStart To BEnd+1
         Print #1, Format(Matrix(A-AStart,B-BStart),"0.00")&vbTab;
      Next
      Print #1
      If A<AEnd+1 Then Print #1, WordsA(A).Text & vbTab;
   Next
   Print #1
   Print #1, Words_Text(WordsA, vbTab)
   Print #1, Words_Text(WordsB, vbTab)
   Close #1
   Shell("Notepad.exe c:\temp\out.txt",vbNormalFocus)
End Sub

Public Sub Levenshtein_Traceback(ByRef WordsA As CscXDocWords, ByRef WordsB As CscXDocWords, AStart As Long, BStart As Long, Matrix() As Long, ByRef Inserts As Long, ByRef Deletes As Long)
   Dim del As Double, Ins As Double, subs As Double, Word As CscXDocWord, WordA As CscXDocWord, WordB As CscXDocWord
   Dim A As Long, B As Long
   A=AStart+UBound(Matrix,1)-1
   B=BStart+UBound(Matrix,2)-1
   While A>AStart And B>BStart
      del = Matrix( A - 1-AStart, B    -BStart) ' cost of deletion
      Ins = Matrix( A    -AStart, B - 1-BStart) ' cost of insertion
      subs =Matrix( A - 1-AStart, B - 1-BStart)  'cost of substition or match
      Select Case Min(Min(subs,Ins),del)
         Case subs
            A=A-1:B=B-1
         Case Ins
            Set Word= New CscXDocWord
            Word.Text="-INS-"
            Words_Insert(WordsA,Word,A)
            B=B-1
            Inserts=Inserts+1
         Case del
            Set Word= New CscXDocWord
            Word.Text="-DEL-"
            Words_Insert(WordsB,Word,B)
            A=A-1
            Deletes=Deletes+1
      End Select
   Wend
   While B=BStart And A>AStart ' the rest are deletes at the beginning
      Set Word= New CscXDocWord
      Word.Text="-DEL-"
      Words_Insert(WordsB,Word,B+1)
      A=A-1
      Deletes=Deletes+1
   Wend
   While A=AStart And B>BStart' the rest are inserts at the beginning
      Set Word= New CscXDocWord
      Word.Text="-INS-"
      Words_Insert(WordsA,Word,A+1)
      B=B-1
      Inserts=Inserts+1
   Wend
   Matrix_Log(Matrix,WordsA,WordsB,AStart,BStart)
End Sub

'====Match Functions to speed up Levenshtein=================================
'finds large runs of text that have not changed at all, so Levenshtein can be skipped

Public Sub Matches_DeleteConflicts(Matches As CscXDocFieldAlternatives)
   Dim M As Long, N As Long, Match As CscXDocFieldAlternative, NextMatch As CscXDocFieldAlternative
   While M<Matches.Count-1
      Set Match=Matches(M)
      N=M+1
      While N<Matches.Count
         Set NextMatch=Matches(N)
         If (NextMatch.Top<Match.Top And NextMatch.Left+NextMatch.Width>Match.Left) Or _
            (NextMatch.Top+NextMatch.Height>Match.Top+Match.Height And NextMatch.Left<Match.Left) _
            Then
            Matches.Remove(N)
         Else
            N=N+1
         End If
      Wend
      M=M+1
   Wend
End Sub

Public Function Matches_Find(ByRef WordsA As CscXDocWords,ByRef WordsB As CscXDocWords, ByRef Score As Double) As CscXDocFieldAlternatives
   Dim uniqueA As Dictionary, UniqueB As Dictionary
   Dim Matches As CscXDocFieldAlternatives, Match As CscXDocFieldAlternative, Word As CscXDocWord, WordText As String, Words() As Variant, W As Long
   Set uniqueA=Words_FindUniqueWords(WordsA)
   Set UniqueB=Words_FindUniqueWords(WordsB)
   Dictionaries_MakeDoublyUnique(uniqueA, UniqueB) 'remove all unique words that are not in BOTH dictionaries
   With New CscXDocField
      Set Matches=.Alternatives
   End With
   Dim d As Dictionary
   Score=0
   Words=uniqueA.Keys
   For Each WordText In Words
      If Not IsEmpty(uniqueA(WordText)) Then ' There is a bug in MSCripts.Dictionary where removing a key randomly doesn't remove it, but rather leaves it empty
         Set Word=uniqueA(WordText)
         Set Match=Matches.Create
         Match.Words.Append(Word)
         Match.Left=CLng(Word.StringTag)
         Match.Top=CLng(UniqueB(Word.Text).StringTag)
         Match.Width=1
         Match.Height=1
         Score=Score+1
         While Match.Left>0 And Match.Top>0 AndAlso WordsA(Match.Left-1).Text=WordsB(Match.Top-1).Text ' add matching words to the left
            Match.Left=Match.Left-1
            Match.Top=Match.Top-1
            Match.Width=Match.Width+1
            Match.Height=Match.Height+1
            Score=Score+1
            Set Word=WordsA(Match.Left)
            If uniqueA.Exists(Word.Text) Then 'these unique words are in the same run, so merge them
               uniqueA.Remove(Word.Text)
               UniqueB.Remove(Word.Text)
            End If
         Wend
         While Match.Left+Match.Width<WordsA.Count AndAlso Match.Top+Match.Width<WordsB.Count AndAlso WordsA(Match.Left+Match.Width).Text=WordsB(Match.Top+Match.Width).Text 'add matching words to the right
            Set Word=WordsA(Match.Left+Match.Width)
            If uniqueA.Exists(Word.Text) Then 'these unique words are in the same run, so merge them
               uniqueA.Remove(Word.Text)

               UniqueB.Remove(Word.Text)
            End If
            Match.Width=Match.Width+1
            Match.Height=Match.Height+1
            Score=Score+1
      Wend
      End If
   Next
   Score=Score/Max(WordsA.Count,WordsB.Count)
   Return Matches
End Function

Sub Dictionaries_MakeDoublyUnique(DictA As Dictionary, DictB As Dictionary)
   Dim Word As String
   For Each Word In DictA.Keys
      If Not DictB.Exists(Word) Then DictA.Remove(Word)
   Next
   For Each Word In DictB.Keys
      If Not DictA.Exists(Word) Then DictB.Remove(Word)
   Next
End Sub

Function Words_Text(Words As CscXDocWords, Delim As String) As String
   Dim result As String, W As Long
   For W=0 To Words.Count-1
      result=result & Words(W).Text & Delim
   Next
   Return result
End Function

Public Sub Words_Insert(Words As CscXDocWords, Word As CscXDocWord, Pos As Long)
   'Workaround to Words.Insert, which doesn't work in KTM 5.5.2.10
   Dim Temp As New CscXDocWords,W As Long
   For W = Pos To Words.Count-1

      Temp.Append(Words(W))
   Next
   While Words.Count>Pos
      Words.Remove(Pos)
   Wend
   Words.Append(Word)
   For W = 0 To Temp.Count-1
      Words.Append(Temp(W))
   Next
   Set Temp=Nothing
End Sub

Public Function Words_FindUniqueWords(Words As CscXDocWords) As Dictionary
   'This makes a dictionary of all words and keeps only unique words

   Dim Dict As New Dictionary, Key As String
   Dim W As Long, Word As CscXDocWord
   For W =0 To Words.Count-1
      Set Word=Words(W)
      Word.StringTag=CStr(W) 'each word remembers it's position in the phrase
      If Dict.Exists(Word.Text) Then

         Set Dict.Item(Word.Text)=Nothing ' this word is not unique
      Else
         Dict.Add(Word.Text, Word)
      End If
   Next
   'remove duplicates
   For Each Key In Dict.Keys
      If Dict.Item(Key) Is Nothing Then Dict.Remove(Key)
   Next
   Return Dict
End Function

Public Function Max(A,B)  'typeless function, works with any object type that supports >. returns the same object type
   Return IIf(A>B,A,B)
End Function

Public Function Min(A,B)
   Return IIf(A<B,A,B)
End Function


Public Function String_LevenshteinDistanceBasic(A As String ,B As String) As Integer
   'http://en.wikipedia.org/wiki/Levenshtein_distance
   'Levenshtein distance between two strings, used for fuzzy matching
   'Returns the number of character differences between the two strings.
   'eg LevenshteinDistance("kitten","kitchen") = 2 = insertion of "c" + substitution of "t" for "h"

   Dim i As Long, j As Long, cost As Long, d() As Long
   Dim Ins As Long, del As Long, subs As Long  ' for counting insertions, deletions and substitutions
   If Len(A) = 0 Then Return Len(B)
   If Len(B) = 0 Then Return Len(A)
   ReDim d(Len(A), Len(B))
   For i = 0 To Len(A)
      d(i, 0) = i
   Next
   For j = 0 To Len(B)
      d(0, j) = j
   Next
   For i = 1 To Len(A)
     For j = 1 To Len(B)
         cost = IIf (Mid(A, i, 1) = Mid(B, j, 1),0,1)   ' cost of substitution
         del = ( d( i - 1, j ) + 1 ) ' cost of deletion
         Ins = ( d( i, j - 1 ) + 1 ) ' cost of insertion
         subs = ( d( i - 1, j - 1 ) + cost ) 'cost of substitution or match
         d(i,j)= Min(Ins, Min(del,subs))
      Next
   Next
   Return d(Len(A), Len(B))
End Function

Public Function String_FuzzyMatch(ByVal A As String, ByVal B As String, Optional removeSpaces As Boolean = False) As Double
   'returns 0.0 for no match, 1.0 for perfect match, in between for fuzzy match.
   If removeSpaces Then
      A=Replace(A," ","")
      B=Replace(B," ","")
   End If
   If InStr(a,",")>1 Then a=Replace(a,",","")
   If InStr(a,".")>1 Then B=Replace(B,",","")
   If Len(a)= 0 Or Len(B)=0 Then Return 0
   Return CDbl(1.0 - String_LevenshteinDistanceBasic(a, B)/ Max(Len(a),Len(B)))
End Function

Private Function TextLines2Words(TextLines As CscXDocTextLines) As CscXDocWords
   Dim Words As New CscXDocWords, TL As Long, W As Long
   For TL=0 To TextLines.Count-1
      For W=0 To TextLines(TL).Words.Count-1
         Words.Append(TextLines(TL).Words(W))
      Next
   Next
   Return Words
End Function

Function Words_Slice(Words As CscXDocWords,ByRef a As Long, ByRef B As Long)
   'return a slice of a word list
   Dim W As Long, Slice As New CscXDocWords
   For W=a To B-1
      Slice.Append(Words(W))
   Next
   Return Slice
End Function

Sub Words_Trim(Words As CscXDocWords,ByRef a As Long, ByRef B As Long)
   'remove a group of words from a word list
   Dim W As Long
   For W= B-1 To a Step -1
      Words.Remove(W)
   Next
End Sub

Sub Words_InsertWords(Words As CscXDocWords, Ins As CscXDocWords, a As Long)
   Dim W As Long
   For W=0 To Ins.Count-1
      Words_Insert(Words,Ins(W),a+W)
   Next
End Sub

'==========================
'Sort Function Delegates https://github.com/KofaxRPA/KTScripts/blob/master/QuickSort.vb

Sub Alt_Copy(a As CscXDocFieldAlternative, B As CscXDocFieldAlternative)
   Dim W As Long
   For W=0 To a.Words.Count-1
      B.Words.Append(a.Words(W))
   Next
   B.Left=a.Left
   B.Width=a.Width
   B.PageIndex=a.PageIndex
   B.Top=a.Top
   B.Height=a.Height
   B.StringTag=a.StringTag
   B.Confidence=a.Confidence
   B.Text=a.Text
End Sub

Public Sub Alternatives_Sort(alternatives As CscXDocFieldAlternatives, Comparer As ComparerDelegate)
   'Sort alternatives using a comparer
   'we have to copy the alternatives to an array, sort the array and then copy back to the alternatives
   Dim alts() As CscXDocFieldAlternative, a As Long
   ReDim alts(alternatives.Count-1)
   For a =0 To alternatives.Count-1
      Set alts(a)=alternatives(a)
   Next
   Array_Sort(alts, Comparer)
   While alternatives.Count>0
      alternatives.Remove(0)
   Wend
   For a=0 To UBound(alts)
      Alt_Copy(alts(a),alternatives.Create)
   Next
End Sub

Private Sub Array_Sort(ByRef a As Variant, Comparer As ComparerDelegate)
   Quicksort_Sort(a,0,UBound(a),Comparer)
End Sub

Sub Quicksort_Sort(ByRef a As Variant, ByVal Left As Integer, ByVal Right As Integer,Comparer As ComparerDelegate)
   Dim pivot As Integer
   If Right > Left  Then
      pivot = Quicksort_GetPivot(Left, Right)
      pivot = Quicksort_Partition(a, Left, Right, pivot, Comparer)
      Quicksort_Sort(a, Left, pivot, Comparer)
      Quicksort_Sort(a, pivot + 1, Right, Comparer)
   End If
End Sub

Function Quicksort_GetPivot(ByVal Left As Integer, ByVal Right As Integer)
   'Return a random integer between Left and Right
   Return (Rnd()*(Right-Left+1)*1000) Mod (Right-Left+1) + Left
End Function

Function Quicksort_Partition(ByRef a As Variant, ByVal l As Integer, ByVal r As Integer, ByRef pivot As Integer, Comparer As ComparerDelegate)
   Dim i,store As Integer
   Dim piv As Variant
   Set piv = a(pivot)
   Object_Swap(a(r), a(pivot))
   store = l
   For i = l To r - 1
      If Comparer.Invoke(a(i),piv) Then
          Object_Swap(a(store), a(i))
          store = store + 1
      End If
   Next
   Object_Swap(a(r), a(store))
   Return store
End Function

Sub Object_Swap(ByRef v1 As Variant, ByRef v2 As Variant)
   Dim tmp As Variant
   Set tmp = v1
   Set v1 = v2
   Set v2 = tmp
End Sub
'======================================================'