From c42552770d994241f0b21eb3e14ee4a1308e6480 Mon Sep 17 00:00:00 2001 From: Vitali Fedulov Date: Wed, 19 Jan 2022 09:08:56 +0100 Subject: [PATCH] switch to methods --- README.md | 12 ++-- about.go | 16 ----- hypercubes.go => cubes.go | 107 +++++++++++++++------------- hypercubes_test.go => cubes_test.go | 42 +++++------ hashes.go | 12 ++-- hashes_test.go | 30 ++++---- 6 files changed, 106 insertions(+), 113 deletions(-) delete mode 100644 about.go rename hypercubes.go => cubes.go (60%) rename hypercubes_test.go => cubes_test.go (70%) diff --git a/README.md b/README.md index 624d4fe..2d2196e 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ # Hashing float vectors in N-dimensions -This is an early beta version. +Package hyper allows fast approximate search of nearest neighbour vectors in n-dimensional space. -### Algorithm +**This is an early beta version**. Description below will be improved (TODO). See tests for examples. -https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html +Package functions discretize a vector and generate a set of hashes, as described in the following document: https://similar.pictures/algorithm-for-hashing-high-dimensional-float-vectors.html -### How to use - -about.go contains a short instruction. +To use the package follow the sequence of functions/methods: +1) CubeSet or CentralCube, depending which one is used for a database record and which one for a query. +2) HashSet and DecimalHash to get corresponding hash set and central hash from results of (2). If DecimalHash is not suitable because of very large number of buckets or dimensions, use FNV1aHash to get both the hash set and the central hash). diff --git a/about.go b/about.go deleted file mode 100644 index a849e3b..0000000 --- a/about.go +++ /dev/null @@ -1,16 +0,0 @@ -package hyper - -// Package hyper allows fast approximate search of nearest -// neighbour vectors in n-dimensional space. -// Package functions discretize a vector and generate a set -// of fuzzy hashes, as described in the following document: -// https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html - -// To use the package follow the sequence: -// 1) CubeSet or CentralCube, depending which one -// is used for a database record and which one for a query, -// 2) HashSet and Decimal to get corresponding hash set -// and central hash from results of (2). If Decimal hash -// is not suitable because of very large number of buckets -// or dimensions, use FNV1a to get both the hash set and -// the central hash). diff --git a/hypercubes.go b/cubes.go similarity index 60% rename from hypercubes.go rename to cubes.go index 9095194..c0f6c13 100644 --- a/hypercubes.go +++ b/cubes.go @@ -1,24 +1,18 @@ package hyper -// rescale is a helper function to offset and rescale all values -// to [0, numBuckets] range. -func rescale(vector []float64, numBuckets int, min, max float64) []float64 { - rescaled := make([]float64, len(vector)) - amp := max - min - for i := range vector { - // Offset to zero and rescale to [0, numBuckets] range. - rescaled[i] = (vector[i] - min) * float64(numBuckets) / amp - } - return rescaled -} - -// clone makes a totally independent copy of a 2D slice. -func clone(src [][]int) (dst [][]int) { - dst = make([][]int, len(src)) - for i := range src { - dst[i] = append([]int{}, src[i]...) - } - return dst +// Hypercube is represented by a slice of its coordinates. +type Cube []int +type Cubes []Cube + +// Parameters of space discretization. +type Params struct { + // Value limits per dimension. For example 0, 255 for pixel values. + Min, Max float64 + // Uncertainty interval expressed as a fraction of bucketWidth + // (for example 0.25 for eps = 1/4 of bucketWidth). + EpsPercent float64 + // Number of buckets per dimension. + NumBuckets int } // CubeSet returns a set of hypercubes, which represent @@ -29,49 +23,43 @@ func clone(src [][]int) (dst [][]int) { // min and max are minimum and maximum possible values of // the vector components. The assumption is that min and max // are the same for all dimensions. -// numBuckets is number of buckets per dimension. -// min and max are value limits per dimension. -// epsPercent is the uncertainty interval expressed as -// a fraction of bucketWidth (for example 0.25 for eps = 1/4 -// of bucketWidth). -func CubeSet(vector []float64, min, max, epsPercent float64, - numBuckets int) (set [][]int) { - - if epsPercent >= 0.5 { - panic(`Error: epsPercent must be less than 0.5.`) +func CubeSet(vector []float64, params Params) (set Cubes) { + + if params.EpsPercent >= 0.5 { + panic(`Error: EpsPercent must be less than 0.5.`) } var ( - bC int // Central bucket number. - bL, bR int // Left and right bucket number. - setL, setR [][]int // Set copies. - branching bool // Branching flag. + bC int // Central bucket number. + bL, bR int // Left and right bucket number. + setL, setR Cubes // Set clones (for Left and Right). + branching bool // Branching flag. ) // Rescaling vector to avoid potential mistakes with // divisions and offsets later on. - rescaled := rescale(vector, numBuckets, min, max) + rescaled := rescale(vector, params) // After the rescale value range of the vector are // [0, numBuckets], and not [min, max]. // min = 0.0 from now on. - max = float64(numBuckets) + max := float64(params.NumBuckets) for _, val := range rescaled { branching = false - bL = int(val - epsPercent) - bR = int(val + epsPercent) + bL = int(val - params.EpsPercent) + bR = int(val + params.EpsPercent) // Get extreme values out of the way. - if val-epsPercent <= 0.0 { // This means that val >= 0. + if val-params.EpsPercent <= 0.0 { // This means that val >= 0. bC = bR goto branchingCheck // No branching. } // Get extreme values out of the way. - if val+epsPercent >= max { // This means that val =< max. + if val+params.EpsPercent >= max { // This means that val =< max. // Above max = numBuckets. bC = bL goto branchingCheck // No branching. @@ -135,33 +123,54 @@ func CubeSet(vector []float64, min, max, epsPercent float64, // CentralCube returns the hypercube containing the vector end. // Arguments are the same as for the CubeSet function. -func CentralCube(vector []float64, min, max, epsPercent float64, - numBuckets int) (central []int) { +func CentralCube(vector []float64, params Params) (central Cube) { - if epsPercent >= 0.5 { - panic(`Error: epsPercent must be less than 0.5.`) + if params.EpsPercent >= 0.5 { + panic(`Error: EpsPercent must be less than 0.5.`) } var bC int // Central bucket numbers. // Rescaling vector to avoid potential mistakes with // divisions and offsets later on. - rescaled := rescale(vector, numBuckets, min, max) + rescaled := rescale(vector, params) // After the rescale value range of the vector are // [0, numBuckets], and not [min, max]. // min = 0.0 from now on. - max = float64(numBuckets) + max := float64(params.NumBuckets) for _, val := range rescaled { bC = int(val) - if val-epsPercent <= 0.0 { // This means that val >= 0. - bC = int(val + epsPercent) + if val-params.EpsPercent <= 0.0 { // This means that val >= 0. + bC = int(val + params.EpsPercent) } - if val+epsPercent >= max { // Meaning val =< max. - bC = int(val - epsPercent) + if val+params.EpsPercent >= max { // Meaning val =< max. + bC = int(val - params.EpsPercent) } central = append(central, bC) } return central } + +// rescale is a helper function to offset and rescale all values +// to [0, numBuckets] range. +func rescale(vector []float64, params Params) []float64 { + rescaled := make([]float64, len(vector)) + amp := params.Max - params.Min + for i := range vector { + // Offset to zero and rescale to [0, numBuckets] range. + rescaled[i] = + (vector[i] - params.Min) * float64(params.NumBuckets) / amp + } + return rescaled +} + +// clone makes an unlinked copy of a 2D slice. +func clone(src Cubes) (dst Cubes) { + dst = make(Cubes, len(src)) + for i := range src { + dst[i] = append(Cube{}, src[i]...) + } + return dst +} diff --git a/hypercubes_test.go b/cubes_test.go similarity index 70% rename from hypercubes_test.go rename to cubes_test.go index 47b14b3..a097242 100644 --- a/hypercubes_test.go +++ b/cubes_test.go @@ -5,7 +5,7 @@ import ( "testing" ) -func centralIsNotInTheSet(set [][]int, central []int) bool { +func centralIsNotInTheSet(set Cubes, central Cube) bool { for _, cube := range set { counter := 0 for i, c := range central { @@ -21,9 +21,9 @@ func centralIsNotInTheSet(set [][]int, central []int) bool { } func TestRescale(t *testing.T) { // Testing panic. - numBuckets, min, max, _ := 10, 0.0, 255.0, 0.25 vector := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 255.0} - rescaled := rescale(vector, numBuckets, min, max) + params := Params{0.0, 255.0, 0.25, 10} + rescaled := rescale(vector, params) got := rescaled want := []float64{ 1, 0.0003921568627450981, 8.24705882352941, @@ -38,20 +38,20 @@ func TestCubeSet1(t *testing.T) { // Testing panic. defer func() { recover() }() // Intentionally forbiden value for epsPercent. values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9} - min, max, epsPercent, numBuckets := 0.0, 255.0, 0.51, 10 - _ = CubeSet(values, min, max, epsPercent, numBuckets) + params := Params{0.0, 255.0, 0.51, 10} + _ = CubeSet(values, params) // Never reaches here if Params panics. t.Errorf("Params did not panic on epsPercent > 0.5") } func TestCubeSet2(t *testing.T) { - numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.25 + params := Params{0.0, 255.0, 0.25, 10} values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9} - gotCubes := CubeSet(values, min, max, epsPercent, numBuckets) - gotCentral := CentralCube(values, min, max, epsPercent, numBuckets) - wantCubes := [][]int{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9}, + gotCubes := CubeSet(values, params) + gotCentral := CentralCube(values, params) + wantCubes := Cubes{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9}, {1, 0, 8, 3, 0, 0, 9}} - wantCentral := []int{1, 0, 8, 3, 0, 0, 9} + wantCentral := Cube{1, 0, 8, 3, 0, 0, 9} if !reflect.DeepEqual(gotCubes, wantCubes) { t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes) } @@ -65,12 +65,12 @@ func TestCubeSet2(t *testing.T) { // Testing bucket borders. func TestCubeSet3(t *testing.T) { - numBuckets, min, max, epsPercent := 4, 0.0, 4.0, 0.25 + params := Params{0.0, 4.0, 0.25, 4} values := []float64{0.01, 2 * 0.999, 2 * 1.001} - gotCubes := CubeSet(values, min, max, epsPercent, numBuckets) - gotCentral := CentralCube(values, min, max, epsPercent, numBuckets) - wantCubes := [][]int{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}} - wantCentral := []int{0, 1, 2} + gotCubes := CubeSet(values, params) + gotCentral := CentralCube(values, params) + wantCubes := Cubes{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}} + wantCentral := Cube{0, 1, 2} if !reflect.DeepEqual(gotCubes, wantCubes) { t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes) } @@ -85,9 +85,9 @@ func TestCubeSet3(t *testing.T) { // Testing extreme buckets. func TestCubeSet4(t *testing.T) { values := []float64{255.0, 0.0, 255.0, 0.0, 255.0, 0.0, 255.0} - numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25 - gotCubes := CubeSet(values, min, max, epsPercent, numBuckets) - wantCubes := [][]int{{3, 0, 3, 0, 3, 0, 3}} + params := Params{0.0, 255.0, 0.25, 4} + gotCubes := CubeSet(values, params) + wantCubes := Cubes{{3, 0, 3, 0, 3, 0, 3}} if !reflect.DeepEqual(gotCubes, wantCubes) { t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes) } @@ -97,9 +97,9 @@ var vector = []float64{ 0, 183, 148, 21, 47, 16, 69, 45, 151, 64, 181} func TestCubeSet5(t *testing.T) { - numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25 - gotCubes := CubeSet(vector, min, max, epsPercent, numBuckets) - wantCubes := [][]int{ + params := Params{0.0, 255.0, 0.25, 4} + gotCubes := CubeSet(vector, params) + wantCubes := Cubes{ {0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2}, {0, 3, 2, 0, 0, 0, 0, 0, 2, 0, 2}, {0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 2}, {0, 3, 2, 0, 0, 0, 1, 0, 2, 0, 2}, {0, 2, 2, 0, 0, 0, 0, 0, 2, 1, 2}, {0, 3, 2, 0, 0, 0, 0, 0, 2, 1, 2}, diff --git a/hashes.go b/hashes.go index 16b6988..783c88b 100644 --- a/hashes.go +++ b/hashes.go @@ -6,22 +6,22 @@ import ( "hash/fnv" ) -// Decimal hashes hypercubes without collisions. IMPORTANT: +// DecimalHash hashes hypercubes without collisions. IMPORTANT: // To work correctly, the number of buckets must be // less than 11 and the number of dimensions less than 20. // Else at certain unexpected moment you might get a hash // value overflow. -func Decimal(cube []int) (h uint64) { +func (cube Cube) DecimalHash() (h uint64) { for _, v := range cube { h = h*10 + uint64(v) } return h } -// FNV1a hashes hypercubes with rare collisions, +// FNV1aHash hashes hypercubes with rare collisions, // and should be used when Decimal cannot be used // because of very large number of buckets or dimensions. -func FNV1a(cube []int) uint64 { +func (cube Cube) FNV1aHash() uint64 { var b bytes.Buffer gob.NewEncoder(&b).Encode(cube) hash := fnv.New64a() @@ -30,11 +30,11 @@ func FNV1a(cube []int) uint64 { } // HashFunc can be any function (also user-defined). -type HashFunc func(hypercube []int) uint64 +type HashFunc func(cube Cube) uint64 // Hash64Set returns a set of hashes for a hypercube set // and a concrete hash function. -func HashSet(cubeSet [][]int, hashFunc HashFunc) ( +func (cubeSet Cubes) HashSet(hashFunc HashFunc) ( hs []uint64) { for i := 0; i < len(cubeSet); i++ { hs = append(hs, hashFunc(cubeSet[i])) diff --git a/hashes_test.go b/hashes_test.go index 655c7cb..a7ba18f 100644 --- a/hashes_test.go +++ b/hashes_test.go @@ -5,37 +5,37 @@ import ( "testing" ) -func TestDecimal(t *testing.T) { - hypercube := []int{3, 2, 0, 1, 1, 4, 1, 0} - hash := Decimal(hypercube) +func TestDecimalHash(t *testing.T) { + cube := Cube{3, 2, 0, 1, 1, 4, 1, 0} + hash := cube.DecimalHash() want := uint64(32011410) if hash != want { t.Errorf(`Got %v, want %v.`, hash, want) } } -func TestFNV1a(t *testing.T) { - buckets := []int{5, 59, 255, 9, 7, 12, 22, 31} - hash := FNV1a(buckets) - want := uint64(13992349377752315208) +func TestFNV1aHash(t *testing.T) { + cube := Cube{5, 59, 255, 9, 7, 12, 22, 31} + hash := cube.FNV1aHash() + want := uint64(1659788114117494335) if hash != want { t.Errorf(`Got %v, want %v.`, hash, want) } } func TestHashSet(t *testing.T) { - tree := [][]int{ + cubes := Cubes{ {0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9}, {0, 0, 8, 3, 0, 0, 9}, {1, 0, 8, 3, 0, 0, 9}} - hs := HashSet(tree, FNV1a) + hashSet := cubes.HashSet((Cube).FNV1aHash) want := []uint64{ - 14647827280143437043, - 17530493565529410009, - 7065940388079601005, - 13953051952027146823} - if !reflect.DeepEqual(hs, want) { - t.Errorf(`Got %v, want %v.`, hs, want) + 6172277127052188606, + 3265650857171344968, + 13730239218993256724, + 6843127655045710906} + if !reflect.DeepEqual(hashSet, want) { + t.Errorf(`Got %v, want %v.`, hashSet, want) } }