Skip to content

Commit

Permalink
switch to methods
Browse files Browse the repository at this point in the history
  • Loading branch information
vitali-fedulov committed Jan 19, 2022
1 parent 0d4c3c1 commit c425527
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 113 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Hashing float vectors in N-dimensions

This is an early beta version.
Package hyper allows fast approximate search of nearest neighbour vectors in n-dimensional space.

### Algorithm
**This is an early beta version**. Description below will be improved (TODO). See tests for examples.

https://vitali-fedulov.github.io/algorithm-for-hashing-high-dimensional-float-vectors.html
Package functions discretize a vector and generate a set of hashes, as described in the following document: https://similar.pictures/algorithm-for-hashing-high-dimensional-float-vectors.html

### How to use

about.go contains a short instruction.
To use the package follow the sequence of functions/methods:
1) CubeSet or CentralCube, depending which one is used for a database record and which one for a query.
2) HashSet and DecimalHash to get corresponding hash set and central hash from results of (2). If DecimalHash is not suitable because of very large number of buckets or dimensions, use FNV1aHash to get both the hash set and the central hash).
16 changes: 0 additions & 16 deletions about.go

This file was deleted.

107 changes: 58 additions & 49 deletions hypercubes.go → cubes.go
Original file line number Diff line number Diff line change
@@ -1,24 +1,18 @@
package hyper

// rescale is a helper function to offset and rescale all values
// to [0, numBuckets] range.
func rescale(vector []float64, numBuckets int, min, max float64) []float64 {
rescaled := make([]float64, len(vector))
amp := max - min
for i := range vector {
// Offset to zero and rescale to [0, numBuckets] range.
rescaled[i] = (vector[i] - min) * float64(numBuckets) / amp
}
return rescaled
}

// clone makes a totally independent copy of a 2D slice.
func clone(src [][]int) (dst [][]int) {
dst = make([][]int, len(src))
for i := range src {
dst[i] = append([]int{}, src[i]...)
}
return dst
// Hypercube is represented by a slice of its coordinates.
type Cube []int
type Cubes []Cube

// Parameters of space discretization.
type Params struct {
// Value limits per dimension. For example 0, 255 for pixel values.
Min, Max float64
// Uncertainty interval expressed as a fraction of bucketWidth
// (for example 0.25 for eps = 1/4 of bucketWidth).
EpsPercent float64
// Number of buckets per dimension.
NumBuckets int
}

// CubeSet returns a set of hypercubes, which represent
Expand All @@ -29,49 +23,43 @@ func clone(src [][]int) (dst [][]int) {
// min and max are minimum and maximum possible values of
// the vector components. The assumption is that min and max
// are the same for all dimensions.
// numBuckets is number of buckets per dimension.
// min and max are value limits per dimension.
// epsPercent is the uncertainty interval expressed as
// a fraction of bucketWidth (for example 0.25 for eps = 1/4
// of bucketWidth).
func CubeSet(vector []float64, min, max, epsPercent float64,
numBuckets int) (set [][]int) {

if epsPercent >= 0.5 {
panic(`Error: epsPercent must be less than 0.5.`)
func CubeSet(vector []float64, params Params) (set Cubes) {

if params.EpsPercent >= 0.5 {
panic(`Error: EpsPercent must be less than 0.5.`)
}

var (
bC int // Central bucket number.
bL, bR int // Left and right bucket number.
setL, setR [][]int // Set copies.
branching bool // Branching flag.
bC int // Central bucket number.
bL, bR int // Left and right bucket number.
setL, setR Cubes // Set clones (for Left and Right).
branching bool // Branching flag.
)

// Rescaling vector to avoid potential mistakes with
// divisions and offsets later on.
rescaled := rescale(vector, numBuckets, min, max)
rescaled := rescale(vector, params)
// After the rescale value range of the vector are
// [0, numBuckets], and not [min, max].

// min = 0.0 from now on.
max = float64(numBuckets)
max := float64(params.NumBuckets)

for _, val := range rescaled {

branching = false

bL = int(val - epsPercent)
bR = int(val + epsPercent)
bL = int(val - params.EpsPercent)
bR = int(val + params.EpsPercent)

// Get extreme values out of the way.
if val-epsPercent <= 0.0 { // This means that val >= 0.
if val-params.EpsPercent <= 0.0 { // This means that val >= 0.
bC = bR
goto branchingCheck // No branching.
}

// Get extreme values out of the way.
if val+epsPercent >= max { // This means that val =< max.
if val+params.EpsPercent >= max { // This means that val =< max.
// Above max = numBuckets.
bC = bL
goto branchingCheck // No branching.
Expand Down Expand Up @@ -135,33 +123,54 @@ func CubeSet(vector []float64, min, max, epsPercent float64,

// CentralCube returns the hypercube containing the vector end.
// Arguments are the same as for the CubeSet function.
func CentralCube(vector []float64, min, max, epsPercent float64,
numBuckets int) (central []int) {
func CentralCube(vector []float64, params Params) (central Cube) {

if epsPercent >= 0.5 {
panic(`Error: epsPercent must be less than 0.5.`)
if params.EpsPercent >= 0.5 {
panic(`Error: EpsPercent must be less than 0.5.`)
}

var bC int // Central bucket numbers.

// Rescaling vector to avoid potential mistakes with
// divisions and offsets later on.
rescaled := rescale(vector, numBuckets, min, max)
rescaled := rescale(vector, params)
// After the rescale value range of the vector are
// [0, numBuckets], and not [min, max].

// min = 0.0 from now on.
max = float64(numBuckets)
max := float64(params.NumBuckets)

for _, val := range rescaled {
bC = int(val)
if val-epsPercent <= 0.0 { // This means that val >= 0.
bC = int(val + epsPercent)
if val-params.EpsPercent <= 0.0 { // This means that val >= 0.
bC = int(val + params.EpsPercent)
}
if val+epsPercent >= max { // Meaning val =< max.
bC = int(val - epsPercent)
if val+params.EpsPercent >= max { // Meaning val =< max.
bC = int(val - params.EpsPercent)
}
central = append(central, bC)
}
return central
}

// rescale is a helper function to offset and rescale all values
// to [0, numBuckets] range.
func rescale(vector []float64, params Params) []float64 {
rescaled := make([]float64, len(vector))
amp := params.Max - params.Min
for i := range vector {
// Offset to zero and rescale to [0, numBuckets] range.
rescaled[i] =
(vector[i] - params.Min) * float64(params.NumBuckets) / amp
}
return rescaled
}

// clone makes an unlinked copy of a 2D slice.
func clone(src Cubes) (dst Cubes) {
dst = make(Cubes, len(src))
for i := range src {
dst[i] = append(Cube{}, src[i]...)
}
return dst
}
42 changes: 21 additions & 21 deletions hypercubes_test.go → cubes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"testing"
)

func centralIsNotInTheSet(set [][]int, central []int) bool {
func centralIsNotInTheSet(set Cubes, central Cube) bool {
for _, cube := range set {
counter := 0
for i, c := range central {
Expand All @@ -21,9 +21,9 @@ func centralIsNotInTheSet(set [][]int, central []int) bool {
}

func TestRescale(t *testing.T) { // Testing panic.
numBuckets, min, max, _ := 10, 0.0, 255.0, 0.25
vector := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 255.0}
rescaled := rescale(vector, numBuckets, min, max)
params := Params{0.0, 255.0, 0.25, 10}
rescaled := rescale(vector, params)
got := rescaled
want := []float64{
1, 0.0003921568627450981, 8.24705882352941,
Expand All @@ -38,20 +38,20 @@ func TestCubeSet1(t *testing.T) { // Testing panic.
defer func() { recover() }()
// Intentionally forbiden value for epsPercent.
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
min, max, epsPercent, numBuckets := 0.0, 255.0, 0.51, 10
_ = CubeSet(values, min, max, epsPercent, numBuckets)
params := Params{0.0, 255.0, 0.51, 10}
_ = CubeSet(values, params)
// Never reaches here if Params panics.
t.Errorf("Params did not panic on epsPercent > 0.5")
}

func TestCubeSet2(t *testing.T) {
numBuckets, min, max, epsPercent := 10, 0.0, 255.0, 0.25
params := Params{0.0, 255.0, 0.25, 10}
values := []float64{25.5, 0.01, 210.3, 93.9, 6.6, 9.1, 254.9}
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
gotCentral := CentralCube(values, min, max, epsPercent, numBuckets)
wantCubes := [][]int{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9},
gotCubes := CubeSet(values, params)
gotCentral := CentralCube(values, params)
wantCubes := Cubes{{0, 0, 7, 3, 0, 0, 9}, {1, 0, 7, 3, 0, 0, 9},
{0, 0, 8, 3, 0, 0, 9}, {1, 0, 8, 3, 0, 0, 9}}
wantCentral := []int{1, 0, 8, 3, 0, 0, 9}
wantCentral := Cube{1, 0, 8, 3, 0, 0, 9}
if !reflect.DeepEqual(gotCubes, wantCubes) {
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
}
Expand All @@ -65,12 +65,12 @@ func TestCubeSet2(t *testing.T) {

// Testing bucket borders.
func TestCubeSet3(t *testing.T) {
numBuckets, min, max, epsPercent := 4, 0.0, 4.0, 0.25
params := Params{0.0, 4.0, 0.25, 4}
values := []float64{0.01, 2 * 0.999, 2 * 1.001}
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
gotCentral := CentralCube(values, min, max, epsPercent, numBuckets)
wantCubes := [][]int{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}}
wantCentral := []int{0, 1, 2}
gotCubes := CubeSet(values, params)
gotCentral := CentralCube(values, params)
wantCubes := Cubes{{0, 1, 1}, {0, 2, 1}, {0, 1, 2}, {0, 2, 2}}
wantCentral := Cube{0, 1, 2}
if !reflect.DeepEqual(gotCubes, wantCubes) {
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
}
Expand All @@ -85,9 +85,9 @@ func TestCubeSet3(t *testing.T) {
// Testing extreme buckets.
func TestCubeSet4(t *testing.T) {
values := []float64{255.0, 0.0, 255.0, 0.0, 255.0, 0.0, 255.0}
numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25
gotCubes := CubeSet(values, min, max, epsPercent, numBuckets)
wantCubes := [][]int{{3, 0, 3, 0, 3, 0, 3}}
params := Params{0.0, 255.0, 0.25, 4}
gotCubes := CubeSet(values, params)
wantCubes := Cubes{{3, 0, 3, 0, 3, 0, 3}}
if !reflect.DeepEqual(gotCubes, wantCubes) {
t.Errorf(`Got %v, want %v.`, gotCubes, wantCubes)
}
Expand All @@ -97,9 +97,9 @@ var vector = []float64{
0, 183, 148, 21, 47, 16, 69, 45, 151, 64, 181}

func TestCubeSet5(t *testing.T) {
numBuckets, min, max, epsPercent := 4, 0.0, 255.0, 0.25
gotCubes := CubeSet(vector, min, max, epsPercent, numBuckets)
wantCubes := [][]int{
params := Params{0.0, 255.0, 0.25, 4}
gotCubes := CubeSet(vector, params)
wantCubes := Cubes{
{0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2}, {0, 3, 2, 0, 0, 0, 0, 0, 2, 0, 2},
{0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 2}, {0, 3, 2, 0, 0, 0, 1, 0, 2, 0, 2},
{0, 2, 2, 0, 0, 0, 0, 0, 2, 1, 2}, {0, 3, 2, 0, 0, 0, 0, 0, 2, 1, 2},
Expand Down
12 changes: 6 additions & 6 deletions hashes.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,22 @@ import (
"hash/fnv"
)

// Decimal hashes hypercubes without collisions. IMPORTANT:
// DecimalHash hashes hypercubes without collisions. IMPORTANT:
// To work correctly, the number of buckets must be
// less than 11 and the number of dimensions less than 20.
// Else at certain unexpected moment you might get a hash
// value overflow.
func Decimal(cube []int) (h uint64) {
func (cube Cube) DecimalHash() (h uint64) {
for _, v := range cube {
h = h*10 + uint64(v)
}
return h
}

// FNV1a hashes hypercubes with rare collisions,
// FNV1aHash hashes hypercubes with rare collisions,
// and should be used when Decimal cannot be used
// because of very large number of buckets or dimensions.
func FNV1a(cube []int) uint64 {
func (cube Cube) FNV1aHash() uint64 {
var b bytes.Buffer
gob.NewEncoder(&b).Encode(cube)
hash := fnv.New64a()
Expand All @@ -30,11 +30,11 @@ func FNV1a(cube []int) uint64 {
}

// HashFunc can be any function (also user-defined).
type HashFunc func(hypercube []int) uint64
type HashFunc func(cube Cube) uint64

// Hash64Set returns a set of hashes for a hypercube set
// and a concrete hash function.
func HashSet(cubeSet [][]int, hashFunc HashFunc) (
func (cubeSet Cubes) HashSet(hashFunc HashFunc) (
hs []uint64) {
for i := 0; i < len(cubeSet); i++ {
hs = append(hs, hashFunc(cubeSet[i]))
Expand Down
30 changes: 15 additions & 15 deletions hashes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,37 +5,37 @@ import (
"testing"
)

func TestDecimal(t *testing.T) {
hypercube := []int{3, 2, 0, 1, 1, 4, 1, 0}
hash := Decimal(hypercube)
func TestDecimalHash(t *testing.T) {
cube := Cube{3, 2, 0, 1, 1, 4, 1, 0}
hash := cube.DecimalHash()
want := uint64(32011410)
if hash != want {
t.Errorf(`Got %v, want %v.`, hash, want)
}
}

func TestFNV1a(t *testing.T) {
buckets := []int{5, 59, 255, 9, 7, 12, 22, 31}
hash := FNV1a(buckets)
want := uint64(13992349377752315208)
func TestFNV1aHash(t *testing.T) {
cube := Cube{5, 59, 255, 9, 7, 12, 22, 31}
hash := cube.FNV1aHash()
want := uint64(1659788114117494335)
if hash != want {
t.Errorf(`Got %v, want %v.`, hash, want)
}
}

func TestHashSet(t *testing.T) {
tree := [][]int{
cubes := Cubes{
{0, 0, 7, 3, 0, 0, 9},
{1, 0, 7, 3, 0, 0, 9},
{0, 0, 8, 3, 0, 0, 9},
{1, 0, 8, 3, 0, 0, 9}}
hs := HashSet(tree, FNV1a)
hashSet := cubes.HashSet((Cube).FNV1aHash)
want := []uint64{
14647827280143437043,
17530493565529410009,
7065940388079601005,
13953051952027146823}
if !reflect.DeepEqual(hs, want) {
t.Errorf(`Got %v, want %v.`, hs, want)
6172277127052188606,
3265650857171344968,
13730239218993256724,
6843127655045710906}
if !reflect.DeepEqual(hashSet, want) {
t.Errorf(`Got %v, want %v.`, hashSet, want)
}
}

0 comments on commit c425527

Please sign in to comment.