-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvectors.go
142 lines (122 loc) · 3.65 KB
/
vectors.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// Copyright 2022, NLP Odyssey Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gomaddness
import (
"math"
"sort"
)
// Vectors is a collection of Vector objects.
//
// All Vector objects MUST be of the same size.
//
// Given a collection of N vectors, each of size M, this can also be seen as a
// matrix with dimensions N×M (N rows, M columns).
type Vectors[F Float] []Vector[F]
// ColumnWiseVariance returns a new Vector containing the column-wise variance
// computed across all vectors.
func (vs Vectors[F]) ColumnWiseVariance() Vector[F] {
vecSize := len(vs[0])
sum := make(Vector[F], vecSize)
sumOfSquares := make(Vector[F], vecSize)
for _, v := range vs {
sum.Add(v)
sumOfSquares.AddSquares(v)
}
n := F(len(vs))
mean := sum.DivScalar(n)
squareMean := mean.Square()
return sumOfSquares.DivScalar(n).Sub(squareMean)
}
// SplitByThreshold splits the collection of vectors by a threshold value,
// compared with the elements at the given index.
//
// Two new collections are returned:
// - lt: vectors whose value at "index" is lower than "threshold"
// - gte: vectors whose value at "index" is greater than or equal to "threshold"
func (vs Vectors[F]) SplitByThreshold(index int, threshold F) (lt, gte Vectors[F]) {
lt = make(Vectors[F], 0)
gte = make(Vectors[F], 0)
for _, v := range vs {
if v[index] < threshold {
lt = append(lt, v)
continue
}
gte = append(gte, v)
}
return
}
// Copy returns a shallow copy of the collection.
func (vs Vectors[F]) Copy() Vectors[F] {
c := make(Vectors[F], len(vs))
copy(c, vs)
return c
}
// SortByColumn sorts vs in place, according to the value at each vector's
// index (column), in ascending order.
// It returns vs.
func (vs Vectors[F]) SortByColumn(index int) Vectors[F] {
sort.SliceStable(vs, func(i, j int) bool {
return vs[i][index] < vs[j][index]
})
return vs
}
// Reverse reverses vs in place.
// It returns vs.
func (vs Vectors[F]) Reverse() Vectors[F] {
for l, r := 0, len(vs)-1; l < r; l, r = l+1, r-1 {
vs[l], vs[r] = vs[r], vs[l]
}
return vs
}
// CumulativeSSE returns a new Vector, computing the error sum of squares
// (SSE) over the set of Vectors.
func (vs Vectors[F]) CumulativeSSE() Vector[F] {
n := len(vs)
d := len(vs[0])
out := make(Vector[F], n) // all values are zero by default
sum := make(Vector[F], d)
sumOfSquares := make(Vector[F], d)
_ = sum[len(vs[0])-1]
for j, x := range vs[0] {
sum[j] = x
sumOfSquares[j] = x * x
}
for i := 1; i < n; i++ {
_ = sum[len(vs[i])-1]
for j, x := range vs[i] {
sum[j] += x
sumOfSquares[j] += x * x
out[i] += sumOfSquares[j] - (sum[j] * sum[j] / F(i+1))
}
}
return out
}
// OptimalSplitThreshold finds the optimal split threshold within the vectors,
// computed over values at the given split-index column.
func (vs Vectors[F]) OptimalSplitThreshold(splitIndex int) (threshold, loss F) {
sorted := vs.Copy().SortByColumn(splitIndex)
if len(sorted) == 1 {
// TODO: check this corner case
sorted = append(sorted, sorted[0])
}
losses := sorted.CumulativeSSE() // head
tail := sorted.Copy().Reverse().CumulativeSSE().Reverse()
losses[:len(losses)-1].Add(tail[1:])
n := losses.ArgMin()
if n < len(sorted)-1 {
threshold = (sorted[n][splitIndex] + sorted[n+1][splitIndex]) / 2
} else {
// TODO: check this corner case
threshold = F(math.Nextafter32(float32(sorted[n][splitIndex]), float32(math.Inf(+1))))
}
return threshold, losses[n]
}
// Mean calculates and returns the mean vector.
func (vs Vectors[F]) Mean() Vector[F] {
sum := vs[0].Copy()
for _, v := range vs[1:] {
sum.Add(v)
}
return sum.DivScalar(F(len(vs)))
}