Skip to content

Commit

Permalink
refactor index
Browse files Browse the repository at this point in the history
  • Loading branch information
micpst committed May 25, 2024
1 parent 1dfe26b commit 779220c
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 172 deletions.
21 changes: 10 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ Restful, in-memory, full-text search engine written in Go.
## ✅ Features

- [x] Full-text indexing of multiple fields in a document
- [x] Boolean queries with AND, OR operators between subqueries
- [x] Exact phrase search
- [x] Document ranking based on BM25
- [x] Vector similarity search for semantic search
Expand Down Expand Up @@ -52,10 +51,10 @@ Create a new document and add it to the index.
```bash
$ curl -X POST localhost:3000/api/v1/documents \
-H 'Content-Type: application/json' \
-d '{
"title": "The Silicon Brain",
"url": "https://micpst.com/posts/silicon-brain",
"abstract": "The human brain is often described as complex..."
-d '{
"title": "The Silicon Brain",
"url": "https://micpst.com/posts/silicon-brain",
"abstract": "The human brain is often described as complex..."
}'
```

Expand Down Expand Up @@ -88,10 +87,10 @@ Update the existing document and re-index it with the new fields.
```bash
$ curl -X PUT localhost:3000/api/v1/documents/<id> \
-H 'Content-Type: application/json' \
-d '{
"title": "The Silicon Brain",
"url": "https://micpst.com/posts/silicon-brain",
"abstract": "The human brain is often described as complex..."
-d '{
"title": "The Silicon Brain",
"url": "https://micpst.com/posts/silicon-brain",
"abstract": "The human brain is often described as complex..."
}'
```

Expand Down Expand Up @@ -185,12 +184,12 @@ $ curl -X POST localhost:3000/api/v1/search \
// Default value: 1.2
// Recommended value: between 1.2 and 2
"k": 1.2,
// Length normalization parameter.
// Default value: 0.75
// Recommended value: > 0.75
"b": 0.75,
// Frequency normalization lower bound.
// Default value: 0.5
// Recommended value: between 0.5 and 1
Expand Down
1 change: 0 additions & 1 deletion api/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,6 @@ func (s *Server) deleteDocument(c *gin.Context) {
func (s *Server) searchDocuments(c *gin.Context) {
params := store.SearchParams{
Properties: []string{},
BoolMode: store.AND,
Limit: 10,
Relevance: store.BM25Params{
K: 1.2,
Expand Down
179 changes: 124 additions & 55 deletions pkg/store/index.go
Original file line number Diff line number Diff line change
@@ -1,103 +1,172 @@
package store

import (
"fmt"
"reflect"

"github.com/micpst/minisearch/pkg/lib"
"github.com/micpst/minisearch/pkg/radix"
"github.com/micpst/minisearch/pkg/tokenizer"
)

type FindParams struct {
Tokens []string
BoolMode Mode
Term string
Property string
Exact bool
Tolerance int
Relevance BM25Params
DocsCount int
}

type IndexParams struct {
Id string
Tokens []string
DocsCount int
type IndexParams[S Schema] struct {
Id string
Document S
DocsCount int
language tokenizer.Language
tokenizerConfig *tokenizer.Config
}

type Index struct {
data *radix.Trie
avgFieldLength float64
fieldLengths map[string]int
tokenOccurrences map[string]int
type Index[S Schema] struct {
indexes map[string]*radix.Trie
searchableProperties []string
avgFieldLength map[string]float64
fieldLengths map[string]map[string]int
tokenOccurrences map[string]map[string]int
}

func NewIndex() *Index {
return &Index{
data: radix.New(),
fieldLengths: make(map[string]int),
tokenOccurrences: make(map[string]int),
func newIndex[S Schema]() *Index[S] {
idx := &Index[S]{
indexes: make(map[string]*radix.Trie),
searchableProperties: make([]string, 0),
avgFieldLength: make(map[string]float64),
fieldLengths: make(map[string]map[string]int),
tokenOccurrences: make(map[string]map[string]int),
}
idx.build()
return idx
}

func (idx *Index) Insert(params *IndexParams) {
tokensCount := lib.Count(params.Tokens)

for token, count := range tokensCount {
tokenFrequency := float64(count) / float64(len(params.Tokens))
idx.data.Insert(&radix.InsertParams{
Id: params.Id,
Word: token,
TermFrequency: tokenFrequency,
})
idx.tokenOccurrences[token]++
func (idx *Index[S]) build() {
var s S
for key, value := range flattenSchema(s) {
switch value.(type) {
case string:
idx.indexes[key] = radix.New()
idx.fieldLengths[key] = make(map[string]int)
idx.tokenOccurrences[key] = make(map[string]int)
idx.searchableProperties = append(idx.searchableProperties, key)
default:
continue
}
}

idx.avgFieldLength = (idx.avgFieldLength*float64(params.DocsCount-1) + float64(len(params.Tokens))) / float64(params.DocsCount)
idx.fieldLengths[params.Id] = len(params.Tokens)
}

func (idx *Index) Delete(params *IndexParams) {
for _, token := range params.Tokens {
idx.data.Delete(&radix.DeleteParams{
Id: params.Id,
Word: token,
})
idx.tokenOccurrences[token]--
if idx.tokenOccurrences[token] == 0 {
delete(idx.tokenOccurrences, token)
func (idx *Index[S]) Insert(params *IndexParams[S]) {
document := flattenSchema(params.Document)

for propName, index := range idx.indexes {
tokens, _ := tokenizer.Tokenize(&tokenizer.TokenizeParams{
Text: document[propName].(string),
Language: params.language,
AllowDuplicates: true,
}, params.tokenizerConfig)

allTokensCount := float64(len(tokens))
tokensCount := lib.Count(tokens)

for token, count := range tokensCount {
tokenFrequency := float64(count) / allTokensCount
index.Insert(&radix.InsertParams{
Id: params.Id,
Word: token,
TermFrequency: tokenFrequency,
})
idx.tokenOccurrences[propName][token]++
}

idx.avgFieldLength[propName] = (idx.avgFieldLength[propName]*float64(params.DocsCount-1) + allTokensCount) / float64(params.DocsCount)
idx.fieldLengths[propName][params.Id] = int(allTokensCount)
}
}

idx.avgFieldLength = (idx.avgFieldLength*float64(params.DocsCount) - float64(len(params.Tokens))) / float64(params.DocsCount-1)
delete(idx.fieldLengths, params.Id)
func (idx *Index[S]) Delete(params *IndexParams[S]) {
document := flattenSchema(params.Document)

for propName, index := range idx.indexes {
tokens, _ := tokenizer.Tokenize(&tokenizer.TokenizeParams{
Text: document[propName].(string),
Language: params.language,
AllowDuplicates: false,
}, params.tokenizerConfig)

for _, token := range tokens {
index.Delete(&radix.DeleteParams{
Id: params.Id,
Word: token,
})
idx.tokenOccurrences[propName][token]--
if idx.tokenOccurrences[propName][token] == 0 {
delete(idx.tokenOccurrences[propName], token)
}
}

idx.avgFieldLength[propName] = (idx.avgFieldLength[propName]*float64(params.DocsCount) - float64(len(tokens))) / float64(params.DocsCount-1)
delete(idx.fieldLengths[propName], params.Id)
}
}

func (idx *Index) Find(params *FindParams) map[string]float64 {
func (idx *Index[S]) Find(params *FindParams) map[string]float64 {
idScores := make(map[string]float64)
idTokensCount := make(map[string]int)

for _, token := range params.Tokens {
infos := idx.data.Find(&radix.FindParams{
Term: token,
if index, ok := idx.indexes[params.Property]; ok {
infos := index.Find(&radix.FindParams{
Term: params.Term,
Tolerance: params.Tolerance,
Exact: params.Exact,
})
for _, info := range infos {
idScores[info.Id] += lib.BM25(
idScores[info.Id] = lib.BM25(
info.TermFrequency,
idx.tokenOccurrences[token],
idx.fieldLengths[info.Id],
idx.avgFieldLength,
idx.tokenOccurrences[params.Property][params.Term],
idx.fieldLengths[params.Property][info.Id],
idx.avgFieldLength[params.Property],
params.DocsCount,
params.Relevance.K,
params.Relevance.B,
params.Relevance.D,
)
idTokensCount[info.Id]++
}
}

for id, tokensCount := range idTokensCount {
if params.BoolMode == AND && tokensCount != len(params.Tokens) {
delete(idScores, id)
return idScores
}

func flattenSchema(obj any, prefix ...string) map[string]any {
m := make(map[string]any)
t := reflect.TypeOf(obj)
v := reflect.ValueOf(obj)
fields := reflect.VisibleFields(t)

for i, field := range fields {
if propName, ok := field.Tag.Lookup("index"); ok {
if len(prefix) == 1 {
propName = fmt.Sprintf("%s.%s", prefix[0], propName)
}

switch field.Type.Kind() {
case reflect.Struct:
for key, value := range flattenSchema(v.Field(i).Interface(), propName) {
m[key] = value
}
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
m[propName] = v.Field(i).Int()
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
m[propName] = v.Field(i).Uint()
default:
m[propName] = v.Field(i).String()
}
}
}

return idScores
return m
}
Loading

0 comments on commit 779220c

Please sign in to comment.