refactor index

micpst · May 25, 2024 · 779220c · 779220c
1 parent 1dfe26b
commit 779220c
Show file tree

Hide file tree

Showing 5 changed files with 190 additions and 172 deletions.
diff --git a/README.md b/README.md
@@ -7,7 +7,6 @@ Restful, in-memory, full-text search engine written in Go.
 ## ✅ Features
 
 - [x] Full-text indexing of multiple fields in a document
-- [x] Boolean queries with AND, OR operators between subqueries
 - [x] Exact phrase search
 - [x] Document ranking based on BM25
 - [x] Vector similarity search for semantic search
@@ -52,10 +51,10 @@ Create a new document and add it to the index.
 ```bash
 $ curl -X POST localhost:3000/api/v1/documents \
     -H 'Content-Type: application/json' \
-    -d '{ 
-      "title": "The Silicon Brain", 
-      "url": "https://micpst.com/posts/silicon-brain", 
-      "abstract": "The human brain is often described as complex..." 
+    -d '{
+      "title": "The Silicon Brain",
+      "url": "https://micpst.com/posts/silicon-brain",
+      "abstract": "The human brain is often described as complex..."
     }'
 ```
 
@@ -88,10 +87,10 @@ Update the existing document and re-index it with the new fields.
 ```bash
 $ curl -X PUT localhost:3000/api/v1/documents/<id> \
     -H 'Content-Type: application/json' \
-    -d '{ 
-      "title": "The Silicon Brain", 
-      "url": "https://micpst.com/posts/silicon-brain", 
-      "abstract": "The human brain is often described as complex..." 
+    -d '{
+      "title": "The Silicon Brain",
+      "url": "https://micpst.com/posts/silicon-brain",
+      "abstract": "The human brain is often described as complex..."
     }'
 ```
 
@@ -185,12 +184,12 @@ $ curl -X POST localhost:3000/api/v1/search \
         // Default value: 1.2
         // Recommended value: between 1.2 and 2
         "k": 1.2,
-        
+
         // Length normalization parameter.
         // Default value: 0.75
         // Recommended value: > 0.75
         "b": 0.75,
-        
+
         // Frequency normalization lower bound.
         // Default value: 0.5
         // Recommended value: between 0.5 and 1

diff --git a/api/handlers.go b/api/handlers.go
@@ -137,7 +137,6 @@ func (s *Server) deleteDocument(c *gin.Context) {
 func (s *Server) searchDocuments(c *gin.Context) {
 	params := store.SearchParams{
 		Properties: []string{},
-		BoolMode:   store.AND,
 		Limit:      10,
 		Relevance: store.BM25Params{
 			K: 1.2,

diff --git a/pkg/store/index.go b/pkg/store/index.go
@@ -1,103 +1,172 @@
 package store
 
 import (
+	"fmt"
+	"reflect"
+
 	"github.com/micpst/minisearch/pkg/lib"
 	"github.com/micpst/minisearch/pkg/radix"
+	"github.com/micpst/minisearch/pkg/tokenizer"
 )
 
 type FindParams struct {
-	Tokens    []string
-	BoolMode  Mode
+	Term      string
+	Property  string
 	Exact     bool
 	Tolerance int
 	Relevance BM25Params
 	DocsCount int
 }
 
-type IndexParams struct {
-	Id        string
-	Tokens    []string
-	DocsCount int
+type IndexParams[S Schema] struct {
+	Id              string
+	Document        S
+	DocsCount       int
+	language        tokenizer.Language
+	tokenizerConfig *tokenizer.Config
 }
 
-type Index struct {
-	data             *radix.Trie
-	avgFieldLength   float64
-	fieldLengths     map[string]int
-	tokenOccurrences map[string]int
+type Index[S Schema] struct {
+	indexes              map[string]*radix.Trie
+	searchableProperties []string
+	avgFieldLength       map[string]float64
+	fieldLengths         map[string]map[string]int
+	tokenOccurrences     map[string]map[string]int
 }
 
-func NewIndex() *Index {
-	return &Index{
-		data:             radix.New(),
-		fieldLengths:     make(map[string]int),
-		tokenOccurrences: make(map[string]int),
+func newIndex[S Schema]() *Index[S] {
+	idx := &Index[S]{
+		indexes:              make(map[string]*radix.Trie),
+		searchableProperties: make([]string, 0),
+		avgFieldLength:       make(map[string]float64),
+		fieldLengths:         make(map[string]map[string]int),
+		tokenOccurrences:     make(map[string]map[string]int),
 	}
+	idx.build()
+	return idx
 }
 
-func (idx *Index) Insert(params *IndexParams) {
-	tokensCount := lib.Count(params.Tokens)
-
-	for token, count := range tokensCount {
-		tokenFrequency := float64(count) / float64(len(params.Tokens))
-		idx.data.Insert(&radix.InsertParams{
-			Id:            params.Id,
-			Word:          token,
-			TermFrequency: tokenFrequency,
-		})
-		idx.tokenOccurrences[token]++
+func (idx *Index[S]) build() {
+	var s S
+	for key, value := range flattenSchema(s) {
+		switch value.(type) {
+		case string:
+			idx.indexes[key] = radix.New()
+			idx.fieldLengths[key] = make(map[string]int)
+			idx.tokenOccurrences[key] = make(map[string]int)
+			idx.searchableProperties = append(idx.searchableProperties, key)
+		default:
+			continue
+		}
 	}
-
-	idx.avgFieldLength = (idx.avgFieldLength*float64(params.DocsCount-1) + float64(len(params.Tokens))) / float64(params.DocsCount)
-	idx.fieldLengths[params.Id] = len(params.Tokens)
 }
 
-func (idx *Index) Delete(params *IndexParams) {
-	for _, token := range params.Tokens {
-		idx.data.Delete(&radix.DeleteParams{
-			Id:   params.Id,
-			Word: token,
-		})
-		idx.tokenOccurrences[token]--
-		if idx.tokenOccurrences[token] == 0 {
-			delete(idx.tokenOccurrences, token)
+func (idx *Index[S]) Insert(params *IndexParams[S]) {
+	document := flattenSchema(params.Document)
+
+	for propName, index := range idx.indexes {
+		tokens, _ := tokenizer.Tokenize(&tokenizer.TokenizeParams{
+			Text:            document[propName].(string),
+			Language:        params.language,
+			AllowDuplicates: true,
+		}, params.tokenizerConfig)
+
+		allTokensCount := float64(len(tokens))
+		tokensCount := lib.Count(tokens)
+
+		for token, count := range tokensCount {
+			tokenFrequency := float64(count) / allTokensCount
+			index.Insert(&radix.InsertParams{
+				Id:            params.Id,
+				Word:          token,
+				TermFrequency: tokenFrequency,
+			})
+			idx.tokenOccurrences[propName][token]++
 		}
+
+		idx.avgFieldLength[propName] = (idx.avgFieldLength[propName]*float64(params.DocsCount-1) + allTokensCount) / float64(params.DocsCount)
+		idx.fieldLengths[propName][params.Id] = int(allTokensCount)
 	}
+}
 
-	idx.avgFieldLength = (idx.avgFieldLength*float64(params.DocsCount) - float64(len(params.Tokens))) / float64(params.DocsCount-1)
-	delete(idx.fieldLengths, params.Id)
+func (idx *Index[S]) Delete(params *IndexParams[S]) {
+	document := flattenSchema(params.Document)
+
+	for propName, index := range idx.indexes {
+		tokens, _ := tokenizer.Tokenize(&tokenizer.TokenizeParams{
+			Text:            document[propName].(string),
+			Language:        params.language,
+			AllowDuplicates: false,
+		}, params.tokenizerConfig)
+
+		for _, token := range tokens {
+			index.Delete(&radix.DeleteParams{
+				Id:   params.Id,
+				Word: token,
+			})
+			idx.tokenOccurrences[propName][token]--
+			if idx.tokenOccurrences[propName][token] == 0 {
+				delete(idx.tokenOccurrences[propName], token)
+			}
+		}
+
+		idx.avgFieldLength[propName] = (idx.avgFieldLength[propName]*float64(params.DocsCount) - float64(len(tokens))) / float64(params.DocsCount-1)
+		delete(idx.fieldLengths[propName], params.Id)
+	}
 }
 
-func (idx *Index) Find(params *FindParams) map[string]float64 {
+func (idx *Index[S]) Find(params *FindParams) map[string]float64 {
 	idScores := make(map[string]float64)
-	idTokensCount := make(map[string]int)
 
-	for _, token := range params.Tokens {
-		infos := idx.data.Find(&radix.FindParams{
-			Term:      token,
+	if index, ok := idx.indexes[params.Property]; ok {
+		infos := index.Find(&radix.FindParams{
+			Term:      params.Term,
 			Tolerance: params.Tolerance,
 			Exact:     params.Exact,
 		})
 		for _, info := range infos {
-			idScores[info.Id] += lib.BM25(
+			idScores[info.Id] = lib.BM25(
 				info.TermFrequency,
-				idx.tokenOccurrences[token],
-				idx.fieldLengths[info.Id],
-				idx.avgFieldLength,
+				idx.tokenOccurrences[params.Property][params.Term],
+				idx.fieldLengths[params.Property][info.Id],
+				idx.avgFieldLength[params.Property],
 				params.DocsCount,
 				params.Relevance.K,
 				params.Relevance.B,
 				params.Relevance.D,
 			)
-			idTokensCount[info.Id]++
 		}
 	}
 
-	for id, tokensCount := range idTokensCount {
-		if params.BoolMode == AND && tokensCount != len(params.Tokens) {
-			delete(idScores, id)
+	return idScores
+}
+
+func flattenSchema(obj any, prefix ...string) map[string]any {
+	m := make(map[string]any)
+	t := reflect.TypeOf(obj)
+	v := reflect.ValueOf(obj)
+	fields := reflect.VisibleFields(t)
+
+	for i, field := range fields {
+		if propName, ok := field.Tag.Lookup("index"); ok {
+			if len(prefix) == 1 {
+				propName = fmt.Sprintf("%s.%s", prefix[0], propName)
+			}
+
+			switch field.Type.Kind() {
+			case reflect.Struct:
+				for key, value := range flattenSchema(v.Field(i).Interface(), propName) {
+					m[key] = value
+				}
+			case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
+				m[propName] = v.Field(i).Int()
+			case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
+				m[propName] = v.Field(i).Uint()
+			default:
+				m[propName] = v.Field(i).String()
+			}
 		}
 	}
 
-	return idScores
+	return m
 }