diff --git a/pkg/stringx/trie.go b/pkg/stringx/trie.go new file mode 100644 index 0000000..1d03206 --- /dev/null +++ b/pkg/stringx/trie.go @@ -0,0 +1,159 @@ +package stringx + +// refer: https://github.com/tal-tech/go-zero/blob/master/core/stringx/trie.go + +// Trie interface +type Trie interface { + // Filter filter sentence get masked sentence and get keywords + Filter(text string) (string, []string) + // FindKeywords get sentence keywords + FindKeywords(text string) []string + // AddWord add keywords + // Attention: NOT thread safe + AddWords(text ...string) +} + +// +type trieNode struct { + node +} + +var trieMask = '*' + +type scope struct { + start int + stop int +} + +type node struct { + children map[rune]*node + end bool +} + +func (n *node) add(word string) { + chars := []rune(word) + if len(chars) == 0 { + return + } + + nd := n + for _, char := range chars { + if nd.children == nil { + child := new(node) + nd.children = map[rune]*node{ + char: child, + } + nd = child + } else if child, ok := nd.children[char]; ok { + nd = child + } else { + child := new(node) + nd.children[char] = child + nd = child + } + } + + nd.end = true +} + +// NewTrie new trie +func NewTrie(words ...string) Trie { + t := &trieNode{} + t.AddWords(words...) + return t +} + +// Filter filter sentence get masked sentence and get keywords +func (t *trieNode) Filter(text string) (string, []string) { + chars := []rune(text) + if len(chars) == 0 { + return text, nil + } + + scopes := t.findScopes(chars) + keywords := t.collectKeywords(chars, scopes) + + for _, match := range scopes { + for i := match.start; i < match.stop; i++ { + chars[i] = trieMask + } + } + + return string(chars), keywords +} + +// FindKeywords get sentence keywords +func (t *trieNode) FindKeywords(text string) []string { + chars := []rune(text) + if len(chars) == 0 { + return nil + } + + scopes := t.findScopes(chars) + return t.collectKeywords(chars, scopes) +} + +// AddWord add keywords +func (t *trieNode) AddWords(words ...string) { + for _, word := range words { + t.add(word) + } +} + +func (t *trieNode) findScopes(chars []rune) []scope { + var scopes []scope + size := len(chars) + start := -1 + + for i := 0; i < size; i++ { + child, ok := t.children[chars[i]] + if !ok { + continue + } + + if start < 0 { + start = i + } + if child.end { + scopes = append(scopes, scope{ + start: start, + stop: i + 1, + }) + } + + for j := i + 1; j < size; j++ { + grandchild, ok := child.children[chars[j]] + if !ok { + break + } + + child = grandchild + if child.end { + scopes = append(scopes, scope{ + start: start, + stop: j + 1, + }) + } + } + + start = -1 + } + + return scopes +} + +func (t *trieNode) collectKeywords(chars []rune, scopes []scope) []string { + set := make(map[string]bool) + for _, v := range scopes { + set[string(chars[v.start:v.stop])] = true + } + + var i int + keywords := make([]string, len(set)) + for k := range set { + keywords[i] = k + i++ + } + + return keywords +} diff --git a/pkg/stringx/trie_test.go b/pkg/stringx/trie_test.go new file mode 100644 index 0000000..cf4f15b --- /dev/null +++ b/pkg/stringx/trie_test.go @@ -0,0 +1,68 @@ +package stringx + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestTrie(t *testing.T) { + tests := []struct { + in string + out string + keywords []string + }{ + { + in: "Lorem ipsum dolor sit amet, ipsum nominati ocurreret ei per", + out: "Lorem ***** dolor sit amet, ***** ******** ocurreret ei per", + keywords: []string{ + "ipsum", + "nominati", + }, + }, + { + in: "ea timeam aliquip tacimates nec", + out: "ea timeam aliquip tacimates nec", + keywords: []string{}, + }, + } + + trie := NewTrie( + "ipsum", + "nominati", + ) + + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + out, keywords := trie.Filter(tt.in) + assert.Equal(t, tt.out, out) + assert.ElementsMatch(t, tt.keywords, keywords) + }) + } +} + +func BenchmarkTrie1(b *testing.B) { + b.ReportAllocs() + + trie := NewTrie( + "ipsum", + "nominati", + ) + + for i := 0; i < b.N; i++ { + _, _ = trie.Filter("Lorem ipsum dolor sit amet, ipsum nominati ocurreret ei per") + } +} + +func BenchmarkTrie2(b *testing.B) { + b.ReportAllocs() + + trie := NewTrie( + "ipsum", + "nominati", + ) + + for i := 0; i < b.N; i++ { + _, _ = trie.Filter("Lorem ipsum dolor sit amet, ipsum nominati ocurreret ei per, in quo tation nonumy, no iusto luptatum gloriatur vel. Per at solet quaestio, admodum feugait splendide ei vis. Mea ad mutat possit. Dicant nonumy animal duo id, no fugit platonem sea. In has zril labitur menandri, his dolorem eleifend et, eu ius wisi solet scribentur.") + } +}