Skip to content

Commit

Permalink
Merge pull request #179 from SwissDataScienceCenter/prefix-search
Browse files Browse the repository at this point in the history
Configure text fields with 'edgeNGram' filter enabling prefix search
  • Loading branch information
eikek authored Aug 13, 2024
2 parents e0a9154 + aa28055 commit 9d5a237
Show file tree
Hide file tree
Showing 13 changed files with 209 additions and 41 deletions.
21 changes: 21 additions & 0 deletions modules/json/src/main/scala/io/renku/json/EncoderSupport.scala
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,28 @@ object EncoderSupport {
val adds = AdditionalFields.const[A, V](field*)
Macros.createEncoder[String, V, A](adds)

/** Derives an encoder that writes all members of the target type as map members. It
* assumes an already open map!
*/
inline def deriveProductMemberEncoder[A <: Product](using
Mirror.ProductOf[A]
): Encoder[A] =
Macros.membersEncoder[A]

private object Macros {
final inline def membersEncoder[T](using
m: Mirror.ProductOf[T]
): Encoder[T] =
new Encoder[T] {
def write(w: Writer, value: T): Writer =
val encoders = summonEncoder[m.MirroredElemTypes]
val names = LabelsMacro.findLabels[T].toList
val values = value.asInstanceOf[Product].productIterator.toList
names.zip(values).zip(encoders).foreach { case ((k, v), e) =>
w.writeMapMember(k, v)(using Encoder[String], e.asInstanceOf[Encoder[Any]])
}
w
}

final inline def createEncoder[K: Encoder, V: Encoder, T](
additionalFields: AdditionalFields[T, V]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,42 @@ object EntityDocumentSchema:
// virtual score field
val score: FieldName = FieldName("score")

private object Analyzers {
val textIndex = Analyzer(
tokenizer = Tokenizer.uax29UrlEmail,
filters = Seq(
Filter.lowercase,
Filter.stop,
Filter.englishMinimalStem,
Filter.asciiFolding,
Filter.edgeNGram(Filter.EdgeNGramSettings(2, 8, true))
)
)
val textQuery = Analyzer(
tokenizer = Tokenizer.uax29UrlEmail,
filters = Seq(
Filter.lowercase,
Filter.stop,
Filter.englishMinimalStem,
Filter.asciiFolding
)
)
}

object FieldTypes:
val id: FieldType = FieldType.id(TypeName("SearchId")).makeDocValue
val string: FieldType = FieldType.str(TypeName("SearchString")).makeDocValue
val text: FieldType = FieldType.text(TypeName("SearchText"), Analyzer.defaultSearch)
val text: FieldType =
FieldType
.text(TypeName("SearchText"))
.withIndexAnalyzer(Analyzers.textIndex)
.withQueryAnalyzer(Analyzers.textQuery)
val textAll: FieldType =
FieldType.text(TypeName("SearchTextAll"), Analyzer.defaultSearch).makeMultiValued
FieldType
.text(TypeName("SearchTextAll"))
.withIndexAnalyzer(Analyzers.textIndex)
.withQueryAnalyzer(Analyzers.textQuery)
.makeMultiValued
val dateTime: FieldType = FieldType.dateTimePoint(TypeName("SearchDateTime"))

val initialEntityDocumentAdd: Seq[SchemaCommand] = Seq(
Expand Down Expand Up @@ -130,3 +160,8 @@ object EntityDocumentSchema:
SchemaCommand.Add(CopyFieldRule(Fields.groupEditors, Fields.membersAll)),
SchemaCommand.Add(CopyFieldRule(Fields.groupViewers, Fields.membersAll))
)

val replaceTextTypes: Seq[SchemaCommand] = Seq(
SchemaCommand.Replace(FieldTypes.text),
SchemaCommand.Replace(FieldTypes.textAll)
)
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ object Migrations {
SchemaMigration(version = 5L, EntityDocumentSchema.keywordField),
SchemaMigration(version = 6L, EntityDocumentSchema.namespaceField),
SchemaMigration(version = 7L, EntityDocumentSchema.editorAndViewerRoles),
SchemaMigration(version = 8L, EntityDocumentSchema.groupRoles)
SchemaMigration(version = 8L, EntityDocumentSchema.groupRoles),
SchemaMigration(version = 9L, EntityDocumentSchema.replaceTextTypes)
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,35 @@ class SearchSolrClientSpec extends CatsEffectSuite with SearchSolrSuite:
_ = assertEquals(memberResult.responseBody.docs.head.id, project.id)
_ = assertEquals(adminResult.responseBody.docs.head.id, project.id)
yield ()

test("search partial words"):
for
client <- IO(searchSolrClient())
project <- IO(
projectDocumentGen(
"NeuroDesk",
"This is a Neurodesk project",
Gen.const(None),
Gen.const(None),
Gen.const(Visibility.Public)
).generateOne
)
_ <- client.upsertSuccess(Seq(project))
result1 <- client.queryEntity(
SearchRole.anonymous,
Query(Query.Segment.text("neuro")),
1,
0
)
_ = assertEquals(result1.responseBody.docs.size, 1)
_ = assertEquals(result1.responseBody.docs.head.id, project.id)

result2 <- client.queryEntity(
SearchRole.anonymous,
Query(Query.Segment.nameIs("neuro")),
1,
0
)
_ = assertEquals(result2.responseBody.docs.size, 1)
_ = assertEquals(result2.responseBody.docs.head.id, project.id)
yield ()
Original file line number Diff line number Diff line change
Expand Up @@ -19,31 +19,16 @@
package io.renku.solr.client.schema

// see https://solr.apache.org/guide/solr/latest/indexing-guide/analyzers.html
// https://solr.apache.org/guide/solr/latest/indexing-guide/schema-api.html#add-a-new-field-type

final case class Analyzer(
tokenizer: Tokenizer,
`type`: Analyzer.AnalyzerType = Analyzer.AnalyzerType.None,
filters: Seq[Filter] = Nil
)

object Analyzer:
enum AnalyzerType:
case Index
case Multiterm
case Query
case None

object AnalyzerType:
def fromString(str: String): Either[String, AnalyzerType] =
AnalyzerType.values
.find(_.productPrefix.equalsIgnoreCase(str))
.toRight(s"Invalid analyzer type: $str")

def index(tokenizer: Tokenizer, filters: Filter*): Analyzer =
Analyzer(tokenizer, AnalyzerType.Index, filters)

def query(tokenizer: Tokenizer, filters: Filter*): Analyzer =
Analyzer(tokenizer, AnalyzerType.Query, filters)
def create(tokenizer: Tokenizer, filters: Filter*): Analyzer =
Analyzer(tokenizer, filters)

val classic: Analyzer = Analyzer(Tokenizer.classic, filters = List(Filter.classic))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ package io.renku.solr.client.schema
final case class FieldType(
name: TypeName,
`class`: FieldTypeClass,
analyzer: Option[Analyzer] = None,
indexAnalyzer: Option[Analyzer] = None,
queryAnalyzer: Option[Analyzer] = None,
required: Boolean = false,
indexed: Boolean = true,
stored: Boolean = true,
Expand All @@ -33,13 +34,22 @@ final case class FieldType(
lazy val makeDocValue: FieldType = copy(docValues = true)
lazy val makeMultiValued: FieldType = copy(multiValued = true)

def withQueryAnalyzer(a: Analyzer): FieldType =
copy(queryAnalyzer = Some(a))

def withIndexAnalyzer(a: Analyzer): FieldType =
copy(indexAnalyzer = Some(a))

def withAnalyzer(a: Analyzer): FieldType =
withQueryAnalyzer(a).withIndexAnalyzer(a)

object FieldType:

def id(name: TypeName): FieldType =
FieldType(name, FieldTypeClass.Defaults.strField)

def text(name: TypeName, analyzer: Analyzer): FieldType =
FieldType(name, FieldTypeClass.Defaults.textField, analyzer = Some(analyzer))
def text(name: TypeName): FieldType =
FieldType(name, FieldTypeClass.Defaults.textField)

def str(name: TypeName): FieldType =
FieldType(name, FieldTypeClass.Defaults.strField)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,12 @@

package io.renku.solr.client.schema

import scala.compiletime.*
import scala.deriving.Mirror

// see https://solr.apache.org/guide/solr/latest/indexing-guide/filters.html

final case class Filter(name: String)
final case class Filter(name: String, settings: Option[Filter.Settings] = None)

object Filter:
val asciiFolding: Filter = Filter("asciiFolding")
Expand All @@ -30,3 +33,42 @@ object Filter:
val classic: Filter = Filter("classic")
val daitchMokotoffSoundex: Filter = Filter("daitchMokotoffSoundex")
val doubleMetaphone: Filter = Filter("doubleMetaphone")
val nGram: Filter = Filter("nGram")
def edgeNGram(cfg: EdgeNGramSettings): Filter =
val settings = Macros.settingsOf(cfg)
Filter("edgeNGram", settings)

/** Settings specific to a filter */
opaque type Settings = Map[String, String]
object Settings {
def createFromMap(m: Map[String, String]): Option[Settings] =
if (m.isEmpty) None else Some(m)
extension (self: Settings)
def asMap: Map[String, String] = self
def get(key: String): Option[String] = self.get(key)
}

final case class EdgeNGramSettings(
minGramSize: Int = 3,
maxGramSize: Int = 6,
preserveOriginal: Boolean = true
)

// SOLR encodes settings as strings. When doing schema requests, it
// accepts both: JSON Number and JSON strings. However, when
// querying the solr schema, it returns all filter settings as
// strings, so it will be `"maxGramSize:"6"` instead of
// `"maxGramSize:6`. So here every setting is encoded as a simple
// `Map[String,String]`. Creating such settings should always happen
// using a specific type, like `EdgeNGramSettings`.
private object Macros {
// This marco converts a case class into a Map[String,String] by
// simply putting each member in it using the `toString` method.
inline def settingsOf[A <: Product](value: A)(using
m: Mirror.ProductOf[A]
): Option[Settings] =
val values = value.asInstanceOf[Product].productIterator.toList
val labels = constValueTuple[m.MirroredElemLabels]
val kv = labels.toList.zip(values).map { case (k, v) => k.toString -> v.toString }
Settings.createFromMap(kv.toMap)
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,20 @@ enum SchemaCommand:
case DeleteField(name: FieldName)
case DeleteType(name: TypeName)
case DeleteDynamicField(name: FieldName)
case Replace(element: SchemaCommand.ReplaceElem)

def commandName: String = this match
case Add(_: Field) => "add-field"
case Add(_: FieldType) => "add-field-type"
case Add(_: DynamicFieldRule) => "add-dynamic-field"
case Add(_: CopyFieldRule) => "add-copy-field"
case _: DeleteField => "delete-field"
case _: DeleteType => "delete-field-type"
case _: DeleteDynamicField => "delete-dynamic-field"
case Add(_: Field) => "add-field"
case Add(_: FieldType) => "add-field-type"
case Add(_: DynamicFieldRule) => "add-dynamic-field"
case Add(_: CopyFieldRule) => "add-copy-field"
case Replace(_: Field) => "replace-field"
case Replace(_: FieldType) => "replace-field-type"
case Replace(_: DynamicFieldRule) => "replace-dynamic-field"
case _: DeleteField => "delete-field"
case _: DeleteType => "delete-field-type"
case _: DeleteDynamicField => "delete-dynamic-field"

object SchemaCommand:
type Element = FieldType | Field | DynamicFieldRule | CopyFieldRule
type ReplaceElem = FieldType | Field | DynamicFieldRule
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package io.renku.solr.client.schema

import io.bullet.borer.NullOptions.given
import io.bullet.borer.Reader
import io.bullet.borer.derivation.MapBasedCodecs
import io.bullet.borer.{Decoder, Encoder, Writer}
import io.renku.solr.client.schema.SchemaCommand.Element
Expand All @@ -28,13 +29,24 @@ trait SchemaJsonCodec {
given Encoder[Tokenizer] = MapBasedCodecs.deriveEncoder
given Decoder[Tokenizer] = MapBasedCodecs.deriveDecoder

given Encoder[Filter] = MapBasedCodecs.deriveEncoder
given Decoder[Filter] = MapBasedCodecs.deriveDecoder
given Encoder[Filter] = { (w: Writer, value: Filter) =>
w.writeMapStart()
w.writeMapMember("name", value.name)
value.settings match {
case None => ()
case Some(s) =>
s.asMap.foreach { case (k, v) =>
w.writeMapMember(k, v)
}
}
w.writeMapClose()
}

given Encoder[Analyzer.AnalyzerType] =
Encoder.forString.contramap(_.productPrefix.toLowerCase)
given Decoder[Analyzer.AnalyzerType] =
Decoder.forString.mapEither(Analyzer.AnalyzerType.fromString)
given Decoder[Filter] = Decoder.forMap[String, String].mapOption { data =>
data.get("name").map { name =>
Filter(name, Filter.Settings.createFromMap(data.removed("name")))
}
}

given Encoder[Analyzer] = MapBasedCodecs.deriveEncoder
given Decoder[Analyzer] = MapBasedCodecs.deriveDecoder
Expand Down Expand Up @@ -72,6 +84,8 @@ trait SchemaJsonCodec {
value match
case SchemaCommand.Add(v) =>
e.write(w, v)
case SchemaCommand.Replace(v) =>
e.write(w, v)
case SchemaCommand.DeleteType(tn) =>
w.writeMap(Map("name" -> tn))
case SchemaCommand.DeleteField(fn) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ class SearchCaseInsensitiveSpec extends CatsEffectSuite with SolrClientBaseSuite
List(solrServer, solrClient)

private val migrations = Seq(
SchemaCommand.Add(FieldType.text(TypeName("my_text_field"), Analyzer.defaultSearch)),
SchemaCommand.Add(
FieldType.text(TypeName("my_text_field")).withAnalyzer(Analyzer.defaultSearch)
),
SchemaCommand.Add(Field(FieldName("my_name"), TypeName("my_text_field")))
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ class SolrClientSpec

test("use schema for inserting and querying") {
val cmds = Seq(
SchemaCommand.Add(FieldType.text(TypeName("roomText"), Analyzer.classic)),
SchemaCommand.Add(
FieldType.text(TypeName("roomText")).withAnalyzer(Analyzer.classic)
),
SchemaCommand.Add(FieldType.int(TypeName("roomInt"))),
SchemaCommand.Add(Field(FieldName("roomName"), TypeName("roomText"))),
SchemaCommand.Add(Field(FieldName("roomDescription"), TypeName("roomText"))),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ class SolrMigratorSpec extends CatsEffectSuite with SolrClientBaseSuite:
List(solrServer, solrClient)

private val migrations = Seq(
SchemaMigration(-5, Add(FieldType.text(TypeName("testText"), Analyzer.classic))),
SchemaMigration(
-5,
Add(FieldType.text(TypeName("testText")).withAnalyzer(Analyzer.classic))
),
SchemaMigration(-4, Add(FieldType.int(TypeName("testInt")))),
SchemaMigration(-3, Add(Field(FieldName("testName"), TypeName("testText")))),
SchemaMigration(-2, Add(Field(FieldName("testDescription"), TypeName("testText")))),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,20 @@ class BorerJsonCodecTest extends FunSuite with SchemaJsonCodec {
assertEquals(result.schema.fieldTypes.size, 73)
assert(result.schema.fields.exists(_.name == FieldName("_kind")))
assert(result.schema.copyFields.exists(_.source == FieldName("description")))

test("encode filter with settings"):
val cfg = Filter.EdgeNGramSettings()
val ft = Filter.edgeNGram(cfg)
val json = Json.encode(ft).toUtf8String
assertEquals(
json,
s"""{"name":"edgeNGram","minGramSize":"${cfg.minGramSize}","maxGramSize":"${cfg.maxGramSize}","preserveOriginal":"${cfg.preserveOriginal}"}"""
)

test("decode filter with settings"):
val jsonStr =
"""{"name":"edgeNGram","minGramSize":"3","maxGramSize":"6","preserveOriginal":"true"}"""
val result = Json.decode(jsonStr.getBytes()).to[Filter].value
val expect = Filter.edgeNGram(Filter.EdgeNGramSettings(3, 6, true))
assertEquals(result, expect)
}

0 comments on commit 9d5a237

Please sign in to comment.