From aa280558ae4de919646656ddd0fee291e64cfd81 Mon Sep 17 00:00:00 2001 From: Eike Kettner Date: Wed, 7 Aug 2024 15:24:53 +0200 Subject: [PATCH] Configure text fields with 'edgeNGram' filter This filter adds prefixes of words into the index, so an normal query finds documents on prefixes starting with 2 letters and up to 8. --- .../scala/io/renku/json/EncoderSupport.scala | 21 +++++++++ .../solr/schema/EntityDocumentSchema.scala | 39 +++++++++++++++- .../renku/search/solr/schema/Migrations.scala | 3 +- .../solr/client/SearchSolrClientSpec.scala | 32 ++++++++++++++ .../renku/solr/client/schema/Analyzer.scala | 21 ++------- .../renku/solr/client/schema/FieldType.scala | 16 +++++-- .../io/renku/solr/client/schema/Filter.scala | 44 ++++++++++++++++++- .../solr/client/schema/SchemaCommand.scala | 19 +++++--- .../solr/client/schema/SchemaJsonCodec.scala | 26 ++++++++--- .../client/SearchCaseInsensitiveSpec.scala | 4 +- .../io/renku/solr/client/SolrClientSpec.scala | 4 +- .../client/migration/SolrMigratorSpec.scala | 5 ++- .../client/schema/BorerJsonCodecTest.scala | 16 +++++++ 13 files changed, 209 insertions(+), 41 deletions(-) diff --git a/modules/json/src/main/scala/io/renku/json/EncoderSupport.scala b/modules/json/src/main/scala/io/renku/json/EncoderSupport.scala index 76687ffd..3cd1fa8e 100644 --- a/modules/json/src/main/scala/io/renku/json/EncoderSupport.scala +++ b/modules/json/src/main/scala/io/renku/json/EncoderSupport.scala @@ -85,7 +85,28 @@ object EncoderSupport { val adds = AdditionalFields.const[A, V](field*) Macros.createEncoder[String, V, A](adds) + /** Derives an encoder that writes all members of the target type as map members. It + * assumes an already open map! + */ + inline def deriveProductMemberEncoder[A <: Product](using + Mirror.ProductOf[A] + ): Encoder[A] = + Macros.membersEncoder[A] + private object Macros { + final inline def membersEncoder[T](using + m: Mirror.ProductOf[T] + ): Encoder[T] = + new Encoder[T] { + def write(w: Writer, value: T): Writer = + val encoders = summonEncoder[m.MirroredElemTypes] + val names = LabelsMacro.findLabels[T].toList + val values = value.asInstanceOf[Product].productIterator.toList + names.zip(values).zip(encoders).foreach { case ((k, v), e) => + w.writeMapMember(k, v)(using Encoder[String], e.asInstanceOf[Encoder[Any]]) + } + w + } final inline def createEncoder[K: Encoder, V: Encoder, T]( additionalFields: AdditionalFields[T, V] diff --git a/modules/search-solr-client/src/main/scala/io/renku/search/solr/schema/EntityDocumentSchema.scala b/modules/search-solr-client/src/main/scala/io/renku/search/solr/schema/EntityDocumentSchema.scala index 03e3489c..1e6d7cc7 100644 --- a/modules/search-solr-client/src/main/scala/io/renku/search/solr/schema/EntityDocumentSchema.scala +++ b/modules/search-solr-client/src/main/scala/io/renku/search/solr/schema/EntityDocumentSchema.scala @@ -56,12 +56,42 @@ object EntityDocumentSchema: // virtual score field val score: FieldName = FieldName("score") + private object Analyzers { + val textIndex = Analyzer( + tokenizer = Tokenizer.uax29UrlEmail, + filters = Seq( + Filter.lowercase, + Filter.stop, + Filter.englishMinimalStem, + Filter.asciiFolding, + Filter.edgeNGram(Filter.EdgeNGramSettings(2, 8, true)) + ) + ) + val textQuery = Analyzer( + tokenizer = Tokenizer.uax29UrlEmail, + filters = Seq( + Filter.lowercase, + Filter.stop, + Filter.englishMinimalStem, + Filter.asciiFolding + ) + ) + } + object FieldTypes: val id: FieldType = FieldType.id(TypeName("SearchId")).makeDocValue val string: FieldType = FieldType.str(TypeName("SearchString")).makeDocValue - val text: FieldType = FieldType.text(TypeName("SearchText"), Analyzer.defaultSearch) + val text: FieldType = + FieldType + .text(TypeName("SearchText")) + .withIndexAnalyzer(Analyzers.textIndex) + .withQueryAnalyzer(Analyzers.textQuery) val textAll: FieldType = - FieldType.text(TypeName("SearchTextAll"), Analyzer.defaultSearch).makeMultiValued + FieldType + .text(TypeName("SearchTextAll")) + .withIndexAnalyzer(Analyzers.textIndex) + .withQueryAnalyzer(Analyzers.textQuery) + .makeMultiValued val dateTime: FieldType = FieldType.dateTimePoint(TypeName("SearchDateTime")) val initialEntityDocumentAdd: Seq[SchemaCommand] = Seq( @@ -130,3 +160,8 @@ object EntityDocumentSchema: SchemaCommand.Add(CopyFieldRule(Fields.groupEditors, Fields.membersAll)), SchemaCommand.Add(CopyFieldRule(Fields.groupViewers, Fields.membersAll)) ) + + val replaceTextTypes: Seq[SchemaCommand] = Seq( + SchemaCommand.Replace(FieldTypes.text), + SchemaCommand.Replace(FieldTypes.textAll) + ) diff --git a/modules/search-solr-client/src/main/scala/io/renku/search/solr/schema/Migrations.scala b/modules/search-solr-client/src/main/scala/io/renku/search/solr/schema/Migrations.scala index 6195c591..797a0339 100644 --- a/modules/search-solr-client/src/main/scala/io/renku/search/solr/schema/Migrations.scala +++ b/modules/search-solr-client/src/main/scala/io/renku/search/solr/schema/Migrations.scala @@ -30,6 +30,7 @@ object Migrations { SchemaMigration(version = 5L, EntityDocumentSchema.keywordField), SchemaMigration(version = 6L, EntityDocumentSchema.namespaceField), SchemaMigration(version = 7L, EntityDocumentSchema.editorAndViewerRoles), - SchemaMigration(version = 8L, EntityDocumentSchema.groupRoles) + SchemaMigration(version = 8L, EntityDocumentSchema.groupRoles), + SchemaMigration(version = 9L, EntityDocumentSchema.replaceTextTypes) ) } diff --git a/modules/search-solr-client/src/test/scala/io/renku/search/solr/client/SearchSolrClientSpec.scala b/modules/search-solr-client/src/test/scala/io/renku/search/solr/client/SearchSolrClientSpec.scala index bb137d6b..b064d88e 100644 --- a/modules/search-solr-client/src/test/scala/io/renku/search/solr/client/SearchSolrClientSpec.scala +++ b/modules/search-solr-client/src/test/scala/io/renku/search/solr/client/SearchSolrClientSpec.scala @@ -179,3 +179,35 @@ class SearchSolrClientSpec extends CatsEffectSuite with SearchSolrSuite: _ = assertEquals(memberResult.responseBody.docs.head.id, project.id) _ = assertEquals(adminResult.responseBody.docs.head.id, project.id) yield () + + test("search partial words"): + for + client <- IO(searchSolrClient()) + project <- IO( + projectDocumentGen( + "NeuroDesk", + "This is a Neurodesk project", + Gen.const(None), + Gen.const(None), + Gen.const(Visibility.Public) + ).generateOne + ) + _ <- client.upsertSuccess(Seq(project)) + result1 <- client.queryEntity( + SearchRole.anonymous, + Query(Query.Segment.text("neuro")), + 1, + 0 + ) + _ = assertEquals(result1.responseBody.docs.size, 1) + _ = assertEquals(result1.responseBody.docs.head.id, project.id) + + result2 <- client.queryEntity( + SearchRole.anonymous, + Query(Query.Segment.nameIs("neuro")), + 1, + 0 + ) + _ = assertEquals(result2.responseBody.docs.size, 1) + _ = assertEquals(result2.responseBody.docs.head.id, project.id) + yield () diff --git a/modules/solr-client/src/main/scala/io/renku/solr/client/schema/Analyzer.scala b/modules/solr-client/src/main/scala/io/renku/solr/client/schema/Analyzer.scala index 333d3558..2a8e1b38 100644 --- a/modules/solr-client/src/main/scala/io/renku/solr/client/schema/Analyzer.scala +++ b/modules/solr-client/src/main/scala/io/renku/solr/client/schema/Analyzer.scala @@ -19,31 +19,16 @@ package io.renku.solr.client.schema // see https://solr.apache.org/guide/solr/latest/indexing-guide/analyzers.html +// https://solr.apache.org/guide/solr/latest/indexing-guide/schema-api.html#add-a-new-field-type final case class Analyzer( tokenizer: Tokenizer, - `type`: Analyzer.AnalyzerType = Analyzer.AnalyzerType.None, filters: Seq[Filter] = Nil ) object Analyzer: - enum AnalyzerType: - case Index - case Multiterm - case Query - case None - - object AnalyzerType: - def fromString(str: String): Either[String, AnalyzerType] = - AnalyzerType.values - .find(_.productPrefix.equalsIgnoreCase(str)) - .toRight(s"Invalid analyzer type: $str") - - def index(tokenizer: Tokenizer, filters: Filter*): Analyzer = - Analyzer(tokenizer, AnalyzerType.Index, filters) - - def query(tokenizer: Tokenizer, filters: Filter*): Analyzer = - Analyzer(tokenizer, AnalyzerType.Query, filters) + def create(tokenizer: Tokenizer, filters: Filter*): Analyzer = + Analyzer(tokenizer, filters) val classic: Analyzer = Analyzer(Tokenizer.classic, filters = List(Filter.classic)) diff --git a/modules/solr-client/src/main/scala/io/renku/solr/client/schema/FieldType.scala b/modules/solr-client/src/main/scala/io/renku/solr/client/schema/FieldType.scala index dcb79766..f4a81d71 100644 --- a/modules/solr-client/src/main/scala/io/renku/solr/client/schema/FieldType.scala +++ b/modules/solr-client/src/main/scala/io/renku/solr/client/schema/FieldType.scala @@ -21,7 +21,8 @@ package io.renku.solr.client.schema final case class FieldType( name: TypeName, `class`: FieldTypeClass, - analyzer: Option[Analyzer] = None, + indexAnalyzer: Option[Analyzer] = None, + queryAnalyzer: Option[Analyzer] = None, required: Boolean = false, indexed: Boolean = true, stored: Boolean = true, @@ -33,13 +34,22 @@ final case class FieldType( lazy val makeDocValue: FieldType = copy(docValues = true) lazy val makeMultiValued: FieldType = copy(multiValued = true) + def withQueryAnalyzer(a: Analyzer): FieldType = + copy(queryAnalyzer = Some(a)) + + def withIndexAnalyzer(a: Analyzer): FieldType = + copy(indexAnalyzer = Some(a)) + + def withAnalyzer(a: Analyzer): FieldType = + withQueryAnalyzer(a).withIndexAnalyzer(a) + object FieldType: def id(name: TypeName): FieldType = FieldType(name, FieldTypeClass.Defaults.strField) - def text(name: TypeName, analyzer: Analyzer): FieldType = - FieldType(name, FieldTypeClass.Defaults.textField, analyzer = Some(analyzer)) + def text(name: TypeName): FieldType = + FieldType(name, FieldTypeClass.Defaults.textField) def str(name: TypeName): FieldType = FieldType(name, FieldTypeClass.Defaults.strField) diff --git a/modules/solr-client/src/main/scala/io/renku/solr/client/schema/Filter.scala b/modules/solr-client/src/main/scala/io/renku/solr/client/schema/Filter.scala index a40c5579..53eefee0 100644 --- a/modules/solr-client/src/main/scala/io/renku/solr/client/schema/Filter.scala +++ b/modules/solr-client/src/main/scala/io/renku/solr/client/schema/Filter.scala @@ -18,9 +18,12 @@ package io.renku.solr.client.schema +import scala.compiletime.* +import scala.deriving.Mirror + // see https://solr.apache.org/guide/solr/latest/indexing-guide/filters.html -final case class Filter(name: String) +final case class Filter(name: String, settings: Option[Filter.Settings] = None) object Filter: val asciiFolding: Filter = Filter("asciiFolding") @@ -30,3 +33,42 @@ object Filter: val classic: Filter = Filter("classic") val daitchMokotoffSoundex: Filter = Filter("daitchMokotoffSoundex") val doubleMetaphone: Filter = Filter("doubleMetaphone") + val nGram: Filter = Filter("nGram") + def edgeNGram(cfg: EdgeNGramSettings): Filter = + val settings = Macros.settingsOf(cfg) + Filter("edgeNGram", settings) + + /** Settings specific to a filter */ + opaque type Settings = Map[String, String] + object Settings { + def createFromMap(m: Map[String, String]): Option[Settings] = + if (m.isEmpty) None else Some(m) + extension (self: Settings) + def asMap: Map[String, String] = self + def get(key: String): Option[String] = self.get(key) + } + + final case class EdgeNGramSettings( + minGramSize: Int = 3, + maxGramSize: Int = 6, + preserveOriginal: Boolean = true + ) + + // SOLR encodes settings as strings. When doing schema requests, it + // accepts both: JSON Number and JSON strings. However, when + // querying the solr schema, it returns all filter settings as + // strings, so it will be `"maxGramSize:"6"` instead of + // `"maxGramSize:6`. So here every setting is encoded as a simple + // `Map[String,String]`. Creating such settings should always happen + // using a specific type, like `EdgeNGramSettings`. + private object Macros { + // This marco converts a case class into a Map[String,String] by + // simply putting each member in it using the `toString` method. + inline def settingsOf[A <: Product](value: A)(using + m: Mirror.ProductOf[A] + ): Option[Settings] = + val values = value.asInstanceOf[Product].productIterator.toList + val labels = constValueTuple[m.MirroredElemLabels] + val kv = labels.toList.zip(values).map { case (k, v) => k.toString -> v.toString } + Settings.createFromMap(kv.toMap) + } diff --git a/modules/solr-client/src/main/scala/io/renku/solr/client/schema/SchemaCommand.scala b/modules/solr-client/src/main/scala/io/renku/solr/client/schema/SchemaCommand.scala index f82d10ca..c81f93f8 100644 --- a/modules/solr-client/src/main/scala/io/renku/solr/client/schema/SchemaCommand.scala +++ b/modules/solr-client/src/main/scala/io/renku/solr/client/schema/SchemaCommand.scala @@ -25,15 +25,20 @@ enum SchemaCommand: case DeleteField(name: FieldName) case DeleteType(name: TypeName) case DeleteDynamicField(name: FieldName) + case Replace(element: SchemaCommand.ReplaceElem) def commandName: String = this match - case Add(_: Field) => "add-field" - case Add(_: FieldType) => "add-field-type" - case Add(_: DynamicFieldRule) => "add-dynamic-field" - case Add(_: CopyFieldRule) => "add-copy-field" - case _: DeleteField => "delete-field" - case _: DeleteType => "delete-field-type" - case _: DeleteDynamicField => "delete-dynamic-field" + case Add(_: Field) => "add-field" + case Add(_: FieldType) => "add-field-type" + case Add(_: DynamicFieldRule) => "add-dynamic-field" + case Add(_: CopyFieldRule) => "add-copy-field" + case Replace(_: Field) => "replace-field" + case Replace(_: FieldType) => "replace-field-type" + case Replace(_: DynamicFieldRule) => "replace-dynamic-field" + case _: DeleteField => "delete-field" + case _: DeleteType => "delete-field-type" + case _: DeleteDynamicField => "delete-dynamic-field" object SchemaCommand: type Element = FieldType | Field | DynamicFieldRule | CopyFieldRule + type ReplaceElem = FieldType | Field | DynamicFieldRule diff --git a/modules/solr-client/src/main/scala/io/renku/solr/client/schema/SchemaJsonCodec.scala b/modules/solr-client/src/main/scala/io/renku/solr/client/schema/SchemaJsonCodec.scala index 9c2d8c80..98bd6fff 100644 --- a/modules/solr-client/src/main/scala/io/renku/solr/client/schema/SchemaJsonCodec.scala +++ b/modules/solr-client/src/main/scala/io/renku/solr/client/schema/SchemaJsonCodec.scala @@ -19,6 +19,7 @@ package io.renku.solr.client.schema import io.bullet.borer.NullOptions.given +import io.bullet.borer.Reader import io.bullet.borer.derivation.MapBasedCodecs import io.bullet.borer.{Decoder, Encoder, Writer} import io.renku.solr.client.schema.SchemaCommand.Element @@ -28,13 +29,24 @@ trait SchemaJsonCodec { given Encoder[Tokenizer] = MapBasedCodecs.deriveEncoder given Decoder[Tokenizer] = MapBasedCodecs.deriveDecoder - given Encoder[Filter] = MapBasedCodecs.deriveEncoder - given Decoder[Filter] = MapBasedCodecs.deriveDecoder + given Encoder[Filter] = { (w: Writer, value: Filter) => + w.writeMapStart() + w.writeMapMember("name", value.name) + value.settings match { + case None => () + case Some(s) => + s.asMap.foreach { case (k, v) => + w.writeMapMember(k, v) + } + } + w.writeMapClose() + } - given Encoder[Analyzer.AnalyzerType] = - Encoder.forString.contramap(_.productPrefix.toLowerCase) - given Decoder[Analyzer.AnalyzerType] = - Decoder.forString.mapEither(Analyzer.AnalyzerType.fromString) + given Decoder[Filter] = Decoder.forMap[String, String].mapOption { data => + data.get("name").map { name => + Filter(name, Filter.Settings.createFromMap(data.removed("name"))) + } + } given Encoder[Analyzer] = MapBasedCodecs.deriveEncoder given Decoder[Analyzer] = MapBasedCodecs.deriveDecoder @@ -72,6 +84,8 @@ trait SchemaJsonCodec { value match case SchemaCommand.Add(v) => e.write(w, v) + case SchemaCommand.Replace(v) => + e.write(w, v) case SchemaCommand.DeleteType(tn) => w.writeMap(Map("name" -> tn)) case SchemaCommand.DeleteField(fn) => diff --git a/modules/solr-client/src/test/scala/io/renku/solr/client/SearchCaseInsensitiveSpec.scala b/modules/solr-client/src/test/scala/io/renku/solr/client/SearchCaseInsensitiveSpec.scala index 3ef33396..d3f2cf3d 100644 --- a/modules/solr-client/src/test/scala/io/renku/solr/client/SearchCaseInsensitiveSpec.scala +++ b/modules/solr-client/src/test/scala/io/renku/solr/client/SearchCaseInsensitiveSpec.scala @@ -36,7 +36,9 @@ class SearchCaseInsensitiveSpec extends CatsEffectSuite with SolrClientBaseSuite List(solrServer, solrClient) private val migrations = Seq( - SchemaCommand.Add(FieldType.text(TypeName("my_text_field"), Analyzer.defaultSearch)), + SchemaCommand.Add( + FieldType.text(TypeName("my_text_field")).withAnalyzer(Analyzer.defaultSearch) + ), SchemaCommand.Add(Field(FieldName("my_name"), TypeName("my_text_field"))) ) diff --git a/modules/solr-client/src/test/scala/io/renku/solr/client/SolrClientSpec.scala b/modules/solr-client/src/test/scala/io/renku/solr/client/SolrClientSpec.scala index 8f5fbd26..23a29cd2 100644 --- a/modules/solr-client/src/test/scala/io/renku/solr/client/SolrClientSpec.scala +++ b/modules/solr-client/src/test/scala/io/renku/solr/client/SolrClientSpec.scala @@ -78,7 +78,9 @@ class SolrClientSpec test("use schema for inserting and querying") { val cmds = Seq( - SchemaCommand.Add(FieldType.text(TypeName("roomText"), Analyzer.classic)), + SchemaCommand.Add( + FieldType.text(TypeName("roomText")).withAnalyzer(Analyzer.classic) + ), SchemaCommand.Add(FieldType.int(TypeName("roomInt"))), SchemaCommand.Add(Field(FieldName("roomName"), TypeName("roomText"))), SchemaCommand.Add(Field(FieldName("roomDescription"), TypeName("roomText"))), diff --git a/modules/solr-client/src/test/scala/io/renku/solr/client/migration/SolrMigratorSpec.scala b/modules/solr-client/src/test/scala/io/renku/solr/client/migration/SolrMigratorSpec.scala index 6cf59d67..294cae09 100644 --- a/modules/solr-client/src/test/scala/io/renku/solr/client/migration/SolrMigratorSpec.scala +++ b/modules/solr-client/src/test/scala/io/renku/solr/client/migration/SolrMigratorSpec.scala @@ -33,7 +33,10 @@ class SolrMigratorSpec extends CatsEffectSuite with SolrClientBaseSuite: List(solrServer, solrClient) private val migrations = Seq( - SchemaMigration(-5, Add(FieldType.text(TypeName("testText"), Analyzer.classic))), + SchemaMigration( + -5, + Add(FieldType.text(TypeName("testText")).withAnalyzer(Analyzer.classic)) + ), SchemaMigration(-4, Add(FieldType.int(TypeName("testInt")))), SchemaMigration(-3, Add(Field(FieldName("testName"), TypeName("testText")))), SchemaMigration(-2, Add(Field(FieldName("testDescription"), TypeName("testText")))), diff --git a/modules/solr-client/src/test/scala/io/renku/solr/client/schema/BorerJsonCodecTest.scala b/modules/solr-client/src/test/scala/io/renku/solr/client/schema/BorerJsonCodecTest.scala index 57827367..8ca0f98f 100644 --- a/modules/solr-client/src/test/scala/io/renku/solr/client/schema/BorerJsonCodecTest.scala +++ b/modules/solr-client/src/test/scala/io/renku/solr/client/schema/BorerJsonCodecTest.scala @@ -63,4 +63,20 @@ class BorerJsonCodecTest extends FunSuite with SchemaJsonCodec { assertEquals(result.schema.fieldTypes.size, 73) assert(result.schema.fields.exists(_.name == FieldName("_kind"))) assert(result.schema.copyFields.exists(_.source == FieldName("description"))) + + test("encode filter with settings"): + val cfg = Filter.EdgeNGramSettings() + val ft = Filter.edgeNGram(cfg) + val json = Json.encode(ft).toUtf8String + assertEquals( + json, + s"""{"name":"edgeNGram","minGramSize":"${cfg.minGramSize}","maxGramSize":"${cfg.maxGramSize}","preserveOriginal":"${cfg.preserveOriginal}"}""" + ) + + test("decode filter with settings"): + val jsonStr = + """{"name":"edgeNGram","minGramSize":"3","maxGramSize":"6","preserveOriginal":"true"}""" + val result = Json.decode(jsonStr.getBytes()).to[Filter].value + val expect = Filter.edgeNGram(Filter.EdgeNGramSettings(3, 6, true)) + assertEquals(result, expect) }