Fix: Prevent ElasticSearch from Skipping Records After Tombstone (#172)

* Fix: Prevent ElasticSearch from Skipping Records After Tombstone Overview This pull request addresses a critical bug in ElasticSearch versions 6 (ES6) and 7 (ES7) where records following a tombstone are inadvertently skipped during the insertion process. The issue stemmed from an erroneous return statement that halted the processing of subsequent records. Background In the current implementation, when a tombstone record is encountered within a sequence of records to be written to ElasticSearch, the insertion process prematurely exits due to a return instruction. This results in all records following the tombstone being ignored, leading to incomplete data ingestion and potential inconsistencies within the ElasticSearch indices. Changes Made Refactored Insert Method: Modularization: The original insert method has been decomposed into smaller, more focused functions. This enhances code readability, maintainability, and facilitates easier testing. Detailed Log Entries: Added log statements at key points within the insertion workflow ES Error not handled: Previously the response from ElasticSearch ignored failures. With this change, if any of the batch fail, the sink will raise an exception. * Avoid sending empty requests * Fix the unit tests --------- Co-authored-by: stheppi <[email protected]>
lensesio · Dec 4, 2024 · 5e03c59 · 5e03c59
1 parent c272cd5
commit 5e03c59
Show file tree

Hide file tree

Showing 7 changed files with 233 additions and 145 deletions.
diff --git a/...-elastic6/src/main/scala/io/lenses/streamreactor/connect/elastic6/ElasticJsonWriter.scala b/...-elastic6/src/main/scala/io/lenses/streamreactor/connect/elastic6/ElasticJsonWriter.scala
@@ -25,17 +25,21 @@ import io.lenses.streamreactor.common.errors.ErrorHandler
 import io.lenses.streamreactor.common.schemas.ConverterUtil
 import io.lenses.streamreactor.connect.elastic6.config.ElasticSettings
 import io.lenses.streamreactor.connect.elastic6.indexname.CreateIndex
-import com.fasterxml.jackson.databind.JsonNode
+import com.sksamuel.elastic4s.bulk.BulkCompatibleRequest
+import com.sksamuel.elastic4s.delete.DeleteByIdRequest
 import io.lenses.sql.Field
 import com.sksamuel.elastic4s.Index
-import com.sksamuel.elastic4s.Indexable
 import com.sksamuel.elastic4s.http.ElasticDsl._
+import com.sksamuel.elastic4s.http.Response
+import com.sksamuel.elastic4s.http.bulk.BulkResponse
 import com.typesafe.scalalogging.StrictLogging
+import io.lenses.json.sql.JacksonJson
 import io.lenses.streamreactor.connect.elastic6.NullValueBehavior.NullValueBehavior
 import io.lenses.streamreactor.connect.elastic6.config.ElasticConfigConstants.BEHAVIOR_ON_NULL_VALUES_PROPERTY
 import org.apache.kafka.connect.sink.SinkRecord
 
 import scala.annotation.nowarn
+import scala.collection.immutable
 import scala.concurrent.ExecutionContext.Implicits.global
 import scala.concurrent.duration._
 import scala.concurrent.Await
@@ -117,88 +121,126 @@ class ElasticJsonWriter(client: KElasticClient, settings: ElasticSettings)
     * @param records A list of SinkRecords
     */
   def insert(records: Map[String, Vector[SinkRecord]]): Unit = {
+    logger.info(s"Inserting ${records.size} records")
     val fut = records.flatMap {
       case (topic, sinkRecords) =>
-        val kcqls = topicKcqlMap.getOrElse(
+        logger.debug(s"Inserting ${sinkRecords.size} records from $topic")
+        val kcqls: Seq[Kcql] = topicKcqlMap.getOrElse(
           topic,
           throw new IllegalArgumentException(
             s"$topic hasn't been configured in KCQL. Configured topics is ${topicKcqlMap.keys.mkString(",")}",
           ),
         )
-
-        //we might have multiple inserts from the same Kafka Message
-        kcqls.flatMap { kcql =>
-          val kcqlValue = kcqlMap.get(kcql)
+        kcqls.flatMap { kcql: Kcql =>
+          val kcqlValue: KcqlValues = kcqlMap.get(kcql)
           sinkRecords.grouped(settings.batchSize)
             .map { batch =>
-              val indexes = batch.flatMap { r =>
-                val i            = CreateIndex.getIndexName(kcql, r).leftMap(throw _).merge
-                val documentType = Option(kcql.getDocType).getOrElse(i)
-                val (json, pks) = if (kcqlValue.primaryKeysPath.isEmpty) {
-                  (Transform(
-                     kcqlValue.fields,
-                     r.valueSchema(),
-                     r.value(),
-                     kcql.hasRetainStructure,
-                   ),
-                   Seq.empty,
-                  )
-                } else {
-                  TransformAndExtractPK(
-                    kcqlValue,
-                    r.valueSchema(),
-                    r.value(),
-                    kcql.hasRetainStructure,
-                    r.keySchema(),
-                    r.key(),
-                    r.headers(),
-                  )
-                }
-                val idFromPk = pks.mkString(settings.pkJoinerSeparator)
-
-                if (json.isEmpty || json.exists(_.isEmpty)) {
-                  (kcqlValue.behaviorOnNullValues) match {
-                    case NullValueBehavior.DELETE =>
-                      Some(deleteById(new Index(i), documentType, if (idFromPk.isEmpty) autoGenId(r) else idFromPk))
-
-                    case NullValueBehavior.FAIL =>
-                      throw new IllegalStateException(
-                        s"$topic KCQL mapping is configured to fail on null value, yet it occurred.",
-                      )
-
-                    case NullValueBehavior.IGNORE =>
-                      return None
-                  }
-
-                } else {
-                  kcql.getWriteMode match {
-                    case WriteModeEnum.INSERT =>
-                      Some(
-                        indexInto(i / documentType)
-                          .id(if (idFromPk.isEmpty) autoGenId(r) else idFromPk)
-                          .pipeline(kcql.getPipeline)
-                          .source(json.get.toString),
-                      )
-
-                    case WriteModeEnum.UPSERT =>
-                      require(pks.nonEmpty, "Error extracting primary keys")
-                      Some(update(idFromPk)
-                        .in(i / documentType)
-                        .docAsUpsert(json.get)(IndexableJsonNode))
-                  }
-
-                }
+              batch.flatMap { r =>
+                processRecord(topic, kcql, kcqlValue, r)
               }
-
-              client.execute(bulk(indexes).refreshImmediately)
+            }
+            .filter(_.nonEmpty)
+            .map { indexes =>
+              client.execute(bulk(indexes))
             }
         }
     }
 
+    handleResponse(fut)
+  }
+
+  private def handleTombstone(
+    topic:        String,
+    kcqlValue:    KcqlValues,
+    r:            SinkRecord,
+    i:            String,
+    idFromPk:     String,
+    documentType: String,
+  ): Option[DeleteByIdRequest] =
+    kcqlValue.behaviorOnNullValues match {
+      case NullValueBehavior.DELETE =>
+        val identifier = if (idFromPk.isEmpty) autoGenId(r) else idFromPk
+        logger.debug(
+          s"Deleting tombstone record: ${r.topic()} ${r.kafkaPartition()} ${r.kafkaOffset()}. Index: $i, Identifier: $identifier",
+        )
+        Some(deleteById(new Index(i), documentType, identifier))
+
+      case NullValueBehavior.FAIL =>
+        logger.error(
+          s"Tombstone record received ${r.topic()} ${r.kafkaPartition()} ${r.kafkaOffset()}. $topic KCQL mapping is configured to fail on tombstone records.",
+        )
+        throw new IllegalStateException(
+          s"$topic KCQL mapping is configured to fail on tombstone records.",
+        )
+
+      case NullValueBehavior.IGNORE =>
+        logger.info(
+          s"Ignoring tombstone record received. for ${r.topic()} ${r.kafkaPartition()} ${r.kafkaOffset()}.",
+        )
+        None
+    }
+
+  private def processRecord(
+    topic:     String,
+    kcql:      Kcql,
+    kcqlValue: KcqlValues,
+    r:         SinkRecord,
+  ): Option[BulkCompatibleRequest] = {
+    val i            = CreateIndex.getIndexName(kcql, r).leftMap(throw _).merge
+    val documentType = Option(kcql.getDocType).getOrElse(i)
+    val (json, pks) = if (kcqlValue.primaryKeysPath.isEmpty) {
+      (Transform(kcqlValue.fields, r.valueSchema(), r.value(), kcql.hasRetainStructure), Seq.empty)
+    } else {
+      TransformAndExtractPK(kcqlValue,
+                            r.valueSchema(),
+                            r.value(),
+                            kcql.hasRetainStructure,
+                            r.keySchema(),
+                            r.key(),
+                            r.headers(),
+      )
+    }
+    val idFromPk = pks.mkString(settings.pkJoinerSeparator)
+
+    json.filterNot(_.isEmpty) match {
+      case Some(value) =>
+        kcql.getWriteMode match {
+          case WriteModeEnum.INSERT =>
+            Some(
+              indexInto(i / documentType)
+                .id(if (idFromPk.isEmpty) autoGenId(r) else idFromPk)
+                .pipeline(kcql.getPipeline)
+                .source(value.toString),
+            )
+
+          case WriteModeEnum.UPSERT =>
+            require(pks.nonEmpty, "Error extracting primary keys")
+            Some(update(idFromPk)
+              .in(i / documentType)
+              .docAsUpsert(value.toString))
+        }
+      case None =>
+        handleTombstone(topic, kcqlValue, r, i, idFromPk, documentType)
+    }
+  }
+
+  private def handleResponse(fut: immutable.Iterable[Future[Response[BulkResponse]]]): Unit = {
     handleTry(
-      Try(
-        Await.result(Future.sequence(fut), settings.writeTimeout.seconds),
-      ),
+      Try {
+        val result: immutable.Iterable[Response[BulkResponse]] =
+          Await.result(Future.sequence(fut), settings.writeTimeout.seconds)
+        val errors = result.filter(_.isError).map(_.error)
+        if (errors.nonEmpty) {
+          logger.error(s"Error writing to Elastic Search: ${JacksonJson.asJson(errors)}")
+          throw new RuntimeException(s"Error writing to Elastic Search: ${errors.map(_.reason)}")
+        }
+        logger.info(
+          s"Inserted ${result.size} records. ${result.map { r =>
+            s"Items: ${r.result.items.size} took ${r.result.took}ms."
+          }.mkString(",")}",
+        )
+        result
+      },
     )
     ()
   }
@@ -220,6 +262,3 @@ case class KcqlValues(
   primaryKeysPath:      Seq[Vector[String]],
   behaviorOnNullValues: NullValueBehavior,
 )
-case object IndexableJsonNode extends Indexable[JsonNode] {
-  override def json(t: JsonNode): String = t.toString
-}
diff --git a/...ect-elastic6/src/main/scala/io/lenses/streamreactor/connect/elastic6/KElasticClient.scala b/...ect-elastic6/src/main/scala/io/lenses/streamreactor/connect/elastic6/KElasticClient.scala
@@ -35,7 +35,7 @@ import scala.concurrent.Future
 trait KElasticClient extends AutoCloseable {
   def index(kcql: Kcql): Unit
 
-  def execute(definition: BulkRequest): Future[Any]
+  def execute(definition: BulkRequest): Future[Response[BulkResponse]]
 }
 
 object KElasticClient extends StrictLogging {

diff --git a/...stic6/src/test/scala/io/lenses/streamreactor/connect/elastic6/ElasticJsonWriterTest.scala b/...stic6/src/test/scala/io/lenses/streamreactor/connect/elastic6/ElasticJsonWriterTest.scala
@@ -86,7 +86,8 @@ class ElasticJsonWriterTest extends TestBase with MockitoSugar {
 
     val sourceTopic = "SOURCE"
     val targetShard = "SHARD"
-    val kcql        = Kcql.parse(s"INSERT INTO $targetShard SELECT * FROM $sourceTopic ")
+    val kcql =
+      Kcql.parse(s"INSERT INTO $targetShard SELECT * FROM $sourceTopic PROPERTIES('behavior.on.null.values'='IGNORE')")
 
     val recordKey = "KEY"
     val tombstoneValue: Null = null
@@ -161,7 +162,7 @@ class ElasticJsonWriterTest extends TestBase with MockitoSugar {
     val exception = intercept[Exception](target.write(Vector(sinkRecord)))
 
     exception shouldBe a[IllegalStateException]
-    exception.getMessage should be(s"$sourceTopic KCQL mapping is configured to fail on null value, yet it occurred.")
+    exception.getMessage should be(s"$sourceTopic KCQL mapping is configured to fail on tombstone records.")
   }
 
 }