Skip to content

Commit

Permalink
Fix: Prevent ElasticSearch from Skipping Records After Tombstone (#172)
Browse files Browse the repository at this point in the history
* Fix: Prevent ElasticSearch from Skipping Records After Tombstone
Overview
This pull request addresses a critical bug in ElasticSearch versions 6 (ES6) and 7 (ES7) where records following a tombstone are inadvertently skipped during the insertion process. The issue stemmed from an erroneous return statement that halted the processing of subsequent records.

Background
In the current implementation, when a tombstone record is encountered within a sequence of records to be written to ElasticSearch, the insertion process prematurely exits due to a return instruction. This results in all records following the tombstone being ignored, leading to incomplete data ingestion and potential inconsistencies within the ElasticSearch indices.

Changes Made
Refactored Insert Method:

Modularization: The original insert method has been decomposed into smaller, more focused functions. This enhances code readability, maintainability, and facilitates easier testing.

Detailed Log Entries: Added  log statements at key points within the insertion workflow

ES Error not handled: Previously the response from ElasticSearch ignored failures. With this change, if any of the batch fail, the sink will raise an exception.

* Avoid sending empty requests

* Fix the unit tests

---------

Co-authored-by: stheppi <[email protected]>
  • Loading branch information
stheppi and stheppi authored Dec 4, 2024
1 parent c272cd5 commit 5e03c59
Show file tree
Hide file tree
Showing 7 changed files with 233 additions and 145 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,21 @@ import io.lenses.streamreactor.common.errors.ErrorHandler
import io.lenses.streamreactor.common.schemas.ConverterUtil
import io.lenses.streamreactor.connect.elastic6.config.ElasticSettings
import io.lenses.streamreactor.connect.elastic6.indexname.CreateIndex
import com.fasterxml.jackson.databind.JsonNode
import com.sksamuel.elastic4s.bulk.BulkCompatibleRequest
import com.sksamuel.elastic4s.delete.DeleteByIdRequest
import io.lenses.sql.Field
import com.sksamuel.elastic4s.Index
import com.sksamuel.elastic4s.Indexable
import com.sksamuel.elastic4s.http.ElasticDsl._
import com.sksamuel.elastic4s.http.Response
import com.sksamuel.elastic4s.http.bulk.BulkResponse
import com.typesafe.scalalogging.StrictLogging
import io.lenses.json.sql.JacksonJson
import io.lenses.streamreactor.connect.elastic6.NullValueBehavior.NullValueBehavior
import io.lenses.streamreactor.connect.elastic6.config.ElasticConfigConstants.BEHAVIOR_ON_NULL_VALUES_PROPERTY
import org.apache.kafka.connect.sink.SinkRecord

import scala.annotation.nowarn
import scala.collection.immutable
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration._
import scala.concurrent.Await
Expand Down Expand Up @@ -117,88 +121,126 @@ class ElasticJsonWriter(client: KElasticClient, settings: ElasticSettings)
* @param records A list of SinkRecords
*/
def insert(records: Map[String, Vector[SinkRecord]]): Unit = {
logger.info(s"Inserting ${records.size} records")
val fut = records.flatMap {
case (topic, sinkRecords) =>
val kcqls = topicKcqlMap.getOrElse(
logger.debug(s"Inserting ${sinkRecords.size} records from $topic")
val kcqls: Seq[Kcql] = topicKcqlMap.getOrElse(
topic,
throw new IllegalArgumentException(
s"$topic hasn't been configured in KCQL. Configured topics is ${topicKcqlMap.keys.mkString(",")}",
),
)

//we might have multiple inserts from the same Kafka Message
kcqls.flatMap { kcql =>
val kcqlValue = kcqlMap.get(kcql)
kcqls.flatMap { kcql: Kcql =>
val kcqlValue: KcqlValues = kcqlMap.get(kcql)
sinkRecords.grouped(settings.batchSize)
.map { batch =>
val indexes = batch.flatMap { r =>
val i = CreateIndex.getIndexName(kcql, r).leftMap(throw _).merge
val documentType = Option(kcql.getDocType).getOrElse(i)
val (json, pks) = if (kcqlValue.primaryKeysPath.isEmpty) {
(Transform(
kcqlValue.fields,
r.valueSchema(),
r.value(),
kcql.hasRetainStructure,
),
Seq.empty,
)
} else {
TransformAndExtractPK(
kcqlValue,
r.valueSchema(),
r.value(),
kcql.hasRetainStructure,
r.keySchema(),
r.key(),
r.headers(),
)
}
val idFromPk = pks.mkString(settings.pkJoinerSeparator)

if (json.isEmpty || json.exists(_.isEmpty)) {
(kcqlValue.behaviorOnNullValues) match {
case NullValueBehavior.DELETE =>
Some(deleteById(new Index(i), documentType, if (idFromPk.isEmpty) autoGenId(r) else idFromPk))

case NullValueBehavior.FAIL =>
throw new IllegalStateException(
s"$topic KCQL mapping is configured to fail on null value, yet it occurred.",
)

case NullValueBehavior.IGNORE =>
return None
}

} else {
kcql.getWriteMode match {
case WriteModeEnum.INSERT =>
Some(
indexInto(i / documentType)
.id(if (idFromPk.isEmpty) autoGenId(r) else idFromPk)
.pipeline(kcql.getPipeline)
.source(json.get.toString),
)

case WriteModeEnum.UPSERT =>
require(pks.nonEmpty, "Error extracting primary keys")
Some(update(idFromPk)
.in(i / documentType)
.docAsUpsert(json.get)(IndexableJsonNode))
}

}
batch.flatMap { r =>
processRecord(topic, kcql, kcqlValue, r)
}

client.execute(bulk(indexes).refreshImmediately)
}
.filter(_.nonEmpty)
.map { indexes =>
client.execute(bulk(indexes))
}
}
}

handleResponse(fut)
}

private def handleTombstone(
topic: String,
kcqlValue: KcqlValues,
r: SinkRecord,
i: String,
idFromPk: String,
documentType: String,
): Option[DeleteByIdRequest] =
kcqlValue.behaviorOnNullValues match {
case NullValueBehavior.DELETE =>
val identifier = if (idFromPk.isEmpty) autoGenId(r) else idFromPk
logger.debug(
s"Deleting tombstone record: ${r.topic()} ${r.kafkaPartition()} ${r.kafkaOffset()}. Index: $i, Identifier: $identifier",
)
Some(deleteById(new Index(i), documentType, identifier))

case NullValueBehavior.FAIL =>
logger.error(
s"Tombstone record received ${r.topic()} ${r.kafkaPartition()} ${r.kafkaOffset()}. $topic KCQL mapping is configured to fail on tombstone records.",
)
throw new IllegalStateException(
s"$topic KCQL mapping is configured to fail on tombstone records.",
)

case NullValueBehavior.IGNORE =>
logger.info(
s"Ignoring tombstone record received. for ${r.topic()} ${r.kafkaPartition()} ${r.kafkaOffset()}.",
)
None
}

private def processRecord(
topic: String,
kcql: Kcql,
kcqlValue: KcqlValues,
r: SinkRecord,
): Option[BulkCompatibleRequest] = {
val i = CreateIndex.getIndexName(kcql, r).leftMap(throw _).merge
val documentType = Option(kcql.getDocType).getOrElse(i)
val (json, pks) = if (kcqlValue.primaryKeysPath.isEmpty) {
(Transform(kcqlValue.fields, r.valueSchema(), r.value(), kcql.hasRetainStructure), Seq.empty)
} else {
TransformAndExtractPK(kcqlValue,
r.valueSchema(),
r.value(),
kcql.hasRetainStructure,
r.keySchema(),
r.key(),
r.headers(),
)
}
val idFromPk = pks.mkString(settings.pkJoinerSeparator)

json.filterNot(_.isEmpty) match {
case Some(value) =>
kcql.getWriteMode match {
case WriteModeEnum.INSERT =>
Some(
indexInto(i / documentType)
.id(if (idFromPk.isEmpty) autoGenId(r) else idFromPk)
.pipeline(kcql.getPipeline)
.source(value.toString),
)

case WriteModeEnum.UPSERT =>
require(pks.nonEmpty, "Error extracting primary keys")
Some(update(idFromPk)
.in(i / documentType)
.docAsUpsert(value.toString))
}
case None =>
handleTombstone(topic, kcqlValue, r, i, idFromPk, documentType)
}
}

private def handleResponse(fut: immutable.Iterable[Future[Response[BulkResponse]]]): Unit = {
handleTry(
Try(
Await.result(Future.sequence(fut), settings.writeTimeout.seconds),
),
Try {
val result: immutable.Iterable[Response[BulkResponse]] =
Await.result(Future.sequence(fut), settings.writeTimeout.seconds)
val errors = result.filter(_.isError).map(_.error)
if (errors.nonEmpty) {
logger.error(s"Error writing to Elastic Search: ${JacksonJson.asJson(errors)}")
throw new RuntimeException(s"Error writing to Elastic Search: ${errors.map(_.reason)}")
}
logger.info(
s"Inserted ${result.size} records. ${result.map { r =>
s"Items: ${r.result.items.size} took ${r.result.took}ms."
}.mkString(",")}",
)
result
},
)
()
}
Expand All @@ -220,6 +262,3 @@ case class KcqlValues(
primaryKeysPath: Seq[Vector[String]],
behaviorOnNullValues: NullValueBehavior,
)
case object IndexableJsonNode extends Indexable[JsonNode] {
override def json(t: JsonNode): String = t.toString
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ import scala.concurrent.Future
trait KElasticClient extends AutoCloseable {
def index(kcql: Kcql): Unit

def execute(definition: BulkRequest): Future[Any]
def execute(definition: BulkRequest): Future[Response[BulkResponse]]
}

object KElasticClient extends StrictLogging {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ class ElasticJsonWriterTest extends TestBase with MockitoSugar {

val sourceTopic = "SOURCE"
val targetShard = "SHARD"
val kcql = Kcql.parse(s"INSERT INTO $targetShard SELECT * FROM $sourceTopic ")
val kcql =
Kcql.parse(s"INSERT INTO $targetShard SELECT * FROM $sourceTopic PROPERTIES('behavior.on.null.values'='IGNORE')")

val recordKey = "KEY"
val tombstoneValue: Null = null
Expand Down Expand Up @@ -161,7 +162,7 @@ class ElasticJsonWriterTest extends TestBase with MockitoSugar {
val exception = intercept[Exception](target.write(Vector(sinkRecord)))

exception shouldBe a[IllegalStateException]
exception.getMessage should be(s"$sourceTopic KCQL mapping is configured to fail on null value, yet it occurred.")
exception.getMessage should be(s"$sourceTopic KCQL mapping is configured to fail on tombstone records.")
}

}
Loading

0 comments on commit 5e03c59

Please sign in to comment.