From ea27181d8e3842ae1f66813f36cab14520a1f87f Mon Sep 17 00:00:00 2001 From: qingjun wu Date: Tue, 6 Aug 2024 17:46:01 +0000 Subject: [PATCH 1/5] Revising documentation about data-loading. --- modules/data-loading/examples/config-avro | 43 +++---------------- .../kafka/kafka-data-source-details.adoc | 3 +- .../kafka/kafka-example-loading-job.adoc | 2 +- 3 files changed, 9 insertions(+), 39 deletions(-) diff --git a/modules/data-loading/examples/config-avro b/modules/data-loading/examples/config-avro index 36f7a403..2050789a 100644 --- a/modules/data-loading/examples/config-avro +++ b/modules/data-loading/examples/config-avro @@ -1,8 +1,8 @@ connector.class=org.apache.kafka.connect.mirror.MirrorSourceConnector source.cluster.alias=hello target.cluster.alias=world -source.cluster.bootstrap.servers=source.kafka.server:9092 -target.cluster.bootstrap.servers=localhost:30002 +source.cluster.bootstrap.servers= +target.cluster.bootstrap.servers= source->target.enabled=true topics=avro-without-registry-topic replication.factor=1 @@ -18,41 +18,10 @@ emit.heartbeats.interval.seconds=5 world.scheduled.rebalance.max.delay.ms=35000 key.converter=org.apache.kafka.connect.converters.ByteArrayConverter header.converter=org.apache.kafka.connect.converters.ByteArrayConverter -value.converter=com.tigergraph.kafka.connect.converters.TigerGraphAvroConverterWithoutSchemaRegistry - -producer.security.protocol=SASL_SSL -producer.sasl.mechanism=GSSAPI -producer.sasl.kerberos.service.name=kafka -producer.sasl.jaas.config=com.sun.security.auth.module.Krb5LoginModule required useKeyTab=true storeKey=true keyTab=\"/path/to/kafka-producer.keytab\" principal=\"kafka-producer@TIGERGRAPH.COM\"; -producer.ssl.endpoint.identification.algorithm= -producer.ssl.keystore.location=/path/to/client.keystore.jks -producer.ssl.keystore.password=****** -producer.ssl.key.password=****** -producer.ssl.truststore.location=/path/to/client.truststore.jks -producer.ssl.truststore.password=****** - -consumer.security.protocol=SASL_SSL -consumer.sasl.mechanism=GSSAPI -consumer.sasl.kerberos.service.name=kafka -consumer.sasl.jaas.config=com.sun.security.auth.module.Krb5LoginModule required useKeyTab=true storeKey=true keyTab=\"/path/to/kafka-consumer.keytab\" principal=\"kafka-consumer@TIGERGRAPH.COM\"; -consumer.ssl.endpoint.identification.algorithm= -consumer.ssl.keystore.location=/path/to/client.keystore.jks -consumer.ssl.keystore.password=****** -consumer.ssl.key.password=****** -consumer.ssl.truststore.location=/path/to/client.truststore.jks -consumer.ssl.truststore.password=****** - -source.admin.security.protocol=SASL_SSL -source.admin.sasl.mechanism=GSSAPI -source.admin.sasl.kerberos.service.name=kafka -source.admin.sasl.jaas.config=com.sun.security.auth.module.Krb5LoginModule required useKeyTab=true storeKey=true keyTab=\"/path/to/kafka-admin.keytab\" principal=\"kafka-admin@TIGERGRAPH.COM\"; -source.admin.ssl.endpoint.identification.algorithm= -source.admin.ssl.keystore.location=/path/to/client.keystore.jks -source.admin.ssl.keystore.password=****** -source.admin.ssl.key.password=****** -source.admin.ssl.truststore.location=/path/to/client.truststore.jks -source.admin.ssl.truststore.password=****** +transforms=TigerGraphAvroTransform +transforms.TigerGraphAvroTransform.type=com.tigergraph.kafka.connect.transformations.TigergraphAvroWithoutSchemaRegistryTransformation +transforms.TigerGraphAvroTransform.errors.tolerance=none [connector_1] name=avro-test-without-registry -tasks.max=10 +tasks.max=10 \ No newline at end of file diff --git a/modules/data-loading/partials/kafka/kafka-data-source-details.adoc b/modules/data-loading/partials/kafka/kafka-data-source-details.adoc index f4a51d7c..c826c21b 100644 --- a/modules/data-loading/partials/kafka/kafka-data-source-details.adoc +++ b/modules/data-loading/partials/kafka/kafka-data-source-details.adoc @@ -25,8 +25,9 @@ If the source cluster is configured for SSL or SASL protocols, you need to provi * If the source cluster uses SASL *and* SSL, you need to upload the keytab of each Kerberos principal, as well as the key store and truststore to every node of your TigerGraph cluster. Each file must be at the same absolute path on all nodes. -The following configurations are required for admin, producer and consumer. To supply the configuration for the corresponding component, replace `` with `source.admin`, `producer`, or `consumer`. +The following are generic configurations required for admin, producer and consumer. To supply the configuration for the corresponding component, replace `` with `source.admin`, `producer`, or `consumer`. For example, to specify `GSSAPI` as the SASL mechanism for consumer, include `"consumer.sasl.mecahnism": "GSSAPI"` in the data source configuration. +But if the source cluster uses SSL, please skip the generic configurations below and follow this documentation to setup connector xref:tigergraph-server:data-loading:kafka-ssl-security-guide.adoc[] [%header,cols="1,2"] |=== diff --git a/modules/data-loading/partials/kafka/kafka-example-loading-job.adoc b/modules/data-loading/partials/kafka/kafka-example-loading-job.adoc index 5cbf6a8c..2bec8d8b 100644 --- a/modules/data-loading/partials/kafka/kafka-example-loading-job.adoc +++ b/modules/data-loading/partials/kafka/kafka-example-loading-job.adoc @@ -2,7 +2,7 @@ The following is an example loading job from and external Kafka cluster. -[source,php,linenums] +[source,sql,linenums] .Example loading job for BigQuery ---- USE GRAPH ldbc_snb From adb820088d3de4048b5c955c2fca7fa53a876e4f Mon Sep 17 00:00:00 2001 From: qingjun wu Date: Wed, 7 Aug 2024 16:13:44 +0000 Subject: [PATCH 2/5] To make it simple, let's support SSL only, and not SASL(including SASL_SSL) in the doc. --- .../pages/data-loading-overview.adoc | 4 +- .../kafka/kafka-data-source-details.adoc | 54 +------------------ .../partials/load-part1-intro-and-schema.adoc | 2 +- .../load-part2-create-data-source.adoc | 6 +-- .../load-part3-create-loading-job.adoc | 2 +- .../load-part3A-define-filenames.adoc | 4 +- .../partials/load-part3B-specify-mapping.adoc | 4 +- .../load-part5-monitor-and-manage.adoc | 4 +- 8 files changed, 15 insertions(+), 65 deletions(-) diff --git a/modules/data-loading/pages/data-loading-overview.adoc b/modules/data-loading/pages/data-loading-overview.adoc index 93b81acf..48898e12 100644 --- a/modules/data-loading/pages/data-loading-overview.adoc +++ b/modules/data-loading/pages/data-loading-overview.adoc @@ -38,7 +38,7 @@ TigerGraph uses the same workflow for both local file and Kafka Connect loading: . *Specify a graph*. Data is always loading to exactly one graph (though that graph could have global vertices and edges which are shared with other graphs). For example: + -[source,php] +[source,gsql] USE GRAPH ldbc_snb . If you are using Kafka Connect, *define a `DATA_SOURCE` object*. @@ -64,7 +64,7 @@ image::data-loading:loading_arch_3.9.3.png[Architectural diagram showing support == Loading Jobs A loading job tells the database how to construct vertices and edges from data sources. -[source,php] +[source,gsql] .CREATE LOADING JOB syntax ---- CREATE LOADING JOB FOR GRAPH { diff --git a/modules/data-loading/partials/kafka/kafka-data-source-details.adoc b/modules/data-loading/partials/kafka/kafka-data-source-details.adoc index c826c21b..f5ae661e 100644 --- a/modules/data-loading/partials/kafka/kafka-data-source-details.adoc +++ b/modules/data-loading/partials/kafka/kafka-data-source-details.adoc @@ -13,62 +13,12 @@ To configure the data source object, the minimum requirement is the address of t .Data source configuration for external Kafka ---- { -"type": "mirrormaker", -"source.cluster.bootstrap.servers": "" + "type": "mirrormaker", + "source.cluster.bootstrap.servers": "" } ---- -If the source cluster is configured for SSL or SASL protocols, you need to provide the following SSL/SASL credentials in order to communicate with the source cluster. - -* If the source cluster uses SASL, you need to upload the keytab of each Kerberos principal to every node of your TigerGraph cluster at the same absolute path. * If the source cluster uses SSL, see our documentation xref:tigergraph-server:data-loading:kafka-ssl-security-guide.adoc[] -* If the source cluster uses SASL *and* SSL, you need to upload the keytab of each Kerberos principal, as well as the key store and truststore to every node of your TigerGraph cluster. -Each file must be at the same absolute path on all nodes. - -The following are generic configurations required for admin, producer and consumer. To supply the configuration for the corresponding component, replace `` with `source.admin`, `producer`, or `consumer`. -For example, to specify `GSSAPI` as the SASL mechanism for consumer, include `"consumer.sasl.mecahnism": "GSSAPI"` in the data source configuration. -But if the source cluster uses SSL, please skip the generic configurations below and follow this documentation to setup connector xref:tigergraph-server:data-loading:kafka-ssl-security-guide.adoc[] - -[%header,cols="1,2"] -|=== -| Field | Description - -| .security.protocol -| Protocol used to communicate with brokers. -Valid values are: `PLAINTEXT`, `SSL, `SASL_PLAINTEXT`, `SASL_SSL`. -The default is `PLAINTEXT`. - -| .sasl.mechanism -| SASL mechanism used for client connections. -This may be any mechanism for which a security provider is available. GSSAPI is the default mechanism. - -| .sasl.kerberos.service.name -| The Kerberos principal name used by your Kafka brokers. -This could be defined in either JAAS configuration or Kafka’s configuration. - -| .sasl.jaas.config -| JAAS login context parameters for SASL connections in the format used by JAAS configuration files. -See https://docs.oracle.com/javase/8/docs/technotes/guides/security/jgss/tutorials/LoginConfigFile.html[JAAS Login Configuration File] for details. - -| .ssl.endpoint.identification.algorithm -| The endpoint identification algorithm used to validate server hostname in the server certificate. Default is `https`. -If the value is set to an empty string, this will disable server host name verification. - -| .ssl.keystore.location -| The location of the key store file. - -| .ssl.keystore.password -| The password of the key store file. - -| .ssl.key.password -| The password of the private key in the key store file or the PEM key specified in `ssl.keystore.key`. - -| .ssl.truststore.location -| The location of the trust store file. - -| .ssl.truststore.password -| The password for the trust store file. -|=== If there is a https://docs.confluent.io/platform/current/schema-registry/index.html[schema registry service] containing the record schema of the source topic, please add it to the data source configuration: diff --git a/modules/data-loading/partials/load-part1-intro-and-schema.adoc b/modules/data-loading/partials/load-part1-intro-and-schema.adoc index f5ea6db4..070ae426 100644 --- a/modules/data-loading/partials/load-part1-intro-and-schema.adoc +++ b/modules/data-loading/partials/load-part1-intro-and-schema.adoc @@ -6,7 +6,7 @@ We will call out whether a particular step is common for all loading or specific == Example Schema This example uses part of the LDBC_SNB schema: -[source,php] +[source,gsql] .Example schema taken from LDBC_SNB ---- //Vertex Types: diff --git a/modules/data-loading/partials/load-part2-create-data-source.adoc b/modules/data-loading/partials/load-part2-create-data-source.adoc index 5a4ec42a..c2c421f3 100644 --- a/modules/data-loading/partials/load-part2-create-data-source.adoc +++ b/modules/data-loading/partials/load-part2-create-data-source.adoc @@ -8,13 +8,13 @@ Inline mode is required when creating data sources for TigerGraph Cloud instance In the following example, we create a data source named `s1`, and read its configuration information from a file called `ds_config.json`. -[source,php] +[source,gsql] USE GRAPH ldbc_snb CREATE DATA_SOURCE s1 = "ds_config.json" FOR GRAPH ldbc_snb Older versions of TigerGraph required a keyword after `DATA_SOURCE` such as `STREAM` or `KAFKA`. -[source,php] +[source,gsql] .Inline JSON data format when creating a data source CREATE DATA_SOURCE s1 = "{ type: , @@ -24,7 +24,7 @@ key: String literals can be enclosed with a double quote `"`, triple double quotes `"""`, or triple single quotes `'''`. Double quotes `"` in the JSON can be omitted if the key name does not contain a colon `:` or comma `,`. -[source,php] +[source,gsql] .Alternate quote syntax for inline JSON data CREATE DATA_SOURCE s1 = """{ "type": "", diff --git a/modules/data-loading/partials/load-part3-create-loading-job.adoc b/modules/data-loading/partials/load-part3-create-loading-job.adoc index dabf705e..33620ef6 100644 --- a/modules/data-loading/partials/load-part3-create-loading-job.adoc +++ b/modules/data-loading/partials/load-part3-create-loading-job.adoc @@ -8,7 +8,7 @@ These can refer to actual files or be placeholder names. The actual data sources . LOAD statements specify how to take the data fields from files to construct vertices or edges. //// -[source,php] +[source,gsql] .CREATE LOADING JOB syntax ---- CREATE LOADING JOB FOR GRAPH { diff --git a/modules/data-loading/partials/load-part3A-define-filenames.adoc b/modules/data-loading/partials/load-part3A-define-filenames.adoc index 8eebf474..b1ab24e5 100644 --- a/modules/data-loading/partials/load-part3A-define-filenames.adoc +++ b/modules/data-loading/partials/load-part3A-define-filenames.adoc @@ -4,7 +4,7 @@ First we define _filenames_, which are local variables referring to data files ( [NOTE] The terms `FILENAME` and `filevar` are used for legacy reasons, but a `filevar` can also be an object in a data object store. -[source,php] +[source,gsql] .DEFINE FILENAME syntax ---- DEFINE FILENAME filevar ["=" file_descriptor ]; @@ -13,7 +13,7 @@ DEFINE FILENAME filevar ["=" file_descriptor ]; The file descriptor can be specified at compile time or at runtime. Runtime settings override compile-time settings: -[source,php] +[source,gsql] .Specifying file descriptor at runtime ---- RUN LOADING JOB job_name USING filevar=file_descriptor_override diff --git a/modules/data-loading/partials/load-part3B-specify-mapping.adoc b/modules/data-loading/partials/load-part3B-specify-mapping.adoc index 1333fe63..f6d39d92 100644 --- a/modules/data-loading/partials/load-part3B-specify-mapping.adoc +++ b/modules/data-loading/partials/load-part3B-specify-mapping.adoc @@ -1,7 +1,7 @@ === Specify the data mapping Next, we use LOAD statements to describe how the incoming data will be loaded to attributes of vertices and edges. Each LOAD statement handles the data mapping, and optional data transformation and filtering, from one filename to one or more vertex and edge types. -[source,php] +[source,gsql] .LOAD statement syntax ---- LOAD [ source_object|filevar|TEMP_TABLE table_name ] @@ -12,7 +12,7 @@ LOAD [ source_object|filevar|TEMP_TABLE table_name ] <1> As of v3.9.3, TAGS are deprecated. Let's break down one of the LOAD statements in our example: -[source,php] +[source,gsql] .Example loading job for local files ---- LOAD file_Person TO VERTEX Person diff --git a/modules/data-loading/partials/load-part5-monitor-and-manage.adoc b/modules/data-loading/partials/load-part5-monitor-and-manage.adoc index 801aabaf..f12ad969 100644 --- a/modules/data-loading/partials/load-part5-monitor-and-manage.adoc +++ b/modules/data-loading/partials/load-part5-monitor-and-manage.adoc @@ -3,7 +3,7 @@ When a loading job starts, the GSQL server assigns it a job ID and displays it for the user to see. There are three key commands to monitor and manage loading jobs: -[source,php] +[source,gsql] ---- SHOW LOADING STATUS job_id|ALL ABORT LOADING JOB job_id|ALL @@ -12,7 +12,7 @@ RESUME LOADING JOB job_id `SHOW LOADING STATUS` shows the current status of either a specified loading job or all current jobs, this command should be within the scope of a graph: -[source,php] +[source,gsql] GSQL > USE GRAPH graph_name GSQL > SHOW LOADING STATUS ALL From 6de9366dfdc066c7a63fc68ff32f1647a94d8d4a Mon Sep 17 00:00:00 2001 From: qingjun wu Date: Mon, 12 Aug 2024 02:59:47 +0000 Subject: [PATCH 3/5] sql -> gsql --- .../data-loading/partials/kafka/kafka-example-loading-job.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/data-loading/partials/kafka/kafka-example-loading-job.adoc b/modules/data-loading/partials/kafka/kafka-example-loading-job.adoc index 2bec8d8b..b68cd829 100644 --- a/modules/data-loading/partials/kafka/kafka-example-loading-job.adoc +++ b/modules/data-loading/partials/kafka/kafka-example-loading-job.adoc @@ -2,7 +2,7 @@ The following is an example loading job from and external Kafka cluster. -[source,sql,linenums] +[source,gsql,linenums] .Example loading job for BigQuery ---- USE GRAPH ldbc_snb From a52c0ce8ddefd2ff3f42881ad9d16f3171b097fe Mon Sep 17 00:00:00 2001 From: qingjun wu Date: Mon, 12 Aug 2024 03:01:23 +0000 Subject: [PATCH 4/5] Addressing comments. --- .../data-loading/partials/kafka/kafka-example-loading-job.adoc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/data-loading/partials/kafka/kafka-example-loading-job.adoc b/modules/data-loading/partials/kafka/kafka-example-loading-job.adoc index b68cd829..48696f47 100644 --- a/modules/data-loading/partials/kafka/kafka-example-loading-job.adoc +++ b/modules/data-loading/partials/kafka/kafka-example-loading-job.adoc @@ -3,7 +3,7 @@ The following is an example loading job from and external Kafka cluster. [source,gsql,linenums] -.Example loading job for BigQuery +.Example loading job from external Kafka ---- USE GRAPH ldbc_snb CREATE DATA_SOURCE s1 = "ds_config.json" FOR GRAPH ldbc_snb From 3101fbb0bc3b5692d2cd4245ead88e2c75bca075 Mon Sep 17 00:00:00 2001 From: qingjun wu Date: Mon, 19 Aug 2024 13:53:46 +0000 Subject: [PATCH 5/5] Resolving code review comments. --- .../kafka/kafka-data-source-details.adoc | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/modules/data-loading/partials/kafka/kafka-data-source-details.adoc b/modules/data-loading/partials/kafka/kafka-data-source-details.adoc index f5ae661e..8ac8ff8b 100644 --- a/modules/data-loading/partials/kafka/kafka-data-source-details.adoc +++ b/modules/data-loading/partials/kafka/kafka-data-source-details.adoc @@ -18,7 +18,64 @@ To configure the data source object, the minimum requirement is the address of t } ---- +If the source cluster is configured for SSL or SASL protocols, you need to provide the following SSL/SASL credentials in order to communicate with the source cluster. + +* If the source cluster uses SASL, you need to upload the keytab of each Kerberos principal to every node of your TigerGraph cluster at the same absolute path. * If the source cluster uses SSL, see our documentation xref:tigergraph-server:data-loading:kafka-ssl-security-guide.adoc[] +* If the source cluster uses SASL *and* SSL, you need to upload the keytab of each Kerberos principal, as well as the key store and truststore to every node of your TigerGraph cluster. +Each file must be at the same absolute path on all nodes. + +The following configurations are required for admin, producer and consumer. Basically Kafka allows SSL settings overriding, it respects security settings in precedence order: generic.ssl.setting < source/target.cluster.ssl.setting < admin/producer/consumer.ssl.setting. + +If both source and target clusters are sharing the same SSL settings, user can set generic settings for both source/target clusters and all the rols(admin/producer/consumer). For example, user can set "ssl.keystore.location=/path/to/key/store" instead of "source.cluster.ssl.keystore.location=/path/to/key/store", or "admin.ssl.keystore.location=/path/to/key/store", or even "source.cluster.admin.ssl.keystore.location=/path/to/key/store". + +If source and target clusters have different SSL settings, to make things simple, users can simply set cluster wide SSL configs, e.g., "target.cluster.ssl.truststore.password=/password/for/trust/store", instead of "target.cluster.producer.ssl.trust.password=/password/for/trust/store". + +To supply the configuration for the corresponding component, replace `` with `source(/or target).cluster`, `source(or target).cluster.admin(or producer, consumer)`, `admin`, `producer`, or `consumer`. +For example, to specify `GSSAPI` as the SASL mechanism for consumer, include `"consumer.sasl.mecahnism": "GSSAPI"` in the data source configuration. + +Note: SSL is now well supported by TigerGraph, we recommend users to set up regular SSL rather than SASL + PlainText/SSL. + +[%header,cols="1,2"] +|=== +| Field | Description + +| .security.protocol +| Protocol used to communicate with brokers. +Valid values are: `PLAINTEXT`, `SSL, `SASL_PLAINTEXT`, `SASL_SSL`. +The default is `PLAINTEXT`. + +| .sasl.mechanism +| SASL mechanism used for client connections. +This may be any mechanism for which a security provider is available. GSSAPI is the default mechanism. + +| .sasl.kerberos.service.name +| The Kerberos principal name used by your Kafka brokers. +This could be defined in either JAAS configuration or Kafka’s configuration. + +| .sasl.jaas.config +| JAAS login context parameters for SASL connections in the format used by JAAS configuration files. +See https://docs.oracle.com/javase/8/docs/technotes/guides/security/jgss/tutorials/LoginConfigFile.html[JAAS Login Configuration File] for details. + +| .ssl.endpoint.identification.algorithm +| The endpoint identification algorithm used to validate server hostname in the server certificate. Default is `https`. +If the value is set to an empty string, this will disable server host name verification. + +| .ssl.keystore.location +| The location of the key store file. + +| .ssl.keystore.password +| The password of the key store file. + +| .ssl.key.password +| The password of the private key in the key store file or the PEM key specified in `ssl.keystore.key`. + +| .ssl.truststore.location +| The location of the trust store file. + +| .ssl.truststore.password +| The password for the trust store file. +|=== If there is a https://docs.confluent.io/platform/current/schema-registry/index.html[schema registry service] containing the record schema of the source topic, please add it to the data source configuration: