diff --git a/.travis.yml b/.travis.yml index d97b432..9a9aab4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,10 +1,10 @@ +dist: trusty language: java jdk: - oraclejdk8 -sudo: false install: - - mvn test-compile -DskipTests=true -Dmaven.javadoc.skip=true -B -V + - mvn test-compile -DskipTests=true -Dmaven.javadoc.skip=true -B -V script: - - mvn test jacoco:report + - mvn test jacoco:report after_success: - - mvn coveralls:report \ No newline at end of file + - mvn coveralls:report diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..7ba7a9d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,7 @@ +FROM confluentinc/cp-kafka-connect-base:5.5.0 + +ARG PROJECT_VERSION +ENV CONNECT_PLUGIN_PATH="/usr/share/java,/usr/share/confluent-hub-components" + +COPY ./target/components/packages/mmolimar-kafka-connect-fs-${PROJECT_VERSION}.zip /tmp/kafka-connect-fs.zip +RUN confluent-hub install --no-prompt /tmp/kafka-connect-fs.zip && rm -rf /tmp/kafka-connect-fs.zip diff --git a/LICENSE b/LICENSE index d645695..1c358a6 100644 --- a/LICENSE +++ b/LICENSE @@ -187,7 +187,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2017 Mario Molina Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README.md b/README.md index 76d3961..40d1b27 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # Kafka Connect FileSystem Connector [![Build Status](https://travis-ci.org/mmolimar/kafka-connect-fs.svg?branch=master)](https://travis-ci.org/mmolimar/kafka-connect-fs)[![Coverage Status](https://coveralls.io/repos/github/mmolimar/kafka-connect-fs/badge.svg?branch=master)](https://coveralls.io/github/mmolimar/kafka-connect-fs?branch=master) -**kafka-connect-fs** is a [Kafka Connector](http://kafka.apache.org/documentation.html#connect) +**kafka-connect-fs** is a [Kafka Connector](https://kafka.apache.org/documentation.html#connect) for reading records from files in the file systems specified and load them into Kafka. -Documentation for this connector can be found [here](http://kafka-connect-fs.readthedocs.io/). +Documentation for this connector can be found [here](https://kafka-connect-fs.readthedocs.io/). ## Development @@ -13,7 +13,7 @@ kafka-connect-fs with Maven using the standard lifecycle phases. ## FAQ Some frequently asked questions on Kafka Connect FileSystem Connector can be found here - -http://kafka-connect-fs.readthedocs.io/en/latest/faq.html +https://kafka-connect-fs.readthedocs.io/en/latest/faq.html ## Contribute diff --git a/config/kafka-connect-fs.properties b/config/kafka-connect-fs.properties index 28ab531..aab1ae6 100644 --- a/config/kafka-connect-fs.properties +++ b/config/kafka-connect-fs.properties @@ -1,9 +1,9 @@ name=FsSourceConnector connector.class=com.github.mmolimar.kafka.connect.fs.FsSourceConnector tasks.max=1 -fs.uris=file:///data,hdfs://localhost:9000/ +fs.uris=file:///data,hdfs://localhost:8020/data topic=mytopic policy.class=com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy policy.recursive=true -policy.regexp=^[0-9]*\.txt$ +policy.regexp=^.*\.txt$ file_reader.class=com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..e763372 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,77 @@ +version: '3' +services: + cp-zookeeper: + image: confluentinc/cp-zookeeper:5.5.0 + hostname: zookeeper + container_name: zookeeper + ports: + - "2181:2181" + environment: + ZOOKEEPER_CLIENT_PORT: 2181 + ZOOKEEPER_TICK_TIME: 2000 + + cp-kafka: + image: confluentinc/cp-kafka:5.5.0 + hostname: kafka + container_name: kafka + depends_on: + - cp-zookeeper + ports: + - "29092:29092" + - "9092:9092" + environment: + KAFKA_BROKER_ID: 1 + KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 + CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: kafka:29092 + CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: zookeeper:2181 + CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1 + CONFLUENT_METRICS_ENABLE: 'false' + + cp-schema-registry: + image: confluentinc/cp-schema-registry:5.5.0 + hostname: schema-registry + container_name: schema-registry + depends_on: + - cp-zookeeper + - cp-kafka + ports: + - "8081:8081" + environment: + SCHEMA_REGISTRY_HOST_NAME: schema-registry + SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181' + + connect-fs: + image: mmolimar/kafka-connect-fs:1.0.0 + container_name: connect + depends_on: + - cp-kafka + - cp-schema-registry + ports: + - "8083:8083" + - "8000:8000" + environment: + CONNECT_BOOTSTRAP_SERVERS: 'kafka:29092' + CONNECT_REST_ADVERTISED_HOST_NAME: connect + CONNECT_REST_PORT: 8083 + CONNECT_GROUP_ID: compose-connect-group + CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs + CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1 + CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000 + CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets + CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1 + CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status + CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1 + CONNECT_KEY_CONVERTER: org.apache.kafka.connect.storage.StringConverter + CONNECT_VALUE_CONVERTER: io.confluent.connect.avro.AvroConverter + CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081 + CONNECT_INTERNAL_KEY_CONVERTER: "org.apache.kafka.connect.json.JsonConverter" + CONNECT_INTERNAL_VALUE_CONVERTER: "org.apache.kafka.connect.json.JsonConverter" + CONNECT_ZOOKEEPER_CONNECT: 'zookeeper:2181' + CONNECT_PLUGIN_PATH: "/usr/share/java,/usr/share/confluent-hub-components/" + CONNECT_LOG4J_ROOT_LOGLEVEL: "INFO" + CONNECT_LOG4J_LOGGERS: org.apache.zookeeper=ERROR,org.I0Itec.zkclient=ERROR,org.reflections=ERROR + KAFKA_OPTS: "-agentlib:jdwp=transport=dt_socket,server=y,address=8000,suspend=n" diff --git a/docs/Makefile b/docs/Makefile index 4dea114..9aeda1f 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -17,4 +17,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/source/conf.py b/docs/source/conf.py index d2ffa24..f6edf0c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -55,9 +55,9 @@ # built documents. # # The short X.Y version. -version = '0.1' +version = '1.0' # The full version, including alpha/beta/rc tags. -release = '0.1' +release = '1.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/source/config_options.rst b/docs/source/config_options.rst index 6eaf081..0a69105 100644 --- a/docs/source/config_options.rst +++ b/docs/source/config_options.rst @@ -48,53 +48,60 @@ General config properties for this connector. If you want to ingest data from dynamic directories, this is, directories created every day and avoiding to add new URIs or look for files from a parent directory, you can include expressions in the URIs to do that. For example, for this URI ``file:///data/${yyyy}``, it will be - converted to ``file:///data/2017`` (when executing whe policy). + converted to ``file:///data/2020`` (when executing whe policy). You can use as many as you like in the URIs, for instance: ``file:///data/${yyyy}/${MM}/${dd}/${HH}${mm}`` .. tip:: - If you want to ingest data from S3, you can add credentials with : + If you want to ingest data from S3, you can add credentials with: ``policy.fs.fs.s3a.access.key=`` and ``policy.fs.fs.s3a.secret.key=``   ``topic`` - Topic in which copy data. + Topic in which copy data to. * Type: string * Importance: high +``poll.interval.ms`` + Frequency in milliseconds to poll for new data. This config just applies when the policies have ended. + + * Type: int + * Default: ``10000`` + * Importance: medium + ``policy.class`` Policy class to apply (must implement ``com.github.mmolimar.kafka.connect.fs.policy.Policy`` interface). * Type: string * Importance: high -``policy.recursive`` - Flag to activate traversed recursion in subdirectories when listing files. - - * Type: boolean - * Default: false - * Importance: medium - ``policy.regexp`` Regular expression to filter files from the FS. * Type: string * Importance: high +``policy.recursive`` + Flag to activate traversed recursion in subdirectories when listing files. + + * Type: boolean + * Default: ``false`` + * Importance: medium + ``policy..`` - This represents the custom properties you can include based on the policy class specified. + This represents custom properties you can include based on the policy class specified. - * Type: depending on the policy. - * Importance: depending on the policy. + * Type: based on the policy. + * Importance: based on the policy. ``policy.fs.`` Custom properties to use for the FS. - * Type: depending on the FS. - * Importance: depending on the FS. + * Type: based on the FS. + * Importance: based on the FS. ``file_reader.class`` File reader class to read files from the FS (must implement @@ -104,10 +111,10 @@ General config properties for this connector. * Importance: high ``file_reader..`` - This represents the custom properties you can include based on the file reader class specified. + This represents custom properties you can include based on the file reader class specified. - * Type: depending on the file reader. - * Importance: depending on the file reader. + * Type: based on the file reader. + * Importance: based on the file reader. .. _config_options-policies: @@ -142,7 +149,7 @@ In order to configure custom properties for this policy, the name you must use i Sleep fraction to divide the sleep time to allow interrupting the policy faster. * Type: long - * Default: 10 + * Default: ``10`` * Importance: medium ``policy.sleepy.max_execs`` @@ -150,15 +157,50 @@ In order to configure custom properties for this policy, the name you must use i An execution represents: listing files from the FS and its corresponding sleep time. * Type: long - * Default: -1 + * Default: ``-1`` + * Importance: medium + +.. _config_options-policies-cron: + +Cron +-------------------------------------------- + +In order to configure custom properties for this policy, the name you must use is ``cron``. + +``policy.cron.expression`` + Cron expression to schedule the policy. + + * Type: string + * Importance: high + +``policy.cron.end_date`` + End date to finish the policy with `ISO date-time `__ + format. + + * Type: date + * Default: ``null`` * Importance: medium .. _config_options-policies-hdfs: -Hdfs file watcher +HDFS file watcher -------------------------------------------- -This policy does not have any additional configuration. +In order to configure custom properties for this policy, the name you must use is ``hdfs_file_watcher``. + +``policy.hdfs_file_watcher.poll`` + Time to wait until the records retrieved from the file watcher will be sent to the source task. + + * Type: long + * Default: ``5000`` + * Importance: medium + +``policy.hdfs_file_watcher.retry`` + Sleep time to retry connections to HDFS in case of connection errors happened. + + * Type: long + * Default: ``20000`` + * Importance: medium .. _config_options-filereaders: @@ -176,7 +218,7 @@ Avro In order to configure custom properties for this reader, the name you must use is ``avro``. ``file_reader.avro.schema`` - AVRO schema in JSON format to use when reading a file. + Avro schema in JSON format to use when reading a file. If not specified, the reader will use the schema defined in the file. * Type: string @@ -190,13 +232,13 @@ Parquet In order to configure custom properties for this reader, the name you must use is ``parquet``. ``file_reader.parquet.schema`` - AVRO schema in JSON format to use when reading a file. + Avro schema in JSON format to use when reading a file. * Type: string * Importance: medium ``file_reader.parquet.projection`` - AVRO schema in JSON format to use for projecting fields from records in a file. + Avro schema in JSON format to use for projecting fields from records in a file. * Type: string * Importance: medium @@ -208,110 +250,666 @@ SequenceFile In order to configure custom properties for this reader, the name you must use is ``sequence``. +``file_reader.sequence.field_name.key`` + Custom field name for the output key to include in the Kafka message. + + * Type: string + * Default: ``key`` + * Importance: medium + +``file_reader.sequence.field_name.value`` + Custom field name for the output value to include in the Kafka message. + + * Type: string + * Default: ``value`` + * Importance: medium + ``file_reader.sequence.buffer_size`` Custom buffer size to read data from the Sequence file. * Type: int - * Default: 4096 + * Default: ``4096`` + * Importance: low + +.. _config_options-filereaders-json: + +JSON +-------------------------------------------- + +To configure custom properties for this reader, the name you must use is ``json``. + +``file_reader.json.record_per_line`` + If enabled, the reader will read each line as a record. Otherwise, the reader will read the full + content of the file as a record. + + * Type: boolean + * Default: ``true`` * Importance: medium -``file_reader.sequence.field_name.key`` - Custom field name for the output key to include in the Kafka message. +``file_reader.json.deserialization.`` + Deserialization feature to use when reading a JSON file. You can add as much as you like + based on the ones defined `here. `__ + + * Type: boolean + * Importance: medium + +``file_reader.json.encoding`` + Encoding to use for reading a file. If not specified, the reader will use the default encoding. * Type: string - * Default: key + * Default: based on the locale and charset of the underlying operating system. + * Importance: medium + +``file_reader.json.compression.type`` + Compression type to use when reading a file. + + * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) + * Default: ``none`` + * Importance: medium + +``file_reader.json.compression.concatenated`` + Flag to specify if the decompression of the reader will finish at the end of the file or after + the first compressed stream. + + * Type: boolean + * Default: ``true`` * Importance: low -``file_reader.sequence.field_name.value`` - Custom field name for the output value to include in the Kafka message. +.. _config_options-filereaders-csv: + +CSV +-------------------------------------------- + +To configure custom properties for this reader, the name you must use is ``delimited`` (even though it's for CSV). + +``file_reader.delimited.settings.format.delimiter`` + Field delimiter. * Type: string - * Default: value + * Default: ``,`` + * Importance: high + +``file_reader.delimited.settings.header`` + If the file contains header or not. + + * Type: boolean + * Default: ``false`` + * Importance: high + +``file_reader.delimited.settings.schema`` + A comma-separated list of ordered data types for each field in the file. Possible values: ``byte``, ``short``, + ``int``, ``long``, ``float``, ``double``, ``boolean``, ``bytes`` and ``string``) + + * Type: string[] + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.data_type_mapping_error`` + Flag to enable/disable throwing errors when mapping data types based on the schema is not possible. If disabled, + the returned value which could not be mapped will be ``null``. + + * Type: boolean + * Default: ``true`` + * Importance: medium + +``file_reader.delimited.settings.allow_nulls`` + If the schema supports nullable fields. If ``file_reader.delimited.settings.data_type_mapping_error`` config flag is + disabled, the value set for this config will be ignored and set to ``true``. + + * Type: boolean + * Default: ``false`` + * Importance: medium + +``file_reader.delimited.settings.header_names`` + A comma-separated list of ordered field names to set when reading a file. + + * Type: string[] + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.null_value`` + Default value for ``null`` values. + + * Type: string + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.empty_value`` + Default value for empty values (empty values within quotes). + + * Type: string + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.format.line_separator`` + Line separator to be used. + + * Type: string + * Default: ``\n`` + * Importance: medium + +``file_reader.delimited.settings.max_columns`` + Default value for ``null`` values. + + * Type: int + * Default: ``512`` * Importance: low -.. _config_options-filereaders-text: +``file_reader.delimited.settings.max_chars_per_column`` + Default value for ``null`` values. -Text + * Type: int + * Default: ``4096`` + * Importance: low + +``file_reader.delimited.settings.rows_to_skip`` + Number of rows to skip. + + * Type: long + * Default: ``0`` + * Importance: low + +``file_reader.delimited.settings.line_separator_detection`` + If the reader should detect the line separator automatically. + + * Type: boolean + * Default: ``false`` + * Importance: low + +``file_reader.delimited.settings.delimiter_detection`` + If the reader should detect the delimiter automatically. + + * Type: boolean + * Default: ``false`` + * Importance: low + +``file_reader.delimited.settings.ignore_leading_whitespaces`` + Flag to enable/disable skipping leading whitespaces from values. + + * Type: boolean + * Default: ``true`` + * Importance: low + +``file_reader.delimited.settings.ignore_trailing_whitespaces`` + Flag to enable/disable skipping trailing whitespaces from values. + + * Type: boolean + * Default: ``true`` + * Importance: low + +``file_reader.delimited.settings.format.comment`` + Character that represents a line comment at the beginning of a line. + + * Type: char + * Default: ``#`` + * Importance: low + +``file_reader.delimited.settings.escape_unquoted`` + Flag to enable/disable processing escape sequences in unquoted values. + + * Type: boolean + * Default: ``false`` + * Importance: low + +``file_reader.delimited.settings.format.quote`` + Character used for escaping values where the field delimiter is part of the value. + + * Type: char + * Default: ``"`` + * Importance: low + +``file_reader.delimited.settings.format.quote_escape`` + Character used for escaping quotes inside an already quoted value. + + * Type: char + * Default: ``"`` + * Importance: low + +``file_reader.delimited.encoding`` + Encoding to use for reading a file. If not specified, the reader will use the default encoding. + + * Type: string + * Default: based on the locale and charset of the underlying operating system. + * Importance: medium + +``file_reader.delimited.compression.type`` + Compression type to use when reading a file. + + * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) + * Default: ``none`` + * Importance: medium + +``file_reader.delimited.compression.concatenated`` + Flag to specify if the decompression of the reader will finish at the end of the file or after + the first compressed stream. + + * Type: boolean + * Default: ``true`` + * Importance: low + +.. _config_options-filereaders-tsv: + +TSV -------------------------------------------- -In order to configure custom properties for this reader, the name you must use is ``text``. +To configure custom properties for this reader, the name you must use is ``delimited`` (even though it's for TSV). -``file_reader.text.field_name.value`` - Custom field name for the output value to include in the Kafka message. +``file_reader.delimited.settings.header`` + If the file contains header or not. + + * Type: boolean + * Default: ``false`` + * Importance: high + +``file_reader.delimited.settings.schema`` + A comma-separated list of ordered data types for each field in the file. Possible values: ``byte``, ``short``, + ``int``, ``long``, ``float``, ``double``, ``boolean``, ``bytes`` and ``string``) + + * Type: string[] + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.data_type_mapping_error`` + Flag to enable/disable throwing errors when mapping data types based on the schema is not possible. If disabled, + the returned value which could not be mapped will be ``null``. + + * Type: boolean + * Default: ``true`` + * Importance: medium + +``file_reader.delimited.settings.allow_nulls`` + If the schema supports nullable fields. If ``file_reader.delimited.settings.data_type_mapping_error`` config flag is + disabled, the value set for this config will be ignored and set to ``true``. + + * Type: boolean + * Default: ``false`` + * Importance: medium + +``file_reader.delimited.settings.header_names`` + A comma-separated list of ordered field names to set when reading a file. + + * Type: string[] + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.null_value`` + Default value for ``null`` values. + + * Type: string + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.format.line_separator`` + Line separator to be used. * Type: string - * Default: value + * Default: ``\n`` + * Importance: medium + +``file_reader.delimited.settings.max_columns`` + Default value for ``null`` values. + + * Type: int + * Default: ``512`` * Importance: low -``file_reader.text.encoding`` +``file_reader.delimited.settings.max_chars_per_column`` + Default value for ``null`` values. + + * Type: int + * Default: ``4096`` + * Importance: low + +``file_reader.delimited.settings.rows_to_skip`` + Number of rows to skip. + + * Type: long + * Default: ``0`` + * Importance: low + +``file_reader.delimited.settings.line_separator_detection`` + If the reader should detect the line separator automatically. + + * Type: boolean + * Default: ``false`` + * Importance: low + +``file_reader.delimited.settings.line_joining`` + Identifies whether or lines ending with the escape character and followed by a line + separator character should be joined with the following line. + + * Type: boolean + * Default: ``true`` + * Importance: low + +``file_reader.delimited.settings.ignore_leading_whitespaces`` + Flag to enable/disable skipping leading whitespaces from values. + + * Type: boolean + * Default: ``true`` + * Importance: low + +``file_reader.delimited.settings.ignore_trailing_whitespaces`` + Flag to enable/disable skipping trailing whitespaces from values. + + * Type: boolean + * Default: ``true`` + * Importance: low + +``file_reader.delimited.settings.format.comment`` + Character that represents a line comment at the beginning of a line. + + * Type: char + * Default: ``#`` + * Importance: low + +``file_reader.delimited.settings.format.escape`` + Character used for escaping special characters. + + * Type: char + * Default: ``\`` + * Importance: low + +``file_reader.delimited.settings.format.escaped_char`` + Character used to represent an escaped tab. + + * Type: char + * Default: ``t`` + * Importance: low + +``file_reader.delimited.encoding`` Encoding to use for reading a file. If not specified, the reader will use the default encoding. * Type: string + * Default: based on the locale and charset of the underlying operating system. + * Importance: medium + +``file_reader.delimited.compression.type`` + Compression type to use when reading a file. + + * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) + * Default: ``none`` * Importance: medium -.. _config_options-filereaders-delimited: +``file_reader.delimited.compression.concatenated`` + Flag to specify if the decompression of the reader will finish at the end of the file or after + the first compressed stream. -Delimited text + * Type: boolean + * Default: ``true`` + * Importance: low + +.. _config_options-filereaders-fixedwidth: + +FixedWidth -------------------------------------------- -In order to configure custom properties for this reader, the name you must use is ``delimited``. +To configure custom properties for this reader, the name you must use is ``delimited`` (even though it's for FixedWidth). -``file_reader.delimited.token`` - The token delimiter for columns. +``file_reader.delimited.settings.field_lengths`` + A comma-separated ordered list of integers with the lengths of each field. - * Type: string + * Type: int[] * Importance: high -``file_reader.delimited.header`` +``file_reader.delimited.settings.header`` If the file contains header or not. * Type: boolean - * Default: false + * Default: ``false`` + * Importance: high + +``file_reader.delimited.settings.schema`` + A comma-separated list of ordered data types for each field in the file. Possible values: ``byte``, ``short``, + ``int``, ``long``, ``float``, ``double``, ``boolean``, ``bytes`` and ``string``) + + * Type: string[] + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.data_type_mapping_error`` + Flag to enable/disable throwing errors when mapping data types based on the schema is not possible. If disabled, + the returned value which could not be mapped will be ``null``. + + * Type: boolean + * Default: ``true`` + * Importance: medium + +``file_reader.delimited.settings.allow_nulls`` + If the schema supports nullable fields. If ``file_reader.delimited.settings.data_type_mapping_error`` config flag is + disabled, the value set for this config will be ignored and set to ``true``. + + * Type: boolean + * Default: ``false`` + * Importance: medium + +``file_reader.delimited.settings.header_names`` + A comma-separated list of ordered field names to set when reading a file. + + * Type: string[] + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.keep_padding`` + If the padding character should be kept in each value. + + * Type: boolean + * Default: ``false`` + * Importance: medium + +``file_reader.delimited.settings.padding_for_headers`` + If headers have the default padding specified. + + * Type: boolean + * Default: ``true`` + * Importance: medium + +``file_reader.delimited.settings.null_value`` + Default value for ``null`` values. + + * Type: string + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.format.ends_on_new_line`` + Line separator to be used. + + * Type: boolean + * Default: ``true`` * Importance: medium +``file_reader.delimited.settings.format.line_separator`` + Line separator to be used. + + * Type: string + * Default: ``\n`` + * Importance: medium + +``file_reader.delimited.settings.format.padding`` + The padding character used to represent unwritten spaces. + + * Type: char + * Default: `` `` + * Importance: medium + +``file_reader.delimited.settings.max_columns`` + Default value for ``null`` values. + + * Type: int + * Default: ``512`` + * Importance: low + +``file_reader.delimited.settings.max_chars_per_column`` + Default value for ``null`` values. + + * Type: int + * Default: ``4096`` + * Importance: low + +``file_reader.delimited.settings.skip_trailing_chars`` + If the trailing characters beyond the record's length should be skipped. + + * Type: boolean + * Default: ``false`` + * Importance: low + +``file_reader.delimited.settings.rows_to_skip`` + Number of rows to skip. + + * Type: long + * Default: ``0`` + * Importance: low + +``file_reader.delimited.settings.line_separator_detection`` + If the reader should detect the line separator automatically. + + * Type: boolean + * Default: ``false`` + * Importance: low + +``file_reader.delimited.settings.ignore_leading_whitespaces`` + Flag to enable/disable skipping leading whitespaces from values. + + * Type: boolean + * Default: ``true`` + * Importance: low + +``file_reader.delimited.settings.ignore_trailing_whitespaces`` + Flag to enable/disable skipping trailing whitespaces from values. + + * Type: boolean + * Default: ``true`` + * Importance: low + +``file_reader.delimited.settings.format.comment`` + Character that represents a line comment at the beginning of a line. + + * Type: char + * Default: ``#`` + * Importance: low + ``file_reader.delimited.encoding`` Encoding to use for reading a file. If not specified, the reader will use the default encoding. * Type: string + * Default: based on the locale and charset of the underlying operating system. + * Importance: medium + +``file_reader.delimited.compression.type`` + Compression type to use when reading a file. + + * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) + * Default: ``none`` * Importance: medium -``file_reader.delimited.default_value`` - Sets a default value in a column when its value is null. This is due to the record is malformed (it does not contain - all expected columns). +``file_reader.delimited.compression.concatenated`` + Flag to specify if the decompression of the reader will finish at the end of the file or after + the first compressed stream. + + * Type: boolean + * Default: ``true`` + * Importance: low + +.. _config_options-filereaders-text: + +Text +-------------------------------------------- + +To configure custom properties for this reader, the name you must use is ``text``. + +``file_reader.json.record_per_line`` + If enabled, the reader will read each line as a record. Otherwise, the reader will read the full + content of the file as a record. + + * Type: boolean + * Default: ``true`` + * Importance: medium + +``file_reader.text.field_name.value`` + Custom field name for the output value to include in the Kafka message. + + * Type: string + * Default: ``value`` + * Importance: medium + +``file_reader.text.encoding`` + Encoding to use for reading a file. If not specified, the reader will use the default encoding. * Type: string - * Default: null + * Default: based on the locale and charset of the underlying operating system. + * Importance: medium + +``file_reader.json.compression.type`` + Compression type to use when reading a file. + + * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) + * Default: ``none`` + * Importance: medium + +``file_reader.json.compression.concatenated`` + Flag to specify if the decompression of the reader will finish at the end of the file or after + the first compressed stream. + + * Type: boolean + * Default: ``true`` * Importance: low +.. _config_options-filereaders-agnostic: + Agnostic -------------------------------------------- -In order to configure custom properties for this reader, the name you must use is ``agnostic``. +To configure custom properties for this reader, the name you must use is ``agnostic``. ``file_reader.agnostic.extensions.parquet`` A comma-separated string list with the accepted extensions for Parquet files. * Type: string - * Default: parquet + * Default: ``parquet`` * Importance: medium ``file_reader.agnostic.extensions.avro`` A comma-separated string list with the accepted extensions for Avro files. * Type: string - * Default: avro + * Default: ``avro`` * Importance: medium ``file_reader.agnostic.extensions.sequence`` A comma-separated string list with the accepted extensions for Sequence files. * Type: string - * Default: seq + * Default: ``seq`` * Importance: medium -``file_reader.agnostic.extensions.delimited`` - A comma-separated string list with the accepted extensions for Delimited text files. +``file_reader.agnostic.extensions.json`` + A comma-separated string list with the accepted extensions for JSON files. * Type: string - * Default: tsv,csv + * Default: ``json`` * Importance: medium + +``file_reader.agnostic.extensions.csv`` + A comma-separated string list with the accepted extensions for CSV files. + + * Type: string + * Default: ``csv`` + * Importance: medium + +``file_reader.agnostic.extensions.tsv`` + A comma-separated string list with the accepted extensions for TSV files. + + * Type: string + * Default: ``tsv`` + * Importance: medium + +``file_reader.agnostic.extensions.fixed`` + A comma-separated string list with the accepted extensions for fixed-width files. + + * Type: string + * Default: ``fixed`` + * Importance: medium + +.. note:: The Agnostic reader uses the previous ones as inner readers. So, in case of using this + reader, you'll probably need to include also the specified properties for those + readers in the connector configuration as well. diff --git a/docs/source/connector.rst b/docs/source/connector.rst index 48cd0e0..476aa7b 100644 --- a/docs/source/connector.rst +++ b/docs/source/connector.rst @@ -12,9 +12,11 @@ of this abstraction and using it in a transparent way. Among others, these are some file systems it supports: * HDFS. -* WebHDFS. * S3. -* FTP and SFTP. +* Google Cloud Storage. +* Azure Blob Storage & Azure Data Lake Store. +* FTP. +* WebHDFS. * Local File System. * Hadoop Archive File System. @@ -24,8 +26,9 @@ Getting started Prerequisites -------------------------------------------- -- Confluent Platform 3.1.1 +- Apache Kafka 2.5.0 - Java 8 +- Confluent Schema Registry (recommended). Building from source -------------------------------------------- @@ -44,7 +47,7 @@ The ``kafka-connect-fs.properties`` file defines the following properties as req name=FsSourceConnector connector.class=com.github.mmolimar.kafka.connect.fs.FsSourceConnector tasks.max=1 - fs.uris=file:///data,hdfs://localhost:9000/ + fs.uris=file:///data,hdfs://localhost:8020/data topic=mytopic policy.class= policy.recursive=true @@ -67,18 +70,36 @@ The ``kafka-connect-fs.properties`` file defines the following properties as req A more detailed information about these properties can be found :ref:`here`. -Running in development +Running in local -------------------------------------------- .. sourcecode:: bash - export CONFLUENT_HOME=/path/to/confluent/install/dir + export KAFKA_HOME=/path/to/kafka/install/dir .. sourcecode:: bash mvn clean package export CLASSPATH="$(find target/ -type f -name '*.jar'| grep '\-package' | tr '\n' ':')" - $CONFLUENT_HOME/bin/connect-standalone $CONFLUENT_HOME/etc/schema-registry/connect-avro-standalone.properties config/kafka-connect-fs.properties + $KAFKA_HOME/bin/connect-standalone.sh $KAFKA_HOME/config/connect-standalone.properties config/kafka-connect-fs.properties + +Running in Docker +-------------------------------------------- + +.. sourcecode:: bash + + mvn clean package + +.. sourcecode:: bash + + docker build --build-arg PROJECT_VERSION= . + docker-compose build + docker-compose up -d + docker logs --tail="all" -f connect + +.. sourcecode:: bash + + curl -sX GET http://localhost:8083/connector-plugins | grep FsSourceConnector Components ============================================ @@ -91,7 +112,7 @@ Policies In order to ingest data from the FS(s), the connector needs a **policy** to define the rules to do it. -Basically, the policy tries to connect to each FS included in ``fs.uris`` connector property, list files +Basically, the policy tries to connect to each FS included in ``fs.uris`` connector property, lists files (and filter them using the regular expression provided in the ``policy.regexp`` property) and enables a file reader to read records from them. diff --git a/docs/source/faq.rst b/docs/source/faq.rst index 49e9ef7..1041bc4 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -4,7 +4,7 @@ FAQs ******************************************** -**My file was already processed and the connector, when it is executed again, +**My file was already processed and the connector, when it's executed again, processes the same records again.** If during the previous executions the records were sent successfully to Kafka, @@ -33,7 +33,7 @@ the connector everyday.** Don't do this! Take advantage of the dynamic URIs using expressions. -For instance, if you have this URI ``hdfs://host:9000/data/2017``, you can +For instance, if you have this URI ``hdfs://host:9000/data/2020``, you can use this URI ``hdfs://host:9000/data/${yyyy}`` instead. **The connector is too slow to process all URIs I have.** @@ -53,4 +53,3 @@ until throws an exception. It's a matter of time. But the main thing is that you don't have to worry about removing files from the FS when they are being processed. The connector tolerates errors when reading files and continues with the next file. - diff --git a/docs/source/filereaders.rst b/docs/source/filereaders.rst index 37c76f3..f887499 100644 --- a/docs/source/filereaders.rst +++ b/docs/source/filereaders.rst @@ -16,7 +16,7 @@ Parquet Reads files with `Parquet `__ format. The reader takes advantage of the Parquet-Avro API and uses the Parquet file -as if it were an Avro file, so the message sent to Kafka is built in the same +as if it was an Avro file, so the message sent to Kafka is built in the same way as the Avro file reader does. .. warning:: Seeking Parquet files is a heavy task because the reader has to @@ -38,28 +38,61 @@ by default but you can customize these field names. More information about properties of this file reader :ref:`here`. -Text +JSON ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Reads plain text files. - -Each line represents one record which will be in a field -named ``value`` in the message sent to Kafka by default but you can -customize these field names. +Reads JSON files which might contain multiple number of fields with their specified +data types. The schema for this sort of records is inferred reading the first record +and marked as optional in the schema all the fields contained. -More information about properties of this file reader :ref:`here`. +More information about properties of this file reader :ref:`here`. -Delimited text +CSV ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Text file reader using a custom token to distinguish different columns on each line. +CSV file reader using a custom token to distinguish different columns on each line. It allows to distinguish a header in the files and set the name of their columns in the message sent to Kafka. If there is no header, the value of each column will be in the field named ``column_N`` (**N** represents the column index) in the message. Also, the token delimiter for columns is configurable. -More information about properties of this file reader :ref:`here`. +This reader is based on the `Univocity CSV parser `__. + +More information about properties of this file reader :ref:`here`. + +TSV +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +TSV file reader using a tab (``\t``) to distinguish different columns on each line. + +Its behaviour is the same one for the CSV file reader regarding the header and the column names. + +This reader is based on the `Univocity TSV parser `__. + +More information about properties of this file reader :ref:`here`. + +FixedWidth +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +FixedWidth is a plain text file reader which distinguishes each column based on the length of each field. + +Its behaviour is the same one for the CSV/TSV file readers regarding the header and the column names. + +This reader is based on the `Univocity Fixed-Width parser `__. + +More information about properties of this file reader :ref:`here`. + +Text +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Reads plain text files. + +Each line represents one record (by default) which will be in a field +named ``value`` in the message sent to Kafka by default but you can +customize these field names. + +More information about properties of this file reader :ref:`here`. Agnostic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -67,14 +100,18 @@ Agnostic Actually, this reader is a wrapper of the readers listing above. It tries to read any kind of file format using an internal reader based on the file extension, -applying the proper one (Parquet, Avro, SecuenceFile, Text or Delimited text). In case of no +applying the proper one (Parquet, Avro, SequenceFile, CSV, TSV or Text). In case of no extension has been matched, the Text file reader will be applied. -Default extensions for each format: -* Parquet: .parquet -* Avro: .avro -* SequenceFile: .seq -* Delimited text: .tsv, .csv +Default extensions for each format (configurable): + +* Parquet: ``.parquet`` +* Avro: ``.avro`` +* SequenceFile: ``.seq`` +* JSON: ``.json`` +* CSV: ``.csv`` +* TSV: ``.tsv`` +* FixedWidth: ``.fixed`` * Text: any other sort of file extension. More information about properties of this file reader :ref:`here`. diff --git a/docs/source/policies.rst b/docs/source/policies.rst index abed625..1a5f654 100644 --- a/docs/source/policies.rst +++ b/docs/source/policies.rst @@ -1,10 +1,9 @@ Simple ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -It's a never-ending policy which just filters and processes files included in the corresponding URIs. +It's a policy which just filters and processes files included in the corresponding URIs one time. .. attention:: This policy is more oriented for testing purposes. - It never stops and Kafka Connect is continuously trying to poll data from the FS(s). Sleepy ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -14,14 +13,26 @@ and wait for the next one. Additionally, its custom properties allow to end it. You can learn more about the properties of this policy :ref:`here`. -Hdfs file watcher +Cron ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -It uses Hadoop notifications events and all create/append/close events will be reported -as new files to be ingested. +This policy is scheduled based on cron expressions and their format to put in the configuration +are based on the library `Quartz Scheduler `__ + +After finishing each execution, the policy gets slept until the next one is scheduled, if applicable. + +You can learn more about the properties of this policy :ref:`here`. + +HDFS file watcher +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It uses Hadoop notifications events and all create/append/rename/close events will be reported +as files to be ingested. Just use it when you have HDFS URIs. -.. attention:: The URIs included in general property ``fs.uris`` will be filtered and only those +You can learn more about the properties of this policy :ref:`here`. + +.. attention:: The URIs included in the general property ``fs.uris`` will be filtered and only those ones which start with the prefix ``hdfs://`` will be watched. Also, this policy will only work for Hadoop versions 2.6.0 or higher. diff --git a/pom.xml b/pom.xml index 8c2c90e..bd22791 100644 --- a/pom.xml +++ b/pom.xml @@ -4,26 +4,32 @@ com.github.mmolimar.kafka.connect kafka-connect-fs - 0.3-SNAPSHOT + 1.0.0 jar kafka-connect-fs UTF-8 - 0.10.1.0 - 3.1.1 - 2.9.0 - 1.8.1 - 1.9.0 - 4.12 - 3.4 - 1.6.6 - 3.0.2 - 3.6.1 - 3.0.0 - 0.7.9 - 4.3.0 + 2.5.0 + 5.5.0 + 3.2.1 + hadoop3-2.1.2 + 1.11.0 + 2.8.4 + 9.0.2 + 5.6.2 + 4.2 + 2.0.7 + 1.8 + ${maven-compiler.source} + 3.2.0 + 3.8.1 + 3.2.0 + 0.8.5 + 4.3.0 + 3.0.0-M4 + 0.11.3 @@ -37,7 +43,6 @@ io.confluent kafka-connect-avro-converter ${confluent.version} - provided org.apache.hadoop @@ -50,15 +55,19 @@ ${hadoop.version} - org.apache.avro - avro-tools - ${avro.version} - nodeps + org.apache.hadoop + hadoop-azure + ${hadoop.version} + + + org.apache.hadoop + hadoop-azure-datalake + ${hadoop.version} - org.apache.avro - avro - ${avro.version} + com.google.cloud.bigdataoss + gcs-connector + ${gcs-connector.version} org.apache.parquet @@ -66,9 +75,21 @@ ${parquet.version} - junit - junit - ${junit.version} + com.univocity + univocity-parsers + ${univocity.version} + + + com.cronutils + cron-utils + ${cron-utils.version} + + + + + org.junit.jupiter + junit-jupiter + ${junit-jupiter.version} test @@ -77,12 +98,6 @@ ${easymock.version} test - - org.powermock - powermock-module-junit4 - ${powermock.version} - test - org.powermock powermock-api-easymock @@ -118,11 +133,17 @@ ${maven-compiler-plugin.version} true - 1.8 - 1.8 + ${maven-compiler.source} + ${maven-compiler.target} + org.apache.maven.plugins + maven-surefire-plugin + ${maven-surfire-plugin.version} + + + org.apache.maven.plugins maven-assembly-plugin ${maven-assembly-plugin.version} @@ -144,7 +165,7 @@ org.jacoco jacoco-maven-plugin - ${jacoco-maven-plugin.version} + ${maven-jacoco-plugin.version} prepare-agent @@ -157,7 +178,65 @@ org.eluder.coveralls coveralls-maven-plugin - ${coveralls-maven-plugin.version} + ${maven-coveralls-plugin.version} + + + io.confluent + kafka-connect-maven-plugin + ${maven-kafka-connect-plugin.version} + + + + kafka-connect + + + kafka-connect-fs + Kafka Connect FileSystem + https://kafka-connect-fs.readthedocs.io/ + https://github.com/mmolimar/kafka-connect-fs + + Kafka Connect FileSystem Connector is a source connector for reading records from files + in the file systems specified and load them into Kafka. + + Mario Molina + This connector is supported by the open source community. + https://github.com/mmolimar/kafka-connect-fs/issues + mmolimar + user + Mario Molina + https://github.com/mmolimar + mmolimar + kafka-connect-fs + ${project.version} + + source + + + filesystem + files + hadoop + hdfs + aws + s3 + google + gcs + azure + txt + csv + tsv + json + avro + parquet + sequence + + + + atLeastOnce + + true + + + @@ -180,4 +259,4 @@ http://packages.confluent.io/maven/ - \ No newline at end of file + diff --git a/src/main/assembly/package.xml b/src/main/assembly/package.xml index 7962c49..a1b9d19 100644 --- a/src/main/assembly/package.xml +++ b/src/main/assembly/package.xml @@ -36,9 +36,8 @@ org.apache.kafka:connect-api org.mortbay.jetty:* com.sun.jersey:* - org.eclipse.jetty.aggregate:jetty-all + org.eclipse.jetty:jetty-util com.sun.jersey.contribs:jersey-guice - com.google.guava:guava org.apache.zookeeper:zookeeper log4j:log4j org.slf4j:slf4j-api diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java index e6aab15..839477b 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java @@ -1,7 +1,5 @@ package com.github.mmolimar.kafka.connect.fs; -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; -import com.github.mmolimar.kafka.connect.fs.policy.Policy; import com.github.mmolimar.kafka.connect.fs.util.Version; import org.apache.kafka.common.config.ConfigDef; import org.apache.kafka.common.config.ConfigException; @@ -16,7 +14,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.stream.Collectors; public class FsSourceConnector extends SourceConnector { @@ -34,13 +31,10 @@ public void start(Map properties) { log.info("Starting FsSourceConnector..."); try { config = new FsSourceConnectorConfig(properties); - } catch (ConfigException ce) { - log.error("Couldn't start FsSourceConnector:", ce); throw new ConnectException("Couldn't start FsSourceConnector due to configuration error.", ce); } catch (Exception ce) { - log.error("Couldn't start FsSourceConnector:", ce); - throw new ConnectException("An error has occurred when starting FsSourceConnector" + ce); + throw new ConnectException("An error has occurred when starting FsSourceConnector." + ce); } } @@ -52,15 +46,16 @@ public Class taskClass() { @Override public List> taskConfigs(int maxTasks) { if (config == null) { - throw new ConnectException("Connector config has not been initialized"); + throw new ConnectException("Connector config has not been initialized."); } - List> taskConfigs = new ArrayList<>(); + final List> taskConfigs = new ArrayList<>(); - int groups = Math.min(config.getFsUris().size(), maxTasks); - ConnectorUtils.groupPartitions(config.getFsUris(), groups) + List fsUris = config.getFsUris(); + int groups = Math.min(fsUris.size(), maxTasks); + ConnectorUtils.groupPartitions(fsUris, groups) .forEach(dirs -> { Map taskProps = new HashMap<>(config.originalsStrings()); - taskProps.put(FsSourceConnectorConfig.FS_URIS, dirs.stream().collect(Collectors.joining(","))); + taskProps.put(FsSourceConnectorConfig.FS_URIS, String.join(",", dirs)); taskConfigs.add(taskProps); }); @@ -71,6 +66,7 @@ public List> taskConfigs(int maxTasks) { @Override public void stop() { + log.info("Stopping FsSourceConnector."); //Nothing to do } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnectorConfig.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnectorConfig.java index d20069f..3a3f1ad 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnectorConfig.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnectorConfig.java @@ -13,9 +13,13 @@ public class FsSourceConnectorConfig extends AbstractConfig { public static final String FS_URIS = "fs.uris"; private static final String FS_URIS_DOC = "Comma-separated URIs of the FS(s)."; + private static final String FS_URIS_DISPLAY = "File system URIs"; public static final String TOPIC = "topic"; private static final String TOPIC_DOC = "Topic to copy data to."; + private static final String TOPIC_DISPLAY = "Topic"; + + private static final String CONNECTOR_GROUP = "Connector"; public FsSourceConnectorConfig(ConfigDef config, Map parsedConfig) { super(config, parsedConfig); @@ -26,9 +30,29 @@ public FsSourceConnectorConfig(Map parsedConfig) { } public static ConfigDef conf() { + int order = 0; return new ConfigDef() - .define(FS_URIS, Type.LIST, Importance.HIGH, FS_URIS_DOC) - .define(TOPIC, Type.STRING, Importance.HIGH, TOPIC_DOC); + .define( + FS_URIS, + Type.LIST, + ConfigDef.NO_DEFAULT_VALUE, + Importance.HIGH, + FS_URIS_DOC, + CONNECTOR_GROUP, + ++order, + ConfigDef.Width.LONG, + FS_URIS_DISPLAY + ).define( + TOPIC, + Type.STRING, + ConfigDef.NO_DEFAULT_VALUE, + Importance.HIGH, + TOPIC_DOC, + CONNECTOR_GROUP, + ++order, + ConfigDef.Width.LONG, + TOPIC_DISPLAY + ); } public List getFsUris() { @@ -38,5 +62,4 @@ public List getFsUris() { public String getTopic() { return this.getString(TOPIC); } - -} \ No newline at end of file +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java index b7c97eb..bb2169a 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java @@ -1,12 +1,13 @@ package com.github.mmolimar.kafka.connect.fs; import com.github.mmolimar.kafka.connect.fs.file.FileMetadata; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; import com.github.mmolimar.kafka.connect.fs.policy.Policy; import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; import com.github.mmolimar.kafka.connect.fs.util.Version; import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.common.utils.SystemTime; +import org.apache.kafka.common.utils.Time; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.source.SourceRecord; @@ -22,11 +23,20 @@ import java.util.stream.StreamSupport; public class FsSourceTask extends SourceTask { + private static final Logger log = LoggerFactory.getLogger(FsSourceTask.class); - private AtomicBoolean stop; + private final AtomicBoolean stop; + private final Time time; + private FsSourceTaskConfig config; private Policy policy; + private int pollInterval; + + public FsSourceTask() { + this.stop = new AtomicBoolean(false); + this.time = new SystemTime(); + } @Override public String version() { @@ -35,65 +45,74 @@ public String version() { @Override public void start(Map properties) { + log.info("Starting FS source task..."); try { config = new FsSourceTaskConfig(properties); - if (config.getClass(FsSourceTaskConfig.POLICY_CLASS).isAssignableFrom(Policy.class)) { throw new ConfigException("Policy class " + - config.getClass(FsSourceTaskConfig.POLICY_CLASS) + "is not a sublass of " + Policy.class); + config.getClass(FsSourceTaskConfig.POLICY_CLASS) + " is not a subclass of " + Policy.class); } if (config.getClass(FsSourceTaskConfig.FILE_READER_CLASS).isAssignableFrom(FileReader.class)) { throw new ConfigException("FileReader class " + - config.getClass(FsSourceTaskConfig.FILE_READER_CLASS) + "is not a sublass of " + FileReader.class); + config.getClass(FsSourceTaskConfig.FILE_READER_CLASS) + " is not a subclass of " + FileReader.class); } Class policyClass = (Class) Class.forName(properties.get(FsSourceTaskConfig.POLICY_CLASS)); - FsSourceTaskConfig taskConfig = new FsSourceTaskConfig(properties); - policy = ReflectionUtils.makePolicy(policyClass, taskConfig); + policy = ReflectionUtils.makePolicy(policyClass, config); + pollInterval = config.getInt(FsSourceTaskConfig.POLL_INTERVAL_MS); } catch (ConfigException ce) { - log.error("Couldn't start FsSourceTask:", ce); - throw new ConnectException("Couldn't start FsSourceTask due to configuration error", ce); - } catch (Throwable t) { - log.error("Couldn't start FsSourceConnector:", t); - throw new ConnectException("A problem has occurred reading configuration:" + t.getMessage()); + log.error("Couldn't start FsSourceTask.", ce); + throw new ConnectException("Couldn't start FsSourceTask due to configuration error: " + ce.getMessage(), ce); + } catch (Exception e) { + log.error("Couldn't start FsSourceConnector.", e); + throw new ConnectException("A problem has occurred reading configuration: " + e.getMessage(), e); } - - stop = new AtomicBoolean(false); + log.info("FS source task started with policy [{}].", policy.getClass().getName()); } @Override - public List poll() throws InterruptedException { - while (stop != null && !stop.get() && !policy.hasEnded()) { - log.trace("Polling for new data"); + public List poll() { + while (!stop.get() && policy != null && !policy.hasEnded()) { + log.trace("Polling for new data..."); - final List results = new ArrayList<>(); - List files = filesToProcess(); - files.forEach(metadata -> { + List totalRecords = filesToProcess().map(metadata -> { + List records = new ArrayList<>(); try (FileReader reader = policy.offer(metadata, context.offsetStorageReader())) { - log.info("Processing records for file {}", metadata); + log.info("Processing records for file {}.", metadata); while (reader.hasNext()) { - results.add(convert(metadata, reader.currentOffset(), reader.next())); + records.add(convert(metadata, reader.currentOffset() + 1, reader.next())); } } catch (ConnectException | IOException e) { //when an exception happens reading a file, the connector continues - log.error("Error reading file from FS: " + metadata.getPath() + ". Keep going...", e); + log.error("Error reading file [{}]. Keep going...", metadata.getPath(), e); } - }); - return results; - } + log.debug("Read [{}] records from file [{}].", records.size(), metadata.getPath()); + + return records; + }).flatMap(Collection::stream).collect(Collectors.toList()); + log.debug("Returning [{}] records in execution number [{}] for policy [{}].", + totalRecords.size(), policy.getExecutions(), policy.getClass().getName()); + + return totalRecords; + } + if (pollInterval > 0) { + log.trace("Waiting [{}] ms for next poll.", pollInterval); + time.sleep(pollInterval); + } return null; } - private List filesToProcess() { + private Stream filesToProcess() { try { return asStream(policy.execute()) - .filter(metadata -> metadata.getLen() > 0) - .collect(Collectors.toList()); + .filter(metadata -> metadata.getLen() > 0); } catch (IOException | ConnectException e) { //when an exception happens executing the policy, the connector continues - log.error("Cannot retrive files to process from FS: " + policy.getURIs() + ". Keep going...", e); - return Collections.EMPTY_LIST; + log.error("Cannot retrieve files to process from the FS: {}. " + + "There was an error executing the policy but the task tolerates this and continues.", + policy.getURIs(), e); + return Stream.empty(); } } @@ -102,16 +121,10 @@ private Stream asStream(Iterator src) { return StreamSupport.stream(iterable.spliterator(), false); } - private SourceRecord convert(FileMetadata metadata, Offset offset, Struct struct) { + private SourceRecord convert(FileMetadata metadata, long offset, Struct struct) { return new SourceRecord( - new HashMap() { - { - put("path", metadata.getPath()); - //TODO manage blocks - //put("blocks", metadata.getBlocks().toString()); - } - }, - Collections.singletonMap("offset", offset.getRecordOffset()), + Collections.singletonMap("path", metadata.getPath()), + Collections.singletonMap("offset", offset), config.getTopic(), struct.schema(), struct @@ -120,11 +133,16 @@ private SourceRecord convert(FileMetadata metadata, Offset offset, Struct struct @Override public void stop() { - if (stop != null) { - stop.set(true); - } - if (policy != null) { - policy.interrupt(); + log.info("Stopping FS source task..."); + stop.set(true); + synchronized (this) { + if (policy != null) { + try { + policy.close(); + } catch (IOException ioe) { + log.warn("Error closing policy [{}].", policy.getClass().getName(), ioe); + } + } } } -} \ No newline at end of file +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java index 3b1f4a5..58231fd 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java @@ -11,17 +11,29 @@ public class FsSourceTaskConfig extends FsSourceConnectorConfig { public static final String POLICY_CLASS = POLICY_PREFIX + "class"; private static final String POLICY_CLASS_DOC = "Policy class to apply to this task."; + private static final String POLICY_CLASS_DISPLAY = "Policy"; public static final String POLICY_RECURSIVE = POLICY_PREFIX + "recursive"; private static final String POLICY_RECURSIVE_DOC = "Flag to activate traversed recursion in subdirectories when listing files."; + private static final String POLICY_RECURSIVE_DISPLAY = "Recursive directory listing"; public static final String POLICY_REGEXP = POLICY_PREFIX + "regexp"; private static final String POLICY_REGEXP_DOC = "Regular expression to filter files from the FS."; + private static final String POLICY_REGEXP_DISPLAY = "File filter regex"; public static final String POLICY_PREFIX_FS = POLICY_PREFIX + "fs."; public static final String FILE_READER_CLASS = FILE_READER_PREFIX + "class"; private static final String FILE_READER_CLASS_DOC = "File reader class to read files from the FS."; + private static final String FILE_READER_CLASS_DISPLAY = "File reader class"; + + public static final String POLL_INTERVAL_MS = "poll.interval.ms"; + private static final String POLL_INTERVAL_MS_DOC = "Frequency in ms to poll for new data."; + public static final int POLL_INTERVAL_MS_DEFAULT = 10000; + private static final String POLL_INTERVAL_MS_DISPLAY = "Poll Interval (ms)"; + + private static final String POLICY_GROUP = "Policy"; + private static final String CONNECTOR_GROUP = "Connector"; public FsSourceTaskConfig(ConfigDef config, Map parsedConfig) { super(config, parsedConfig); @@ -32,11 +44,58 @@ public FsSourceTaskConfig(Map parsedConfig) { } public static ConfigDef conf() { + int order = 0; return FsSourceConnectorConfig.conf() - .define(POLICY_CLASS, ConfigDef.Type.CLASS, ConfigDef.Importance.HIGH, POLICY_CLASS_DOC) - .define(POLICY_RECURSIVE, ConfigDef.Type.BOOLEAN, Boolean.TRUE, ConfigDef.Importance.LOW, POLICY_RECURSIVE_DOC) - .define(POLICY_REGEXP, ConfigDef.Type.STRING, ".*", ConfigDef.Importance.MEDIUM, POLICY_REGEXP_DOC) - .define(FILE_READER_CLASS, ConfigDef.Type.CLASS, ConfigDef.Importance.HIGH, FILE_READER_CLASS_DOC); + .define( + POLICY_CLASS, + ConfigDef.Type.CLASS, + ConfigDef.NO_DEFAULT_VALUE, + ConfigDef.Importance.HIGH, + POLICY_CLASS_DOC, + POLICY_GROUP, + ++order, + ConfigDef.Width.MEDIUM, + POLICY_CLASS_DISPLAY + ).define( + POLICY_RECURSIVE, + ConfigDef.Type.BOOLEAN, + Boolean.TRUE, + ConfigDef.Importance.MEDIUM, + POLICY_RECURSIVE_DOC, + POLICY_GROUP, + ++order, + ConfigDef.Width.SHORT, + POLICY_RECURSIVE_DISPLAY + ).define( + POLICY_REGEXP, + ConfigDef.Type.STRING, + ".*", + ConfigDef.Importance.MEDIUM, + POLICY_REGEXP_DOC, + POLICY_GROUP, + ++order, + ConfigDef.Width.MEDIUM, + POLICY_REGEXP_DISPLAY + ).define( + FILE_READER_CLASS, + ConfigDef.Type.CLASS, + ConfigDef.NO_DEFAULT_VALUE, + ConfigDef.Importance.HIGH, + FILE_READER_CLASS_DOC, + POLICY_GROUP, + ++order, + ConfigDef.Width.MEDIUM, + FILE_READER_CLASS_DISPLAY + ).define( + POLL_INTERVAL_MS, + ConfigDef.Type.INT, + POLL_INTERVAL_MS_DEFAULT, + ConfigDef.Importance.MEDIUM, + POLL_INTERVAL_MS_DOC, + CONNECTOR_GROUP, + ++order, + ConfigDef.Width.SHORT, + POLL_INTERVAL_MS_DISPLAY + ); } - } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java index 45902e9..669b681 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java @@ -36,12 +36,9 @@ public boolean equals(Object object) { if (!(object instanceof FileMetadata)) return false; FileMetadata metadata = (FileMetadata) object; - if (this.path.equals(metadata.getPath()) && + return this.path.equals(metadata.getPath()) && this.length == metadata.length && - this.blocks.equals(metadata.getBlocks())) { - return true; - } - return false; + this.blocks.equals(metadata.getBlocks()); } public int hashCode() { @@ -65,4 +62,4 @@ public String toString() { return String.format("[offset = %s, length = %s, corrupt = %s]", offset, length, corrupt); } } -} \ No newline at end of file +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/Offset.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/Offset.java deleted file mode 100644 index ca1d530..0000000 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/Offset.java +++ /dev/null @@ -1,7 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file; - -public interface Offset { - - long getRecordOffset(); - -} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java index 4e1b474..d63283f 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java @@ -3,33 +3,47 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.IOException; import java.util.Map; +import java.util.NoSuchElementException; import java.util.stream.Collectors; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; public abstract class AbstractFileReader implements FileReader { + protected final Logger log = LoggerFactory.getLogger(getClass()); + private final FileSystem fs; private final Path filePath; - private ReaderAdapter adapter; + private final ReaderAdapter adapter; + private long offset; - public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter, Map config) { + public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter, Map config) { if (fs == null || filePath == null) { - throw new IllegalArgumentException("fileSystem and filePath are required"); + throw new IllegalArgumentException("File system and file path are required."); } this.fs = fs; this.filePath = filePath; this.adapter = adapter; + this.offset = 0; + + configure(readerConfig(config)); + log.trace("Initialized file reader [{}] for file [{}].", getClass().getName(), filePath); + } - Map readerConf = config.entrySet().stream() + protected final Map readerConfig(Map config) { + return config.entrySet().stream() .filter(entry -> entry.getKey().startsWith(FILE_READER_PREFIX)) - .collect(Collectors.toMap(entry -> entry.getKey(), entry -> entry.getValue())); - configure(readerConf); + .filter(entry -> entry.getValue() != null) + .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().toString())); } - protected abstract void configure(Map config); + protected abstract void configure(Map config); protected FileSystem getFs() { return fs; @@ -40,14 +54,74 @@ public Path getFilePath() { return filePath; } + @Override public final Struct next() { - return adapter.apply(nextRecord()); + if (!hasNext()) { + throw new NoSuchElementException("There are no more records in file: " + getFilePath()); + } + try { + return adapter.apply(nextRecord()); + } catch (ConnectException ce) { + throw ce; + } catch (Exception e) { + throw new ConnectException("Error processing next record in file: " + getFilePath(), e); + } + } + + @Override + public long currentOffset() { + return offset; + } + + protected void incrementOffset() { + this.offset++; + } + + protected void setOffset(long offset) { + this.offset = offset; + } + + @Override + public final void seek(long offset) { + if (offset < 0) { + throw new IllegalArgumentException("Record offset must be greater than 0."); + } + checkClosed(); + try { + seekFile(offset); + } catch (IOException ioe) { + throw new ConnectException("Error seeking file: " + getFilePath(), ioe); + } } - protected abstract T nextRecord(); + @Override + public final boolean hasNext() { + checkClosed(); + try { + return hasNextRecord(); + } catch (ConnectException ce) { + throw ce; + } catch (Exception e) { + throw new ConnectException("Error when checking if the reader has more records.", e); + } + } protected ReaderAdapter getAdapter() { return adapter; } + private void checkClosed() { + if (isClosed()) { + throw new ConnectException("File stream is closed!"); + } + } + + protected abstract T nextRecord() throws IOException; + + protected abstract boolean hasNextRecord() throws IOException; + + protected abstract void seekFile(long offset) throws IOException; + + protected abstract boolean isClosed(); + } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java index 5e025da..2630762 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java @@ -1,15 +1,16 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; import java.io.IOException; import java.util.Arrays; -import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; @@ -17,76 +18,87 @@ public class AgnosticFileReader extends AbstractFileReader parquetExtensions, avroExtensions, sequenceExtensions, delimitedExtensions; + private final AbstractFileReader reader; + private Set parquetExtensions, avroExtensions, sequenceExtensions, + jsonExtensions, csvExtensions, tsvExtensions, fixedExtensions; - public AgnosticFileReader(FileSystem fs, Path filePath, Map config) throws IOException { + public AgnosticFileReader(FileSystem fs, Path filePath, Map config) throws Exception { super(fs, filePath, new AgnosticAdapter(), config); try { - reader = (AbstractFileReader) readerByExtension(fs, filePath, config); - } catch (RuntimeException | IOException e) { - throw e; - } catch (Throwable t) { - throw new IOException("An error has ocurred when creating a concrete reader", t); + reader = readerByExtension(fs, filePath, config); + } catch (ConnectException ce) { + throw (Exception) ce.getCause(); } } - private FileReader readerByExtension(FileSystem fs, Path filePath, Map config) - throws Throwable { + private AbstractFileReader readerByExtension(FileSystem fs, Path filePath, Map config) { int index = filePath.getName().lastIndexOf('.'); String extension = index == -1 || index == filePath.getName().length() - 1 ? "" : filePath.getName().substring(index + 1).toLowerCase(); - Class clz; + Class clz; if (parquetExtensions.contains(extension)) { clz = ParquetFileReader.class; } else if (avroExtensions.contains(extension)) { clz = AvroFileReader.class; } else if (sequenceExtensions.contains(extension)) { clz = SequenceFileReader.class; - } else if (delimitedExtensions.contains(extension)) { - clz = DelimitedTextFileReader.class; + } else if (jsonExtensions.contains(extension)) { + clz = JsonFileReader.class; + } else if (csvExtensions.contains(extension)) { + clz = CsvFileReader.class; + } else if (tsvExtensions.contains(extension)) { + clz = TsvFileReader.class; + } else if (fixedExtensions.contains(extension)) { + clz = FixedWidthFileReader.class; } else { clz = TextFileReader.class; } - return ReflectionUtils.makeReader(clz, fs, filePath, config); + return (AbstractFileReader) ReflectionUtils.makeReader(clz, fs, filePath, config); } @Override - protected void configure(Map config) { - this.parquetExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET) == null ? - Arrays.asList("parquet") : - Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET).toString().toLowerCase().split(",")); - this.avroExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_AVRO) == null ? - Arrays.asList("avro") : - Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_AVRO).toString().toLowerCase().split(",")); - this.sequenceExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE) == null ? - Arrays.asList("seq") : - Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE).toString().toLowerCase().split(",")); - this.delimitedExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED) == null ? - Arrays.asList("tsv", "csv") : - Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED).toString().toLowerCase().split(",")); + protected void configure(Map config) { + this.parquetExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, "parquet") + .toLowerCase().split(",")).collect(Collectors.toSet()); + this.avroExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, "avro") + .toLowerCase().split(",")).collect(Collectors.toSet()); + this.sequenceExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE, "seq") + .toLowerCase().split(",")).collect(Collectors.toSet()); + this.jsonExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_JSON, "json") + .toLowerCase().split(",")).collect(Collectors.toSet()); + this.csvExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_CSV, "csv") + .toLowerCase().split(",")).collect(Collectors.toSet()); + this.tsvExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_TSV, "tsv") + .toLowerCase().split(",")).collect(Collectors.toSet()); + this.fixedExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_FIXED, "fixed") + .toLowerCase().split(",")).collect(Collectors.toSet()); } @Override - public boolean hasNext() { + public boolean hasNextRecord() { return reader.hasNext(); } @Override - public void seek(Offset offset) { + public void seekFile(long offset) { reader.seek(offset); } @Override - public Offset currentOffset() { + public long currentOffset() { return reader.currentOffset(); } @@ -96,15 +108,17 @@ public void close() throws IOException { } @Override - protected AgnosticRecord nextRecord() { + public boolean isClosed() { + return reader.isClosed(); + } + + @Override + protected AgnosticRecord nextRecord() throws IOException { return new AgnosticRecord(reader.getAdapter(), reader.nextRecord()); } static class AgnosticAdapter implements ReaderAdapter { - public AgnosticAdapter() { - } - @Override public Struct apply(AgnosticRecord ag) { return ag.adapter.apply(ag.record); @@ -115,7 +129,7 @@ static class AgnosticRecord { private final ReaderAdapter adapter; private final Object record; - public AgnosticRecord(ReaderAdapter adapter, Object record) { + AgnosticRecord(ReaderAdapter adapter, Object record) { this.adapter = adapter; this.record = record; } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java index 46e5e9f..3db8e3c 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import io.confluent.connect.avro.AvroData; import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; @@ -11,10 +10,10 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.errors.ConnectException; import java.io.IOException; import java.util.Map; +import java.util.Optional; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; @@ -24,86 +23,66 @@ public class AvroFileReader extends AbstractFileReader { public static final String FILE_READER_AVRO_SCHEMA = FILE_READER_AVRO + "schema"; - private final AvroOffset offset; - private DataFileReader reader; + private final DataFileReader reader; private Schema schema; + private boolean closed; public AvroFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, new GenericRecordToStruct(), config); AvroFSInput input = new AvroFSInput(FileContext.getFileContext(filePath.toUri()), filePath); - this.reader = new DataFileReader<>(input, new SpecificDatumReader<>(this.schema)); - this.offset = new AvroOffset(0); - } - - protected void configure(Map config) { - if (config.get(FILE_READER_AVRO_SCHEMA) != null) { - this.schema = new Schema.Parser().parse(config.get(FILE_READER_AVRO_SCHEMA).toString()); + if (this.schema == null) { + this.reader = new DataFileReader<>(input, new SpecificDatumReader<>()); } else { - this.schema = null; + this.reader = new DataFileReader<>(input, new SpecificDatumReader<>(this.schema)); } + this.closed = false; + } + + @Override + protected void configure(Map config) { + this.schema = Optional.ofNullable(config.get(FILE_READER_AVRO_SCHEMA)) + .map(c -> new Schema.Parser().parse(c)) + .orElse(null); } @Override - public boolean hasNext() { + public boolean hasNextRecord() { return reader.hasNext(); } @Override protected GenericRecord nextRecord() { GenericRecord record = reader.next(); - this.offset.inc(); + incrementOffset(); return record; } @Override - public void seek(Offset offset) { - try { - reader.sync(offset.getRecordOffset()); - this.offset.setOffset(reader.previousSync() - 15); - } catch (IOException ioe) { - throw new ConnectException("Error seeking file " + getFilePath(), ioe); - } - } - - @Override - public Offset currentOffset() { - return offset; + public void seekFile(long offset) throws IOException { + reader.sync(offset); + setOffset(reader.previousSync() - 16L); } @Override public void close() throws IOException { + closed = true; reader.sync(0); reader.close(); } - public static class AvroOffset implements Offset { - private long offset; - - public AvroOffset(long offset) { - this.offset = offset; - } - - public void setOffset(long offset) { - this.offset = offset; - } - - protected void inc() { - this.offset++; - } - - @Override - public long getRecordOffset() { - return offset; - } + @Override + public boolean isClosed() { + return closed; } static class GenericRecordToStruct implements ReaderAdapter { + private static final int CACHE_SIZE = 100; private final AvroData avroData; - public GenericRecordToStruct() { + GenericRecordToStruct() { this.avroData = new AvroData(CACHE_SIZE); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CompressionType.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CompressionType.java new file mode 100644 index 0000000..9dade35 --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CompressionType.java @@ -0,0 +1,23 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +public enum CompressionType { + BZIP2, + GZIP, + NONE; + + private boolean concatenated; + + CompressionType() { + this.concatenated = true; + } + + public boolean isConcatenated() { + return concatenated; + } + + public static CompressionType fromName(String compression, boolean concatenated) { + CompressionType ct = CompressionType.valueOf(compression.trim().toUpperCase()); + ct.concatenated = concatenated; + return ct; + } +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReader.java new file mode 100644 index 0000000..70388dc --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReader.java @@ -0,0 +1,43 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.univocity.parsers.common.AbstractParser; +import com.univocity.parsers.csv.CsvParser; +import com.univocity.parsers.csv.CsvParserSettings; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.Map; + +public class CsvFileReader extends UnivocityFileReader { + + public static final String FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE = FILE_READER_DELIMITED_SETTINGS + "empty_value"; + public static final String FILE_READER_DELIMITED_SETTINGS_DELIMITER_DETECTION = FILE_READER_DELIMITED_SETTINGS + "delimiter_detection"; + public static final String FILE_READER_DELIMITED_SETTINGS_ESCAPE_UNQUOTED = FILE_READER_DELIMITED_SETTINGS + "escape_unquoted"; + + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER = FILE_READER_DELIMITED_SETTINGS_FORMAT + "delimiter"; + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "quote"; + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE_ESCAPE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "quote_escape"; + + public CsvFileReader(FileSystem fs, Path filePath, Map config) throws IOException { + super(fs, filePath, config); + } + + @Override + protected CsvParserSettings parserSettings(Map config) { + CsvParserSettings settings = new CsvParserSettings(); + settings.setEmptyValue(config.get(FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE)); + settings.setDelimiterDetectionEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_DELIMITER_DETECTION, false)); + settings.setEscapeUnquotedValues(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_ESCAPE_UNQUOTED, false)); + settings.getFormat().setDelimiter(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, ",")); + settings.getFormat().setQuote(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE, "\"").charAt(0)); + settings.getFormat().setQuoteEscape(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE_ESCAPE, "\"").charAt(0)); + + return settings; + } + + @Override + protected AbstractParser createParser(CsvParserSettings settings) { + return new CsvParser(settings); + } +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java deleted file mode 100644 index 542d3c0..0000000 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java +++ /dev/null @@ -1,159 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Schema; -import org.apache.kafka.connect.data.SchemaBuilder; -import org.apache.kafka.connect.data.Struct; - -import java.io.IOException; -import java.util.Map; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; - -public class DelimitedTextFileReader extends AbstractFileReader { - private static final String FILE_READER_DELIMITED = FILE_READER_PREFIX + "delimited."; - public static final String FILE_READER_DELIMITED_HEADER = FILE_READER_DELIMITED + "header"; - public static final String FILE_READER_DELIMITED_TOKEN = FILE_READER_DELIMITED + "token"; - public static final String FILE_READER_DELIMITED_ENCODING = FILE_READER_DELIMITED + "encoding"; - public static final String FILE_READER_DELIMITED_DEFAULT_VALUE = FILE_READER_DELIMITED + "default_value"; - - private static final String DEFAULT_COLUMN_NAME = "column"; - - private final TextFileReader inner; - private final Schema schema; - private DelimitedTextOffset offset; - private String token; - private String defaultValue; - private boolean hasHeader; - - public DelimitedTextFileReader(FileSystem fs, Path filePath, Map config) throws IOException { - super(fs, filePath, new DelimitedTxtToStruct(), config); - - //mapping encoding for text file reader - if (config.get(FILE_READER_DELIMITED_ENCODING) != null) { - config.put(TextFileReader.FILE_READER_TEXT_ENCODING, config.get(FILE_READER_DELIMITED_ENCODING)); - } - this.inner = new TextFileReader(fs, filePath, config); - this.offset = new DelimitedTextOffset(0, hasHeader); - - SchemaBuilder schemaBuilder = SchemaBuilder.struct(); - if (hasNext()) { - String firstLine = inner.nextRecord().getValue(); - String columns[] = firstLine.split(token); - IntStream.range(0, columns.length).forEach(index -> { - String columnName = hasHeader ? columns[index] : DEFAULT_COLUMN_NAME + "_" + ++index; - schemaBuilder.field(columnName, SchemaBuilder.STRING_SCHEMA); - }); - - if (!hasHeader) { - //back to the first line - inner.seek(this.offset); - } - } - this.schema = schemaBuilder.build(); - } - - @Override - protected void configure(Map config) { - if (config.get(FILE_READER_DELIMITED_TOKEN) == null || - config.get(FILE_READER_DELIMITED_TOKEN).toString().equals("")) { - throw new IllegalArgumentException(FILE_READER_DELIMITED_TOKEN + " property cannot be empty for DelimitedTextFileReader"); - } - this.token = config.get(FILE_READER_DELIMITED_TOKEN).toString(); - this.defaultValue = config.get(FILE_READER_DELIMITED_DEFAULT_VALUE) == null ? - null : config.get(FILE_READER_DELIMITED_DEFAULT_VALUE).toString(); - this.hasHeader = Boolean.valueOf((String) config.get(FILE_READER_DELIMITED_HEADER)); - } - - @Override - protected DelimitedRecord nextRecord() { - offset.inc(); - String values[] = inner.nextRecord().getValue().split(token); - return new DelimitedRecord(schema, defaultValue != null ? fillNullValues(values) : values); - } - - private String[] fillNullValues(final String[] values) { - return IntStream.range(0, schema.fields().size()) - .mapToObj(index -> { - if (index < values.length) { - return values[index]; - } else { - return defaultValue; - } - }) - .collect(Collectors.toList()) - .toArray(new String[0]); - } - - @Override - public boolean hasNext() { - return inner.hasNext(); - } - - @Override - public void seek(Offset offset) { - inner.seek(offset); - this.offset.setOffset(inner.currentOffset().getRecordOffset()); - } - - @Override - public Offset currentOffset() { - return offset; - } - - @Override - public void close() throws IOException { - inner.close(); - } - - public static class DelimitedTextOffset implements Offset { - private long offset; - private boolean hasHeader; - - public DelimitedTextOffset(long offset, boolean hasHeader) { - this.hasHeader = hasHeader; - this.offset = hasHeader && offset >= 0 ? offset + 1 : offset; - } - - public void setOffset(long offset) { - this.offset = hasHeader && offset > 0 ? offset - 1 : offset; - } - - protected void inc() { - this.offset++; - } - - @Override - public long getRecordOffset() { - return offset; - } - } - - static class DelimitedTxtToStruct implements ReaderAdapter { - - @Override - public Struct apply(DelimitedRecord record) { - Struct struct = new Struct(record.schema); - IntStream.range(0, record.schema.fields().size()).forEach(index -> { - if (index < record.values.length) { - struct.put(record.schema.fields().get(index).name(), record.values[index]); - } - }); - return struct; - } - } - - static class DelimitedRecord { - private final Schema schema; - private final String[] values; - - public DelimitedRecord(Schema schema, String[] values) { - this.schema = schema; - this.values = values; - } - } -} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java index 521ddbb..518e9f8 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; @@ -16,12 +15,12 @@ public interface FileReader extends Iterator, Closeable { Struct next(); - void seek(Offset offset); + void seek(long offset); - Offset currentOffset(); + long currentOffset(); } @FunctionalInterface interface ReaderAdapter extends Function { -} \ No newline at end of file +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReader.java new file mode 100644 index 0000000..52f4a95 --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReader.java @@ -0,0 +1,50 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.univocity.parsers.common.AbstractParser; +import com.univocity.parsers.fixed.FixedWidthFields; +import com.univocity.parsers.fixed.FixedWidthParser; +import com.univocity.parsers.fixed.FixedWidthParserSettings; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Map; +import java.util.Optional; + +public class FixedWidthFileReader extends UnivocityFileReader { + + public static final String FILE_READER_DELIMITED_SETTINGS_FIELD_LENGTHS = FILE_READER_DELIMITED_SETTINGS + "field_lengths"; + public static final String FILE_READER_DELIMITED_SETTINGS_KEEP_PADDING = FILE_READER_DELIMITED_SETTINGS + "keep_padding"; + public static final String FILE_READER_DELIMITED_SETTINGS_PADDING_FOR_HEADERS = FILE_READER_DELIMITED_SETTINGS + "padding_for_headers"; + public static final String FILE_READER_DELIMITED_SETTINGS_ENDS_ON_NEW_LINE = FILE_READER_DELIMITED_SETTINGS + "ends_on_new_line"; + public static final String FILE_READER_DELIMITED_SETTINGS_SKIP_TRAILING_CHARS = FILE_READER_DELIMITED_SETTINGS + "skip_trailing_chars"; + + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_PADDING = FILE_READER_DELIMITED_SETTINGS_FORMAT + "padding"; + + public FixedWidthFileReader(FileSystem fs, Path filePath, Map config) throws IOException { + super(fs, filePath, config); + } + + @Override + protected FixedWidthParserSettings parserSettings(Map config) { + FixedWidthFields fieldLengths = new FixedWidthFields(); + Optional.ofNullable(config.get(FILE_READER_DELIMITED_SETTINGS_FIELD_LENGTHS)) + .map(fl -> Arrays.stream(fl.split(","))) + .ifPresent(fl -> fl.forEach(field -> fieldLengths.addField(Integer.parseInt(field)))); + + FixedWidthParserSettings settings = new FixedWidthParserSettings(fieldLengths); + settings.setKeepPadding(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_KEEP_PADDING, false)); + settings.setUseDefaultPaddingForHeaders(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_PADDING_FOR_HEADERS, true)); + settings.setRecordEndsOnNewline(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_ENDS_ON_NEW_LINE, true)); + settings.setSkipTrailingCharsUntilNewline(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_SKIP_TRAILING_CHARS, false)); + settings.getFormat().setPadding(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_PADDING, " ").charAt(0)); + + return settings; + } + + @Override + protected AbstractParser createParser(FixedWidthParserSettings settings) { + return new FixedWidthParser(settings); + } +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java new file mode 100644 index 0000000..3fabc01 --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java @@ -0,0 +1,226 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; + +public class JsonFileReader extends AbstractFileReader { + + private static final String FILE_READER_JSON = FILE_READER_PREFIX + "json."; + private static final String FILE_READER_JSON_COMPRESSION = FILE_READER_JSON + "compression."; + + public static final String FILE_READER_JSON_RECORD_PER_LINE = FILE_READER_JSON + "record_per_line"; + public static final String FILE_READER_JSON_DESERIALIZATION_CONFIGS = FILE_READER_JSON + "deserialization."; + + public static final String FILE_READER_JSON_COMPRESSION_TYPE = FILE_READER_JSON_COMPRESSION + "type"; + public static final String FILE_READER_JSON_COMPRESSION_CONCATENATED = FILE_READER_JSON_COMPRESSION + "concatenated"; + public static final String FILE_READER_JSON_ENCODING = FILE_READER_JSON + "encoding"; + + private final TextFileReader inner; + private final Schema schema; + private ObjectMapper mapper; + + public JsonFileReader(FileSystem fs, Path filePath, Map config) throws IOException { + super(fs, filePath, new JsonToStruct(), config); + + config.put(TextFileReader.FILE_READER_TEXT_ENCODING, config.get(FILE_READER_JSON_ENCODING)); + config.put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, config.get(FILE_READER_JSON_RECORD_PER_LINE)); + config.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, config.get(FILE_READER_JSON_COMPRESSION_TYPE)); + config.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_CONCATENATED, config.get(FILE_READER_JSON_COMPRESSION_CONCATENATED)); + + this.inner = new TextFileReader(fs, filePath, config); + + if (hasNext()) { + String line = inner.nextRecord().getValue(); + this.schema = extractSchema(mapper.readTree(line)); + //back to the first line + inner.seek(0); + } else { + this.schema = SchemaBuilder.struct().build(); + } + } + + @Override + protected void configure(Map config) { + mapper = new ObjectMapper(); + Set deserializationFeatures = Arrays.stream(DeserializationFeature.values()) + .map(Enum::name) + .collect(Collectors.toSet()); + config.entrySet().stream() + .filter(entry -> entry.getKey().startsWith(FILE_READER_JSON_DESERIALIZATION_CONFIGS)) + .forEach(entry -> { + String feature = entry.getKey().replaceAll(FILE_READER_JSON_DESERIALIZATION_CONFIGS, ""); + if (deserializationFeatures.contains(feature)) { + mapper.configure(DeserializationFeature.valueOf(feature), + Boolean.parseBoolean(entry.getValue())); + } else { + log.warn("Ignoring deserialization configuration '{}' due to it does not exist.", feature); + } + }); + } + + @Override + protected JsonRecord nextRecord() throws IOException { + JsonNode value = mapper.readTree(inner.nextRecord().getValue()); + return new JsonRecord(schema, value); + } + + @Override + public boolean hasNextRecord() throws IOException { + return inner.hasNextRecord(); + } + + @Override + public void seekFile(long offset) throws IOException { + inner.seekFile(offset); + } + + @Override + public long currentOffset() { + return inner.currentOffset(); + } + + @Override + public void close() throws IOException { + inner.close(); + } + + @Override + public boolean isClosed() { + return inner.isClosed(); + } + + private static Schema extractSchema(JsonNode jsonNode) { + switch (jsonNode.getNodeType()) { + case BOOLEAN: + return Schema.OPTIONAL_BOOLEAN_SCHEMA; + case NUMBER: + if (jsonNode.isShort()) { + return Schema.OPTIONAL_INT8_SCHEMA; + } else if (jsonNode.isInt()) { + return Schema.OPTIONAL_INT32_SCHEMA; + } else if (jsonNode.isLong()) { + return Schema.OPTIONAL_INT64_SCHEMA; + } else if (jsonNode.isFloat()) { + return Schema.OPTIONAL_FLOAT32_SCHEMA; + } else if (jsonNode.isDouble()) { + return Schema.OPTIONAL_FLOAT64_SCHEMA; + } else if (jsonNode.isBigInteger()) { + return Schema.OPTIONAL_INT64_SCHEMA; + } else if (jsonNode.isBigDecimal()) { + return Schema.OPTIONAL_FLOAT64_SCHEMA; + } else { + return Schema.OPTIONAL_FLOAT64_SCHEMA; + } + case STRING: + return Schema.OPTIONAL_STRING_SCHEMA; + case BINARY: + return Schema.OPTIONAL_BYTES_SCHEMA; + case ARRAY: + Iterable elements = jsonNode::elements; + Schema arraySchema = StreamSupport.stream(elements.spliterator(), false) + .findFirst().map(JsonFileReader::extractSchema) + .orElse(SchemaBuilder.struct().build()); + return SchemaBuilder.array(arraySchema).build(); + case OBJECT: + SchemaBuilder builder = SchemaBuilder.struct(); + jsonNode.fields() + .forEachRemaining(field -> builder.field(field.getKey(), extractSchema(field.getValue()))); + return builder.build(); + default: + return SchemaBuilder.struct().optional().build(); + } + } + + static class JsonToStruct implements ReaderAdapter { + + @Override + public Struct apply(JsonRecord record) { + return toStruct(record.schema, record.value); + } + + private Struct toStruct(Schema schema, JsonNode jsonNode) { + if (jsonNode.isNull()) return null; + Struct struct = new Struct(schema); + jsonNode.fields() + .forEachRemaining(field -> struct.put(field.getKey(), + mapValue(struct.schema().field(field.getKey()).schema(), field.getValue()))); + return struct; + } + + private Object mapValue(Schema schema, JsonNode value) { + if (value == null) return null; + + switch (value.getNodeType()) { + case BOOLEAN: + return value.booleanValue(); + case NUMBER: + if (value.isShort()) { + return value.shortValue(); + } else if (value.isInt()) { + return value.intValue(); + } else if (value.isLong()) { + return value.longValue(); + } else if (value.isFloat()) { + return value.floatValue(); + } else if (value.isDouble()) { + return value.doubleValue(); + } else if (value.isBigInteger()) { + return value.bigIntegerValue(); + } else { + return value.numberValue(); + } + case STRING: + return value.asText(); + case BINARY: + try { + return value.binaryValue(); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + case OBJECT: + case POJO: + Struct struct = new Struct(schema); + Iterable> fields = value::fields; + StreamSupport.stream(fields.spliterator(), false) + .forEach(field -> struct.put(field.getKey(), + mapValue(extractSchema(field.getValue()), field.getValue())) + ); + return struct; + case ARRAY: + Iterable arrayElements = value::elements; + return StreamSupport.stream(arrayElements.spliterator(), false) + .map(elm -> mapValue(schema, elm)) + .collect(Collectors.toList()); + case NULL: + case MISSING: + default: + return null; + } + } + } + + static class JsonRecord { + private final Schema schema; + private final JsonNode value; + + JsonRecord(Schema schema, JsonNode value) { + this.schema = schema; + this.value = value; + } + } +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java index 76b71da..0657d0b 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import io.confluent.connect.avro.AvroData; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; @@ -9,14 +8,14 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.errors.ConnectException; import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.avro.AvroReadSupport; import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; import java.io.IOException; import java.util.Map; -import java.util.NoSuchElementException; +import java.util.Optional; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; @@ -27,19 +26,15 @@ public class ParquetFileReader extends AbstractFileReader { public static final String FILE_READER_PARQUET_SCHEMA = FILE_READER_PARQUET + "schema"; public static final String FILE_READER_PARQUET_PROJECTION = FILE_READER_PARQUET + "projection"; - private final ParquetOffset offset; - private ParquetReader reader; private GenericRecord currentRecord; private Schema schema; private Schema projection; private boolean closed; - public ParquetFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, new GenericRecordToStruct(), config); - this.offset = new ParquetOffset(0); this.reader = initReader(); this.closed = false; } @@ -52,43 +47,30 @@ private ParquetReader initReader() throws IOException { if (this.projection != null) { AvroReadSupport.setRequestedProjection(configuration, this.projection); } - ParquetReader reader = AvroParquetReader.builder(getFilePath()) - .withConf(configuration).build(); - return reader; + return AvroParquetReader + .builder(HadoopInputFile.fromPath(getFilePath(), configuration)) + .build(); } - protected void configure(Map config) { - if (config.get(FILE_READER_PARQUET_SCHEMA) != null) { - this.schema = new Schema.Parser().parse(config.get(FILE_READER_PARQUET_SCHEMA).toString()); - } else { - this.schema = null; - } - if (config.get(FILE_READER_PARQUET_PROJECTION) != null) { - this.projection = new Schema.Parser().parse(config.get(FILE_READER_PARQUET_PROJECTION).toString()); - } else { - this.projection = null; - } + protected void configure(Map config) { + this.schema = Optional.ofNullable(config.get(FILE_READER_PARQUET_SCHEMA)) + .map(c -> new Schema.Parser().parse(c)) + .orElse(null); + this.projection = Optional.ofNullable(config.get(FILE_READER_PARQUET_PROJECTION)) + .map(c -> new Schema.Parser().parse(c)) + .orElse(null); } @Override - public boolean hasNext() { - if (closed) return false; + public boolean hasNextRecord() throws IOException { if (currentRecord == null) { - try { - currentRecord = reader.read(); - if (currentRecord != null) offset.inc(); - } catch (IOException ioe) { - throw new ConnectException("Error reading parquet record", ioe); - } + currentRecord = reader.read(); } return currentRecord != null; } @Override protected GenericRecord nextRecord() { - if (!hasNext()) { - throw new NoSuchElementException("There are no more records in file: " + getFilePath()); - } GenericRecord record; if (this.projection != null) { record = new GenericData.Record(this.projection); @@ -97,68 +79,38 @@ record = new GenericData.Record(this.projection); record = currentRecord; } currentRecord = null; + incrementOffset(); return record; } @Override - public void seek(Offset offset) { - if (closed) { - throw new ConnectException("Stream is closed!"); - } - if (offset.getRecordOffset() < 0) { - throw new IllegalArgumentException("Record offset must be greater than 0"); - } - if (this.offset.getRecordOffset() > offset.getRecordOffset()) { - try { - this.reader = initReader(); - this.offset.setOffset(0); - this.closed = false; - } catch (IOException ioe) { - throw new ConnectException("Error initializing parquet reader", ioe); - } + public void seekFile(long offset) throws IOException { + if (currentOffset() > offset) { + this.reader = initReader(); + this.closed = false; + setOffset(0); } - while (hasNext() && this.offset.getRecordOffset() <= offset.getRecordOffset()) { + while (hasNext() && currentOffset() < offset) { nextRecord(); } } - @Override - public Offset currentOffset() { - return offset; - } - @Override public void close() throws IOException { - this.closed = true; + closed = true; reader.close(); } - public static class ParquetOffset implements Offset { - private long offset; - - public ParquetOffset(long offset) { - this.offset = offset; - } - - public void setOffset(long offset) { - this.offset = offset; - } - - protected void inc() { - this.offset++; - } - - @Override - public long getRecordOffset() { - return offset; - } + @Override + public boolean isClosed() { + return closed; } static class GenericRecordToStruct implements ReaderAdapter { private static final int CACHE_SIZE = 100; private final AvroData avroData; - public GenericRecordToStruct() { + GenericRecordToStruct() { this.avroData = new AvroData(CACHE_SIZE); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java index 013a680..3740db9 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.*; @@ -8,12 +7,10 @@ import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.data.SchemaBuilder; import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.errors.ConnectException; import java.io.EOFException; import java.io.IOException; import java.util.Map; -import java.util.NoSuchElementException; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; @@ -30,14 +27,13 @@ public class SequenceFileReader extends AbstractFileReader config) throws IOException { super(fs, filePath, new SeqToStruct(), config); @@ -51,28 +47,18 @@ public SequenceFileReader(FileSystem fs, Path filePath, Map conf .field(keyFieldName, getSchema(this.key)) .field(valueFieldName, getSchema(this.value)) .build(); - this.offset = new SeqOffset(0); this.recordIndex = this.hasNextIndex = -1; this.hasNext = false; + this.closed = false; } @Override - protected void configure(Map config) { - if (config.get(FILE_READER_SEQUENCE_FIELD_NAME_KEY) == null || - config.get(FILE_READER_SEQUENCE_FIELD_NAME_KEY).toString().equals("")) { - this.keyFieldName = FIELD_NAME_KEY_DEFAULT; - } else { - this.keyFieldName = config.get(FILE_READER_SEQUENCE_FIELD_NAME_KEY).toString(); - } - if (config.get(FILE_READER_SEQUENCE_FIELD_NAME_VALUE) == null || - config.get(FILE_READER_SEQUENCE_FIELD_NAME_VALUE).toString().equals("")) { - this.valueFieldName = FIELD_NAME_VALUE_DEFAULT; - } else { - this.valueFieldName = config.get(FILE_READER_SEQUENCE_FIELD_NAME_VALUE).toString(); - } + protected void configure(Map config) { + this.keyFieldName = config.getOrDefault(FILE_READER_SEQUENCE_FIELD_NAME_KEY, FIELD_NAME_KEY_DEFAULT); + this.valueFieldName = config.getOrDefault(FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE_DEFAULT); } - private Schema getSchema(Writable writable) { + Schema getSchema(Writable writable) { if (writable instanceof ByteWritable) { return SchemaBuilder.INT8_SCHEMA; } else if (writable instanceof ShortWritable) { @@ -84,7 +70,7 @@ private Schema getSchema(Writable writable) { } else if (writable instanceof FloatWritable) { return SchemaBuilder.FLOAT32_SCHEMA; } else if (writable instanceof DoubleWritable) { - return SchemaBuilder.INT64_SCHEMA; + return SchemaBuilder.FLOAT64_SCHEMA; } else if (writable instanceof BytesWritable) { return SchemaBuilder.BYTES_SCHEMA; } else if (writable instanceof BooleanWritable) { @@ -94,74 +80,42 @@ private Schema getSchema(Writable writable) { } @Override - public boolean hasNext() { + public boolean hasNextRecord() throws IOException { try { if (hasNextIndex == -1 || hasNextIndex == recordIndex) { hasNextIndex++; - offset.inc(); - return hasNext = reader.next(key, value); + incrementOffset(); + hasNext = reader.next(key, value); } return hasNext; } catch (EOFException eofe) { return false; - } catch (IOException ioe) { - throw new ConnectException(ioe); } } @Override protected SequenceRecord nextRecord() { - if (!hasNext()) { - throw new NoSuchElementException("There are no more records in file: " + getFilePath()); - } recordIndex++; - return new SequenceRecord(schema, keyFieldName, key, valueFieldName, value); - } - - @Override - public void seek(Offset offset) { - if (offset.getRecordOffset() < 0) { - throw new IllegalArgumentException("Record offset must be greater than 0"); - } - try { - reader.sync(offset.getRecordOffset()); - hasNextIndex = recordIndex = offset.getRecordOffset(); - hasNext = false; - this.offset.setOffset(offset.getRecordOffset()); - } catch (IOException ioe) { - throw new ConnectException("Error seeking file " + getFilePath(), ioe); - } + return new SequenceRecord<>(schema, keyFieldName, key, valueFieldName, value); } @Override - public Offset currentOffset() { - return offset; + public void seekFile(long offset) throws IOException { + reader.sync(offset); + hasNextIndex = recordIndex = offset; + hasNext = false; + setOffset(offset - 1); } @Override public void close() throws IOException { + closed = true; reader.close(); } - public static class SeqOffset implements Offset { - private long offset; - - public SeqOffset(long offset) { - this.offset = offset; - } - - public void setOffset(long offset) { - this.offset = offset; - } - - protected void inc() { - this.offset++; - } - - @Override - public long getRecordOffset() { - return offset; - } + @Override + public boolean isClosed() { + return closed; } static class SeqToStruct implements ReaderAdapter> { @@ -173,7 +127,7 @@ public Struct apply(SequenceRecord record) { .put(record.valueFieldName, toSchemaValue(record.value)); } - private Object toSchemaValue(Writable writable) { + Object toSchemaValue(Writable writable) { if (writable instanceof ByteWritable) { return ((ByteWritable) writable).get(); } else if (writable instanceof ShortWritable) { @@ -196,13 +150,14 @@ private Object toSchemaValue(Writable writable) { } static class SequenceRecord { + private final Schema schema; private final String keyFieldName; private final T key; private final String valueFieldName; private final U value; - public SequenceRecord(Schema schema, String keyFieldName, T key, String valueFieldName, U value) { + SequenceRecord(Schema schema, String keyFieldName, T key, String valueFieldName, U value) { this.schema = schema; this.keyFieldName = keyFieldName; this.key = key; @@ -211,5 +166,4 @@ public SequenceRecord(Schema schema, String keyFieldName, T key, String valueFie } } - } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java index a5781af..56f5581 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java @@ -1,148 +1,138 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.data.SchemaBuilder; import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.errors.ConnectException; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.LineNumberReader; +import java.io.*; import java.nio.charset.Charset; +import java.util.List; import java.util.Map; -import java.util.NoSuchElementException; +import java.util.stream.Collectors; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; public class TextFileReader extends AbstractFileReader { - public static final String FIELD_NAME_VALUE_DEFAULT = "value"; - private static final String FILE_READER_TEXT = FILE_READER_PREFIX + "text."; - private static final String FILE_READER_SEQUENCE_FIELD_NAME_PREFIX = FILE_READER_TEXT + "field_name."; + private static final String FILE_READER_FIELD_NAME_PREFIX = FILE_READER_TEXT + "field_name."; + private static final String FILE_READER_TEXT_COMPRESSION = FILE_READER_TEXT + "compression."; + + public static final String FIELD_NAME_VALUE_DEFAULT = "value"; - public static final String FILE_READER_TEXT_FIELD_NAME_VALUE = FILE_READER_SEQUENCE_FIELD_NAME_PREFIX + "value"; + public static final String FILE_READER_TEXT_FIELD_NAME_VALUE = FILE_READER_FIELD_NAME_PREFIX + "value"; + public static final String FILE_READER_TEXT_RECORD_PER_LINE = FILE_READER_TEXT + "record_per_line"; + public static final String FILE_READER_TEXT_COMPRESSION_TYPE = FILE_READER_TEXT_COMPRESSION + "type"; + public static final String FILE_READER_TEXT_COMPRESSION_CONCATENATED = FILE_READER_TEXT_COMPRESSION + "concatenated"; public static final String FILE_READER_TEXT_ENCODING = FILE_READER_TEXT + "encoding"; - private final TextOffset offset; - private String currentLine; + private String current; private boolean finished = false; private LineNumberReader reader; private Schema schema; private Charset charset; + private CompressionType compression; + private boolean recordPerLine; + private boolean closed; public TextFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, new TxtToStruct(), config); - this.reader = new LineNumberReader(new InputStreamReader(fs.open(filePath), this.charset)); - this.offset = new TextOffset(0); + this.reader = new LineNumberReader(getFileReader(fs.open(filePath))); + this.closed = false; } @Override - protected void configure(Map config) { - String valueFieldName; - if (config.get(FILE_READER_TEXT_FIELD_NAME_VALUE) == null || - config.get(FILE_READER_TEXT_FIELD_NAME_VALUE).toString().equals("")) { - valueFieldName = FIELD_NAME_VALUE_DEFAULT; - } else { - valueFieldName = config.get(FILE_READER_TEXT_FIELD_NAME_VALUE).toString(); - } + protected void configure(Map config) { this.schema = SchemaBuilder.struct() - .field(valueFieldName, Schema.STRING_SCHEMA) + .field(config.getOrDefault(FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE_DEFAULT), + Schema.STRING_SCHEMA) .build(); + this.recordPerLine = Boolean.parseBoolean(config.getOrDefault(FILE_READER_TEXT_RECORD_PER_LINE, "true")); + String cType = config.getOrDefault(FILE_READER_TEXT_COMPRESSION_TYPE, CompressionType.NONE.toString()); + boolean concatenated = Boolean.parseBoolean(config.getOrDefault(FILE_READER_TEXT_COMPRESSION_CONCATENATED, + "true")); + this.compression = CompressionType.fromName(cType, concatenated); + this.charset = Charset.forName(config.getOrDefault(FILE_READER_TEXT_ENCODING, Charset.defaultCharset().name())); + } - if (config.get(FILE_READER_TEXT_ENCODING) == null || - config.get(FILE_READER_TEXT_ENCODING).toString().equals("")) { - this.charset = Charset.defaultCharset(); - } else { - this.charset = Charset.forName(config.get(FILE_READER_TEXT_ENCODING).toString()); + private Reader getFileReader(InputStream inputStream) throws IOException { + final InputStreamReader isr; + switch (this.compression) { + case BZIP2: + isr = new InputStreamReader(new BZip2CompressorInputStream(inputStream, + this.compression.isConcatenated()), this.charset); + break; + case GZIP: + isr = new InputStreamReader(new GzipCompressorInputStream(inputStream, + this.compression.isConcatenated()), this.charset); + break; + default: + isr = new InputStreamReader(inputStream, this.charset); + break; } + return isr; } @Override - public boolean hasNext() { - if (currentLine != null) { + public boolean hasNextRecord() throws IOException { + if (current != null) { return true; } else if (finished) { return false; } else { - try { - while (true) { - String line = reader.readLine(); - offset.setOffset(reader.getLineNumber()); - if (line == null) { - finished = true; - return false; - } - currentLine = line; - return true; + if (!recordPerLine) { + List lines = new BufferedReader(reader).lines().collect(Collectors.toList()); + current = String.join("\n", lines); + finished = true; + return true; + } + for (; ; ) { + String line = reader.readLine(); + if (line == null) { + finished = true; + return false; } - } catch (IOException ioe) { - throw new IllegalStateException(ioe); + current = line; + return true; } } } @Override protected TextRecord nextRecord() { - if (!hasNext()) { - throw new NoSuchElementException("There are no more records in file: " + getFilePath()); - } - String aux = currentLine; - currentLine = null; - + String aux = current; + current = null; + incrementOffset(); return new TextRecord(schema, aux); } @Override - public void seek(Offset offset) { - if (offset.getRecordOffset() < 0) { - throw new IllegalArgumentException("Record offset must be greater than 0"); + public void seekFile(long offset) throws IOException { + current = null; + if (offset < reader.getLineNumber()) { + finished = false; + reader.close(); + reader = new LineNumberReader(getFileReader(getFs().open(getFilePath()))); } - try { - if (offset.getRecordOffset() < reader.getLineNumber()) { - this.reader = new LineNumberReader(new InputStreamReader(getFs().open(getFilePath()))); - currentLine = null; - } - while ((currentLine = reader.readLine()) != null) { - if (reader.getLineNumber() - 1 == offset.getRecordOffset()) { - this.offset.setOffset(reader.getLineNumber()); - return; - } - } - this.offset.setOffset(reader.getLineNumber()); - } catch (IOException ioe) { - throw new ConnectException("Error seeking file " + getFilePath(), ioe); + while (reader.getLineNumber() < offset) { + reader.readLine(); } - } - - @Override - public Offset currentOffset() { - return offset; + setOffset(reader.getLineNumber()); } @Override public void close() throws IOException { + closed = true; reader.close(); } - public static class TextOffset implements Offset { - private long offset; - - public TextOffset(long offset) { - this.offset = offset; - } - - public void setOffset(long offset) { - this.offset = offset; - } - - @Override - public long getRecordOffset() { - return offset; - } + @Override + public boolean isClosed() { + return closed; } static class TxtToStruct implements ReaderAdapter { @@ -158,7 +148,7 @@ static class TextRecord { private final Schema schema; private final String value; - public TextRecord(Schema schema, String value) { + TextRecord(Schema schema, String value) { this.schema = schema; this.value = value; } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReader.java new file mode 100644 index 0000000..f626a8e --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReader.java @@ -0,0 +1,37 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.univocity.parsers.common.AbstractParser; +import com.univocity.parsers.tsv.TsvParser; +import com.univocity.parsers.tsv.TsvParserSettings; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.Map; + +public class TsvFileReader extends UnivocityFileReader { + + public static final String FILE_READER_DELIMITED_SETTINGS_LINE_JOINING = FILE_READER_DELIMITED_SETTINGS + "line_joining"; + + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "escape"; + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPED_CHAR = FILE_READER_DELIMITED_SETTINGS_FORMAT + "escaped_char"; + + public TsvFileReader(FileSystem fs, Path filePath, Map config) throws IOException { + super(fs, filePath, config); + } + + @Override + protected TsvParserSettings parserSettings(Map config) { + TsvParserSettings settings = new TsvParserSettings(); + settings.setLineJoiningEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_LINE_JOINING, false)); + settings.getFormat().setEscapeChar(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPE, "\"").charAt(0)); + settings.getFormat().setEscapedTabChar(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPED_CHAR, "\"").charAt(0)); + + return settings; + } + + @Override + protected AbstractParser createParser(TsvParserSettings settings) { + return new TsvParser(settings); + } +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java new file mode 100644 index 0000000..25a685d --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java @@ -0,0 +1,313 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.univocity.parsers.common.AbstractParser; +import com.univocity.parsers.common.CommonParserSettings; +import com.univocity.parsers.common.ParsingContext; +import com.univocity.parsers.common.ResultIterator; +import com.univocity.parsers.common.record.Record; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; + +abstract class UnivocityFileReader> + extends AbstractFileReader { + + private static final String FILE_READER_DELIMITED = FILE_READER_PREFIX + "delimited."; + private static final String FILE_READER_COMPRESSION = FILE_READER_DELIMITED + "compression."; + + protected static final String FILE_READER_DELIMITED_SETTINGS = FILE_READER_DELIMITED + "settings."; + protected static final String FILE_READER_DELIMITED_SETTINGS_FORMAT = FILE_READER_DELIMITED_SETTINGS + "format."; + + public static final String FILE_READER_DELIMITED_SETTINGS_HEADER = FILE_READER_DELIMITED_SETTINGS + "header"; + public static final String FILE_READER_DELIMITED_SETTINGS_SCHEMA = FILE_READER_DELIMITED_SETTINGS + "schema"; + public static final String FILE_READER_DELIMITED_SETTINGS_DATA_TYPE_MAPPING_ERROR = FILE_READER_DELIMITED_SETTINGS + "data_type_mapping_error"; + public static final String FILE_READER_DELIMITED_SETTINGS_ALLOW_NULLS = FILE_READER_DELIMITED_SETTINGS + "allow_nulls"; + public static final String FILE_READER_DELIMITED_SETTINGS_HEADER_NAMES = FILE_READER_DELIMITED_SETTINGS + "header_names"; + public static final String FILE_READER_DELIMITED_SETTINGS_LINE_SEPARATOR_DETECTION = FILE_READER_DELIMITED_SETTINGS + "line_separator_detection"; + public static final String FILE_READER_DELIMITED_SETTINGS_NULL_VALUE = FILE_READER_DELIMITED_SETTINGS + "null_value"; + public static final String FILE_READER_DELIMITED_SETTINGS_MAX_COLUMNS = FILE_READER_DELIMITED_SETTINGS + "max_columns"; + public static final String FILE_READER_DELIMITED_SETTINGS_MAX_CHARS_PER_COLUMN = FILE_READER_DELIMITED_SETTINGS + "max_chars_per_column"; + public static final String FILE_READER_DELIMITED_SETTINGS_ROWS_TO_SKIP = FILE_READER_DELIMITED_SETTINGS + "rows_to_skip"; + public static final String FILE_READER_DELIMITED_SETTINGS_ILW = FILE_READER_DELIMITED_SETTINGS + "ignore_leading_whitespaces"; + public static final String FILE_READER_DELIMITED_SETTINGS_ITW = FILE_READER_DELIMITED_SETTINGS + "ignore_trailing_whitespaces"; + + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_LINE_SEP = FILE_READER_DELIMITED_SETTINGS_FORMAT + "line_separator"; + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_COMMENT = FILE_READER_DELIMITED_SETTINGS_FORMAT + "comment"; + + public static final String FILE_READER_DELIMITED_COMPRESSION_TYPE = FILE_READER_COMPRESSION + "type"; + public static final String FILE_READER_DELIMITED_COMPRESSION_CONCATENATED = FILE_READER_COMPRESSION + "concatenated"; + public static final String FILE_READER_DELIMITED_ENCODING = FILE_READER_DELIMITED + "encoding"; + + private static final String DEFAULT_COLUMN_NAME = "column_"; + + private T settings; + private Schema schema; + private Charset charset; + private CompressionType compression; + private boolean dataTypeMappingError; + private boolean allowNulls; + private boolean closed; + + private ResultIterator iterator; + + public enum DataType { + BYTE, + SHORT, + INT, + LONG, + FLOAT, + DOUBLE, + BOOLEAN, + BYTES, + STRING + } + + public UnivocityFileReader(FileSystem fs, Path filePath, Map config) throws IOException { + super(fs, filePath, new UnivocityToStruct(), config); + + this.iterator = iterateRecords(); + this.schema = buildSchema(this.iterator, settings.isHeaderExtractionEnabled(), config); + } + + private Schema buildSchema(ResultIterator it, boolean hasHeader, Map config) { + SchemaBuilder builder = SchemaBuilder.struct(); + if (it.hasNext() && !hasHeader) { + Record first = it.next(); + List dataTypes = getDataTypes(config, first.getValues()); + IntStream.range(0, first.getValues().length) + .forEach(index -> builder.field(DEFAULT_COLUMN_NAME + (index + 1), dataTypes.get(index))); + seek(0); + } else if (hasHeader) { + Optional.ofNullable(it.getContext().headers()).ifPresent(headers -> { + List dataTypes = getDataTypes(config, headers); + IntStream.range(0, headers.length) + .forEach(index -> builder.field(headers[index], dataTypes.get(index))); + }); + } + return builder.build(); + } + + @Override + protected void configure(Map config) { + String cType = config.getOrDefault(FILE_READER_DELIMITED_COMPRESSION_TYPE, CompressionType.NONE.toString()); + boolean concatenated = Boolean.parseBoolean(config.getOrDefault(FILE_READER_DELIMITED_COMPRESSION_CONCATENATED, + "true")); + this.compression = CompressionType.fromName(cType, concatenated); + this.charset = Charset.forName(config.getOrDefault(FILE_READER_DELIMITED_ENCODING, Charset.defaultCharset().name())); + this.settings = allSettings(config); + this.dataTypeMappingError = Boolean.parseBoolean( + config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_DATA_TYPE_MAPPING_ERROR, "true")); + if (this.dataTypeMappingError) { + this.allowNulls = Boolean.parseBoolean( + config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_ALLOW_NULLS, "false")); + } else { + this.allowNulls = true; + } + + } + + private List getDataTypes(Map config, String[] headers) { + List dataTypes = Arrays + .stream(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_SCHEMA, "").toString().split(",")) + .filter(dt -> !dt.trim().isEmpty()) + .map(this::strToSchema) + .collect(Collectors.toList()); + if (dataTypes.size() > 0 && dataTypes.size() != headers.length) { + throw new IllegalArgumentException("The schema defined in property '" + FILE_READER_DELIMITED_SETTINGS_SCHEMA + + "' does not match the number of fields inferred in the file."); + } else if (dataTypes.size() == 0) { + return IntStream.range(0, headers.length) + .mapToObj(index -> Schema.STRING_SCHEMA) + .collect(Collectors.toList()); + } + return dataTypes; + } + + private Schema strToSchema(String dataType) { + switch (DataType.valueOf(dataType.trim().toUpperCase())) { + case BYTE: + return dataTypeMappingError && !allowNulls ? Schema.INT8_SCHEMA : Schema.OPTIONAL_INT8_SCHEMA; + case SHORT: + return dataTypeMappingError && !allowNulls ? Schema.INT16_SCHEMA : Schema.OPTIONAL_INT16_SCHEMA; + case INT: + return dataTypeMappingError && !allowNulls ? Schema.INT32_SCHEMA : Schema.OPTIONAL_INT32_SCHEMA; + case LONG: + return dataTypeMappingError && !allowNulls ? Schema.INT64_SCHEMA : Schema.OPTIONAL_INT64_SCHEMA; + case FLOAT: + return dataTypeMappingError && !allowNulls ? Schema.FLOAT32_SCHEMA : Schema.OPTIONAL_FLOAT32_SCHEMA; + case DOUBLE: + return dataTypeMappingError && !allowNulls ? Schema.FLOAT64_SCHEMA : Schema.OPTIONAL_FLOAT64_SCHEMA; + case BOOLEAN: + return dataTypeMappingError && !allowNulls ? Schema.BOOLEAN_SCHEMA : Schema.OPTIONAL_BOOLEAN_SCHEMA; + case BYTES: + return dataTypeMappingError && !allowNulls ? Schema.BYTES_SCHEMA : Schema.OPTIONAL_BYTES_SCHEMA; + case STRING: + default: + return dataTypeMappingError && !allowNulls ? Schema.STRING_SCHEMA : Schema.OPTIONAL_STRING_SCHEMA; + } + } + + private T allSettings(Map config) { + T settings = parserSettings(config); + settings.setHeaderExtractionEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_HEADER, false)); + settings.setHeaders(Optional.ofNullable(config.get(FILE_READER_DELIMITED_SETTINGS_HEADER_NAMES)) + .map(headers -> headers.split(",")).orElse(null)); + settings.setLineSeparatorDetectionEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_LINE_SEPARATOR_DETECTION, false)); + settings.setNullValue(config.get(FILE_READER_DELIMITED_SETTINGS_NULL_VALUE)); + settings.setMaxColumns(Integer.parseInt(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_MAX_COLUMNS, "512"))); + settings.setMaxCharsPerColumn(Integer.parseInt(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_MAX_CHARS_PER_COLUMN, "4096"))); + settings.setNumberOfRowsToSkip(Long.parseLong(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_ROWS_TO_SKIP, "0"))); + settings.setIgnoreLeadingWhitespaces(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_ILW, true)); + settings.setIgnoreTrailingWhitespaces(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_ITW, true)); + settings.getFormat().setLineSeparator(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_LINE_SEP, "\n")); + settings.getFormat().setComment(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_COMMENT, "#").charAt(0)); + + return settings; + } + + protected boolean getBoolean(Map config, String property, boolean defaultValue) { + return Boolean.parseBoolean(config.getOrDefault(property, String.valueOf(defaultValue))); + } + + protected abstract T parserSettings(Map config); + + protected abstract AbstractParser createParser(T settings); + + private Reader getFileReader(InputStream is, CompressionType compression, Charset charset) throws IOException { + final InputStreamReader isr; + switch (compression) { + case BZIP2: + isr = new InputStreamReader(new BZip2CompressorInputStream(is, compression.isConcatenated()), charset); + break; + case GZIP: + isr = new InputStreamReader(new GzipCompressorInputStream(is, compression.isConcatenated()), charset); + break; + default: + isr = new InputStreamReader(is, charset); + break; + } + return isr; + } + + private ResultIterator iterateRecords() throws IOException { + return createParser(settings) + .iterateRecords(getFileReader(getFs().open(getFilePath()), this.compression, this.charset)) + .iterator(); + } + + @Override + protected final UnivocityRecord nextRecord() { + incrementOffset(); + return new UnivocityRecord(schema, iterator.next(), dataTypeMappingError); + } + + @Override + public final boolean hasNextRecord() { + return iterator.hasNext(); + } + + @Override + public final void seekFile(long offset) throws IOException { + if (offset > currentOffset()) { + iterator.hasNext(); + iterator.getContext().skipLines(offset - currentOffset() - 1); + iterator.next(); + } else { + iterator = iterateRecords(); + iterator.hasNext(); + iterator.getContext().skipLines(offset); + } + setOffset(offset); + } + + @Override + public final void close() { + iterator.getContext().stop(); + closed = true; + } + + @Override + public final boolean isClosed() { + return closed; + } + + static class UnivocityToStruct implements ReaderAdapter { + + @Override + public Struct apply(UnivocityRecord record) { + Struct struct = new Struct(record.schema); + IntStream.range(0, record.schema.fields().size()) + .filter(index -> index < record.value.getValues().length) + .forEach(index -> { + Schema.Type type = record.schema.fields().get(index).schema().type(); + String fieldName = record.schema.fields().get(index).name(); + struct.put(fieldName, mapDatatype(type, record.value, index, record.dataTypeMappingError)); + }); + return struct; + } + + private Object mapDatatype(Schema.Type type, Record record, int fieldIndex, boolean dataTypeMappingError) { + try { + switch (type) { + case INT8: + return record.getByte(fieldIndex); + case INT16: + return record.getShort(fieldIndex); + case INT32: + return record.getInt(fieldIndex); + case INT64: + return record.getLong(fieldIndex); + case FLOAT32: + return record.getFloat(fieldIndex); + case FLOAT64: + return record.getDouble(fieldIndex); + case BOOLEAN: + return record.getBoolean(fieldIndex); + case BYTES: + return record.getString(fieldIndex).getBytes(); + case ARRAY: + case MAP: + case STRUCT: + case STRING: + default: + return record.getString(fieldIndex); + } + } catch (RuntimeException re) { + if (dataTypeMappingError) { + throw re; + } + return null; + } + } + } + + static class UnivocityRecord { + private final Schema schema; + private final Record value; + private final boolean dataTypeMappingError; + + UnivocityRecord(Schema schema, Record value, boolean dataTypeMappingError) { + this.schema = schema; + this.value = value; + this.dataTypeMappingError = dataTypeMappingError; + } + } +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java index 2a6dbce..d250e76 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java @@ -4,6 +4,7 @@ import com.github.mmolimar.kafka.connect.fs.file.FileMetadata; import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; +import com.github.mmolimar.kafka.connect.fs.util.TailCall; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; @@ -20,7 +21,7 @@ import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.*; -import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -33,14 +34,14 @@ abstract class AbstractPolicy implements Policy { protected final Pattern fileRegexp; private final FsSourceTaskConfig conf; - private final AtomicInteger executions; + private final AtomicLong executions; private final boolean recursive; private boolean interrupted; public AbstractPolicy(FsSourceTaskConfig conf) throws IOException { this.fileSystems = new ArrayList<>(); this.conf = conf; - this.executions = new AtomicInteger(0); + this.executions = new AtomicLong(0); this.recursive = conf.getBoolean(FsSourceTaskConfig.POLICY_RECURSIVE); this.fileRegexp = Pattern.compile(conf.getString(FsSourceTaskConfig.POLICY_REGEXP)); this.interrupted = false; @@ -54,7 +55,7 @@ public AbstractPolicy(FsSourceTaskConfig conf) throws IOException { private Map customConfigs() { return conf.originals().entrySet().stream() .filter(entry -> entry.getKey().startsWith(FsSourceTaskConfig.POLICY_PREFIX)) - .collect(Collectors.toMap(entry -> entry.getKey(), entry -> entry.getValue())); + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } private void configFs(Map customConfigs) throws IOException { @@ -87,7 +88,6 @@ private String convert(String uri) { throw new IllegalArgumentException("Cannot convert dynamic URI: " + matcher.group(1), e); } } - return converted; } @@ -103,15 +103,15 @@ public List getURIs() { @Override public final Iterator execute() throws IOException { if (hasEnded()) { - throw new IllegalWorkerStateException("Policy has ended. Cannot be retried"); + throw new IllegalWorkerStateException("Policy has ended. Cannot be retried."); } preCheck(); + executions.incrementAndGet(); Iterator files = Collections.emptyIterator(); for (FileSystem fs : fileSystems) { files = concat(files, listFiles(fs)); } - executions.incrementAndGet(); postCheck(); @@ -133,31 +133,36 @@ public Iterator listFiles(FileSystem fs) throws IOException { return new Iterator() { RemoteIterator it = fs.listFiles(fs.getWorkingDirectory(), recursive); LocatedFileStatus current = null; - boolean previous = false; - @Override - public boolean hasNext() { + private TailCall hasNextRec() { try { if (current == null) { - if (!it.hasNext()) return false; + if (!it.hasNext()) { + return TailCall.done(false); + } current = it.next(); - return hasNext(); + return this::hasNextRec; } - if (current.isFile() && + if (current.isFile() & fileRegexp.matcher(current.getPath().getName()).find()) { - return true; + return TailCall.done(true); } current = null; - return hasNext(); + return this::hasNextRec; } catch (IOException ioe) { throw new ConnectException(ioe); } } + @Override + public boolean hasNext() { + return hasNextRec().invoke(); + } + @Override public FileMetadata next() { if (!hasNext() && current == null) { - throw new NoSuchElementException("There are no more items"); + throw new NoSuchElementException("There are no more items."); } FileMetadata metadata = toMetadata(current); current = null; @@ -173,50 +178,43 @@ public final boolean hasEnded() { protected abstract boolean isPolicyCompleted(); - public final int getExecutions() { + public final long getExecutions() { return executions.get(); } - protected FileMetadata toMetadata(LocatedFileStatus fileStatus) { - List blocks = new ArrayList<>(); + FileMetadata toMetadata(LocatedFileStatus fileStatus) { - blocks.addAll(Arrays.stream(fileStatus.getBlockLocations()) - .map(block -> - new FileMetadata.BlockInfo(block.getOffset(), block.getLength(), block.isCorrupt())) - .collect(Collectors.toList())); + List blocks = Arrays.stream(fileStatus.getBlockLocations()) + .map(block -> new FileMetadata.BlockInfo(block.getOffset(), block.getLength(), block.isCorrupt())) + .collect(Collectors.toList()); return new FileMetadata(fileStatus.getPath().toString(), fileStatus.getLen(), blocks); } @Override - public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorageReader) throws IOException { - Map partition = new HashMap() {{ - put("path", metadata.getPath()); - //TODO manage blocks - //put("blocks", metadata.getBlocks().toString()); - }}; - + public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorageReader) { FileSystem current = fileSystems.stream() .filter(fs -> metadata.getPath().startsWith(fs.getWorkingDirectory().toString())) - .findFirst().orElse(null); - - FileReader reader; + .findFirst() + .orElse(null); try { - reader = ReflectionUtils.makeReader((Class) conf.getClass(FsSourceTaskConfig.FILE_READER_CLASS), + FileReader reader = ReflectionUtils.makeReader( + (Class) conf.getClass(FsSourceTaskConfig.FILE_READER_CLASS), current, new Path(metadata.getPath()), conf.originals()); - } catch (Throwable t) { - throw new ConnectException("An error has occurred when creating reader for file: " + metadata.getPath(), t); - } - - Map offset = offsetStorageReader.offset(partition); - if (offset != null && offset.get("offset") != null) { - reader.seek(() -> (Long) offset.get("offset")); + Map partition = Collections.singletonMap("path", metadata.getPath()); + Map offset = offsetStorageReader.offset(partition); + if (offset != null && offset.get("offset") != null) { + log.info("Seeking to offset [{}] for file [{}].", offset.get("offset"), metadata.getPath()); + reader.seek((Long) offset.get("offset")); + } + return reader; + } catch (Exception e) { + throw new ConnectException("An error has occurred when creating reader for file: " + metadata.getPath(), e); } - return reader; } - Iterator concat(final Iterator it1, - final Iterator it2) { + private Iterator concat(final Iterator it1, + final Iterator it2) { return new Iterator() { @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicy.java new file mode 100644 index 0000000..307fc23 --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicy.java @@ -0,0 +1,73 @@ +package com.github.mmolimar.kafka.connect.fs.policy; + +import com.cronutils.model.CronType; +import com.cronutils.model.definition.CronDefinitionBuilder; +import com.cronutils.model.time.ExecutionTime; +import com.cronutils.parser.CronParser; +import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; +import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.common.utils.SystemTime; +import org.apache.kafka.common.utils.Time; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.time.DateTimeException; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.ZonedDateTime; +import java.util.Date; +import java.util.Map; + +public class CronPolicy extends AbstractPolicy { + + private static final Logger log = LoggerFactory.getLogger(CronPolicy.class); + + private static final String CRON_POLICY_PREFIX = FsSourceTaskConfig.POLICY_PREFIX + "cron."; + + public static final String CRON_POLICY_EXPRESSION = CRON_POLICY_PREFIX + "expression"; + public static final String CRON_POLICY_END_DATE = CRON_POLICY_PREFIX + "end_date"; + + private final Time time; + private ExecutionTime executionTime; + private Date endDate; + + public CronPolicy(FsSourceTaskConfig conf) throws IOException { + super(conf); + this.time = new SystemTime(); + } + + @Override + protected void configPolicy(Map customConfigs) { + try { + if (customConfigs.get(CRON_POLICY_END_DATE) != null && + !customConfigs.get(CRON_POLICY_END_DATE).toString().equals("")) { + endDate = Date.from(LocalDateTime.parse(customConfigs.get(CRON_POLICY_END_DATE).toString().trim()) + .atZone(ZoneId.systemDefault()).toInstant()); + } + executionTime = ExecutionTime.forCron( + new CronParser(CronDefinitionBuilder.instanceDefinitionFor(CronType.QUARTZ)) + .parse(customConfigs.get(CRON_POLICY_EXPRESSION).toString()) + ); + } catch (DateTimeException dte) { + throw new ConfigException(CRON_POLICY_END_DATE + " property must have a proper value. Got: '" + + customConfigs.get(CRON_POLICY_END_DATE) + "'."); + } catch (IllegalArgumentException iae) { + throw new ConfigException(CRON_POLICY_EXPRESSION + " property must have a proper value. Got: '" + + customConfigs.get(CRON_POLICY_EXPRESSION) + "'."); + } + } + + @Override + protected void preCheck() { + executionTime.timeToNextExecution(ZonedDateTime.now()) + .ifPresent(next -> time.sleep(next.toMillis())); + } + + @Override + protected boolean isPolicyCompleted() { + return (endDate != null && + endDate.before(Date.from(LocalDateTime.now().atZone(ZoneId.systemDefault()).toInstant()))) || + !executionTime.timeToNextExecution(ZonedDateTime.now()).isPresent(); + } +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java index e928d13..8d2f0d6 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java @@ -10,12 +10,14 @@ import org.apache.hadoop.hdfs.client.HdfsAdmin; import org.apache.hadoop.hdfs.inotify.Event; import org.apache.hadoop.hdfs.inotify.EventBatch; +import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.common.utils.SystemTime; +import org.apache.kafka.common.utils.Time; import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.IllegalWorkerStateException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; import java.util.concurrent.ConcurrentLinkedQueue; @@ -25,40 +27,65 @@ public class HdfsFileWatcherPolicy extends AbstractPolicy { private static final Logger log = LoggerFactory.getLogger(HdfsFileWatcherPolicy.class); private static final String URI_PREFIX = "hdfs://"; + private static final long DEFAULT_POLL = 5000L; + private static final long DEFAULT_RETRY = 20000L; + private static final String HDFS_FILE_WATCHER_POLICY_PREFIX = FsSourceTaskConfig.POLICY_PREFIX + "hdfs_file_watcher."; + + public static final String HDFS_FILE_WATCHER_POLICY_POLL_MS = HDFS_FILE_WATCHER_POLICY_PREFIX + "poll"; + public static final String HDFS_FILE_WATCHER_POLICY_RETRY_MS = HDFS_FILE_WATCHER_POLICY_PREFIX + "retry"; + private final Queue fileQueue; + private final Time time; private Map fsEvenStream; + private long pollSleepMs; + private long retrySleepMs; public HdfsFileWatcherPolicy(FsSourceTaskConfig conf) throws IOException { super(conf); - this.fileQueue = new ConcurrentLinkedQueue(); + this.fileQueue = new ConcurrentLinkedQueue<>(); + this.time = new SystemTime(); startWatchers(); } @Override protected void configPolicy(Map customConfigs) { + try { + this.pollSleepMs = Long.parseLong((String) customConfigs + .getOrDefault(HDFS_FILE_WATCHER_POLICY_POLL_MS, String.valueOf(DEFAULT_POLL))); + } catch (NumberFormatException nfe) { + throw new ConfigException(HDFS_FILE_WATCHER_POLICY_POLL_MS + " property is required and must be a " + + "number (long). Got: " + customConfigs.get(HDFS_FILE_WATCHER_POLICY_POLL_MS)); + } + try { + this.retrySleepMs = Long.parseLong((String) customConfigs + .getOrDefault(HDFS_FILE_WATCHER_POLICY_RETRY_MS, String.valueOf(DEFAULT_RETRY))); + } catch (NumberFormatException nfe) { + throw new ConfigException(HDFS_FILE_WATCHER_POLICY_RETRY_MS + " property is required and must be a " + + "number (long). Got: " + customConfigs.get(HDFS_FILE_WATCHER_POLICY_RETRY_MS)); + } this.fsEvenStream = new HashMap<>(); fileSystems.stream() .filter(fs -> fs.getWorkingDirectory().toString().startsWith(URI_PREFIX)) .forEach(fs -> { try { HdfsAdmin admin = new HdfsAdmin(fs.getWorkingDirectory().toUri(), fs.getConf()); - fsEvenStream.put(fs, new EventStreamThread(fs, admin)); + fsEvenStream.put(fs, new EventStreamThread(fs, admin, retrySleepMs)); } catch (IOException ioe) { - throw new ConnectException("Error creating admin for notifications", ioe); + throw new ConnectException("Error creating HDFS notifications.", ioe); } }); } private void startWatchers() { - fsEvenStream.values().forEach(stream -> stream.start()); + fsEvenStream.values().forEach(Thread::start); } private void stopWatchers() { - fsEvenStream.values().forEach(stream -> stream.interrupt()); + fsEvenStream.values().forEach(Thread::interrupt); } @Override - public Iterator listFiles(FileSystem fs) throws IOException { + public Iterator listFiles(FileSystem fs) { Set files = new HashSet<>(); FileMetadata metadata; while ((metadata = fileQueue.poll()) != null) { @@ -69,14 +96,7 @@ public Iterator listFiles(FileSystem fs) throws IOException { @Override protected boolean isPolicyCompleted() { - boolean hasRunningThreads = false; - for (EventStreamThread thread : fsEvenStream.values()) { - if (thread.isAlive()) { - hasRunningThreads = true; - break; - } - } - return !hasRunningThreads; + return fsEvenStream.values().stream().noneMatch(Thread::isAlive); } @Override @@ -85,6 +105,11 @@ public void interrupt() { super.interrupt(); } + @Override + public void postCheck() { + time.sleep(pollSleepMs); + } + @Override public void close() throws IOException { stopWatchers(); @@ -94,54 +119,76 @@ public void close() throws IOException { private class EventStreamThread extends Thread { private final FileSystem fs; private final HdfsAdmin admin; + private final long retrySleepMs; + private final Time time; - protected EventStreamThread(FileSystem fs, HdfsAdmin admin) { + EventStreamThread(FileSystem fs, HdfsAdmin admin, long retrySleepMs) { this.fs = fs; this.admin = admin; + this.retrySleepMs = retrySleepMs; + this.time = new SystemTime(); } @Override public void run() { - try { - DFSInotifyEventInputStream eventStream = admin.getInotifyEventStream(); - while (fs.getFileStatus(fs.getWorkingDirectory()) != null && - fs.exists(fs.getWorkingDirectory())) { - EventBatch batch = eventStream.poll(); - if (batch == null) continue; - - for (Event event : batch.getEvents()) { - switch (event.getEventType()) { - case CREATE: - enqueue(((Event.CreateEvent) event).getPath()); - break; - case APPEND: - enqueue(((Event.AppendEvent) event).getPath()); - break; - case CLOSE: - enqueue(((Event.CloseEvent) event).getPath()); - break; - default: - break; + while (true) { + try { + DFSInotifyEventInputStream eventStream = admin.getInotifyEventStream(); + if (fs.getFileStatus(fs.getWorkingDirectory()) != null && + fs.exists(fs.getWorkingDirectory())) { + EventBatch batch = eventStream.poll(); + if (batch == null) continue; + + for (Event event : batch.getEvents()) { + switch (event.getEventType()) { + case CREATE: + if (!((Event.CreateEvent) event).getPath().endsWith("._COPYING_")) { + enqueue(((Event.CreateEvent) event).getPath()); + } + break; + case APPEND: + if (!((Event.AppendEvent) event).getPath().endsWith("._COPYING_")) { + enqueue(((Event.AppendEvent) event).getPath()); + } + break; + case RENAME: + if (((Event.RenameEvent) event).getSrcPath().endsWith("._COPYING_")) { + enqueue(((Event.RenameEvent) event).getDstPath()); + } + break; + case CLOSE: + if (!((Event.CloseEvent) event).getPath().endsWith("._COPYING_")) { + enqueue(((Event.CloseEvent) event).getPath()); + } + break; + default: + break; + } } } + } catch (IOException ioe) { + if (retrySleepMs > 0) { + time.sleep(retrySleepMs); + } else { + log.warn("Error watching path [{}]. Stopping it...", fs.getWorkingDirectory(), ioe); + throw new IllegalWorkerStateException(ioe); + } + } catch (Exception e) { + log.warn("Stopping watcher due to an unexpected exception when watching path [{}].", + fs.getWorkingDirectory(), e); + throw new IllegalWorkerStateException(e); } - } catch (FileNotFoundException fnfe) { - log.warn("Cannot find file in this FS {}. Stopping watcher...", fs.getWorkingDirectory(), fnfe); - } catch (IOException ioe) { - log.info("An interrupted exception has occurred. Path {} is not watched any more", fs.getWorkingDirectory()); - } catch (Exception ioe) { - log.warn("Exception watching path {}", fs.getWorkingDirectory(), ioe); - throw new IllegalWorkerStateException(ioe); } } private void enqueue(String path) throws IOException { Path filePath = new Path(path); if (!fs.exists(filePath) || fs.getFileStatus(filePath) == null) { - log.info("Cannot enqueue file {} because it does not exist but got an event from the FS", filePath.toString()); + log.info("Cannot enqueue file [{}] because it does not exist but got an event from the FS", filePath); return; } + log.debug("Enqueuing file to process [{}]", filePath); RemoteIterator it = fs.listFiles(filePath, false); while (it.hasNext()) { LocatedFileStatus status = it.next(); @@ -151,4 +198,3 @@ private void enqueue(String path) throws IOException { } } } - diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java index 8cb3232..370288f 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java @@ -19,5 +19,7 @@ public interface Policy extends Closeable { List getURIs(); + long getExecutions(); + void interrupt(); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java index 4919c34..2a02884 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java @@ -31,14 +31,14 @@ public SleepyPolicy(FsSourceTaskConfig conf) throws IOException { @Override protected void configPolicy(Map customConfigs) { try { - this.sleep = Long.valueOf((String) customConfigs.get(SLEEPY_POLICY_SLEEP_MS)); + this.sleep = Long.parseLong((String) customConfigs.get(SLEEPY_POLICY_SLEEP_MS)); } catch (NumberFormatException nfe) { throw new ConfigException(SLEEPY_POLICY_SLEEP_MS + " property is required and must be a number(long). Got: " + customConfigs.get(SLEEPY_POLICY_SLEEP_MS)); } if (customConfigs.get(SLEEPY_POLICY_MAX_EXECS) != null) { try { - this.maxExecs = Long.valueOf((String) customConfigs.get(SLEEPY_POLICY_MAX_EXECS)); + this.maxExecs = Long.parseLong((String) customConfigs.get(SLEEPY_POLICY_MAX_EXECS)); } catch (NumberFormatException nfe) { throw new ConfigException(SLEEPY_POLICY_MAX_EXECS + " property must be a number(long). Got: " + customConfigs.get(SLEEPY_POLICY_MAX_EXECS)); @@ -48,7 +48,7 @@ protected void configPolicy(Map customConfigs) { } if (customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION) != null) { try { - this.sleepFraction = Long.valueOf((String) customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION)); + this.sleepFraction = Long.parseLong((String) customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION)); } catch (NumberFormatException nfe) { throw new ConfigException(SLEEPY_POLICY_SLEEP_FRACTION + " property must be a number(long). Got: " + customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION)); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java index babe70c..04fa75c 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java @@ -3,9 +3,10 @@ import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; import com.github.mmolimar.kafka.connect.fs.policy.Policy; -import org.apache.commons.lang.reflect.ConstructorUtils; +import org.apache.commons.lang3.reflect.ConstructorUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.errors.ConnectException; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; @@ -15,24 +16,24 @@ public class ReflectionUtils { public static FileReader makeReader(Class clazz, FileSystem fs, - Path path, Map config) throws Throwable { + Path path, Map config) { return make(clazz, fs, path, config); } - public static Policy makePolicy(Class clazz, FsSourceTaskConfig conf) throws Throwable { + public static Policy makePolicy(Class clazz, FsSourceTaskConfig conf) { return make(clazz, conf); } - private static T make(Class clazz, Object... args) throws Throwable { + private static T make(Class clazz, Object... args) { try { - Class[] constClasses = Arrays.stream(args).map(arg -> arg.getClass()).toArray(Class[]::new); + Class[] constClasses = Arrays.stream(args).map(Object::getClass).toArray(Class[]::new); - Constructor constructor = ConstructorUtils.getMatchingAccessibleConstructor(clazz, constClasses); - return (T) constructor.newInstance(args); + Constructor constructor = ConstructorUtils.getMatchingAccessibleConstructor(clazz, constClasses); + return constructor.newInstance(args); } catch (IllegalAccessException | InstantiationException | InvocationTargetException e) { - throw e.getCause(); + throw new ConnectException(e.getCause()); } } } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/TailCall.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/TailCall.java new file mode 100644 index 0000000..5b82099 --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/TailCall.java @@ -0,0 +1,44 @@ +package com.github.mmolimar.kafka.connect.fs.util; + +import java.util.stream.Stream; + +@FunctionalInterface +public interface TailCall { + + TailCall apply(); + + default boolean completed() { + return false; + } + + default T result() { + throw new IllegalStateException("Call does not have a value."); + } + + default T invoke() { + return Stream.iterate(this, TailCall::apply) + .filter(TailCall::completed) + .findFirst() + .get() + .result(); + } + + static TailCall done(final T value) { + return new TailCall() { + @Override + public boolean completed() { + return true; + } + + @Override + public T result() { + return value; + } + + @Override + public TailCall apply() { + throw new IllegalStateException("Done cannot be applied."); + } + }; + } +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java index 23d2312..7e94e04 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java @@ -22,4 +22,4 @@ public class Version { public static String getVersion() { return version; } -} \ No newline at end of file +} diff --git a/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem b/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem new file mode 100644 index 0000000..de86f4a --- /dev/null +++ b/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem @@ -0,0 +1,15 @@ +org.apache.hadoop.fs.LocalFileSystem +org.apache.hadoop.fs.viewfs.ViewFileSystem +org.apache.hadoop.fs.HarFileSystem +org.apache.hadoop.fs.http.HttpFileSystem +org.apache.hadoop.fs.http.HttpsFileSystem +org.apache.hadoop.fs.ftp.FTPFileSystem +org.apache.hadoop.hdfs.DistributedFileSystem +org.apache.hadoop.fs.s3a.S3AFileSystem +org.apache.hadoop.fs.s3native.NativeS3FileSystem +org.apache.hadoop.fs.adl.AdlFileSystem +org.apache.hadoop.fs.azure.NativeAzureFileSystem +org.apache.hadoop.fs.azure.NativeAzureFileSystem$Secure +org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem +org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem +com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem diff --git a/src/main/resources/kafka-connect-fs-version.properties b/src/main/resources/kafka-connect-fs-version.properties index e5683df..defbd48 100644 --- a/src/main/resources/kafka-connect-fs-version.properties +++ b/src/main/resources/kafka-connect-fs-version.properties @@ -1 +1 @@ -version=${project.version} \ No newline at end of file +version=${project.version} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractHdfsFsConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractHdfsFsConfig.java new file mode 100644 index 0000000..f3fef89 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractHdfsFsConfig.java @@ -0,0 +1,44 @@ +package com.github.mmolimar.kafka.connect.fs; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.hdfs.MiniDFSCluster; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; + +public abstract class AbstractHdfsFsConfig implements FsTestConfig { + private MiniDFSCluster cluster; + private FileSystem fs; + private URI fsUri; + + @Override + public final void initFs() throws IOException { + Configuration clusterConfig = new Configuration(); + java.nio.file.Path hdfsDir = Files.createTempDirectory("test-"); + clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString()); + cluster = new MiniDFSCluster.Builder(clusterConfig).build(); + fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/"); + fs = FileSystem.newInstance(fsUri, new Configuration()); + init(); + } + + protected abstract void init() throws IOException; + + @Override + public FileSystem getFs() { + return fs; + } + + @Override + public URI getFsUri() { + return fsUri; + } + + @Override + public void close() throws IOException { + fs.close(); + cluster.shutdown(true); + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractLocalFsConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractLocalFsConfig.java new file mode 100644 index 0000000..dab5736 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractLocalFsConfig.java @@ -0,0 +1,41 @@ +package com.github.mmolimar.kafka.connect.fs; + +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; + +public abstract class AbstractLocalFsConfig implements FsTestConfig { + private java.nio.file.Path localDir; + private FileSystem fs; + private URI fsUri; + + @Override + public final void initFs() throws IOException { + localDir = Files.createTempDirectory("test-"); + fsUri = localDir.toUri(); + fs = FileSystem.newInstance(fsUri, new Configuration()); + init(); + } + + protected abstract void init() throws IOException; + + @Override + public FileSystem getFs() { + return fs; + } + + @Override + public URI getFsUri() { + return fsUri; + } + + @Override + public void close() throws IOException { + fs.close(); + FileUtils.deleteDirectory(localDir.toFile()); + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/FsTestConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/FsTestConfig.java new file mode 100644 index 0000000..64b9c4c --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/FsTestConfig.java @@ -0,0 +1,17 @@ +package com.github.mmolimar.kafka.connect.fs; + +import org.apache.hadoop.fs.FileSystem; + +import java.io.Closeable; +import java.io.IOException; +import java.net.URI; + +public interface FsTestConfig extends Closeable { + + void initFs() throws IOException; + + FileSystem getFs(); + + URI getFsUri(); + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorConfigTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorConfigTest.java index 2a33262..5f0538e 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorConfigTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorConfigTest.java @@ -2,10 +2,10 @@ import com.github.mmolimar.kafka.connect.fs.FsSourceConnectorConfig; import org.apache.kafka.common.config.ConfigDef; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class FsSourceConnectorConfigTest { @@ -13,9 +13,9 @@ public class FsSourceConnectorConfigTest { public void checkDocumentation() { ConfigDef config = FsSourceConnectorConfig.conf(); config.names().forEach(key -> { - assertFalse("Property " + key + " should be documented", - config.configKeys().get(key).documentation == null || - "".equals(config.configKeys().get(key).documentation.trim())); + assertFalse(config.configKeys().get(key).documentation == null || + "".equals(config.configKeys().get(key).documentation.trim()), + () -> "Property " + key + " should be documented"); }); } @@ -23,4 +23,4 @@ public void checkDocumentation() { public void toRst() { assertNotNull(FsSourceConnectorConfig.conf().toRst()); } -} \ No newline at end of file +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorTest.java index 5fc9c5e..a67a92e 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorTest.java @@ -4,55 +4,53 @@ import com.github.mmolimar.kafka.connect.fs.FsSourceTask; import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; import org.apache.kafka.connect.errors.ConnectException; -import org.junit.Before; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; import java.io.File; -import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.IntStream; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class FsSourceConnectorTest { - @ClassRule - public static final TemporaryFolder temporaryFolder = new TemporaryFolder(); + @TempDir + public static File temporaryFolder; private FsSourceConnector connector; private Map connProps; - @Before - public void setup() throws IOException { + @BeforeEach + public void setup() { connector = new FsSourceConnector(); Map cfg = new HashMap() {{ put(FsSourceTaskConfig.FS_URIS, String.join(",", - temporaryFolder.getRoot().toURI() + File.separator + "dir1", - temporaryFolder.getRoot().toURI() + File.separator + "dir2", - temporaryFolder.getRoot().toURI() + File.separator + "dir3")); + temporaryFolder.toURI() + File.separator + "dir1", + temporaryFolder.toURI() + File.separator + "dir2", + temporaryFolder.toURI() + File.separator + "dir3")); put(FsSourceTaskConfig.TOPIC, "topic_test"); }}; connProps = new HashMap<>(cfg); } - @Test(expected = ConnectException.class) + @Test public void nullProperties() { - connector.start(null); + assertThrows(ConnectException.class, () -> connector.start(null)); } - @Test(expected = ConnectException.class) + @Test public void expectedFsUris() { Map testProps = new HashMap<>(connProps); testProps.remove(FsSourceTaskConfig.FS_URIS); - connector.start(testProps); + assertThrows(ConnectException.class, () -> connector.start(testProps)); } @Test - public void minimunConfig() { + public void minimumConfig() { connector.start(connProps); connector.stop(); } @@ -62,15 +60,15 @@ public void checkTaskClass() { assertEquals(FsSourceTask.class, connector.taskClass()); } - @Test(expected = ConnectException.class) + @Test public void configTasksWithoutStart() { - connector.taskConfigs(1); + assertThrows(ConnectException.class, () -> connector.taskConfigs(1)); } - @Test(expected = IllegalArgumentException.class) + @Test public void invalidConfigTaskNumber() { connector.start(connProps); - connector.taskConfigs(0); + assertThrows(IllegalArgumentException.class, () -> connector.taskConfigs(0)); } @Test @@ -80,7 +78,7 @@ public void configTasks() { IntStream.range(1, connProps.get(FsSourceTaskConfig.FS_URIS).split(",").length + 1) .forEach(index -> { List> taskConfigs = connector.taskConfigs(index); - assertTrue(taskConfigs.size() == (index > uris ? uris : index)); + assertEquals(taskConfigs.size(), Math.min(index, uris)); }); connector.stop(); } @@ -95,5 +93,4 @@ public void checkVersion() { public void checkDefaultConf() { assertNotNull(connector.config()); } - -} \ No newline at end of file +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java new file mode 100644 index 0000000..ab44e27 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java @@ -0,0 +1,184 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import org.junit.jupiter.api.Nested; + +import java.util.Map; + +public class AgnosticFileReaderTest { + + private static final String FILE_EXTENSION = "test"; + + @Nested + class AgnosticTextFileReaderTest extends TextFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_TEXT, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + + @Nested + class AgnosticCsvFileReaderTest extends CsvFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_CSV, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + + @Nested + class AgnosticTsvFileReaderTest extends TsvFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_TSV, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + + @Nested + class AgnosticFixedWidthFileReaderTest extends FixedWidthFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_FIXED, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + + @Nested + class AgnosticJsonFileReaderTest extends JsonFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_JSON, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + + @Nested + class AgnosticAvroFileReaderTest extends AvroFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + + @Nested + class AgnosticParquetFileReaderTest extends ParquetFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + + @Nested + class AgnosticSequenceFileReaderTest extends SequenceFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE, getFileExtension()); + return config; + } + + @Override + public void schemaMapper(ReaderFsTestConfig fsConfig) { + + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java new file mode 100644 index 0000000..5e9d59e --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java @@ -0,0 +1,136 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import org.apache.avro.AvroTypeException; +import org.apache.avro.Schema; +import org.apache.avro.SchemaParseException; +import org.apache.avro.file.DataFileWriter; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumWriter; +import org.apache.avro.generic.GenericRecord; +import org.apache.avro.io.DatumWriter; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.*; + +public class AvroFileReaderTest extends FileReaderTestBase { + + private static final String FIELD_INDEX = "index"; + private static final String FIELD_NAME = "name"; + private static final String FIELD_SURNAME = "surname"; + private static final String FILE_EXTENSION = "avr"; + + private static Schema schema; + + @BeforeAll + public static void setUp() throws IOException { + schema = new Schema.Parser().parse(AvroFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); + } + + @Override + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { + File avroFile = File.createTempFile("test-", "." + getFileExtension()); + DatumWriter writer = new GenericDatumWriter<>(schema); + try (DataFileWriter dataFileWriter = new DataFileWriter<>(writer)) { + dataFileWriter.setFlushOnEveryBlock(true); + dataFileWriter.setSyncInterval(32); + dataFileWriter.create(schema, avroFile); + + IntStream.range(0, NUM_RECORDS).forEach(index -> { + GenericRecord datum = new GenericData.Record(schema); + datum.put(FIELD_INDEX, index); + datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID())); + datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID())); + try { + fsConfig.offsetsByIndex().put(index, dataFileWriter.sync() - 16L); + dataFileWriter.append(datum); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + }); + } + Path path = new Path(new Path(fsConfig.getFsUri()), avroFile.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(avroFile.getAbsolutePath()), path); + return path; + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithSchema(ReaderFsTestConfig fsConfig) throws IOException { + Map readerConfig = getReaderConfig(); + readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, schema.toString()); + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); + readAllData(fsConfig); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithInvalidSchema(ReaderFsTestConfig fsConfig) throws IOException { + Map readerConfig = getReaderConfig(); + readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, Schema.create(Schema.Type.STRING).toString()); + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); + assertThrows(ConnectException.class, () -> readAllData(fsConfig)); + assertThrows(AvroTypeException.class, () -> { + try { + readAllData(fsConfig); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithUnparseableSchema(ReaderFsTestConfig fsConfig) throws IOException { + Map readerConfig = getReaderConfig(); + readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, "invalid schema"); + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + assertThrows(ConnectException.class, () -> getReader(testFs, fsConfig.getDataFile(), readerConfig)); + assertThrows(SchemaParseException.class, () -> { + try { + getReader(testFs, fsConfig.getDataFile(), readerConfig); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + + @Override + protected Class getReaderClass() { + return AvroFileReader.class; + } + + @Override + protected Map getReaderConfig() { + return new HashMap<>(); + } + + @Override + protected void checkData(Struct record, long index) { + assertAll( + () -> assertEquals((int) (Integer) record.get(FIELD_INDEX), index), + () -> assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")), + () -> assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")) + ); + } + + @Override + protected String getFileExtension() { + return FILE_EXTENSION; + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java new file mode 100644 index 0000000..3eba9c0 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java @@ -0,0 +1,95 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.*; + +public class CsvFileReaderTest extends UnivocityFileReaderTest { + + @Override + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { + boolean header = args.length < 1 || (boolean) args[0]; + CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1]; + File txtFile = File.createTempFile("test-", "." + getFileExtension()); + try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { + if (header) { + String headerValue = String.join("#", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3, FIELD_COLUMN4, + FIELD_COLUMN5, FIELD_COLUMN6, FIELD_COLUMN7, FIELD_COLUMN8, FIELD_COLUMN9); + writer.append(headerValue + "\n"); + } + IntStream.range(0, NUM_RECORDS).forEach(index -> { + String value = String.format("%d#%d#%d#%d#%f#%f#%s#%s#%s\n", + (byte) 2, (short) 4, 8, 16L, 32.32f, 64.64d, + true, "test bytes", "test string"); + writer.append(value); + fsConfig.offsetsByIndex().put(index, (long) index); + }); + } + Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); + return path; + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readAllDataWithMalformedRows(ReaderFsTestConfig fsConfig) throws IOException { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + try (FileWriter writer = new FileWriter(tmp)) { + String headerValue = String.join(",", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3, FIELD_COLUMN4, + FIELD_COLUMN5, FIELD_COLUMN6, FIELD_COLUMN7, FIELD_COLUMN8, FIELD_COLUMN9); + writer.append(headerValue + "\n"); + writer.append(",\"\",,,,,true,test bytes,test string\n"); + writer.append("#comment\n"); + writer.append(",\"\",,,,,true,test bytes,test string\n"); + } + Map readerConfig = getReaderConfig(); + readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, ","); + readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE, "10"); + readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_NULL_VALUE, "100"); + + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + FileReader reader = getReader(fsConfig.getFs(), path, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + assertAll( + () -> assertEquals(record.get(FIELD_COLUMN1), (byte) 100), + () -> assertEquals(record.get(FIELD_COLUMN2), (short) 10), + () -> assertEquals(record.get(FIELD_COLUMN3), 100), + () -> assertEquals(record.get(FIELD_COLUMN4), 100L), + () -> assertEquals(record.get(FIELD_COLUMN5), 100.00f), + () -> assertEquals(record.get(FIELD_COLUMN6), 100.00d), + () -> assertEquals(record.get(FIELD_COLUMN7), true), + () -> assertEquals(new String((byte[]) record.get(FIELD_COLUMN8)), "test bytes"), + () -> assertEquals(record.get(FIELD_COLUMN9), "test string") + ); + recordCount++; + } + assertEquals(2, recordCount, () -> "The number of records in the file does not match"); + } + + @Override + protected Map getReaderConfig() { + return new HashMap() {{ + put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, "#"); + put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "byte,short,int,long,float,double,boolean,bytes,string"); + }}; + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java index e4aa2b4..f21cf49 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java @@ -1,92 +1,140 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.Test; +import org.apache.kafka.connect.errors.ConnectException; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import java.io.*; -import java.net.URI; -import java.util.HashMap; -import java.util.Map; -import java.util.NoSuchElementException; -import java.util.UUID; +import java.util.*; +import java.util.stream.Stream; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; -public abstract class FileReaderTestBase { +abstract class FileReaderTestBase { + private static final List TEST_FILE_SYSTEMS = Arrays.asList( + new LocalFsConfig(), + new HdfsFsConfig() + ); protected static final int NUM_RECORDS = 100; - protected static final Map OFFSETS_BY_INDEX = new HashMap<>(); - protected static Class readerClass; - protected static FileSystem fs; - protected static URI fsUri; - protected static Path dataFile; - protected static Map readerConfig; - protected static FileReader reader; + @BeforeAll + public static void initFs() throws IOException { + for (ReaderFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.initFs(); + } + } - @AfterClass - public static void tearDown() throws IOException { - fs.close(); + @AfterAll + public static void finishFs() throws IOException { + for (ReaderFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.close(); + } } - @Before - public void openReader() throws Throwable { - reader = getReader(fs, dataFile, readerConfig); - assertTrue(reader.getFilePath().equals(dataFile)); + @BeforeEach + public void openReader() throws IOException { + for (ReaderFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.setDataFile(createDataFile(fsConfig)); + FileReader reader = ReflectionUtils.makeReader(getReaderClass(), fsConfig.getFs(), + fsConfig.getDataFile(), getReaderConfig()); + assertEquals(reader.getFilePath(), fsConfig.getDataFile()); + fsConfig.setReader(reader); + } } - @After + @AfterEach public void closeReader() { - try { - reader.close(); - } catch (Exception e) { - //ignoring + for (ReaderFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + try { + fsConfig.getReader().close(); + } catch (Exception e) { + //ignoring + } } } - @Test(expected = IllegalArgumentException.class) - public void invalidArgs() throws Throwable { + private static Stream fileSystemConfigProvider() { + return TEST_FILE_SYSTEMS.stream().map(Arguments::of); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidArgs(ReaderFsTestConfig fsConfig) { try { - readerClass.getConstructor(FileSystem.class, Path.class, Map.class).newInstance(null, null, null); + fsConfig.getReader().getClass().getConstructor(FileSystem.class, Path.class, Map.class) + .newInstance(null, null, null); } catch (Exception e) { - throw e.getCause(); + assertThrows(IllegalArgumentException.class, () -> { + throw e.getCause(); + }); } } - @Test(expected = FileNotFoundException.class) - public void fileDoesNotExist() throws Throwable { - Path path = new Path(new Path(fsUri), UUID.randomUUID().toString()); - getReader(fs, path, readerConfig); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void fileDoesNotExist(ReaderFsTestConfig fsConfig) { + Path path = new Path(new Path(fsConfig.getFsUri()), UUID.randomUUID().toString()); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); + assertThrows(FileNotFoundException.class, () -> { + try { + getReader(fsConfig.getFs(), path, getReaderConfig()); + } catch (Exception e) { + throw e.getCause(); + } + }); } - @Test(expected = IOException.class) - public void emptyFile() throws Throwable { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void emptyFile(ReaderFsTestConfig fsConfig) throws IOException { File tmp = File.createTempFile("test-", "." + getFileExtension()); - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); + assertThrows(IOException.class, () -> { + try { + getReader(fsConfig.getFs(), path, getReaderConfig()); + } catch (Exception e) { + throw e.getCause(); + } + }); } - @Test(expected = IOException.class) - public void invalidFileFormat() throws Throwable { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws IOException { File tmp = File.createTempFile("test-", "." + getFileExtension()); try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { writer.write("test"); } - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); + assertThrows(IOException.class, () -> { + try { + getReader(fsConfig.getFs(), path, getReaderConfig()); + } catch (Exception e) { + throw e.getCause(); + } + }); } - @Test - public void readAllData() { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readAllData(ReaderFsTestConfig fsConfig) { + FileReader reader = fsConfig.getReader(); assertTrue(reader.hasNext()); int recordCount = 0; @@ -95,61 +143,89 @@ public void readAllData() { checkData(record, recordCount); recordCount++; } - assertEquals("The number of records in the file does not match", NUM_RECORDS, recordCount); + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); } - @Test - public void seekFile() { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void seekFile(ReaderFsTestConfig fsConfig) { + FileReader reader = fsConfig.getReader(); int recordIndex = NUM_RECORDS / 2; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex))); + reader.seek(fsConfig.offsetsByIndex().get(recordIndex)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset()); checkData(reader.next(), recordIndex); recordIndex = 0; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex))); + reader.seek(fsConfig.offsetsByIndex().get(recordIndex)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset()); checkData(reader.next(), recordIndex); recordIndex = NUM_RECORDS - 3; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex))); + reader.seek(fsConfig.offsetsByIndex().get(recordIndex)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset()); checkData(reader.next(), recordIndex); - reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1)); + reader.seek(fsConfig.offsetsByIndex().get(NUM_RECORDS - 1) + 1); assertFalse(reader.hasNext()); - } - @Test(expected = RuntimeException.class) - public void negativeSeek() { - reader.seek(getOffset(-1)); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void negativeSeek(ReaderFsTestConfig fsConfig) { + FileReader reader = fsConfig.getReader(); + assertThrows(IllegalArgumentException.class, () -> reader.seek(-1)); } - @Test(expected = NoSuchElementException.class) - public void exceededSeek() { - reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1)); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void exceededSeek(ReaderFsTestConfig fsConfig) { + FileReader reader = fsConfig.getReader(); + reader.seek(fsConfig.offsetsByIndex().get(NUM_RECORDS - 1) + 1); assertFalse(reader.hasNext()); - reader.next(); + assertThrows(NoSuchElementException.class, reader::next); } - @Test(expected = RuntimeException.class) - public void readFileAlreadyClosed() throws IOException { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readFileAlreadyClosed(ReaderFsTestConfig fsConfig) throws IOException { + FileReader reader = fsConfig.getReader(); reader.close(); - assertFalse(reader.hasNext()); - reader.seek(getOffset(0)); + assertThrows(ConnectException.class, reader::hasNext); + assertThrows(ConnectException.class, reader::next); + assertThrows(ConnectException.class, () -> reader.seek(1)); } - protected final FileReader getReader(FileSystem fs, Path path, Map config) throws Throwable { - return ReflectionUtils.makeReader(readerClass, fs, path, config); + protected final FileReader getReader(FileSystem fs, Path path, Map config) { + return ReflectionUtils.makeReader(getReaderClass(), fs, path, config); } - protected abstract Offset getOffset(long offset); + protected OutputStream getOutputStream(File file, CompressionType compression) throws IOException { + final OutputStream os; + switch (compression) { + case BZIP2: + os = new BZip2CompressorOutputStream(new FileOutputStream(file)); + break; + case GZIP: + os = new GzipCompressorOutputStream(new FileOutputStream(file)); + break; + default: + os = new FileOutputStream(file); + break; + } + return os; + } - protected abstract void checkData(Struct record, long index); + protected abstract Class getReaderClass(); + + protected abstract Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException; + + protected abstract Map getReaderConfig(); protected abstract String getFileExtension(); + protected abstract void checkData(Struct record, long index); + } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReaderTest.java new file mode 100644 index 0000000..8b1fedc --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReaderTest.java @@ -0,0 +1,64 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import org.apache.hadoop.fs.Path; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public class FixedWidthFileReaderTest extends UnivocityFileReaderTest { + + private static final int[] fieldLengths = new int[]{45, 53, 71, 89, 14, 44, 67, 46, 75}; + + @Override + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { + boolean header = args.length < 1 || (boolean) args[0]; + CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1]; + File txtFile = File.createTempFile("test-", "." + getFileExtension()); + try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { + if (header) { + writer.append(String.format("%-" + fieldLengths[0] + "s", FIELD_COLUMN1) + + String.format("%-" + fieldLengths[1] + "s", FIELD_COLUMN2) + + String.format("%-" + fieldLengths[2] + "s", FIELD_COLUMN3) + + String.format("%-" + fieldLengths[3] + "s", FIELD_COLUMN4) + + String.format("%-" + fieldLengths[4] + "s", FIELD_COLUMN5) + + String.format("%-" + fieldLengths[5] + "s", FIELD_COLUMN6) + + String.format("%-" + fieldLengths[6] + "s", FIELD_COLUMN7) + + String.format("%-" + fieldLengths[7] + "s", FIELD_COLUMN8) + + String.format("%-" + fieldLengths[8] + "s", FIELD_COLUMN9) + "\n"); + } + IntStream.range(0, NUM_RECORDS).forEach(index -> { + writer.append(String.format("%-" + fieldLengths[0] + "s", String.format("%d", (byte) 2)) + + String.format("%-" + fieldLengths[1] + "s", String.format("%d", (short) 4)) + + String.format("%-" + fieldLengths[2] + "s", String.format("%d", 8)) + + String.format("%-" + fieldLengths[3] + "s", String.format("%d", 16L)) + + String.format("%-" + fieldLengths[4] + "s", String.format("%f", 32.32f)) + + String.format("%-" + fieldLengths[5] + "s", String.format("%f", 64.64d)) + + String.format("%-" + fieldLengths[6] + "s", String.format("%s", true)) + + String.format("%-" + fieldLengths[7] + "s", String.format("%s", "test bytes")) + + String.format("%-" + fieldLengths[8] + "s", String.format("%s", "test string")) + "\n" + ); + fsConfig.offsetsByIndex().put(index, (long) index); + }); + } + Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); + return path; + } + + @Override + protected Map getReaderConfig() { + return new HashMap() {{ + put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_FIELD_LENGTHS, + Arrays.stream(fieldLengths).mapToObj(String::valueOf).collect(Collectors.joining(","))); + put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "byte,short,int,long,float,double,boolean,bytes,string"); + }}; + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java new file mode 100644 index 0000000..98e7e5b --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java @@ -0,0 +1,208 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectWriter; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.*; + +public class JsonFileReaderTest extends FileReaderTestBase { + + private static final String FIELD_INTEGER = "integerField"; + private static final String FIELD_LONG = "longField"; + private static final String FIELD_BOOLEAN = "booleanField"; + private static final String FIELD_STRING = "stringField"; + private static final String FIELD_DECIMAL = "decimalField"; + private static final String FIELD_ARRAY = "arrayField"; + private static final String FIELD_STRUCT = "structField"; + private static final String FIELD_NULL = "nullField"; + private static final String FILE_EXTENSION = "jsn"; + private static final CompressionType COMPRESSION_TYPE_DEFAULT = CompressionType.NONE; + + @Override + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { + int numRecords = args.length < 1 ? NUM_RECORDS : (int) args[0]; + boolean recordPerLine = args.length < 2 || (boolean) args[1]; + CompressionType compression = args.length < 3 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[2]; + File txtFile = File.createTempFile("test-", "." + getFileExtension()); + try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { + ObjectWriter jsonWriter = new ObjectMapper().writerWithDefaultPrettyPrinter(); + IntStream.range(0, numRecords).forEach(index -> { + ObjectNode json = JsonNodeFactory.instance.objectNode() + .put(FIELD_INTEGER, index) + .put(FIELD_LONG, Long.MAX_VALUE) + .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID())) + .put(FIELD_BOOLEAN, true) + .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index)) + .put(FIELD_NULL, (String) null); + json.putArray(FIELD_ARRAY) + .add("elm[" + index + "]") + .add("elm[" + (index + 1) + "]"); + json.putObject(FIELD_STRUCT) + .put(FIELD_INTEGER, (short) index) + .put(FIELD_LONG, Long.MAX_VALUE) + .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID())) + .put(FIELD_BOOLEAN, true) + .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index)) + .put(FIELD_NULL, (String) null); + try { + writer.append(recordPerLine ? json.toString() + "\n" : jsonWriter.writeValueAsString(json)); + } catch (JsonProcessingException jpe) { + throw new RuntimeException(jpe); + } + fsConfig.offsetsByIndex().put(index, (long) index); + }); + } + Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); + return path; + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void emptyFile(ReaderFsTestConfig fsConfig) throws IOException { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + FileReader reader = getReader(fsConfig.getFs(), path, getReaderConfig()); + assertFalse(reader.hasNext()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void validFileEncoding(ReaderFsTestConfig fsConfig) { + Map readerConfig = getReaderConfig(); + readerConfig.put(JsonFileReader.FILE_READER_JSON_ENCODING, "Cp1252"); + fsConfig.setReader(getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); + readAllData(fsConfig); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidDeserializationConfig(ReaderFsTestConfig fsConfig) { + Map readerConfig = getReaderConfig(); + readerConfig.put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + "invalid", "false"); + fsConfig.setReader(getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); + readAllData(fsConfig); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidFileEncoding(ReaderFsTestConfig fsConfig) { + Map readerConfig = getReaderConfig(); + readerConfig.put(JsonFileReader.FILE_READER_JSON_ENCODING, "invalid_charset"); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); + assertThrows(UnsupportedCharsetException.class, () -> { + try { + getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readDataWithRecordPerLineDisabled(ReaderFsTestConfig fsConfig) throws IOException { + Path file = createDataFile(fsConfig, 1, false); + Map readerConfig = getReaderConfig(); + readerConfig.put(JsonFileReader.FILE_READER_JSON_RECORD_PER_LINE, "false"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals(1, recordCount, "The number of records in the file does not match"); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { + Arrays.stream(CompressionType.values()).forEach(compressionType -> { + try { + Path file = createDataFile(fsConfig, NUM_RECORDS, true, compressionType); + Map readerConfig = getReaderConfig(); + readerConfig.put(JsonFileReader.FILE_READER_JSON_COMPRESSION_TYPE, compressionType.toString()); + readerConfig.put(JsonFileReader.FILE_READER_JSON_COMPRESSION_CONCATENATED, "true"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + } + + @Override + protected Class getReaderClass() { + return JsonFileReader.class; + } + + @Override + protected Map getReaderConfig() { + return new HashMap() {{ + String deserializationConfig = DeserializationFeature.ACCEPT_EMPTY_ARRAY_AS_NULL_OBJECT.name(); + put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + deserializationConfig, "true"); + }}; + } + + @Override + protected void checkData(Struct record, long index) { + Struct subrecord = record.getStruct(FIELD_STRUCT); + assertAll( + () -> assertEquals((int) (Integer) record.get(FIELD_INTEGER), index), + () -> assertEquals((long) (Long) record.get(FIELD_LONG), Long.MAX_VALUE), + () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")), + () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())), + () -> assertEquals((Double) record.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0), + () -> assertNull(record.get(FIELD_NULL)), + () -> assertNotNull(record.schema().field(FIELD_NULL)), + () -> assertEquals(record.get(FIELD_ARRAY), Arrays.asList("elm[" + index + "]", "elm[" + (index + 1) + "]")), + () -> assertEquals((int) (Integer) subrecord.get(FIELD_INTEGER), index), + () -> assertEquals((long) (Long) subrecord.get(FIELD_LONG), Long.MAX_VALUE), + () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")), + () -> assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())), + () -> assertEquals((Double) subrecord.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0), + () -> assertNull(subrecord.get(FIELD_NULL)), + () -> assertNotNull(subrecord.schema().field(FIELD_NULL)) + ); + } + + @Override + protected String getFileExtension() { + return FILE_EXTENSION; + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java new file mode 100644 index 0000000..30dd425 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java @@ -0,0 +1,205 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import org.apache.avro.AvroRuntimeException; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.SchemaParseException; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.errors.DataException; +import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.io.InvalidRecordException; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.*; + +public class ParquetFileReaderTest extends FileReaderTestBase { + + private static final String FIELD_INDEX = "index"; + private static final String FIELD_NAME = "name"; + private static final String FIELD_SURNAME = "surname"; + private static final String FILE_EXTENSION = "parquet"; + + private static Schema readerSchema; + private static Schema projectionSchema; + + @BeforeAll + public static void setUp() throws IOException { + readerSchema = new Schema.Parser().parse( + ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); + projectionSchema = new Schema.Parser().parse( + ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people_projection.avsc")); + } + + @Override + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { + FileSystem fs = fsConfig.getFs(); + File parquetFile = File.createTempFile("test-", "." + getFileExtension()); + + try (ParquetWriter writer = AvroParquetWriter.builder(new Path(parquetFile.toURI())) + .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) { + IntStream.range(0, NUM_RECORDS).forEach(index -> { + GenericRecord datum = new GenericData.Record(readerSchema); + datum.put(FIELD_INDEX, index); + String uuid = UUID.randomUUID().toString(); + datum.put(FIELD_NAME, String.format("%d_name_%s", index, uuid)); + datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, uuid)); + try { + fsConfig.offsetsByIndex().put(index, (long) index); + writer.write(datum); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + }); + } + Path path = new Path(new Path(fsConfig.getFsUri()), parquetFile.getName()); + fs.moveFromLocalFile(new Path(parquetFile.getAbsolutePath()), path); + return path; + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void emptyFile(ReaderFsTestConfig fsConfig) throws IOException { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fsConfig.getFs(), path, getReaderConfig()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws IOException { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { + writer.write("test"); + } + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fsConfig.getFs(), path, getReaderConfig()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithSchema(ReaderFsTestConfig fsConfig) throws IOException { + Map readerConfig = getReaderConfig(); + readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, readerSchema.toString()); + readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); + readAllData(fsConfig); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithProjection(ReaderFsTestConfig fsConfig) throws IOException { + Map readerConfig = getReaderConfig(); + readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, projectionSchema.toString()); + readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); + fsConfig.setReader(getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); + while (fsConfig.getReader().hasNext()) { + Struct record = fsConfig.getReader().next(); + assertNotNull(record.schema().field(FIELD_INDEX)); + assertNotNull(record.schema().field(FIELD_NAME)); + assertNull(record.schema().field(FIELD_SURNAME)); + } + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); + assertThrows(DataException.class, () -> readAllData(fsConfig)); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithInvalidProjection(ReaderFsTestConfig fsConfig) throws IOException { + Schema testSchema = SchemaBuilder.record("test_projection").namespace("test.avro") + .fields() + .name("field1").type("string").noDefault() + .endRecord(); + Map readerConfig = getReaderConfig(); + readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, testSchema.toString()); + readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); + try { + readAllData(fsConfig); + } catch (Exception e) { + assertEquals(ConnectException.class, e.getClass()); + assertEquals(InvalidRecordException.class, e.getCause().getClass()); + } + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithInvalidSchema(ReaderFsTestConfig fsConfig) throws IOException { + Map readerConfig = getReaderConfig(); + readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, Schema.create(Schema.Type.STRING).toString()); + readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); + try { + readAllData(fsConfig); + } catch (Exception e) { + assertEquals(ConnectException.class, e.getClass()); + assertEquals(AvroRuntimeException.class, e.getCause().getClass()); + } + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithUnparseableSchema(ReaderFsTestConfig fsConfig) { + Map readerConfig = getReaderConfig(); + readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, "invalid schema"); + readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); + assertThrows(ConnectException.class, () -> + getReader(FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()), + fsConfig.getDataFile(), readerConfig)); + assertThrows(SchemaParseException.class, () -> { + try { + getReader(FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()), + fsConfig.getDataFile(), readerConfig); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + + @Override + protected Map getReaderConfig() { + return new HashMap<>(); + } + + @Override + protected Class getReaderClass() { + return ParquetFileReader.class; + } + + @Override + protected void checkData(Struct record, long index) { + assertEquals((int) (Integer) record.get(FIELD_INDEX), index); + assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); + assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); + } + + @Override + protected String getFileExtension() { + return FILE_EXTENSION; + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ReaderFsTestConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ReaderFsTestConfig.java new file mode 100644 index 0000000..7fde007 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ReaderFsTestConfig.java @@ -0,0 +1,97 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.github.mmolimar.kafka.connect.fs.AbstractHdfsFsConfig; +import com.github.mmolimar.kafka.connect.fs.AbstractLocalFsConfig; +import com.github.mmolimar.kafka.connect.fs.FsTestConfig; +import org.apache.hadoop.fs.Path; + +import java.util.HashMap; +import java.util.Map; + +interface ReaderFsTestConfig extends FsTestConfig { + + void setDataFile(Path dataFile); + + Path getDataFile(); + + void setReader(FileReader reader); + + FileReader getReader(); + + Map offsetsByIndex(); + +} + +class LocalFsConfig extends AbstractLocalFsConfig implements ReaderFsTestConfig { + private Path dataFile; + private FileReader reader; + private Map offsetsByIndex; + + @Override + public void init() { + offsetsByIndex = new HashMap<>(); + } + + @Override + public void setDataFile(Path dataFile) { + this.dataFile = dataFile; + } + + @Override + public Path getDataFile() { + return dataFile; + } + + @Override + public void setReader(FileReader reader) { + this.reader = reader; + } + + @Override + public FileReader getReader() { + return reader; + } + + @Override + public Map offsetsByIndex() { + return offsetsByIndex; + } + +} + +class HdfsFsConfig extends AbstractHdfsFsConfig implements ReaderFsTestConfig { + private Path dataFile; + private FileReader reader; + private Map offsetsByIndex; + + @Override + public void init() { + offsetsByIndex = new HashMap<>(); + } + + @Override + public Path getDataFile() { + return dataFile; + } + + @Override + public void setDataFile(Path dataFile) { + this.dataFile = dataFile; + } + + @Override + public void setReader(FileReader reader) { + this.reader = reader; + } + + @Override + public FileReader getReader() { + return reader; + } + + @Override + public Map offsetsByIndex() { + return offsetsByIndex; + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java new file mode 100644 index 0000000..e70d3dd --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java @@ -0,0 +1,161 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.*; +import org.apache.hadoop.util.ReflectionUtils; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.*; + +public class SequenceFileReaderTest extends FileReaderTestBase { + + private static final String FIELD_NAME_KEY = "custom_field_key"; + private static final String FIELD_NAME_VALUE = "custom_field_name"; + private static final String FILE_EXTENSION = "sq"; + + @Override + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { + FileSystem fs = fsConfig.getFs(); + File seqFile = File.createTempFile("test-", "." + getFileExtension()); + try (SequenceFile.Writer writer = SequenceFile.createWriter(fs.getConf(), + SequenceFile.Writer.file(new Path(seqFile.getAbsolutePath())), + SequenceFile.Writer.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class))) { + IntStream.range(0, NUM_RECORDS).forEach(index -> { + Writable key = new IntWritable(index); + Writable value = new Text(String.format("%d_%s", index, UUID.randomUUID())); + try { + writer.append(key, value); + writer.sync(); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + }); + } + try (SequenceFile.Reader reader = new SequenceFile.Reader(fs.getConf(), + SequenceFile.Reader.file(new Path(seqFile.getAbsolutePath())))) { + Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), fs.getConf()); + Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), fs.getConf()); + int index = 0; + long pos = reader.getPosition() - 1; + while (reader.next(key, value)) { + fsConfig.offsetsByIndex().put(index++, pos); + pos = reader.getPosition(); + } + } + Path path = new Path(new Path(fsConfig.getFsUri()), seqFile.getName()); + fs.moveFromLocalFile(new Path(seqFile.getAbsolutePath()), path); + return path; + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void defaultFieldNames(ReaderFsTestConfig fsConfig) { + Map readerConfig = getReaderConfig(); + readerConfig.put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, null); + readerConfig.put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, null); + FileReader reader = getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); + assertEquals(reader.getFilePath(), fsConfig.getDataFile()); + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(SequenceFileReader.FIELD_NAME_KEY_DEFAULT, SequenceFileReader.FIELD_NAME_VALUE_DEFAULT, + record, recordCount); + recordCount++; + } + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void schemaMapper(ReaderFsTestConfig fsConfig) { + SequenceFileReader reader = (SequenceFileReader) fsConfig.getReader(); + + ByteWritable byteWritable = new ByteWritable((byte) 1); + ShortWritable shortWritable = new ShortWritable((short) 123); + IntWritable intWritable = new IntWritable(123); + LongWritable longWritable = new LongWritable(123L); + FloatWritable floatWritable = new FloatWritable(0.123F); + DoubleWritable doubleWritable = new DoubleWritable(0.123D); + BytesWritable bytesWritable = new BytesWritable(new byte[]{1, 2, 3}); + BooleanWritable booleanWritable = new BooleanWritable(true); + Text textWritable = new Text("123"); + + assertEquals(SchemaBuilder.INT8_SCHEMA, reader.getSchema(byteWritable)); + assertEquals(SchemaBuilder.INT16_SCHEMA, reader.getSchema(shortWritable)); + assertEquals(SchemaBuilder.INT32_SCHEMA, reader.getSchema(intWritable)); + assertEquals(SchemaBuilder.INT64_SCHEMA, reader.getSchema(longWritable)); + assertEquals(SchemaBuilder.FLOAT32_SCHEMA, reader.getSchema(floatWritable)); + assertEquals(SchemaBuilder.FLOAT64_SCHEMA, reader.getSchema(doubleWritable)); + assertEquals(SchemaBuilder.BYTES_SCHEMA, reader.getSchema(bytesWritable)); + assertEquals(SchemaBuilder.BOOLEAN_SCHEMA, reader.getSchema(booleanWritable)); + assertEquals(SchemaBuilder.STRING_SCHEMA, reader.getSchema(textWritable)); + assertEquals(SchemaBuilder.STRING_SCHEMA, reader.getSchema(new Writable() { + + @Override + public void write(DataOutput out) { + + } + + @Override + public void readFields(DataInput in) { + + } + })); + + SequenceFileReader.SeqToStruct seqToStruct = new SequenceFileReader.SeqToStruct(); + assertEquals(seqToStruct.toSchemaValue(byteWritable), byteWritable.get()); + assertEquals(seqToStruct.toSchemaValue(shortWritable), shortWritable.get()); + assertEquals(seqToStruct.toSchemaValue(intWritable), intWritable.get()); + assertEquals(seqToStruct.toSchemaValue(longWritable), longWritable.get()); + assertEquals(seqToStruct.toSchemaValue(floatWritable), floatWritable.get()); + assertEquals(seqToStruct.toSchemaValue(doubleWritable), doubleWritable.get()); + assertEquals(seqToStruct.toSchemaValue(bytesWritable), bytesWritable.getBytes()); + assertEquals(seqToStruct.toSchemaValue(booleanWritable), booleanWritable.get()); + assertEquals(seqToStruct.toSchemaValue(textWritable), textWritable.toString()); + } + + @Override + protected Class getReaderClass() { + return SequenceFileReader.class; + } + + @Override + protected Map getReaderConfig() { + return new HashMap() {{ + put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, FIELD_NAME_KEY); + put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + }}; + } + + @Override + protected void checkData(Struct record, long index) { + checkData(FIELD_NAME_KEY, FIELD_NAME_VALUE, record, index); + } + + private void checkData(String keyFieldName, String valueFieldName, Struct record, long index) { + assertAll( + () -> assertEquals((int) (Integer) record.get(keyFieldName), index), + () -> assertTrue(record.get(valueFieldName).toString().startsWith(index + "_")) + ); + } + + @Override + protected String getFileExtension() { + return FILE_EXTENSION; + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java new file mode 100644 index 0000000..5e56ac6 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java @@ -0,0 +1,144 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.*; + +public class TextFileReaderTest extends FileReaderTestBase { + + private static final String FIELD_NAME_VALUE = "custom_field_name"; + private static final String FILE_EXTENSION = "txt"; + private static final CompressionType COMPRESSION_TYPE_DEFAULT = CompressionType.GZIP; + + @Override + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { + CompressionType compression = args.length < 1 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[0]; + File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); + try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { + IntStream.range(0, NUM_RECORDS).forEach(index -> { + String value = String.format("%d_%s", index, UUID.randomUUID()); + writer.append(value + "\n"); + fsConfig.offsetsByIndex().put(index, (long) index); + }); + } + Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); + return path; + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void validFileEncoding(ReaderFsTestConfig fsConfig) { + Map readerConfig = getReaderConfig(); + readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + readerConfig.put(TextFileReader.FILE_READER_TEXT_ENCODING, "Cp1252"); + readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT); + FileReader reader = getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); + fsConfig.setReader(reader); + readAllData(fsConfig); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidFileEncoding(ReaderFsTestConfig fsConfig) { + Map readerConfig = getReaderConfig(); + readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + readerConfig.put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset"); + readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); + assertThrows(UnsupportedCharsetException.class, () -> { + try { + getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readDataWithRecordPerLineDisabled(ReaderFsTestConfig fsConfig) throws IOException { + Path file = createDataFile(fsConfig, COMPRESSION_TYPE_DEFAULT); + Map readerConfig = getReaderConfig(); + readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + readerConfig.put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, "false"); + readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals(1, recordCount, () -> "The number of records in the file does not match"); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { + Arrays.stream(CompressionType.values()).forEach(compressionType -> { + try { + Path file = createDataFile(fsConfig, compressionType); + Map readerConfig = getReaderConfig(); + readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, compressionType); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + } + + @Override + protected Class getReaderClass() { + return TextFileReader.class; + } + + @Override + protected Map getReaderConfig() { + return new HashMap() {{ + put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_CONCATENATED, "true"); + }}; + } + + @Override + protected void checkData(Struct record, long index) { + assertTrue(record.get(FIELD_NAME_VALUE).toString().startsWith(index + "_")); + } + + @Override + protected String getFileExtension() { + return FILE_EXTENSION; + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java new file mode 100644 index 0000000..d82a50e --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java @@ -0,0 +1,45 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import org.apache.hadoop.fs.Path; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.IntStream; + +public class TsvFileReaderTest extends UnivocityFileReaderTest { + + @Override + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { + boolean header = args.length < 1 || (boolean) args[0]; + CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1]; + File txtFile = File.createTempFile("test-", "." + getFileExtension()); + try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { + if (header) { + String headerValue = String.join("\t", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3, FIELD_COLUMN4, + FIELD_COLUMN5, FIELD_COLUMN6, FIELD_COLUMN7, FIELD_COLUMN8, FIELD_COLUMN9); + writer.append(headerValue + "\n"); + } + IntStream.range(0, NUM_RECORDS).forEach(index -> { + String value = String.format("%d\t%d\t%d\t%d\t%f\t%f\t%s\t%s\t%s\n", + (byte) 2, (short) 4, 8, 16L, 32.32f, 64.64d, + true, "test bytes", "test string"); + writer.append(value); + fsConfig.offsetsByIndex().put(index, (long) index); + }); + } + Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); + return path; + } + + @Override + protected Map getReaderConfig() { + return new HashMap() {{ + put(TsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + put(TsvFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "byte,short,int,long,float,double,boolean,bytes,string"); + }}; + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java new file mode 100644 index 0000000..79663bc --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java @@ -0,0 +1,296 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.univocity.parsers.common.DataProcessingException; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.lang.reflect.ParameterizedType; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.*; + +abstract class UnivocityFileReaderTest extends FileReaderTestBase { + + protected static final String FIELD_COLUMN1 = "column_1"; + protected static final String FIELD_COLUMN2 = "column_2"; + protected static final String FIELD_COLUMN3 = "column_3"; + protected static final String FIELD_COLUMN4 = "column_4"; + protected static final String FIELD_COLUMN5 = "column_5"; + protected static final String FIELD_COLUMN6 = "column_6"; + protected static final String FIELD_COLUMN7 = "column_7"; + protected static final String FIELD_COLUMN8 = "column_8"; + protected static final String FIELD_COLUMN9 = "column_9"; + protected static final String FILE_EXTENSION = "tcsv"; + protected static final CompressionType COMPRESSION_TYPE_DEFAULT = CompressionType.NONE; + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void emptyFile(ReaderFsTestConfig fsConfig) throws IOException { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fsConfig.getFs(), path, getReaderConfig()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws IOException { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { + writer.write("test"); + } + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); + assertThrows(IllegalArgumentException.class, () -> { + try { + getReader(fsConfig.getFs(), path, getReaderConfig()); + } catch (Exception ce) { + throw ce.getCause(); + } + }); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invaliConfigArgs(ReaderFsTestConfig fsConfig) { + try { + getReaderClass().getConstructor(FileSystem.class, Path.class, Map.class) + .newInstance(fsConfig.getFs(), fsConfig.getDataFile(), new HashMap()); + } catch (Exception e) { + assertThrows(IllegalArgumentException.class, () -> { + throw e.getCause(); + }); + } + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readAllDataWithoutHeader(ReaderFsTestConfig fsConfig) throws IOException { + Path file = createDataFile(fsConfig, false); + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "false"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readAllDataWithoutSchema(ReaderFsTestConfig fsConfig) throws IOException { + Path file = createDataFile(fsConfig, true); + Map readerConfig = getReaderConfig(); + readerConfig.remove(T.FILE_READER_DELIMITED_SETTINGS_SCHEMA); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkDataString(record); + recordCount++; + } + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readAllDataWithMappingErrors(ReaderFsTestConfig fsConfig) throws IOException { + Path file = createDataFile(fsConfig, true); + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "boolean,boolean,boolean,boolean,boolean,boolean,int,long,double"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + try { + reader.next(); + } catch (Exception e) { + assertEquals(ConnectException.class, e.getClass()); + assertEquals(DataProcessingException.class, e.getCause().getClass()); + } + recordCount++; + } + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readAllDataToleratingMappingErrors(ReaderFsTestConfig fsConfig) throws IOException { + Path file = createDataFile(fsConfig, true); + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "boolean,boolean,boolean,boolean,boolean,boolean,int,long,double"); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_DATA_TYPE_MAPPING_ERROR, "false"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkDataNull(record); + recordCount++; + } + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { + Arrays.stream(CompressionType.values()).forEach(compressionType -> { + try { + Path file = createDataFile(fsConfig, true, compressionType); + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_COMPRESSION_TYPE, compressionType.toString()); + readerConfig.put(T.FILE_READER_DELIMITED_COMPRESSION_CONCATENATED, "true"); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void seekFileWithoutHeader(ReaderFsTestConfig fsConfig) throws IOException { + Path file = createDataFile(fsConfig, false); + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "false"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordIndex = NUM_RECORDS / 2; + reader.seek(fsConfig.offsetsByIndex().get(recordIndex)); + assertTrue(reader.hasNext()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset()); + checkData(reader.next(), recordIndex); + + recordIndex = 0; + reader.seek(fsConfig.offsetsByIndex().get(recordIndex)); + assertTrue(reader.hasNext()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset()); + checkData(reader.next(), recordIndex); + + recordIndex = NUM_RECORDS - 3; + reader.seek(fsConfig.offsetsByIndex().get(recordIndex)); + assertTrue(reader.hasNext()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset()); + checkData(reader.next(), recordIndex); + + reader.seek(fsConfig.offsetsByIndex().get(NUM_RECORDS - 1) + 1); + assertFalse(reader.hasNext()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void validFileEncoding(ReaderFsTestConfig fsConfig) { + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + readerConfig.put(T.FILE_READER_DELIMITED_ENCODING, "Cp1252"); + getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidFileEncoding(ReaderFsTestConfig fsConfig) { + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + readerConfig.put(T.FILE_READER_DELIMITED_ENCODING, "invalid_charset"); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); + assertThrows(UnsupportedCharsetException.class, () -> { + try { + getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + + @Override + protected Class getReaderClass() { + return (Class) ((ParameterizedType) this.getClass().getGenericSuperclass()).getActualTypeArguments()[0]; + } + + @Override + protected void checkData(Struct record, long index) { + assertAll( + () -> assertEquals(record.get(FIELD_COLUMN1), (byte) 2), + () -> assertEquals(record.get(FIELD_COLUMN2), (short) 4), + () -> assertEquals(record.get(FIELD_COLUMN3), 8), + () -> assertEquals(record.get(FIELD_COLUMN4), 16L), + () -> assertEquals(record.get(FIELD_COLUMN5), 32.32f), + () -> assertEquals(record.get(FIELD_COLUMN6), 64.64d), + () -> assertEquals(record.get(FIELD_COLUMN7), true), + () -> assertEquals(new String((byte[]) record.get(FIELD_COLUMN8)), "test bytes"), + () -> assertEquals(record.get(FIELD_COLUMN9), "test string") + ); + } + + protected void checkDataString(Struct record) { + assertAll( + () -> assertEquals(record.get(FIELD_COLUMN1), "2"), + () -> assertEquals(record.get(FIELD_COLUMN2), "4"), + () -> assertEquals(record.get(FIELD_COLUMN3), "8"), + () -> assertEquals(record.get(FIELD_COLUMN4), "16"), + () -> assertEquals(record.get(FIELD_COLUMN5), "32.320000"), + () -> assertEquals(record.get(FIELD_COLUMN6), "64.640000"), + () -> assertEquals(record.get(FIELD_COLUMN7), "true"), + () -> assertEquals(record.get(FIELD_COLUMN8), "test bytes"), + () -> assertEquals(record.get(FIELD_COLUMN9), "test string") + ); + } + + protected void checkDataNull(Struct record) { + assertAll( + () -> assertEquals(record.get(FIELD_COLUMN1), null), + () -> assertEquals(record.get(FIELD_COLUMN2), null), + () -> assertEquals(record.get(FIELD_COLUMN3), null), + () -> assertEquals(record.get(FIELD_COLUMN4), null), + () -> assertEquals(record.get(FIELD_COLUMN5), null), + () -> assertEquals(record.get(FIELD_COLUMN6), null), + () -> assertEquals(record.get(FIELD_COLUMN7), null), + () -> assertEquals(record.get(FIELD_COLUMN8), null), + () -> assertEquals(record.get(FIELD_COLUMN9), null) + ); + } + + @Override + protected String getFileExtension() { + return FILE_EXTENSION; + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java deleted file mode 100644 index 67a772e..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java +++ /dev/null @@ -1,113 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.AvroFileReader; -import org.apache.avro.AvroTypeException; -import org.apache.avro.Schema; -import org.apache.avro.SchemaParseException; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.DatumWriter; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.Assert.assertTrue; - -public class AvroFileReaderTest extends HdfsFileReaderTestBase { - - private static final String FIELD_INDEX = "index"; - private static final String FIELD_NAME = "name"; - private static final String FIELD_SURNAME = "surname"; - private static final String FILE_EXTENSION = "avro"; - - private static Schema schema; - - @BeforeClass - public static void setUp() throws IOException { - schema = new Schema.Parser().parse(AvroFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap<>(); - } - - private static Path createDataFile() throws IOException { - File avroFile = File.createTempFile("test-", "." + FILE_EXTENSION); - DatumWriter writer = new GenericDatumWriter<>(schema); - try (DataFileWriter dataFileWriter = new DataFileWriter<>(writer)) { - dataFileWriter.setFlushOnEveryBlock(true); - dataFileWriter.setSyncInterval(32); - dataFileWriter.create(schema, avroFile); - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - GenericRecord datum = new GenericData.Record(schema); - datum.put(FIELD_INDEX, index); - datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID())); - datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID())); - try { - OFFSETS_BY_INDEX.put(index, dataFileWriter.sync() - 16L); - dataFileWriter.append(datum); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), avroFile.getName()); - fs.moveFromLocalFile(new Path(avroFile.getAbsolutePath()), path); - return path; - } - - @Test - public void readerWithSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(AvroFileReader.FILE_READER_AVRO_SCHEMA, schema.toString()); - }}; - reader = getReader(fs, dataFile, cfg); - readAllData(); - } - - @Test(expected = AvroTypeException.class) - public void readerWithInvalidSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(AvroFileReader.FILE_READER_AVRO_SCHEMA, Schema.create(Schema.Type.STRING).toString()); - }}; - reader = getReader(fs, dataFile, cfg); - readAllData(); - } - - @Test(expected = SchemaParseException.class) - public void readerWithUnparseableSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(AvroFileReader.FILE_READER_AVRO_SCHEMA, "invalid schema"); - }}; - getReader(fs, dataFile, cfg); - } - - @Override - protected Offset getOffset(long offset) { - return new AvroFileReader.AvroOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - assertTrue((Integer) record.get(FIELD_INDEX) == index); - assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java deleted file mode 100644 index da5304d..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java +++ /dev/null @@ -1,210 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.DelimitedTextFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.charset.UnsupportedCharsetException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.Assert.*; - -public class DelimitedTextFileReaderTest extends HdfsFileReaderTestBase { - - private static final String FIELD_COLUMN1 = "column_1"; - private static final String FIELD_COLUMN2 = "column_2"; - private static final String FIELD_COLUMN3 = "column_3"; - private static final String FIELD_COLUMN4 = "column_4"; - private static final String FILE_EXTENSION = "csv"; - - @BeforeClass - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(true); - readerConfig = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - }}; - } - - private static Path createDataFile(boolean header) throws IOException { - File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (FileWriter writer = new FileWriter(txtFile)) { - - if (header) - writer.append(FIELD_COLUMN1 + "," + FIELD_COLUMN2 + "," + FIELD_COLUMN3 + "," + FIELD_COLUMN4 + "\n"); - IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - try { - writer.append(value + "," + value + "," + value + "," + value + "\n"); - if (header) OFFSETS_BY_INDEX.put(index, Long.valueOf(index++)); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), txtFile.getName()); - fs.moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); - return path; - } - - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) - public void emptyFile() throws Throwable { - super.emptyFile(); - } - - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) - public void invalidFileFormat() throws Throwable { - super.invalidFileFormat(); - } - - @Test(expected = IllegalArgumentException.class) - public void invaliConfigArgs() throws Throwable { - try { - readerClass.getConstructor(FileSystem.class, Path.class, Map.class).newInstance(fs, dataFile, new HashMap<>()); - } catch (Exception e) { - throw e.getCause(); - } - } - - @Test - public void readAllDataWithoutHeader() throws Throwable { - Path file = createDataFile(false); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "false"); - }}); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - checkData(record, recordCount); - recordCount++; - } - assertEquals("The number of records in the file does not match", NUM_RECORDS, recordCount); - - } - - @Test - public void readAllDataWithMalformedRows() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - try (FileWriter writer = new FileWriter(tmp)) { - writer.append(FIELD_COLUMN1 + "," + FIELD_COLUMN2 + "," + FIELD_COLUMN3 + "," + FIELD_COLUMN4 + "\n"); - writer.append("dummy\n"); - writer.append("dummy\n"); - } - Map cfg = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_DEFAULT_VALUE, "custom_value"); - }}; - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - reader = getReader(fs, path, cfg); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - assertTrue(record.get(FIELD_COLUMN1).equals("dummy")); - assertTrue(record.get(FIELD_COLUMN2).equals("custom_value")); - assertTrue(record.get(FIELD_COLUMN3).equals("custom_value")); - assertTrue(record.get(FIELD_COLUMN4).equals("custom_value")); - recordCount++; - } - assertEquals("The number of records in the file does not match", 2, recordCount); - } - - @Test - public void seekFileWithoutHeader() throws Throwable { - Path file = createDataFile(false); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "false"); - }}); - - assertTrue(reader.hasNext()); - - int recordIndex = NUM_RECORDS / 2; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); - assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); - checkData(reader.next(), recordIndex); - - recordIndex = 0; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); - assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); - checkData(reader.next(), recordIndex); - - recordIndex = NUM_RECORDS - 3; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); - assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); - checkData(reader.next(), recordIndex); - - reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1, false)); - assertFalse(reader.hasNext()); - - } - - @Test - public void validFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_ENCODING, "Cp1252"); - }}; - getReader(fs, dataFile, cfg); - } - - @Test(expected = UnsupportedCharsetException.class) - public void invalidFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_ENCODING, "invalid_charset"); - }}; - getReader(fs, dataFile, cfg); - } - - @Override - protected Offset getOffset(long offset) { - return getOffset(offset, true); - } - - private Offset getOffset(long offset, boolean hasHeader) { - return new DelimitedTextFileReader.DelimitedTextOffset(offset, hasHeader); - } - - @Override - protected void checkData(Struct record, long index) { - assertTrue(record.get(FIELD_COLUMN1).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_COLUMN2).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_COLUMN3).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_COLUMN4).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java deleted file mode 100644 index 5a7c1ba..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReaderTestBase; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.junit.AfterClass; -import org.junit.BeforeClass; - -import java.io.IOException; -import java.net.URI; -import java.nio.file.Files; -import java.nio.file.Path; - -public abstract class HdfsFileReaderTestBase extends FileReaderTestBase { - - private static MiniDFSCluster cluster; - private static Configuration clusterConfig; - private static Path hdfsDir; - - @BeforeClass - public static void initFs() throws IOException { - clusterConfig = new Configuration(); - hdfsDir = Files.createTempDirectory("test-"); - clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString()); - cluster = new MiniDFSCluster.Builder(clusterConfig).build(); - fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/"); - fs = FileSystem.newInstance(fsUri, new Configuration()); - } - - @AfterClass - public static void finishFs() throws Exception { - cluster.shutdown(true); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java deleted file mode 100644 index ae0e82c..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java +++ /dev/null @@ -1,151 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.ParquetFileReader; -import org.apache.avro.AvroRuntimeException; -import org.apache.avro.Schema; -import org.apache.avro.SchemaBuilder; -import org.apache.avro.SchemaParseException; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.errors.DataException; -import org.apache.parquet.avro.AvroParquetWriter; -import org.apache.parquet.hadoop.ParquetFileWriter; -import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.io.InvalidRecordException; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.Assert.*; - -public class ParquetFileReaderTest extends HdfsFileReaderTestBase { - - private static final String FIELD_INDEX = "index"; - private static final String FIELD_NAME = "name"; - private static final String FIELD_SURNAME = "surname"; - private static final String FILE_EXTENSION = "parquet"; - - private static Schema readerSchema; - private static Schema projectionSchema; - - @BeforeClass - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap<>(); - } - - private static Path createDataFile() throws IOException { - File parquetFile = File.createTempFile("test-", "." + FILE_EXTENSION); - readerSchema = new Schema.Parser().parse( - ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); - projectionSchema = new Schema.Parser().parse( - ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people_projection.avsc")); - - try (ParquetWriter writer = AvroParquetWriter.builder(new Path(parquetFile.toURI())) - .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) { - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - GenericRecord datum = new GenericData.Record(readerSchema); - datum.put(FIELD_INDEX, index); - datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID())); - datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID())); - try { - OFFSETS_BY_INDEX.put(index, Long.valueOf(index)); - writer.write(datum); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), parquetFile.getName()); - fs.moveFromLocalFile(new Path(parquetFile.getAbsolutePath()), path); - return path; - } - - @Test - public void readerWithSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, readerSchema.toString()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); - } - - @Test(expected = DataException.class) - public void readerWithProjection() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, projectionSchema.toString()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - while (reader.hasNext()) { - Struct record = reader.next(); - assertNotNull(record.schema().field(FIELD_INDEX)); - assertNotNull(record.schema().field(FIELD_NAME)); - assertNull(record.schema().field(FIELD_SURNAME)); - } - - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); - } - - @Test(expected = InvalidRecordException.class) - public void readerWithInvalidProjection() throws Throwable { - Schema testSchema = SchemaBuilder.record("test_projection").namespace("test.avro") - .fields() - .name("field1").type("string").noDefault() - .endRecord(); - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, testSchema.toString()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); - } - - @Test(expected = AvroRuntimeException.class) - public void readerWithInvalidSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, Schema.create(Schema.Type.STRING).toString()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); - } - - @Test(expected = SchemaParseException.class) - public void readerWithUnparseableSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, "invalid schema"); - }}; - getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - } - - @Override - protected Offset getOffset(long offset) { - return new ParquetFileReader.ParquetOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - assertTrue((Integer) record.get(FIELD_INDEX) == index); - assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } - -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java deleted file mode 100644 index d7e6ba0..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java +++ /dev/null @@ -1,111 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.SequenceFileReader; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.util.ReflectionUtils; -import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -public class SequenceFileReaderTest extends HdfsFileReaderTestBase { - - private static final String FIELD_NAME_KEY = "key"; - private static final String FIELD_NAME_VALUE = "value"; - private static final String FILE_EXTENSION = "seq"; - - @BeforeClass - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap() {{ - put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, FIELD_NAME_KEY); - put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - }}; - } - - private static Path createDataFile() throws IOException { - File seqFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (SequenceFile.Writer writer = SequenceFile.createWriter(fs.getConf(), SequenceFile.Writer.file(new Path(seqFile.getAbsolutePath())), - SequenceFile.Writer.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class))) { - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - Writable key = new IntWritable(index); - Writable value = new Text(String.format("%d_%s", index, UUID.randomUUID())); - try { - writer.append(key, value); - writer.sync(); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - try (SequenceFile.Reader reader = new SequenceFile.Reader(fs.getConf(), - SequenceFile.Reader.file(new Path(seqFile.getAbsolutePath())))) { - Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), fs.getConf()); - Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), fs.getConf()); - int index = 0; - long pos = reader.getPosition() - 1; - while (reader.next(key, value)) { - OFFSETS_BY_INDEX.put(index++, pos); - pos = reader.getPosition(); - } - } - Path path = new Path(new Path(fsUri), seqFile.getName()); - fs.moveFromLocalFile(new Path(seqFile.getAbsolutePath()), path); - return path; - } - - @Test - public void defaultFieldNames() throws Throwable { - Map customReaderCfg = new HashMap<>(); - reader = getReader(fs, dataFile, customReaderCfg); - assertTrue(reader.getFilePath().equals(dataFile)); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - checkData(SequenceFileReader.FIELD_NAME_KEY_DEFAULT, SequenceFileReader.FIELD_NAME_VALUE_DEFAULT, record, recordCount); - recordCount++; - } - assertEquals("The number of records in the file does not match", NUM_RECORDS, recordCount); - } - - @Override - protected Offset getOffset(long offset) { - return new SequenceFileReader.SeqOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - checkData(FIELD_NAME_KEY, FIELD_NAME_VALUE, record, index); - } - - private void checkData(String keyFieldName, String valueFieldName, Struct record, long index) { - assertTrue((Integer) record.get(keyFieldName) == index); - assertTrue(record.get(valueFieldName).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } - -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java deleted file mode 100644 index 0c37d4d..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java +++ /dev/null @@ -1,102 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.charset.UnsupportedCharsetException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.Assert.assertTrue; - -public class TextFileReaderTest extends HdfsFileReaderTestBase { - - private static final String FIELD_NAME_VALUE = "custom_field_name"; - private static final String FILE_EXTENSION = "txt"; - - @BeforeClass - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - }}; - } - - private static Path createDataFile() throws IOException { - File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (FileWriter writer = new FileWriter(txtFile)) { - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - try { - writer.append(value + "\n"); - OFFSETS_BY_INDEX.put(index, Long.valueOf(index++)); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), txtFile.getName()); - fs.moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); - return path; - } - - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) - public void emptyFile() throws Throwable { - super.emptyFile(); - } - - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) - public void invalidFileFormat() throws Throwable { - super.invalidFileFormat(); - } - - @Test - public void validFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_ENCODING, "Cp1252"); - }}; - reader = getReader(fs, dataFile, cfg); - readAllData(); - } - - @Test(expected = UnsupportedCharsetException.class) - public void invalidFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset"); - }}; - getReader(fs, dataFile, cfg); - } - - @Override - protected Offset getOffset(long offset) { - return new TextFileReader.TextOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - assertTrue(record.get(FIELD_NAME_VALUE).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } - -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java deleted file mode 100644 index de4ed20..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java +++ /dev/null @@ -1,119 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.local; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.AvroFileReader; -import org.apache.avro.AvroTypeException; -import org.apache.avro.Schema; -import org.apache.avro.SchemaParseException; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.DatumWriter; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.Assert.assertTrue; - -public class AvroFileReaderTest extends LocalFileReaderTestBase { - - private static final String FIELD_INDEX = "index"; - private static final String FIELD_NAME = "name"; - private static final String FIELD_SURNAME = "surname"; - private static final String FILE_EXTENSION = "avr"; - - private static Schema schema; - - @BeforeClass - public static void setUp() throws IOException { - schema = new Schema.Parser().parse(AvroFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap() {{ - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, FILE_EXTENSION); - }}; - } - - private static Path createDataFile() throws IOException { - File avroFile = File.createTempFile("test-", "." + FILE_EXTENSION); - DatumWriter writer = new GenericDatumWriter<>(schema); - try (DataFileWriter dataFileWriter = new DataFileWriter<>(writer)) { - dataFileWriter.setFlushOnEveryBlock(true); - dataFileWriter.setSyncInterval(32); - dataFileWriter.create(schema, avroFile); - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - GenericRecord datum = new GenericData.Record(schema); - datum.put(FIELD_INDEX, index); - datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID())); - datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID())); - try { - OFFSETS_BY_INDEX.put(index, dataFileWriter.sync() - 16L); - dataFileWriter.append(datum); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), avroFile.getName()); - fs.moveFromLocalFile(new Path(avroFile.getAbsolutePath()), path); - return path; - } - - @Test - public void readerWithSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(AvroFileReader.FILE_READER_AVRO_SCHEMA, schema.toString()); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, getFileExtension()); - }}; - reader = getReader(fs, dataFile, cfg); - readAllData(); - } - - @Test(expected = AvroTypeException.class) - public void readerWithInvalidSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(AvroFileReader.FILE_READER_AVRO_SCHEMA, Schema.create(Schema.Type.STRING).toString()); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, getFileExtension()); - }}; - reader = getReader(fs, dataFile, cfg); - readAllData(); - } - - @Test(expected = SchemaParseException.class) - public void readerWithUnparseableSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(AvroFileReader.FILE_READER_AVRO_SCHEMA, "invalid schema"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, getFileExtension()); - }}; - getReader(fs, dataFile, cfg); - } - - @Override - protected Offset getOffset(long offset) { - return new AvroFileReader.AvroOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - assertTrue((Integer) record.get(FIELD_INDEX) == index); - assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } - -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java deleted file mode 100644 index 5884240..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java +++ /dev/null @@ -1,219 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.local; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.DelimitedTextFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.charset.UnsupportedCharsetException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.Assert.*; - -public class DelimitedTextFileReaderTest extends LocalFileReaderTestBase { - - private static final String FIELD_COLUMN1 = "column_1"; - private static final String FIELD_COLUMN2 = "column_2"; - private static final String FIELD_COLUMN3 = "column_3"; - private static final String FIELD_COLUMN4 = "column_4"; - private static final String FILE_EXTENSION = "tcsv"; - - @BeforeClass - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(true); - readerConfig = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, FILE_EXTENSION); - }}; - } - - private static Path createDataFile(boolean header) throws IOException { - File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (FileWriter writer = new FileWriter(txtFile)) { - - if (header) - writer.append(FIELD_COLUMN1 + "," + FIELD_COLUMN2 + "," + FIELD_COLUMN3 + "," + FIELD_COLUMN4 + "\n"); - IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - try { - writer.append(value + "," + value + "," + value + "," + value + "\n"); - if (header) OFFSETS_BY_INDEX.put(index, Long.valueOf(index++)); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), txtFile.getName()); - fs.moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); - return path; - } - - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) - public void emptyFile() throws Throwable { - super.emptyFile(); - } - - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) - public void invalidFileFormat() throws Throwable { - super.invalidFileFormat(); - } - - @Test(expected = IllegalArgumentException.class) - public void invaliConfigArgs() throws Throwable { - try { - readerClass.getConstructor(FileSystem.class, Path.class, Map.class).newInstance(fs, dataFile, - new HashMap() {{ - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, FILE_EXTENSION); - }}); - } catch (Exception e) { - throw e.getCause(); - } - } - - @Test - public void readAllDataWithoutHeader() throws Throwable { - Path file = createDataFile(false); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "false"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, getFileExtension()); - }}); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - checkData(record, recordCount); - recordCount++; - } - assertEquals("The number of records in the file does not match", NUM_RECORDS, recordCount); - - } - - @Test - public void readAllDataWithMalformedRows() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - try (FileWriter writer = new FileWriter(tmp)) { - writer.append(FIELD_COLUMN1 + "," + FIELD_COLUMN2 + "," + FIELD_COLUMN3 + "," + FIELD_COLUMN4 + "\n"); - writer.append("dummy\n"); - writer.append("dummy\n"); - } - Map cfg = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_DEFAULT_VALUE, "custom_value"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, getFileExtension()); - }}; - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - reader = getReader(fs, path, cfg); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - assertTrue(record.get(FIELD_COLUMN1).equals("dummy")); - assertTrue(record.get(FIELD_COLUMN2).equals("custom_value")); - assertTrue(record.get(FIELD_COLUMN3).equals("custom_value")); - assertTrue(record.get(FIELD_COLUMN4).equals("custom_value")); - recordCount++; - } - assertEquals("The number of records in the file does not match", 2, recordCount); - } - - @Test - public void seekFileWithoutHeader() throws Throwable { - Path file = createDataFile(false); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "false"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, getFileExtension()); - }}); - - assertTrue(reader.hasNext()); - - int recordIndex = NUM_RECORDS / 2; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); - assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); - checkData(reader.next(), recordIndex); - - recordIndex = 0; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); - assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); - checkData(reader.next(), recordIndex); - - recordIndex = NUM_RECORDS - 3; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); - assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); - checkData(reader.next(), recordIndex); - - reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1, false)); - assertFalse(reader.hasNext()); - - } - - @Test - public void validFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_ENCODING, "Cp1252"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, getFileExtension()); - }}; - getReader(fs, dataFile, cfg); - } - - @Test(expected = UnsupportedCharsetException.class) - public void invalidFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_ENCODING, "invalid_charset"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, getFileExtension()); - }}; - getReader(fs, dataFile, cfg); - } - - @Override - protected Offset getOffset(long offset) { - return getOffset(offset, true); - } - - private Offset getOffset(long offset, boolean hasHeader) { - return new DelimitedTextFileReader.DelimitedTextOffset(offset, hasHeader); - } - - @Override - protected void checkData(Struct record, long index) { - assertTrue(record.get(FIELD_COLUMN1).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_COLUMN2).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_COLUMN3).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_COLUMN4).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/LocalFileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/LocalFileReaderTestBase.java deleted file mode 100644 index 6589e92..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/LocalFileReaderTestBase.java +++ /dev/null @@ -1,30 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.local; - -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReaderTestBase; -import org.apache.commons.io.FileUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.junit.AfterClass; -import org.junit.BeforeClass; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -public abstract class LocalFileReaderTestBase extends FileReaderTestBase { - - private static Path localDir; - - @BeforeClass - public static void initFs() throws IOException { - localDir = Files.createTempDirectory("test-"); - fsUri = localDir.toUri(); - fs = FileSystem.newInstance(fsUri, new Configuration()); - } - - @AfterClass - public static void finishFs() throws IOException { - FileUtils.deleteDirectory(localDir.toFile()); - } - -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java deleted file mode 100644 index 91c1eb6..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java +++ /dev/null @@ -1,158 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.local; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.ParquetFileReader; -import org.apache.avro.AvroRuntimeException; -import org.apache.avro.Schema; -import org.apache.avro.SchemaBuilder; -import org.apache.avro.SchemaParseException; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.errors.DataException; -import org.apache.parquet.avro.AvroParquetWriter; -import org.apache.parquet.hadoop.ParquetFileWriter; -import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.io.InvalidRecordException; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.Assert.*; - -public class ParquetFileReaderTest extends LocalFileReaderTestBase { - - private static final String FIELD_INDEX = "index"; - private static final String FIELD_NAME = "name"; - private static final String FIELD_SURNAME = "surname"; - private static final String FILE_EXTENSION = "prqt"; - - private static Schema readerSchema; - private static Schema projectionSchema; - - @BeforeClass - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap() {{ - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, FILE_EXTENSION); - }}; - } - - private static Path createDataFile() throws IOException { - File parquetFile = File.createTempFile("test-", "." + FILE_EXTENSION); - readerSchema = new Schema.Parser().parse( - ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); - projectionSchema = new Schema.Parser().parse( - ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people_projection.avsc")); - - try (ParquetWriter writer = AvroParquetWriter.builder(new Path(parquetFile.toURI())) - .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) { - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - GenericRecord datum = new GenericData.Record(readerSchema); - datum.put(FIELD_INDEX, index); - datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID())); - datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID())); - try { - OFFSETS_BY_INDEX.put(index, Long.valueOf(index)); - writer.write(datum); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), parquetFile.getName()); - fs.moveFromLocalFile(new Path(parquetFile.getAbsolutePath()), path); - return path; - } - - @Test - public void readerWithSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, readerSchema.toString()); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); - } - - @Test(expected = DataException.class) - public void readerWithProjection() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, projectionSchema.toString()); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - while (reader.hasNext()) { - Struct record = reader.next(); - assertNotNull(record.schema().field(FIELD_INDEX)); - assertNotNull(record.schema().field(FIELD_NAME)); - assertNull(record.schema().field(FIELD_SURNAME)); - } - - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); - } - - @Test(expected = InvalidRecordException.class) - public void readerWithInvalidProjection() throws Throwable { - Schema testSchema = SchemaBuilder.record("test_projection").namespace("test.avro") - .fields() - .name("field1").type("string").noDefault() - .endRecord(); - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, testSchema.toString()); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); - } - - @Test(expected = AvroRuntimeException.class) - public void readerWithInvalidSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, Schema.create(Schema.Type.STRING).toString()); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); - } - - @Test(expected = SchemaParseException.class) - public void readerWithUnparseableSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, "invalid schema"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); - }}; - getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - } - - @Override - protected Offset getOffset(long offset) { - return new ParquetFileReader.ParquetOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - assertTrue((Integer) record.get(FIELD_INDEX) == index); - assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } - -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java deleted file mode 100644 index 8d53cb8..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java +++ /dev/null @@ -1,113 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.local; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.SequenceFileReader; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.util.ReflectionUtils; -import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -public class SequenceFileReaderTest extends LocalFileReaderTestBase { - - private static final String FIELD_NAME_KEY = "custom_field_key"; - private static final String FIELD_NAME_VALUE = "custom_field_name"; - private static final String FILE_EXTENSION = "sq"; - - @BeforeClass - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap() {{ - put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, FIELD_NAME_KEY); - put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE, FILE_EXTENSION); - }}; - } - - private static Path createDataFile() throws IOException { - File seqFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (SequenceFile.Writer writer = SequenceFile.createWriter(fs.getConf(), SequenceFile.Writer.file(new Path(seqFile.getAbsolutePath())), - SequenceFile.Writer.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class))) { - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - Writable key = new IntWritable(index); - Writable value = new Text(String.format("%d_%s", index, UUID.randomUUID())); - try { - writer.append(key, value); - writer.sync(); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - try (SequenceFile.Reader reader = new SequenceFile.Reader(fs.getConf(), - SequenceFile.Reader.file(new Path(seqFile.getAbsolutePath())))) { - Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), fs.getConf()); - Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), fs.getConf()); - int index = 0; - long pos = reader.getPosition() - 1; - while (reader.next(key, value)) { - OFFSETS_BY_INDEX.put(index++, pos); - pos = reader.getPosition(); - } - } - Path path = new Path(new Path(fsUri), seqFile.getName()); - fs.moveFromLocalFile(new Path(seqFile.getAbsolutePath()), path); - return path; - } - - @Test - public void defaultFieldNames() throws Throwable { - Map customReaderCfg = new HashMap() {{ - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE, getFileExtension()); - }}; - reader = getReader(fs, dataFile, customReaderCfg); - assertTrue(reader.getFilePath().equals(dataFile)); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - checkData(SequenceFileReader.FIELD_NAME_KEY_DEFAULT, SequenceFileReader.FIELD_NAME_VALUE_DEFAULT, record, recordCount); - recordCount++; - } - assertEquals("The number of records in the file does not match", NUM_RECORDS, recordCount); - } - - @Override - protected Offset getOffset(long offset) { - return new SequenceFileReader.SeqOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - checkData(FIELD_NAME_KEY, FIELD_NAME_VALUE, record, index); - } - - private void checkData(String keyFieldName, String valueFieldName, Struct record, long index) { - assertTrue((Integer) record.get(keyFieldName) == index); - assertTrue(record.get(valueFieldName).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java deleted file mode 100644 index 53d9a98..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java +++ /dev/null @@ -1,102 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.local; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.charset.UnsupportedCharsetException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.Assert.assertTrue; - -public class TextFileReaderTest extends LocalFileReaderTestBase { - - private static final String FIELD_NAME_VALUE = "custom_field_name"; - private static final String FILE_EXTENSION = "txt"; - - @BeforeClass - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - }}; - } - - private static Path createDataFile() throws IOException { - File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (FileWriter writer = new FileWriter(txtFile)) { - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - try { - writer.append(value + "\n"); - OFFSETS_BY_INDEX.put(index, Long.valueOf(index++)); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), txtFile.getName()); - fs.moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); - return path; - } - - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) - public void emptyFile() throws Throwable { - super.emptyFile(); - } - - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) - public void invalidFileFormat() throws Throwable { - super.invalidFileFormat(); - } - - @Test - public void validFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_ENCODING, "Cp1252"); - }}; - reader = getReader(fs, dataFile, cfg); - readAllData(); - } - - @Test(expected = UnsupportedCharsetException.class) - public void invalidFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset"); - }}; - getReader(fs, dataFile, cfg); - } - - @Override - protected Offset getOffset(long offset) { - return new TextFileReader.TextOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - assertTrue(record.get(FIELD_NAME_VALUE).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } - -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java new file mode 100644 index 0000000..72bac98 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java @@ -0,0 +1,103 @@ +package com.github.mmolimar.kafka.connect.fs.policy; + +import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; +import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; +import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.errors.IllegalWorkerStateException; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.IOException; +import java.time.LocalDateTime; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.*; + +public class CronPolicyTest extends PolicyTestBase { + + @Override + protected FsSourceTaskConfig buildSourceTaskConfig(List directories) { + Map cfg = new HashMap() {{ + String[] uris = directories.stream().map(Path::toString) + .toArray(String[]::new); + put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); + put(FsSourceTaskConfig.TOPIC, "topic_test"); + put(FsSourceTaskConfig.POLICY_CLASS, CronPolicy.class.getName()); + put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); + put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); + put(CronPolicy.CRON_POLICY_EXPRESSION, "0/2 * * * * ?"); + put(CronPolicy.CRON_POLICY_END_DATE, LocalDateTime.now().plusDays(1).toString()); + }}; + return new FsSourceTaskConfig(cfg); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + @Override + public void execPolicyAlreadyEnded(PolicyFsTestConfig fsConfig) throws IOException { + fsConfig.getPolicy().execute(); + fsConfig.getPolicy().interrupt(); + assertTrue(fsConfig.getPolicy().hasEnded()); + assertThrows(IllegalWorkerStateException.class, () -> fsConfig.getPolicy().execute()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidCronExpression(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); + originals.put(CronPolicy.CRON_POLICY_EXPRESSION, "invalid"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidEndDate(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); + originals.put(CronPolicy.CRON_POLICY_END_DATE, "invalid"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void canBeInterrupted(PolicyFsTestConfig fsConfig) throws IOException { + Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), + fsConfig.getSourceTaskConfig()); + + for (int i = 0; i < 5; i++) { + assertFalse(policy.hasEnded()); + policy.execute(); + } + policy.interrupt(); + assertTrue(policy.hasEnded()); + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java new file mode 100644 index 0000000..a29ae5d --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java @@ -0,0 +1,139 @@ +package com.github.mmolimar.kafka.connect.fs.policy; + +import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; +import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; +import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.errors.IllegalWorkerStateException; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.*; + +public class HdfsFileWatcherPolicyTest extends PolicyTestBase { + + static { + TEST_FILE_SYSTEMS = Collections.singletonList( + new HdfsFsConfig() + ); + } + + @BeforeAll + public static void initFs() throws IOException { + for (PolicyFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.initFs(); + } + } + + @Override + protected FsSourceTaskConfig buildSourceTaskConfig(List directories) { + Map cfg = new HashMap() {{ + String[] uris = directories.stream().map(Path::toString) + .toArray(String[]::new); + put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); + put(FsSourceTaskConfig.TOPIC, "topic_test"); + put(FsSourceTaskConfig.POLICY_CLASS, HdfsFileWatcherPolicy.class.getName()); + put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); + put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); + }}; + return new FsSourceTaskConfig(cfg); + } + + //This policy does not throw any exception. Just stop watching those nonexistent dirs + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + @Override + public void invalidDirectory(PolicyFsTestConfig fsConfig) throws IOException { + for (Path dir : fsConfig.getDirectories()) { + fsConfig.getFs().delete(dir, true); + } + try { + fsConfig.getPolicy().execute(); + } finally { + for (Path dir : fsConfig.getDirectories()) { + fsConfig.getFs().mkdirs(dir); + } + } + } + + //This policy never ends. We have to interrupt it + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + @Override + public void execPolicyAlreadyEnded(PolicyFsTestConfig fsConfig) throws IOException { + fsConfig.getPolicy().execute(); + assertFalse(fsConfig.getPolicy().hasEnded()); + fsConfig.getPolicy().interrupt(); + assertTrue(fsConfig.getPolicy().hasEnded()); + assertThrows(IllegalWorkerStateException.class, () -> fsConfig.getPolicy().execute()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void notReachableFileSystem(PolicyFsTestConfig fsConfig) throws InterruptedException { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); + originals.put(FsSourceTaskConfig.FS_URIS, "hdfs://localhost:65432/data"); + originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_POLL_MS, "0"); + originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_RETRY_MS, "0"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + int count = 0; + while (!policy.hasEnded() && count < 10) { + Thread.sleep(500); + count++; + } + assertTrue(count < 10); + assertTrue(policy.hasEnded()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidPollTime(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); + originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_POLL_MS, "invalid"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidRetryTime(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); + originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_RETRY_MS, "invalid"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyFsTestConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyFsTestConfig.java new file mode 100644 index 0000000..60382c9 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyFsTestConfig.java @@ -0,0 +1,112 @@ +package com.github.mmolimar.kafka.connect.fs.policy; + +import com.github.mmolimar.kafka.connect.fs.AbstractHdfsFsConfig; +import com.github.mmolimar.kafka.connect.fs.AbstractLocalFsConfig; +import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; +import com.github.mmolimar.kafka.connect.fs.FsTestConfig; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; + +interface PolicyFsTestConfig extends FsTestConfig { + + Policy getPolicy(); + + void setPolicy(Policy policy); + + FsSourceTaskConfig getSourceTaskConfig(); + + void setSourceTaskConfig(FsSourceTaskConfig sourceTaskConfig); + + List getDirectories(); + +} + +class LocalFsConfig extends AbstractLocalFsConfig implements PolicyFsTestConfig { + private Policy policy; + private FsSourceTaskConfig sourceTaskConfig; + private List directories; + + @Override + public void init() throws IOException { + directories = new ArrayList() {{ + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + }}; + for (Path dir : directories) { + getFs().mkdirs(dir); + } + } + + @Override + public Policy getPolicy() { + return policy; + } + + @Override + public void setPolicy(Policy policy) { + this.policy = policy; + } + + @Override + public FsSourceTaskConfig getSourceTaskConfig() { + return sourceTaskConfig; + } + + @Override + public void setSourceTaskConfig(FsSourceTaskConfig sourceTaskConfig) { + this.sourceTaskConfig = sourceTaskConfig; + } + + @Override + public List getDirectories() { + return directories; + } + +} + +class HdfsFsConfig extends AbstractHdfsFsConfig implements PolicyFsTestConfig { + private Policy policy; + private FsSourceTaskConfig sourceTaskConfig; + private List directories; + + @Override + public void init() throws IOException { + directories = new ArrayList() {{ + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + }}; + for (Path dir : directories) { + getFs().mkdirs(dir); + } + } + + @Override + public Policy getPolicy() { + return policy; + } + + @Override + public void setPolicy(Policy policy) { + this.policy = policy; + } + + @Override + public FsSourceTaskConfig getSourceTaskConfig() { + return sourceTaskConfig; + } + + @Override + public void setSourceTaskConfig(FsSourceTaskConfig sourceTaskConfig) { + this.sourceTaskConfig = sourceTaskConfig; + } + + @Override + public List getDirectories() { + return directories; + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java index 4f2bc24..ba77775 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java @@ -3,107 +3,138 @@ import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; import com.github.mmolimar.kafka.connect.fs.file.FileMetadata; import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; -import org.apache.commons.collections.map.HashedMap; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.IllegalWorkerStateException; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import java.io.FileNotFoundException; import java.io.IOException; -import java.net.URI; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; +import java.util.*; +import java.util.stream.Stream; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; -public abstract class PolicyTestBase { +abstract class PolicyTestBase { - protected static FileSystem fs; - protected static Policy policy; - protected static List directories; - protected static FsSourceTaskConfig taskConfig; - protected static URI fsUri; + protected static List TEST_FILE_SYSTEMS = Arrays.asList( + new LocalFsConfig(), + new HdfsFsConfig() + ); - @AfterClass - public static void tearDown() throws Exception { - policy.close(); - fs.close(); + @BeforeAll + public static void initFs() throws IOException { + for (PolicyFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.initFs(); + } } - @Before - public void initPolicy() throws Throwable { - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - taskConfig); + @AfterAll + public static void finishFs() throws IOException { + for (PolicyFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.getPolicy().close(); + fsConfig.close(); + } } - @After - public void cleanDirs() throws IOException { - for (Path dir : directories) { - fs.delete(dir, true); - fs.mkdirs(dir); + @BeforeEach + public void initPolicy() { + for (PolicyFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + FsSourceTaskConfig sourceTaskConfig = buildSourceTaskConfig(fsConfig.getDirectories()); + Policy policy = ReflectionUtils.makePolicy((Class) sourceTaskConfig + .getClass(FsSourceTaskConfig.POLICY_CLASS), sourceTaskConfig); + fsConfig.setSourceTaskConfig(sourceTaskConfig); + fsConfig.setPolicy(policy); + } + } + + @AfterEach + public void cleanDirsAndClose() throws IOException { + for (PolicyFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + for (Path dir : fsConfig.getDirectories()) { + fsConfig.getFs().delete(dir, true); + fsConfig.getFs().mkdirs(dir); + } + fsConfig.getPolicy().close(); } - policy.close(); } - @Test(expected = IllegalArgumentException.class) - public void invalidArgs() throws Exception { - taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS).getConstructor(taskConfig.getClass()).newInstance(null); + private static Stream fileSystemConfigProvider() { + return TEST_FILE_SYSTEMS.stream().map(Arguments::of); } - @Test(expected = ConfigException.class) - public void invalidConfig() throws Throwable { - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - new FsSourceTaskConfig(new HashedMap())); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidArgs(PolicyFsTestConfig fsConfig) { + assertThrows(IllegalArgumentException.class, () -> fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS) + .getConstructor(fsConfig.getSourceTaskConfig().getClass()).newInstance(null)); } - @Test - public void interruptPolicy() throws Throwable { - policy.execute(); - policy.interrupt(); - assertTrue(policy.hasEnded()); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidConfig(PolicyFsTestConfig fsConfig) { + assertThrows(ConfigException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), + new FsSourceTaskConfig(new HashMap<>()))); } - @Test(expected = FileNotFoundException.class) - public void invalidDirectory() throws IOException { - for (Path dir : directories) { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void interruptPolicy(PolicyFsTestConfig fsConfig) throws IOException { + fsConfig.getPolicy().execute(); + fsConfig.getPolicy().interrupt(); + assertTrue(fsConfig.getPolicy().hasEnded()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidDirectory(PolicyFsTestConfig fsConfig) throws IOException { + FileSystem fs = fsConfig.getFs(); + for (Path dir : fsConfig.getDirectories()) { fs.delete(dir, true); } try { - policy.execute(); + assertThrows(FileNotFoundException.class, () -> fsConfig.getPolicy().execute()); } finally { - for (Path dir : directories) { + for (Path dir : fsConfig.getDirectories()) { fs.mkdirs(dir); } } } - @Test(expected = NoSuchElementException.class) - public void listEmptyDirectories() throws IOException { - Iterator it = policy.execute(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void listEmptyDirectories(PolicyFsTestConfig fsConfig) throws IOException { + Iterator it = fsConfig.getPolicy().execute(); assertFalse(it.hasNext()); - it.next(); + assertThrows(NoSuchElementException.class, it::next); } - @Test - public void oneFilePerFs() throws IOException, InterruptedException { - for (Path dir : directories) { - fs.createNewFile(new Path(dir, String.valueOf(System.nanoTime() + ".txt"))); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void oneFilePerFs(PolicyFsTestConfig fsConfig) throws IOException, InterruptedException { + FileSystem fs = fsConfig.getFs(); + for (Path dir : fsConfig.getDirectories()) { + fs.createNewFile(new Path(dir, System.nanoTime() + ".txt")); //this file does not match the regexp - fs.createNewFile(new Path(dir, String.valueOf(System.nanoTime()) + ".invalid")); - } - //we wait till FS has registered the files - Thread.sleep(500); + fs.createNewFile(new Path(dir, System.nanoTime() + ".invalid")); - Iterator it = policy.execute(); + //we wait till FS has registered the files + Thread.sleep(3000); + } + Iterator it = fsConfig.getPolicy().execute(); assertTrue(it.hasNext()); it.next(); assertTrue(it.hasNext()); @@ -111,19 +142,21 @@ public void oneFilePerFs() throws IOException, InterruptedException { assertFalse(it.hasNext()); } - @Test - public void recursiveDirectory() throws IOException, InterruptedException { - for (Path dir : directories) { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void recursiveDirectory(PolicyFsTestConfig fsConfig) throws IOException, InterruptedException { + FileSystem fs = fsConfig.getFs(); + for (Path dir : fsConfig.getDirectories()) { Path tmpDir = new Path(dir, String.valueOf(System.nanoTime())); fs.mkdirs(tmpDir); - fs.createNewFile(new Path(tmpDir, String.valueOf(System.nanoTime() + ".txt"))); + fs.createNewFile(new Path(tmpDir, System.nanoTime() + ".txt")); //this file does not match the regexp - fs.createNewFile(new Path(tmpDir, String.valueOf(System.nanoTime()) + ".invalid")); - } - //we wait till FS has registered the files - Thread.sleep(500); + fs.createNewFile(new Path(tmpDir, System.nanoTime() + ".invalid")); - Iterator it = policy.execute(); + //we wait till FS has registered the files + Thread.sleep(3000); + } + Iterator it = fsConfig.getPolicy().execute(); assertTrue(it.hasNext()); it.next(); assertTrue(it.hasNext()); @@ -131,29 +164,26 @@ public void recursiveDirectory() throws IOException, InterruptedException { assertFalse(it.hasNext()); } - @Test - public void hasEnded() throws IOException { - policy.execute(); - assertTrue(policy.hasEnded()); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void execPolicyAlreadyEnded(PolicyFsTestConfig fsConfig) throws IOException { + fsConfig.getPolicy().execute(); + assertTrue(fsConfig.getPolicy().hasEnded()); + assertThrows(IllegalWorkerStateException.class, () -> fsConfig.getPolicy().execute()); } - @Test(expected = IllegalWorkerStateException.class) - public void execPolicyAlreadyEnded() throws IOException { - policy.execute(); - assertTrue(policy.hasEnded()); - policy.execute(); - } - - @Test - public void dynamicURIs() throws Throwable { - Path dynamic = new Path(fsUri.toString(), "${G}/${yyyy}/${MM}/${W}"); - fs.create(dynamic); - Map originals = taskConfig.originalsStrings(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void dynamicURIs(PolicyFsTestConfig fsConfig) throws IOException { + Path dynamic = new Path(fsConfig.getFsUri().toString(), "${G}/${yyyy}/${MM}/${W}"); + fsConfig.getFs().create(dynamic); + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(FsSourceTaskConfig.FS_URIS, dynamic.toString()); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - cfg); - assertEquals(1, policy.getURIs().size()); + Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + fsConfig.setPolicy(policy); + assertEquals(1, fsConfig.getPolicy().getURIs().size()); LocalDateTime dateTime = LocalDateTime.now(); DateTimeFormatter formatter = DateTimeFormatter.ofPattern("G"); @@ -167,19 +197,30 @@ public void dynamicURIs() throws Throwable { uri.append("/"); formatter = DateTimeFormatter.ofPattern("W"); uri.append(dateTime.format(formatter)); - assertTrue(policy.getURIs().get(0).endsWith(uri.toString())); - + assertTrue(fsConfig.getPolicy().getURIs().get(0).endsWith(uri.toString())); } - @Test(expected = IllegalArgumentException.class) - public void invalidDynamicURIs() throws Throwable { - Path dynamic = new Path(fsUri.toString(), "${yyyy}/${MM}/${mmmmmmm}"); - fs.create(dynamic); - Map originals = taskConfig.originalsStrings(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidDynamicURIs(PolicyFsTestConfig fsConfig) throws IOException { + Path dynamic = new Path(fsConfig.getFsUri().toString(), "${yyyy}/${MM}/${mmmmmmm}"); + fsConfig.getFs().create(dynamic); + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(FsSourceTaskConfig.FS_URIS, dynamic.toString()); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - cfg); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(IllegalArgumentException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); } + protected abstract FsSourceTaskConfig buildSourceTaskConfig(List directories); + } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java new file mode 100644 index 0000000..279a775 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java @@ -0,0 +1,29 @@ +package com.github.mmolimar.kafka.connect.fs.policy; + +import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; +import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; +import org.apache.hadoop.fs.Path; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class SimplePolicyTest extends PolicyTestBase { + + @Override + protected FsSourceTaskConfig buildSourceTaskConfig(List directories) { + Map cfg = new HashMap() {{ + String[] uris = directories.stream().map(Path::toString) + .toArray(String[]::new); + put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); + put(FsSourceTaskConfig.TOPIC, "topic_test"); + put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); + put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); + put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test/"); + }}; + return new FsSourceTaskConfig(cfg); + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java new file mode 100644 index 0000000..65c41c7 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java @@ -0,0 +1,133 @@ +package com.github.mmolimar.kafka.connect.fs.policy; + +import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; +import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; +import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.connect.errors.ConnectException; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.*; + +public class SleepyPolicyTest extends PolicyTestBase { + + @Override + protected FsSourceTaskConfig buildSourceTaskConfig(List directories) { + Map cfg = new HashMap() {{ + String[] uris = directories.stream().map(Path::toString) + .toArray(String[]::new); + put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); + put(FsSourceTaskConfig.TOPIC, "topic_test"); + put(FsSourceTaskConfig.POLICY_CLASS, SleepyPolicy.class.getName()); + put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); + put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); + put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "100"); + put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "1"); + }}; + return new FsSourceTaskConfig(cfg); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidSleepTime(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); + originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "invalid"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidMaxExecs(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); + originals.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "invalid"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidSleepFraction(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); + originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_FRACTION, "invalid"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void sleepExecution(PolicyFsTestConfig fsConfig) throws IOException { + Map tConfig = fsConfig.getSourceTaskConfig().originalsStrings(); + tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1000"); + tConfig.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "2"); + FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); + + Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); + assertFalse(policy.hasEnded()); + policy.execute(); + assertFalse(policy.hasEnded()); + policy.execute(); + assertTrue(policy.hasEnded()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void defaultExecutions(PolicyFsTestConfig fsConfig) throws IOException { + Map tConfig = fsConfig.getSourceTaskConfig().originalsStrings(); + tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1"); + tConfig.remove(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS); + FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); + + Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); + + //it never ends + for (int i = 0; i < 100; i++) { + assertFalse(policy.hasEnded()); + policy.execute(); + } + policy.interrupt(); + assertTrue(policy.hasEnded()); + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java deleted file mode 100644 index d3e0d9a..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java +++ /dev/null @@ -1,74 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.policy.hdfs; - -import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.HdfsFileWatcherPolicy; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.errors.IllegalWorkerStateException; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; - -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -public class HdfsFileWatcherPolicyTest extends HdfsPolicyTestBase { - - @BeforeClass - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } - - Map cfg = new HashMap() {{ - String uris[] = directories.stream().map(dir -> dir.toString()) - .toArray(size -> new String[size]); - put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); - put(FsSourceTaskConfig.TOPIC, "topic_test"); - put(FsSourceTaskConfig.POLICY_CLASS, HdfsFileWatcherPolicy.class.getName()); - put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); - put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "test"); - }}; - taskConfig = new FsSourceTaskConfig(cfg); - } - - //This policy does not throw any exception. Just stop watching those nonexistent dirs - @Test - @Override - public void invalidDirectory() throws IOException { - super.invalidDirectory(); - } - - //This policy never ends at least all watchers die - @Test - @Override - public void hasEnded() throws IOException { - policy.execute(); - assertFalse(policy.hasEnded()); - policy.interrupt(); - assertTrue(policy.hasEnded()); - } - - //This policy never ends. We have to interrupt it - @Test(expected = IllegalWorkerStateException.class) - @Override - public void execPolicyAlreadyEnded() throws IOException { - policy.execute(); - assertFalse(policy.hasEnded()); - policy.interrupt(); - assertTrue(policy.hasEnded()); - policy.execute(); - } - -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java deleted file mode 100644 index 3cbe9a9..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.policy.hdfs; - -import com.github.mmolimar.kafka.connect.fs.policy.PolicyTestBase; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.junit.AfterClass; -import org.junit.BeforeClass; - -import java.io.IOException; -import java.net.URI; -import java.nio.file.Files; -import java.nio.file.Path; - -public abstract class HdfsPolicyTestBase extends PolicyTestBase { - - private static MiniDFSCluster cluster; - private static Configuration clusterConfig; - private static Path hdfsDir; - - @BeforeClass - public static void initFs() throws IOException { - clusterConfig = new Configuration(); - hdfsDir = Files.createTempDirectory("test-"); - clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString()); - cluster = new MiniDFSCluster.Builder(clusterConfig).build(); - fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/"); - fs = FileSystem.newInstance(fsUri, new Configuration()); - } - - @AfterClass - public static void finishFs() throws Exception { - cluster.shutdown(true); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java deleted file mode 100644 index 06f1db7..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java +++ /dev/null @@ -1,40 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.policy.hdfs; - -import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy; -import org.apache.hadoop.fs.Path; -import org.junit.BeforeClass; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; - -public class SimplePolicyTest extends HdfsPolicyTestBase { - - @BeforeClass - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } - - Map cfg = new HashMap() {{ - String uris[] = directories.stream().map(dir -> dir.toString()) - .toArray(size -> new String[size]); - put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); - put(FsSourceTaskConfig.TOPIC, "topic_test"); - put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); - put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); - put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "test"); - }}; - taskConfig = new FsSourceTaskConfig(cfg); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java deleted file mode 100644 index edd5533..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java +++ /dev/null @@ -1,109 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.policy.hdfs; - -import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.Policy; -import com.github.mmolimar.kafka.connect.fs.policy.SleepyPolicy; -import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.common.config.ConfigException; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; - -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -public class SleepyPolicyTest extends HdfsPolicyTestBase { - - @BeforeClass - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } - - Map cfg = new HashMap() {{ - String uris[] = directories.stream().map(dir -> dir.toString()) - .toArray(size -> new String[size]); - put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); - put(FsSourceTaskConfig.TOPIC, "topic_test"); - put(FsSourceTaskConfig.POLICY_CLASS, SleepyPolicy.class.getName()); - put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); - put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "test"); - put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "100"); - put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "1"); - }}; - taskConfig = new FsSourceTaskConfig(cfg); - } - - @Test(expected = ConfigException.class) - public void invalidSleepTime() throws Throwable { - Map originals = taskConfig.originalsStrings(); - originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "invalid"); - FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); - } - - @Test(expected = ConfigException.class) - public void invalidMaxExecs() throws Throwable { - Map originals = taskConfig.originalsStrings(); - originals.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "invalid"); - FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); - } - - @Test(expected = ConfigException.class) - public void invalidSleepFraction() throws Throwable { - Map originals = taskConfig.originalsStrings(); - originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_FRACTION, "invalid"); - FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); - } - - @Test - public void sleepExecution() throws Throwable { - Map tConfig = taskConfig.originalsStrings(); - tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1000"); - tConfig.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "2"); - FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - sleepConfig); - assertFalse(policy.hasEnded()); - policy.execute(); - assertFalse(policy.hasEnded()); - policy.execute(); - assertTrue(policy.hasEnded()); - } - - @Test - public void defaultExecutions() throws Throwable { - Map tConfig = taskConfig.originalsStrings(); - tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1"); - tConfig.remove(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS); - FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - sleepConfig); - - //it never ends - for (int i = 0; i < 100; i++) { - assertFalse(policy.hasEnded()); - policy.execute(); - } - policy.interrupt(); - assertTrue(policy.hasEnded()); - } - -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/LocalPolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/LocalPolicyTestBase.java deleted file mode 100644 index 6aa4cd5..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/LocalPolicyTestBase.java +++ /dev/null @@ -1,29 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.policy.local; - -import com.github.mmolimar.kafka.connect.fs.policy.PolicyTestBase; -import org.apache.commons.io.FileUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.junit.AfterClass; -import org.junit.BeforeClass; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -public abstract class LocalPolicyTestBase extends PolicyTestBase { - - private static Path localDir; - - @BeforeClass - public static void initFs() throws IOException { - localDir = Files.createTempDirectory("test-"); - fsUri = localDir.toUri(); - fs = FileSystem.newInstance(fsUri, new Configuration()); - } - - @AfterClass - public static void finishFs() throws IOException { - FileUtils.deleteDirectory(localDir.toFile()); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java deleted file mode 100644 index 214849b..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java +++ /dev/null @@ -1,40 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.policy.local; - -import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy; -import org.apache.hadoop.fs.Path; -import org.junit.BeforeClass; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; - -public class SimplePolicyTest extends LocalPolicyTestBase { - - @BeforeClass - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } - - Map cfg = new HashMap() {{ - String uris[] = directories.stream().map(dir -> dir.toString()) - .toArray(size -> new String[size]); - put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); - put(FsSourceTaskConfig.TOPIC, "topic_test"); - put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); - put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); - put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "test"); - }}; - taskConfig = new FsSourceTaskConfig(cfg); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java deleted file mode 100644 index 2f907ae..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java +++ /dev/null @@ -1,108 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.policy.local; - -import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.Policy; -import com.github.mmolimar.kafka.connect.fs.policy.SleepyPolicy; -import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.common.config.ConfigException; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; - -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -public class SleepyPolicyTest extends LocalPolicyTestBase { - - @BeforeClass - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } - - Map cfg = new HashMap() {{ - String uris[] = directories.stream().map(dir -> dir.toString()) - .toArray(size -> new String[size]); - put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); - put(FsSourceTaskConfig.TOPIC, "topic_test"); - put(FsSourceTaskConfig.POLICY_CLASS, SleepyPolicy.class.getName()); - put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); - put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "test"); - put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "100"); - put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "1"); - }}; - taskConfig = new FsSourceTaskConfig(cfg); - } - - @Test(expected = ConfigException.class) - public void invalidSleepTime() throws Throwable { - Map originals = taskConfig.originalsStrings(); - originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "invalid"); - FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); - } - - @Test(expected = ConfigException.class) - public void invalidMaxExecs() throws Throwable { - Map originals = taskConfig.originalsStrings(); - originals.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "invalid"); - FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); - } - - @Test(expected = ConfigException.class) - public void invalidSleepFraction() throws Throwable { - Map originals = taskConfig.originalsStrings(); - originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_FRACTION, "invalid"); - FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); - } - - @Test - public void sleepExecution() throws Throwable { - Map tConfig = taskConfig.originalsStrings(); - tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1000"); - tConfig.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "2"); - FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - sleepConfig); - assertFalse(policy.hasEnded()); - policy.execute(); - assertFalse(policy.hasEnded()); - policy.execute(); - assertTrue(policy.hasEnded()); - } - - @Test - public void defaultExecutions() throws Throwable { - Map tConfig = taskConfig.originalsStrings(); - tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1"); - tConfig.remove(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS); - FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - sleepConfig); - - //it never ends - for (int i = 0; i < 100; i++) { - assertFalse(policy.hasEnded()); - policy.execute(); - } - policy.interrupt(); - assertTrue(policy.hasEnded()); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskConfigTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskConfigTest.java index 6b0e619..5506baf 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskConfigTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskConfigTest.java @@ -3,10 +3,10 @@ import com.github.mmolimar.kafka.connect.fs.FsSourceConnectorConfig; import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; import org.apache.kafka.common.config.ConfigDef; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class FsSourceTaskConfigTest { @@ -14,9 +14,9 @@ public class FsSourceTaskConfigTest { public void checkDocumentation() { ConfigDef config = FsSourceTaskConfig.conf(); config.names().forEach(key -> { - assertFalse("Property " + key + " should be documented", - config.configKeys().get(key).documentation == null || - "".equals(config.configKeys().get(key).documentation.trim())); + assertFalse(config.configKeys().get(key).documentation == null || + "".equals(config.configKeys().get(key).documentation.trim()), + () -> "Property " + key + " should be documented"); }); } @@ -24,4 +24,4 @@ public void checkDocumentation() { public void toRst() { assertNotNull(FsSourceConnectorConfig.conf().toRst()); } -} \ No newline at end of file +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java index f0fbacc..b4b5a4e 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java @@ -2,99 +2,304 @@ import com.github.mmolimar.kafka.connect.fs.FsSourceTask; import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; +import com.github.mmolimar.kafka.connect.fs.file.reader.AvroFileReader; import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; +import com.github.mmolimar.kafka.connect.fs.policy.Policy; import com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.errors.ConnectException; -import org.junit.Before; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.source.SourceTaskContext; +import org.apache.kafka.connect.storage.OffsetStorageReader; +import org.easymock.EasyMock; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.powermock.api.easymock.PowerMock; +import org.powermock.api.support.membermodification.MemberModifier; import java.io.File; +import java.io.FileWriter; import java.io.IOException; -import java.util.HashMap; -import java.util.Map; +import java.io.PrintWriter; +import java.util.*; +import java.util.stream.IntStream; +import java.util.stream.Stream; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class FsSourceTaskTest { - @ClassRule - public static final TemporaryFolder temporaryFolder = new TemporaryFolder(); - private FsSourceTask task; - private Map taskConfig; + private static final List TEST_FILE_SYSTEMS = Arrays.asList( + new LocalFsConfig(), + new HdfsFsConfig() + ); + private static final int NUM_RECORDS = 10; - @Before - public void setup() throws IOException { - task = new FsSourceTask(); + @BeforeAll + public static void initFs() throws IOException { + for (TaskFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.initFs(); + } + } + + @AfterAll + public static void finishFs() throws IOException { + for (TaskFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.close(); + } + } + + @BeforeEach + public void initTask() { + for (TaskFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + Map taskConfig = new HashMap() {{ + String[] uris = fsConfig.getDirectories().stream().map(Path::toString) + .toArray(String[]::new); + put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); + put(FsSourceTaskConfig.TOPIC, "topic_test"); + put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); + put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); + put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); + }}; + + //Mock initialization + SourceTaskContext taskContext = PowerMock.createMock(SourceTaskContext.class); + OffsetStorageReader offsetStorageReader = PowerMock.createMock(OffsetStorageReader.class); + + EasyMock.expect(taskContext.offsetStorageReader()) + .andReturn(offsetStorageReader); + + EasyMock.expect(taskContext.offsetStorageReader()) + .andReturn(offsetStorageReader); + + EasyMock.expect(offsetStorageReader.offset(EasyMock.anyObject())) + .andReturn(new HashMap() {{ + put("offset", (long) (NUM_RECORDS / 2)); + }}); + EasyMock.expect(offsetStorageReader.offset(EasyMock.anyObject())) + .andReturn(new HashMap() {{ + put("offset", (long) (NUM_RECORDS / 2)); + }}); + + EasyMock.checkOrder(taskContext, false); + EasyMock.replay(taskContext); + + EasyMock.checkOrder(offsetStorageReader, false); + EasyMock.replay(offsetStorageReader); - taskConfig = new HashMap() {{ - put(FsSourceTaskConfig.FS_URIS, String.join(",", - temporaryFolder.getRoot().toURI() + File.separator + "dir1", - temporaryFolder.getRoot().toURI() + File.separator + "dir2", - temporaryFolder.getRoot().toURI() + File.separator + "dir3")); - put(FsSourceTaskConfig.TOPIC, "topic_test"); - put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); - put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); - }}; + FsSourceTask task = new FsSourceTask(); + task.initialize(taskContext); + + fsConfig.setTaskConfig(taskConfig); + fsConfig.setTask(task); + } + } + + @AfterEach + public void cleanDirsAndStop() throws IOException { + for (TaskFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + for (Path dir : fsConfig.getDirectories()) { + fsConfig.getFs().delete(dir, true); + fsConfig.getFs().mkdirs(dir); + } + fsConfig.getTask().stop(); + } + } + + private static Stream fileSystemConfigProvider() { + return TEST_FILE_SYSTEMS.stream().map(Arguments::of); } - @Test(expected = ConnectException.class) - public void nullProperties() { - task.start(null); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void pollNoData(TaskFsTestConfig fsConfig) { + fsConfig.getTask().start(fsConfig.getTaskConfig()); + assertEquals(0, fsConfig.getTask().poll().size()); + //policy has ended + assertNull(fsConfig.getTask().poll()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void emptyFilesToProcess(TaskFsTestConfig fsConfig) throws IOException { + for (Path dir : fsConfig.getDirectories()) { + fsConfig.getFs().createNewFile(new Path(dir, System.nanoTime() + ".txt")); + //this file does not match the regexp + fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime()))); + } + fsConfig.getTask().start(fsConfig.getTaskConfig()); + assertEquals(0, fsConfig.getTask().poll().size()); + //policy has ended + assertNull(fsConfig.getTask().poll()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void oneFilePerFs(TaskFsTestConfig fsConfig) throws IOException { + for (Path dir : fsConfig.getDirectories()) { + Path dataFile = new Path(dir, System.nanoTime() + ".txt"); + createDataFile(fsConfig.getFs(), dataFile); + //this file does not match the regexp + fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime()))); + } + + fsConfig.getTask().start(fsConfig.getTaskConfig()); + List records = fsConfig.getTask().poll(); + assertEquals((NUM_RECORDS * fsConfig.getDirectories().size()) / 2, records.size()); + checkRecords(records); + //policy has ended + assertNull(fsConfig.getTask().poll()); } - @Test(expected = ConnectException.class) - public void expectedFsUris() { - Map testProps = new HashMap<>(taskConfig); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void nonExistentUri(TaskFsTestConfig fsConfig) { + Map props = new HashMap<>(fsConfig.getTaskConfig()); + props.put(FsSourceTaskConfig.FS_URIS, + new Path(fsConfig.getFs().getWorkingDirectory(), UUID.randomUUID().toString()).toString()); + fsConfig.getTask().start(props); + fsConfig.getTask().poll(); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void exceptionExecutingPolicy(TaskFsTestConfig fsConfig) throws IOException, IllegalAccessException { + Map props = new HashMap<>(fsConfig.getTaskConfig()); + fsConfig.getTask().start(props); + + Policy policy = EasyMock.createNiceMock(Policy.class); + EasyMock.expect(policy.hasEnded()).andReturn(Boolean.FALSE); + EasyMock.expect(policy.execute()).andThrow(new ConnectException("Exception from mock")); + EasyMock.expect(policy.getURIs()).andReturn(null); + EasyMock.checkOrder(policy, false); + EasyMock.replay(policy); + MemberModifier.field(FsSourceTask.class, "policy").set(fsConfig.getTask(), policy); + + assertEquals(0, fsConfig.getTask().poll().size()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void exceptionReadingFile(TaskFsTestConfig fsConfig) throws IOException { + Map props = new HashMap<>(fsConfig.getTaskConfig()); + File tmp = File.createTempFile("test-", ".txt"); + try (PrintWriter writer = new PrintWriter(tmp)) { + writer.append("txt"); + } + Path dest = new Path(fsConfig.getDirectories().get(0).toString(), System.nanoTime() + ".txt"); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), dest); + props.put(FsSourceTaskConfig.FILE_READER_CLASS, AvroFileReader.class.getName()); + fsConfig.getTask().start(props); + assertEquals(0, fsConfig.getTask().poll().size()); + fsConfig.getTask().stop(); + + fsConfig.getFs().delete(dest, false); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void nullProperties(TaskFsTestConfig fsConfig) { + assertThrows(ConnectException.class, () -> fsConfig.getTask().start(null)); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void expectedFsUris(TaskFsTestConfig fsConfig) { + Map testProps = new HashMap<>(fsConfig.getTaskConfig()); testProps.remove(FsSourceTaskConfig.FS_URIS); - task.start(testProps); + assertThrows(ConnectException.class, () -> fsConfig.getTask().start(testProps)); } - @Test(expected = ConnectException.class) - public void expectedPolicyClass() { - Map testProps = new HashMap<>(taskConfig); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void expectedPolicyClass(TaskFsTestConfig fsConfig) { + Map testProps = new HashMap<>(fsConfig.getTaskConfig()); testProps.remove(FsSourceTaskConfig.POLICY_CLASS); - task.start(testProps); + assertThrows(ConnectException.class, () -> fsConfig.getTask().start(testProps)); } - @Test(expected = ConnectException.class) - public void invalidPolicyClass() { - Map testProps = new HashMap<>(taskConfig); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidPolicyClass(TaskFsTestConfig fsConfig) { + Map testProps = new HashMap<>(fsConfig.getTaskConfig()); testProps.put(FsSourceTaskConfig.POLICY_CLASS, Object.class.getName()); - task.start(testProps); + assertThrows(ConnectException.class, () -> fsConfig.getTask().start(testProps)); } - @Test(expected = ConnectException.class) - public void expectedReaderClass() { - Map testProps = new HashMap<>(taskConfig); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void expectedReaderClass(TaskFsTestConfig fsConfig) { + Map testProps = new HashMap<>(fsConfig.getTaskConfig()); testProps.remove(FsSourceTaskConfig.FILE_READER_CLASS); - task.start(testProps); + assertThrows(ConnectException.class, () -> fsConfig.getTask().start(testProps)); } - @Test(expected = ConnectException.class) - public void invalidReaderClass() { - Map testProps = new HashMap<>(taskConfig); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidReaderClass(TaskFsTestConfig fsConfig) { + Map testProps = new HashMap<>(fsConfig.getTaskConfig()); testProps.put(FsSourceTaskConfig.FILE_READER_CLASS, Object.class.getName()); - task.start(testProps); + assertThrows(ConnectException.class, () -> fsConfig.getTask().start(testProps)); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void minimumConfig(TaskFsTestConfig fsConfig) { + fsConfig.getTask().start(fsConfig.getTaskConfig()); + fsConfig.getTask().stop(); } - @Test - public void minimunConfig() { - task.start(taskConfig); - task.stop(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void pollWithoutStart(TaskFsTestConfig fsConfig) { + assertNull(fsConfig.getTask().poll()); + fsConfig.getTask().stop(); } - @Test - public void pollWithoutStart() throws InterruptedException { - assertNull(task.poll()); - task.stop(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void checkVersion(TaskFsTestConfig fsConfig) { + assertNotNull(fsConfig.getTask().version()); + assertFalse("unknown".equalsIgnoreCase(fsConfig.getTask().version())); } - @Test - public void checkVersion() { - assertNotNull(task.version()); - assertFalse("unknown".equalsIgnoreCase(task.version())); + protected void checkRecords(List records) { + records.forEach(record -> { + assertEquals("topic_test", record.topic()); + assertNotNull(record.sourcePartition()); + assertNotNull(record.sourceOffset()); + assertNotNull(record.value()); + + assertNotNull(((Struct) record.value()).get(TextFileReader.FIELD_NAME_VALUE_DEFAULT)); + }); + } + + protected void createDataFile(FileSystem fs, Path path) throws IOException { + File file = fillDataFile(); + fs.moveFromLocalFile(new Path(file.getAbsolutePath()), path); + } + + private File fillDataFile() throws IOException { + File txtFile = File.createTempFile("test-", ".txt"); + try (FileWriter writer = new FileWriter(txtFile)) { + + IntStream.range(0, NUM_RECORDS).forEach(index -> { + String value = String.format("%d_%s", index, UUID.randomUUID()); + try { + writer.append(value + "\n"); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + }); + } + return txtFile; } -} \ No newline at end of file +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTestBase.java deleted file mode 100644 index 192b756..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTestBase.java +++ /dev/null @@ -1,187 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.task; - -import com.github.mmolimar.kafka.connect.fs.FsSourceTask; -import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; -import com.github.mmolimar.kafka.connect.fs.file.reader.AvroFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.Policy; -import com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.errors.ConnectException; -import org.apache.kafka.connect.source.SourceRecord; -import org.apache.kafka.connect.source.SourceTaskContext; -import org.apache.kafka.connect.storage.OffsetStorageReader; -import org.easymock.EasyMock; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.Test; -import org.powermock.api.easymock.PowerMock; -import org.powermock.api.support.membermodification.MemberModifier; - -import java.io.File; -import java.io.IOException; -import java.io.PrintWriter; -import java.net.URI; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.UUID; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; - -public abstract class FsSourceTaskTestBase { - - protected static final int NUM_RECORDS = 10; - - protected static FileSystem fs; - protected static List directories; - protected static URI fsUri; - - protected FsSourceTask task; - protected Map taskConfig; - protected SourceTaskContext taskContext; - protected OffsetStorageReader offsetStorageReader; - - @AfterClass - public static void tearDown() throws Exception { - fs.close(); - } - - @Before - public void initTask() { - task = new FsSourceTask(); - taskConfig = new HashMap() {{ - String uris[] = directories.stream().map(dir -> dir.toString()) - .toArray(size -> new String[size]); - put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); - put(FsSourceTaskConfig.TOPIC, "topic_test"); - put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); - put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); - put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); - }}; - - //Mock initialization - taskContext = PowerMock.createMock(SourceTaskContext.class); - offsetStorageReader = PowerMock.createMock(OffsetStorageReader.class); - - EasyMock.expect(taskContext.offsetStorageReader()) - .andReturn(offsetStorageReader); - - EasyMock.expect(taskContext.offsetStorageReader()) - .andReturn(offsetStorageReader); - - EasyMock.expect(offsetStorageReader.offset(EasyMock.anyObject())) - .andReturn(new HashMap() {{ - put("offset", 5L); - }}); - EasyMock.expect(offsetStorageReader.offset(EasyMock.anyObject())) - .andReturn(new HashMap() {{ - put("offset", 5L); - }}); - - EasyMock.checkOrder(taskContext, false); - EasyMock.replay(taskContext); - - EasyMock.checkOrder(offsetStorageReader, false); - EasyMock.replay(offsetStorageReader); - - task.initialize(taskContext); - - } - - @After - public void cleanDirsAndStop() throws IOException { - for (Path dir : directories) { - fs.delete(dir, true); - fs.mkdirs(dir); - } - task.stop(); - } - - @Test - public void pollNoData() throws InterruptedException { - task.start(taskConfig); - assertEquals(0, task.poll().size()); - //policy has ended - assertNull(task.poll()); - } - - @Test - public void emptyFilesToProcess() throws IOException, InterruptedException { - for (Path dir : directories) { - fs.createNewFile(new Path(dir, String.valueOf(System.nanoTime() + ".txt"))); - //this file does not match the regexp - fs.createNewFile(new Path(dir, String.valueOf(System.nanoTime()))); - } - task.start(taskConfig); - assertEquals(0, task.poll().size()); - //policy has ended - assertNull(task.poll()); - } - - @Test - public void oneFilePerFs() throws IOException, InterruptedException { - for (Path dir : directories) { - Path dataFile = new Path(dir, String.valueOf(System.nanoTime() + ".txt")); - createDataFile(dataFile); - //this file does not match the regexp - fs.createNewFile(new Path(dir, String.valueOf(System.nanoTime()))); - } - - task.start(taskConfig); - List records = task.poll(); - assertEquals(10, records.size()); - checkRecords(records); - //policy has ended - assertNull(task.poll()); - } - - @Test - public void nonExistentUri() throws InterruptedException { - Map props = new HashMap<>(taskConfig); - props.put(FsSourceTaskConfig.FS_URIS, new Path(fs.getWorkingDirectory(), UUID.randomUUID().toString()).toString()); - task.start(props); - task.poll(); - } - - @Test - public void exceptionExecutingPolicy() throws InterruptedException, IOException, IllegalAccessException { - Map props = new HashMap<>(taskConfig); - task.start(props); - - Policy policy = EasyMock.createNiceMock(Policy.class); - EasyMock.expect(policy.hasEnded()).andReturn(Boolean.FALSE); - EasyMock.expect(policy.execute()).andThrow(new ConnectException("Exception from mock")); - EasyMock.expect(policy.getURIs()).andReturn(null); - EasyMock.checkOrder(policy, false); - EasyMock.replay(policy); - MemberModifier.field(FsSourceTask.class, "policy").set(task, policy); - - assertEquals(0, task.poll().size()); - } - - @Test - public void exceptionReadingFile() throws InterruptedException, IOException { - Map props = new HashMap<>(taskConfig); - File tmp = File.createTempFile("test-", ".txt"); - try (PrintWriter writer = new PrintWriter(tmp)) { - writer.append("txt"); - } - Path dest = new Path(directories.get(0).toString(), System.nanoTime() + ".txt"); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), dest); - props.put(FsSourceTaskConfig.FILE_READER_CLASS, AvroFileReader.class.getName()); - task.start(props); - assertEquals(0, task.poll().size()); - task.stop(); - - fs.delete(dest, false); - } - - protected abstract void checkRecords(List records); - - protected abstract void createDataFile(Path path) throws IOException; - -} \ No newline at end of file diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/TaskFsTestConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/TaskFsTestConfig.java new file mode 100644 index 0000000..1efe3b4 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/TaskFsTestConfig.java @@ -0,0 +1,113 @@ +package com.github.mmolimar.kafka.connect.fs.task; + +import com.github.mmolimar.kafka.connect.fs.AbstractHdfsFsConfig; +import com.github.mmolimar.kafka.connect.fs.AbstractLocalFsConfig; +import com.github.mmolimar.kafka.connect.fs.FsSourceTask; +import com.github.mmolimar.kafka.connect.fs.FsTestConfig; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +interface TaskFsTestConfig extends FsTestConfig { + + FsSourceTask getTask(); + + void setTask(FsSourceTask task); + + Map getTaskConfig(); + + void setTaskConfig(Map taskConfig); + + List getDirectories(); + +} + +class LocalFsConfig extends AbstractLocalFsConfig implements TaskFsTestConfig { + private FsSourceTask task; + private Map taskConfig; + private List directories; + + @Override + public void init() throws IOException { + directories = new ArrayList() {{ + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + }}; + for (Path dir : directories) { + getFs().mkdirs(dir); + } + } + + @Override + public FsSourceTask getTask() { + return task; + } + + @Override + public void setTask(FsSourceTask task) { + this.task = task; + } + + @Override + public Map getTaskConfig() { + return taskConfig; + } + + @Override + public void setTaskConfig(Map taskConfig) { + this.taskConfig = taskConfig; + } + + @Override + public List getDirectories() { + return directories; + } + +} + +class HdfsFsConfig extends AbstractHdfsFsConfig implements TaskFsTestConfig { + private FsSourceTask task; + private Map taskConfig; + private List directories; + + @Override + public void init() throws IOException { + directories = new ArrayList() {{ + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + }}; + for (Path dir : directories) { + getFs().mkdirs(dir); + } + } + + @Override + public FsSourceTask getTask() { + return task; + } + + @Override + public void setTask(FsSourceTask task) { + this.task = task; + } + + @Override + public Map getTaskConfig() { + return taskConfig; + } + + @Override + public void setTaskConfig(Map taskConfig) { + this.taskConfig = taskConfig; + } + + @Override + public List getDirectories() { + return directories; + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTest.java deleted file mode 100644 index 629a0f8..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTest.java +++ /dev/null @@ -1,66 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.task.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.source.SourceRecord; -import org.junit.BeforeClass; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; - -public class HdfsFsSourceTaskTest extends HdfsFsSourceTaskTestBase { - - @BeforeClass - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } - } - - @Override - protected void checkRecords(List records) { - records.forEach(record -> { - assertTrue(record.topic().equals("topic_test")); - assertNotNull(record.sourcePartition()); - assertNotNull(record.sourceOffset()); - assertNotNull(record.value()); - - assertNotNull(((Struct) record.value()).get(TextFileReader.FIELD_NAME_VALUE_DEFAULT)); - }); - } - - @Override - protected void createDataFile(Path path) throws IOException { - File file = fillDataFile(); - fs.moveFromLocalFile(new Path(file.getAbsolutePath()), path); - } - - private File fillDataFile() throws IOException { - File txtFile = File.createTempFile("test-", ".txt"); - try (FileWriter writer = new FileWriter(txtFile)) { - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - try { - writer.append(value + "\n"); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - return txtFile; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTestBase.java deleted file mode 100644 index fd8c3bd..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTestBase.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.task.hdfs; - -import com.github.mmolimar.kafka.connect.fs.task.FsSourceTaskTestBase; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.junit.AfterClass; -import org.junit.BeforeClass; - -import java.io.IOException; -import java.net.URI; -import java.nio.file.Files; -import java.nio.file.Path; - -public abstract class HdfsFsSourceTaskTestBase extends FsSourceTaskTestBase { - - private static MiniDFSCluster cluster; - private static Configuration clusterConfig; - private static Path hdfsDir; - - @BeforeClass - public static void initFs() throws IOException { - clusterConfig = new Configuration(); - hdfsDir = Files.createTempDirectory("test-"); - clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString()); - cluster = new MiniDFSCluster.Builder(clusterConfig).build(); - fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/"); - fs = FileSystem.newInstance(fsUri, clusterConfig); - } - - @AfterClass - public static void finishFs() throws Exception { - cluster.shutdown(true); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTest.java deleted file mode 100644 index bbacd9e..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTest.java +++ /dev/null @@ -1,66 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.task.local; - -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.source.SourceRecord; -import org.junit.BeforeClass; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; - -public class LocalFsSourceTaskTest extends LocalFsSourceTaskTestBase { - - @BeforeClass - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } - } - - @Override - protected void checkRecords(List records) { - records.forEach(record -> { - assertTrue(record.topic().equals("topic_test")); - assertNotNull(record.sourcePartition()); - assertNotNull(record.sourceOffset()); - assertNotNull(record.value()); - - assertNotNull(((Struct) record.value()).get(TextFileReader.FIELD_NAME_VALUE_DEFAULT)); - }); - } - - @Override - protected void createDataFile(Path path) throws IOException { - File file = fillDataFile(); - fs.moveFromLocalFile(new Path(file.getAbsolutePath()), path); - } - - private File fillDataFile() throws IOException { - File txtFile = File.createTempFile("test-", ".txt"); - try (FileWriter writer = new FileWriter(txtFile)) { - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - try { - writer.append(value + "\n"); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - return txtFile; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTestBase.java deleted file mode 100644 index 569b623..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTestBase.java +++ /dev/null @@ -1,29 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.task.local; - -import com.github.mmolimar.kafka.connect.fs.task.FsSourceTaskTestBase; -import org.apache.commons.io.FileUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.junit.AfterClass; -import org.junit.BeforeClass; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -public abstract class LocalFsSourceTaskTestBase extends FsSourceTaskTestBase { - - private static Path localDir; - - @BeforeClass - public static void initFs() throws IOException { - localDir = Files.createTempDirectory("test-"); - fsUri = localDir.toUri(); - fs = FileSystem.newInstance(fsUri, new Configuration()); - } - - @AfterClass - public static void finishFs() throws IOException { - FileUtils.deleteDirectory(localDir.toFile()); - } -} diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties new file mode 100644 index 0000000..bb7782f --- /dev/null +++ b/src/test/resources/log4j.properties @@ -0,0 +1,15 @@ +# Root logger option +log4j.rootLogger=INFO, stdout + +# Direct log messages to stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target=System.out +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c:%L - %m%n + +log4j.logger.com.github.mmolimar.kafka.connect.fs=TRACE +log4j.logger.org.apache.hadoop=ERROR +log4j.logger.BlockStateChange=WARN +log4j.logger.org.apache.parquet=WARN +log4j.logger.org.eclipse.jetty=WARN +log4j.logger.io.confluent.connect.avro=WARN