From e59bf9301339e4983853e544ee4759d68a968106 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Fri, 21 Feb 2020 18:48:57 -0600 Subject: [PATCH 01/51] Prepare for next version --- pom.xml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8c2c90e..88ae6e6 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ com.github.mmolimar.kafka.connect kafka-connect-fs - 0.3-SNAPSHOT + 1.0.0-SNAPSHOT jar kafka-connect-fs @@ -123,6 +123,7 @@ + org.apache.maven.plugins maven-assembly-plugin ${maven-assembly-plugin.version} From 29ce9273eb60dc3efac24e6282c2dcf3fc4eec57 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Fri, 21 Feb 2020 18:49:22 -0600 Subject: [PATCH 02/51] Upgrade dependencies --- pom.xml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pom.xml b/pom.xml index 88ae6e6..52c6b89 100644 --- a/pom.xml +++ b/pom.xml @@ -11,18 +11,18 @@ UTF-8 - 0.10.1.0 - 3.1.1 - 2.9.0 - 1.8.1 - 1.9.0 - 4.12 - 3.4 - 1.6.6 - 3.0.2 - 3.6.1 - 3.0.0 - 0.7.9 + 2.4.0 + 5.4.0 + 3.2.1 + 1.9.2 + 1.11.0 + 4.13 + 4.2 + 2.0.5 + 3.2.0 + 3.8.1 + 3.2.0 + 0.8.5 4.3.0 From 9af48436e879dd7e626c6f5e12fd972927a0d91a Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sat, 22 Feb 2020 11:37:04 -0600 Subject: [PATCH 03/51] Compatibility with new version in deps --- .../kafka/connect/fs/FsSourceConnector.java | 5 +---- .../kafka/connect/fs/FsSourceTask.java | 19 +++++++------------ .../kafka/connect/fs/file/FileMetadata.java | 9 +++------ .../fs/file/reader/AbstractFileReader.java | 5 ++--- .../fs/file/reader/AgnosticFileReader.java | 13 +++++++------ .../fs/file/reader/AvroFileReader.java | 8 ++++++-- .../file/reader/DelimitedTextFileReader.java | 15 ++++++--------- .../fs/file/reader/ParquetFileReader.java | 11 ++++++----- .../fs/file/reader/SequenceFileReader.java | 4 ++-- .../fs/file/reader/TextFileReader.java | 2 +- .../connect/fs/policy/AbstractPolicy.java | 17 ++++++++--------- .../fs/policy/HdfsFileWatcherPolicy.java | 11 +++++------ .../kafka/connect/fs/policy/SleepyPolicy.java | 6 +++--- .../connect/fs/util/ReflectionUtils.java | 8 ++++---- .../kafka/connect/fs/util/Version.java | 2 +- 15 files changed, 62 insertions(+), 73 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java index e6aab15..0d4ad3e 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java @@ -1,7 +1,5 @@ package com.github.mmolimar.kafka.connect.fs; -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; -import com.github.mmolimar.kafka.connect.fs.policy.Policy; import com.github.mmolimar.kafka.connect.fs.util.Version; import org.apache.kafka.common.config.ConfigDef; import org.apache.kafka.common.config.ConfigException; @@ -16,7 +14,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.stream.Collectors; public class FsSourceConnector extends SourceConnector { @@ -60,7 +57,7 @@ public List> taskConfigs(int maxTasks) { ConnectorUtils.groupPartitions(config.getFsUris(), groups) .forEach(dirs -> { Map taskProps = new HashMap<>(config.originalsStrings()); - taskProps.put(FsSourceConnectorConfig.FS_URIS, dirs.stream().collect(Collectors.joining(","))); + taskProps.put(FsSourceConnectorConfig.FS_URIS, String.join(",", dirs)); taskConfigs.add(taskProps); }); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java index b7c97eb..971f6ee 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java @@ -24,7 +24,7 @@ public class FsSourceTask extends SourceTask { private static final Logger log = LoggerFactory.getLogger(FsSourceTask.class); - private AtomicBoolean stop; + private final AtomicBoolean stop = new AtomicBoolean(false); private FsSourceTaskConfig config; private Policy policy; @@ -37,7 +37,6 @@ public String version() { public void start(Map properties) { try { config = new FsSourceTaskConfig(properties); - if (config.getClass(FsSourceTaskConfig.POLICY_CLASS).isAssignableFrom(Policy.class)) { throw new ConfigException("Policy class " + config.getClass(FsSourceTaskConfig.POLICY_CLASS) + "is not a sublass of " + Policy.class); @@ -57,13 +56,11 @@ public void start(Map properties) { log.error("Couldn't start FsSourceConnector:", t); throw new ConnectException("A problem has occurred reading configuration:" + t.getMessage()); } - - stop = new AtomicBoolean(false); } @Override - public List poll() throws InterruptedException { - while (stop != null && !stop.get() && !policy.hasEnded()) { + public List poll() { + while (!stop.get() && policy != null && !policy.hasEnded()) { log.trace("Polling for new data"); final List results = new ArrayList<>(); @@ -92,8 +89,8 @@ private List filesToProcess() { .collect(Collectors.toList()); } catch (IOException | ConnectException e) { //when an exception happens executing the policy, the connector continues - log.error("Cannot retrive files to process from FS: " + policy.getURIs() + ". Keep going...", e); - return Collections.EMPTY_LIST; + log.error("Cannot retrieve files to process from FS: " + policy.getURIs() + ". Keep going...", e); + return Collections.emptyList(); } } @@ -120,11 +117,9 @@ private SourceRecord convert(FileMetadata metadata, Offset offset, Struct struct @Override public void stop() { - if (stop != null) { - stop.set(true); - } + stop.set(true); if (policy != null) { policy.interrupt(); } } -} \ No newline at end of file +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java index 45902e9..669b681 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/FileMetadata.java @@ -36,12 +36,9 @@ public boolean equals(Object object) { if (!(object instanceof FileMetadata)) return false; FileMetadata metadata = (FileMetadata) object; - if (this.path.equals(metadata.getPath()) && + return this.path.equals(metadata.getPath()) && this.length == metadata.length && - this.blocks.equals(metadata.getBlocks())) { - return true; - } - return false; + this.blocks.equals(metadata.getBlocks()); } public int hashCode() { @@ -65,4 +62,4 @@ public String toString() { return String.format("[offset = %s, length = %s, corrupt = %s]", offset, length, corrupt); } } -} \ No newline at end of file +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java index 4e1b474..1fbdddb 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java @@ -15,7 +15,7 @@ public abstract class AbstractFileReader implements FileReader { private final Path filePath; private ReaderAdapter adapter; - public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter, Map config) { + public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter, Map config) { if (fs == null || filePath == null) { throw new IllegalArgumentException("fileSystem and filePath are required"); } @@ -25,7 +25,7 @@ public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter, M Map readerConf = config.entrySet().stream() .filter(entry -> entry.getKey().startsWith(FILE_READER_PREFIX)) - .collect(Collectors.toMap(entry -> entry.getKey(), entry -> entry.getValue())); + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); configure(readerConf); } @@ -49,5 +49,4 @@ public final Struct next() { protected ReaderAdapter getAdapter() { return adapter; } - } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java index 5e025da..34f9670 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java @@ -8,6 +8,7 @@ import java.io.IOException; import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Map; @@ -33,7 +34,7 @@ public AgnosticFileReader(FileSystem fs, Path filePath, Map conf } catch (RuntimeException | IOException e) { throw e; } catch (Throwable t) { - throw new IOException("An error has ocurred when creating a concrete reader", t); + throw new IOException("An error has occurred when creating a concrete reader", t); } } @@ -62,13 +63,13 @@ private FileReader readerByExtension(FileSystem fs, Path filePath, Map config) { this.parquetExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET) == null ? - Arrays.asList("parquet") : + Collections.singletonList("parquet") : Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET).toString().toLowerCase().split(",")); this.avroExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_AVRO) == null ? - Arrays.asList("avro") : + Collections.singletonList("avro") : Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_AVRO).toString().toLowerCase().split(",")); this.sequenceExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE) == null ? - Arrays.asList("seq") : + Collections.singletonList("seq") : Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE).toString().toLowerCase().split(",")); this.delimitedExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED) == null ? Arrays.asList("tsv", "csv") : @@ -102,7 +103,7 @@ protected AgnosticRecord nextRecord() { static class AgnosticAdapter implements ReaderAdapter { - public AgnosticAdapter() { + AgnosticAdapter() { } @Override @@ -115,7 +116,7 @@ static class AgnosticRecord { private final ReaderAdapter adapter; private final Object record; - public AgnosticRecord(ReaderAdapter adapter, Object record) { + AgnosticRecord(ReaderAdapter adapter, Object record) { this.adapter = adapter; this.record = record; } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java index 46e5e9f..44ec3df 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java @@ -25,14 +25,18 @@ public class AvroFileReader extends AbstractFileReader { public static final String FILE_READER_AVRO_SCHEMA = FILE_READER_AVRO + "schema"; private final AvroOffset offset; - private DataFileReader reader; + private final DataFileReader reader; private Schema schema; public AvroFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, new GenericRecordToStruct(), config); AvroFSInput input = new AvroFSInput(FileContext.getFileContext(filePath.toUri()), filePath); - this.reader = new DataFileReader<>(input, new SpecificDatumReader<>(this.schema)); + if (this.schema == null) { + this.reader = new DataFileReader<>(input, new SpecificDatumReader<>()); + } else { + this.reader = new DataFileReader<>(input, new SpecificDatumReader<>(this.schema)); + } this.offset = new AvroOffset(0); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java index 542d3c0..b5f59ee 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java @@ -9,7 +9,6 @@ import java.io.IOException; import java.util.Map; -import java.util.stream.Collectors; import java.util.stream.IntStream; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; @@ -43,7 +42,7 @@ public DelimitedTextFileReader(FileSystem fs, Path filePath, Map SchemaBuilder schemaBuilder = SchemaBuilder.struct(); if (hasNext()) { String firstLine = inner.nextRecord().getValue(); - String columns[] = firstLine.split(token); + String[] columns = firstLine.split(token); IntStream.range(0, columns.length).forEach(index -> { String columnName = hasHeader ? columns[index] : DEFAULT_COLUMN_NAME + "_" + ++index; schemaBuilder.field(columnName, SchemaBuilder.STRING_SCHEMA); @@ -66,13 +65,13 @@ protected void configure(Map config) { this.token = config.get(FILE_READER_DELIMITED_TOKEN).toString(); this.defaultValue = config.get(FILE_READER_DELIMITED_DEFAULT_VALUE) == null ? null : config.get(FILE_READER_DELIMITED_DEFAULT_VALUE).toString(); - this.hasHeader = Boolean.valueOf((String) config.get(FILE_READER_DELIMITED_HEADER)); + this.hasHeader = Boolean.parseBoolean((String) config.get(FILE_READER_DELIMITED_HEADER)); } @Override protected DelimitedRecord nextRecord() { offset.inc(); - String values[] = inner.nextRecord().getValue().split(token); + String[] values = inner.nextRecord().getValue().split(token); return new DelimitedRecord(schema, defaultValue != null ? fillNullValues(values) : values); } @@ -84,9 +83,7 @@ private String[] fillNullValues(final String[] values) { } else { return defaultValue; } - }) - .collect(Collectors.toList()) - .toArray(new String[0]); + }).toArray(String[]::new); } @Override @@ -123,7 +120,7 @@ public void setOffset(long offset) { this.offset = hasHeader && offset > 0 ? offset - 1 : offset; } - protected void inc() { + void inc() { this.offset++; } @@ -151,7 +148,7 @@ static class DelimitedRecord { private final Schema schema; private final String[] values; - public DelimitedRecord(Schema schema, String[] values) { + DelimitedRecord(Schema schema, String[] values) { this.schema = schema; this.values = values; } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java index 76b71da..f6537f3 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java @@ -13,6 +13,7 @@ import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.avro.AvroReadSupport; import org.apache.parquet.hadoop.ParquetReader; +import org.apache.parquet.hadoop.util.HadoopInputFile; import java.io.IOException; import java.util.Map; @@ -52,9 +53,9 @@ private ParquetReader initReader() throws IOException { if (this.projection != null) { AvroReadSupport.setRequestedProjection(configuration, this.projection); } - ParquetReader reader = AvroParquetReader.builder(getFilePath()) - .withConf(configuration).build(); - return reader; + return AvroParquetReader + .builder(HadoopInputFile.fromPath(getFilePath(), configuration)) + .build(); } protected void configure(Map config) { @@ -144,7 +145,7 @@ public void setOffset(long offset) { this.offset = offset; } - protected void inc() { + void inc() { this.offset++; } @@ -158,7 +159,7 @@ static class GenericRecordToStruct implements ReaderAdapter { private static final int CACHE_SIZE = 100; private final AvroData avroData; - public GenericRecordToStruct() { + GenericRecordToStruct() { this.avroData = new AvroData(CACHE_SIZE); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java index 013a680..3432da7 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java @@ -154,7 +154,7 @@ public void setOffset(long offset) { this.offset = offset; } - protected void inc() { + void inc() { this.offset++; } @@ -202,7 +202,7 @@ static class SequenceRecord { private final String valueFieldName; private final U value; - public SequenceRecord(Schema schema, String keyFieldName, T key, String valueFieldName, U value) { + SequenceRecord(Schema schema, String keyFieldName, T key, String valueFieldName, U value) { this.schema = schema; this.keyFieldName = keyFieldName; this.key = key; diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java index a5781af..7ed0b80 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java @@ -158,7 +158,7 @@ static class TextRecord { private final Schema schema; private final String value; - public TextRecord(Schema schema, String value) { + TextRecord(Schema schema, String value) { this.schema = schema; this.value = value; } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java index 2a6dbce..251987e 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java @@ -54,7 +54,7 @@ public AbstractPolicy(FsSourceTaskConfig conf) throws IOException { private Map customConfigs() { return conf.originals().entrySet().stream() .filter(entry -> entry.getKey().startsWith(FsSourceTaskConfig.POLICY_PREFIX)) - .collect(Collectors.toMap(entry -> entry.getKey(), entry -> entry.getValue())); + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } private void configFs(Map customConfigs) throws IOException { @@ -126,7 +126,7 @@ public void interrupt() { protected void preCheck() { } - protected void postCheck() { + private void postCheck() { } public Iterator listFiles(FileSystem fs) throws IOException { @@ -173,17 +173,16 @@ public final boolean hasEnded() { protected abstract boolean isPolicyCompleted(); - public final int getExecutions() { + final int getExecutions() { return executions.get(); } - protected FileMetadata toMetadata(LocatedFileStatus fileStatus) { - List blocks = new ArrayList<>(); + FileMetadata toMetadata(LocatedFileStatus fileStatus) { - blocks.addAll(Arrays.stream(fileStatus.getBlockLocations()) + List blocks = Arrays.stream(fileStatus.getBlockLocations()) .map(block -> new FileMetadata.BlockInfo(block.getOffset(), block.getLength(), block.isCorrupt())) - .collect(Collectors.toList())); + .collect(Collectors.toList()); return new FileMetadata(fileStatus.getPath().toString(), fileStatus.getLen(), blocks); } @@ -215,8 +214,8 @@ public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorage return reader; } - Iterator concat(final Iterator it1, - final Iterator it2) { + private Iterator concat(final Iterator it1, + final Iterator it2) { return new Iterator() { @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java index e928d13..dd558d6 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java @@ -30,7 +30,7 @@ public class HdfsFileWatcherPolicy extends AbstractPolicy { public HdfsFileWatcherPolicy(FsSourceTaskConfig conf) throws IOException { super(conf); - this.fileQueue = new ConcurrentLinkedQueue(); + this.fileQueue = new ConcurrentLinkedQueue<>(); startWatchers(); } @@ -50,15 +50,15 @@ protected void configPolicy(Map customConfigs) { } private void startWatchers() { - fsEvenStream.values().forEach(stream -> stream.start()); + fsEvenStream.values().forEach(Thread::start); } private void stopWatchers() { - fsEvenStream.values().forEach(stream -> stream.interrupt()); + fsEvenStream.values().forEach(Thread::interrupt); } @Override - public Iterator listFiles(FileSystem fs) throws IOException { + public Iterator listFiles(FileSystem fs) { Set files = new HashSet<>(); FileMetadata metadata; while ((metadata = fileQueue.poll()) != null) { @@ -95,7 +95,7 @@ private class EventStreamThread extends Thread { private final FileSystem fs; private final HdfsAdmin admin; - protected EventStreamThread(FileSystem fs, HdfsAdmin admin) { + EventStreamThread(FileSystem fs, HdfsAdmin admin) { this.fs = fs; this.admin = admin; } @@ -151,4 +151,3 @@ private void enqueue(String path) throws IOException { } } } - diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java index 4919c34..2a02884 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicy.java @@ -31,14 +31,14 @@ public SleepyPolicy(FsSourceTaskConfig conf) throws IOException { @Override protected void configPolicy(Map customConfigs) { try { - this.sleep = Long.valueOf((String) customConfigs.get(SLEEPY_POLICY_SLEEP_MS)); + this.sleep = Long.parseLong((String) customConfigs.get(SLEEPY_POLICY_SLEEP_MS)); } catch (NumberFormatException nfe) { throw new ConfigException(SLEEPY_POLICY_SLEEP_MS + " property is required and must be a number(long). Got: " + customConfigs.get(SLEEPY_POLICY_SLEEP_MS)); } if (customConfigs.get(SLEEPY_POLICY_MAX_EXECS) != null) { try { - this.maxExecs = Long.valueOf((String) customConfigs.get(SLEEPY_POLICY_MAX_EXECS)); + this.maxExecs = Long.parseLong((String) customConfigs.get(SLEEPY_POLICY_MAX_EXECS)); } catch (NumberFormatException nfe) { throw new ConfigException(SLEEPY_POLICY_MAX_EXECS + " property must be a number(long). Got: " + customConfigs.get(SLEEPY_POLICY_MAX_EXECS)); @@ -48,7 +48,7 @@ protected void configPolicy(Map customConfigs) { } if (customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION) != null) { try { - this.sleepFraction = Long.valueOf((String) customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION)); + this.sleepFraction = Long.parseLong((String) customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION)); } catch (NumberFormatException nfe) { throw new ConfigException(SLEEPY_POLICY_SLEEP_FRACTION + " property must be a number(long). Got: " + customConfigs.get(SLEEPY_POLICY_SLEEP_FRACTION)); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java index babe70c..6b84ca3 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java @@ -3,7 +3,7 @@ import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; import com.github.mmolimar.kafka.connect.fs.policy.Policy; -import org.apache.commons.lang.reflect.ConstructorUtils; +import org.apache.commons.lang3.reflect.ConstructorUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -25,10 +25,10 @@ public static Policy makePolicy(Class clazz, FsSourceTaskConfi private static T make(Class clazz, Object... args) throws Throwable { try { - Class[] constClasses = Arrays.stream(args).map(arg -> arg.getClass()).toArray(Class[]::new); + Class[] constClasses = Arrays.stream(args).map(Object::getClass).toArray(Class[]::new); - Constructor constructor = ConstructorUtils.getMatchingAccessibleConstructor(clazz, constClasses); - return (T) constructor.newInstance(args); + Constructor constructor = ConstructorUtils.getMatchingAccessibleConstructor(clazz, constClasses); + return constructor.newInstance(args); } catch (IllegalAccessException | InstantiationException | InvocationTargetException e) { diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java index 23d2312..7e94e04 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/Version.java @@ -22,4 +22,4 @@ public class Version { public static String getVersion() { return version; } -} \ No newline at end of file +} From 896b44378794f7d1c3ddf06a2eb29ccee085e24a Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sat, 22 Feb 2020 11:37:52 -0600 Subject: [PATCH 04/51] Fix tests --- .../fs/file/reader/FileReaderTestBase.java | 8 ++++---- .../fs/file/reader/hdfs/AvroFileReaderTest.java | 3 ++- .../hdfs/DelimitedTextFileReaderTest.java | 16 ++++++++-------- .../reader/hdfs/HdfsFileReaderTestBase.java | 6 ++---- .../file/reader/hdfs/ParquetFileReaderTest.java | 17 +++++++++++++++-- .../reader/hdfs/SequenceFileReaderTest.java | 6 ++---- .../fs/file/reader/hdfs/TextFileReaderTest.java | 3 +-- .../file/reader/local/AvroFileReaderTest.java | 4 ++-- .../local/DelimitedTextFileReaderTest.java | 16 ++++++++-------- .../reader/local/ParquetFileReaderTest.java | 17 +++++++++++++++-- .../reader/local/SequenceFileReaderTest.java | 5 ++--- .../file/reader/local/TextFileReaderTest.java | 3 +-- .../policy/hdfs/HdfsFileWatcherPolicyTest.java | 5 ++--- .../fs/policy/hdfs/HdfsPolicyTestBase.java | 6 ++---- .../fs/policy/hdfs/SimplePolicyTest.java | 4 ++-- .../fs/policy/hdfs/SleepyPolicyTest.java | 5 ++--- .../fs/policy/local/SimplePolicyTest.java | 4 ++-- .../fs/policy/local/SleepyPolicyTest.java | 4 ++-- 18 files changed, 74 insertions(+), 58 deletions(-) diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java index e4aa2b4..238db17 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java @@ -39,7 +39,7 @@ public static void tearDown() throws IOException { @Before public void openReader() throws Throwable { reader = getReader(fs, dataFile, readerConfig); - assertTrue(reader.getFilePath().equals(dataFile)); + assertEquals(reader.getFilePath(), dataFile); } @After @@ -103,19 +103,19 @@ public void seekFile() { int recordIndex = NUM_RECORDS / 2; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex))); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = 0; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex))); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = NUM_RECORDS - 3; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex))); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1)); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java index 67a772e..f829ff1 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java @@ -23,6 +23,7 @@ import java.util.UUID; import java.util.stream.IntStream; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class AvroFileReaderTest extends HdfsFileReaderTestBase { @@ -101,7 +102,7 @@ protected Offset getOffset(long offset) { @Override protected void checkData(Struct record, long index) { - assertTrue((Integer) record.get(FIELD_INDEX) == index); + assertEquals((int) (Integer) record.get(FIELD_INDEX), index); assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java index da5304d..137eee1 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java @@ -50,7 +50,7 @@ private static Path createDataFile(boolean header) throws IOException { String value = String.format("%d_%s", index, UUID.randomUUID()); try { writer.append(value + "," + value + "," + value + "," + value + "\n"); - if (header) OFFSETS_BY_INDEX.put(index, Long.valueOf(index++)); + if (header) OFFSETS_BY_INDEX.put(index, (long) index); } catch (IOException ioe) { throw new RuntimeException(ioe); } @@ -124,10 +124,10 @@ public void readAllDataWithMalformedRows() throws Throwable { int recordCount = 0; while (reader.hasNext()) { Struct record = reader.next(); - assertTrue(record.get(FIELD_COLUMN1).equals("dummy")); - assertTrue(record.get(FIELD_COLUMN2).equals("custom_value")); - assertTrue(record.get(FIELD_COLUMN3).equals("custom_value")); - assertTrue(record.get(FIELD_COLUMN4).equals("custom_value")); + assertEquals("dummy", record.get(FIELD_COLUMN1)); + assertEquals("custom_value", record.get(FIELD_COLUMN2)); + assertEquals("custom_value", record.get(FIELD_COLUMN3)); + assertEquals("custom_value", record.get(FIELD_COLUMN4)); recordCount++; } assertEquals("The number of records in the file does not match", 2, recordCount); @@ -146,19 +146,19 @@ public void seekFileWithoutHeader() throws Throwable { int recordIndex = NUM_RECORDS / 2; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = 0; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = NUM_RECORDS - 3; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1, false)); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java index 5a7c1ba..f4f5183 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java @@ -15,13 +15,11 @@ public abstract class HdfsFileReaderTestBase extends FileReaderTestBase { private static MiniDFSCluster cluster; - private static Configuration clusterConfig; - private static Path hdfsDir; @BeforeClass public static void initFs() throws IOException { - clusterConfig = new Configuration(); - hdfsDir = Files.createTempDirectory("test-"); + Configuration clusterConfig = new Configuration(); + Path hdfsDir = Files.createTempDirectory("test-"); clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString()); cluster = new MiniDFSCluster.Builder(clusterConfig).build(); fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/"); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java index ae0e82c..5b69bb3 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java @@ -19,6 +19,7 @@ import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.io.InvalidRecordException; import org.junit.BeforeClass; +import org.junit.Ignore; import org.junit.Test; import java.io.File; @@ -63,7 +64,7 @@ private static Path createDataFile() throws IOException { datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID())); datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID())); try { - OFFSETS_BY_INDEX.put(index, Long.valueOf(index)); + OFFSETS_BY_INDEX.put(index, (long) index); writer.write(datum); } catch (IOException ioe) { throw new RuntimeException(ioe); @@ -75,6 +76,18 @@ private static Path createDataFile() throws IOException { return path; } + @Ignore(value = "This test does not apply for parquet files") + @Test(expected = IOException.class) + public void emptyFile() throws Throwable { + super.emptyFile(); + } + + @Ignore(value = "This test does not apply for parquet files") + @Test(expected = IOException.class) + public void invalidFileFormat() throws Throwable { + super.invalidFileFormat(); + } + @Test public void readerWithSchema() throws Throwable { Map cfg = new HashMap() {{ @@ -138,7 +151,7 @@ protected Offset getOffset(long offset) { @Override protected void checkData(Struct record, long index) { - assertTrue((Integer) record.get(FIELD_INDEX) == index); + assertEquals((int) (Integer) record.get(FIELD_INDEX), index); assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java index d7e6ba0..23e1f8c 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java @@ -75,8 +75,7 @@ private static Path createDataFile() throws IOException { public void defaultFieldNames() throws Throwable { Map customReaderCfg = new HashMap<>(); reader = getReader(fs, dataFile, customReaderCfg); - assertTrue(reader.getFilePath().equals(dataFile)); - + assertEquals(reader.getFilePath(), dataFile); assertTrue(reader.hasNext()); int recordCount = 0; @@ -99,7 +98,7 @@ protected void checkData(Struct record, long index) { } private void checkData(String keyFieldName, String valueFieldName, Struct record, long index) { - assertTrue((Integer) record.get(keyFieldName) == index); + assertEquals((int) (Integer) record.get(keyFieldName), index); assertTrue(record.get(valueFieldName).toString().startsWith(index + "_")); } @@ -107,5 +106,4 @@ private void checkData(String keyFieldName, String valueFieldName, Struct record protected String getFileExtension() { return FILE_EXTENSION; } - } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java index 0c37d4d..9a063d3 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java @@ -42,7 +42,7 @@ private static Path createDataFile() throws IOException { String value = String.format("%d_%s", index, UUID.randomUUID()); try { writer.append(value + "\n"); - OFFSETS_BY_INDEX.put(index, Long.valueOf(index++)); + OFFSETS_BY_INDEX.put(index, (long) index); } catch (IOException ioe) { throw new RuntimeException(ioe); } @@ -98,5 +98,4 @@ protected void checkData(Struct record, long index) { protected String getFileExtension() { return FILE_EXTENSION; } - } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java index de4ed20..2dc0454 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java @@ -23,6 +23,7 @@ import java.util.UUID; import java.util.stream.IntStream; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class AvroFileReaderTest extends LocalFileReaderTestBase { @@ -106,7 +107,7 @@ protected Offset getOffset(long offset) { @Override protected void checkData(Struct record, long index) { - assertTrue((Integer) record.get(FIELD_INDEX) == index); + assertEquals((int) (Integer) record.get(FIELD_INDEX), index); assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); } @@ -115,5 +116,4 @@ protected void checkData(Struct record, long index) { protected String getFileExtension() { return FILE_EXTENSION; } - } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java index 5884240..763ce11 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java @@ -51,7 +51,7 @@ private static Path createDataFile(boolean header) throws IOException { String value = String.format("%d_%s", index, UUID.randomUUID()); try { writer.append(value + "," + value + "," + value + "," + value + "\n"); - if (header) OFFSETS_BY_INDEX.put(index, Long.valueOf(index++)); + if (header) OFFSETS_BY_INDEX.put(index, (long) index); } catch (IOException ioe) { throw new RuntimeException(ioe); } @@ -130,10 +130,10 @@ public void readAllDataWithMalformedRows() throws Throwable { int recordCount = 0; while (reader.hasNext()) { Struct record = reader.next(); - assertTrue(record.get(FIELD_COLUMN1).equals("dummy")); - assertTrue(record.get(FIELD_COLUMN2).equals("custom_value")); - assertTrue(record.get(FIELD_COLUMN3).equals("custom_value")); - assertTrue(record.get(FIELD_COLUMN4).equals("custom_value")); + assertEquals("dummy", record.get(FIELD_COLUMN1)); + assertEquals("custom_value", record.get(FIELD_COLUMN2)); + assertEquals("custom_value", record.get(FIELD_COLUMN3)); + assertEquals("custom_value", record.get(FIELD_COLUMN4)); recordCount++; } assertEquals("The number of records in the file does not match", 2, recordCount); @@ -153,19 +153,19 @@ public void seekFileWithoutHeader() throws Throwable { int recordIndex = NUM_RECORDS / 2; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = 0; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = NUM_RECORDS - 3; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex).longValue() + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1, false)); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java index 91c1eb6..1cceebb 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java @@ -19,6 +19,7 @@ import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.io.InvalidRecordException; import org.junit.BeforeClass; +import org.junit.Ignore; import org.junit.Test; import java.io.File; @@ -65,7 +66,7 @@ private static Path createDataFile() throws IOException { datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID())); datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID())); try { - OFFSETS_BY_INDEX.put(index, Long.valueOf(index)); + OFFSETS_BY_INDEX.put(index, (long) index); writer.write(datum); } catch (IOException ioe) { throw new RuntimeException(ioe); @@ -77,6 +78,18 @@ private static Path createDataFile() throws IOException { return path; } + @Ignore(value = "This test does not apply for parquet files") + @Test(expected = IOException.class) + public void emptyFile() throws Throwable { + super.emptyFile(); + } + + @Ignore(value = "This test does not apply for parquet files") + @Test(expected = IOException.class) + public void invalidFileFormat() throws Throwable { + super.invalidFileFormat(); + } + @Test public void readerWithSchema() throws Throwable { Map cfg = new HashMap() {{ @@ -145,7 +158,7 @@ protected Offset getOffset(long offset) { @Override protected void checkData(Struct record, long index) { - assertTrue((Integer) record.get(FIELD_INDEX) == index); + assertEquals((int) (Integer) record.get(FIELD_INDEX), index); assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java index 8d53cb8..48c4c4e 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java @@ -78,8 +78,7 @@ public void defaultFieldNames() throws Throwable { put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE, getFileExtension()); }}; reader = getReader(fs, dataFile, customReaderCfg); - assertTrue(reader.getFilePath().equals(dataFile)); - + assertEquals(reader.getFilePath(), dataFile); assertTrue(reader.hasNext()); int recordCount = 0; @@ -102,7 +101,7 @@ protected void checkData(Struct record, long index) { } private void checkData(String keyFieldName, String valueFieldName, Struct record, long index) { - assertTrue((Integer) record.get(keyFieldName) == index); + assertEquals((int) (Integer) record.get(keyFieldName), index); assertTrue(record.get(valueFieldName).toString().startsWith(index + "_")); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java index 53d9a98..39ae6e8 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java @@ -42,7 +42,7 @@ private static Path createDataFile() throws IOException { String value = String.format("%d_%s", index, UUID.randomUUID()); try { writer.append(value + "\n"); - OFFSETS_BY_INDEX.put(index, Long.valueOf(index++)); + OFFSETS_BY_INDEX.put(index, (long) index); } catch (IOException ioe) { throw new RuntimeException(ioe); } @@ -98,5 +98,4 @@ protected void checkData(Struct record, long index) { protected String getFileExtension() { return FILE_EXTENSION; } - } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java index d3e0d9a..0c32830 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java @@ -30,8 +30,8 @@ public static void setUp() throws IOException { } Map cfg = new HashMap() {{ - String uris[] = directories.stream().map(dir -> dir.toString()) - .toArray(size -> new String[size]); + String[] uris = directories.stream().map(Path::toString) + .toArray(String[]::new); put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); put(FsSourceTaskConfig.TOPIC, "topic_test"); put(FsSourceTaskConfig.POLICY_CLASS, HdfsFileWatcherPolicy.class.getName()); @@ -70,5 +70,4 @@ public void execPolicyAlreadyEnded() throws IOException { assertTrue(policy.hasEnded()); policy.execute(); } - } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java index 3cbe9a9..d046d0b 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java @@ -15,13 +15,11 @@ public abstract class HdfsPolicyTestBase extends PolicyTestBase { private static MiniDFSCluster cluster; - private static Configuration clusterConfig; - private static Path hdfsDir; @BeforeClass public static void initFs() throws IOException { - clusterConfig = new Configuration(); - hdfsDir = Files.createTempDirectory("test-"); + Configuration clusterConfig = new Configuration(); + Path hdfsDir = Files.createTempDirectory("test-"); clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString()); cluster = new MiniDFSCluster.Builder(clusterConfig).build(); fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/"); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java index 06f1db7..33ebe28 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java @@ -25,8 +25,8 @@ public static void setUp() throws IOException { } Map cfg = new HashMap() {{ - String uris[] = directories.stream().map(dir -> dir.toString()) - .toArray(size -> new String[size]); + String[] uris = directories.stream().map(Path::toString) + .toArray(String[]::new); put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); put(FsSourceTaskConfig.TOPIC, "topic_test"); put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java index edd5533..77d85a6 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java @@ -32,8 +32,8 @@ public static void setUp() throws IOException { } Map cfg = new HashMap() {{ - String uris[] = directories.stream().map(dir -> dir.toString()) - .toArray(size -> new String[size]); + String[] uris = directories.stream().map(Path::toString) + .toArray(String[]::new); put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); put(FsSourceTaskConfig.TOPIC, "topic_test"); put(FsSourceTaskConfig.POLICY_CLASS, SleepyPolicy.class.getName()); @@ -105,5 +105,4 @@ public void defaultExecutions() throws Throwable { policy.interrupt(); assertTrue(policy.hasEnded()); } - } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java index 214849b..c8a221a 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java @@ -25,8 +25,8 @@ public static void setUp() throws IOException { } Map cfg = new HashMap() {{ - String uris[] = directories.stream().map(dir -> dir.toString()) - .toArray(size -> new String[size]); + String[] uris = directories.stream().map(Path::toString) + .toArray(String[]::new); put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); put(FsSourceTaskConfig.TOPIC, "topic_test"); put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java index 2f907ae..be6c58b 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java @@ -32,8 +32,8 @@ public static void setUp() throws IOException { } Map cfg = new HashMap() {{ - String uris[] = directories.stream().map(dir -> dir.toString()) - .toArray(size -> new String[size]); + String[] uris = directories.stream().map(Path::toString) + .toArray(String[]::new); put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); put(FsSourceTaskConfig.TOPIC, "topic_test"); put(FsSourceTaskConfig.POLICY_CLASS, SleepyPolicy.class.getName()); From 38e1bf050d9239d69180322d5c726edc426f124d Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sat, 22 Feb 2020 11:52:49 -0600 Subject: [PATCH 05/51] Updating docs --- docs/Makefile | 2 +- docs/source/conf.py | 4 ++-- docs/source/config_options.rst | 2 +- docs/source/connector.rst | 2 +- docs/source/faq.rst | 1 - docs/source/filereaders.rst | 2 +- docs/source/policies.rst | 2 +- 7 files changed, 7 insertions(+), 8 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index 4dea114..9aeda1f 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -17,4 +17,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/source/conf.py b/docs/source/conf.py index d2ffa24..f6edf0c 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -55,9 +55,9 @@ # built documents. # # The short X.Y version. -version = '0.1' +version = '1.0' # The full version, including alpha/beta/rc tags. -release = '0.1' +release = '1.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/source/config_options.rst b/docs/source/config_options.rst index 6eaf081..851419f 100644 --- a/docs/source/config_options.rst +++ b/docs/source/config_options.rst @@ -155,7 +155,7 @@ In order to configure custom properties for this policy, the name you must use i .. _config_options-policies-hdfs: -Hdfs file watcher +HDFS file watcher -------------------------------------------- This policy does not have any additional configuration. diff --git a/docs/source/connector.rst b/docs/source/connector.rst index 48cd0e0..d045f1e 100644 --- a/docs/source/connector.rst +++ b/docs/source/connector.rst @@ -24,7 +24,7 @@ Getting started Prerequisites -------------------------------------------- -- Confluent Platform 3.1.1 +- Confluent Platform 5.4.0 - Java 8 Building from source diff --git a/docs/source/faq.rst b/docs/source/faq.rst index 49e9ef7..a5077dc 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -53,4 +53,3 @@ until throws an exception. It's a matter of time. But the main thing is that you don't have to worry about removing files from the FS when they are being processed. The connector tolerates errors when reading files and continues with the next file. - diff --git a/docs/source/filereaders.rst b/docs/source/filereaders.rst index 37c76f3..75b349b 100644 --- a/docs/source/filereaders.rst +++ b/docs/source/filereaders.rst @@ -16,7 +16,7 @@ Parquet Reads files with `Parquet `__ format. The reader takes advantage of the Parquet-Avro API and uses the Parquet file -as if it were an Avro file, so the message sent to Kafka is built in the same +as if it was an Avro file, so the message sent to Kafka is built in the same way as the Avro file reader does. .. warning:: Seeking Parquet files is a heavy task because the reader has to diff --git a/docs/source/policies.rst b/docs/source/policies.rst index abed625..b2ceb86 100644 --- a/docs/source/policies.rst +++ b/docs/source/policies.rst @@ -14,7 +14,7 @@ and wait for the next one. Additionally, its custom properties allow to end it. You can learn more about the properties of this policy :ref:`here`. -Hdfs file watcher +HDFS file watcher ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ It uses Hadoop notifications events and all create/append/close events will be reported From f2e665313da7dba54e53d33d9eb6c9d651fe64b8 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Fri, 28 Feb 2020 23:04:28 -0600 Subject: [PATCH 06/51] Enable flag in TextFileReader to read a record per line --- .../fs/file/reader/AbstractFileReader.java | 4 ++ .../fs/file/reader/AvroFileReader.java | 5 +- .../file/reader/DelimitedTextFileReader.java | 8 ++-- .../fs/file/reader/SequenceFileReader.java | 5 +- .../fs/file/reader/TextFileReader.java | 48 ++++++++++++------- .../local/DelimitedTextFileReaderTest.java | 1 - .../reader/local/ParquetFileReaderTest.java | 1 - .../file/reader/local/TextFileReaderTest.java | 22 +++++++++ 8 files changed, 67 insertions(+), 27 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java index 1fbdddb..533b628 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java @@ -3,6 +3,8 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.Map; import java.util.stream.Collectors; @@ -10,6 +12,7 @@ import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; public abstract class AbstractFileReader implements FileReader { + protected final Logger log = LoggerFactory.getLogger(getClass()); private final FileSystem fs; private final Path filePath; @@ -25,6 +28,7 @@ public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter Map readerConf = config.entrySet().stream() .filter(entry -> entry.getKey().startsWith(FILE_READER_PREFIX)) + .filter(entry -> entry.getValue() != null) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); configure(readerConf); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java index 44ec3df..1db7e01 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java @@ -93,7 +93,7 @@ public void setOffset(long offset) { this.offset = offset; } - protected void inc() { + void inc() { this.offset++; } @@ -104,10 +104,11 @@ public long getRecordOffset() { } static class GenericRecordToStruct implements ReaderAdapter { + private static final int CACHE_SIZE = 100; private final AvroData avroData; - public GenericRecordToStruct() { + GenericRecordToStruct() { this.avroData = new AvroData(CACHE_SIZE); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java index b5f59ee..ff703aa 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java @@ -14,6 +14,7 @@ import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; public class DelimitedTextFileReader extends AbstractFileReader { + private static final String FILE_READER_DELIMITED = FILE_READER_PREFIX + "delimited."; public static final String FILE_READER_DELIMITED_HEADER = FILE_READER_DELIMITED + "header"; public static final String FILE_READER_DELIMITED_TOKEN = FILE_READER_DELIMITED + "token"; @@ -32,10 +33,9 @@ public class DelimitedTextFileReader extends AbstractFileReader config) throws IOException { super(fs, filePath, new DelimitedTxtToStruct(), config); - //mapping encoding for text file reader - if (config.get(FILE_READER_DELIMITED_ENCODING) != null) { - config.put(TextFileReader.FILE_READER_TEXT_ENCODING, config.get(FILE_READER_DELIMITED_ENCODING)); - } + config.put(TextFileReader.FILE_READER_TEXT_ENCODING, config.get(FILE_READER_DELIMITED_ENCODING)); + config.put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, "true"); + this.inner = new TextFileReader(fs, filePath, config); this.offset = new DelimitedTextOffset(0, hasHeader); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java index 3432da7..58d1e0e 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java @@ -30,7 +30,6 @@ public class SequenceFileReader extends AbstractFileReader nextRecord() { throw new NoSuchElementException("There are no more records in file: " + getFilePath()); } recordIndex++; - return new SequenceRecord(schema, keyFieldName, key, valueFieldName, value); + return new SequenceRecord<>(schema, keyFieldName, key, valueFieldName, value); } @Override @@ -196,6 +195,7 @@ private Object toSchemaValue(Writable writable) { } static class SequenceRecord { + private final Schema schema; private final String keyFieldName; private final T key; @@ -211,5 +211,4 @@ static class SequenceRecord { } } - } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java index 7ed0b80..c623275 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java @@ -8,31 +8,35 @@ import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.errors.ConnectException; +import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.nio.charset.Charset; +import java.util.List; import java.util.Map; import java.util.NoSuchElementException; +import java.util.stream.Collectors; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; public class TextFileReader extends AbstractFileReader { - public static final String FIELD_NAME_VALUE_DEFAULT = "value"; - private static final String FILE_READER_TEXT = FILE_READER_PREFIX + "text."; - private static final String FILE_READER_SEQUENCE_FIELD_NAME_PREFIX = FILE_READER_TEXT + "field_name."; + private static final String FILE_READER_FIELD_NAME_PREFIX = FILE_READER_TEXT + "field_name."; - public static final String FILE_READER_TEXT_FIELD_NAME_VALUE = FILE_READER_SEQUENCE_FIELD_NAME_PREFIX + "value"; + public static final String FIELD_NAME_VALUE_DEFAULT = "value"; + public static final String FILE_READER_TEXT_FIELD_NAME_VALUE = FILE_READER_FIELD_NAME_PREFIX + "value"; + public static final String FILE_READER_TEXT_RECORD_PER_LINE = FILE_READER_TEXT + "record_per_line"; public static final String FILE_READER_TEXT_ENCODING = FILE_READER_TEXT + "encoding"; private final TextOffset offset; - private String currentLine; + private String current; private boolean finished = false; private LineNumberReader reader; private Schema schema; private Charset charset; + private boolean recordPerLine; public TextFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, new TxtToStruct(), config); @@ -49,34 +53,46 @@ protected void configure(Map config) { } else { valueFieldName = config.get(FILE_READER_TEXT_FIELD_NAME_VALUE).toString(); } - this.schema = SchemaBuilder.struct() - .field(valueFieldName, Schema.STRING_SCHEMA) - .build(); - if (config.get(FILE_READER_TEXT_ENCODING) == null || config.get(FILE_READER_TEXT_ENCODING).toString().equals("")) { this.charset = Charset.defaultCharset(); } else { this.charset = Charset.forName(config.get(FILE_READER_TEXT_ENCODING).toString()); } + if (config.get(FILE_READER_TEXT_RECORD_PER_LINE) == null || + config.get(FILE_READER_TEXT_RECORD_PER_LINE).toString().equals("")) { + this.recordPerLine = true; + } else { + this.recordPerLine = Boolean.parseBoolean(config.get(FILE_READER_TEXT_RECORD_PER_LINE).toString()); + } + this.schema = SchemaBuilder.struct() + .field(valueFieldName, Schema.STRING_SCHEMA) + .build(); } @Override public boolean hasNext() { - if (currentLine != null) { + if (current != null) { return true; } else if (finished) { return false; } else { try { - while (true) { + if (!recordPerLine) { + List lines = new BufferedReader(reader).lines().collect(Collectors.toList()); + offset.setOffset(lines.size() - 1); + current = String.join("\n", lines); + finished = true; + return true; + } + for (; ; ) { String line = reader.readLine(); offset.setOffset(reader.getLineNumber()); if (line == null) { finished = true; return false; } - currentLine = line; + current = line; return true; } } catch (IOException ioe) { @@ -90,8 +106,8 @@ protected TextRecord nextRecord() { if (!hasNext()) { throw new NoSuchElementException("There are no more records in file: " + getFilePath()); } - String aux = currentLine; - currentLine = null; + String aux = current; + current = null; return new TextRecord(schema, aux); } @@ -104,9 +120,9 @@ public void seek(Offset offset) { try { if (offset.getRecordOffset() < reader.getLineNumber()) { this.reader = new LineNumberReader(new InputStreamReader(getFs().open(getFilePath()))); - currentLine = null; + current = null; } - while ((currentLine = reader.readLine()) != null) { + while ((current = reader.readLine()) != null) { if (reader.getLineNumber() - 1 == offset.getRecordOffset()) { this.offset.setOffset(reader.getLineNumber()); return; diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java index 763ce11..679ef45 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java @@ -104,7 +104,6 @@ public void readAllDataWithoutHeader() throws Throwable { recordCount++; } assertEquals("The number of records in the file does not match", NUM_RECORDS, recordCount); - } @Test diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java index 1cceebb..da23677 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java @@ -59,7 +59,6 @@ private static Path createDataFile() throws IOException { try (ParquetWriter writer = AvroParquetWriter.builder(new Path(parquetFile.toURI())) .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) { - IntStream.range(0, NUM_RECORDS).forEach(index -> { GenericRecord datum = new GenericData.Record(readerSchema); datum.put(FIELD_INDEX, index); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java index 39ae6e8..7de8414 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java @@ -2,6 +2,7 @@ import com.github.mmolimar.kafka.connect.fs.file.Offset; import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; +import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; @@ -18,6 +19,7 @@ import java.util.UUID; import java.util.stream.IntStream; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; public class TextFileReaderTest extends LocalFileReaderTestBase { @@ -84,6 +86,26 @@ public void invalidFileEncoding() throws Throwable { getReader(fs, dataFile, cfg); } + @Test + public void readDataWithRecordPerLineDisabled() throws Throwable { + Path file = createDataFile(); + FileReader reader = getReader(fs, file, new HashMap() {{ + put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, "false"); + }}); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals("The number of records in the file does not match", 1, recordCount); + } + @Override protected Offset getOffset(long offset) { return new TextFileReader.TextOffset(offset); From 2cd530e5fda239308571fa4933229f916616ac99 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sat, 29 Feb 2020 01:52:46 -0600 Subject: [PATCH 07/51] Fix offsets when seeking in text readers --- .../connect/fs/file/reader/TextFileReader.java | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java index c623275..4d03487 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java @@ -118,17 +118,15 @@ public void seek(Offset offset) { throw new IllegalArgumentException("Record offset must be greater than 0"); } try { + current = null; if (offset.getRecordOffset() < reader.getLineNumber()) { - this.reader = new LineNumberReader(new InputStreamReader(getFs().open(getFilePath()))); - current = null; + finished = false; + reader = new LineNumberReader(new InputStreamReader(getFs().open(getFilePath()))); } - while ((current = reader.readLine()) != null) { - if (reader.getLineNumber() - 1 == offset.getRecordOffset()) { - this.offset.setOffset(reader.getLineNumber()); - return; - } + while (reader.getLineNumber() < offset.getRecordOffset()) { + reader.readLine(); } - this.offset.setOffset(reader.getLineNumber()); + this.offset.setOffset(reader.getLineNumber() + 1); } catch (IOException ioe) { throw new ConnectException("Error seeking file " + getFilePath(), ioe); } From 09bf4afca529c6071bb33c3982fb67f3946f8d16 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Mon, 2 Mar 2020 02:58:49 +0100 Subject: [PATCH 08/51] New JSON file reader --- pom.xml | 16 +- .../fs/file/reader/AgnosticFileReader.java | 18 +- .../fs/file/reader/JsonFileReader.java | 222 ++++++++++++++++++ 3 files changed, 246 insertions(+), 10 deletions(-) create mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java diff --git a/pom.xml b/pom.xml index 52c6b89..c63e4fc 100644 --- a/pom.xml +++ b/pom.xml @@ -16,6 +16,7 @@ 3.2.1 1.9.2 1.11.0 + 2.10.2 4.13 4.2 2.0.5 @@ -51,20 +52,27 @@ org.apache.avro - avro-tools + avro ${avro.version} - nodeps org.apache.avro - avro + avro-tools ${avro.version} + nodeps org.apache.parquet parquet-avro ${parquet.version} + + com.fasterxml.jackson.core + jackson-core + ${fasterxml-jackson.version} + + + junit junit @@ -181,4 +189,4 @@ http://packages.confluent.io/maven/ - \ No newline at end of file + diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java index 34f9670..caa89a1 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java @@ -20,17 +20,18 @@ public class AgnosticFileReader extends AbstractFileReader parquetExtensions, avroExtensions, sequenceExtensions, delimitedExtensions; + private final AbstractFileReader reader; + private List parquetExtensions, avroExtensions, jsonExtensions, sequenceExtensions, delimitedExtensions; public AgnosticFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, new AgnosticAdapter(), config); try { - reader = (AbstractFileReader) readerByExtension(fs, filePath, config); + reader = readerByExtension(fs, filePath, config); } catch (RuntimeException | IOException e) { throw e; } catch (Throwable t) { @@ -38,17 +39,19 @@ public AgnosticFileReader(FileSystem fs, Path filePath, Map conf } } - private FileReader readerByExtension(FileSystem fs, Path filePath, Map config) + private AbstractFileReader readerByExtension(FileSystem fs, Path filePath, Map config) throws Throwable { int index = filePath.getName().lastIndexOf('.'); String extension = index == -1 || index == filePath.getName().length() - 1 ? "" : filePath.getName().substring(index + 1).toLowerCase(); - Class clz; + Class clz; if (parquetExtensions.contains(extension)) { clz = ParquetFileReader.class; } else if (avroExtensions.contains(extension)) { clz = AvroFileReader.class; + } else if (jsonExtensions.contains(extension)) { + clz = JsonFileReader.class; } else if (sequenceExtensions.contains(extension)) { clz = SequenceFileReader.class; } else if (delimitedExtensions.contains(extension)) { @@ -57,7 +60,7 @@ private FileReader readerByExtension(FileSystem fs, Path filePath, Map) ReflectionUtils.makeReader(clz, fs, filePath, config); } @Override @@ -68,6 +71,9 @@ protected void configure(Map config) { this.avroExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_AVRO) == null ? Collections.singletonList("avro") : Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_AVRO).toString().toLowerCase().split(",")); + this.jsonExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_JSON) == null ? + Collections.singletonList("json") : + Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_JSON).toString().toLowerCase().split(",")); this.sequenceExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE) == null ? Collections.singletonList("seq") : Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE).toString().toLowerCase().split(",")); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java new file mode 100644 index 0000000..58230f9 --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java @@ -0,0 +1,222 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.github.mmolimar.kafka.connect.fs.file.Offset; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaBuilder; +import org.apache.kafka.connect.data.Struct; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; + +public class JsonFileReader extends AbstractFileReader { + + private static final String FILE_READER_JSON = FILE_READER_PREFIX + "json."; + + public static final String FILE_READER_JSON_DESERIALIZATION_CONFIGS = FILE_READER_JSON + "deserialization."; + public static final String FILE_READER_JSON_RECORD_PER_LINE = FILE_READER_JSON + "record_per_line"; + public static final String FILE_READER_JSON_ENCODING = FILE_READER_JSON + "encoding"; + + private final TextFileReader inner; + private final Schema schema; + private ObjectMapper mapper; + + public JsonFileReader(FileSystem fs, Path filePath, Map config) throws IOException { + super(fs, filePath, new JsonToStruct(), config); + + config.put(TextFileReader.FILE_READER_TEXT_ENCODING, config.get(FILE_READER_JSON_ENCODING)); + config.put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, config.get(FILE_READER_JSON_RECORD_PER_LINE)); + + this.inner = new TextFileReader(fs, filePath, config); + + if (hasNext()) { + String line = inner.nextRecord().getValue(); + this.schema = extractSchema(mapper.readTree(line)); + //back to the first line + inner.seek(() -> 0); + } else { + this.schema = SchemaBuilder.struct().build(); + } + } + + @Override + protected void configure(Map config) { + mapper = new ObjectMapper(); + Set deserializationFeatures = Arrays.stream(DeserializationFeature.values()) + .map(Enum::name) + .collect(Collectors.toSet()); + config.entrySet().stream() + .filter(entry -> entry.getValue() != null) + .filter(entry -> entry.getKey().startsWith(FILE_READER_JSON_DESERIALIZATION_CONFIGS)) + .forEach(entry -> { + String feature = entry.getKey().replaceAll(FILE_READER_JSON_DESERIALIZATION_CONFIGS, ""); + if (deserializationFeatures.contains(feature)) { + mapper.configure(DeserializationFeature.valueOf(feature), + Boolean.parseBoolean(entry.getValue().toString())); + } else { + log.warn("Ignoring deserialization configuration '" + feature + "' due to it does not exist."); + } + }); + } + + @Override + protected JsonRecord nextRecord() { + try { + JsonNode value = mapper.readTree(inner.nextRecord().getValue()); + return new JsonRecord(schema, value); + } catch (JsonProcessingException jpe) { + throw new IllegalStateException(jpe); + } + } + + @Override + public boolean hasNext() { + return inner.hasNext(); + } + + @Override + public void seek(Offset offset) { + inner.seek(offset); + } + + @Override + public Offset currentOffset() { + return inner.currentOffset(); + } + + @Override + public void close() throws IOException { + inner.close(); + } + + private static Schema extractSchema(JsonNode jsonNode) { + switch (jsonNode.getNodeType()) { + case BOOLEAN: + return Schema.OPTIONAL_BOOLEAN_SCHEMA; + case NUMBER: + if (jsonNode.isShort()) { + return Schema.OPTIONAL_INT8_SCHEMA; + } else if (jsonNode.isInt()) { + return Schema.OPTIONAL_INT32_SCHEMA; + } else if (jsonNode.isLong()) { + return Schema.OPTIONAL_INT64_SCHEMA; + } else if (jsonNode.isFloat()) { + return Schema.OPTIONAL_FLOAT32_SCHEMA; + } else if (jsonNode.isDouble()) { + return Schema.OPTIONAL_FLOAT64_SCHEMA; + } else if (jsonNode.isBigInteger()) { + return Schema.OPTIONAL_INT64_SCHEMA; + } else if (jsonNode.isBigDecimal()) { + return Schema.OPTIONAL_FLOAT64_SCHEMA; + } else { + return Schema.OPTIONAL_FLOAT64_SCHEMA; + } + case STRING: + return Schema.OPTIONAL_STRING_SCHEMA; + case BINARY: + return Schema.OPTIONAL_BYTES_SCHEMA; + case ARRAY: + Iterable elements = jsonNode::elements; + Schema arraySchema = StreamSupport.stream(elements.spliterator(), false) + .findFirst().map(JsonFileReader::extractSchema) + .orElse(SchemaBuilder.struct().build()); + return SchemaBuilder.array(arraySchema).build(); + case OBJECT: + SchemaBuilder builder = SchemaBuilder.struct(); + jsonNode.fields() + .forEachRemaining(field -> builder.field(field.getKey(), extractSchema(field.getValue()))); + return builder.build(); + default: + return SchemaBuilder.struct().optional().build(); + } + } + + static class JsonToStruct implements ReaderAdapter { + + @Override + public Struct apply(JsonRecord record) { + return toStruct(record.schema, record.value); + } + + private Struct toStruct(Schema schema, JsonNode jsonNode) { + if (jsonNode.isNull()) return null; + Struct struct = new Struct(schema); + jsonNode.fields() + .forEachRemaining(field -> struct.put(field.getKey(), + mapValue(struct.schema().field(field.getKey()).schema(), field.getValue()))); + return struct; + } + + private Object mapValue(Schema schema, JsonNode value) { + if (value == null) return null; + + switch (value.getNodeType()) { + case BOOLEAN: + return value.booleanValue(); + case NUMBER: + if (value.isShort()) { + return value.shortValue(); + } else if (value.isInt()) { + return value.intValue(); + } else if (value.isLong()) { + return value.longValue(); + } else if (value.isFloat()) { + return value.floatValue(); + } else if (value.isDouble()) { + return value.doubleValue(); + } else if (value.isBigInteger()) { + return value.bigIntegerValue(); + } else { + return value.numberValue(); + } + case STRING: + return value.asText(); + case BINARY: + try { + return value.binaryValue(); + } catch (IOException ioe) { + throw new IllegalStateException(ioe); + } + case OBJECT: + Struct struct = new Struct(schema); + Iterable> fields = value::fields; + StreamSupport.stream(fields.spliterator(), false) + .forEach(field -> struct.put(field.getKey(), + mapValue(extractSchema(field.getValue()), field.getValue())) + ); + return struct; + case ARRAY: + Iterable arrayElements = value::elements; + return StreamSupport.stream(arrayElements.spliterator(), false) + .map(elm -> mapValue(schema, elm)) + .collect(Collectors.toList()); + case NULL: + case POJO: + case MISSING: + default: + return null; + } + } + } + + static class JsonRecord { + private final Schema schema; + private final JsonNode value; + + JsonRecord(Schema schema, JsonNode value) { + this.schema = schema; + this.value = value; + } + } +} From 9a35403f425367565814b822d585ac0b3b828057 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Mon, 2 Mar 2020 02:59:07 +0100 Subject: [PATCH 09/51] Tests for JSON file reader --- .../file/reader/hdfs/JsonFileReaderTest.java | 170 +++++++++++++++++ .../file/reader/local/JsonFileReaderTest.java | 174 ++++++++++++++++++ 2 files changed, 344 insertions(+) create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/JsonFileReaderTest.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/JsonFileReaderTest.java diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/JsonFileReaderTest.java new file mode 100644 index 0000000..4a82ede --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/JsonFileReaderTest.java @@ -0,0 +1,170 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.github.mmolimar.kafka.connect.fs.file.Offset; +import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; +import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; +import com.github.mmolimar.kafka.connect.fs.file.reader.JsonFileReader; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.IntStream; + +import static org.junit.Assert.*; + +public class JsonFileReaderTest extends HdfsFileReaderTestBase { + + private static final String FIELD_INTEGER = "integerField"; + private static final String FIELD_LONG = "longField"; + private static final String FIELD_BOOLEAN = "booleanField"; + private static final String FIELD_STRING = "stringField"; + private static final String FIELD_DECIMAL = "decimalField"; + private static final String FIELD_ARRAY = "arrayField"; + private static final String FIELD_STRUCT = "structField"; + private static final String FIELD_NULL = "nullField"; + private static final String FILE_EXTENSION = "json"; + + @BeforeClass + public static void setUp() throws IOException { + readerClass = AgnosticFileReader.class; + dataFile = createDataFile(); + readerConfig = new HashMap() {{ + String deserializationConfig = DeserializationFeature.ACCEPT_EMPTY_ARRAY_AS_NULL_OBJECT.name(); + put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_JSON, FILE_EXTENSION); + put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + deserializationConfig, "true"); + put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + "invalid", "false"); + }}; + } + + private static Path createDataFile() throws IOException { + return createDataFile(NUM_RECORDS, true); + } + + private static Path createDataFile(int numRecords, boolean recordPerLine) throws IOException { + File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); + try (FileWriter writer = new FileWriter(txtFile)) { + IntStream.range(0, numRecords).forEach(index -> { + ObjectNode json = JsonNodeFactory.instance.objectNode() + .put(FIELD_INTEGER, index) + .put(FIELD_LONG, Long.MAX_VALUE) + .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID())) + .put(FIELD_BOOLEAN, true) + .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index)) + .put(FIELD_NULL, (String) null); + json.putArray(FIELD_ARRAY) + .add("elm[" + index + "]") + .add("elm[" + index + "]"); + json.putObject(FIELD_STRUCT) + .put(FIELD_INTEGER, (short) index) + .put(FIELD_LONG, Long.MAX_VALUE) + .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID())) + .put(FIELD_BOOLEAN, true) + .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index)) + .put(FIELD_NULL, (String) null); + try { + writer.append(recordPerLine ? json.toString() + "\n" : json.toPrettyString()); + OFFSETS_BY_INDEX.put(index, (long) index); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + }); + } + Path path = new Path(new Path(fsUri), txtFile.getName()); + fs.moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); + return path; + } + + @Ignore(value = "This test does not apply for json files") + @Test(expected = IOException.class) + public void emptyFile() throws Throwable { + super.emptyFile(); + } + + @Test + public void readEmptyFile() throws Throwable { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + FileReader reader = getReader(fs, path, readerConfig); + assertFalse(reader.hasNext()); + } + + @Test + public void validFileEncoding() throws Throwable { + Map cfg = new HashMap() {{ + put(JsonFileReader.FILE_READER_JSON_ENCODING, "Cp1252"); + }}; + reader = getReader(fs, dataFile, cfg); + readAllData(); + } + + @Test(expected = UnsupportedCharsetException.class) + public void invalidFileEncoding() throws Throwable { + Map cfg = new HashMap() {{ + put(JsonFileReader.FILE_READER_JSON_ENCODING, "invalid_charset"); + }}; + getReader(fs, dataFile, cfg); + } + + @Test + public void readDataWithRecordPerLineDisabled() throws Throwable { + Path file = createDataFile(1, false); + FileReader reader = getReader(fs, file, new HashMap() {{ + put(JsonFileReader.FILE_READER_JSON_RECORD_PER_LINE, "false"); + }}); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals("The number of records in the file does not match", 1, recordCount); + } + + @Override + protected Offset getOffset(long offset) { + return () -> offset; + } + + @Override + protected void checkData(Struct record, long index) { + assertEquals((int) (Integer) record.get(FIELD_INTEGER), index); + assertEquals((long) (Long) record.get(FIELD_LONG), Long.MAX_VALUE); + assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")); + assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())); + assertEquals((Double) record.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0); + assertNull(record.get(FIELD_NULL)); + assertNotNull(record.schema().field(FIELD_NULL)); + assertEquals(record.get(FIELD_ARRAY), Arrays.asList("elm[" + index + "]", "elm[" + index + "]")); + Struct subrecord = record.getStruct(FIELD_STRUCT); + assertEquals((int) (Integer) subrecord.get(FIELD_INTEGER), index); + assertEquals((long) (Long) subrecord.get(FIELD_LONG), Long.MAX_VALUE); + assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")); + assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())); + assertEquals((Double) subrecord.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0); + assertNull(subrecord.get(FIELD_NULL)); + assertNotNull(subrecord.schema().field(FIELD_NULL)); + } + + @Override + protected String getFileExtension() { + return FILE_EXTENSION; + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/JsonFileReaderTest.java new file mode 100644 index 0000000..16bf3eb --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/JsonFileReaderTest.java @@ -0,0 +1,174 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader.local; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.github.mmolimar.kafka.connect.fs.file.Offset; +import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; +import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; +import com.github.mmolimar.kafka.connect.fs.file.reader.JsonFileReader; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.IntStream; + +import static org.junit.Assert.*; + +public class JsonFileReaderTest extends LocalFileReaderTestBase { + + private static final String FIELD_INTEGER = "integerField"; + private static final String FIELD_LONG = "longField"; + private static final String FIELD_BOOLEAN = "booleanField"; + private static final String FIELD_STRING = "stringField"; + private static final String FIELD_DECIMAL = "decimalField"; + private static final String FIELD_ARRAY = "arrayField"; + private static final String FIELD_STRUCT = "structField"; + private static final String FIELD_NULL = "nullField"; + private static final String FILE_EXTENSION = "jsn"; + + @BeforeClass + public static void setUp() throws IOException { + readerClass = AgnosticFileReader.class; + dataFile = createDataFile(); + readerConfig = new HashMap() {{ + String deserializationConfig = DeserializationFeature.ACCEPT_EMPTY_ARRAY_AS_NULL_OBJECT.name(); + put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_JSON, FILE_EXTENSION); + put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + deserializationConfig, "true"); + put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + "invalid", "false"); + }}; + } + + private static Path createDataFile() throws IOException { + return createDataFile(NUM_RECORDS, true); + } + + private static Path createDataFile(int numRecords, boolean recordPerLine) throws IOException { + File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); + try (FileWriter writer = new FileWriter(txtFile)) { + IntStream.range(0, numRecords).forEach(index -> { + ObjectNode json = JsonNodeFactory.instance.objectNode() + .put(FIELD_INTEGER, index) + .put(FIELD_LONG, Long.MAX_VALUE) + .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID())) + .put(FIELD_BOOLEAN, true) + .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index)) + .put(FIELD_NULL, (String) null); + json.putArray(FIELD_ARRAY) + .add("elm[" + index + "]") + .add("elm[" + index + "]"); + json.putObject(FIELD_STRUCT) + .put(FIELD_INTEGER, (short) index) + .put(FIELD_LONG, Long.MAX_VALUE) + .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID())) + .put(FIELD_BOOLEAN, true) + .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index)) + .put(FIELD_NULL, (String) null); + try { + writer.append(recordPerLine ? json.toString() + "\n" : json.toPrettyString()); + OFFSETS_BY_INDEX.put(index, (long) index); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + }); + } + Path path = new Path(new Path(fsUri), txtFile.getName()); + fs.moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); + return path; + } + + @Ignore(value = "This test does not apply for json files") + @Test(expected = IOException.class) + public void emptyFile() throws Throwable { + super.emptyFile(); + } + + @Test + public void readEmptyFile() throws Throwable { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + FileReader reader = getReader(fs, path, readerConfig); + assertFalse(reader.hasNext()); + } + + @Test + public void validFileEncoding() throws Throwable { + Map cfg = new HashMap() {{ + put(JsonFileReader.FILE_READER_JSON_ENCODING, "Cp1252"); + put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_JSON, getFileExtension()); + }}; + reader = getReader(fs, dataFile, cfg); + readAllData(); + } + + @Test(expected = UnsupportedCharsetException.class) + public void invalidFileEncoding() throws Throwable { + Map cfg = new HashMap() {{ + put(JsonFileReader.FILE_READER_JSON_ENCODING, "invalid_charset"); + put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_JSON, getFileExtension()); + }}; + getReader(fs, dataFile, cfg); + } + + @Test + public void readDataWithRecordPerLineDisabled() throws Throwable { + Path file = createDataFile(1, false); + FileReader reader = getReader(fs, file, new HashMap() {{ + put(JsonFileReader.FILE_READER_JSON_RECORD_PER_LINE, "false"); + put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_JSON, getFileExtension()); + }}); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals("The number of records in the file does not match", 1, recordCount); + } + + @Override + protected Offset getOffset(long offset) { + return () -> offset; + } + + @Override + protected void checkData(Struct record, long index) { + assertEquals((int) (Integer) record.get(FIELD_INTEGER), index); + assertEquals((long) (Long) record.get(FIELD_LONG), Long.MAX_VALUE); + assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")); + assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())); + assertEquals((Double) record.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0); + assertNull(record.get(FIELD_NULL)); + assertNotNull(record.schema().field(FIELD_NULL)); + assertEquals(record.get(FIELD_ARRAY), Arrays.asList("elm[" + index + "]", "elm[" + index + "]")); + Struct subrecord = record.getStruct(FIELD_STRUCT); + assertEquals((int) (Integer) subrecord.get(FIELD_INTEGER), index); + assertEquals((long) (Long) subrecord.get(FIELD_LONG), Long.MAX_VALUE); + assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")); + assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())); + assertEquals((Double) subrecord.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0); + assertNull(subrecord.get(FIELD_NULL)); + assertNotNull(subrecord.schema().field(FIELD_NULL)); + } + + @Override + protected String getFileExtension() { + return FILE_EXTENSION; + } + +} From 528b2b044383fef361161e7d236b208c84e26e84 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Tue, 3 Mar 2020 00:18:35 +0100 Subject: [PATCH 10/51] Throw IllegalStateException in readers when reader is already closed --- .../fs/file/reader/AvroFileReader.java | 20 ++++++++++++++----- .../fs/file/reader/ParquetFileReader.java | 2 +- .../fs/file/reader/SequenceFileReader.java | 4 ++++ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java index 1db7e01..14b70a2 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java @@ -2,6 +2,7 @@ import com.github.mmolimar.kafka.connect.fs.file.Offset; import io.confluent.connect.avro.AvroData; +import org.apache.avro.AvroRuntimeException; import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; import org.apache.avro.generic.GenericRecord; @@ -41,7 +42,8 @@ public AvroFileReader(FileSystem fs, Path filePath, Map config) } protected void configure(Map config) { - if (config.get(FILE_READER_AVRO_SCHEMA) != null) { + if (config.get(FILE_READER_AVRO_SCHEMA) != null && + !config.get(FILE_READER_AVRO_SCHEMA).toString().trim().isEmpty()) { this.schema = new Schema.Parser().parse(config.get(FILE_READER_AVRO_SCHEMA).toString()); } else { this.schema = null; @@ -50,15 +52,23 @@ protected void configure(Map config) { @Override public boolean hasNext() { - return reader.hasNext(); + try { + return reader.hasNext(); + } catch (AvroRuntimeException are) { + throw new IllegalStateException(are); + } } @Override protected GenericRecord nextRecord() { - GenericRecord record = reader.next(); - this.offset.inc(); + try { + GenericRecord record = reader.next(); + this.offset.inc(); - return record; + return record; + } catch (AvroRuntimeException are) { + throw new IllegalStateException(are); + } } @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java index f6537f3..6afe74f 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java @@ -73,7 +73,7 @@ protected void configure(Map config) { @Override public boolean hasNext() { - if (closed) return false; + if (closed) throw new IllegalStateException("Reader already closed."); if (currentRecord == null) { try { currentRecord = reader.read(); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java index 58d1e0e..40a939a 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java @@ -37,6 +37,7 @@ public class SequenceFileReader extends AbstractFileReader config) throws IOException { super(fs, filePath, new SeqToStruct(), config); @@ -53,6 +54,7 @@ public SequenceFileReader(FileSystem fs, Path filePath, Map conf this.offset = new SeqOffset(0); this.recordIndex = this.hasNextIndex = -1; this.hasNext = false; + this.isClosed = false; } @Override @@ -94,6 +96,7 @@ private Schema getSchema(Writable writable) { @Override public boolean hasNext() { + if (isClosed) throw new IllegalStateException("Reader already closed."); try { if (hasNextIndex == -1 || hasNextIndex == recordIndex) { hasNextIndex++; @@ -139,6 +142,7 @@ public Offset currentOffset() { @Override public void close() throws IOException { + isClosed = true; reader.close(); } From 8da9f0ae911abd8f1480ae1e420f6a87553723df Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Tue, 3 Mar 2020 00:57:07 +0100 Subject: [PATCH 11/51] Test migration to JUnit 5 --- pom.xml | 34 +++++---- .../FsSourceConnectorConfigTest.java | 14 ++-- .../fs/connector/FsSourceConnectorTest.java | 47 ++++++------ .../fs/file/reader/FileReaderTestBase.java | 55 +++++++------- .../file/reader/hdfs/AvroFileReaderTest.java | 34 +++++---- .../hdfs/DelimitedTextFileReaderTest.java | 59 ++++++++------- .../reader/hdfs/HdfsFileReaderTestBase.java | 8 +- .../file/reader/hdfs/JsonFileReaderTest.java | 58 ++++++++------- .../reader/hdfs/ParquetFileReaderTest.java | 51 +++++++------ .../reader/hdfs/SequenceFileReaderTest.java | 17 +++-- .../file/reader/hdfs/TextFileReaderTest.java | 56 ++++++++++---- .../file/reader/local/AvroFileReaderTest.java | 34 +++++---- .../local/DelimitedTextFileReaderTest.java | 68 ++++++++++------- .../file/reader/local/JsonFileReaderTest.java | 59 ++++++++------- .../reader/local/LocalFileReaderTestBase.java | 9 +-- .../reader/local/ParquetFileReaderTest.java | 50 +++++++------ .../reader/local/SequenceFileReaderTest.java | 17 +++-- .../file/reader/local/TextFileReaderTest.java | 38 ++++++---- .../connect/fs/policy/PolicyTestBase.java | 74 +++++++++---------- .../hdfs/HdfsFileWatcherPolicyTest.java | 26 ++++--- .../fs/policy/hdfs/HdfsPolicyTestBase.java | 10 +-- .../fs/policy/hdfs/SimplePolicyTest.java | 6 +- .../fs/policy/hdfs/SleepyPolicyTest.java | 40 +++++----- .../fs/policy/local/LocalPolicyTestBase.java | 8 +- .../fs/policy/local/SimplePolicyTest.java | 6 +- .../fs/policy/local/SleepyPolicyTest.java | 40 +++++----- .../fs/task/FsSourceTaskConfigTest.java | 14 ++-- .../connect/fs/task/FsSourceTaskTest.java | 55 +++++++------- .../connect/fs/task/FsSourceTaskTestBase.java | 40 +++++----- .../fs/task/hdfs/HdfsFsSourceTaskTest.java | 10 +-- .../task/hdfs/HdfsFsSourceTaskTestBase.java | 16 ++-- .../fs/task/local/LocalFsSourceTaskTest.java | 11 ++- .../task/local/LocalFsSourceTaskTestBase.java | 8 +- src/test/resources/log4j.properties | 13 ++++ 34 files changed, 597 insertions(+), 488 deletions(-) create mode 100644 src/test/resources/log4j.properties diff --git a/pom.xml b/pom.xml index c63e4fc..606806a 100644 --- a/pom.xml +++ b/pom.xml @@ -17,14 +17,17 @@ 1.9.2 1.11.0 2.10.2 - 4.13 + 5.6.0 4.2 2.0.5 + 1.8 + ${maven-compiler.source} 3.2.0 3.8.1 3.2.0 - 0.8.5 - 4.3.0 + 0.8.5 + 4.3.0 + 3.0.0-M4 @@ -74,9 +77,9 @@ - junit - junit - ${junit.version} + org.junit.jupiter + junit-jupiter-api + ${junit-jupiter.version} test @@ -85,12 +88,6 @@ ${easymock.version} test - - org.powermock - powermock-module-junit4 - ${powermock.version} - test - org.powermock powermock-api-easymock @@ -126,10 +123,15 @@ ${maven-compiler-plugin.version} true - 1.8 - 1.8 + ${maven-compiler.source} + ${maven-compiler.target} + + org.apache.maven.plugins + maven-surefire-plugin + ${maven-surfire-plugin.version} + org.apache.maven.plugins maven-assembly-plugin @@ -153,7 +155,7 @@ org.jacoco jacoco-maven-plugin - ${jacoco-maven-plugin.version} + ${maven-jacoco-plugin.version} prepare-agent @@ -166,7 +168,7 @@ org.eluder.coveralls coveralls-maven-plugin - ${coveralls-maven-plugin.version} + ${maven-coveralls-plugin.version} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorConfigTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorConfigTest.java index 2a33262..5f0538e 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorConfigTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorConfigTest.java @@ -2,10 +2,10 @@ import com.github.mmolimar.kafka.connect.fs.FsSourceConnectorConfig; import org.apache.kafka.common.config.ConfigDef; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class FsSourceConnectorConfigTest { @@ -13,9 +13,9 @@ public class FsSourceConnectorConfigTest { public void checkDocumentation() { ConfigDef config = FsSourceConnectorConfig.conf(); config.names().forEach(key -> { - assertFalse("Property " + key + " should be documented", - config.configKeys().get(key).documentation == null || - "".equals(config.configKeys().get(key).documentation.trim())); + assertFalse(config.configKeys().get(key).documentation == null || + "".equals(config.configKeys().get(key).documentation.trim()), + () -> "Property " + key + " should be documented"); }); } @@ -23,4 +23,4 @@ public void checkDocumentation() { public void toRst() { assertNotNull(FsSourceConnectorConfig.conf().toRst()); } -} \ No newline at end of file +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorTest.java index 5fc9c5e..a67a92e 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/connector/FsSourceConnectorTest.java @@ -4,55 +4,53 @@ import com.github.mmolimar.kafka.connect.fs.FsSourceTask; import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; import org.apache.kafka.connect.errors.ConnectException; -import org.junit.Before; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; import java.io.File; -import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.IntStream; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class FsSourceConnectorTest { - @ClassRule - public static final TemporaryFolder temporaryFolder = new TemporaryFolder(); + @TempDir + public static File temporaryFolder; private FsSourceConnector connector; private Map connProps; - @Before - public void setup() throws IOException { + @BeforeEach + public void setup() { connector = new FsSourceConnector(); Map cfg = new HashMap() {{ put(FsSourceTaskConfig.FS_URIS, String.join(",", - temporaryFolder.getRoot().toURI() + File.separator + "dir1", - temporaryFolder.getRoot().toURI() + File.separator + "dir2", - temporaryFolder.getRoot().toURI() + File.separator + "dir3")); + temporaryFolder.toURI() + File.separator + "dir1", + temporaryFolder.toURI() + File.separator + "dir2", + temporaryFolder.toURI() + File.separator + "dir3")); put(FsSourceTaskConfig.TOPIC, "topic_test"); }}; connProps = new HashMap<>(cfg); } - @Test(expected = ConnectException.class) + @Test public void nullProperties() { - connector.start(null); + assertThrows(ConnectException.class, () -> connector.start(null)); } - @Test(expected = ConnectException.class) + @Test public void expectedFsUris() { Map testProps = new HashMap<>(connProps); testProps.remove(FsSourceTaskConfig.FS_URIS); - connector.start(testProps); + assertThrows(ConnectException.class, () -> connector.start(testProps)); } @Test - public void minimunConfig() { + public void minimumConfig() { connector.start(connProps); connector.stop(); } @@ -62,15 +60,15 @@ public void checkTaskClass() { assertEquals(FsSourceTask.class, connector.taskClass()); } - @Test(expected = ConnectException.class) + @Test public void configTasksWithoutStart() { - connector.taskConfigs(1); + assertThrows(ConnectException.class, () -> connector.taskConfigs(1)); } - @Test(expected = IllegalArgumentException.class) + @Test public void invalidConfigTaskNumber() { connector.start(connProps); - connector.taskConfigs(0); + assertThrows(IllegalArgumentException.class, () -> connector.taskConfigs(0)); } @Test @@ -80,7 +78,7 @@ public void configTasks() { IntStream.range(1, connProps.get(FsSourceTaskConfig.FS_URIS).split(",").length + 1) .forEach(index -> { List> taskConfigs = connector.taskConfigs(index); - assertTrue(taskConfigs.size() == (index > uris ? uris : index)); + assertEquals(taskConfigs.size(), Math.min(index, uris)); }); connector.stop(); } @@ -95,5 +93,4 @@ public void checkVersion() { public void checkDefaultConf() { assertNotNull(connector.config()); } - -} \ No newline at end of file +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java index 238db17..c8eec79 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java @@ -5,10 +5,10 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.*; import java.net.URI; @@ -17,7 +17,7 @@ import java.util.NoSuchElementException; import java.util.UUID; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public abstract class FileReaderTestBase { @@ -31,18 +31,18 @@ public abstract class FileReaderTestBase { protected static Map readerConfig; protected static FileReader reader; - @AfterClass + @AfterAll public static void tearDown() throws IOException { fs.close(); } - @Before + @BeforeEach public void openReader() throws Throwable { reader = getReader(fs, dataFile, readerConfig); assertEquals(reader.getFilePath(), dataFile); } - @After + @AfterEach public void closeReader() { try { reader.close(); @@ -51,30 +51,32 @@ public void closeReader() { } } - @Test(expected = IllegalArgumentException.class) - public void invalidArgs() throws Throwable { + @Test + public void invalidArgs() { try { readerClass.getConstructor(FileSystem.class, Path.class, Map.class).newInstance(null, null, null); } catch (Exception e) { - throw e.getCause(); + assertThrows(IllegalArgumentException.class, () -> { + throw e.getCause(); + }); } } - @Test(expected = FileNotFoundException.class) - public void fileDoesNotExist() throws Throwable { + @Test + public void fileDoesNotExist() { Path path = new Path(new Path(fsUri), UUID.randomUUID().toString()); - getReader(fs, path, readerConfig); + assertThrows(FileNotFoundException.class, () -> getReader(fs, path, readerConfig)); } - @Test(expected = IOException.class) + @Test public void emptyFile() throws Throwable { File tmp = File.createTempFile("test-", "." + getFileExtension()); Path path = new Path(new Path(fsUri), tmp.getName()); fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); + assertThrows(IOException.class, () -> getReader(fs, path, readerConfig)); } - @Test(expected = IOException.class) + @Test public void invalidFileFormat() throws Throwable { File tmp = File.createTempFile("test-", "." + getFileExtension()); try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { @@ -82,7 +84,7 @@ public void invalidFileFormat() throws Throwable { } Path path = new Path(new Path(fsUri), tmp.getName()); fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); + assertThrows(IOException.class, () -> getReader(fs, path, readerConfig)); } @Test @@ -95,7 +97,7 @@ public void readAllData() { checkData(record, recordCount); recordCount++; } - assertEquals("The number of records in the file does not match", NUM_RECORDS, recordCount); + assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); } @Test @@ -120,26 +122,25 @@ public void seekFile() { reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1)); assertFalse(reader.hasNext()); - } - @Test(expected = RuntimeException.class) + @Test public void negativeSeek() { - reader.seek(getOffset(-1)); + assertThrows(RuntimeException.class, () -> reader.seek(getOffset(-1))); } - @Test(expected = NoSuchElementException.class) + @Test public void exceededSeek() { reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1)); assertFalse(reader.hasNext()); - reader.next(); + assertThrows(NoSuchElementException.class, () -> reader.next()); } - @Test(expected = RuntimeException.class) + @Test public void readFileAlreadyClosed() throws IOException { reader.close(); - assertFalse(reader.hasNext()); - reader.seek(getOffset(0)); + assertThrows(IllegalStateException.class, () -> reader.hasNext()); + assertThrows(IllegalStateException.class, () -> reader.next()); } protected final FileReader getReader(FileSystem fs, Path path, Map config) throws Throwable { diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java index f829ff1..b4ae9ae 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java @@ -13,8 +13,8 @@ import org.apache.avro.io.DatumWriter; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; @@ -23,8 +23,7 @@ import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.*; public class AvroFileReaderTest extends HdfsFileReaderTestBase { @@ -35,7 +34,7 @@ public class AvroFileReaderTest extends HdfsFileReaderTestBase { private static Schema schema; - @BeforeClass + @BeforeAll public static void setUp() throws IOException { schema = new Schema.Parser().parse(AvroFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); readerClass = AgnosticFileReader.class; @@ -78,21 +77,28 @@ public void readerWithSchema() throws Throwable { readAllData(); } - @Test(expected = AvroTypeException.class) + @Test public void readerWithInvalidSchema() throws Throwable { Map cfg = new HashMap() {{ put(AvroFileReader.FILE_READER_AVRO_SCHEMA, Schema.create(Schema.Type.STRING).toString()); }}; reader = getReader(fs, dataFile, cfg); - readAllData(); + assertThrows(IllegalStateException.class, this::readAllData); + assertThrows(AvroTypeException.class, () -> { + try { + readAllData(); + } catch (Exception e) { + throw e.getCause(); + } + }); } - @Test(expected = SchemaParseException.class) - public void readerWithUnparseableSchema() throws Throwable { + @Test + public void readerWithUnparseableSchema() { Map cfg = new HashMap() {{ put(AvroFileReader.FILE_READER_AVRO_SCHEMA, "invalid schema"); }}; - getReader(fs, dataFile, cfg); + assertThrows(SchemaParseException.class, () -> getReader(fs, dataFile, cfg)); } @Override @@ -102,9 +108,11 @@ protected Offset getOffset(long offset) { @Override protected void checkData(Struct record, long index) { - assertEquals((int) (Integer) record.get(FIELD_INDEX), index); - assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); + assertAll( + () -> assertEquals((int) (Integer) record.get(FIELD_INDEX), index), + () -> assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")), + () -> assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")) + ); } @Override diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java index 137eee1..f4b6c92 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java @@ -7,10 +7,10 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; @@ -20,7 +20,7 @@ import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class DelimitedTextFileReaderTest extends HdfsFileReaderTestBase { @@ -30,7 +30,7 @@ public class DelimitedTextFileReaderTest extends HdfsFileReaderTestBase { private static final String FIELD_COLUMN4 = "column_4"; private static final String FILE_EXTENSION = "csv"; - @BeforeClass + @BeforeAll public static void setUp() throws IOException { readerClass = AgnosticFileReader.class; dataFile = createDataFile(true); @@ -61,24 +61,33 @@ private static Path createDataFile(boolean header) throws IOException { return path; } - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) + @Test public void emptyFile() throws Throwable { - super.emptyFile(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) + @Test public void invalidFileFormat() throws Throwable { - super.invalidFileFormat(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { + writer.write("test"); + } + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } - @Test(expected = IllegalArgumentException.class) - public void invaliConfigArgs() throws Throwable { + @Test + public void invaliConfigArgs() { try { readerClass.getConstructor(FileSystem.class, Path.class, Map.class).newInstance(fs, dataFile, new HashMap<>()); } catch (Exception e) { - throw e.getCause(); + assertThrows(IllegalArgumentException.class, () -> { + throw e.getCause(); + }); } } @@ -98,8 +107,7 @@ public void readAllDataWithoutHeader() throws Throwable { checkData(record, recordCount); recordCount++; } - assertEquals("The number of records in the file does not match", NUM_RECORDS, recordCount); - + assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); } @Test @@ -130,7 +138,7 @@ public void readAllDataWithMalformedRows() throws Throwable { assertEquals("custom_value", record.get(FIELD_COLUMN4)); recordCount++; } - assertEquals("The number of records in the file does not match", 2, recordCount); + assertEquals(2, recordCount, () -> "The number of records in the file does not match"); } @Test @@ -163,7 +171,6 @@ public void seekFileWithoutHeader() throws Throwable { reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1, false)); assertFalse(reader.hasNext()); - } @Test @@ -176,14 +183,14 @@ public void validFileEncoding() throws Throwable { getReader(fs, dataFile, cfg); } - @Test(expected = UnsupportedCharsetException.class) - public void invalidFileEncoding() throws Throwable { + @Test + public void invalidFileEncoding() { Map cfg = new HashMap() {{ put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); put(DelimitedTextFileReader.FILE_READER_DELIMITED_ENCODING, "invalid_charset"); }}; - getReader(fs, dataFile, cfg); + assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); } @Override @@ -197,10 +204,12 @@ private Offset getOffset(long offset, boolean hasHeader) { @Override protected void checkData(Struct record, long index) { - assertTrue(record.get(FIELD_COLUMN1).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_COLUMN2).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_COLUMN3).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_COLUMN4).toString().startsWith(index + "_")); + assertAll( + () -> assertTrue(record.get(FIELD_COLUMN1).toString().startsWith(index + "_")), + () -> assertTrue(record.get(FIELD_COLUMN2).toString().startsWith(index + "_")), + () -> assertTrue(record.get(FIELD_COLUMN3).toString().startsWith(index + "_")), + () -> assertTrue(record.get(FIELD_COLUMN4).toString().startsWith(index + "_")) + ); } @Override diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java index f4f5183..c60d0c3 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java @@ -4,8 +4,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.junit.AfterClass; -import org.junit.BeforeClass; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import java.io.IOException; import java.net.URI; @@ -16,7 +16,7 @@ public abstract class HdfsFileReaderTestBase extends FileReaderTestBase { private static MiniDFSCluster cluster; - @BeforeClass + @BeforeAll public static void initFs() throws IOException { Configuration clusterConfig = new Configuration(); Path hdfsDir = Files.createTempDirectory("test-"); @@ -26,7 +26,7 @@ public static void initFs() throws IOException { fs = FileSystem.newInstance(fsUri, new Configuration()); } - @AfterClass + @AfterAll public static void finishFs() throws Exception { cluster.shutdown(true); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/JsonFileReaderTest.java index 4a82ede..188487a 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/JsonFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/JsonFileReaderTest.java @@ -9,9 +9,8 @@ import com.github.mmolimar.kafka.connect.fs.file.reader.JsonFileReader; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileWriter; @@ -23,7 +22,7 @@ import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class JsonFileReaderTest extends HdfsFileReaderTestBase { @@ -37,7 +36,7 @@ public class JsonFileReaderTest extends HdfsFileReaderTestBase { private static final String FIELD_NULL = "nullField"; private static final String FILE_EXTENSION = "json"; - @BeforeClass + @BeforeAll public static void setUp() throws IOException { readerClass = AgnosticFileReader.class; dataFile = createDataFile(); @@ -87,10 +86,12 @@ private static Path createDataFile(int numRecords, boolean recordPerLine) throws return path; } - @Ignore(value = "This test does not apply for json files") - @Test(expected = IOException.class) + @Test public void emptyFile() throws Throwable { - super.emptyFile(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } @Test @@ -111,12 +112,12 @@ public void validFileEncoding() throws Throwable { readAllData(); } - @Test(expected = UnsupportedCharsetException.class) - public void invalidFileEncoding() throws Throwable { + @Test + public void invalidFileEncoding() { Map cfg = new HashMap() {{ put(JsonFileReader.FILE_READER_JSON_ENCODING, "invalid_charset"); }}; - getReader(fs, dataFile, cfg); + assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); } @Test @@ -135,7 +136,7 @@ public void readDataWithRecordPerLineDisabled() throws Throwable { recordCount++; } reader.close(); - assertEquals("The number of records in the file does not match", 1, recordCount); + assertEquals(1, recordCount, () -> "The number of records in the file does not match"); } @Override @@ -145,22 +146,25 @@ protected Offset getOffset(long offset) { @Override protected void checkData(Struct record, long index) { - assertEquals((int) (Integer) record.get(FIELD_INTEGER), index); - assertEquals((long) (Long) record.get(FIELD_LONG), Long.MAX_VALUE); - assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")); - assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())); - assertEquals((Double) record.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0); - assertNull(record.get(FIELD_NULL)); - assertNotNull(record.schema().field(FIELD_NULL)); - assertEquals(record.get(FIELD_ARRAY), Arrays.asList("elm[" + index + "]", "elm[" + index + "]")); Struct subrecord = record.getStruct(FIELD_STRUCT); - assertEquals((int) (Integer) subrecord.get(FIELD_INTEGER), index); - assertEquals((long) (Long) subrecord.get(FIELD_LONG), Long.MAX_VALUE); - assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")); - assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())); - assertEquals((Double) subrecord.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0); - assertNull(subrecord.get(FIELD_NULL)); - assertNotNull(subrecord.schema().field(FIELD_NULL)); + assertAll( + () -> assertEquals((int) (Integer) record.get(FIELD_INTEGER), index), + () -> assertEquals((long) (Long) record.get(FIELD_LONG), Long.MAX_VALUE), + () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")), + () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())), + () -> assertEquals((Double) record.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0), + () -> assertNull(record.get(FIELD_NULL)), + () -> assertNotNull(record.schema().field(FIELD_NULL)), + () -> assertEquals(record.get(FIELD_ARRAY), Arrays.asList("elm[" + index + "]", "elm[" + index + "]")), + () -> assertEquals((int) (Integer) subrecord.get(FIELD_INTEGER), index), + () -> assertEquals((long) (Long) subrecord.get(FIELD_LONG), Long.MAX_VALUE), + () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")), + () -> assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())), + () -> assertEquals((Double) subrecord.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0), + () -> assertNull(subrecord.get(FIELD_NULL)), + () -> assertNotNull(subrecord.schema().field(FIELD_NULL)) + ); + } @Override diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java index 5b69bb3..d08395d 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java @@ -18,18 +18,19 @@ import org.apache.parquet.hadoop.ParquetFileWriter; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.io.InvalidRecordException; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileWriter; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class ParquetFileReaderTest extends HdfsFileReaderTestBase { @@ -41,7 +42,7 @@ public class ParquetFileReaderTest extends HdfsFileReaderTestBase { private static Schema readerSchema; private static Schema projectionSchema; - @BeforeClass + @BeforeAll public static void setUp() throws IOException { readerClass = AgnosticFileReader.class; dataFile = createDataFile(); @@ -57,7 +58,6 @@ private static Path createDataFile() throws IOException { try (ParquetWriter writer = AvroParquetWriter.builder(new Path(parquetFile.toURI())) .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) { - IntStream.range(0, NUM_RECORDS).forEach(index -> { GenericRecord datum = new GenericData.Record(readerSchema); datum.put(FIELD_INDEX, index); @@ -76,16 +76,23 @@ private static Path createDataFile() throws IOException { return path; } - @Ignore(value = "This test does not apply for parquet files") - @Test(expected = IOException.class) + @Test public void emptyFile() throws Throwable { - super.emptyFile(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } - @Ignore(value = "This test does not apply for parquet files") - @Test(expected = IOException.class) + @Test public void invalidFileFormat() throws Throwable { - super.invalidFileFormat(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { + writer.write("test"); + } + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } @Test @@ -97,7 +104,7 @@ public void readerWithSchema() throws Throwable { readAllData(); } - @Test(expected = DataException.class) + @Test public void readerWithProjection() throws Throwable { Map cfg = new HashMap() {{ put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, projectionSchema.toString()); @@ -111,10 +118,10 @@ public void readerWithProjection() throws Throwable { } reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); + assertThrows(DataException.class, this::readAllData); } - @Test(expected = InvalidRecordException.class) + @Test public void readerWithInvalidProjection() throws Throwable { Schema testSchema = SchemaBuilder.record("test_projection").namespace("test.avro") .fields() @@ -124,24 +131,25 @@ public void readerWithInvalidProjection() throws Throwable { put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, testSchema.toString()); }}; reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); + assertThrows(InvalidRecordException.class, this::readAllData); } - @Test(expected = AvroRuntimeException.class) + @Test public void readerWithInvalidSchema() throws Throwable { Map cfg = new HashMap() {{ put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, Schema.create(Schema.Type.STRING).toString()); }}; reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); + assertThrows(AvroRuntimeException.class, this::readAllData); } - @Test(expected = SchemaParseException.class) - public void readerWithUnparseableSchema() throws Throwable { + @Test + public void readerWithUnparseableSchema() { Map cfg = new HashMap() {{ put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, "invalid schema"); }}; - getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); + assertThrows(SchemaParseException.class, () -> + getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg)); } @Override @@ -160,5 +168,4 @@ protected void checkData(Struct record, long index) { protected String getFileExtension() { return FILE_EXTENSION; } - } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java index 23e1f8c..a4435bc 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java @@ -10,8 +10,8 @@ import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.ReflectionUtils; import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; @@ -20,8 +20,7 @@ import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.*; public class SequenceFileReaderTest extends HdfsFileReaderTestBase { @@ -29,7 +28,7 @@ public class SequenceFileReaderTest extends HdfsFileReaderTestBase { private static final String FIELD_NAME_VALUE = "value"; private static final String FILE_EXTENSION = "seq"; - @BeforeClass + @BeforeAll public static void setUp() throws IOException { readerClass = AgnosticFileReader.class; dataFile = createDataFile(); @@ -84,7 +83,7 @@ public void defaultFieldNames() throws Throwable { checkData(SequenceFileReader.FIELD_NAME_KEY_DEFAULT, SequenceFileReader.FIELD_NAME_VALUE_DEFAULT, record, recordCount); recordCount++; } - assertEquals("The number of records in the file does not match", NUM_RECORDS, recordCount); + assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); } @Override @@ -98,8 +97,10 @@ protected void checkData(Struct record, long index) { } private void checkData(String keyFieldName, String valueFieldName, Struct record, long index) { - assertEquals((int) (Integer) record.get(keyFieldName), index); - assertTrue(record.get(valueFieldName).toString().startsWith(index + "_")); + assertAll( + () -> assertEquals((int) (Integer) record.get(keyFieldName), index), + () -> assertTrue(record.get(valueFieldName).toString().startsWith(index + "_")) + ); } @Override diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java index 9a063d3..8e932f2 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java @@ -2,13 +2,14 @@ import com.github.mmolimar.kafka.connect.fs.file.Offset; import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; +import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; @@ -18,14 +19,14 @@ import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.*; public class TextFileReaderTest extends HdfsFileReaderTestBase { private static final String FIELD_NAME_VALUE = "custom_field_name"; private static final String FILE_EXTENSION = "txt"; - @BeforeClass + @BeforeAll public static void setUp() throws IOException { readerClass = AgnosticFileReader.class; dataFile = createDataFile(); @@ -53,16 +54,23 @@ private static Path createDataFile() throws IOException { return path; } - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) + @Test public void emptyFile() throws Throwable { - super.emptyFile(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) + @Test public void invalidFileFormat() throws Throwable { - super.invalidFileFormat(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { + writer.write("test"); + } + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } @Test @@ -75,13 +83,33 @@ public void validFileEncoding() throws Throwable { readAllData(); } - @Test(expected = UnsupportedCharsetException.class) - public void invalidFileEncoding() throws Throwable { + @Test + public void invalidFileEncoding() { Map cfg = new HashMap() {{ put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset"); }}; - getReader(fs, dataFile, cfg); + assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); + } + + @Test + public void readDataWithRecordPerLineDisabled() throws Throwable { + Path file = createDataFile(); + FileReader reader = getReader(fs, file, new HashMap() {{ + put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, "false"); + }}); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals(1, recordCount, () -> "The number of records in the file does not match"); } @Override diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java index 2dc0454..5c707e1 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java @@ -13,8 +13,8 @@ import org.apache.avro.io.DatumWriter; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; @@ -23,8 +23,7 @@ import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.*; public class AvroFileReaderTest extends LocalFileReaderTestBase { @@ -35,7 +34,7 @@ public class AvroFileReaderTest extends LocalFileReaderTestBase { private static Schema schema; - @BeforeClass + @BeforeAll public static void setUp() throws IOException { schema = new Schema.Parser().parse(AvroFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); readerClass = AgnosticFileReader.class; @@ -81,23 +80,30 @@ public void readerWithSchema() throws Throwable { readAllData(); } - @Test(expected = AvroTypeException.class) + @Test public void readerWithInvalidSchema() throws Throwable { Map cfg = new HashMap() {{ put(AvroFileReader.FILE_READER_AVRO_SCHEMA, Schema.create(Schema.Type.STRING).toString()); put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, getFileExtension()); }}; reader = getReader(fs, dataFile, cfg); - readAllData(); + assertThrows(IllegalStateException.class, this::readAllData); + assertThrows(AvroTypeException.class, () -> { + try { + readAllData(); + } catch (Exception e) { + throw e.getCause(); + } + }); } - @Test(expected = SchemaParseException.class) - public void readerWithUnparseableSchema() throws Throwable { + @Test + public void readerWithUnparseableSchema() { Map cfg = new HashMap() {{ put(AvroFileReader.FILE_READER_AVRO_SCHEMA, "invalid schema"); put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, getFileExtension()); }}; - getReader(fs, dataFile, cfg); + assertThrows(SchemaParseException.class, () -> getReader(fs, dataFile, cfg)); } @Override @@ -107,9 +113,11 @@ protected Offset getOffset(long offset) { @Override protected void checkData(Struct record, long index) { - assertEquals((int) (Integer) record.get(FIELD_INDEX), index); - assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); + assertAll( + () -> assertEquals((int) (Integer) record.get(FIELD_INDEX), index), + () -> assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")), + () -> assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")) + ); } @Override diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java index 679ef45..91f08e9 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java @@ -7,10 +7,10 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; @@ -20,7 +20,7 @@ import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class DelimitedTextFileReaderTest extends LocalFileReaderTestBase { @@ -30,7 +30,7 @@ public class DelimitedTextFileReaderTest extends LocalFileReaderTestBase { private static final String FIELD_COLUMN4 = "column_4"; private static final String FILE_EXTENSION = "tcsv"; - @BeforeClass + @BeforeAll public static void setUp() throws IOException { readerClass = AgnosticFileReader.class; dataFile = createDataFile(true); @@ -62,27 +62,36 @@ private static Path createDataFile(boolean header) throws IOException { return path; } - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) + @Test public void emptyFile() throws Throwable { - super.emptyFile(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) + @Test public void invalidFileFormat() throws Throwable { - super.invalidFileFormat(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { + writer.write("test"); + } + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } - @Test(expected = IllegalArgumentException.class) - public void invaliConfigArgs() throws Throwable { + @Test + public void invaliConfigArgs() { try { readerClass.getConstructor(FileSystem.class, Path.class, Map.class).newInstance(fs, dataFile, new HashMap() {{ put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, FILE_EXTENSION); }}); } catch (Exception e) { - throw e.getCause(); + assertThrows(IllegalArgumentException.class, () -> { + throw e.getCause(); + }); } } @@ -103,7 +112,7 @@ public void readAllDataWithoutHeader() throws Throwable { checkData(record, recordCount); recordCount++; } - assertEquals("The number of records in the file does not match", NUM_RECORDS, recordCount); + assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); } @Test @@ -129,13 +138,15 @@ public void readAllDataWithMalformedRows() throws Throwable { int recordCount = 0; while (reader.hasNext()) { Struct record = reader.next(); - assertEquals("dummy", record.get(FIELD_COLUMN1)); - assertEquals("custom_value", record.get(FIELD_COLUMN2)); - assertEquals("custom_value", record.get(FIELD_COLUMN3)); - assertEquals("custom_value", record.get(FIELD_COLUMN4)); + assertAll( + () -> assertEquals("dummy", record.get(FIELD_COLUMN1)), + () -> assertEquals("custom_value", record.get(FIELD_COLUMN2)), + () -> assertEquals("custom_value", record.get(FIELD_COLUMN3)), + () -> assertEquals("custom_value", record.get(FIELD_COLUMN4)) + ); recordCount++; } - assertEquals("The number of records in the file does not match", 2, recordCount); + assertEquals(2, recordCount, () -> "The number of records in the file does not match"); } @Test @@ -169,7 +180,6 @@ public void seekFileWithoutHeader() throws Throwable { reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1, false)); assertFalse(reader.hasNext()); - } @Test @@ -183,15 +193,15 @@ public void validFileEncoding() throws Throwable { getReader(fs, dataFile, cfg); } - @Test(expected = UnsupportedCharsetException.class) - public void invalidFileEncoding() throws Throwable { + @Test + public void invalidFileEncoding() { Map cfg = new HashMap() {{ put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); put(DelimitedTextFileReader.FILE_READER_DELIMITED_ENCODING, "invalid_charset"); put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, getFileExtension()); }}; - getReader(fs, dataFile, cfg); + assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); } @Override @@ -205,10 +215,12 @@ private Offset getOffset(long offset, boolean hasHeader) { @Override protected void checkData(Struct record, long index) { - assertTrue(record.get(FIELD_COLUMN1).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_COLUMN2).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_COLUMN3).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_COLUMN4).toString().startsWith(index + "_")); + assertAll( + () -> assertTrue(record.get(FIELD_COLUMN1).toString().startsWith(index + "_")), + () -> assertTrue(record.get(FIELD_COLUMN2).toString().startsWith(index + "_")), + () -> assertTrue(record.get(FIELD_COLUMN3).toString().startsWith(index + "_")), + () -> assertTrue(record.get(FIELD_COLUMN4).toString().startsWith(index + "_")) + ); } @Override diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/JsonFileReaderTest.java index 16bf3eb..131e427 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/JsonFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/JsonFileReaderTest.java @@ -9,9 +9,8 @@ import com.github.mmolimar.kafka.connect.fs.file.reader.JsonFileReader; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.FileWriter; @@ -23,7 +22,7 @@ import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class JsonFileReaderTest extends LocalFileReaderTestBase { @@ -37,7 +36,7 @@ public class JsonFileReaderTest extends LocalFileReaderTestBase { private static final String FIELD_NULL = "nullField"; private static final String FILE_EXTENSION = "jsn"; - @BeforeClass + @BeforeAll public static void setUp() throws IOException { readerClass = AgnosticFileReader.class; dataFile = createDataFile(); @@ -87,10 +86,12 @@ private static Path createDataFile(int numRecords, boolean recordPerLine) throws return path; } - @Ignore(value = "This test does not apply for json files") - @Test(expected = IOException.class) + @Test public void emptyFile() throws Throwable { - super.emptyFile(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } @Test @@ -112,13 +113,13 @@ public void validFileEncoding() throws Throwable { readAllData(); } - @Test(expected = UnsupportedCharsetException.class) - public void invalidFileEncoding() throws Throwable { + @Test + public void invalidFileEncoding() { Map cfg = new HashMap() {{ put(JsonFileReader.FILE_READER_JSON_ENCODING, "invalid_charset"); put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_JSON, getFileExtension()); }}; - getReader(fs, dataFile, cfg); + assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); } @Test @@ -138,7 +139,7 @@ public void readDataWithRecordPerLineDisabled() throws Throwable { recordCount++; } reader.close(); - assertEquals("The number of records in the file does not match", 1, recordCount); + assertEquals(1, recordCount, () -> "The number of records in the file does not match"); } @Override @@ -148,27 +149,29 @@ protected Offset getOffset(long offset) { @Override protected void checkData(Struct record, long index) { - assertEquals((int) (Integer) record.get(FIELD_INTEGER), index); - assertEquals((long) (Long) record.get(FIELD_LONG), Long.MAX_VALUE); - assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")); - assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())); - assertEquals((Double) record.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0); - assertNull(record.get(FIELD_NULL)); - assertNotNull(record.schema().field(FIELD_NULL)); - assertEquals(record.get(FIELD_ARRAY), Arrays.asList("elm[" + index + "]", "elm[" + index + "]")); Struct subrecord = record.getStruct(FIELD_STRUCT); - assertEquals((int) (Integer) subrecord.get(FIELD_INTEGER), index); - assertEquals((long) (Long) subrecord.get(FIELD_LONG), Long.MAX_VALUE); - assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")); - assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())); - assertEquals((Double) subrecord.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0); - assertNull(subrecord.get(FIELD_NULL)); - assertNotNull(subrecord.schema().field(FIELD_NULL)); + assertAll( + () -> assertEquals((int) (Integer) record.get(FIELD_INTEGER), index), + () -> assertEquals((long) (Long) record.get(FIELD_LONG), Long.MAX_VALUE), + () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")), + () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())), + () -> assertEquals((Double) record.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0), + () -> assertNull(record.get(FIELD_NULL)), + () -> assertNotNull(record.schema().field(FIELD_NULL)), + () -> assertEquals(record.get(FIELD_ARRAY), Arrays.asList("elm[" + index + "]", "elm[" + index + "]")), + () -> assertEquals((int) (Integer) subrecord.get(FIELD_INTEGER), index), + () -> assertEquals((long) (Long) subrecord.get(FIELD_LONG), Long.MAX_VALUE), + () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")), + () -> assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())), + () -> assertEquals((Double) subrecord.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0), + () -> assertNull(subrecord.get(FIELD_NULL)), + () -> assertNotNull(subrecord.schema().field(FIELD_NULL)) + ); + } @Override protected String getFileExtension() { return FILE_EXTENSION; } - } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/LocalFileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/LocalFileReaderTestBase.java index 6589e92..f08bff7 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/LocalFileReaderTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/LocalFileReaderTestBase.java @@ -4,8 +4,8 @@ import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; -import org.junit.AfterClass; -import org.junit.BeforeClass; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import java.io.IOException; import java.nio.file.Files; @@ -15,16 +15,15 @@ public abstract class LocalFileReaderTestBase extends FileReaderTestBase { private static Path localDir; - @BeforeClass + @BeforeAll public static void initFs() throws IOException { localDir = Files.createTempDirectory("test-"); fsUri = localDir.toUri(); fs = FileSystem.newInstance(fsUri, new Configuration()); } - @AfterClass + @AfterAll public static void finishFs() throws IOException { FileUtils.deleteDirectory(localDir.toFile()); } - } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java index da23677..41060c6 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java @@ -18,18 +18,19 @@ import org.apache.parquet.hadoop.ParquetFileWriter; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.io.InvalidRecordException; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileWriter; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class ParquetFileReaderTest extends LocalFileReaderTestBase { @@ -41,7 +42,7 @@ public class ParquetFileReaderTest extends LocalFileReaderTestBase { private static Schema readerSchema; private static Schema projectionSchema; - @BeforeClass + @BeforeAll public static void setUp() throws IOException { readerClass = AgnosticFileReader.class; dataFile = createDataFile(); @@ -77,16 +78,23 @@ private static Path createDataFile() throws IOException { return path; } - @Ignore(value = "This test does not apply for parquet files") - @Test(expected = IOException.class) + @Test public void emptyFile() throws Throwable { - super.emptyFile(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } - @Ignore(value = "This test does not apply for parquet files") - @Test(expected = IOException.class) + @Test public void invalidFileFormat() throws Throwable { - super.invalidFileFormat(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { + writer.write("test"); + } + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } @Test @@ -99,7 +107,7 @@ public void readerWithSchema() throws Throwable { readAllData(); } - @Test(expected = DataException.class) + @Test public void readerWithProjection() throws Throwable { Map cfg = new HashMap() {{ put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, projectionSchema.toString()); @@ -114,10 +122,10 @@ public void readerWithProjection() throws Throwable { } reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); + assertThrows(DataException.class, this::readAllData); } - @Test(expected = InvalidRecordException.class) + @Test public void readerWithInvalidProjection() throws Throwable { Schema testSchema = SchemaBuilder.record("test_projection").namespace("test.avro") .fields() @@ -128,26 +136,27 @@ public void readerWithInvalidProjection() throws Throwable { put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); }}; reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); + assertThrows(InvalidRecordException.class, this::readAllData); } - @Test(expected = AvroRuntimeException.class) + @Test public void readerWithInvalidSchema() throws Throwable { Map cfg = new HashMap() {{ put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, Schema.create(Schema.Type.STRING).toString()); put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); }}; reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); + assertThrows(AvroRuntimeException.class, this::readAllData); } - @Test(expected = SchemaParseException.class) - public void readerWithUnparseableSchema() throws Throwable { + @Test + public void readerWithUnparseableSchema() { Map cfg = new HashMap() {{ put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, "invalid schema"); put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); }}; - getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); + assertThrows(SchemaParseException.class, () -> + getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg)); } @Override @@ -166,5 +175,4 @@ protected void checkData(Struct record, long index) { protected String getFileExtension() { return FILE_EXTENSION; } - } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java index 48c4c4e..411f647 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java @@ -10,8 +10,8 @@ import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.ReflectionUtils; import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.File; import java.io.IOException; @@ -20,8 +20,7 @@ import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.*; public class SequenceFileReaderTest extends LocalFileReaderTestBase { @@ -29,7 +28,7 @@ public class SequenceFileReaderTest extends LocalFileReaderTestBase { private static final String FIELD_NAME_VALUE = "custom_field_name"; private static final String FILE_EXTENSION = "sq"; - @BeforeClass + @BeforeAll public static void setUp() throws IOException { readerClass = AgnosticFileReader.class; dataFile = createDataFile(); @@ -87,7 +86,7 @@ public void defaultFieldNames() throws Throwable { checkData(SequenceFileReader.FIELD_NAME_KEY_DEFAULT, SequenceFileReader.FIELD_NAME_VALUE_DEFAULT, record, recordCount); recordCount++; } - assertEquals("The number of records in the file does not match", NUM_RECORDS, recordCount); + assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); } @Override @@ -101,8 +100,10 @@ protected void checkData(Struct record, long index) { } private void checkData(String keyFieldName, String valueFieldName, Struct record, long index) { - assertEquals((int) (Integer) record.get(keyFieldName), index); - assertTrue(record.get(valueFieldName).toString().startsWith(index + "_")); + assertAll( + () -> assertEquals((int) (Integer) record.get(keyFieldName), index), + () -> assertTrue(record.get(valueFieldName).toString().startsWith(index + "_")) + ); } @Override diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java index 7de8414..a605b9f 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java @@ -6,10 +6,10 @@ import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.junit.BeforeClass; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; @@ -19,15 +19,14 @@ import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.*; public class TextFileReaderTest extends LocalFileReaderTestBase { private static final String FIELD_NAME_VALUE = "custom_field_name"; private static final String FILE_EXTENSION = "txt"; - @BeforeClass + @BeforeAll public static void setUp() throws IOException { readerClass = AgnosticFileReader.class; dataFile = createDataFile(); @@ -55,16 +54,23 @@ private static Path createDataFile() throws IOException { return path; } - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) + @Test public void emptyFile() throws Throwable { - super.emptyFile(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } - @Ignore(value = "This test does not apply for txt files") - @Test(expected = IOException.class) + @Test public void invalidFileFormat() throws Throwable { - super.invalidFileFormat(); + File tmp = File.createTempFile("test-", "." + getFileExtension()); + try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { + writer.write("test"); + } + Path path = new Path(new Path(fsUri), tmp.getName()); + fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fs, path, readerConfig); } @Test @@ -77,13 +83,13 @@ public void validFileEncoding() throws Throwable { readAllData(); } - @Test(expected = UnsupportedCharsetException.class) - public void invalidFileEncoding() throws Throwable { + @Test + public void invalidFileEncoding() { Map cfg = new HashMap() {{ put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset"); }}; - getReader(fs, dataFile, cfg); + assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); } @Test @@ -103,7 +109,7 @@ public void readDataWithRecordPerLineDisabled() throws Throwable { recordCount++; } reader.close(); - assertEquals("The number of records in the file does not match", 1, recordCount); + assertEquals(1, recordCount, () -> "The number of records in the file does not match"); } @Override diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java index 4f2bc24..8c9eba9 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java @@ -3,27 +3,23 @@ import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; import com.github.mmolimar.kafka.connect.fs.file.FileMetadata; import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; -import org.apache.commons.collections.map.HashedMap; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.common.config.ConfigException; import org.apache.kafka.connect.errors.IllegalWorkerStateException; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import java.io.FileNotFoundException; import java.io.IOException; import java.net.URI; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.NoSuchElementException; +import java.util.*; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public abstract class PolicyTestBase { @@ -33,19 +29,19 @@ public abstract class PolicyTestBase { protected static FsSourceTaskConfig taskConfig; protected static URI fsUri; - @AfterClass + @AfterAll public static void tearDown() throws Exception { policy.close(); fs.close(); } - @Before + @BeforeEach public void initPolicy() throws Throwable { - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - taskConfig); + policy = ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), taskConfig); } - @After + @AfterEach public void cleanDirs() throws IOException { for (Path dir : directories) { fs.delete(dir, true); @@ -54,15 +50,17 @@ public void cleanDirs() throws IOException { policy.close(); } - @Test(expected = IllegalArgumentException.class) - public void invalidArgs() throws Exception { - taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS).getConstructor(taskConfig.getClass()).newInstance(null); + @Test + public void invalidArgs() { + assertThrows(IllegalArgumentException.class, () -> taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS) + .getConstructor(taskConfig.getClass()).newInstance(null)); } - @Test(expected = ConfigException.class) - public void invalidConfig() throws Throwable { - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - new FsSourceTaskConfig(new HashedMap())); + @Test + public void invalidConfig() { + assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), + new FsSourceTaskConfig(new HashMap<>()))); } @Test @@ -72,13 +70,13 @@ public void interruptPolicy() throws Throwable { assertTrue(policy.hasEnded()); } - @Test(expected = FileNotFoundException.class) + @Test public void invalidDirectory() throws IOException { for (Path dir : directories) { fs.delete(dir, true); } try { - policy.execute(); + assertThrows(FileNotFoundException.class, () -> policy.execute()); } finally { for (Path dir : directories) { fs.mkdirs(dir); @@ -86,19 +84,19 @@ public void invalidDirectory() throws IOException { } } - @Test(expected = NoSuchElementException.class) + @Test public void listEmptyDirectories() throws IOException { Iterator it = policy.execute(); assertFalse(it.hasNext()); - it.next(); + assertThrows(NoSuchElementException.class, it::next); } @Test public void oneFilePerFs() throws IOException, InterruptedException { for (Path dir : directories) { - fs.createNewFile(new Path(dir, String.valueOf(System.nanoTime() + ".txt"))); + fs.createNewFile(new Path(dir, System.nanoTime() + ".txt")); //this file does not match the regexp - fs.createNewFile(new Path(dir, String.valueOf(System.nanoTime()) + ".invalid")); + fs.createNewFile(new Path(dir, System.nanoTime() + ".invalid")); } //we wait till FS has registered the files Thread.sleep(500); @@ -116,9 +114,9 @@ public void recursiveDirectory() throws IOException, InterruptedException { for (Path dir : directories) { Path tmpDir = new Path(dir, String.valueOf(System.nanoTime())); fs.mkdirs(tmpDir); - fs.createNewFile(new Path(tmpDir, String.valueOf(System.nanoTime() + ".txt"))); + fs.createNewFile(new Path(tmpDir, System.nanoTime() + ".txt")); //this file does not match the regexp - fs.createNewFile(new Path(tmpDir, String.valueOf(System.nanoTime()) + ".invalid")); + fs.createNewFile(new Path(tmpDir, System.nanoTime() + ".invalid")); } //we wait till FS has registered the files Thread.sleep(500); @@ -137,11 +135,11 @@ public void hasEnded() throws IOException { assertTrue(policy.hasEnded()); } - @Test(expected = IllegalWorkerStateException.class) + @Test public void execPolicyAlreadyEnded() throws IOException { policy.execute(); assertTrue(policy.hasEnded()); - policy.execute(); + assertThrows(IllegalWorkerStateException.class, () -> policy.execute()); } @Test @@ -151,8 +149,8 @@ public void dynamicURIs() throws Throwable { Map originals = taskConfig.originalsStrings(); originals.put(FsSourceTaskConfig.FS_URIS, dynamic.toString()); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - cfg); + policy = ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); assertEquals(1, policy.getURIs().size()); LocalDateTime dateTime = LocalDateTime.now(); @@ -168,18 +166,16 @@ public void dynamicURIs() throws Throwable { formatter = DateTimeFormatter.ofPattern("W"); uri.append(dateTime.format(formatter)); assertTrue(policy.getURIs().get(0).endsWith(uri.toString())); - } - @Test(expected = IllegalArgumentException.class) + @Test public void invalidDynamicURIs() throws Throwable { Path dynamic = new Path(fsUri.toString(), "${yyyy}/${MM}/${mmmmmmm}"); fs.create(dynamic); Map originals = taskConfig.originalsStrings(); originals.put(FsSourceTaskConfig.FS_URIS, dynamic.toString()); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - cfg); + assertThrows(IllegalArgumentException.class, () -> ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); } - } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java index 0c32830..dca39be 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java @@ -5,8 +5,8 @@ import com.github.mmolimar.kafka.connect.fs.policy.HdfsFileWatcherPolicy; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.errors.IllegalWorkerStateException; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.ArrayList; @@ -14,12 +14,11 @@ import java.util.Map; import java.util.UUID; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.*; public class HdfsFileWatcherPolicyTest extends HdfsPolicyTestBase { - @BeforeClass + @BeforeAll public static void setUp() throws IOException { directories = new ArrayList() {{ add(new Path(fsUri.toString(), UUID.randomUUID().toString())); @@ -38,7 +37,7 @@ public static void setUp() throws IOException { put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "test"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); }}; taskConfig = new FsSourceTaskConfig(cfg); } @@ -47,7 +46,16 @@ public static void setUp() throws IOException { @Test @Override public void invalidDirectory() throws IOException { - super.invalidDirectory(); + for (Path dir : directories) { + fs.delete(dir, true); + } + try { + policy.execute(); + } finally { + for (Path dir : directories) { + fs.mkdirs(dir); + } + } } //This policy never ends at least all watchers die @@ -61,13 +69,13 @@ public void hasEnded() throws IOException { } //This policy never ends. We have to interrupt it - @Test(expected = IllegalWorkerStateException.class) + @Test @Override public void execPolicyAlreadyEnded() throws IOException { policy.execute(); assertFalse(policy.hasEnded()); policy.interrupt(); assertTrue(policy.hasEnded()); - policy.execute(); + assertThrows(IllegalWorkerStateException.class, () -> policy.execute()); } } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java index d046d0b..522d1de 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java @@ -4,8 +4,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.junit.AfterClass; -import org.junit.BeforeClass; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import java.io.IOException; import java.net.URI; @@ -16,7 +16,7 @@ public abstract class HdfsPolicyTestBase extends PolicyTestBase { private static MiniDFSCluster cluster; - @BeforeClass + @BeforeAll public static void initFs() throws IOException { Configuration clusterConfig = new Configuration(); Path hdfsDir = Files.createTempDirectory("test-"); @@ -26,8 +26,8 @@ public static void initFs() throws IOException { fs = FileSystem.newInstance(fsUri, new Configuration()); } - @AfterClass - public static void finishFs() throws Exception { + @AfterAll + public static void finishFs() { cluster.shutdown(true); } } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java index 33ebe28..5e0eb7f 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java @@ -4,7 +4,7 @@ import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; import com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy; import org.apache.hadoop.fs.Path; -import org.junit.BeforeClass; +import org.junit.jupiter.api.BeforeAll; import java.io.IOException; import java.util.ArrayList; @@ -14,7 +14,7 @@ public class SimplePolicyTest extends HdfsPolicyTestBase { - @BeforeClass + @BeforeAll public static void setUp() throws IOException { directories = new ArrayList() {{ add(new Path(fsUri.toString(), UUID.randomUUID().toString())); @@ -33,7 +33,7 @@ public static void setUp() throws IOException { put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "test"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); }}; taskConfig = new FsSourceTaskConfig(cfg); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java index 77d85a6..d47faae 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java @@ -7,8 +7,8 @@ import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; import org.apache.hadoop.fs.Path; import org.apache.kafka.common.config.ConfigException; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.ArrayList; @@ -16,12 +16,11 @@ import java.util.Map; import java.util.UUID; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.*; public class SleepyPolicyTest extends HdfsPolicyTestBase { - @BeforeClass + @BeforeAll public static void setUp() throws IOException { directories = new ArrayList() {{ add(new Path(fsUri.toString(), UUID.randomUUID().toString())); @@ -40,35 +39,38 @@ public static void setUp() throws IOException { put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "test"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "100"); put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "1"); }}; taskConfig = new FsSourceTaskConfig(cfg); } - @Test(expected = ConfigException.class) - public void invalidSleepTime() throws Throwable { + @Test + public void invalidSleepTime() { Map originals = taskConfig.originalsStrings(); originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); } - @Test(expected = ConfigException.class) - public void invalidMaxExecs() throws Throwable { + @Test + public void invalidMaxExecs() { Map originals = taskConfig.originalsStrings(); originals.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); } - @Test(expected = ConfigException.class) - public void invalidSleepFraction() throws Throwable { + @Test + public void invalidSleepFraction() { Map originals = taskConfig.originalsStrings(); originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_FRACTION, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); } @Test @@ -78,8 +80,8 @@ public void sleepExecution() throws Throwable { tConfig.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "2"); FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - sleepConfig); + policy = ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); assertFalse(policy.hasEnded()); policy.execute(); assertFalse(policy.hasEnded()); @@ -94,8 +96,8 @@ public void defaultExecutions() throws Throwable { tConfig.remove(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS); FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - sleepConfig); + policy = ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); //it never ends for (int i = 0; i < 100; i++) { diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/LocalPolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/LocalPolicyTestBase.java index 6aa4cd5..8c12b3a 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/LocalPolicyTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/LocalPolicyTestBase.java @@ -4,8 +4,8 @@ import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; -import org.junit.AfterClass; -import org.junit.BeforeClass; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import java.io.IOException; import java.nio.file.Files; @@ -15,14 +15,14 @@ public abstract class LocalPolicyTestBase extends PolicyTestBase { private static Path localDir; - @BeforeClass + @BeforeAll public static void initFs() throws IOException { localDir = Files.createTempDirectory("test-"); fsUri = localDir.toUri(); fs = FileSystem.newInstance(fsUri, new Configuration()); } - @AfterClass + @AfterAll public static void finishFs() throws IOException { FileUtils.deleteDirectory(localDir.toFile()); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java index c8a221a..2de53e6 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java @@ -4,7 +4,7 @@ import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; import com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy; import org.apache.hadoop.fs.Path; -import org.junit.BeforeClass; +import org.junit.jupiter.api.BeforeAll; import java.io.IOException; import java.util.ArrayList; @@ -14,7 +14,7 @@ public class SimplePolicyTest extends LocalPolicyTestBase { - @BeforeClass + @BeforeAll public static void setUp() throws IOException { directories = new ArrayList() {{ add(new Path(fsUri.toString(), UUID.randomUUID().toString())); @@ -33,7 +33,7 @@ public static void setUp() throws IOException { put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "test"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test/"); }}; taskConfig = new FsSourceTaskConfig(cfg); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java index be6c58b..93c9f09 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java @@ -7,8 +7,8 @@ import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; import org.apache.hadoop.fs.Path; import org.apache.kafka.common.config.ConfigException; -import org.junit.BeforeClass; -import org.junit.Test; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; import java.io.IOException; import java.util.ArrayList; @@ -16,12 +16,11 @@ import java.util.Map; import java.util.UUID; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.*; public class SleepyPolicyTest extends LocalPolicyTestBase { - @BeforeClass + @BeforeAll public static void setUp() throws IOException { directories = new ArrayList() {{ add(new Path(fsUri.toString(), UUID.randomUUID().toString())); @@ -40,35 +39,38 @@ public static void setUp() throws IOException { put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "test"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "100"); put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "1"); }}; taskConfig = new FsSourceTaskConfig(cfg); } - @Test(expected = ConfigException.class) - public void invalidSleepTime() throws Throwable { + @Test + public void invalidSleepTime() { Map originals = taskConfig.originalsStrings(); originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); } - @Test(expected = ConfigException.class) - public void invalidMaxExecs() throws Throwable { + @Test + public void invalidMaxExecs() { Map originals = taskConfig.originalsStrings(); originals.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); } - @Test(expected = ConfigException.class) - public void invalidSleepFraction() throws Throwable { + @Test + public void invalidSleepFraction() { Map originals = taskConfig.originalsStrings(); originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_FRACTION, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); } @Test @@ -78,8 +80,8 @@ public void sleepExecution() throws Throwable { tConfig.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "2"); FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - sleepConfig); + policy = ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); assertFalse(policy.hasEnded()); policy.execute(); assertFalse(policy.hasEnded()); @@ -94,8 +96,8 @@ public void defaultExecutions() throws Throwable { tConfig.remove(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS); FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - policy = ReflectionUtils.makePolicy((Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), - sleepConfig); + policy = ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); //it never ends for (int i = 0; i < 100; i++) { diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskConfigTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskConfigTest.java index 6b0e619..5506baf 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskConfigTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskConfigTest.java @@ -3,10 +3,10 @@ import com.github.mmolimar.kafka.connect.fs.FsSourceConnectorConfig; import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; import org.apache.kafka.common.config.ConfigDef; -import org.junit.Test; +import org.junit.jupiter.api.Test; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class FsSourceTaskConfigTest { @@ -14,9 +14,9 @@ public class FsSourceTaskConfigTest { public void checkDocumentation() { ConfigDef config = FsSourceTaskConfig.conf(); config.names().forEach(key -> { - assertFalse("Property " + key + " should be documented", - config.configKeys().get(key).documentation == null || - "".equals(config.configKeys().get(key).documentation.trim())); + assertFalse(config.configKeys().get(key).documentation == null || + "".equals(config.configKeys().get(key).documentation.trim()), + () -> "Property " + key + " should be documented"); }); } @@ -24,4 +24,4 @@ public void checkDocumentation() { public void toRst() { assertNotNull(FsSourceConnectorConfig.conf().toRst()); } -} \ No newline at end of file +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java index f0fbacc..6d4a823 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java @@ -5,88 +5,86 @@ import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; import com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy; import org.apache.kafka.connect.errors.ConnectException; -import org.junit.Before; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; import java.io.File; -import java.io.IOException; import java.util.HashMap; import java.util.Map; -import static org.junit.Assert.*; +import static org.junit.jupiter.api.Assertions.*; public class FsSourceTaskTest { - @ClassRule - public static final TemporaryFolder temporaryFolder = new TemporaryFolder(); + @TempDir + public static File temporaryFolder; private FsSourceTask task; private Map taskConfig; - @Before - public void setup() throws IOException { + @BeforeEach + public void setup() { task = new FsSourceTask(); taskConfig = new HashMap() {{ put(FsSourceTaskConfig.FS_URIS, String.join(",", - temporaryFolder.getRoot().toURI() + File.separator + "dir1", - temporaryFolder.getRoot().toURI() + File.separator + "dir2", - temporaryFolder.getRoot().toURI() + File.separator + "dir3")); + temporaryFolder.toURI() + File.separator + "dir1", + temporaryFolder.toURI() + File.separator + "dir2", + temporaryFolder.toURI() + File.separator + "dir3")); put(FsSourceTaskConfig.TOPIC, "topic_test"); put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); }}; } - @Test(expected = ConnectException.class) + @Test public void nullProperties() { - task.start(null); + assertThrows(ConnectException.class, () -> task.start(null)); } - @Test(expected = ConnectException.class) + @Test public void expectedFsUris() { Map testProps = new HashMap<>(taskConfig); testProps.remove(FsSourceTaskConfig.FS_URIS); - task.start(testProps); + assertThrows(ConnectException.class, () -> task.start(testProps)); } - @Test(expected = ConnectException.class) + @Test public void expectedPolicyClass() { Map testProps = new HashMap<>(taskConfig); testProps.remove(FsSourceTaskConfig.POLICY_CLASS); - task.start(testProps); + assertThrows(ConnectException.class, () -> task.start(testProps)); } - @Test(expected = ConnectException.class) + @Test public void invalidPolicyClass() { Map testProps = new HashMap<>(taskConfig); testProps.put(FsSourceTaskConfig.POLICY_CLASS, Object.class.getName()); - task.start(testProps); + assertThrows(ConnectException.class, () -> task.start(testProps)); } - @Test(expected = ConnectException.class) + @Test public void expectedReaderClass() { Map testProps = new HashMap<>(taskConfig); testProps.remove(FsSourceTaskConfig.FILE_READER_CLASS); - task.start(testProps); + assertThrows(ConnectException.class, () -> task.start(testProps)); } - @Test(expected = ConnectException.class) + @Test public void invalidReaderClass() { Map testProps = new HashMap<>(taskConfig); testProps.put(FsSourceTaskConfig.FILE_READER_CLASS, Object.class.getName()); - task.start(testProps); + assertThrows(ConnectException.class, () -> task.start(testProps)); } @Test - public void minimunConfig() { + public void minimumConfig() { task.start(taskConfig); task.stop(); } @Test - public void pollWithoutStart() throws InterruptedException { + public void pollWithoutStart() { assertNull(task.poll()); task.stop(); } @@ -96,5 +94,4 @@ public void checkVersion() { assertNotNull(task.version()); assertFalse("unknown".equalsIgnoreCase(task.version())); } - -} \ No newline at end of file +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTestBase.java index 192b756..22d388c 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTestBase.java @@ -13,10 +13,10 @@ import org.apache.kafka.connect.source.SourceTaskContext; import org.apache.kafka.connect.storage.OffsetStorageReader; import org.easymock.EasyMock; -import org.junit.After; -import org.junit.AfterClass; -import org.junit.Before; -import org.junit.Test; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; import org.powermock.api.easymock.PowerMock; import org.powermock.api.support.membermodification.MemberModifier; @@ -29,8 +29,8 @@ import java.util.Map; import java.util.UUID; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNull; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; public abstract class FsSourceTaskTestBase { @@ -45,17 +45,17 @@ public abstract class FsSourceTaskTestBase { protected SourceTaskContext taskContext; protected OffsetStorageReader offsetStorageReader; - @AfterClass + @AfterAll public static void tearDown() throws Exception { fs.close(); } - @Before + @BeforeEach public void initTask() { task = new FsSourceTask(); taskConfig = new HashMap() {{ - String uris[] = directories.stream().map(dir -> dir.toString()) - .toArray(size -> new String[size]); + String[] uris = directories.stream().map(Path::toString) + .toArray(String[]::new); put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); put(FsSourceTaskConfig.TOPIC, "topic_test"); put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); @@ -92,7 +92,7 @@ public void initTask() { } - @After + @AfterEach public void cleanDirsAndStop() throws IOException { for (Path dir : directories) { fs.delete(dir, true); @@ -102,7 +102,7 @@ public void cleanDirsAndStop() throws IOException { } @Test - public void pollNoData() throws InterruptedException { + public void pollNoData() { task.start(taskConfig); assertEquals(0, task.poll().size()); //policy has ended @@ -110,9 +110,9 @@ public void pollNoData() throws InterruptedException { } @Test - public void emptyFilesToProcess() throws IOException, InterruptedException { + public void emptyFilesToProcess() throws IOException { for (Path dir : directories) { - fs.createNewFile(new Path(dir, String.valueOf(System.nanoTime() + ".txt"))); + fs.createNewFile(new Path(dir, System.nanoTime() + ".txt")); //this file does not match the regexp fs.createNewFile(new Path(dir, String.valueOf(System.nanoTime()))); } @@ -123,9 +123,9 @@ public void emptyFilesToProcess() throws IOException, InterruptedException { } @Test - public void oneFilePerFs() throws IOException, InterruptedException { + public void oneFilePerFs() throws IOException { for (Path dir : directories) { - Path dataFile = new Path(dir, String.valueOf(System.nanoTime() + ".txt")); + Path dataFile = new Path(dir, System.nanoTime() + ".txt"); createDataFile(dataFile); //this file does not match the regexp fs.createNewFile(new Path(dir, String.valueOf(System.nanoTime()))); @@ -140,7 +140,7 @@ public void oneFilePerFs() throws IOException, InterruptedException { } @Test - public void nonExistentUri() throws InterruptedException { + public void nonExistentUri() { Map props = new HashMap<>(taskConfig); props.put(FsSourceTaskConfig.FS_URIS, new Path(fs.getWorkingDirectory(), UUID.randomUUID().toString()).toString()); task.start(props); @@ -148,7 +148,7 @@ public void nonExistentUri() throws InterruptedException { } @Test - public void exceptionExecutingPolicy() throws InterruptedException, IOException, IllegalAccessException { + public void exceptionExecutingPolicy() throws IOException, IllegalAccessException { Map props = new HashMap<>(taskConfig); task.start(props); @@ -164,7 +164,7 @@ public void exceptionExecutingPolicy() throws InterruptedException, IOException, } @Test - public void exceptionReadingFile() throws InterruptedException, IOException { + public void exceptionReadingFile() throws IOException { Map props = new HashMap<>(taskConfig); File tmp = File.createTempFile("test-", ".txt"); try (PrintWriter writer = new PrintWriter(tmp)) { @@ -184,4 +184,4 @@ public void exceptionReadingFile() throws InterruptedException, IOException { protected abstract void createDataFile(Path path) throws IOException; -} \ No newline at end of file +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTest.java index 629a0f8..1e8b303 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTest.java @@ -4,7 +4,7 @@ import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.source.SourceRecord; -import org.junit.BeforeClass; +import org.junit.jupiter.api.BeforeAll; import java.io.File; import java.io.FileWriter; @@ -14,12 +14,12 @@ import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class HdfsFsSourceTaskTest extends HdfsFsSourceTaskTestBase { - @BeforeClass + @BeforeAll public static void setUp() throws IOException { directories = new ArrayList() {{ add(new Path(fsUri.toString(), UUID.randomUUID().toString())); @@ -33,7 +33,7 @@ public static void setUp() throws IOException { @Override protected void checkRecords(List records) { records.forEach(record -> { - assertTrue(record.topic().equals("topic_test")); + assertEquals("topic_test", record.topic()); assertNotNull(record.sourcePartition()); assertNotNull(record.sourceOffset()); assertNotNull(record.value()); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTestBase.java index fd8c3bd..1132bc6 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTestBase.java @@ -4,8 +4,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.junit.AfterClass; -import org.junit.BeforeClass; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import java.io.IOException; import java.net.URI; @@ -15,21 +15,19 @@ public abstract class HdfsFsSourceTaskTestBase extends FsSourceTaskTestBase { private static MiniDFSCluster cluster; - private static Configuration clusterConfig; - private static Path hdfsDir; - @BeforeClass + @BeforeAll public static void initFs() throws IOException { - clusterConfig = new Configuration(); - hdfsDir = Files.createTempDirectory("test-"); + Configuration clusterConfig = new Configuration(); + Path hdfsDir = Files.createTempDirectory("test-"); clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString()); cluster = new MiniDFSCluster.Builder(clusterConfig).build(); fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/"); fs = FileSystem.newInstance(fsUri, clusterConfig); } - @AfterClass - public static void finishFs() throws Exception { + @AfterAll + public static void finishFs() { cluster.shutdown(true); } } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTest.java index bbacd9e..8623e05 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTest.java @@ -4,7 +4,7 @@ import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.source.SourceRecord; -import org.junit.BeforeClass; +import org.junit.jupiter.api.BeforeAll; import java.io.File; import java.io.FileWriter; @@ -14,12 +14,12 @@ import java.util.UUID; import java.util.stream.IntStream; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; public class LocalFsSourceTaskTest extends LocalFsSourceTaskTestBase { - @BeforeClass + @BeforeAll public static void setUp() throws IOException { directories = new ArrayList() {{ add(new Path(fsUri.toString(), UUID.randomUUID().toString())); @@ -33,11 +33,10 @@ public static void setUp() throws IOException { @Override protected void checkRecords(List records) { records.forEach(record -> { - assertTrue(record.topic().equals("topic_test")); + assertEquals("topic_test", record.topic()); assertNotNull(record.sourcePartition()); assertNotNull(record.sourceOffset()); assertNotNull(record.value()); - assertNotNull(((Struct) record.value()).get(TextFileReader.FIELD_NAME_VALUE_DEFAULT)); }); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTestBase.java index 569b623..4cf1074 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTestBase.java @@ -4,8 +4,8 @@ import org.apache.commons.io.FileUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; -import org.junit.AfterClass; -import org.junit.BeforeClass; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; import java.io.IOException; import java.nio.file.Files; @@ -15,14 +15,14 @@ public abstract class LocalFsSourceTaskTestBase extends FsSourceTaskTestBase { private static Path localDir; - @BeforeClass + @BeforeAll public static void initFs() throws IOException { localDir = Files.createTempDirectory("test-"); fsUri = localDir.toUri(); fs = FileSystem.newInstance(fsUri, new Configuration()); } - @AfterClass + @AfterAll public static void finishFs() throws IOException { FileUtils.deleteDirectory(localDir.toFile()); } diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties new file mode 100644 index 0000000..18e87a8 --- /dev/null +++ b/src/test/resources/log4j.properties @@ -0,0 +1,13 @@ +# Root logger option +log4j.rootLogger=INFO, stdout + +# Direct log messages to stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target=System.out +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c:%L - %m%n + +log4j.logger.org.apache.hadoop=WARN +log4j.logger.org.apache.parquet=WARN +log4j.logger.org.eclipse.jetty=WARN +log4j.logger.io.confluent.connect.avro=WARN From 7eef49f8b0c40a3bfd568ec097a5dc8ed878595d Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Fri, 6 Mar 2020 17:20:09 +0100 Subject: [PATCH 12/51] Compression type and JSON reader configs in the docs --- docs/source/config_options.rst | 136 ++++++++++++++++++++++++++++----- docs/source/connector.rst | 2 +- docs/source/faq.rst | 4 +- docs/source/filereaders.rst | 12 ++- docs/source/policies.rst | 2 +- 5 files changed, 133 insertions(+), 23 deletions(-) diff --git a/docs/source/config_options.rst b/docs/source/config_options.rst index 851419f..962f572 100644 --- a/docs/source/config_options.rst +++ b/docs/source/config_options.rst @@ -176,7 +176,7 @@ Avro In order to configure custom properties for this reader, the name you must use is ``avro``. ``file_reader.avro.schema`` - AVRO schema in JSON format to use when reading a file. + Avro schema in JSON format to use when reading a file. If not specified, the reader will use the schema defined in the file. * Type: string @@ -190,13 +190,13 @@ Parquet In order to configure custom properties for this reader, the name you must use is ``parquet``. ``file_reader.parquet.schema`` - AVRO schema in JSON format to use when reading a file. + Avro schema in JSON format to use when reading a file. * Type: string * Importance: medium ``file_reader.parquet.projection`` - AVRO schema in JSON format to use for projecting fields from records in a file. + Avro schema in JSON format to use for projecting fields from records in a file. * Type: string * Importance: medium @@ -208,33 +208,99 @@ SequenceFile In order to configure custom properties for this reader, the name you must use is ``sequence``. -``file_reader.sequence.buffer_size`` - Custom buffer size to read data from the Sequence file. - - * Type: int - * Default: 4096 - * Importance: medium - ``file_reader.sequence.field_name.key`` Custom field name for the output key to include in the Kafka message. * Type: string * Default: key - * Importance: low + * Importance: medium ``file_reader.sequence.field_name.value`` Custom field name for the output value to include in the Kafka message. * Type: string * Default: value + * Importance: medium + +``file_reader.sequence.buffer_size`` + Custom buffer size to read data from the Sequence file. + + * Type: int + * Default: 4096 * Importance: low +.. _config_options-filereaders-json: + +JSON +-------------------------------------------- + +To configure custom properties for this reader, the name you must use is ``json``. + +``file_reader.json.record_per_line`` + If enabled, the reader will read each line as a record. Otherwise, the reader will read the full + content of the file as a record. + + * Type: boolean + * Default: true + * Importance: medium + +``file_reader.json.deserialization.`` + Deserialization feature to use when reading a JSON file. You can add as much as you like + based on the ones defined `here. `__ + + * Type: boolean + * Importance: medium + +``file_reader.json.compression.type`` + Compression type to use when reading a file. + + * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) + * Default: none + * Importance: medium + +``file_reader.json.compression.concatenated`` + Flag to specify if the decompression of the reader will finish at the end of the file or after + the first compressed stream. + + * Type: boolean + * Default: true + * Importance: low + +``file_reader.json.encoding`` + Encoding to use for reading a file. If not specified, the reader will use the default encoding. + + * Type: string + * Importance: medium + .. _config_options-filereaders-text: Text -------------------------------------------- -In order to configure custom properties for this reader, the name you must use is ``text``. +To configure custom properties for this reader, the name you must use is ``text``. + +``file_reader.json.record_per_line`` + If enabled, the reader will read each line as a record. Otherwise, the reader will read the full + content of the file as a record. + + * Type: boolean + * Default: true + * Importance: medium + +``file_reader.json.compression.type`` + Compression type to use when reading a file. + + * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) + * Default: none + * Importance: medium + +``file_reader.json.compression.concatenated`` + Flag to specify if the decompression of the reader will finish at the end of the file or after + the first compressed stream. + + * Type: boolean + * Default: true + * Importance: low ``file_reader.text.field_name.value`` Custom field name for the output value to include in the Kafka message. @@ -254,7 +320,7 @@ In order to configure custom properties for this reader, the name you must use i Delimited text -------------------------------------------- -In order to configure custom properties for this reader, the name you must use is ``delimited``. +To configure custom properties for this reader, the name you must use is ``delimited``. ``file_reader.delimited.token`` The token delimiter for columns. @@ -269,10 +335,12 @@ In order to configure custom properties for this reader, the name you must use i * Default: false * Importance: medium -``file_reader.delimited.encoding`` - Encoding to use for reading a file. If not specified, the reader will use the default encoding. +``file_reader.json.record_per_line`` + If enabled, the reader will read each line as a record. Otherwise, the reader will read the full + content of the file as a record. - * Type: string + * Type: boolean + * Default: true * Importance: medium ``file_reader.delimited.default_value`` @@ -280,13 +348,34 @@ In order to configure custom properties for this reader, the name you must use i all expected columns). * Type: string - * Default: null + * Default: ``null`` + * Importance: medium + +``file_reader.json.compression.type`` + Compression type to use when reading a file. + + * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) + * Default: none + * Importance: medium + +``file_reader.json.compression.concatenated`` + Flag to specify if the decompression of the reader will finish at the end of the file or after + the first compressed stream. + + * Type: boolean + * Default: true * Importance: low +``file_reader.delimited.encoding`` + Encoding to use for reading a file. If not specified, the reader will use the default encoding. + + * Type: string + * Importance: medium + Agnostic -------------------------------------------- -In order to configure custom properties for this reader, the name you must use is ``agnostic``. +To configure custom properties for this reader, the name you must use is ``agnostic``. ``file_reader.agnostic.extensions.parquet`` A comma-separated string list with the accepted extensions for Parquet files. @@ -309,9 +398,20 @@ In order to configure custom properties for this reader, the name you must use i * Default: seq * Importance: medium +``file_reader.agnostic.extensions.json`` + A comma-separated string list with the accepted extensions for JSON files. + + * Type: string + * Default: json + * Importance: medium + ``file_reader.agnostic.extensions.delimited`` A comma-separated string list with the accepted extensions for Delimited text files. * Type: string * Default: tsv,csv * Importance: medium + +.. note:: The Agnostic reader uses the previous ones as inner readers. So, in case of using this + reader, you'll probably need to include also the specified properties for those + readers in the connector configuration as well. diff --git a/docs/source/connector.rst b/docs/source/connector.rst index d045f1e..8d2e305 100644 --- a/docs/source/connector.rst +++ b/docs/source/connector.rst @@ -91,7 +91,7 @@ Policies In order to ingest data from the FS(s), the connector needs a **policy** to define the rules to do it. -Basically, the policy tries to connect to each FS included in ``fs.uris`` connector property, list files +Basically, the policy tries to connect to each FS included in ``fs.uris`` connector property, lists files (and filter them using the regular expression provided in the ``policy.regexp`` property) and enables a file reader to read records from them. diff --git a/docs/source/faq.rst b/docs/source/faq.rst index a5077dc..1041bc4 100644 --- a/docs/source/faq.rst +++ b/docs/source/faq.rst @@ -4,7 +4,7 @@ FAQs ******************************************** -**My file was already processed and the connector, when it is executed again, +**My file was already processed and the connector, when it's executed again, processes the same records again.** If during the previous executions the records were sent successfully to Kafka, @@ -33,7 +33,7 @@ the connector everyday.** Don't do this! Take advantage of the dynamic URIs using expressions. -For instance, if you have this URI ``hdfs://host:9000/data/2017``, you can +For instance, if you have this URI ``hdfs://host:9000/data/2020``, you can use this URI ``hdfs://host:9000/data/${yyyy}`` instead. **The connector is too slow to process all URIs I have.** diff --git a/docs/source/filereaders.rst b/docs/source/filereaders.rst index 75b349b..0ea1560 100644 --- a/docs/source/filereaders.rst +++ b/docs/source/filereaders.rst @@ -38,12 +38,21 @@ by default but you can customize these field names. More information about properties of this file reader :ref:`here`. +JSON +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Reads JSON files which might contain multiple number of fields with their specified +data types. The schema for this sort of records is inferred reading the first record +and marked as optional in the schema all the fields contained. + +More information about properties of this file reader :ref:`here`. + Text ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Reads plain text files. -Each line represents one record which will be in a field +Each line represents one record (by default) which will be in a field named ``value`` in the message sent to Kafka by default but you can customize these field names. @@ -74,6 +83,7 @@ Default extensions for each format: * Parquet: .parquet * Avro: .avro * SequenceFile: .seq +* JSON: .json * Delimited text: .tsv, .csv * Text: any other sort of file extension. diff --git a/docs/source/policies.rst b/docs/source/policies.rst index b2ceb86..2e496be 100644 --- a/docs/source/policies.rst +++ b/docs/source/policies.rst @@ -17,7 +17,7 @@ You can learn more about the properties of this policy :ref:`here Date: Fri, 6 Mar 2020 17:41:39 +0100 Subject: [PATCH 13/51] Enable different input streams in text readers --- .../file/reader/DelimitedTextFileReader.java | 1 + .../fs/file/reader/TextFileReader.java | 69 ++++++++++++-- .../file/reader/hdfs/TextFileReaderTest.java | 92 ++++++++++++------- .../file/reader/local/TextFileReaderTest.java | 92 ++++++++++++------- 4 files changed, 180 insertions(+), 74 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java index ff703aa..d6a8834 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java @@ -16,6 +16,7 @@ public class DelimitedTextFileReader extends AbstractFileReader { private static final String FILE_READER_DELIMITED = FILE_READER_PREFIX + "delimited."; + public static final String FILE_READER_DELIMITED_HEADER = FILE_READER_DELIMITED + "header"; public static final String FILE_READER_DELIMITED_TOKEN = FILE_READER_DELIMITED + "token"; public static final String FILE_READER_DELIMITED_ENCODING = FILE_READER_DELIMITED + "encoding"; diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java index 4d03487..fb5db46 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java @@ -1,6 +1,8 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; import com.github.mmolimar.kafka.connect.fs.file.Offset; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Schema; @@ -8,10 +10,7 @@ import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.errors.ConnectException; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.LineNumberReader; +import java.io.*; import java.nio.charset.Charset; import java.util.List; import java.util.Map; @@ -24,10 +23,14 @@ public class TextFileReader extends AbstractFileReader config) throws IOException { super(fs, filePath, new TxtToStruct(), config); - this.reader = new LineNumberReader(new InputStreamReader(fs.open(filePath), this.charset)); + this.reader = new LineNumberReader(getFileReader(fs.open(filePath))); this.offset = new TextOffset(0); } @@ -53,6 +79,18 @@ protected void configure(Map config) { } else { valueFieldName = config.get(FILE_READER_TEXT_FIELD_NAME_VALUE).toString(); } + if (config.get(FILE_READER_TEXT_COMPRESSION_TYPE) == null || + config.get(FILE_READER_TEXT_COMPRESSION_TYPE).toString().equals("")) { + this.compression = CompressionType.NONE; + } else { + boolean concatenated = true; + if (config.get(FILE_READER_TEXT_COMPRESSION_CONCATENATED) != null && + !config.get(FILE_READER_TEXT_COMPRESSION_CONCATENATED).toString().equals("")) { + concatenated = Boolean.parseBoolean(config.get(FILE_READER_TEXT_COMPRESSION_CONCATENATED) + .toString().trim()); + } + this.compression = CompressionType.fromName(config.get(FILE_READER_TEXT_COMPRESSION_TYPE).toString(), concatenated); + } if (config.get(FILE_READER_TEXT_ENCODING) == null || config.get(FILE_READER_TEXT_ENCODING).toString().equals("")) { this.charset = Charset.defaultCharset(); @@ -70,6 +108,24 @@ protected void configure(Map config) { .build(); } + private Reader getFileReader(InputStream inputStream) throws IOException { + final InputStreamReader isr; + switch (this.compression) { + case BZIP2: + isr = new InputStreamReader(new BZip2CompressorInputStream(inputStream, + this.compression.isConcatenated()), this.charset); + break; + case GZIP: + isr = new InputStreamReader(new GzipCompressorInputStream(inputStream, + this.compression.isConcatenated()), this.charset); + break; + default: + isr = new InputStreamReader(inputStream, this.charset); + break; + } + return isr; + } + @Override public boolean hasNext() { if (current != null) { @@ -121,7 +177,8 @@ public void seek(Offset offset) { current = null; if (offset.getRecordOffset() < reader.getLineNumber()) { finished = false; - reader = new LineNumberReader(new InputStreamReader(getFs().open(getFilePath()))); + reader.close(); + reader = new LineNumberReader(getFileReader(getFs().open(getFilePath()))); } while (reader.getLineNumber() < offset.getRecordOffset()) { reader.readLine(); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java index 8e932f2..79e8b88 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java @@ -4,16 +4,16 @@ import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; +import java.io.*; import java.nio.charset.UnsupportedCharsetException; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.UUID; @@ -25,28 +25,42 @@ public class TextFileReaderTest extends HdfsFileReaderTestBase { private static final String FIELD_NAME_VALUE = "custom_field_name"; private static final String FILE_EXTENSION = "txt"; + private static final TextFileReader.CompressionType COMPRESSION_TYPE = TextFileReader.CompressionType.GZIP; @BeforeAll public static void setUp() throws IOException { readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); + dataFile = createDataFile(COMPRESSION_TYPE); readerConfig = new HashMap() {{ put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_CONCATENATED, "true"); }}; } - private static Path createDataFile() throws IOException { - File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (FileWriter writer = new FileWriter(txtFile)) { + private static OutputStream getOutputStream(File file, TextFileReader.CompressionType compression) throws IOException { + final OutputStream os; + switch (compression) { + case BZIP2: + os = new BZip2CompressorOutputStream(new FileOutputStream(file)); + break; + case GZIP: + os = new GzipCompressorOutputStream(new FileOutputStream(file)); + break; + default: + os = new FileOutputStream(file); + break; + } + return os; + } + private static Path createDataFile(TextFileReader.CompressionType compression) throws IOException { + File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); + try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { IntStream.range(0, NUM_RECORDS).forEach(index -> { String value = String.format("%d_%s", index, UUID.randomUUID()); - try { - writer.append(value + "\n"); - OFFSETS_BY_INDEX.put(index, (long) index); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } + writer.append(value + "\n"); + OFFSETS_BY_INDEX.put(index, (long) index); }); } Path path = new Path(new Path(fsUri), txtFile.getName()); @@ -54,30 +68,12 @@ private static Path createDataFile() throws IOException { return path; } - @Test - public void emptyFile() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - - @Test - public void invalidFileFormat() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { - writer.write("test"); - } - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - @Test public void validFileEncoding() throws Throwable { Map cfg = new HashMap() {{ put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); put(TextFileReader.FILE_READER_TEXT_ENCODING, "Cp1252"); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); }}; reader = getReader(fs, dataFile, cfg); readAllData(); @@ -88,16 +84,18 @@ public void invalidFileEncoding() { Map cfg = new HashMap() {{ put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset"); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); }}; assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); } @Test public void readDataWithRecordPerLineDisabled() throws Throwable { - Path file = createDataFile(); + Path file = createDataFile(COMPRESSION_TYPE); FileReader reader = getReader(fs, file, new HashMap() {{ put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, "false"); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); }}); assertTrue(reader.hasNext()); @@ -112,6 +110,32 @@ public void readDataWithRecordPerLineDisabled() throws Throwable { assertEquals(1, recordCount, () -> "The number of records in the file does not match"); } + @Test + public void readDifferentCompressionTypes() { + Arrays.stream(TextFileReader.CompressionType.values()).forEach(compressionType -> { + try { + Path file = createDataFile(compressionType); + FileReader reader = getReader(fs, file, new HashMap() {{ + put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, compressionType); + }}); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }); + } + @Override protected Offset getOffset(long offset) { return new TextFileReader.TextOffset(offset); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java index a605b9f..edb26e4 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java @@ -4,16 +4,16 @@ import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; +import java.io.*; import java.nio.charset.UnsupportedCharsetException; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.UUID; @@ -25,28 +25,42 @@ public class TextFileReaderTest extends LocalFileReaderTestBase { private static final String FIELD_NAME_VALUE = "custom_field_name"; private static final String FILE_EXTENSION = "txt"; + private static final TextFileReader.CompressionType COMPRESSION_TYPE = TextFileReader.CompressionType.GZIP; @BeforeAll public static void setUp() throws IOException { readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); + dataFile = createDataFile(COMPRESSION_TYPE); readerConfig = new HashMap() {{ put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_CONCATENATED, "true"); }}; } - private static Path createDataFile() throws IOException { - File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (FileWriter writer = new FileWriter(txtFile)) { + private static OutputStream getOutputStream(File file, TextFileReader.CompressionType compression) throws IOException { + final OutputStream os; + switch (compression) { + case BZIP2: + os = new BZip2CompressorOutputStream(new FileOutputStream(file)); + break; + case GZIP: + os = new GzipCompressorOutputStream(new FileOutputStream(file)); + break; + default: + os = new FileOutputStream(file); + break; + } + return os; + } + private static Path createDataFile(TextFileReader.CompressionType compression) throws IOException { + File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); + try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { IntStream.range(0, NUM_RECORDS).forEach(index -> { String value = String.format("%d_%s", index, UUID.randomUUID()); - try { - writer.append(value + "\n"); - OFFSETS_BY_INDEX.put(index, (long) index); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } + writer.append(value + "\n"); + OFFSETS_BY_INDEX.put(index, (long) index); }); } Path path = new Path(new Path(fsUri), txtFile.getName()); @@ -54,30 +68,12 @@ private static Path createDataFile() throws IOException { return path; } - @Test - public void emptyFile() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - - @Test - public void invalidFileFormat() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { - writer.write("test"); - } - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - @Test public void validFileEncoding() throws Throwable { Map cfg = new HashMap() {{ put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); put(TextFileReader.FILE_READER_TEXT_ENCODING, "Cp1252"); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); }}; reader = getReader(fs, dataFile, cfg); readAllData(); @@ -88,16 +84,18 @@ public void invalidFileEncoding() { Map cfg = new HashMap() {{ put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset"); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); }}; assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); } @Test public void readDataWithRecordPerLineDisabled() throws Throwable { - Path file = createDataFile(); + Path file = createDataFile(COMPRESSION_TYPE); FileReader reader = getReader(fs, file, new HashMap() {{ put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, "false"); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); }}); assertTrue(reader.hasNext()); @@ -112,6 +110,32 @@ public void readDataWithRecordPerLineDisabled() throws Throwable { assertEquals(1, recordCount, () -> "The number of records in the file does not match"); } + @Test + public void readDifferentCompressionTypes() { + Arrays.stream(TextFileReader.CompressionType.values()).forEach(compressionType -> { + try { + Path file = createDataFile(compressionType); + FileReader reader = getReader(fs, file, new HashMap() {{ + put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, compressionType); + }}); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }); + } + @Override protected Offset getOffset(long offset) { return new TextFileReader.TextOffset(offset); From 1d439223f29d6104b968a24452dba2a024e6c3a3 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sat, 7 Mar 2020 13:18:50 +0100 Subject: [PATCH 14/51] File readers configuration with string --- .../fs/file/reader/AbstractFileReader.java | 6 +-- .../fs/file/reader/AgnosticFileReader.java | 34 +++++++-------- .../fs/file/reader/AvroFileReader.java | 12 +++--- .../file/reader/DelimitedTextFileReader.java | 18 ++++---- .../fs/file/reader/JsonFileReader.java | 7 ++-- .../fs/file/reader/SequenceFileReader.java | 16 ++----- .../fs/file/reader/TextFileReader.java | 42 ++++--------------- 7 files changed, 46 insertions(+), 89 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java index 533b628..180b0e2 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java @@ -26,14 +26,14 @@ public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter this.filePath = filePath; this.adapter = adapter; - Map readerConf = config.entrySet().stream() + Map readerConf = config.entrySet().stream() .filter(entry -> entry.getKey().startsWith(FILE_READER_PREFIX)) .filter(entry -> entry.getValue() != null) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().toString())); configure(readerConf); } - protected abstract void configure(Map config); + protected abstract void configure(Map config); protected FileSystem getFs() { return fs; diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java index caa89a1..30a6371 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java @@ -8,7 +8,6 @@ import java.io.IOException; import java.util.Arrays; -import java.util.Collections; import java.util.List; import java.util.Map; @@ -25,7 +24,7 @@ public class AgnosticFileReader extends AbstractFileReader reader; - private List parquetExtensions, avroExtensions, jsonExtensions, sequenceExtensions, delimitedExtensions; + private List parquetExtensions, avroExtensions, sequenceExtensions, jsonExtensions, delimitedExtensions; public AgnosticFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, new AgnosticAdapter(), config); @@ -50,10 +49,10 @@ private AbstractFileReader readerByExtension(FileSystem fs, Path filePat clz = ParquetFileReader.class; } else if (avroExtensions.contains(extension)) { clz = AvroFileReader.class; - } else if (jsonExtensions.contains(extension)) { - clz = JsonFileReader.class; } else if (sequenceExtensions.contains(extension)) { clz = SequenceFileReader.class; + } else if (jsonExtensions.contains(extension)) { + clz = JsonFileReader.class; } else if (delimitedExtensions.contains(extension)) { clz = DelimitedTextFileReader.class; } else { @@ -64,22 +63,17 @@ private AbstractFileReader readerByExtension(FileSystem fs, Path filePat } @Override - protected void configure(Map config) { - this.parquetExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET) == null ? - Collections.singletonList("parquet") : - Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET).toString().toLowerCase().split(",")); - this.avroExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_AVRO) == null ? - Collections.singletonList("avro") : - Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_AVRO).toString().toLowerCase().split(",")); - this.jsonExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_JSON) == null ? - Collections.singletonList("json") : - Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_JSON).toString().toLowerCase().split(",")); - this.sequenceExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE) == null ? - Collections.singletonList("seq") : - Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE).toString().toLowerCase().split(",")); - this.delimitedExtensions = config.get(FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED) == null ? - Arrays.asList("tsv", "csv") : - Arrays.asList(config.get(FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED).toString().toLowerCase().split(",")); + protected void configure(Map config) { + this.parquetExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, "parquet") + .toLowerCase().split(",")); + this.avroExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, "avro") + .toLowerCase().split(",")); + this.sequenceExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE, "seq") + .toLowerCase().split(",")); + this.jsonExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_JSON, "json") + .toLowerCase().split(",")); + this.delimitedExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, "tsv,csv") + .toLowerCase().split(",")); } @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java index 14b70a2..040d837 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java @@ -16,6 +16,7 @@ import java.io.IOException; import java.util.Map; +import java.util.Optional; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; @@ -41,13 +42,10 @@ public AvroFileReader(FileSystem fs, Path filePath, Map config) this.offset = new AvroOffset(0); } - protected void configure(Map config) { - if (config.get(FILE_READER_AVRO_SCHEMA) != null && - !config.get(FILE_READER_AVRO_SCHEMA).toString().trim().isEmpty()) { - this.schema = new Schema.Parser().parse(config.get(FILE_READER_AVRO_SCHEMA).toString()); - } else { - this.schema = null; - } + protected void configure(Map config) { + this.schema = Optional.ofNullable(config.get(FILE_READER_AVRO_SCHEMA)) + .map(c -> new Schema.Parser().parse(c)) + .orElse(null); } @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java index d6a8834..0cae141 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java @@ -9,6 +9,7 @@ import java.io.IOException; import java.util.Map; +import java.util.Optional; import java.util.stream.IntStream; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; @@ -58,15 +59,14 @@ public DelimitedTextFileReader(FileSystem fs, Path filePath, Map } @Override - protected void configure(Map config) { - if (config.get(FILE_READER_DELIMITED_TOKEN) == null || - config.get(FILE_READER_DELIMITED_TOKEN).toString().equals("")) { - throw new IllegalArgumentException(FILE_READER_DELIMITED_TOKEN + " property cannot be empty for DelimitedTextFileReader"); - } - this.token = config.get(FILE_READER_DELIMITED_TOKEN).toString(); - this.defaultValue = config.get(FILE_READER_DELIMITED_DEFAULT_VALUE) == null ? - null : config.get(FILE_READER_DELIMITED_DEFAULT_VALUE).toString(); - this.hasHeader = Boolean.parseBoolean((String) config.get(FILE_READER_DELIMITED_HEADER)); + protected void configure(Map config) { + this.token = Optional.ofNullable(config.get(FILE_READER_DELIMITED_TOKEN)) + .filter(t -> !t.isEmpty()) + .orElseThrow(() -> new IllegalArgumentException( + FILE_READER_DELIMITED_TOKEN + " property cannot be empty for DelimitedTextFileReader") + ); + this.defaultValue = config.get(FILE_READER_DELIMITED_DEFAULT_VALUE); + this.hasHeader = Boolean.parseBoolean(config.getOrDefault(FILE_READER_DELIMITED_HEADER, "false")); } @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java index 58230f9..3e8f411 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java @@ -51,19 +51,18 @@ public JsonFileReader(FileSystem fs, Path filePath, Map config) } @Override - protected void configure(Map config) { + protected void configure(Map config) { mapper = new ObjectMapper(); Set deserializationFeatures = Arrays.stream(DeserializationFeature.values()) .map(Enum::name) .collect(Collectors.toSet()); config.entrySet().stream() - .filter(entry -> entry.getValue() != null) .filter(entry -> entry.getKey().startsWith(FILE_READER_JSON_DESERIALIZATION_CONFIGS)) .forEach(entry -> { String feature = entry.getKey().replaceAll(FILE_READER_JSON_DESERIALIZATION_CONFIGS, ""); if (deserializationFeatures.contains(feature)) { mapper.configure(DeserializationFeature.valueOf(feature), - Boolean.parseBoolean(entry.getValue().toString())); + Boolean.parseBoolean(entry.getValue())); } else { log.warn("Ignoring deserialization configuration '" + feature + "' due to it does not exist."); } @@ -189,6 +188,7 @@ private Object mapValue(Schema schema, JsonNode value) { throw new IllegalStateException(ioe); } case OBJECT: + case POJO: Struct struct = new Struct(schema); Iterable> fields = value::fields; StreamSupport.stream(fields.spliterator(), false) @@ -202,7 +202,6 @@ private Object mapValue(Schema schema, JsonNode value) { .map(elm -> mapValue(schema, elm)) .collect(Collectors.toList()); case NULL: - case POJO: case MISSING: default: return null; diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java index 40a939a..595c340 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java @@ -58,19 +58,9 @@ public SequenceFileReader(FileSystem fs, Path filePath, Map conf } @Override - protected void configure(Map config) { - if (config.get(FILE_READER_SEQUENCE_FIELD_NAME_KEY) == null || - config.get(FILE_READER_SEQUENCE_FIELD_NAME_KEY).toString().equals("")) { - this.keyFieldName = FIELD_NAME_KEY_DEFAULT; - } else { - this.keyFieldName = config.get(FILE_READER_SEQUENCE_FIELD_NAME_KEY).toString(); - } - if (config.get(FILE_READER_SEQUENCE_FIELD_NAME_VALUE) == null || - config.get(FILE_READER_SEQUENCE_FIELD_NAME_VALUE).toString().equals("")) { - this.valueFieldName = FIELD_NAME_VALUE_DEFAULT; - } else { - this.valueFieldName = config.get(FILE_READER_SEQUENCE_FIELD_NAME_VALUE).toString(); - } + protected void configure(Map config) { + this.keyFieldName = config.getOrDefault(FILE_READER_SEQUENCE_FIELD_NAME_KEY, FIELD_NAME_KEY_DEFAULT); + this.valueFieldName = config.getOrDefault(FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE_DEFAULT); } private Schema getSchema(Writable writable) { diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java index fb5db46..28f5c48 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java @@ -71,41 +71,17 @@ public TextFileReader(FileSystem fs, Path filePath, Map config) } @Override - protected void configure(Map config) { - String valueFieldName; - if (config.get(FILE_READER_TEXT_FIELD_NAME_VALUE) == null || - config.get(FILE_READER_TEXT_FIELD_NAME_VALUE).toString().equals("")) { - valueFieldName = FIELD_NAME_VALUE_DEFAULT; - } else { - valueFieldName = config.get(FILE_READER_TEXT_FIELD_NAME_VALUE).toString(); - } - if (config.get(FILE_READER_TEXT_COMPRESSION_TYPE) == null || - config.get(FILE_READER_TEXT_COMPRESSION_TYPE).toString().equals("")) { - this.compression = CompressionType.NONE; - } else { - boolean concatenated = true; - if (config.get(FILE_READER_TEXT_COMPRESSION_CONCATENATED) != null && - !config.get(FILE_READER_TEXT_COMPRESSION_CONCATENATED).toString().equals("")) { - concatenated = Boolean.parseBoolean(config.get(FILE_READER_TEXT_COMPRESSION_CONCATENATED) - .toString().trim()); - } - this.compression = CompressionType.fromName(config.get(FILE_READER_TEXT_COMPRESSION_TYPE).toString(), concatenated); - } - if (config.get(FILE_READER_TEXT_ENCODING) == null || - config.get(FILE_READER_TEXT_ENCODING).toString().equals("")) { - this.charset = Charset.defaultCharset(); - } else { - this.charset = Charset.forName(config.get(FILE_READER_TEXT_ENCODING).toString()); - } - if (config.get(FILE_READER_TEXT_RECORD_PER_LINE) == null || - config.get(FILE_READER_TEXT_RECORD_PER_LINE).toString().equals("")) { - this.recordPerLine = true; - } else { - this.recordPerLine = Boolean.parseBoolean(config.get(FILE_READER_TEXT_RECORD_PER_LINE).toString()); - } + protected void configure(Map config) { this.schema = SchemaBuilder.struct() - .field(valueFieldName, Schema.STRING_SCHEMA) + .field(config.getOrDefault(FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE_DEFAULT), + Schema.STRING_SCHEMA) .build(); + this.recordPerLine = Boolean.parseBoolean(config.getOrDefault(FILE_READER_TEXT_RECORD_PER_LINE, "true")); + String cType = config.getOrDefault(FILE_READER_TEXT_COMPRESSION_TYPE, CompressionType.NONE.toString()); + boolean concatenated = Boolean.parseBoolean(config.getOrDefault(FILE_READER_TEXT_COMPRESSION_CONCATENATED, + "true")); + this.compression = CompressionType.fromName(cType, concatenated); + this.charset = Charset.forName(config.getOrDefault(FILE_READER_TEXT_ENCODING, Charset.defaultCharset().name())); } private Reader getFileReader(InputStream inputStream) throws IOException { From 6246f21bcd91ad404be63a343a5634861193150a Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sat, 7 Mar 2020 13:57:42 +0100 Subject: [PATCH 15/51] New policy based on cron expressions --- pom.xml | 18 ++-- .../fs/file/reader/ParquetFileReader.java | 19 ++-- .../connect/fs/policy/AbstractPolicy.java | 8 +- .../kafka/connect/fs/policy/CronPolicy.java | 75 ++++++++++++++++ .../connect/fs/policy/PolicyTestBase.java | 6 -- .../fs/policy/hdfs/CronPolicyTest.java | 90 +++++++++++++++++++ .../fs/policy/local/CronPolicyTest.java | 90 +++++++++++++++++++ 7 files changed, 279 insertions(+), 27 deletions(-) create mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicy.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/CronPolicyTest.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/CronPolicyTest.java diff --git a/pom.xml b/pom.xml index 606806a..11a210e 100644 --- a/pom.xml +++ b/pom.xml @@ -14,9 +14,10 @@ 2.4.0 5.4.0 3.2.1 - 1.9.2 1.11.0 + 1.9.2 2.10.2 + 9.0.2 5.6.0 4.2 2.0.5 @@ -53,6 +54,11 @@ hadoop-aws ${hadoop.version} + + org.apache.parquet + parquet-avro + ${parquet.version} + org.apache.avro avro @@ -64,16 +70,16 @@ ${avro.version} nodeps - - org.apache.parquet - parquet-avro - ${parquet.version} - com.fasterxml.jackson.core jackson-core ${fasterxml-jackson.version} + + com.cronutils + cron-utils + ${cron-utils.version} + diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java index 6afe74f..8e5fd33 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.util.Map; import java.util.NoSuchElementException; +import java.util.Optional; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; @@ -58,17 +59,13 @@ private ParquetReader initReader() throws IOException { .build(); } - protected void configure(Map config) { - if (config.get(FILE_READER_PARQUET_SCHEMA) != null) { - this.schema = new Schema.Parser().parse(config.get(FILE_READER_PARQUET_SCHEMA).toString()); - } else { - this.schema = null; - } - if (config.get(FILE_READER_PARQUET_PROJECTION) != null) { - this.projection = new Schema.Parser().parse(config.get(FILE_READER_PARQUET_PROJECTION).toString()); - } else { - this.projection = null; - } + protected void configure(Map config) { + this.schema = Optional.ofNullable(config.get(FILE_READER_PARQUET_SCHEMA)) + .map(c -> new Schema.Parser().parse(c)) + .orElse(null); + this.projection = Optional.ofNullable(config.get(FILE_READER_PARQUET_PROJECTION)) + .map(c -> new Schema.Parser().parse(c)) + .orElse(null); } @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java index 251987e..849692a 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java @@ -126,14 +126,13 @@ public void interrupt() { protected void preCheck() { } - private void postCheck() { + protected void postCheck() { } public Iterator listFiles(FileSystem fs) throws IOException { return new Iterator() { RemoteIterator it = fs.listFiles(fs.getWorkingDirectory(), recursive); LocatedFileStatus current = null; - boolean previous = false; @Override public boolean hasNext() { @@ -188,7 +187,7 @@ FileMetadata toMetadata(LocatedFileStatus fileStatus) { } @Override - public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorageReader) throws IOException { + public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorageReader) { Map partition = new HashMap() {{ put("path", metadata.getPath()); //TODO manage blocks @@ -201,7 +200,8 @@ public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorage FileReader reader; try { - reader = ReflectionUtils.makeReader((Class) conf.getClass(FsSourceTaskConfig.FILE_READER_CLASS), + reader = ReflectionUtils.makeReader( + (Class) conf.getClass(FsSourceTaskConfig.FILE_READER_CLASS), current, new Path(metadata.getPath()), conf.originals()); } catch (Throwable t) { throw new ConnectException("An error has occurred when creating reader for file: " + metadata.getPath(), t); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicy.java new file mode 100644 index 0000000..0774789 --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicy.java @@ -0,0 +1,75 @@ +package com.github.mmolimar.kafka.connect.fs.policy; + +import com.cronutils.model.CronType; +import com.cronutils.model.definition.CronDefinitionBuilder; +import com.cronutils.model.time.ExecutionTime; +import com.cronutils.parser.CronParser; +import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; +import org.apache.kafka.common.config.ConfigException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.time.DateTimeException; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.ZonedDateTime; +import java.util.Date; +import java.util.Map; + +public class CronPolicy extends AbstractPolicy { + + private static final Logger log = LoggerFactory.getLogger(CronPolicy.class); + + private static final String CRON_POLICY_PREFIX = FsSourceTaskConfig.POLICY_PREFIX + "cron."; + + public static final String CRON_POLICY_EXPRESSION = CRON_POLICY_PREFIX + "expression"; + public static final String CRON_POLICY_END_DATE = CRON_POLICY_PREFIX + "end_date"; + + private ExecutionTime executionTime; + private Date endDate; + + public CronPolicy(FsSourceTaskConfig conf) throws IOException { + super(conf); + } + + @Override + protected void configPolicy(Map customConfigs) { + try { + if (customConfigs.get(CRON_POLICY_END_DATE) != null && + !customConfigs.get(CRON_POLICY_END_DATE).toString().equals("")) { + endDate = Date.from(LocalDateTime.parse(customConfigs.get(CRON_POLICY_END_DATE).toString().trim()) + .atZone(ZoneId.systemDefault()).toInstant()); + } + executionTime = ExecutionTime.forCron( + new CronParser(CronDefinitionBuilder.instanceDefinitionFor(CronType.QUARTZ)) + .parse(customConfigs.get(CRON_POLICY_EXPRESSION).toString()) + ); + } catch (DateTimeException dte) { + throw new ConfigException(CRON_POLICY_END_DATE + " property must have a proper value. Got: '" + + customConfigs.get(CRON_POLICY_END_DATE) + "'."); + } catch (IllegalArgumentException iae) { + throw new ConfigException(CRON_POLICY_EXPRESSION + " property must have a proper value. Got: '" + + customConfigs.get(CRON_POLICY_EXPRESSION) + "'."); + } + } + + @Override + protected void preCheck() { + executionTime.timeToNextExecution(ZonedDateTime.now()) + .ifPresent(next -> { + try { + Thread.sleep(next.toMillis()); + } catch (InterruptedException ie) { + log.warn("An interrupted exception has occurred.", ie); + } + }); + } + + @Override + protected boolean isPolicyCompleted() { + return (endDate != null && + endDate.before(Date.from(LocalDateTime.now().atZone(ZoneId.systemDefault()).toInstant()))) || + !executionTime.timeToNextExecution(ZonedDateTime.now()).isPresent(); + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java index 8c9eba9..4a1aa42 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java @@ -129,12 +129,6 @@ public void recursiveDirectory() throws IOException, InterruptedException { assertFalse(it.hasNext()); } - @Test - public void hasEnded() throws IOException { - policy.execute(); - assertTrue(policy.hasEnded()); - } - @Test public void execPolicyAlreadyEnded() throws IOException { policy.execute(); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/CronPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/CronPolicyTest.java new file mode 100644 index 0000000..26d20d1 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/CronPolicyTest.java @@ -0,0 +1,90 @@ +package com.github.mmolimar.kafka.connect.fs.policy.hdfs; + +import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; +import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; +import com.github.mmolimar.kafka.connect.fs.policy.CronPolicy; +import com.github.mmolimar.kafka.connect.fs.policy.Policy; +import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.connect.errors.IllegalWorkerStateException; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; + +import static org.junit.jupiter.api.Assertions.*; + +public class CronPolicyTest extends HdfsPolicyTestBase { + + @BeforeAll + public static void setUp() throws IOException { + directories = new ArrayList() {{ + add(new Path(fsUri.toString(), UUID.randomUUID().toString())); + add(new Path(fsUri.toString(), UUID.randomUUID().toString())); + }}; + for (Path dir : directories) { + fs.mkdirs(dir); + } + + Map cfg = new HashMap() {{ + String[] uris = directories.stream().map(Path::toString) + .toArray(String[]::new); + put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); + put(FsSourceTaskConfig.TOPIC, "topic_test"); + put(FsSourceTaskConfig.POLICY_CLASS, CronPolicy.class.getName()); + put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); + put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); + put(CronPolicy.CRON_POLICY_EXPRESSION, "0/2 * * * * ?"); + put(CronPolicy.CRON_POLICY_END_DATE, LocalDateTime.now().plusDays(1).toString()); + }}; + taskConfig = new FsSourceTaskConfig(cfg); + } + + @Test + @Override + public void execPolicyAlreadyEnded() throws IOException { + policy.execute(); + policy.interrupt(); + assertTrue(policy.hasEnded()); + assertThrows(IllegalWorkerStateException.class, () -> policy.execute()); + } + + @Test + public void invalidCronExpression() { + Map originals = taskConfig.originalsStrings(); + originals.put(CronPolicy.CRON_POLICY_EXPRESSION, "invalid"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + } + + @Test + public void invalidEndDate() { + Map originals = taskConfig.originalsStrings(); + originals.put(CronPolicy.CRON_POLICY_END_DATE, "invalid"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + } + + @Test + public void canBeInterrupted() throws Throwable { + policy = ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), taskConfig); + + for (int i = 0; i < 5; i++) { + assertFalse(policy.hasEnded()); + policy.execute(); + } + policy.interrupt(); + assertTrue(policy.hasEnded()); + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/CronPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/CronPolicyTest.java new file mode 100644 index 0000000..f054371 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/CronPolicyTest.java @@ -0,0 +1,90 @@ +package com.github.mmolimar.kafka.connect.fs.policy.local; + +import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; +import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; +import com.github.mmolimar.kafka.connect.fs.policy.CronPolicy; +import com.github.mmolimar.kafka.connect.fs.policy.Policy; +import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.connect.errors.IllegalWorkerStateException; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; + +import static org.junit.jupiter.api.Assertions.*; + +public class CronPolicyTest extends LocalPolicyTestBase { + + @BeforeAll + public static void setUp() throws IOException { + directories = new ArrayList() {{ + add(new Path(fsUri.toString(), UUID.randomUUID().toString())); + add(new Path(fsUri.toString(), UUID.randomUUID().toString())); + }}; + for (Path dir : directories) { + fs.mkdirs(dir); + } + + Map cfg = new HashMap() {{ + String[] uris = directories.stream().map(Path::toString) + .toArray(String[]::new); + put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); + put(FsSourceTaskConfig.TOPIC, "topic_test"); + put(FsSourceTaskConfig.POLICY_CLASS, CronPolicy.class.getName()); + put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); + put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); + put(CronPolicy.CRON_POLICY_EXPRESSION, "0/2 * * * * ?"); + put(CronPolicy.CRON_POLICY_END_DATE, LocalDateTime.now().plusDays(1).toString()); + }}; + taskConfig = new FsSourceTaskConfig(cfg); + } + + @Test + @Override + public void execPolicyAlreadyEnded() throws IOException { + policy.execute(); + policy.interrupt(); + assertTrue(policy.hasEnded()); + assertThrows(IllegalWorkerStateException.class, () -> policy.execute()); + } + + @Test + public void invalidCronExpression() { + Map originals = taskConfig.originalsStrings(); + originals.put(CronPolicy.CRON_POLICY_EXPRESSION, "invalid"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + } + + @Test + public void invalidEndDate() { + Map originals = taskConfig.originalsStrings(); + originals.put(CronPolicy.CRON_POLICY_END_DATE, "invalid"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + } + + @Test + public void canBeInterrupted() throws Throwable { + policy = ReflectionUtils.makePolicy( + (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), taskConfig); + + for (int i = 0; i < 5; i++) { + assertFalse(policy.hasEnded()); + policy.execute(); + } + policy.interrupt(); + assertTrue(policy.hasEnded()); + } +} From 21bd7616145fa831158070dc80916fa0ae047a4c Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sun, 8 Mar 2020 14:39:35 +0100 Subject: [PATCH 16/51] Fix in HDFS file watcher policy to avoid events for files which are being copied --- .../fs/policy/HdfsFileWatcherPolicy.java | 17 ++++++++++++++--- .../policy/hdfs/HdfsFileWatcherPolicyTest.java | 10 ---------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java index dd558d6..a6505a3 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java @@ -112,13 +112,24 @@ public void run() { for (Event event : batch.getEvents()) { switch (event.getEventType()) { case CREATE: - enqueue(((Event.CreateEvent) event).getPath()); + if (!((Event.CreateEvent) event).getPath().endsWith("._COPYING_")) { + enqueue(((Event.CreateEvent) event).getPath()); + } break; case APPEND: - enqueue(((Event.AppendEvent) event).getPath()); + if (!((Event.AppendEvent) event).getPath().endsWith("._COPYING_")) { + enqueue(((Event.AppendEvent) event).getPath()); + } + break; + case RENAME: + if (((Event.RenameEvent) event).getSrcPath().endsWith("._COPYING_")) { + enqueue(((Event.RenameEvent) event).getDstPath()); + } break; case CLOSE: - enqueue(((Event.CloseEvent) event).getPath()); + if (!((Event.CloseEvent) event).getPath().endsWith("._COPYING_")) { + enqueue(((Event.CloseEvent) event).getPath()); + } break; default: break; diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java index dca39be..ec68d68 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java @@ -58,16 +58,6 @@ public void invalidDirectory() throws IOException { } } - //This policy never ends at least all watchers die - @Test - @Override - public void hasEnded() throws IOException { - policy.execute(); - assertFalse(policy.hasEnded()); - policy.interrupt(); - assertTrue(policy.hasEnded()); - } - //This policy never ends. We have to interrupt it @Test @Override From e5102e5fe803c8b518619359f5bbfe66dcd7f213 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sun, 8 Mar 2020 14:41:33 +0100 Subject: [PATCH 17/51] Adding Cron policy to docs --- docs/source/config_options.rst | 65 ++++++++++++++++++++++------------ docs/source/policies.rst | 10 ++++++ 2 files changed, 53 insertions(+), 22 deletions(-) diff --git a/docs/source/config_options.rst b/docs/source/config_options.rst index 962f572..bc89498 100644 --- a/docs/source/config_options.rst +++ b/docs/source/config_options.rst @@ -75,7 +75,7 @@ General config properties for this connector. Flag to activate traversed recursion in subdirectories when listing files. * Type: boolean - * Default: false + * Default: ``false`` * Importance: medium ``policy.regexp`` @@ -142,7 +142,7 @@ In order to configure custom properties for this policy, the name you must use i Sleep fraction to divide the sleep time to allow interrupting the policy faster. * Type: long - * Default: 10 + * Default: ``10`` * Importance: medium ``policy.sleepy.max_execs`` @@ -150,7 +150,28 @@ In order to configure custom properties for this policy, the name you must use i An execution represents: listing files from the FS and its corresponding sleep time. * Type: long - * Default: -1 + * Default: ``-1`` + * Importance: medium + +.. _config_options-policies-cron: + +Cron +-------------------------------------------- + +In order to configure custom properties for this policy, the name you must use is ``cron``. + +``policy.cron.expression`` + Cron expression to schedule the policy. + + * Type: string + * Importance: high + +``policy.cron.end_date`` + End date to finish the policy with `ISO date-time `__ + format. + + * Type: date + * Default: ``null`` * Importance: medium .. _config_options-policies-hdfs: @@ -212,21 +233,21 @@ In order to configure custom properties for this reader, the name you must use i Custom field name for the output key to include in the Kafka message. * Type: string - * Default: key + * Default: ``key`` * Importance: medium ``file_reader.sequence.field_name.value`` Custom field name for the output value to include in the Kafka message. * Type: string - * Default: value + * Default: ``value`` * Importance: medium ``file_reader.sequence.buffer_size`` Custom buffer size to read data from the Sequence file. * Type: int - * Default: 4096 + * Default: ``4096`` * Importance: low .. _config_options-filereaders-json: @@ -241,7 +262,7 @@ To configure custom properties for this reader, the name you must use is ``json` content of the file as a record. * Type: boolean - * Default: true + * Default: ``true`` * Importance: medium ``file_reader.json.deserialization.`` @@ -255,7 +276,7 @@ To configure custom properties for this reader, the name you must use is ``json` Compression type to use when reading a file. * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) - * Default: none + * Default: ``none`` * Importance: medium ``file_reader.json.compression.concatenated`` @@ -263,7 +284,7 @@ To configure custom properties for this reader, the name you must use is ``json` the first compressed stream. * Type: boolean - * Default: true + * Default: ``true`` * Importance: low ``file_reader.json.encoding`` @@ -284,14 +305,14 @@ To configure custom properties for this reader, the name you must use is ``text` content of the file as a record. * Type: boolean - * Default: true + * Default: ``true`` * Importance: medium ``file_reader.json.compression.type`` Compression type to use when reading a file. * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) - * Default: none + * Default: ``none`` * Importance: medium ``file_reader.json.compression.concatenated`` @@ -299,14 +320,14 @@ To configure custom properties for this reader, the name you must use is ``text` the first compressed stream. * Type: boolean - * Default: true + * Default: ``true`` * Importance: low ``file_reader.text.field_name.value`` Custom field name for the output value to include in the Kafka message. * Type: string - * Default: value + * Default: ``value`` * Importance: low ``file_reader.text.encoding`` @@ -332,7 +353,7 @@ To configure custom properties for this reader, the name you must use is ``delim If the file contains header or not. * Type: boolean - * Default: false + * Default: ``false`` * Importance: medium ``file_reader.json.record_per_line`` @@ -340,7 +361,7 @@ To configure custom properties for this reader, the name you must use is ``delim content of the file as a record. * Type: boolean - * Default: true + * Default: ``true`` * Importance: medium ``file_reader.delimited.default_value`` @@ -355,7 +376,7 @@ To configure custom properties for this reader, the name you must use is ``delim Compression type to use when reading a file. * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) - * Default: none + * Default: ``none`` * Importance: medium ``file_reader.json.compression.concatenated`` @@ -363,7 +384,7 @@ To configure custom properties for this reader, the name you must use is ``delim the first compressed stream. * Type: boolean - * Default: true + * Default: ``true`` * Importance: low ``file_reader.delimited.encoding`` @@ -381,35 +402,35 @@ To configure custom properties for this reader, the name you must use is ``agnos A comma-separated string list with the accepted extensions for Parquet files. * Type: string - * Default: parquet + * Default: ``parquet`` * Importance: medium ``file_reader.agnostic.extensions.avro`` A comma-separated string list with the accepted extensions for Avro files. * Type: string - * Default: avro + * Default: ``avro`` * Importance: medium ``file_reader.agnostic.extensions.sequence`` A comma-separated string list with the accepted extensions for Sequence files. * Type: string - * Default: seq + * Default: ``seq`` * Importance: medium ``file_reader.agnostic.extensions.json`` A comma-separated string list with the accepted extensions for JSON files. * Type: string - * Default: json + * Default: ``json`` * Importance: medium ``file_reader.agnostic.extensions.delimited`` A comma-separated string list with the accepted extensions for Delimited text files. * Type: string - * Default: tsv,csv + * Default: ``tsv,csv`` * Importance: medium .. note:: The Agnostic reader uses the previous ones as inner readers. So, in case of using this diff --git a/docs/source/policies.rst b/docs/source/policies.rst index 2e496be..dc0f607 100644 --- a/docs/source/policies.rst +++ b/docs/source/policies.rst @@ -14,6 +14,16 @@ and wait for the next one. Additionally, its custom properties allow to end it. You can learn more about the properties of this policy :ref:`here`. +Cron +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This policy is scheduled based on cron expressions and their format to put in the configuration +are based on the library `Quartz Scheduler `__ + +After finishing each execution, the policy gets slept until the next one is scheduled, if applicable. + +You can learn more about the properties of this policy :ref:`here`. + HDFS file watcher ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From f0f105a9439903fd65b3b35c70daf1867995fdb0 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Mon, 9 Mar 2020 05:17:29 +0100 Subject: [PATCH 18/51] Fix offset indexes in readers --- .../connect/fs/file/reader/AvroFileReader.java | 2 +- .../file/reader/DelimitedTextFileReader.java | 6 +----- .../fs/file/reader/ParquetFileReader.java | 4 ++-- .../fs/file/reader/SequenceFileReader.java | 4 ++-- .../connect/fs/file/reader/TextFileReader.java | 10 ++++++---- .../fs/file/reader/FileReaderTestBase.java | 6 +++--- .../hdfs/DelimitedTextFileReaderTest.java | 18 ++++++++++-------- .../local/DelimitedTextFileReaderTest.java | 8 ++++---- 8 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java index 040d837..6dd8d8a 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java @@ -73,7 +73,7 @@ protected GenericRecord nextRecord() { public void seek(Offset offset) { try { reader.sync(offset.getRecordOffset()); - this.offset.setOffset(reader.previousSync() - 15); + this.offset.setOffset(reader.previousSync() - 16); } catch (IOException ioe) { throw new ConnectException("Error seeking file " + getFilePath(), ioe); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java index 0cae141..19fd83f 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java @@ -49,11 +49,7 @@ public DelimitedTextFileReader(FileSystem fs, Path filePath, Map String columnName = hasHeader ? columns[index] : DEFAULT_COLUMN_NAME + "_" + ++index; schemaBuilder.field(columnName, SchemaBuilder.STRING_SCHEMA); }); - - if (!hasHeader) { - //back to the first line - inner.seek(this.offset); - } + inner.seek(this.offset); } this.schema = schemaBuilder.build(); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java index 8e5fd33..cf12483 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java @@ -74,7 +74,6 @@ public boolean hasNext() { if (currentRecord == null) { try { currentRecord = reader.read(); - if (currentRecord != null) offset.inc(); } catch (IOException ioe) { throw new ConnectException("Error reading parquet record", ioe); } @@ -95,6 +94,7 @@ record = new GenericData.Record(this.projection); record = currentRecord; } currentRecord = null; + offset.inc(); return record; } @@ -115,7 +115,7 @@ public void seek(Offset offset) { throw new ConnectException("Error initializing parquet reader", ioe); } } - while (hasNext() && this.offset.getRecordOffset() <= offset.getRecordOffset()) { + while (hasNext() && this.offset.getRecordOffset() < offset.getRecordOffset()) { nextRecord(); } } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java index 595c340..15a9d8f 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java @@ -91,7 +91,7 @@ public boolean hasNext() { if (hasNextIndex == -1 || hasNextIndex == recordIndex) { hasNextIndex++; offset.inc(); - return hasNext = reader.next(key, value); + hasNext = reader.next(key, value); } return hasNext; } catch (EOFException eofe) { @@ -119,7 +119,7 @@ public void seek(Offset offset) { reader.sync(offset.getRecordOffset()); hasNextIndex = recordIndex = offset.getRecordOffset(); hasNext = false; - this.offset.setOffset(offset.getRecordOffset()); + this.offset.setOffset(offset.getRecordOffset() - 1); } catch (IOException ioe) { throw new ConnectException("Error seeking file " + getFilePath(), ioe); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java index 28f5c48..bb3d634 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java @@ -112,14 +112,12 @@ public boolean hasNext() { try { if (!recordPerLine) { List lines = new BufferedReader(reader).lines().collect(Collectors.toList()); - offset.setOffset(lines.size() - 1); current = String.join("\n", lines); finished = true; return true; } for (; ; ) { String line = reader.readLine(); - offset.setOffset(reader.getLineNumber()); if (line == null) { finished = true; return false; @@ -140,7 +138,7 @@ protected TextRecord nextRecord() { } String aux = current; current = null; - + offset.inc(); return new TextRecord(schema, aux); } @@ -159,7 +157,7 @@ public void seek(Offset offset) { while (reader.getLineNumber() < offset.getRecordOffset()) { reader.readLine(); } - this.offset.setOffset(reader.getLineNumber() + 1); + this.offset.setOffset(reader.getLineNumber()); } catch (IOException ioe) { throw new ConnectException("Error seeking file " + getFilePath(), ioe); } @@ -186,6 +184,10 @@ public void setOffset(long offset) { this.offset = offset; } + void inc() { + this.offset++; + } + @Override public long getRecordOffset() { return offset; diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java index c8eec79..d9cc9f4 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java @@ -105,19 +105,19 @@ public void seekFile() { int recordIndex = NUM_RECORDS / 2; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex))); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = 0; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex))); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = NUM_RECORDS - 3; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex))); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1)); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java index f4b6c92..7914f12 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java @@ -50,7 +50,7 @@ private static Path createDataFile(boolean header) throws IOException { String value = String.format("%d_%s", index, UUID.randomUUID()); try { writer.append(value + "," + value + "," + value + "," + value + "\n"); - if (header) OFFSETS_BY_INDEX.put(index, (long) index); + OFFSETS_BY_INDEX.put(index, (long) index); } catch (IOException ioe) { throw new RuntimeException(ioe); } @@ -132,10 +132,12 @@ public void readAllDataWithMalformedRows() throws Throwable { int recordCount = 0; while (reader.hasNext()) { Struct record = reader.next(); - assertEquals("dummy", record.get(FIELD_COLUMN1)); - assertEquals("custom_value", record.get(FIELD_COLUMN2)); - assertEquals("custom_value", record.get(FIELD_COLUMN3)); - assertEquals("custom_value", record.get(FIELD_COLUMN4)); + assertAll( + () -> assertEquals("dummy", record.get(FIELD_COLUMN1)), + () -> assertEquals("custom_value", record.get(FIELD_COLUMN2)), + () -> assertEquals("custom_value", record.get(FIELD_COLUMN3)), + () -> assertEquals("custom_value", record.get(FIELD_COLUMN4)) + ); recordCount++; } assertEquals(2, recordCount, () -> "The number of records in the file does not match"); @@ -154,19 +156,19 @@ public void seekFileWithoutHeader() throws Throwable { int recordIndex = NUM_RECORDS / 2; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = 0; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = NUM_RECORDS - 3; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1, false)); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java index 91f08e9..e8413ad 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java @@ -51,7 +51,7 @@ private static Path createDataFile(boolean header) throws IOException { String value = String.format("%d_%s", index, UUID.randomUUID()); try { writer.append(value + "," + value + "," + value + "," + value + "\n"); - if (header) OFFSETS_BY_INDEX.put(index, (long) index); + OFFSETS_BY_INDEX.put(index, (long) index); } catch (IOException ioe) { throw new RuntimeException(ioe); } @@ -163,19 +163,19 @@ public void seekFileWithoutHeader() throws Throwable { int recordIndex = NUM_RECORDS / 2; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = 0; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = NUM_RECORDS - 3; reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex) + 1, reader.currentOffset().getRecordOffset()); + assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1, false)); From 1a11a893b02f72b7c440e2adf5a345c4dad8f680 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Thu, 12 Mar 2020 20:15:40 -0600 Subject: [PATCH 19/51] Compression support for text-based file readers --- .../kafka/connect/fs/FsSourceConnector.java | 1 - .../fs/file/reader/AbstractFileReader.java | 9 +++++--- .../fs/file/reader/AvroFileReader.java | 1 + .../fs/file/reader/CompressionType.java | 23 +++++++++++++++++++ .../fs/file/reader/JsonFileReader.java | 5 ++++ .../fs/file/reader/TextFileReader.java | 22 ------------------ .../file/reader/hdfs/TextFileReaderTest.java | 9 ++++---- .../file/reader/local/TextFileReaderTest.java | 10 ++++---- 8 files changed, 46 insertions(+), 34 deletions(-) create mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CompressionType.java diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java index 0d4ad3e..0316acd 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java @@ -31,7 +31,6 @@ public void start(Map properties) { log.info("Starting FsSourceConnector..."); try { config = new FsSourceConnectorConfig(properties); - } catch (ConfigException ce) { log.error("Couldn't start FsSourceConnector:", ce); throw new ConnectException("Couldn't start FsSourceConnector due to configuration error.", ce); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java index 180b0e2..dae25af 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java @@ -16,7 +16,7 @@ public abstract class AbstractFileReader implements FileReader { private final FileSystem fs; private final Path filePath; - private ReaderAdapter adapter; + private final ReaderAdapter adapter; public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter, Map config) { if (fs == null || filePath == null) { @@ -26,11 +26,14 @@ public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter this.filePath = filePath; this.adapter = adapter; - Map readerConf = config.entrySet().stream() + configure(readerConfig(config)); + } + + protected final Map readerConfig(Map config) { + return config.entrySet().stream() .filter(entry -> entry.getKey().startsWith(FILE_READER_PREFIX)) .filter(entry -> entry.getValue() != null) .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().toString())); - configure(readerConf); } protected abstract void configure(Map config); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java index 6dd8d8a..2438f51 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java @@ -42,6 +42,7 @@ public AvroFileReader(FileSystem fs, Path filePath, Map config) this.offset = new AvroOffset(0); } + @Override protected void configure(Map config) { this.schema = Optional.ofNullable(config.get(FILE_READER_AVRO_SCHEMA)) .map(c -> new Schema.Parser().parse(c)) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CompressionType.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CompressionType.java new file mode 100644 index 0000000..9dade35 --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CompressionType.java @@ -0,0 +1,23 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +public enum CompressionType { + BZIP2, + GZIP, + NONE; + + private boolean concatenated; + + CompressionType() { + this.concatenated = true; + } + + public boolean isConcatenated() { + return concatenated; + } + + public static CompressionType fromName(String compression, boolean concatenated) { + CompressionType ct = CompressionType.valueOf(compression.trim().toUpperCase()); + ct.concatenated = concatenated; + return ct; + } +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java index 3e8f411..cf26a34 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java @@ -23,10 +23,13 @@ public class JsonFileReader extends AbstractFileReader { private static final String FILE_READER_JSON = FILE_READER_PREFIX + "json."; + private static final String FILE_READER_JSON_COMPRESSION = FILE_READER_JSON + "compression."; public static final String FILE_READER_JSON_DESERIALIZATION_CONFIGS = FILE_READER_JSON + "deserialization."; public static final String FILE_READER_JSON_RECORD_PER_LINE = FILE_READER_JSON + "record_per_line"; public static final String FILE_READER_JSON_ENCODING = FILE_READER_JSON + "encoding"; + public static final String FILE_READER_JSON_COMPRESSION_TYPE = FILE_READER_JSON_COMPRESSION + "type"; + public static final String FILE_READER_JSON_COMPRESSION_CONCATENATED = FILE_READER_JSON_COMPRESSION + "concatenated"; private final TextFileReader inner; private final Schema schema; @@ -37,6 +40,8 @@ public JsonFileReader(FileSystem fs, Path filePath, Map config) config.put(TextFileReader.FILE_READER_TEXT_ENCODING, config.get(FILE_READER_JSON_ENCODING)); config.put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, config.get(FILE_READER_JSON_RECORD_PER_LINE)); + config.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, config.get(FILE_READER_JSON_COMPRESSION_TYPE)); + config.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_CONCATENATED, config.get(FILE_READER_JSON_COMPRESSION_CONCATENATED)); this.inner = new TextFileReader(fs, filePath, config); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java index bb3d634..a12323e 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java @@ -42,28 +42,6 @@ public class TextFileReader extends AbstractFileReader config) throws IOException { super(fs, filePath, new TxtToStruct(), config); this.reader = new LineNumberReader(getFileReader(fs.open(filePath))); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java index 79e8b88..fdb3004 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java @@ -2,6 +2,7 @@ import com.github.mmolimar.kafka.connect.fs.file.Offset; import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; +import com.github.mmolimar.kafka.connect.fs.file.reader.CompressionType; import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; @@ -25,7 +26,7 @@ public class TextFileReaderTest extends HdfsFileReaderTestBase { private static final String FIELD_NAME_VALUE = "custom_field_name"; private static final String FILE_EXTENSION = "txt"; - private static final TextFileReader.CompressionType COMPRESSION_TYPE = TextFileReader.CompressionType.GZIP; + private static final CompressionType COMPRESSION_TYPE = CompressionType.GZIP; @BeforeAll public static void setUp() throws IOException { @@ -38,7 +39,7 @@ public static void setUp() throws IOException { }}; } - private static OutputStream getOutputStream(File file, TextFileReader.CompressionType compression) throws IOException { + private static OutputStream getOutputStream(File file, CompressionType compression) throws IOException { final OutputStream os; switch (compression) { case BZIP2: @@ -54,7 +55,7 @@ private static OutputStream getOutputStream(File file, TextFileReader.Compressio return os; } - private static Path createDataFile(TextFileReader.CompressionType compression) throws IOException { + private static Path createDataFile(CompressionType compression) throws IOException { File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { IntStream.range(0, NUM_RECORDS).forEach(index -> { @@ -112,7 +113,7 @@ public void readDataWithRecordPerLineDisabled() throws Throwable { @Test public void readDifferentCompressionTypes() { - Arrays.stream(TextFileReader.CompressionType.values()).forEach(compressionType -> { + Arrays.stream(CompressionType.values()).forEach(compressionType -> { try { Path file = createDataFile(compressionType); FileReader reader = getReader(fs, file, new HashMap() {{ diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java index edb26e4..fd80931 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java @@ -2,6 +2,7 @@ import com.github.mmolimar.kafka.connect.fs.file.Offset; import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; +import com.github.mmolimar.kafka.connect.fs.file.reader.CompressionType; import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; @@ -25,7 +26,7 @@ public class TextFileReaderTest extends LocalFileReaderTestBase { private static final String FIELD_NAME_VALUE = "custom_field_name"; private static final String FILE_EXTENSION = "txt"; - private static final TextFileReader.CompressionType COMPRESSION_TYPE = TextFileReader.CompressionType.GZIP; + private static final CompressionType COMPRESSION_TYPE = CompressionType.GZIP; @BeforeAll public static void setUp() throws IOException { @@ -38,7 +39,7 @@ public static void setUp() throws IOException { }}; } - private static OutputStream getOutputStream(File file, TextFileReader.CompressionType compression) throws IOException { + private static OutputStream getOutputStream(File file, CompressionType compression) throws IOException { final OutputStream os; switch (compression) { case BZIP2: @@ -54,7 +55,8 @@ private static OutputStream getOutputStream(File file, TextFileReader.Compressio return os; } - private static Path createDataFile(TextFileReader.CompressionType compression) throws IOException { + + private static Path createDataFile(CompressionType compression) throws IOException { File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { IntStream.range(0, NUM_RECORDS).forEach(index -> { @@ -112,7 +114,7 @@ public void readDataWithRecordPerLineDisabled() throws Throwable { @Test public void readDifferentCompressionTypes() { - Arrays.stream(TextFileReader.CompressionType.values()).forEach(compressionType -> { + Arrays.stream(CompressionType.values()).forEach(compressionType -> { try { Path file = createDataFile(compressionType); FileReader reader = getReader(fs, file, new HashMap() {{ From 5ee8eb4c9b928a3f96e6e611a096effc8cd75645 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Thu, 12 Mar 2020 20:17:24 -0600 Subject: [PATCH 20/51] Avoid verbosity in logs from HDFS for tests --- src/test/resources/log4j.properties | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties index 18e87a8..493f160 100644 --- a/src/test/resources/log4j.properties +++ b/src/test/resources/log4j.properties @@ -7,7 +7,8 @@ log4j.appender.stdout.Target=System.out log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c:%L - %m%n -log4j.logger.org.apache.hadoop=WARN +log4j.logger.org.apache.hadoop=ERROR +log4j.logger.BlockStateChange=WARN log4j.logger.org.apache.parquet=WARN log4j.logger.org.eclipse.jetty=WARN log4j.logger.io.confluent.connect.avro=WARN From 22de7392996c18a6b587ddec2fd3093167f06b17 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Fri, 13 Mar 2020 18:07:29 -0600 Subject: [PATCH 21/51] Removed delimited file reader --- .../file/reader/DelimitedTextFileReader.java | 153 ------------ .../fs/file/reader/SequenceFileReader.java | 8 +- .../hdfs/DelimitedTextFileReaderTest.java | 221 ----------------- .../local/DelimitedTextFileReaderTest.java | 230 ------------------ 4 files changed, 4 insertions(+), 608 deletions(-) delete mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java deleted file mode 100644 index 19fd83f..0000000 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java +++ /dev/null @@ -1,153 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Schema; -import org.apache.kafka.connect.data.SchemaBuilder; -import org.apache.kafka.connect.data.Struct; - -import java.io.IOException; -import java.util.Map; -import java.util.Optional; -import java.util.stream.IntStream; - -import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; - -public class DelimitedTextFileReader extends AbstractFileReader { - - private static final String FILE_READER_DELIMITED = FILE_READER_PREFIX + "delimited."; - - public static final String FILE_READER_DELIMITED_HEADER = FILE_READER_DELIMITED + "header"; - public static final String FILE_READER_DELIMITED_TOKEN = FILE_READER_DELIMITED + "token"; - public static final String FILE_READER_DELIMITED_ENCODING = FILE_READER_DELIMITED + "encoding"; - public static final String FILE_READER_DELIMITED_DEFAULT_VALUE = FILE_READER_DELIMITED + "default_value"; - - private static final String DEFAULT_COLUMN_NAME = "column"; - - private final TextFileReader inner; - private final Schema schema; - private DelimitedTextOffset offset; - private String token; - private String defaultValue; - private boolean hasHeader; - - public DelimitedTextFileReader(FileSystem fs, Path filePath, Map config) throws IOException { - super(fs, filePath, new DelimitedTxtToStruct(), config); - - config.put(TextFileReader.FILE_READER_TEXT_ENCODING, config.get(FILE_READER_DELIMITED_ENCODING)); - config.put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, "true"); - - this.inner = new TextFileReader(fs, filePath, config); - this.offset = new DelimitedTextOffset(0, hasHeader); - - SchemaBuilder schemaBuilder = SchemaBuilder.struct(); - if (hasNext()) { - String firstLine = inner.nextRecord().getValue(); - String[] columns = firstLine.split(token); - IntStream.range(0, columns.length).forEach(index -> { - String columnName = hasHeader ? columns[index] : DEFAULT_COLUMN_NAME + "_" + ++index; - schemaBuilder.field(columnName, SchemaBuilder.STRING_SCHEMA); - }); - inner.seek(this.offset); - } - this.schema = schemaBuilder.build(); - } - - @Override - protected void configure(Map config) { - this.token = Optional.ofNullable(config.get(FILE_READER_DELIMITED_TOKEN)) - .filter(t -> !t.isEmpty()) - .orElseThrow(() -> new IllegalArgumentException( - FILE_READER_DELIMITED_TOKEN + " property cannot be empty for DelimitedTextFileReader") - ); - this.defaultValue = config.get(FILE_READER_DELIMITED_DEFAULT_VALUE); - this.hasHeader = Boolean.parseBoolean(config.getOrDefault(FILE_READER_DELIMITED_HEADER, "false")); - } - - @Override - protected DelimitedRecord nextRecord() { - offset.inc(); - String[] values = inner.nextRecord().getValue().split(token); - return new DelimitedRecord(schema, defaultValue != null ? fillNullValues(values) : values); - } - - private String[] fillNullValues(final String[] values) { - return IntStream.range(0, schema.fields().size()) - .mapToObj(index -> { - if (index < values.length) { - return values[index]; - } else { - return defaultValue; - } - }).toArray(String[]::new); - } - - @Override - public boolean hasNext() { - return inner.hasNext(); - } - - @Override - public void seek(Offset offset) { - inner.seek(offset); - this.offset.setOffset(inner.currentOffset().getRecordOffset()); - } - - @Override - public Offset currentOffset() { - return offset; - } - - @Override - public void close() throws IOException { - inner.close(); - } - - public static class DelimitedTextOffset implements Offset { - private long offset; - private boolean hasHeader; - - public DelimitedTextOffset(long offset, boolean hasHeader) { - this.hasHeader = hasHeader; - this.offset = hasHeader && offset >= 0 ? offset + 1 : offset; - } - - public void setOffset(long offset) { - this.offset = hasHeader && offset > 0 ? offset - 1 : offset; - } - - void inc() { - this.offset++; - } - - @Override - public long getRecordOffset() { - return offset; - } - } - - static class DelimitedTxtToStruct implements ReaderAdapter { - - @Override - public Struct apply(DelimitedRecord record) { - Struct struct = new Struct(record.schema); - IntStream.range(0, record.schema.fields().size()).forEach(index -> { - if (index < record.values.length) { - struct.put(record.schema.fields().get(index).name(), record.values[index]); - } - }); - return struct; - } - } - - static class DelimitedRecord { - private final Schema schema; - private final String[] values; - - DelimitedRecord(Schema schema, String[] values) { - this.schema = schema; - this.values = values; - } - } -} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java index 15a9d8f..bdde95b 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java @@ -37,7 +37,7 @@ public class SequenceFileReader extends AbstractFileReader config) throws IOException { super(fs, filePath, new SeqToStruct(), config); @@ -54,7 +54,7 @@ public SequenceFileReader(FileSystem fs, Path filePath, Map conf this.offset = new SeqOffset(0); this.recordIndex = this.hasNextIndex = -1; this.hasNext = false; - this.isClosed = false; + this.closed = false; } @Override @@ -86,7 +86,7 @@ private Schema getSchema(Writable writable) { @Override public boolean hasNext() { - if (isClosed) throw new IllegalStateException("Reader already closed."); + if (closed) throw new IllegalStateException("Reader already closed."); try { if (hasNextIndex == -1 || hasNextIndex == recordIndex) { hasNextIndex++; @@ -132,7 +132,7 @@ public Offset currentOffset() { @Override public void close() throws IOException { - isClosed = true; + closed = true; reader.close(); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java deleted file mode 100644 index 7914f12..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/DelimitedTextFileReaderTest.java +++ /dev/null @@ -1,221 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.DelimitedTextFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.charset.UnsupportedCharsetException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.jupiter.api.Assertions.*; - -public class DelimitedTextFileReaderTest extends HdfsFileReaderTestBase { - - private static final String FIELD_COLUMN1 = "column_1"; - private static final String FIELD_COLUMN2 = "column_2"; - private static final String FIELD_COLUMN3 = "column_3"; - private static final String FIELD_COLUMN4 = "column_4"; - private static final String FILE_EXTENSION = "csv"; - - @BeforeAll - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(true); - readerConfig = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - }}; - } - - private static Path createDataFile(boolean header) throws IOException { - File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (FileWriter writer = new FileWriter(txtFile)) { - - if (header) - writer.append(FIELD_COLUMN1 + "," + FIELD_COLUMN2 + "," + FIELD_COLUMN3 + "," + FIELD_COLUMN4 + "\n"); - IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - try { - writer.append(value + "," + value + "," + value + "," + value + "\n"); - OFFSETS_BY_INDEX.put(index, (long) index); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), txtFile.getName()); - fs.moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); - return path; - } - - @Test - public void emptyFile() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - - @Test - public void invalidFileFormat() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { - writer.write("test"); - } - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - - @Test - public void invaliConfigArgs() { - try { - readerClass.getConstructor(FileSystem.class, Path.class, Map.class).newInstance(fs, dataFile, new HashMap<>()); - } catch (Exception e) { - assertThrows(IllegalArgumentException.class, () -> { - throw e.getCause(); - }); - } - } - - @Test - public void readAllDataWithoutHeader() throws Throwable { - Path file = createDataFile(false); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "false"); - }}); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - checkData(record, recordCount); - recordCount++; - } - assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); - } - - @Test - public void readAllDataWithMalformedRows() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - try (FileWriter writer = new FileWriter(tmp)) { - writer.append(FIELD_COLUMN1 + "," + FIELD_COLUMN2 + "," + FIELD_COLUMN3 + "," + FIELD_COLUMN4 + "\n"); - writer.append("dummy\n"); - writer.append("dummy\n"); - } - Map cfg = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_DEFAULT_VALUE, "custom_value"); - }}; - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - reader = getReader(fs, path, cfg); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - assertAll( - () -> assertEquals("dummy", record.get(FIELD_COLUMN1)), - () -> assertEquals("custom_value", record.get(FIELD_COLUMN2)), - () -> assertEquals("custom_value", record.get(FIELD_COLUMN3)), - () -> assertEquals("custom_value", record.get(FIELD_COLUMN4)) - ); - recordCount++; - } - assertEquals(2, recordCount, () -> "The number of records in the file does not match"); - } - - @Test - public void seekFileWithoutHeader() throws Throwable { - Path file = createDataFile(false); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "false"); - }}); - - assertTrue(reader.hasNext()); - - int recordIndex = NUM_RECORDS / 2; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); - assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); - checkData(reader.next(), recordIndex); - - recordIndex = 0; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); - assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); - checkData(reader.next(), recordIndex); - - recordIndex = NUM_RECORDS - 3; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); - assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); - checkData(reader.next(), recordIndex); - - reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1, false)); - assertFalse(reader.hasNext()); - } - - @Test - public void validFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_ENCODING, "Cp1252"); - }}; - getReader(fs, dataFile, cfg); - } - - @Test - public void invalidFileEncoding() { - Map cfg = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_ENCODING, "invalid_charset"); - }}; - assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); - } - - @Override - protected Offset getOffset(long offset) { - return getOffset(offset, true); - } - - private Offset getOffset(long offset, boolean hasHeader) { - return new DelimitedTextFileReader.DelimitedTextOffset(offset, hasHeader); - } - - @Override - protected void checkData(Struct record, long index) { - assertAll( - () -> assertTrue(record.get(FIELD_COLUMN1).toString().startsWith(index + "_")), - () -> assertTrue(record.get(FIELD_COLUMN2).toString().startsWith(index + "_")), - () -> assertTrue(record.get(FIELD_COLUMN3).toString().startsWith(index + "_")), - () -> assertTrue(record.get(FIELD_COLUMN4).toString().startsWith(index + "_")) - ); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java deleted file mode 100644 index e8413ad..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/DelimitedTextFileReaderTest.java +++ /dev/null @@ -1,230 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.local; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.DelimitedTextFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.charset.UnsupportedCharsetException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.jupiter.api.Assertions.*; - -public class DelimitedTextFileReaderTest extends LocalFileReaderTestBase { - - private static final String FIELD_COLUMN1 = "column_1"; - private static final String FIELD_COLUMN2 = "column_2"; - private static final String FIELD_COLUMN3 = "column_3"; - private static final String FIELD_COLUMN4 = "column_4"; - private static final String FILE_EXTENSION = "tcsv"; - - @BeforeAll - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(true); - readerConfig = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, FILE_EXTENSION); - }}; - } - - private static Path createDataFile(boolean header) throws IOException { - File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (FileWriter writer = new FileWriter(txtFile)) { - - if (header) - writer.append(FIELD_COLUMN1 + "," + FIELD_COLUMN2 + "," + FIELD_COLUMN3 + "," + FIELD_COLUMN4 + "\n"); - IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - try { - writer.append(value + "," + value + "," + value + "," + value + "\n"); - OFFSETS_BY_INDEX.put(index, (long) index); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), txtFile.getName()); - fs.moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); - return path; - } - - @Test - public void emptyFile() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - - @Test - public void invalidFileFormat() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { - writer.write("test"); - } - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - - @Test - public void invaliConfigArgs() { - try { - readerClass.getConstructor(FileSystem.class, Path.class, Map.class).newInstance(fs, dataFile, - new HashMap() {{ - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, FILE_EXTENSION); - }}); - } catch (Exception e) { - assertThrows(IllegalArgumentException.class, () -> { - throw e.getCause(); - }); - } - } - - @Test - public void readAllDataWithoutHeader() throws Throwable { - Path file = createDataFile(false); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "false"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, getFileExtension()); - }}); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - checkData(record, recordCount); - recordCount++; - } - assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); - } - - @Test - public void readAllDataWithMalformedRows() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - try (FileWriter writer = new FileWriter(tmp)) { - writer.append(FIELD_COLUMN1 + "," + FIELD_COLUMN2 + "," + FIELD_COLUMN3 + "," + FIELD_COLUMN4 + "\n"); - writer.append("dummy\n"); - writer.append("dummy\n"); - } - Map cfg = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_DEFAULT_VALUE, "custom_value"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, getFileExtension()); - }}; - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - reader = getReader(fs, path, cfg); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - assertAll( - () -> assertEquals("dummy", record.get(FIELD_COLUMN1)), - () -> assertEquals("custom_value", record.get(FIELD_COLUMN2)), - () -> assertEquals("custom_value", record.get(FIELD_COLUMN3)), - () -> assertEquals("custom_value", record.get(FIELD_COLUMN4)) - ); - recordCount++; - } - assertEquals(2, recordCount, () -> "The number of records in the file does not match"); - } - - @Test - public void seekFileWithoutHeader() throws Throwable { - Path file = createDataFile(false); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "false"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, getFileExtension()); - }}); - - assertTrue(reader.hasNext()); - - int recordIndex = NUM_RECORDS / 2; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); - assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); - checkData(reader.next(), recordIndex); - - recordIndex = 0; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); - assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); - checkData(reader.next(), recordIndex); - - recordIndex = NUM_RECORDS - 3; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex), false)); - assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); - checkData(reader.next(), recordIndex); - - reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1, false)); - assertFalse(reader.hasNext()); - } - - @Test - public void validFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_ENCODING, "Cp1252"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, getFileExtension()); - }}; - getReader(fs, dataFile, cfg); - } - - @Test - public void invalidFileEncoding() { - Map cfg = new HashMap() {{ - put(DelimitedTextFileReader.FILE_READER_DELIMITED_TOKEN, ","); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_HEADER, "true"); - put(DelimitedTextFileReader.FILE_READER_DELIMITED_ENCODING, "invalid_charset"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, getFileExtension()); - }}; - assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); - } - - @Override - protected Offset getOffset(long offset) { - return getOffset(offset, true); - } - - private Offset getOffset(long offset, boolean hasHeader) { - return new DelimitedTextFileReader.DelimitedTextOffset(offset, hasHeader); - } - - @Override - protected void checkData(Struct record, long index) { - assertAll( - () -> assertTrue(record.get(FIELD_COLUMN1).toString().startsWith(index + "_")), - () -> assertTrue(record.get(FIELD_COLUMN2).toString().startsWith(index + "_")), - () -> assertTrue(record.get(FIELD_COLUMN3).toString().startsWith(index + "_")), - () -> assertTrue(record.get(FIELD_COLUMN4).toString().startsWith(index + "_")) - ); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} From b7f15f30c77d03865c1cd04de4d3c08cfd29fec4 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Fri, 13 Mar 2020 18:16:42 -0600 Subject: [PATCH 22/51] Refactor tests for file readers --- .../{local => }/AvroFileReaderTest.java | 85 ++++---- .../fs/file/reader/FileReaderTestBase.java | 185 ++++++++++------ .../fs/file/reader/FileSystemConfig.java | 153 ++++++++++++++ .../fs/file/reader/JsonFileReaderTest.java | 199 ++++++++++++++++++ .../fs/file/reader/ParquetFileReaderTest.java | 192 +++++++++++++++++ .../{local => }/SequenceFileReaderTest.java | 69 +++--- .../fs/file/reader/TextFileReaderTest.java | 143 +++++++++++++ .../file/reader/hdfs/AvroFileReaderTest.java | 122 ----------- .../reader/hdfs/HdfsFileReaderTestBase.java | 33 --- .../file/reader/hdfs/JsonFileReaderTest.java | 174 --------------- .../reader/hdfs/ParquetFileReaderTest.java | 171 --------------- .../reader/hdfs/SequenceFileReaderTest.java | 110 ---------- .../file/reader/hdfs/TextFileReaderTest.java | 154 -------------- .../file/reader/local/JsonFileReaderTest.java | 177 ---------------- .../reader/local/LocalFileReaderTestBase.java | 29 --- .../reader/local/ParquetFileReaderTest.java | 178 ---------------- .../file/reader/local/TextFileReaderTest.java | 155 -------------- 17 files changed, 891 insertions(+), 1438 deletions(-) rename src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/{local => }/AvroFileReaderTest.java (53%) create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileSystemConfig.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java rename src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/{local => }/SequenceFileReaderTest.java (64%) create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/JsonFileReaderTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/JsonFileReaderTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/LocalFileReaderTestBase.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java similarity index 53% rename from src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java rename to src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java index 5c707e1..176b6dd 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/AvroFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java @@ -1,8 +1,6 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.local; +package com.github.mmolimar.kafka.connect.fs.file.reader; import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.AvroFileReader; import org.apache.avro.AvroTypeException; import org.apache.avro.Schema; import org.apache.avro.SchemaParseException; @@ -11,10 +9,13 @@ import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumWriter; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; import java.io.File; import java.io.IOException; @@ -25,7 +26,7 @@ import static org.junit.jupiter.api.Assertions.*; -public class AvroFileReaderTest extends LocalFileReaderTestBase { +public class AvroFileReaderTest extends FileReaderTestBase { private static final String FIELD_INDEX = "index"; private static final String FIELD_NAME = "name"; @@ -37,15 +38,11 @@ public class AvroFileReaderTest extends LocalFileReaderTestBase { @BeforeAll public static void setUp() throws IOException { schema = new Schema.Parser().parse(AvroFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap() {{ - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, FILE_EXTENSION); - }}; } - private static Path createDataFile() throws IOException { - File avroFile = File.createTempFile("test-", "." + FILE_EXTENSION); + @Override + protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + File avroFile = File.createTempFile("test-", "." + getFileExtension()); DatumWriter writer = new GenericDatumWriter<>(schema); try (DataFileWriter dataFileWriter = new DataFileWriter<>(writer)) { dataFileWriter.setFlushOnEveryBlock(true); @@ -58,52 +55,52 @@ private static Path createDataFile() throws IOException { datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID())); datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID())); try { - OFFSETS_BY_INDEX.put(index, dataFileWriter.sync() - 16L); + fsConfig.getOffsetsByIndex().put(index, dataFileWriter.sync() - 16L); dataFileWriter.append(datum); } catch (IOException ioe) { throw new RuntimeException(ioe); } }); } - Path path = new Path(new Path(fsUri), avroFile.getName()); - fs.moveFromLocalFile(new Path(avroFile.getAbsolutePath()), path); + Path path = new Path(new Path(fsConfig.getFsUri()), avroFile.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(avroFile.getAbsolutePath()), path); return path; } - @Test - public void readerWithSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(AvroFileReader.FILE_READER_AVRO_SCHEMA, schema.toString()); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, getFileExtension()); - }}; - reader = getReader(fs, dataFile, cfg); - readAllData(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithSchema(FileSystemConfig fsConfig) throws Throwable { + Map readerConfig = getReaderConfig(); + readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, schema.toString()); + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); + readAllData(fsConfig); } - @Test - public void readerWithInvalidSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(AvroFileReader.FILE_READER_AVRO_SCHEMA, Schema.create(Schema.Type.STRING).toString()); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, getFileExtension()); - }}; - reader = getReader(fs, dataFile, cfg); - assertThrows(IllegalStateException.class, this::readAllData); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithInvalidSchema(FileSystemConfig fsConfig) throws Throwable { + Map readerConfig = getReaderConfig(); + readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, Schema.create(Schema.Type.STRING).toString()); + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); + assertThrows(IllegalStateException.class, () -> readAllData(fsConfig)); assertThrows(AvroTypeException.class, () -> { try { - readAllData(); + readAllData(fsConfig); } catch (Exception e) { throw e.getCause(); } }); } - @Test - public void readerWithUnparseableSchema() { - Map cfg = new HashMap() {{ - put(AvroFileReader.FILE_READER_AVRO_SCHEMA, "invalid schema"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, getFileExtension()); - }}; - assertThrows(SchemaParseException.class, () -> getReader(fs, dataFile, cfg)); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithUnparseableSchema(FileSystemConfig fsConfig) throws IOException { + Map readerConfig = getReaderConfig(); + readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, "invalid schema"); + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + assertThrows(SchemaParseException.class, () -> getReader(testFs, fsConfig.getDataFile(), readerConfig)); } @Override @@ -111,6 +108,16 @@ protected Offset getOffset(long offset) { return new AvroFileReader.AvroOffset(offset); } + @Override + protected Class getReaderClass() { + return AvroFileReader.class; + } + + @Override + protected Map getReaderConfig() { + return new HashMap<>(); + } + @Override protected void checkData(Struct record, long index) { assertAll( diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java index d9cc9f4..188960e 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java @@ -2,59 +2,79 @@ import com.github.mmolimar.kafka.connect.fs.file.Offset; import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import java.io.*; -import java.net.URI; -import java.util.HashMap; -import java.util.Map; -import java.util.NoSuchElementException; -import java.util.UUID; +import java.util.*; +import java.util.stream.Stream; import static org.junit.jupiter.api.Assertions.*; -public abstract class FileReaderTestBase { +abstract class FileReaderTestBase { + private static final List TEST_FILE_SYSTEMS = Arrays.asList( + new LocalFsConfig(), + new HdfsFsConfig() + ); protected static final int NUM_RECORDS = 100; - protected static final Map OFFSETS_BY_INDEX = new HashMap<>(); - protected static Class readerClass; - protected static FileSystem fs; - protected static URI fsUri; - protected static Path dataFile; - protected static Map readerConfig; - protected static FileReader reader; + @BeforeAll + public static void initFs() throws IOException { + for (FileSystemConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.initFs(); + } + } @AfterAll - public static void tearDown() throws IOException { - fs.close(); + public static void finishFs() throws IOException { + for (FileSystemConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.close(); + } } @BeforeEach public void openReader() throws Throwable { - reader = getReader(fs, dataFile, readerConfig); - assertEquals(reader.getFilePath(), dataFile); + for (FileSystemConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.setDataFile(createDataFile(fsConfig)); + FileReader reader = ReflectionUtils.makeReader(getReaderClass(), fsConfig.getFs(), + fsConfig.getDataFile(), getReaderConfig()); + assertEquals(reader.getFilePath(), fsConfig.getDataFile()); + fsConfig.setReader(reader); + } } @AfterEach public void closeReader() { - try { - reader.close(); - } catch (Exception e) { - //ignoring + for (FileSystemConfig fsConfig : TEST_FILE_SYSTEMS) { + try { + fsConfig.getReader().close(); + } catch (Exception e) { + //ignoring + } } } - @Test - public void invalidArgs() { + private static Stream fileSystemConfigProvider() { + return TEST_FILE_SYSTEMS.stream().map(Arguments::of); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidArgs(FileSystemConfig fsConfig) { try { - readerClass.getConstructor(FileSystem.class, Path.class, Map.class).newInstance(null, null, null); + fsConfig.getReader().getClass().getConstructor(FileSystem.class, Path.class, Map.class) + .newInstance(null, null, null); } catch (Exception e) { assertThrows(IllegalArgumentException.class, () -> { throw e.getCause(); @@ -62,33 +82,38 @@ public void invalidArgs() { } } - @Test - public void fileDoesNotExist() { - Path path = new Path(new Path(fsUri), UUID.randomUUID().toString()); - assertThrows(FileNotFoundException.class, () -> getReader(fs, path, readerConfig)); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void fileDoesNotExist(FileSystemConfig fsConfig) { + Path path = new Path(new Path(fsConfig.getFsUri()), UUID.randomUUID().toString()); + assertThrows(FileNotFoundException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); } - @Test - public void emptyFile() throws Throwable { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void emptyFile(FileSystemConfig fsConfig) throws Throwable { File tmp = File.createTempFile("test-", "." + getFileExtension()); - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - assertThrows(IOException.class, () -> getReader(fs, path, readerConfig)); + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + assertThrows(IOException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); } - @Test - public void invalidFileFormat() throws Throwable { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidFileFormat(FileSystemConfig fsConfig) throws Throwable { File tmp = File.createTempFile("test-", "." + getFileExtension()); try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { writer.write("test"); } - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - assertThrows(IOException.class, () -> getReader(fs, path, readerConfig)); + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + assertThrows(IOException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); } - @Test - public void readAllData() { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readAllData(FileSystemConfig fsConfig) { + FileReader reader = fsConfig.getReader(); assertTrue(reader.hasNext()); int recordCount = 0; @@ -97,60 +122,92 @@ public void readAllData() { checkData(record, recordCount); recordCount++; } - assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); } - @Test - public void seekFile() { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void seekFile(FileSystemConfig fsConfig) { + FileReader reader = fsConfig.getReader(); int recordIndex = NUM_RECORDS / 2; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex))); + reader.seek(getOffset(fsConfig.getOffsetsByIndex().get(recordIndex))); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.getOffsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = 0; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex))); + reader.seek(getOffset(fsConfig.getOffsetsByIndex().get(recordIndex))); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.getOffsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = NUM_RECORDS - 3; - reader.seek(getOffset(OFFSETS_BY_INDEX.get(recordIndex))); + reader.seek(getOffset(fsConfig.getOffsetsByIndex().get(recordIndex))); assertTrue(reader.hasNext()); - assertEquals(OFFSETS_BY_INDEX.get(recordIndex), reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.getOffsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); - reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1)); + reader.seek(getOffset(fsConfig.getOffsetsByIndex().get(NUM_RECORDS - 1) + 1)); assertFalse(reader.hasNext()); } - @Test - public void negativeSeek() { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void negativeSeek(FileSystemConfig fsConfig) { + FileReader reader = fsConfig.getReader(); assertThrows(RuntimeException.class, () -> reader.seek(getOffset(-1))); } - @Test - public void exceededSeek() { - reader.seek(getOffset(OFFSETS_BY_INDEX.get(NUM_RECORDS - 1) + 1)); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void exceededSeek(FileSystemConfig fsConfig) { + FileReader reader = fsConfig.getReader(); + reader.seek(getOffset(fsConfig.getOffsetsByIndex().get(NUM_RECORDS - 1) + 1)); assertFalse(reader.hasNext()); - assertThrows(NoSuchElementException.class, () -> reader.next()); + assertThrows(NoSuchElementException.class, reader::next); } - @Test - public void readFileAlreadyClosed() throws IOException { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readFileAlreadyClosed(FileSystemConfig fsConfig) throws IOException { + FileReader reader = fsConfig.getReader(); reader.close(); - assertThrows(IllegalStateException.class, () -> reader.hasNext()); - assertThrows(IllegalStateException.class, () -> reader.next()); + assertThrows(IllegalStateException.class, reader::hasNext); + assertThrows(IllegalStateException.class, reader::next); + } + + protected Offset getOffset(long offset) { + return () -> offset; } protected final FileReader getReader(FileSystem fs, Path path, Map config) throws Throwable { - return ReflectionUtils.makeReader(readerClass, fs, path, config); + return ReflectionUtils.makeReader(getReaderClass(), fs, path, config); } - protected abstract Offset getOffset(long offset); + protected OutputStream getOutputStream(File file, CompressionType compression) throws IOException { + final OutputStream os; + switch (compression) { + case BZIP2: + os = new BZip2CompressorOutputStream(new FileOutputStream(file)); + break; + case GZIP: + os = new GzipCompressorOutputStream(new FileOutputStream(file)); + break; + default: + os = new FileOutputStream(file); + break; + } + return os; + } - protected abstract void checkData(Struct record, long index); + protected abstract Class getReaderClass(); + + protected abstract Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException; + + protected abstract Map getReaderConfig(); protected abstract String getFileExtension(); + protected abstract void checkData(Struct record, long index); + } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileSystemConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileSystemConfig.java new file mode 100644 index 0000000..c670e5f --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileSystemConfig.java @@ -0,0 +1,153 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.MiniDFSCluster; + +import java.io.Closeable; +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; +import java.util.HashMap; +import java.util.Map; + +interface FileSystemConfig extends Closeable { + + void initFs() throws IOException; + + FileSystem getFs(); + + URI getFsUri(); + + void setDataFile(Path dataFile); + + Path getDataFile(); + + void setReader(FileReader reader); + + FileReader getReader(); + + Map getOffsetsByIndex(); + +} + +class LocalFsConfig implements FileSystemConfig { + private java.nio.file.Path localDir; + private FileSystem fs; + private URI fsUri; + private Path dataFile; + private FileReader reader; + private Map offsetsByIndex; + + @Override + public void initFs() throws IOException { + localDir = Files.createTempDirectory("test-"); + fsUri = localDir.toUri(); + fs = FileSystem.newInstance(fsUri, new Configuration()); + offsetsByIndex = new HashMap<>(); + } + + @Override + public FileSystem getFs() { + return fs; + } + + @Override + public URI getFsUri() { + return fsUri; + } + + @Override + public void setDataFile(Path dataFile) { + this.dataFile = dataFile; + } + + @Override + public Path getDataFile() { + return dataFile; + } + + @Override + public void setReader(FileReader reader) { + this.reader = reader; + } + + @Override + public FileReader getReader() { + return reader; + } + + @Override + public Map getOffsetsByIndex() { + return offsetsByIndex; + } + + @Override + public void close() throws IOException { + fs.close(); + FileUtils.deleteDirectory(localDir.toFile()); + } +} + +class HdfsFsConfig implements FileSystemConfig { + private MiniDFSCluster cluster; + private FileSystem fs; + private URI fsUri; + private Path dataFile; + private FileReader reader; + private Map offsetsByIndex; + + @Override + public void initFs() throws IOException { + Configuration clusterConfig = new Configuration(); + java.nio.file.Path hdfsDir = Files.createTempDirectory("test-"); + clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString()); + cluster = new MiniDFSCluster.Builder(clusterConfig).build(); + fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/"); + fs = FileSystem.newInstance(fsUri, new Configuration()); + offsetsByIndex = new HashMap<>(); + } + + @Override + public FileSystem getFs() { + return fs; + } + + @Override + public URI getFsUri() { + return fsUri; + } + + @Override + public Path getDataFile() { + return dataFile; + } + + @Override + public void setDataFile(Path dataFile) { + this.dataFile = dataFile; + } + + @Override + public void setReader(FileReader reader) { + this.reader = reader; + } + + @Override + public FileReader getReader() { + return reader; + } + + @Override + public Map getOffsetsByIndex() { + return offsetsByIndex; + } + + @Override + public void close() throws IOException { + fs.close(); + cluster.shutdown(true); + } +} \ No newline at end of file diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java new file mode 100644 index 0000000..9d05edf --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java @@ -0,0 +1,199 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.node.JsonNodeFactory; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.github.mmolimar.kafka.connect.fs.file.Offset; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.*; + +public class JsonFileReaderTest extends FileReaderTestBase { + + private static final String FIELD_INTEGER = "integerField"; + private static final String FIELD_LONG = "longField"; + private static final String FIELD_BOOLEAN = "booleanField"; + private static final String FIELD_STRING = "stringField"; + private static final String FIELD_DECIMAL = "decimalField"; + private static final String FIELD_ARRAY = "arrayField"; + private static final String FIELD_STRUCT = "structField"; + private static final String FIELD_NULL = "nullField"; + private static final String FILE_EXTENSION = "jsn"; + private static final CompressionType COMPRESSION_TYPE_DEFAULT = CompressionType.NONE; + + @Override + protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + int numRecords = args.length < 1 ? NUM_RECORDS : (int) args[0]; + boolean recordPerLine = args.length < 2 || (boolean) args[1]; + CompressionType compression = args.length < 3 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[2]; + File txtFile = File.createTempFile("test-", "." + getFileExtension()); + try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { + IntStream.range(0, numRecords).forEach(index -> { + ObjectNode json = JsonNodeFactory.instance.objectNode() + .put(FIELD_INTEGER, index) + .put(FIELD_LONG, Long.MAX_VALUE) + .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID())) + .put(FIELD_BOOLEAN, true) + .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index)) + .put(FIELD_NULL, (String) null); + json.putArray(FIELD_ARRAY) + .add("elm[" + index + "]") + .add("elm[" + index + "]"); + json.putObject(FIELD_STRUCT) + .put(FIELD_INTEGER, (short) index) + .put(FIELD_LONG, Long.MAX_VALUE) + .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID())) + .put(FIELD_BOOLEAN, true) + .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index)) + .put(FIELD_NULL, (String) null); + writer.append(recordPerLine ? json.toString() + "\n" : json.toPrettyString()); + fsConfig.getOffsetsByIndex().put(index, (long) index); + }); + } + Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); + return path; + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void emptyFile(FileSystemConfig fsConfig) throws Throwable { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + FileReader reader = getReader(fsConfig.getFs(), path, getReaderConfig()); + assertFalse(reader.hasNext()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void validFileEncoding(FileSystemConfig fsConfig) throws Throwable { + Map readerConfig = getReaderConfig(); + readerConfig.put(JsonFileReader.FILE_READER_JSON_ENCODING, "Cp1252"); + fsConfig.setReader(getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); + readAllData(fsConfig); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidDeserializationConfig(FileSystemConfig fsConfig) throws Throwable { + Map readerConfig = getReaderConfig(); + readerConfig.put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + "invalid", "false"); + fsConfig.setReader(getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); + readAllData(fsConfig); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidFileEncoding(FileSystemConfig fsConfig) { + Map readerConfig = getReaderConfig(); + readerConfig.put(JsonFileReader.FILE_READER_JSON_ENCODING, "invalid_charset"); + assertThrows(UnsupportedCharsetException.class, () -> getReader(fsConfig.getFs(), + fsConfig.getDataFile(), readerConfig)); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readDataWithRecordPerLineDisabled(FileSystemConfig fsConfig) throws Throwable { + Path file = createDataFile(fsConfig, 1, false); + Map readerConfig = getReaderConfig(); + readerConfig.put(JsonFileReader.FILE_READER_JSON_RECORD_PER_LINE, "false"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals(1, recordCount, "The number of records in the file does not match"); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readDifferentCompressionTypes(FileSystemConfig fsConfig) { + Arrays.stream(CompressionType.values()).forEach(compressionType -> { + try { + Path file = createDataFile(fsConfig, NUM_RECORDS, true, compressionType); + Map readerConfig = getReaderConfig(); + readerConfig.put(JsonFileReader.FILE_READER_JSON_COMPRESSION_TYPE, compressionType.toString()); + readerConfig.put(JsonFileReader.FILE_READER_JSON_COMPRESSION_CONCATENATED, "true"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }); + } + + @Override + protected Offset getOffset(long offset) { + return () -> offset; + } + + @Override + protected Class getReaderClass() { + return JsonFileReader.class; + } + + @Override + protected Map getReaderConfig() { + return new HashMap() {{ + String deserializationConfig = DeserializationFeature.ACCEPT_EMPTY_ARRAY_AS_NULL_OBJECT.name(); + put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + deserializationConfig, "true"); + }}; + } + + @Override + protected void checkData(Struct record, long index) { + Struct subrecord = record.getStruct(FIELD_STRUCT); + assertAll( + () -> assertEquals((int) (Integer) record.get(FIELD_INTEGER), index), + () -> assertEquals((long) (Long) record.get(FIELD_LONG), Long.MAX_VALUE), + () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")), + () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())), + () -> assertEquals((Double) record.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0), + () -> assertNull(record.get(FIELD_NULL)), + () -> assertNotNull(record.schema().field(FIELD_NULL)), + () -> assertEquals(record.get(FIELD_ARRAY), Arrays.asList("elm[" + index + "]", "elm[" + index + "]")), + () -> assertEquals((int) (Integer) subrecord.get(FIELD_INTEGER), index), + () -> assertEquals((long) (Long) subrecord.get(FIELD_LONG), Long.MAX_VALUE), + () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")), + () -> assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())), + () -> assertEquals((Double) subrecord.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0), + () -> assertNull(subrecord.get(FIELD_NULL)), + () -> assertNotNull(subrecord.schema().field(FIELD_NULL)) + ); + } + + @Override + protected String getFileExtension() { + return FILE_EXTENSION; + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java new file mode 100644 index 0000000..672872e --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java @@ -0,0 +1,192 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.github.mmolimar.kafka.connect.fs.file.Offset; +import org.apache.avro.AvroRuntimeException; +import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; +import org.apache.avro.SchemaParseException; +import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericRecord; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.DataException; +import org.apache.parquet.avro.AvroParquetWriter; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.apache.parquet.hadoop.ParquetWriter; +import org.apache.parquet.io.InvalidRecordException; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.*; + +public class ParquetFileReaderTest extends FileReaderTestBase { + + private static final String FIELD_INDEX = "index"; + private static final String FIELD_NAME = "name"; + private static final String FIELD_SURNAME = "surname"; + private static final String FILE_EXTENSION = "parquet"; + + private static Schema readerSchema; + private static Schema projectionSchema; + + @BeforeAll + public static void setUp() throws IOException { + readerSchema = new Schema.Parser().parse( + ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); + projectionSchema = new Schema.Parser().parse( + ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people_projection.avsc")); + } + + @Override + protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + FileSystem fs = fsConfig.getFs(); + File parquetFile = File.createTempFile("test-", "." + getFileExtension()); + + try (ParquetWriter writer = AvroParquetWriter.builder(new Path(parquetFile.toURI())) + .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) { + IntStream.range(0, NUM_RECORDS).forEach(index -> { + GenericRecord datum = new GenericData.Record(readerSchema); + datum.put(FIELD_INDEX, index); + String uuid = UUID.randomUUID().toString(); + datum.put(FIELD_NAME, String.format("%d_name_%s", index, uuid)); + datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, uuid)); + try { + fsConfig.getOffsetsByIndex().put(index, (long) index); + writer.write(datum); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + }); + } + Path path = new Path(new Path(fsConfig.getFsUri()), parquetFile.getName()); + fs.moveFromLocalFile(new Path(parquetFile.getAbsolutePath()), path); + return path; + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void emptyFile(FileSystemConfig fsConfig) throws Throwable { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fsConfig.getFs(), path, getReaderConfig()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidFileFormat(FileSystemConfig fsConfig) throws Throwable { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { + writer.write("test"); + } + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fsConfig.getFs(), path, getReaderConfig()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithSchema(FileSystemConfig fsConfig) throws Throwable { + Map readerConfig = getReaderConfig(); + readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, readerSchema.toString()); + readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); + readAllData(fsConfig); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithProjection(FileSystemConfig fsConfig) throws Throwable { + Map readerConfig = getReaderConfig(); + readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, projectionSchema.toString()); + readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); + fsConfig.setReader(getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); + while (fsConfig.getReader().hasNext()) { + Struct record = fsConfig.getReader().next(); + assertNotNull(record.schema().field(FIELD_INDEX)); + assertNotNull(record.schema().field(FIELD_NAME)); + assertNull(record.schema().field(FIELD_SURNAME)); + } + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); + assertThrows(DataException.class, () -> readAllData(fsConfig)); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithInvalidProjection(FileSystemConfig fsConfig) throws Throwable { + Schema testSchema = SchemaBuilder.record("test_projection").namespace("test.avro") + .fields() + .name("field1").type("string").noDefault() + .endRecord(); + Map readerConfig = getReaderConfig(); + readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, testSchema.toString()); + readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); + assertThrows(InvalidRecordException.class, () -> readAllData(fsConfig)); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithInvalidSchema(FileSystemConfig fsConfig) throws Throwable { + Map readerConfig = getReaderConfig(); + readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, Schema.create(Schema.Type.STRING).toString()); + readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); + FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); + fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); + assertThrows(AvroRuntimeException.class, () -> readAllData(fsConfig)); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readerWithUnparseableSchema(FileSystemConfig fsConfig) { + Map readerConfig = getReaderConfig(); + readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, "invalid schema"); + readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); + assertThrows(SchemaParseException.class, () -> + getReader(FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()), + fsConfig.getDataFile(), readerConfig)); + } + + @Override + protected Map getReaderConfig() { + return new HashMap<>(); + } + + @Override + protected Offset getOffset(long offset) { + return new ParquetFileReader.ParquetOffset(offset); + } + + @Override + protected Class getReaderClass() { + return ParquetFileReader.class; + } + + @Override + protected void checkData(Struct record, long index) { + assertEquals((int) (Integer) record.get(FIELD_INDEX), index); + assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); + assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); + } + + @Override + protected String getFileExtension() { + return FILE_EXTENSION; + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java similarity index 64% rename from src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java rename to src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java index 411f647..18377fa 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/SequenceFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java @@ -1,8 +1,7 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.local; +package com.github.mmolimar.kafka.connect.fs.file.reader; import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.SequenceFileReader; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; @@ -10,8 +9,8 @@ import org.apache.hadoop.io.Writable; import org.apache.hadoop.util.ReflectionUtils; import org.apache.kafka.connect.data.Struct; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; import java.io.File; import java.io.IOException; @@ -22,28 +21,19 @@ import static org.junit.jupiter.api.Assertions.*; -public class SequenceFileReaderTest extends LocalFileReaderTestBase { +public class SequenceFileReaderTest extends FileReaderTestBase { private static final String FIELD_NAME_KEY = "custom_field_key"; private static final String FIELD_NAME_VALUE = "custom_field_name"; private static final String FILE_EXTENSION = "sq"; - @BeforeAll - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap() {{ - put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, FIELD_NAME_KEY); - put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE, FILE_EXTENSION); - }}; - } - - private static Path createDataFile() throws IOException { - File seqFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (SequenceFile.Writer writer = SequenceFile.createWriter(fs.getConf(), SequenceFile.Writer.file(new Path(seqFile.getAbsolutePath())), + @Override + protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + FileSystem fs = fsConfig.getFs(); + File seqFile = File.createTempFile("test-", "." + getFileExtension()); + try (SequenceFile.Writer writer = SequenceFile.createWriter(fs.getConf(), + SequenceFile.Writer.file(new Path(seqFile.getAbsolutePath())), SequenceFile.Writer.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class))) { - IntStream.range(0, NUM_RECORDS).forEach(index -> { Writable key = new IntWritable(index); Writable value = new Text(String.format("%d_%s", index, UUID.randomUUID())); @@ -62,31 +52,33 @@ private static Path createDataFile() throws IOException { int index = 0; long pos = reader.getPosition() - 1; while (reader.next(key, value)) { - OFFSETS_BY_INDEX.put(index++, pos); + fsConfig.getOffsetsByIndex().put(index++, pos); pos = reader.getPosition(); } } - Path path = new Path(new Path(fsUri), seqFile.getName()); + Path path = new Path(new Path(fsConfig.getFsUri()), seqFile.getName()); fs.moveFromLocalFile(new Path(seqFile.getAbsolutePath()), path); return path; } - @Test - public void defaultFieldNames() throws Throwable { - Map customReaderCfg = new HashMap() {{ - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE, getFileExtension()); - }}; - reader = getReader(fs, dataFile, customReaderCfg); - assertEquals(reader.getFilePath(), dataFile); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void defaultFieldNames(FileSystemConfig fsConfig) throws Throwable { + Map readerConfig = getReaderConfig(); + readerConfig.put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, null); + readerConfig.put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, null); + FileReader reader = getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); + assertEquals(reader.getFilePath(), fsConfig.getDataFile()); assertTrue(reader.hasNext()); int recordCount = 0; while (reader.hasNext()) { Struct record = reader.next(); - checkData(SequenceFileReader.FIELD_NAME_KEY_DEFAULT, SequenceFileReader.FIELD_NAME_VALUE_DEFAULT, record, recordCount); + checkData(SequenceFileReader.FIELD_NAME_KEY_DEFAULT, SequenceFileReader.FIELD_NAME_VALUE_DEFAULT, + record, recordCount); recordCount++; } - assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); } @Override @@ -94,6 +86,19 @@ protected Offset getOffset(long offset) { return new SequenceFileReader.SeqOffset(offset); } + @Override + protected Class getReaderClass() { + return SequenceFileReader.class; + } + + @Override + protected Map getReaderConfig() { + return new HashMap() {{ + put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, FIELD_NAME_KEY); + put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + }}; + } + @Override protected void checkData(Struct record, long index) { checkData(FIELD_NAME_KEY, FIELD_NAME_VALUE, record, index); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java new file mode 100644 index 0000000..9220772 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java @@ -0,0 +1,143 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.github.mmolimar.kafka.connect.fs.file.Offset; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.*; + +public class TextFileReaderTest extends FileReaderTestBase { + + private static final String FIELD_NAME_VALUE = "custom_field_name"; + private static final String FILE_EXTENSION = "txt"; + private static final CompressionType COMPRESSION_TYPE_DEFAULT = CompressionType.GZIP; + + @Override + protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + CompressionType compression = args.length < 1 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[0]; + File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); + try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { + IntStream.range(0, NUM_RECORDS).forEach(index -> { + String value = String.format("%d_%s", index, UUID.randomUUID()); + writer.append(value + "\n"); + fsConfig.getOffsetsByIndex().put(index, (long) index); + }); + } + Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); + return path; + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void validFileEncoding(FileSystemConfig fsConfig) throws Throwable { + Map readerConfig = getReaderConfig(); + readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + readerConfig.put(TextFileReader.FILE_READER_TEXT_ENCODING, "Cp1252"); + readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT); + FileReader reader = getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); + fsConfig.setReader(reader); + readAllData(fsConfig); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidFileEncoding(FileSystemConfig fsConfig) { + Map readerConfig = getReaderConfig(); + readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + readerConfig.put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset"); + readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT); + assertThrows(UnsupportedCharsetException.class, () -> getReader(fsConfig.getFs(), + fsConfig.getDataFile(), readerConfig)); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readDataWithRecordPerLineDisabled(FileSystemConfig fsConfig) throws Throwable { + Path file = createDataFile(fsConfig, COMPRESSION_TYPE_DEFAULT); + Map readerConfig = getReaderConfig(); + readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + readerConfig.put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, "false"); + readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals(1, recordCount, () -> "The number of records in the file does not match"); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readDifferentCompressionTypes(FileSystemConfig fsConfig) { + Arrays.stream(CompressionType.values()).forEach(compressionType -> { + try { + Path file = createDataFile(fsConfig, compressionType); + Map readerConfig = getReaderConfig(); + readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, compressionType); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }); + } + + @Override + protected Offset getOffset(long offset) { + return new TextFileReader.TextOffset(offset); + } + + @Override + protected Class getReaderClass() { + return TextFileReader.class; + } + + @Override + protected Map getReaderConfig() { + return new HashMap() {{ + put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT); + put(TextFileReader.FILE_READER_TEXT_COMPRESSION_CONCATENATED, "true"); + }}; + } + + @Override + protected void checkData(Struct record, long index) { + assertTrue(record.get(FIELD_NAME_VALUE).toString().startsWith(index + "_")); + } + + @Override + protected String getFileExtension() { + return FILE_EXTENSION; + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java deleted file mode 100644 index b4ae9ae..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/AvroFileReaderTest.java +++ /dev/null @@ -1,122 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.AvroFileReader; -import org.apache.avro.AvroTypeException; -import org.apache.avro.Schema; -import org.apache.avro.SchemaParseException; -import org.apache.avro.file.DataFileWriter; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericDatumWriter; -import org.apache.avro.generic.GenericRecord; -import org.apache.avro.io.DatumWriter; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.jupiter.api.Assertions.*; - -public class AvroFileReaderTest extends HdfsFileReaderTestBase { - - private static final String FIELD_INDEX = "index"; - private static final String FIELD_NAME = "name"; - private static final String FIELD_SURNAME = "surname"; - private static final String FILE_EXTENSION = "avro"; - - private static Schema schema; - - @BeforeAll - public static void setUp() throws IOException { - schema = new Schema.Parser().parse(AvroFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap<>(); - } - - private static Path createDataFile() throws IOException { - File avroFile = File.createTempFile("test-", "." + FILE_EXTENSION); - DatumWriter writer = new GenericDatumWriter<>(schema); - try (DataFileWriter dataFileWriter = new DataFileWriter<>(writer)) { - dataFileWriter.setFlushOnEveryBlock(true); - dataFileWriter.setSyncInterval(32); - dataFileWriter.create(schema, avroFile); - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - GenericRecord datum = new GenericData.Record(schema); - datum.put(FIELD_INDEX, index); - datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID())); - datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID())); - try { - OFFSETS_BY_INDEX.put(index, dataFileWriter.sync() - 16L); - dataFileWriter.append(datum); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), avroFile.getName()); - fs.moveFromLocalFile(new Path(avroFile.getAbsolutePath()), path); - return path; - } - - @Test - public void readerWithSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(AvroFileReader.FILE_READER_AVRO_SCHEMA, schema.toString()); - }}; - reader = getReader(fs, dataFile, cfg); - readAllData(); - } - - @Test - public void readerWithInvalidSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(AvroFileReader.FILE_READER_AVRO_SCHEMA, Schema.create(Schema.Type.STRING).toString()); - }}; - reader = getReader(fs, dataFile, cfg); - assertThrows(IllegalStateException.class, this::readAllData); - assertThrows(AvroTypeException.class, () -> { - try { - readAllData(); - } catch (Exception e) { - throw e.getCause(); - } - }); - } - - @Test - public void readerWithUnparseableSchema() { - Map cfg = new HashMap() {{ - put(AvroFileReader.FILE_READER_AVRO_SCHEMA, "invalid schema"); - }}; - assertThrows(SchemaParseException.class, () -> getReader(fs, dataFile, cfg)); - } - - @Override - protected Offset getOffset(long offset) { - return new AvroFileReader.AvroOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - assertAll( - () -> assertEquals((int) (Integer) record.get(FIELD_INDEX), index), - () -> assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")), - () -> assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")) - ); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java deleted file mode 100644 index c60d0c3..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/HdfsFileReaderTestBase.java +++ /dev/null @@ -1,33 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReaderTestBase; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; - -import java.io.IOException; -import java.net.URI; -import java.nio.file.Files; -import java.nio.file.Path; - -public abstract class HdfsFileReaderTestBase extends FileReaderTestBase { - - private static MiniDFSCluster cluster; - - @BeforeAll - public static void initFs() throws IOException { - Configuration clusterConfig = new Configuration(); - Path hdfsDir = Files.createTempDirectory("test-"); - clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString()); - cluster = new MiniDFSCluster.Builder(clusterConfig).build(); - fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/"); - fs = FileSystem.newInstance(fsUri, new Configuration()); - } - - @AfterAll - public static void finishFs() throws Exception { - cluster.shutdown(true); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/JsonFileReaderTest.java deleted file mode 100644 index 188487a..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/JsonFileReaderTest.java +++ /dev/null @@ -1,174 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; - -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.node.JsonNodeFactory; -import com.fasterxml.jackson.databind.node.ObjectNode; -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.JsonFileReader; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.charset.UnsupportedCharsetException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.jupiter.api.Assertions.*; - -public class JsonFileReaderTest extends HdfsFileReaderTestBase { - - private static final String FIELD_INTEGER = "integerField"; - private static final String FIELD_LONG = "longField"; - private static final String FIELD_BOOLEAN = "booleanField"; - private static final String FIELD_STRING = "stringField"; - private static final String FIELD_DECIMAL = "decimalField"; - private static final String FIELD_ARRAY = "arrayField"; - private static final String FIELD_STRUCT = "structField"; - private static final String FIELD_NULL = "nullField"; - private static final String FILE_EXTENSION = "json"; - - @BeforeAll - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap() {{ - String deserializationConfig = DeserializationFeature.ACCEPT_EMPTY_ARRAY_AS_NULL_OBJECT.name(); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_JSON, FILE_EXTENSION); - put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + deserializationConfig, "true"); - put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + "invalid", "false"); - }}; - } - - private static Path createDataFile() throws IOException { - return createDataFile(NUM_RECORDS, true); - } - - private static Path createDataFile(int numRecords, boolean recordPerLine) throws IOException { - File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (FileWriter writer = new FileWriter(txtFile)) { - IntStream.range(0, numRecords).forEach(index -> { - ObjectNode json = JsonNodeFactory.instance.objectNode() - .put(FIELD_INTEGER, index) - .put(FIELD_LONG, Long.MAX_VALUE) - .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID())) - .put(FIELD_BOOLEAN, true) - .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index)) - .put(FIELD_NULL, (String) null); - json.putArray(FIELD_ARRAY) - .add("elm[" + index + "]") - .add("elm[" + index + "]"); - json.putObject(FIELD_STRUCT) - .put(FIELD_INTEGER, (short) index) - .put(FIELD_LONG, Long.MAX_VALUE) - .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID())) - .put(FIELD_BOOLEAN, true) - .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index)) - .put(FIELD_NULL, (String) null); - try { - writer.append(recordPerLine ? json.toString() + "\n" : json.toPrettyString()); - OFFSETS_BY_INDEX.put(index, (long) index); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), txtFile.getName()); - fs.moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); - return path; - } - - @Test - public void emptyFile() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - - @Test - public void readEmptyFile() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - FileReader reader = getReader(fs, path, readerConfig); - assertFalse(reader.hasNext()); - } - - @Test - public void validFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(JsonFileReader.FILE_READER_JSON_ENCODING, "Cp1252"); - }}; - reader = getReader(fs, dataFile, cfg); - readAllData(); - } - - @Test - public void invalidFileEncoding() { - Map cfg = new HashMap() {{ - put(JsonFileReader.FILE_READER_JSON_ENCODING, "invalid_charset"); - }}; - assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); - } - - @Test - public void readDataWithRecordPerLineDisabled() throws Throwable { - Path file = createDataFile(1, false); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(JsonFileReader.FILE_READER_JSON_RECORD_PER_LINE, "false"); - }}); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - checkData(record, recordCount); - recordCount++; - } - reader.close(); - assertEquals(1, recordCount, () -> "The number of records in the file does not match"); - } - - @Override - protected Offset getOffset(long offset) { - return () -> offset; - } - - @Override - protected void checkData(Struct record, long index) { - Struct subrecord = record.getStruct(FIELD_STRUCT); - assertAll( - () -> assertEquals((int) (Integer) record.get(FIELD_INTEGER), index), - () -> assertEquals((long) (Long) record.get(FIELD_LONG), Long.MAX_VALUE), - () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")), - () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())), - () -> assertEquals((Double) record.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0), - () -> assertNull(record.get(FIELD_NULL)), - () -> assertNotNull(record.schema().field(FIELD_NULL)), - () -> assertEquals(record.get(FIELD_ARRAY), Arrays.asList("elm[" + index + "]", "elm[" + index + "]")), - () -> assertEquals((int) (Integer) subrecord.get(FIELD_INTEGER), index), - () -> assertEquals((long) (Long) subrecord.get(FIELD_LONG), Long.MAX_VALUE), - () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")), - () -> assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())), - () -> assertEquals((Double) subrecord.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0), - () -> assertNull(subrecord.get(FIELD_NULL)), - () -> assertNotNull(subrecord.schema().field(FIELD_NULL)) - ); - - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java deleted file mode 100644 index d08395d..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/ParquetFileReaderTest.java +++ /dev/null @@ -1,171 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.ParquetFileReader; -import org.apache.avro.AvroRuntimeException; -import org.apache.avro.Schema; -import org.apache.avro.SchemaBuilder; -import org.apache.avro.SchemaParseException; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.errors.DataException; -import org.apache.parquet.avro.AvroParquetWriter; -import org.apache.parquet.hadoop.ParquetFileWriter; -import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.io.InvalidRecordException; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.jupiter.api.Assertions.*; - -public class ParquetFileReaderTest extends HdfsFileReaderTestBase { - - private static final String FIELD_INDEX = "index"; - private static final String FIELD_NAME = "name"; - private static final String FIELD_SURNAME = "surname"; - private static final String FILE_EXTENSION = "parquet"; - - private static Schema readerSchema; - private static Schema projectionSchema; - - @BeforeAll - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap<>(); - } - - private static Path createDataFile() throws IOException { - File parquetFile = File.createTempFile("test-", "." + FILE_EXTENSION); - readerSchema = new Schema.Parser().parse( - ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); - projectionSchema = new Schema.Parser().parse( - ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people_projection.avsc")); - - try (ParquetWriter writer = AvroParquetWriter.builder(new Path(parquetFile.toURI())) - .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) { - IntStream.range(0, NUM_RECORDS).forEach(index -> { - GenericRecord datum = new GenericData.Record(readerSchema); - datum.put(FIELD_INDEX, index); - datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID())); - datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID())); - try { - OFFSETS_BY_INDEX.put(index, (long) index); - writer.write(datum); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), parquetFile.getName()); - fs.moveFromLocalFile(new Path(parquetFile.getAbsolutePath()), path); - return path; - } - - @Test - public void emptyFile() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - - @Test - public void invalidFileFormat() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { - writer.write("test"); - } - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - - @Test - public void readerWithSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, readerSchema.toString()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); - } - - @Test - public void readerWithProjection() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, projectionSchema.toString()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - while (reader.hasNext()) { - Struct record = reader.next(); - assertNotNull(record.schema().field(FIELD_INDEX)); - assertNotNull(record.schema().field(FIELD_NAME)); - assertNull(record.schema().field(FIELD_SURNAME)); - } - - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - assertThrows(DataException.class, this::readAllData); - } - - @Test - public void readerWithInvalidProjection() throws Throwable { - Schema testSchema = SchemaBuilder.record("test_projection").namespace("test.avro") - .fields() - .name("field1").type("string").noDefault() - .endRecord(); - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, testSchema.toString()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - assertThrows(InvalidRecordException.class, this::readAllData); - } - - @Test - public void readerWithInvalidSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, Schema.create(Schema.Type.STRING).toString()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - assertThrows(AvroRuntimeException.class, this::readAllData); - } - - @Test - public void readerWithUnparseableSchema() { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, "invalid schema"); - }}; - assertThrows(SchemaParseException.class, () -> - getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg)); - } - - @Override - protected Offset getOffset(long offset) { - return new ParquetFileReader.ParquetOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - assertEquals((int) (Integer) record.get(FIELD_INDEX), index); - assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java deleted file mode 100644 index a4435bc..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java +++ /dev/null @@ -1,110 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.SequenceFileReader; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.util.ReflectionUtils; -import org.apache.kafka.connect.data.Struct; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.jupiter.api.Assertions.*; - -public class SequenceFileReaderTest extends HdfsFileReaderTestBase { - - private static final String FIELD_NAME_KEY = "key"; - private static final String FIELD_NAME_VALUE = "value"; - private static final String FILE_EXTENSION = "seq"; - - @BeforeAll - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap() {{ - put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, FIELD_NAME_KEY); - put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - }}; - } - - private static Path createDataFile() throws IOException { - File seqFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (SequenceFile.Writer writer = SequenceFile.createWriter(fs.getConf(), SequenceFile.Writer.file(new Path(seqFile.getAbsolutePath())), - SequenceFile.Writer.keyClass(IntWritable.class), SequenceFile.Writer.valueClass(Text.class))) { - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - Writable key = new IntWritable(index); - Writable value = new Text(String.format("%d_%s", index, UUID.randomUUID())); - try { - writer.append(key, value); - writer.sync(); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - try (SequenceFile.Reader reader = new SequenceFile.Reader(fs.getConf(), - SequenceFile.Reader.file(new Path(seqFile.getAbsolutePath())))) { - Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), fs.getConf()); - Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), fs.getConf()); - int index = 0; - long pos = reader.getPosition() - 1; - while (reader.next(key, value)) { - OFFSETS_BY_INDEX.put(index++, pos); - pos = reader.getPosition(); - } - } - Path path = new Path(new Path(fsUri), seqFile.getName()); - fs.moveFromLocalFile(new Path(seqFile.getAbsolutePath()), path); - return path; - } - - @Test - public void defaultFieldNames() throws Throwable { - Map customReaderCfg = new HashMap<>(); - reader = getReader(fs, dataFile, customReaderCfg); - assertEquals(reader.getFilePath(), dataFile); - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - checkData(SequenceFileReader.FIELD_NAME_KEY_DEFAULT, SequenceFileReader.FIELD_NAME_VALUE_DEFAULT, record, recordCount); - recordCount++; - } - assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); - } - - @Override - protected Offset getOffset(long offset) { - return new SequenceFileReader.SeqOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - checkData(FIELD_NAME_KEY, FIELD_NAME_VALUE, record, index); - } - - private void checkData(String keyFieldName, String valueFieldName, Struct record, long index) { - assertAll( - () -> assertEquals((int) (Integer) record.get(keyFieldName), index), - () -> assertTrue(record.get(valueFieldName).toString().startsWith(index + "_")) - ); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java deleted file mode 100644 index fdb3004..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/TextFileReaderTest.java +++ /dev/null @@ -1,154 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.CompressionType; -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; -import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.*; -import java.nio.charset.UnsupportedCharsetException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.jupiter.api.Assertions.*; - -public class TextFileReaderTest extends HdfsFileReaderTestBase { - - private static final String FIELD_NAME_VALUE = "custom_field_name"; - private static final String FILE_EXTENSION = "txt"; - private static final CompressionType COMPRESSION_TYPE = CompressionType.GZIP; - - @BeforeAll - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(COMPRESSION_TYPE); - readerConfig = new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); - put(TextFileReader.FILE_READER_TEXT_COMPRESSION_CONCATENATED, "true"); - }}; - } - - private static OutputStream getOutputStream(File file, CompressionType compression) throws IOException { - final OutputStream os; - switch (compression) { - case BZIP2: - os = new BZip2CompressorOutputStream(new FileOutputStream(file)); - break; - case GZIP: - os = new GzipCompressorOutputStream(new FileOutputStream(file)); - break; - default: - os = new FileOutputStream(file); - break; - } - return os; - } - - private static Path createDataFile(CompressionType compression) throws IOException { - File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { - IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - writer.append(value + "\n"); - OFFSETS_BY_INDEX.put(index, (long) index); - }); - } - Path path = new Path(new Path(fsUri), txtFile.getName()); - fs.moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); - return path; - } - - @Test - public void validFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_ENCODING, "Cp1252"); - put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); - }}; - reader = getReader(fs, dataFile, cfg); - readAllData(); - } - - @Test - public void invalidFileEncoding() { - Map cfg = new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset"); - put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); - }}; - assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); - } - - @Test - public void readDataWithRecordPerLineDisabled() throws Throwable { - Path file = createDataFile(COMPRESSION_TYPE); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, "false"); - put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); - }}); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - checkData(record, recordCount); - recordCount++; - } - reader.close(); - assertEquals(1, recordCount, () -> "The number of records in the file does not match"); - } - - @Test - public void readDifferentCompressionTypes() { - Arrays.stream(CompressionType.values()).forEach(compressionType -> { - try { - Path file = createDataFile(compressionType); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, compressionType); - }}); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - checkData(record, recordCount); - recordCount++; - } - reader.close(); - assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); - } catch (Throwable e) { - throw new RuntimeException(e); - } - }); - } - - @Override - protected Offset getOffset(long offset) { - return new TextFileReader.TextOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - assertTrue(record.get(FIELD_NAME_VALUE).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/JsonFileReaderTest.java deleted file mode 100644 index 131e427..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/JsonFileReaderTest.java +++ /dev/null @@ -1,177 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.local; - -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.node.JsonNodeFactory; -import com.fasterxml.jackson.databind.node.ObjectNode; -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.JsonFileReader; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.nio.charset.UnsupportedCharsetException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.jupiter.api.Assertions.*; - -public class JsonFileReaderTest extends LocalFileReaderTestBase { - - private static final String FIELD_INTEGER = "integerField"; - private static final String FIELD_LONG = "longField"; - private static final String FIELD_BOOLEAN = "booleanField"; - private static final String FIELD_STRING = "stringField"; - private static final String FIELD_DECIMAL = "decimalField"; - private static final String FIELD_ARRAY = "arrayField"; - private static final String FIELD_STRUCT = "structField"; - private static final String FIELD_NULL = "nullField"; - private static final String FILE_EXTENSION = "jsn"; - - @BeforeAll - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap() {{ - String deserializationConfig = DeserializationFeature.ACCEPT_EMPTY_ARRAY_AS_NULL_OBJECT.name(); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_JSON, FILE_EXTENSION); - put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + deserializationConfig, "true"); - put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + "invalid", "false"); - }}; - } - - private static Path createDataFile() throws IOException { - return createDataFile(NUM_RECORDS, true); - } - - private static Path createDataFile(int numRecords, boolean recordPerLine) throws IOException { - File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (FileWriter writer = new FileWriter(txtFile)) { - IntStream.range(0, numRecords).forEach(index -> { - ObjectNode json = JsonNodeFactory.instance.objectNode() - .put(FIELD_INTEGER, index) - .put(FIELD_LONG, Long.MAX_VALUE) - .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID())) - .put(FIELD_BOOLEAN, true) - .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index)) - .put(FIELD_NULL, (String) null); - json.putArray(FIELD_ARRAY) - .add("elm[" + index + "]") - .add("elm[" + index + "]"); - json.putObject(FIELD_STRUCT) - .put(FIELD_INTEGER, (short) index) - .put(FIELD_LONG, Long.MAX_VALUE) - .put(FIELD_STRING, String.format("%d_%s", index, UUID.randomUUID())) - .put(FIELD_BOOLEAN, true) - .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index)) - .put(FIELD_NULL, (String) null); - try { - writer.append(recordPerLine ? json.toString() + "\n" : json.toPrettyString()); - OFFSETS_BY_INDEX.put(index, (long) index); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), txtFile.getName()); - fs.moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); - return path; - } - - @Test - public void emptyFile() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - - @Test - public void readEmptyFile() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - FileReader reader = getReader(fs, path, readerConfig); - assertFalse(reader.hasNext()); - } - - @Test - public void validFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(JsonFileReader.FILE_READER_JSON_ENCODING, "Cp1252"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_JSON, getFileExtension()); - }}; - reader = getReader(fs, dataFile, cfg); - readAllData(); - } - - @Test - public void invalidFileEncoding() { - Map cfg = new HashMap() {{ - put(JsonFileReader.FILE_READER_JSON_ENCODING, "invalid_charset"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_JSON, getFileExtension()); - }}; - assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); - } - - @Test - public void readDataWithRecordPerLineDisabled() throws Throwable { - Path file = createDataFile(1, false); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(JsonFileReader.FILE_READER_JSON_RECORD_PER_LINE, "false"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_JSON, getFileExtension()); - }}); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - checkData(record, recordCount); - recordCount++; - } - reader.close(); - assertEquals(1, recordCount, () -> "The number of records in the file does not match"); - } - - @Override - protected Offset getOffset(long offset) { - return () -> offset; - } - - @Override - protected void checkData(Struct record, long index) { - Struct subrecord = record.getStruct(FIELD_STRUCT); - assertAll( - () -> assertEquals((int) (Integer) record.get(FIELD_INTEGER), index), - () -> assertEquals((long) (Long) record.get(FIELD_LONG), Long.MAX_VALUE), - () -> assertTrue(record.get(FIELD_STRING).toString().startsWith(index + "_")), - () -> assertTrue(Boolean.parseBoolean(record.get(FIELD_BOOLEAN).toString())), - () -> assertEquals((Double) record.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0), - () -> assertNull(record.get(FIELD_NULL)), - () -> assertNotNull(record.schema().field(FIELD_NULL)), - () -> assertEquals(record.get(FIELD_ARRAY), Arrays.asList("elm[" + index + "]", "elm[" + index + "]")), - () -> assertEquals((int) (Integer) subrecord.get(FIELD_INTEGER), index), - () -> assertEquals((long) (Long) subrecord.get(FIELD_LONG), Long.MAX_VALUE), - () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")), - () -> assertTrue(Boolean.parseBoolean(subrecord.get(FIELD_BOOLEAN).toString())), - () -> assertEquals((Double) subrecord.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0), - () -> assertNull(subrecord.get(FIELD_NULL)), - () -> assertNotNull(subrecord.schema().field(FIELD_NULL)) - ); - - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/LocalFileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/LocalFileReaderTestBase.java deleted file mode 100644 index f08bff7..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/LocalFileReaderTestBase.java +++ /dev/null @@ -1,29 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.local; - -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReaderTestBase; -import org.apache.commons.io.FileUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -public abstract class LocalFileReaderTestBase extends FileReaderTestBase { - - private static Path localDir; - - @BeforeAll - public static void initFs() throws IOException { - localDir = Files.createTempDirectory("test-"); - fsUri = localDir.toUri(); - fs = FileSystem.newInstance(fsUri, new Configuration()); - } - - @AfterAll - public static void finishFs() throws IOException { - FileUtils.deleteDirectory(localDir.toFile()); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java deleted file mode 100644 index 41060c6..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/ParquetFileReaderTest.java +++ /dev/null @@ -1,178 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.local; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.ParquetFileReader; -import org.apache.avro.AvroRuntimeException; -import org.apache.avro.Schema; -import org.apache.avro.SchemaBuilder; -import org.apache.avro.SchemaParseException; -import org.apache.avro.generic.GenericData; -import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.errors.DataException; -import org.apache.parquet.avro.AvroParquetWriter; -import org.apache.parquet.hadoop.ParquetFileWriter; -import org.apache.parquet.hadoop.ParquetWriter; -import org.apache.parquet.io.InvalidRecordException; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.jupiter.api.Assertions.*; - -public class ParquetFileReaderTest extends LocalFileReaderTestBase { - - private static final String FIELD_INDEX = "index"; - private static final String FIELD_NAME = "name"; - private static final String FIELD_SURNAME = "surname"; - private static final String FILE_EXTENSION = "prqt"; - - private static Schema readerSchema; - private static Schema projectionSchema; - - @BeforeAll - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(); - readerConfig = new HashMap() {{ - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, FILE_EXTENSION); - }}; - } - - private static Path createDataFile() throws IOException { - File parquetFile = File.createTempFile("test-", "." + FILE_EXTENSION); - readerSchema = new Schema.Parser().parse( - ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people.avsc")); - projectionSchema = new Schema.Parser().parse( - ParquetFileReaderTest.class.getResourceAsStream("/file/reader/schemas/people_projection.avsc")); - - try (ParquetWriter writer = AvroParquetWriter.builder(new Path(parquetFile.toURI())) - .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) { - IntStream.range(0, NUM_RECORDS).forEach(index -> { - GenericRecord datum = new GenericData.Record(readerSchema); - datum.put(FIELD_INDEX, index); - datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID())); - datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID())); - try { - OFFSETS_BY_INDEX.put(index, (long) index); - writer.write(datum); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - Path path = new Path(new Path(fsUri), parquetFile.getName()); - fs.moveFromLocalFile(new Path(parquetFile.getAbsolutePath()), path); - return path; - } - - @Test - public void emptyFile() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - - @Test - public void invalidFileFormat() throws Throwable { - File tmp = File.createTempFile("test-", "." + getFileExtension()); - try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { - writer.write("test"); - } - Path path = new Path(new Path(fsUri), tmp.getName()); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fs, path, readerConfig); - } - - @Test - public void readerWithSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, readerSchema.toString()); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - readAllData(); - } - - @Test - public void readerWithProjection() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, projectionSchema.toString()); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - while (reader.hasNext()) { - Struct record = reader.next(); - assertNotNull(record.schema().field(FIELD_INDEX)); - assertNotNull(record.schema().field(FIELD_NAME)); - assertNull(record.schema().field(FIELD_SURNAME)); - } - - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - assertThrows(DataException.class, this::readAllData); - } - - @Test - public void readerWithInvalidProjection() throws Throwable { - Schema testSchema = SchemaBuilder.record("test_projection").namespace("test.avro") - .fields() - .name("field1").type("string").noDefault() - .endRecord(); - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, testSchema.toString()); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - assertThrows(InvalidRecordException.class, this::readAllData); - } - - @Test - public void readerWithInvalidSchema() throws Throwable { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, Schema.create(Schema.Type.STRING).toString()); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); - }}; - reader = getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg); - assertThrows(AvroRuntimeException.class, this::readAllData); - } - - @Test - public void readerWithUnparseableSchema() { - Map cfg = new HashMap() {{ - put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, "invalid schema"); - put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); - }}; - assertThrows(SchemaParseException.class, () -> - getReader(FileSystem.newInstance(fsUri, new Configuration()), dataFile, cfg)); - } - - @Override - protected Offset getOffset(long offset) { - return new ParquetFileReader.ParquetOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - assertEquals((int) (Integer) record.get(FIELD_INDEX), index); - assertTrue(record.get(FIELD_NAME).toString().startsWith(index + "_")); - assertTrue(record.get(FIELD_SURNAME).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java deleted file mode 100644 index fd80931..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/local/TextFileReaderTest.java +++ /dev/null @@ -1,155 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader.local; - -import com.github.mmolimar.kafka.connect.fs.file.Offset; -import com.github.mmolimar.kafka.connect.fs.file.reader.AgnosticFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.CompressionType; -import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; -import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.*; -import java.nio.charset.UnsupportedCharsetException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.jupiter.api.Assertions.*; - -public class TextFileReaderTest extends LocalFileReaderTestBase { - - private static final String FIELD_NAME_VALUE = "custom_field_name"; - private static final String FILE_EXTENSION = "txt"; - private static final CompressionType COMPRESSION_TYPE = CompressionType.GZIP; - - @BeforeAll - public static void setUp() throws IOException { - readerClass = AgnosticFileReader.class; - dataFile = createDataFile(COMPRESSION_TYPE); - readerConfig = new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); - put(TextFileReader.FILE_READER_TEXT_COMPRESSION_CONCATENATED, "true"); - }}; - } - - private static OutputStream getOutputStream(File file, CompressionType compression) throws IOException { - final OutputStream os; - switch (compression) { - case BZIP2: - os = new BZip2CompressorOutputStream(new FileOutputStream(file)); - break; - case GZIP: - os = new GzipCompressorOutputStream(new FileOutputStream(file)); - break; - default: - os = new FileOutputStream(file); - break; - } - return os; - } - - - private static Path createDataFile(CompressionType compression) throws IOException { - File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); - try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { - IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - writer.append(value + "\n"); - OFFSETS_BY_INDEX.put(index, (long) index); - }); - } - Path path = new Path(new Path(fsUri), txtFile.getName()); - fs.moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); - return path; - } - - @Test - public void validFileEncoding() throws Throwable { - Map cfg = new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_ENCODING, "Cp1252"); - put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); - }}; - reader = getReader(fs, dataFile, cfg); - readAllData(); - } - - @Test - public void invalidFileEncoding() { - Map cfg = new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset"); - put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); - }}; - assertThrows(UnsupportedCharsetException.class, () -> getReader(fs, dataFile, cfg)); - } - - @Test - public void readDataWithRecordPerLineDisabled() throws Throwable { - Path file = createDataFile(COMPRESSION_TYPE); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_RECORD_PER_LINE, "false"); - put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE); - }}); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - checkData(record, recordCount); - recordCount++; - } - reader.close(); - assertEquals(1, recordCount, () -> "The number of records in the file does not match"); - } - - @Test - public void readDifferentCompressionTypes() { - Arrays.stream(CompressionType.values()).forEach(compressionType -> { - try { - Path file = createDataFile(compressionType); - FileReader reader = getReader(fs, file, new HashMap() {{ - put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); - put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, compressionType); - }}); - - assertTrue(reader.hasNext()); - - int recordCount = 0; - while (reader.hasNext()) { - Struct record = reader.next(); - checkData(record, recordCount); - recordCount++; - } - reader.close(); - assertEquals(NUM_RECORDS, recordCount, () -> "The number of records in the file does not match"); - } catch (Throwable e) { - throw new RuntimeException(e); - } - }); - } - - @Override - protected Offset getOffset(long offset) { - return new TextFileReader.TextOffset(offset); - } - - @Override - protected void checkData(Struct record, long index) { - assertTrue(record.get(FIELD_NAME_VALUE).toString().startsWith(index + "_")); - } - - @Override - protected String getFileExtension() { - return FILE_EXTENSION; - } -} From e4427c5836217374452b3f7d67c05adfdf37a43f Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Fri, 20 Mar 2020 17:48:57 -0600 Subject: [PATCH 23/51] New Univocity readers for TSV and CSV text files --- pom.xml | 8 +++- .../fs/file/reader/AgnosticFileReader.java | 21 +++++---- .../connect/fs/file/reader/CsvFileReader.java | 43 +++++++++++++++++++ .../connect/fs/file/reader/TsvFileReader.java | 37 ++++++++++++++++ 4 files changed, 100 insertions(+), 9 deletions(-) create mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReader.java create mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReader.java diff --git a/pom.xml b/pom.xml index 11a210e..b575f0e 100644 --- a/pom.xml +++ b/pom.xml @@ -16,6 +16,7 @@ 3.2.1 1.11.0 1.9.2 + 2.8.4 2.10.2 9.0.2 5.6.0 @@ -75,6 +76,11 @@ jackson-core ${fasterxml-jackson.version} + + com.univocity + univocity-parsers + ${univocity.version} + com.cronutils cron-utils @@ -84,7 +90,7 @@ org.junit.jupiter - junit-jupiter-api + junit-jupiter ${junit-jupiter.version} test diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java index 30a6371..a096f5c 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java @@ -17,14 +17,18 @@ public class AgnosticFileReader extends AbstractFileReader reader; - private List parquetExtensions, avroExtensions, sequenceExtensions, jsonExtensions, delimitedExtensions; + private List parquetExtensions, avroExtensions, sequenceExtensions, + jsonExtensions, csvExtensions, tsvExtensions; public AgnosticFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, new AgnosticAdapter(), config); @@ -53,8 +57,10 @@ private AbstractFileReader readerByExtension(FileSystem fs, Path filePat clz = SequenceFileReader.class; } else if (jsonExtensions.contains(extension)) { clz = JsonFileReader.class; - } else if (delimitedExtensions.contains(extension)) { - clz = DelimitedTextFileReader.class; + } else if (csvExtensions.contains(extension)) { + clz = CsvFileReader.class; + } else if (tsvExtensions.contains(extension)) { + clz = TsvFileReader.class; } else { clz = TextFileReader.class; } @@ -72,7 +78,9 @@ protected void configure(Map config) { .toLowerCase().split(",")); this.jsonExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_JSON, "json") .toLowerCase().split(",")); - this.delimitedExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_DELIMITED, "tsv,csv") + this.csvExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_CSV, "csv") + .toLowerCase().split(",")); + this.tsvExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_TSV, "tsv") .toLowerCase().split(",")); } @@ -103,9 +111,6 @@ protected AgnosticRecord nextRecord() { static class AgnosticAdapter implements ReaderAdapter { - AgnosticAdapter() { - } - @Override public Struct apply(AgnosticRecord ag) { return ag.adapter.apply(ag.record); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReader.java new file mode 100644 index 0000000..9442e54 --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReader.java @@ -0,0 +1,43 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.univocity.parsers.common.AbstractParser; +import com.univocity.parsers.csv.CsvParser; +import com.univocity.parsers.csv.CsvParserSettings; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.Map; + +public class CsvFileReader extends UnivocityFileReader { + + public static final String FILE_READER_DELIMITED_SETTINGS_DELIMITER_DETECTION = FILE_READER_DELIMITED_SETTINGS + "delimiter_detection"; + public static final String FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE = FILE_READER_DELIMITED_SETTINGS + "empty_value"; + public static final String FILE_READER_DELIMITED_SETTINGS_ESCAPE_UNQUOTED = FILE_READER_DELIMITED_SETTINGS + "escape_unquoted"; + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER = FILE_READER_DELIMITED_SETTINGS_FORMAT + "delimiter"; + + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "quote"; + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE_ESCAPE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "quote_scape"; + + public CsvFileReader(FileSystem fs, Path filePath, Map config) throws IOException { + super(fs, filePath, config); + } + + @Override + protected CsvParserSettings parserSettings(Map config) { + CsvParserSettings settings = new CsvParserSettings(); + settings.setDelimiterDetectionEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_DELIMITER_DETECTION, false)); + settings.setEmptyValue(config.get(FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE)); + settings.setEscapeUnquotedValues(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_ESCAPE_UNQUOTED, false)); + settings.getFormat().setDelimiter(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, ",")); + settings.getFormat().setQuote(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE, "\"").charAt(0)); + settings.getFormat().setQuoteEscape(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE_ESCAPE, "\"").charAt(0)); + + return settings; + } + + @Override + protected AbstractParser createParser(CsvParserSettings settings) { + return new CsvParser(settings); + } +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReader.java new file mode 100644 index 0000000..f626a8e --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReader.java @@ -0,0 +1,37 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.univocity.parsers.common.AbstractParser; +import com.univocity.parsers.tsv.TsvParser; +import com.univocity.parsers.tsv.TsvParserSettings; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.Map; + +public class TsvFileReader extends UnivocityFileReader { + + public static final String FILE_READER_DELIMITED_SETTINGS_LINE_JOINING = FILE_READER_DELIMITED_SETTINGS + "line_joining"; + + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "escape"; + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPED_CHAR = FILE_READER_DELIMITED_SETTINGS_FORMAT + "escaped_char"; + + public TsvFileReader(FileSystem fs, Path filePath, Map config) throws IOException { + super(fs, filePath, config); + } + + @Override + protected TsvParserSettings parserSettings(Map config) { + TsvParserSettings settings = new TsvParserSettings(); + settings.setLineJoiningEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_LINE_JOINING, false)); + settings.getFormat().setEscapeChar(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPE, "\"").charAt(0)); + settings.getFormat().setEscapedTabChar(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_ESCAPED_CHAR, "\"").charAt(0)); + + return settings; + } + + @Override + protected AbstractParser createParser(TsvParserSettings settings) { + return new TsvParser(settings); + } +} From 5007c745c96f7cb507ae7cfc68b8e4b54a6f684a Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Fri, 20 Mar 2020 18:26:57 -0600 Subject: [PATCH 24/51] Upgrade Kafka and Confluent versions --- pom.xml | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/pom.xml b/pom.xml index b575f0e..fcdad09 100644 --- a/pom.xml +++ b/pom.xml @@ -11,13 +11,11 @@ UTF-8 - 2.4.0 - 5.4.0 + 2.4.1 + 5.4.1 3.2.1 1.11.0 - 1.9.2 2.8.4 - 2.10.2 9.0.2 5.6.0 4.2 @@ -43,7 +41,6 @@ io.confluent kafka-connect-avro-converter ${confluent.version} - provided org.apache.hadoop @@ -60,22 +57,6 @@ parquet-avro ${parquet.version} - - org.apache.avro - avro - ${avro.version} - - - org.apache.avro - avro-tools - ${avro.version} - nodeps - - - com.fasterxml.jackson.core - jackson-core - ${fasterxml-jackson.version} - com.univocity univocity-parsers From 55b7a7308d1e7665dc10398ca26cac77548ec60b Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Fri, 20 Mar 2020 20:16:30 -0600 Subject: [PATCH 25/51] Tests for CSV and TSV file readers --- .../fs/file/reader/AgnosticFileReader.java | 4 +- .../connect/fs/file/reader/CsvFileReader.java | 8 +- .../fs/file/reader/JsonFileReader.java | 9 +- .../fs/file/reader/UnivocityFileReader.java | 236 ++++++++++++++++++ .../file/reader/AgnosticFileReaderTest.java | 158 ++++++++++++ .../fs/file/reader/AvroFileReaderTest.java | 2 +- .../fs/file/reader/CsvFileReaderTest.java | 84 +++++++ .../fs/file/reader/FileReaderTestBase.java | 16 +- .../fs/file/reader/FileSystemConfig.java | 8 +- .../fs/file/reader/JsonFileReaderTest.java | 16 +- .../fs/file/reader/ParquetFileReaderTest.java | 2 +- .../file/reader/SequenceFileReaderTest.java | 2 +- .../fs/file/reader/TextFileReaderTest.java | 2 +- .../fs/file/reader/TsvFileReaderTest.java | 41 +++ .../file/reader/UnivocityFileReaderTest.java | 187 ++++++++++++++ 15 files changed, 745 insertions(+), 30 deletions(-) create mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java index a096f5c..9ee8665 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java @@ -20,11 +20,11 @@ public class AgnosticFileReader extends AbstractFileReader reader; private List parquetExtensions, avroExtensions, sequenceExtensions, diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReader.java index 9442e54..70388dc 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReader.java @@ -11,13 +11,13 @@ public class CsvFileReader extends UnivocityFileReader { - public static final String FILE_READER_DELIMITED_SETTINGS_DELIMITER_DETECTION = FILE_READER_DELIMITED_SETTINGS + "delimiter_detection"; public static final String FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE = FILE_READER_DELIMITED_SETTINGS + "empty_value"; + public static final String FILE_READER_DELIMITED_SETTINGS_DELIMITER_DETECTION = FILE_READER_DELIMITED_SETTINGS + "delimiter_detection"; public static final String FILE_READER_DELIMITED_SETTINGS_ESCAPE_UNQUOTED = FILE_READER_DELIMITED_SETTINGS + "escape_unquoted"; - public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER = FILE_READER_DELIMITED_SETTINGS_FORMAT + "delimiter"; + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER = FILE_READER_DELIMITED_SETTINGS_FORMAT + "delimiter"; public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "quote"; - public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE_ESCAPE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "quote_scape"; + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE_ESCAPE = FILE_READER_DELIMITED_SETTINGS_FORMAT + "quote_escape"; public CsvFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, config); @@ -26,8 +26,8 @@ public CsvFileReader(FileSystem fs, Path filePath, Map config) t @Override protected CsvParserSettings parserSettings(Map config) { CsvParserSettings settings = new CsvParserSettings(); - settings.setDelimiterDetectionEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_DELIMITER_DETECTION, false)); settings.setEmptyValue(config.get(FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE)); + settings.setDelimiterDetectionEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_DELIMITER_DETECTION, false)); settings.setEscapeUnquotedValues(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_ESCAPE_UNQUOTED, false)); settings.getFormat().setDelimiter(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, ",")); settings.getFormat().setQuote(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_QUOTE, "\"").charAt(0)); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java index cf26a34..76db116 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java @@ -25,11 +25,12 @@ public class JsonFileReader extends AbstractFileReader> + extends AbstractFileReader { + + private static final String FILE_READER_DELIMITED = FILE_READER_PREFIX + "delimited."; + private static final String FILE_READER_COMPRESSION = FILE_READER_DELIMITED + "compression."; + + protected static final String FILE_READER_DELIMITED_SETTINGS = FILE_READER_DELIMITED + "settings."; + protected static final String FILE_READER_DELIMITED_SETTINGS_FORMAT = FILE_READER_DELIMITED_SETTINGS + "format."; + + public static final String FILE_READER_DELIMITED_SETTINGS_HEADER = FILE_READER_DELIMITED_SETTINGS + "header"; + public static final String FILE_READER_DELIMITED_SETTINGS_LINE_SEPARATOR_DETECTION = FILE_READER_DELIMITED_SETTINGS + "line_separator_detection"; + public static final String FILE_READER_DELIMITED_SETTINGS_NULL_VALUE = FILE_READER_DELIMITED_SETTINGS + "null_value"; + public static final String FILE_READER_DELIMITED_SETTINGS_MAX_COLUMNS = FILE_READER_DELIMITED_SETTINGS + "max_columns"; + public static final String FILE_READER_DELIMITED_SETTINGS_MAX_CHARS_PER_COLUMN = FILE_READER_DELIMITED_SETTINGS + "max_chars_per_column"; + public static final String FILE_READER_DELIMITED_SETTINGS_ROWS_TO_SKIP = FILE_READER_DELIMITED_SETTINGS + "rows_to_skip"; + public static final String FILE_READER_DELIMITED_SETTINGS_ILW = FILE_READER_DELIMITED_SETTINGS + "ignore_leading_whitespaces"; + public static final String FILE_READER_DELIMITED_SETTINGS_ITW = FILE_READER_DELIMITED_SETTINGS + "ignore_trailing_whitespaces"; + + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_LINE_SEP = FILE_READER_DELIMITED_SETTINGS_FORMAT + "line_separator"; + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_COMMENT = FILE_READER_DELIMITED_SETTINGS_FORMAT + "comment"; + + public static final String FILE_READER_DELIMITED_COMPRESSION_TYPE = FILE_READER_COMPRESSION + "type"; + public static final String FILE_READER_DELIMITED_COMPRESSION_CONCATENATED = FILE_READER_COMPRESSION + "concatenated"; + public static final String FILE_READER_DELIMITED_ENCODING = FILE_READER_DELIMITED + "encoding"; + + private static final String DEFAULT_COLUMN_NAME = "column_"; + + private final UnivocityOffset offset; + private T settings; + private Schema schema; + private Charset charset; + private CompressionType compression; + private boolean closed; + + private ResultIterator iterator; + + public UnivocityFileReader(FileSystem fs, Path filePath, Map config) throws IOException { + super(fs, filePath, new UnivocityToStruct(), config); + + this.offset = new UnivocityOffset(0); + this.iterator = iterateRecords(); + this.schema = buildSchema(this.iterator, settings.isHeaderExtractionEnabled()); + } + + private Schema buildSchema(ResultIterator it, boolean hasHeader) { + SchemaBuilder builder = SchemaBuilder.struct(); + if (it.hasNext() && !hasHeader) { + Record first = it.next(); + IntStream.range(0, first.getValues().length) + .forEach(index -> builder.field(DEFAULT_COLUMN_NAME + ++index, SchemaBuilder.STRING_SCHEMA)); + seek(new UnivocityOffset(0)); + } else if (hasHeader) { + Optional.ofNullable(it.getContext().headers()).ifPresent(headers -> { + IntStream.range(0, headers.length) + .forEach(index -> builder.field(headers[index], SchemaBuilder.STRING_SCHEMA)); + }); + } + return builder.build(); + } + + @Override + protected void configure(Map config) { + String cType = config.getOrDefault(FILE_READER_DELIMITED_COMPRESSION_TYPE, CompressionType.NONE.toString()); + boolean concatenated = Boolean.parseBoolean(config.getOrDefault(FILE_READER_DELIMITED_COMPRESSION_CONCATENATED, + "true")); + this.compression = CompressionType.fromName(cType, concatenated); + this.charset = Charset.forName(config.getOrDefault(FILE_READER_DELIMITED_ENCODING, Charset.defaultCharset().name())); + this.settings = allSettings(config); + } + + private T allSettings(Map config) { + T settings = parserSettings(config); + settings.setHeaderExtractionEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_HEADER, false)); + settings.setLineSeparatorDetectionEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_LINE_SEPARATOR_DETECTION, false)); + settings.setNullValue(config.get(FILE_READER_DELIMITED_SETTINGS_NULL_VALUE)); + settings.setMaxColumns(Integer.parseInt(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_MAX_COLUMNS, "512"))); + settings.setMaxCharsPerColumn(Integer.parseInt(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_MAX_CHARS_PER_COLUMN, "4096"))); + settings.setNumberOfRowsToSkip(Long.parseLong(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_ROWS_TO_SKIP, "0"))); + settings.setIgnoreLeadingWhitespaces(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_ILW, true)); + settings.setIgnoreTrailingWhitespaces(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_ITW, true)); + settings.getFormat().setLineSeparator(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_LINE_SEP, "\n")); + settings.getFormat().setComment(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_COMMENT, "#").charAt(0)); + + return settings; + } + + protected boolean getBoolean(Map config, String property, boolean defaultValue) { + return Boolean.parseBoolean(config.getOrDefault(property, String.valueOf(defaultValue))); + } + + protected abstract T parserSettings(Map config); + + protected abstract AbstractParser createParser(T settings); + + private Reader getFileReader(InputStream is, CompressionType compression, Charset charset) throws IOException { + final InputStreamReader isr; + switch (compression) { + case BZIP2: + isr = new InputStreamReader(new BZip2CompressorInputStream(is, compression.isConcatenated()), charset); + break; + case GZIP: + isr = new InputStreamReader(new GzipCompressorInputStream(is, compression.isConcatenated()), charset); + break; + default: + isr = new InputStreamReader(is, charset); + break; + } + return isr; + } + + private ResultIterator iterateRecords() throws IOException { + return createParser(settings) + .iterateRecords(getFileReader(getFs().open(getFilePath()), this.compression, this.charset)) + .iterator(); + } + + @Override + protected final UnivocityRecord nextRecord() { + if (!hasNext()) throw new NoSuchElementException("There are no more records in file: " + getFilePath()); + + offset.inc(); + Record record = iterator.next(); + return new UnivocityRecord(schema, record.getValues()); + } + + @Override + public final boolean hasNext() { + if (closed) throw new IllegalStateException("Reader already closed."); + + return iterator.hasNext(); + } + + @Override + public final void seek(Offset offset) { + if (offset.getRecordOffset() < 0) { + throw new IllegalArgumentException("Record offset must be greater than 0"); + } + try { + if (offset.getRecordOffset() > this.offset.getRecordOffset()) { + iterator.hasNext(); + iterator.getContext().skipLines(offset.getRecordOffset() - this.offset.getRecordOffset() - 1); + iterator.next(); + } else { + iterator = iterateRecords(); + iterator.hasNext(); + iterator.getContext().skipLines(offset.getRecordOffset()); + } + this.offset.setOffset(offset.getRecordOffset()); + } catch (IOException ioe) { + throw new ConnectException("Error seeking file " + getFilePath(), ioe); + } + } + + @Override + public final Offset currentOffset() { + return offset; + } + + @Override + public final void close() { + iterator.getContext().stop(); + closed = true; + } + + public static class UnivocityOffset implements Offset { + private long offset; + + public UnivocityOffset(long offset) { + this.offset = offset; + } + + public void setOffset(long offset) { + this.offset = offset; + } + + void inc() { + this.offset++; + } + + @Override + public long getRecordOffset() { + return offset; + } + } + + static class UnivocityToStruct implements ReaderAdapter { + + @Override + public Struct apply(UnivocityRecord record) { + Struct struct = new Struct(record.schema); + IntStream.range(0, record.schema.fields().size()) + .filter(index -> index < record.values.length) + .forEach(index -> struct.put(record.schema.fields().get(index).name(), record.values[index])); + return struct; + } + } + + static class UnivocityRecord { + private final Schema schema; + private final String[] values; + + UnivocityRecord(Schema schema, String[] values) { + this.schema = schema; + this.values = values; + } + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java new file mode 100644 index 0000000..7f25e66 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java @@ -0,0 +1,158 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import org.junit.jupiter.api.Nested; + +import java.util.Map; + +public class AgnosticFileReaderTest { + + private static final String FILE_EXTENSION = "test"; + + @Nested + class AgnosticTextFileReaderTest extends TextFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_TEXT, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + + @Nested + class AgnosticCsvFileReaderTest extends CsvFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_CSV, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + + @Nested + class AgnosticTsvFileReaderTest extends TsvFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_TSV, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + + @Nested + class AgnosticJsonFileReaderTest extends JsonFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_JSON, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + + @Nested + class AgnosticAvroFileReaderTest extends AvroFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + + @Nested + class AgnosticParquetFileReaderTest extends ParquetFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + + @Nested + class AgnosticSequenceFileReaderTest extends SequenceFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java index 176b6dd..75fc8a2 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java @@ -55,7 +55,7 @@ protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws datum.put(FIELD_NAME, String.format("%d_name_%s", index, UUID.randomUUID())); datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, UUID.randomUUID())); try { - fsConfig.getOffsetsByIndex().put(index, dataFileWriter.sync() - 16L); + fsConfig.offsetsByIndex().put(index, dataFileWriter.sync() - 16L); dataFileWriter.append(datum); } catch (IOException ioe) { throw new RuntimeException(ioe); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java new file mode 100644 index 0000000..f4a0809 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java @@ -0,0 +1,84 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.*; + +public class CsvFileReaderTest extends UnivocityFileReaderTest { + + @Override + protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + boolean header = args.length < 1 || (boolean) args[0]; + CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1]; + File txtFile = File.createTempFile("test-", "." + getFileExtension()); + try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { + if (header) { + writer.append(FIELD_COLUMN1 + "#" + FIELD_COLUMN2 + "#" + FIELD_COLUMN3 + "#" + FIELD_COLUMN4 + "\n"); + } + IntStream.range(0, NUM_RECORDS).forEach(index -> { + String value = String.format("%d_%s", index, UUID.randomUUID()); + writer.append(value + "#" + value + "#" + value + "#" + value + "\n"); + fsConfig.offsetsByIndex().put(index, (long) index); + }); + } + Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); + return path; + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readAllDataWithMalformedRows(FileSystemConfig fsConfig) throws Throwable { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + try (FileWriter writer = new FileWriter(tmp)) { + writer.append(FIELD_COLUMN1 + "," + FIELD_COLUMN2 + "," + FIELD_COLUMN3 + "," + FIELD_COLUMN4 + "\n"); + writer.append("dummy,\"\",,dummy\n"); + writer.append("#comment\n"); + writer.append("dummy,\"\",,dummy\n"); + } + Map readerConfig = getReaderConfig(); + readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, ","); + readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE, "empty_value"); + readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_NULL_VALUE, "null_value"); + + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + FileReader reader = getReader(fsConfig.getFs(), path, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + assertAll( + () -> assertEquals("dummy", record.get(FIELD_COLUMN1)), + () -> assertEquals("empty_value", record.get(FIELD_COLUMN2)), + () -> assertEquals("null_value", record.get(FIELD_COLUMN3)), + () -> assertEquals("dummy", record.get(FIELD_COLUMN4)) + ); + recordCount++; + } + assertEquals(2, recordCount, () -> "The number of records in the file does not match"); + } + + @Override + protected Map getReaderConfig() { + return new HashMap() {{ + put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, "#"); + put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + }}; + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java index 188960e..4e0d38b 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java @@ -130,24 +130,24 @@ public void readAllData(FileSystemConfig fsConfig) { public void seekFile(FileSystemConfig fsConfig) { FileReader reader = fsConfig.getReader(); int recordIndex = NUM_RECORDS / 2; - reader.seek(getOffset(fsConfig.getOffsetsByIndex().get(recordIndex))); + reader.seek(getOffset(fsConfig.offsetsByIndex().get(recordIndex))); assertTrue(reader.hasNext()); - assertEquals(fsConfig.getOffsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = 0; - reader.seek(getOffset(fsConfig.getOffsetsByIndex().get(recordIndex))); + reader.seek(getOffset(fsConfig.offsetsByIndex().get(recordIndex))); assertTrue(reader.hasNext()); - assertEquals(fsConfig.getOffsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); recordIndex = NUM_RECORDS - 3; - reader.seek(getOffset(fsConfig.getOffsetsByIndex().get(recordIndex))); + reader.seek(getOffset(fsConfig.offsetsByIndex().get(recordIndex))); assertTrue(reader.hasNext()); - assertEquals(fsConfig.getOffsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); checkData(reader.next(), recordIndex); - reader.seek(getOffset(fsConfig.getOffsetsByIndex().get(NUM_RECORDS - 1) + 1)); + reader.seek(getOffset(fsConfig.offsetsByIndex().get(NUM_RECORDS - 1) + 1)); assertFalse(reader.hasNext()); } @@ -162,7 +162,7 @@ public void negativeSeek(FileSystemConfig fsConfig) { @MethodSource("fileSystemConfigProvider") public void exceededSeek(FileSystemConfig fsConfig) { FileReader reader = fsConfig.getReader(); - reader.seek(getOffset(fsConfig.getOffsetsByIndex().get(NUM_RECORDS - 1) + 1)); + reader.seek(getOffset(fsConfig.offsetsByIndex().get(NUM_RECORDS - 1) + 1)); assertFalse(reader.hasNext()); assertThrows(NoSuchElementException.class, reader::next); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileSystemConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileSystemConfig.java index c670e5f..a838251 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileSystemConfig.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileSystemConfig.java @@ -29,7 +29,7 @@ interface FileSystemConfig extends Closeable { FileReader getReader(); - Map getOffsetsByIndex(); + Map offsetsByIndex(); } @@ -80,7 +80,7 @@ public FileReader getReader() { } @Override - public Map getOffsetsByIndex() { + public Map offsetsByIndex() { return offsetsByIndex; } @@ -141,7 +141,7 @@ public FileReader getReader() { } @Override - public Map getOffsetsByIndex() { + public Map offsetsByIndex() { return offsetsByIndex; } @@ -150,4 +150,4 @@ public void close() throws IOException { fs.close(); cluster.shutdown(true); } -} \ No newline at end of file +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java index 9d05edf..da8994b 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java @@ -1,6 +1,9 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectWriter; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import com.github.mmolimar.kafka.connect.fs.file.Offset; @@ -41,6 +44,7 @@ protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws CompressionType compression = args.length < 3 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[2]; File txtFile = File.createTempFile("test-", "." + getFileExtension()); try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { + ObjectWriter jsonWriter = new ObjectMapper().writerWithDefaultPrettyPrinter(); IntStream.range(0, numRecords).forEach(index -> { ObjectNode json = JsonNodeFactory.instance.objectNode() .put(FIELD_INTEGER, index) @@ -51,7 +55,7 @@ protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws .put(FIELD_NULL, (String) null); json.putArray(FIELD_ARRAY) .add("elm[" + index + "]") - .add("elm[" + index + "]"); + .add("elm[" + (index + 1) + "]"); json.putObject(FIELD_STRUCT) .put(FIELD_INTEGER, (short) index) .put(FIELD_LONG, Long.MAX_VALUE) @@ -59,8 +63,12 @@ protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws .put(FIELD_BOOLEAN, true) .put(FIELD_DECIMAL, Double.parseDouble(index + "." + index)) .put(FIELD_NULL, (String) null); - writer.append(recordPerLine ? json.toString() + "\n" : json.toPrettyString()); - fsConfig.getOffsetsByIndex().put(index, (long) index); + try { + writer.append(recordPerLine ? json.toString() + "\n" : jsonWriter.writeValueAsString(json)); + } catch (JsonProcessingException jpe) { + throw new RuntimeException(jpe); + } + fsConfig.offsetsByIndex().put(index, (long) index); }); } Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); @@ -181,7 +189,7 @@ protected void checkData(Struct record, long index) { () -> assertEquals((Double) record.get(FIELD_DECIMAL), Double.parseDouble(index + "." + index), 0), () -> assertNull(record.get(FIELD_NULL)), () -> assertNotNull(record.schema().field(FIELD_NULL)), - () -> assertEquals(record.get(FIELD_ARRAY), Arrays.asList("elm[" + index + "]", "elm[" + index + "]")), + () -> assertEquals(record.get(FIELD_ARRAY), Arrays.asList("elm[" + index + "]", "elm[" + (index + 1) + "]")), () -> assertEquals((int) (Integer) subrecord.get(FIELD_INTEGER), index), () -> assertEquals((long) (Long) subrecord.get(FIELD_LONG), Long.MAX_VALUE), () -> assertTrue(subrecord.get(FIELD_STRING).toString().startsWith(index + "_")), diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java index 672872e..ae21b88 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java @@ -63,7 +63,7 @@ protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws datum.put(FIELD_NAME, String.format("%d_name_%s", index, uuid)); datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, uuid)); try { - fsConfig.getOffsetsByIndex().put(index, (long) index); + fsConfig.offsetsByIndex().put(index, (long) index); writer.write(datum); } catch (IOException ioe) { throw new RuntimeException(ioe); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java index 18377fa..ae87901 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java @@ -52,7 +52,7 @@ protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws int index = 0; long pos = reader.getPosition() - 1; while (reader.next(key, value)) { - fsConfig.getOffsetsByIndex().put(index++, pos); + fsConfig.offsetsByIndex().put(index++, pos); pos = reader.getPosition(); } } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java index 9220772..53ac900 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java @@ -32,7 +32,7 @@ protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IntStream.range(0, NUM_RECORDS).forEach(index -> { String value = String.format("%d_%s", index, UUID.randomUUID()); writer.append(value + "\n"); - fsConfig.getOffsetsByIndex().put(index, (long) index); + fsConfig.offsetsByIndex().put(index, (long) index); }); } Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java new file mode 100644 index 0000000..20c0dc0 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java @@ -0,0 +1,41 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import org.apache.hadoop.fs.Path; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.IntStream; + +public class TsvFileReaderTest extends UnivocityFileReaderTest { + + @Override + protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + boolean header = args.length < 1 || (boolean) args[0]; + CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1]; + File txtFile = File.createTempFile("test-", "." + getFileExtension()); + try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { + if (header) { + writer.append(FIELD_COLUMN1 + "\t" + FIELD_COLUMN2 + "\t" + FIELD_COLUMN3 + "\t" + FIELD_COLUMN4 + "\n"); + } + IntStream.range(0, NUM_RECORDS).forEach(index -> { + String value = String.format("%d_%s", index, UUID.randomUUID()); + writer.append(value + "\t" + value + "\t" + value + "\t" + value + "\n"); + fsConfig.offsetsByIndex().put(index, (long) index); + }); + } + Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); + return path; + } + + @Override + protected Map getReaderConfig() { + return new HashMap() {{ + put(TsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + }}; + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java new file mode 100644 index 0000000..da18e0e --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java @@ -0,0 +1,187 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.github.mmolimar.kafka.connect.fs.file.Offset; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.lang.reflect.ParameterizedType; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.*; + +abstract class UnivocityFileReaderTest extends FileReaderTestBase { + + protected static final String FIELD_COLUMN1 = "column_1"; + protected static final String FIELD_COLUMN2 = "column_2"; + protected static final String FIELD_COLUMN3 = "column_3"; + protected static final String FIELD_COLUMN4 = "column_4"; + protected static final String FILE_EXTENSION = "tcsv"; + protected static final CompressionType COMPRESSION_TYPE_DEFAULT = CompressionType.NONE; + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void emptyFile(FileSystemConfig fsConfig) throws Throwable { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fsConfig.getFs(), path, getReaderConfig()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidFileFormat(FileSystemConfig fsConfig) throws Throwable { + File tmp = File.createTempFile("test-", "." + getFileExtension()); + try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { + writer.write("test"); + } + Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); + getReader(fsConfig.getFs(), path, getReaderConfig()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invaliConfigArgs(FileSystemConfig fsConfig) { + try { + getReaderClass().getConstructor(FileSystem.class, Path.class, Map.class) + .newInstance(fsConfig.getFs(), fsConfig.getDataFile(), new HashMap()); + } catch (Exception e) { + assertThrows(IllegalArgumentException.class, () -> { + throw e.getCause(); + }); + } + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readAllDataWithoutHeader(FileSystemConfig fsConfig) throws Throwable { + Path file = createDataFile(fsConfig, false); + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "false"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readDifferentCompressionTypes(FileSystemConfig fsConfig) { + Arrays.stream(CompressionType.values()).forEach(compressionType -> { + try { + Path file = createDataFile(fsConfig, true, compressionType); + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_COMPRESSION_TYPE, compressionType.toString()); + readerConfig.put(T.FILE_READER_DELIMITED_COMPRESSION_CONCATENATED, "true"); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkData(record, recordCount); + recordCount++; + } + reader.close(); + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void seekFileWithoutHeader(FileSystemConfig fsConfig) throws Throwable { + Path file = createDataFile(fsConfig, false); + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "false"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordIndex = NUM_RECORDS / 2; + reader.seek(getOffset(fsConfig.offsetsByIndex().get(recordIndex))); + assertTrue(reader.hasNext()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); + checkData(reader.next(), recordIndex); + + recordIndex = 0; + reader.seek(getOffset(fsConfig.offsetsByIndex().get(recordIndex))); + assertTrue(reader.hasNext()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); + checkData(reader.next(), recordIndex); + + recordIndex = NUM_RECORDS - 3; + reader.seek(getOffset(fsConfig.offsetsByIndex().get(recordIndex))); + assertTrue(reader.hasNext()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); + checkData(reader.next(), recordIndex); + + reader.seek(getOffset(fsConfig.offsetsByIndex().get(NUM_RECORDS - 1) + 1)); + assertFalse(reader.hasNext()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void validFileEncoding(FileSystemConfig fsConfig) throws Throwable { + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + readerConfig.put(T.FILE_READER_DELIMITED_ENCODING, "Cp1252"); + getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidFileEncoding(FileSystemConfig fsConfig) { + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + readerConfig.put(T.FILE_READER_DELIMITED_ENCODING, "invalid_charset"); + assertThrows(UnsupportedCharsetException.class, () -> getReader(fsConfig.getFs(), + fsConfig.getDataFile(), readerConfig)); + } + + @Override + protected Offset getOffset(long offset) { + return new T.UnivocityOffset(offset); + } + + @Override + protected Class getReaderClass() { + return (Class) ((ParameterizedType) this.getClass().getGenericSuperclass()) + .getActualTypeArguments()[0]; + } + + @Override + protected void checkData(Struct record, long index) { + assertAll( + () -> assertTrue(record.get(FIELD_COLUMN1).toString().startsWith(index + "_")), + () -> assertTrue(record.get(FIELD_COLUMN2).toString().startsWith(index + "_")), + () -> assertTrue(record.get(FIELD_COLUMN3).toString().startsWith(index + "_")), + () -> assertTrue(record.get(FIELD_COLUMN4).toString().startsWith(index + "_")) + ); + } + + @Override + protected String getFileExtension() { + return FILE_EXTENSION; + } +} From f47a1d42aac877920f7883b3b1fd6af8891a0531 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sat, 21 Mar 2020 12:05:21 -0600 Subject: [PATCH 26/51] Updating docs --- README.md | 6 +- docs/source/config_options.rst | 322 +++++++++++++++++++++++++++------ docs/source/connector.rst | 2 +- docs/source/filereaders.rst | 48 +++-- 4 files changed, 300 insertions(+), 78 deletions(-) diff --git a/README.md b/README.md index 76d3961..40d1b27 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # Kafka Connect FileSystem Connector [![Build Status](https://travis-ci.org/mmolimar/kafka-connect-fs.svg?branch=master)](https://travis-ci.org/mmolimar/kafka-connect-fs)[![Coverage Status](https://coveralls.io/repos/github/mmolimar/kafka-connect-fs/badge.svg?branch=master)](https://coveralls.io/github/mmolimar/kafka-connect-fs?branch=master) -**kafka-connect-fs** is a [Kafka Connector](http://kafka.apache.org/documentation.html#connect) +**kafka-connect-fs** is a [Kafka Connector](https://kafka.apache.org/documentation.html#connect) for reading records from files in the file systems specified and load them into Kafka. -Documentation for this connector can be found [here](http://kafka-connect-fs.readthedocs.io/). +Documentation for this connector can be found [here](https://kafka-connect-fs.readthedocs.io/). ## Development @@ -13,7 +13,7 @@ kafka-connect-fs with Maven using the standard lifecycle phases. ## FAQ Some frequently asked questions on Kafka Connect FileSystem Connector can be found here - -http://kafka-connect-fs.readthedocs.io/en/latest/faq.html +https://kafka-connect-fs.readthedocs.io/en/latest/faq.html ## Contribute diff --git a/docs/source/config_options.rst b/docs/source/config_options.rst index bc89498..5b25ed1 100644 --- a/docs/source/config_options.rst +++ b/docs/source/config_options.rst @@ -48,7 +48,7 @@ General config properties for this connector. If you want to ingest data from dynamic directories, this is, directories created every day and avoiding to add new URIs or look for files from a parent directory, you can include expressions in the URIs to do that. For example, for this URI ``file:///data/${yyyy}``, it will be - converted to ``file:///data/2017`` (when executing whe policy). + converted to ``file:///data/2020`` (when executing whe policy). You can use as many as you like in the URIs, for instance: ``file:///data/${yyyy}/${MM}/${dd}/${HH}${mm}`` @@ -60,7 +60,7 @@ General config properties for this connector. ``policy.fs.fs.s3a.secret.key=``   ``topic`` - Topic in which copy data. + Topic in which copy data to. * Type: string * Importance: high @@ -71,6 +71,12 @@ General config properties for this connector. * Type: string * Importance: high +``policy.regexp`` + Regular expression to filter files from the FS. + + * Type: string + * Importance: high + ``policy.recursive`` Flag to activate traversed recursion in subdirectories when listing files. @@ -78,23 +84,17 @@ General config properties for this connector. * Default: ``false`` * Importance: medium -``policy.regexp`` - Regular expression to filter files from the FS. - - * Type: string - * Importance: high - ``policy..`` - This represents the custom properties you can include based on the policy class specified. + This represents custom properties you can include based on the policy class specified. - * Type: depending on the policy. - * Importance: depending on the policy. + * Type: based on the policy. + * Importance: based on the policy. ``policy.fs.`` Custom properties to use for the FS. - * Type: depending on the FS. - * Importance: depending on the FS. + * Type: based on the FS. + * Importance: based on the FS. ``file_reader.class`` File reader class to read files from the FS (must implement @@ -104,10 +104,10 @@ General config properties for this connector. * Importance: high ``file_reader..`` - This represents the custom properties you can include based on the file reader class specified. + This represents custom properties you can include based on the file reader class specified. - * Type: depending on the file reader. - * Importance: depending on the file reader. + * Type: based on the file reader. + * Importance: based on the file reader. .. _config_options-policies: @@ -272,6 +272,12 @@ To configure custom properties for this reader, the name you must use is ``json` * Type: boolean * Importance: medium +``file_reader.json.encoding`` + Encoding to use for reading a file. If not specified, the reader will use the default encoding. + + * Type: string + * Importance: medium + ``file_reader.json.compression.type`` Compression type to use when reading a file. @@ -287,35 +293,139 @@ To configure custom properties for this reader, the name you must use is ``json` * Default: ``true`` * Importance: low -``file_reader.json.encoding`` - Encoding to use for reading a file. If not specified, the reader will use the default encoding. +.. _config_options-filereaders-csv: + +CSV +-------------------------------------------- + +To configure custom properties for this reader, the name you must use is ``delimited`` (even though it's for CSV). + +``file_reader.delimited.settings.header`` + If the file contains header or not. + + * Type: boolean + * Default: ``false`` + * Importance: high + +``file_reader.delimited.settings.format.delimiter`` + Field delimiter. * Type: string + * Default: ``,`` + * Importance: high + +``file_reader.delimited.settings.null_value`` + Default value for ``null`` values. + + * Type: string + * Default: ``null`` * Importance: medium -.. _config_options-filereaders-text: +``file_reader.delimited.settings.empty_value`` + Default value for empty values (empty values within quotes). -Text --------------------------------------------- + * Type: string + * Default: ``null`` + * Importance: medium -To configure custom properties for this reader, the name you must use is ``text``. +``file_reader.delimited.settings.format.line_separator`` + Line separator to be used. -``file_reader.json.record_per_line`` - If enabled, the reader will read each line as a record. Otherwise, the reader will read the full - content of the file as a record. + * Type: string + * Default: ``\n`` + * Importance: medium + +``file_reader.delimited.settings.max_columns`` + Default value for ``null`` values. + + * Type: int + * Default: ``512`` + * Importance: low + +``file_reader.delimited.settings.max_chars_per_column`` + Default value for ``null`` values. + + * Type: int + * Default: ``4096`` + * Importance: low + +``file_reader.delimited.settings.rows_to_skip`` + Number of rows to skip. + + * Type: long + * Default: ``0`` + * Importance: low + +``file_reader.delimited.settings.line_separator_detection`` + If the reader should detect the line separator automatically. + + * Type: boolean + * Default: ``false`` + * Importance: medium + +``file_reader.delimited.settings.delimiter_detection`` + If the reader should detect the delimiter automatically. + + * Type: boolean + * Default: ``false`` + * Importance: medium + +``file_reader.delimited.settings.ignore_leading_whitespaces`` + Flag to enable/disable skipping leading whitespaces from values. * Type: boolean * Default: ``true`` + * Importance: low + +``file_reader.delimited.settings.ignore_trailing_whitespaces`` + Flag to enable/disable skipping trailing whitespaces from values. + + * Type: boolean + * Default: ``true`` + * Importance: low + +``file_reader.delimited.settings.format.comment`` + Character that represents a line comment at the beginning of a line. + + * Type: char + * Default: ``#`` + * Importance: low + +``file_reader.delimited.settings.escape_unquoted`` + Flag to enable/disable processing escape sequences in unquoted values. + + * Type: boolean + * Default: ``false`` + * Importance: low + +``file_reader.delimited.settings.format.quote`` + Character used for escaping values where the field delimiter is part of the value. + + * Type: char + * Default: ``"`` + * Importance: low + +``file_reader.delimited.settings.format.quote_escape`` + Character used for escaping quotes inside an already quoted value. + + * Type: char + * Default: ``"`` + * Importance: low + +``file_reader.delimited.encoding`` + Encoding to use for reading a file. If not specified, the reader will use the default encoding. + + * Type: string * Importance: medium -``file_reader.json.compression.type`` +``file_reader.delimited.compression.type`` Compression type to use when reading a file. * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) * Default: ``none`` * Importance: medium -``file_reader.json.compression.concatenated`` +``file_reader.delimited.compression.concatenated`` Flag to specify if the decompression of the reader will finish at the end of the file or after the first compressed stream. @@ -323,39 +433,133 @@ To configure custom properties for this reader, the name you must use is ``text` * Default: ``true`` * Importance: low -``file_reader.text.field_name.value`` - Custom field name for the output value to include in the Kafka message. +.. _config_options-filereaders-tsv: + +TSV +-------------------------------------------- + +To configure custom properties for this reader, the name you must use is ``delimited`` (even though it's for TSV). + +``file_reader.delimited.settings.header`` + If the file contains header or not. + + * Type: boolean + * Default: ``false`` + * Importance: high + +``file_reader.delimited.settings.null_value`` + Default value for ``null`` values. * Type: string - * Default: ``value`` - * Importance: low + * Default: ``null`` + * Importance: medium -``file_reader.text.encoding`` - Encoding to use for reading a file. If not specified, the reader will use the default encoding. +``file_reader.delimited.settings.format.line_separator`` + Line separator to be used. * Type: string + * Default: ``\n`` * Importance: medium -.. _config_options-filereaders-delimited: +``file_reader.delimited.settings.max_columns`` + Default value for ``null`` values. -Delimited text --------------------------------------------- + * Type: int + * Default: ``512`` + * Importance: low -To configure custom properties for this reader, the name you must use is ``delimited``. +``file_reader.delimited.settings.max_chars_per_column`` + Default value for ``null`` values. -``file_reader.delimited.token`` - The token delimiter for columns. + * Type: int + * Default: ``4096`` + * Importance: low - * Type: string - * Importance: high +``file_reader.delimited.settings.rows_to_skip`` + Number of rows to skip. -``file_reader.delimited.header`` - If the file contains header or not. + * Type: long + * Default: ``0`` + * Importance: low + +``file_reader.delimited.settings.line_separator_detection`` + If the reader should detect the line separator automatically. + + * Type: boolean + * Default: ``false`` + * Importance: medium + +``file_reader.delimited.settings.line_separator_detection`` + If the reader should detect the line separator automatically. * Type: boolean * Default: ``false`` + * Importance: low + +``file_reader.delimited.settings.line_joining`` + Identifies whether or lines ending with the escape character and followed by a line + separator character should be joined with the following line. + + * Type: boolean + * Default: ``true`` + * Importance: low + +``file_reader.delimited.settings.ignore_trailing_whitespaces`` + Flag to enable/disable skipping trailing whitespaces from values. + + * Type: boolean + * Default: ``true`` + * Importance: low + +``file_reader.delimited.settings.format.comment`` + Character that represents a line comment at the beginning of a line. + + * Type: char + * Default: ``#`` + * Importance: low + +``file_reader.delimited.settings.format.escape`` + Character used for escaping special characters. + + * Type: char + * Default: ``\`` + * Importance: low + +``file_reader.delimited.settings.format.escaped_char`` + Character used to represent an escaped tab. + + * Type: char + * Default: ``t`` + * Importance: low + +``file_reader.delimited.encoding`` + Encoding to use for reading a file. If not specified, the reader will use the default encoding. + + * Type: string * Importance: medium +``file_reader.delimited.compression.type`` + Compression type to use when reading a file. + + * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) + * Default: ``none`` + * Importance: medium + +``file_reader.delimited.compression.concatenated`` + Flag to specify if the decompression of the reader will finish at the end of the file or after + the first compressed stream. + + * Type: boolean + * Default: ``true`` + * Importance: low + +.. _config_options-filereaders-text: + +Text +-------------------------------------------- + +To configure custom properties for this reader, the name you must use is ``text``. + ``file_reader.json.record_per_line`` If enabled, the reader will read each line as a record. Otherwise, the reader will read the full content of the file as a record. @@ -364,12 +568,17 @@ To configure custom properties for this reader, the name you must use is ``delim * Default: ``true`` * Importance: medium -``file_reader.delimited.default_value`` - Sets a default value in a column when its value is null. This is due to the record is malformed (it does not contain - all expected columns). +``file_reader.text.field_name.value`` + Custom field name for the output value to include in the Kafka message. + + * Type: string + * Default: ``value`` + * Importance: medium + +``file_reader.text.encoding`` + Encoding to use for reading a file. If not specified, the reader will use the default encoding. * Type: string - * Default: ``null`` * Importance: medium ``file_reader.json.compression.type`` @@ -387,11 +596,7 @@ To configure custom properties for this reader, the name you must use is ``delim * Default: ``true`` * Importance: low -``file_reader.delimited.encoding`` - Encoding to use for reading a file. If not specified, the reader will use the default encoding. - - * Type: string - * Importance: medium +.. _config_options-filereaders-agnostic: Agnostic -------------------------------------------- @@ -426,11 +631,18 @@ To configure custom properties for this reader, the name you must use is ``agnos * Default: ``json`` * Importance: medium -``file_reader.agnostic.extensions.delimited`` - A comma-separated string list with the accepted extensions for Delimited text files. +``file_reader.agnostic.extensions.csv`` + A comma-separated string list with the accepted extensions for CSV files. + + * Type: string + * Default: ``csv`` + * Importance: medium + +``file_reader.agnostic.extensions.tsv`` + A comma-separated string list with the accepted extensions for TSV files. * Type: string - * Default: ``tsv,csv`` + * Default: ``tsv`` * Importance: medium .. note:: The Agnostic reader uses the previous ones as inner readers. So, in case of using this diff --git a/docs/source/connector.rst b/docs/source/connector.rst index 8d2e305..6c79317 100644 --- a/docs/source/connector.rst +++ b/docs/source/connector.rst @@ -24,7 +24,7 @@ Getting started Prerequisites -------------------------------------------- -- Confluent Platform 5.4.0 +- Confluent Platform 5.4.1 - Java 8 Building from source diff --git a/docs/source/filereaders.rst b/docs/source/filereaders.rst index 0ea1560..8e52634 100644 --- a/docs/source/filereaders.rst +++ b/docs/source/filereaders.rst @@ -47,6 +47,27 @@ and marked as optional in the schema all the fields contained. More information about properties of this file reader :ref:`here`. +CSV +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +CSV file reader using a custom token to distinguish different columns on each line. + +It allows to distinguish a header in the files and set the name of their columns +in the message sent to Kafka. If there is no header, the value of each column will be in +the field named ``column_N`` (**N** represents the column index) in the message. +Also, the token delimiter for columns is configurable. + +More information about properties of this file reader :ref:`here`. + +TSV +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +TSV file reader using a tab (``\t``) to distinguish different columns on each line. + +Its behaviour is the same one for the CSV file reader regarding the header and the column names. + +More information about properties of this file reader :ref:`here`. + Text ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -58,33 +79,22 @@ customize these field names. More information about properties of this file reader :ref:`here`. -Delimited text -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Text file reader using a custom token to distinguish different columns on each line. - -It allows to distinguish a header in the files and set the name of their columns -in the message sent to Kafka. If there is no header, the value of each column will be in -the field named ``column_N`` (**N** represents the column index) in the message. -Also, the token delimiter for columns is configurable. - -More information about properties of this file reader :ref:`here`. - Agnostic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Actually, this reader is a wrapper of the readers listing above. It tries to read any kind of file format using an internal reader based on the file extension, -applying the proper one (Parquet, Avro, SecuenceFile, Text or Delimited text). In case of no +applying the proper one (Parquet, Avro, SecuenceFile, CSV, TSV or Text). In case of no extension has been matched, the Text file reader will be applied. -Default extensions for each format: -* Parquet: .parquet -* Avro: .avro -* SequenceFile: .seq -* JSON: .json -* Delimited text: .tsv, .csv +Default extensions for each format (configurable): +* Parquet: ``.parquet`` +* Avro: ``.avro`` +* SequenceFile: ``.seq`` +* JSON: ``.json`` +* CSV: ``.csv`` +* TSV: ``.tsv`` * Text: any other sort of file extension. More information about properties of this file reader :ref:`here`. From 372ecf6071275d18320a589e8cce0293440565dc Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sat, 21 Mar 2020 19:13:52 -0600 Subject: [PATCH 27/51] Test refactor for tasks and policies --- .../kafka/connect/fs/FsSourceTask.java | 4 +- ...estBase.java => AbstractHdfsFsConfig.java} | 37 +- .../connect/fs/AbstractLocalFsConfig.java | 41 +++ .../kafka/connect/fs/FsTestConfig.java | 17 + .../fs/file/reader/AvroFileReaderTest.java | 8 +- .../fs/file/reader/CsvFileReaderTest.java | 4 +- .../fs/file/reader/FileReaderTestBase.java | 30 +- .../fs/file/reader/FileSystemConfig.java | 153 --------- .../fs/file/reader/JsonFileReaderTest.java | 14 +- .../fs/file/reader/ParquetFileReaderTest.java | 16 +- .../fs/file/reader/ReaderFsTestConfig.java | 97 ++++++ .../file/reader/SequenceFileReaderTest.java | 4 +- .../fs/file/reader/TextFileReaderTest.java | 10 +- .../fs/file/reader/TsvFileReaderTest.java | 2 +- .../file/reader/UnivocityFileReaderTest.java | 16 +- .../fs/policy/{local => }/CronPolicyTest.java | 68 ++-- .../fs/policy/HdfsFileWatcherPolicyTest.java | 79 +++++ .../connect/fs/policy/PolicyFsTestConfig.java | 112 +++++++ .../connect/fs/policy/PolicyTestBase.java | 172 ++++++---- .../policy/{local => }/SimplePolicyTest.java | 25 +- .../policy/{local => }/SleepyPolicyTest.java | 78 ++--- .../fs/policy/hdfs/CronPolicyTest.java | 90 ----- .../hdfs/HdfsFileWatcherPolicyTest.java | 71 ---- .../fs/policy/hdfs/SimplePolicyTest.java | 40 --- .../fs/policy/hdfs/SleepyPolicyTest.java | 110 ------ .../fs/policy/local/LocalPolicyTestBase.java | 29 -- .../connect/fs/task/FsSourceTaskTest.java | 316 +++++++++++++++--- .../connect/fs/task/FsSourceTaskTestBase.java | 187 ----------- .../connect/fs/task/TaskFsTestConfig.java | 113 +++++++ .../fs/task/hdfs/HdfsFsSourceTaskTest.java | 66 ---- .../task/hdfs/HdfsFsSourceTaskTestBase.java | 33 -- .../fs/task/local/LocalFsSourceTaskTest.java | 65 ---- .../task/local/LocalFsSourceTaskTestBase.java | 29 -- 33 files changed, 980 insertions(+), 1156 deletions(-) rename src/test/java/com/github/mmolimar/kafka/connect/fs/{policy/hdfs/HdfsPolicyTestBase.java => AbstractHdfsFsConfig.java} (50%) create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractLocalFsConfig.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/FsTestConfig.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileSystemConfig.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ReaderFsTestConfig.java rename src/test/java/com/github/mmolimar/kafka/connect/fs/policy/{local => }/CronPolicyTest.java (53%) create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyFsTestConfig.java rename src/test/java/com/github/mmolimar/kafka/connect/fs/policy/{local => }/SimplePolicyTest.java (57%) rename src/test/java/com/github/mmolimar/kafka/connect/fs/policy/{local => }/SleepyPolicyTest.java (54%) delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/CronPolicyTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/LocalPolicyTestBase.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTestBase.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/task/TaskFsTestConfig.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTestBase.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTest.java delete mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTestBase.java diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java index 971f6ee..1fdc9b5 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java @@ -89,7 +89,9 @@ private List filesToProcess() { .collect(Collectors.toList()); } catch (IOException | ConnectException e) { //when an exception happens executing the policy, the connector continues - log.error("Cannot retrieve files to process from FS: " + policy.getURIs() + ". Keep going...", e); + log.error("Cannot retrieve files to process from the FS: " + policy.getURIs() + ". " + + "There was an error executing the policy but the task tolerates this and continues. " + + "Error message: " + e.getMessage()); return Collections.emptyList(); } } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractHdfsFsConfig.java similarity index 50% rename from src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java rename to src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractHdfsFsConfig.java index 522d1de..f3fef89 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsPolicyTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractHdfsFsConfig.java @@ -1,33 +1,44 @@ -package com.github.mmolimar.kafka.connect.fs.policy.hdfs; +package com.github.mmolimar.kafka.connect.fs; -import com.github.mmolimar.kafka.connect.fs.policy.PolicyTestBase; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; import java.io.IOException; import java.net.URI; import java.nio.file.Files; -import java.nio.file.Path; -public abstract class HdfsPolicyTestBase extends PolicyTestBase { +public abstract class AbstractHdfsFsConfig implements FsTestConfig { + private MiniDFSCluster cluster; + private FileSystem fs; + private URI fsUri; - private static MiniDFSCluster cluster; - - @BeforeAll - public static void initFs() throws IOException { + @Override + public final void initFs() throws IOException { Configuration clusterConfig = new Configuration(); - Path hdfsDir = Files.createTempDirectory("test-"); + java.nio.file.Path hdfsDir = Files.createTempDirectory("test-"); clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString()); cluster = new MiniDFSCluster.Builder(clusterConfig).build(); fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/"); fs = FileSystem.newInstance(fsUri, new Configuration()); + init(); + } + + protected abstract void init() throws IOException; + + @Override + public FileSystem getFs() { + return fs; + } + + @Override + public URI getFsUri() { + return fsUri; } - @AfterAll - public static void finishFs() { + @Override + public void close() throws IOException { + fs.close(); cluster.shutdown(true); } } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractLocalFsConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractLocalFsConfig.java new file mode 100644 index 0000000..dab5736 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/AbstractLocalFsConfig.java @@ -0,0 +1,41 @@ +package com.github.mmolimar.kafka.connect.fs; + +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; + +public abstract class AbstractLocalFsConfig implements FsTestConfig { + private java.nio.file.Path localDir; + private FileSystem fs; + private URI fsUri; + + @Override + public final void initFs() throws IOException { + localDir = Files.createTempDirectory("test-"); + fsUri = localDir.toUri(); + fs = FileSystem.newInstance(fsUri, new Configuration()); + init(); + } + + protected abstract void init() throws IOException; + + @Override + public FileSystem getFs() { + return fs; + } + + @Override + public URI getFsUri() { + return fsUri; + } + + @Override + public void close() throws IOException { + fs.close(); + FileUtils.deleteDirectory(localDir.toFile()); + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/FsTestConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/FsTestConfig.java new file mode 100644 index 0000000..64b9c4c --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/FsTestConfig.java @@ -0,0 +1,17 @@ +package com.github.mmolimar.kafka.connect.fs; + +import org.apache.hadoop.fs.FileSystem; + +import java.io.Closeable; +import java.io.IOException; +import java.net.URI; + +public interface FsTestConfig extends Closeable { + + void initFs() throws IOException; + + FileSystem getFs(); + + URI getFsUri(); + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java index 75fc8a2..841c951 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java @@ -41,7 +41,7 @@ public static void setUp() throws IOException { } @Override - protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { File avroFile = File.createTempFile("test-", "." + getFileExtension()); DatumWriter writer = new GenericDatumWriter<>(schema); try (DataFileWriter dataFileWriter = new DataFileWriter<>(writer)) { @@ -69,7 +69,7 @@ protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithSchema(FileSystemConfig fsConfig) throws Throwable { + public void readerWithSchema(ReaderFsTestConfig fsConfig) throws Throwable { Map readerConfig = getReaderConfig(); readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, schema.toString()); FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); @@ -79,7 +79,7 @@ public void readerWithSchema(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithInvalidSchema(FileSystemConfig fsConfig) throws Throwable { + public void readerWithInvalidSchema(ReaderFsTestConfig fsConfig) throws Throwable { Map readerConfig = getReaderConfig(); readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, Schema.create(Schema.Type.STRING).toString()); FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); @@ -96,7 +96,7 @@ public void readerWithInvalidSchema(FileSystemConfig fsConfig) throws Throwable @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithUnparseableSchema(FileSystemConfig fsConfig) throws IOException { + public void readerWithUnparseableSchema(ReaderFsTestConfig fsConfig) throws IOException { Map readerConfig = getReaderConfig(); readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, "invalid schema"); FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java index f4a0809..a6b9fbf 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java @@ -19,7 +19,7 @@ public class CsvFileReaderTest extends UnivocityFileReaderTest { @Override - protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { boolean header = args.length < 1 || (boolean) args[0]; CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1]; File txtFile = File.createTempFile("test-", "." + getFileExtension()); @@ -40,7 +40,7 @@ protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readAllDataWithMalformedRows(FileSystemConfig fsConfig) throws Throwable { + public void readAllDataWithMalformedRows(ReaderFsTestConfig fsConfig) throws Throwable { File tmp = File.createTempFile("test-", "." + getFileExtension()); try (FileWriter writer = new FileWriter(tmp)) { writer.append(FIELD_COLUMN1 + "," + FIELD_COLUMN2 + "," + FIELD_COLUMN3 + "," + FIELD_COLUMN4 + "\n"); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java index 4e0d38b..e691d87 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java @@ -23,7 +23,7 @@ abstract class FileReaderTestBase { - private static final List TEST_FILE_SYSTEMS = Arrays.asList( + private static final List TEST_FILE_SYSTEMS = Arrays.asList( new LocalFsConfig(), new HdfsFsConfig() ); @@ -31,21 +31,21 @@ abstract class FileReaderTestBase { @BeforeAll public static void initFs() throws IOException { - for (FileSystemConfig fsConfig : TEST_FILE_SYSTEMS) { + for (ReaderFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { fsConfig.initFs(); } } @AfterAll public static void finishFs() throws IOException { - for (FileSystemConfig fsConfig : TEST_FILE_SYSTEMS) { + for (ReaderFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { fsConfig.close(); } } @BeforeEach public void openReader() throws Throwable { - for (FileSystemConfig fsConfig : TEST_FILE_SYSTEMS) { + for (ReaderFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { fsConfig.setDataFile(createDataFile(fsConfig)); FileReader reader = ReflectionUtils.makeReader(getReaderClass(), fsConfig.getFs(), fsConfig.getDataFile(), getReaderConfig()); @@ -56,7 +56,7 @@ public void openReader() throws Throwable { @AfterEach public void closeReader() { - for (FileSystemConfig fsConfig : TEST_FILE_SYSTEMS) { + for (ReaderFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { try { fsConfig.getReader().close(); } catch (Exception e) { @@ -71,7 +71,7 @@ private static Stream fileSystemConfigProvider() { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invalidArgs(FileSystemConfig fsConfig) { + public void invalidArgs(ReaderFsTestConfig fsConfig) { try { fsConfig.getReader().getClass().getConstructor(FileSystem.class, Path.class, Map.class) .newInstance(null, null, null); @@ -84,14 +84,14 @@ public void invalidArgs(FileSystemConfig fsConfig) { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void fileDoesNotExist(FileSystemConfig fsConfig) { + public void fileDoesNotExist(ReaderFsTestConfig fsConfig) { Path path = new Path(new Path(fsConfig.getFsUri()), UUID.randomUUID().toString()); assertThrows(FileNotFoundException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); } @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void emptyFile(FileSystemConfig fsConfig) throws Throwable { + public void emptyFile(ReaderFsTestConfig fsConfig) throws Throwable { File tmp = File.createTempFile("test-", "." + getFileExtension()); Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); @@ -100,7 +100,7 @@ public void emptyFile(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invalidFileFormat(FileSystemConfig fsConfig) throws Throwable { + public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws Throwable { File tmp = File.createTempFile("test-", "." + getFileExtension()); try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { writer.write("test"); @@ -112,7 +112,7 @@ public void invalidFileFormat(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readAllData(FileSystemConfig fsConfig) { + public void readAllData(ReaderFsTestConfig fsConfig) { FileReader reader = fsConfig.getReader(); assertTrue(reader.hasNext()); @@ -127,7 +127,7 @@ public void readAllData(FileSystemConfig fsConfig) { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void seekFile(FileSystemConfig fsConfig) { + public void seekFile(ReaderFsTestConfig fsConfig) { FileReader reader = fsConfig.getReader(); int recordIndex = NUM_RECORDS / 2; reader.seek(getOffset(fsConfig.offsetsByIndex().get(recordIndex))); @@ -153,14 +153,14 @@ public void seekFile(FileSystemConfig fsConfig) { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void negativeSeek(FileSystemConfig fsConfig) { + public void negativeSeek(ReaderFsTestConfig fsConfig) { FileReader reader = fsConfig.getReader(); assertThrows(RuntimeException.class, () -> reader.seek(getOffset(-1))); } @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void exceededSeek(FileSystemConfig fsConfig) { + public void exceededSeek(ReaderFsTestConfig fsConfig) { FileReader reader = fsConfig.getReader(); reader.seek(getOffset(fsConfig.offsetsByIndex().get(NUM_RECORDS - 1) + 1)); assertFalse(reader.hasNext()); @@ -169,7 +169,7 @@ public void exceededSeek(FileSystemConfig fsConfig) { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readFileAlreadyClosed(FileSystemConfig fsConfig) throws IOException { + public void readFileAlreadyClosed(ReaderFsTestConfig fsConfig) throws IOException { FileReader reader = fsConfig.getReader(); reader.close(); assertThrows(IllegalStateException.class, reader::hasNext); @@ -202,7 +202,7 @@ protected OutputStream getOutputStream(File file, CompressionType compression) t protected abstract Class getReaderClass(); - protected abstract Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException; + protected abstract Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException; protected abstract Map getReaderConfig(); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileSystemConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileSystemConfig.java deleted file mode 100644 index a838251..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileSystemConfig.java +++ /dev/null @@ -1,153 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file.reader; - -import org.apache.commons.io.FileUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hdfs.MiniDFSCluster; - -import java.io.Closeable; -import java.io.IOException; -import java.net.URI; -import java.nio.file.Files; -import java.util.HashMap; -import java.util.Map; - -interface FileSystemConfig extends Closeable { - - void initFs() throws IOException; - - FileSystem getFs(); - - URI getFsUri(); - - void setDataFile(Path dataFile); - - Path getDataFile(); - - void setReader(FileReader reader); - - FileReader getReader(); - - Map offsetsByIndex(); - -} - -class LocalFsConfig implements FileSystemConfig { - private java.nio.file.Path localDir; - private FileSystem fs; - private URI fsUri; - private Path dataFile; - private FileReader reader; - private Map offsetsByIndex; - - @Override - public void initFs() throws IOException { - localDir = Files.createTempDirectory("test-"); - fsUri = localDir.toUri(); - fs = FileSystem.newInstance(fsUri, new Configuration()); - offsetsByIndex = new HashMap<>(); - } - - @Override - public FileSystem getFs() { - return fs; - } - - @Override - public URI getFsUri() { - return fsUri; - } - - @Override - public void setDataFile(Path dataFile) { - this.dataFile = dataFile; - } - - @Override - public Path getDataFile() { - return dataFile; - } - - @Override - public void setReader(FileReader reader) { - this.reader = reader; - } - - @Override - public FileReader getReader() { - return reader; - } - - @Override - public Map offsetsByIndex() { - return offsetsByIndex; - } - - @Override - public void close() throws IOException { - fs.close(); - FileUtils.deleteDirectory(localDir.toFile()); - } -} - -class HdfsFsConfig implements FileSystemConfig { - private MiniDFSCluster cluster; - private FileSystem fs; - private URI fsUri; - private Path dataFile; - private FileReader reader; - private Map offsetsByIndex; - - @Override - public void initFs() throws IOException { - Configuration clusterConfig = new Configuration(); - java.nio.file.Path hdfsDir = Files.createTempDirectory("test-"); - clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString()); - cluster = new MiniDFSCluster.Builder(clusterConfig).build(); - fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/"); - fs = FileSystem.newInstance(fsUri, new Configuration()); - offsetsByIndex = new HashMap<>(); - } - - @Override - public FileSystem getFs() { - return fs; - } - - @Override - public URI getFsUri() { - return fsUri; - } - - @Override - public Path getDataFile() { - return dataFile; - } - - @Override - public void setDataFile(Path dataFile) { - this.dataFile = dataFile; - } - - @Override - public void setReader(FileReader reader) { - this.reader = reader; - } - - @Override - public FileReader getReader() { - return reader; - } - - @Override - public Map offsetsByIndex() { - return offsetsByIndex; - } - - @Override - public void close() throws IOException { - fs.close(); - cluster.shutdown(true); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java index da8994b..1df1cd0 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java @@ -38,7 +38,7 @@ public class JsonFileReaderTest extends FileReaderTestBase { private static final CompressionType COMPRESSION_TYPE_DEFAULT = CompressionType.NONE; @Override - protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { int numRecords = args.length < 1 ? NUM_RECORDS : (int) args[0]; boolean recordPerLine = args.length < 2 || (boolean) args[1]; CompressionType compression = args.length < 3 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[2]; @@ -78,7 +78,7 @@ protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void emptyFile(FileSystemConfig fsConfig) throws Throwable { + public void emptyFile(ReaderFsTestConfig fsConfig) throws Throwable { File tmp = File.createTempFile("test-", "." + getFileExtension()); Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); @@ -88,7 +88,7 @@ public void emptyFile(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void validFileEncoding(FileSystemConfig fsConfig) throws Throwable { + public void validFileEncoding(ReaderFsTestConfig fsConfig) throws Throwable { Map readerConfig = getReaderConfig(); readerConfig.put(JsonFileReader.FILE_READER_JSON_ENCODING, "Cp1252"); fsConfig.setReader(getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); @@ -97,7 +97,7 @@ public void validFileEncoding(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invalidDeserializationConfig(FileSystemConfig fsConfig) throws Throwable { + public void invalidDeserializationConfig(ReaderFsTestConfig fsConfig) throws Throwable { Map readerConfig = getReaderConfig(); readerConfig.put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + "invalid", "false"); fsConfig.setReader(getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); @@ -106,7 +106,7 @@ public void invalidDeserializationConfig(FileSystemConfig fsConfig) throws Throw @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invalidFileEncoding(FileSystemConfig fsConfig) { + public void invalidFileEncoding(ReaderFsTestConfig fsConfig) { Map readerConfig = getReaderConfig(); readerConfig.put(JsonFileReader.FILE_READER_JSON_ENCODING, "invalid_charset"); assertThrows(UnsupportedCharsetException.class, () -> getReader(fsConfig.getFs(), @@ -115,7 +115,7 @@ public void invalidFileEncoding(FileSystemConfig fsConfig) { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readDataWithRecordPerLineDisabled(FileSystemConfig fsConfig) throws Throwable { + public void readDataWithRecordPerLineDisabled(ReaderFsTestConfig fsConfig) throws Throwable { Path file = createDataFile(fsConfig, 1, false); Map readerConfig = getReaderConfig(); readerConfig.put(JsonFileReader.FILE_READER_JSON_RECORD_PER_LINE, "false"); @@ -135,7 +135,7 @@ public void readDataWithRecordPerLineDisabled(FileSystemConfig fsConfig) throws @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readDifferentCompressionTypes(FileSystemConfig fsConfig) { + public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { Arrays.stream(CompressionType.values()).forEach(compressionType -> { try { Path file = createDataFile(fsConfig, NUM_RECORDS, true, compressionType); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java index ae21b88..891eeec 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java @@ -50,7 +50,7 @@ public static void setUp() throws IOException { } @Override - protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { FileSystem fs = fsConfig.getFs(); File parquetFile = File.createTempFile("test-", "." + getFileExtension()); @@ -77,7 +77,7 @@ protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void emptyFile(FileSystemConfig fsConfig) throws Throwable { + public void emptyFile(ReaderFsTestConfig fsConfig) throws Throwable { File tmp = File.createTempFile("test-", "." + getFileExtension()); Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); @@ -86,7 +86,7 @@ public void emptyFile(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invalidFileFormat(FileSystemConfig fsConfig) throws Throwable { + public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws Throwable { File tmp = File.createTempFile("test-", "." + getFileExtension()); try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { writer.write("test"); @@ -98,7 +98,7 @@ public void invalidFileFormat(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithSchema(FileSystemConfig fsConfig) throws Throwable { + public void readerWithSchema(ReaderFsTestConfig fsConfig) throws Throwable { Map readerConfig = getReaderConfig(); readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, readerSchema.toString()); readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); @@ -109,7 +109,7 @@ public void readerWithSchema(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithProjection(FileSystemConfig fsConfig) throws Throwable { + public void readerWithProjection(ReaderFsTestConfig fsConfig) throws Throwable { Map readerConfig = getReaderConfig(); readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, projectionSchema.toString()); readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); @@ -127,7 +127,7 @@ public void readerWithProjection(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithInvalidProjection(FileSystemConfig fsConfig) throws Throwable { + public void readerWithInvalidProjection(ReaderFsTestConfig fsConfig) throws Throwable { Schema testSchema = SchemaBuilder.record("test_projection").namespace("test.avro") .fields() .name("field1").type("string").noDefault() @@ -142,7 +142,7 @@ public void readerWithInvalidProjection(FileSystemConfig fsConfig) throws Throwa @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithInvalidSchema(FileSystemConfig fsConfig) throws Throwable { + public void readerWithInvalidSchema(ReaderFsTestConfig fsConfig) throws Throwable { Map readerConfig = getReaderConfig(); readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, Schema.create(Schema.Type.STRING).toString()); readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); @@ -153,7 +153,7 @@ public void readerWithInvalidSchema(FileSystemConfig fsConfig) throws Throwable @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithUnparseableSchema(FileSystemConfig fsConfig) { + public void readerWithUnparseableSchema(ReaderFsTestConfig fsConfig) { Map readerConfig = getReaderConfig(); readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, "invalid schema"); readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ReaderFsTestConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ReaderFsTestConfig.java new file mode 100644 index 0000000..7fde007 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ReaderFsTestConfig.java @@ -0,0 +1,97 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.github.mmolimar.kafka.connect.fs.AbstractHdfsFsConfig; +import com.github.mmolimar.kafka.connect.fs.AbstractLocalFsConfig; +import com.github.mmolimar.kafka.connect.fs.FsTestConfig; +import org.apache.hadoop.fs.Path; + +import java.util.HashMap; +import java.util.Map; + +interface ReaderFsTestConfig extends FsTestConfig { + + void setDataFile(Path dataFile); + + Path getDataFile(); + + void setReader(FileReader reader); + + FileReader getReader(); + + Map offsetsByIndex(); + +} + +class LocalFsConfig extends AbstractLocalFsConfig implements ReaderFsTestConfig { + private Path dataFile; + private FileReader reader; + private Map offsetsByIndex; + + @Override + public void init() { + offsetsByIndex = new HashMap<>(); + } + + @Override + public void setDataFile(Path dataFile) { + this.dataFile = dataFile; + } + + @Override + public Path getDataFile() { + return dataFile; + } + + @Override + public void setReader(FileReader reader) { + this.reader = reader; + } + + @Override + public FileReader getReader() { + return reader; + } + + @Override + public Map offsetsByIndex() { + return offsetsByIndex; + } + +} + +class HdfsFsConfig extends AbstractHdfsFsConfig implements ReaderFsTestConfig { + private Path dataFile; + private FileReader reader; + private Map offsetsByIndex; + + @Override + public void init() { + offsetsByIndex = new HashMap<>(); + } + + @Override + public Path getDataFile() { + return dataFile; + } + + @Override + public void setDataFile(Path dataFile) { + this.dataFile = dataFile; + } + + @Override + public void setReader(FileReader reader) { + this.reader = reader; + } + + @Override + public FileReader getReader() { + return reader; + } + + @Override + public Map offsetsByIndex() { + return offsetsByIndex; + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java index ae87901..ee5bdd8 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java @@ -28,7 +28,7 @@ public class SequenceFileReaderTest extends FileReaderTestBase { private static final String FILE_EXTENSION = "sq"; @Override - protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { FileSystem fs = fsConfig.getFs(); File seqFile = File.createTempFile("test-", "." + getFileExtension()); try (SequenceFile.Writer writer = SequenceFile.createWriter(fs.getConf(), @@ -63,7 +63,7 @@ protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void defaultFieldNames(FileSystemConfig fsConfig) throws Throwable { + public void defaultFieldNames(ReaderFsTestConfig fsConfig) throws Throwable { Map readerConfig = getReaderConfig(); readerConfig.put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, null); readerConfig.put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, null); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java index 53ac900..281bb24 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java @@ -25,7 +25,7 @@ public class TextFileReaderTest extends FileReaderTestBase { private static final CompressionType COMPRESSION_TYPE_DEFAULT = CompressionType.GZIP; @Override - protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { CompressionType compression = args.length < 1 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[0]; File txtFile = File.createTempFile("test-", "." + FILE_EXTENSION); try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { @@ -42,7 +42,7 @@ protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void validFileEncoding(FileSystemConfig fsConfig) throws Throwable { + public void validFileEncoding(ReaderFsTestConfig fsConfig) throws Throwable { Map readerConfig = getReaderConfig(); readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); readerConfig.put(TextFileReader.FILE_READER_TEXT_ENCODING, "Cp1252"); @@ -54,7 +54,7 @@ public void validFileEncoding(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invalidFileEncoding(FileSystemConfig fsConfig) { + public void invalidFileEncoding(ReaderFsTestConfig fsConfig) { Map readerConfig = getReaderConfig(); readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); readerConfig.put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset"); @@ -65,7 +65,7 @@ public void invalidFileEncoding(FileSystemConfig fsConfig) { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readDataWithRecordPerLineDisabled(FileSystemConfig fsConfig) throws Throwable { + public void readDataWithRecordPerLineDisabled(ReaderFsTestConfig fsConfig) throws Throwable { Path file = createDataFile(fsConfig, COMPRESSION_TYPE_DEFAULT); Map readerConfig = getReaderConfig(); readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); @@ -87,7 +87,7 @@ public void readDataWithRecordPerLineDisabled(FileSystemConfig fsConfig) throws @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readDifferentCompressionTypes(FileSystemConfig fsConfig) { + public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { Arrays.stream(CompressionType.values()).forEach(compressionType -> { try { Path file = createDataFile(fsConfig, compressionType); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java index 20c0dc0..2f94f28 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java @@ -13,7 +13,7 @@ public class TsvFileReaderTest extends UnivocityFileReaderTest { @Override - protected Path createDataFile(FileSystemConfig fsConfig, Object... args) throws IOException { + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { boolean header = args.length < 1 || (boolean) args[0]; CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1]; File txtFile = File.createTempFile("test-", "." + getFileExtension()); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java index da18e0e..d224027 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java @@ -29,7 +29,7 @@ abstract class UnivocityFileReaderTest extends Fi @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void emptyFile(FileSystemConfig fsConfig) throws Throwable { + public void emptyFile(ReaderFsTestConfig fsConfig) throws Throwable { File tmp = File.createTempFile("test-", "." + getFileExtension()); Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); @@ -38,7 +38,7 @@ public void emptyFile(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invalidFileFormat(FileSystemConfig fsConfig) throws Throwable { + public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws Throwable { File tmp = File.createTempFile("test-", "." + getFileExtension()); try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { writer.write("test"); @@ -50,7 +50,7 @@ public void invalidFileFormat(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invaliConfigArgs(FileSystemConfig fsConfig) { + public void invaliConfigArgs(ReaderFsTestConfig fsConfig) { try { getReaderClass().getConstructor(FileSystem.class, Path.class, Map.class) .newInstance(fsConfig.getFs(), fsConfig.getDataFile(), new HashMap()); @@ -63,7 +63,7 @@ public void invaliConfigArgs(FileSystemConfig fsConfig) { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readAllDataWithoutHeader(FileSystemConfig fsConfig) throws Throwable { + public void readAllDataWithoutHeader(ReaderFsTestConfig fsConfig) throws Throwable { Path file = createDataFile(fsConfig, false); Map readerConfig = getReaderConfig(); readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "false"); @@ -82,7 +82,7 @@ public void readAllDataWithoutHeader(FileSystemConfig fsConfig) throws Throwable @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readDifferentCompressionTypes(FileSystemConfig fsConfig) { + public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { Arrays.stream(CompressionType.values()).forEach(compressionType -> { try { Path file = createDataFile(fsConfig, true, compressionType); @@ -110,7 +110,7 @@ public void readDifferentCompressionTypes(FileSystemConfig fsConfig) { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void seekFileWithoutHeader(FileSystemConfig fsConfig) throws Throwable { + public void seekFileWithoutHeader(ReaderFsTestConfig fsConfig) throws Throwable { Path file = createDataFile(fsConfig, false); Map readerConfig = getReaderConfig(); readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "false"); @@ -142,7 +142,7 @@ public void seekFileWithoutHeader(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void validFileEncoding(FileSystemConfig fsConfig) throws Throwable { + public void validFileEncoding(ReaderFsTestConfig fsConfig) throws Throwable { Map readerConfig = getReaderConfig(); readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); readerConfig.put(T.FILE_READER_DELIMITED_ENCODING, "Cp1252"); @@ -151,7 +151,7 @@ public void validFileEncoding(FileSystemConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invalidFileEncoding(FileSystemConfig fsConfig) { + public void invalidFileEncoding(ReaderFsTestConfig fsConfig) { Map readerConfig = getReaderConfig(); readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); readerConfig.put(T.FILE_READER_DELIMITED_ENCODING, "invalid_charset"); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/CronPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java similarity index 53% rename from src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/CronPolicyTest.java rename to src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java index f054371..5de4e95 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/CronPolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java @@ -1,37 +1,26 @@ -package com.github.mmolimar.kafka.connect.fs.policy.local; +package com.github.mmolimar.kafka.connect.fs.policy; import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.CronPolicy; -import com.github.mmolimar.kafka.connect.fs.policy.Policy; import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; import org.apache.hadoop.fs.Path; import org.apache.kafka.common.config.ConfigException; import org.apache.kafka.connect.errors.IllegalWorkerStateException; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; import java.io.IOException; import java.time.LocalDateTime; -import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; -import java.util.UUID; import static org.junit.jupiter.api.Assertions.*; -public class CronPolicyTest extends LocalPolicyTestBase { - - @BeforeAll - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } +public class CronPolicyTest extends PolicyTestBase { + @Override + protected FsSourceTaskConfig buildSourceTaskConfig(List directories) { Map cfg = new HashMap() {{ String[] uris = directories.stream().map(Path::toString) .toArray(String[]::new); @@ -45,40 +34,45 @@ public static void setUp() throws IOException { put(CronPolicy.CRON_POLICY_EXPRESSION, "0/2 * * * * ?"); put(CronPolicy.CRON_POLICY_END_DATE, LocalDateTime.now().plusDays(1).toString()); }}; - taskConfig = new FsSourceTaskConfig(cfg); + return new FsSourceTaskConfig(cfg); } - @Test + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") @Override - public void execPolicyAlreadyEnded() throws IOException { - policy.execute(); - policy.interrupt(); - assertTrue(policy.hasEnded()); - assertThrows(IllegalWorkerStateException.class, () -> policy.execute()); + public void execPolicyAlreadyEnded(PolicyFsTestConfig fsConfig) throws IOException { + fsConfig.getPolicy().execute(); + fsConfig.getPolicy().interrupt(); + assertTrue(fsConfig.getPolicy().hasEnded()); + assertThrows(IllegalWorkerStateException.class, () -> fsConfig.getPolicy().execute()); } - @Test - public void invalidCronExpression() { - Map originals = taskConfig.originalsStrings(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidCronExpression(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(CronPolicy.CRON_POLICY_EXPRESSION, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); } - @Test - public void invalidEndDate() { - Map originals = taskConfig.originalsStrings(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidEndDate(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(CronPolicy.CRON_POLICY_END_DATE, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); } - @Test - public void canBeInterrupted() throws Throwable { - policy = ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), taskConfig); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void canBeInterrupted(PolicyFsTestConfig fsConfig) throws Throwable { + Policy policy = ReflectionUtils.makePolicy( + (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), + fsConfig.getSourceTaskConfig()); for (int i = 0; i < 5; i++) { assertFalse(policy.hasEnded()); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java new file mode 100644 index 0000000..ddf69b7 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java @@ -0,0 +1,79 @@ +package com.github.mmolimar.kafka.connect.fs.policy; + +import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; +import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.errors.IllegalWorkerStateException; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.*; + +public class HdfsFileWatcherPolicyTest extends PolicyTestBase { + + static { + TEST_FILE_SYSTEMS = Collections.singletonList( + new HdfsFsConfig() + ); + } + + @BeforeAll + public static void initFs() throws IOException { + for (PolicyFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.initFs(); + } + } + + @Override + protected FsSourceTaskConfig buildSourceTaskConfig(List directories) { + Map cfg = new HashMap() {{ + String[] uris = directories.stream().map(Path::toString) + .toArray(String[]::new); + put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); + put(FsSourceTaskConfig.TOPIC, "topic_test"); + put(FsSourceTaskConfig.POLICY_CLASS, HdfsFileWatcherPolicy.class.getName()); + put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); + put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); + put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); + }}; + return new FsSourceTaskConfig(cfg); + } + + //This policy does not throw any exception. Just stop watching those nonexistent dirs + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + @Override + public void invalidDirectory(PolicyFsTestConfig fsConfig) throws IOException { + for (Path dir : fsConfig.getDirectories()) { + fsConfig.getFs().delete(dir, true); + } + try { + fsConfig.getPolicy().execute(); + } finally { + for (Path dir : fsConfig.getDirectories()) { + fsConfig.getFs().mkdirs(dir); + } + } + } + + //This policy never ends. We have to interrupt it + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + @Override + public void execPolicyAlreadyEnded(PolicyFsTestConfig fsConfig) throws IOException { + fsConfig.getPolicy().execute(); + assertFalse(fsConfig.getPolicy().hasEnded()); + fsConfig.getPolicy().interrupt(); + assertTrue(fsConfig.getPolicy().hasEnded()); + assertThrows(IllegalWorkerStateException.class, () -> fsConfig.getPolicy().execute()); + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyFsTestConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyFsTestConfig.java new file mode 100644 index 0000000..60382c9 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyFsTestConfig.java @@ -0,0 +1,112 @@ +package com.github.mmolimar.kafka.connect.fs.policy; + +import com.github.mmolimar.kafka.connect.fs.AbstractHdfsFsConfig; +import com.github.mmolimar.kafka.connect.fs.AbstractLocalFsConfig; +import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; +import com.github.mmolimar.kafka.connect.fs.FsTestConfig; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.UUID; + +interface PolicyFsTestConfig extends FsTestConfig { + + Policy getPolicy(); + + void setPolicy(Policy policy); + + FsSourceTaskConfig getSourceTaskConfig(); + + void setSourceTaskConfig(FsSourceTaskConfig sourceTaskConfig); + + List getDirectories(); + +} + +class LocalFsConfig extends AbstractLocalFsConfig implements PolicyFsTestConfig { + private Policy policy; + private FsSourceTaskConfig sourceTaskConfig; + private List directories; + + @Override + public void init() throws IOException { + directories = new ArrayList() {{ + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + }}; + for (Path dir : directories) { + getFs().mkdirs(dir); + } + } + + @Override + public Policy getPolicy() { + return policy; + } + + @Override + public void setPolicy(Policy policy) { + this.policy = policy; + } + + @Override + public FsSourceTaskConfig getSourceTaskConfig() { + return sourceTaskConfig; + } + + @Override + public void setSourceTaskConfig(FsSourceTaskConfig sourceTaskConfig) { + this.sourceTaskConfig = sourceTaskConfig; + } + + @Override + public List getDirectories() { + return directories; + } + +} + +class HdfsFsConfig extends AbstractHdfsFsConfig implements PolicyFsTestConfig { + private Policy policy; + private FsSourceTaskConfig sourceTaskConfig; + private List directories; + + @Override + public void init() throws IOException { + directories = new ArrayList() {{ + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + }}; + for (Path dir : directories) { + getFs().mkdirs(dir); + } + } + + @Override + public Policy getPolicy() { + return policy; + } + + @Override + public void setPolicy(Policy policy) { + this.policy = policy; + } + + @Override + public FsSourceTaskConfig getSourceTaskConfig() { + return sourceTaskConfig; + } + + @Override + public void setSourceTaskConfig(FsSourceTaskConfig sourceTaskConfig) { + this.sourceTaskConfig = sourceTaskConfig; + } + + @Override + public List getDirectories() { + return directories; + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java index 4a1aa42..6aa77b1 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java @@ -9,91 +9,122 @@ import org.apache.kafka.connect.errors.IllegalWorkerStateException; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; import java.io.FileNotFoundException; import java.io.IOException; -import java.net.URI; import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.*; +import java.util.stream.Stream; import static org.junit.jupiter.api.Assertions.*; -public abstract class PolicyTestBase { +abstract class PolicyTestBase { - protected static FileSystem fs; - protected static Policy policy; - protected static List directories; - protected static FsSourceTaskConfig taskConfig; - protected static URI fsUri; + protected static List TEST_FILE_SYSTEMS = Arrays.asList( + new LocalFsConfig(), + new HdfsFsConfig() + ); + + @BeforeAll + public static void initFs() throws IOException { + for (PolicyFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.initFs(); + } + } @AfterAll - public static void tearDown() throws Exception { - policy.close(); - fs.close(); + public static void finishFs() throws IOException { + for (PolicyFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.getPolicy().close(); + fsConfig.close(); + } } @BeforeEach public void initPolicy() throws Throwable { - policy = ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), taskConfig); + for (PolicyFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + FsSourceTaskConfig sourceTaskConfig = buildSourceTaskConfig(fsConfig.getDirectories()); + Policy policy = ReflectionUtils.makePolicy( + (Class) sourceTaskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), sourceTaskConfig); + fsConfig.setSourceTaskConfig(sourceTaskConfig); + fsConfig.setPolicy(policy); + } } @AfterEach - public void cleanDirs() throws IOException { - for (Path dir : directories) { - fs.delete(dir, true); - fs.mkdirs(dir); + public void cleanDirsAndClose() throws IOException { + for (PolicyFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + for (Path dir : fsConfig.getDirectories()) { + fsConfig.getFs().delete(dir, true); + fsConfig.getFs().mkdirs(dir); + } + fsConfig.getPolicy().close(); } - policy.close(); } - @Test - public void invalidArgs() { - assertThrows(IllegalArgumentException.class, () -> taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS) - .getConstructor(taskConfig.getClass()).newInstance(null)); + private static Stream fileSystemConfigProvider() { + return TEST_FILE_SYSTEMS.stream().map(Arguments::of); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidArgs(PolicyFsTestConfig fsConfig) { + assertThrows(IllegalArgumentException.class, () -> fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS) + .getConstructor(fsConfig.getSourceTaskConfig().getClass()).newInstance(null)); } - @Test - public void invalidConfig() { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidConfig(PolicyFsTestConfig fsConfig) { assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), + (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), new FsSourceTaskConfig(new HashMap<>()))); } - @Test - public void interruptPolicy() throws Throwable { - policy.execute(); - policy.interrupt(); - assertTrue(policy.hasEnded()); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void interruptPolicy(PolicyFsTestConfig fsConfig) throws Throwable { + fsConfig.getPolicy().execute(); + fsConfig.getPolicy().interrupt(); + assertTrue(fsConfig.getPolicy().hasEnded()); } - @Test - public void invalidDirectory() throws IOException { - for (Path dir : directories) { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidDirectory(PolicyFsTestConfig fsConfig) throws IOException { + FileSystem fs = fsConfig.getFs(); + for (Path dir : fsConfig.getDirectories()) { fs.delete(dir, true); } try { - assertThrows(FileNotFoundException.class, () -> policy.execute()); + assertThrows(FileNotFoundException.class, () -> fsConfig.getPolicy().execute()); } finally { - for (Path dir : directories) { + for (Path dir : fsConfig.getDirectories()) { fs.mkdirs(dir); } } } - @Test - public void listEmptyDirectories() throws IOException { - Iterator it = policy.execute(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void listEmptyDirectories(PolicyFsTestConfig fsConfig) throws IOException { + Iterator it = fsConfig.getPolicy().execute(); assertFalse(it.hasNext()); assertThrows(NoSuchElementException.class, it::next); } - @Test - public void oneFilePerFs() throws IOException, InterruptedException { - for (Path dir : directories) { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void oneFilePerFs(PolicyFsTestConfig fsConfig) throws IOException, InterruptedException { + FileSystem fs = fsConfig.getFs(); + for (Path dir : fsConfig.getDirectories()) { fs.createNewFile(new Path(dir, System.nanoTime() + ".txt")); //this file does not match the regexp fs.createNewFile(new Path(dir, System.nanoTime() + ".invalid")); @@ -101,7 +132,7 @@ public void oneFilePerFs() throws IOException, InterruptedException { //we wait till FS has registered the files Thread.sleep(500); - Iterator it = policy.execute(); + Iterator it = fsConfig.getPolicy().execute(); assertTrue(it.hasNext()); it.next(); assertTrue(it.hasNext()); @@ -109,9 +140,11 @@ public void oneFilePerFs() throws IOException, InterruptedException { assertFalse(it.hasNext()); } - @Test - public void recursiveDirectory() throws IOException, InterruptedException { - for (Path dir : directories) { + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void recursiveDirectory(PolicyFsTestConfig fsConfig) throws IOException, InterruptedException { + FileSystem fs = fsConfig.getFs(); + for (Path dir : fsConfig.getDirectories()) { Path tmpDir = new Path(dir, String.valueOf(System.nanoTime())); fs.mkdirs(tmpDir); fs.createNewFile(new Path(tmpDir, System.nanoTime() + ".txt")); @@ -121,7 +154,7 @@ public void recursiveDirectory() throws IOException, InterruptedException { //we wait till FS has registered the files Thread.sleep(500); - Iterator it = policy.execute(); + Iterator it = fsConfig.getPolicy().execute(); assertTrue(it.hasNext()); it.next(); assertTrue(it.hasNext()); @@ -129,23 +162,26 @@ public void recursiveDirectory() throws IOException, InterruptedException { assertFalse(it.hasNext()); } - @Test - public void execPolicyAlreadyEnded() throws IOException { - policy.execute(); - assertTrue(policy.hasEnded()); - assertThrows(IllegalWorkerStateException.class, () -> policy.execute()); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void execPolicyAlreadyEnded(PolicyFsTestConfig fsConfig) throws IOException { + fsConfig.getPolicy().execute(); + assertTrue(fsConfig.getPolicy().hasEnded()); + assertThrows(IllegalWorkerStateException.class, () -> fsConfig.getPolicy().execute()); } - @Test - public void dynamicURIs() throws Throwable { - Path dynamic = new Path(fsUri.toString(), "${G}/${yyyy}/${MM}/${W}"); - fs.create(dynamic); - Map originals = taskConfig.originalsStrings(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void dynamicURIs(PolicyFsTestConfig fsConfig) throws Throwable { + Path dynamic = new Path(fsConfig.getFsUri().toString(), "${G}/${yyyy}/${MM}/${W}"); + fsConfig.getFs().create(dynamic); + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(FsSourceTaskConfig.FS_URIS, dynamic.toString()); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - policy = ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); - assertEquals(1, policy.getURIs().size()); + Policy policy = ReflectionUtils.makePolicy( + (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + fsConfig.setPolicy(policy); + assertEquals(1, fsConfig.getPolicy().getURIs().size()); LocalDateTime dateTime = LocalDateTime.now(); DateTimeFormatter formatter = DateTimeFormatter.ofPattern("G"); @@ -159,17 +195,21 @@ public void dynamicURIs() throws Throwable { uri.append("/"); formatter = DateTimeFormatter.ofPattern("W"); uri.append(dateTime.format(formatter)); - assertTrue(policy.getURIs().get(0).endsWith(uri.toString())); + assertTrue(fsConfig.getPolicy().getURIs().get(0).endsWith(uri.toString())); } - @Test - public void invalidDynamicURIs() throws Throwable { - Path dynamic = new Path(fsUri.toString(), "${yyyy}/${MM}/${mmmmmmm}"); - fs.create(dynamic); - Map originals = taskConfig.originalsStrings(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidDynamicURIs(PolicyFsTestConfig fsConfig) throws Throwable { + Path dynamic = new Path(fsConfig.getFsUri().toString(), "${yyyy}/${MM}/${mmmmmmm}"); + fsConfig.getFs().create(dynamic); + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(FsSourceTaskConfig.FS_URIS, dynamic.toString()); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); assertThrows(IllegalArgumentException.class, () -> ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); } + + protected abstract FsSourceTaskConfig buildSourceTaskConfig(List directories); + } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java similarity index 57% rename from src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java rename to src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java index 2de53e6..279a775 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SimplePolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SimplePolicyTest.java @@ -1,29 +1,17 @@ -package com.github.mmolimar.kafka.connect.fs.policy.local; +package com.github.mmolimar.kafka.connect.fs.policy; import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy; import org.apache.hadoop.fs.Path; -import org.junit.jupiter.api.BeforeAll; -import java.io.IOException; -import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; -import java.util.UUID; -public class SimplePolicyTest extends LocalPolicyTestBase { - - @BeforeAll - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } +public class SimplePolicyTest extends PolicyTestBase { + @Override + protected FsSourceTaskConfig buildSourceTaskConfig(List directories) { Map cfg = new HashMap() {{ String[] uris = directories.stream().map(Path::toString) .toArray(String[]::new); @@ -35,6 +23,7 @@ public static void setUp() throws IOException { put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test/"); }}; - taskConfig = new FsSourceTaskConfig(cfg); + return new FsSourceTaskConfig(cfg); } + } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java similarity index 54% rename from src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java rename to src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java index 93c9f09..9748d15 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/SleepyPolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java @@ -1,35 +1,23 @@ -package com.github.mmolimar.kafka.connect.fs.policy.local; +package com.github.mmolimar.kafka.connect.fs.policy; import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.Policy; -import com.github.mmolimar.kafka.connect.fs.policy.SleepyPolicy; import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; import org.apache.hadoop.fs.Path; import org.apache.kafka.common.config.ConfigException; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; -import java.io.IOException; -import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; -import java.util.UUID; import static org.junit.jupiter.api.Assertions.*; -public class SleepyPolicyTest extends LocalPolicyTestBase { - - @BeforeAll - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } +public class SleepyPolicyTest extends PolicyTestBase { + @Override + protected FsSourceTaskConfig buildSourceTaskConfig(List directories) { Map cfg = new HashMap() {{ String[] uris = directories.stream().map(Path::toString) .toArray(String[]::new); @@ -43,45 +31,49 @@ public static void setUp() throws IOException { put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "100"); put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "1"); }}; - taskConfig = new FsSourceTaskConfig(cfg); + return new FsSourceTaskConfig(cfg); } - @Test - public void invalidSleepTime() { - Map originals = taskConfig.originalsStrings(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidSleepTime(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); } - @Test - public void invalidMaxExecs() { - Map originals = taskConfig.originalsStrings(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidMaxExecs(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); } - @Test - public void invalidSleepFraction() { - Map originals = taskConfig.originalsStrings(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidSleepFraction(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_FRACTION, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); } - @Test - public void sleepExecution() throws Throwable { - Map tConfig = taskConfig.originalsStrings(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void sleepExecution(PolicyFsTestConfig fsConfig) throws Throwable { + Map tConfig = fsConfig.getSourceTaskConfig().originalsStrings(); tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1000"); tConfig.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "2"); FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - policy = ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); + Policy policy = ReflectionUtils.makePolicy( + (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); assertFalse(policy.hasEnded()); policy.execute(); assertFalse(policy.hasEnded()); @@ -89,15 +81,16 @@ public void sleepExecution() throws Throwable { assertTrue(policy.hasEnded()); } - @Test - public void defaultExecutions() throws Throwable { - Map tConfig = taskConfig.originalsStrings(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void defaultExecutions(PolicyFsTestConfig fsConfig) throws Throwable { + Map tConfig = fsConfig.getSourceTaskConfig().originalsStrings(); tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1"); tConfig.remove(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS); FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - policy = ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); + Policy policy = ReflectionUtils.makePolicy( + (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); //it never ends for (int i = 0; i < 100; i++) { @@ -107,4 +100,5 @@ public void defaultExecutions() throws Throwable { policy.interrupt(); assertTrue(policy.hasEnded()); } + } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/CronPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/CronPolicyTest.java deleted file mode 100644 index 26d20d1..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/CronPolicyTest.java +++ /dev/null @@ -1,90 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.policy.hdfs; - -import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.CronPolicy; -import com.github.mmolimar.kafka.connect.fs.policy.Policy; -import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.common.config.ConfigException; -import org.apache.kafka.connect.errors.IllegalWorkerStateException; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.time.LocalDateTime; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; - -import static org.junit.jupiter.api.Assertions.*; - -public class CronPolicyTest extends HdfsPolicyTestBase { - - @BeforeAll - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } - - Map cfg = new HashMap() {{ - String[] uris = directories.stream().map(Path::toString) - .toArray(String[]::new); - put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); - put(FsSourceTaskConfig.TOPIC, "topic_test"); - put(FsSourceTaskConfig.POLICY_CLASS, CronPolicy.class.getName()); - put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); - put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); - put(CronPolicy.CRON_POLICY_EXPRESSION, "0/2 * * * * ?"); - put(CronPolicy.CRON_POLICY_END_DATE, LocalDateTime.now().plusDays(1).toString()); - }}; - taskConfig = new FsSourceTaskConfig(cfg); - } - - @Test - @Override - public void execPolicyAlreadyEnded() throws IOException { - policy.execute(); - policy.interrupt(); - assertTrue(policy.hasEnded()); - assertThrows(IllegalWorkerStateException.class, () -> policy.execute()); - } - - @Test - public void invalidCronExpression() { - Map originals = taskConfig.originalsStrings(); - originals.put(CronPolicy.CRON_POLICY_EXPRESSION, "invalid"); - FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); - } - - @Test - public void invalidEndDate() { - Map originals = taskConfig.originalsStrings(); - originals.put(CronPolicy.CRON_POLICY_END_DATE, "invalid"); - FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); - } - - @Test - public void canBeInterrupted() throws Throwable { - policy = ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), taskConfig); - - for (int i = 0; i < 5; i++) { - assertFalse(policy.hasEnded()); - policy.execute(); - } - policy.interrupt(); - assertTrue(policy.hasEnded()); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java deleted file mode 100644 index ec68d68..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/HdfsFileWatcherPolicyTest.java +++ /dev/null @@ -1,71 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.policy.hdfs; - -import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.HdfsFileWatcherPolicy; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.errors.IllegalWorkerStateException; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; - -import static org.junit.jupiter.api.Assertions.*; - -public class HdfsFileWatcherPolicyTest extends HdfsPolicyTestBase { - - @BeforeAll - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } - - Map cfg = new HashMap() {{ - String[] uris = directories.stream().map(Path::toString) - .toArray(String[]::new); - put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); - put(FsSourceTaskConfig.TOPIC, "topic_test"); - put(FsSourceTaskConfig.POLICY_CLASS, HdfsFileWatcherPolicy.class.getName()); - put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); - put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); - }}; - taskConfig = new FsSourceTaskConfig(cfg); - } - - //This policy does not throw any exception. Just stop watching those nonexistent dirs - @Test - @Override - public void invalidDirectory() throws IOException { - for (Path dir : directories) { - fs.delete(dir, true); - } - try { - policy.execute(); - } finally { - for (Path dir : directories) { - fs.mkdirs(dir); - } - } - } - - //This policy never ends. We have to interrupt it - @Test - @Override - public void execPolicyAlreadyEnded() throws IOException { - policy.execute(); - assertFalse(policy.hasEnded()); - policy.interrupt(); - assertTrue(policy.hasEnded()); - assertThrows(IllegalWorkerStateException.class, () -> policy.execute()); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java deleted file mode 100644 index 5e0eb7f..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SimplePolicyTest.java +++ /dev/null @@ -1,40 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.policy.hdfs; - -import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy; -import org.apache.hadoop.fs.Path; -import org.junit.jupiter.api.BeforeAll; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; - -public class SimplePolicyTest extends HdfsPolicyTestBase { - - @BeforeAll - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } - - Map cfg = new HashMap() {{ - String[] uris = directories.stream().map(Path::toString) - .toArray(String[]::new); - put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); - put(FsSourceTaskConfig.TOPIC, "topic_test"); - put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); - put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); - put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); - }}; - taskConfig = new FsSourceTaskConfig(cfg); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java deleted file mode 100644 index d47faae..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/hdfs/SleepyPolicyTest.java +++ /dev/null @@ -1,110 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.policy.hdfs; - -import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.Policy; -import com.github.mmolimar.kafka.connect.fs.policy.SleepyPolicy; -import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.common.config.ConfigException; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.UUID; - -import static org.junit.jupiter.api.Assertions.*; - -public class SleepyPolicyTest extends HdfsPolicyTestBase { - - @BeforeAll - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } - - Map cfg = new HashMap() {{ - String[] uris = directories.stream().map(Path::toString) - .toArray(String[]::new); - put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); - put(FsSourceTaskConfig.TOPIC, "topic_test"); - put(FsSourceTaskConfig.POLICY_CLASS, SleepyPolicy.class.getName()); - put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); - put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "dfs.data.dir", "test"); - put(FsSourceTaskConfig.POLICY_PREFIX_FS + "fs.default.name", "hdfs://test"); - put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "100"); - put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "1"); - }}; - taskConfig = new FsSourceTaskConfig(cfg); - } - - @Test - public void invalidSleepTime() { - Map originals = taskConfig.originalsStrings(); - originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "invalid"); - FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); - } - - @Test - public void invalidMaxExecs() { - Map originals = taskConfig.originalsStrings(); - originals.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "invalid"); - FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); - } - - @Test - public void invalidSleepFraction() { - Map originals = taskConfig.originalsStrings(); - originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_FRACTION, "invalid"); - FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); - } - - @Test - public void sleepExecution() throws Throwable { - Map tConfig = taskConfig.originalsStrings(); - tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1000"); - tConfig.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "2"); - FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - - policy = ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); - assertFalse(policy.hasEnded()); - policy.execute(); - assertFalse(policy.hasEnded()); - policy.execute(); - assertTrue(policy.hasEnded()); - } - - @Test - public void defaultExecutions() throws Throwable { - Map tConfig = taskConfig.originalsStrings(); - tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1"); - tConfig.remove(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS); - FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - - policy = ReflectionUtils.makePolicy( - (Class) taskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); - - //it never ends - for (int i = 0; i < 100; i++) { - assertFalse(policy.hasEnded()); - policy.execute(); - } - policy.interrupt(); - assertTrue(policy.hasEnded()); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/LocalPolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/LocalPolicyTestBase.java deleted file mode 100644 index 8c12b3a..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/local/LocalPolicyTestBase.java +++ /dev/null @@ -1,29 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.policy.local; - -import com.github.mmolimar.kafka.connect.fs.policy.PolicyTestBase; -import org.apache.commons.io.FileUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -public abstract class LocalPolicyTestBase extends PolicyTestBase { - - private static Path localDir; - - @BeforeAll - public static void initFs() throws IOException { - localDir = Files.createTempDirectory("test-"); - fsUri = localDir.toUri(); - fs = FileSystem.newInstance(fsUri, new Configuration()); - } - - @AfterAll - public static void finishFs() throws IOException { - FileUtils.deleteDirectory(localDir.toFile()); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java index 6d4a823..8dd610a 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java @@ -2,96 +2,304 @@ import com.github.mmolimar.kafka.connect.fs.FsSourceTask; import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; +import com.github.mmolimar.kafka.connect.fs.file.reader.AvroFileReader; import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; +import com.github.mmolimar.kafka.connect.fs.policy.Policy; import com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.errors.ConnectException; +import org.apache.kafka.connect.source.SourceRecord; +import org.apache.kafka.connect.source.SourceTaskContext; +import org.apache.kafka.connect.storage.OffsetStorageReader; +import org.easymock.EasyMock; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; +import org.powermock.api.easymock.PowerMock; +import org.powermock.api.support.membermodification.MemberModifier; import java.io.File; -import java.util.HashMap; -import java.util.Map; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.*; +import java.util.stream.IntStream; +import java.util.stream.Stream; import static org.junit.jupiter.api.Assertions.*; public class FsSourceTaskTest { - @TempDir - public static File temporaryFolder; - private FsSourceTask task; - private Map taskConfig; + private static final List TEST_FILE_SYSTEMS = Arrays.asList( + new LocalFsConfig(), + new HdfsFsConfig() + ); + private static final int NUM_RECORDS = 10; + + @BeforeAll + public static void initFs() throws IOException { + for (TaskFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.initFs(); + } + } + + @AfterAll + public static void finishFs() throws IOException { + for (TaskFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + fsConfig.close(); + } + } @BeforeEach - public void setup() { - task = new FsSourceTask(); + public void initTask() { + for (TaskFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + Map taskConfig = new HashMap() {{ + String[] uris = fsConfig.getDirectories().stream().map(Path::toString) + .toArray(String[]::new); + put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); + put(FsSourceTaskConfig.TOPIC, "topic_test"); + put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); + put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); + put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); + }}; + + //Mock initialization + SourceTaskContext taskContext = PowerMock.createMock(SourceTaskContext.class); + OffsetStorageReader offsetStorageReader = PowerMock.createMock(OffsetStorageReader.class); + + EasyMock.expect(taskContext.offsetStorageReader()) + .andReturn(offsetStorageReader); + + EasyMock.expect(taskContext.offsetStorageReader()) + .andReturn(offsetStorageReader); + + EasyMock.expect(offsetStorageReader.offset(EasyMock.anyObject())) + .andReturn(new HashMap() {{ + put("offset", 5L); + }}); + EasyMock.expect(offsetStorageReader.offset(EasyMock.anyObject())) + .andReturn(new HashMap() {{ + put("offset", 5L); + }}); + + EasyMock.checkOrder(taskContext, false); + EasyMock.replay(taskContext); + + EasyMock.checkOrder(offsetStorageReader, false); + EasyMock.replay(offsetStorageReader); + + FsSourceTask task = new FsSourceTask(); + task.initialize(taskContext); + + fsConfig.setTaskConfig(taskConfig); + fsConfig.setTask(task); + } + } + + @AfterEach + public void cleanDirsAndStop() throws IOException { + for (TaskFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { + for (Path dir : fsConfig.getDirectories()) { + fsConfig.getFs().delete(dir, true); + fsConfig.getFs().mkdirs(dir); + } + fsConfig.getTask().stop(); + } + } + + private static Stream fileSystemConfigProvider() { + return TEST_FILE_SYSTEMS.stream().map(Arguments::of); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void pollNoData(TaskFsTestConfig fsConfig) { + fsConfig.getTask().start(fsConfig.getTaskConfig()); + assertEquals(0, fsConfig.getTask().poll().size()); + //policy has ended + assertNull(fsConfig.getTask().poll()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void emptyFilesToProcess(TaskFsTestConfig fsConfig) throws IOException { + for (Path dir : fsConfig.getDirectories()) { + fsConfig.getFs().createNewFile(new Path(dir, System.nanoTime() + ".txt")); + //this file does not match the regexp + fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime()))); + } + fsConfig.getTask().start(fsConfig.getTaskConfig()); + assertEquals(0, fsConfig.getTask().poll().size()); + //policy has ended + assertNull(fsConfig.getTask().poll()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void oneFilePerFs(TaskFsTestConfig fsConfig) throws IOException { + for (Path dir : fsConfig.getDirectories()) { + Path dataFile = new Path(dir, System.nanoTime() + ".txt"); + createDataFile(fsConfig.getFs(), dataFile); + //this file does not match the regexp + fsConfig.getFs().createNewFile(new Path(dir, String.valueOf(System.nanoTime()))); + } + + fsConfig.getTask().start(fsConfig.getTaskConfig()); + List records = fsConfig.getTask().poll(); + assertEquals(10, records.size()); + checkRecords(records); + //policy has ended + assertNull(fsConfig.getTask().poll()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void nonExistentUri(TaskFsTestConfig fsConfig) { + Map props = new HashMap<>(fsConfig.getTaskConfig()); + props.put(FsSourceTaskConfig.FS_URIS, + new Path(fsConfig.getFs().getWorkingDirectory(), UUID.randomUUID().toString()).toString()); + fsConfig.getTask().start(props); + fsConfig.getTask().poll(); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void exceptionExecutingPolicy(TaskFsTestConfig fsConfig) throws IOException, IllegalAccessException { + Map props = new HashMap<>(fsConfig.getTaskConfig()); + fsConfig.getTask().start(props); + + Policy policy = EasyMock.createNiceMock(Policy.class); + EasyMock.expect(policy.hasEnded()).andReturn(Boolean.FALSE); + EasyMock.expect(policy.execute()).andThrow(new ConnectException("Exception from mock")); + EasyMock.expect(policy.getURIs()).andReturn(null); + EasyMock.checkOrder(policy, false); + EasyMock.replay(policy); + MemberModifier.field(FsSourceTask.class, "policy").set(fsConfig.getTask(), policy); + + assertEquals(0, fsConfig.getTask().poll().size()); + } - taskConfig = new HashMap() {{ - put(FsSourceTaskConfig.FS_URIS, String.join(",", - temporaryFolder.toURI() + File.separator + "dir1", - temporaryFolder.toURI() + File.separator + "dir2", - temporaryFolder.toURI() + File.separator + "dir3")); - put(FsSourceTaskConfig.TOPIC, "topic_test"); - put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); - put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); - }}; + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void exceptionReadingFile(TaskFsTestConfig fsConfig) throws IOException { + Map props = new HashMap<>(fsConfig.getTaskConfig()); + File tmp = File.createTempFile("test-", ".txt"); + try (PrintWriter writer = new PrintWriter(tmp)) { + writer.append("txt"); + } + Path dest = new Path(fsConfig.getDirectories().get(0).toString(), System.nanoTime() + ".txt"); + fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), dest); + props.put(FsSourceTaskConfig.FILE_READER_CLASS, AvroFileReader.class.getName()); + fsConfig.getTask().start(props); + assertEquals(0, fsConfig.getTask().poll().size()); + fsConfig.getTask().stop(); + + fsConfig.getFs().delete(dest, false); } - @Test - public void nullProperties() { - assertThrows(ConnectException.class, () -> task.start(null)); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void nullProperties(TaskFsTestConfig fsConfig) { + assertThrows(ConnectException.class, () -> fsConfig.getTask().start(null)); } - @Test - public void expectedFsUris() { - Map testProps = new HashMap<>(taskConfig); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void expectedFsUris(TaskFsTestConfig fsConfig) { + Map testProps = new HashMap<>(fsConfig.getTaskConfig()); testProps.remove(FsSourceTaskConfig.FS_URIS); - assertThrows(ConnectException.class, () -> task.start(testProps)); + assertThrows(ConnectException.class, () -> fsConfig.getTask().start(testProps)); } - @Test - public void expectedPolicyClass() { - Map testProps = new HashMap<>(taskConfig); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void expectedPolicyClass(TaskFsTestConfig fsConfig) { + Map testProps = new HashMap<>(fsConfig.getTaskConfig()); testProps.remove(FsSourceTaskConfig.POLICY_CLASS); - assertThrows(ConnectException.class, () -> task.start(testProps)); + assertThrows(ConnectException.class, () -> fsConfig.getTask().start(testProps)); } - @Test - public void invalidPolicyClass() { - Map testProps = new HashMap<>(taskConfig); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidPolicyClass(TaskFsTestConfig fsConfig) { + Map testProps = new HashMap<>(fsConfig.getTaskConfig()); testProps.put(FsSourceTaskConfig.POLICY_CLASS, Object.class.getName()); - assertThrows(ConnectException.class, () -> task.start(testProps)); + assertThrows(ConnectException.class, () -> fsConfig.getTask().start(testProps)); } - @Test - public void expectedReaderClass() { - Map testProps = new HashMap<>(taskConfig); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void expectedReaderClass(TaskFsTestConfig fsConfig) { + Map testProps = new HashMap<>(fsConfig.getTaskConfig()); testProps.remove(FsSourceTaskConfig.FILE_READER_CLASS); - assertThrows(ConnectException.class, () -> task.start(testProps)); + assertThrows(ConnectException.class, () -> fsConfig.getTask().start(testProps)); } - @Test - public void invalidReaderClass() { - Map testProps = new HashMap<>(taskConfig); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidReaderClass(TaskFsTestConfig fsConfig) { + Map testProps = new HashMap<>(fsConfig.getTaskConfig()); testProps.put(FsSourceTaskConfig.FILE_READER_CLASS, Object.class.getName()); - assertThrows(ConnectException.class, () -> task.start(testProps)); + assertThrows(ConnectException.class, () -> fsConfig.getTask().start(testProps)); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void minimumConfig(TaskFsTestConfig fsConfig) { + fsConfig.getTask().start(fsConfig.getTaskConfig()); + fsConfig.getTask().stop(); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void pollWithoutStart(TaskFsTestConfig fsConfig) { + assertNull(fsConfig.getTask().poll()); + fsConfig.getTask().stop(); } - @Test - public void minimumConfig() { - task.start(taskConfig); - task.stop(); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void checkVersion(TaskFsTestConfig fsConfig) { + assertNotNull(fsConfig.getTask().version()); + assertFalse("unknown".equalsIgnoreCase(fsConfig.getTask().version())); } - @Test - public void pollWithoutStart() { - assertNull(task.poll()); - task.stop(); + protected void checkRecords(List records) { + records.forEach(record -> { + assertEquals("topic_test", record.topic()); + assertNotNull(record.sourcePartition()); + assertNotNull(record.sourceOffset()); + assertNotNull(record.value()); + + assertNotNull(((Struct) record.value()).get(TextFileReader.FIELD_NAME_VALUE_DEFAULT)); + }); } - @Test - public void checkVersion() { - assertNotNull(task.version()); - assertFalse("unknown".equalsIgnoreCase(task.version())); + protected void createDataFile(FileSystem fs, Path path) throws IOException { + File file = fillDataFile(); + fs.moveFromLocalFile(new Path(file.getAbsolutePath()), path); } + + private File fillDataFile() throws IOException { + File txtFile = File.createTempFile("test-", ".txt"); + try (FileWriter writer = new FileWriter(txtFile)) { + + IntStream.range(0, NUM_RECORDS).forEach(index -> { + String value = String.format("%d_%s", index, UUID.randomUUID()); + try { + writer.append(value + "\n"); + } catch (IOException ioe) { + throw new RuntimeException(ioe); + } + }); + } + return txtFile; + } + } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTestBase.java deleted file mode 100644 index 22d388c..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTestBase.java +++ /dev/null @@ -1,187 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.task; - -import com.github.mmolimar.kafka.connect.fs.FsSourceTask; -import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; -import com.github.mmolimar.kafka.connect.fs.file.reader.AvroFileReader; -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import com.github.mmolimar.kafka.connect.fs.policy.Policy; -import com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.errors.ConnectException; -import org.apache.kafka.connect.source.SourceRecord; -import org.apache.kafka.connect.source.SourceTaskContext; -import org.apache.kafka.connect.storage.OffsetStorageReader; -import org.easymock.EasyMock; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.powermock.api.easymock.PowerMock; -import org.powermock.api.support.membermodification.MemberModifier; - -import java.io.File; -import java.io.IOException; -import java.io.PrintWriter; -import java.net.URI; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.UUID; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNull; - -public abstract class FsSourceTaskTestBase { - - protected static final int NUM_RECORDS = 10; - - protected static FileSystem fs; - protected static List directories; - protected static URI fsUri; - - protected FsSourceTask task; - protected Map taskConfig; - protected SourceTaskContext taskContext; - protected OffsetStorageReader offsetStorageReader; - - @AfterAll - public static void tearDown() throws Exception { - fs.close(); - } - - @BeforeEach - public void initTask() { - task = new FsSourceTask(); - taskConfig = new HashMap() {{ - String[] uris = directories.stream().map(Path::toString) - .toArray(String[]::new); - put(FsSourceTaskConfig.FS_URIS, String.join(",", uris)); - put(FsSourceTaskConfig.TOPIC, "topic_test"); - put(FsSourceTaskConfig.POLICY_CLASS, SimplePolicy.class.getName()); - put(FsSourceTaskConfig.FILE_READER_CLASS, TextFileReader.class.getName()); - put(FsSourceTaskConfig.POLICY_REGEXP, "^[0-9]*\\.txt$"); - }}; - - //Mock initialization - taskContext = PowerMock.createMock(SourceTaskContext.class); - offsetStorageReader = PowerMock.createMock(OffsetStorageReader.class); - - EasyMock.expect(taskContext.offsetStorageReader()) - .andReturn(offsetStorageReader); - - EasyMock.expect(taskContext.offsetStorageReader()) - .andReturn(offsetStorageReader); - - EasyMock.expect(offsetStorageReader.offset(EasyMock.anyObject())) - .andReturn(new HashMap() {{ - put("offset", 5L); - }}); - EasyMock.expect(offsetStorageReader.offset(EasyMock.anyObject())) - .andReturn(new HashMap() {{ - put("offset", 5L); - }}); - - EasyMock.checkOrder(taskContext, false); - EasyMock.replay(taskContext); - - EasyMock.checkOrder(offsetStorageReader, false); - EasyMock.replay(offsetStorageReader); - - task.initialize(taskContext); - - } - - @AfterEach - public void cleanDirsAndStop() throws IOException { - for (Path dir : directories) { - fs.delete(dir, true); - fs.mkdirs(dir); - } - task.stop(); - } - - @Test - public void pollNoData() { - task.start(taskConfig); - assertEquals(0, task.poll().size()); - //policy has ended - assertNull(task.poll()); - } - - @Test - public void emptyFilesToProcess() throws IOException { - for (Path dir : directories) { - fs.createNewFile(new Path(dir, System.nanoTime() + ".txt")); - //this file does not match the regexp - fs.createNewFile(new Path(dir, String.valueOf(System.nanoTime()))); - } - task.start(taskConfig); - assertEquals(0, task.poll().size()); - //policy has ended - assertNull(task.poll()); - } - - @Test - public void oneFilePerFs() throws IOException { - for (Path dir : directories) { - Path dataFile = new Path(dir, System.nanoTime() + ".txt"); - createDataFile(dataFile); - //this file does not match the regexp - fs.createNewFile(new Path(dir, String.valueOf(System.nanoTime()))); - } - - task.start(taskConfig); - List records = task.poll(); - assertEquals(10, records.size()); - checkRecords(records); - //policy has ended - assertNull(task.poll()); - } - - @Test - public void nonExistentUri() { - Map props = new HashMap<>(taskConfig); - props.put(FsSourceTaskConfig.FS_URIS, new Path(fs.getWorkingDirectory(), UUID.randomUUID().toString()).toString()); - task.start(props); - task.poll(); - } - - @Test - public void exceptionExecutingPolicy() throws IOException, IllegalAccessException { - Map props = new HashMap<>(taskConfig); - task.start(props); - - Policy policy = EasyMock.createNiceMock(Policy.class); - EasyMock.expect(policy.hasEnded()).andReturn(Boolean.FALSE); - EasyMock.expect(policy.execute()).andThrow(new ConnectException("Exception from mock")); - EasyMock.expect(policy.getURIs()).andReturn(null); - EasyMock.checkOrder(policy, false); - EasyMock.replay(policy); - MemberModifier.field(FsSourceTask.class, "policy").set(task, policy); - - assertEquals(0, task.poll().size()); - } - - @Test - public void exceptionReadingFile() throws IOException { - Map props = new HashMap<>(taskConfig); - File tmp = File.createTempFile("test-", ".txt"); - try (PrintWriter writer = new PrintWriter(tmp)) { - writer.append("txt"); - } - Path dest = new Path(directories.get(0).toString(), System.nanoTime() + ".txt"); - fs.moveFromLocalFile(new Path(tmp.getAbsolutePath()), dest); - props.put(FsSourceTaskConfig.FILE_READER_CLASS, AvroFileReader.class.getName()); - task.start(props); - assertEquals(0, task.poll().size()); - task.stop(); - - fs.delete(dest, false); - } - - protected abstract void checkRecords(List records); - - protected abstract void createDataFile(Path path) throws IOException; - -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/TaskFsTestConfig.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/TaskFsTestConfig.java new file mode 100644 index 0000000..1efe3b4 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/TaskFsTestConfig.java @@ -0,0 +1,113 @@ +package com.github.mmolimar.kafka.connect.fs.task; + +import com.github.mmolimar.kafka.connect.fs.AbstractHdfsFsConfig; +import com.github.mmolimar.kafka.connect.fs.AbstractLocalFsConfig; +import com.github.mmolimar.kafka.connect.fs.FsSourceTask; +import com.github.mmolimar.kafka.connect.fs.FsTestConfig; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.UUID; + +interface TaskFsTestConfig extends FsTestConfig { + + FsSourceTask getTask(); + + void setTask(FsSourceTask task); + + Map getTaskConfig(); + + void setTaskConfig(Map taskConfig); + + List getDirectories(); + +} + +class LocalFsConfig extends AbstractLocalFsConfig implements TaskFsTestConfig { + private FsSourceTask task; + private Map taskConfig; + private List directories; + + @Override + public void init() throws IOException { + directories = new ArrayList() {{ + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + }}; + for (Path dir : directories) { + getFs().mkdirs(dir); + } + } + + @Override + public FsSourceTask getTask() { + return task; + } + + @Override + public void setTask(FsSourceTask task) { + this.task = task; + } + + @Override + public Map getTaskConfig() { + return taskConfig; + } + + @Override + public void setTaskConfig(Map taskConfig) { + this.taskConfig = taskConfig; + } + + @Override + public List getDirectories() { + return directories; + } + +} + +class HdfsFsConfig extends AbstractHdfsFsConfig implements TaskFsTestConfig { + private FsSourceTask task; + private Map taskConfig; + private List directories; + + @Override + public void init() throws IOException { + directories = new ArrayList() {{ + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + add(new Path(getFsUri().toString(), UUID.randomUUID().toString())); + }}; + for (Path dir : directories) { + getFs().mkdirs(dir); + } + } + + @Override + public FsSourceTask getTask() { + return task; + } + + @Override + public void setTask(FsSourceTask task) { + this.task = task; + } + + @Override + public Map getTaskConfig() { + return taskConfig; + } + + @Override + public void setTaskConfig(Map taskConfig) { + this.taskConfig = taskConfig; + } + + @Override + public List getDirectories() { + return directories; + } + +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTest.java deleted file mode 100644 index 1e8b303..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTest.java +++ /dev/null @@ -1,66 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.task.hdfs; - -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.source.SourceRecord; -import org.junit.jupiter.api.BeforeAll; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; - -public class HdfsFsSourceTaskTest extends HdfsFsSourceTaskTestBase { - - @BeforeAll - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } - } - - @Override - protected void checkRecords(List records) { - records.forEach(record -> { - assertEquals("topic_test", record.topic()); - assertNotNull(record.sourcePartition()); - assertNotNull(record.sourceOffset()); - assertNotNull(record.value()); - - assertNotNull(((Struct) record.value()).get(TextFileReader.FIELD_NAME_VALUE_DEFAULT)); - }); - } - - @Override - protected void createDataFile(Path path) throws IOException { - File file = fillDataFile(); - fs.moveFromLocalFile(new Path(file.getAbsolutePath()), path); - } - - private File fillDataFile() throws IOException { - File txtFile = File.createTempFile("test-", ".txt"); - try (FileWriter writer = new FileWriter(txtFile)) { - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - try { - writer.append(value + "\n"); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - return txtFile; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTestBase.java deleted file mode 100644 index 1132bc6..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/hdfs/HdfsFsSourceTaskTestBase.java +++ /dev/null @@ -1,33 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.task.hdfs; - -import com.github.mmolimar.kafka.connect.fs.task.FsSourceTaskTestBase; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; - -import java.io.IOException; -import java.net.URI; -import java.nio.file.Files; -import java.nio.file.Path; - -public abstract class HdfsFsSourceTaskTestBase extends FsSourceTaskTestBase { - - private static MiniDFSCluster cluster; - - @BeforeAll - public static void initFs() throws IOException { - Configuration clusterConfig = new Configuration(); - Path hdfsDir = Files.createTempDirectory("test-"); - clusterConfig.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsDir.toAbsolutePath().toString()); - cluster = new MiniDFSCluster.Builder(clusterConfig).build(); - fsUri = URI.create("hdfs://localhost:" + cluster.getNameNodePort() + "/"); - fs = FileSystem.newInstance(fsUri, clusterConfig); - } - - @AfterAll - public static void finishFs() { - cluster.shutdown(true); - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTest.java deleted file mode 100644 index 8623e05..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTest.java +++ /dev/null @@ -1,65 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.task.local; - -import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; -import org.apache.hadoop.fs.Path; -import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.source.SourceRecord; -import org.junit.jupiter.api.BeforeAll; - -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.UUID; -import java.util.stream.IntStream; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; - -public class LocalFsSourceTaskTest extends LocalFsSourceTaskTestBase { - - @BeforeAll - public static void setUp() throws IOException { - directories = new ArrayList() {{ - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - add(new Path(fsUri.toString(), UUID.randomUUID().toString())); - }}; - for (Path dir : directories) { - fs.mkdirs(dir); - } - } - - @Override - protected void checkRecords(List records) { - records.forEach(record -> { - assertEquals("topic_test", record.topic()); - assertNotNull(record.sourcePartition()); - assertNotNull(record.sourceOffset()); - assertNotNull(record.value()); - assertNotNull(((Struct) record.value()).get(TextFileReader.FIELD_NAME_VALUE_DEFAULT)); - }); - } - - @Override - protected void createDataFile(Path path) throws IOException { - File file = fillDataFile(); - fs.moveFromLocalFile(new Path(file.getAbsolutePath()), path); - } - - private File fillDataFile() throws IOException { - File txtFile = File.createTempFile("test-", ".txt"); - try (FileWriter writer = new FileWriter(txtFile)) { - - IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - try { - writer.append(value + "\n"); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - }); - } - return txtFile; - } -} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTestBase.java deleted file mode 100644 index 4cf1074..0000000 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/local/LocalFsSourceTaskTestBase.java +++ /dev/null @@ -1,29 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.task.local; - -import com.github.mmolimar.kafka.connect.fs.task.FsSourceTaskTestBase; -import org.apache.commons.io.FileUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -public abstract class LocalFsSourceTaskTestBase extends FsSourceTaskTestBase { - - private static Path localDir; - - @BeforeAll - public static void initFs() throws IOException { - localDir = Files.createTempDirectory("test-"); - fsUri = localDir.toUri(); - fs = FileSystem.newInstance(fsUri, new Configuration()); - } - - @AfterAll - public static void finishFs() throws IOException { - FileUtils.deleteDirectory(localDir.toFile()); - } -} From 3655b714dcd9055db3ff488f10d6f0e2883149a3 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sat, 21 Mar 2020 19:17:54 -0600 Subject: [PATCH 28/51] Change to Open JDK --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index d97b432..2d90a0c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,6 @@ language: java jdk: - - oraclejdk8 + - openjdk8 sudo: false install: - mvn test-compile -DskipTests=true -Dmaven.javadoc.skip=true -B -V From e2429c54c6aac79bd0cdb35c04e590fdaa550306 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sat, 21 Mar 2020 19:33:58 -0600 Subject: [PATCH 29/51] Fix in doc --- docs/source/config_options.rst | 4 ++++ docs/source/filereaders.rst | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/source/config_options.rst b/docs/source/config_options.rst index 5b25ed1..df4a946 100644 --- a/docs/source/config_options.rst +++ b/docs/source/config_options.rst @@ -276,6 +276,7 @@ To configure custom properties for this reader, the name you must use is ``json` Encoding to use for reading a file. If not specified, the reader will use the default encoding. * Type: string + * Default: based on the locale and charset of the underlying operating system. * Importance: medium ``file_reader.json.compression.type`` @@ -416,6 +417,7 @@ To configure custom properties for this reader, the name you must use is ``delim Encoding to use for reading a file. If not specified, the reader will use the default encoding. * Type: string + * Default: based on the locale and charset of the underlying operating system. * Importance: medium ``file_reader.delimited.compression.type`` @@ -536,6 +538,7 @@ To configure custom properties for this reader, the name you must use is ``delim Encoding to use for reading a file. If not specified, the reader will use the default encoding. * Type: string + * Default: based on the locale and charset of the underlying operating system. * Importance: medium ``file_reader.delimited.compression.type`` @@ -579,6 +582,7 @@ To configure custom properties for this reader, the name you must use is ``text` Encoding to use for reading a file. If not specified, the reader will use the default encoding. * Type: string + * Default: based on the locale and charset of the underlying operating system. * Importance: medium ``file_reader.json.compression.type`` diff --git a/docs/source/filereaders.rst b/docs/source/filereaders.rst index 8e52634..2b029c5 100644 --- a/docs/source/filereaders.rst +++ b/docs/source/filereaders.rst @@ -85,10 +85,11 @@ Agnostic Actually, this reader is a wrapper of the readers listing above. It tries to read any kind of file format using an internal reader based on the file extension, -applying the proper one (Parquet, Avro, SecuenceFile, CSV, TSV or Text). In case of no +applying the proper one (Parquet, Avro, SequenceFile, CSV, TSV or Text). In case of no extension has been matched, the Text file reader will be applied. Default extensions for each format (configurable): + * Parquet: ``.parquet`` * Avro: ``.avro`` * SequenceFile: ``.seq`` From cc2fe9fa9b4536bfebd231f44f10b24379cd4db4 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Mon, 23 Mar 2020 21:02:37 -0600 Subject: [PATCH 30/51] Enabled set header names in Univocity file readers --- .../kafka/connect/fs/file/reader/UnivocityFileReader.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java index caab986..dac8740 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java @@ -37,6 +37,7 @@ abstract class UnivocityFileReader> protected static final String FILE_READER_DELIMITED_SETTINGS_FORMAT = FILE_READER_DELIMITED_SETTINGS + "format."; public static final String FILE_READER_DELIMITED_SETTINGS_HEADER = FILE_READER_DELIMITED_SETTINGS + "header"; + public static final String FILE_READER_DELIMITED_SETTINGS_HEADER_NAMES = FILE_READER_DELIMITED_SETTINGS + "header_names"; public static final String FILE_READER_DELIMITED_SETTINGS_LINE_SEPARATOR_DETECTION = FILE_READER_DELIMITED_SETTINGS + "line_separator_detection"; public static final String FILE_READER_DELIMITED_SETTINGS_NULL_VALUE = FILE_READER_DELIMITED_SETTINGS + "null_value"; public static final String FILE_READER_DELIMITED_SETTINGS_MAX_COLUMNS = FILE_READER_DELIMITED_SETTINGS + "max_columns"; @@ -100,6 +101,8 @@ protected void configure(Map config) { private T allSettings(Map config) { T settings = parserSettings(config); settings.setHeaderExtractionEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_HEADER, false)); + settings.setHeaders(Optional.ofNullable(config.get(FILE_READER_DELIMITED_SETTINGS_HEADER_NAMES)) + .map(headers -> headers.split(",")).orElse(null)); settings.setLineSeparatorDetectionEnabled(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_LINE_SEPARATOR_DETECTION, false)); settings.setNullValue(config.get(FILE_READER_DELIMITED_SETTINGS_NULL_VALUE)); settings.setMaxColumns(Integer.parseInt(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_MAX_COLUMNS, "512"))); From 0bb547bdf8eaffa8ab28a26c7b444f62e8a2f3d8 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Mon, 23 Mar 2020 21:02:58 -0600 Subject: [PATCH 31/51] New fixed-width file reader --- .../fs/file/reader/AgnosticFileReader.java | 36 +++++++------ .../fs/file/reader/FixedWidthFileReader.java | 50 +++++++++++++++++ .../file/reader/AgnosticFileReaderTest.java | 21 ++++++++ .../file/reader/FixedWidthFileReaderTest.java | 54 +++++++++++++++++++ 4 files changed, 146 insertions(+), 15 deletions(-) create mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReader.java create mode 100644 src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReaderTest.java diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java index 9ee8665..9f5930f 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java @@ -8,8 +8,9 @@ import java.io.IOException; import java.util.Arrays; -import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; @@ -24,11 +25,12 @@ public class AgnosticFileReader extends AbstractFileReader reader; - private List parquetExtensions, avroExtensions, sequenceExtensions, - jsonExtensions, csvExtensions, tsvExtensions; + private Set parquetExtensions, avroExtensions, sequenceExtensions, + jsonExtensions, csvExtensions, tsvExtensions, fixedExtensions; public AgnosticFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, new AgnosticAdapter(), config); @@ -61,6 +63,8 @@ private AbstractFileReader readerByExtension(FileSystem fs, Path filePat clz = CsvFileReader.class; } else if (tsvExtensions.contains(extension)) { clz = TsvFileReader.class; + } else if (fixedExtensions.contains(extension)) { + clz = FixedWidthFileReader.class; } else { clz = TextFileReader.class; } @@ -70,18 +74,20 @@ private AbstractFileReader readerByExtension(FileSystem fs, Path filePat @Override protected void configure(Map config) { - this.parquetExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, "parquet") - .toLowerCase().split(",")); - this.avroExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, "avro") - .toLowerCase().split(",")); - this.sequenceExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE, "seq") - .toLowerCase().split(",")); - this.jsonExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_JSON, "json") - .toLowerCase().split(",")); - this.csvExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_CSV, "csv") - .toLowerCase().split(",")); - this.tsvExtensions = Arrays.asList(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_TSV, "tsv") - .toLowerCase().split(",")); + this.parquetExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, "parquet") + .toLowerCase().split(",")).collect(Collectors.toSet()); + this.avroExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_AVRO, "avro") + .toLowerCase().split(",")).collect(Collectors.toSet()); + this.sequenceExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_SEQUENCE, "seq") + .toLowerCase().split(",")).collect(Collectors.toSet()); + this.jsonExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_JSON, "json") + .toLowerCase().split(",")).collect(Collectors.toSet()); + this.csvExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_CSV, "csv") + .toLowerCase().split(",")).collect(Collectors.toSet()); + this.tsvExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_TSV, "tsv") + .toLowerCase().split(",")).collect(Collectors.toSet()); + this.fixedExtensions = Arrays.stream(config.getOrDefault(FILE_READER_AGNOSTIC_EXTENSIONS_FIXED, "fixed") + .toLowerCase().split(",")).collect(Collectors.toSet()); } @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReader.java new file mode 100644 index 0000000..52f4a95 --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReader.java @@ -0,0 +1,50 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import com.univocity.parsers.common.AbstractParser; +import com.univocity.parsers.fixed.FixedWidthFields; +import com.univocity.parsers.fixed.FixedWidthParser; +import com.univocity.parsers.fixed.FixedWidthParserSettings; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Map; +import java.util.Optional; + +public class FixedWidthFileReader extends UnivocityFileReader { + + public static final String FILE_READER_DELIMITED_SETTINGS_FIELD_LENGTHS = FILE_READER_DELIMITED_SETTINGS + "field_lengths"; + public static final String FILE_READER_DELIMITED_SETTINGS_KEEP_PADDING = FILE_READER_DELIMITED_SETTINGS + "keep_padding"; + public static final String FILE_READER_DELIMITED_SETTINGS_PADDING_FOR_HEADERS = FILE_READER_DELIMITED_SETTINGS + "padding_for_headers"; + public static final String FILE_READER_DELIMITED_SETTINGS_ENDS_ON_NEW_LINE = FILE_READER_DELIMITED_SETTINGS + "ends_on_new_line"; + public static final String FILE_READER_DELIMITED_SETTINGS_SKIP_TRAILING_CHARS = FILE_READER_DELIMITED_SETTINGS + "skip_trailing_chars"; + + public static final String FILE_READER_DELIMITED_SETTINGS_FORMAT_PADDING = FILE_READER_DELIMITED_SETTINGS_FORMAT + "padding"; + + public FixedWidthFileReader(FileSystem fs, Path filePath, Map config) throws IOException { + super(fs, filePath, config); + } + + @Override + protected FixedWidthParserSettings parserSettings(Map config) { + FixedWidthFields fieldLengths = new FixedWidthFields(); + Optional.ofNullable(config.get(FILE_READER_DELIMITED_SETTINGS_FIELD_LENGTHS)) + .map(fl -> Arrays.stream(fl.split(","))) + .ifPresent(fl -> fl.forEach(field -> fieldLengths.addField(Integer.parseInt(field)))); + + FixedWidthParserSettings settings = new FixedWidthParserSettings(fieldLengths); + settings.setKeepPadding(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_KEEP_PADDING, false)); + settings.setUseDefaultPaddingForHeaders(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_PADDING_FOR_HEADERS, true)); + settings.setRecordEndsOnNewline(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_ENDS_ON_NEW_LINE, true)); + settings.setSkipTrailingCharsUntilNewline(getBoolean(config, FILE_READER_DELIMITED_SETTINGS_SKIP_TRAILING_CHARS, false)); + settings.getFormat().setPadding(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_FORMAT_PADDING, " ").charAt(0)); + + return settings; + } + + @Override + protected AbstractParser createParser(FixedWidthParserSettings settings) { + return new FixedWidthParser(settings); + } +} diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java index 7f25e66..632b13b 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java @@ -71,6 +71,27 @@ public String getFileExtension() { } } + @Nested + class AgnosticFixedWidthFileReaderTest extends FixedWidthFileReaderTest { + + @Override + protected Map getReaderConfig() { + Map config = super.getReaderConfig(); + config.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_FIXED, getFileExtension()); + return config; + } + + @Override + public Class getReaderClass() { + return AgnosticFileReader.class; + } + + @Override + public String getFileExtension() { + return FILE_EXTENSION; + } + } + @Nested class AgnosticJsonFileReaderTest extends JsonFileReaderTest { diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReaderTest.java new file mode 100644 index 0000000..6f0ff01 --- /dev/null +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReaderTest.java @@ -0,0 +1,54 @@ +package com.github.mmolimar.kafka.connect.fs.file.reader; + +import org.apache.hadoop.fs.Path; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.UUID; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +public class FixedWidthFileReaderTest extends UnivocityFileReaderTest { + + private static final int[] fieldLengths = new int[]{45, 53, 71, 89}; + + @Override + protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { + boolean header = args.length < 1 || (boolean) args[0]; + CompressionType compression = args.length < 2 ? COMPRESSION_TYPE_DEFAULT : (CompressionType) args[1]; + File txtFile = File.createTempFile("test-", "." + getFileExtension()); + try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { + if (header) { + writer.append(String.format("%-" + fieldLengths[0] + "s", FIELD_COLUMN1) + + String.format("%-" + fieldLengths[1] + "s", FIELD_COLUMN2) + + String.format("%-" + fieldLengths[2] + "s", FIELD_COLUMN3) + + String.format("%-" + fieldLengths[3] + "s", FIELD_COLUMN4) + "\n"); + } + IntStream.range(0, NUM_RECORDS).forEach(index -> { + String value = String.format("%d_%s", index, UUID.randomUUID()); + writer.append(String.format("%-" + fieldLengths[0] + "s", value) + + String.format("%-" + fieldLengths[1] + "s", value) + + String.format("%-" + fieldLengths[2] + "s", value) + + String.format("%-" + fieldLengths[3] + "s", value) + "\n"); + fsConfig.offsetsByIndex().put(index, (long) index); + }); + } + Path path = new Path(new Path(fsConfig.getFsUri()), txtFile.getName()); + fsConfig.getFs().moveFromLocalFile(new Path(txtFile.getAbsolutePath()), path); + return path; + } + + @Override + protected Map getReaderConfig() { + return new HashMap() {{ + put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_FIELD_LENGTHS, + Arrays.stream(fieldLengths).mapToObj(String::valueOf).collect(Collectors.joining(","))); + }}; + } + +} From a797971d777f41e13c3cce8ebab68a273f899a1c Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Mon, 23 Mar 2020 21:19:10 -0600 Subject: [PATCH 32/51] Added new file reader to the documentation --- config/kafka-connect-fs.properties | 2 +- docs/source/config_options.rst | 196 ++++++++++++++++-- docs/source/filereaders.rst | 16 ++ .../kafka-connect-fs-version.properties | 2 +- 4 files changed, 200 insertions(+), 16 deletions(-) diff --git a/config/kafka-connect-fs.properties b/config/kafka-connect-fs.properties index 28ab531..67435af 100644 --- a/config/kafka-connect-fs.properties +++ b/config/kafka-connect-fs.properties @@ -5,5 +5,5 @@ fs.uris=file:///data,hdfs://localhost:9000/ topic=mytopic policy.class=com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy policy.recursive=true -policy.regexp=^[0-9]*\.txt$ +policy.regexp=^.*\.txt$ file_reader.class=com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader diff --git a/docs/source/config_options.rst b/docs/source/config_options.rst index df4a946..70c8b6c 100644 --- a/docs/source/config_options.rst +++ b/docs/source/config_options.rst @@ -301,6 +301,13 @@ CSV To configure custom properties for this reader, the name you must use is ``delimited`` (even though it's for CSV). +``file_reader.delimited.settings.format.delimiter`` + Field delimiter. + + * Type: string + * Default: ``,`` + * Importance: high + ``file_reader.delimited.settings.header`` If the file contains header or not. @@ -308,12 +315,12 @@ To configure custom properties for this reader, the name you must use is ``delim * Default: ``false`` * Importance: high -``file_reader.delimited.settings.format.delimiter`` - Field delimiter. +``file_reader.delimited.settings.header_names`` + A comma-separated list of ordered field names to set when reading a file. - * Type: string - * Default: ``,`` - * Importance: high + * Type: string[] + * Default: ``null`` + * Importance: medium ``file_reader.delimited.settings.null_value`` Default value for ``null`` values. @@ -362,14 +369,14 @@ To configure custom properties for this reader, the name you must use is ``delim * Type: boolean * Default: ``false`` - * Importance: medium + * Importance: low ``file_reader.delimited.settings.delimiter_detection`` If the reader should detect the delimiter automatically. * Type: boolean * Default: ``false`` - * Importance: medium + * Importance: low ``file_reader.delimited.settings.ignore_leading_whitespaces`` Flag to enable/disable skipping leading whitespaces from values. @@ -449,6 +456,13 @@ To configure custom properties for this reader, the name you must use is ``delim * Default: ``false`` * Importance: high +``file_reader.delimited.settings.header_names`` + A comma-separated list of ordered field names to set when reading a file. + + * Type: string[] + * Default: ``null`` + * Importance: medium + ``file_reader.delimited.settings.null_value`` Default value for ``null`` values. @@ -484,13 +498,6 @@ To configure custom properties for this reader, the name you must use is ``delim * Default: ``0`` * Importance: low -``file_reader.delimited.settings.line_separator_detection`` - If the reader should detect the line separator automatically. - - * Type: boolean - * Default: ``false`` - * Importance: medium - ``file_reader.delimited.settings.line_separator_detection`` If the reader should detect the line separator automatically. @@ -506,6 +513,13 @@ To configure custom properties for this reader, the name you must use is ``delim * Default: ``true`` * Importance: low +``file_reader.delimited.settings.ignore_leading_whitespaces`` + Flag to enable/disable skipping leading whitespaces from values. + + * Type: boolean + * Default: ``true`` + * Importance: low + ``file_reader.delimited.settings.ignore_trailing_whitespaces`` Flag to enable/disable skipping trailing whitespaces from values. @@ -556,6 +570,153 @@ To configure custom properties for this reader, the name you must use is ``delim * Default: ``true`` * Importance: low +.. _config_options-filereaders-fixedwidth: + +FixedWidth +-------------------------------------------- + +To configure custom properties for this reader, the name you must use is ``delimited`` (even though it's for FixedWidth). + +``file_reader.delimited.settings.field_lengths`` + A comma-separated ordered list of integers with the lengths of each field. + + * Type: int[] + * Importance: high + +``file_reader.delimited.settings.header`` + If the file contains header or not. + + * Type: boolean + * Default: ``false`` + * Importance: high + +``file_reader.delimited.settings.header_names`` + A comma-separated list of ordered field names to set when reading a file. + + * Type: string[] + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.keep_padding`` + If the padding character should be kept in each value. + + * Type: boolean + * Default: ``false`` + * Importance: medium + +``file_reader.delimited.settings.padding_for_headers`` + If headers have the default padding specified. + + * Type: boolean + * Default: ``true`` + * Importance: medium + +``file_reader.delimited.settings.null_value`` + Default value for ``null`` values. + + * Type: string + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.format.ends_on_new_line`` + Line separator to be used. + + * Type: boolean + * Default: ``true`` + * Importance: medium + +``file_reader.delimited.settings.format.line_separator`` + Line separator to be used. + + * Type: string + * Default: ``\n`` + * Importance: medium + +``file_reader.delimited.settings.format.padding`` + The padding character used to represent unwritten spaces. + + * Type: char + * Default: `` `` + * Importance: medium + +``file_reader.delimited.settings.max_columns`` + Default value for ``null`` values. + + * Type: int + * Default: ``512`` + * Importance: low + +``file_reader.delimited.settings.max_chars_per_column`` + Default value for ``null`` values. + + * Type: int + * Default: ``4096`` + * Importance: low + +``file_reader.delimited.settings.skip_trailing_chars`` + If the trailing characters beyond the record's length should be skipped. + + * Type: boolean + * Default: ``false`` + * Importance: low + +``file_reader.delimited.settings.rows_to_skip`` + Number of rows to skip. + + * Type: long + * Default: ``0`` + * Importance: low + +``file_reader.delimited.settings.line_separator_detection`` + If the reader should detect the line separator automatically. + + * Type: boolean + * Default: ``false`` + * Importance: low + +``file_reader.delimited.settings.ignore_leading_whitespaces`` + Flag to enable/disable skipping leading whitespaces from values. + + * Type: boolean + * Default: ``true`` + * Importance: low + +``file_reader.delimited.settings.ignore_trailing_whitespaces`` + Flag to enable/disable skipping trailing whitespaces from values. + + * Type: boolean + * Default: ``true`` + * Importance: low + +``file_reader.delimited.settings.format.comment`` + Character that represents a line comment at the beginning of a line. + + * Type: char + * Default: ``#`` + * Importance: low + +``file_reader.delimited.encoding`` + Encoding to use for reading a file. If not specified, the reader will use the default encoding. + + * Type: string + * Default: based on the locale and charset of the underlying operating system. + * Importance: medium + +``file_reader.delimited.compression.type`` + Compression type to use when reading a file. + + * Type: enum (available values ``bzip2``, ``gzip`` and ``none``) + * Default: ``none`` + * Importance: medium + +``file_reader.delimited.compression.concatenated`` + Flag to specify if the decompression of the reader will finish at the end of the file or after + the first compressed stream. + + * Type: boolean + * Default: ``true`` + * Importance: low + .. _config_options-filereaders-text: Text @@ -649,6 +810,13 @@ To configure custom properties for this reader, the name you must use is ``agnos * Default: ``tsv`` * Importance: medium +``file_reader.agnostic.extensions.fixed`` + A comma-separated string list with the accepted extensions for fixed-width files. + + * Type: string + * Default: ``fixed`` + * Importance: medium + .. note:: The Agnostic reader uses the previous ones as inner readers. So, in case of using this reader, you'll probably need to include also the specified properties for those readers in the connector configuration as well. diff --git a/docs/source/filereaders.rst b/docs/source/filereaders.rst index 2b029c5..f887499 100644 --- a/docs/source/filereaders.rst +++ b/docs/source/filereaders.rst @@ -57,6 +57,8 @@ in the message sent to Kafka. If there is no header, the value of each column wi the field named ``column_N`` (**N** represents the column index) in the message. Also, the token delimiter for columns is configurable. +This reader is based on the `Univocity CSV parser `__. + More information about properties of this file reader :ref:`here`. TSV @@ -66,8 +68,21 @@ TSV file reader using a tab (``\t``) to distinguish different columns on each li Its behaviour is the same one for the CSV file reader regarding the header and the column names. +This reader is based on the `Univocity TSV parser `__. + More information about properties of this file reader :ref:`here`. +FixedWidth +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +FixedWidth is a plain text file reader which distinguishes each column based on the length of each field. + +Its behaviour is the same one for the CSV/TSV file readers regarding the header and the column names. + +This reader is based on the `Univocity Fixed-Width parser `__. + +More information about properties of this file reader :ref:`here`. + Text ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -96,6 +111,7 @@ Default extensions for each format (configurable): * JSON: ``.json`` * CSV: ``.csv`` * TSV: ``.tsv`` +* FixedWidth: ``.fixed`` * Text: any other sort of file extension. More information about properties of this file reader :ref:`here`. diff --git a/src/main/resources/kafka-connect-fs-version.properties b/src/main/resources/kafka-connect-fs-version.properties index e5683df..defbd48 100644 --- a/src/main/resources/kafka-connect-fs-version.properties +++ b/src/main/resources/kafka-connect-fs-version.properties @@ -1 +1 @@ -version=${project.version} \ No newline at end of file +version=${project.version} From 7e15fb90c441f5a43d5ed7fa1f2c0890cdae5af3 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sun, 5 Apr 2020 20:21:58 -0500 Subject: [PATCH 33/51] Prevent stack overflow error when listing files in the FS --- .../connect/fs/policy/AbstractPolicy.java | 21 ++++++--- .../kafka/connect/fs/util/TailCall.java | 44 +++++++++++++++++++ 2 files changed, 58 insertions(+), 7 deletions(-) create mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/util/TailCall.java diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java index 849692a..37da859 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java @@ -4,6 +4,7 @@ import com.github.mmolimar.kafka.connect.fs.file.FileMetadata; import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; +import com.github.mmolimar.kafka.connect.fs.util.TailCall; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; @@ -134,25 +135,31 @@ public Iterator listFiles(FileSystem fs) throws IOException { RemoteIterator it = fs.listFiles(fs.getWorkingDirectory(), recursive); LocatedFileStatus current = null; - @Override - public boolean hasNext() { + private TailCall hasNextRec() { try { if (current == null) { - if (!it.hasNext()) return false; + if (!it.hasNext()) { + return TailCall.done(false); + } current = it.next(); - return hasNext(); + return this::hasNextRec; } - if (current.isFile() && + if (current.isFile() & fileRegexp.matcher(current.getPath().getName()).find()) { - return true; + return TailCall.done(true); } current = null; - return hasNext(); + return this::hasNextRec; } catch (IOException ioe) { throw new ConnectException(ioe); } } + @Override + public boolean hasNext() { + return hasNextRec().invoke(); + } + @Override public FileMetadata next() { if (!hasNext() && current == null) { diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/TailCall.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/TailCall.java new file mode 100644 index 0000000..5b82099 --- /dev/null +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/TailCall.java @@ -0,0 +1,44 @@ +package com.github.mmolimar.kafka.connect.fs.util; + +import java.util.stream.Stream; + +@FunctionalInterface +public interface TailCall { + + TailCall apply(); + + default boolean completed() { + return false; + } + + default T result() { + throw new IllegalStateException("Call does not have a value."); + } + + default T invoke() { + return Stream.iterate(this, TailCall::apply) + .filter(TailCall::completed) + .findFirst() + .get() + .result(); + } + + static TailCall done(final T value) { + return new TailCall() { + @Override + public boolean completed() { + return true; + } + + @Override + public T result() { + return value; + } + + @Override + public TailCall apply() { + throw new IllegalStateException("Done cannot be applied."); + } + }; + } +} From 709b3808904948352c0dccc5646f13945a333c2c Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sun, 12 Apr 2020 15:07:01 -0500 Subject: [PATCH 34/51] Updating Maven deps --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index fcdad09..130728a 100644 --- a/pom.xml +++ b/pom.xml @@ -17,9 +17,9 @@ 1.11.0 2.8.4 9.0.2 - 5.6.0 + 5.6.2 4.2 - 2.0.5 + 2.0.7 1.8 ${maven-compiler.source} 3.2.0 From b7ff3e7cdbf6ed8008623c159764e71590db706c Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sun, 12 Apr 2020 17:00:49 -0500 Subject: [PATCH 35/51] Refactor offset management --- .../kafka/connect/fs/FsSourceTask.java | 13 +--- .../kafka/connect/fs/file/Offset.java | 7 --- .../fs/file/reader/AbstractFileReader.java | 16 +++++ .../fs/file/reader/AgnosticFileReader.java | 5 +- .../fs/file/reader/AvroFileReader.java | 37 ++---------- .../connect/fs/file/reader/FileReader.java | 7 +-- .../fs/file/reader/JsonFileReader.java | 8 +-- .../fs/file/reader/ParquetFileReader.java | 43 ++------------ .../fs/file/reader/SequenceFileReader.java | 47 +++------------ .../fs/file/reader/TextFileReader.java | 41 ++----------- .../fs/file/reader/UnivocityFileReader.java | 45 +++----------- .../connect/fs/policy/AbstractPolicy.java | 27 ++++----- .../file/reader/AgnosticFileReaderTest.java | 5 ++ .../fs/file/reader/AvroFileReaderTest.java | 6 -- .../fs/file/reader/FileReaderTestBase.java | 23 +++----- .../fs/file/reader/JsonFileReaderTest.java | 6 -- .../fs/file/reader/ParquetFileReaderTest.java | 6 -- .../file/reader/SequenceFileReaderTest.java | 59 ++++++++++++++++--- .../fs/file/reader/TextFileReaderTest.java | 6 -- .../file/reader/UnivocityFileReaderTest.java | 20 +++---- 20 files changed, 142 insertions(+), 285 deletions(-) delete mode 100644 src/main/java/com/github/mmolimar/kafka/connect/fs/file/Offset.java diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java index 1fdc9b5..db8188f 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java @@ -1,7 +1,6 @@ package com.github.mmolimar.kafka.connect.fs; import com.github.mmolimar.kafka.connect.fs.file.FileMetadata; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import com.github.mmolimar.kafka.connect.fs.file.reader.FileReader; import com.github.mmolimar.kafka.connect.fs.policy.Policy; import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; @@ -101,16 +100,10 @@ private Stream asStream(Iterator src) { return StreamSupport.stream(iterable.spliterator(), false); } - private SourceRecord convert(FileMetadata metadata, Offset offset, Struct struct) { + private SourceRecord convert(FileMetadata metadata, long offset, Struct struct) { return new SourceRecord( - new HashMap() { - { - put("path", metadata.getPath()); - //TODO manage blocks - //put("blocks", metadata.getBlocks().toString()); - } - }, - Collections.singletonMap("offset", offset.getRecordOffset()), + Collections.singletonMap("path", metadata.getPath()), + Collections.singletonMap("offset", offset), config.getTopic(), struct.schema(), struct diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/Offset.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/Offset.java deleted file mode 100644 index ca1d530..0000000 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/Offset.java +++ /dev/null @@ -1,7 +0,0 @@ -package com.github.mmolimar.kafka.connect.fs.file; - -public interface Offset { - - long getRecordOffset(); - -} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java index dae25af..a54814b 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java @@ -17,6 +17,7 @@ public abstract class AbstractFileReader implements FileReader { private final FileSystem fs; private final Path filePath; private final ReaderAdapter adapter; + private long offset; public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter, Map config) { if (fs == null || filePath == null) { @@ -25,6 +26,7 @@ public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter this.fs = fs; this.filePath = filePath; this.adapter = adapter; + this.offset = 0; configure(readerConfig(config)); } @@ -47,10 +49,24 @@ public Path getFilePath() { return filePath; } + @Override public final Struct next() { return adapter.apply(nextRecord()); } + @Override + public long currentOffset() { + return offset; + } + + protected void incrementOffset() { + this.offset++; + } + + protected void setOffset(long offset) { + this.offset = offset; + } + protected abstract T nextRecord(); protected ReaderAdapter getAdapter() { diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java index 9f5930f..478dacb 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -96,12 +95,12 @@ public boolean hasNext() { } @Override - public void seek(Offset offset) { + public void seek(long offset) { reader.seek(offset); } @Override - public Offset currentOffset() { + public long currentOffset() { return reader.currentOffset(); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java index 2438f51..589ded7 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import io.confluent.connect.avro.AvroData; import org.apache.avro.AvroRuntimeException; import org.apache.avro.Schema; @@ -26,7 +25,6 @@ public class AvroFileReader extends AbstractFileReader { public static final String FILE_READER_AVRO_SCHEMA = FILE_READER_AVRO + "schema"; - private final AvroOffset offset; private final DataFileReader reader; private Schema schema; @@ -39,7 +37,6 @@ public AvroFileReader(FileSystem fs, Path filePath, Map config) } else { this.reader = new DataFileReader<>(input, new SpecificDatumReader<>(this.schema)); } - this.offset = new AvroOffset(0); } @Override @@ -62,7 +59,7 @@ public boolean hasNext() { protected GenericRecord nextRecord() { try { GenericRecord record = reader.next(); - this.offset.inc(); + incrementOffset(); return record; } catch (AvroRuntimeException are) { @@ -71,47 +68,21 @@ protected GenericRecord nextRecord() { } @Override - public void seek(Offset offset) { + public void seek(long offset) { try { - reader.sync(offset.getRecordOffset()); - this.offset.setOffset(reader.previousSync() - 16); + reader.sync(offset); + setOffset(reader.previousSync() - 16L); } catch (IOException ioe) { throw new ConnectException("Error seeking file " + getFilePath(), ioe); } } - @Override - public Offset currentOffset() { - return offset; - } - @Override public void close() throws IOException { reader.sync(0); reader.close(); } - public static class AvroOffset implements Offset { - private long offset; - - public AvroOffset(long offset) { - this.offset = offset; - } - - public void setOffset(long offset) { - this.offset = offset; - } - - void inc() { - this.offset++; - } - - @Override - public long getRecordOffset() { - return offset; - } - } - static class GenericRecordToStruct implements ReaderAdapter { private static final int CACHE_SIZE = 100; diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java index 521ddbb..518e9f8 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReader.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; @@ -16,12 +15,12 @@ public interface FileReader extends Iterator, Closeable { Struct next(); - void seek(Offset offset); + void seek(long offset); - Offset currentOffset(); + long currentOffset(); } @FunctionalInterface interface ReaderAdapter extends Function { -} \ No newline at end of file +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java index 76db116..700d38b 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java @@ -1,10 +1,8 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Schema; @@ -50,7 +48,7 @@ public JsonFileReader(FileSystem fs, Path filePath, Map config) String line = inner.nextRecord().getValue(); this.schema = extractSchema(mapper.readTree(line)); //back to the first line - inner.seek(() -> 0); + inner.seek(0); } else { this.schema = SchemaBuilder.struct().build(); } @@ -91,12 +89,12 @@ public boolean hasNext() { } @Override - public void seek(Offset offset) { + public void seek(long offset) { inner.seek(offset); } @Override - public Offset currentOffset() { + public long currentOffset() { return inner.currentOffset(); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java index cf12483..7253a4b 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import io.confluent.connect.avro.AvroData; import org.apache.avro.Schema; import org.apache.avro.generic.GenericData; @@ -29,19 +28,15 @@ public class ParquetFileReader extends AbstractFileReader { public static final String FILE_READER_PARQUET_SCHEMA = FILE_READER_PARQUET + "schema"; public static final String FILE_READER_PARQUET_PROJECTION = FILE_READER_PARQUET + "projection"; - private final ParquetOffset offset; - private ParquetReader reader; private GenericRecord currentRecord; private Schema schema; private Schema projection; private boolean closed; - public ParquetFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, new GenericRecordToStruct(), config); - this.offset = new ParquetOffset(0); this.reader = initReader(); this.closed = false; } @@ -94,64 +89,38 @@ record = new GenericData.Record(this.projection); record = currentRecord; } currentRecord = null; - offset.inc(); + incrementOffset(); return record; } @Override - public void seek(Offset offset) { + public void seek(long offset) { if (closed) { throw new ConnectException("Stream is closed!"); } - if (offset.getRecordOffset() < 0) { + if (offset < 0) { throw new IllegalArgumentException("Record offset must be greater than 0"); } - if (this.offset.getRecordOffset() > offset.getRecordOffset()) { + if (currentOffset() > offset) { try { this.reader = initReader(); - this.offset.setOffset(0); + setOffset(0); this.closed = false; } catch (IOException ioe) { throw new ConnectException("Error initializing parquet reader", ioe); } } - while (hasNext() && this.offset.getRecordOffset() < offset.getRecordOffset()) { + while (hasNext() && currentOffset() < offset) { nextRecord(); } } - @Override - public Offset currentOffset() { - return offset; - } - @Override public void close() throws IOException { this.closed = true; reader.close(); } - public static class ParquetOffset implements Offset { - private long offset; - - public ParquetOffset(long offset) { - this.offset = offset; - } - - public void setOffset(long offset) { - this.offset = offset; - } - - void inc() { - this.offset++; - } - - @Override - public long getRecordOffset() { - return offset; - } - } - static class GenericRecordToStruct implements ReaderAdapter { private static final int CACHE_SIZE = 100; private final AvroData avroData; diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java index bdde95b..e21bdf2 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.*; @@ -32,7 +31,6 @@ public class SequenceFileReader extends AbstractFileReader conf .field(keyFieldName, getSchema(this.key)) .field(valueFieldName, getSchema(this.value)) .build(); - this.offset = new SeqOffset(0); this.recordIndex = this.hasNextIndex = -1; this.hasNext = false; this.closed = false; @@ -63,7 +60,7 @@ protected void configure(Map config) { this.valueFieldName = config.getOrDefault(FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE_DEFAULT); } - private Schema getSchema(Writable writable) { + Schema getSchema(Writable writable) { if (writable instanceof ByteWritable) { return SchemaBuilder.INT8_SCHEMA; } else if (writable instanceof ShortWritable) { @@ -75,7 +72,7 @@ private Schema getSchema(Writable writable) { } else if (writable instanceof FloatWritable) { return SchemaBuilder.FLOAT32_SCHEMA; } else if (writable instanceof DoubleWritable) { - return SchemaBuilder.INT64_SCHEMA; + return SchemaBuilder.FLOAT64_SCHEMA; } else if (writable instanceof BytesWritable) { return SchemaBuilder.BYTES_SCHEMA; } else if (writable instanceof BooleanWritable) { @@ -90,7 +87,7 @@ public boolean hasNext() { try { if (hasNextIndex == -1 || hasNextIndex == recordIndex) { hasNextIndex++; - offset.inc(); + incrementOffset(); hasNext = reader.next(key, value); } return hasNext; @@ -111,52 +108,26 @@ protected SequenceRecord nextRecord() { } @Override - public void seek(Offset offset) { - if (offset.getRecordOffset() < 0) { + public void seek(long offset) { + if (offset < 0) { throw new IllegalArgumentException("Record offset must be greater than 0"); } try { - reader.sync(offset.getRecordOffset()); - hasNextIndex = recordIndex = offset.getRecordOffset(); + reader.sync(offset); + hasNextIndex = recordIndex = offset; hasNext = false; - this.offset.setOffset(offset.getRecordOffset() - 1); + setOffset(offset - 1); } catch (IOException ioe) { throw new ConnectException("Error seeking file " + getFilePath(), ioe); } } - @Override - public Offset currentOffset() { - return offset; - } - @Override public void close() throws IOException { closed = true; reader.close(); } - public static class SeqOffset implements Offset { - private long offset; - - public SeqOffset(long offset) { - this.offset = offset; - } - - public void setOffset(long offset) { - this.offset = offset; - } - - void inc() { - this.offset++; - } - - @Override - public long getRecordOffset() { - return offset; - } - } - static class SeqToStruct implements ReaderAdapter> { @Override @@ -166,7 +137,7 @@ public Struct apply(SequenceRecord record) { .put(record.valueFieldName, toSchemaValue(record.value)); } - private Object toSchemaValue(Writable writable) { + Object toSchemaValue(Writable writable) { if (writable instanceof ByteWritable) { return ((ByteWritable) writable).get(); } else if (writable instanceof ShortWritable) { diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java index a12323e..060de36 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; import org.apache.hadoop.fs.FileSystem; @@ -33,7 +32,6 @@ public class TextFileReader extends AbstractFileReader config) throws IOException { super(fs, filePath, new TxtToStruct(), config); this.reader = new LineNumberReader(getFileReader(fs.open(filePath))); - this.offset = new TextOffset(0); } @Override @@ -116,62 +113,36 @@ protected TextRecord nextRecord() { } String aux = current; current = null; - offset.inc(); + incrementOffset(); return new TextRecord(schema, aux); } @Override - public void seek(Offset offset) { - if (offset.getRecordOffset() < 0) { + public void seek(long offset) { + if (offset < 0) { throw new IllegalArgumentException("Record offset must be greater than 0"); } try { current = null; - if (offset.getRecordOffset() < reader.getLineNumber()) { + if (offset < reader.getLineNumber()) { finished = false; reader.close(); reader = new LineNumberReader(getFileReader(getFs().open(getFilePath()))); } - while (reader.getLineNumber() < offset.getRecordOffset()) { + while (reader.getLineNumber() < offset) { reader.readLine(); } - this.offset.setOffset(reader.getLineNumber()); + setOffset(reader.getLineNumber()); } catch (IOException ioe) { throw new ConnectException("Error seeking file " + getFilePath(), ioe); } } - @Override - public Offset currentOffset() { - return offset; - } - @Override public void close() throws IOException { reader.close(); } - public static class TextOffset implements Offset { - private long offset; - - public TextOffset(long offset) { - this.offset = offset; - } - - public void setOffset(long offset) { - this.offset = offset; - } - - void inc() { - this.offset++; - } - - @Override - public long getRecordOffset() { - return offset; - } - } - static class TxtToStruct implements ReaderAdapter { @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java index dac8740..050ba4c 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import com.univocity.parsers.common.AbstractParser; import com.univocity.parsers.common.CommonParserSettings; import com.univocity.parsers.common.ParsingContext; @@ -55,7 +54,6 @@ abstract class UnivocityFileReader> private static final String DEFAULT_COLUMN_NAME = "column_"; - private final UnivocityOffset offset; private T settings; private Schema schema; private Charset charset; @@ -67,7 +65,6 @@ abstract class UnivocityFileReader> public UnivocityFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, new UnivocityToStruct(), config); - this.offset = new UnivocityOffset(0); this.iterator = iterateRecords(); this.schema = buildSchema(this.iterator, settings.isHeaderExtractionEnabled()); } @@ -78,7 +75,7 @@ private Schema buildSchema(ResultIterator it, boolean ha Record first = it.next(); IntStream.range(0, first.getValues().length) .forEach(index -> builder.field(DEFAULT_COLUMN_NAME + ++index, SchemaBuilder.STRING_SCHEMA)); - seek(new UnivocityOffset(0)); + seek(0); } else if (hasHeader) { Optional.ofNullable(it.getContext().headers()).ifPresent(headers -> { IntStream.range(0, headers.length) @@ -150,7 +147,7 @@ private ResultIterator iterateRecords() throws IOExcepti protected final UnivocityRecord nextRecord() { if (!hasNext()) throw new NoSuchElementException("There are no more records in file: " + getFilePath()); - offset.inc(); + incrementOffset(); Record record = iterator.next(); return new UnivocityRecord(schema, record.getValues()); } @@ -163,58 +160,32 @@ public final boolean hasNext() { } @Override - public final void seek(Offset offset) { - if (offset.getRecordOffset() < 0) { + public final void seek(long offset) { + if (offset < 0) { throw new IllegalArgumentException("Record offset must be greater than 0"); } try { - if (offset.getRecordOffset() > this.offset.getRecordOffset()) { + if (offset > currentOffset()) { iterator.hasNext(); - iterator.getContext().skipLines(offset.getRecordOffset() - this.offset.getRecordOffset() - 1); + iterator.getContext().skipLines(offset - currentOffset() - 1); iterator.next(); } else { iterator = iterateRecords(); iterator.hasNext(); - iterator.getContext().skipLines(offset.getRecordOffset()); + iterator.getContext().skipLines(offset); } - this.offset.setOffset(offset.getRecordOffset()); + setOffset(offset); } catch (IOException ioe) { throw new ConnectException("Error seeking file " + getFilePath(), ioe); } } - @Override - public final Offset currentOffset() { - return offset; - } - @Override public final void close() { iterator.getContext().stop(); closed = true; } - public static class UnivocityOffset implements Offset { - private long offset; - - public UnivocityOffset(long offset) { - this.offset = offset; - } - - public void setOffset(long offset) { - this.offset = offset; - } - - void inc() { - this.offset++; - } - - @Override - public long getRecordOffset() { - return offset; - } - } - static class UnivocityToStruct implements ReaderAdapter { @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java index 37da859..b57d0c2 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java @@ -186,8 +186,7 @@ final int getExecutions() { FileMetadata toMetadata(LocatedFileStatus fileStatus) { List blocks = Arrays.stream(fileStatus.getBlockLocations()) - .map(block -> - new FileMetadata.BlockInfo(block.getOffset(), block.getLength(), block.isCorrupt())) + .map(block -> new FileMetadata.BlockInfo(block.getOffset(), block.getLength(), block.isCorrupt())) .collect(Collectors.toList()); return new FileMetadata(fileStatus.getPath().toString(), fileStatus.getLen(), blocks); @@ -195,30 +194,24 @@ FileMetadata toMetadata(LocatedFileStatus fileStatus) { @Override public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorageReader) { - Map partition = new HashMap() {{ - put("path", metadata.getPath()); - //TODO manage blocks - //put("blocks", metadata.getBlocks().toString()); - }}; - FileSystem current = fileSystems.stream() .filter(fs -> metadata.getPath().startsWith(fs.getWorkingDirectory().toString())) - .findFirst().orElse(null); + .findFirst() + .orElse(null); - FileReader reader; try { - reader = ReflectionUtils.makeReader( + FileReader reader = ReflectionUtils.makeReader( (Class) conf.getClass(FsSourceTaskConfig.FILE_READER_CLASS), current, new Path(metadata.getPath()), conf.originals()); + Map partition = Collections.singletonMap("path", metadata.getPath()); + Map offset = offsetStorageReader.offset(partition); + if (offset != null && offset.get("offset") != null) { + reader.seek((Long) offset.get("offset")); + } + return reader; } catch (Throwable t) { throw new ConnectException("An error has occurred when creating reader for file: " + metadata.getPath(), t); } - - Map offset = offsetStorageReader.offset(partition); - if (offset != null && offset.get("offset") != null) { - reader.seek(() -> (Long) offset.get("offset")); - } - return reader; } private Iterator concat(final Iterator it1, diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java index 632b13b..ab44e27 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReaderTest.java @@ -165,6 +165,11 @@ protected Map getReaderConfig() { return config; } + @Override + public void schemaMapper(ReaderFsTestConfig fsConfig) { + + } + @Override public Class getReaderClass() { return AgnosticFileReader.class; diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java index 841c951..bebeff7 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import org.apache.avro.AvroTypeException; import org.apache.avro.Schema; import org.apache.avro.SchemaParseException; @@ -103,11 +102,6 @@ public void readerWithUnparseableSchema(ReaderFsTestConfig fsConfig) throws IOEx assertThrows(SchemaParseException.class, () -> getReader(testFs, fsConfig.getDataFile(), readerConfig)); } - @Override - protected Offset getOffset(long offset) { - return new AvroFileReader.AvroOffset(offset); - } - @Override protected Class getReaderClass() { return AvroFileReader.class; diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java index e691d87..f91e9af 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream; @@ -130,24 +129,24 @@ public void readAllData(ReaderFsTestConfig fsConfig) { public void seekFile(ReaderFsTestConfig fsConfig) { FileReader reader = fsConfig.getReader(); int recordIndex = NUM_RECORDS / 2; - reader.seek(getOffset(fsConfig.offsetsByIndex().get(recordIndex))); + reader.seek(fsConfig.offsetsByIndex().get(recordIndex)); assertTrue(reader.hasNext()); - assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset()); checkData(reader.next(), recordIndex); recordIndex = 0; - reader.seek(getOffset(fsConfig.offsetsByIndex().get(recordIndex))); + reader.seek(fsConfig.offsetsByIndex().get(recordIndex)); assertTrue(reader.hasNext()); - assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset()); checkData(reader.next(), recordIndex); recordIndex = NUM_RECORDS - 3; - reader.seek(getOffset(fsConfig.offsetsByIndex().get(recordIndex))); + reader.seek(fsConfig.offsetsByIndex().get(recordIndex)); assertTrue(reader.hasNext()); - assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset()); checkData(reader.next(), recordIndex); - reader.seek(getOffset(fsConfig.offsetsByIndex().get(NUM_RECORDS - 1) + 1)); + reader.seek(fsConfig.offsetsByIndex().get(NUM_RECORDS - 1) + 1); assertFalse(reader.hasNext()); } @@ -155,14 +154,14 @@ public void seekFile(ReaderFsTestConfig fsConfig) { @MethodSource("fileSystemConfigProvider") public void negativeSeek(ReaderFsTestConfig fsConfig) { FileReader reader = fsConfig.getReader(); - assertThrows(RuntimeException.class, () -> reader.seek(getOffset(-1))); + assertThrows(RuntimeException.class, () -> reader.seek(-1)); } @ParameterizedTest @MethodSource("fileSystemConfigProvider") public void exceededSeek(ReaderFsTestConfig fsConfig) { FileReader reader = fsConfig.getReader(); - reader.seek(getOffset(fsConfig.offsetsByIndex().get(NUM_RECORDS - 1) + 1)); + reader.seek(fsConfig.offsetsByIndex().get(NUM_RECORDS - 1) + 1); assertFalse(reader.hasNext()); assertThrows(NoSuchElementException.class, reader::next); } @@ -176,10 +175,6 @@ public void readFileAlreadyClosed(ReaderFsTestConfig fsConfig) throws IOExceptio assertThrows(IllegalStateException.class, reader::next); } - protected Offset getOffset(long offset) { - return () -> offset; - } - protected final FileReader getReader(FileSystem fs, Path path, Map config) throws Throwable { return ReflectionUtils.makeReader(getReaderClass(), fs, path, config); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java index 1df1cd0..fdc2422 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java @@ -6,7 +6,6 @@ import com.fasterxml.jackson.databind.ObjectWriter; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; import org.junit.jupiter.params.ParameterizedTest; @@ -160,11 +159,6 @@ public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { }); } - @Override - protected Offset getOffset(long offset) { - return () -> offset; - } - @Override protected Class getReaderClass() { return JsonFileReader.class; diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java index 891eeec..be5e831 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import org.apache.avro.AvroRuntimeException; import org.apache.avro.Schema; import org.apache.avro.SchemaBuilder; @@ -167,11 +166,6 @@ protected Map getReaderConfig() { return new HashMap<>(); } - @Override - protected Offset getOffset(long offset) { - return new ParquetFileReader.ParquetOffset(offset); - } - @Override protected Class getReaderClass() { return ParquetFileReader.class; diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java index ee5bdd8..cc62c0b 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java @@ -1,17 +1,16 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.*; import org.apache.hadoop.util.ReflectionUtils; +import org.apache.kafka.connect.data.SchemaBuilder; import org.apache.kafka.connect.data.Struct; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; +import java.io.DataInput; +import java.io.DataOutput; import java.io.File; import java.io.IOException; import java.util.HashMap; @@ -81,9 +80,53 @@ public void defaultFieldNames(ReaderFsTestConfig fsConfig) throws Throwable { assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); } - @Override - protected Offset getOffset(long offset) { - return new SequenceFileReader.SeqOffset(offset); + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void schemaMapper(ReaderFsTestConfig fsConfig) { + SequenceFileReader reader = (SequenceFileReader) fsConfig.getReader(); + + ByteWritable byteWritable = new ByteWritable((byte) 1); + ShortWritable shortWritable = new ShortWritable((short) 123); + IntWritable intWritable = new IntWritable(123); + LongWritable longWritable = new LongWritable(123L); + FloatWritable floatWritable = new FloatWritable(0.123F); + DoubleWritable doubleWritable = new DoubleWritable(0.123D); + BytesWritable bytesWritable = new BytesWritable(new byte[]{1, 2, 3}); + BooleanWritable booleanWritable = new BooleanWritable(true); + Text textWritable = new Text("123"); + + assertEquals(SchemaBuilder.INT8_SCHEMA, reader.getSchema(byteWritable)); + assertEquals(SchemaBuilder.INT16_SCHEMA, reader.getSchema(shortWritable)); + assertEquals(SchemaBuilder.INT32_SCHEMA, reader.getSchema(intWritable)); + assertEquals(SchemaBuilder.INT64_SCHEMA, reader.getSchema(longWritable)); + assertEquals(SchemaBuilder.FLOAT32_SCHEMA, reader.getSchema(floatWritable)); + assertEquals(SchemaBuilder.FLOAT64_SCHEMA, reader.getSchema(doubleWritable)); + assertEquals(SchemaBuilder.BYTES_SCHEMA, reader.getSchema(bytesWritable)); + assertEquals(SchemaBuilder.BOOLEAN_SCHEMA, reader.getSchema(booleanWritable)); + assertEquals(SchemaBuilder.STRING_SCHEMA, reader.getSchema(textWritable)); + assertEquals(SchemaBuilder.STRING_SCHEMA, reader.getSchema(new Writable() { + + @Override + public void write(DataOutput out) { + + } + + @Override + public void readFields(DataInput in) { + + } + })); + + SequenceFileReader.SeqToStruct seqToStruct = new SequenceFileReader.SeqToStruct(); + assertEquals(seqToStruct.toSchemaValue(byteWritable), byteWritable.get()); + assertEquals(seqToStruct.toSchemaValue(shortWritable), shortWritable.get()); + assertEquals(seqToStruct.toSchemaValue(intWritable), intWritable.get()); + assertEquals(seqToStruct.toSchemaValue(longWritable), longWritable.get()); + assertEquals(seqToStruct.toSchemaValue(floatWritable), floatWritable.get()); + assertEquals(seqToStruct.toSchemaValue(doubleWritable), doubleWritable.get()); + assertEquals(seqToStruct.toSchemaValue(bytesWritable), bytesWritable.getBytes()); + assertEquals(seqToStruct.toSchemaValue(booleanWritable), booleanWritable.get()); + assertEquals(seqToStruct.toSchemaValue(textWritable), textWritable.toString()); } @Override diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java index 281bb24..5078d24 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; import org.junit.jupiter.params.ParameterizedTest; @@ -112,11 +111,6 @@ public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { }); } - @Override - protected Offset getOffset(long offset) { - return new TextFileReader.TextOffset(offset); - } - @Override protected Class getReaderClass() { return TextFileReader.class; diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java index d224027..438bb1e 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java @@ -1,6 +1,5 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; -import com.github.mmolimar.kafka.connect.fs.file.Offset; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; @@ -119,24 +118,24 @@ public void seekFileWithoutHeader(ReaderFsTestConfig fsConfig) throws Throwable assertTrue(reader.hasNext()); int recordIndex = NUM_RECORDS / 2; - reader.seek(getOffset(fsConfig.offsetsByIndex().get(recordIndex))); + reader.seek(fsConfig.offsetsByIndex().get(recordIndex)); assertTrue(reader.hasNext()); - assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset()); checkData(reader.next(), recordIndex); recordIndex = 0; - reader.seek(getOffset(fsConfig.offsetsByIndex().get(recordIndex))); + reader.seek(fsConfig.offsetsByIndex().get(recordIndex)); assertTrue(reader.hasNext()); - assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset()); checkData(reader.next(), recordIndex); recordIndex = NUM_RECORDS - 3; - reader.seek(getOffset(fsConfig.offsetsByIndex().get(recordIndex))); + reader.seek(fsConfig.offsetsByIndex().get(recordIndex)); assertTrue(reader.hasNext()); - assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset().getRecordOffset()); + assertEquals(fsConfig.offsetsByIndex().get(recordIndex), reader.currentOffset()); checkData(reader.next(), recordIndex); - reader.seek(getOffset(fsConfig.offsetsByIndex().get(NUM_RECORDS - 1) + 1)); + reader.seek(fsConfig.offsetsByIndex().get(NUM_RECORDS - 1) + 1); assertFalse(reader.hasNext()); } @@ -159,11 +158,6 @@ public void invalidFileEncoding(ReaderFsTestConfig fsConfig) { fsConfig.getDataFile(), readerConfig)); } - @Override - protected Offset getOffset(long offset) { - return new T.UnivocityOffset(offset); - } - @Override protected Class getReaderClass() { return (Class) ((ParameterizedType) this.getClass().getGenericSuperclass()) From 1d9b75c2257d729d23ae880fb3cc246a327c8841 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Tue, 14 Apr 2020 20:27:08 -0500 Subject: [PATCH 36/51] Logging improvements --- .../kafka/connect/fs/FsSourceConnector.java | 9 ++- .../kafka/connect/fs/FsSourceTask.java | 37 +++++----- .../fs/file/reader/AbstractFileReader.java | 61 ++++++++++++++++- .../fs/file/reader/AgnosticFileReader.java | 23 ++++--- .../fs/file/reader/AvroFileReader.java | 38 +++++------ .../fs/file/reader/JsonFileReader.java | 27 ++++---- .../fs/file/reader/ParquetFileReader.java | 39 ++++------- .../fs/file/reader/SequenceFileReader.java | 32 +++------ .../fs/file/reader/TextFileReader.java | 68 ++++++++----------- .../fs/file/reader/UnivocityFileReader.java | 40 +++++------ .../connect/fs/policy/AbstractPolicy.java | 10 ++- .../fs/policy/HdfsFileWatcherPolicy.java | 3 +- .../connect/fs/util/ReflectionUtils.java | 9 +-- .../fs/file/reader/AvroFileReaderTest.java | 16 +++-- .../fs/file/reader/CsvFileReaderTest.java | 2 +- .../fs/file/reader/FileReaderTestBase.java | 43 +++++++++--- .../fs/file/reader/JsonFileReaderTest.java | 21 ++++-- .../fs/file/reader/ParquetFileReaderTest.java | 37 +++++++--- .../file/reader/SequenceFileReaderTest.java | 2 +- .../fs/file/reader/TextFileReaderTest.java | 17 +++-- .../file/reader/UnivocityFileReaderTest.java | 27 +++++--- .../connect/fs/policy/CronPolicyTest.java | 33 +++++++-- .../connect/fs/policy/PolicyTestBase.java | 37 ++++++---- .../connect/fs/policy/SleepyPolicyTest.java | 53 +++++++++++---- .../connect/fs/task/FsSourceTaskTest.java | 6 +- src/test/resources/log4j.properties | 1 + 26 files changed, 418 insertions(+), 273 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java index 0316acd..3689452 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java @@ -32,11 +32,9 @@ public void start(Map properties) { try { config = new FsSourceConnectorConfig(properties); } catch (ConfigException ce) { - log.error("Couldn't start FsSourceConnector:", ce); throw new ConnectException("Couldn't start FsSourceConnector due to configuration error.", ce); } catch (Exception ce) { - log.error("Couldn't start FsSourceConnector:", ce); - throw new ConnectException("An error has occurred when starting FsSourceConnector" + ce); + throw new ConnectException("An error has occurred when starting FsSourceConnector." + ce); } } @@ -48,9 +46,9 @@ public Class taskClass() { @Override public List> taskConfigs(int maxTasks) { if (config == null) { - throw new ConnectException("Connector config has not been initialized"); + throw new ConnectException("Connector config has not been initialized."); } - List> taskConfigs = new ArrayList<>(); + final List> taskConfigs = new ArrayList<>(); int groups = Math.min(config.getFsUris().size(), maxTasks); ConnectorUtils.groupPartitions(config.getFsUris(), groups) @@ -67,6 +65,7 @@ public List> taskConfigs(int maxTasks) { @Override public void stop() { + log.info("Stopping FsSourceConnector."); //Nothing to do } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java index db8188f..51a9e3d 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java @@ -21,6 +21,7 @@ import java.util.stream.StreamSupport; public class FsSourceTask extends SourceTask { + private static final Logger log = LoggerFactory.getLogger(FsSourceTask.class); private final AtomicBoolean stop = new AtomicBoolean(false); @@ -34,15 +35,16 @@ public String version() { @Override public void start(Map properties) { + log.info("Starting FS source task..."); try { config = new FsSourceTaskConfig(properties); if (config.getClass(FsSourceTaskConfig.POLICY_CLASS).isAssignableFrom(Policy.class)) { throw new ConfigException("Policy class " + - config.getClass(FsSourceTaskConfig.POLICY_CLASS) + "is not a sublass of " + Policy.class); + config.getClass(FsSourceTaskConfig.POLICY_CLASS) + "is not a subclass of " + Policy.class); } if (config.getClass(FsSourceTaskConfig.FILE_READER_CLASS).isAssignableFrom(FileReader.class)) { throw new ConfigException("FileReader class " + - config.getClass(FsSourceTaskConfig.FILE_READER_CLASS) + "is not a sublass of " + FileReader.class); + config.getClass(FsSourceTaskConfig.FILE_READER_CLASS) + "is not a subclass of " + FileReader.class); } Class policyClass = (Class) Class.forName(properties.get(FsSourceTaskConfig.POLICY_CLASS)); @@ -51,10 +53,11 @@ public void start(Map properties) { } catch (ConfigException ce) { log.error("Couldn't start FsSourceTask:", ce); throw new ConnectException("Couldn't start FsSourceTask due to configuration error", ce); - } catch (Throwable t) { - log.error("Couldn't start FsSourceConnector:", t); - throw new ConnectException("A problem has occurred reading configuration:" + t.getMessage()); + } catch (Exception e) { + log.error("Couldn't start FsSourceConnector:", e); + throw new ConnectException("A problem has occurred reading configuration: " + e.getMessage()); } + log.info("FS source task started with policy {}", policy.getClass().getName()); } @Override @@ -62,36 +65,35 @@ public List poll() { while (!stop.get() && policy != null && !policy.hasEnded()) { log.trace("Polling for new data"); - final List results = new ArrayList<>(); - List files = filesToProcess(); - files.forEach(metadata -> { + return filesToProcess().map(metadata -> { + List records = new ArrayList<>(); try (FileReader reader = policy.offer(metadata, context.offsetStorageReader())) { log.info("Processing records for file {}", metadata); while (reader.hasNext()) { - results.add(convert(metadata, reader.currentOffset(), reader.next())); + records.add(convert(metadata, reader.currentOffset(), reader.next())); } } catch (ConnectException | IOException e) { //when an exception happens reading a file, the connector continues log.error("Error reading file from FS: " + metadata.getPath() + ". Keep going...", e); } - }); - return results; + return records; + }).flatMap(Collection::stream).collect(Collectors.toList()); } - return null; } - private List filesToProcess() { + private Stream filesToProcess() { try { return asStream(policy.execute()) .filter(metadata -> metadata.getLen() > 0) - .collect(Collectors.toList()); + .collect(Collectors.toList()) + .stream(); } catch (IOException | ConnectException e) { //when an exception happens executing the policy, the connector continues - log.error("Cannot retrieve files to process from the FS: " + policy.getURIs() + ". " + + log.error("Cannot retrieve files to process from the FS: {}. " + "There was an error executing the policy but the task tolerates this and continues. " + - "Error message: " + e.getMessage()); - return Collections.emptyList(); + e.getMessage(), policy.getURIs(), e); + return Stream.empty(); } } @@ -112,6 +114,7 @@ private SourceRecord convert(FileMetadata metadata, long offset, Struct struct) @Override public void stop() { + log.info("Stopping FS source task."); stop.set(true); if (policy != null) { policy.interrupt(); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java index a54814b..fec6b73 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java @@ -3,15 +3,19 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.util.Map; +import java.util.NoSuchElementException; import java.util.stream.Collectors; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; public abstract class AbstractFileReader implements FileReader { + protected final Logger log = LoggerFactory.getLogger(getClass()); private final FileSystem fs; @@ -21,7 +25,7 @@ public abstract class AbstractFileReader implements FileReader { public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter, Map config) { if (fs == null || filePath == null) { - throw new IllegalArgumentException("fileSystem and filePath are required"); + throw new IllegalArgumentException("File system and file path are required."); } this.fs = fs; this.filePath = filePath; @@ -29,6 +33,7 @@ public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter this.offset = 0; configure(readerConfig(config)); + log.trace("Initialized file reader {} for file {}", getClass(), filePath); } protected final Map readerConfig(Map config) { @@ -51,7 +56,16 @@ public Path getFilePath() { @Override public final Struct next() { - return adapter.apply(nextRecord()); + if (!hasNext()) { + throw new NoSuchElementException("There are no more records in file: " + getFilePath()); + } + try { + return adapter.apply(nextRecord()); + } catch (ConnectException ce) { + throw ce; + } catch (Exception e) { + throw new ConnectException("Error processing next record in file: " + getFilePath(), e); + } } @Override @@ -67,9 +81,50 @@ protected void setOffset(long offset) { this.offset = offset; } - protected abstract T nextRecord(); + @Override + public final void seek(long offset) { + if (offset < 0) { + throw new IllegalArgumentException("Record offset must be greater than 0."); + } + checkClosed(); + try { + log.debug("Seeking file {} to offset {}.", filePath, offset); + seekFile(offset); + } catch (ConnectException ce) { + throw ce; + } catch (IOException ioe) { + throw new ConnectException("Error seeking file: " + getFilePath(), ioe); + } + } + + @Override + public final boolean hasNext() { + checkClosed(); + try { + return hasNextRecord(); + } catch (ConnectException ce) { + throw ce; + } catch (Exception e) { + throw new ConnectException("Error when checking if the reader has more records.", e); + } + } protected ReaderAdapter getAdapter() { return adapter; } + + private void checkClosed() { + if (isClosed()) { + throw new ConnectException("File stream is closed!"); + } + } + + protected abstract T nextRecord() throws IOException; + + protected abstract boolean hasNextRecord() throws IOException; + + protected abstract void seekFile(long offset) throws IOException; + + protected abstract boolean isClosed(); + } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java index 478dacb..2630762 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AgnosticFileReader.java @@ -4,6 +4,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; import java.io.IOException; import java.util.Arrays; @@ -31,20 +32,17 @@ public class AgnosticFileReader extends AbstractFileReader parquetExtensions, avroExtensions, sequenceExtensions, jsonExtensions, csvExtensions, tsvExtensions, fixedExtensions; - public AgnosticFileReader(FileSystem fs, Path filePath, Map config) throws IOException { + public AgnosticFileReader(FileSystem fs, Path filePath, Map config) throws Exception { super(fs, filePath, new AgnosticAdapter(), config); try { reader = readerByExtension(fs, filePath, config); - } catch (RuntimeException | IOException e) { - throw e; - } catch (Throwable t) { - throw new IOException("An error has occurred when creating a concrete reader", t); + } catch (ConnectException ce) { + throw (Exception) ce.getCause(); } } - private AbstractFileReader readerByExtension(FileSystem fs, Path filePath, Map config) - throws Throwable { + private AbstractFileReader readerByExtension(FileSystem fs, Path filePath, Map config) { int index = filePath.getName().lastIndexOf('.'); String extension = index == -1 || index == filePath.getName().length() - 1 ? "" : filePath.getName().substring(index + 1).toLowerCase(); @@ -90,12 +88,12 @@ protected void configure(Map config) { } @Override - public boolean hasNext() { + public boolean hasNextRecord() { return reader.hasNext(); } @Override - public void seek(long offset) { + public void seekFile(long offset) { reader.seek(offset); } @@ -110,7 +108,12 @@ public void close() throws IOException { } @Override - protected AgnosticRecord nextRecord() { + public boolean isClosed() { + return reader.isClosed(); + } + + @Override + protected AgnosticRecord nextRecord() throws IOException { return new AgnosticRecord(reader.getAdapter(), reader.nextRecord()); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java index 589ded7..3db8e3c 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java @@ -1,7 +1,6 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; import io.confluent.connect.avro.AvroData; -import org.apache.avro.AvroRuntimeException; import org.apache.avro.Schema; import org.apache.avro.file.DataFileReader; import org.apache.avro.generic.GenericRecord; @@ -11,7 +10,6 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.errors.ConnectException; import java.io.IOException; import java.util.Map; @@ -27,6 +25,7 @@ public class AvroFileReader extends AbstractFileReader { private final DataFileReader reader; private Schema schema; + private boolean closed; public AvroFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, new GenericRecordToStruct(), config); @@ -37,6 +36,7 @@ public AvroFileReader(FileSystem fs, Path filePath, Map config) } else { this.reader = new DataFileReader<>(input, new SpecificDatumReader<>(this.schema)); } + this.closed = false; } @Override @@ -47,42 +47,36 @@ protected void configure(Map config) { } @Override - public boolean hasNext() { - try { - return reader.hasNext(); - } catch (AvroRuntimeException are) { - throw new IllegalStateException(are); - } + public boolean hasNextRecord() { + return reader.hasNext(); } @Override protected GenericRecord nextRecord() { - try { - GenericRecord record = reader.next(); - incrementOffset(); + GenericRecord record = reader.next(); + incrementOffset(); - return record; - } catch (AvroRuntimeException are) { - throw new IllegalStateException(are); - } + return record; } @Override - public void seek(long offset) { - try { - reader.sync(offset); - setOffset(reader.previousSync() - 16L); - } catch (IOException ioe) { - throw new ConnectException("Error seeking file " + getFilePath(), ioe); - } + public void seekFile(long offset) throws IOException { + reader.sync(offset); + setOffset(reader.previousSync() - 16L); } @Override public void close() throws IOException { + closed = true; reader.sync(0); reader.close(); } + @Override + public boolean isClosed() { + return closed; + } + static class GenericRecordToStruct implements ReaderAdapter { private static final int CACHE_SIZE = 100; diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java index 700d38b..3fabc01 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReader.java @@ -68,29 +68,25 @@ protected void configure(Map config) { mapper.configure(DeserializationFeature.valueOf(feature), Boolean.parseBoolean(entry.getValue())); } else { - log.warn("Ignoring deserialization configuration '" + feature + "' due to it does not exist."); + log.warn("Ignoring deserialization configuration '{}' due to it does not exist.", feature); } }); } @Override - protected JsonRecord nextRecord() { - try { - JsonNode value = mapper.readTree(inner.nextRecord().getValue()); - return new JsonRecord(schema, value); - } catch (IOException ioe) { - throw new IllegalStateException(ioe); - } + protected JsonRecord nextRecord() throws IOException { + JsonNode value = mapper.readTree(inner.nextRecord().getValue()); + return new JsonRecord(schema, value); } @Override - public boolean hasNext() { - return inner.hasNext(); + public boolean hasNextRecord() throws IOException { + return inner.hasNextRecord(); } @Override - public void seek(long offset) { - inner.seek(offset); + public void seekFile(long offset) throws IOException { + inner.seekFile(offset); } @Override @@ -103,6 +99,11 @@ public void close() throws IOException { inner.close(); } + @Override + public boolean isClosed() { + return inner.isClosed(); + } + private static Schema extractSchema(JsonNode jsonNode) { switch (jsonNode.getNodeType()) { case BOOLEAN: @@ -189,7 +190,7 @@ private Object mapValue(Schema schema, JsonNode value) { try { return value.binaryValue(); } catch (IOException ioe) { - throw new IllegalStateException(ioe); + throw new RuntimeException(ioe); } case OBJECT: case POJO: diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java index 7253a4b..0657d0b 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java @@ -8,7 +8,6 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.errors.ConnectException; import org.apache.parquet.avro.AvroParquetReader; import org.apache.parquet.avro.AvroReadSupport; import org.apache.parquet.hadoop.ParquetReader; @@ -16,7 +15,6 @@ import java.io.IOException; import java.util.Map; -import java.util.NoSuchElementException; import java.util.Optional; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; @@ -64,23 +62,15 @@ protected void configure(Map config) { } @Override - public boolean hasNext() { - if (closed) throw new IllegalStateException("Reader already closed."); + public boolean hasNextRecord() throws IOException { if (currentRecord == null) { - try { - currentRecord = reader.read(); - } catch (IOException ioe) { - throw new ConnectException("Error reading parquet record", ioe); - } + currentRecord = reader.read(); } return currentRecord != null; } @Override protected GenericRecord nextRecord() { - if (!hasNext()) { - throw new NoSuchElementException("There are no more records in file: " + getFilePath()); - } GenericRecord record; if (this.projection != null) { record = new GenericData.Record(this.projection); @@ -94,21 +84,11 @@ record = currentRecord; } @Override - public void seek(long offset) { - if (closed) { - throw new ConnectException("Stream is closed!"); - } - if (offset < 0) { - throw new IllegalArgumentException("Record offset must be greater than 0"); - } + public void seekFile(long offset) throws IOException { if (currentOffset() > offset) { - try { - this.reader = initReader(); - setOffset(0); - this.closed = false; - } catch (IOException ioe) { - throw new ConnectException("Error initializing parquet reader", ioe); - } + this.reader = initReader(); + this.closed = false; + setOffset(0); } while (hasNext() && currentOffset() < offset) { nextRecord(); @@ -117,10 +97,15 @@ public void seek(long offset) { @Override public void close() throws IOException { - this.closed = true; + closed = true; reader.close(); } + @Override + public boolean isClosed() { + return closed; + } + static class GenericRecordToStruct implements ReaderAdapter { private static final int CACHE_SIZE = 100; private final AvroData avroData; diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java index e21bdf2..3740db9 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java @@ -7,12 +7,10 @@ import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.data.SchemaBuilder; import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.errors.ConnectException; import java.io.EOFException; import java.io.IOException; import java.util.Map; -import java.util.NoSuchElementException; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; @@ -82,8 +80,7 @@ Schema getSchema(Writable writable) { } @Override - public boolean hasNext() { - if (closed) throw new IllegalStateException("Reader already closed."); + public boolean hasNextRecord() throws IOException { try { if (hasNextIndex == -1 || hasNextIndex == recordIndex) { hasNextIndex++; @@ -93,33 +90,21 @@ public boolean hasNext() { return hasNext; } catch (EOFException eofe) { return false; - } catch (IOException ioe) { - throw new ConnectException(ioe); } } @Override protected SequenceRecord nextRecord() { - if (!hasNext()) { - throw new NoSuchElementException("There are no more records in file: " + getFilePath()); - } recordIndex++; return new SequenceRecord<>(schema, keyFieldName, key, valueFieldName, value); } @Override - public void seek(long offset) { - if (offset < 0) { - throw new IllegalArgumentException("Record offset must be greater than 0"); - } - try { - reader.sync(offset); - hasNextIndex = recordIndex = offset; - hasNext = false; - setOffset(offset - 1); - } catch (IOException ioe) { - throw new ConnectException("Error seeking file " + getFilePath(), ioe); - } + public void seekFile(long offset) throws IOException { + reader.sync(offset); + hasNextIndex = recordIndex = offset; + hasNext = false; + setOffset(offset - 1); } @Override @@ -128,6 +113,11 @@ public void close() throws IOException { reader.close(); } + @Override + public boolean isClosed() { + return closed; + } + static class SeqToStruct implements ReaderAdapter> { @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java index 060de36..56f5581 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java @@ -7,13 +7,11 @@ import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.data.SchemaBuilder; import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.errors.ConnectException; import java.io.*; import java.nio.charset.Charset; import java.util.List; import java.util.Map; -import java.util.NoSuchElementException; import java.util.stream.Collectors; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; @@ -39,10 +37,12 @@ public class TextFileReader extends AbstractFileReader config) throws IOException { super(fs, filePath, new TxtToStruct(), config); this.reader = new LineNumberReader(getFileReader(fs.open(filePath))); + this.closed = false; } @Override @@ -78,39 +78,32 @@ private Reader getFileReader(InputStream inputStream) throws IOException { } @Override - public boolean hasNext() { + public boolean hasNextRecord() throws IOException { if (current != null) { return true; } else if (finished) { return false; } else { - try { - if (!recordPerLine) { - List lines = new BufferedReader(reader).lines().collect(Collectors.toList()); - current = String.join("\n", lines); + if (!recordPerLine) { + List lines = new BufferedReader(reader).lines().collect(Collectors.toList()); + current = String.join("\n", lines); + finished = true; + return true; + } + for (; ; ) { + String line = reader.readLine(); + if (line == null) { finished = true; - return true; - } - for (; ; ) { - String line = reader.readLine(); - if (line == null) { - finished = true; - return false; - } - current = line; - return true; + return false; } - } catch (IOException ioe) { - throw new IllegalStateException(ioe); + current = line; + return true; } } } @Override protected TextRecord nextRecord() { - if (!hasNext()) { - throw new NoSuchElementException("There are no more records in file: " + getFilePath()); - } String aux = current; current = null; incrementOffset(); @@ -118,31 +111,30 @@ protected TextRecord nextRecord() { } @Override - public void seek(long offset) { - if (offset < 0) { - throw new IllegalArgumentException("Record offset must be greater than 0"); + public void seekFile(long offset) throws IOException { + current = null; + if (offset < reader.getLineNumber()) { + finished = false; + reader.close(); + reader = new LineNumberReader(getFileReader(getFs().open(getFilePath()))); } - try { - current = null; - if (offset < reader.getLineNumber()) { - finished = false; - reader.close(); - reader = new LineNumberReader(getFileReader(getFs().open(getFilePath()))); - } - while (reader.getLineNumber() < offset) { - reader.readLine(); - } - setOffset(reader.getLineNumber()); - } catch (IOException ioe) { - throw new ConnectException("Error seeking file " + getFilePath(), ioe); + while (reader.getLineNumber() < offset) { + reader.readLine(); } + setOffset(reader.getLineNumber()); } @Override public void close() throws IOException { + closed = true; reader.close(); } + @Override + public boolean isClosed() { + return closed; + } + static class TxtToStruct implements ReaderAdapter { @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java index 050ba4c..490af2d 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java @@ -12,7 +12,6 @@ import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.data.SchemaBuilder; import org.apache.kafka.connect.data.Struct; -import org.apache.kafka.connect.errors.ConnectException; import java.io.IOException; import java.io.InputStream; @@ -20,7 +19,6 @@ import java.io.Reader; import java.nio.charset.Charset; import java.util.Map; -import java.util.NoSuchElementException; import java.util.Optional; import java.util.stream.IntStream; @@ -145,39 +143,28 @@ private ResultIterator iterateRecords() throws IOExcepti @Override protected final UnivocityRecord nextRecord() { - if (!hasNext()) throw new NoSuchElementException("There are no more records in file: " + getFilePath()); - incrementOffset(); Record record = iterator.next(); return new UnivocityRecord(schema, record.getValues()); } @Override - public final boolean hasNext() { - if (closed) throw new IllegalStateException("Reader already closed."); - + public final boolean hasNextRecord() { return iterator.hasNext(); } @Override - public final void seek(long offset) { - if (offset < 0) { - throw new IllegalArgumentException("Record offset must be greater than 0"); - } - try { - if (offset > currentOffset()) { - iterator.hasNext(); - iterator.getContext().skipLines(offset - currentOffset() - 1); - iterator.next(); - } else { - iterator = iterateRecords(); - iterator.hasNext(); - iterator.getContext().skipLines(offset); - } - setOffset(offset); - } catch (IOException ioe) { - throw new ConnectException("Error seeking file " + getFilePath(), ioe); + public final void seekFile(long offset) throws IOException { + if (offset > currentOffset()) { + iterator.hasNext(); + iterator.getContext().skipLines(offset - currentOffset() - 1); + iterator.next(); + } else { + iterator = iterateRecords(); + iterator.hasNext(); + iterator.getContext().skipLines(offset); } + setOffset(offset); } @Override @@ -186,6 +173,11 @@ public final void close() { closed = true; } + @Override + public final boolean isClosed() { + return closed; + } + static class UnivocityToStruct implements ReaderAdapter { @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java index b57d0c2..5908380 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java @@ -88,7 +88,6 @@ private String convert(String uri) { throw new IllegalArgumentException("Cannot convert dynamic URI: " + matcher.group(1), e); } } - return converted; } @@ -104,7 +103,7 @@ public List getURIs() { @Override public final Iterator execute() throws IOException { if (hasEnded()) { - throw new IllegalWorkerStateException("Policy has ended. Cannot be retried"); + throw new IllegalWorkerStateException("Policy has ended. Cannot be retried."); } preCheck(); @@ -163,7 +162,7 @@ public boolean hasNext() { @Override public FileMetadata next() { if (!hasNext() && current == null) { - throw new NoSuchElementException("There are no more items"); + throw new NoSuchElementException("There are no more items."); } FileMetadata metadata = toMetadata(current); current = null; @@ -198,7 +197,6 @@ public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorage .filter(fs -> metadata.getPath().startsWith(fs.getWorkingDirectory().toString())) .findFirst() .orElse(null); - try { FileReader reader = ReflectionUtils.makeReader( (Class) conf.getClass(FsSourceTaskConfig.FILE_READER_CLASS), @@ -209,8 +207,8 @@ public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorage reader.seek((Long) offset.get("offset")); } return reader; - } catch (Throwable t) { - throw new ConnectException("An error has occurred when creating reader for file: " + metadata.getPath(), t); + } catch (Exception e) { + throw new ConnectException("An error has occurred when creating reader for file: " + metadata.getPath(), e); } } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java index a6505a3..996d868 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java @@ -139,7 +139,7 @@ public void run() { } catch (FileNotFoundException fnfe) { log.warn("Cannot find file in this FS {}. Stopping watcher...", fs.getWorkingDirectory(), fnfe); } catch (IOException ioe) { - log.info("An interrupted exception has occurred. Path {} is not watched any more", fs.getWorkingDirectory()); + log.warn("An interrupted exception has occurred. Path {} is not watched any more", fs.getWorkingDirectory()); } catch (Exception ioe) { log.warn("Exception watching path {}", fs.getWorkingDirectory(), ioe); throw new IllegalWorkerStateException(ioe); @@ -153,6 +153,7 @@ private void enqueue(String path) throws IOException { return; } + log.debug("Enqueuing file to process {}", filePath); RemoteIterator it = fs.listFiles(filePath, false); while (it.hasNext()) { LocatedFileStatus status = it.next(); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java index 6b84ca3..04fa75c 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/util/ReflectionUtils.java @@ -6,6 +6,7 @@ import org.apache.commons.lang3.reflect.ConstructorUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.kafka.connect.errors.ConnectException; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; @@ -15,15 +16,15 @@ public class ReflectionUtils { public static FileReader makeReader(Class clazz, FileSystem fs, - Path path, Map config) throws Throwable { + Path path, Map config) { return make(clazz, fs, path, config); } - public static Policy makePolicy(Class clazz, FsSourceTaskConfig conf) throws Throwable { + public static Policy makePolicy(Class clazz, FsSourceTaskConfig conf) { return make(clazz, conf); } - private static T make(Class clazz, Object... args) throws Throwable { + private static T make(Class clazz, Object... args) { try { Class[] constClasses = Arrays.stream(args).map(Object::getClass).toArray(Class[]::new); @@ -32,7 +33,7 @@ private static T make(Class clazz, Object... args) throws Throwable { } catch (IllegalAccessException | InstantiationException | InvocationTargetException e) { - throw e.getCause(); + throw new ConnectException(e.getCause()); } } } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java index bebeff7..5e9d59e 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReaderTest.java @@ -12,6 +12,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; @@ -68,7 +69,7 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithSchema(ReaderFsTestConfig fsConfig) throws Throwable { + public void readerWithSchema(ReaderFsTestConfig fsConfig) throws IOException { Map readerConfig = getReaderConfig(); readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, schema.toString()); FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); @@ -78,12 +79,12 @@ public void readerWithSchema(ReaderFsTestConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithInvalidSchema(ReaderFsTestConfig fsConfig) throws Throwable { + public void readerWithInvalidSchema(ReaderFsTestConfig fsConfig) throws IOException { Map readerConfig = getReaderConfig(); readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, Schema.create(Schema.Type.STRING).toString()); FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); - assertThrows(IllegalStateException.class, () -> readAllData(fsConfig)); + assertThrows(ConnectException.class, () -> readAllData(fsConfig)); assertThrows(AvroTypeException.class, () -> { try { readAllData(fsConfig); @@ -99,7 +100,14 @@ public void readerWithUnparseableSchema(ReaderFsTestConfig fsConfig) throws IOEx Map readerConfig = getReaderConfig(); readerConfig.put(AvroFileReader.FILE_READER_AVRO_SCHEMA, "invalid schema"); FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); - assertThrows(SchemaParseException.class, () -> getReader(testFs, fsConfig.getDataFile(), readerConfig)); + assertThrows(ConnectException.class, () -> getReader(testFs, fsConfig.getDataFile(), readerConfig)); + assertThrows(SchemaParseException.class, () -> { + try { + getReader(testFs, fsConfig.getDataFile(), readerConfig); + } catch (Exception e) { + throw e.getCause(); + } + }); } @Override diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java index a6b9fbf..a1247d5 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java @@ -40,7 +40,7 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readAllDataWithMalformedRows(ReaderFsTestConfig fsConfig) throws Throwable { + public void readAllDataWithMalformedRows(ReaderFsTestConfig fsConfig) throws IOException { File tmp = File.createTempFile("test-", "." + getFileExtension()); try (FileWriter writer = new FileWriter(tmp)) { writer.append(FIELD_COLUMN1 + "," + FIELD_COLUMN2 + "," + FIELD_COLUMN3 + "," + FIELD_COLUMN4 + "\n"); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java index f91e9af..f21cf49 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FileReaderTestBase.java @@ -6,6 +6,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; @@ -43,7 +44,7 @@ public static void finishFs() throws IOException { } @BeforeEach - public void openReader() throws Throwable { + public void openReader() throws IOException { for (ReaderFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { fsConfig.setDataFile(createDataFile(fsConfig)); FileReader reader = ReflectionUtils.makeReader(getReaderClass(), fsConfig.getFs(), @@ -85,28 +86,49 @@ public void invalidArgs(ReaderFsTestConfig fsConfig) { @MethodSource("fileSystemConfigProvider") public void fileDoesNotExist(ReaderFsTestConfig fsConfig) { Path path = new Path(new Path(fsConfig.getFsUri()), UUID.randomUUID().toString()); - assertThrows(FileNotFoundException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); + assertThrows(FileNotFoundException.class, () -> { + try { + getReader(fsConfig.getFs(), path, getReaderConfig()); + } catch (Exception e) { + throw e.getCause(); + } + }); } @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void emptyFile(ReaderFsTestConfig fsConfig) throws Throwable { + public void emptyFile(ReaderFsTestConfig fsConfig) throws IOException { File tmp = File.createTempFile("test-", "." + getFileExtension()); Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - assertThrows(IOException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); + assertThrows(IOException.class, () -> { + try { + getReader(fsConfig.getFs(), path, getReaderConfig()); + } catch (Exception e) { + throw e.getCause(); + } + }); } @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws Throwable { + public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws IOException { File tmp = File.createTempFile("test-", "." + getFileExtension()); try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { writer.write("test"); } Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - assertThrows(IOException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); + assertThrows(IOException.class, () -> { + try { + getReader(fsConfig.getFs(), path, getReaderConfig()); + } catch (Exception e) { + throw e.getCause(); + } + }); } @ParameterizedTest @@ -154,7 +176,7 @@ public void seekFile(ReaderFsTestConfig fsConfig) { @MethodSource("fileSystemConfigProvider") public void negativeSeek(ReaderFsTestConfig fsConfig) { FileReader reader = fsConfig.getReader(); - assertThrows(RuntimeException.class, () -> reader.seek(-1)); + assertThrows(IllegalArgumentException.class, () -> reader.seek(-1)); } @ParameterizedTest @@ -171,11 +193,12 @@ public void exceededSeek(ReaderFsTestConfig fsConfig) { public void readFileAlreadyClosed(ReaderFsTestConfig fsConfig) throws IOException { FileReader reader = fsConfig.getReader(); reader.close(); - assertThrows(IllegalStateException.class, reader::hasNext); - assertThrows(IllegalStateException.class, reader::next); + assertThrows(ConnectException.class, reader::hasNext); + assertThrows(ConnectException.class, reader::next); + assertThrows(ConnectException.class, () -> reader.seek(1)); } - protected final FileReader getReader(FileSystem fs, Path path, Map config) throws Throwable { + protected final FileReader getReader(FileSystem fs, Path path, Map config) { return ReflectionUtils.makeReader(getReaderClass(), fs, path, config); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java index fdc2422..98e7e5b 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/JsonFileReaderTest.java @@ -8,6 +8,7 @@ import com.fasterxml.jackson.databind.node.ObjectNode; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; @@ -77,7 +78,7 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void emptyFile(ReaderFsTestConfig fsConfig) throws Throwable { + public void emptyFile(ReaderFsTestConfig fsConfig) throws IOException { File tmp = File.createTempFile("test-", "." + getFileExtension()); Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); @@ -87,7 +88,7 @@ public void emptyFile(ReaderFsTestConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void validFileEncoding(ReaderFsTestConfig fsConfig) throws Throwable { + public void validFileEncoding(ReaderFsTestConfig fsConfig) { Map readerConfig = getReaderConfig(); readerConfig.put(JsonFileReader.FILE_READER_JSON_ENCODING, "Cp1252"); fsConfig.setReader(getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); @@ -96,7 +97,7 @@ public void validFileEncoding(ReaderFsTestConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invalidDeserializationConfig(ReaderFsTestConfig fsConfig) throws Throwable { + public void invalidDeserializationConfig(ReaderFsTestConfig fsConfig) { Map readerConfig = getReaderConfig(); readerConfig.put(JsonFileReader.FILE_READER_JSON_DESERIALIZATION_CONFIGS + "invalid", "false"); fsConfig.setReader(getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); @@ -108,13 +109,19 @@ public void invalidDeserializationConfig(ReaderFsTestConfig fsConfig) throws Thr public void invalidFileEncoding(ReaderFsTestConfig fsConfig) { Map readerConfig = getReaderConfig(); readerConfig.put(JsonFileReader.FILE_READER_JSON_ENCODING, "invalid_charset"); - assertThrows(UnsupportedCharsetException.class, () -> getReader(fsConfig.getFs(), - fsConfig.getDataFile(), readerConfig)); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); + assertThrows(UnsupportedCharsetException.class, () -> { + try { + getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); + } catch (Exception e) { + throw e.getCause(); + } + }); } @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readDataWithRecordPerLineDisabled(ReaderFsTestConfig fsConfig) throws Throwable { + public void readDataWithRecordPerLineDisabled(ReaderFsTestConfig fsConfig) throws IOException { Path file = createDataFile(fsConfig, 1, false); Map readerConfig = getReaderConfig(); readerConfig.put(JsonFileReader.FILE_READER_JSON_RECORD_PER_LINE, "false"); @@ -153,7 +160,7 @@ public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { } reader.close(); assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); - } catch (Throwable e) { + } catch (Exception e) { throw new RuntimeException(e); } }); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java index be5e831..30dd425 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReaderTest.java @@ -10,6 +10,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.DataException; import org.apache.parquet.avro.AvroParquetWriter; import org.apache.parquet.hadoop.ParquetFileWriter; @@ -76,7 +77,7 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void emptyFile(ReaderFsTestConfig fsConfig) throws Throwable { + public void emptyFile(ReaderFsTestConfig fsConfig) throws IOException { File tmp = File.createTempFile("test-", "." + getFileExtension()); Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); @@ -85,7 +86,7 @@ public void emptyFile(ReaderFsTestConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws Throwable { + public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws IOException { File tmp = File.createTempFile("test-", "." + getFileExtension()); try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { writer.write("test"); @@ -97,7 +98,7 @@ public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithSchema(ReaderFsTestConfig fsConfig) throws Throwable { + public void readerWithSchema(ReaderFsTestConfig fsConfig) throws IOException { Map readerConfig = getReaderConfig(); readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, readerSchema.toString()); readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); @@ -108,7 +109,7 @@ public void readerWithSchema(ReaderFsTestConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithProjection(ReaderFsTestConfig fsConfig) throws Throwable { + public void readerWithProjection(ReaderFsTestConfig fsConfig) throws IOException { Map readerConfig = getReaderConfig(); readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_PROJECTION, projectionSchema.toString()); readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); @@ -126,7 +127,7 @@ public void readerWithProjection(ReaderFsTestConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithInvalidProjection(ReaderFsTestConfig fsConfig) throws Throwable { + public void readerWithInvalidProjection(ReaderFsTestConfig fsConfig) throws IOException { Schema testSchema = SchemaBuilder.record("test_projection").namespace("test.avro") .fields() .name("field1").type("string").noDefault() @@ -136,18 +137,28 @@ public void readerWithInvalidProjection(ReaderFsTestConfig fsConfig) throws Thro readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); - assertThrows(InvalidRecordException.class, () -> readAllData(fsConfig)); + try { + readAllData(fsConfig); + } catch (Exception e) { + assertEquals(ConnectException.class, e.getClass()); + assertEquals(InvalidRecordException.class, e.getCause().getClass()); + } } @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readerWithInvalidSchema(ReaderFsTestConfig fsConfig) throws Throwable { + public void readerWithInvalidSchema(ReaderFsTestConfig fsConfig) throws IOException { Map readerConfig = getReaderConfig(); readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, Schema.create(Schema.Type.STRING).toString()); readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); FileSystem testFs = FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()); fsConfig.setReader(getReader(testFs, fsConfig.getDataFile(), readerConfig)); - assertThrows(AvroRuntimeException.class, () -> readAllData(fsConfig)); + try { + readAllData(fsConfig); + } catch (Exception e) { + assertEquals(ConnectException.class, e.getClass()); + assertEquals(AvroRuntimeException.class, e.getCause().getClass()); + } } @ParameterizedTest @@ -156,9 +167,17 @@ public void readerWithUnparseableSchema(ReaderFsTestConfig fsConfig) { Map readerConfig = getReaderConfig(); readerConfig.put(ParquetFileReader.FILE_READER_PARQUET_SCHEMA, "invalid schema"); readerConfig.put(AgnosticFileReader.FILE_READER_AGNOSTIC_EXTENSIONS_PARQUET, getFileExtension()); - assertThrows(SchemaParseException.class, () -> + assertThrows(ConnectException.class, () -> getReader(FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()), fsConfig.getDataFile(), readerConfig)); + assertThrows(SchemaParseException.class, () -> { + try { + getReader(FileSystem.newInstance(fsConfig.getFsUri(), new Configuration()), + fsConfig.getDataFile(), readerConfig); + } catch (Exception e) { + throw e.getCause(); + } + }); } @Override diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java index cc62c0b..e70d3dd 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReaderTest.java @@ -62,7 +62,7 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void defaultFieldNames(ReaderFsTestConfig fsConfig) throws Throwable { + public void defaultFieldNames(ReaderFsTestConfig fsConfig) { Map readerConfig = getReaderConfig(); readerConfig.put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, null); readerConfig.put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, null); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java index 5078d24..5e56ac6 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReaderTest.java @@ -2,6 +2,7 @@ import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; @@ -41,7 +42,7 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void validFileEncoding(ReaderFsTestConfig fsConfig) throws Throwable { + public void validFileEncoding(ReaderFsTestConfig fsConfig) { Map readerConfig = getReaderConfig(); readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); readerConfig.put(TextFileReader.FILE_READER_TEXT_ENCODING, "Cp1252"); @@ -58,13 +59,19 @@ public void invalidFileEncoding(ReaderFsTestConfig fsConfig) { readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); readerConfig.put(TextFileReader.FILE_READER_TEXT_ENCODING, "invalid_charset"); readerConfig.put(TextFileReader.FILE_READER_TEXT_COMPRESSION_TYPE, COMPRESSION_TYPE_DEFAULT); - assertThrows(UnsupportedCharsetException.class, () -> getReader(fsConfig.getFs(), - fsConfig.getDataFile(), readerConfig)); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); + assertThrows(UnsupportedCharsetException.class, () -> { + try { + getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); + } catch (Exception e) { + throw e.getCause(); + } + }); } @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readDataWithRecordPerLineDisabled(ReaderFsTestConfig fsConfig) throws Throwable { + public void readDataWithRecordPerLineDisabled(ReaderFsTestConfig fsConfig) throws IOException { Path file = createDataFile(fsConfig, COMPRESSION_TYPE_DEFAULT); Map readerConfig = getReaderConfig(); readerConfig.put(TextFileReader.FILE_READER_TEXT_FIELD_NAME_VALUE, FIELD_NAME_VALUE); @@ -105,7 +112,7 @@ public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { } reader.close(); assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); - } catch (Throwable e) { + } catch (Exception e) { throw new RuntimeException(e); } }); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java index 438bb1e..a5e8d9e 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java @@ -3,12 +3,14 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; +import org.apache.kafka.connect.errors.ConnectException; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; +import java.io.IOException; import java.lang.reflect.ParameterizedType; import java.nio.charset.UnsupportedCharsetException; import java.util.Arrays; @@ -28,7 +30,7 @@ abstract class UnivocityFileReaderTest extends Fi @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void emptyFile(ReaderFsTestConfig fsConfig) throws Throwable { + public void emptyFile(ReaderFsTestConfig fsConfig) throws IOException { File tmp = File.createTempFile("test-", "." + getFileExtension()); Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); @@ -37,7 +39,7 @@ public void emptyFile(ReaderFsTestConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws Throwable { + public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws IOException { File tmp = File.createTempFile("test-", "." + getFileExtension()); try (BufferedWriter writer = new BufferedWriter(new FileWriter(tmp))) { writer.write("test"); @@ -62,7 +64,7 @@ public void invaliConfigArgs(ReaderFsTestConfig fsConfig) { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void readAllDataWithoutHeader(ReaderFsTestConfig fsConfig) throws Throwable { + public void readAllDataWithoutHeader(ReaderFsTestConfig fsConfig) throws IOException { Path file = createDataFile(fsConfig, false); Map readerConfig = getReaderConfig(); readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "false"); @@ -101,7 +103,7 @@ public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { } reader.close(); assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); - } catch (Throwable e) { + } catch (Exception e) { throw new RuntimeException(e); } }); @@ -109,7 +111,7 @@ public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void seekFileWithoutHeader(ReaderFsTestConfig fsConfig) throws Throwable { + public void seekFileWithoutHeader(ReaderFsTestConfig fsConfig) throws IOException { Path file = createDataFile(fsConfig, false); Map readerConfig = getReaderConfig(); readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "false"); @@ -141,7 +143,7 @@ public void seekFileWithoutHeader(ReaderFsTestConfig fsConfig) throws Throwable @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void validFileEncoding(ReaderFsTestConfig fsConfig) throws Throwable { + public void validFileEncoding(ReaderFsTestConfig fsConfig) { Map readerConfig = getReaderConfig(); readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); readerConfig.put(T.FILE_READER_DELIMITED_ENCODING, "Cp1252"); @@ -154,14 +156,19 @@ public void invalidFileEncoding(ReaderFsTestConfig fsConfig) { Map readerConfig = getReaderConfig(); readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); readerConfig.put(T.FILE_READER_DELIMITED_ENCODING, "invalid_charset"); - assertThrows(UnsupportedCharsetException.class, () -> getReader(fsConfig.getFs(), - fsConfig.getDataFile(), readerConfig)); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig)); + assertThrows(UnsupportedCharsetException.class, () -> { + try { + getReader(fsConfig.getFs(), fsConfig.getDataFile(), readerConfig); + } catch (Exception e) { + throw e.getCause(); + } + }); } @Override protected Class getReaderClass() { - return (Class) ((ParameterizedType) this.getClass().getGenericSuperclass()) - .getActualTypeArguments()[0]; + return (Class) ((ParameterizedType) this.getClass().getGenericSuperclass()).getActualTypeArguments()[0]; } @Override diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java index 5de4e95..72bac98 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicyTest.java @@ -5,6 +5,7 @@ import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; import org.apache.hadoop.fs.Path; import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.IllegalWorkerStateException; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; @@ -53,8 +54,17 @@ public void invalidCronExpression(PolicyFsTestConfig fsConfig) { Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(CronPolicy.CRON_POLICY_EXPRESSION, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); } @ParameterizedTest @@ -63,15 +73,24 @@ public void invalidEndDate(PolicyFsTestConfig fsConfig) { Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(CronPolicy.CRON_POLICY_END_DATE, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); } @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void canBeInterrupted(PolicyFsTestConfig fsConfig) throws Throwable { - Policy policy = ReflectionUtils.makePolicy( - (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), + public void canBeInterrupted(PolicyFsTestConfig fsConfig) throws IOException { + Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), fsConfig.getSourceTaskConfig()); for (int i = 0; i < 5; i++) { diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java index 6aa77b1..6af841b 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java @@ -6,6 +6,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.IllegalWorkerStateException; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; @@ -47,11 +48,11 @@ public static void finishFs() throws IOException { } @BeforeEach - public void initPolicy() throws Throwable { + public void initPolicy() { for (PolicyFsTestConfig fsConfig : TEST_FILE_SYSTEMS) { FsSourceTaskConfig sourceTaskConfig = buildSourceTaskConfig(fsConfig.getDirectories()); - Policy policy = ReflectionUtils.makePolicy( - (Class) sourceTaskConfig.getClass(FsSourceTaskConfig.POLICY_CLASS), sourceTaskConfig); + Policy policy = ReflectionUtils.makePolicy((Class) sourceTaskConfig + .getClass(FsSourceTaskConfig.POLICY_CLASS), sourceTaskConfig); fsConfig.setSourceTaskConfig(sourceTaskConfig); fsConfig.setPolicy(policy); } @@ -83,14 +84,15 @@ public void invalidArgs(PolicyFsTestConfig fsConfig) { @ParameterizedTest @MethodSource("fileSystemConfigProvider") public void invalidConfig(PolicyFsTestConfig fsConfig) { - assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), - new FsSourceTaskConfig(new HashMap<>()))); + assertThrows(ConfigException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), + new FsSourceTaskConfig(new HashMap<>()))); } @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void interruptPolicy(PolicyFsTestConfig fsConfig) throws Throwable { + public void interruptPolicy(PolicyFsTestConfig fsConfig) throws IOException { fsConfig.getPolicy().execute(); fsConfig.getPolicy().interrupt(); assertTrue(fsConfig.getPolicy().hasEnded()); @@ -172,14 +174,14 @@ public void execPolicyAlreadyEnded(PolicyFsTestConfig fsConfig) throws IOExcepti @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void dynamicURIs(PolicyFsTestConfig fsConfig) throws Throwable { + public void dynamicURIs(PolicyFsTestConfig fsConfig) throws IOException { Path dynamic = new Path(fsConfig.getFsUri().toString(), "${G}/${yyyy}/${MM}/${W}"); fsConfig.getFs().create(dynamic); Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(FsSourceTaskConfig.FS_URIS, dynamic.toString()); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - Policy policy = ReflectionUtils.makePolicy( - (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); fsConfig.setPolicy(policy); assertEquals(1, fsConfig.getPolicy().getURIs().size()); @@ -200,14 +202,23 @@ public void dynamicURIs(PolicyFsTestConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void invalidDynamicURIs(PolicyFsTestConfig fsConfig) throws Throwable { + public void invalidDynamicURIs(PolicyFsTestConfig fsConfig) throws IOException { Path dynamic = new Path(fsConfig.getFsUri().toString(), "${yyyy}/${MM}/${mmmmmmm}"); fsConfig.getFs().create(dynamic); Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(FsSourceTaskConfig.FS_URIS, dynamic.toString()); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - assertThrows(IllegalArgumentException.class, () -> ReflectionUtils.makePolicy( - (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(IllegalArgumentException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); } protected abstract FsSourceTaskConfig buildSourceTaskConfig(List directories); diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java index 9748d15..65c41c7 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/SleepyPolicyTest.java @@ -5,9 +5,11 @@ import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; import org.apache.hadoop.fs.Path; import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.connect.errors.ConnectException; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; +import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -40,8 +42,17 @@ public void invalidSleepTime(PolicyFsTestConfig fsConfig) { Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); } @ParameterizedTest @@ -50,8 +61,17 @@ public void invalidMaxExecs(PolicyFsTestConfig fsConfig) { Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); } @ParameterizedTest @@ -60,20 +80,29 @@ public void invalidSleepFraction(PolicyFsTestConfig fsConfig) { Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); originals.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_FRACTION, "invalid"); FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); - assertThrows(ConfigException.class, () -> ReflectionUtils.makePolicy( - (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); } @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void sleepExecution(PolicyFsTestConfig fsConfig) throws Throwable { + public void sleepExecution(PolicyFsTestConfig fsConfig) throws IOException { Map tConfig = fsConfig.getSourceTaskConfig().originalsStrings(); tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1000"); tConfig.put(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS, "2"); FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - Policy policy = ReflectionUtils.makePolicy( - (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); + Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); assertFalse(policy.hasEnded()); policy.execute(); assertFalse(policy.hasEnded()); @@ -83,14 +112,14 @@ public void sleepExecution(PolicyFsTestConfig fsConfig) throws Throwable { @ParameterizedTest @MethodSource("fileSystemConfigProvider") - public void defaultExecutions(PolicyFsTestConfig fsConfig) throws Throwable { + public void defaultExecutions(PolicyFsTestConfig fsConfig) throws IOException { Map tConfig = fsConfig.getSourceTaskConfig().originalsStrings(); tConfig.put(SleepyPolicy.SLEEPY_POLICY_SLEEP_MS, "1"); tConfig.remove(SleepyPolicy.SLEEPY_POLICY_MAX_EXECS); FsSourceTaskConfig sleepConfig = new FsSourceTaskConfig(tConfig); - Policy policy = ReflectionUtils.makePolicy( - (Class) fsConfig.getSourceTaskConfig().getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); + Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), sleepConfig); //it never ends for (int i = 0; i < 100; i++) { diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java index 8dd610a..b4b5a4e 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/task/FsSourceTaskTest.java @@ -81,11 +81,11 @@ public void initTask() { EasyMock.expect(offsetStorageReader.offset(EasyMock.anyObject())) .andReturn(new HashMap() {{ - put("offset", 5L); + put("offset", (long) (NUM_RECORDS / 2)); }}); EasyMock.expect(offsetStorageReader.offset(EasyMock.anyObject())) .andReturn(new HashMap() {{ - put("offset", 5L); + put("offset", (long) (NUM_RECORDS / 2)); }}); EasyMock.checkOrder(taskContext, false); @@ -152,7 +152,7 @@ public void oneFilePerFs(TaskFsTestConfig fsConfig) throws IOException { fsConfig.getTask().start(fsConfig.getTaskConfig()); List records = fsConfig.getTask().poll(); - assertEquals(10, records.size()); + assertEquals((NUM_RECORDS * fsConfig.getDirectories().size()) / 2, records.size()); checkRecords(records); //policy has ended assertNull(fsConfig.getTask().poll()); diff --git a/src/test/resources/log4j.properties b/src/test/resources/log4j.properties index 493f160..bb7782f 100644 --- a/src/test/resources/log4j.properties +++ b/src/test/resources/log4j.properties @@ -7,6 +7,7 @@ log4j.appender.stdout.Target=System.out log4j.appender.stdout.layout=org.apache.log4j.PatternLayout log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c:%L - %m%n +log4j.logger.com.github.mmolimar.kafka.connect.fs=TRACE log4j.logger.org.apache.hadoop=ERROR log4j.logger.BlockStateChange=WARN log4j.logger.org.apache.parquet=WARN From 543a3a3b91634d1675316f86a9d797096afb3c40 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Fri, 17 Apr 2020 21:03:35 -0500 Subject: [PATCH 37/51] Schema support for Univocity file readers --- .../fs/file/reader/UnivocityFileReader.java | 124 ++++++++++++++++-- .../fs/file/reader/CsvFileReaderTest.java | 37 ++++-- .../file/reader/FixedWidthFileReaderTest.java | 26 ++-- .../fs/file/reader/TsvFileReaderTest.java | 12 +- .../file/reader/UnivocityFileReaderTest.java | 118 ++++++++++++++++- 5 files changed, 276 insertions(+), 41 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java index 490af2d..fb93116 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java @@ -18,8 +18,11 @@ import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.stream.Collectors; import java.util.stream.IntStream; import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX; @@ -34,6 +37,8 @@ abstract class UnivocityFileReader> protected static final String FILE_READER_DELIMITED_SETTINGS_FORMAT = FILE_READER_DELIMITED_SETTINGS + "format."; public static final String FILE_READER_DELIMITED_SETTINGS_HEADER = FILE_READER_DELIMITED_SETTINGS + "header"; + public static final String FILE_READER_DELIMITED_SETTINGS_SCHEMA = FILE_READER_DELIMITED_SETTINGS + "schema"; + public static final String FILE_READER_DELIMITED_SETTINGS_DATA_TYPE_MAPPING_ERROR = FILE_READER_DELIMITED_SETTINGS + "data_type_mapping_error"; public static final String FILE_READER_DELIMITED_SETTINGS_HEADER_NAMES = FILE_READER_DELIMITED_SETTINGS + "header_names"; public static final String FILE_READER_DELIMITED_SETTINGS_LINE_SEPARATOR_DETECTION = FILE_READER_DELIMITED_SETTINGS + "line_separator_detection"; public static final String FILE_READER_DELIMITED_SETTINGS_NULL_VALUE = FILE_READER_DELIMITED_SETTINGS + "null_value"; @@ -56,28 +61,43 @@ abstract class UnivocityFileReader> private Schema schema; private Charset charset; private CompressionType compression; + private boolean dataTypeMappingError; private boolean closed; private ResultIterator iterator; + public enum DataType { + BYTE, + SHORT, + INT, + LONG, + FLOAT, + DOUBLE, + BOOLEAN, + BYTES, + STRING + } + public UnivocityFileReader(FileSystem fs, Path filePath, Map config) throws IOException { super(fs, filePath, new UnivocityToStruct(), config); this.iterator = iterateRecords(); - this.schema = buildSchema(this.iterator, settings.isHeaderExtractionEnabled()); + this.schema = buildSchema(this.iterator, settings.isHeaderExtractionEnabled(), config); } - private Schema buildSchema(ResultIterator it, boolean hasHeader) { + private Schema buildSchema(ResultIterator it, boolean hasHeader, Map config) { SchemaBuilder builder = SchemaBuilder.struct(); if (it.hasNext() && !hasHeader) { Record first = it.next(); + List dataTypes = getDataTypes(config, first.getValues()); IntStream.range(0, first.getValues().length) - .forEach(index -> builder.field(DEFAULT_COLUMN_NAME + ++index, SchemaBuilder.STRING_SCHEMA)); + .forEach(index -> builder.field(DEFAULT_COLUMN_NAME + (index + 1), dataTypes.get(index))); seek(0); } else if (hasHeader) { Optional.ofNullable(it.getContext().headers()).ifPresent(headers -> { + List dataTypes = getDataTypes(config, headers); IntStream.range(0, headers.length) - .forEach(index -> builder.field(headers[index], SchemaBuilder.STRING_SCHEMA)); + .forEach(index -> builder.field(headers[index], dataTypes.get(index))); }); } return builder.build(); @@ -91,6 +111,49 @@ protected void configure(Map config) { this.compression = CompressionType.fromName(cType, concatenated); this.charset = Charset.forName(config.getOrDefault(FILE_READER_DELIMITED_ENCODING, Charset.defaultCharset().name())); this.settings = allSettings(config); + this.dataTypeMappingError = Boolean.parseBoolean( + config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_DATA_TYPE_MAPPING_ERROR, "true")); + } + + private List getDataTypes(Map config, String[] headers) { + List dataTypes = Arrays + .stream(config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_SCHEMA, "").toString().split(",")) + .filter(dt -> !dt.trim().isEmpty()) + .map(this::strToSchema) + .collect(Collectors.toList()); + if (dataTypes.size() > 0 && dataTypes.size() != headers.length) { + throw new IllegalArgumentException("The schema defined in property '" + FILE_READER_DELIMITED_SETTINGS_SCHEMA + + "' does not match the number of fields inferred in the file."); + } else if (dataTypes.size() == 0) { + return IntStream.range(0, headers.length) + .mapToObj(index -> Schema.STRING_SCHEMA) + .collect(Collectors.toList()); + } + return dataTypes; + } + + private Schema strToSchema(String dataType) { + switch (DataType.valueOf(dataType.trim().toUpperCase())) { + case BYTE: + return this.dataTypeMappingError ? Schema.INT8_SCHEMA : Schema.OPTIONAL_INT8_SCHEMA; + case SHORT: + return this.dataTypeMappingError ? Schema.INT16_SCHEMA : Schema.OPTIONAL_INT16_SCHEMA; + case INT: + return this.dataTypeMappingError ? Schema.INT32_SCHEMA : Schema.OPTIONAL_INT32_SCHEMA; + case LONG: + return this.dataTypeMappingError ? Schema.INT64_SCHEMA : Schema.OPTIONAL_INT64_SCHEMA; + case FLOAT: + return this.dataTypeMappingError ? Schema.FLOAT32_SCHEMA : Schema.OPTIONAL_FLOAT32_SCHEMA; + case DOUBLE: + return this.dataTypeMappingError ? Schema.FLOAT64_SCHEMA : Schema.OPTIONAL_FLOAT64_SCHEMA; + case BOOLEAN: + return this.dataTypeMappingError ? Schema.BOOLEAN_SCHEMA : Schema.OPTIONAL_BOOLEAN_SCHEMA; + case BYTES: + return this.dataTypeMappingError ? Schema.BYTES_SCHEMA : Schema.OPTIONAL_BYTES_SCHEMA; + case STRING: + default: + return this.dataTypeMappingError ? Schema.STRING_SCHEMA : Schema.OPTIONAL_STRING_SCHEMA; + } } private T allSettings(Map config) { @@ -144,8 +207,7 @@ private ResultIterator iterateRecords() throws IOExcepti @Override protected final UnivocityRecord nextRecord() { incrementOffset(); - Record record = iterator.next(); - return new UnivocityRecord(schema, record.getValues()); + return new UnivocityRecord(schema, iterator.next(), dataTypeMappingError); } @Override @@ -184,19 +246,59 @@ static class UnivocityToStruct implements ReaderAdapter { public Struct apply(UnivocityRecord record) { Struct struct = new Struct(record.schema); IntStream.range(0, record.schema.fields().size()) - .filter(index -> index < record.values.length) - .forEach(index -> struct.put(record.schema.fields().get(index).name(), record.values[index])); + .filter(index -> index < record.value.getValues().length) + .forEach(index -> { + Schema.Type type = record.schema.fields().get(index).schema().type(); + String fieldName = record.schema.fields().get(index).name(); + struct.put(fieldName, mapDatatype(type, record.value, index, record.dataTypeMappingError)); + }); return struct; } + + private Object mapDatatype(Schema.Type type, Record record, int fieldIndex, boolean dataTypeMappingError) { + try { + switch (type) { + case INT8: + return record.getByte(fieldIndex); + case INT16: + return record.getShort(fieldIndex); + case INT32: + return record.getInt(fieldIndex); + case INT64: + return record.getLong(fieldIndex); + case FLOAT32: + return record.getFloat(fieldIndex); + case FLOAT64: + return record.getDouble(fieldIndex); + case BOOLEAN: + return record.getBoolean(fieldIndex); + case BYTES: + return record.getString(fieldIndex).getBytes(); + case ARRAY: + case MAP: + case STRUCT: + case STRING: + default: + return record.getString(fieldIndex); + } + } catch (RuntimeException re) { + if (dataTypeMappingError) { + throw re; + } + return null; + } + } } static class UnivocityRecord { private final Schema schema; - private final String[] values; + private final Record value; + private final boolean dataTypeMappingError; - UnivocityRecord(Schema schema, String[] values) { + UnivocityRecord(Schema schema, Record value, boolean dataTypeMappingError) { this.schema = schema; - this.values = values; + this.value = value; + this.dataTypeMappingError = dataTypeMappingError; } } } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java index a1247d5..3eba9c0 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/CsvFileReaderTest.java @@ -11,7 +11,6 @@ import java.io.PrintWriter; import java.util.HashMap; import java.util.Map; -import java.util.UUID; import java.util.stream.IntStream; import static org.junit.jupiter.api.Assertions.*; @@ -25,11 +24,15 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw File txtFile = File.createTempFile("test-", "." + getFileExtension()); try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { if (header) { - writer.append(FIELD_COLUMN1 + "#" + FIELD_COLUMN2 + "#" + FIELD_COLUMN3 + "#" + FIELD_COLUMN4 + "\n"); + String headerValue = String.join("#", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3, FIELD_COLUMN4, + FIELD_COLUMN5, FIELD_COLUMN6, FIELD_COLUMN7, FIELD_COLUMN8, FIELD_COLUMN9); + writer.append(headerValue + "\n"); } IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - writer.append(value + "#" + value + "#" + value + "#" + value + "\n"); + String value = String.format("%d#%d#%d#%d#%f#%f#%s#%s#%s\n", + (byte) 2, (short) 4, 8, 16L, 32.32f, 64.64d, + true, "test bytes", "test string"); + writer.append(value); fsConfig.offsetsByIndex().put(index, (long) index); }); } @@ -43,16 +46,18 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw public void readAllDataWithMalformedRows(ReaderFsTestConfig fsConfig) throws IOException { File tmp = File.createTempFile("test-", "." + getFileExtension()); try (FileWriter writer = new FileWriter(tmp)) { - writer.append(FIELD_COLUMN1 + "," + FIELD_COLUMN2 + "," + FIELD_COLUMN3 + "," + FIELD_COLUMN4 + "\n"); - writer.append("dummy,\"\",,dummy\n"); + String headerValue = String.join(",", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3, FIELD_COLUMN4, + FIELD_COLUMN5, FIELD_COLUMN6, FIELD_COLUMN7, FIELD_COLUMN8, FIELD_COLUMN9); + writer.append(headerValue + "\n"); + writer.append(",\"\",,,,,true,test bytes,test string\n"); writer.append("#comment\n"); - writer.append("dummy,\"\",,dummy\n"); + writer.append(",\"\",,,,,true,test bytes,test string\n"); } Map readerConfig = getReaderConfig(); readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, ","); readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); - readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE, "empty_value"); - readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_NULL_VALUE, "null_value"); + readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_EMPTY_VALUE, "10"); + readerConfig.put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_NULL_VALUE, "100"); Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); @@ -64,10 +69,15 @@ public void readAllDataWithMalformedRows(ReaderFsTestConfig fsConfig) throws IOE while (reader.hasNext()) { Struct record = reader.next(); assertAll( - () -> assertEquals("dummy", record.get(FIELD_COLUMN1)), - () -> assertEquals("empty_value", record.get(FIELD_COLUMN2)), - () -> assertEquals("null_value", record.get(FIELD_COLUMN3)), - () -> assertEquals("dummy", record.get(FIELD_COLUMN4)) + () -> assertEquals(record.get(FIELD_COLUMN1), (byte) 100), + () -> assertEquals(record.get(FIELD_COLUMN2), (short) 10), + () -> assertEquals(record.get(FIELD_COLUMN3), 100), + () -> assertEquals(record.get(FIELD_COLUMN4), 100L), + () -> assertEquals(record.get(FIELD_COLUMN5), 100.00f), + () -> assertEquals(record.get(FIELD_COLUMN6), 100.00d), + () -> assertEquals(record.get(FIELD_COLUMN7), true), + () -> assertEquals(new String((byte[]) record.get(FIELD_COLUMN8)), "test bytes"), + () -> assertEquals(record.get(FIELD_COLUMN9), "test string") ); recordCount++; } @@ -79,6 +89,7 @@ protected Map getReaderConfig() { return new HashMap() {{ put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_FORMAT_DELIMITER, "#"); put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + put(CsvFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "byte,short,int,long,float,double,boolean,bytes,string"); }}; } } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReaderTest.java index 6f0ff01..8b1fedc 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/FixedWidthFileReaderTest.java @@ -8,13 +8,12 @@ import java.util.Arrays; import java.util.HashMap; import java.util.Map; -import java.util.UUID; import java.util.stream.Collectors; import java.util.stream.IntStream; public class FixedWidthFileReaderTest extends UnivocityFileReaderTest { - private static final int[] fieldLengths = new int[]{45, 53, 71, 89}; + private static final int[] fieldLengths = new int[]{45, 53, 71, 89, 14, 44, 67, 46, 75}; @Override protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException { @@ -26,14 +25,24 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw writer.append(String.format("%-" + fieldLengths[0] + "s", FIELD_COLUMN1) + String.format("%-" + fieldLengths[1] + "s", FIELD_COLUMN2) + String.format("%-" + fieldLengths[2] + "s", FIELD_COLUMN3) + - String.format("%-" + fieldLengths[3] + "s", FIELD_COLUMN4) + "\n"); + String.format("%-" + fieldLengths[3] + "s", FIELD_COLUMN4) + + String.format("%-" + fieldLengths[4] + "s", FIELD_COLUMN5) + + String.format("%-" + fieldLengths[5] + "s", FIELD_COLUMN6) + + String.format("%-" + fieldLengths[6] + "s", FIELD_COLUMN7) + + String.format("%-" + fieldLengths[7] + "s", FIELD_COLUMN8) + + String.format("%-" + fieldLengths[8] + "s", FIELD_COLUMN9) + "\n"); } IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - writer.append(String.format("%-" + fieldLengths[0] + "s", value) + - String.format("%-" + fieldLengths[1] + "s", value) + - String.format("%-" + fieldLengths[2] + "s", value) + - String.format("%-" + fieldLengths[3] + "s", value) + "\n"); + writer.append(String.format("%-" + fieldLengths[0] + "s", String.format("%d", (byte) 2)) + + String.format("%-" + fieldLengths[1] + "s", String.format("%d", (short) 4)) + + String.format("%-" + fieldLengths[2] + "s", String.format("%d", 8)) + + String.format("%-" + fieldLengths[3] + "s", String.format("%d", 16L)) + + String.format("%-" + fieldLengths[4] + "s", String.format("%f", 32.32f)) + + String.format("%-" + fieldLengths[5] + "s", String.format("%f", 64.64d)) + + String.format("%-" + fieldLengths[6] + "s", String.format("%s", true)) + + String.format("%-" + fieldLengths[7] + "s", String.format("%s", "test bytes")) + + String.format("%-" + fieldLengths[8] + "s", String.format("%s", "test string")) + "\n" + ); fsConfig.offsetsByIndex().put(index, (long) index); }); } @@ -48,6 +57,7 @@ protected Map getReaderConfig() { put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_FIELD_LENGTHS, Arrays.stream(fieldLengths).mapToObj(String::valueOf).collect(Collectors.joining(","))); + put(FixedWidthFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "byte,short,int,long,float,double,boolean,bytes,string"); }}; } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java index 2f94f28..d82a50e 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/TsvFileReaderTest.java @@ -7,7 +7,6 @@ import java.io.PrintWriter; import java.util.HashMap; import java.util.Map; -import java.util.UUID; import java.util.stream.IntStream; public class TsvFileReaderTest extends UnivocityFileReaderTest { @@ -19,11 +18,15 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw File txtFile = File.createTempFile("test-", "." + getFileExtension()); try (PrintWriter writer = new PrintWriter(getOutputStream(txtFile, compression))) { if (header) { - writer.append(FIELD_COLUMN1 + "\t" + FIELD_COLUMN2 + "\t" + FIELD_COLUMN3 + "\t" + FIELD_COLUMN4 + "\n"); + String headerValue = String.join("\t", FIELD_COLUMN1, FIELD_COLUMN2, FIELD_COLUMN3, FIELD_COLUMN4, + FIELD_COLUMN5, FIELD_COLUMN6, FIELD_COLUMN7, FIELD_COLUMN8, FIELD_COLUMN9); + writer.append(headerValue + "\n"); } IntStream.range(0, NUM_RECORDS).forEach(index -> { - String value = String.format("%d_%s", index, UUID.randomUUID()); - writer.append(value + "\t" + value + "\t" + value + "\t" + value + "\n"); + String value = String.format("%d\t%d\t%d\t%d\t%f\t%f\t%s\t%s\t%s\n", + (byte) 2, (short) 4, 8, 16L, 32.32f, 64.64d, + true, "test bytes", "test string"); + writer.append(value); fsConfig.offsetsByIndex().put(index, (long) index); }); } @@ -36,6 +39,7 @@ protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throw protected Map getReaderConfig() { return new HashMap() {{ put(TsvFileReader.FILE_READER_DELIMITED_SETTINGS_HEADER, "true"); + put(TsvFileReader.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "byte,short,int,long,float,double,boolean,bytes,string"); }}; } } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java index a5e8d9e..79663bc 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReaderTest.java @@ -1,5 +1,6 @@ package com.github.mmolimar.kafka.connect.fs.file.reader; +import com.univocity.parsers.common.DataProcessingException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.kafka.connect.data.Struct; @@ -25,6 +26,11 @@ abstract class UnivocityFileReaderTest extends Fi protected static final String FIELD_COLUMN2 = "column_2"; protected static final String FIELD_COLUMN3 = "column_3"; protected static final String FIELD_COLUMN4 = "column_4"; + protected static final String FIELD_COLUMN5 = "column_5"; + protected static final String FIELD_COLUMN6 = "column_6"; + protected static final String FIELD_COLUMN7 = "column_7"; + protected static final String FIELD_COLUMN8 = "column_8"; + protected static final String FIELD_COLUMN9 = "column_9"; protected static final String FILE_EXTENSION = "tcsv"; protected static final CompressionType COMPRESSION_TYPE_DEFAULT = CompressionType.NONE; @@ -46,7 +52,14 @@ public void invalidFileFormat(ReaderFsTestConfig fsConfig) throws IOException { } Path path = new Path(new Path(fsConfig.getFsUri()), tmp.getName()); fsConfig.getFs().moveFromLocalFile(new Path(tmp.getAbsolutePath()), path); - getReader(fsConfig.getFs(), path, getReaderConfig()); + assertThrows(ConnectException.class, () -> getReader(fsConfig.getFs(), path, getReaderConfig())); + assertThrows(IllegalArgumentException.class, () -> { + try { + getReader(fsConfig.getFs(), path, getReaderConfig()); + } catch (Exception ce) { + throw ce.getCause(); + } + }); } @ParameterizedTest @@ -81,6 +94,68 @@ public void readAllDataWithoutHeader(ReaderFsTestConfig fsConfig) throws IOExcep assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); } + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readAllDataWithoutSchema(ReaderFsTestConfig fsConfig) throws IOException { + Path file = createDataFile(fsConfig, true); + Map readerConfig = getReaderConfig(); + readerConfig.remove(T.FILE_READER_DELIMITED_SETTINGS_SCHEMA); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkDataString(record); + recordCount++; + } + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readAllDataWithMappingErrors(ReaderFsTestConfig fsConfig) throws IOException { + Path file = createDataFile(fsConfig, true); + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "boolean,boolean,boolean,boolean,boolean,boolean,int,long,double"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + try { + reader.next(); + } catch (Exception e) { + assertEquals(ConnectException.class, e.getClass()); + assertEquals(DataProcessingException.class, e.getCause().getClass()); + } + recordCount++; + } + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void readAllDataToleratingMappingErrors(ReaderFsTestConfig fsConfig) throws IOException { + Path file = createDataFile(fsConfig, true); + Map readerConfig = getReaderConfig(); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_SCHEMA, "boolean,boolean,boolean,boolean,boolean,boolean,int,long,double"); + readerConfig.put(T.FILE_READER_DELIMITED_SETTINGS_DATA_TYPE_MAPPING_ERROR, "false"); + FileReader reader = getReader(fsConfig.getFs(), file, readerConfig); + + assertTrue(reader.hasNext()); + + int recordCount = 0; + while (reader.hasNext()) { + Struct record = reader.next(); + checkDataNull(record); + recordCount++; + } + assertEquals(NUM_RECORDS, recordCount, "The number of records in the file does not match"); + } + @ParameterizedTest @MethodSource("fileSystemConfigProvider") public void readDifferentCompressionTypes(ReaderFsTestConfig fsConfig) { @@ -174,10 +249,43 @@ protected Class getReaderClass() { @Override protected void checkData(Struct record, long index) { assertAll( - () -> assertTrue(record.get(FIELD_COLUMN1).toString().startsWith(index + "_")), - () -> assertTrue(record.get(FIELD_COLUMN2).toString().startsWith(index + "_")), - () -> assertTrue(record.get(FIELD_COLUMN3).toString().startsWith(index + "_")), - () -> assertTrue(record.get(FIELD_COLUMN4).toString().startsWith(index + "_")) + () -> assertEquals(record.get(FIELD_COLUMN1), (byte) 2), + () -> assertEquals(record.get(FIELD_COLUMN2), (short) 4), + () -> assertEquals(record.get(FIELD_COLUMN3), 8), + () -> assertEquals(record.get(FIELD_COLUMN4), 16L), + () -> assertEquals(record.get(FIELD_COLUMN5), 32.32f), + () -> assertEquals(record.get(FIELD_COLUMN6), 64.64d), + () -> assertEquals(record.get(FIELD_COLUMN7), true), + () -> assertEquals(new String((byte[]) record.get(FIELD_COLUMN8)), "test bytes"), + () -> assertEquals(record.get(FIELD_COLUMN9), "test string") + ); + } + + protected void checkDataString(Struct record) { + assertAll( + () -> assertEquals(record.get(FIELD_COLUMN1), "2"), + () -> assertEquals(record.get(FIELD_COLUMN2), "4"), + () -> assertEquals(record.get(FIELD_COLUMN3), "8"), + () -> assertEquals(record.get(FIELD_COLUMN4), "16"), + () -> assertEquals(record.get(FIELD_COLUMN5), "32.320000"), + () -> assertEquals(record.get(FIELD_COLUMN6), "64.640000"), + () -> assertEquals(record.get(FIELD_COLUMN7), "true"), + () -> assertEquals(record.get(FIELD_COLUMN8), "test bytes"), + () -> assertEquals(record.get(FIELD_COLUMN9), "test string") + ); + } + + protected void checkDataNull(Struct record) { + assertAll( + () -> assertEquals(record.get(FIELD_COLUMN1), null), + () -> assertEquals(record.get(FIELD_COLUMN2), null), + () -> assertEquals(record.get(FIELD_COLUMN3), null), + () -> assertEquals(record.get(FIELD_COLUMN4), null), + () -> assertEquals(record.get(FIELD_COLUMN5), null), + () -> assertEquals(record.get(FIELD_COLUMN6), null), + () -> assertEquals(record.get(FIELD_COLUMN7), null), + () -> assertEquals(record.get(FIELD_COLUMN8), null), + () -> assertEquals(record.get(FIELD_COLUMN9), null) ); } From d8907091a1740151230475ff4697864f124001e7 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sun, 19 Apr 2020 14:34:08 -0500 Subject: [PATCH 38/51] Allow nullable fields in Univocity file readers --- .../fs/file/reader/UnivocityFileReader.java | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java index fb93116..25a685d 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/UnivocityFileReader.java @@ -39,6 +39,7 @@ abstract class UnivocityFileReader> public static final String FILE_READER_DELIMITED_SETTINGS_HEADER = FILE_READER_DELIMITED_SETTINGS + "header"; public static final String FILE_READER_DELIMITED_SETTINGS_SCHEMA = FILE_READER_DELIMITED_SETTINGS + "schema"; public static final String FILE_READER_DELIMITED_SETTINGS_DATA_TYPE_MAPPING_ERROR = FILE_READER_DELIMITED_SETTINGS + "data_type_mapping_error"; + public static final String FILE_READER_DELIMITED_SETTINGS_ALLOW_NULLS = FILE_READER_DELIMITED_SETTINGS + "allow_nulls"; public static final String FILE_READER_DELIMITED_SETTINGS_HEADER_NAMES = FILE_READER_DELIMITED_SETTINGS + "header_names"; public static final String FILE_READER_DELIMITED_SETTINGS_LINE_SEPARATOR_DETECTION = FILE_READER_DELIMITED_SETTINGS + "line_separator_detection"; public static final String FILE_READER_DELIMITED_SETTINGS_NULL_VALUE = FILE_READER_DELIMITED_SETTINGS + "null_value"; @@ -62,6 +63,7 @@ abstract class UnivocityFileReader> private Charset charset; private CompressionType compression; private boolean dataTypeMappingError; + private boolean allowNulls; private boolean closed; private ResultIterator iterator; @@ -113,6 +115,13 @@ protected void configure(Map config) { this.settings = allSettings(config); this.dataTypeMappingError = Boolean.parseBoolean( config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_DATA_TYPE_MAPPING_ERROR, "true")); + if (this.dataTypeMappingError) { + this.allowNulls = Boolean.parseBoolean( + config.getOrDefault(FILE_READER_DELIMITED_SETTINGS_ALLOW_NULLS, "false")); + } else { + this.allowNulls = true; + } + } private List getDataTypes(Map config, String[] headers) { @@ -135,24 +144,24 @@ private List getDataTypes(Map config, String[] headers) private Schema strToSchema(String dataType) { switch (DataType.valueOf(dataType.trim().toUpperCase())) { case BYTE: - return this.dataTypeMappingError ? Schema.INT8_SCHEMA : Schema.OPTIONAL_INT8_SCHEMA; + return dataTypeMappingError && !allowNulls ? Schema.INT8_SCHEMA : Schema.OPTIONAL_INT8_SCHEMA; case SHORT: - return this.dataTypeMappingError ? Schema.INT16_SCHEMA : Schema.OPTIONAL_INT16_SCHEMA; + return dataTypeMappingError && !allowNulls ? Schema.INT16_SCHEMA : Schema.OPTIONAL_INT16_SCHEMA; case INT: - return this.dataTypeMappingError ? Schema.INT32_SCHEMA : Schema.OPTIONAL_INT32_SCHEMA; + return dataTypeMappingError && !allowNulls ? Schema.INT32_SCHEMA : Schema.OPTIONAL_INT32_SCHEMA; case LONG: - return this.dataTypeMappingError ? Schema.INT64_SCHEMA : Schema.OPTIONAL_INT64_SCHEMA; + return dataTypeMappingError && !allowNulls ? Schema.INT64_SCHEMA : Schema.OPTIONAL_INT64_SCHEMA; case FLOAT: - return this.dataTypeMappingError ? Schema.FLOAT32_SCHEMA : Schema.OPTIONAL_FLOAT32_SCHEMA; + return dataTypeMappingError && !allowNulls ? Schema.FLOAT32_SCHEMA : Schema.OPTIONAL_FLOAT32_SCHEMA; case DOUBLE: - return this.dataTypeMappingError ? Schema.FLOAT64_SCHEMA : Schema.OPTIONAL_FLOAT64_SCHEMA; + return dataTypeMappingError && !allowNulls ? Schema.FLOAT64_SCHEMA : Schema.OPTIONAL_FLOAT64_SCHEMA; case BOOLEAN: - return this.dataTypeMappingError ? Schema.BOOLEAN_SCHEMA : Schema.OPTIONAL_BOOLEAN_SCHEMA; + return dataTypeMappingError && !allowNulls ? Schema.BOOLEAN_SCHEMA : Schema.OPTIONAL_BOOLEAN_SCHEMA; case BYTES: - return this.dataTypeMappingError ? Schema.BYTES_SCHEMA : Schema.OPTIONAL_BYTES_SCHEMA; + return dataTypeMappingError && !allowNulls ? Schema.BYTES_SCHEMA : Schema.OPTIONAL_BYTES_SCHEMA; case STRING: default: - return this.dataTypeMappingError ? Schema.STRING_SCHEMA : Schema.OPTIONAL_STRING_SCHEMA; + return dataTypeMappingError && !allowNulls ? Schema.STRING_SCHEMA : Schema.OPTIONAL_STRING_SCHEMA; } } From b679d1536fcd48fa229127759069cc44954668be Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Tue, 21 Apr 2020 21:43:21 -0500 Subject: [PATCH 39/51] Support por Google Cloud Storage, Azure Blob Storage and Azure Data Lake Store --- pom.xml | 16 ++++++++++++++++ .../services/org.apache.hadoop.fs.FileSystem | 15 +++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem diff --git a/pom.xml b/pom.xml index 130728a..6ff4fbd 100644 --- a/pom.xml +++ b/pom.xml @@ -14,6 +14,7 @@ 2.4.1 5.4.1 3.2.1 + hadoop3-2.1.2 1.11.0 2.8.4 9.0.2 @@ -52,6 +53,21 @@ hadoop-aws ${hadoop.version} + + org.apache.hadoop + hadoop-azure + ${hadoop.version} + + + org.apache.hadoop + hadoop-azure-datalake + ${hadoop.version} + + + com.google.cloud.bigdataoss + gcs-connector + ${gcs-connector.version} + org.apache.parquet parquet-avro diff --git a/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem b/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem new file mode 100644 index 0000000..de86f4a --- /dev/null +++ b/src/main/resources/META-INF/services/org.apache.hadoop.fs.FileSystem @@ -0,0 +1,15 @@ +org.apache.hadoop.fs.LocalFileSystem +org.apache.hadoop.fs.viewfs.ViewFileSystem +org.apache.hadoop.fs.HarFileSystem +org.apache.hadoop.fs.http.HttpFileSystem +org.apache.hadoop.fs.http.HttpsFileSystem +org.apache.hadoop.fs.ftp.FTPFileSystem +org.apache.hadoop.hdfs.DistributedFileSystem +org.apache.hadoop.fs.s3a.S3AFileSystem +org.apache.hadoop.fs.s3native.NativeS3FileSystem +org.apache.hadoop.fs.adl.AdlFileSystem +org.apache.hadoop.fs.azure.NativeAzureFileSystem +org.apache.hadoop.fs.azure.NativeAzureFileSystem$Secure +org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem +org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem +com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem From 64624352b36744818427c11158486ccb1b8e5678 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Tue, 21 Apr 2020 21:44:34 -0500 Subject: [PATCH 40/51] Set next offset to source record to commit --- .../com/github/mmolimar/kafka/connect/fs/FsSourceTask.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java index 51a9e3d..a5ceb6c 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java @@ -70,7 +70,7 @@ public List poll() { try (FileReader reader = policy.offer(metadata, context.offsetStorageReader())) { log.info("Processing records for file {}", metadata); while (reader.hasNext()) { - records.add(convert(metadata, reader.currentOffset(), reader.next())); + records.add(convert(metadata, reader.currentOffset() + 1, reader.next())); } } catch (ConnectException | IOException e) { //when an exception happens reading a file, the connector continues @@ -85,9 +85,7 @@ public List poll() { private Stream filesToProcess() { try { return asStream(policy.execute()) - .filter(metadata -> metadata.getLen() > 0) - .collect(Collectors.toList()) - .stream(); + .filter(metadata -> metadata.getLen() > 0); } catch (IOException | ConnectException e) { //when an exception happens executing the policy, the connector continues log.error("Cannot retrieve files to process from the FS: {}. " + From c9372dfa696230b071b27b634f81f875cc5a457a Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Fri, 24 Apr 2020 19:31:40 -0500 Subject: [PATCH 41/51] Upgrade to Kafka 2.5.0 and Confluent 5.5.0 --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 6ff4fbd..699c039 100644 --- a/pom.xml +++ b/pom.xml @@ -11,8 +11,8 @@ UTF-8 - 2.4.1 - 5.4.1 + 2.5.0 + 5.5.0 3.2.1 hadoop3-2.1.2 1.11.0 From 26b3f0ef7ab84e0f5f0fdf396d656d93866c77ed Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Fri, 24 Apr 2020 21:04:03 -0500 Subject: [PATCH 42/51] Updating documentation --- docs/source/config_options.rst | 72 ++++++++++++++++++++++++++++++++++ docs/source/connector.rst | 13 +++--- 2 files changed, 80 insertions(+), 5 deletions(-) diff --git a/docs/source/config_options.rst b/docs/source/config_options.rst index 70c8b6c..1cce34f 100644 --- a/docs/source/config_options.rst +++ b/docs/source/config_options.rst @@ -315,6 +315,30 @@ To configure custom properties for this reader, the name you must use is ``delim * Default: ``false`` * Importance: high +``file_reader.delimited.settings.schema`` + A comma-separated list of ordered data types for each field in the file. Possible values: ``byte``, ``short``, + ``int``, ``long``, ``float``, ``double``, ``boolean``, ``bytes`` and ``string``) + + * Type: string[] + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.data_type_mapping_error`` + Flag to enable/disable throwing errors when mapping data types based on the schema is not possible. If disabled, + the returned value which could not be mapped will be ``null``. + + * Type: boolean + * Default: ``true`` + * Importance: medium + +``file_reader.delimited.settings.allow_nulls`` + If the schema supports nullable fields. If ``file_reader.delimited.settings.data_type_mapping_error`` config flag is + disabled, the value set for this config will be ignored and set to ``true``. + + * Type: boolean + * Default: ``false`` + * Importance: medium + ``file_reader.delimited.settings.header_names`` A comma-separated list of ordered field names to set when reading a file. @@ -456,6 +480,30 @@ To configure custom properties for this reader, the name you must use is ``delim * Default: ``false`` * Importance: high +``file_reader.delimited.settings.schema`` + A comma-separated list of ordered data types for each field in the file. Possible values: ``byte``, ``short``, + ``int``, ``long``, ``float``, ``double``, ``boolean``, ``bytes`` and ``string``) + + * Type: string[] + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.data_type_mapping_error`` + Flag to enable/disable throwing errors when mapping data types based on the schema is not possible. If disabled, + the returned value which could not be mapped will be ``null``. + + * Type: boolean + * Default: ``true`` + * Importance: medium + +``file_reader.delimited.settings.allow_nulls`` + If the schema supports nullable fields. If ``file_reader.delimited.settings.data_type_mapping_error`` config flag is + disabled, the value set for this config will be ignored and set to ``true``. + + * Type: boolean + * Default: ``false`` + * Importance: medium + ``file_reader.delimited.settings.header_names`` A comma-separated list of ordered field names to set when reading a file. @@ -590,6 +638,30 @@ To configure custom properties for this reader, the name you must use is ``delim * Default: ``false`` * Importance: high +``file_reader.delimited.settings.schema`` + A comma-separated list of ordered data types for each field in the file. Possible values: ``byte``, ``short``, + ``int``, ``long``, ``float``, ``double``, ``boolean``, ``bytes`` and ``string``) + + * Type: string[] + * Default: ``null`` + * Importance: medium + +``file_reader.delimited.settings.data_type_mapping_error`` + Flag to enable/disable throwing errors when mapping data types based on the schema is not possible. If disabled, + the returned value which could not be mapped will be ``null``. + + * Type: boolean + * Default: ``true`` + * Importance: medium + +``file_reader.delimited.settings.allow_nulls`` + If the schema supports nullable fields. If ``file_reader.delimited.settings.data_type_mapping_error`` config flag is + disabled, the value set for this config will be ignored and set to ``true``. + + * Type: boolean + * Default: ``false`` + * Importance: medium + ``file_reader.delimited.settings.header_names`` A comma-separated list of ordered field names to set when reading a file. diff --git a/docs/source/connector.rst b/docs/source/connector.rst index 6c79317..0e02451 100644 --- a/docs/source/connector.rst +++ b/docs/source/connector.rst @@ -12,9 +12,11 @@ of this abstraction and using it in a transparent way. Among others, these are some file systems it supports: * HDFS. -* WebHDFS. * S3. -* FTP and SFTP. +* Google Cloud Storage. +* Azure Blob Storage & Azure Data Lake Store. +* FTP. +* WebHDFS. * Local File System. * Hadoop Archive File System. @@ -24,8 +26,9 @@ Getting started Prerequisites -------------------------------------------- -- Confluent Platform 5.4.1 +- Apache Kafka 2.5.0 - Java 8 +- Confluent Schema Registry (recommended). Building from source -------------------------------------------- @@ -72,13 +75,13 @@ Running in development .. sourcecode:: bash - export CONFLUENT_HOME=/path/to/confluent/install/dir + export KAFKA_HOME=/path/to/kafka/install/dir .. sourcecode:: bash mvn clean package export CLASSPATH="$(find target/ -type f -name '*.jar'| grep '\-package' | tr '\n' ':')" - $CONFLUENT_HOME/bin/connect-standalone $CONFLUENT_HOME/etc/schema-registry/connect-avro-standalone.properties config/kafka-connect-fs.properties + $KAFKA_HOME/bin/connect-distributed.sh config/kafka-connect-fs.properties Components ============================================ From 6da68ba3d10b7f92704d2a182eb7b967e8fa6ce8 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sat, 25 Apr 2020 14:11:21 -0500 Subject: [PATCH 43/51] Updating license file --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index d645695..1c358a6 100644 --- a/LICENSE +++ b/LICENSE @@ -187,7 +187,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2017 Mario Molina Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From e90f36a60652b3e268506ec7f177b3d6cb0d069b Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sun, 26 Apr 2020 17:32:42 -0500 Subject: [PATCH 44/51] Poll interval ms config --- .../kafka/connect/fs/FsSourceConnector.java | 5 +- .../connect/fs/FsSourceConnectorConfig.java | 31 +++++++-- .../kafka/connect/fs/FsSourceTask.java | 61 +++++++++++----- .../kafka/connect/fs/FsSourceTaskConfig.java | 69 +++++++++++++++++-- 4 files changed, 138 insertions(+), 28 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java index 3689452..839477b 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnector.java @@ -50,8 +50,9 @@ public List> taskConfigs(int maxTasks) { } final List> taskConfigs = new ArrayList<>(); - int groups = Math.min(config.getFsUris().size(), maxTasks); - ConnectorUtils.groupPartitions(config.getFsUris(), groups) + List fsUris = config.getFsUris(); + int groups = Math.min(fsUris.size(), maxTasks); + ConnectorUtils.groupPartitions(fsUris, groups) .forEach(dirs -> { Map taskProps = new HashMap<>(config.originalsStrings()); taskProps.put(FsSourceConnectorConfig.FS_URIS, String.join(",", dirs)); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnectorConfig.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnectorConfig.java index d20069f..3a3f1ad 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnectorConfig.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceConnectorConfig.java @@ -13,9 +13,13 @@ public class FsSourceConnectorConfig extends AbstractConfig { public static final String FS_URIS = "fs.uris"; private static final String FS_URIS_DOC = "Comma-separated URIs of the FS(s)."; + private static final String FS_URIS_DISPLAY = "File system URIs"; public static final String TOPIC = "topic"; private static final String TOPIC_DOC = "Topic to copy data to."; + private static final String TOPIC_DISPLAY = "Topic"; + + private static final String CONNECTOR_GROUP = "Connector"; public FsSourceConnectorConfig(ConfigDef config, Map parsedConfig) { super(config, parsedConfig); @@ -26,9 +30,29 @@ public FsSourceConnectorConfig(Map parsedConfig) { } public static ConfigDef conf() { + int order = 0; return new ConfigDef() - .define(FS_URIS, Type.LIST, Importance.HIGH, FS_URIS_DOC) - .define(TOPIC, Type.STRING, Importance.HIGH, TOPIC_DOC); + .define( + FS_URIS, + Type.LIST, + ConfigDef.NO_DEFAULT_VALUE, + Importance.HIGH, + FS_URIS_DOC, + CONNECTOR_GROUP, + ++order, + ConfigDef.Width.LONG, + FS_URIS_DISPLAY + ).define( + TOPIC, + Type.STRING, + ConfigDef.NO_DEFAULT_VALUE, + Importance.HIGH, + TOPIC_DOC, + CONNECTOR_GROUP, + ++order, + ConfigDef.Width.LONG, + TOPIC_DISPLAY + ); } public List getFsUris() { @@ -38,5 +62,4 @@ public List getFsUris() { public String getTopic() { return this.getString(TOPIC); } - -} \ No newline at end of file +} diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java index a5ceb6c..c3fa38f 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java @@ -6,6 +6,8 @@ import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; import com.github.mmolimar.kafka.connect.fs.util.Version; import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.common.utils.SystemTime; +import org.apache.kafka.common.utils.Time; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.source.SourceRecord; @@ -24,9 +26,17 @@ public class FsSourceTask extends SourceTask { private static final Logger log = LoggerFactory.getLogger(FsSourceTask.class); - private final AtomicBoolean stop = new AtomicBoolean(false); + private final AtomicBoolean stop; + private final Time time; + private FsSourceTaskConfig config; private Policy policy; + private int pollInterval; + + public FsSourceTask() { + this.stop = new AtomicBoolean(false); + this.time = new SystemTime(); + } @Override public String version() { @@ -48,36 +58,47 @@ public void start(Map properties) { } Class policyClass = (Class) Class.forName(properties.get(FsSourceTaskConfig.POLICY_CLASS)); - FsSourceTaskConfig taskConfig = new FsSourceTaskConfig(properties); - policy = ReflectionUtils.makePolicy(policyClass, taskConfig); + policy = ReflectionUtils.makePolicy(policyClass, config); + pollInterval = config.getInt(FsSourceTaskConfig.POLL_INTERVAL_MS); } catch (ConfigException ce) { - log.error("Couldn't start FsSourceTask:", ce); - throw new ConnectException("Couldn't start FsSourceTask due to configuration error", ce); + log.error("Couldn't start FsSourceTask.", ce); + throw new ConnectException("Couldn't start FsSourceTask due to configuration error: " + ce.getMessage(), ce); } catch (Exception e) { - log.error("Couldn't start FsSourceConnector:", e); - throw new ConnectException("A problem has occurred reading configuration: " + e.getMessage()); + log.error("Couldn't start FsSourceConnector.", e); + throw new ConnectException("A problem has occurred reading configuration: " + e.getMessage(), e); } - log.info("FS source task started with policy {}", policy.getClass().getName()); + log.info("FS source task started with policy [{}].", policy.getClass().getName()); } @Override public List poll() { while (!stop.get() && policy != null && !policy.hasEnded()) { - log.trace("Polling for new data"); + log.trace("Polling for new data..."); - return filesToProcess().map(metadata -> { + List totalRecords = filesToProcess().map(metadata -> { List records = new ArrayList<>(); try (FileReader reader = policy.offer(metadata, context.offsetStorageReader())) { - log.info("Processing records for file {}", metadata); + log.info("Processing records for file {}.", metadata); while (reader.hasNext()) { records.add(convert(metadata, reader.currentOffset() + 1, reader.next())); } } catch (ConnectException | IOException e) { //when an exception happens reading a file, the connector continues - log.error("Error reading file from FS: " + metadata.getPath() + ". Keep going...", e); + log.error("Error reading file [{}]. Keep going...", metadata.getPath(), e); } + log.debug("Read [{}] records from file [{}].", records.size(), metadata.getPath()); + return records; }).flatMap(Collection::stream).collect(Collectors.toList()); + + log.debug("Returning [{}] records in execution number [{}] for policy [{}].", + totalRecords.size(), policy.getExecutions(), policy.getClass().getName()); + + return totalRecords; + } + if (pollInterval > 0) { + log.trace("Waiting [{}] ms for next poll.", pollInterval); + time.sleep(pollInterval); } return null; } @@ -89,8 +110,8 @@ private Stream filesToProcess() { } catch (IOException | ConnectException e) { //when an exception happens executing the policy, the connector continues log.error("Cannot retrieve files to process from the FS: {}. " + - "There was an error executing the policy but the task tolerates this and continues. " + - e.getMessage(), policy.getURIs(), e); + "There was an error executing the policy but the task tolerates this and continues.", + policy.getURIs(), e); return Stream.empty(); } } @@ -112,10 +133,16 @@ private SourceRecord convert(FileMetadata metadata, long offset, Struct struct) @Override public void stop() { - log.info("Stopping FS source task."); + log.info("Stopping FS source task..."); stop.set(true); - if (policy != null) { - policy.interrupt(); + synchronized (this) { + if (policy != null) { + try { + policy.close(); + } catch (IOException ioe) { + log.warn("Error closing policy [{}].", policy.getClass().getName(), ioe); + } + } } } } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java index 3b1f4a5..58231fd 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTaskConfig.java @@ -11,17 +11,29 @@ public class FsSourceTaskConfig extends FsSourceConnectorConfig { public static final String POLICY_CLASS = POLICY_PREFIX + "class"; private static final String POLICY_CLASS_DOC = "Policy class to apply to this task."; + private static final String POLICY_CLASS_DISPLAY = "Policy"; public static final String POLICY_RECURSIVE = POLICY_PREFIX + "recursive"; private static final String POLICY_RECURSIVE_DOC = "Flag to activate traversed recursion in subdirectories when listing files."; + private static final String POLICY_RECURSIVE_DISPLAY = "Recursive directory listing"; public static final String POLICY_REGEXP = POLICY_PREFIX + "regexp"; private static final String POLICY_REGEXP_DOC = "Regular expression to filter files from the FS."; + private static final String POLICY_REGEXP_DISPLAY = "File filter regex"; public static final String POLICY_PREFIX_FS = POLICY_PREFIX + "fs."; public static final String FILE_READER_CLASS = FILE_READER_PREFIX + "class"; private static final String FILE_READER_CLASS_DOC = "File reader class to read files from the FS."; + private static final String FILE_READER_CLASS_DISPLAY = "File reader class"; + + public static final String POLL_INTERVAL_MS = "poll.interval.ms"; + private static final String POLL_INTERVAL_MS_DOC = "Frequency in ms to poll for new data."; + public static final int POLL_INTERVAL_MS_DEFAULT = 10000; + private static final String POLL_INTERVAL_MS_DISPLAY = "Poll Interval (ms)"; + + private static final String POLICY_GROUP = "Policy"; + private static final String CONNECTOR_GROUP = "Connector"; public FsSourceTaskConfig(ConfigDef config, Map parsedConfig) { super(config, parsedConfig); @@ -32,11 +44,58 @@ public FsSourceTaskConfig(Map parsedConfig) { } public static ConfigDef conf() { + int order = 0; return FsSourceConnectorConfig.conf() - .define(POLICY_CLASS, ConfigDef.Type.CLASS, ConfigDef.Importance.HIGH, POLICY_CLASS_DOC) - .define(POLICY_RECURSIVE, ConfigDef.Type.BOOLEAN, Boolean.TRUE, ConfigDef.Importance.LOW, POLICY_RECURSIVE_DOC) - .define(POLICY_REGEXP, ConfigDef.Type.STRING, ".*", ConfigDef.Importance.MEDIUM, POLICY_REGEXP_DOC) - .define(FILE_READER_CLASS, ConfigDef.Type.CLASS, ConfigDef.Importance.HIGH, FILE_READER_CLASS_DOC); + .define( + POLICY_CLASS, + ConfigDef.Type.CLASS, + ConfigDef.NO_DEFAULT_VALUE, + ConfigDef.Importance.HIGH, + POLICY_CLASS_DOC, + POLICY_GROUP, + ++order, + ConfigDef.Width.MEDIUM, + POLICY_CLASS_DISPLAY + ).define( + POLICY_RECURSIVE, + ConfigDef.Type.BOOLEAN, + Boolean.TRUE, + ConfigDef.Importance.MEDIUM, + POLICY_RECURSIVE_DOC, + POLICY_GROUP, + ++order, + ConfigDef.Width.SHORT, + POLICY_RECURSIVE_DISPLAY + ).define( + POLICY_REGEXP, + ConfigDef.Type.STRING, + ".*", + ConfigDef.Importance.MEDIUM, + POLICY_REGEXP_DOC, + POLICY_GROUP, + ++order, + ConfigDef.Width.MEDIUM, + POLICY_REGEXP_DISPLAY + ).define( + FILE_READER_CLASS, + ConfigDef.Type.CLASS, + ConfigDef.NO_DEFAULT_VALUE, + ConfigDef.Importance.HIGH, + FILE_READER_CLASS_DOC, + POLICY_GROUP, + ++order, + ConfigDef.Width.MEDIUM, + FILE_READER_CLASS_DISPLAY + ).define( + POLL_INTERVAL_MS, + ConfigDef.Type.INT, + POLL_INTERVAL_MS_DEFAULT, + ConfigDef.Importance.MEDIUM, + POLL_INTERVAL_MS_DOC, + CONNECTOR_GROUP, + ++order, + ConfigDef.Width.SHORT, + POLL_INTERVAL_MS_DISPLAY + ); } - } From a8e4ca3a77ea76e1f6ffe9a54cf28c181a77f839 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sun, 26 Apr 2020 17:34:03 -0500 Subject: [PATCH 45/51] Retrievable connections in HDFS file watcher --- .../fs/file/reader/AbstractFileReader.java | 5 +- .../connect/fs/policy/AbstractPolicy.java | 11 +- .../kafka/connect/fs/policy/CronPolicy.java | 12 +- .../fs/policy/HdfsFileWatcherPolicy.java | 139 +++++++++++------- .../kafka/connect/fs/policy/Policy.java | 2 + .../fs/policy/HdfsFileWatcherPolicyTest.java | 60 ++++++++ 6 files changed, 161 insertions(+), 68 deletions(-) diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java index fec6b73..d63283f 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AbstractFileReader.java @@ -33,7 +33,7 @@ public AbstractFileReader(FileSystem fs, Path filePath, ReaderAdapter adapter this.offset = 0; configure(readerConfig(config)); - log.trace("Initialized file reader {} for file {}", getClass(), filePath); + log.trace("Initialized file reader [{}] for file [{}].", getClass().getName(), filePath); } protected final Map readerConfig(Map config) { @@ -88,10 +88,7 @@ public final void seek(long offset) { } checkClosed(); try { - log.debug("Seeking file {} to offset {}.", filePath, offset); seekFile(offset); - } catch (ConnectException ce) { - throw ce; } catch (IOException ioe) { throw new ConnectException("Error seeking file: " + getFilePath(), ioe); } diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java index 5908380..d250e76 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/AbstractPolicy.java @@ -21,7 +21,7 @@ import java.time.LocalDateTime; import java.time.format.DateTimeFormatter; import java.util.*; -import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -34,14 +34,14 @@ abstract class AbstractPolicy implements Policy { protected final Pattern fileRegexp; private final FsSourceTaskConfig conf; - private final AtomicInteger executions; + private final AtomicLong executions; private final boolean recursive; private boolean interrupted; public AbstractPolicy(FsSourceTaskConfig conf) throws IOException { this.fileSystems = new ArrayList<>(); this.conf = conf; - this.executions = new AtomicInteger(0); + this.executions = new AtomicLong(0); this.recursive = conf.getBoolean(FsSourceTaskConfig.POLICY_RECURSIVE); this.fileRegexp = Pattern.compile(conf.getString(FsSourceTaskConfig.POLICY_REGEXP)); this.interrupted = false; @@ -107,11 +107,11 @@ public final Iterator execute() throws IOException { } preCheck(); + executions.incrementAndGet(); Iterator files = Collections.emptyIterator(); for (FileSystem fs : fileSystems) { files = concat(files, listFiles(fs)); } - executions.incrementAndGet(); postCheck(); @@ -178,7 +178,7 @@ public final boolean hasEnded() { protected abstract boolean isPolicyCompleted(); - final int getExecutions() { + public final long getExecutions() { return executions.get(); } @@ -204,6 +204,7 @@ public FileReader offer(FileMetadata metadata, OffsetStorageReader offsetStorage Map partition = Collections.singletonMap("path", metadata.getPath()); Map offset = offsetStorageReader.offset(partition); if (offset != null && offset.get("offset") != null) { + log.info("Seeking to offset [{}] for file [{}].", offset.get("offset"), metadata.getPath()); reader.seek((Long) offset.get("offset")); } return reader; diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicy.java index 0774789..307fc23 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/CronPolicy.java @@ -6,6 +6,8 @@ import com.cronutils.parser.CronParser; import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.common.utils.SystemTime; +import org.apache.kafka.common.utils.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,11 +28,13 @@ public class CronPolicy extends AbstractPolicy { public static final String CRON_POLICY_EXPRESSION = CRON_POLICY_PREFIX + "expression"; public static final String CRON_POLICY_END_DATE = CRON_POLICY_PREFIX + "end_date"; + private final Time time; private ExecutionTime executionTime; private Date endDate; public CronPolicy(FsSourceTaskConfig conf) throws IOException { super(conf); + this.time = new SystemTime(); } @Override @@ -57,13 +61,7 @@ protected void configPolicy(Map customConfigs) { @Override protected void preCheck() { executionTime.timeToNextExecution(ZonedDateTime.now()) - .ifPresent(next -> { - try { - Thread.sleep(next.toMillis()); - } catch (InterruptedException ie) { - log.warn("An interrupted exception has occurred.", ie); - } - }); + .ifPresent(next -> time.sleep(next.toMillis())); } @Override diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java index 996d868..8d2f0d6 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicy.java @@ -10,12 +10,14 @@ import org.apache.hadoop.hdfs.client.HdfsAdmin; import org.apache.hadoop.hdfs.inotify.Event; import org.apache.hadoop.hdfs.inotify.EventBatch; +import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.common.utils.SystemTime; +import org.apache.kafka.common.utils.Time; import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.IllegalWorkerStateException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; import java.util.concurrent.ConcurrentLinkedQueue; @@ -25,26 +27,51 @@ public class HdfsFileWatcherPolicy extends AbstractPolicy { private static final Logger log = LoggerFactory.getLogger(HdfsFileWatcherPolicy.class); private static final String URI_PREFIX = "hdfs://"; + private static final long DEFAULT_POLL = 5000L; + private static final long DEFAULT_RETRY = 20000L; + private static final String HDFS_FILE_WATCHER_POLICY_PREFIX = FsSourceTaskConfig.POLICY_PREFIX + "hdfs_file_watcher."; + + public static final String HDFS_FILE_WATCHER_POLICY_POLL_MS = HDFS_FILE_WATCHER_POLICY_PREFIX + "poll"; + public static final String HDFS_FILE_WATCHER_POLICY_RETRY_MS = HDFS_FILE_WATCHER_POLICY_PREFIX + "retry"; + private final Queue fileQueue; + private final Time time; private Map fsEvenStream; + private long pollSleepMs; + private long retrySleepMs; public HdfsFileWatcherPolicy(FsSourceTaskConfig conf) throws IOException { super(conf); this.fileQueue = new ConcurrentLinkedQueue<>(); + this.time = new SystemTime(); startWatchers(); } @Override protected void configPolicy(Map customConfigs) { + try { + this.pollSleepMs = Long.parseLong((String) customConfigs + .getOrDefault(HDFS_FILE_WATCHER_POLICY_POLL_MS, String.valueOf(DEFAULT_POLL))); + } catch (NumberFormatException nfe) { + throw new ConfigException(HDFS_FILE_WATCHER_POLICY_POLL_MS + " property is required and must be a " + + "number (long). Got: " + customConfigs.get(HDFS_FILE_WATCHER_POLICY_POLL_MS)); + } + try { + this.retrySleepMs = Long.parseLong((String) customConfigs + .getOrDefault(HDFS_FILE_WATCHER_POLICY_RETRY_MS, String.valueOf(DEFAULT_RETRY))); + } catch (NumberFormatException nfe) { + throw new ConfigException(HDFS_FILE_WATCHER_POLICY_RETRY_MS + " property is required and must be a " + + "number (long). Got: " + customConfigs.get(HDFS_FILE_WATCHER_POLICY_RETRY_MS)); + } this.fsEvenStream = new HashMap<>(); fileSystems.stream() .filter(fs -> fs.getWorkingDirectory().toString().startsWith(URI_PREFIX)) .forEach(fs -> { try { HdfsAdmin admin = new HdfsAdmin(fs.getWorkingDirectory().toUri(), fs.getConf()); - fsEvenStream.put(fs, new EventStreamThread(fs, admin)); + fsEvenStream.put(fs, new EventStreamThread(fs, admin, retrySleepMs)); } catch (IOException ioe) { - throw new ConnectException("Error creating admin for notifications", ioe); + throw new ConnectException("Error creating HDFS notifications.", ioe); } }); } @@ -69,14 +96,7 @@ public Iterator listFiles(FileSystem fs) { @Override protected boolean isPolicyCompleted() { - boolean hasRunningThreads = false; - for (EventStreamThread thread : fsEvenStream.values()) { - if (thread.isAlive()) { - hasRunningThreads = true; - break; - } - } - return !hasRunningThreads; + return fsEvenStream.values().stream().noneMatch(Thread::isAlive); } @Override @@ -85,6 +105,11 @@ public void interrupt() { super.interrupt(); } + @Override + public void postCheck() { + time.sleep(pollSleepMs); + } + @Override public void close() throws IOException { stopWatchers(); @@ -94,66 +119,76 @@ public void close() throws IOException { private class EventStreamThread extends Thread { private final FileSystem fs; private final HdfsAdmin admin; + private final long retrySleepMs; + private final Time time; - EventStreamThread(FileSystem fs, HdfsAdmin admin) { + EventStreamThread(FileSystem fs, HdfsAdmin admin, long retrySleepMs) { this.fs = fs; this.admin = admin; + this.retrySleepMs = retrySleepMs; + this.time = new SystemTime(); } @Override public void run() { - try { - DFSInotifyEventInputStream eventStream = admin.getInotifyEventStream(); - while (fs.getFileStatus(fs.getWorkingDirectory()) != null && - fs.exists(fs.getWorkingDirectory())) { - EventBatch batch = eventStream.poll(); - if (batch == null) continue; - - for (Event event : batch.getEvents()) { - switch (event.getEventType()) { - case CREATE: - if (!((Event.CreateEvent) event).getPath().endsWith("._COPYING_")) { - enqueue(((Event.CreateEvent) event).getPath()); - } - break; - case APPEND: - if (!((Event.AppendEvent) event).getPath().endsWith("._COPYING_")) { - enqueue(((Event.AppendEvent) event).getPath()); - } - break; - case RENAME: - if (((Event.RenameEvent) event).getSrcPath().endsWith("._COPYING_")) { - enqueue(((Event.RenameEvent) event).getDstPath()); - } - break; - case CLOSE: - if (!((Event.CloseEvent) event).getPath().endsWith("._COPYING_")) { - enqueue(((Event.CloseEvent) event).getPath()); - } - break; - default: - break; + while (true) { + try { + DFSInotifyEventInputStream eventStream = admin.getInotifyEventStream(); + if (fs.getFileStatus(fs.getWorkingDirectory()) != null && + fs.exists(fs.getWorkingDirectory())) { + EventBatch batch = eventStream.poll(); + if (batch == null) continue; + + for (Event event : batch.getEvents()) { + switch (event.getEventType()) { + case CREATE: + if (!((Event.CreateEvent) event).getPath().endsWith("._COPYING_")) { + enqueue(((Event.CreateEvent) event).getPath()); + } + break; + case APPEND: + if (!((Event.AppendEvent) event).getPath().endsWith("._COPYING_")) { + enqueue(((Event.AppendEvent) event).getPath()); + } + break; + case RENAME: + if (((Event.RenameEvent) event).getSrcPath().endsWith("._COPYING_")) { + enqueue(((Event.RenameEvent) event).getDstPath()); + } + break; + case CLOSE: + if (!((Event.CloseEvent) event).getPath().endsWith("._COPYING_")) { + enqueue(((Event.CloseEvent) event).getPath()); + } + break; + default: + break; + } } } + } catch (IOException ioe) { + if (retrySleepMs > 0) { + time.sleep(retrySleepMs); + } else { + log.warn("Error watching path [{}]. Stopping it...", fs.getWorkingDirectory(), ioe); + throw new IllegalWorkerStateException(ioe); + } + } catch (Exception e) { + log.warn("Stopping watcher due to an unexpected exception when watching path [{}].", + fs.getWorkingDirectory(), e); + throw new IllegalWorkerStateException(e); } - } catch (FileNotFoundException fnfe) { - log.warn("Cannot find file in this FS {}. Stopping watcher...", fs.getWorkingDirectory(), fnfe); - } catch (IOException ioe) { - log.warn("An interrupted exception has occurred. Path {} is not watched any more", fs.getWorkingDirectory()); - } catch (Exception ioe) { - log.warn("Exception watching path {}", fs.getWorkingDirectory(), ioe); - throw new IllegalWorkerStateException(ioe); } } private void enqueue(String path) throws IOException { Path filePath = new Path(path); if (!fs.exists(filePath) || fs.getFileStatus(filePath) == null) { - log.info("Cannot enqueue file {} because it does not exist but got an event from the FS", filePath.toString()); + log.info("Cannot enqueue file [{}] because it does not exist but got an event from the FS", filePath); return; } - log.debug("Enqueuing file to process {}", filePath); + log.debug("Enqueuing file to process [{}]", filePath); RemoteIterator it = fs.listFiles(filePath, false); while (it.hasNext()) { LocatedFileStatus status = it.next(); diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java index 8cb3232..370288f 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/policy/Policy.java @@ -19,5 +19,7 @@ public interface Policy extends Closeable { List getURIs(); + long getExecutions(); + void interrupt(); } diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java index ddf69b7..a29ae5d 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/HdfsFileWatcherPolicyTest.java @@ -2,7 +2,10 @@ import com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig; import com.github.mmolimar.kafka.connect.fs.file.reader.TextFileReader; +import com.github.mmolimar.kafka.connect.fs.util.ReflectionUtils; import org.apache.hadoop.fs.Path; +import org.apache.kafka.common.config.ConfigException; +import org.apache.kafka.connect.errors.ConnectException; import org.apache.kafka.connect.errors.IllegalWorkerStateException; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.params.ParameterizedTest; @@ -76,4 +79,61 @@ public void execPolicyAlreadyEnded(PolicyFsTestConfig fsConfig) throws IOExcepti assertThrows(IllegalWorkerStateException.class, () -> fsConfig.getPolicy().execute()); } + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void notReachableFileSystem(PolicyFsTestConfig fsConfig) throws InterruptedException { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); + originals.put(FsSourceTaskConfig.FS_URIS, "hdfs://localhost:65432/data"); + originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_POLL_MS, "0"); + originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_RETRY_MS, "0"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + Policy policy = ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + int count = 0; + while (!policy.hasEnded() && count < 10) { + Thread.sleep(500); + count++; + } + assertTrue(count < 10); + assertTrue(policy.hasEnded()); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidPollTime(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); + originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_POLL_MS, "invalid"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + + @ParameterizedTest + @MethodSource("fileSystemConfigProvider") + public void invalidRetryTime(PolicyFsTestConfig fsConfig) { + Map originals = fsConfig.getSourceTaskConfig().originalsStrings(); + originals.put(HdfsFileWatcherPolicy.HDFS_FILE_WATCHER_POLICY_RETRY_MS, "invalid"); + FsSourceTaskConfig cfg = new FsSourceTaskConfig(originals); + assertThrows(ConnectException.class, () -> + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg)); + assertThrows(ConfigException.class, () -> { + try { + ReflectionUtils.makePolicy((Class) fsConfig.getSourceTaskConfig() + .getClass(FsSourceTaskConfig.POLICY_CLASS), cfg); + } catch (Exception e) { + throw e.getCause(); + } + }); + } + } From b50d963ac9a49e5cb25967b2d0c52593059ad508 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sun, 26 Apr 2020 17:37:02 -0500 Subject: [PATCH 46/51] Docker files --- Dockerfile | 7 ++++ docker-compose.yml | 77 +++++++++++++++++++++++++++++++++++ pom.xml | 59 +++++++++++++++++++++++++++ src/main/assembly/package.xml | 3 +- 4 files changed, 144 insertions(+), 2 deletions(-) create mode 100644 Dockerfile create mode 100644 docker-compose.yml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..7ba7a9d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,7 @@ +FROM confluentinc/cp-kafka-connect-base:5.5.0 + +ARG PROJECT_VERSION +ENV CONNECT_PLUGIN_PATH="/usr/share/java,/usr/share/confluent-hub-components" + +COPY ./target/components/packages/mmolimar-kafka-connect-fs-${PROJECT_VERSION}.zip /tmp/kafka-connect-fs.zip +RUN confluent-hub install --no-prompt /tmp/kafka-connect-fs.zip && rm -rf /tmp/kafka-connect-fs.zip diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..498b645 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,77 @@ +version: '3' +services: + cp-zookeeper: + image: confluentinc/cp-zookeeper:5.5.0 + hostname: zookeeper + container_name: zookeeper + ports: + - "2181:2181" + environment: + ZOOKEEPER_CLIENT_PORT: 2181 + ZOOKEEPER_TICK_TIME: 2000 + + cp-kafka: + image: confluentinc/cp-kafka:5.5.0 + hostname: kafka + container_name: kafka + depends_on: + - cp-zookeeper + ports: + - "29092:29092" + - "9092:9092" + environment: + KAFKA_BROKER_ID: 1 + KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 + CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: kafka:29092 + CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: zookeeper:2181 + CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1 + CONFLUENT_METRICS_ENABLE: 'false' + + cp-schema-registry: + image: confluentinc/cp-schema-registry:5.5.0 + hostname: schema-registry + container_name: schema-registry + depends_on: + - cp-zookeeper + - cp-kafka + ports: + - "8081:8081" + environment: + SCHEMA_REGISTRY_HOST_NAME: schema-registry + SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181' + + connect-fs: + image: mmolimar/kafka-connect-fs:1.0.0-SNAPSHOT + container_name: connect + depends_on: + - cp-kafka + - cp-schema-registry + ports: + - "8083:8083" + - "8000:8000" + environment: + CONNECT_BOOTSTRAP_SERVERS: 'kafka:29092' + CONNECT_REST_ADVERTISED_HOST_NAME: connect + CONNECT_REST_PORT: 8083 + CONNECT_GROUP_ID: compose-connect-group + CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs + CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1 + CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000 + CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets + CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1 + CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status + CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1 + CONNECT_KEY_CONVERTER: org.apache.kafka.connect.storage.StringConverter + CONNECT_VALUE_CONVERTER: io.confluent.connect.avro.AvroConverter + CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081 + CONNECT_INTERNAL_KEY_CONVERTER: "org.apache.kafka.connect.json.JsonConverter" + CONNECT_INTERNAL_VALUE_CONVERTER: "org.apache.kafka.connect.json.JsonConverter" + CONNECT_ZOOKEEPER_CONNECT: 'zookeeper:2181' + CONNECT_PLUGIN_PATH: "/usr/share/java,/usr/share/confluent-hub-components/" + CONNECT_LOG4J_ROOT_LOGLEVEL: "INFO" + CONNECT_LOG4J_LOGGERS: org.apache.zookeeper=ERROR,org.I0Itec.zkclient=ERROR,org.reflections=ERROR + KAFKA_OPTS: "-agentlib:jdwp=transport=dt_socket,server=y,address=8000,suspend=n" diff --git a/pom.xml b/pom.xml index 699c039..2a1ae2c 100644 --- a/pom.xml +++ b/pom.xml @@ -29,6 +29,7 @@ 0.8.5 4.3.0 3.0.0-M4 + 0.11.3 @@ -179,6 +180,64 @@ coveralls-maven-plugin ${maven-coveralls-plugin.version} + + io.confluent + kafka-connect-maven-plugin + ${maven-kafka-connect-plugin.version} + + + + kafka-connect + + + kafka-connect-fs + Kafka Connect FileSystem + https://kafka-connect-fs.readthedocs.io/ + https://github.com/mmolimar/kafka-connect-fs + + Kafka Connect FileSystem Connector is a source connector for reading records from files + in the file systems specified and load them into Kafka. + + Mario Molina + This connector is supported by the open source community. + https://github.com/mmolimar/kafka-connect-fs/issues + mmolimar + user + Mario Molina + https://github.com/mmolimar + mmolimar + kafka-connect-fs + ${project.version} + + source + + + filesystem + files + hadoop + hdfs + aws + s3 + google + gcs + azure + txt + csv + tsv + json + avro + parquet + sequence + + + + atLeastOnce + + true + + + + diff --git a/src/main/assembly/package.xml b/src/main/assembly/package.xml index 7962c49..a1b9d19 100644 --- a/src/main/assembly/package.xml +++ b/src/main/assembly/package.xml @@ -36,9 +36,8 @@ org.apache.kafka:connect-api org.mortbay.jetty:* com.sun.jersey:* - org.eclipse.jetty.aggregate:jetty-all + org.eclipse.jetty:jetty-util com.sun.jersey.contribs:jersey-guice - com.google.guava:guava org.apache.zookeeper:zookeeper log4j:log4j org.slf4j:slf4j-api From 2eda92941b95df1f0a636e0bca1ba1d40e36b16f Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sun, 26 Apr 2020 17:47:47 -0500 Subject: [PATCH 47/51] Updating docs --- config/kafka-connect-fs.properties | 2 +- docs/source/config_options.rst | 25 +++++++++++++++++++++++-- docs/source/connector.rst | 24 +++++++++++++++++++++--- 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/config/kafka-connect-fs.properties b/config/kafka-connect-fs.properties index 67435af..aab1ae6 100644 --- a/config/kafka-connect-fs.properties +++ b/config/kafka-connect-fs.properties @@ -1,7 +1,7 @@ name=FsSourceConnector connector.class=com.github.mmolimar.kafka.connect.fs.FsSourceConnector tasks.max=1 -fs.uris=file:///data,hdfs://localhost:9000/ +fs.uris=file:///data,hdfs://localhost:8020/data topic=mytopic policy.class=com.github.mmolimar.kafka.connect.fs.policy.SimplePolicy policy.recursive=true diff --git a/docs/source/config_options.rst b/docs/source/config_options.rst index 1cce34f..0a69105 100644 --- a/docs/source/config_options.rst +++ b/docs/source/config_options.rst @@ -54,7 +54,7 @@ General config properties for this connector. ``file:///data/${yyyy}/${MM}/${dd}/${HH}${mm}`` .. tip:: - If you want to ingest data from S3, you can add credentials with : + If you want to ingest data from S3, you can add credentials with: ``policy.fs.fs.s3a.access.key=`` and ``policy.fs.fs.s3a.secret.key=`` @@ -65,6 +65,13 @@ General config properties for this connector. * Type: string * Importance: high +``poll.interval.ms`` + Frequency in milliseconds to poll for new data. This config just applies when the policies have ended. + + * Type: int + * Default: ``10000`` + * Importance: medium + ``policy.class`` Policy class to apply (must implement ``com.github.mmolimar.kafka.connect.fs.policy.Policy`` interface). @@ -179,7 +186,21 @@ In order to configure custom properties for this policy, the name you must use i HDFS file watcher -------------------------------------------- -This policy does not have any additional configuration. +In order to configure custom properties for this policy, the name you must use is ``hdfs_file_watcher``. + +``policy.hdfs_file_watcher.poll`` + Time to wait until the records retrieved from the file watcher will be sent to the source task. + + * Type: long + * Default: ``5000`` + * Importance: medium + +``policy.hdfs_file_watcher.retry`` + Sleep time to retry connections to HDFS in case of connection errors happened. + + * Type: long + * Default: ``20000`` + * Importance: medium .. _config_options-filereaders: diff --git a/docs/source/connector.rst b/docs/source/connector.rst index 0e02451..476aa7b 100644 --- a/docs/source/connector.rst +++ b/docs/source/connector.rst @@ -47,7 +47,7 @@ The ``kafka-connect-fs.properties`` file defines the following properties as req name=FsSourceConnector connector.class=com.github.mmolimar.kafka.connect.fs.FsSourceConnector tasks.max=1 - fs.uris=file:///data,hdfs://localhost:9000/ + fs.uris=file:///data,hdfs://localhost:8020/data topic=mytopic policy.class= policy.recursive=true @@ -70,7 +70,7 @@ The ``kafka-connect-fs.properties`` file defines the following properties as req A more detailed information about these properties can be found :ref:`here`. -Running in development +Running in local -------------------------------------------- .. sourcecode:: bash @@ -81,7 +81,25 @@ Running in development mvn clean package export CLASSPATH="$(find target/ -type f -name '*.jar'| grep '\-package' | tr '\n' ':')" - $KAFKA_HOME/bin/connect-distributed.sh config/kafka-connect-fs.properties + $KAFKA_HOME/bin/connect-standalone.sh $KAFKA_HOME/config/connect-standalone.properties config/kafka-connect-fs.properties + +Running in Docker +-------------------------------------------- + +.. sourcecode:: bash + + mvn clean package + +.. sourcecode:: bash + + docker build --build-arg PROJECT_VERSION= . + docker-compose build + docker-compose up -d + docker logs --tail="all" -f connect + +.. sourcecode:: bash + + curl -sX GET http://localhost:8083/connector-plugins | grep FsSourceConnector Components ============================================ From 4455bd47fb00a5b184838640270717254aaa868b Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Sun, 26 Apr 2020 21:48:50 -0500 Subject: [PATCH 48/51] Change to Oracle JDK8 in Travis --- .travis.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 2d90a0c..9a9aab4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,10 +1,10 @@ +dist: trusty language: java jdk: - - openjdk8 -sudo: false + - oraclejdk8 install: - - mvn test-compile -DskipTests=true -Dmaven.javadoc.skip=true -B -V + - mvn test-compile -DskipTests=true -Dmaven.javadoc.skip=true -B -V script: - - mvn test jacoco:report + - mvn test jacoco:report after_success: - - mvn coveralls:report \ No newline at end of file + - mvn coveralls:report From aa3f45091e5bd84e886cfdac6e555625f1111e77 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Mon, 27 Apr 2020 07:54:58 -0500 Subject: [PATCH 49/51] Fix typo in doc --- docs/source/policies.rst | 9 +++++---- .../github/mmolimar/kafka/connect/fs/FsSourceTask.java | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/source/policies.rst b/docs/source/policies.rst index dc0f607..1a5f654 100644 --- a/docs/source/policies.rst +++ b/docs/source/policies.rst @@ -1,10 +1,9 @@ Simple ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -It's a never-ending policy which just filters and processes files included in the corresponding URIs. +It's a policy which just filters and processes files included in the corresponding URIs one time. .. attention:: This policy is more oriented for testing purposes. - It never stops and Kafka Connect is continuously trying to poll data from the FS(s). Sleepy ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -28,10 +27,12 @@ HDFS file watcher ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ It uses Hadoop notifications events and all create/append/rename/close events will be reported -as new files to be ingested. +as files to be ingested. Just use it when you have HDFS URIs. -.. attention:: The URIs included in general property ``fs.uris`` will be filtered and only those +You can learn more about the properties of this policy :ref:`here`. + +.. attention:: The URIs included in the general property ``fs.uris`` will be filtered and only those ones which start with the prefix ``hdfs://`` will be watched. Also, this policy will only work for Hadoop versions 2.6.0 or higher. diff --git a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java index c3fa38f..bb2169a 100644 --- a/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java +++ b/src/main/java/com/github/mmolimar/kafka/connect/fs/FsSourceTask.java @@ -50,11 +50,11 @@ public void start(Map properties) { config = new FsSourceTaskConfig(properties); if (config.getClass(FsSourceTaskConfig.POLICY_CLASS).isAssignableFrom(Policy.class)) { throw new ConfigException("Policy class " + - config.getClass(FsSourceTaskConfig.POLICY_CLASS) + "is not a subclass of " + Policy.class); + config.getClass(FsSourceTaskConfig.POLICY_CLASS) + " is not a subclass of " + Policy.class); } if (config.getClass(FsSourceTaskConfig.FILE_READER_CLASS).isAssignableFrom(FileReader.class)) { throw new ConfigException("FileReader class " + - config.getClass(FsSourceTaskConfig.FILE_READER_CLASS) + "is not a subclass of " + FileReader.class); + config.getClass(FsSourceTaskConfig.FILE_READER_CLASS) + " is not a subclass of " + FileReader.class); } Class policyClass = (Class) Class.forName(properties.get(FsSourceTaskConfig.POLICY_CLASS)); From 9de08e0ad96b6fe687da48a54f74fe7ed4043a64 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Mon, 27 Apr 2020 10:16:27 -0500 Subject: [PATCH 50/51] Wait to receive events in HDFS file watcher test policy --- .../kafka/connect/fs/policy/PolicyTestBase.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java index 6af841b..ba77775 100644 --- a/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java +++ b/src/test/java/com/github/mmolimar/kafka/connect/fs/policy/PolicyTestBase.java @@ -130,10 +130,10 @@ public void oneFilePerFs(PolicyFsTestConfig fsConfig) throws IOException, Interr fs.createNewFile(new Path(dir, System.nanoTime() + ".txt")); //this file does not match the regexp fs.createNewFile(new Path(dir, System.nanoTime() + ".invalid")); - } - //we wait till FS has registered the files - Thread.sleep(500); + //we wait till FS has registered the files + Thread.sleep(3000); + } Iterator it = fsConfig.getPolicy().execute(); assertTrue(it.hasNext()); it.next(); @@ -152,10 +152,10 @@ public void recursiveDirectory(PolicyFsTestConfig fsConfig) throws IOException, fs.createNewFile(new Path(tmpDir, System.nanoTime() + ".txt")); //this file does not match the regexp fs.createNewFile(new Path(tmpDir, System.nanoTime() + ".invalid")); - } - //we wait till FS has registered the files - Thread.sleep(500); + //we wait till FS has registered the files + Thread.sleep(3000); + } Iterator it = fsConfig.getPolicy().execute(); assertTrue(it.hasNext()); it.next(); From 175df2ae54b9ccc14cc540d678e74d6623501ab5 Mon Sep 17 00:00:00 2001 From: Mario Molina Date: Mon, 27 Apr 2020 10:17:01 -0500 Subject: [PATCH 51/51] Release version 1.0.0 --- docker-compose.yml | 2 +- pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 498b645..e763372 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -45,7 +45,7 @@ services: SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181' connect-fs: - image: mmolimar/kafka-connect-fs:1.0.0-SNAPSHOT + image: mmolimar/kafka-connect-fs:1.0.0 container_name: connect depends_on: - cp-kafka diff --git a/pom.xml b/pom.xml index 2a1ae2c..bd22791 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ com.github.mmolimar.kafka.connect kafka-connect-fs - 1.0.0-SNAPSHOT + 1.0.0 jar kafka-connect-fs