Skip to content

Commit

Permalink
WARC spout to emit captures into topology (implements apache#755)
Browse files Browse the repository at this point in the history
- emit content tuple with ID (URL)
  • Loading branch information
sebastian-nagel committed Jun 24, 2020
1 parent 9795f5b commit cd8f028
Showing 1 changed file with 1 addition and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ public class WARCSpout extends FileSpout {

private int maxContentSize = -1;
private int contentBufferSize = 8192;
private long sleepEmitFetched = 0;

private boolean storeHTTPHeaders = false;
private String protocolMDprefix = "";
Expand Down Expand Up @@ -356,8 +355,6 @@ record = Optional.empty();
false);
protocolMDprefix = ConfUtils.getString(conf,
ProtocolResponse.PROTOCOL_MD_PREFIX_PARAM, protocolMDprefix);
sleepEmitFetched = ConfUtils.getLong(conf,
"warc.spout.emit.fetched.sleep.ms", 0);

int metricsTimeBucketSecs = ConfUtils.getInt(conf,
"fetcher.metrics.time.bucket.secs", 10);
Expand Down Expand Up @@ -498,7 +495,7 @@ public void nextTuple() {

nextRecord(offset, metadata); // proceed and calculate length

_collector.emit(new Values(url, content, metadata));
_collector.emit(new Values(url, content, metadata), url);

return;
}
Expand Down

0 comments on commit cd8f028

Please sign in to comment.