Skip to content
This repository has been archived by the owner on Jul 7, 2023. It is now read-only.

Commit

Permalink
Merge pull request iipc#12 from ukwa/master
Browse files Browse the repository at this point in the history
Check WARC/GZ via file extension
  • Loading branch information
anjackson committed Mar 7, 2014
2 parents 08175bf + 4bf2929 commit 0715bf9
Show file tree
Hide file tree
Showing 3 changed files with 3,201 additions and 3 deletions.
14 changes: 11 additions & 3 deletions src/main/java/org/archive/io/warc/WARCReaderFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,20 @@ public static ArchiveReader get(final String s, final InputStream is,
atFirstRecord);
}

/*
* Note that the ARC companion does this differently, with quite a lot of duplication.
*
* @see org.archive.io.arc.ARCReaderFactory.getArchiveReader(String, InputStream, boolean)
*/
protected ArchiveReader getArchiveReader(final String f,
final InputStream is, final boolean atFirstRecord)
throws IOException {
// For now, assume stream is compressed. Later add test of input
// stream or handle exception thrown when figure not compressed stream.
return new CompressedWARCReader(f, is, atFirstRecord);
// Check if it's compressed, based on file extension.
if( f.endsWith(".gz") ) {
return new CompressedWARCReader(f, is, atFirstRecord);
} else {
return new UncompressedWARCReader(f, is);
}
}

public static WARCReader get(final URL arcUrl, final long offset)
Expand Down
34 changes: 34 additions & 0 deletions src/test/java/org/archive/io/warc/WARCReaderFactoryTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package org.archive.io.warc;

import java.io.FileInputStream;
import java.io.IOException;

import org.archive.format.warc.WARCConstants;
import org.archive.format.warc.WARCConstants.WARCRecordType;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;

import junit.framework.TestCase;

public class WARCReaderFactoryTest extends TestCase {

// Test files:
String[] files = new String[] {
"src/test/resources/org/archive/format/gzip/IAH-urls-wget.warc.gz",
"src/test/resources/org/archive/format/warc/IAH-urls-wget.warc"
};

public void testGetStringInputstreamBoolean() throws IOException {
// Check the test files can be opened:
for( String file : files ) {
FileInputStream is = new FileInputStream(file);
ArchiveReader ar = WARCReaderFactory.get(file, is, true);
ArchiveRecord r = ar.get();
String type = (String) r.getHeader().getHeaderValue(WARCConstants.HEADER_KEY_TYPE);
// Check the first record comes out as a 'warcinfo' record.
assertEquals(WARCRecordType.warcinfo.name(), type);
}
}


}
Loading

0 comments on commit 0715bf9

Please sign in to comment.