Skip to content

Commit

Permalink
Use pica dat instead of xml dump #462
Browse files Browse the repository at this point in the history
  • Loading branch information
TobiasNx committed Jul 18, 2023
1 parent a728939 commit 10be853
Show file tree
Hide file tree
Showing 12 changed files with 61 additions and 1,264 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ app/transformation/input/*.csv
.cache*
/bin/
application-log*.gz
app/transformation/input/*.dat
45 changes: 32 additions & 13 deletions app/transformation/TransformSigel.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import org.metafacture.metafix.Metafix;
import org.metafacture.triples.StreamToTriples;
import org.metafacture.biblio.pica.PicaXmlHandler;
import org.metafacture.io.LineReader;
import org.metafacture.biblio.pica.PicaDecoder;
import org.metafacture.xml.XmlDecoder;
import org.metafacture.triples.TripleFilter;
import org.metafacture.xml.XmlElementSplitter;
Expand Down Expand Up @@ -52,14 +54,30 @@ public class TransformSigel {

static void process(String startOfUpdates, int intervalSize,
final String outputPath, String geoLookupServer) throws IOException {
splitUpSigelDump();
final FileOpener splitFileOpener = new FileOpener();
final FileOpener dumpOpener = new FileOpener();

StreamToTriples streamToTriples = new StreamToTriples();
streamToTriples.setRedirect(true);
final TripleFilter tripleFilter = new TripleFilter();
tripleFilter.setSubjectPattern(".+"); // Remove entries without id
JsonEncoder encodeJson = new JsonEncoder();
encodeJson.setPrettyPrinting(true);
dumpOpener//
.setReceiver(new LineReader())//
.setReceiver(new PicaDecoder())//
.setReceiver(new Metafix("conf/fix-sigel.fix"))//
.setReceiver(streamToTriples)//
.setReceiver(tripleFilter)//
.setReceiver(new TripleCollect())//
.setReceiver(TransformAll.fixEnriched(geoLookupServer))//
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
.setReceiver(new ObjectWriter<>(outputPath));
dumpOpener.process(TransformAll.DATA_INPUT_DIR + "sigil.dat");

ObjectWriter objectWriter = new ObjectWriter<>(outputPath);
objectWriter.setAppendIfFileExists(true);
splitFileOpener//
.setReceiver(new XmlDecoder())//
.setReceiver(new PicaXmlHandler())//
Expand All @@ -70,7 +88,7 @@ static void process(String startOfUpdates, int intervalSize,
.setReceiver(TransformAll.fixEnriched(geoLookupServer))//
.setReceiver(encodeJson)//
.setReceiver(TransformAll.esBulk())//
.setReceiver(new ObjectWriter<>(outputPath));
.setReceiver(objectWriter);
if (!startOfUpdates.isEmpty()) {
processSigelUpdates(startOfUpdates, intervalSize);
}
Expand All @@ -80,19 +98,20 @@ static void process(String startOfUpdates, int intervalSize,
.collect(Collectors.toList()).forEach(path -> {
splitFileOpener.process(path.toString());
});
splitFileOpener.closeStream();


}

private static void splitUpSigelDump() {
final FileOpener dumpFileOpener = new FileOpener();
dumpFileOpener//
.setReceiver(new XmlDecoder())//
.setReceiver(new XmlElementSplitter(DUMP_TOP_LEVEL_TAG, DUMP_ENTITY))//
.setReceiver(
xmlFilenameWriter(TransformAll.DATA_OUTPUT_DIR, DUMP_XPATH));
dumpFileOpener.process(TransformAll.DATA_INPUT_DIR + "sigel.xml");
dumpFileOpener.closeStream();
}
// private static void splitUpSigelDump() {
// final FileOpener dumpFileOpener = new FileOpener();
// dumpFileOpener//
// .setReceiver(new XmlDecoder())//
// .setReceiver(new XmlElementSplitter(DUMP_TOP_LEVEL_TAG, DUMP_ENTITY))//
// .setReceiver(
// xmlFilenameWriter(TransformAll.DATA_OUTPUT_DIR, DUMP_XPATH));
// dumpFileOpener.process(TransformAll.DATA_INPUT_DIR + "sigel.xml");
// dumpFileOpener.closeStream();
// }

private static void processSigelUpdates(String startOfUpdates,
int intervalSize) {
Expand Down
2 changes: 1 addition & 1 deletion conf/application.conf
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ index.es.port.tcp=7310

index.remote=[10.1.1.106,127.0.0.1]

transformation.updates.start="2013-06-01"
transformation.updates.start="2023-06-01"
transformation.updates.interval.size=50
transformation.geo.lookup.server="http://gaia.hbz-nrw.de:4000/v1/search"
transformation.geo.lookup.threshold=0.675
Expand Down
Loading

0 comments on commit 10be853

Please sign in to comment.