Skip to content

Commit

Permalink
Merge pull request #1138 from libris/feature/separate-scripts
Browse files Browse the repository at this point in the history
Put scripts in separate files to run normally with whelktool
  • Loading branch information
olovy authored Aug 22, 2022
2 parents de8fc25 + 49284d2 commit e40a509
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,6 @@ import whelk.Whelk
import whelk.util.Unicode

class Util {
static def whelk = Whelk.createLoadedCoreWhelk()

static def clusters = 'clusters'

static def contributionPath = ['@graph', 1, 'instanceOf', 'contribution']

static def titleComponents = ['mainTitle', 'titleRemainder', 'subtitle', 'hasPart', 'partNumber', 'partName', 'marc:parallelTitle', 'marc:equalTitle']

static def titleVariant = ['Title', 'ParallelTitle']
Expand Down Expand Up @@ -154,7 +148,7 @@ class Util {
.collect { it['flatTitle'] }
}

static String chipString(def thing) {
static String chipString(def thing, Whelk whelk) {
if (thing instanceof Integer) {
return thing
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@ import whelk.Document

import static datatool.scripts.mergeworks.Util.asList
import static datatool.scripts.mergeworks.Util.getPathSafe
import static datatool.scripts.mergeworks.Util.contributionPath
import static datatool.scripts.mergeworks.Util.clusters
import static datatool.scripts.mergeworks.Util.Relator

/**
Expand All @@ -15,10 +13,11 @@ import static datatool.scripts.mergeworks.Util.Relator

PrintWriter report = getReportWriter("report.txt")

def contributionPath = ['@graph', 1, 'instanceOf', 'contribution']
def ill = ['@id': Relator.ILLUSTRATOR.iri]
def pu = ['@id': Relator.PRIMARY_RIGHTS_HOLDER.iri]

new File(System.getProperty(clusters)).splitEachLine('\t') { cluster ->
new File(System.getProperty('clusters')).splitEachLine('\t') { cluster ->
incrementStats('add 9pu', 'clusters checked')

def docs = Collections.synchronizedList([])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ import whelk.Document

import java.util.regex.Pattern

import static datatool.scripts.mergeworks.Util.contributionPath
import static datatool.scripts.mergeworks.Util.clusters
import static datatool.scripts.mergeworks.Util.asList
import static datatool.scripts.mergeworks.Util.chipString
import static datatool.scripts.mergeworks.Util.getPathSafe
Expand All @@ -22,7 +20,11 @@ import static datatool.scripts.mergeworks.Util.Relator

PrintWriter report = getReportWriter("report.txt")

new File(System.getProperty(clusters)).splitEachLine('\t') { cluster ->
def whelk = getWhelk()

def contributionPath = ['@graph', 1, 'instanceOf', 'contribution']

new File(System.getProperty('clusters')).splitEachLine('\t') { cluster ->
incrementStats('fetch contribution from respStatement', 'clusters checked')

def docs = Collections.synchronizedList([])
Expand Down Expand Up @@ -92,7 +94,7 @@ new File(System.getProperty(clusters)).splitEachLine('\t') { cluster ->
def roleShort = r.iri.split('/').last()
incrementStats('fetch contribution from respStatement', "$roleShort roles specified")

report.println("${chipString(c)} (${d.shortId}) <- $roleShort")
report.println("${chipString(c, whelk)} (${d.shortId}) <- $roleShort")
}
}

Expand Down Expand Up @@ -124,7 +126,7 @@ new File(System.getProperty(clusters)).splitEachLine('\t') { cluster ->
def roleShort = it.getV1().iri.split('/').last()
incrementStats('fetch contribution from respStatement', "$roleShort found in cluster")
}
report.println("${d.shortId} <- ${chipString(matched)} (${other.shortId})")
report.println("${d.shortId} <- ${chipString(matched, whelk)} (${other.shortId})")
changed = true
break
}
Expand Down Expand Up @@ -243,3 +245,15 @@ private List<String> parseNames(Pattern namePattern, Pattern conjPattern, String

return names
}

def getWhelk() {
// A little hack to get a handle to whelk...
def whelk = null
selectByIds(['https://id.kb.se/marc']) { docItem ->
whelk = docItem.whelk
}
if (!whelk) {
throw new RuntimeException("Could not get Whelk")
}
return whelk
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import groovy.transform.Memoized
import whelk.util.DocumentUtil

import static datatool.scripts.mergeworks.Util.getPathSafe
import static datatool.scripts.mergeworks.Util.clusters

/**
Example:
Expand All @@ -13,7 +12,7 @@ import static datatool.scripts.mergeworks.Util.clusters

PrintWriter report = getReportWriter("report.txt")

def ids = new File(System.getProperty(clusters))
def ids = new File(System.getProperty('clusters'))
.readLines()
.collect { it.split('\t').collect { it.trim()} }
.flatten()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,9 @@ package datatool.scripts.mergeworks.normalize

import groovy.transform.Memoized
import whelk.Document
import whelk.Whelk

import static datatool.scripts.mergeworks.Util.contributionPath
import static datatool.scripts.mergeworks.Util.asList
import static datatool.scripts.mergeworks.Util.chipString
import static datatool.scripts.mergeworks.Util.getClusters
import static datatool.scripts.mergeworks.Util.getPathSafe
import static datatool.scripts.mergeworks.Util.nameMatch
import static datatool.scripts.mergeworks.Util.Relator
Expand All @@ -19,9 +16,11 @@ import static datatool.scripts.mergeworks.Util.Relator

PrintWriter report = getReportWriter("report.txt")

def whelk = Whelk.createLoadedCoreWhelk()
def whelk = getWhelk()

new File(System.getProperty(clusters)).splitEachLine('\t') { cluster ->
def contributionPath = ['@graph', 1, 'instanceOf', 'contribution']

new File(System.getProperty('clusters')).splitEachLine('\t') { cluster ->
def docs = Collections.synchronizedList([])
selectByIds(cluster.collect { it.trim() }) {
docs << it.doc
Expand Down Expand Up @@ -58,12 +57,12 @@ new File(System.getProperty(clusters)).splitEachLine('\t') { cluster ->
agentMatches(c.agent, it) && (!c.role || it.roles.containsAll(c.role))
}
if (l) {
report.println("${d.shortId} ${chipString(c)} --> ${chipString(l)}")
report.println("${d.shortId} ${chipString(c, whelk)} --> ${chipString(l, whelk)}")
c.agent = ['@id': l['@id']]
changed = true
incrementStats('link contribution', 'agents linked')
} else {
report.println("${d.shortId} NO MATCH: ${chipString(c)} ??? ${linked.collect { chipString(it) }}")
report.println("${d.shortId} NO MATCH: ${chipString(c, whelk)} ??? ${linked.collect { chipString(it, whelk) }}")
}
}
if (c['@type'] == 'PrimaryContribution' && !c.role) {
Expand Down Expand Up @@ -111,4 +110,16 @@ static boolean yearMismatch(Map local, Map linked) {
b || d
}

def getWhelk() {
// A little hack to get a handle to whelk...
def whelk = null
selectByIds(['https://id.kb.se/marc']) { docItem ->
whelk = docItem.whelk
}
if (!whelk) {
throw new RuntimeException("Could not get Whelk")
}
return whelk
}


0 comments on commit e40a509

Please sign in to comment.