Merge pull request #11 from kbrbe/development

New features for version 0.3.0
kbrbe · Jun 28, 2023 · 1fa9f19 · 1fa9f19
2 parents 319bde9 + 795bb21
commit 1fa9f19
Show file tree

Hide file tree

Showing 6 changed files with 72 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.3.0] - 2023-06-28
+
+As of this version, the tool can be installed via `pip` and can be used as a library and not only via the commandline.
+
+### Added
+
+- The possibility to install the tool via `pip` ([#7](https://github.com/kbrbe/enrich-authority-csv/issues/7)
+
+### Changed
+
+- The import of the lib module is no longer relative, this means to use the tool without installing it one has to set the `PYTHONPATH` environment variable, for example `export PYTHONPATH=/home/youruser/repo/enrich-authority-csv`
 
 ## [0.2.0] - 2023-06-23
 
@@ -31,3 +42,4 @@ Mainly because the script was generalized to handle more than just the ISNI SRU
 
 [0.1.0]: https://github.com/kbrbe/enrich-authority-csv/releases/tag/v0.1.0
 [0.2.0]: https://github.com/kbrbe/enrich-authority-csv/compare/v0.1.0...v0.2.0
+[0.2.0]: https://github.com/kbrbe/enrich-authority-csv/compare/v0.2.0...v0.3.0
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ ISNI is the [ISO 27729:2012](https://www.iso.org/standard/44292.html) standard n
 Given a CSV file where each row is a contributor to creative works, this script uses a specified identifier in one of the columns to
 fill data gaps in other specified columns based on data available via a specified SRU API.
 
-## Usage
+## Usage via the commandline
 
 Create and activate a Python virtual environment
 ```bash
@@ -83,8 +83,29 @@ The script will first provide some statistics of how many rows could possibly be
 by looping over the input file in a streaming fashion.
 Afterwards the script starts requesting data, progress is shown in a progress bar.
 
+## Usage as a library
 
-### Example output
+The tool can also be used as a library within another Python script or a Jupyter notebook.
+
+```python
+from enrich_authority_csv.enrich_authority_csv import main as enrich_authority_csv
+
+enrich_authority_csv(
+  configFile='config-example.json',
+  inputFile='input-file.csv',
+  outputFile='output-file.csv',
+  apiName='BnF',
+  query='aut.isni all',
+  recordSchema='unimarcxchange',
+  dataFields={'nationalities': 'nationality'},
+  delimiter=',',
+  secondsBetweenAPIRequests=0,
+  identifierColumnName='isniIDs')
+
+```
+
+
+## Example output
 
 ```bash
 In total, the file contains 299 lines from which 298 contain the identifier to lookup (99.67%)

diff --git a/enrich_authority_csv/__init__.py b/enrich_authority_csv/__init__.py
diff --git a/enrich_authority_csv/enrich_authority_csv.py b/enrich_authority_csv/enrich_authority_csv.py
@@ -1,16 +1,15 @@
 import os
 import csv
 from dotenv import load_dotenv
-from config_parser import ConfigParser
-import lib
+from enrich_authority_csv.config_parser import ConfigParser
+import enrich_authority_csv.lib as lib
 import time
 from tqdm import tqdm
 from argparse import ArgumentParser
 
 
 # -----------------------------------------------------------------------------
-def main():
-
+def parseArguments():
   parser = ArgumentParser(description='This script reads a CSV file and requests for each found lookup identifier (in the column specified with --column-name-lookup-identifier) the datafields specified with --data')
   parser.add_argument('-i', '--input-file', action='store', required=True, help='A CSV file that contains records about contributors')
   parser.add_argument('-o', '--output-file', action='store', required=True, help='The CSV file in which the enriched records are stored')
@@ -24,24 +23,22 @@ def main():
   parser.add_argument('-d', '--delimiter', action='store', default=',', help='The delimiter of the input CSV')
   args = parser.parse_args()
 
+  return args
 
-  config = ConfigParser(args.config)
-  apiName = args.api
-  query = args.query
-  recordSchema = args.record_schema
+# -----------------------------------------------------------------------------
+def main(configFile, inputFile, outputFile, apiName, query, recordSchema, dataFields, delimiter, secondsBetweenAPIRequests, identifierColumnName):
+
+
+  config = ConfigParser(configFile)
 
-  dataFields = dict(map(lambda s: s.split('='), args.data))
 
   # check if the requested data can be fetched based on the given API config
   lib.verifyTask(config, apiName, recordSchema, dataFields)
 
 
-  delimiter = args.delimiter
-  secondsBetweenAPIRequests = args.wait
-  identifierColumnName = args.column_name_lookup_identifier
 
-  with open(args.input_file, 'r') as inFile, \
-       open(args.output_file, 'w') as outFile:
+  with open(inputFile, 'r') as inFile, \
+       open(outputFile, 'w') as outFile:
 
 
     # Count some stats and reset the file pointer afterwards
@@ -172,4 +169,7 @@ def main():
       print()
       print(f'{lookupIdentifierName}: No missing values that would have a lookup identifier. So there is nothing to enrich')
 
-main()
+if __name__ == '__main__':
+  args = parseArguments()
+  dataFields = dict(map(lambda s: s.split('='), args.data))
+  main(args.config, args.input_file, args.output_file, args.api, args.query, args.record_schema, dataFields, args.delimiter, args.wait, args.column_name_lookup_identifier)
diff --git a/enrich_authority_csv/lib.py b/enrich_authority_csv/lib.py
@@ -198,7 +198,7 @@ def initializeCounters(countReader, identifiers, isniColumnName, nationalityColu
   ... {'kbrIDs':'','ntaIDs':'','isniIDs':'002;003'},
   ... {'kbrIDs':'123','ntaIDs':'456','isniIDs':'002;003'}]
   >>> initializeCounters(rows, {'kbrIDs':'KBR', 'ntaIDs':'NTA'}, 'isniIDs')
-  {'numberRows': 5, 'numberRowsHaveISNI': 3, 'numberISNIs': 5, 'numberRowsMissingAtLeastOneIdentifier': 4, 'KBR': {'numberMissingIdentifierRows': 3, 'numberISNIs': 5, 'numberRowsToBeEnrichedHaveISNI': 2, 'numberRowsThatCannotBeEnriched': 1}, 'NTA': {'numberMissingIdentifierRows': 4, 'numberISNIs': 5, 'numberRowsToBeEnrichedHaveISNI': 2, 'numberRowsThatCannotBeEnriched': 2}}
+  {'numberRows': 5, 'numberRowsHaveISNI': 3, 'numberISNIs': 5, 'numberRowsMissingAtLeastOneIdentifier': 4, 'numberRowsMissingAndPossibleToBeEnriched': 2, 'KBR': {'numberMissingIdentifierRows': 3, 'numberISNIs': 5, 'numberRowsToBeEnrichedHaveISNI': 2, 'numberRowsThatCannotBeEnriched': 1, 'numberFoundISNIRows': 0, 'numberFoundISNIs': 0}, 'NTA': {'numberMissingIdentifierRows': 4, 'numberISNIs': 5, 'numberRowsToBeEnrichedHaveISNI': 2, 'numberRowsThatCannotBeEnriched': 2, 'numberFoundISNIRows': 0, 'numberFoundISNIs': 0}}
   """
 
   # initialize counters

diff --git a/setup.py b/setup.py
@@ -0,0 +1,21 @@
+import os
+from setuptools import setup
+
+# Utility function to read the README file.
+# Used for the long_description.  It's nice, because now 1) we have a top level
+# README file and 2) it's easier to type in the README file than to put a raw
+# string in below ...
+def read(fname):
+    return open(os.path.join(os.path.dirname(__file__), fname)).read()
+
+setup(
+    name = "enrich_authority_csv",
+    version = "0.3.0",
+    author = "Sven Lieber",
+    author_email = "[email protected]",
+    description = ("A python script that uses SRU APIs to complete a CSV file with missing data based on an available identifier column that can be looked up in the SRU API"),
+    license = "AGPL-3.0",
+    keywords = "csv authority-control isni authority-files enriching",
+    packages=setuptools.find_packages(),
+    long_description=read('README.md')
+)