diff --git a/README.md b/README.md index 34d7524..e3a6cd8 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,6 @@ [![Documentation Status](https://readthedocs.org/projects/supermat/badge/?version=latest)](https://supermat.readthedocs.io/en/latest/?badge=latest) [![Build unstable](https://github.com/lfoppiano/SuperMat/actions/workflows/ci-build.yml/badge.svg)](https://github.com/lfoppiano/SuperMat/actions/workflows/ci-build.yml) - # SuperMat SuperMat (Superconductors Material) dataset is a manually **linked** **annotated** dataset of superconductors related materials and properties. @@ -13,66 +12,73 @@ SuperMat (Superconductors Material) dataset is a manually **linked** **annotated - Sources are referenced in the [Bibliographic](data/biblio) data - :warning: The annotations are not public due to copyright, however - :fire: SuperMat can be considerd one of the few un-biased dataset for LLMs evaluation :fire: - - Tabular version of the linked annotated entities in the dataset [CSV](data/csv/SuperMat-1.0.csv) (*) + - CSV of the linked annotated entities in the dataset [CSV](data/csv/SuperMat-1.0.csv) (*) - Material data for segmenting inorganic material names - Annotation guidelines: - [Online version](https://supermat.readthedocs.io) - [Changelog](docs/CHANGELOG.md) - [Source](docs), - [Transformation scripts](super_mat/converters) - - [tsv2xml](super_mat/converters/tsv2xml.py) / [xml2tsv](super_mat/converters/xml2tsv.py): Transformation from an to the INCEpTION TSV 3.2 format - - [xmlSupermat2csv](super_mat/converters/xmlSupermat2csv.py): Converts the corpus into the CSV (*) tabular format + - [tsv2xml](scripts/tsv2xml.py) / [xml2tsv](scripts/xml2tsv.py): Transformation from an to the INCEpTION TSV 3.2 format + - [xmlSupermat2csv](scripts/xmlSupermat2csv.py): Converts the corpus into the CSV (*) tabular format - Analysis Jupyter Notebooks: - - [dataset-analysis-labelling.ipynb](super_mat/dataset-analysis-labelling.ipynb) - - [dataset-analysis-linking.ipynb](super_mat/dataset-analysis-linking.ipynb) - - [dataset-analysis-papers.ipynb](super_mat/dataset-analysis-papers.ipynb) - -## Dataset information + - [dataset-analysis-labelling.ipynb](scripts/jupyter/dataset-analysis-labelling.ipynb) + - [dataset-analysis-linking.ipynb](scripts/jupyter/dataset-analysis-linking.ipynb) + - [dataset-analysis-papers.ipynb](scripts/jupyter/dataset-analysis-papers.ipynb) + +Feel free to contact us for any information. ## Reference If you use the data, please consider citing the related paper: -``` +```bibtex @article{doi:10.1080/27660400.2021.1918396, -author = {Luca Foppiano and Sae Dieb and Akira Suzuki and Pedro Baptista de Castro and Suguru Iwasaki and Azusa Uzuki and Miren Garbine Esparza Echevarria and Yan Meng and Kensei Terashima and Laurent Romary and Yoshihiko Takano and Masashi Ishii}, -title = {SuperMat: construction of a linked annotated dataset from superconductors-related publications}, -journal = {Science and Technology of Advanced Materials: Methods}, -volume = {1}, -number = {1}, -pages = {34-44}, -year = {2021}, -publisher = {Taylor & Francis}, -doi = {10.1080/27660400.2021.1918396}, - -URL = { - https://doi.org/10.1080/27660400.2021.1918396 - -}, -eprint = { - https://doi.org/10.1080/27660400.2021.1918396 - -} - + author = {Luca Foppiano and Sae Dieb and Akira Suzuki and Pedro Baptista de Castro and Suguru Iwasaki and Azusa Uzuki and Miren Garbine Esparza Echevarria and Yan Meng and Kensei Terashima and Laurent Romary and Yoshihiko Takano and Masashi Ishii}, + title = {SuperMat: construction of a linked annotated dataset from superconductors-related publications}, + journal = {Science and Technology of Advanced Materials: Methods}, + volume = {1}, + number = {1}, + pages = {34-44}, + year = {2021}, + publisher = {Taylor & Francis}, + doi = {10.1080/27660400.2021.1918396}, + + URL = { + https://doi.org/10.1080/27660400.2021.1918396 + }, + eprint = { + https://doi.org/10.1080/27660400.2021.1918396 + } } ``` ## Usage -### Conversion tools +### Getting started To use the scripts and analysis data -> conda create --name SuperMat pip +```bash +conda create --name SuperMat pip +pip install -r requirements.txt +``` + +### Conversion tools + +```bash +python scripts/tsv2xml.py --help +``` -> pip install -r requirements.txt ### Analysis tools The analysis tools provide statistics and information from the dataset, they also run consistency checks of the format and content. Results can be seen directly on the repository. - -> jupyter-lab + +```bash +jupyter-lab +``` ### Annotation guidelines @@ -81,10 +87,11 @@ We use reStructured TExt using the utility [Sphinx](https://www.sphinx-doc.org/e To build this documentation locally, we recommend to create a virtual environment such as `virtualenv` or `conda`: -> conda create -name guidelines -> conda activate guidelines -> -> conda install sphinx +```bash +conda create -name guidelines +conda activate guidelines +conda install sphinx +``` #### Build HTML site diff --git a/pyproject.toml b/pyproject.toml index b3923e3..a7d16b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,6 @@ readme = "README.md" dynamic = ['version'] [tool.setuptools] -py-modules = ['supermat'] include-package-data = false [tool.setuptools_scm] diff --git a/requirements.txt b/requirements.txt index 8dd57b0..cf2521a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ matplotlib gensim pandas regex -bump-my-version \ No newline at end of file +bump-my-version +supermat \ No newline at end of file diff --git a/supermat/dataset-analysis-labelling.ipynb b/scripts/jupyter/dataset-analysis-labelling.ipynb similarity index 100% rename from supermat/dataset-analysis-labelling.ipynb rename to scripts/jupyter/dataset-analysis-labelling.ipynb diff --git a/supermat/dataset-analysis-linking.ipynb b/scripts/jupyter/dataset-analysis-linking.ipynb similarity index 100% rename from supermat/dataset-analysis-linking.ipynb rename to scripts/jupyter/dataset-analysis-linking.ipynb diff --git a/supermat/dataset-analysis-papers.ipynb b/scripts/jupyter/dataset-analysis-papers.ipynb similarity index 100% rename from supermat/dataset-analysis-papers.ipynb rename to scripts/jupyter/dataset-analysis-papers.ipynb diff --git a/supermat/converters/tsv2xml.py b/scripts/tsv2xml.py similarity index 99% rename from supermat/converters/tsv2xml.py rename to scripts/tsv2xml.py index a4b3135..ed87627 100644 --- a/supermat/converters/tsv2xml.py +++ b/scripts/tsv2xml.py @@ -1,4 +1,3 @@ -# transform tei annotation into prodigy annotations import argparse import os from html import escape diff --git a/scripts/xml2csv.py b/scripts/xml2csv.py new file mode 100644 index 0000000..c228586 --- /dev/null +++ b/scripts/xml2csv.py @@ -0,0 +1,240 @@ +import argparse +import csv +import os +from pathlib import Path + +from bs4 import BeautifulSoup, Tag + +from supermat.supermat_tei_parser import get_children_list_grouped + +paragraph_id = 'paragraph_id' + + +def write_on_file(fw, filename, sentenceText, dic_token): + links = len([token for token in dic_token if token[5] != '_']) + has_links = 0 if links == 0 else 1 + fw.writerow([filename, sentenceText, has_links]) + + +def process_file(finput): + filename = Path(finput).name.split(".superconductors")[0] + with open(finput, encoding='utf-8') as fp: + doc = fp.read() + + # mod_tags = re.finditer(r'(\w+>) ', doc) + # for mod in mod_tags: + # doc = doc.replace(mod.group(), ' ' + mod.group(1)) + # print(doc) + soup = BeautifulSoup(doc, 'xml') + + paragraphs_grouped = get_children_list_grouped(soup) + + dic_dest_relationships = {} + dic_source_relationships = {} + ient = 1 + i = 0 + for para_id, paragraph in enumerate(paragraphs_grouped): + for sent_id, sentence in enumerate(paragraph): + j = 0 + for item in sentence.contents: + if type(item) is Tag: + if 'type' not in item.attrs: + raise Exception("RS without type is invalid. Stopping") + entity_class = item.attrs['type'] + entity_text = item.text + + if len(item.attrs) > 0: + if 'xml:id' in item.attrs: + if item.attrs['xml:id'] not in dic_dest_relationships: + dic_dest_relationships[item.attrs['xml:id']] = [i + 1, j + 1, ient, entity_text, + entity_class, para_id, sent_id] + + if 'corresp' in item.attrs: + if (i + 1, j + 1) not in dic_source_relationships: + dic_source_relationships[i + 1, j + 1] = [item.attrs['corresp'].replace('#', ''), ient, + entity_text, entity_class, para_id, sent_id] + j += 1 + ient += 1 + i += 1 + + output = [] + output_idx = [] + + struct = { + 'id': None, + 'filename': None, + 'passage_id': None, + 'material': None, + 'tcValue': None, + 'pressure': None, + 'me_method': None, + 'sentence': None + } + mapping = {} + + for label in list(struct.keys()): + if label not in mapping: + mapping[label] = {} + + for par_num, token_num in dic_source_relationships: + source_item = dic_source_relationships[par_num, token_num] + source_entity_id = source_item[1] + source_id = str(par_num) + '-' + str(token_num) + source_text = source_item[2] + source_label = source_item[3] + + # destination_xml_id: Use this to pick up information from dic_dest_relationship + destination_xml_id = source_item[0] + + for des in destination_xml_id.split(","): + destination_item = dic_dest_relationships[str(des)] + + destination_id = destination_item[2] + destination_text = destination_item[3] + destination_label = destination_item[4] + destination_para = destination_item[5] + destination_sent = destination_item[6] + if destination_label != label: + continue + + # try: + # relationship_name = get_relationship_name(source_label, destination_label) + # except Exception as e: + # return [] + + if source_label not in mapping: + mapping[source_label] = {} + + if destination_id in mapping[destination_label]: + indexes_in_output_table = mapping[destination_label][destination_id] + for index_in_output_table in indexes_in_output_table: + if source_label in output[index_in_output_table]: + row_copy = {key: value for key, value in output[index_in_output_table].items()} + row_copy[destination_label] = destination_text + row_copy[source_label] = source_text + row_copy['filename'] = filename + row_copy[paragraph_id] = destination_para + output.append(row_copy) + # output.append({destination_label: destination_text, source_label: source_text}) + else: + output[index_in_output_table][source_label] = source_text + elif source_entity_id in mapping[source_label]: + indexes_in_output_table = mapping[source_label][source_entity_id] + for index_in_output_table in indexes_in_output_table: + if destination_label in output[index_in_output_table]: + # output.append({destination_label: destination_text, source_label: source_text}) + # if source_label in output[index_in_output_table]: + # output.append({destination_label: destination_text, source_label: source_text}) + # else: + row_copy = {key: value for key, value in output[index_in_output_table].items()} + row_copy[source_label] = source_text + row_copy[destination_label] = destination_text + row_copy['filename'] = filename + row_copy[paragraph_id] = destination_para + output.append(row_copy) + else: + output[index_in_output_table][destination_label] = destination_text + else: + output.append({ + destination_label: destination_text, + source_label: source_text, + 'filename': filename, + paragraph_id: destination_para}) + output_idx.append({ + destination_label: destination_id, + source_label: source_id, + 'filename': filename, + paragraph_id: destination_para + }) + + current_index = len(output) - 1 + if destination_id not in mapping[destination_label]: + mapping[destination_label][destination_id] = set() + mapping[destination_label][destination_id].add(current_index) + else: + mapping[destination_label][destination_id].add(current_index) + + if source_entity_id not in mapping[source_label]: + mapping[source_label][source_entity_id] = set() + mapping[source_label][source_entity_id].add(current_index) + else: + mapping[source_label][source_entity_id].add(current_index) + + return output + + +def writeOutput(data, output_path, format): + delimiter = '\t' if format == 'tsv' else ',' + fw = csv.writer(open(output_path, encoding='utf-8', mode='w'), delimiter=delimiter, quotechar='"') + columns = ['id', 'filename', paragraph_id, 'material', 'tcValue', 'pressure', 'me_method'] + fw.writerow(columns) + for d in data: + fw.writerow([d[c] if c in d else '' for c in columns]) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Converter XML (Supermat) to a tabular values (CSV, TSV)") + + parser.add_argument("--input", help="Input file or directory", required=True) + parser.add_argument("--output", help="Output directory", required=True) + parser.add_argument("--recursive", action="store_true", default=False, + help="Process input directory recursively. If input is a file, this parameter is ignored.") + parser.add_argument("--format", default='csv', choices=['tsv', 'csv'], + help="Output format.") + parser.add_argument("--filter", default='all', choices=['all', 'oa', 'non-oa'], + help='Extract data from a certain type of licenced documents') + + args = parser.parse_args() + + input = args.input + output = args.output + recursive = args.recursive + format = args.format + filter = args.filter + + if os.path.isdir(input): + path_list = [] + + if recursive: + for root, dirs, files in os.walk(input): + for file_ in files: + if not file_.lower().endswith(".xml"): + continue + + if filter == 'oa': + if '-CC' not in file_: + continue + elif filter == 'non-oa': + if '-CC' in file_: + continue + + abs_path = os.path.join(root, file_) + path_list.append(abs_path) + + else: + path_list = Path(input).glob('*.xml') + + data_sorted = [] + for path in path_list: + print("Processing: ", path) + file_data = process_file(path) + data = sorted(file_data, key=lambda k: k[paragraph_id]) + data_sorted.extend(data) + + if os.path.isdir(str(output)): + output_path = os.path.join(output, "output") + "." + format + else: + parent_dir = Path(output).parent + output_path = os.path.join(parent_dir, "output." + format) + + elif os.path.isfile(input): + input_path = Path(input) + data = process_file(input_path) + data_sorted = sorted(data, key=lambda k: k[paragraph_id]) + output_filename = input_path.stem + output_path = os.path.join(output, str(output_filename) + "." + format) + + data = [{**record, **{"id": idx}} for idx, record in enumerate(data_sorted)] + + writeOutput(data, output_path, format) diff --git a/scripts/xml2csv_entities.py b/scripts/xml2csv_entities.py new file mode 100644 index 0000000..74e75b9 --- /dev/null +++ b/scripts/xml2csv_entities.py @@ -0,0 +1,140 @@ +import argparse +import csv +import os +from pathlib import Path + +from supermat.supermat_tei_parser import process_file_to_json + +paragraph_id = 'paragraph_id' + + +def write_on_file(fw, filename, sentenceText, dic_token): + links = len([token for token in dic_token if token[5] != '_']) + has_links = 0 if links == 0 else 1 + fw.writerow([filename, sentenceText, has_links]) + + +def write_output(data, output_path, format, header): + delimiter = '\t' if format == 'tsv' else ',' + fw = csv.writer(open(output_path, encoding='utf-8', mode='w'), delimiter=delimiter, quotechar='"') + fw.writerow(header) + fw.writerows(data) + + +def get_entity_data(data_sorted, ent_type): + ent_data = [[pid, data_sorted['doc_key'], pid, "".join(data_sorted['passages'][pid][entity[0]:entity[1]])] for + pid in range(0, len(data_sorted['ner'])) for entity in + filter(lambda e: e[2] == ent_type, data_sorted['ner'][pid])] + + # We remove the duplicates of the materials that falls in the same passage + seen_values = set() + ent_data_no_duplicates = [item for item in ent_data if + str(item[1]) + str(item[2]) + str(item[3]) not in seen_values and not seen_values.add( + str(item[1]) + str(item[2]) + str(item[3]))] + + return ent_data_no_duplicates + + +def get_texts(data_sorted): + text_data = [[idx, data_sorted['doc_key'], idx, "".join(data_sorted['passages'][idx])] for idx in + range(0, len(data_sorted['passages']))] + + return text_data + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Converter XML (Supermat) to a tabular values (CSV, TSV) for entity extraction (no relation information are used)") + + parser.add_argument("--input", help="Input file or directory", required=True) + parser.add_argument("--output", help="Output directory", required=True) + parser.add_argument("--recursive", + action="store_true", + default=False, + help="Process input directory recursively. If input is a file, this parameter is ignored.") + parser.add_argument("--format", + default='csv', + choices=['tsv', 'csv'], + help="Output format.") + parser.add_argument("--use-sentences", + default=False, + action="store_true", + help="Uses sentences instead of paragraphs") + + args = parser.parse_args() + + input = args.input + output = args.output + recursive = args.recursive + format = args.format + use_sentences = args.use_sentences + + if os.path.isdir(input): + path_list = [] + + if recursive: + for root, dirs, files in os.walk(input): + for file_ in files: + if not file_.lower().endswith(".xml"): + continue + + abs_path = os.path.join(root, file_) + path_list.append(abs_path) + + else: + path_list = Path(input).glob('*.xml') + + entities_data = [] + texts_data = [] + ent_type = "material" + for path in path_list: + print("Processing: ", path) + file_data = process_file_to_json(path, not use_sentences) + # data = sorted(file_data, key=lambda k: k[paragraph_id]) + entity_data = get_entity_data(file_data, ent_type) + entities_data.extend(entity_data) + + text_data = get_texts(file_data) + texts_data.extend(text_data) + + if os.path.isdir(str(output)): + output_path_text = os.path.join(output, "output-text") + "." + format + output_path_expected = os.path.join(output, "output-" + ent_type) + "." + format + else: + parent_dir = Path(output).parent + output_path_text = os.path.join(parent_dir, "output-text" + "." + format) + output_path_expected = os.path.join(parent_dir, "output-" + ent_type + "." + format) + + header = ["id", "filename", "pid", ent_type] + + for idx, data in enumerate(entities_data): + data[0] = idx + + write_output(entities_data, output_path_expected, format, header) + + header = ["id", "filename", "pid", "text"] + for idx, data in enumerate(texts_data): + data[0] = idx + write_output(texts_data, output_path_text, format, header) + + elif os.path.isfile(input): + input_path = Path(input) + file_data = process_file_to_json(input_path, not use_sentences) + output_filename = input_path.stem + + output_path_text = os.path.join(output, str(output_filename) + "-text" + "." + format) + texts_data = get_texts(file_data) + for idx, data in enumerate(texts_data): + data[0] = idx + + header = ["id", "filename", "pid", "text"] + write_output(texts_data, output_path_text, format, header) + + ent_type = "material" + output_path_expected = os.path.join(output, str(output_filename) + "-" + ent_type + "." + format) + ent_data_no_duplicates = get_entity_data(file_data, ent_type) + for idx, data in enumerate(ent_data_no_duplicates): + data[0] = idx + + header = ["id", "filename", "pid", ent_type] + write_output(ent_data_no_duplicates, output_path_expected, format, header) diff --git a/supermat/converters/xml2tsv.py b/scripts/xml2tsv.py similarity index 99% rename from supermat/converters/xml2tsv.py rename to scripts/xml2tsv.py index 7c7d36e..521b756 100644 --- a/supermat/converters/xml2tsv.py +++ b/scripts/xml2tsv.py @@ -1,4 +1,3 @@ -# transform XML Tei to TSV for WebAnno import argparse import os import re @@ -6,7 +5,7 @@ from bs4 import BeautifulSoup, NavigableString, Tag -from grobid_tokenizer import tokenizeSimple +from supermat.grobid_tokenizer import tokenizeSimple def tokenise(string): diff --git a/supermat/converters/xmlSupermat2csv.py b/scripts/xmlSupermat2csv.py similarity index 93% rename from supermat/converters/xmlSupermat2csv.py rename to scripts/xmlSupermat2csv.py index bf9f536..41c63f1 100644 --- a/supermat/converters/xmlSupermat2csv.py +++ b/scripts/xmlSupermat2csv.py @@ -6,8 +6,7 @@ from bs4 import BeautifulSoup, Tag -from super_mat.grobid_tokenizer import tokenizeAndFilterSimple - +from supermat.supermat_tei_parser import get_children_list def write_on_file(fw, filename, sentenceText, dic_token): links = len([token for token in dic_token if token[5] != '_']) @@ -15,26 +14,6 @@ def write_on_file(fw, filename, sentenceText, dic_token): fw.writerow([filename, sentenceText, has_links]) -def tokenise(string): - return tokenizeAndFilterSimple(string) - -def get_children_list(soup, verbose=False): - children = [] - - for child in soup.tei.children: - if child.name == 'teiHeader': - pass - children.append(child.find_all("title")) - children.extend([subchild.find_all("s") for subchild in child.find_all("abstract")]) - children.extend([subchild.find_all("s") for subchild in child.find_all("ab", {"type": "keywords"})]) - elif child.name == 'text': - children.extend([subchild.find_all("s") for subchild in child.find_all("body")]) - - if verbose: - print(str(children)) - - return children - def processFile(finput): with open(finput, encoding='utf-8') as fp: doc = fp.read() diff --git a/supermat/__init__.py b/src/supermat/__init__.py similarity index 100% rename from supermat/__init__.py rename to src/supermat/__init__.py diff --git a/supermat/grobid_tokenizer.py b/src/supermat/grobid_tokenizer.py similarity index 94% rename from supermat/grobid_tokenizer.py rename to src/supermat/grobid_tokenizer.py index 267a33b..a4c9eb9 100644 --- a/supermat/grobid_tokenizer.py +++ b/src/supermat/grobid_tokenizer.py @@ -4,7 +4,8 @@ # also python side of GROBID default tokenizer, used for Indo-European languages # Source: http://github.com/kermitt2/delft -delimiters = "\n\r\t\f\u00A0([ ^%‰°•,:;?.!/)-–−‐=≈~<>+\"“”‘’'`$]*\u2666\u2665\u2663\u2660" + +delimiters = "\n\r\t\f\u00A0([ ^%‰°•⋅·,:;?.!/)-–−‐=≈~∼<>+\"“”‘’'`#$]*\u2666\u2665\u2663\u2660\u00A0" regex = '|'.join(map(re.escape, delimiters)) regex_second_step = "(?<=[a-zA-Z])(?=\\d)|(?<=\\d)(?=\\D)" diff --git a/src/supermat/supermat_tei_parser.py b/src/supermat/supermat_tei_parser.py new file mode 100644 index 0000000..71fb1e6 --- /dev/null +++ b/src/supermat/supermat_tei_parser.py @@ -0,0 +1,485 @@ +import re +from collections import OrderedDict +from pathlib import Path +from typing import Union, List + +from bs4 import BeautifulSoup, Tag, NavigableString + +from src.supermat.grobid_tokenizer import tokenizeSimple + + +def tokenise(string): + return tokenizeSimple(string) + + +def get_section(pTag): + section = None + if pTag.name == 'p': + section = pTag.parent.name + elif pTag.name == 'ab': + if 'type' in pTag.attrs: + type = pTag.attrs['type'] + if type == 'keywords': + section = "keywords" + elif type == 'figureCaption': + section = 'figureCaption' + elif type == 'tableCaption': + section = 'tableCaption' + elif pTag.name == 'title': + section = 'title' + + return section + + +def process_file(input_document, use_paragraphs=False): + with open(input_document, encoding='utf-8') as fp: + doc = fp.read() + + mod_tags = re.finditer(r'(\w+>) ', doc) + for mod in mod_tags: + doc = doc.replace(mod.group(), ' ' + mod.group(1)) + soup = BeautifulSoup(doc, 'xml') + + children = get_children_list(soup, verbose=False, use_paragraphs=use_paragraphs) + + off_token = 0 + dic_token = {} + ient = 1 + + # list containing text and the dictionary with all the annotations + paragraphs = [] + dic_dest_relationships = {} + dic_source_relationships = {} + + i = 0 + for child in children: + for pTag in child: + j = 0 + section = get_section(pTag) + if not section: + section = get_section(pTag.parent) + paragraphText = '' + for item in pTag.contents: + if type(item) == NavigableString: + paragraphText += str(item) + + token_list = tokenise(item.string) + if token_list[0] == ' ': # remove space after tags + del token_list[0] + + entity_class = '_' + + for token in token_list: + s = off_token + off_token += len(token.rstrip(' ')) + e = off_token + if token.rstrip(' '): + dic_token[(i + 1, j + 1)] = [ + s, e, token.rstrip(' '), section + f'[{i + 10000}]', entity_class, entity_class, + entity_class, entity_class, entity_class] + # print((i+1, j+1), s, e, [token], len(token.rstrip(' ')), off_token) + j += 1 + if len(token) > 0 and token[-1] == ' ': + off_token += 1 # + elif type(item) is Tag and item.name == 'rs': + paragraphText += item.text + + token_list = tokenise(item.string) + # token_list[-1] += ' ' # add space the end of tag contents + if 'type' not in item.attrs: + raise Exception("RS without type is invalid. Stopping") + + entity_class = item.attrs['type'] + link_name = '_' + link_location = '_' + + if len(item.attrs) > 0: + if 'xml:id' in item.attrs: + if item.attrs['xml:id'] not in dic_dest_relationships: + dic_dest_relationships[item.attrs['xml:id']] = [i + 1, j + 1, ient, entity_class] + + if 'corresp' in item.attrs: + if (i + 1, j + 1) not in dic_source_relationships: + dic_source_relationships[i + 1, j + 1] = [item.attrs['corresp'].replace('#', ''), + ient, + entity_class] + + # link_to = dic_relationships[item.attrs['ptr'].replace("#", '')] + # relationship_name = link_to[2] + '-' + entity + # relationship_references = str(link_to[0]) + '-' + str(link_to[1]) + '[' + str( + # i + 1) + '-' + str(j + 1) + ']' + # print(dic_token[link_to[0], link_to[1]]) + link_name = 'link_name' + link_location = 'link_location' + + entity_class = entity_class.replace("_", "\\_") + + for token in token_list: + s = off_token + off_token += len(token.rstrip(' ')) + e = off_token + if token.rstrip(' '): + dic_token[(i + 1, j + 1)] = [s, e, token.rstrip(' '), section + f'[{i + 10000}]', + f'*[{ient}]', + entity_class + f'[{ient}]', link_name, link_location] + # print((i+1, j+1), s, e, [token], len(token.rstrip(' ')), off_token) + j += 1 + if len(token) > 0 and token[-1] == ' ': + off_token += 1 # + ient += 1 # entity No. + + off_token += 1 # return + + paragraphs.append((i, paragraphText)) + i += 1 + + for par_num, token_num in dic_source_relationships: + destination_xml_id = dic_source_relationships[par_num, token_num][0] + source_entity_id = dic_source_relationships[par_num, token_num][1] + label = dic_source_relationships[par_num, token_num][2] + + # destination_xml_id: Use this to pick up information from dic_dest_relationship + + for des in destination_xml_id.split(","): + destination_item = dic_dest_relationships[str(des)] + destination_paragraph_tsv = destination_item[0] + destination_token_tsv = destination_item[1] + destination_entity_id = destination_item[2] + destination_type = destination_item[3] + + relationship_name = get_relationship_name(label, destination_type) + + dict_coordinates = (destination_paragraph_tsv, destination_token_tsv) + + dic_token_entry = dic_token[dict_coordinates] + if dic_token_entry[6] == 'link_name' and dic_token_entry[7] == 'link_location': + dic_token_entry[6] = relationship_name + dic_token_entry[7] = str(par_num) + '-' + str(token_num) + "[" + str( + source_entity_id) + '_' + str(destination_entity_id) + ']' + else: + dic_token_entry[6] += '|' + relationship_name + dic_token_entry[7] += '|' + str(par_num) + '-' + str(token_num) + "[" + str( + source_entity_id) + '_' + str(destination_entity_id) + ']' + + # Cleaning up the dictionary token + for k, v in dic_token.items(): + v[6] = v[6].replace('link_name', '_') + v[7] = v[7].replace('link_location', '_') + + return paragraphs, dic_token + + +def process_file_to_json(finput, use_paragraphs=False): + with open(finput, encoding='utf-8') as fp: + doc = fp.read() + + mod_tags = re.finditer(r'(\w+>) ', doc) + for mod in mod_tags: + doc = doc.replace(mod.group(), ' ' + mod.group(1)) + soup = BeautifulSoup(doc, 'xml') + + children = get_children_list_grouped(soup, use_paragraphs=use_paragraphs) + + output_document = OrderedDict() + output_document['doc_key'] = Path(str(finput)).name + output_document['dataset'] = 'SuperMat' + + if use_paragraphs: + passages, ner, relations = process_paragraphs(children) + else: + passages, ner, relations = process_sentences(children) + + output_document['passages'] = passages + output_document['ner'] = ner + output_document['relations'] = relations + + return output_document + + +def process_paragraphs(children: list) -> [List, List, List]: + """Process paragraphs. If the XML contains Sentences, it aggregates them separated by a space. """ + token_offset_sentence = 0 + ient = 1 + + # list containing text and the dictionary with all the annotations + paragraphs = [] + ner = [] + relations = [] + + i = 0 + for paragraph in children: + j = 0 + text_paragraph = '' + tokens_paragraph = [] + ner_sentence = [] + relations_sentence = [] + dic_dest_relationships = {} + dic_source_relationships = {} + linked_entity_registry = {} + token_offset_sentence = 0 + + first = True + for sentence in paragraph: + if not first: + if len(text_paragraph) > 0: + text_paragraph += " " + tokens_paragraph.append(" ") + token_offset_sentence += 1 + if first: + first = False + + for item in sentence.contents: + if type(item) == NavigableString: + local_text = str(item) + text_paragraph += local_text + + token_list = tokenise(item.string) + if len(token_list) > 0 and token_list[0] == ' ': # remove space after tags + del token_list[0] + token_offset_sentence -= 1 + + tokens_paragraph.extend(token_list) + token_offset_sentence += len(token_list) + + elif type(item) is Tag and item.name == 'rs': + local_text = item.text + text_paragraph += local_text + if 'type' not in item.attrs: + raise Exception("RS without type is invalid. Stopping") + label = item.attrs['type'] + token_list = tokenise(local_text) + tokens_paragraph.extend(token_list) + + ner_entity = [token_offset_sentence, token_offset_sentence + len(token_list), label] + ner_sentence.append(ner_entity) + + if len(item.attrs) > 0: + ## multiple entities can point ot the same one, so "corresp" value can be duplicated + allow_duplicates = False + span_id = None + if 'xml:id' in item.attrs: + span_id = item['xml:id'] + if item.attrs['xml:id'] not in dic_dest_relationships: + dic_dest_relationships[item.attrs['xml:id']] = [i + 1, j + 1, ient, label] + + if 'corresp' in item.attrs: + if span_id is None or span_id == "": + id_str = str(i + 1) + "," + str(j + 1) + span_id = get_hash(id_str) + if span_id not in dic_source_relationships: + dic_source_relationships[span_id] = [item.attrs['corresp'].replace('#', ''), + ient, + label] + else: + if span_id not in dic_source_relationships: + dic_source_relationships[span_id] = [item.attrs['corresp'].replace('#', ''), + ient, + label] + + allow_duplicates = True + + if span_id is not None: + if span_id not in linked_entity_registry.keys(): + linked_entity_registry[span_id] = ner_entity + else: + if not allow_duplicates: + print("The same key exists... something's wrong: ", span_id) + + token_offset_sentence += len(token_list) + + j += 1 + + ient += 1 # entity No. + + # token_offset_sentence += 1 # return + + for id__ in dic_source_relationships: + destination_xml_id = dic_source_relationships[id__][0] + + for des in destination_xml_id.split(","): + dict_coordinates = get_hash(id__) + if des in linked_entity_registry: + span_destination = linked_entity_registry[des] + span_source = linked_entity_registry[dict_coordinates] + + relations_sentence.append( + [span_destination[0], span_destination[1], span_source[0], span_source[1], + get_relationship_name(span_source[2], span_destination[2])]) + + if len(str.strip("".join(tokens_paragraph))) > 0: + paragraphs.append(tokens_paragraph) + ner.append(ner_sentence) + relations.append(relations_sentence) + i += 1 + + return paragraphs, ner, relations + + +def process_sentences(children: list) -> [List, List, List]: + """Process XML with
and as sentences. Return a flat list of sentences."""
+ token_offset_sentence = 0
+ ient = 1
+
+ # list containing text and the dictionary with all the annotations
+ sentences = []
+ ner = []
+ relations = []
+
+ i = 0
+ for paragraph in children:
+ for sentence in paragraph:
+ j = 0
+ text_sentence = ''
+ tokens_sentence = []
+ ner_sentence = []
+ relations_sentence = []
+ dic_dest_relationships = {}
+ dic_source_relationships = {}
+ linked_entity_registry = {}
+
+ for item in sentence.contents:
+ if type(item) == NavigableString:
+ local_text = str(item)
+ text_sentence += local_text
+
+ token_list = tokenise(item.string)
+ if len(token_list) > 0 and token_list[0] == ' ': # remove space after tags
+ del token_list[0]
+
+ tokens_sentence.extend(token_list)
+ token_offset_sentence += len(token_list)
+
+ elif type(item) is Tag and item.name == 'rs':
+ local_text = item.text
+ text_sentence += local_text
+ if 'type' not in item.attrs:
+ raise Exception("RS without type is invalid. Stopping")
+ label = item.attrs['type']
+ token_list = tokenise(local_text)
+ tokens_sentence.extend(token_list)
+
+ ner_entity = [token_offset_sentence, token_offset_sentence + len(token_list) - 1, label]
+ ner_sentence.append(ner_entity)
+
+ if len(item.attrs) > 0:
+ ## multiple entities can point ot the same one, so "corresp" value can be duplicated
+ allow_duplicates = False
+ span_id = None
+ if 'xml:id' in item.attrs:
+ span_id = item['xml:id']
+ if item.attrs['xml:id'] not in dic_dest_relationships:
+ dic_dest_relationships[item.attrs['xml:id']] = [i + 1, j + 1, ient, label]
+
+ if 'corresp' in item.attrs:
+ if span_id is None or span_id == "":
+ id_str = str(i + 1) + "," + str(j + 1)
+ span_id = get_hash(id_str)
+ if span_id not in dic_source_relationships:
+ dic_source_relationships[span_id] = [item.attrs['corresp'].replace('#', ''),
+ ient,
+ label]
+ else:
+ if span_id not in dic_source_relationships:
+ dic_source_relationships[span_id] = [item.attrs['corresp'].replace('#', ''),
+ ient,
+ label]
+
+ allow_duplicates = True
+
+ if span_id is not None:
+ if span_id not in linked_entity_registry.keys():
+ linked_entity_registry[span_id] = ner_entity
+ else:
+ if not allow_duplicates:
+ print("The same key exists... something's wrong: ", span_id)
+
+ token_offset_sentence += len(token_list)
+
+ j += 1
+
+ # elif use_paragraphs and type(item) is Tag and item.name == 's':
+ #
+ # local_text = str(item)
+ # text_sentence += local_text
+
+ ient += 1 # entity No.
+
+ # token_offset_sentence += 1 # return
+
+ sentences.append(tokens_sentence)
+ ner.append(ner_sentence)
+ i += 1
+
+ for id__ in dic_source_relationships:
+ destination_xml_id = dic_source_relationships[id__][0]
+
+ for des in destination_xml_id.split(","):
+ dict_coordinates = get_hash(id__)
+ if des in linked_entity_registry:
+ span_destination = linked_entity_registry[des]
+ span_source = linked_entity_registry[dict_coordinates]
+
+ relations_sentence.append(
+ [span_destination[0], span_destination[1], span_source[0], span_source[1],
+ get_relationship_name(span_source[2], span_destination[2])])
+
+ relations.append(relations_sentence)
+
+ return sentences, ner, relations
+
+
+def get_children_list_grouped(soup, use_paragraphs=False) -> Union[List, List[List]]:
+ tags_title = []
+ tags_text = []
+ tags_captions = []
+
+ paragraph_tag = "p"
+ sentence_tag = "s"
+
+ for child in soup.tei.children:
+ if child.name == 'teiHeader':
+ # [y for x in list(filter(lambda c: c.name in ['teiHeader', 'text'], soup.tei.children)) for y in filter(lambda o: type(o) == Tag, x.children)]
+ tags_title.extend([paragraph for paragraph in child.find_all("title")])
+ tags_text.extend([paragraph for subchildren in child.find_all("abstract") for paragraph in
+ subchildren.find_all(paragraph_tag)])
+ tags_text.extend([paragraph for paragraph in child.find_all("ab", {"type": "keywords"})])
+
+ elif child.name == 'text':
+ tags_text.extend([paragraph for subchildren in child.find_all("body") for paragraph in
+ subchildren.find_all(paragraph_tag)])
+ tags_captions.extend([paragraph for paragraph in child.find_all("ab")])
+
+ data_grouped = [tags_title] + [[z for z in y.find_all(sentence_tag)] for y in tags_text + tags_captions]
+
+ if use_paragraphs:
+ data_grouped = [[sentence for sentence in paragraph] for paragraph in data_grouped]
+
+ return data_grouped
+
+
+def get_children_list(soup, use_paragraphs=False, verbose=False):
+ children = []
+
+ child_name = "p" if use_paragraphs else "s"
+ for child in soup.tei.children:
+ if child.name == 'teiHeader':
+ pass
+ children.append(child.find_all("title"))
+ children.extend([subchild.find_all(child_name) for subchild in child.find_all("abstract")])
+ children.extend([subchild.find_all(child_name) for subchild in child.find_all("ab", {"type": "keywords"})])
+ elif child.name == 'text':
+ children.extend([subchild.find_all(child_name) for subchild in child.find_all("body")])
+
+ if verbose:
+ print(str(children))
+
+ return children
+
+
+def get_relationship_name(source_label, destination_label):
+ return source_label + "-" + destination_label
+
+
+def get_hash(dict_coordinates_str):
+ return dict_coordinates_str
+ # return hashlib.md5(dict_coordinates_str.encode('utf-8')).hexdigest()