From 4c02935e7f649b0365da6754aba0e92ec532a26e Mon Sep 17 00:00:00 2001 From: saggu Date: Mon, 16 May 2022 15:49:11 -0700 Subject: [PATCH 01/21] add new property definitions --- kgtk-properties/kgtk.properties.tsv | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/kgtk-properties/kgtk.properties.tsv b/kgtk-properties/kgtk.properties.tsv index d80c55ed5..79a247e09 100644 --- a/kgtk-properties/kgtk.properties.tsv +++ b/kgtk-properties/kgtk.properties.tsv @@ -78,13 +78,13 @@ P279dwdstar P31 Q18616576 P279dwdstar-P31-Q18616576 P279dwdstar P31 Q28326461 P279dwdstar-P31-Q28326461 P279dwdstar P31 Q18647519 P279dwdstar-P31-Q18647519 P279dwdstar datatype wikibase-item P279dwdstar-datatype-643cc9 -P1963computed label 'properties for this type'@en P1963computed-label-69d08f +P1963computed label 'properties for this type'@en P1963computed-label-813bce P1963computed description 'The properties defined for a class, computed based on its instances'@en P1963computed-description-2e5ab8 P1963computed P31 Q19820110 P1963computed-P31-Q19820110 P1963computed P279 Q22582645 P1963computed-P279-Q22582645 P1963computed P1659 P1963 P1963computed-P1659-P1963 P1963computed P7482 Q108739856 P1963computed-P7482-Q108739856 -P1963computed datatype quantity P1963computed-datatype-1a7b30 +P1963computed datatype wikibase-item P1963computed-datatype-643cc9 Pproperty_domain label 'entity types used (computed)'@en Pproperty_domain-label-4ffb8a Pproperty_domain description 'List the classes that appear as values of a property'@en Pproperty_domain-description-7eb869 Pproperty_domain P31 Q19820110 Pproperty_domain-P31-Q19820110 @@ -121,11 +121,11 @@ P131country P31 Q18647519 P131country-P31-Q18647519 P131country P31 Q18647515 P131country-P31-Q18647515 P131country P31 Q18615777 P131country-P31-Q18615777 P131country P31 Q70564278 P131country-P31-Q70564278 -Pinstance_count label '# instances'@en Pinstance_count-label-a6705f +Pinstance_count label '# instances'@en Pinstance_count-label-34ffab Pinstance_count description 'The number of P31/P31x instances of a class'@en Pinstance_count-description-b06abb Pinstance_count P31 Q18616576 Pinstance_count-P31-Q18616576 Pinstance_count datatype quantity Pinstance_count-datatype-1a7b30 -Pinstance_count_star label '# instances including subclasses'@en Pinstance_count_star-label-c48ebd +Pinstance_count_star label '# instances including subclasses'@en Pinstance_count_star-label-b3a90f Pinstance_count_star description 'The number of P31/P31x instances of a class and its subclasses'@en Pinstance_count_star-description-bceb9a Pinstance_count_star P31 Q18616576 Pinstance_count_star-P31-Q18616576 Pinstance_count_star datatype quantity Pinstance_count_star-datatype-1a7b30 @@ -137,10 +137,22 @@ Pshort_abstract label 'short abstract from Wikipedia articles'@en Pshort_abstrac Pshort_abstract description 'text before the table of contents from Wikipedia articles shortened to 2-3 sentences'@en Pshort_abstract-description-d251e5 Pshort_abstract P31 Q18616576 Pshort_abstract-P31-Q18616576 Pshort_abstract datatype string Pshort_abstract-datatype-473287 -P1963computed_star label 'properties for this type including subclasses'@en P1963computed_star-label-69d08f -P1963computed_star description 'The properties defined for a class, computed based on its instances and instances of all subclasses'@en P1963computed_star-description-229afc +P1963computed_star label 'properties for this type including instances of subclasses'@en P1963computed_star-label-c1ae81 +P1963computed_star description 'The properties defined for a class, computed based on its instances and instances of all subclasses'@en P1963computed_star-description-fd0dc9 P1963computed_star P31 Q19820110 P1963computed_star-P31-Q19820110 P1963computed_star P279 Q22582645 P1963computed_star-P279-Q22582645 P1963computed_star P1659 P1963 P1963computed_star-P1659-P1963 P1963computed_star P7482 Q108739856 P1963computed_star-P7482-Q108739856 -P1963computed_star datatype quantity P1963computed_star-datatype-1a7b30 +P1963computed_star datatype wikibase-item P1963computed_star-datatype-643cc9 +Psubclass_count_star label '# subclasses'@en Psubclass_count_star-label-0ea3bb +Psubclass_count_star description 'The number of P279* subclasses of a class'@en Psubclass_count_star-description-732aef +Psubclass_count_star P31 Q18616576 Psubclass_count_star-P31-Q18616576 +Psubclass_count_star datatype quantity Psubclass_count_star-datatype-1a7b30 +Psubclass_count_star P7482 Q108739856 Psubclass_count_star-P7482-Q108739856 +P1963subclass_star label 'properties for this type including subclasses'@en P1963subclass_star-label-173efe +P1963subclass_star description 'The properties defined for a class, computed based on its subclasses'@en P1963subclass_star-description-8f67d8 +P1963subclass_star P31 Q19820110 P1963subclass_star-P31-Q19820110 +P1963subclass_star P279 Q22582645 P1963subclass_star-P279-Q22582645 +P1963subclass_star P1659 P1963 P1963subclass_star-P1659-P1963 +P1963subclass_star P7482 Q108739856 P1963subclass_star-P7482-Q108739856 +P1963subclass_star datatype wikibase-item P1963subclass_star-datatype-643cc9 From a810f151c4f175f2a43528b7401cbb42bb6b28fa Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 18 May 2022 11:18:44 -0700 Subject: [PATCH 02/21] make pep 8 happier --- kgtk/utils/elasticsearch_manager.py | 59 ++++++++++++++++------------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/kgtk/utils/elasticsearch_manager.py b/kgtk/utils/elasticsearch_manager.py index 09413762b..b83c9fdd5 100644 --- a/kgtk/utils/elasticsearch_manager.py +++ b/kgtk/utils/elasticsearch_manager.py @@ -14,33 +14,38 @@ class ElasticsearchManager(object): @staticmethod - def build_kgtk_search_input(kgtk_file_path, - label_fields, - mapping_file_path, - output_path, - alias_fields=None, - extra_alias_properties=None, - pagerank_fields=None, - add_text=False, - description_properties=None, - separate_languages=True, - property_datatype_file=None, - languages=None + def build_kgtk_search_input(kgtk_file_path: str, + label_fields: str, + mapping_file_path: str, + output_path: str, + alias_fields: str = None, + extra_alias_properties: str = None, + pagerank_fields: str = None, + add_text: bool = False, + description_properties: str = None, + separate_languages: bool = True, + property_datatype_file: str = None, + languages: set = None ): """ builds a json lines file and a mapping file to support retrieval of candidates - It is assumed that the file is sorted by subject and predicate, in order to be able to process it in a streaming fashion + It is assumed that the file is sorted by subject and predicate, in order to be able to process it in a + streaming fashion Args: - kgtk_file_path: a file in KGTK format - label_fields: field in the kgtk file to be used as labels - mapping_file_path: output mapping file path for elasticsearch - output_path: output json lines path, converted from the input kgtk file - alias_fields: field in the kgtk file to be used as aliases - pagerank_fields: field in the kgtk file to be used as pagerank - black_list_file_path: path to black list file - Returns: Nothing - + :param kgtk_file_path: input KGTK edge file + :param label_fields: comma separated properties to be used as labels + :param mapping_file_path: output file path for mapping json file + :param output_path: output json lines file path + :param alias_fields: comma separated properties to be used as aliases + :param extra_alias_properties: additional properties to be used as aliases + :param pagerank_fields: comma separated properties to be used as pagerank + :param add_text: concatenate english labels, aliases and descriptions in one text field + :param description_properties: comma separated properties to be used as descriptions + :param separate_languages: flag to store text in separate languages in different fields + :param property_datatype_file: input file with property datatype information + :param languages: a set of languages, for labels, aliases and descriptions + :return: None """ if languages is None: @@ -340,7 +345,7 @@ def build_kgtk_search_input(kgtk_file_path, external_identifiers=_external_identifiers, external_identifiers_pairs=_external_identifiers_pairs ) - except: + except Exception: print(traceback.print_exc()) mapping_dict = ElasticsearchManager.create_mapping_es(languages=list(all_langs)) @@ -505,7 +510,7 @@ def create_all_text(labels, aliases, descriptions): def to_float(input_str): try: return float(input_str) - except: + except Exception: return None @staticmethod @@ -799,7 +804,7 @@ def load_elasticsearch_index(kgtk_jl_path, es_url, es_index, mapping_file_path=N es_pass=es_pass) if response.status_code >= 400: print(response.text) - except: + except Exception: print('Exception while loading a batch to es') print(response.text) print(response.status_code) @@ -967,11 +972,11 @@ def create_es_fields_part(field_type: str, es_fields: List[str]): @staticmethod def generate_abbreviations(name: str) -> List[str]: - ''' + """ Helper function to generate the abbreviation. Input: name_split: List of the words in a name Output: Abbreviated Name - ''' + """ name_split = name.split() abbreviated_names = set() From ad9b3be90658d05496ec8741876184d8cd2ae640 Mon Sep 17 00:00:00 2001 From: saggu Date: Thu, 19 May 2022 15:45:09 -0700 Subject: [PATCH 03/21] clean up, delete unused cells --- examples/partition-wikidata.ipynb | 4 +- .../Embeddings-Elasticsearch-&-Triples.ipynb | 375 +- use-cases/Table-Linker-Files.ipynb | 2 +- use-cases/Wikidata Subsets.ipynb | 586 +-- use-cases/Wikidata Useful Files-Copy1.ipynb | 1762 -------- use-cases/Wikidata Useful Files.ipynb | 448 +- use-cases/import-wikidata.ipynb | 3710 ++++++++++++++++- 7 files changed, 4275 insertions(+), 2612 deletions(-) delete mode 100644 use-cases/Wikidata Useful Files-Copy1.ipynb diff --git a/examples/partition-wikidata.ipynb b/examples/partition-wikidata.ipynb index c0a08478f..54b392c48 100644 --- a/examples/partition-wikidata.ipynb +++ b/examples/partition-wikidata.ipynb @@ -698,7 +698,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -712,7 +712,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.11" + "version": "3.9.7" } }, "nbformat": 4, diff --git a/use-cases/Embeddings-Elasticsearch-&-Triples.ipynb b/use-cases/Embeddings-Elasticsearch-&-Triples.ipynb index 29a1331e9..6a0985e7d 100644 --- a/use-cases/Embeddings-Elasticsearch-&-Triples.ipynb +++ b/use-cases/Embeddings-Elasticsearch-&-Triples.ipynb @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "departmental-connectivity", "metadata": { "tags": [ @@ -73,9 +73,9 @@ "source": [ "# Parameters\n", "\n", - "input_path = \"/data/amandeep/wikidata-20211027-dwd-v3\"\n", - "output_path = \"/data/amandeep/wikidata-20211027-dwd-v3\"\n", - "kgtk_path = \"/Users/amandeep/github/kgtk\"\n", + "input_path = \"/data/amandeep/wikidata-20220505-dwd-v4\"\n", + "output_path = \"/data/amandeep/wikidata-20220505-dwd-v4\"\n", + "kgtk_path = \"/Users/amandeep/Github/kgtk\"\n", "\n", "graph_cache_path = None\n", "\n", @@ -83,12 +83,15 @@ "\n", "languages = 'en,ru,es,zh-cn,de,it,nl,pl,fr,pt,sv'\n", "\n", - "files = 'label_all,alias_all,description_all'" + "files = 'label_all,alias_all,description_all'\n", + "compute_embeddings = False\n", + "generate_triples = False\n", + "datatype_property = \"datatype\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "f44d69f2-eca7-4ac6-8b63-1d7c42898f59", "metadata": {}, "outputs": [], @@ -98,20 +101,21 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "8e64367d-b01b-49f2-8c1e-dda2e0ceb2e2", - "metadata": {}, - "outputs": [], - "source": [ - "languages = [f\"'{x}'\" for x in languages.split(\",\")]" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "1b52a584-551e-43ad-becb-9314e95932fa", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User home: /nas/home/amandeep\n", + "Current dir: /data/amandeep/Github/kgtk/use-cases\n", + "KGTK dir: /Users/amandeep/Github/kgtk\n", + "Use-cases dir: /Users/amandeep/Github/kgtk/use-cases\n" + ] + } + ], "source": [ "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", @@ -122,98 +126,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "1ffdcaec-c0d7-468c-a207-186fad300d56", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "USE_CASES_DIR: /Users/amandeep/Github/kgtk/use-cases\n", + "TEMP: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples\n", + "EXAMPLES_DIR: /Users/amandeep/Github/kgtk/examples\n", + "KGTK_LABEL_FILE: /data/amandeep/wikidata-20220505-dwd-v4/labels.en.tsv.gz\n", + "GRAPH: /data/amandeep/wikidata-20220505-dwd-v4\n", + "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples/wikidata.sqlite3.db\n", + "KGTK_OPTION_DEBUG: false\n", + "kgtk: kgtk\n", + "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples/wikidata.sqlite3.db\n", + "OUT: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples\n", + "STORE: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples/wikidata.sqlite3.db\n", + "label_all: /data/amandeep/wikidata-20220505-dwd-v4/labels.tsv.gz\n", + "alias_all: /data/amandeep/wikidata-20220505-dwd-v4/aliases.tsv.gz\n", + "description_all: /data/amandeep/wikidata-20220505-dwd-v4/descriptions.tsv.gz\n" + ] + } + ], "source": [ "ck.print_env_variables()" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "cab363fd-ba8c-433d-b2e7-d56164118c41", - "metadata": {}, - "outputs": [], - "source": [ - "if graph_cache_path is None:\n", - " ck.load_files_into_cache()" - ] - }, - { - "cell_type": "markdown", - "id": "a7473bed-c3fd-4dcd-a0ff-c7fa865003da", - "metadata": {}, - "source": [ - "## Filter the labels in user provided languages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2e887704-2b39-4dbb-8d5a-005c9cf8f47d", - "metadata": {}, - "outputs": [], - "source": [ - "kypher(f\"\"\"-i label_all \n", - " -o $OUT/labels.filtered.tsv.gz \n", - " --match '(n1)-[l:label]->(n2)' \n", - " --where 'n2.kgtk_lqstring_lang_suffix IN {languages}' \n", - " --return 'n1, l.label, n2, l.id'\n", - " \"\"\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d300c616-3ec8-4f98-8adb-dafcf6fac066", - "metadata": {}, - "outputs": [], - "source": [ - "## Filter the aliases in user provided languages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "986c35e1-429d-4059-bd05-6df6c6d4573f", - "metadata": {}, - "outputs": [], - "source": [ - "kypher(f\"\"\" -i alias_all \n", - " -o $OUT/aliases.filtered.tsv.gz \n", - " --match '(n1)-[l:alias]->(n2)' \n", - " --where 'n2.kgtk_lqstring_lang_suffix IN {languages}' \n", - " --return 'n1, l.label, n2, l.id'\n", - " \"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6d93085-b9b7-45db-9d6a-1ae09831f9d1", - "metadata": {}, - "outputs": [], - "source": [ - "## Filter the descriptions in user provided languages" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cdfa2639-36f2-4971-a988-bfc6c5ec937a", - "metadata": {}, - "outputs": [], - "source": [ - "kypher(f\"\"\" -i description_all \n", - " -o $OUT/descriptions.filtered.tsv.gz \n", - " --match '(n1)-[l:description]->(n2)'\n", - " --where 'n2.kgtk_lqstring_lang_suffix IN {languages}' \n", - " --return 'n1, l.label, n2, l.id'\n", - " \"\"\")\n" - ] - }, { "cell_type": "markdown", "id": "excellent-passenger", @@ -267,15 +208,16 @@ "metadata": {}, "outputs": [], "source": [ - "!kgtk graph-embeddings --verbose -i \"$ITEMS\" \\\n", - "-o $OUT/wikidatadwd.complEx.graph-embeddings.txt \\\n", - "--retain_temporary_data True \\\n", - "--operator ComplEx \\\n", - "--workers 24 \\\n", - "--log $TEMP_COMPLEX/ge.complex.log \\\n", - "-T $TEMP_COMPLEX \\\n", - "-ot w2v \\\n", - "-e 600" + "if compute_embeddings:\n", + " !kgtk graph-embeddings --verbose -i \"$ITEMS\" \\\n", + " -o $OUT/wikidatadwd.complEx.graph-embeddings.txt \\\n", + " --retain_temporary_data True \\\n", + " --operator ComplEx \\\n", + " --workers 24 \\\n", + " --log $TEMP_COMPLEX/ge.complex.log \\\n", + " -T $TEMP_COMPLEX \\\n", + " -ot w2v \\\n", + " -e 600" ] }, { @@ -323,15 +265,16 @@ "metadata": {}, "outputs": [], "source": [ - "!$kgtk graph-embeddings --verbose -i \"$ITEMS\" \\\n", - "-o $OUT/wikidatadwd.transE.graph-embeddings.txt \\\n", - "--retain_temporary_data True \\\n", - "--operator TransE \\\n", - "--workers 24 \\\n", - "--log $TEMP_TRANSE/ge.transE.log \\\n", - "-T $TEMP_TRANSE \\\n", - "-ot w2v \\\n", - "-e 600" + "if compute_embeddings:\n", + " !$kgtk graph-embeddings --verbose -i \"$ITEMS\" \\\n", + " -o $OUT/wikidatadwd.transE.graph-embeddings.txt \\\n", + " --retain_temporary_data True \\\n", + " --operator TransE \\\n", + " --workers 24 \\\n", + " --log $TEMP_TRANSE/ge.transE.log \\\n", + " -T $TEMP_TRANSE \\\n", + " -ot w2v \\\n", + " -e 600" ] }, { @@ -349,11 +292,12 @@ "metadata": {}, "outputs": [], "source": [ - "!$kgtk text-embedding -i $ALL \\\n", - "--model roberta-large-nli-mean-tokens \\\n", - "--property-labels-file $LABELS_EN \\\n", - "--isa-properties P31 P279 P106 P39 P1382 P373 P452 \\\n", - "--save-embedding-sentence > $OUT/wikidatadwd-text-embeddings-all.tsv" + "if compute_embeddings:\n", + " !$kgtk text-embedding -i $ALL \\\n", + " --model roberta-large-nli-mean-tokens \\\n", + " --property-labels-file $LABELS_EN \\\n", + " --isa-properties P31 P279 P106 P39 P1382 P373 P452 \\\n", + " --save-embedding-sentence > $OUT/wikidatadwd-text-embeddings-all.tsv" ] }, { @@ -366,27 +310,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "closed-yemen", "metadata": {}, "outputs": [], "source": [ "kgtk(\"\"\"cat -i $GRAPH/all.tsv.gz \n", " -i $GRAPH/derived.isastar.tsv.gz \n", - " -i $GRAPH/metadata.property.datatypes.tsv.gz \n", " -i $GRAPH/metadata.pagerank.undirected.tsv.gz\n", " -i $GRAPH/metadata.pagerank.directed.tsv.gz\n", - " -o $OUT/wikidata.dwd.all.kgtk.search.unsorted.tsv.gz\"\"\")" + " -o $TEMP/wikidata.dwd.all.kgtk.search.unsorted.tsv.gz\"\"\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "trained-typing", "metadata": {}, "outputs": [], "source": [ - "kgtk(f\"\"\"sort -i $OUT/wikidata.dwd.all.kgtk.search.unsorted.tsv.gz\n", + "kgtk(f\"\"\"sort -i $TEMP/wikidata.dwd.all.kgtk.search.unsorted.tsv.gz\n", " --columns node1\n", " --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'\n", " -o $OUT/wikidata.dwd.all.kgtk.search.sorted.tsv.gz\"\"\")" @@ -394,20 +337,117 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "6f2f5864-5dae-47ec-b4de-0726654de82c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Processed 1000000 lines...
0Processed 2000000 lines...
1Processed 3000000 lines...
2Processed 4000000 lines...
3Processed 5000000 lines...
4Processed 6000000 lines...
......
5080Processed 5082000000 lines...
5081Processed 5083000000 lines...
5082Processed 5084000000 lines...
5083Processed 5085000000 lines...
5084Done!
\n", + "

5085 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " Processed 1000000 lines...\n", + "0 Processed 2000000 lines...\n", + "1 Processed 3000000 lines...\n", + "2 Processed 4000000 lines...\n", + "3 Processed 5000000 lines...\n", + "4 Processed 6000000 lines...\n", + "... ...\n", + "5080 Processed 5082000000 lines...\n", + "5081 Processed 5083000000 lines...\n", + "5082 Processed 5084000000 lines...\n", + "5083 Processed 5085000000 lines...\n", + "5084 Done!\n", + "\n", + "[5085 rows x 1 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "kgtk(\"\"\"build-kgtk-search-input --input-file \"$OUT\"/wikidata.dwd.all.kgtk.search.sorted.tsv.gz\n", + "kgtk(f\"\"\"--debug build-kgtk-search-input --input-file \"$OUT\"/wikidata.dwd.all.kgtk.search.sorted.tsv.gz\n", "--output-file \"$OUT\"/wikidata.dwd.all.kgtk.search.sorted.jl \n", "--label-properties label \n", "--alias-properties alias \n", "--extra-alias-properties P1448,P1705,P1477,P1810,P742,P1449 \n", "--description-properties description \n", - "--pagerank-properties Pdirected_pagerank \n", + "--pagerank-properties Pundirected_pagerank \n", + "--languages {languages}\n", "--mapping-file \"$OUT\"/wikidata_dwd_v3_mapping.json \n", - "--property-datatype-file \"$OUT\"/metadata.property.datatypes.tsv.gz\"\"\")" + "--property-datatype-file \"$GRAPH\"/metadata.property.datatypes.tsv.gz\"\"\")" ] }, { @@ -425,13 +465,14 @@ "metadata": {}, "outputs": [], "source": [ - "!$kgtk cat \\\n", - "-i $OUT/wikidata.dwd.all.kgtk.search.sorted.tsv.gz \\\n", - "-i $OUT/derived.isa.tsv.gz \\\n", - "-i $OUT/derived.P279star.tsv.gz \\\n", - "-i $OUT/metadata.in_degree.tsv.gz \\\n", - "-i $OUT/metadata.out_degree.tsv.gz \\\n", - "-o $TEMP/wikidata.dwd.all.kgtk.triples.1.tsv.gz" + "if generate_triples:\n", + " !$kgtk cat \\\n", + " -i $OUT/wikidata.dwd.all.kgtk.search.sorted.tsv.gz \\\n", + " -i $OUT/derived.isa.tsv.gz \\\n", + " -i $OUT/derived.P279star.tsv.gz \\\n", + " -i $OUT/metadata.in_degree.tsv.gz \\\n", + " -i $OUT/metadata.out_degree.tsv.gz \\\n", + " -o $TEMP/wikidata.dwd.all.kgtk.triples.1.tsv.gz" ] }, { @@ -441,9 +482,10 @@ "metadata": {}, "outputs": [], "source": [ - "!$kgtk add-id -i $TEMP/wikidata.dwd.all.kgtk.triples.1.tsv.gz \\\n", - "--id-style wikidata \\\n", - "-o $TEMP/wikidata.dwd.all.kgtk.triples.2.tsv.gz" + "if generate_triples:\n", + " !$kgtk add-id -i $TEMP/wikidata.dwd.all.kgtk.triples.1.tsv.gz \\\n", + " --id-style wikidata \\\n", + " -o $TEMP/wikidata.dwd.all.kgtk.triples.2.tsv.gz" ] }, { @@ -453,10 +495,11 @@ "metadata": {}, "outputs": [], "source": [ - "!$kgtk sort -i $TEMP/wikidata.dwd.all.kgtk.triples.2.tsv.gz \\\n", - "--columns node1 \\\n", - " --extra '--parallel 24 --buffer-size 30% --temporary-directory ' + temp_folder_path \\\n", - "-o $OUT/wikidata.dwd.all.kgtk.triples.sorted.tsv.gz" + "if generate_triples:\n", + " !$kgtk sort -i $TEMP/wikidata.dwd.all.kgtk.triples.2.tsv.gz \\\n", + " --columns node1 \\\n", + " --extra '--parallel 24 --buffer-size 30% --temporary-directory ' + temp_folder_path \\\n", + " -o $OUT/wikidata.dwd.all.kgtk.triples.sorted.tsv.gz" ] }, { @@ -474,7 +517,8 @@ "metadata": {}, "outputs": [], "source": [ - "!mkdir -p $OUT/kgtk_triples_split" + "generate_triples:\n", + " !mkdir -p $OUT/kgtk_triples_split" ] }, { @@ -484,10 +528,11 @@ "metadata": {}, "outputs": [], "source": [ - "!$kgtk split -i $OUT/wikidata.dwd.all.kgtk.triples.sorted.tsv.gz \\\n", - "--output-path $OUT/kgtk_triples_split \\\n", - "--gzipped-output --lines 10000000 \\\n", - "--file-prefix kgtk_triples" + "if generate_triples:\n", + " !$kgtk split -i $OUT/wikidata.dwd.all.kgtk.triples.sorted.tsv.gz \\\n", + " --output-path $OUT/kgtk_triples_split \\\n", + " --gzipped-output --lines 10000000 \\\n", + " --file-prefix kgtk_triples" ] }, { @@ -497,7 +542,8 @@ "metadata": {}, "outputs": [], "source": [ - "!curl https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv -o $TEMP/kgtk-properties.tsv" + "if generate_triples:\n", + " !curl https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv -o $TEMP/kgtk-properties.tsv" ] }, { @@ -507,7 +553,8 @@ "metadata": {}, "outputs": [], "source": [ - "!$kgtk filter -p \";data_type;\" -i $TEMP/kgtk-properties.tsv -o $TEMP/kgtk-properties.datatype.tsv.gz" + "if generate_triples:\n", + " kgtk(f\"\"\"filter -p \";{datatype_property};\" -i $TEMP/kgtk-properties.tsv -o $TEMP/kgtk-properties.datatype.tsv.gz\"\"\")" ] }, { @@ -526,7 +573,8 @@ } ], "source": [ - "!$kgtk cat -i $TEMP/kgtk-properties.datatype.tsv.gz $OUT/metadata.property.datatypes.tsv.gz -o $OUT/metadata.property.datatypes.augmented.tsv.gz" + "if generate_triples:\n", + " !$kgtk cat -i $TEMP/kgtk-properties.datatype.tsv.gz $OUT/metadata.property.datatypes.tsv.gz -o $OUT/metadata.property.datatypes.augmented.tsv.gz" ] }, { @@ -536,16 +584,17 @@ "metadata": {}, "outputs": [], "source": [ - "ls $OUT/kgtk_triples_split/*.tsv.gz | parallel -j 18 'kgtk --debug generate-wikidata-triples -lp label -ap alias -dp description -pf $OUT/metadata.property.datatypes.augmented.tsv.gz --output-n-lines 100000 --generate-truthy --warning --use-id --log-path $TEMP/generate_triples_log.txt --error-action log -i {} -o {.}.ttl'\n", + "if generate_triples:\n", + " ls $OUT/kgtk_triples_split/*.tsv.gz | parallel -j 18 'kgtk --debug generate-wikidata-triples -lp label -ap alias -dp description -pf $OUT/metadata.property.datatypes.augmented.tsv.gz --output-n-lines 100000 --generate-truthy --warning --use-id --log-path $TEMP/generate_triples_log.txt --error-action log -i {} -o {.}.ttl'\n", "\n" ] } ], "metadata": { "kernelspec": { - "display_name": "kgtk-env", + "display_name": "kgtk-env-ckg07", "language": "python", - "name": "kgtk-env" + "name": "kgtk-env-ckg07" }, "language_info": { "codemirror_mode": { @@ -557,7 +606,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.12" } }, "nbformat": 4, diff --git a/use-cases/Table-Linker-Files.ipynb b/use-cases/Table-Linker-Files.ipynb index e38557cef..17e285f4b 100644 --- a/use-cases/Table-Linker-Files.ipynb +++ b/use-cases/Table-Linker-Files.ipynb @@ -867,7 +867,7 @@ "--alias-properties alias \n", "--extra-alias-properties P1448,P1705,P1477,P1810,P742,P1449 \n", "--description-properties description \n", - "--pagerank-properties Pdirected_pagerank \n", + "--pagerank-properties Pundirected_pagerank \n", "--mapping-file \"$OUT\"/wikidata_dwd.v2.table-linker.json \n", "--property-datatype-file \"$OUT\"/metadata.property.datatypes.augmented.tsv.gz\"\"\")" ] diff --git a/use-cases/Wikidata Subsets.ipynb b/use-cases/Wikidata Subsets.ipynb index f37a5d2b9..4b30491f4 100644 --- a/use-cases/Wikidata Subsets.ipynb +++ b/use-cases/Wikidata Subsets.ipynb @@ -59,13 +59,13 @@ "metadata": {}, "outputs": [], "source": [ - "input_path = \"/data/amandeep/wikidata-20211027\"\n", + "input_path = \"/data/amandeep/wikidata-20220505/import-wikidata/data\"\n", "output_path = \"/data/amandeep\"\n", - "kgtk_path = \"/data/amandeep/github/kgtk\"\n", + "kgtk_path = \"/data/amandeep/Github/kgtk\"\n", "\n", "graph_cache_path = None\n", "\n", - "project_name = \"wikidata-20211027-dwd-v3\"\n", + "project_name = \"wikidata-20220505-dwd-v4\"\n", "\n", "files = 'claims,label_all,alias_all,description_all,item,qualifiers,datatypes,types,isa,p279star'\n", "\n", @@ -100,9 +100,9 @@ "output_type": "stream", "text": [ "User home: /nas/home/amandeep\n", - "Current dir: /data/amandeep/github/kgtk/use-cases\n", - "KGTK dir: /data/amandeep/github/kgtk\n", - "Use-cases dir: /data/amandeep/github/kgtk/use-cases\n" + "Current dir: /data/amandeep/Github/kgtk/use-cases\n", + "KGTK dir: /data/amandeep/Github/kgtk\n", + "Use-cases dir: /data/amandeep/Github/kgtk/use-cases\n" ] } ], @@ -123,27 +123,27 @@ "name": "stdout", "output_type": "stream", "text": [ - "KGTK_LABEL_FILE: /data/amandeep/wikidata-20211027/labels.en.tsv.gz\n", - "STORE: /data/amandeep/wikidata-20211027-dwd-v3/temp.wikidata-20211027-dwd-v3/wikidata.sqlite3.db\n", - "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20211027-dwd-v3/temp.wikidata-20211027-dwd-v3/wikidata.sqlite3.db\n", - "GRAPH: /data/amandeep/wikidata-20211027\n", - "USE_CASES_DIR: /data/amandeep/github/kgtk/use-cases\n", - "TEMP: /data/amandeep/wikidata-20211027-dwd-v3/temp.wikidata-20211027-dwd-v3\n", - "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20211027-dwd-v3/temp.wikidata-20211027-dwd-v3/wikidata.sqlite3.db\n", + "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4/wikidata.sqlite3.db\n", + "GRAPH: /data/amandeep/wikidata-20220505/import-wikidata/data\n", + "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4/wikidata.sqlite3.db\n", + "STORE: /data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4/wikidata.sqlite3.db\n", + "OUT: /data/amandeep/wikidata-20220505-dwd-v4\n", "kgtk: kgtk\n", + "TEMP: /data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4\n", + "USE_CASES_DIR: /data/amandeep/Github/kgtk/use-cases\n", "KGTK_OPTION_DEBUG: false\n", - "OUT: /data/amandeep/wikidata-20211027-dwd-v3\n", - "EXAMPLES_DIR: /data/amandeep/github/kgtk/examples\n", - "claims: /data/amandeep/wikidata-20211027/claims.tsv.gz\n", - "label_all: /data/amandeep/wikidata-20211027/labels.tsv.gz\n", - "alias_all: /data/amandeep/wikidata-20211027/aliases.tsv.gz\n", - "description_all: /data/amandeep/wikidata-20211027/descriptions.tsv.gz\n", - "item: /data/amandeep/wikidata-20211027/claims.wikibase-item.tsv.gz\n", - "qualifiers: /data/amandeep/wikidata-20211027/qualifiers.tsv.gz\n", - "datatypes: /data/amandeep/wikidata-20211027/metadata.property.datatypes.tsv.gz\n", - "types: /data/amandeep/wikidata-20211027/metadata.types.tsv.gz\n", - "isa: /data/amandeep/wikidata-20211027/derived.isa.tsv.gz\n", - "p279star: /data/amandeep/wikidata-20211027/derived.P279star.tsv.gz\n" + "EXAMPLES_DIR: /data/amandeep/Github/kgtk/examples\n", + "KGTK_LABEL_FILE: /data/amandeep/wikidata-20220505/import-wikidata/data/labels.en.tsv.gz\n", + "claims: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\n", + "label_all: /data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz\n", + "alias_all: /data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz\n", + "description_all: /data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz\n", + "item: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-item.tsv.gz\n", + "qualifiers: /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz\n", + "datatypes: /data/amandeep/wikidata-20220505/import-wikidata/data/metadata.property.datatypes.tsv.gz\n", + "types: /data/amandeep/wikidata-20220505/import-wikidata/data/metadata.types.tsv.gz\n", + "isa: /data/amandeep/wikidata-20220505/import-wikidata/data/derived.isa.tsv.gz\n", + "p279star: /data/amandeep/wikidata-20220505/import-wikidata/data/derived.P279star.tsv.gz\n" ] } ], @@ -160,7 +160,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "kgtk query --graph-cache /data/amandeep/wikidata-20211027-dwd-v3/temp.wikidata-20211027-dwd-v3/wikidata.sqlite3.db -i \"/data/amandeep/wikidata-20211027/claims.tsv.gz\" --as claims -i \"/data/amandeep/wikidata-20211027/labels.tsv.gz\" --as label_all -i \"/data/amandeep/wikidata-20211027/aliases.tsv.gz\" --as alias_all -i \"/data/amandeep/wikidata-20211027/descriptions.tsv.gz\" --as description_all -i \"/data/amandeep/wikidata-20211027/claims.wikibase-item.tsv.gz\" --as item -i \"/data/amandeep/wikidata-20211027/qualifiers.tsv.gz\" --as qualifiers -i \"/data/amandeep/wikidata-20211027/metadata.property.datatypes.tsv.gz\" --as datatypes -i \"/data/amandeep/wikidata-20211027/metadata.types.tsv.gz\" --as types -i \"/data/amandeep/wikidata-20211027/derived.isa.tsv.gz\" --as isa -i \"/data/amandeep/wikidata-20211027/derived.P279star.tsv.gz\" --as p279star --limit 3\n", + "kgtk query --graph-cache /data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4/wikidata.sqlite3.db -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\" --as claims -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz\" --as label_all -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz\" --as alias_all -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz\" --as description_all -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-item.tsv.gz\" --as item -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz\" --as qualifiers -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.property.datatypes.tsv.gz\" --as datatypes -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.types.tsv.gz\" --as types -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/derived.isa.tsv.gz\" --as isa -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/derived.P279star.tsv.gz\" --as p279star --limit 3\n", "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", "P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\tnormal\turl\n", "P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\tnormal\turl\n", @@ -172,6 +172,84 @@ "ck.load_files_into_cache()" ] }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Graph Cache:\n", + "DB file: /data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4/wikidata.sqlite3.db\n", + " size: 535.50 GB \tfree: 0 Bytes \tmodified: 2022-05-14 13:07:10\n", + "\n", + "KGTK File Information:\n", + "/data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4/items.remove.tsv.gz:\n", + " size: 99.33 MB \tmodified: 2022-05-14 12:47:54 \tgraph: graph_11\n", + "alias_all:\n", + " size: 2.07 GB \tmodified: 2022-05-11 06:01:24 \tgraph: graph_3\n", + "claims:\n", + " size: 27.33 GB \tmodified: 2022-05-11 05:55:01 \tgraph: graph_1\n", + "datatypes:\n", + " size: 54.46 KB \tmodified: 2022-05-11 07:29:02 \tgraph: graph_7\n", + "description_all:\n", + " size: 23.66 GB \tmodified: 2022-05-11 07:08:19 \tgraph: graph_4\n", + "isa:\n", + " size: 303.01 MB \tmodified: 2022-05-11 14:42:27 \tgraph: graph_9\n", + "item:\n", + " size: 9.63 GB \tmodified: 2022-05-14 07:47:33 \tgraph: graph_5\n", + "label_all:\n", + " size: 7.88 GB \tmodified: 2022-05-11 07:21:33 \tgraph: graph_2\n", + "p279star:\n", + " size: 698.89 MB \tmodified: 2022-05-11 14:13:56 \tgraph: graph_10\n", + "qualifiers:\n", + " size: 5.36 GB \tmodified: 2022-05-11 05:59:30 \tgraph: graph_6\n", + "types:\n", + " size: 455.79 MB \tmodified: 2022-05-11 07:29:39 \tgraph: graph_8\n", + "\n", + "Graph Table Information:\n", + "graph_1:\n", + " size: 119.52 GB \tcreated: 2022-05-14 09:24:57\n", + " header: ['id', 'node1', 'label', 'node2', 'rank', 'node2;wikidatatype']\n", + "graph_10:\n", + " size: 8.74 GB \tcreated: 2022-05-14 12:06:03\n", + " header: ['node1', 'label', 'node2', 'id']\n", + "graph_11:\n", + " size: 1.54 GB \tcreated: 2022-05-14 13:07:10\n", + " header: ['node1', 'label', 'node2']\n", + "graph_2:\n", + " size: 54.66 GB \tcreated: 2022-05-14 09:47:37\n", + " header: ['id', 'node1', 'label', 'node2', 'lang']\n", + "graph_3:\n", + " size: 11.91 GB \tcreated: 2022-05-14 09:52:44\n", + " header: ['id', 'node1', 'label', 'node2', 'lang']\n", + "graph_4:\n", + " size: 238.50 GB \tcreated: 2022-05-14 11:24:07\n", + " header: ['id', 'node1', 'label', 'node2', 'lang']\n", + "graph_5:\n", + " size: 56.81 GB \tcreated: 2022-05-14 11:47:26\n", + " header: ['id', 'node1', 'label', 'node2', 'rank', 'node2;wikidatatype']\n", + "graph_6:\n", + " size: 33.64 GB \tcreated: 2022-05-14 11:59:40\n", + " header: ['id', 'node1', 'label', 'node2', 'node2;wikidatatype']\n", + "graph_7:\n", + " size: 476.00 KB \tcreated: 2022-05-14 11:59:40\n", + " header: ['id', 'node1', 'label', 'node2']\n", + "graph_8:\n", + " size: 4.38 GB \tcreated: 2022-05-14 12:01:43\n", + " header: ['id', 'node1', 'label', 'node2']\n", + "graph_9:\n", + " size: 5.83 GB \tcreated: 2022-05-14 12:03:14\n", + " header: ['node1', 'label', 'node2']\n" + ] + } + ], + "source": [ + "!kgtk query --gc $STORE --show-cache" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -188,7 +266,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -225,38 +303,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Compute the items to be removed\n", - "\n", - "First look at the classes we will remove" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/bin/bash: wd: command not found\n" - ] - }, - { - "data": { - "text/plain": [ - "'wd u Q7318358 Q13442814'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cmd = \"wd u {}\".format(\" \".join(remove_classes.split(\",\")))\n", - "!{cmd}\n", - "cmd" + "### Compute the items to be removed" ] }, { @@ -268,7 +315,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -277,16 +324,16 @@ "text": [ "node1\tlabel\tnode2\n", "P10\tisa\tQ18610173\n", + "P10\tisa\tQ19847637\n", "P1000\tisa\tQ18608871\n", "P10000\tisa\tQ19833377\n", "P10000\tisa\tQ89560413\n", "P10001\tisa\tQ107738007\n", - "P10001\tisa\tQ64221137\n", - "P10002\tisa\tQ93433126\n", - "P10003\tisa\tQ108914651\n", - "P10003\tisa\tQ42396390\n", "\n", - "gzip: stdout: Broken pipe\n" + "gzip: P10001\tisa\tQ64221137\n", + "P10002\tisa\tQ93433126\n", + "stdout: Broken pipe\n", + "P10003\tisa\tQ108914651\n" ] } ], @@ -303,7 +350,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -312,7 +359,7 @@ "'\"Q7318358\", \"Q13442814\"'" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -325,15 +372,16 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "!$kypher -i isa -i p279star -o \"$TEMP\"/items.remove.tsv.gz \\\n", - "--match 'isa: (n1)-[:isa]->(c), p279star: (c)-[]->(class)' \\\n", - "--where 'class in [{classes}]' \\\n", - "--return 'distinct n1, \"p31_p279star\" as label, class as node2' \\\n", - "--order-by 'n1'" + "kypher(f\"\"\" -i isa -i p279star -o \"$TEMP\"/items.remove.tsv.gz \n", + " --match 'isa: (n1)-[:isa]->(c), p279star: (c)-[]->(class)' \n", + " --where 'class in [{classes}]' \n", + " --return 'distinct n1, \"p31_p279star\" as label, class as node2' \n", + " --order-by 'n1'\n", + " \"\"\")" ] }, { @@ -345,7 +393,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -353,10 +401,10 @@ "output_type": "stream", "text": [ "node1\tlabel\tnode2\n", + "Q100000005\tp31_p279star\tQ13442814\n", "\n", - "gzip: Q100000005\tp31_p279star\tQ13442814\n", + "gzip: Q100000009\tp31_p279star\tQ13442814\n", "stdout: Broken pipe\n", - "Q100000009\tp31_p279star\tQ13442814\n", "Q100000015\tp31_p279star\tQ13442814\n", "Q100000022\tp31_p279star\tQ13442814\n", "Q100000031\tp31_p279star\tQ13442814\n", @@ -373,14 +421,14 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "39548165 118644495 1303905996\n" + "39873936 119621808 1314915334\n" ] } ], @@ -397,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -662,17 +710,17 @@ "output_type": "stream", "text": [ "\n", - "gzip: stdout: Broken pipe\n", - "id node1 label node2 node2;wikidatatype\n", + "gzip: id node1 label node2 node2;wikidatatype\n", + "P10-P1630-53947a-fbe9093e-0-P407-Q20923490-0 P10-P1630-53947a-fbe9093e-0 P407 Q20923490 wikibase-item\n", + "stdout: Broken pipe\n", "P10-P1855-Q15075950-7eff6d65-0-P10-54b214-0 P10-P1855-Q15075950-7eff6d65-0 P10 \"Smoorverliefd 12 september.webm\" commonsMedia\n", "P10-P1855-Q15075950-7eff6d65-0-P3831-Q622550-0 P10-P1855-Q15075950-7eff6d65-0 P3831 Q622550 wikibase-item\n", "P10-P1855-Q4504-a69d2c73-0-P10-bef003-0 P10-P1855-Q4504-a69d2c73-0 P10 \"Komodo dragons video.ogv\" commonsMedia\n", "P10-P1855-Q69063653-c8cdb04c-0-P10-6fb08f-0 P10-P1855-Q69063653-c8cdb04c-0 P10 \"Couch Commander.webm\" commonsMedia\n", - "P10-P1855-Q7378-555592a4-0-P10-8a982d-0 P10-P1855-Q7378-555592a4-0 P10 \"Elephants Dream (2006).webm\" commonsMedia\n", - "P10-P2302-Q21502404-d012aef4-0-P1793-f4c2ed-0 P10-P2302-Q21502404-d012aef4-0 P1793 \"(?i).+\\\\.(webm\\|ogv\\|ogg\\|gif)\" string\n", + "P10-P1855-Q825197-555592a4-0-P10-8a982d-0 P10-P1855-Q825197-555592a4-0 P10 \"Elephants Dream (2006).webm\" commonsMedia\n", + "P10-P2302-Q21502404-d012aef4-0-P1793-1f3adb-0 P10-P2302-Q21502404-d012aef4-0 P1793 \"(?i).+\\\\.(webm\\|ogv\\|ogg\\|gif\\|svg)\" string\n", "P10-P2302-Q21502404-d012aef4-0-P2316-Q21502408-0 P10-P2302-Q21502404-d012aef4-0 P2316 Q21502408 wikibase-item\n", - "P10-P2302-Q21502404-d012aef4-0-P2916-cb0917-0 P10-P2302-Q21502404-d012aef4-0 P2916 'filename with extension: webm, ogg, ogv, or gif (case insensitive)'@en monolingualtext\n", - "P10-P2302-Q21510851-5224fe0b-0-P2306-P175-0 P10-P2302-Q21510851-5224fe0b-0 P2306 P175 wikibase-property\n" + "P10-P2302-Q21502404-d012aef4-0-P2916-cb0917-0 P10-P2302-Q21502404-d012aef4-0 P2916 'filename with extension: webm, ogg, ogv, or gif (case insensitive)'@en monolingualtext\n" ] } ], @@ -711,25 +759,25 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "\n", + "gzip: stdout: Broken pipe\n", "id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "P10-P1630-53947a-fbe9093e-0-P407-Q20923490-0\tP10-P1630-53947a-fbe9093e-0\tP407\tQ20923490\twikibase-item\n", "P10-P1855-Q15075950-7eff6d65-0-P10-54b214-0\tP10-P1855-Q15075950-7eff6d65-0\tP10\t\"Smoorverliefd 12 september.webm\"\tcommonsMedia\n", - "\n", - "gzip: P10-P1855-Q15075950-7eff6d65-0-P3831-Q622550-0\tP10-P1855-Q15075950-7eff6d65-0\tP3831\tQ622550 wikibase-item\n", - "stdout: Broken pipe\n", + "P10-P1855-Q15075950-7eff6d65-0-P3831-Q622550-0\tP10-P1855-Q15075950-7eff6d65-0\tP3831\tQ622550 wikibase-item\n", "P10-P1855-Q4504-a69d2c73-0-P10-bef003-0 P10-P1855-Q4504-a69d2c73-0\tP10\t\"Komodo dragons video.ogv\"\tcommonsMedia\n", "P10-P1855-Q69063653-c8cdb04c-0-P10-6fb08f-0\tP10-P1855-Q69063653-c8cdb04c-0\tP10\t\"Couch Commander.webm\"\tcommonsMedia\n", - "P10-P1855-Q7378-555592a4-0-P10-8a982d-0 P10-P1855-Q7378-555592a4-0\tP10\t\"Elephants Dream (2006).webm\"\tcommonsMedia\n", - "P10-P2302-Q21502404-d012aef4-0-P1793-f4c2ed-0\tP10-P2302-Q21502404-d012aef4-0\tP1793\t\"(?i).+\\\\.(webm\\|ogv\\|ogg\\|gif)\"\tstring\n", + "P10-P1855-Q825197-555592a4-0-P10-8a982d-0\tP10-P1855-Q825197-555592a4-0\tP10\t\"Elephants Dream (2006).webm\"\tcommonsMedia\n", + "P10-P2302-Q21502404-d012aef4-0-P1793-1f3adb-0\tP10-P2302-Q21502404-d012aef4-0\tP1793\t\"(?i).+\\\\.(webm\\|ogv\\|ogg\\|gif\\|svg)\"\tstring\n", "P10-P2302-Q21502404-d012aef4-0-P2316-Q21502408-0\tP10-P2302-Q21502404-d012aef4-0\tP2316\tQ21502408\twikibase-item\n", - "P10-P2302-Q21502404-d012aef4-0-P2916-cb0917-0\tP10-P2302-Q21502404-d012aef4-0\tP2916\t'filename with extension: webm, ogg, ogv, or gif (case insensitive)'@en monolingualtext\n", - "P10-P2302-Q21510851-5224fe0b-0-P2306-P175-0\tP10-P2302-Q21510851-5224fe0b-0\tP2306\tP175\twikibase-property\n" + "P10-P2302-Q21502404-d012aef4-0-P2916-cb0917-0\tP10-P2302-Q21502404-d012aef4-0\tP2916\t'filename with extension: webm, ogg, ogv, or gif (case insensitive)'@en monolingualtext\n" ] } ], @@ -739,20 +787,20 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "total 30982776\n", - "-rw-r--r-- 1 amandeep isdstaff 1445060824 Nov 19 01:10 aliases.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 11107210112 Nov 19 09:03 claims.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 12040923857 Nov 19 08:26 descriptions.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 4729262725 Nov 19 00:43 labels.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2403890930 Nov 19 10:07 qualifiers.tsv.gz\n", - "drwxr-xr-x 2 amandeep isdstaff 324 Nov 19 09:03 temp.wikidata-20211027-dwd-v3\n" + "total 34220224\n", + "-rw-r--r-- 1 amandeep isdstaff 2214529468 May 14 20:50 aliases.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 11594856613 May 15 04:31 claims.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 12667243225 May 15 03:52 descriptions.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 6007956701 May 14 20:09 labels.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2556913530 May 15 05:28 qualifiers.tsv.gz\n", + "drwxr-xr-x 2 amandeep isdstaff 288 May 15 04:31 temp.wikidata-20220505-dwd-v4\n" ] } ], @@ -769,7 +817,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -785,7 +833,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -806,7 +854,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -828,7 +876,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -906,7 +954,10 @@ " parameters=dict(\n", " output_path = os.environ[\"OUT\"],\n", " input_path = os.environ[\"OUT\"],\n", - " kgtk_path = '/Users/amandeep/github/kgtk'\n", + " kgtk_path = '/data/amandeep/Github/kgtk',\n", + " compute_pagerank=True,\n", + " compute_degrees=True,\n", + " debug=False\n", " )\n", ")\n" ] @@ -951,280 +1002,87 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!ls -lh {out}/*.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## concatenate files to get the `all` file" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "!kgtk cat -i {out}/claims.tsv.gz \\\n", - "{lad_file_list} \\\n", - "{out}/qualifiers.tsv.gz \\\n", - "{out}/useful_files/metadata.pagerank.undirected.tsv.gz \\\n", - "{out}/useful_files/metadata.pagerank.directed.tsv.gz \\\n", - "{out}/useful_files/metadata.in_degree.tsv.gz \\\n", - "{out}/useful_files/metadata.out_degree.tsv.gz \\\n", - "-o {out}/wikidatadwd.all.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## concatenate files to get the `all for triples` file\n" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "!kgtk cat -i $OUT/wikidatadwd.all.tsv.gz \\\n", - "$OUT/useful_files/derived.isa.tsv.gz \\\n", - "$OUT/useful_files/derived.P279star.tsv.gz \\\n", - "-o $OUT/wikidatadwd.all.for.triples.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## concatenate files to get the `all for elasticsearch` file\n" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "!kgtk cat -i $OUT/wikidatadwd.all.tsv.gz \\\n", - "$OUT/useful_files/derived.P279.tsv.gz \\\n", - "$OUT/useful_files/derived.isastar.tsv.gz \\\n", - "-o $OUT/wikidatadwd.all.for.es.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### remove `somevalue,novalue,P9`" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "# this might not be needed in future: AS (03/11/2021)\n", - "!kgtk filter -i $OUT/wikidatadwd.all.for.es.tsv.gz \\\n", - " -o $OUT/wikidataos.all.for.es.filtered.tsv.gz \\\n", - " -p ';;somevalue,novalue,P9' --invert" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### add text and graph embeddings, augmented wikipedia and abbreviated human names for ES" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kgtk cat \\\n", - "-i $OUT/wikidatadwd.all.for.es.tsv.gz \\\n", - "-i $OUT/metadata.property.datatypes.tsv.gz \\\n", - "-i $OUT/graph-embeddings/wikidataos.complEx.graph-embeddings.tsv.gz \\\n", - "-i $OUT/graph-embeddings/wikidataos.transE.graph-embeddings.tsv.gz \\\n", - "-i $OUT/text-embeddings/text-embeddings-concatenated.tsv.gz \\\n", - "-i $OUT/derived_files_for_es/augmentation.wikipedia.anchors.tsv.gz \\\n", - "-i $OUT/derived_files_for_es/augmentation.wikipedia.redirect.tsv.gz \\\n", - "-i $OUT/derived_files_for_es/augmentation.wikipedia.tables.anchors.tsv.gz \\\n", - "-i $OUT/derived_files_for_es/derived.Q5.abbreviations.tsv.gz \\\n", - "-o $OUT/wikidatadwd.all.for.es.embeddings.augmented.unsorted.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### remove columns `id rank node2;wikidatatype url` as it is not required in the ES file and then sort the file by `node1,label`" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "!kgtk remove-columns --columns id,rank,node2;wikidatatype,url \\\n", - "-i $OUT/wikidatadwd.all.for.es.embeddings.augmented.unsorted.tsv.gz \\\n", - "-o $OUT/wikidatadwd.all.for.es.embeddings.augmented.unsorted.clean.tsv.gz\n" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'/data/amandeep/temp.wikidata-20210215-dwd'" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "temp" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kgtk sort -c node1,label \\\n", - "-i $OUT/wikidatadwd.all.for.es.embeddings.augmented.unsorted.tsv.gz \\\n", - "--extra '--parallel 24 --buffer-size 30% --temporary-directory /data/amandeep/temp.wikidata-20210215-dwd' \\\n", - "-o $OUT/wikidatadwd.all.for.es.embeddings.augmented.sorted.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Filter out `novalue`, `somevalue` and `P9`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kgtk filter -i $OUT/wikidataos.all.for.triples.tsv.gz \\\n", - " -o $OUT/wikidataos.all.for.triples.filtered.tsv.gz \\\n", - " -p ';;somevalue,novalue,P9' --invert" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add ids for any edge with missing id" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kgtk add-id -i $OUT/wikidataos.all.for.triples.filtered.tsv.gz \\\n", - "-o $OUT/wikidataos.all.for.triples.filtered.id.tsv.gz \\\n", - "--id-style wikidata" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Sort by `id`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!kgtk sort2 -i $OUT/wikidataos.all.for.triples.filtered.id.tsv.gz \\\n", - "-o $OUT/wikidataos.all.for.triples.filtered.id.sorted.tsv.gz \n", - "-c id" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run graph embeddings: complEx" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "In Processing, Please go to /data/amandeep/wikidata-20210215-dwd/graph-embeddings/temp/ge.complex.log to check details\n", - "Opening the input file: /data/amandeep/wikidata-20210215-dwd/parts/claims.wikibase-item.tsv.gz\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading gzip /data/amandeep/wikidata-20210215-dwd/parts/claims.wikibase-item.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\trank\n", - "input format: kgtk\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading an edge file.\n", - "Opening the output file: /data/amandeep/wikidata-20210215-dwd/graph-embeddings/temp/tmp_claims.wikibase-item.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing gzip /data/amandeep/wikidata-20210215-dwd/graph-embeddings/temp/tmp_claims.wikibase-item.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\trank\n", - "Processing the input records.\n", - "Processed 182246240 records.\n" + "-rw-r--r-- 1 amandeep isdstaff 175M May 16 04:59 /data/amandeep/wikidata-20220505-dwd-v4/aliases.en.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2.0G May 16 01:22 /data/amandeep/wikidata-20220505-dwd-v4/aliases.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 39G May 15 22:08 /data/amandeep/wikidata-20220505-dwd-v4/all.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 184M May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.commonsMedia.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2.5G May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.external-id.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 779K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.geo-shape.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 227M May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.globe-coordinate.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 689K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.math.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 295M May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.monolingualtext.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 28K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.musical-notation.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 88 May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.other.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2.0G May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.quantity.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.1G May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.string.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 421K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.tabular-data.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 301M May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.time.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 11G May 16 04:42 /data/amandeep/wikidata-20220505-dwd-v4/claims.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 123M May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.url.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 115K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.wikibase-form.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 3.6G May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.wikibase-item.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 75K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.wikibase-lexeme.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 643K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.wikibase-property.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 965 May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.wikibase-sense.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 12G May 17 03:41 /data/amandeep/wikidata-20220505-dwd-v4/derived.isastar.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 189M May 16 13:27 /data/amandeep/wikidata-20220505-dwd-v4/derived.isa.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 699M May 16 13:05 /data/amandeep/wikidata-20220505-dwd-v4/derived.P279star.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 42M May 16 11:23 /data/amandeep/wikidata-20220505-dwd-v4/derived.P279.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 12G May 17 17:49 /data/amandeep/wikidata-20220505-dwd-v4/derived.P31P279star.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 717M May 16 11:22 /data/amandeep/wikidata-20220505-dwd-v4/derived.P31.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 395M May 16 06:01 /data/amandeep/wikidata-20220505-dwd-v4/descriptions.en.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 12G May 16 01:22 /data/amandeep/wikidata-20220505-dwd-v4/descriptions.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 640M May 16 06:30 /data/amandeep/wikidata-20220505-dwd-v4/labels.en.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 5.6G May 16 01:22 /data/amandeep/wikidata-20220505-dwd-v4/labels.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 79M May 17 21:15 /data/amandeep/wikidata-20220505-dwd-v4/metadata.in_degree.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 357M May 17 20:44 /data/amandeep/wikidata-20220505-dwd-v4/metadata.out_degree.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 559M May 17 18:52 /data/amandeep/wikidata-20220505-dwd-v4/metadata.pagerank.directed.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 770M May 17 19:59 /data/amandeep/wikidata-20220505-dwd-v4/metadata.pagerank.undirected.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 53K May 16 01:21 /data/amandeep/wikidata-20220505-dwd-v4/metadata.property.datatypes.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 271M May 16 01:22 /data/amandeep/wikidata-20220505-dwd-v4/metadata.types.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 16M May 16 07:12 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.commonsMedia.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 151M May 16 07:22 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.external-id.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 29K May 16 07:27 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.geo-shape.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2.9M May 16 07:32 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.globe-coordinate.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 87K May 16 07:38 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.math.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 6.8M May 16 07:43 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.monolingualtext.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.8K May 16 07:48 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.musical-notation.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 900M May 16 07:58 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.quantity.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 530M May 16 08:07 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.string.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 201K May 16 08:12 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.tabular-data.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 16M May 16 08:18 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.time.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2.5G May 16 04:52 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 35M May 16 08:23 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.url.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.1K May 16 08:28 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.wikibase-form.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 695M May 16 08:44 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.wikibase-item.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 9.3K May 16 08:49 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.wikibase-lexeme.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 21K May 16 08:54 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.wikibase-property.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.6K May 16 08:58 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.wikibase-sense.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 88 May 16 06:33 /data/amandeep/wikidata-20220505-dwd-v4/sitelinks.en.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 99 May 16 06:33 /data/amandeep/wikidata-20220505-dwd-v4/sitelinks.qualifiers.en.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 96 May 16 01:22 /data/amandeep/wikidata-20220505-dwd-v4/sitelinks.qualifiers.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.8G May 16 01:22 /data/amandeep/wikidata-20220505-dwd-v4/sitelinks.tsv.gz\n" ] } ], "source": [ - "# make sure the output directories are created\n", - "!kgtk --debug graph-embeddings --verbose -i $OUT/parts/claims.wikibase-item.tsv.gz \\\n", - "-o $OUT/graph-embeddings/wikidataos.complEx.graph-embeddings.txt \\\n", - "--retain_temporary_data True \\\n", - "--operator ComplEx \\\n", - "--workers 24 \\\n", - "--log $OUT/graph-embeddings/temp/ge.complex.log \\\n", - "-T $OUT/graph-embeddings/temp \\\n", - "-ot w2v \\\n", - "-e 600" + "!ls -lh $OUT/*.tsv.gz" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "kgtk-env", + "display_name": "kgtk-env-ckg07", "language": "python", - "name": "kgtk-env" + "name": "kgtk-env-ckg07" }, "language_info": { "codemirror_mode": { @@ -1236,7 +1094,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.10" + "version": "3.9.12" } }, "nbformat": 4, diff --git a/use-cases/Wikidata Useful Files-Copy1.ipynb b/use-cases/Wikidata Useful Files-Copy1.ipynb deleted file mode 100644 index 6c03e9b20..000000000 --- a/use-cases/Wikidata Useful Files-Copy1.ipynb +++ /dev/null @@ -1,1762 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generating Useful Wikidata Files\n", - "\n", - "This notebook generates files that contain derived data that is useful in many applications. The input to the notebook is the full Wikidata or a subset of Wikidata. It also works for arbutrary KGs as long as they follow the representation requirements of Wikidata:\n", - "\n", - "- the *instance of* relation is represented using the `P31` property\n", - "- the *subclass of* relation is represented using the `P279` property\n", - "- all properties declare a datatype, and the data types must be one of the datatypes in Wikidata.\n", - "\n", - "Inputs:\n", - "\n", - "- `claims_file`: contains all statements, which consist of edges `node1/label/node2` where `label` is a property in Wikidata (e.g., sitelinks, labels, aliases and description are not in the claims file.\n", - "- `item_file`: the subset of the `claims_file` consistin of edges for property of data type `wikibase-item`\n", - "- `label_file`, `alias_file` and `description_file` containing labels, aliases and descriptions. It is assume that these files contain the labels, aliases and descriptions of all nodes appearing in the claims file. Users may provide these files for specific languages only.\n", - "\n", - "Outputs:\n", - "\n", - "- **Instance of (P31):** `derived.P31.tsv.gz` contains all the `instance of (P31)` edges present in the claims file.\n", - "- **Subclass of (P279):** `derived.P279.tsv.gz` contains all the `subclass of (P279)` edges present in the claims file.\n", - "- **Is A (isa):** `derived.isa.tsv.gz` contains edges `node`isa/node2` where either `node1/P31/node2` or `node1/P279/node2`\n", - "- **Closure of subclass of (P279star):** `derived.P279star.tsv.gz` contains edges `node1/P279star/node2` where `node2` is reachable from `node1` via zero or more hops using the `P279` property. Note that for example, `Q44/P279star/Q44`. An example when this file is useful is when you want to find all the instance of a class, including instances of subclasses of the given class.\n", - "- **In/out degrees:** `metadata.out_degree.tsv.gz` contains the out degree of every node, and `metadata.in_degree.tsv.gz` contains the in degree of every node.\n", - "- **Pagerank:** outputs page rank on the directed graph in `metadata.pagerank.directed.tsv.gz` and page rank of the directed graph in `metadata.pagerank.undirected.tsv.gz`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Batch Invocation\n", - "Example batch command. The second argument is a notebook where the output will be stored. You can load it to see progress.\n", - "\n", - "```\n", - "papermill Wikidata\\ Useful\\ Files.ipynb useful-files.out.ipynb \\\n", - "-p claims_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/all.tsv.gz \\\n", - "-p label_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/part.label.en.tsv.gz \\\n", - "-p item_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/part.wikibase-item.tsv.gz \\\n", - "-p property_item_file = /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/part.property.wikibase-item.tsv.gz \\\n", - "-p output_path \\\n", - "-p output_folder useful_files_v4 \\\n", - "-p temp_folder temp.useful_files_v4 \\\n", - "-p delete_database no \n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "# Parameters\n", - "\n", - "# Folder on local machine where to create the output and temporary folders\n", - "output_path = \"/Users/pedroszekely/Downloads/kypher\"\n", - "\n", - "# The names of the output and temporary folders\n", - "output_folder = \"useful_wikidata_files_v4\"\n", - "temp_folder = \"temp.useful_wikidata_files_v4\"\n", - "\n", - "# The location of input files\n", - "wiki_root_folder = \"/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/\"\n", - "claims_file = \"claims.tsv.gz\"\n", - "label_file = \"labels.en.tsv.gz\"\n", - "alias_file = \"aliases.en.tsv.gz\"\n", - "description_file = \"descriptions.en.tsv.gz\"\n", - "item_file = \"claims.wikibase-item.tsv.gz\"\n", - "\n", - "# Location of the cache database for kypher\n", - "cache_path = \"/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4\"\n", - "\n", - "# Whether to delete the cache database\n", - "delete_database = False\n", - "\n", - "# Whether to compute pagerank as it may not run on the laptop\n", - "compute_pagerank = False" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import io\n", - "import os\n", - "import subprocess\n", - "import sys\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import altair as alt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set up environment and folders to store the files\n", - "\n", - "- `OUT` folder where the output files go\n", - "- `TEMP` folder to keep temporary files , including the database\n", - "- `kgtk` shortcut to invoke the kgtk software\n", - "- `kypher` shortcut to invoke `kgtk query with the cache database\n", - "- `CLAIMS` the `all.tsv` file of wikidata that contains all edges except label/alias/description\n", - "- `LABELS` the file with the English labels\n", - "- `ITEMS` the wikibase-item file (currently does not include node1 that are properties so for now we need the net file\n", - "- `STORE` location of the cache file" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "if cache_path:\n", - " os.environ['STORE'] = \"{}/wikidata.sqlite3.db\".format(cache_path)\n", - "else:\n", - " os.environ['STORE'] = \"{}/{}/wikidata.sqlite3.db\".format(output_path, temp_folder)\n", - "os.environ['OUT'] = \"{}/{}\".format(output_path, output_folder)\n", - "os.environ['TEMP'] = \"{}/{}\".format(output_path, temp_folder)\n", - "os.environ['kgtk'] = \"kgtk\"\n", - "os.environ['kgtk'] = \"time kgtk --debug\"\n", - "os.environ['kypher'] = \"time kgtk --debug query --graph-cache \" + os.environ['STORE']\n", - "os.environ['CLAIMS'] = wiki_root_folder + claims_file\n", - "os.environ['LABELS'] = wiki_root_folder + label_file\n", - "os.environ['ALIASES'] = wiki_root_folder + alias_file\n", - "os.environ['DESCRIPTIONS'] = wiki_root_folder + description_file\n", - "os.environ['ITEMS'] = wiki_root_folder + item_file" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Echo the variables to see if they are all set correctly" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4\n", - "/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4\n", - "time kgtk --debug\n", - "time kgtk --debug query --graph-cache /Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\n", - "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz\n", - "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/labels.en.tsv.gz\n", - "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/aliases.en.tsv.gz\n", - "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/labels.en.tsv.gz\n", - "/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/descriptions.en.tsv.gz\n", - "/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\n" - ] - } - ], - "source": [ - "!echo $OUT\n", - "!echo $TEMP\n", - "!echo $kgtk\n", - "!echo $kypher\n", - "!echo $CLAIMS\n", - "!echo $LABELS\n", - "!echo $ALIASES\n", - "!echo $LABELS\n", - "!echo $DESCRIPTIONS\n", - "!echo $STORE\n", - "!alias col=\"column -t -s $'\\t' \"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Go to the output directory and create the subfolders for the output files and the temporary files" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/Users/pedroszekely/Downloads/kypher\n" - ] - } - ], - "source": [ - "cd $output_path" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mkdir: /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4: File exists\n", - "mkdir: /Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4: File exists\n" - ] - } - ], - "source": [ - "!mkdir $OUT\n", - "!mkdir $TEMP" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Clean up the output and temp folders before we start" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# !rm $OUT/*.tsv $OUT/*.tsv.gz\n", - "# !rm $TEMP/*.tsv $TEMP/*.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "if delete_database:\n", - " print(\"Deleteddatabase\") \n", - " !rm $STORE" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 1888376\n", - "-rw-r--r-- 1 pedroszekely staff 38563332 Nov 14 00:50 all.P279.tsv.gz\n", - "-rw-r--r-- 1 pedroszekely staff 21 Nov 14 00:50 all.P31_P279.tsv.gz\n", - "-rw-r--r-- 1 pedroszekely staff 37 Nov 14 00:51 all.isa.tsv.gz\n", - "-rw-r--r-- 1 pedroszekely staff 38563336 Nov 14 08:14 derived.P279.tsv.gz\n", - "-rw-r--r-- 1 pedroszekely staff 876497386 Nov 14 08:39 derived.P31.tsv.gz\n", - "P279.n1.tsv.gz P279.roots.tsv isa.1.tsv.gz\n", - "P279.reachable.tsv.gz P279star.1.tsv.gz wikidata.sqlite3.db\n", - "P279.roots.1.tsv.gz P279star.2.tsv.gz\n", - "P279.roots.2.tsv.gz P31.n2.tsv.gz\n", - "-rw------- 1 pedroszekely staff 24260264435 Nov 10 21:52 /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz\n", - "-rw------- 1 pedroszekely staff 2142929019 Nov 10 22:19 /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/labels.en.tsv.gz\n", - "-rw------- 1 pedroszekely staff 129552943 Nov 10 21:55 /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/aliases.en.tsv.gz\n", - "-rw------- 1 pedroszekely staff 2142929019 Nov 10 22:19 /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/labels.en.tsv.gz\n", - "-rw------- 1 pedroszekely staff 587501239 Nov 10 22:15 /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/descriptions.en.tsv.gz\n", - "/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/wikidata.sqlite3.db\n" - ] - } - ], - "source": [ - "!ls -l $OUT\n", - "!ls $TEMP\n", - "!ls -l \"$CLAIMS\"\n", - "!ls -l \"$LABELS\"\n", - "!ls -l \"$ALIASES\"\n", - "!ls -l \"$LABELS\"\n", - "!ls -l \"$DESCRIPTIONS\"\n", - "!ls $STORE" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "zcat: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "error writing to output: P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\" normal\turl\n", - "Broken pipe\n", - "P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\tnormal\turl\n", - "P10-P1629-Q34508-bcc39400-0\tP10\tP1629\tQ34508\tnormal\twikibase-item\n", - "P10-P1659-P1651-c4068028-0\tP10\tP1659\tP1651\tnormal\twikibase-property\n", - "P10-P1659-P18-5e4b9c4f-0\tP10\tP1659\tP18\tnormal\twikibase-property\n", - "P10-P1659-P4238-d21d1ac0-0\tP10\tP1659\tP4238\tnormal\twikibase-property\n", - "P10-P1659-P51-86aca4c5-0\tP10\tP1659\tP51\tnormal\twikibase-property\n", - "P10-P1855-Q15075950-7eff6d65-0\tP10\tP1855\tQ15075950\tnormal\twikibase-item\n", - "P10-P1855-Q4504-a69d2c73-0\tP10\tP1855\tQ4504\tnormal\twikibase-item\n" - ] - } - ], - "source": [ - "!zcat < \"$CLAIMS\" | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Preview the input files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It is always a good practice to peek a the files to make sure the column headings are what we expect" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 08:45:04 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT *\n", - " FROM graph_1 AS graph_1_c1\n", - " LIMIT ?\n", - " PARAS: [10]\n", - "---------------------------------------------\n", - " 0.80 real 0.53 user 0.14 sys\n", - "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\" normal\turl\n", - "P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\tnormal\turl\n", - "P10-P1629-Q34508-bcc39400-0\tP10\tP1629\tQ34508\tnormal\twikibase-item\n", - "P10-P1659-P1651-c4068028-0\tP10\tP1659\tP1651\tnormal\twikibase-property\n", - "P10-P1659-P18-5e4b9c4f-0\tP10\tP1659\tP18\tnormal\twikibase-property\n", - "P10-P1659-P4238-d21d1ac0-0\tP10\tP1659\tP4238\tnormal\twikibase-property\n", - "P10-P1659-P51-86aca4c5-0\tP10\tP1659\tP51\tnormal\twikibase-property\n", - "P10-P1855-Q15075950-7eff6d65-0\tP10\tP1855\tQ15075950\tnormal\twikibase-item\n", - "P10-P1855-Q4504-a69d2c73-0\tP10\tP1855\tQ4504\tnormal\twikibase-item\n", - "P10-P1855-Q69063653-c8cdb04c-0\tP10\tP1855\tQ69063653\tnormal\twikibase-item\n" - ] - } - ], - "source": [ - "!$kypher -i \"$CLAIMS\" --limit 10 | col " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Force creation of the index on the label column" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 08:45:06 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT *\n", - " FROM graph_1 AS graph_1_c1\n", - " WHERE graph_1_c1.\"label\"=?\n", - " LIMIT ?\n", - " PARAS: ['P31', 5]\n", - "---------------------------------------------\n", - " 0.62 real 0.50 user 0.11 sys\n", - "id node1 label node2 rank node2;wikidatatype\n", - "P10-P31-Q18610173-85ef4d24-0 P10 P31 Q18610173 normal wikibase-item\n", - "P1000-P31-Q18608871-093affb5-0 P1000 P31 Q18608871 normal wikibase-item\n", - "P1001-P31-Q15720608-deeedec9-0 P1001 P31 Q15720608 normal wikibase-item\n", - "P1001-P31-Q22984026-8beb0cfe-0 P1001 P31 Q22984026 normal wikibase-item\n", - "P1001-P31-Q22997934-1e5b1a96-0 P1001 P31 Q22997934 normal wikibase-item\n" - ] - } - ], - "source": [ - "!$kypher -i \"$CLAIMS\" -o - \\\n", - "--match '(i)-[:P31]->(c)' \\\n", - "--limit 5 \\\n", - "| column -t -s $'\\t' " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Force creation of the index on the node2 column" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 08:45:09 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT *\n", - " FROM graph_1 AS graph_1_c1\n", - " WHERE graph_1_c1.\"node2\"=?\n", - " LIMIT ?\n", - " PARAS: ['Q5', 5]\n", - "---------------------------------------------\n", - " 0.62 real 0.50 user 0.11 sys\n", - "id node1 label node2 rank node2;wikidatatype\n", - "P1424-P1855-Q5-47bdcd17-0 P1424 P1855 Q5 normal wikibase-item\n", - "P1963-P1855-Q5-1ba43aca-0 P1963 P1855 Q5 normal wikibase-item\n", - "P3055-P1629-Q5-fb63cfeb-0 P3055 P1629 Q5 normal wikibase-item\n", - "P685-P1855-Q5-76c93460-0 P685 P1855 Q5 normal wikibase-item\n", - "P8168-P1855-Q5-1f792f8c-0 P8168 P1855 Q5 normal wikibase-item\n" - ] - } - ], - "source": [ - "!$kypher -i \"$CLAIMS\" -o - \\\n", - "--match '(i)-[r]->(:Q5)' \\\n", - "--limit 5 \\\n", - "| column -t -s $'\\t' " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Count the number of edges" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Counting takes a long time" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 08:04:54 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT count(graph_1_c1.\"id\") \"count\"\n", - " FROM graph_1 AS graph_1_c1\n", - " LIMIT ?\n", - " PARAS: [10]\n", - "---------------------------------------------\n", - "count\n", - "1102950183\n", - " 491.63 real 87.41 user 94.80 sys\n" - ] - } - ], - "source": [ - "!$kypher -i \"$CLAIMS\" \\\n", - "--match '()-[r]->()' \\\n", - "--return 'count(r) as count' \\\n", - "--limit 10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create the P31 and P279 files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create the `P31` file" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 08:45:19 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT graph_1_c1.\"id\", graph_1_c1.\"node1\", graph_1_c1.\"label\", graph_1_c1.\"node2\"\n", - " FROM graph_1 AS graph_1_c1\n", - " WHERE graph_1_c1.\"label\"=?\n", - " PARAS: ['P31']\n", - "---------------------------------------------\n", - " 1573.64 real 930.85 user 202.92 sys\n" - ] - } - ], - "source": [ - "!$kypher -i \"$CLAIMS\" -o $OUT/derived.P31.tsv.gz \\\n", - "--match '(n1)-[l:P31]->(n2)' \\\n", - "--return 'l, n1, l.label, n2' " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create the P279 file" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id\tnode1\tlabel\tnode2\n", - "P10-P31-Q18610173-85ef4d24-0\tP10\tP31\tQ18610173\n", - "gzcat: P1000-P31-Q18608871-093affb5-0\tP1000\tP31\tQ18608871\n", - "P1001-P31-Q15720608-deeedec9-0\tP1001\tP31\tQ15720608\n", - "P1001-P31-Q22984026-8beb0cfe-0\tP1001\tP31\tQ22984026\n", - "P1001-P31-Q22997934-1e5b1a96-0\tP1001\tP31\tQ22997934\n", - "P1001-P31-Q61719275-0ccc11a5-0\tP1001\tP31\tQ61719275\n", - "P1002-P31-Q22963600-b3a47587-0\tP1002\tP31\tQ22963600\n", - "error writing to outputP1003-P31-Q19595382-152d2cdd-0\tP1003\tP31\tQ19595382\n", - ": Broken pipe\n", - "P1003-P31-Q19833377-75138cf5-0\tP1003\tP31\tQ19833377\n", - "gzcat: /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/derived.P31.tsv.gz: uncompress failed\n" - ] - } - ], - "source": [ - "!gzcat $OUT/derived.P31.tsv.gz | head | col" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 09:11:34 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT graph_1_c1.\"id\", graph_1_c1.\"node1\", graph_1_c1.\"label\", graph_1_c1.\"node2\"\n", - " FROM graph_1 AS graph_1_c1\n", - " WHERE graph_1_c1.\"label\"=?\n", - " PARAS: ['P279']\n", - "---------------------------------------------\n", - " 102.82 real 38.44 user 18.03 sys\n" - ] - } - ], - "source": [ - "!$kypher -i \"$CLAIMS\" -o $OUT/derived.P279.tsv.gz \\\n", - " --match '(n1)-[l:P279]->(n2)' \\\n", - " --return 'l, n1, l.label, n2' " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create the file that contains all nodes reachable via P279 starting from a node2 in P31 or a node1 in P279" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First compute the roots" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 09:24:17 sqlstore]: IMPORT graph directly into table graph_3 from /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/derived.P279.tsv.gz ...\n", - "[2020-11-14 09:24:31 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT graph_3_c1.\"node1\" \"id\"\n", - " FROM graph_3 AS graph_3_c1\n", - " PARAS: []\n", - "---------------------------------------------\n", - " 27.08 real 34.74 user 0.84 sys\n" - ] - } - ], - "source": [ - "!$kypher -i $OUT/derived.P279.tsv.gz -o $TEMP/P279.n1.tsv.gz \\\n", - "--match '(n1)-[l]->()' \\\n", - "--return 'n1 as id' " - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 10:08:48 sqlstore]: IMPORT graph directly into table graph_4 from /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/derived.P31.tsv.gz ...\n", - "[2020-11-14 10:14:38 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT graph_4_c1.\"node2\" \"id\"\n", - " FROM graph_4 AS graph_4_c1\n", - " PARAS: []\n", - "---------------------------------------------\n", - " 526.29 real 735.22 user 21.42 sys\n" - ] - } - ], - "source": [ - "!$kypher -i $OUT/derived.P31.tsv.gz -o $TEMP/P31.n2.tsv.gz \\\n", - "--match '()-[l]->(n2)' \\\n", - "--return 'n2 as id' " - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "!$kgtk cat --mode NONE -i $TEMP/P31.n2.tsv.gz $TEMP/P279.n1.tsv.gz \\\n", - "| gzip > $TEMP/P279.roots.1.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "!$kgtk sort2 --mode NONE --column id -i $TEMP/P279.roots.1.tsv.gz \\\n", - "| gzip > $TEMP/P279.roots.2.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have lots of duplicates" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id\n", - "Q1\n", - "Q1\n", - "Q1000032\n", - "Q1000032\n", - "Q1000039\n", - "Q1000064\n", - "Q1000084\n", - "Q1000108\n", - "Q1000116\n", - "zcat: error writing to output: Broken pipe\n" - ] - } - ], - "source": [ - "!zcat < $TEMP/P279.roots.2.tsv.gz | head" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "!$kgtk compact -i $TEMP/P279.roots.2.tsv.gz --mode NONE \\\n", - " --presorted \\\n", - " --columns id \\\n", - "> $TEMP/P279.roots.tsv" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can invoke the reachable-nodes command" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 4429.37 real 2866.14 user 1546.66 sys\n" - ] - } - ], - "source": [ - "!$kgtk reachable-nodes \\\n", - " --rootfile $TEMP/P279.roots.tsv \\\n", - " --selflink \\\n", - " -i $OUT/derived.P279.tsv.gz \\\n", - "| gzip > $TEMP/P279.reachable.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "zcat: node1\tlabel\tnode2\n", - "error writing to outputQ1000032\treachable\tQ1000032\n", - ": Broken pipe\n", - "Q1000032\treachable\tQ1813494\n", - "Q1000032\treachable\tQ1799072\n", - "Q1000032\treachable\tQ16686448\n", - "Q1000032\treachable\tQ35120\n", - "Q1000032\treachable\tnovalue\n", - "Q1000032\treachable\tQ2695280\n", - "Q1000032\treachable\tQ1914636\n", - "Q1000032\treachable\tQ20937557\n" - ] - } - ], - "source": [ - "!zcat < $TEMP/P279.reachable.tsv.gz | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The reachable-nodes command produces edges labeled `reachable`, so we need one command to rename them." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 11:46:10 sqlstore]: IMPORT graph directly into table graph_5 from /Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/P279.reachable.tsv.gz ...\n", - "[2020-11-14 11:49:16 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT graph_5_c1.\"node1\", ? \"label\", graph_5_c1.\"node2\" \"node2\"\n", - " FROM graph_5 AS graph_5_c1\n", - " PARAS: ['P279star']\n", - "---------------------------------------------\n", - " 738.99 real 866.13 user 11.47 sys\n" - ] - } - ], - "source": [ - "!$kypher -i $TEMP/P279.reachable.tsv.gz -o $TEMP/P279star.1.tsv.gz \\\n", - "--match '(n1)-[]->(n2)' \\\n", - "--return 'n1, \"P279star\" as label, n2 as node2' " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can concatenate these files to produce the final output" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 239.06 real 232.07 user 47.03 sys\n" - ] - } - ], - "source": [ - "!$kgtk sort2 -i $TEMP/P279star.1.tsv.gz -o $TEMP/P279star.2.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure there are no duplicates" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 1372.23 real 1368.60 user 2.27 sys\n" - ] - } - ], - "source": [ - "!$kgtk compact --presorted -i $TEMP/P279star.2.tsv.gz -o $TEMP/P279star.3.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add ids" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 1273.07 real 1239.92 user 19.64 sys\n" - ] - } - ], - "source": [ - "!$kgtk add-id --id-style node1-label-node2-num -i $TEMP/P279star.3.tsv.gz -o $OUT/derived.P279star.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "zcat: node1\tlabel\tnode2\tid\n", - "error writing to outputQ1000032\tP279star\tQ1000032\tQ1000032-P279star-Q1000032-0000\n", - ": Broken pipe\n", - "Q1000032\tP279star\tQ1150070\tQ1000032-P279star-Q1150070-0000\n", - "Q1000032\tP279star\tQ1190554\tQ1000032-P279star-Q1190554-0000\n", - "Q1000032\tP279star\tQ133500 Q1000032-P279star-Q133500-0000\n", - "Q1000032\tP279star\tQ13878858\tQ1000032-P279star-Q13878858-0000\n", - "Q1000032\tP279star\tQ14819853\tQ1000032-P279star-Q14819853-0000\n", - "Q1000032\tP279star\tQ14912053\tQ1000032-P279star-Q14912053-0000\n", - "Q1000032\tP279star\tQ16686448\tQ1000032-P279star-Q16686448-0000\n", - "Q1000032\tP279star\tQ16722960\tQ1000032-P279star-Q16722960-0000\n" - ] - } - ], - "source": [ - "!zcat < $OUT/derived.P279star.tsv.gz | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is how we would do the typical `?item P31/P279* ?class` in Kypher. \n", - "The example shows how to get all the counts of instances of subclasses of city (Q515)." - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 12:54:48 sqlstore]: IMPORT graph directly into table graph_6 from /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/derived.P279star.tsv.gz ...\n", - "[2020-11-14 13:01:15 sqlstore]: IMPORT graph directly into table graph_7 from /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/labels.en.tsv.gz ...\n", - "[2020-11-14 13:10:32 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT DISTINCT graph_4_c1.\"node2\" \"class\", count(graph_4_c1.\"node2\") \"count\", graph_7_c4.\"node2\" \"class name\", graph_7_c3.\"node1\" \"instance\", graph_7_c3.\"node2\" \"label\"\n", - " FROM graph_4 AS graph_4_c1, graph_6 AS graph_6_c2, graph_7 AS graph_7_c3, graph_7 AS graph_7_c4\n", - " WHERE graph_4_c1.\"label\"=?\n", - " AND graph_6_c2.\"node2\"=?\n", - " AND graph_7_c3.\"label\"=?\n", - " AND graph_7_c4.\"label\"=?\n", - " AND graph_4_c1.\"node1\"=graph_7_c3.\"node1\"\n", - " AND graph_4_c1.\"node2\"=graph_6_c2.\"node1\"\n", - " AND graph_6_c2.\"node1\"=graph_7_c4.\"node1\"\n", - " GROUP BY class\n", - " ORDER BY count(graph_4_c1.\"node2\") DESC, graph_4_c1.\"node2\" ASC, graph_7_c3.\"node1\" ASC\n", - " LIMIT ?\n", - " PARAS: ['P31', 'Q515', 'label', 'label', 10]\n", - "---------------------------------------------\n", - "[2020-11-14 13:10:32 sqlstore]: CREATE INDEX on table graph_7 column label ...\n", - "[2020-11-14 13:11:47 sqlstore]: ANALYZE INDEX on table graph_7 column label ...\n", - "[2020-11-14 13:11:51 sqlstore]: CREATE INDEX on table graph_4 column label ...\n", - "[2020-11-14 13:12:52 sqlstore]: ANALYZE INDEX on table graph_4 column label ...\n", - "[2020-11-14 13:12:57 sqlstore]: CREATE INDEX on table graph_4 column node1 ...\n", - "[2020-11-14 13:13:52 sqlstore]: ANALYZE INDEX on table graph_4 column node1 ...\n", - "[2020-11-14 13:13:58 sqlstore]: CREATE INDEX on table graph_7 column node1 ...\n", - "[2020-11-14 13:15:12 sqlstore]: ANALYZE INDEX on table graph_7 column node1 ...\n", - "[2020-11-14 13:15:18 sqlstore]: CREATE INDEX on table graph_6 column node1 ...\n", - "[2020-11-14 13:16:25 sqlstore]: ANALYZE INDEX on table graph_6 column node1 ...\n", - "[2020-11-14 13:16:30 sqlstore]: CREATE INDEX on table graph_4 column node2 ...\n", - "[2020-11-14 13:17:54 sqlstore]: ANALYZE INDEX on table graph_4 column node2 ...\n", - "[2020-11-14 13:17:59 sqlstore]: CREATE INDEX on table graph_6 column node2 ...\n", - "[2020-11-14 13:19:48 sqlstore]: ANALYZE INDEX on table graph_6 column node2 ...\n", - " 1519.15 real 1848.63 user 126.54 sys\n", - "class\tcount\tclass name\tinstance\tlabel\n", - "Q1093829\t9783\t'city of the United States'@en\tQ100\t'Boston'@en\n", - "Q515\t8085\t'city'@en\tQ1000143\t'Güigüe'@en\n", - "Q918230 3179\t'Roman villa'@en\tQ10275577\t'Estação arqueológica romana da Praia da Luz'@en\n", - "Q42744322\t2086\t'urban municipality of Germany'@en\tQ1017\t'Aachen'@en\n", - "Q15661340\t1453\t'ancient city'@en\tQ1001370\t'Colossae'@en\n", - "Q56557504\t1234\t'city of Iran'@en\tQ1020637\t'Jahrom'@en\n", - "Q494721 712\t'city of Japan'@en\tQ1011145\t'Itoshima'@en\n", - "Q20541692\t504\t'settlement in Galicia, Spain'@en\tQ11232456\t'A Peneda'@en\n", - "Q11939023\t418\t'population nucleus (Spain)'@en Q1050185\t'Les Escaldes'@en\n", - "Q13539802\t385\t'place with town rights and privileges'@en\tQ100076 'Valkenburg'@en\n" - ] - } - ], - "source": [ - "!$kypher -i $OUT/derived.P31.tsv.gz -i $OUT/derived.P279star.tsv.gz -i \"$LABELS\" \\\n", - "--match 'P31: (n1)-[:P31]->(c), P279star: (c)-[]->(:Q515), label: (n1)-[:label]->(label), label: (c)-[:label]->(c_label)' \\\n", - "--return 'distinct c as class, count(c) as count, c_label as `class name`, n1 as instance, label as `label`' \\\n", - "--order-by 'count(c) desc, c, n1' \\\n", - "--limit 10 \\\n", - "| col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Illustrate that it is indeed `P279*`" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 13:44:45 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT DISTINCT graph_6_c2.\"node1\" \"class\", graph_7_c4.\"node2\" \"class name\", graph_7_c3.\"node1\" \"instance\", graph_7_c3.\"node2\" \"label\"\n", - " FROM graph_4 AS graph_4_c1, graph_6 AS graph_6_c2, graph_7 AS graph_7_c3, graph_7 AS graph_7_c4\n", - " WHERE graph_4_c1.\"label\"=?\n", - " AND graph_6_c2.\"node2\"=?\n", - " AND graph_7_c3.\"label\"=?\n", - " AND graph_7_c4.\"label\"=?\n", - " AND graph_4_c1.\"node1\"=graph_7_c3.\"node1\"\n", - " AND graph_4_c1.\"node2\"=graph_6_c2.\"node1\"\n", - " AND graph_4_c1.\"node2\"=graph_7_c4.\"node1\"\n", - " ORDER BY graph_6_c2.\"node1\" ASC, graph_7_c3.\"node1\" ASC\n", - " LIMIT ?\n", - " PARAS: ['P31', 'Q63440326', 'label', 'label', 10]\n", - "---------------------------------------------\n", - " 1.28 real 0.61 user 0.22 sys\n", - "class\tclass name\tinstance\tlabel\n", - "Q63440326\t'city of Oregon'@en\tQ1003672\t'Cascade Locks'@en\n", - "Q63440326\t'city of Oregon'@en\tQ1003826\t'Yamhill'@en\n", - "Q63440326\t'city of Oregon'@en\tQ1003838\t'La Pine'@en\n", - "Q63440326\t'city of Oregon'@en\tQ1007028\t'Cottage Grove'@en\n", - "Q63440326\t'city of Oregon'@en\tQ1012169\t'Mount Vernon'@en\n", - "Q63440326\t'city of Oregon'@en\tQ1012351\t'Sutherlin'@en\n", - "Q63440326\t'city of Oregon'@en\tQ1012371\t'Lafayette'@en\n", - "Q63440326\t'city of Oregon'@en\tQ1022986\t'Paisley'@en\n", - "Q63440326\t'city of Oregon'@en\tQ1065556\t'Gold Beach'@en\n", - "Q63440326\t'city of Oregon'@en\tQ1152219\t'Baker City'@en\n" - ] - } - ], - "source": [ - "!$kypher -i $OUT/derived.P31.tsv.gz -i $OUT/derived.P279star.tsv.gz -i \"$LABELS\" \\\n", - "--match 'P31: (n1)-[:P31]->(c), P279star: (c)-[]->(:Q63440326), label: (n1)-[:label]->(label), label: (c)-[:label]->(c_label)' \\\n", - "--return 'distinct c as class, c_label as `class name`, n1 as instance, label as `label`' \\\n", - "--order-by 'c, n1' \\\n", - "--limit 10 \\\n", - "| col " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Test that `P279star` is indeed star" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 14:58:59 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT *\n", - " FROM graph_6 AS graph_6_c1\n", - " WHERE graph_6_c1.\"label\"=?\n", - " AND graph_6_c1.\"node1\"=?\n", - " AND graph_6_c1.\"node2\"=?\n", - " PARAS: ['P279star', 'Q44', 'Q44']\n", - "---------------------------------------------\n", - "[2020-11-14 14:58:59 sqlstore]: CREATE INDEX on table graph_6 column label ...\n", - "[2020-11-14 15:00:08 sqlstore]: ANALYZE INDEX on table graph_6 column label ...\n", - "node1\tlabel\tnode2\tid\n", - "Q44\tP279star\tQ44\tQ44-P279star-Q44-0000\n", - " 78.41 real 38.58 user 16.33 sys\n" - ] - } - ], - "source": [ - "!$kypher -i $OUT/derived.P279star.tsv.gz \\\n", - "--match '(n1:Q44)-[:P279star]->(n2:Q44)'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a file to do generalized Is-A queries\n", - "The idea is that `(n1)-[:isa]->(n2)` when `(n1)-[:P31]->(n2)` or `(n1)-[:P279]->(n2)`\n", - "\n", - "We do this by concatenating the files and renaming the relation" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 435.98 real 431.80 user 2.86 sys\n" - ] - } - ], - "source": [ - "!$kgtk cat -i $OUT/derived.P31.tsv.gz $OUT/derived.P279.tsv.gz \\\n", - " | gzip > $TEMP/isa.1.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 13:27:24 sqlstore]: IMPORT graph directly into table graph_8 from /Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4/isa.1.tsv.gz ...\n", - "[2020-11-14 13:33:32 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT graph_8_c1.\"node1\", ? \"label\", graph_8_c1.\"node2\"\n", - " FROM graph_8 AS graph_8_c1\n", - " PARAS: ['isa']\n", - "---------------------------------------------\n", - " 736.21 real 953.69 user 24.91 sys\n" - ] - } - ], - "source": [ - "!$kypher -i $TEMP/isa.1.tsv.gz -o $OUT/derived.isa.tsv.gz \\\n", - "--match '(n1)-[]->(n2)' \\\n", - "--return 'n1, \"isa\" as label, n2' " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Example of how to use the `isa` relation" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 13:39:41 sqlstore]: IMPORT graph directly into table graph_9 from /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/derived.isa.tsv.gz ...\n", - "[2020-11-14 13:42:09 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT DISTINCT graph_9_c1.\"node1\", graph_9_c1.\"label\", ? \"node2\", graph_7_c3.\"node2\" \"n1_label\"\n", - " FROM graph_6 AS graph_6_c2, graph_7 AS graph_7_c3, graph_9 AS graph_9_c1\n", - " WHERE graph_6_c2.\"node2\"=?\n", - " AND graph_7_c3.\"label\"=?\n", - " AND graph_9_c1.\"label\"=?\n", - " AND graph_6_c2.\"node1\"=graph_9_c1.\"node2\"\n", - " AND graph_7_c3.\"node1\"=graph_9_c1.\"node1\"\n", - " LIMIT ?\n", - " PARAS: ['Q44', 'Q44', 'label', 'isa', 10]\n", - "---------------------------------------------\n", - "[2020-11-14 13:42:09 sqlstore]: CREATE INDEX on table graph_9 column label ...\n", - "[2020-11-14 13:42:50 sqlstore]: ANALYZE INDEX on table graph_9 column label ...\n", - "[2020-11-14 13:42:56 sqlstore]: CREATE INDEX on table graph_9 column node1 ...\n", - "[2020-11-14 13:43:32 sqlstore]: ANALYZE INDEX on table graph_9 column node1 ...\n", - "[2020-11-14 13:43:38 sqlstore]: CREATE INDEX on table graph_9 column node2 ...\n", - "[2020-11-14 13:44:37 sqlstore]: ANALYZE INDEX on table graph_9 column node2 ...\n", - " 304.75 real 389.56 user 23.10 sys\n", - "node1\tlabel\tnode2\tn1_label\n", - "Q15875298\tisa\tQ44\t'Floreffe'@en\n", - "Q1917255\tisa\tQ44\t'St-Idesbald'@en\n", - "Q2004062\tisa\tQ44\t'Sancti Adalberti'@en\n", - "Q2006077\tisa\tQ44\t'Bonne-Espérance abbey'@en\n", - "Q2272636\tisa\tQ44\t'Ename beer'@en\n", - "Q2290730\tisa\tQ44\t'Ter Dolen (beer)'@en\n", - "Q3625571\tisa\tQ44\t'Herkenrode Tripel'@en\n", - "Q505815 isa\tQ44\t'Trappist beer'@en\n", - "Q747216 isa\tQ44\t'Bornem'@en\n", - "Q15985396\tisa\tQ44\t'Crabbelaer'@en\n" - ] - } - ], - "source": [ - "!$kypher -i $OUT/derived.isa.tsv.gz -i $OUT/derived.P279star.tsv.gz -i \"$LABELS\" -o - \\\n", - "--match 'isa: (n1)-[l:isa]->(c), P279star: (c)-[]->(:Q44), label: (n1)-[:label]->(label)' \\\n", - "--return 'distinct n1, l.label, \"Q44\" as node2, label as n1_label' \\\n", - "--limit 10 \\\n", - "| col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compute pagerank" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now compute pagerank. These commands will exceed 16GB memory for graphs containing over 25 million nodes." - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_pagerank:\n", - " !$kgtk graph-statistics -i \"$ITEMS\" -o $OUT/metadata.pagerank.directed.tsv.gz \\\n", - " --page-rank-property directed_pagerank \\\n", - " --pagerank --statistics-only \\\n", - " --log $TEMP/metadata.pagerank.directed.summary.txt " - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_pagerank:\n", - " !cat $TEMP/metadata.pagerank.directed.summary.txt" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_pagerank:\n", - " !$kgtk graph-statistics -i \"$ITEMS\" -o $OUT/metadata.pagerank.undirected.tsv.gz \\\n", - " --page-rank-property undirected_pagerank \\\n", - " --pagerank --statistics-only \\\n", - " --log $TEMP/metadata.pagerank.undirected.summary.txt " - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_pagerank:\n", - " !cat $TEMP/metadata.pagerank.undirected.summary.txt " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compute Degrees" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Kypher can compute the out degree by counting the node2s for each node1" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 18:33:05 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT DISTINCT graph_1_c1.\"node1\" \"node1\", count(graph_1_c1.\"id\") \"node2\", ? \"label\"\n", - " FROM graph_1 AS graph_1_c1\n", - " GROUP BY node1\n", - " PARAS: ['out_degree']\n", - "---------------------------------------------\n", - " 2160.01 real 986.18 user 826.28 sys\n" - ] - } - ], - "source": [ - "!$kypher -i \"$CLAIMS\" -o $TEMP/metadata.out_degree.tsv.gz \\\n", - "--match '(n1)-[l]->()' \\\n", - "--return 'distinct n1 as node1, count(l) as node2, \"out_degree\" as label' " - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 707.37 real 742.88 user 69.35 sys\n" - ] - } - ], - "source": [ - "!$kgtk add-id --id-style node1-label-node2-num -i $TEMP/metadata.out_degree.tsv.gz \\\n", - "/ sort2 -o $OUT/metadata.out_degree.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To count the in-degree we only care when the node2 is a wikibase-item" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 19:20:53 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT DISTINCT graph_1_c1.\"node2\" \"node1\", count(graph_1_c1.\"id\") \"node2\", ? \"label\"\n", - " FROM graph_1 AS graph_1_c1\n", - " WHERE graph_1_c1.\"node2;wikidatatype\"=?\n", - " GROUP BY node1\n", - " PARAS: ['in_degree', 'wikibase-item']\n", - "---------------------------------------------\n", - " 1342.16 real 458.20 user 498.16 sys\n" - ] - } - ], - "source": [ - "!$kypher -i \"$CLAIMS\" -o $TEMP/metadata.in_degree.tsv.gz \\\n", - "--match '()-[l {`node2;wikidatatype`:\"wikibase-item\"}]->(n2)' \\\n", - "--return 'distinct n2 as node1, count(l) as node2, \"in_degree\" as label' " - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 29.74 real 32.55 user 1.18 sys\n" - ] - } - ], - "source": [ - "!$kgtk add-id --id-style node1-label-node2-num -i $TEMP/metadata.in_degree.tsv.gz \\\n", - "/ sort2 -o $OUT/metadata.in_degree.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "node1\tnode2\tlabel\tid\n", - "Q1\t11\tin_degree\tQ1-in_degree-11-0000\n", - "Q1\t12\tin_degree\tQ1-in_degree-12-0000\n", - "Q1\t17\tin_degree\tQ1-in_degree-17-0000\n", - "Q1\t2\tin_degree\tQ1-in_degree-2-0000\n", - "Q1\t3\tin_degree\tQ1-in_degree-3-0000\n", - "zcat: Q1\t4\tin_degree\tQ1-in_degree-4-0000\n", - "Q1\t6\tin_degree\tQ1-in_degree-6-0000\n", - "Q100\t1\tin_degree\tQ100-in_degree-1-0000\n", - "Q100\t10\tin_degree\tQ100-in_degree-10-0000\n", - "error writing to output: Broken pipe\n" - ] - } - ], - "source": [ - "!zcat < $OUT/metadata.in_degree.tsv.gz | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculate the distribution so we can make a nice chart" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 19:43:45 sqlstore]: DROP graph data table graph_12 from /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/metadata.in_degree.tsv.gz\n", - "[2020-11-14 19:45:40 sqlstore]: IMPORT graph directly into table graph_12 from /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/metadata.in_degree.tsv.gz ...\n", - "[2020-11-14 19:45:56 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT DISTINCT graph_12_c1.\"node2\" \"in_degree\", count(DISTINCT graph_12_c1.\"node1\") \"count\", ? \"label\"\n", - " FROM graph_12 AS graph_12_c1\n", - " GROUP BY in_degree\n", - " ORDER BY CAST(graph_12_c1.\"node2\" AS integer) ASC\n", - " PARAS: ['count']\n", - "---------------------------------------------\n", - " 135.41 real 33.74 user 43.07 sys\n" - ] - } - ], - "source": [ - "!$kypher -i $OUT/metadata.in_degree.tsv.gz -o $OUT/statistics.in_degree.distribution.tsv \\\n", - "--match '(n1)-[]->(n2)' \\\n", - "--return 'distinct n2 as in_degree, count(distinct n1) as count, \"count\" as label' \\\n", - "--order-by 'cast(n2, integer)' " - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "head: /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/metadata.in_degree.distribution.tsv: No such file or directory\n" - ] - } - ], - "source": [ - "!head $OUT/metadata.in_degree.distribution.tsv | col" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-14 19:46:01 sqlstore]: DROP graph data table graph_11 from /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/metadata.out_degree.tsv.gz\n", - "[2020-11-14 19:48:11 sqlstore]: IMPORT graph directly into table graph_11 from /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/metadata.out_degree.tsv.gz ...\n", - "[2020-11-14 19:53:34 query]: SQL Translation:\n", - "---------------------------------------------\n", - " SELECT DISTINCT graph_11_c1.\"node2\" \"out_degree\", count(DISTINCT graph_11_c1.\"node1\") \"count\", ? \"label\"\n", - " FROM graph_11 AS graph_11_c1\n", - " GROUP BY out_degree\n", - " ORDER BY CAST(graph_11_c1.\"node2\" AS integer) ASC\n", - " PARAS: ['count']\n", - "---------------------------------------------\n", - " 593.21 real 659.68 user 72.66 sys\n" - ] - } - ], - "source": [ - "!$kypher -i $OUT/metadata.out_degree.tsv.gz -o $OUT/statistics.out_degree.distribution.tsv \\\n", - "--match '(n1)-[]->(n2)' \\\n", - "--return 'distinct n2 as out_degree, count(distinct n1) as count, \"count\" as label' \\\n", - "--order-by 'cast(n2, integer)' " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Draw some charts" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 86, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data = pd.read_csv(\n", - " os.environ[\"OUT\"] + \"/statistics.in_degree.distribution.tsv\", sep=\"\\t\"\n", - ")\n", - "\n", - "alt.Chart(data).mark_circle(size=60).encode(\n", - " x=alt.X(\"in_degree\", scale=alt.Scale(type=\"log\")),\n", - " y=alt.Y(\"count\", scale=alt.Scale(type=\"log\"), title=\"count of nodes\"),\n", - " tooltip=[\"in_degree\", \"count\"],\n", - ").interactive().properties(title=\"Distribution of In Degree\")" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.Chart(...)" - ] - }, - "execution_count": 87, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "data = pd.read_csv(\n", - " os.environ[\"OUT\"] + \"/statistics.out_degree.distribution.tsv\", sep=\"\\t\"\n", - ")\n", - "\n", - "alt.Chart(data).mark_circle(size=60).encode(\n", - " x=alt.X(\"out_degree\", scale=alt.Scale(type=\"log\")),\n", - " y=alt.Y(\"count\", scale=alt.Scale(type=\"log\"), title=\"count of nodes\"),\n", - " tooltip=[\"out_degree\", \"count\"],\n", - ").interactive().properties(title=\"Distribution of Out Degree\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary of results" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-rw-r--r-- 1 pedroszekely staff 37M Nov 14 09:13 /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/derived.P279.tsv.gz\n", - "-rw-r--r-- 1 pedroszekely staff 500M Nov 14 12:54 /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/derived.P279star.tsv.gz\n", - "-rw-r--r-- 1 pedroszekely staff 973M Nov 14 09:11 /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/derived.P31.tsv.gz\n", - "-rw-r--r-- 1 pedroszekely staff 252M Nov 14 13:39 /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/derived.isa.tsv.gz\n", - "-rw-r--r-- 1 pedroszekely staff 21M Nov 14 19:43 /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/metadata.in_degree.tsv.gz\n", - "-rw-r--r-- 1 pedroszekely staff 512M Nov 14 19:20 /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/metadata.out_degree.tsv.gz\n", - "-rw-r--r-- 1 pedroszekely staff 11K Nov 14 19:46 /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/statistics.in_degree.distribution.tsv\n", - "-rw-r--r-- 1 pedroszekely staff 21K Nov 14 19:55 /Users/pedroszekely/Downloads/kypher/useful_wikidata_files_v4/statistics.out_degree.distribution.tsv\n" - ] - } - ], - "source": [ - "!ls -lh $OUT/*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Highest page rank" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 36.63 real 4.07 user 7.29 sys\n", - "node1 labe page_rank\n", - "Q81581 'Szeged'@en 9.99820910584327e-06\n", - "Q474406 'Tropiduchidae'@en 9.99775062441874e-06\n", - "Q102496 'parish'@en 9.989295648293259e-06\n", - "Q19830596 'Rubens'@en 9.98709688634465e-06\n", - "Q211661 'Jämtland'@en 9.983987961260548e-06\n", - "Q10361310 'Rick Bonadio'@en 9.983680487413594e-06\n", - "Q688275 'São Leopoldo'@en 9.972744274999134e-06\n", - "Q15008131 'Category:Acyrthosiphon'@en 9.971425871370554e-06\n", - "Q9876232 'Category:Colladonus'@en 9.971425871370554e-06\n", - "Q10387575 'registered historic monument'@en 9.963088508250605e-06\n" - ] - } - ], - "source": [ - "if compute_pagerank:\n", - " !$kypher -i $OUT/metadata.pagerank.undirected.tsv.gz -i \"$LABELS\" -o - \\\n", - " --match 'pagerank: (n1)-[:undirected_pagerank]->(page_rank), label: (n1)-[:label]->(label)' \\\n", - " --return 'distinct n1, label as label, page_rank as `undirected page rank' \\\n", - " --order-by 'cast(page_rank, float) desc' \\\n", - " --limit 10 \\\n", - " | col" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ls: : No such file or directory\n" - ] - } - ], - "source": [ - "!ls \"$QUALS\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "celltoolbar": "Tags", - "kernelspec": { - "display_name": "kgtk", - "language": "python", - "name": "kgtk" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.8" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/use-cases/Wikidata Useful Files.ipynb b/use-cases/Wikidata Useful Files.ipynb index 63f3876dd..4536df177 100644 --- a/use-cases/Wikidata Useful Files.ipynb +++ b/use-cases/Wikidata Useful Files.ipynb @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 3, "metadata": { "tags": [ "parameters" @@ -78,8 +78,8 @@ "source": [ "# Parameters\n", "\n", - "input_path = \"/data/amandeep/wikidata-20211027-dwd-v3\"\n", - "output_path = \"/data/amandeep/wikidata-20211027-dwd-v3\"\n", + "input_path = \"/data/amandeep/wikidata-20220505/import-wikidata/data\"\n", + "output_path = \"/data/amandeep/wikidata-20220505/import-wikidata/data\"\n", "kgtk_path = \"/Users/amandeep/github/kgtk\"\n", "\n", "graph_cache_path = None\n", @@ -90,14 +90,14 @@ "\n", "files = 'claims,label_all,alias_all,description_all'\n", "\n", - "compute_pagerank = True\n", - "compute_degrees = True\n", + "compute_pagerank = False\n", + "compute_degrees = False\n", "debug = False" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -107,7 +107,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -115,7 +115,7 @@ "output_type": "stream", "text": [ "User home: /nas/home/amandeep\n", - "Current dir: /data/amandeep/github/kgtk/use-cases\n", + "Current dir: /data/amandeep/Github/kgtk/use-cases\n", "KGTK dir: /Users/amandeep/github/kgtk\n", "Use-cases dir: /Users/amandeep/github/kgtk/use-cases\n" ] @@ -131,28 +131,28 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "KGTK_OPTION_DEBUG: false\n", - "TEMP: /data/amandeep/wikidata-20211027-dwd-v3/useful-files/temp.useful-files\n", + "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files/temp.useful-files/wikidata.sqlite3.db\n", + "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files/temp.useful-files/wikidata.sqlite3.db\n", "EXAMPLES_DIR: /Users/amandeep/github/kgtk/examples\n", - "KGTK_LABEL_FILE: /data/amandeep/wikidata-20211027-dwd-v3/labels.en.tsv.gz\n", - "STORE: /data/amandeep/wikidata-20211027-dwd-v3/useful-files/temp.useful-files/wikidata.sqlite3.db\n", - "kgtk: kgtk\n", - "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20211027-dwd-v3/useful-files/temp.useful-files/wikidata.sqlite3.db\n", - "GRAPH: /data/amandeep/wikidata-20211027-dwd-v3\n", "USE_CASES_DIR: /Users/amandeep/github/kgtk/use-cases\n", - "OUT: /data/amandeep/wikidata-20211027-dwd-v3/useful-files\n", - "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20211027-dwd-v3/useful-files/temp.useful-files/wikidata.sqlite3.db\n", - "claims: /data/amandeep/wikidata-20211027-dwd-v3/claims.tsv.gz\n", - "label_all: /data/amandeep/wikidata-20211027-dwd-v3/labels.tsv.gz\n", - "alias_all: /data/amandeep/wikidata-20211027-dwd-v3/aliases.tsv.gz\n", - "description_all: /data/amandeep/wikidata-20211027-dwd-v3/descriptions.tsv.gz\n" + "KGTK_LABEL_FILE: /data/amandeep/wikidata-20220505/import-wikidata/data/labels.en.tsv.gz\n", + "kgtk: kgtk\n", + "STORE: /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files/temp.useful-files/wikidata.sqlite3.db\n", + "GRAPH: /data/amandeep/wikidata-20220505/import-wikidata/data\n", + "OUT: /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files\n", + "TEMP: /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files/temp.useful-files\n", + "KGTK_OPTION_DEBUG: false\n", + "claims: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\n", + "label_all: /data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz\n", + "alias_all: /data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz\n", + "description_all: /data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz\n" ] } ], @@ -162,14 +162,14 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "kgtk query --graph-cache /data/amandeep/wikidata-20211027-dwd-v3/useful-files/temp.useful-files/wikidata.sqlite3.db -i \"/data/amandeep/wikidata-20211027-dwd-v3/claims.tsv.gz\" --as claims -i \"/data/amandeep/wikidata-20211027-dwd-v3/labels.tsv.gz\" --as label_all -i \"/data/amandeep/wikidata-20211027-dwd-v3/aliases.tsv.gz\" --as alias_all -i \"/data/amandeep/wikidata-20211027-dwd-v3/descriptions.tsv.gz\" --as description_all --limit 3\n", + "kgtk query --graph-cache /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files/temp.useful-files/wikidata.sqlite3.db -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\" --as claims -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz\" --as label_all -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz\" --as alias_all -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz\" --as description_all --limit 3\n", "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", "P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\tnormal\turl\n", "P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\tnormal\turl\n", @@ -182,6 +182,49 @@ " ck.load_files_into_cache()" ] }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Graph Cache:\n", + "DB file: /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files/temp.useful-files/wikidata.sqlite3.db\n", + " size: 424.57 GB \tfree: 0 Bytes \tmodified: 2022-05-11 11:30:56\n", + "\n", + "KGTK File Information:\n", + "alias_all:\n", + " size: 2.07 GB \tmodified: 2022-05-11 06:01:24 \tgraph: graph_3\n", + "claims:\n", + " size: 27.33 GB \tmodified: 2022-05-11 05:55:01 \tgraph: graph_1\n", + "description_all:\n", + " size: 23.66 GB \tmodified: 2022-05-11 07:08:19 \tgraph: graph_4\n", + "label_all:\n", + " size: 7.88 GB \tmodified: 2022-05-11 07:21:33 \tgraph: graph_2\n", + "\n", + "Graph Table Information:\n", + "graph_1:\n", + " size: 119.52 GB \tcreated: 2022-05-11 09:35:56\n", + " header: ['id', 'node1', 'label', 'node2', 'rank', 'node2;wikidatatype']\n", + "graph_2:\n", + " size: 54.66 GB \tcreated: 2022-05-11 09:57:48\n", + " header: ['id', 'node1', 'label', 'node2', 'lang']\n", + "graph_3:\n", + " size: 11.91 GB \tcreated: 2022-05-11 10:02:43\n", + " header: ['id', 'node1', 'label', 'node2', 'lang']\n", + "graph_4:\n", + " size: 238.50 GB \tcreated: 2022-05-11 11:30:56\n", + " header: ['id', 'node1', 'label', 'node2', 'lang']\n" + ] + } + ], + "source": [ + "!kgtk query --show-cache" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -198,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -223,7 +266,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -250,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -284,7 +327,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -295,7 +338,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -304,16 +347,16 @@ "text": [ "id\tnode1\tlabel\tnode2\n", "P10-P31-Q18610173-85ef4d24-0\tP10\tP31\tQ18610173\n", + "P10-P31-Q19847637-e81ded71-0\tP10\tP31\tQ19847637\n", "P1000-P31-Q18608871-093affb5-0\tP1000\tP31\tQ18608871\n", "P10000-P31-Q19833377-f87f0d4c-0 P10000\tP31\tQ19833377\n", - "\n", - "gzip: P10000-P31-Q89560413-f555a944-0 P10000\tP31\tQ89560413\n", + "P10000-P31-Q89560413-f555a944-0 P10000\tP31\tQ89560413\n", "P10001-P31-Q107738007-c7725ce7-0\tP10001\tP31\tQ107738007\n", - "stdout: Broken pipe\n", "P10001-P31-Q64221137-d154ffd9-0 P10001\tP31\tQ64221137\n", "P10002-P31-Q93433126-dbd52b84-0 P10002\tP31\tQ93433126\n", "P10003-P31-Q108914651-f3644858-0\tP10003\tP31\tQ108914651\n", - "P10003-P31-Q42396390-7f1b5502-0 P10003\tP31\tQ42396390\n" + "\n", + "gzip: stdout: Broken pipe\n" ] } ], @@ -330,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -341,7 +384,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -349,17 +392,17 @@ "output_type": "stream", "text": [ "id\tnode1\tlabel\tnode2\n", + "P2217-P279-Q986260-6ee7fda9-0\tP2217\tP279\tQ986260\n", "Q100000030-P279-Q14748-30394205-0\tQ100000030\tP279\tQ14748\n", "Q100000058-P279-Q1622444-bd182663-0\tQ100000058\tP279\tQ1622444\n", "Q1000032-P279-Q1813494-0aa0f1dc-0\tQ1000032\tP279\tQ1813494\n", "Q1000032-P279-Q83602-482a1943-0 Q1000032\tP279\tQ83602\n", "Q1000039-P279-Q11555767-2dddfd86-0\tQ1000039\tP279\tQ11555767\n", "Q100004761-P279-Q100095237-3971e1cd-0\tQ100004761\tP279\tQ100095237\n", - "\n", - "gzip: Q100004761-P279-Q126793-77b1fce8-0\tQ100004761\tP279\tQ126793\n", - "stdout: Broken pipe\n", + "Q100004761-P279-Q126793-77b1fce8-0\tQ100004761\tP279\tQ126793\n", "Q100004761-P279-Q4544523-639fbe16-0\tQ100004761\tP279\tQ4544523\n", - "Q1000064-P279-Q11016-0ab23344-0 Q1000064\tP279\tQ11016\n" + "\n", + "gzip: stdout: Broken pipe\n" ] } ], @@ -383,7 +426,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -394,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -405,7 +448,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -417,7 +460,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -436,7 +479,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -444,15 +487,15 @@ "output_type": "stream", "text": [ "id\n", + "P2217\n", + "Q1\n", + "Q1\n", "Q100000030\n", "Q100000058\n", "Q1000017\n", "Q1000032\n", "Q1000032\n", "Q1000039\n", - "Q100004761\n", - "Q100004761\n", - "Q100004761\n", "\n", "gzip: stdout: Broken pipe\n" ] @@ -464,7 +507,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -485,7 +528,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -498,7 +541,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -506,17 +549,17 @@ "output_type": "stream", "text": [ "node1\tlabel\tnode2\n", - "Q100000030\treachable\tQ100000030\n", - "Q100000030\treachable\tQ14748\n", - "Q100000030\treachable\tQ14745\n", - "Q100000030\treachable\tQ1357761\n", - "Q100000030\treachable\tQ223557\n", - "Q100000030\treachable\tQ35459920\n", + "P2217\treachable\tP2217\n", + "P2217\treachable\tQ986260\n", + "P2217\treachable\tQ3711325\n", + "P2217\treachable\tQ107715\n", + "P2217\treachable\tQ309314\n", + "P2217\treachable\tQ246672\n", "\n", - "gzip: Q100000030\treachable\tQ488383\n", + "gzip: P2217\treachable\tQ7184903\n", "stdout: Broken pipe\n", - "Q100000030\treachable\tQ35120\n", - "Q100000030\treachable\tQ4406616\n" + "P2217\treachable\tQ488383\n", + "P2217\treachable\tQ35120\n" ] } ], @@ -533,7 +576,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -552,7 +595,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -561,25 +604,25 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\n", - "gzip: stdout: Broken pipe\n", "node1\tlabel\tnode2\tid\n", - "Q100000030\tP279star\tQ100000030\tQ100000030-P279star-Q100000030\n", - "Q100000030\tP279star\tQ14748\tQ100000030-P279star-Q14748\n", - "Q100000030\tP279star\tQ14745\tQ100000030-P279star-Q14745\n", - "Q100000030\tP279star\tQ1357761\tQ100000030-P279star-Q1357761\n", - "Q100000030\tP279star\tQ223557 Q100000030-P279star-Q223557\n", - "Q100000030\tP279star\tQ35459920\tQ100000030-P279star-Q35459920\n", - "Q100000030\tP279star\tQ488383 Q100000030-P279star-Q488383\n", - "Q100000030\tP279star\tQ35120\tQ100000030-P279star-Q35120\n", - "Q100000030\tP279star\tQ4406616\tQ100000030-P279star-Q4406616\n" + "P2217\tP279star\tP2217\tP2217-P279star-P2217\n", + "P2217\tP279star\tQ986260 P2217-P279star-Q986260\n", + "P2217\tP279star\tQ3711325\tP2217-P279star-Q3711325\n", + "P2217\tP279star\tQ107715 P2217-P279star-Q107715\n", + "P2217\tP279star\tQ309314 P2217-P279star-Q309314\n", + "P2217\tP279star\tQ246672 P2217-P279star-Q246672\n", + "P2217\tP279star\tQ7184903\tP2217-P279star-Q7184903\n", + "\n", + "gzip: P2217\tP279star\tQ488383 P2217-P279star-Q488383\n", + "stdout: Broken pipe\n", + "P2217\tP279star\tQ35120\tP2217-P279star-Q35120\n" ] } ], @@ -597,7 +640,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -619,7 +662,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -644,7 +687,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -656,7 +699,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -675,7 +718,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -699,7 +742,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -709,7 +752,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -731,7 +774,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -750,7 +793,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -772,7 +815,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -791,17 +834,9 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 36, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " 1704159 6816636 100144221\n" - ] - } - ], + "outputs": [], "source": [ "if debug:\n", " !zcat < \"$OUT\"/derived.P31P279star.tsv.gz | wc" @@ -823,7 +858,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -846,39 +881,39 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "graph loaded! It has 92915895 nodes and 654318387 edges\n", + "graph loaded! It has 94903511 nodes and 670635690 edges\n", "\n", "*** Top relations:\n", - "P2860\t284754818\n", - "P31\t97274091\n", - "P1433\t37574876\n", - "P50\t21623753\n", - "P921\t17012294\n", - "P17\t14282490\n", - "P407\t14064733\n", - "P131\t10893839\n", - "P106\t8872011\n", - "P6259\t8076591\n", + "P2860\t285098156\n", + "P31\t99559383\n", + "P1433\t37893478\n", + "P50\t22619544\n", + "P921\t21565587\n", + "P17\t14723889\n", + "P407\t14494498\n", + "P131\t11189895\n", + "P106\t9239520\n", + "P6259\t8076517\n", "\n", "*** Degrees:\n", - "in degree stats: mean=7.042050, std=0.460818, max=1\n", - "out degree stats: mean=7.042050, std=0.001477, max=1\n", - "total degree stats: mean=14.084100, std=0.460825, max=1\n", + "in degree stats: mean=7.066500, std=0.456495, max=1\n", + "out degree stats: mean=7.066500, std=0.001451, max=1\n", + "total degree stats: mean=14.133001, std=0.456502, max=1\n", "\n", "*** PageRank\n", "Max pageranks\n", - "3927\tQ4167836\t0.023031\n", - "42785\tQ13442814\t0.021860\n", - "1926\tQ1860\t0.007249\n", - "2472\tQ5\t0.006367\n", - "1263646\tQ35252665\t0.005490\n" + "7296\tQ4167836\t0.024407\n", + "30751\tQ13442814\t0.020599\n", + "2476\tQ1860\t0.007204\n", + "5853\tQ5\t0.006323\n", + "5852\tQ11266439\t0.005784\n" ] } ], @@ -889,7 +924,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -912,39 +947,39 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "graph loaded! It has 92915895 nodes and 654318387 edges\n", + "graph loaded! It has 94903511 nodes and 670635690 edges\n", "\n", "*** Top relations:\n", - "P2860\t284754818\n", - "P31\t97274091\n", - "P1433\t37574876\n", - "P50\t21623753\n", - "P921\t17012294\n", - "P17\t14282490\n", - "P407\t14064733\n", - "P131\t10893839\n", - "P106\t8872011\n", - "P6259\t8076591\n", + "P2860\t285098156\n", + "P31\t99559383\n", + "P1433\t37893478\n", + "P50\t22619544\n", + "P921\t21565587\n", + "P17\t14723889\n", + "P407\t14494498\n", + "P131\t11189895\n", + "P106\t9239520\n", + "P6259\t8076517\n", "\n", "*** Degrees:\n", "in degree stats: mean=0.000000, std=0.000000, max=1\n", - "out degree stats: mean=14.084100, std=0.460825, max=1\n", - "total degree stats: mean=14.084100, std=0.460825, max=1\n", + "out degree stats: mean=14.133001, std=0.456502, max=1\n", + "total degree stats: mean=14.133001, std=0.456502, max=1\n", "\n", "*** PageRank\n", "Max pageranks\n", - "42785\tQ13442814\t0.030297\n", - "126633\tQ1264450\t0.013443\n", - "3927\tQ4167836\t0.012639\n", - "2472\tQ5\t0.008565\n", - "1926\tQ1860\t0.006830\n" + "30751\tQ13442814\t0.029250\n", + "130053\tQ1264450\t0.013161\n", + "7296\tQ4167836\t0.012312\n", + "5853\tQ5\t0.008650\n", + "2476\tQ1860\t0.006818\n" ] } ], @@ -969,7 +1004,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -981,7 +1016,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -992,7 +1027,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -1000,17 +1035,17 @@ "output_type": "stream", "text": [ "node1\tnode2\tlabel\tid\n", - "P10\t19\tPout_degree\tP10-Pout_degree-9400f1\n", - "P1000\t10\tPout_degree\tP1000-Pout_degree-4a44dc\n", "\n", - "gzip: P10000\t23\tPout_degree\tP10000-Pout_degree-535fa3\n", + "gzip: P10\t20\tPout_degree\tP10-Pout_degree-f5ca38\n", "stdout: Broken pipe\n", - "P10001\t26\tPout_degree\tP10001-Pout_degree-5f9c4a\n", - "P10002\t20\tPout_degree\tP10002-Pout_degree-f5ca38\n", + "P1000\t10\tPout_degree\tP1000-Pout_degree-4a44dc\n", + "P10000\t25\tPout_degree\tP10000-Pout_degree-b7a568\n", + "P10001\t30\tPout_degree\tP10001-Pout_degree-624b60\n", + "P10002\t21\tPout_degree\tP10002-Pout_degree-6f4b66\n", "P10003\t20\tPout_degree\tP10003-Pout_degree-f5ca38\n", - "P10004\t21\tPout_degree\tP10004-Pout_degree-6f4b66\n", - "P10005\t19\tPout_degree\tP10005-Pout_degree-9400f1\n", - "P10006\t20\tPout_degree\tP10006-Pout_degree-f5ca38\n" + "P10004\t23\tPout_degree\tP10004-Pout_degree-535fa3\n", + "P10005\t21\tPout_degree\tP10005-Pout_degree-6f4b66\n", + "P10006\t25\tPout_degree\tP10006-Pout_degree-b7a568\n" ] } ], @@ -1027,7 +1062,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -1039,7 +1074,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -1050,7 +1085,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -1058,17 +1093,17 @@ "output_type": "stream", "text": [ "node1\tnode2\tlabel\tid\n", + "Q1\t104\tPin_degree\tQ1-Pin_degree-5ef6fd\n", + "Q100\t14133\tPin_degree\tQ100-Pin_degree-ef9f82\n", + "Q1000\t6812\tPin_degree\tQ1000-Pin_degree-7536db\n", "\n", - "gzip: Q1\t91\tPin_degree\tQ1-Pin_degree-1da51b\n", + "gzip: Q10000\t2\tPin_degree\tQ10000-Pin_degree-d4735e\n", "stdout: Broken pipe\n", - "Q100\t13492\tPin_degree\tQ100-Pin_degree-9ba93e\n", - "Q1000\t5423\tPin_degree\tQ1000-Pin_degree-f2069c\n", "Q100000 125\tPin_degree\tQ100000-Pin_degree-0f8ef3\n", "Q10000000\t1\tPin_degree\tQ10000000-Pin_degree-6b86b2\n", - "Q100000001\t3\tPin_degree\tQ100000001-Pin_degree-4e0740\n", + "Q100000001\t5\tPin_degree\tQ100000001-Pin_degree-ef2d12\n", "Q10000002\t1\tPin_degree\tQ10000002-Pin_degree-6b86b2\n", - "Q100000040\t4\tPin_degree\tQ100000040-Pin_degree-4b2277\n", - "Q10000005\t1\tPin_degree\tQ10000005-Pin_degree-6b86b2\n" + "Q100000040\t4\tPin_degree\tQ100000040-Pin_degree-4b2277\n" ] } ], @@ -1085,7 +1120,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -1097,7 +1132,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 48, "metadata": {}, "outputs": [ { @@ -1105,15 +1140,15 @@ "output_type": "stream", "text": [ "Pin_degree\tcount\tlabel\n", - "1\t7182787 count\n", - "2\t2146229 count\n", - "3\t903632\tcount\n", - "4\t454520\tcount\n", - "5\t317411\tcount\n", - "6\t214644\tcount\n", - "7\t169879\tcount\n", - "8\t115011\tcount\n", - "9\t91834\tcount\n" + "1\t12410535\tcount\n", + "2\t5079189 count\n", + "3\t2954842 count\n", + "4\t1981895 count\n", + "5\t1530432 count\n", + "6\t1212475 count\n", + "7\t1008174 count\n", + "8\t827467\tcount\n", + "9\t706367\tcount\n" ] } ], @@ -1123,7 +1158,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -1135,7 +1170,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -1143,15 +1178,15 @@ "output_type": "stream", "text": [ "Pout_degree\tcount\tlabel\n", - "1\t6167233 count\n", - "2\t2647513 count\n", - "3\t2853195 count\n", - "4\t3070920 count\n", - "5\t4297447 count\n", - "6\t5898946 count\n", - "7\t4787449 count\n", - "8\t3723237 count\n", - "9\t3343266 count\n" + "1\t6266209 count\n", + "2\t2622464 count\n", + "3\t2889122 count\n", + "4\t3106569 count\n", + "5\t4518981 count\n", + "6\t6059016 count\n", + "7\t5408942 count\n", + "8\t5105646 count\n", + "9\t6513341 count\n" ] } ], @@ -1211,36 +1246,43 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "-rw-r--r-- 1 amandeep isdstaff 19G Nov 16 02:53 /data/amandeep/wikidata-20210215-dwd-v3/useful-files/derived.isastar.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 295M Nov 11 18:13 /data/amandeep/wikidata-20210215-dwd-v3/useful-files/derived.isa.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 723M Nov 11 18:04 /data/amandeep/wikidata-20210215-dwd-v3/useful-files/derived.P279star.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 40M Nov 11 17:59 /data/amandeep/wikidata-20210215-dwd-v3/useful-files/derived.P279.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 18G Nov 17 11:52 /data/amandeep/wikidata-20210215-dwd-v3/useful-files/derived.P31P279star.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.2G Nov 11 18:12 /data/amandeep/wikidata-20210215-dwd-v3/useful-files/derived.P31.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 231M Nov 18 01:18 /data/amandeep/wikidata-20210215-dwd-v3/useful-files/metadata.in_degree.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 598M Nov 17 19:49 /data/amandeep/wikidata-20210215-dwd-v3/useful-files/metadata.out_degree.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.6G Nov 17 14:25 /data/amandeep/wikidata-20210215-dwd-v3/useful-files/metadata.pagerank.directed.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.4G Nov 17 17:25 /data/amandeep/wikidata-20210215-dwd-v3/useful-files/metadata.pagerank.undirected.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 122K Nov 18 08:08 /data/amandeep/wikidata-20210215-dwd-v3/useful-files/statistics.in_degree.distribution.tsv\n", - "-rw-r--r-- 1 amandeep isdstaff 24K Nov 18 08:13 /data/amandeep/wikidata-20210215-dwd-v3/useful-files/statistics.out_degree.distribution.tsv\n", + "-rw-r--r-- 1 amandeep isdstaff 21G May 6 23:11 /data/amandeep/wikidata-20220409/useful-files/useful-files/derived.isastar.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 303M May 5 22:49 /data/amandeep/wikidata-20220409/useful-files/useful-files/derived.isa.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 710M May 5 21:57 /data/amandeep/wikidata-20220409/useful-files/useful-files/derived.P279star.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 42M May 5 20:10 /data/amandeep/wikidata-20220409/useful-files/useful-files/derived.P279.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 22G May 8 00:24 /data/amandeep/wikidata-20220409/useful-files/useful-files/derived.P31P279star.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.2G May 5 20:09 /data/amandeep/wikidata-20220409/useful-files/useful-files/derived.P31.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 235M May 8 08:39 /data/amandeep/wikidata-20220409/useful-files/useful-files/metadata.in_degree.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 610M May 8 06:52 /data/amandeep/wikidata-20220409/useful-files/useful-files/metadata.out_degree.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.1G May 8 02:12 /data/amandeep/wikidata-20220409/useful-files/useful-files/metadata.pagerank.directed.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.5G May 8 05:02 /data/amandeep/wikidata-20220409/useful-files/useful-files/metadata.pagerank.undirected.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 126K May 8 08:41 /data/amandeep/wikidata-20220409/useful-files/useful-files/statistics.in_degree.distribution.tsv\n", + "-rw-r--r-- 1 amandeep isdstaff 24K May 8 08:46 /data/amandeep/wikidata-20220409/useful-files/useful-files/statistics.out_degree.distribution.tsv\n", "\n", - "/data/amandeep/wikidata-20210215-dwd-v3/useful-files/temp.useful-files:\n", - "total 502G\n", - "-rw-r--r-- 1 amandeep isdstaff 8.7G Nov 12 16:15 derived.isastar_1.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 18G Nov 16 00:57 derived.isastar_2.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 8.7G Nov 16 23:39 derived.P31P279star.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 105M Nov 18 01:11 metadata.in_degree.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 286M Nov 17 19:30 metadata.out_degree.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 530 Nov 17 15:19 metadata.pagerank.directed.summary.txt\n", - "-rw-r--r-- 1 amandeep isdstaff 529 Nov 17 18:14 metadata.pagerank.undirected.summary.txt\n", - "-rw-r--r-- 1 amandeep isdstaff 467G Nov 18 08:11 wikidata.sqlite3.db\n" + "/data/amandeep/wikidata-20220409/useful-files/useful-files/temp.useful-files:\n", + "total 245G\n", + "-rw-r--r-- 1 amandeep isdstaff 11G May 6 08:56 derived.isastar_1.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 11G May 7 10:28 derived.P31P279star.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.2G May 5 22:38 isa.1.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 107M May 8 08:32 metadata.in_degree.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 292M May 8 06:35 metadata.out_degree.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 527 May 8 03:10 metadata.pagerank.directed.summary.txt\n", + "-rw-r--r-- 1 amandeep isdstaff 529 May 8 05:50 metadata.pagerank.undirected.summary.txt\n", + "-rw-r--r-- 1 amandeep isdstaff 5.9M May 5 20:10 P279.n1.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 352M May 5 21:11 P279.reachable.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 68M May 5 20:16 P279.roots.1.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 7.7M May 5 20:17 P279.roots.2.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 26M May 5 20:22 P279.roots.tsv\n", + "-rw-r--r-- 1 amandeep isdstaff 351M May 5 21:31 P279star.1.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 58M May 5 20:16 P31.n2.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 222G May 8 08:43 wikidata.sqlite3.db\n" ] } ], @@ -1281,9 +1323,9 @@ "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "kgtk-env", + "display_name": "kgtk-env-ckg07", "language": "python", - "name": "kgtk-env" + "name": "kgtk-env-ckg07" }, "language_info": { "codemirror_mode": { @@ -1295,7 +1337,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.12" } }, "nbformat": 4, diff --git a/use-cases/import-wikidata.ipynb b/use-cases/import-wikidata.ipynb index 019bf5aff..348a5c3f4 100644 --- a/use-cases/import-wikidata.ipynb +++ b/use-cases/import-wikidata.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "c5f9d560-8293-4dec-9667-f7e08c6ccf52", "metadata": {}, "outputs": [], @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "1a6cc50d-2a13-4eca-95be-486767de63ec", "metadata": {}, "outputs": [], @@ -42,16 +42,18 @@ "\n", "# Folder on local machine where to create the output and temporary folders\n", "input_path = \"/Volumes/saggu-ssd/wikidata-2021-10-27\"\n", + "input_path = \"/data/amandeep/wikidata-20220505\"\n", "output_path = \"/Volumes/saggu-ssd/wikidata-2021-10-27-out\"\n", + "output_path = \"/data/amandeep/wikidata-20220505\"\n", "project_name = \"import-wikidata\"\n", "\n", - "kgtk_path = \"/Users/amandeep/Github/kgtk\"\n", + "kgtk_path = \"/data/amandeep/Github/kgtk\"\n", "wikidata_json_file = \"latest-all.json.bz2\"" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "id": "d56ac16c-ba43-4810-8760-2a0755bfbd5f", "metadata": {}, "outputs": [ @@ -59,10 +61,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "User home: /Users/amandeep\n", - "Current dir: /Users/amandeep/Github/kgtk/use-cases\n", - "KGTK dir: /Users/amandeep/Github/kgtk\n", - "Use-cases dir: /Users/amandeep/Github/kgtk/use-cases\n" + "User home: /nas/home/amandeep\n", + "Current dir: /data/amandeep/Github/kgtk/use-cases\n", + "KGTK dir: /data/amandeep/Github/kgtk\n", + "Use-cases dir: /data/amandeep/Github/kgtk/use-cases\n" ] } ], @@ -77,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "046a0b40-c0c1-4e9f-9b36-afcfac05edfe", "metadata": {}, "outputs": [ @@ -85,17 +87,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "kgtk: kgtk\n", - "GRAPH: /Volumes/saggu-ssd/wikidata-2021-10-27\n", - "KGTK_GRAPH_CACHE: /Volumes/saggu-ssd/wikidata-2021-10-27-out/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db\n", + "OUT: /data/amandeep/wikidata-20220505/import-wikidata\n", + "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db\n", + "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db\n", + "EXAMPLES_DIR: /data/amandeep/Github/kgtk/examples\n", "KGTK_OPTION_DEBUG: false\n", - "OUT: /Volumes/saggu-ssd/wikidata-2021-10-27-out/import-wikidata\n", - "kypher: kgtk query --graph-cache /Volumes/saggu-ssd/wikidata-2021-10-27-out/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db\n", - "KGTK_LABEL_FILE: /Volumes/saggu-ssd/wikidata-2021-10-27/labels.en.tsv.gz\n", - "EXAMPLES_DIR: /Users/amandeep/Github/kgtk/examples\n", - "USE_CASES_DIR: /Users/amandeep/Github/kgtk/use-cases\n", - "STORE: /Volumes/saggu-ssd/wikidata-2021-10-27-out/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db\n", - "TEMP: /Volumes/saggu-ssd/wikidata-2021-10-27-out/import-wikidata/temp.import-wikidata\n" + "USE_CASES_DIR: /data/amandeep/Github/kgtk/use-cases\n", + "STORE: /data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db\n", + "kgtk: kgtk\n", + "GRAPH: /data/amandeep/wikidata-20220505\n", + "TEMP: /data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata\n", + "KGTK_LABEL_FILE: /data/amandeep/wikidata-20220505/labels.en.tsv.gz\n" ] } ], @@ -113,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "48fb958a-ff91-4360-b73f-3a136797f056", "metadata": {}, "outputs": [], @@ -204,12 +206,13 @@ "\t\"metadata.types\"]\n", "\n", "\n", - "os.environ['SORT_COMMAND'] = \"gsort\"" + "os.environ['SORT_COMMAND'] = \"gsort\"\n", + "os.environ['SORT_COMMAND'] = \"sort\"" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "3218233f-37b7-4563-b6bf-aacaa73ecd08", "metadata": {}, "outputs": [], @@ -241,10 +244,1284 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "df8647da-650a-44a6-9d19-290c64765e31", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kgtk import-wikidata version: 2021-11-17T01:38:17.437678+00:00#9z/aARcXhiV2hPdyVXjAREcpZwh2MawWFp6numz8GZBCtAg2WypLYAFpHjP43k97Zj8VHVaoel0oEit9KHXH0w==\n", + "Starting main process (pid 118098).\n", + "Processing.\n", + "Processing wikidata file /data/amandeep/wikidata-20220505/latest-all.json.bz2\n", + "Decompressing (bz2)\n", + "Creating the collector queue.\n", + "The collector node queue has been created (maxsize=36).\n", + "Creating the node_collector.\n", + "Creating the node collector process.\n", + "Starting the node collector process.\n", + "Started the node collector process.\n", + "The node collector is starting (pid 118140).\n", + "The collector edge queue has been created (maxsize=36).\n", + "Creating the edge_collector.\n", + "Creating the edge collector process.\n", + "Starting the edge collector process.\n", + "Started the edge collector process.\n", + "The edge collector is starting (pid 118141).\n", + "The collector qual queue has been created (maxsize=36).\n", + "Creating the qual_collector.\n", + "Creating the qual collector process.\n", + "Starting the qual collector process.\n", + "Started the qual collector process.\n", + "The qual collector is starting (pid 118142).\n", + "The collector invalid edge queue has been created (maxsize=36).\n", + "Creating the invalid_edge_collector.\n", + "Creating the invalid edge collector process.\n", + "Starting the invalid edge collector process.\n", + "Started the invalid edge collector process.\n", + "The invalid edge collector is starting (pid 118143).\n", + "The collector invalid qual queue has been created (maxsize=36).\n", + "Creating the invalid_qual_collector.\n", + "Creating the invalid qual collector process.\n", + "Starting the invalid qual collector process.\n", + "Started the invalid qual collector process.\n", + "The invalid qual collector is starting (pid 118144).\n", + "The collector description queue has been created (maxsize=36).\n", + "Creating the description collector.\n", + "Creating the description collector process.\n", + "Starting the description collector process.\n", + "Started the description collector process.\n", + "The description collector is starting (pid 118145).\n", + "The collector sitelink queue has been created (maxsize=36).\n", + "Creating the sitelink collector.\n", + "Creating the sitelink collector process.\n", + "Starting the sitelink collector process.\n", + "Started the sitelink collector process.\n", + "Sending the node header to the collector.\n", + "The sitelink collector is starting (pid 118146).\n", + "Sent the node header to the collector.\n", + "Sending the minimal edge file header to the collector.\n", + "Sent the minimal edge file header to the collector.\n", + "Sending the alias file header to the collector.\n", + "Sent the alias file header to the collector.\n", + "Sending the English alias file header to the collector.\n", + "Sent the English alias file header to the collector.\n", + "Sending the datatype file header to the collector.\n", + "Opening the node file in the node collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.node.unsorted.tsv.gz\n", + "Sent the datatype file header to the collector.\n", + "Sending the description file header to the collector.\n", + "Sent the description file header to the collector.\n", + "Sending the English description file header to the collector.\n", + "Sent the English description file header to the collector.\n", + "Sending the label file header to the collector.\n", + "Opening the minimal edge file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz\n", + "Sent the label file header to the collector.\n", + "Sending the English label file header to the collector.\n", + "Sent the English label file header to the collector.\n", + "Sending the sitelink file header to the collector.\n", + "Sent the sitelink file header to the collector.\n", + "Sending the English sitelink file header to the collector.\n", + "Sent the English sitelink file header to the collector.\n", + "Sending the entry type file header to the collector.\n", + "Sent the entry type file header to the collector.\n", + "Sending the minimal invalid edge header to the collector.\n", + "Opening the description file in the description collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz\n", + "Sent the minimal invalid edge header to the collector.\n", + "Sending the minimal qual file header to the collector.\n", + "Sent the minimal qual file header to the collector.\n", + "Sending the minimal invalid qual header to the collector.\n", + "Sent the minimal invalid qual header to the collector.\n", + "Opening the wikipedia_sitelink file in the sitelink collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz\n", + "Creating parallel processor for /data/amandeep/wikidata-20220505/latest-all.json.bz2\n", + "Opening the invalid edge file in the invalid edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz\n", + "Opening the minimal qual file in the qual collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz\n", + "Opening the qual file in the invalid qual collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalue.unsorted.tsv.gz\n", + "Opening the alias file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz\n", + "Opening the English description file in the description collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.en.unsorted.tsv.gz\n", + "Opening the English wikipedia_sitelink file in the sitelink collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz\n", + "Opening the English alias file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.en.unsorted.tsv.gz\n", + "Opening the datatype file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz\n", + "Opening the label file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz\n", + "Start parallel processing\n", + "Opening the English label file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.en.unsorted.tsv.gz\n", + "Opening the type file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz\n", + "Starting worker process 0 (pid 118147).\n", + "Starting worker process 1 (pid 118148).\n", + "Starting worker process 2 (pid 118149).\n", + "Starting worker process 3 (pid 118150).\n", + "Starting worker process 4 (pid 118151).\n", + "Starting worker process 5 (pid 118152).\n", + "Starting worker process 6 (pid 118153).\n", + "Starting worker process 7 (pid 118154).\n", + "Starting worker process 8 (pid 118155).\n", + "Starting worker process 9 (pid 118156).\n", + "Starting worker process 10 (pid 118157).\n", + "Starting worker process 11 (pid 118158).\n", + "\n", + "*** Sitelink collision #1 detected for Q7580-wikipedia_sitelink-dcda22 (http://lv.wikipedia.org/wiki/1743._gads)\n", + "\n", + "*** Qualifier collision #1 detected for Q37062-P26-Q2028843-b2e6740f-0-P580-6f4356 (^1411-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q61814-P26-Q66516-1fa99291-0-P580-d435a1 (^1502-04-10T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q62481-P26-Q2086776-87b8910e-0-P580-360391 (^1561-10-12T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q70789-P26-Q935411-28987fd8-0-P580-941716 (^1463-05-10T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q89405-P26-Q101877-d20a377b-0-P580-2b9eed (^1560-07-01T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q150611-P26-Q233335-575116d2-0-P580-29c809 (^1521-05-25T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q165284-P26-Q353-84a8ff47-0-P580-776c43 (^1200-05-23T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q169992-P26-Q235487-0e315055-0-P580-7a47d9 (^1332-07-28T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q183698-P26-Q256222-4322595e-0-P580-1fecee (^1684-01-09T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q235447-P26-Q161958-0d89305f-0-P580-52c362 (^1406-10-26T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q256222-P26-Q183698-415fc5b0-0-P580-1fecee (^1684-01-09T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q380373-P26-Q1141121-48bebee4-0-P580-2e184a (^1294-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q380868-P26-Q384941-46f6240f-0-P580-4b742f (^1533-08-25T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q453771-P26-Q443876-84acba5b-0-P580-84a26a (^1446-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q477343-P26-Q3374718-c7014aa0-0-P580-a95d2d (^1573-10-27T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q702835-wikipedia_sitelink-6ce2fd (http://uk.wikipedia.org/wiki/Бессі_Купер)\n", + "\n", + "*** Qualifier collision #1 detected for Q1834423-P26-Q322841-6c85598c-0-P580-876067 (^1559-06-16T00:00:00Z/11)\n", + "\n", + "*** Alias collision #1 detected for Q2336516-alias-es-f24d14 ('Elecciones al Parlamento Europeo de 1989 en Dinamarca'@es)\n", + "\n", + "*** Qualifier collision #1 detected for Q3007367-P26-Q430782-f64d3af2-0-P580-5b468d (^1555-02-07T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q4299475-wikipedia_sitelink-39fa76 (http://.wikipedia.org/wiki/Template:Bot)\n", + "\n", + "*** Sitelink collision #1 detected for Q4847311-wikipedia_sitelink-c4b491 (http://.wikipedia.org/wiki/Template:Delete)\n", + "\n", + "*** Sitelink collision #1 detected for Q5406510-wikipedia_sitelink-c7418e (http://.wikipedia.org/wiki/Template:=)\n", + "\n", + "*** Sitelink collision #1 detected for Q5412328-wikipedia_sitelink-6bc2e1 (http://.wikipedia.org/wiki/Template:Trim)\n", + "\n", + "*** Sitelink collision #1 detected for Q5607945-wikipedia_sitelink-6795ff (http://mr.wikipedia.org/wiki/वर्ग:मार्गक्रमण_साचे)\n", + "\n", + "*** Sitelink collision #1 detected for Q5621274-wikipedia_sitelink-126246 (http://.wikipedia.org/wiki/Template:Column-count)\n", + "\n", + "*** Sitelink collision #1 detected for Q5882248-wikipedia_sitelink-1cc4bd (http://.wikipedia.org/wiki/Template:Documentation_subpage)\n", + "\n", + "*** Qualifier collision #1 detected for Q7529231-P26-Q6792225-896048a6-0-P580-5a896b (^1508-00-00T00:00:00Z/9)\n", + "\n", + "*** Sitelink collision #1 detected for Q9737782-wikipedia_sitelink-0b4a66 (http://.wikipedia.org/wiki/Category:User_templates)\n", + "\n", + "*** Sitelink collision #1 detected for Q10560270-wikipedia_sitelink-2fe24f (http://.wikipedia.org/wiki/Template:Under_construction)\n", + "\n", + "*** Qualifier collision #1 detected for Q13058108-P159-Q1354-267a1462-0-P625-cb2660 (@23.728063/90.419591)\n", + "\n", + "*** Sitelink collision #1 detected for Q13156670-wikipedia_sitelink-c35c37 (http://.wikipedia.org/wiki/Template:Interwiki_redirect)\n", + "\n", + "*** Sitelink collision #1 detected for Q14511701-wikipedia_sitelink-5b1836 (http://.wikipedia.org/wiki/Template:TemplateData_header)\n", + "\n", + "*** Sitelink collision #1 detected for Q14635514-wikipedia_sitelink-156619 (http://.wikipedia.org/wiki/Template:Reply_to)\n", + "\n", + "*** Sitelink collision #1 detected for Q7253814-wikipedia_sitelink-9e2840 (http://.wikipedia.org/wiki/Module:String)\n", + "\n", + "*** Sitelink collision #1 detected for Q7348344-wikipedia_sitelink-d451bf (http://.wikipedia.org/wiki/Module:Coordinates)\n", + "\n", + "*** Sitelink collision #1 detected for Q15818920-wikipedia_sitelink-fa275b (http://.wikipedia.org/wiki/Template:Autoarchive_resolved_section)\n", + "The node collector called 500000 times: 2500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 500000 times: 0 nrows, 60662629 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 500000 times: 0 nrows, 81273275 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The sitelink collector called 500000 times: 0 nrows, 33927067 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q29053400-wikipedia_sitelink-b11f2e (http://.wikipedia.org/wiki/Category:Pages_with_template_loops)\n", + "The qual collector called 500000 times: 0 nrows, 0 erows, 7571086 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 1000000 times: 5000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 1000000 times: 0 nrows, 133466230 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 1000000 times: 0 nrows, 200314003 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "500000 lines processed by processor 5\n", + "500000 lines processed by processor 1\n", + "500000 lines processed by processor 9\n", + "500000 lines processed by processor 4\n", + "500000 lines processed by processor 10\n", + "500000 lines processed by processor 6\n", + "500000 lines processed by processor 7\n", + "500000 lines processed by processor 2\n", + "500000 lines processed by processor 11\n", + "500000 lines processed by processor 8\n", + "500000 lines processed by processor 3\n", + "\n", + "*** Qualifier collision #1 detected for Q55579391-P26-Q121846-1952d1ff-0-P580-cae35d (^1284-00-00T00:00:00Z/9)\n", + "500000 lines processed by processor 0\n", + "\n", + "*** Sitelink collision #1 detected for Q58832772-wikipedia_sitelink-ea0ae7 (http://.wikipedia.org/wiki/Module:LangSwitch)\n", + "The qual collector called 1000000 times: 0 nrows, 0 erows, 18531429 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 1500000 times: 7500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 1500000 times: 0 nrows, 183700332 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 1500000 times: 0 nrows, 268671270 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The qual collector called 1500000 times: 0 nrows, 0 erows, 28002684 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 2000000 times: 10000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 2000000 times: 0 nrows, 243125009 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 2000000 times: 0 nrows, 300937648 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q90722487-wikipedia_sitelink-24b73c (http://.wikipedia.org/wiki/Category:Pages_using_deprecated_source_tags)\n", + "\n", + "*** Sitelink collision #1 detected for Q99735928-wikipedia_sitelink-a62902 (http://.wikipedia.org/wiki/Template:BCP47)\n", + "The sitelink collector called 1000000 times: 0 nrows, 40653120 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "1000000 lines processed by processor 5\n", + "\n", + "*** Sitelink collision #1 detected for Q109249671-wikipedia_sitelink-f0183d (http://.wikipedia.org/wiki/Template:None)\n", + "1000000 lines processed by processor 8\n", + "1000000 lines processed by processor 10\n", + "1000000 lines processed by processor 7\n", + "1000000 lines processed by processor 0\n", + "1000000 lines processed by processor 6\n", + "1000000 lines processed by processor 11\n", + "1000000 lines processed by processor 9\n", + "1000000 lines processed by processor 4\n", + "1000000 lines processed by processor 1\n", + "1000000 lines processed by processor 2\n", + "1000000 lines processed by processor 3\n", + "The qual collector called 2000000 times: 0 nrows, 0 erows, 38508407 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q8058-P26-Q254085-4eab60ab-0-P580-8df26d (^1436-06-24T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q8384-P26-Q70590-6edd7354-0-P580-e23c66 (^1305-09-23T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q13167-P348-99b09e-08cc7a6d-0-P577-07f6e3 (^2016-07-12T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q57161-P26-Q441394-f0d02358-0-P580-77780b (^1308-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q57161-P26-Q467019-4ee33344-0-P580-9268e9 (^1324-02-25T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q74019-P26-Q540767-c098df36-0-P580-5774e5 (^1422-07-25T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q121130-P26-Q119431-af2d7776-0-P580-10c067 (^1197-05-25T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q151587-P26-Q7996-4448a491-0-P580-62d46c (^1572-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q153319-P26-Q57852-a80af489-0-P580-56d3ba (^1725-06-01T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q154998-P26-Q234549-77a9d927-0-P580-45ce34 (^1525-10-29T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q184868-P26-Q390071-b34d3d54-0-P580-90dfde (^1680-07-18T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q203647-P26-Q2284422-aed54bb0-0-P580-246002 (^1045-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q232137-P26-Q41847-0dcc4fd6-0-P580-fe3abc (^0956-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q263474-P26-Q3044-90d4ea9f-0-P580-9b0b8a (^0770-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q333359-P26-Q3052486-9cbd9d9e-0-P580-355ae9 (^0960-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q454810-P26-Q702209-74f88753-0-P580-e60df9 (^1476-08-25T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q598906-P26-Q1635933-0d160adf-0-P580-ec2def (^1236-11-01T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q672446-P26-Q2912335-f19e5091-0-P580-93d3bd (^1447-12-14T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q674931-P26-Q19601994-f7d507fb-0-P580-9b41a5 (^1222-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q694351-P26-Q329555-c88da6e5-0-P580-15a1f0 (^1381-09-02T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q702602-P26-Q79176-0f28ed9a-0-P580-676c21 (^1431-06-03T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q719501-P26-Q69462-4f695a08-0-P580-79dbc8 (^1512-07-06T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q921009-wikipedia_sitelink-53a877 (http://ceb.wikipedia.org/wiki/Balod_(lungsod_sa_Indiya))\n", + "\n", + "*** Sitelink collision #1 detected for Q1007634-wikipedia_sitelink-ed9b51 (http://ko.wikipedia.org/wiki/에스텔리)\n", + "\n", + "*** Sitelink collision #1 detected for Q1071820-wikipedia_sitelink-aab9e2 (http://br.wikipedia.org/wiki/Lagostomus)\n", + "\n", + "*** Qualifier collision #1 detected for Q1106184-P26-Q4331742-bc45332b-0-P580-e7880a (^1555-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q1381324-P26-Q535528-9a0e7ede-0-P580-c9b00e (^1221-06-19T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q3139317-P159-Q9799-e865820c-0-P625-be8120 (@50.8802/5.9595)\n", + "The node collector called 2500000 times: 12500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 2500000 times: 0 nrows, 314121442 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q4115450-P159-Q191204-8fdad044-0-P625-a98823 (@35.569778/45.352163)\n", + "The description collector called 2500000 times: 0 nrows, 345747666 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q5411705-wikipedia_sitelink-7111a4 (http://.wikipedia.org/wiki/Template:Clear)\n", + "\n", + "*** Sitelink collision #1 detected for Q5459259-wikipedia_sitelink-044e48 (http://.wikipedia.org/wiki/Template:Center)\n", + "\n", + "*** Sitelink collision #1 detected for Q5622198-wikipedia_sitelink-a3c8ed (http://.wikipedia.org/wiki/Template:Done)\n", + "\n", + "*** Sitelink collision #1 detected for Q5646673-wikipedia_sitelink-162b1a (http://.wikipedia.org/wiki/Template:Pp-template)\n", + "\n", + "*** Sitelink collision #1 detected for Q6063221-wikipedia_sitelink-4fe51c (http://.wikipedia.org/wiki/Template:Mbox)\n", + "\n", + "*** Sitelink collision #1 detected for Q6133158-wikipedia_sitelink-62de50 (http://.wikipedia.org/wiki/Template:@)\n", + "\n", + "*** Qualifier collision #1 detected for Q6867218-P159-Q9268849-32a831ad-0-P625-51c420 (@52.223817/21.005108)\n", + "\n", + "*** Qualifier collision #1 detected for Q9061646-P39-Q84701409-5a714518-0-P580-3e1e37 (^1116-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q9061646-P39-Q84701409-5a714518-0-P582-ac0fb1 (^1154-00-00T00:00:00Z/9)\n", + "\n", + "*** Sitelink collision #1 detected for Q16051502-wikipedia_sitelink-1ba00f (http://arz.wikipedia.org/wiki/صوت_الصمت_2013)\n", + "\n", + "*** Sitelink collision #1 detected for Q16748603-wikipedia_sitelink-1c7ff7 (http://.wikipedia.org/wiki/Module:No_globals)\n", + "\n", + "*** Sitelink collision #1 detected for Q17347205-wikipedia_sitelink-9e0e56 (http://.wikipedia.org/wiki/Module:Category_handler/config)\n", + "The sitelink collector called 1500000 times: 0 nrows, 73918436 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q26878179-P26-Q55169081-de1c53f2-0-P580-01b412 (^1571-09-08T00:00:00Z/11)\n", + "The node collector called 3000000 times: 15000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 3000000 times: 0 nrows, 363934028 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q28941166-wikipedia_sitelink-d4f303 (http://fr.wikipedia.org/wiki/Tempête_de_neige_de_la_mi-mars_2017_dans_l'est_de_l'Amérique_du_Nord)\n", + "The description collector called 3000000 times: 0 nrows, 435008293 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The qual collector called 2500000 times: 0 nrows, 0 erows, 45967305 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 3500000 times: 17500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 3500000 times: 0 nrows, 439374610 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 3500000 times: 0 nrows, 558743824 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "1500000 lines processed by processor 5\n", + "1500000 lines processed by processor 7\n", + "1500000 lines processed by processor 4\n", + "1500000 lines processed by processor 9\n", + "1500000 lines processed by processor 11\n", + "1500000 lines processed by processor 8\n", + "1500000 lines processed by processor 0\n", + "1500000 lines processed by processor 1\n", + "1500000 lines processed by processor 10\n", + "1500000 lines processed by processor 6\n", + "1500000 lines processed by processor 2\n", + "\n", + "*** Qualifier collision #1 detected for Q54902946-P26-Q31191593-fb18c102-0-P580-d109bb (^1560-12-15T00:00:00Z/11)\n", + "1500000 lines processed by processor 3\n", + "\n", + "*** Sitelink collision #1 detected for Q56528384-wikipedia_sitelink-563962 (http://.wikipedia.org/wiki/Module:I18n/date)\n", + "\n", + "*** Qualifier collision #1 detected for Q56582849-P26-Q72922-06e7a6cd-0-P580-c16f56 (^1499-01-21T00:00:00Z/11)\n", + "The qual collector called 3000000 times: 0 nrows, 0 erows, 56981081 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 4000000 times: 20000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 4000000 times: 0 nrows, 486362243 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 4000000 times: 0 nrows, 604767527 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q75458516-P26-Q7324457-79a267cb-0-P580-221dc5 (^1568-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q75552262-P26-Q75552257-6fa3779f-0-P580-04284b (^1556-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q75552257-P26-Q75552262-2af17717-0-P580-04284b (^1556-00-00T00:00:00Z/9)\n", + "The qual collector called 3500000 times: 0 nrows, 0 erows, 66478883 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 4500000 times: 22500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 4500000 times: 0 nrows, 548812363 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 4500000 times: 0 nrows, 643030994 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The sitelink collector called 2000000 times: 0 nrows, 81240831 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "2000000 lines processed by processor 5\n", + "2000000 lines processed by processor 7\n", + "2000000 lines processed by processor 4\n", + "2000000 lines processed by processor 11\n", + "2000000 lines processed by processor 8\n", + "2000000 lines processed by processor 2\n", + "2000000 lines processed by processor 9\n", + "2000000 lines processed by processor 10\n", + "2000000 lines processed by processor 0\n", + "2000000 lines processed by processor 1\n", + "2000000 lines processed by processor 6\n", + "2000000 lines processed by processor 3\n", + "The qual collector called 4000000 times: 0 nrows, 0 erows, 77003327 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q2323-wikipedia_sitelink-8e4ee7 (http://yo.wikipedia.org/wiki/8_February)\n", + "\n", + "*** Sitelink collision #1 detected for Q8877-wikipedia_sitelink-dcbe09 (http://scn.wikipedia.org/wiki/Steven_Spielberg)\n", + "\n", + "*** Sitelink collision #1 detected for Q9696-wikipedia_sitelink-c8fdf8 (http://haw.wikipedia.org/wiki/John_Fitzgerald_Kennedy)\n", + "\n", + "*** Qualifier collision #1 detected for Q40433-P26-Q463669-cd43ed58-0-P580-480b99 (^1550-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q57654-P26-Q154041-8d52292f-0-P580-3b3df4 (^1572-07-20T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q66270-P26-Q325505-28bc872e-0-P580-08d4a0 (^1478-05-29T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q66516-P26-Q61814-43ebfd75-0-P580-d435a1 (^1502-04-10T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q101877-P26-Q89405-6e0cba4d-0-P580-2b9eed (^1560-07-01T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q155167-P26-Q269586-cc56bab6-0-P580-c54274 (^1334-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q168669-P26-Q193658-6bff08d2-0-P580-e8a3ec (^0939-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q241797-P26-Q7731-b7834ae7-0-P580-a01064 (^1671-02-01T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q262059-P26-Q187312-c501aba2-0-P580-7e48ad (^1302-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q267483-P26-Q57920-80635ac2-0-P580-c0fc4c (^1570-01-08T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q271799-P26-Q169319-a97c2304-0-P580-0d082c (^1523-12-11T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q326738-P26-Q684224-2df6ee20-0-P580-ff2137 (^1524-01-17T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q327572-P26-Q68952-ae5f6316-0-P580-5906e2 (^1563-05-10T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q374210-P26-Q4768218-c9e0eacd-0-P580-ef8382 (^1571-12-19T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q384941-P26-Q380868-4ca9581a-0-P580-4b742f (^1533-08-25T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q536174-P26-Q551752-c9a99a5e-0-P580-16c9b2 (^1229-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q574718-P26-Q21153658-ffa49040-0-P580-03dd18 (^1319-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q623188-P26-Q553289-7323bb58-0-P580-d8d288 (^1090-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q947423-P26-Q5358431-95b068e2-0-P580-6b2ce5 (^1152-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q2039358-P26-Q13474657-3f305fc3-0-P580-593f4e (^1558-00-00T00:00:00Z/9)\n", + "\n", + "*** Sitelink collision #1 detected for Q6428609-wikipedia_sitelink-660dd5 (http://nah.wikipedia.org/wiki/Neneuhcāyōtl:Tlatequitiltilīlli_pt-1)\n", + "\n", + "*** Sitelink collision #1 detected for Q6705618-wikipedia_sitelink-4b5e22 (http://.wikipedia.org/wiki/Template:Autotranslate)\n", + "The node collector called 5000000 times: 25000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 5000000 times: 0 nrows, 619728420 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q7221363-wikipedia_sitelink-78630c (http://.wikipedia.org/wiki/Category:Lua-based_templates)\n", + "\n", + "*** Qualifier collision #1 detected for Q7324457-P26-Q75567328-84b7c804-0-P580-4e6c67 (^1553-11-24T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q7324457-P26-Q75458516-fee1a551-0-P580-221dc5 (^1568-00-00T00:00:00Z/9)\n", + "\n", + "*** Sitelink collision #1 detected for Q7453435-wikipedia_sitelink-c016e9 (http://ku.wikipedia.org/wiki/Kategorî:Ewrasya)\n", + "The description collector called 5000000 times: 0 nrows, 710063872 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q10350561-wikipedia_sitelink-5f2907 (http://.wikipedia.org/wiki/Template:Lua)\n", + "\n", + "*** Sitelink collision #1 detected for Q15116966-wikipedia_sitelink-74d712 (http://.wikipedia.org/wiki/Module:Message_box)\n", + "\n", + "*** Sitelink collision #1 detected for Q15212145-wikipedia_sitelink-907e39 (http://.wikipedia.org/wiki/Template:LangSwitch)\n", + "\n", + "*** Sitelink collision #1 detected for Q17121869-wikipedia_sitelink-582caf (http://.wikipedia.org/wiki/Module:Lua_banner)\n", + "The sitelink collector called 2500000 times: 0 nrows, 113956702 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q21153658-P26-Q574718-39f28f24-0-P580-03dd18 (^1319-00-00T00:00:00Z/9)\n", + "\n", + "*** Sitelink collision #1 detected for Q26905108-wikipedia_sitelink-51f8f2 (http://.wikipedia.org/wiki/Module:I18n/complex_date)\n", + "The node collector called 5500000 times: 27500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 5500000 times: 0 nrows, 669593652 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 5500000 times: 0 nrows, 797514432 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The qual collector called 4500000 times: 0 nrows, 0 erows, 84435753 qrows, 0 invalid erows, 0 invalid qrows\n", + "2500000 lines processed by processor 5\n", + "2500000 lines processed by processor 7\n", + "2500000 lines processed by processor 4\n", + "2500000 lines processed by processor 11\n", + "2500000 lines processed by processor 0\n", + "2500000 lines processed by processor 8\n", + "2500000 lines processed by processor 9\n", + "The node collector called 6000000 times: 30000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 6000000 times: 0 nrows, 742817342 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "2500000 lines processed by processor 2\n", + "2500000 lines processed by processor 10\n", + "2500000 lines processed by processor 6\n", + "2500000 lines processed by processor 1\n", + "2500000 lines processed by processor 3\n", + "The description collector called 6000000 times: 0 nrows, 918682552 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The qual collector called 5000000 times: 0 nrows, 0 erows, 95556082 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 6500000 times: 0 nrows, 788430809 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 6500000 times: 32500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 6500000 times: 0 nrows, 944911355 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The qual collector called 5500000 times: 0 nrows, 0 erows, 104971336 qrows, 0 invalid erows, 0 invalid qrows\n", + "The sitelink collector called 3000000 times: 0 nrows, 121890140 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 7000000 times: 35000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 7000000 times: 0 nrows, 852807659 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 7000000 times: 0 nrows, 985723776 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "3000000 lines processed by processor 5\n", + "3000000 lines processed by processor 7\n", + "3000000 lines processed by processor 2\n", + "3000000 lines processed by processor 11\n", + "3000000 lines processed by processor 10\n", + "3000000 lines processed by processor 0\n", + "3000000 lines processed by processor 8\n", + "3000000 lines processed by processor 9\n", + "3000000 lines processed by processor 4\n", + "3000000 lines processed by processor 6\n", + "3000000 lines processed by processor 3\n", + "3000000 lines processed by processor 1\n", + "The qual collector called 6000000 times: 0 nrows, 0 erows, 115587156 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q3044-P26-Q263474-631d88d0-0-P580-9b0b8a (^0770-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q3044-P26-Q261866-27b1ed09-0-P580-3fbd66 (^0794-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q27932-P26-Q287503-29306074-0-P580-11c3a9 (^1237-04-25T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q38370-P26-Q80823-ae3ce4e4-0-P580-c9d352 (^1533-01-25T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q38370-P26-Q182637-8103e2ff-0-P580-7524c3 (^1536-05-30T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q38370-P26-Q57126-cb76b09d-0-P580-c55b0a (^1540-01-06T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q38370-P26-Q188926-259757b1-0-P580-3301d6 (^1540-07-28T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q38370-P26-Q192943-4b53adeb-0-P580-1ea2b6 (^1543-07-12T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q43682-wikipedia_sitelink-7b59de (http://ms.wikipedia.org/wiki/Philipp_Lahm)\n", + "\n", + "*** Qualifier collision #1 detected for Q60563-P26-Q2915743-f5fdee07-0-P580-cade68 (^1169-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q65946-P26-Q462536-89b54878-0-P580-48f754 (^1407-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q66888-P26-Q3721846-b7243730-0-P580-4e0bc1 (^1571-01-09T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q66888-P26-Q23771111-34bc78ba-0-P580-713f01 (^1560-03-03T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q68304-P26-Q539111-dfcad6f4-0-P580-d0edbb (^1545-05-17T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q95627-P26-Q354945-873a167d-0-P580-3b86a9 (^1276-11-24T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q121846-P26-Q55579391-afdbc2b3-0-P580-cae35d (^1284-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q122794-P26-Q430950-e085ea2d-0-P580-94ae3a (^1577-10-20T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q124682-P26-Q337057-4fb67536-0-P580-db5ec5 (^1389-08-17T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q130005-P26-Q259564-c738415f-0-P580-a4a595 (^1045-01-23T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q132545-P26-Q131552-2fbc7eb5-0-P580-e56690 (^1533-10-28T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q134452-P26-Q201143-a2079e30-0-P580-7c0e43 (^1491-12-06T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q160349-P26-Q154064-ec5ff971-0-P580-7a7cba (^1385-07-17T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q220845-P26-Q936976-0f99833d-0-P580-5eeb19 (^1572-08-18T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q234257-P26-Q170398-56a0eb9a-0-P580-850b4d (^1816-01-24T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q259564-P26-Q130005-bd5ab415-0-P580-a4a595 (^1045-01-23T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q325583-P26-Q527486-704144b1-0-P580-f981af (^1577-05-19T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q326449-P26-Q23682783-85a9914e-0-P580-13178a (^1736-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q440132-P26-Q506527-db15118a-0-P580-2bef25 (^1524-11-06T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q469389-P26-Q1924994-36c61689-0-P580-017942 (^1377-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q658714-P26-Q20498980-28fdf4a6-0-P580-0672b7 (^1409-01-30T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q682736-P26-Q68285-f3f03090-0-P580-eae385 (^1460-11-19T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q684276-P26-Q61576937-87fbff2c-0-P580-2b5632 (^1217-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q684224-P26-Q326738-18c31ccf-0-P580-ff2137 (^1524-01-17T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q703249-P26-Q1309296-09047836-0-P580-a97c74 (^1228-00-00T00:00:00Z/9)\n", + "\n", + "*** Sitelink collision #1 detected for Q1581723-wikipedia_sitelink-ff37b1 (http://eu.wikipedia.org/wiki/The_Love_Parade)\n", + "\n", + "*** Qualifier collision #1 detected for Q2028843-P26-Q37062-259ae253-0-P580-6f4356 (^1411-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q6244699-P26-Q76366716-754c9057-0-P580-d7261a (^1579-04-27T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q7020051-wikipedia_sitelink-033b05 (http://fi.wikipedia.org/wiki/Luokka:Palkitut)\n", + "The node collector called 7500000 times: 37500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 7500000 times: 0 nrows, 922083165 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q15117391-wikipedia_sitelink-00ef81 (http://.wikipedia.org/wiki/Module:Message_box/configuration)\n", + "The description collector called 7500000 times: 0 nrows, 1062631793 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The sitelink collector called 3500000 times: 0 nrows, 153793624 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q25713407-wikipedia_sitelink-ff4697 (http://.wikipedia.org/wiki/Template:CURRENTCONTENTLANGUAGE)\n", + "The node collector called 8000000 times: 40000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 8000000 times: 0 nrows, 974216636 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The qual collector called 6500000 times: 0 nrows, 0 erows, 122984270 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 8000000 times: 0 nrows, 1155675148 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "3500000 lines processed by processor 0\n", + "3500000 lines processed by processor 5\n", + "3500000 lines processed by processor 7\n", + "3500000 lines processed by processor 11\n", + "3500000 lines processed by processor 2\n", + "3500000 lines processed by processor 10\n", + "3500000 lines processed by processor 8\n", + "3500000 lines processed by processor 9\n", + "3500000 lines processed by processor 4\n", + "3500000 lines processed by processor 3\n", + "3500000 lines processed by processor 6\n", + "3500000 lines processed by processor 1\n", + "The node collector called 8500000 times: 42500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 8500000 times: 0 nrows, 1047004832 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The qual collector called 7000000 times: 0 nrows, 0 erows, 134121504 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 8500000 times: 0 nrows, 1254745369 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q64944842-wikipedia_sitelink-8258da (http://.wikipedia.org/wiki/Module:Portal_navigation)\n", + "\n", + "*** Qualifier collision #1 detected for Q65617406-P26-Q265478-6faeca05-0-P580-dc9c16 (^1884-05-30T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q76366716-P26-Q6244699-b5d45f0b-0-P580-d7261a (^1579-04-27T00:00:00Z/11)\n", + "The edge collector called 9000000 times: 0 nrows, 1092386056 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 9000000 times: 45000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The qual collector called 7500000 times: 0 nrows, 0 erows, 143533696 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 9000000 times: 0 nrows, 1286483409 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The sitelink collector called 4000000 times: 0 nrows, 162571004 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 9500000 times: 47500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 9500000 times: 0 nrows, 1155205313 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q102226589-wikipedia_sitelink-c65fff (http://.wikipedia.org/wiki/Template:User_mnw)\n", + "4000000 lines processed by processor 7\n", + "4000000 lines processed by processor 0\n", + "4000000 lines processed by processor 11\n", + "4000000 lines processed by processor 5\n", + "4000000 lines processed by processor 10\n", + "4000000 lines processed by processor 8\n", + "4000000 lines processed by processor 2\n", + "The description collector called 9500000 times: 0 nrows, 1331240615 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "4000000 lines processed by processor 9\n", + "4000000 lines processed by processor 4\n", + "4000000 lines processed by processor 3\n", + "4000000 lines processed by processor 1\n", + "4000000 lines processed by processor 6\n", + "The qual collector called 8000000 times: 0 nrows, 0 erows, 154200276 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q3740-wikipedia_sitelink-7aaed8 (http://.wikipedia.org/wiki/Category:Templates)\n", + "\n", + "*** Sitelink collision #1 detected for Q8079-wikipedia_sitelink-ade6e5 (http://ga.wikipedia.org/wiki/Nintendo_Wii)\n", + "\n", + "*** Qualifier collision #1 detected for Q41847-P26-Q232137-573ea212-0-P580-fe3abc (^0956-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q60386-P26-Q157776-b68a50b9-0-P580-e55fcf (^1478-09-06T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q60211-P26-Q264709-a5d5e20b-0-P580-7f1413 (^1564-12-17T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q70828-P26-Q110845-78948fbb-0-P580-189c4f (^1282-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q79176-P26-Q702602-bcda292d-0-P580-676c21 (^1431-06-03T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q112783-wikipedia_sitelink-d7b2c3 (http://uk.wikipedia.org/wiki/Вілкокс_(округ))\n", + "\n", + "*** Qualifier collision #1 detected for Q119050-P26-Q26882160-c09f6014-0-P580-78cd0b (^1567-01-13T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q120365-P26-Q69620-c54a3667-0-P580-05429a (^1116-07-13T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q167782-P26-Q231794-aef59aa3-0-P580-7e2e98 (^1350-04-08T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q170586-P26-Q231742-11295529-0-P580-90c66e (^1313-07-00T00:00:00Z/10)\n", + "\n", + "*** Qualifier collision #1 detected for Q172203-P26-Q229419-b442326a-0-P580-a50c51 (^1262-05-28T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q174964-P26-Q231798-bd2d3d6b-0-P580-dc0f7a (^1322-09-21T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q202566-P26-Q688471-440b6399-0-P580-283d12 (^1531-09-20T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q232801-P26-Q721680-fa26b14e-0-P580-70598b (^1473-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q270234-P26-Q210569-6b693078-0-P580-f6928a (^1446-06-20T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q325824-P26-Q547225-762b0607-0-P580-df29d7 (^1467-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q441394-P26-Q57161-47bffbac-0-P580-77780b (^1308-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q454769-P26-Q76956-91d862f6-0-P580-981a99 (^1245-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q455201-P26-Q152148-9eb66558-0-P580-e1de94 (^1389-05-02T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q479538-P26-Q98010-40ca7cda-0-P580-31ff5b (^1582-11-10T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q513315-P26-Q87066-cd6b2f7c-0-P580-3f638b (^1551-03-10T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q513315-P26-Q70019-5c7fa382-0-P580-f9548c (^1558-08-01T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q547225-P26-Q325824-31db3890-0-P580-df29d7 (^1467-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q567378-P26-Q434771-205319b2-0-P580-4d06ab (^1509-11-20T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q571597-P26-Q327750-b0a44162-0-P580-f16789 (^1555-09-10T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q687028-P26-Q2334373-525f829d-0-P580-a6af64 (^1556-02-16T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q1070853-P26-Q2467970-c7d5f6fa-0-P580-d71f7b (^1358-09-04T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q1141121-P26-Q380373-eeba5d95-0-P580-2e184a (^1294-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q1916706-P26-Q80714-35bbccc5-0-P580-d3fce7 (^1109-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q2049198-P26-Q63291-be046904-0-P580-f3b88a (^1372-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q2334373-P26-Q687028-b6b9f398-0-P580-a6af64 (^1556-02-16T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q2334373-P26-Q328693-3f939052-0-P580-0d15c6 (^1543-08-26T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q3926051-wikipedia_sitelink-a027cd (http://.wikipedia.org/wiki/Template:Tl)\n", + "\n", + "*** Sitelink collision #1 detected for Q4481730-wikipedia_sitelink-1d5954 (http://.wikipedia.org/wiki/Template:Tracked)\n", + "\n", + "*** Sitelink collision #1 detected for Q4989282-wikipedia_sitelink-8a2fcc (http://.wikipedia.org/wiki/Category:Pages_with_broken_file_links)\n", + "\n", + "*** Sitelink collision #1 detected for Q5070586-wikipedia_sitelink-b1e4d1 (http://.wikipedia.org/wiki/Template:Shortcut)\n", + "\n", + "*** Sitelink collision #1 detected for Q6027565-wikipedia_sitelink-91a43e (http://.wikipedia.org/wiki/Template:Tag)\n", + "\n", + "*** Qualifier collision #1 detected for Q6940461-P159-Q61302-c45d5aa7-0-P625-dc88d7 (@28.6386/-106.0756)\n", + "\n", + "*** Sitelink collision #1 detected for Q7643575-wikipedia_sitelink-d4d012 (http://.wikipedia.org/wiki/Template:Colon)\n", + "\n", + "*** Qualifier collision #1 detected for Q9150575-P26-Q679083-79dd46a6-0-P580-5d5db4 (^1320-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q13972091-P26-Q75389849-5b19ecc3-0-P1319-532ed8 (^1509-07-04T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q13972091-P26-Q6469914-b9869239-0-P1319-839147 (^1520-00-00T00:00:00Z/9)\n", + "The node collector called 10000000 times: 50000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 10000000 times: 0 nrows, 1225496761 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q15281133-P26-Q75240211-8a7057f8-0-P580-97ad08 (^1526-07-20T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q15605797-wikipedia_sitelink-d3c200 (http://.wikipedia.org/wiki/Module:List)\n", + "The sitelink collector called 4500000 times: 0 nrows, 193397096 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q17347230-wikipedia_sitelink-be73cc (http://.wikipedia.org/wiki/Module:Category_handler/blacklist)\n", + "\n", + "*** Qualifier collision #1 detected for Q20202663-P26-Q299612-893fda0a-0-P580-b50376 (^1080-00-00T00:00:00Z/9)\n", + "The description collector called 10000000 times: 0 nrows, 1410220232 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q20819962-wikipedia_sitelink-377adb (http://.wikipedia.org/wiki/Module:Fallback)\n", + "\n", + "*** Qualifier collision #1 detected for Q26877297-P26-Q542751-a70d423c-0-P580-d584ea (^1488-02-17T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q27031232-P26-Q55217321-0fe60a4f-0-P580-7606e7 (^1280-00-00T00:00:00Z/9)\n", + "The qual collector called 8500000 times: 0 nrows, 0 erows, 161593444 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 10500000 times: 52500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 10500000 times: 0 nrows, 1291455380 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 10500000 times: 0 nrows, 1518697994 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q44191792-P26-Q54862322-4c83e8e6-0-P580-a27cd1 (^1567-08-21T00:00:00Z/11)\n", + "4500000 lines processed by processor 7\n", + "4500000 lines processed by processor 0\n", + "4500000 lines processed by processor 11\n", + "4500000 lines processed by processor 5\n", + "4500000 lines processed by processor 10\n", + "4500000 lines processed by processor 8\n", + "4500000 lines processed by processor 9\n", + "4500000 lines processed by processor 2\n", + "4500000 lines processed by processor 4\n", + "4500000 lines processed by processor 3\n", + "4500000 lines processed by processor 1\n", + "4500000 lines processed by processor 6\n", + "The qual collector called 9000000 times: 0 nrows, 0 erows, 172525914 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 11000000 times: 55000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 11000000 times: 0 nrows, 1351553522 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q64506586-P26-Q262726-b670dee9-0-P580-4a9d3d (^1298-00-00T00:00:00Z/9)\n", + "The description collector called 11000000 times: 0 nrows, 1600991120 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The sitelink collector called 5000000 times: 0 nrows, 203335940 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q75395291-P26-Q76157640-e3d697ee-0-P580-54254d (^1578-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q75420332-P26-Q208922-3b5559ee-0-P580-447dca (^1559-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q76157640-P26-Q75395291-d75eee5e-0-P580-54254d (^1578-00-00T00:00:00Z/9)\n", + "The edge collector called 11500000 times: 0 nrows, 1398071827 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 11500000 times: 57500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The qual collector called 9500000 times: 0 nrows, 0 erows, 181961851 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 11500000 times: 0 nrows, 1631823417 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "5000000 lines processed by processor 0\n", + "5000000 lines processed by processor 11\n", + "5000000 lines processed by processor 7\n", + "5000000 lines processed by processor 8\n", + "5000000 lines processed by processor 10\n", + "5000000 lines processed by processor 5\n", + "The node collector called 12000000 times: 60000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 12000000 times: 0 nrows, 1457551225 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "5000000 lines processed by processor 9\n", + "5000000 lines processed by processor 4\n", + "5000000 lines processed by processor 2\n", + "5000000 lines processed by processor 3\n", + "5000000 lines processed by processor 6\n", + "5000000 lines processed by processor 1\n", + "The qual collector called 10000000 times: 0 nrows, 0 erows, 192683479 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q7731-P26-Q259907-7f7cc241-0-P580-8d5052 (^1648-01-26T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q7731-P26-Q241797-ff9269a2-0-P580-a01064 (^1671-02-01T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q57920-P26-Q267483-a2460de3-0-P580-c0fc4c (^1570-01-08T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q62483-P26-Q229286-18b62769-0-P580-bebb21 (^1541-06-14T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q62483-P26-Q261905-fc01d066-0-P580-7aecc7 (^1546-07-18T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q69334-P26-Q2419674-1dc5e587-0-P580-e1ff18 (^1183-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q72922-P26-Q56582849-77ca7313-0-P580-c16f56 (^1499-01-21T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q76956-P26-Q454769-cf7fc40d-0-P580-981a99 (^1245-00-00T00:00:00Z/9)\n", + "\n", + "*** Sitelink collision #1 detected for Q80235-wikipedia_sitelink-03e5ac (http://is.wikipedia.org/wiki/Tamarind)\n", + "\n", + "*** Qualifier collision #1 detected for Q86055-P26-Q24661944-f75c4596-0-P580-54820b (^1472-10-19T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q151327-wikipedia_sitelink-9044c0 (http://oc.wikipedia.org/wiki/(333)_Badenia)\n", + "\n", + "*** Qualifier collision #1 detected for Q168664-P26-Q15193-1b533b05-0-P580-a310ca (^1793-10-09T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q276526-P26-Q10855916-3e70b907-0-P580-f18c2a (^1392-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q287503-P26-Q316828-d4637da7-0-P580-9879f5 (^1261-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q316831-P26-Q238609-208f7dcc-0-P580-92ae06 (^1153-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q316831-P26-Q450971-656d5797-0-P580-5ed4f3 (^1177-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q336754-P26-Q2084307-30a93eb5-0-P580-b520a9 (^1318-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q370902-P26-Q75289133-2d7df0e9-0-P580-83a193 (^1275-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q430782-P26-Q3007367-9502d33f-0-P580-5b468d (^1555-02-07T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q443876-P26-Q453771-bbc80f51-0-P580-84a26a (^1446-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q563792-P26-Q4958342-a85e5b57-0-P580-acfb1b (^1391-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q588852-P26-Q58514-55e81240-0-P580-ae0480 (^1514-10-09T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q651948-P26-Q9165680-bf5d7e43-0-P580-a08da9 (^1396-03-06T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q679083-P26-Q9150575-c56910ae-0-P580-5d5db4 (^1320-00-00T00:00:00Z/9)\n", + "\n", + "*** Sitelink collision #1 detected for Q929200-wikipedia_sitelink-7842a9 (http://vi.wikipedia.org/wiki/Ilicura_militaris)\n", + "\n", + "*** Qualifier collision #1 detected for Q936976-P26-Q220845-281a5972-0-P580-5eeb19 (^1572-08-18T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q1166728-P26-Q1494018-db61e006-0-P580-3550f9 (^1285-00-00T00:00:00Z/9)\n", + "The description collector called 12000000 times: 0 nrows, 1675439151 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q1494018-P26-Q1166728-5c17988d-0-P580-3550f9 (^1285-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q1524640-P26-Q166853-40fa3891-0-P580-515f76 (^1375-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q2465095-P26-Q1167368-4ffb7291-0-P580-e64863 (^1257-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q2834394-P39-Q84701409-f487718d-0-P580-ac0fb1 (^1154-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q2834394-P39-Q84701409-f487718d-0-P582-35fc60 (^1173-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q2844574-P793-Q2238935-db1dea90-0-P585-ab2ece (^1350-00-00T00:00:00Z/7)\n", + "\n", + "*** Alias collision #1 detected for Q4493910-alias-ru-70f749 ('Фёдоров А. В.'@ru)\n", + "\n", + "*** Sitelink collision #1 detected for Q4608595-wikipedia_sitelink-4a0154 (http://.wikipedia.org/wiki/Template:Documentation)\n", + "\n", + "*** Sitelink collision #1 detected for Q5611978-wikipedia_sitelink-3b808e (http://.wikipedia.org/wiki/Template:Welcome)\n", + "\n", + "*** Qualifier collision #1 detected for Q6129540-P106-Q25393460-4c72cbac-0-P580-9eefc6 (^1552-07-17T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q6426831-wikipedia_sitelink-0d77ef (http://.wikipedia.org/wiki/Template:Edit_filter_warning)\n", + "\n", + "*** Sitelink collision #1 detected for Q7605021-wikipedia_sitelink-3a136d (http://.wikipedia.org/wiki/Template:Comment)\n", + "The sitelink collector called 5500000 times: 0 nrows, 232340004 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q15098140-wikipedia_sitelink-bfa017 (http://.wikipedia.org/wiki/Module:Yesno)\n", + "\n", + "*** Sitelink collision #1 detected for Q15117218-wikipedia_sitelink-4b5db5 (http://.wikipedia.org/wiki/Module:Category_handler)\n", + "\n", + "*** Sitelink collision #1 detected for Q15506579-wikipedia_sitelink-c363ea (http://.wikipedia.org/wiki/Module:Documentation/config)\n", + "\n", + "*** Sitelink collision #1 detected for Q8244473-wikipedia_sitelink-0e32ac (http://.wikipedia.org/wiki/Module:InfoboxImage)\n", + "The node collector called 12500000 times: 62500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 12500000 times: 0 nrows, 1530628094 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q20962109-wikipedia_sitelink-0f5e03 (http://.wikipedia.org/wiki/Module:ISOdate)\n", + "\n", + "*** Sitelink collision #1 detected for Q22910717-wikipedia_sitelink-4da401 (http://.wikipedia.org/wiki/Template:Sandbox_other)\n", + "The description collector called 12500000 times: 0 nrows, 1757625754 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q25714577-wikipedia_sitelink-5c85f4 (http://.wikipedia.org/wiki/Module:WikidataIB)\n", + "\n", + "*** Qualifier collision #1 detected for Q26877285-P26-Q828710-08b99587-0-P580-c30f0a (^1566-02-16T00:00:00Z/11)\n", + "The qual collector called 10500000 times: 0 nrows, 0 erows, 200088433 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 13000000 times: 65000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 13000000 times: 0 nrows, 1599577883 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 13000000 times: 0 nrows, 1879410082 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "5500000 lines processed by processor 0\n", + "5500000 lines processed by processor 11\n", + "5500000 lines processed by processor 7\n", + "5500000 lines processed by processor 8\n", + "5500000 lines processed by processor 10\n", + "5500000 lines processed by processor 5\n", + "5500000 lines processed by processor 9\n", + "5500000 lines processed by processor 2\n", + "5500000 lines processed by processor 4\n", + "5500000 lines processed by processor 3\n", + "5500000 lines processed by processor 6\n", + "5500000 lines processed by processor 1\n", + "The qual collector called 11000000 times: 0 nrows, 0 erows, 210987903 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q61140016-P26-Q44192051-99f17a00-0-P580-7a76d2 (^1575-01-30T00:00:00Z/11)\n", + "The sitelink collector called 6000000 times: 0 nrows, 243922942 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 13500000 times: 67500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 13500000 times: 0 nrows, 1654081431 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 13500000 times: 0 nrows, 1939040832 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q75390802-P26-Q6135465-d640c900-0-P580-91ce5e (^1422-10-20T00:00:00Z/11)\n", + "The qual collector called 11500000 times: 0 nrows, 0 erows, 220410486 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 14000000 times: 70000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 14000000 times: 0 nrows, 1703202908 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 14000000 times: 0 nrows, 1972258264 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "6000000 lines processed by processor 11\n", + "6000000 lines processed by processor 0\n", + "6000000 lines processed by processor 7\n", + "6000000 lines processed by processor 8\n", + "6000000 lines processed by processor 10\n", + "6000000 lines processed by processor 5\n", + "6000000 lines processed by processor 9\n", + "6000000 lines processed by processor 4\n", + "6000000 lines processed by processor 2\n", + "6000000 lines processed by processor 3\n", + "6000000 lines processed by processor 6\n", + "6000000 lines processed by processor 1\n", + "The node collector called 14500000 times: 72500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 14500000 times: 0 nrows, 1766161278 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The qual collector called 12000000 times: 0 nrows, 0 erows, 231223257 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q15193-P26-Q168664-80106bb4-0-P580-a310ca (^1793-10-09T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q53459-P26-Q93408-6fe4810b-0-P580-df7c8b (^1454-02-10T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q56226-wikipedia_sitelink-ff2d61 (http://sl.wikipedia.org/wiki/Kim_Džong-un)\n", + "\n", + "*** Qualifier collision #1 detected for Q58514-P26-Q236220-01ae7b47-0-P580-766cef (^1476-09-08T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q58514-P26-Q201143-bc6f20e4-0-P580-5a4f65 (^1499-01-08T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q58514-P26-Q588852-cdd0895d-0-P580-ae0480 (^1514-10-09T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q69620-P26-Q120365-10ec8d2b-0-P580-05429a (^1116-07-13T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q91003-P26-Q72789-6b43d0ab-0-P580-4be813 (^1150-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q130969-P26-Q229189-a7f573de-0-P580-aacd85 (^1284-08-16T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q131412-P26-Q132548-dcd19f44-0-P580-3d6c00 (^1558-04-24T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q157789-P26-Q260926-aa804d7f-0-P580-d435a1 (^1502-04-10T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q157789-P26-Q233790-625e10c2-0-P580-4cdfa7 (^1518-10-09T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q160165-P26-Q63494-d339ca47-0-P580-b1f503 (^1710-11-11T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q178525-P26-Q134259-aaf86e95-0-P580-d2c4ca (^1137-07-25T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q178525-P26-Q102140-ab1e10d9-0-P580-8b694f (^1152-05-18T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q208420-wikipedia_sitelink-87d729 (http://pl.wikipedia.org/wiki/Triera)\n", + "\n", + "*** Qualifier collision #1 detected for Q231476-P26-Q161866-fa54b200-0-P580-3b4b8f (^1403-02-07T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q231476-P26-Q449008-63ea5e99-0-P580-6fd386 (^1386-09-11T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q254927-P26-Q367001-9bafa4c7-0-P580-a91848 (^1112-02-03T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q259907-P26-Q7731-5a5cc2ce-0-P580-8d5052 (^1648-01-26T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q261847-P26-Q767582-bc9963e7-0-P580-5c7f0d (^1456-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q266309-P26-Q510987-645ea879-0-P580-5513d1 (^1272-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q283548-P26-Q450768-eb0b2e92-0-P580-26c3cf (^1572-08-17T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q397733-wikipedia_sitelink-d5cb47 (http://ar.wikipedia.org/wiki/سلوني)\n", + "\n", + "*** Qualifier collision #1 detected for Q505918-P26-Q274025-fb8c2108-0-P580-e7b518 (^1545-02-15T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q540767-P26-Q74019-da034a0b-0-P580-5774e5 (^1422-07-25T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q702209-P26-Q454810-36c82016-0-P580-e60df9 (^1476-08-25T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q935411-P26-Q70789-de6fac79-0-P580-941716 (^1463-05-10T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q1360375-P26-Q4726173-be500fdb-0-P580-98b695 (^1253-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q2318556-P26-Q317621-b5da07f6-0-P580-a8e531 (^1575-06-14T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q3721525-P26-Q571632-481d6ba3-0-P580-10eca4 (^1436-02-12T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q3904375-P26-Q57231616-7c1080b1-0-P580-df5c65 (^1281-00-00T00:00:00Z/9)\n", + "\n", + "*** Sitelink collision #1 detected for Q4048908-wikipedia_sitelink-0836ac (http://.wikipedia.org/wiki/Category:Hidden_categories)\n", + "\n", + "*** Sitelink collision #1 detected for Q5324375-wikipedia_sitelink-91df00 (http://.wikipedia.org/wiki/Category:Maintenance)\n", + "\n", + "*** Sitelink collision #1 detected for Q5626735-wikipedia_sitelink-7430af (http://.wikipedia.org/wiki/Template:Infobox)\n", + "\n", + "*** Qualifier collision #1 detected for Q6135465-P26-Q75390802-f0852539-0-P580-91ce5e (^1422-10-20T00:00:00Z/11)\n", + "The description collector called 14500000 times: 0 nrows, 2017690517 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q6469914-P26-Q4497270-0dd2e8d0-0-P580-bd95e9 (^1511-08-28T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q8213590-wikipedia_sitelink-5ee1a4 (http://.wikipedia.org/wiki/Template:Sister_project)\n", + "\n", + "*** Qualifier collision #1 detected for Q9264442-P159-Q270-adc8754c-0-P625-bdfc28 (@52.228472/21.013139)\n", + "The sitelink collector called 6500000 times: 0 nrows, 271891952 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q12118776-wikipedia_sitelink-2380bc (http://tt.wikipedia.org/wiki/Луминица_(Констанца))\n", + "\n", + "*** Sitelink collision #1 detected for Q15408619-wikipedia_sitelink-e2b771 (http://.wikipedia.org/wiki/Module:TableTools)\n", + "\n", + "*** Sitelink collision #1 detected for Q13107716-wikipedia_sitelink-595446 (http://.wikipedia.org/wiki/Module:Infobox)\n", + "\n", + "*** Sitelink collision #1 detected for Q15818852-wikipedia_sitelink-659cea (http://.wikipedia.org/wiki/Template:Section_resolved)\n", + "\n", + "*** Qualifier collision #1 detected for Q16566720-P26-Q319870-602b0c96-0-P580-ee5a8a (^1572-09-08T00:00:00Z/11)\n", + "\n", + "*** Sitelink collision #1 detected for Q16830095-wikipedia_sitelink-064884 (http://.wikipedia.org/wiki/Module:Check_for_unknown_parameters)\n", + "\n", + "*** Sitelink collision #1 detected for Q17347215-wikipedia_sitelink-aada88 (http://.wikipedia.org/wiki/Module:Category_handler/data)\n", + "\n", + "*** Sitelink collision #1 detected for Q18123834-wikipedia_sitelink-e4e0f9 (http://.wikipedia.org/wiki/Template:Mono)\n", + "\n", + "*** Sitelink collision #1 detected for Q18338361-wikipedia_sitelink-a89d12 (http://.wikipedia.org/wiki/Category:Pages_using_duplicate_arguments_in_template_calls)\n", + "The node collector called 15000000 times: 75000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 15000000 times: 0 nrows, 1833984215 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q23769960-P1416-Q26222380-85afc868-0-P580-896ff2 (^1999-12-01T00:00:00Z/11)\n", + "The description collector called 15000000 times: 0 nrows, 2108547178 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The qual collector called 12500000 times: 0 nrows, 0 erows, 238549976 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 15500000 times: 77500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 15500000 times: 0 nrows, 1905410846 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "6500000 lines processed by processor 11\n", + "6500000 lines processed by processor 7\n", + "6500000 lines processed by processor 0\n", + "6500000 lines processed by processor 8\n", + "6500000 lines processed by processor 5\n", + "6500000 lines processed by processor 10\n", + "6500000 lines processed by processor 9\n", + "6500000 lines processed by processor 4\n", + "6500000 lines processed by processor 2\n", + "6500000 lines processed by processor 3\n", + "6500000 lines processed by processor 6\n", + "6500000 lines processed by processor 1\n", + "The description collector called 15500000 times: 0 nrows, 2236189283 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The sitelink collector called 7000000 times: 0 nrows, 284558497 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q56528363-wikipedia_sitelink-12db12 (http://.wikipedia.org/wiki/Module:DateI18n)\n", + "The qual collector called 13000000 times: 0 nrows, 0 erows, 249491175 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q64944768-wikipedia_sitelink-f8a890 (http://.wikipedia.org/wiki/Template:Portal_navigation)\n", + "The node collector called 16000000 times: 80000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 16000000 times: 0 nrows, 1956689637 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 16000000 times: 0 nrows, 2275342115 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q75273844-P26-Q75273846-b5720745-0-P580-3f9e86 (^1468-07-08T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q75389849-P26-Q13972091-b1b4193f-0-P1319-532ed8 (^1509-07-04T00:00:00Z/11)\n", + "The qual collector called 13500000 times: 0 nrows, 0 erows, 258909784 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 16500000 times: 82500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 16500000 times: 0 nrows, 2011778907 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 16500000 times: 0 nrows, 2312084116 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "7000000 lines processed by processor 11\n", + "7000000 lines processed by processor 0\n", + "7000000 lines processed by processor 7\n", + "7000000 lines processed by processor 8\n", + "7000000 lines processed by processor 5\n", + "7000000 lines processed by processor 10\n", + "7000000 lines processed by processor 4\n", + "7000000 lines processed by processor 9\n", + "7000000 lines processed by processor 2\n", + "7000000 lines processed by processor 3\n", + "\n", + "*** Sitelink collision #1 detected for Q105429923-wikipedia_sitelink-61eaae (http://.wikipedia.org/wiki/Special:RecentChanges)\n", + "7000000 lines processed by processor 6\n", + "7000000 lines processed by processor 1\n", + "The qual collector called 14000000 times: 0 nrows, 0 erows, 269782940 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q42305-P26-Q229807-78d461dc-0-P580-34f596 (^1191-05-12T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q51089-P26-Q378756-91878f66-0-P580-78aa9d (^1350-09-27T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q64222-P26-Q969770-fd31fc8c-0-P580-684a59 (^1433-11-12T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q68567-P26-Q53441-de9c32df-0-P580-4d0846 (^1115-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q68952-P26-Q327572-72aafaf0-0-P580-5906e2 (^1563-05-10T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q132548-P26-Q131412-dba1192f-0-P580-3d6c00 (^1558-04-24T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q155581-P26-Q61261-80392b57-0-P580-3550f9 (^1285-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q166853-P26-Q1524640-013a0a25-0-P580-515f76 (^1375-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q234549-P26-Q154998-45a333f4-0-P580-45ce34 (^1525-10-29T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q235484-P26-Q105378-affa9a7c-0-P580-e8c2d5 (^1168-02-01T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q238609-P26-Q316831-e3e16df6-0-P580-92ae06 (^1153-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q242636-P26-Q129308-cfe073cc-0-P580-c6fd17 (^1189-08-29T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q242636-P26-Q1502979-64af4f58-0-P580-366f19 (^1214-01-20T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q242636-P26-Q1381324-c86a9785-0-P580-762370 (^1217-09-00T00:00:00Z/10)\n", + "\n", + "*** Qualifier collision #1 detected for Q260926-P26-Q157789-f6d156ff-0-P580-d435a1 (^1502-04-10T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q264709-P26-Q60211-e9a2f9fc-0-P580-7f1413 (^1564-12-17T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q266025-P26-Q312110-2dbf2ebf-0-P580-189c4f (^1282-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q273307-P26-Q1772833-b61045de-0-P580-a25d98 (^1271-00-00T00:00:00Z/9)\n", + "\n", + "*** Sitelink collision #1 detected for Q282380-wikipedia_sitelink-bf04f9 (http://et.wikipedia.org/wiki/Jedwabne)\n", + "\n", + "*** Qualifier collision #1 detected for Q325041-P26-Q2309561-25e0a31e-0-P580-f872f9 (^1515-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q435726-P26-Q719626-f865d7a4-0-P580-5a49c5 (^1496-11-21T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q450971-P26-Q316831-e5799852-0-P580-5ed4f3 (^1177-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q463669-P26-Q40433-6767e474-0-P580-480b99 (^1550-00-00T00:00:00Z/9)\n", + "\n", + "*** Qualifier collision #1 detected for Q506527-P26-Q440132-e6b92d24-0-P580-2bef25 (^1524-11-06T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q553550-P26-Q465382-752c3e78-0-P580-6fd04d (^1540-02-08T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q926335-P26-Q2419674-95719c4f-0-P580-d5e047 (^1190-00-00T00:00:00Z/9)\n", + "\n", + "*** Sitelink collision #1 detected for Q956852-wikipedia_sitelink-44d82a (http://zh-min-nan.wikipedia.org/wiki/Buffalo_(Missouri))\n", + "The node collector called 17000000 times: 85000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 17000000 times: 0 nrows, 2082839182 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q2299423-P159-Q1715-d2db66d9-0-P625-ceaec4 (@52.3704/9.7734)\n", + "\n", + "*** Qualifier collision #1 detected for Q2467970-P26-Q1070853-48f1aeab-0-P580-d71f7b (^1358-09-04T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q2517901-P106-Q25393460-25aee660-0-P580-e5e8e2 (^1558-03-05T00:00:00Z/11)\n", + "\n", + "*** Qualifier collision #1 detected for Q3997398-P159-Q101500-ee2c35e9-0-P625-79ba3e (@45.718139/9.715862)\n", + "\n", + "*** Sitelink collision #1 detected for Q5640659-wikipedia_sitelink-53102e (http://.wikipedia.org/wiki/Template:Ombox)\n", + "\n", + "*** Sitelink collision #1 detected for Q5843835-wikipedia_sitelink-6be4da (http://.wikipedia.org/wiki/Template:Fmbox)\n", + "\n", + "*** Sitelink collision #1 detected for Q7009036-wikipedia_sitelink-904fb9 (http://lij.wikipedia.org/wiki/Categorîa:Bahrain)\n", + "The sitelink collector called 7500000 times: 0 nrows, 311160195 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 17000000 times: 0 nrows, 2383647839 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q13889180-wikipedia_sitelink-61fce0 (http://ceb.wikipedia.org/wiki/Neotanais_minimus)\n", + "\n", + "*** Sitelink collision #1 detected for Q14357839-wikipedia_sitelink-379650 (http://.wikipedia.org/wiki/Module:Documentation)\n", + "\n", + "*** Sitelink collision #1 detected for Q15379728-wikipedia_sitelink-f395bc (http://.wikipedia.org/wiki/Module:Arguments)\n", + "\n", + "*** Sitelink collision #1 detected for Q16746551-wikipedia_sitelink-3b6af8 (http://.wikipedia.org/wiki/Template:Bulleted_list)\n", + "\n", + "*** Sitelink collision #1 detected for Q17347224-wikipedia_sitelink-98d4f5 (http://.wikipedia.org/wiki/Module:Category_handler/shared)\n", + "\n", + "*** Sitelink collision #1 detected for Q18577165-wikipedia_sitelink-79293a (http://.wikipedia.org/wiki/Template:Translatable_template)\n", + "\n", + "*** Sitelink collision #1 detected for Q18577187-wikipedia_sitelink-8b452d (http://.wikipedia.org/wiki/Template:Translatable_template_name)\n", + "The node collector called 17500000 times: 87500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 17500000 times: 0 nrows, 2136259786 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Sitelink collision #1 detected for Q26905045-wikipedia_sitelink-969685 (http://.wikipedia.org/wiki/Module:Complex_date)\n", + "\n", + "*** Sitelink collision #1 detected for Q28132212-wikipedia_sitelink-62565d (http://.wikipedia.org/wiki/Module:TNT)\n", + "\n", + "*** Qualifier collision #1 detected for Q31191558-P26-Q61139836-6de8c7ca-0-P580-70d445 (^1577-02-18T00:00:00Z/11)\n", + "The qual collector called 14500000 times: 0 nrows, 0 erows, 277058427 qrows, 0 invalid erows, 0 invalid qrows\n", + "The description collector called 17500000 times: 0 nrows, 2471569069 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "7500000 lines processed by processor 11\n", + "7500000 lines processed by processor 0\n", + "7500000 lines processed by processor 8\n", + "7500000 lines processed by processor 7\n", + "7500000 lines processed by processor 10\n", + "7500000 lines processed by processor 5\n", + "The node collector called 18000000 times: 90000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 18000000 times: 0 nrows, 2210319612 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "7500000 lines processed by processor 9\n", + "7500000 lines processed by processor 4\n", + "7500000 lines processed by processor 2\n", + "7500000 lines processed by processor 6\n", + "7500000 lines processed by processor 3\n", + "The sitelink collector called 8000000 times: 0 nrows, 324984687 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "7500000 lines processed by processor 1\n", + "\n", + "*** Qualifier collision #1 detected for Q54882974-P26-Q26205746-cdf8483f-0-P580-d027aa (^1549-02-16T00:00:00Z/11)\n", + "The description collector called 18000000 times: 0 nrows, 2585990048 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The qual collector called 15000000 times: 0 nrows, 0 erows, 287910963 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 18500000 times: 92500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 18500000 times: 0 nrows, 2259350521 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "\n", + "*** Qualifier collision #1 detected for Q75273846-P26-Q75273844-b3554203-0-P580-3f9e86 (^1468-07-08T00:00:00Z/11)\n", + "The description collector called 18500000 times: 0 nrows, 2615091276 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The qual collector called 15500000 times: 0 nrows, 0 erows, 297353277 qrows, 0 invalid erows, 0 invalid qrows\n", + "The node collector called 19000000 times: 95000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "The edge collector called 19000000 times: 0 nrows, 2320200322 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "8000000 lines processed by processor 11\n", + "8000000 lines processed by processor 0\n", + "8000000 lines processed by processor 10\n", + "8000000 lines processed by processor 5\n", + "8000000 lines processed by processor 7\n", + "8000000 lines processed by processor 8\n", + "8000000 lines processed by processor 9\n", + "8000000 lines processed by processor 4\n", + "8000000 lines processed by processor 2\n", + "The description collector called 19000000 times: 0 nrows, 2654687085 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", + "8000000 lines processed by processor 6\n", + "8000000 lines processed by processor 3\n", + "8000000 lines processed by processor 1\n", + "The qual collector called 16000000 times: 0 nrows, 0 erows, 308264519 qrows, 0 invalid erows, 0 invalid qrows\n", + "Done processing /data/amandeep/wikidata-20220505/latest-all.json.bz2\n", + "Telling the workers to shut down.\n", + "Exiting worker process 4 (pid 118151).\n", + "Exiting worker process 0 (pid 118147).\n", + "Exiting worker process 10 (pid 118157).\n", + "Exiting worker process 8 (pid 118155).\n", + "Waiting for the workers to shut down.\n", + "Exiting worker process 6 (pid 118153).\n", + "Exiting worker process 1 (pid 118148).\n", + "Exiting worker process 5 (pid 118152).\n", + "Exiting worker process 9 (pid 118156).\n", + "Exiting worker process 11 (pid 118158).\n", + "Exiting worker process 3 (pid 118150).\n", + "Exiting worker process 7 (pid 118154).\n", + "Exiting worker process 2 (pid 118149).\n", + "Worker shut down is complete.\n", + "Telling the node collector to shut down.\n", + "Waiting for the node collector to shut down.\n", + "Exiting the node collector (pid 118140).\n", + "The node collector has closed its output files.\n", + "Node collector shut down is complete.\n", + "Telling the edge collector to shut down.\n", + "Waiting for the edge collector to shut down.\n", + "Exiting the edge collector (pid 118141).\n", + "The edge collector has closed its output files.\n", + "Edge collector shut down is complete.\n", + "Telling the qual collector to shut down.\n", + "Waiting for the qual collector to shut down.\n", + "Exiting the qual collector (pid 118142).\n", + "The qual collector has closed its output files.\n", + "Qual collector shut down is complete.\n", + "Telling the invalid edge collector to shut down.\n", + "Waiting for the invalid edge collector to shut down.\n", + "Exiting the invalid edge collector (pid 118143).\n", + "The invalid edge collector has closed its output files.\n", + "Invalid edge collector shut down is complete.\n", + "Telling the invalid qual collector to shut down.\n", + "Waiting for the invalid qual collector to shut down.\n", + "Exiting the invalid qual collector (pid 118144).\n", + "The invalid qual collector has closed its output files.\n", + "Invalid qual collector shut down is complete.\n", + "Telling the description collector to shut down.\n", + "Waiting for the description collector to shut down.\n", + "Exiting the description collector (pid 118145).\n", + "The description collector has closed its output files.\n", + "Description collector shut down is complete.\n", + "Telling the sitelink collector to shut down.\n", + "Waiting for the sitelink collector to shut down.\n", + "Exiting the sitelink collector (pid 118146).\n", + "The sitelink collector has closed its output files.\n", + "Sitelink collector shut down is complete.\n", + "import complete\n", + "time taken : 45465.164197444916s\n", + "Timing: elapsed=12:37:48.283348 CPU=10:19:31.139152 ( 81.8%): import-wikidata -i /data/amandeep/wikidata-20220505/latest-all.json.bz2 --node-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.node.unsorted.tsv.gz --minimal-edge-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz --minimal-qual-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz --invalid-edge-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz --invalid-qual-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalue.unsorted.tsv.gz --node-file-id-only --explode-values False --all-languages True --lang en --alias-edges True --split-alias-file /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz --split-en-alias-file /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.en.unsorted.tsv.gz --description-edges True --split-description-file /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz --split-en-description-file /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.en.unsorted.tsv.gz --label-edges True --split-label-file /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz --split-en-label-file /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.en.unsorted.tsv.gz --datatype-edges True --split-datatype-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz --entry-type-edges True --split-type-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz --sitelink-edges True --sitelink-verbose-edges True --split-sitelink-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz --split-en-sitelink-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz --value-hash-width 6 --claim-id-hash-width 8 --use-kgtkwriter True --use-mgzip-for-input False --use-mgzip-for-output False --use-shm True --procs 12 --mapper-batch-size 5 --max-size-per-mapper-queue 3 --single-mapper-queue True --collect-results True --collect-seperately True --collector-batch-size 5 --collector-queue-per-proc-size 3 --progress-interval 500000 --clean --allow-end-of-day False --repair-month-or-day-zero --minimum-valid-year 1 --maximum-valid-year 9999 --validate-fromisoformat --repair-lax-coordinates --allow-language-suffixes --allow-wikidata-lq-strings\n" + ] + } + ], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " import-wikidata \\\n", @@ -256,7 +1533,7 @@ " --invalid-qual-file ${TEMPDIR}/qualifiers.badvalue.${UNSORTED_KGTK} \\\n", " --node-file-id-only \\\n", " --explode-values False \\\n", - " --all-languages False \\\n", + " --all-languages True \\\n", " --lang en \\\n", " --alias-edges True \\\n", " --split-alias-file ${TEMPDIR}/aliases.${UNSORTED_KGTK} \\\n", @@ -281,7 +1558,7 @@ " --use-mgzip-for-input False \\\n", " --use-mgzip-for-output False \\\n", " --use-shm True \\\n", - " --procs 6 \\\n", + " --procs 12 \\\n", " --mapper-batch-size 5 \\\n", " --max-size-per-mapper-queue 3 \\\n", " --single-mapper-queue True \\\n", @@ -312,10 +1589,50 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "6aad9a3c-c27b-4858-802b-b633c40dbb5d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", + "Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the reject file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Applying a dispatched multiple-output object filter\n", + "Read 1362524112 rows, rejected 1361968102 rows, wrote 556010 rows.\n", + "Closing output files.\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "All output files have been closed.\n", + "Timing: elapsed=1:21:07.331541 CPU=4:34:39.248156 (338.6%): filter --verbose --use-mgzip TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz --first-match-only --pattern ;; novalue -o /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz --pattern ;; somevalue -o /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz --reject-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n" + ] + } + ], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \\\n", @@ -337,10 +1654,174 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "7538ad9d-018a-45ea-95f8-46467e68affe", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: -\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", + "Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: -\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '-' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading stdin\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '-' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading stdin\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: -\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '-' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading stdin\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Opening the reject file: -\n", + "KgtkWriter: writing stdout\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Applying a dispatched multiple-output object filter\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Opening the reject file: -\n", + "KgtkWriter: writing stdout\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing by cacheing the filter file's key set.\n", + "Building the filter key set from /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz\n", + "There are 124282 entries in the filter key set.\n", + "Filtering records from -\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Opening the reject file: -\n", + "KgtkWriter: writing stdout\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing by cacheing the filter file's key set.\n", + "Building the filter key set from /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz\n", + "There are 431728 entries in the filter key set.\n", + "Filtering records from -\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Opening the reject file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing by cacheing the filter file's key set.\n", + "Building the filter key set from /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz\n", + "There are 50133 entries in the filter key set.\n", + "Filtering records from -\n", + "Read 308638139 rows, rejected 308457119 rows, wrote 181020 rows.\n", + "Closing output files.\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: not closing standard output\n", + "All output files have been closed.\n", + "Timing: elapsed=1:10:16.268229 CPU=0:19:02.118320 ( 27.1%): filter --verbose --use-mgzip TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz --first-match-only --pattern ;; novalue -o /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz --pattern ;; somevalue -o /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz --reject-file -\n", + "Read 308457119 input records, accepted 39972 records, rejected 308417147 records.\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: not closing standard output\n", + "Timing: elapsed=1:10:16.404495 CPU=0:16:26.828742 ( 23.4%): ifexists --verbose --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz --reject-file -\n", + "Read 308417147 input records, accepted 368548 records, rejected 308048599 records.\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: not closing standard output\n", + "Timing: elapsed=1:10:16.610469 CPU=0:16:06.036942 ( 22.9%): ifexists --verbose --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz --reject-file -\n", + "Read 308048599 input records, accepted 14219 records, rejected 308034380 records.\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=1:10:16.726708 CPU=1:09:22.133053 ( 98.7%): ifexists --verbose --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz --reject-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", + "Timing: elapsed=1:10:29.856671 CPU=0:00:07.875724 ( 0.2%): filter --verbose --use-mgzip TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz --first-match-only --pattern ;; novalue -o /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz --pattern ;; somevalue -o /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz --reject-file - / ifexists --verbose --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz --reject-file - / ifexists --verbose --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz --reject-file - / ifexists --verbose --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz --reject-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n" + ] + } + ], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \\\n", @@ -380,10 +1861,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "261a84b8-c671-4134-afaa-12af4c4a7762", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", + "Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tlang\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tlang\n", + "Opening the reject file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tlang\n", + "Applying a single general filter\n", + "Read 328865880 rows, rejected 82326179 rows, wrote 246539701 rows.\n", + "Keep counts: subject=0, predicate=246539701, object=0.\n", + "Reject counts: subject=0, predicate=82326179, object=0.\n", + "Closing output files.\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "All output files have been closed.\n", + "Timing: elapsed=0:23:09.677401 CPU=1:32:45.933341 (400.5%): filter --verbose --use-mgzip=TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz --pattern ; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ; --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz --reject-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n" + ] + } + ], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \\\n", @@ -404,10 +1922,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "9bce9d69-6032-4e2f-aa04-12da7998d508", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", + "Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.qualifiers.unsorted.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.qualifiers.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\n", + "Opening the reject file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.unsorted.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\n", + "Applying a single general filter\n", + "Read 55048589 rows, rejected 13745591 rows, wrote 41302998 rows.\n", + "Keep counts: subject=0, predicate=41302998, object=0.\n", + "Reject counts: subject=0, predicate=13745591, object=0.\n", + "Closing output files.\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "All output files have been closed.\n", + "Timing: elapsed=0:04:06.637752 CPU=0:16:28.598107 (400.8%): filter --verbose --use-mgzip=TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz --pattern ; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ; --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.qualifiers.unsorted.tsv.gz --reject-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.unsorted.tsv.gz\n" + ] + } + ], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \\\n", @@ -428,10 +1983,841 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "90b70419-1894-4dcb-954d-5b83d4c80d48", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sort the claims file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz'\n", + "Monitoring the cat command (pid=175774).\n", + "Running the sort script (pid=175778).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 rank node2;wikidatatype\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=175786)\n", + "Cleanup.\n", + "Timing: elapsed=0:38:27.177785 CPU=0:00:06.266422 ( 0.3%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the claims.badvalue file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.badvalue.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.badvalue.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.badvalue.tsv.gz'\n", + "Monitoring the cat command (pid=180475).\n", + "Running the sort script (pid=180479).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 rank node2;wikidatatype\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Exception looking for sort command: \n", + "\n", + " RAN: /usr/bin/pgrep -g 180479 --newest sort\n", + "\n", + " STDOUT:\n", + "\n", + "\n", + " STDERR:\n", + "\n", + "Cleanup.\n", + "Timing: elapsed=0:00:06.927161 CPU=0:00:06.170400 ( 89.1%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.badvalue.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the claims.novalue file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.novalue.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.novalue.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.novalue.tsv.gz'\n", + "Monitoring the cat command (pid=180571).\n", + "Running the sort script (pid=180575).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 rank node2;wikidatatype\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Exception looking for sort command: \n", + "\n", + " RAN: /usr/bin/pgrep -g 180575 --newest sort\n", + "\n", + " STDOUT:\n", + "\n", + "\n", + " STDERR:\n", + "\n", + "Cleanup.\n", + "Timing: elapsed=0:00:02.236354 CPU=0:00:05.542310 (247.8%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.novalue.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the claims.somevalue file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.somevalue.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.somevalue.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.somevalue.tsv.gz'\n", + "Monitoring the cat command (pid=180673).\n", + "Running the sort script (pid=180677).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 rank node2;wikidatatype\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Exception looking for sort command: \n", + "\n", + " RAN: /usr/bin/pgrep -g 180677 --newest sort\n", + "\n", + " STDOUT:\n", + "\n", + "\n", + " STDERR:\n", + "\n", + "Cleanup.\n", + "Timing: elapsed=0:00:02.156390 CPU=0:00:05.279872 (244.8%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.somevalue.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the qualifiers file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz'\n", + "Monitoring the cat command (pid=180779).\n", + "Running the sort script (pid=180783).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 node2;wikidatatype\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=180791)\n", + "Cleanup.\n", + "Timing: elapsed=0:04:15.085880 CPU=0:00:05.685372 ( 2.2%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the qualifiers.badvalue file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalue.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalue.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalue.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalue.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalue.tsv.gz'\n", + "Monitoring the cat command (pid=181176).\n", + "Running the sort script (pid=181180).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 node2;wikidatatype\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Exception looking for sort command: \n", + "\n", + " RAN: /usr/bin/pgrep -g 181180 --newest sort\n", + "\n", + " STDOUT:\n", + "\n", + "\n", + " STDERR:\n", + "\n", + "Cleanup.\n", + "Timing: elapsed=0:00:03.547380 CPU=0:00:06.041493 (170.3%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalue.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalue.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the qualifiers.badvalueClaims file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalueClaims.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalueClaims.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalueClaims.tsv.gz'\n", + "Monitoring the cat command (pid=181268).\n", + "Running the sort script (pid=181272).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 node2;wikidatatype\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Exception looking for sort command: \n", + "\n", + " RAN: /usr/bin/pgrep -g 181272 --newest sort\n", + "\n", + " STDOUT:\n", + "\n", + "\n", + " STDERR:\n", + "\n", + "Cleanup.\n", + "Timing: elapsed=0:00:01.955742 CPU=0:00:05.388535 (275.5%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalueClaims.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the qualifiers.novalue file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalue.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalue.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalue.tsv.gz'\n", + "Monitoring the cat command (pid=181345).\n", + "Running the sort script (pid=181349).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 node2;wikidatatype\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Exception looking for sort command: \n", + "\n", + " RAN: /usr/bin/pgrep -g 181349 --newest sort\n", + "\n", + " STDOUT:\n", + "\n", + "\n", + " STDERR:\n", + "\n", + "Cleanup.\n", + "Timing: elapsed=0:00:02.107548 CPU=0:00:05.496728 (260.8%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalue.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the qualifiers.novalueClaims file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalueClaims.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalueClaims.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalueClaims.tsv.gz'\n", + "Monitoring the cat command (pid=181447).\n", + "Running the sort script (pid=181453).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 node2;wikidatatype\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Exception looking for sort command: \n", + "\n", + " RAN: /usr/bin/pgrep -g 181453 --newest sort\n", + "\n", + " STDOUT:\n", + "\n", + "\n", + " STDERR:\n", + "\n", + "Cleanup.\n", + "Timing: elapsed=0:00:02.208194 CPU=0:00:05.602475 (253.7%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalueClaims.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the qualifiers.somevalue file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalue.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalue.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalue.tsv.gz'\n", + "Monitoring the cat command (pid=181543).\n", + "Running the sort script (pid=181547).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 node2;wikidatatype\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Exception looking for sort command: \n", + "\n", + " RAN: /usr/bin/pgrep -g 181547 --newest sort\n", + "\n", + " STDOUT:\n", + "\n", + "\n", + " STDERR:\n", + "\n", + "Cleanup.\n", + "Timing: elapsed=0:00:02.095882 CPU=0:00:05.336625 (254.6%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalue.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the qualifiers.somevalueClaims file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalueClaims.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalueClaims.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalueClaims.tsv.gz'\n", + "Monitoring the cat command (pid=181645).\n", + "Running the sort script (pid=181651).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 node2;wikidatatype\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Exception looking for sort command: \n", + "\n", + " RAN: /usr/bin/pgrep -g 181651 --newest sort\n", + "\n", + " STDOUT:\n", + "\n", + "\n", + " STDERR:\n", + "\n", + "Cleanup.\n", + "Timing: elapsed=0:00:02.544968 CPU=0:00:05.689815 (223.6%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalueClaims.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the aliases file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz'\n", + "Monitoring the cat command (pid=181750).\n", + "Running the sort script (pid=181754).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\tlang\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 lang\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=181762)\n", + "Cleanup.\n", + "Timing: elapsed=0:01:36.071250 CPU=0:00:05.613290 ( 5.8%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the aliases.en file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.en.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.en.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.en.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.en.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.en.tsv.gz'\n", + "Monitoring the cat command (pid=181952).\n", + "Running the sort script (pid=181956).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=181964)\n", + "Cleanup.\n", + "Timing: elapsed=0:00:13.705462 CPU=0:00:05.731224 ( 41.8%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.en.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/aliases.en.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the descriptions file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz'\n", + "Monitoring the cat command (pid=182059).\n", + "Running the sort script (pid=182063).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\tlang\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 lang\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=182071)\n", + "Cleanup.\n", + "Timing: elapsed=1:06:40.955939 CPU=0:00:06.018089 ( 0.2%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the descriptions.en file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.en.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.en.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.en.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.en.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.en.tsv.gz'\n", + "Monitoring the cat command (pid=186189).\n", + "Running the sort script (pid=186193).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=186201)\n", + "Cleanup.\n", + "Timing: elapsed=0:00:46.518554 CPU=0:00:04.968905 ( 10.7%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.en.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.en.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the labels file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz'\n", + "Monitoring the cat command (pid=186298).\n", + "Running the sort script (pid=186302).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\tlang\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 lang\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=186310)\n", + "Cleanup.\n", + "Timing: elapsed=0:12:25.459447 CPU=0:00:05.751043 ( 0.8%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the labels.en file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/labels.en.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/labels.en.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/labels.en.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/labels.en.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/labels.en.tsv.gz'\n", + "Monitoring the cat command (pid=187128).\n", + "Running the sort script (pid=187132).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=187140)\n", + "Cleanup.\n", + "Timing: elapsed=0:01:16.965917 CPU=0:00:06.148691 ( 8.0%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.en.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/labels.en.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the sitelinks file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.tsv.gz'\n", + "Monitoring the cat command (pid=187343).\n", + "Running the sort script (pid=187347).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\tlang\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 lang\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=187355)\n", + "Cleanup.\n", + "Timing: elapsed=0:01:27.116634 CPU=0:00:06.156457 ( 7.1%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the sitelinks.en file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.tsv.gz'\n", + "Monitoring the cat command (pid=187715).\n", + "Running the sort script (pid=187719).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=187735)\n", + "Cleanup.\n", + "Timing: elapsed=0:00:17.925203 CPU=0:00:05.605642 ( 31.3%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the sitelinks.en.qualifiers file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.qualifiers.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.qualifiers.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.qualifiers.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.qualifiers.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.qualifiers.tsv.gz'\n", + "Monitoring the cat command (pid=187877).\n", + "Running the sort script (pid=187881).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=187889)\n", + "Cleanup.\n", + "Timing: elapsed=0:00:33.135455 CPU=0:00:05.715740 ( 17.2%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.qualifiers.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.qualifiers.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the sitelinks.qualifiers file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.qualifiers.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.qualifiers.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.qualifiers.tsv.gz'\n", + "Monitoring the cat command (pid=188131).\n", + "Running the sort script (pid=188135).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\tlang\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 lang\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=188144)\n", + "Cleanup.\n", + "Timing: elapsed=0:03:19.657764 CPU=0:00:05.317021 ( 2.7%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.qualifiers.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the metadata.node file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.node.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.node.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.node.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.node.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.node.tsv.gz'\n", + "Monitoring the cat command (pid=189086).\n", + "Running the sort script (pid=189090).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\n", + "node1 column not found, assuming this is a KGTK node file\n", + "KgtkReader: is_edge_file=False is_node_file=True\n", + "KgtkReader: Special columns: node1=-1 label=-1 node2=-1 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=189098)\n", + "Cleanup.\n", + "Timing: elapsed=0:00:28.235616 CPU=0:00:05.945252 ( 21.1%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.node.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/metadata.node.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the metadata.property.datatypes file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.property.datatypes.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.property.datatypes.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.property.datatypes.tsv.gz'\n", + "Monitoring the cat command (pid=189196).\n", + "Running the sort script (pid=189200).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Exception looking for sort command: \n", + "\n", + " RAN: /usr/bin/pgrep -g 189200 --newest sort\n", + "\n", + " STDOUT:\n", + "\n", + "\n", + " STDERR:\n", + "\n", + "Cleanup.\n", + "Timing: elapsed=0:00:02.100170 CPU=0:00:05.558517 (264.7%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/metadata.property.datatypes.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", + "Sort the metadata.types file.\n", + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.types.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.types.tsv.gz'\n", + "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz'\n", + "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.types.tsv.gz'\n", + "Monitoring the cat command (pid=189262).\n", + "Running the sort script (pid=189268).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "\n", + "Done reading the input file\n", + "Monitoring the sort command (pid=189274)\n", + "Cleanup.\n", + "Timing: elapsed=0:00:36.869484 CPU=0:00:05.592611 ( 15.2%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/metadata.types.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n" + ] + } + ], "source": [ "for TARGET in WIKIDATA_IMPORT_SPLIT_FILES:\n", " print(f\"Sort the {TARGET} file.\")\n", @@ -458,10 +2844,205 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "47560fa3-d87e-4840-800e-ecdf7d1d4341", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using the sort command 'sort'\n", + "header pipe: read_fd=4 write_fd=5\n", + "sort options pipe: read_fd=6 write_fd=7\n", + "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/all.tsv.gz'\n", + "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/all.tsv.gz'\n", + "Running the sort script (pid=158825).\n", + "Reading the KGTK input file header line with KgtkReader\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Starting kgtkcat pid=158741\n", + "Opening the 9 input files.\n", + "Opening file 1: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: reading file descriptor 4\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "The output file will be an edge file.\n", + "Mapping the 6 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz.\n", + "Opening file 2: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Mapping the 5 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz.\n", + "Opening file 3: /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tlang\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Mapping the 5 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz.\n", + "Opening file 4: /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tlang\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Mapping the 5 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz.\n", + "Opening file 5: /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tlang\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Mapping the 5 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz.\n", + "Opening file 6: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tlang\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Mapping the 5 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz.\n", + "Opening file 7: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tlang\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Mapping the 5 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz.\n", + "Opening file 8: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Mapping the 4 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz.\n", + "Opening file 9: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Mapping the 4 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz.\n", + "There are 7 merged columns.\n", + "Opening the output edge file: -\n", + "KgtkWriter: writing stdout\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\tlang\n", + "Copying data from file 1: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n", + "Shortcut not possible: len(kr.column_names)=6 != len(kw.column_names)=7\n", + "Row by row file copy\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\tlang\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "KGTK header: id node1 label node2 rank node2;wikidatatype lang\n", + "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", + "\n", + "Waiting for the sort command to complete.\n", + "\n", + "Read 1361968102 data lines from file 1: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n", + "Copying data from file 2: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", + "Shortcut not possible: len(kr.column_names)=5 != len(kw.column_names)=7\n", + "Row by row file copy with a shuffle list: 0 1 2 3 5\n", + "Read 308034380 data lines from file 2: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", + "Copying data from file 3: /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz\n", + "Shortcut not possible: len(kr.column_names)=5 != len(kw.column_names)=7\n", + "Row by row file copy with a shuffle list: 0 1 2 3 6\n", + "Read 170178120 data lines from file 3: /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz\n", + "Copying data from file 4: /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz\n", + "Shortcut not possible: len(kr.column_names)=5 != len(kw.column_names)=7\n", + "Row by row file copy with a shuffle list: 0 1 2 3 6\n", + "Read 2670247344 data lines from file 4: /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz\n", + "Copying data from file 5: /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz\n", + "Shortcut not possible: len(kr.column_names)=5 != len(kw.column_names)=7\n", + "Row by row file copy with a shuffle list: 0 1 2 3 6\n", + "Read 739125735 data lines from file 5: /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz\n", + "Copying data from file 6: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n", + "Shortcut not possible: len(kr.column_names)=5 != len(kw.column_names)=7\n", + "Row by row file copy with a shuffle list: 0 1 2 3 6\n", + "Read 82326179 data lines from file 6: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n", + "Copying data from file 7: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz\n", + "Shortcut not possible: len(kr.column_names)=5 != len(kw.column_names)=7\n", + "Row by row file copy with a shuffle list: 0 1 2 3 6\n", + "Read 246539701 data lines from file 7: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz\n", + "Copying data from file 8: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz\n", + "Shortcut not possible: len(kr.column_names)=4 != len(kw.column_names)=7\n", + "Row by row file copy\n", + "Read 96951235 data lines from file 8: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz\n", + "Copying data from file 9: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz\n", + "Shortcut not possible: len(kr.column_names)=4 != len(kw.column_names)=7\n", + "Row by row file copy\n", + "Read 9984 data lines from file 9: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz\n", + "Wrote 5675380780 lines total from 9 files\n", + "KgtkWriter: not closing standard output\n", + "Timing: elapsed=7:05:38.707032 CPU=6:22:33.602783 ( 89.9%): cat --verbose --use-mgzip=TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz\n", + "Cleanup.\n", + "Timing: elapsed=8:15:48.511512 CPU=0:00:06.981474 ( 0.0%): sort --verbose --gzip-command pigz --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/all.tsv.gz\n", + "Timing: elapsed=8:15:54.400740 CPU=0:00:10.688431 ( 0.0%): cat --verbose --use-mgzip=TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz / sort --verbose --gzip-command pigz --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/all.tsv.gz\n" + ] + } + ], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " cat ${VERBOSE} --use-mgzip=${USE_MGZIP} \\\n", @@ -491,10 +3072,53 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "68f89d90-9094-4bff-91ee-911bd2c7773d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.unclaimed.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.unclaimed.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 308034380 input records, accepted 0 records, rejected 308034380 records.\n", + "Read 1361968102 filter records, 271770825 found matching input records, 1090197277 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=1:04:24.282472 CPU=1:01:51.930236 ( 96.1%): ifnotexists --verbose --use-mgzip=TRUE --presorted --input-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.unclaimed.tsv.gz\n" + ] + } + ], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " ifnotexists $VERBOSE --use-mgzip=$USE_MGZIP --presorted \\\n", @@ -516,10 +3140,125 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "8dfffa80-b5c7-4999-96be-0611401c4ffc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", + "Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.commonsMedia.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.commonsMedia.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.external-id.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.external-id.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.geo-shape.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.geo-shape.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.globe-coordinate.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.globe-coordinate.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.math.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.math.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.monolingualtext.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.monolingualtext.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.musical-notation.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.musical-notation.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.quantity.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.quantity.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.string.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.string.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tabular-data.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tabular-data.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.time.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.time.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.url.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.url.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-form.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-form.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-item.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-item.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-lexeme.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-lexeme.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-property.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-property.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-sense.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-sense.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Opening the reject file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.other.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.other.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Applying a dispatched multiple-output object filter\n", + "Read 1361968102 rows, rejected 0 rows, wrote 1361968102 rows.\n", + "Closing output files.\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "KgtkWriter: closing the output file\n", + "All output files have been closed.\n", + "Timing: elapsed=1:28:16.121467 CPU=4:51:32.610135 (330.3%): filter --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz --obj node2;wikidatatype --first-match-only --pattern ;;commonsMedia --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.commonsMedia.tsv.gz --pattern ;;external-id --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.external-id.tsv.gz --pattern ;;geo-shape --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.geo-shape.tsv.gz --pattern ;;globe-coordinate --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.globe-coordinate.tsv.gz --pattern ;;math --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.math.tsv.gz --pattern ;;monolingualtext --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.monolingualtext.tsv.gz --pattern ;;musical-notation --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.musical-notation.tsv.gz --pattern ;;quantity --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.quantity.tsv.gz --pattern ;;string --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.string.tsv.gz --pattern ;;tabular-data --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tabular-data.tsv.gz --pattern ;;time --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.time.tsv.gz --pattern ;;url --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.url.tsv.gz --pattern ;;wikibase-form --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-form.tsv.gz --pattern ;;wikibase-item --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-item.tsv.gz --pattern ;;wikibase-lexeme --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-lexeme.tsv.gz --pattern ;;wikibase-property --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-property.tsv.gz --pattern ;;wikibase-sense --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-sense.tsv.gz --reject-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.other.tsv.gz --use-mgzip TRUE\n" + ] + } + ], "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} \\\n", @@ -575,10 +3314,683 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "05788888-c90a-4841-a943-35fcdee72668", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extract any qualifiers for the properties in claims.commonsMedia\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.commonsMedia.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.commonsMedia.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.commonsMedia.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.commonsMedia.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.commonsMedia.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 464056 records, rejected 305315947 records.\n", + "Read 5426154 filter records, 376326 found matching input records, 5049828 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:10:47.085673 CPU=0:10:50.451530 (100.5%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.commonsMedia.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.commonsMedia.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.external-id\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.external-id.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.external-id.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.external-id.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.external-id.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.external-id.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 6309903 records, rejected 299470100 records.\n", + "Read 188875219 filter records, 3510610 found matching input records, 185364608 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:16:52.937288 CPU=0:19:32.692672 (115.8%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.external-id.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.external-id.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.geo-shape\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.geo-shape.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.geo-shape.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.geo-shape.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.geo-shape.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.geo-shape.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 1754 records, rejected 305778249 records.\n", + "Read 28215 filter records, 1396 found matching input records, 26819 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:10:18.439706 CPU=0:10:17.922182 ( 99.9%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.geo-shape.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.geo-shape.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.globe-coordinate\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.globe-coordinate.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.globe-coordinate.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.globe-coordinate.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.globe-coordinate.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.globe-coordinate.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 162832 records, rejected 305617171 records.\n", + "Read 9156940 filter records, 155142 found matching input records, 9001798 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:10:32.063434 CPU=0:10:39.611307 (101.2%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.globe-coordinate.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.globe-coordinate.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.math\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.math.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.math.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.math.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.math.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.math.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 3807 records, rejected 305776196 records.\n", + "Read 24996 filter records, 3726 found matching input records, 21270 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:10:43.502118 CPU=0:10:46.397159 (100.4%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.math.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.math.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.monolingualtext\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.monolingualtext.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.monolingualtext.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.monolingualtext.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.monolingualtext.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.monolingualtext.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 317870 records, rejected 305462133 records.\n", + "Read 47753791 filter records, 231442 found matching input records, 47522349 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:12:29.399306 CPU=0:13:21.059256 (106.9%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.monolingualtext.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.monolingualtext.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.musical-notation\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.musical-notation.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.musical-notation.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.musical-notation.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.musical-notation.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.musical-notation.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 62 records, rejected 305779941 records.\n", + "Read 942 filter records, 38 found matching input records, 904 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:10:41.608581 CPU=0:10:44.486667 (100.4%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.musical-notation.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.musical-notation.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.quantity\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.quantity.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.quantity.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.quantity.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.quantity.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.quantity.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 54751054 records, rejected 251028949 records.\n", + "Read 86267605 filter records, 49747714 found matching input records, 36519891 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:15:38.545722 CPU=0:28:59.081318 (185.3%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.quantity.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.quantity.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.string\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.string.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.string.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.string.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.string.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.string.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 166537462 records, rejected 139242541 records.\n", + "Read 286774252 filter records, 163568733 found matching input records, 123205519 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:26:46.648710 CPU=1:07:50.078828 (253.3%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.string.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.string.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.tabular-data\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tabular-data.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tabular-data.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tabular-data.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tabular-data.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tabular-data.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 12355 records, rejected 305767648 records.\n", + "Read 22880 filter records, 12334 found matching input records, 10546 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:10:29.056178 CPU=0:10:29.203481 (100.0%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tabular-data.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tabular-data.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.time\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.time.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.time.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.time.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.time.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.time.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 1048220 records, rejected 304731783 records.\n", + "Read 54361593 filter records, 751395 found matching input records, 53610197 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:12:35.679876 CPU=0:13:09.415386 (104.5%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.time.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.time.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.url\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.url.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.url.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.url.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.url.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.url.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 2382690 records, rejected 303397313 records.\n", + "Read 8328249 filter records, 1750479 found matching input records, 6577770 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:11:04.391601 CPU=0:11:27.257304 (103.4%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.url.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.url.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.wikibase-form\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-form.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-form.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-form.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-form.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-form.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 28 records, rejected 305779975 records.\n", + "Read 8241 filter records, 25 found matching input records, 8216 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:10:17.631748 CPU=0:10:14.786283 ( 99.5%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-form.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-form.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.wikibase-item\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-item.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-item.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-item.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-item.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-item.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 73786828 records, rejected 231993175 records.\n", + "Read 670635690 filter records, 50007257 found matching input records, 620628432 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:34:14.591949 CPU=0:52:31.248502 (153.4%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-item.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-item.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.wikibase-lexeme\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-lexeme.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-lexeme.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-lexeme.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-lexeme.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-lexeme.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 338 records, rejected 305779665 records.\n", + "Read 4524 filter records, 279 found matching input records, 4245 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:10:08.115616 CPU=0:10:05.726132 ( 99.6%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-lexeme.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-lexeme.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.wikibase-property\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-property.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-property.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-property.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-property.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-property.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 683 records, rejected 305779320 records.\n", + "Read 39288 filter records, 552 found matching input records, 38736 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:10:21.397888 CPU=0:10:20.181629 ( 99.8%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-property.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-property.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.wikibase-sense\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-sense.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-sense.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-sense.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-sense.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-sense.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 61 records, rejected 305779942 records.\n", + "Read 47 filter records, 46 found matching input records, 1 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:09:44.827663 CPU=0:09:42.992229 ( 99.7%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-sense.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-sense.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", + "Extract any qualifiers for the properties in claims.other\n", + "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.other.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.other.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.other.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Input key columns: node1\n", + "Filter key columns: id\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.other.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.other.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Processing presorted files.\n", + "Read 305780003 input records, accepted 0 records, rejected 305780003 records.\n", + "Read 0 filter records, 0 found matching input records, 0 did not find matches.\n", + "KgtkWriter: closing the output file\n", + "Timing: elapsed=0:10:16.915168 CPU=0:10:14.315237 ( 99.6%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.other.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.other.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n" + ] + } + ], "source": [ "for TARGET in WIKIDATATYPES:\n", " print(f\"Extract any qualifiers for the properties in claims.{TARGET}\")\n", @@ -605,10 +4017,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "d48fdaf8-d5cd-4e2b-a2c5-24be1d82d41b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.properties.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.properties.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", + "Applying a multiple-output general regex match filter\n", + "Read 1357708626 rows, rejected 1357510220 rows, wrote 198406 rows.\n", + "Keep counts: subject=198406, predicate=0, object=0.\n", + "Reject counts: subject=1357510220, predicate=0, object=0.\n", + "Closing output files.\n", + "KgtkWriter: closing the output file\n", + "All output files have been closed.\n", + "Timing: elapsed=0:51:41.926642 CPU=0:51:39.721178 ( 99.9%): filter --verbose --use-mgzip=TRUE --regex --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tsv.gz -p ^P ;; -o /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.properties.tsv.gz\n" + ] + } + ], "source": [ "!kgtk $KGTK_FLAGS filter $VERBOSE --use-mgzip=$USE_MGZIP --regex\\\n", " --input-file $DATADIR/claims.$SORTED_KGTK \\\n", @@ -626,10 +4070,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "d25d8095-dae2-406e-9a28-bc6ab74429f6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", + "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "input format: kgtk\n", + "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", + "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", + "KgtkReader: OK to use the fast read path.\n", + "KgtkReader: File_path.suffix: .gz\n", + "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "node1 column found, this is a KGTK edge file\n", + "KgtkReader: is_edge_file=True is_node_file=False\n", + "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", + "KgtkReader: Reading a kgtk file using the fast path.\n", + "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.properties.tsv.gz\n", + "File_path.suffix: .gz\n", + "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.properties.tsv.gz\n", + "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", + "Applying a multiple-output general regex match filter\n", + "Read 305780003 rows, rejected 305620319 rows, wrote 159684 rows.\n", + "Keep counts: subject=159684, predicate=0, object=0.\n", + "Reject counts: subject=305620319, predicate=0, object=0.\n", + "Closing output files.\n", + "KgtkWriter: closing the output file\n", + "All output files have been closed.\n", + "Timing: elapsed=0:11:44.249832 CPU=0:11:37.552965 ( 99.0%): filter --verbose --use-mgzip=TRUE --regex --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz -p ^P ;; -o /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.properties.tsv.gz\n" + ] + } + ], "source": [ "!kgtk $KGTK_FLAGS filter $VERBOSE --use-mgzip=$USE_MGZIP --regex \\\n", " --input-file $DATADIR/qualifiers.$SORTED_KGTK \\\n", @@ -647,7 +4123,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 13, "id": "44cb612c-a71a-43e9-a3dc-44410e68fdd2", "metadata": {}, "outputs": [ @@ -655,71 +4131,71 @@ "name": "stdout", "output_type": "stream", "text": [ - "total 213464\n", - "-rw-r--r-- 1 amandeep staff 9.2M Nov 2 15:38 claims.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 4.7K Nov 2 15:38 claims.badvalue.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 3.2K Nov 2 15:38 claims.novalue.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 1.4K Nov 2 15:38 claims.somevalue.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 2.5M Nov 2 15:38 qualifiers.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 2.0K Nov 2 15:38 qualifiers.badvalue.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 2.3K Nov 2 15:38 qualifiers.badvalueClaims.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 1.9K Nov 2 15:38 qualifiers.novalue.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 1.4K Nov 2 15:38 qualifiers.novalueClaims.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 4.9K Nov 2 15:38 qualifiers.somevalue.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 1.7K Nov 2 15:38 qualifiers.somevalueClaims.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 2.4M Nov 2 15:38 aliases.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 190K Nov 2 15:38 aliases.en.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 4.2M Nov 2 15:38 descriptions.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 174K Nov 2 15:38 descriptions.en.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 5.7M Nov 2 15:38 labels.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 147K Nov 2 15:38 labels.en.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 7.2M Nov 2 15:38 sitelinks.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 347K Nov 2 15:38 sitelinks.en.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 662K Nov 2 15:38 sitelinks.en.qualifiers.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 14M Nov 2 15:38 sitelinks.qualifiers.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 23K Nov 2 15:38 metadata.node.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 37B Nov 2 15:38 metadata.property.datatypes.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 47K Nov 2 15:38 metadata.types.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 46M Nov 2 15:58 all.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 87B Nov 2 15:59 qualifiers.unclaimed.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 474K Nov 2 16:01 claims.commonsMedia.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 3.7M Nov 2 16:01 claims.external-id.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 6.3K Nov 2 16:01 claims.geo-shape.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 166K Nov 2 16:01 claims.globe-coordinate.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 1.5K Nov 2 16:01 claims.math.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 250K Nov 2 16:01 claims.monolingualtext.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 359B Nov 2 16:01 claims.musical-notation.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 659K Nov 2 16:01 claims.quantity.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 434K Nov 2 16:01 claims.string.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 217B Nov 2 16:01 claims.tabular-data.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 176K Nov 2 16:01 claims.time.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 125K Nov 2 16:01 claims.url.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 175B Nov 2 16:01 claims.wikibase-form.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 2.7M Nov 2 16:01 claims.wikibase-item.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 315B Nov 2 16:01 claims.wikibase-lexeme.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 2.6K Nov 2 16:01 claims.wikibase-property.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 93B Nov 2 16:01 claims.wikibase-sense.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 84B Nov 2 16:01 claims.other.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 96K Nov 2 16:05 qualifiers.commonsMedia.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 373K Nov 2 16:05 qualifiers.external-id.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 175B Nov 2 16:05 qualifiers.geo-shape.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 3.2K Nov 2 16:05 qualifiers.globe-coordinate.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 744B Nov 2 16:05 qualifiers.math.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 29K Nov 2 16:05 qualifiers.monolingualtext.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 165B Nov 2 16:05 qualifiers.musical-notation.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 767K Nov 2 16:05 qualifiers.quantity.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 78K Nov 2 16:06 qualifiers.string.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 90B Nov 2 16:06 qualifiers.tabular-data.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 16K Nov 2 16:06 qualifiers.time.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 28K Nov 2 16:06 qualifiers.url.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 91B Nov 2 16:06 qualifiers.wikibase-form.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 1.0M Nov 2 16:06 qualifiers.wikibase-item.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 93B Nov 2 16:06 qualifiers.wikibase-lexeme.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 304B Nov 2 16:06 qualifiers.wikibase-property.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 92B Nov 2 16:06 qualifiers.wikibase-sense.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 83B Nov 2 16:06 qualifiers.other.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 89B Nov 2 16:08 claims.properties.tsv.gz\n", - "-rw-r--r-- 1 amandeep staff 88B Nov 2 16:09 qualifiers.properties.tsv.gz\n" + "total 115G\n", + "-rw-r--r-- 1 amandeep isdstaff 28G Apr 15 11:15 claims.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 994K Apr 15 11:16 claims.badvalue.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.7M Apr 15 11:16 claims.novalue.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 5.4M Apr 15 11:16 claims.somevalue.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 5.4G Apr 15 11:20 qualifiers.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 592K Apr 15 11:20 qualifiers.badvalue.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 242K Apr 15 11:20 qualifiers.badvalueClaims.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 862K Apr 15 11:20 qualifiers.novalue.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 573K Apr 15 11:20 qualifiers.novalueClaims.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2.1M Apr 15 11:20 qualifiers.somevalue.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 6.7M Apr 15 11:20 qualifiers.somevalueClaims.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 184M Apr 15 11:20 aliases.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 181M Apr 15 11:20 aliases.en.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 694M Apr 15 11:21 descriptions.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 692M Apr 15 11:22 descriptions.en.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2.2G Apr 15 11:23 labels.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2.2G Apr 15 11:25 labels.en.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.8G Apr 15 11:26 sitelinks.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 287M Apr 15 11:27 sitelinks.en.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 507M Apr 15 11:27 sitelinks.en.qualifiers.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 3.3G Apr 15 11:30 sitelinks.qualifiers.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 204M Apr 15 11:31 metadata.node.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 54K Apr 15 11:31 metadata.property.datatypes.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 455M Apr 15 11:31 metadata.types.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 40G Apr 15 14:41 all.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 87 Apr 15 15:51 qualifiers.unclaimed.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 183M Apr 15 17:20 claims.commonsMedia.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 4.1G Apr 15 17:20 claims.external-id.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 778K Apr 15 17:20 claims.geo-shape.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 226M Apr 15 17:20 claims.globe-coordinate.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 686K Apr 15 17:20 claims.math.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2.3G Apr 15 17:20 claims.monolingualtext.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 28K Apr 15 17:20 claims.musical-notation.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2.0G Apr 15 17:21 claims.quantity.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 5.7G Apr 15 17:21 claims.string.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 420K Apr 15 17:21 claims.tabular-data.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 850M Apr 15 17:21 claims.time.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 205M Apr 15 17:21 claims.url.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 115K Apr 15 17:21 claims.wikibase-form.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 9.6G Apr 15 17:21 claims.wikibase-item.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 74K Apr 15 17:21 claims.wikibase-lexeme.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 634K Apr 15 17:21 claims.wikibase-property.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 960 Apr 15 17:21 claims.wikibase-sense.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 84 Apr 15 17:21 claims.other.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 15M Apr 15 17:32 qualifiers.commonsMedia.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 150M Apr 15 17:49 qualifiers.external-id.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 29K Apr 15 17:59 qualifiers.geo-shape.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2.8M Apr 15 18:09 qualifiers.globe-coordinate.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 85K Apr 15 18:20 qualifiers.math.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 6.9M Apr 15 18:33 qualifiers.monolingualtext.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.8K Apr 15 18:43 qualifiers.musical-notation.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 891M Apr 15 18:59 qualifiers.quantity.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2.7G Apr 15 19:26 qualifiers.string.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 201K Apr 15 19:36 qualifiers.tabular-data.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 16M Apr 15 19:49 qualifiers.time.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 39M Apr 15 20:00 qualifiers.url.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.1K Apr 15 20:10 qualifiers.wikibase-form.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.3G Apr 15 20:45 qualifiers.wikibase-item.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 9.2K Apr 15 20:55 qualifiers.wikibase-lexeme.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 18K Apr 15 21:05 qualifiers.wikibase-property.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 1.6K Apr 15 21:15 qualifiers.wikibase-sense.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 83 Apr 15 21:25 qualifiers.other.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 3.8M Apr 15 22:17 claims.properties.tsv.gz\n", + "-rw-r--r-- 1 amandeep isdstaff 2.8M Apr 15 22:29 qualifiers.properties.tsv.gz\n" ] } ], @@ -738,9 +4214,9 @@ ], "metadata": { "kernelspec": { - "display_name": "kgtk-env", + "display_name": "kgtk-env-ckg07", "language": "python", - "name": "kgtk-env" + "name": "kgtk-env-ckg07" }, "language_info": { "codemirror_mode": { @@ -752,7 +4228,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.12" } }, "nbformat": 4, From 4799c05ba294746bd5b2f242a03b1bb67df9c215 Mon Sep 17 00:00:00 2001 From: saggu Date: Mon, 23 May 2022 11:06:04 -0700 Subject: [PATCH 04/21] use https for sitelinks, PEP 8 fixes --- kgtk/cli/import_wikidata.py | 737 ++++++++++++++++++++---------------- 1 file changed, 421 insertions(+), 316 deletions(-) diff --git a/kgtk/cli/import_wikidata.py b/kgtk/cli/import_wikidata.py index 8f4ac1999..efb0b628f 100644 --- a/kgtk/cli/import_wikidata.py +++ b/kgtk/cli/import_wikidata.py @@ -24,6 +24,7 @@ import typing from kgtk.cli_argparse import KGTKArgumentParser, KGTKFiles + def parser(): return { 'help': 'Import an wikidata file into KGTK file' @@ -40,7 +41,7 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names from kgtk.io.kgtkreader import KgtkReaderOptions from kgtk.io.kgtkwriter import KgtkWriter from kgtk.value.kgtkvalueoptions import KgtkValueOptions - + _expert: bool = parsed_shared_args._expert parser.add_input_file(positional=True, who='input path file (may be .bz2)') @@ -77,7 +78,8 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names const=True, default=False, metavar="True/False", - help="If true, use a single queue for worker tasks. If false, each worker has its own task queue. (default=%(default)s).", + help="If true, use a single queue for worker tasks. " + "If false, each worker has its own task queue. (default=%(default)s).", ) parser.add_argument( @@ -88,7 +90,8 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names const=True, default=False, metavar="True/False", - help="If true, collect the results before writing to disk. If false, write results to disk, then concatenate. (default=%(default)s).", + help="If true, collect the results before writing to disk. " + "If false, write results to disk, then concatenate. (default=%(default)s).", ) parser.add_argument( @@ -99,7 +102,8 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names const=True, default=False, metavar="True/False", - help="If true, collect the node, edge, and qualifier results using seperate processes. If false, collect the results with a single process. (default=%(default)s).", + help="If true, collect the node, edge, and qualifier results using seperate processes. " + "If false, collect the results with a single process. (default=%(default)s).", ) parser.add_argument( @@ -296,7 +300,7 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names action="store_true", dest="deprecated", help='option to include deprecated statements, not included by default') - + parser.add_argument( "--explode-values", nargs='?', @@ -375,7 +379,7 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names ) parser.add_argument( - "--alias-edges", + "--alias-edges", nargs='?', type=optional_bool, dest="alias_edges", @@ -385,9 +389,8 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names help="If true, create edge records for aliases. (default=%(default)s).", ) - parser.add_argument( - "--datatype-edges", + "--datatype-edges", nargs='?', type=optional_bool, dest="datatype_edges", @@ -397,9 +400,8 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names help="If true, create edge records for property datatypes. (default=%(default)s).", ) - parser.add_argument( - "--description-edges", + "--description-edges", nargs='?', type=optional_bool, dest="descr_edges", @@ -409,7 +411,6 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names help="If true, create edge records for descriptions. (default=%(default)s).", ) - parser.add_argument( "--label-edges", nargs='?', @@ -475,7 +476,7 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names metavar="True/False", help="If true, parse descriptions. (default=%(default)s).", ) - + parser.add_argument( "--parse-labels", nargs='?', @@ -528,9 +529,10 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names const=True, default=False, metavar="True/False", - help="If true, override --lang and import aliases, dscriptions, and labels in all languages. (default=%(default)s).", + help="If true, override --lang and import aliases, dscriptions, and labels in all languages. " + "(default=%(default)s).", ) - + parser.add_argument( "--warn-if-missing", nargs='?', @@ -549,7 +551,7 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names dest="progress_interval", default=500000, help='How often to report progress. (default=%(default)d)') - + parser.add_argument( "--use-kgtkwriter", nargs='?', @@ -608,7 +610,8 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names type=int, dest="claim_id_hash_width", default=0, - help='How many characters should be used to hash the claim ID? 0 means do not hash the claim ID. (default=%(default)d)') + help='How many characters should be used to hash the claim ID? 0 means do not hash the claim ID. ' + '(default=%(default)d)') parser.add_argument( "--clean", @@ -655,12 +658,14 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names default=False, metavar="True/False", help="If true, skip output record validation. (default=%(default)s).", - ) - + ) + KgtkValueOptions.add_arguments(parser, expert=_expert) -def custom_progress()->bool: - return True # We want to start a custom progress monitor. + +def custom_progress() -> bool: + return True # We want to start a custom progress monitor. + def run(input_file: KGTKFiles, procs: int, @@ -731,9 +736,8 @@ def run(input_file: KGTKFiles, clean_input_values: bool, clean_verbose: bool, skip_validation: bool, - **kwargs # Whatever KgtkValueOptions wants. + **kwargs # Whatever KgtkValueOptions wants. ): - # import modules locally import bz2 import simplejson as json @@ -756,7 +760,7 @@ def run(input_file: KGTKFiles, value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) - languages=lang.split(',') + languages = lang.split(',') ADDL_SITELINK_LABEL: str = "addl_wikipedia_sitelink" ALIAS_LABEL: str = "alias" @@ -771,11 +775,11 @@ def run(input_file: KGTKFiles, TYPE_LABEL: str = "type" SNAKTYPE_NOVALUE: str = "novalue" - SNAKTYPE_SOMEVALUE: str = "somevalue" + SNAKTYPE_SOMEVALUE: str = "somevalue" SNAKTYPE_VALUE: str = "value" NOVALUE_VALUE: str = "novalue" - SOMEVALUE_VALUE: str = "somevalue" + SOMEVALUE_VALUE: str = "somevalue" CLAIM_TYPE_STATEMENT: str = "statement" @@ -806,14 +810,13 @@ class MyMapper(pyrallel.Mapper): def enter(self): print("Starting worker process {} (pid {}).".format(self._idx, os.getpid()), file=sys.stderr, flush=True) - self.first=True - self.cnt=0 - self.write_mode='w' + self.first = True + self.cnt = 0 + self.write_mode = 'w' - self.node_f = None if node_file and not collect_results: - self.node_f = open(node_file+'_{}'.format(self._idx), self.write_mode, newline='') + self.node_f = open(node_file + '_{}'.format(self._idx), self.write_mode, newline='') self.node_wr = csv.writer( self.node_f, quoting=csv.QUOTE_NONE, @@ -821,10 +824,10 @@ def enter(self): escapechar="\n", quotechar='', lineterminator=csv_line_terminator) - + self.edge_f = None if detailed_edge_file and not collect_results: - self.edge_f = open(detailed_edge_file+'_{}'.format(self._idx), self.write_mode, newline='') + self.edge_f = open(detailed_edge_file + '_{}'.format(self._idx), self.write_mode, newline='') self.edge_wr = csv.writer( self.edge_f, quoting=csv.QUOTE_NONE, @@ -832,10 +835,10 @@ def enter(self): escapechar="\n", quotechar='', lineterminator=csv_line_terminator) - + self.qual_f = None if detailed_qual_file and not collect_results: - self.qual_f = open(detailed_qual_file+'_{}'.format(self._idx), self.write_mode, newline='') + self.qual_f = open(detailed_qual_file + '_{}'.format(self._idx), self.write_mode, newline='') self.qual_wr = csv.writer( self.qual_f, quoting=csv.QUOTE_NONE, @@ -846,7 +849,7 @@ def enter(self): self.invalid_edge_f = None if invalid_edge_file and not collect_results: - self.invalid_edge_f = open(invalid_edge_file+'_{}'.format(self._idx), self.write_mode, newline='') + self.invalid_edge_f = open(invalid_edge_file + '_{}'.format(self._idx), self.write_mode, newline='') self.invalid_edge_wr = csv.writer( self.invalid_edge_f, quoting=csv.QUOTE_NONE, @@ -854,10 +857,10 @@ def enter(self): escapechar="\n", quotechar='', lineterminator=csv_line_terminator) - + self.invalid_qual_f = None if invalid_qual_file and not collect_results: - self.invalid_qual_f = open(invalid_qual_file+'_{}'.format(self._idx), self.write_mode, newline='') + self.invalid_qual_f = open(invalid_qual_file + '_{}'.format(self._idx), self.write_mode, newline='') self.invalid_qual_wr = csv.writer( self.invalid_qual_f, quoting=csv.QUOTE_NONE, @@ -868,14 +871,14 @@ def enter(self): if collect_results and collector_batch_size > 1: self.collector_batch_cnt = 0 - self.collector_nrows_batch = [ ] - self.collector_erows_batch = [ ] - self.collector_qrows_batch = [ ] - self.collector_invalid_erows_batch = [ ] - self.collector_invalid_qrows_batch = [ ] + self.collector_nrows_batch = [] + self.collector_erows_batch = [] + self.collector_qrows_batch = [] + self.collector_invalid_erows_batch = [] + self.collector_invalid_qrows_batch = [] - self.collector_description_erows_batch = [ ] - self.collector_sitelink_erows_batch = [ ] + self.collector_description_erows_batch = [] + self.collector_sitelink_erows_batch = [] self.process_row_data = \ node_file or \ @@ -889,10 +892,10 @@ def exit(self, *args, **kwargs): if collect_results: if collector_batch_size > 1: if len(self.collector_nrows_batch) > 0 or \ - len(self.collector_erows_batch) > 0 or \ - len(self.collector_qrows_batch) > 0 or \ - len(self.collector_invalid_erows_batch) > 0 or \ - len(self.collector_invalid_qrows_batch) > 0: + len(self.collector_erows_batch) > 0 or \ + len(self.collector_qrows_batch) > 0 or \ + len(self.collector_invalid_erows_batch) > 0 or \ + len(self.collector_invalid_qrows_batch) > 0: if collect_seperately: if len(self.collector_nrows_batch) > 0: node_collector_q.put(("rows", self.collector_nrows_batch, [], [], [], [], None)) @@ -901,14 +904,18 @@ def exit(self, *args, **kwargs): if len(self.collector_qrows_batch) > 0: qual_collector_q.put(("rows", [], [], self.collector_qrows_batch, [], [], None)) if len(self.collector_invalid_erows_batch) > 0: - invalid_edge_collector_q.put(("rows", [], [], [], self.collector_invalid_erows_batch, [], None)) + invalid_edge_collector_q.put( + ("rows", [], [], [], self.collector_invalid_erows_batch, [], None)) if len(self.collector_invalid_qrows_batch) > 0: - invalid_qual_collector_q.put(("rows", [], [], [], [], self.collector_invalid_qrows_batch, None)) + invalid_qual_collector_q.put( + ("rows", [], [], [], [], self.collector_invalid_qrows_batch, None)) if len(self.collector_description_erows_batch) > 0: - description_collector_q.put(("rows", [], self.collector_description_erows_batch, [], [], [], None)) + description_collector_q.put( + ("rows", [], self.collector_description_erows_batch, [], [], [], None)) if len(self.collector_sitelink_erows_batch) > 0: - sitelink_collector_q.put(("rows", [], self.collector_sitelink_erows_batch, [], [], [], None)) + sitelink_collector_q.put( + ("rows", [], self.collector_sitelink_erows_batch, [], [], [], None)) else: collector_q.put(("rows", self.collector_nrows_batch, @@ -917,7 +924,7 @@ def exit(self, *args, **kwargs): self.collector_invalid_erows_batch, self.collector_invalid_qrows_batch, None)) - + else: if self.node_f is not None: self.node_f.close() @@ -954,7 +961,7 @@ def erows_append(self, erows, edge_id, node1, label, node2, calendar="", entrylang="", invalid_erows=None, - )->bool: + ) -> bool: if len(claim_type) > 0 and claim_type != "statement": raise ValueError("Unexpected claim type %s" % claim_type) @@ -983,8 +990,9 @@ def erows_append(self, erows, edge_id, node1, label, node2, erows = invalid_erows if not values_are_valid and clean_verbose: - print("Value validation error in edge %s: %s" % ("|".join([repr(edge_id), repr(node1), repr(label), repr(node2)]), - error_buffer.getvalue().rstrip()), + print("Value validation error in edge %s: %s" % ( + "|".join([repr(edge_id), repr(node1), repr(label), repr(node2)]), + error_buffer.getvalue().rstrip()), file=sys.stderr, flush=True) error_buffer.close() @@ -1046,7 +1054,7 @@ def qrows_append(self, qrows, edge_id, node1, label, node2, invalid_qrows=None, erows=None, invalid_erows=None, - )->bool: + ) -> bool: values_are_valid: bool = True if clean_input_values: @@ -1073,11 +1081,12 @@ def qrows_append(self, qrows, edge_id, node1, label, node2, qrows = invalid_qrows if not values_are_valid and clean_verbose: - print("Value validation error in qual %s: %s" % ("|".join([repr(edge_id), repr(node1), repr(label), repr(node2)]), - error_buffer.getvalue().rstrip()), + print("Value validation error in qual %s: %s" % ( + "|".join([repr(edge_id), repr(node1), repr(label), repr(node2)]), + error_buffer.getvalue().rstrip()), file=sys.stderr, flush=True) error_buffer.close() - + if minimal_qual_file is not None or detailed_qual_file is not None: if explode_values: qrows.append([edge_id, @@ -1096,7 +1105,7 @@ def qrows_append(self, qrows, edge_id, node1, label, node2, calendar, entity_type, wikidatatype, - ]) + ]) else: qrows.append([edge_id, node1, @@ -1108,9 +1117,8 @@ def qrows_append(self, qrows, edge_id, node1, label, node2, datahash, precision, calendar, - ]) - - + ]) + if interleave: self.erows_append(erows, edge_id=edge_id, @@ -1132,16 +1140,16 @@ def qrows_append(self, qrows, edge_id, node1, label, node2, calendar=calendar, invalid_erows=invalid_erows) return values_are_valid - + # def process(self,line,node_file,edge_file,qual_file,languages,source): def process(self, line): - if progress_interval > 0 and self.cnt % progress_interval == 0 and self.cnt>0: - print("{} lines processed by processor {}".format(self.cnt,self._idx), file=sys.stderr, flush=True) - self.cnt+=1 + if progress_interval > 0 and self.cnt % progress_interval == 0 and self.cnt > 0: + print("{} lines processed by processor {}".format(self.cnt, self._idx), file=sys.stderr, flush=True) + self.cnt += 1 # csv_line_terminator = "\n" if os.name == 'posix' else "\r\n" - nrows=[] - erows=[] - qrows=[] + nrows = [] + erows = [] + qrows = [] invalid_erows = [] if invalid_edge_file is not None else None invalid_qrows = [] if invalid_qual_file is not None else None @@ -1180,7 +1188,7 @@ def process(self, line): raise KGTKException("Qnode %s is missing its labels" % qnode) elif warn_if_missing: print("Object id {} has no labels.".format(qnode), file=sys.stderr, flush=True) - label_list=[] + label_list = [] if labels: if all_languages: label_languages = labels.keys() @@ -1191,14 +1199,13 @@ def process(self, line): if lang_label: # We needn't worry about duplicate label entries if this check passes. if lang_label['language'] != lang: - print("*** Conflicting language key %s for the %s label for %s" % (repr(lang_label['language']), repr(lang), qnode), + print("*** Conflicting language key %s for the %s label for %s" % ( + repr(lang_label['language']), repr(lang), qnode), file=sys.stderr, flush=True) - # lang_label['value']=lang_label['value'].replace('|','\\|') - # label_list.append('\'' + lang_label['value'].replace("'","\\'") + '\'' + "@" + lang) value = KgtkFormat.stringify(lang_label['value'], language=lang) label_list.append(value) - + if label_edges: langid: str = qnode + '-' + LABEL_LABEL + '-' + lang self.erows_append(erows, @@ -1209,16 +1216,15 @@ def process(self, line): entrylang=lang, invalid_erows=invalid_erows) - if not node_id_only: - if len(label_list)>0: + if len(label_list) > 0: row.append("|".join(label_list)) else: row.append("") if not node_id_only: row.append(entry_type) - + if entry_type_edges: typeid: str = qnode + '-' + TYPE_LABEL + '-' + entry_type self.erows_append(erows, @@ -1235,7 +1241,7 @@ def process(self, line): raise KGTKException("Qnode %s is missing its descriptions" % qnode) elif warn_if_missing: print("Object id {} has no descriptions.".format(qnode), file=sys.stderr, flush=True) - descr_list=[] + descr_list = [] if descriptions: if all_languages: desc_languages = descriptions.keys() @@ -1246,10 +1252,10 @@ def process(self, line): if lang_descr: # We needn't worry about duplicate description entries if this check passes. if lang_descr['language'] != lang: - print("*** Conflicting language key %s for the %s description for %s" % (repr(lang_descr['language']), repr(lang), qnode), + print("*** Conflicting language key %s for the %s description for %s" % ( + repr(lang_descr['language']), repr(lang), qnode), file=sys.stderr, flush=True) - # lang_descr['value']=lang_descr['value'].replace('|','\\|') - # descr_list.append('\'' + lang_descr['value'].replace("'","\\'") + '\'' + "@" + lang) + value = KgtkFormat.stringify(lang_descr['value'], language=lang) descr_list.append(value) if descr_edges: @@ -1263,7 +1269,7 @@ def process(self, line): invalid_erows=invalid_erows) if not node_id_only: - if len(descr_list)>0: + if len(descr_list) > 0: row.append("|".join(descr_list)) else: row.append("") @@ -1292,12 +1298,14 @@ def process(self, line): if alias_edges: # Hash the value to save space and avoid syntactic difficulties. # Take a subset of the hash value to save space. - alias_value_hash: str = hashlib.sha256(value.encode('utf-8')).hexdigest()[:value_hash_width] + alias_value_hash: str = hashlib.sha256(value.encode('utf-8')).hexdigest()[ + :value_hash_width] aliasid = qnode + '-' + ALIAS_LABEL + "-" + lang + '-' + alias_value_hash - alias_seq_no: int # In case of hash collision + alias_seq_no: int # In case of hash collision if aliasid in alias_id_collision_map: alias_seq_no = alias_id_collision_map[aliasid] - print("\n*** Alias collision #%d detected for %s (%s)" % (alias_seq_no, aliasid, value), file=sys.stderr, flush=True) + print("\n*** Alias collision #%d detected for %s (%s)" % ( + alias_seq_no, aliasid, value), file=sys.stderr, flush=True) else: alias_seq_no = 0 alias_id_collision_map[aliasid] = alias_seq_no + 1 @@ -1310,14 +1318,12 @@ def process(self, line): entrylang=lang, invalid_erows=invalid_erows) - if not node_id_only: - if len(alias_list)>0: + if len(alias_list) > 0: row.append("|".join(alias_list)) else: row.append("") - datatype = obj.get("datatype", "") if not node_id_only: row.append(datatype) @@ -1331,8 +1337,8 @@ def process(self, line): label=DATATYPE_LABEL, node2=datatype, invalid_erows=invalid_erows) - - #row.append(source) + + # row.append(source) if node_file: nrows.append(row) @@ -1340,8 +1346,9 @@ def process(self, line): if fail_if_missing: raise KGTKException("Qnode %s is missing its claims" % obj.get("id", "")) elif warn_if_missing: - print("Object id {} is missing its claims.".format(obj.get("id", "")), file=sys.stderr, flush=True) - + print("Object id {} is missing its claims.".format(obj.get("id", "")), file=sys.stderr, + flush=True) + if parse_claims and "claims" in obj: claims = obj["claims"] if keep: @@ -1351,23 +1358,25 @@ def process(self, line): raise KGTKException("A claim is missing its Qnode id.") elif warn_if_missing: print("A claim is missing its Qnode id", file=sys.stderr, flush=True) - qnode = "UNKNOWN" # This will cause trouble down the line. + qnode = "UNKNOWN" # This will cause trouble down the line. for prop, claim_property in claims.items(): for cp in claim_property: if (deprecated or cp['rank'] != 'deprecated'): mainsnak = cp['mainsnak'] snaktype = mainsnak.get(MAINSNAK_SNAKTYPE) - rank=cp['rank'] + rank = cp['rank'] claim_id = cp['id'] claim_type = cp['type'] if claim_type != CLAIM_TYPE_STATEMENT: - print("Unknown claim type %s, ignoring claim_property for (%s, %s)." % (repr(claim_type), repr(qnode), repr(prop)), + print("Unknown claim type %s, ignoring claim_property for (%s, %s)." % ( + repr(claim_type), repr(qnode), repr(prop)), file=sys.stderr, flush=True) continue if snaktype is None: - print("Mainsnak without snaktype, ignoring claim_property for (%s, %s)." % (repr(qnode), repr(prop)), + print("Mainsnak without snaktype, ignoring claim_property for (%s, %s)." % ( + repr(qnode), repr(prop)), file=sys.stderr, flush=True) continue if snaktype == SNAKTYPE_VALUE: @@ -1377,12 +1386,19 @@ def process(self, line): if val is not None: if val_type in ("string", "wikibase-unmapped-entityid"): if not isinstance(val, str): - print("Value type is %s but the value is not a string, ignoring claim_property for (%s, %s)." % (repr(val_type), repr(qnode), repr(prop)), + print("Value type is %s but the value is not a string, " + "ignoring claim_property for (%s, %s)." % (repr(val_type), + repr(qnode), + repr(prop)), file=sys.stderr, flush=True) continue elif not isinstance(val, dict): - print("Value type %s is not a known string type and value is not a dict, ignoring claim_property for (%s, %s)." % (repr(val_type), repr(qnode), repr(prop)), - file=sys.stderr, flush=True) + print( + "Value type %s is not a known string type and value is not a dict, " + "ignoring claim_property for (%s, %s)." % (repr(val_type), + repr(qnode), + repr(prop)), + file=sys.stderr, flush=True) continue elif snaktype == SNAKTYPE_SOMEVALUE: @@ -1394,13 +1410,15 @@ def process(self, line): val_type = NOVALUE_VALUE else: - print("Unknown snaktype %s, ignoring claim_property for (%s, %s)." % (repr(snaktype), repr(qnode), repr(prop)), - file=sys.stderr, flush=True) + print("Unknown snaktype %s, ignoring claim_property for (%s, %s)." % ( + repr(snaktype), repr(qnode), repr(prop)), + file=sys.stderr, flush=True) continue - + typ = mainsnak.get(MAINSNAK_DATATYPE) if typ is None: - print("Mainsnak without datatype, ignoring claim_property for (%s, %s)" % (repr(qnode), repr(prop)), + print("Mainsnak without datatype, ignoring claim_property for (%s, %s)" % ( + repr(qnode), repr(prop)), file=sys.stderr, flush=True) continue # if typ != val_type: @@ -1409,8 +1427,8 @@ def process(self, line): value = '' mag = '' unit = '' - date='' - item='' + date = '' + item = '' lower = '' upper = '' precision = '' @@ -1438,7 +1456,9 @@ def process(self, line): if isinstance(val, dict) and 'numeric-id' in val: numeric_id = str(val['numeric-id']) else: - raise ValueError("No numeric ID for datatype %s, entity type %s, in (%s, %s)." % (repr(typ), repr(enttype), repr(qnode), repr(prop))) + raise ValueError( + "No numeric ID for datatype %s, entity type %s, in (%s, %s)." % ( + repr(typ), repr(enttype), repr(qnode), repr(prop))) if enttype == "item": value = 'Q' + numeric_id @@ -1447,8 +1467,10 @@ def process(self, line): elif enttype == "lexeme": value = 'L' + numeric_id else: - raise ValueError('Unknown entity type %s for datatype %s in (%s, %s).' % (repr(enttype), repr(typ), repr(qnode), repr(prop))) - item=value + raise ValueError( + 'Unknown entity type %s for datatype %s in (%s, %s).' % ( + repr(enttype), repr(typ), repr(qnode), repr(prop))) + item = value elif typ == DATATYPE_QUANTITY: # Strip whitespace from the numeric fields. Some older Wikidata dumps @@ -1457,16 +1479,13 @@ def process(self, line): # and trailing whitespace. value = str(val['amount']).strip() mag = value - if val.get( - 'upperBound', - None) or val.get( - 'lowerBound', - None): + if val.get('upperBound', None) or val.get('lowerBound', None): lower = str(val.get('lowerBound', '')).strip() upper = str(val.get('upperBound', '')).strip() value += '[' + lower + \ - ',' + upper + ']' - # TODO: Don't lose the single-character unit code. At a minimum, verify that it is the value "1". + ',' + upper + ']' + # TODO: Don't lose the single-character unit code. + # At a minimum, verify that it is the value "1". if len(val.get('unit')) > 1: unit = val.get( 'unit').split('/')[-1] @@ -1483,10 +1502,10 @@ def process(self, line): # TODO: what about "globe"? elif typ == DATATYPE_TIME: - if val['time'][0]=='-': - pre="^-" + if val['time'][0] == '-': + pre = "^-" else: - pre="^" + pre = "^" # TODO: Maybe strip leading and traiming whitespace here? date = pre + val['time'][1:] # Cautiously strip leading and trailing whitespace from precision? @@ -1495,12 +1514,9 @@ def process(self, line): value = date + '/' + precision elif typ == DATATYPE_MONOLINGUALTEXT: - # value = '\'' + \ - # val['text'].replace("'","\\'").replace("|", "\\|") + '\'' + '@' + val['language'] value = KgtkFormat.stringify(val['text'], language=val['language']) else: - # value = '\"' + val.replace('"','\\"').replace("|", "\\|") + '\"' value = KgtkFormat.stringify(val) if minimal_edge_file is not None or detailed_edge_file is not None: @@ -1508,16 +1524,19 @@ def process(self, line): if value.startswith(('P', 'Q')): prop_value_hash = value else: - prop_value_hash = hashlib.sha256(value.encode('utf-8')).hexdigest()[:value_hash_width] + prop_value_hash = hashlib.sha256(value.encode('utf-8')).hexdigest()[ + :value_hash_width] edgeid: str = qnode + '-' + prop + '-' + prop_value_hash + '-' if claim_id_hash_width == 0: edgeid += claim_id.lower() else: - edgeid += hashlib.sha256(claim_id.lower().encode('utf-8')).hexdigest()[:claim_id_hash_width] - prop_seq_no: int # In case of hash collision + edgeid += hashlib.sha256(claim_id.lower().encode('utf-8')).hexdigest()[ + :claim_id_hash_width] + prop_seq_no: int # In case of hash collision if edgeid in edge_id_collision_map: prop_seq_no = edge_id_collision_map[edgeid] - print("\n*** Edge collision #%d detected for %s (%s)" % (prop_seq_no, edgeid, value), file=sys.stderr, flush=True) + print("\n*** Edge collision #%d detected for %s (%s)" % ( + prop_seq_no, edgeid, value), file=sys.stderr, flush=True) else: prop_seq_no = 0 edge_id_collision_map[edgeid] = prop_seq_no + 1 @@ -1545,7 +1564,6 @@ def process(self, line): calendar=calendar, invalid_erows=invalid_erows) - if minimal_qual_file is not None or detailed_qual_file is not None or interleave: if cp.get('qualifiers', None): quals = cp['qualifiers'] @@ -1567,14 +1585,15 @@ def process(self, line): val_type = NOVALUE_VALUE else: - raise ValueError("Unknown qualifier snaktype %s" % repr(snaktype)) + raise ValueError( + "Unknown qualifier snaktype %s" % repr(snaktype)) if True: value = '' mag = '' unit = '' - date= '' - item='' + date = '' + item = '' lower = '' upper = '' precision = '' @@ -1586,14 +1605,28 @@ def process(self, line): typ = qcp.get(MAINSNAK_DATATYPE) if typ is None: if fail_if_missing: - raise KGTKException("Found qualifier %s without a datatype for (%s, %s)" % (repr(qual_prop), repr(qnode), repr(prop))) + raise KGTKException( + "Found qualifier %s without a datatype for (%s, %s)" + % (repr(qual_prop), repr(qnode), repr(prop))) elif warn_if_missing: if val_type == SOMEVALUE_VALUE: - print("Somevalue qualifier %s without a datatype for (%s, %s)" % (repr(qual_prop), repr(qnode), repr(prop)), file=sys.stderr, flush=True) + print("Somevalue qualifier %s without a datatype " + "for (%s, %s)" % (repr(qual_prop), + repr(qnode), + repr(prop)), + file=sys.stderr, flush=True) elif val_type == NOVALUE_VALUE: - print("Novalue qualifier %s without a datatype for (%s, %s)" % (repr(qual_prop), repr(qnode), repr(prop)), file=sys.stderr, flush=True) + print("Novalue qualifier %s without a datatype " + "for (%s, %s)" % (repr(qual_prop), + repr(qnode), + repr(prop)), + file=sys.stderr, flush=True) else: - print("Found qualifier %s without a datatype for (%s, %s)" % (repr(qual_prop), repr(qnode), repr(prop)), file=sys.stderr, flush=True) + print("Found qualifier %s without a datatype " + "for (%s, %s)" % (repr(qual_prop), + repr(qnode), + repr(prop)), + file=sys.stderr, flush=True) continue if val is None: @@ -1615,8 +1648,12 @@ def process(self, line): if isinstance(val, dict) and 'numeric-id' in val: numeric_id = str(val['numeric-id']) else: - raise ValueError("No numeric ID for datatype %s, entity type %s, in (%s, %s)." % (repr(typ), repr(enttype), repr(qnode), repr(prop))) - + raise ValueError("No numeric ID for datatype %s, " + "entity type %s, in (%s, %s)." % ( + repr(typ), repr(enttype), + repr(qnode), + repr(prop))) + if enttype == "item": value = 'Q' + numeric_id elif enttype == "property": @@ -1624,24 +1661,25 @@ def process(self, line): elif enttype == "lexeme": value = 'L' + numeric_id else: - raise ValueError('Unknown entity type %s for datatype %s in (%s, %s).' % (repr(enttype), repr(typ), repr(qnode), repr(prop))) + raise ValueError('Unknown entity type %s for ' + 'datatype %s in (%s, %s).' % ( + repr(enttype), repr(typ), + repr(qnode), + repr(prop))) - item=value + item = value elif typ == DATATYPE_QUANTITY: value = val['amount'] mag = val['amount'] - if val.get( - 'upperBound', - None) or val.get( - 'lowerBound', - None): + if val.get('upperBound', None) or \ + val.get('lowerBound', None): lower = val.get( 'lowerBound', '') upper = val.get( 'upperBound', '') value += '[' + lower + \ - ',' + upper + ']' + ',' + upper + ']' if len( val.get('unit')) > 1: unit = val.get( @@ -1658,23 +1696,18 @@ def process(self, line): value = '@' + lat + '/' + long elif typ == DATATYPE_TIME: - if val['time'][0]=='-': - pre="^-" + if val['time'][0] == '-': + pre = "^-" else: - pre="^" - date = pre + \ - val['time'][1:] - precision = str( - val['precision']) - calendar = val.get( - 'calendarmodel', '').split('/')[-1] - value = pre + \ - val['time'][1:] + '/' + str(val['precision']) + pre = "^" + date = pre + val['time'][1:] + precision = str(val['precision']) + calendar = val.get('calendarmodel', '').split('/')[-1] + value = pre + val['time'][1:] + '/' + str(val['precision']) elif typ == DATATYPE_MONOLINGUALTEXT: - # value = '\'' + \ - # val['text'].replace("'","\\'") + '\'' + '@' + val['language'] - value = KgtkFormat.stringify(val['text'], language=val['language']) + value = KgtkFormat.stringify(val['text'], + language=val['language']) else: # value = '\"' + val.replace('"','\\"') + '\"' value = KgtkFormat.stringify(val) @@ -1683,12 +1716,16 @@ def process(self, line): if value.startswith(('P', 'Q')): qual_value_hash = value else: - qual_value_hash = hashlib.sha256(value.encode('utf-8')).hexdigest()[:value_hash_width] - qualid: str = edgeid + '-' + qual_prop + '-' + qual_value_hash - qual_seq_no: int # In case of hash collision + qual_value_hash = hashlib.sha256( + value.encode('utf-8')).hexdigest()[:value_hash_width] + qualid: str = edgeid + '-' + qual_prop + '-' + qual_value_hash + qual_seq_no: int # In case of hash collision if qualid in qual_id_collision_map: qual_seq_no = qual_id_collision_map[qualid] - print("\n*** Qualifier collision #%d detected for %s (%s)" % (qual_seq_no, qualid, value), file=sys.stderr, flush=True) + print( + "\n*** Qualifier collision #%d detected for %s (%s)" % ( + qual_seq_no, qualid, value), file=sys.stderr, + flush=True) else: qual_seq_no = 0 qual_id_collision_map[qualid] = qual_seq_no + 1 @@ -1714,9 +1751,9 @@ def process(self, line): invalid_qrows=invalid_qrows, erows=erows, invalid_erows=invalid_erows) - + if parse_sitelinks: - sitelinks=obj.get('sitelinks',None) + sitelinks = obj.get('sitelinks', None) else: sitelinks = None if sitelinks: @@ -1725,36 +1762,38 @@ def process(self, line): # to make the sitetitle safe for KGTK. if link.endswith('wiki') and link not in ('commonswiki', 'simplewiki'): linklabel = SITELINK_LABEL - sitetitle='_'.join(sitelinks[link]['title'].split()) + sitetitle = '_'.join(sitelinks[link]['title'].split()) # The following leads to ambuiguity if there are both # "afwiki" and "afwikibooks". # # TODO: Need to research the sitelink structure more fully. - sitelang=link.split('wiki')[0].replace('_','-') + sitelang = link.split('wiki')[0].replace('_', '-') - sitelink='http://'+sitelang+'.wikipedia.org/wiki/'+sitetitle + sitelink = 'https://' + sitelang + '.wikipedia.org/wiki/' + sitetitle else: linklabel = ADDL_SITELINK_LABEL - sitetitle='_'.join(sitelinks[link]['title'].split()) + sitetitle = '_'.join(sitelinks[link]['title'].split()) if "wiki" in link: # TODO: needs more work here. - sitelang=link.split("wiki")[0] + sitelang = link.split("wiki")[0] if sitelang in ("commons", "simple"): - sitelang = "en" # TODO: Need to retain the distinction we lose here. + sitelang = "en" # TODO: Need to retain the distinction we lose here. else: - sitelang="" - sitehost=link+'.org' # TODO: Needs more work here - sitelink = 'http://'+sitehost+'/wiki/'+sitetitle + sitelang = "" + sitehost = link + '.org' # TODO: Needs more work here + sitelink = 'https://' + sitehost + '/wiki/' + sitetitle if sitelink is not None: serows = sitelink_erows if collect_seperately else erows - sitelink_value_hash: str = hashlib.sha256(sitelink.encode('utf-8')).hexdigest()[:value_hash_width] + sitelink_value_hash: str = hashlib.sha256(sitelink.encode('utf-8')).hexdigest()[ + :value_hash_width] sitelinkid: str = qnode + '-' + linklabel + '-' + sitelink_value_hash sitelink_seq_no: int = 0 if sitelinkid in sitelink_id_collision_map: sitelink_seq_no = sitelink_id_collision_map[sitelinkid] - print("\n*** Sitelink collision #%d detected for %s (%s)" % (sitelink_seq_no, sitelinkid, sitelink), file=sys.stderr, flush=True) + print("\n*** Sitelink collision #%d detected for %s (%s)" % ( + sitelink_seq_no, sitelinkid, sitelink), file=sys.stderr, flush=True) else: sitelink_seq_no = 0 sitelink_id_collision_map[sitelinkid] = sitelink_seq_no + 1 @@ -1778,7 +1817,7 @@ def process(self, line): node2=sitelang, entrylang=sitelang, invalid_erows=invalid_erows) - + self.erows_append(serows, edge_id=sitelinkid + '-site-0', node1=sitelinkid, @@ -1814,7 +1853,7 @@ def process(self, line): invalid_qrows=invalid_qrows, erows=erows, invalid_erows=invalid_erows) - + self.qrows_append(qrows, edge_id=sitelinkid + '-site-0', node1=sitelinkid, @@ -1837,7 +1876,7 @@ def process(self, line): badgeid = sitelinkid + '-badge-' + badge self.qrows_append(qrows, edge_id=badgeid, - node1=sielinkid, + node1=sitelinkid, label=SITELINK_BADGE_LABEL, node2=badge, invalid_qrows=invalid_qrows, @@ -1845,12 +1884,12 @@ def process(self, line): invalid_erows=invalid_erows) if len(nrows) > 0 or \ - len(erows) > 0 or \ - len(qrows) > 0 or \ - (invalid_erows is not None and len(invalid_erows) > 0) or \ - (invalid_qrows is not None and len(invalid_qrows) > 0) or \ - len(description_erows) > 0 or \ - len(sitelink_erows) > 0: + len(erows) > 0 or \ + len(qrows) > 0 or \ + (invalid_erows is not None and len(invalid_erows) > 0) or \ + (invalid_qrows is not None and len(invalid_qrows) > 0) or \ + len(description_erows) > 0 or \ + len(sitelink_erows) > 0: if collect_results: if collector_batch_size == 1: if collect_seperately: @@ -1863,10 +1902,12 @@ def process(self, line): if len(qrows) > 0 and qual_collector_q is not None: qual_collector_q.put(("rows", [], [], qrows, [], [], None)) - if invalid_erows is not None and len(invalid_erows) > 0 and invalid_edge_collector_q is not None: + if invalid_erows is not None and len( + invalid_erows) > 0 and invalid_edge_collector_q is not None: invalid_edge_collector_q.put(("rows", [], [], [], invalid_erows, [], None)) - if invalid_qrows is not None and len(invalid_qrows) > 0 and invalid_qual_collector_q is not None: + if invalid_qrows is not None and len( + invalid_qrows) > 0 and invalid_qual_collector_q is not None: invalid_qual_collector_q.put(("rows", [], [], [], [], invalid_qrows, None)) if len(description_erows) > 0 and description_collector_q is not None: @@ -1888,9 +1929,9 @@ def process(self, line): if collect_seperately: self.collector_description_erows_batch.extend(description_erows) self.collector_sitelink_erows_batch.extend(sitelink_erows) - + self.collector_batch_cnt += 1 - + if self.collector_batch_cnt >= collector_batch_size: if collect_seperately: if len(self.collector_nrows_batch) > 0 and node_collector_q is not None: @@ -1903,19 +1944,24 @@ def process(self, line): qual_collector_q.put(("rows", [], [], self.collector_qrows_batch, [], [], None)) if len(self.collector_invalid_erows_batch) > 0 and invalid_edge_collector_q is not None: - invalid_edge_collector_q.put(("rows", [], [], [], self.collector_invalid_erows_batch, [], None)) + invalid_edge_collector_q.put( + ("rows", [], [], [], self.collector_invalid_erows_batch, [], None)) if len(self.collector_invalid_qrows_batch) > 0 and invalid_qual_collector_q is not None: - invalid_qual_collector_q.put(("rows", [], [], [], [], self.collector_invalid_qrows_batch, None)) + invalid_qual_collector_q.put( + ("rows", [], [], [], [], self.collector_invalid_qrows_batch, None)) - if len(self.collector_description_erows_batch) > 0 and description_collector_q is not None: - description_collector_q.put(("rows", [], self.collector_description_erows_batch, [], [], [], None)) + if len(self.collector_description_erows_batch) > 0 and \ + description_collector_q is not None: + description_collector_q.put( + ("rows", [], self.collector_description_erows_batch, [], [], [], None)) self.collector_description_erows_batch.clear() if len(self.collector_sitelink_erows_batch) > 0 and sitelink_collector_q is not None: - sitelink_collector_q.put(("rows", [], self.collector_sitelink_erows_batch, [], [], [], None)) + sitelink_collector_q.put( + ("rows", [], self.collector_sitelink_erows_batch, [], [], [], None)) self.collector_sitelink_erows_batch.clear() - + elif collector_q is not None: collector_q.put(("rows", self.collector_nrows_batch, @@ -1946,7 +1992,7 @@ def process(self, line): for row in qrows: if skip_validation or validate(row, "detailed qual uncollected"): self.qual_wr.writerow(row) - + if invalid_edge_file and invalid_erows is not None: for row in invalid_erows: self.invalid_edge_wr.writerow(row) @@ -1954,7 +2000,7 @@ def process(self, line): if invalid_qual_file and invalid_qrows is not None: for row in invalid_qrows: self.invalid_qual_wr.writerow(row) - + class MyCollector: def __init__(self): @@ -2044,7 +2090,7 @@ def run(self, collector_q, who: str): print("The %s collector is starting (pid %d)." % (who, os.getpid()), file=sys.stderr, flush=True) - + while True: action, nrows, erows, qrows, invalid_erows, invalid_qrows, header = collector_q.get() # print("Collector action %s." % action, file=sys.stderr, flush=True) @@ -2133,12 +2179,15 @@ def _open_file(self, the_file: typing.Optional[str], header: typing.List[str], f wr: typing.Any if use_kgtkwriter: from kgtk.io.kgtkwriter import KgtkWriter - print("Opening the %s file in the %s collector with KgtkWriter: %s" % (file_type, who, the_file), file=sys.stderr, flush=True) - wr = KgtkWriter.open(header, Path(the_file), who=who + " collector", use_mgzip=use_mgzip_for_output, mgzip_threads=mgzip_threads_for_output) + print("Opening the %s file in the %s collector with KgtkWriter: %s" % (file_type, who, the_file), + file=sys.stderr, flush=True) + wr = KgtkWriter.open(header, Path(the_file), who=who + " collector", use_mgzip=use_mgzip_for_output, + mgzip_threads=mgzip_threads_for_output) return None, wr - + else: - print("Opening the %s file in the %s collector with csv.writer." % (file_type, who), file=sys.stderr, flush=True) + print("Opening the %s file in the %s collector with csv.writer." % (file_type, who), file=sys.stderr, + flush=True) csv_line_terminator = "\n" if os.name == 'posix' else "\r\n" f = open(the_file, "w", newline='') wr = csv.writer( @@ -2158,55 +2207,67 @@ def open_minimal_edge_file(self, header: typing.List[str], who: str): self.minimal_edge_f, self.minimal_edge_wr = self._open_file(minimal_edge_file, header, "minimal edge", who) def open_detailed_edge_file(self, header: typing.List[str], who: str): - self.detailed_edge_f, self.detailed_edge_wr = self._open_file(detailed_edge_file, header, "detailed edge", who) + self.detailed_edge_f, self.detailed_edge_wr = self._open_file(detailed_edge_file, header, "detailed edge", + who) def open_minimal_qual_file(self, header: typing.List[str], who: str): self.minimal_qual_f, self.minimal_qual_wr = self._open_file(minimal_qual_file, header, "minimal qual", who) - + def open_detailed_qual_file(self, header: typing.List[str], who: str): self.detailed_qual_f, self.detailed_qual_wr = self._open_file(detailed_qual_file, header, "qual", who) - + def open_invalid_edge_file(self, header: typing.List[str], who: str): self.invalid_edge_f, self.invalid_edge_wr = self._open_file(invalid_edge_file, header, "invalid edge", who) def open_invalid_qual_file(self, header: typing.List[str], who: str): self.invalid_qual_f, self.invalid_qual_wr = self._open_file(invalid_qual_file, header, "qual", who) - + def open_split_alias_file(self, header: typing.List[str], who: str): self.split_alias_f, self.split_alias_wr = self._open_file(split_alias_file, header, ALIAS_LABEL, who) def open_split_en_alias_file(self, header: typing.List[str], who: str): - self.split_en_alias_f, self.split_en_alias_wr = self._open_file(split_en_alias_file, header, "English " + ALIAS_LABEL, who) + self.split_en_alias_f, self.split_en_alias_wr = self._open_file(split_en_alias_file, header, + "English " + ALIAS_LABEL, who) def open_split_datatype_file(self, header: typing.List[str], who: str): - self.split_datatype_f, self.split_datatype_wr = self._open_file(split_datatype_file, header, DATATYPE_LABEL, who) + self.split_datatype_f, self.split_datatype_wr = self._open_file(split_datatype_file, header, DATATYPE_LABEL, + who) def open_split_description_file(self, header: typing.List[str], who: str): - self.split_description_f, self.split_description_wr = self._open_file(split_description_file, header, DESCRIPTION_LABEL, who) + self.split_description_f, self.split_description_wr = self._open_file(split_description_file, header, + DESCRIPTION_LABEL, who) def open_split_en_description_file(self, header: typing.List[str], who: str): - self.split_en_description_f, self.split_en_description_wr = self._open_file(split_en_description_file, header, "English " + DESCRIPTION_LABEL, who) + self.split_en_description_f, self.split_en_description_wr = self._open_file(split_en_description_file, + header, + "English " + DESCRIPTION_LABEL, + who) def open_split_label_file(self, header: typing.List[str], who: str): self.split_label_f, self.split_label_wr = self._open_file(split_label_file, header, LABEL_LABEL, who) def open_split_en_label_file(self, header: typing.List[str], who: str): - self.split_en_label_f, self.split_en_label_wr = self._open_file(split_en_label_file, header, "English " + LABEL_LABEL, who) + self.split_en_label_f, self.split_en_label_wr = self._open_file(split_en_label_file, header, + "English " + LABEL_LABEL, who) def open_split_sitelink_file(self, header: typing.List[str], who: str): - self.split_sitelink_f, self.split_sitelink_wr = self._open_file(split_sitelink_file, header, SITELINK_LABEL, who) + self.split_sitelink_f, self.split_sitelink_wr = self._open_file(split_sitelink_file, header, SITELINK_LABEL, + who) def open_split_en_sitelink_file(self, header: typing.List[str], who: str): - self.split_en_sitelink_f, self.split_en_sitelink_wr = self._open_file(split_en_sitelink_file, header, "English " + SITELINK_LABEL, who) + self.split_en_sitelink_f, self.split_en_sitelink_wr = self._open_file(split_en_sitelink_file, header, + "English " + SITELINK_LABEL, who) def open_split_type_file(self, header: typing.List[str], who: str): self.split_type_f, self.split_type_wr = self._open_file(split_type_file, header, TYPE_LABEL, who) def open_split_property_edge_file(self, header: typing.List[str], who: str): - self.split_property_edge_f, self.split_property_edge_wr = self._open_file(split_property_edge_file, header, "property edge", who) + self.split_property_edge_f, self.split_property_edge_wr = self._open_file(split_property_edge_file, header, + "property edge", who) def open_split_property_qual_file(self, header: typing.List[str], who: str): - self.split_property_qual_f, self.split_property_qual_wr = self._open_file(split_property_qual_file, header, "property qual", who) + self.split_property_qual_f, self.split_property_qual_wr = self._open_file(split_property_qual_file, header, + "property qual", who) def shutdown(self, who: str): print("Exiting the %s collector (pid %d)." % (who, os.getpid()), file=sys.stderr, flush=True) @@ -2344,14 +2405,17 @@ def collect(self, self.cnt += 1 if progress_interval > 0 and self.cnt % progress_interval == 0: - print("The {} collector called {} times: {} nrows, {} erows, {} qrows, {} invalid erows, {} invalid qrows".format(who, - self.cnt, - self.nrows, - self.erows, - self.qrows, - self.invalid_erows, - self.invalid_qrows), - file=sys.stderr, flush=True) + print( + "The {} collector called {} times: {} nrows, {} erows, {} qrows, {} invalid erows, " + "{} invalid qrows".format( + who, + self.cnt, + self.nrows, + self.erows, + self.qrows, + self.invalid_erows, + self.invalid_qrows), + file=sys.stderr, flush=True) row: typing.List[str] if len(nrows) > 0: if self.node_wr is None: @@ -2374,22 +2438,27 @@ def collect(self, else: for row in erows: split: bool = False - label: str = row[2] # Hack: knows the structure of the row. - method: typing.Optional[typing.Callable[[typing.List[str]], bool]] = self.split_dispatcher.get(label) + label: str = row[2] # Hack: knows the structure of the row. + method: typing.Optional[ + typing.Callable[[typing.List[str]], bool]] = self.split_dispatcher.get(label) if method is not None: split = method(row) if not split: - if self.minimal_edge_wr is None and self.detailed_edge_wr is None and self.split_property_edge_wr is None: + if self.minimal_edge_wr is None and self.detailed_edge_wr is None and \ + self.split_property_edge_wr is None: raise ValueError("Unexpected %s edge rows in the %s collector." % (label, who)) - if self.split_property_edge_wr is not None and row[1].startswith("P"): # Hack: knows the structure of the row. + if self.split_property_edge_wr is not None and row[1].startswith( + "P"): # Hack: knows the structure of the row. # For now, split property files are minimal. if skip_validation or validate(row, "split property edge"): - self.split_property_edge_wr.write((row[0], row[1], row[2], row[3], row[4], row[5])) # Hack: knows the structure of the row. + self.split_property_edge_wr.write((row[0], row[1], row[2], row[3], row[4], row[ + 5])) # Hack: knows the structure of the row. elif self.minimal_edge_wr is not None: if skip_validation or validate(row, "minimal edge"): - self.minimal_edge_wr.write((row[0], row[1], row[2], row[3], row[4], row[5])) # Hack: knows the structure of the row. + self.minimal_edge_wr.write((row[0], row[1], row[2], row[3], row[4], + row[5])) # Hack: knows the structure of the row. if self.detailed_edge_wr is not None: if skip_validation or validate(row, "split detailed edge"): @@ -2409,15 +2478,18 @@ def collect(self, if use_kgtkwriter: if self.minimal_qual_wr is None and self.detailed_qual_wr is None: raise ValueError("Unexpected qual rows in the %s collector." % who) - + for row in qrows: - if self.split_property_qual_wr is not None and row[0].startswith("P"): # Hack: knows the structure of the row. + if self.split_property_qual_wr is not None and row[0].startswith( + "P"): # Hack: knows the structure of the row. if skip_validation or validate(row, "split property qual"): - self.split_property_qual_wr.write((row[0], row[1], row[2], row[3], row[4])) # Hack: knows the structure of the row. - + self.split_property_qual_wr.write( + (row[0], row[1], row[2], row[3], row[4])) # Hack: knows the structure of the row. + elif self.minimal_qual_wr is not None: if skip_validation or validate(row, "minimal qual"): - self.minimal_qual_wr.write((row[0], row[1], row[2], row[3], row[4])) # Hack: knows the structure of the row. + self.minimal_qual_wr.write( + (row[0], row[1], row[2], row[3], row[4])) # Hack: knows the structure of the row. if self.detailed_qual_wr is not None: if skip_validation or validate(row, "detailed qual"): @@ -2439,26 +2511,28 @@ def collect(self, if use_kgtkwriter: for row in invalid_erows: - if minimal_edge_file is not None: # messy - self.invalid_edge_wr.write((row[0], row[1], row[2], row[3], row[4], row[5])) # Hack: knows the structure of the row. + if minimal_edge_file is not None: # messy + self.invalid_edge_wr.write((row[0], row[1], row[2], row[3], row[4], + row[5])) # Hack: knows the structure of the row. else: self.invalid_edge_wr.write(row) else: self.invalid_edge_wr.writerows(invalid_erows) - + if len(invalid_qrows) > 0: if self.invalid_qual_wr is None: raise ValueError("Unexpected invalid qual rows in the %s collector." % who) if use_kgtkwriter: for row in invalid_qrows: - if minimal_qual_file is not None: # messy - self.invalid_qual_wr.write((row[0], row[1], row[2], row[3], row[4])) # Hack: knows the structure of the row. + if minimal_qual_file is not None: # messy + self.invalid_qual_wr.write( + (row[0], row[1], row[2], row[3], row[4])) # Hack: knows the structure of the row. else: self.invalid_qual_wr.write(row) else: self.invalid_qual_wr.writerows(invalid_qrows) - + def setup_split_dispatcher(self): self.split_dispatcher: typing.MutableMapping[str, typing.Callable[[typing.List[str]], bool]] = dict() self.split_dispatcher[ADDL_SITELINK_LABEL] = self.split_sitelink @@ -2473,94 +2547,99 @@ def setup_split_dispatcher(self): self.split_dispatcher[SITELINK_TITLE_LABEL] = self.split_sitelink self.split_dispatcher[TYPE_LABEL] = self.split_type - def split_alias(self, row: typing.List[str])->bool: + def split_alias(self, row: typing.List[str]) -> bool: split: bool = False - lang: str = row[-1] # Hack: knows the structure of the row. + lang: str = row[-1] # Hack: knows the structure of the row. if self.split_alias_wr is not None: - self.split_alias_wr.write((row[0], row[1], row[2], row[3], lang)) # Hack: knows the structure of the row. - split= True - + self.split_alias_wr.write( + (row[0], row[1], row[2], row[3], lang)) # Hack: knows the structure of the row. + split = True + if self.split_en_alias_wr is not None and lang == "en": - self.split_en_alias_wr.write((row[0], row[1], row[2], row[3])) # Hack: knows the structure of the row. + self.split_en_alias_wr.write((row[0], row[1], row[2], row[3])) # Hack: knows the structure of the row. split = True return split - def split_datatype(self, row: typing.List[str])->bool: + def split_datatype(self, row: typing.List[str]) -> bool: split: bool = False if self.split_datatype_wr is not None: - self.split_datatype_wr.write((row[0], row[1], row[2], row[3])) # Hack: knows the structure of the row. + self.split_datatype_wr.write((row[0], row[1], row[2], row[3])) # Hack: knows the structure of the row. split = True return split - def split_description(self, row: typing.List[str])->bool: + def split_description(self, row: typing.List[str]) -> bool: split: bool = False - lang: str = row[-1] # Hack: knows the structure of the row. + lang: str = row[-1] # Hack: knows the structure of the row. if self.split_description_wr is not None: - self.split_description_wr.write((row[0], row[1], row[2], row[3], lang)) # Hack: knows the structure of the row. + self.split_description_wr.write( + (row[0], row[1], row[2], row[3], lang)) # Hack: knows the structure of the row. split = True if self.split_en_description_wr is not None and lang == "en": - self.split_en_description_wr.write((row[0], row[1], row[2], row[3])) # Hack: knows the structure of the row. + self.split_en_description_wr.write( + (row[0], row[1], row[2], row[3])) # Hack: knows the structure of the row. split = True - + return split - def split_label(self, row: typing.List[str])->bool: + def split_label(self, row: typing.List[str]) -> bool: split: bool = False - lang: str = row[-1] # Hack: knows the structure of the row. + lang: str = row[-1] # Hack: knows the structure of the row. if self.split_label_wr is not None: - self.split_label_wr.write((row[0], row[1], row[2], row[3], lang)) # Hack: knows the structure of the row. + self.split_label_wr.write( + (row[0], row[1], row[2], row[3], lang)) # Hack: knows the structure of the row. split = True if self.split_en_label_wr is not None and lang == "en": - self.split_en_label_wr.write((row[0], row[1], row[2], row[3])) # Hack: knows the structure of the row. + self.split_en_label_wr.write((row[0], row[1], row[2], row[3])) # Hack: knows the structure of the row. split = True return split - def split_sitelink(self, row: typing.List[str])->bool: + def split_sitelink(self, row: typing.List[str]) -> bool: split: bool = False - lang: str = row[-1] # Hack: knows the structure of the row. + lang: str = row[-1] # Hack: knows the structure of the row. if self.split_sitelink_wr is not None: - self.split_sitelink_wr.write((row[0], row[1], row[2], row[3], lang)) # Hack: knows the structure of the row. + self.split_sitelink_wr.write( + (row[0], row[1], row[2], row[3], lang)) # Hack: knows the structure of the row. split = True if self.split_en_sitelink_wr is not None and lang == "en": - self.split_en_sitelink_wr.write((row[0], row[1], row[2], row[3])) # Hack: knows the structure of the row. + self.split_en_sitelink_wr.write( + (row[0], row[1], row[2], row[3])) # Hack: knows the structure of the row. split = True return split - def split_type(self, row: typing.List[str])->bool: + def split_type(self, row: typing.List[str]) -> bool: split: bool = False if self.split_type_wr is not None: - self.split_type_wr.write((row[0], row[1], row[2], row[3])) # Hack: knows the structure of the row. + self.split_type_wr.write((row[0], row[1], row[2], row[3])) # Hack: knows the structure of the row. split = True return split - try: UPDATE_VERSION: str = "2021-11-17T01:38:17.437678+00:00#9z/aARcXhiV2hPdyVXjAREcpZwh2MawWFp6numz8GZBCtAg2WypLYAFpHjP43k97Zj8VHVaoel0oEit9KHXH0w==" print("kgtk import-wikidata version: %s" % UPDATE_VERSION, file=sys.stderr, flush=True) print("Starting main process (pid %d)." % os.getpid(), file=sys.stderr, flush=True) inp_path = KGTKArgumentParser.get_input_file(input_file) - + csv_line_terminator = "\n" if os.name == 'posix' else "\r\n" - - start=time.time() + + start = time.time() if not skip_processing: from gzip import GzipFile @@ -2579,8 +2658,8 @@ def split_type(self, row: typing.List[str])->bool: else: print('Processing wikidata file %s' % str(inp_path), file=sys.stderr, flush=True) input_f = open(inp_path, mode='rb') - progress_startup(fd=input_f.fileno()) # Start the custom progress monitor. - + progress_startup(fd=input_f.fileno()) # Start the custom progress monitor. + if str(inp_path).endswith(".bz2"): print('Decompressing (bz2)', file=sys.stderr, flush=True) # TODO: Optionally use a system decompression program. @@ -2610,13 +2689,14 @@ def split_type(self, row: typing.List[str])->bool: if collect_results: print("Creating the collector queue.", file=sys.stderr, flush=True) # collector_q = pyrallel.ShmQueue() - collector_q_maxsize = procs*collector_queue_per_proc_size + collector_q_maxsize = procs * collector_queue_per_proc_size if collect_seperately: if node_file is not None: node_collector_q = pyrallel.ShmQueue(maxsize=collector_q_maxsize) - print("The collector node queue has been created (maxsize=%d)." % collector_q_maxsize, file=sys.stderr, flush=True) - + print("The collector node queue has been created (maxsize=%d)." % collector_q_maxsize, + file=sys.stderr, flush=True) + print("Creating the node_collector.", file=sys.stderr, flush=True) node_collector: MyCollector = MyCollector() print("Creating the node collector process.", file=sys.stderr, flush=True) @@ -2627,7 +2707,8 @@ def split_type(self, row: typing.List[str])->bool: if minimal_edge_file is not None or detailed_edge_file is not None: edge_collector_q = pyrallel.ShmQueue(maxsize=collector_q_maxsize) - print("The collector edge queue has been created (maxsize=%d)." % collector_q_maxsize, file=sys.stderr, flush=True) + print("The collector edge queue has been created (maxsize=%d)." % collector_q_maxsize, + file=sys.stderr, flush=True) print("Creating the edge_collector.", file=sys.stderr, flush=True) edge_collector: MyCollector = MyCollector() @@ -2639,7 +2720,8 @@ def split_type(self, row: typing.List[str])->bool: if minimal_qual_file is not None or detailed_qual_file is not None: qual_collector_q = pyrallel.ShmQueue(maxsize=collector_q_maxsize) - print("The collector qual queue has been created (maxsize=%d)." % collector_q_maxsize, file=sys.stderr, flush=True) + print("The collector qual queue has been created (maxsize=%d)." % collector_q_maxsize, + file=sys.stderr, flush=True) print("Creating the qual_collector.", file=sys.stderr, flush=True) qual_collector: MyCollector = MyCollector() @@ -2651,56 +2733,65 @@ def split_type(self, row: typing.List[str])->bool: if invalid_edge_file is not None: invalid_edge_collector_q = pyrallel.ShmQueue(maxsize=collector_q_maxsize) - print("The collector invalid edge queue has been created (maxsize=%d)." % collector_q_maxsize, file=sys.stderr, flush=True) + print("The collector invalid edge queue has been created (maxsize=%d)." % collector_q_maxsize, + file=sys.stderr, flush=True) print("Creating the invalid_edge_collector.", file=sys.stderr, flush=True) invalid_edge_collector: MyCollector = MyCollector() print("Creating the invalid edge collector process.", file=sys.stderr, flush=True) - invalid_edge_collector_p = mp.Process(target=invalid_edge_collector.run, args=(invalid_edge_collector_q, "invalid edge")) + invalid_edge_collector_p = mp.Process(target=invalid_edge_collector.run, + args=(invalid_edge_collector_q, "invalid edge")) print("Starting the invalid edge collector process.", file=sys.stderr, flush=True) invalid_edge_collector_p.start() print("Started the invalid edge collector process.", file=sys.stderr, flush=True) if invalid_qual_file is not None: invalid_qual_collector_q = pyrallel.ShmQueue(maxsize=collector_q_maxsize) - print("The collector invalid qual queue has been created (maxsize=%d)." % collector_q_maxsize, file=sys.stderr, flush=True) + print("The collector invalid qual queue has been created (maxsize=%d)." % collector_q_maxsize, + file=sys.stderr, flush=True) print("Creating the invalid_qual_collector.", file=sys.stderr, flush=True) invalid_qual_collector: MyCollector = MyCollector() print("Creating the invalid qual collector process.", file=sys.stderr, flush=True) - invalid_qual_collector_p = mp.Process(target=invalid_qual_collector.run, args=(invalid_qual_collector_q, "invalid qual")) + invalid_qual_collector_p = mp.Process(target=invalid_qual_collector.run, + args=(invalid_qual_collector_q, "invalid qual")) print("Starting the invalid qual collector process.", file=sys.stderr, flush=True) invalid_qual_collector_p.start() print("Started the invalid qual collector process.", file=sys.stderr, flush=True) if split_description_file is not None: description_collector_q = pyrallel.ShmQueue(maxsize=collector_q_maxsize) - print("The collector description queue has been created (maxsize=%d)." % collector_q_maxsize, file=sys.stderr, flush=True) + print("The collector description queue has been created (maxsize=%d)." % collector_q_maxsize, + file=sys.stderr, flush=True) print("Creating the description collector.", file=sys.stderr, flush=True) description_collector: MyCollector = MyCollector() print("Creating the description collector process.", file=sys.stderr, flush=True) - description_collector_p = mp.Process(target=description_collector.run, args=(description_collector_q, "description")) + description_collector_p = mp.Process(target=description_collector.run, + args=(description_collector_q, "description")) print("Starting the description collector process.", file=sys.stderr, flush=True) description_collector_p.start() print("Started the description collector process.", file=sys.stderr, flush=True) if split_sitelink_file is not None: sitelink_collector_q = pyrallel.ShmQueue(maxsize=collector_q_maxsize) - print("The collector sitelink queue has been created (maxsize=%d)." % collector_q_maxsize, file=sys.stderr, flush=True) + print("The collector sitelink queue has been created (maxsize=%d)." % collector_q_maxsize, + file=sys.stderr, flush=True) print("Creating the sitelink collector.", file=sys.stderr, flush=True) sitelink_collector: MyCollector = MyCollector() print("Creating the sitelink collector process.", file=sys.stderr, flush=True) - sitelink_collector_p = mp.Process(target=sitelink_collector.run, args=(sitelink_collector_q, "sitelink")) + sitelink_collector_p = mp.Process(target=sitelink_collector.run, + args=(sitelink_collector_q, "sitelink")) print("Starting the sitelink collector process.", file=sys.stderr, flush=True) sitelink_collector_p.start() print("Started the sitelink collector process.", file=sys.stderr, flush=True) else: collector_q = pyrallel.ShmQueue(maxsize=collector_q_maxsize) - print("The common collector queue has been created (maxsize=%d)." % collector_q_maxsize, file=sys.stderr, flush=True) - + print("The common collector queue has been created (maxsize=%d)." % collector_q_maxsize, + file=sys.stderr, flush=True) + print("Creating the common collector.", file=sys.stderr, flush=True) collector: MyCollector = MyCollector() print("Creating the common collector process.", file=sys.stderr, flush=True) @@ -2713,7 +2804,7 @@ def split_type(self, row: typing.List[str])->bool: if node_id_only: node_file_header = ['id'] else: - node_file_header = ['id','label','type','description','alias','datatype'] + node_file_header = ['id', 'label', 'type', 'description', 'alias', 'datatype'] ncq = collector_q if collector_q is not None else node_collector_q if ncq is not None: @@ -2722,7 +2813,7 @@ def split_type(self, row: typing.List[str])->bool: print("Sent the node header to the collector.", file=sys.stderr, flush=True) else: - with open(node_file+'_header', 'w', newline='') as myfile: + with open(node_file + '_header', 'w', newline='') as myfile: wr = csv.writer( myfile, quoting=csv.QUOTE_NONE, @@ -2733,10 +2824,12 @@ def split_type(self, row: typing.List[str])->bool: wr.writerow(node_file_header) if explode_values: - edge_file_header = ['id','node1','label','node2','rank','node2;magnitude','node2;unit','node2;date','node2;item','node2;lower','node2;upper', - 'node2;latitude','node2;longitude','node2;precision','node2;calendar','node2;entity-type','node2;wikidatatype', 'lang'] + edge_file_header = ['id', 'node1', 'label', 'node2', 'rank', 'node2;magnitude', 'node2;unit', + 'node2;date', 'node2;item', 'node2;lower', 'node2;upper', + 'node2;latitude', 'node2;longitude', 'node2;precision', 'node2;calendar', + 'node2;entity-type', 'node2;wikidatatype', 'lang'] else: - edge_file_header = ['id','node1','label','node2', + edge_file_header = ['id', 'node1', 'label', 'node2', 'rank', 'node2;wikidatatype', 'claim_id', 'val_type', 'entity_type', 'datahash', 'precision', 'calendar', 'lang'] @@ -2748,7 +2841,7 @@ def split_type(self, row: typing.List[str])->bool: print("Sent the detailed edge header to the collector.", file=sys.stderr, flush=True) else: - with open(detailed_edge_file+'_header', 'w', newline='') as myfile: + with open(detailed_edge_file + '_header', 'w', newline='') as myfile: wr = csv.writer( myfile, quoting=csv.QUOTE_NONE, @@ -2833,13 +2926,15 @@ def split_type(self, row: typing.List[str])->bool: if invalid_edge_file and invalid_edge_collector_q is not None: if detailed_edge_file: print("Sending the detailed invalid edge header to the collector.", file=sys.stderr, flush=True) - invalid_edge_collector_q.put(("invalid_edge_header", None, None, None, None, None, edge_file_header)) + invalid_edge_collector_q.put( + ("invalid_edge_header", None, None, None, None, None, edge_file_header)) print("Sent the detailed invalid edge header to the collector.", file=sys.stderr, flush=True) elif minimal_edge_file: print("Sending the minimal invalid edge header to the collector.", file=sys.stderr, flush=True) - invalid_edge_collector_q.put(("invalid_edge_header", None, None, None, None, None, edge_file_header[0:6])) + invalid_edge_collector_q.put( + ("invalid_edge_header", None, None, None, None, None, edge_file_header[0:6])) print("Sent the minimal invalid edge header to the collector.", file=sys.stderr, flush=True) - + if minimal_qual_file is not None or detailed_qual_file is not None or split_property_qual_file is not None: qual_file_header = edge_file_header.copy() if "rank" in qual_file_header: @@ -2860,7 +2955,7 @@ def split_type(self, row: typing.List[str])->bool: print("Sent the detailed qual file header to the collector.", file=sys.stderr, flush=True) else: - with open(detailed_qual_file+'_header', 'w', newline='') as myfile: + with open(detailed_qual_file + '_header', 'w', newline='') as myfile: wr = csv.writer( myfile, quoting=csv.QUOTE_NONE, @@ -2873,7 +2968,7 @@ def split_type(self, row: typing.List[str])->bool: print("Sending the minimal qual file header to the collector.", file=sys.stderr, flush=True) qcq.put(("minimal_qual_header", None, None, None, None, None, qual_file_header[0:5])) print("Sent the minimal qual file header to the collector.", file=sys.stderr, flush=True) - + if split_property_qual_file and qcq is not None: print("Sending the property qual file header to the collector.", file=sys.stderr, flush=True) qcq.put(("split_property_qual_header", None, None, None, None, None, qual_file_header[0:5])) @@ -2882,20 +2977,25 @@ def split_type(self, row: typing.List[str])->bool: if invalid_qual_file and invalid_qual_collector_q is not None: if detailed_qual_file: print("Sending the detailed invalid qual header to the collector.", file=sys.stderr, flush=True) - invalid_qual_collector_q.put(("invalid_qual_header", None, None, None, None, None, qual_file_header)) + invalid_qual_collector_q.put( + ("invalid_qual_header", None, None, None, None, None, qual_file_header)) print("Sent the detailed invalid qual header to the collector.", file=sys.stderr, flush=True) elif minimal_qual_file: print("Sending the minimal invalid qual header to the collector.", file=sys.stderr, flush=True) - invalid_qual_collector_q.put(("invalid_qual_header", None, None, None, None, None, qual_file_header[0:5])) + invalid_qual_collector_q.put( + ("invalid_qual_header", None, None, None, None, None, qual_file_header[0:5])) print("Sent the minimal invalid qual header to the collector.", file=sys.stderr, flush=True) - + print('Creating parallel processor for {}'.format(str(inp_path)), file=sys.stderr, flush=True) if use_shm or single_mapper_queue: - pp = pyrallel.ParallelProcessor(procs, MyMapper,enable_process_id=True, max_size_per_mapper_queue=max_size_per_mapper_queue, - use_shm=use_shm, enable_collector_queues=False, batch_size=mapper_batch_size, + pp = pyrallel.ParallelProcessor(procs, MyMapper, enable_process_id=True, + max_size_per_mapper_queue=max_size_per_mapper_queue, + use_shm=use_shm, enable_collector_queues=False, + batch_size=mapper_batch_size, single_mapper_queue=single_mapper_queue) else: - pp = pyrallel.ParallelProcessor(procs, MyMapper,enable_process_id=True, max_size_per_mapper_queue=max_size_per_mapper_queue, + pp = pyrallel.ParallelProcessor(procs, MyMapper, enable_process_id=True, + max_size_per_mapper_queue=max_size_per_mapper_queue, batch_size=mapper_batch_size) print('Start parallel processing', file=sys.stderr, flush=True) pp.start() @@ -2907,7 +3007,7 @@ def split_type(self, row: typing.List[str])->bool: print('Done processing {}'.format(str(inp_path)), file=sys.stderr, flush=True) input_f.close() - + print('Telling the workers to shut down.', file=sys.stderr, flush=True) pp.task_done() print('Waiting for the workers to shut down.', file=sys.stderr, flush=True) @@ -3004,32 +3104,36 @@ def split_type(self, row: typing.List[str])->bool: # should provide the simplest, highest-performing solution. if node_file: print('Combining the node file fragments', file=sys.stderr, flush=True) - node_file_fragments=[node_file+'_header'] + node_file_fragments = [node_file + '_header'] for n in range(procs): - node_file_fragments.append(node_file+'_'+str(n)) - platform_cat(node_file_fragments, node_file, remove=not keep_temp_files, use_python_cat=use_python_cat, verbose=True) + node_file_fragments.append(node_file + '_' + str(n)) + platform_cat(node_file_fragments, node_file, remove=not keep_temp_files, use_python_cat=use_python_cat, + verbose=True) if detailed_edge_file: print('Combining the edge file fragments', file=sys.stderr, flush=True) - edge_file_fragments=[detailed_edge_file+'_header'] + edge_file_fragments = [detailed_edge_file + '_header'] for n in range(procs): - edge_file_fragments.append(detailed_edge_file+'_'+str(n)) - platform_cat(edge_file_fragments, detailed_edge_file, remove=not keep_temp_files, use_python_cat=use_python_cat, verbose=True) + edge_file_fragments.append(detailed_edge_file + '_' + str(n)) + platform_cat(edge_file_fragments, detailed_edge_file, remove=not keep_temp_files, + use_python_cat=use_python_cat, verbose=True) if detailed_qual_file: print('Combining the qualifier file fragments', file=sys.stderr, flush=True) - qual_file_fragments=[detailed_qual_file+'_header'] + qual_file_fragments = [detailed_qual_file + '_header'] for n in range(procs): - qual_file_fragments.append(detailed_qual_file+'_'+str(n)) - platform_cat(qual_file_fragments, detailed_qual_file, remove=not keep_temp_files, use_python_cat=use_python_cat, verbose=True) + qual_file_fragments.append(detailed_qual_file + '_' + str(n)) + platform_cat(qual_file_fragments, detailed_qual_file, remove=not keep_temp_files, + use_python_cat=use_python_cat, verbose=True) print('import complete', file=sys.stderr, flush=True) - end=time.time() - print('time taken : {}s'.format(end-start), file=sys.stderr, flush=True) + end = time.time() + print('time taken : {}s'.format(end - start), file=sys.stderr, flush=True) except Exception as e: raise KGTKException(str(e)) -def validate(row: typing.List[str], who: str)->bool: + +def validate(row: typing.List[str], who: str) -> bool: """Ensure that output edge rows meet minimal validation criteria.""" import sys @@ -3040,8 +3144,9 @@ def validate(row: typing.List[str], who: str)->bool: # Ensure that the first four fields (id, node1, label, node2) are all # non-empty. - if len(row[0]) == 0 or len(row[1]) == 0 or len(row[2]) == 0 or len(row[3]) ==0: - print("Invalid %s row: (%s, %s, %s, %s)" % (who, repr(row[0]), repr(row[1]), repr(row[2]), repr(row[3])), file=sys.stderr, flush=True) + if len(row[0]) == 0 or len(row[1]) == 0 or len(row[2]) == 0 or len(row[3]) == 0: + print("Invalid %s row: (%s, %s, %s, %s)" % (who, repr(row[0]), repr(row[1]), repr(row[2]), repr(row[3])), + file=sys.stderr, flush=True) return False return True From fa3b366f4ac34d9d37b320a32f5ea2ea0d77b4cc Mon Sep 17 00:00:00 2001 From: saggu Date: Mon, 23 May 2022 16:33:21 -0700 Subject: [PATCH 05/21] setup notebooks to run fortnighlty imports --- kgtk/configure_kgtk_notebooks.py | 16 +- use-cases/Wikidata Subsets-Copy1.ipynb | 1377 ----------------- ...a Subsets.ipynb => Wikidata-Subsets.ipynb} | 102 +- ...iles.ipynb => Wikidata-Useful-Files.ipynb} | 166 +- use-cases/create-wikidata-dwd.ipynb | 221 +++ use-cases/import-wikidata.ipynb | 231 ++- 6 files changed, 419 insertions(+), 1694 deletions(-) delete mode 100644 use-cases/Wikidata Subsets-Copy1.ipynb rename use-cases/{Wikidata Subsets.ipynb => Wikidata-Subsets.ipynb} (90%) rename use-cases/{Wikidata Useful Files.ipynb => Wikidata-Useful-Files.ipynb} (89%) create mode 100644 use-cases/create-wikidata-dwd.ipynb diff --git a/kgtk/configure_kgtk_notebooks.py b/kgtk/configure_kgtk_notebooks.py index 9f1e6d6ec..2f8e698eb 100644 --- a/kgtk/configure_kgtk_notebooks.py +++ b/kgtk/configure_kgtk_notebooks.py @@ -24,6 +24,7 @@ class ConfigureKGTK(object): def __init__(self, file_list: List[str], kgtk_path: str = None, input_files_url: str = None): + self.graph_files = files_config self.files = file_list self.INPUT_FILES_URL = "https://github.com/usc-isi-i2/kgtk-tutorial-files/raw/main/datasets/arnold" \ if input_files_url is None else input_files_url @@ -64,6 +65,8 @@ def configure_kgtk(self, debug=False): """ configures the environment for a jupyter notebook. + :param debug: add --debug flag in the kypher command + :param additional_files: additional files in dict format to be used :param input_graph_path: path to the input graph files. By default it'll create a folder "isi-kgtk-tutorial" in user home and download files from github :param output_path: path where the output and temp files will be created. By default, "isi-kgtk-tutorial-out" in @@ -76,8 +79,6 @@ def configure_kgtk(self, :return: """ - self.graph_files = files_config - if json_config_file is not None: try: _files_config = json.load(open(json_config_file)) @@ -139,7 +140,7 @@ def configure_kgtk(self, os.environ['kypher'] = kypher self.kgtk_environment_variables.add('kypher') - def download_tutorial_files(self, graph_path): + def download_tutorial_files(self, graph_path: str): if not graph_path.endswith('/'): graph_path += '/' @@ -159,20 +160,23 @@ def print_env_variables(self): for key in self.files: print(f"{key}: {os.environ[key]}") - def load_files_into_cache(self): + def load_files_into_cache(self, files: List[str] = None): """ Loads files into graph cache. The keys in this list should be in json_config_file :return: """ kypher_command = f"{os.environ['kypher']}" - for f_key in self.files: + if files is None: + files = self.files + + for f_key in files: kypher_command += f" -i \"{os.environ[f_key]}\" --as {f_key} " kypher_command += " --limit 3" print(kypher_command) print(subprocess.getoutput(kypher_command)) - def print_kgtk_file_names(self, file_names=None): + def print_kgtk_file_names(self, file_names: List[str] = None): if file_names is not None: if not type(file_names) == list: file_names = [file_names] diff --git a/use-cases/Wikidata Subsets-Copy1.ipynb b/use-cases/Wikidata Subsets-Copy1.ipynb deleted file mode 100644 index fe7d48b87..000000000 --- a/use-cases/Wikidata Subsets-Copy1.ipynb +++ /dev/null @@ -1,1377 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generating Subsets of Wikidata" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Batch Invocation\n", - "Example batch command. The second argument is a notebook where the output will be stored. You can load it to see progress.\n", - "\n", - "UPDATE EXAMPLE INVOCATION\n", - "\n", - "\n", - "```\n", - "papermill Wikidata\\ Useful\\ Files.ipynb useful-files.out.ipynb \\\n", - "-p wiki_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v3/all.tsv.gz \\\n", - "-p label_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v3/part.label.en.tsv.gz \\\n", - "-p item_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v3/part.wikibase-item.tsv.gz \\\n", - "-p property_item_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v3/part.property.wikibase-item.tsv.gz \\\n", - "-p qual_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v3/qual.tsv.gz \\\n", - "-p output_path \\\n", - "-p output_folder useful_files_v4 \\\n", - "-p temp_folder temp.useful_files_v4 \\\n", - "-p delete_database no \n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "# Parameters\n", - "\n", - "# Folder on local machine where to create the output and temporary folders\n", - "output_path = \"/Users/pedroszekely/Downloads/kypher\"\n", - "\n", - "# The names of the output and temporary folders\n", - "output_folder = \"wikidata_os_v5\"\n", - "temp_folder = \"temp.wikidata_os_v5\"\n", - "\n", - "# Classes to remove\n", - "remove_classes = \"Q13442814, Q523, Q16521, Q318, Q7318358, Q7187, Q11173, Q8054\"\n", - "\n", - "# The location of input files\n", - "wiki_root_folder = \"/Volumes/GoogleDrive/Shared\\ drives/KGTK/datasets/wikidata-20200803-v4/\"\n", - "claims_file = \"claims.tsv.gz\"\n", - "label_file = \"labels.en.tsv.gz\"\n", - "alias_file = \"aliases.en.tsv.gz\"\n", - "description_file = \"descriptions.en.tsv.gz\"\n", - "item_file = \"claims.wikibase-item.tsv.gz\"\n", - "qual_file = \"qualifiers.tsv.gz\"\n", - "property_datatypes_file = \"metadata.property.datatypes.tsv.gz\"\n", - "isa_file = \"derived.isa.tsv.gz\"\n", - "p279star_file = \"derived.P279star.tsv.gz\"\n", - "\n", - "# Location of the cache database for kypher\n", - "cache_path = \"/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4\"\n", - "\n", - "# Whether to delete the cache database\n", - "delete_database = False\n", - "\n", - "# shortcuts to commands\n", - "kgtk = \"time kgtk --debug\"\n", - "# kgtk = \"kgtk --debug\"\n", - "\n", - "# Useful files Jupyter notebook\n", - "useful_files_notebook = \"Wikidata Useful Files.ipynb\"\n", - "notebooks_folder = \"/Users/pedroszekely/Documents/GitHub/kgtk/examples/\"" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "import io\n", - "import os\n", - "import subprocess\n", - "import sys\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import altair as alt\n", - "\n", - "import papermill as pm" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Set up variables for files" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "if cache_path:\n", - " os.environ['STORE'] = \"{}/wikidata.sqlite3.db\".format(cache_path)\n", - "else:\n", - " os.environ['STORE'] = \"{}/{}/wikidata.sqlite3.db\".format(output_path, temp_folder)\n", - " \n", - "if cache_path:\n", - " store = \"{}/wikidata.sqlite3.db\".format(cache_path)\n", - "else:\n", - " store = \"{}/{}/wikidata.sqlite3.db\".format(output_path, temp_folder)\n", - "\n", - "out = \"{}/{}\".format(output_path, output_folder)\n", - "temp = \"{}/{}\".format(output_path, temp_folder)\n", - "\n", - "kypher = \"kgtk query --debug --graph-cache \" + store\n", - "\n", - "claims = wiki_root_folder + claims_file\n", - "items = wiki_root_folder + item_file\n", - "isa = wiki_root_folder + isa_file\n", - "quals = wiki_root_folder + qual_file\n", - "datatypes = wiki_root_folder + property_datatypes_file\n", - "p279star = wiki_root_folder + p279star_file\n", - "\n", - "labels = wiki_root_folder + label_file\n", - "aliases = wiki_root_folder + alias_file\n", - "descriptions = wiki_root_folder + description_file" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Go to the output directory and create the subfolders for the output files and the temporary files" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Errno 2] No such file or directory: 'output_path'\n", - "/Users/pedroszekely/Documents/GitHub/kgtk/use-cases\n" - ] - } - ], - "source": [ - "cd output_path" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "mkdir: /Users/pedroszekely/Downloads/kypher/wikidata_os_v5: File exists\n", - "mkdir: /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5: File exists\n" - ] - } - ], - "source": [ - "!mkdir {out}\n", - "!mkdir {temp}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Clean up the output and temp folders before we start" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# !rm {out}/*.tsv {out}/*.tsv.gz\n", - "# !rm {temp}/*.tsv {temp}/*.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "if delete_database:\n", - " !rm {out}/*.tsv {out}/*.tsv.gz\n", - " !rm {temp}/*.tsv {temp}/*.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Preview the input files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It is always a good practice to peek a the files to make sure the column headings are what we expect" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\tnormal\turl\n", - "P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\tnormal\turl\n", - "P10-P1629-Q34508-bcc39400-0\tP10\tP1629\tQ34508\tnormal\twikibase-item\n", - "P10-P1659-P1651-c4068028-0\tP10\tP1659\tP1651\tnormal\twikibase-property\n", - "P10-P1659-P18-5e4b9c4f-0\tP10\tP1659\tP18\tnormal\twikibase-property\n", - "P10-P1659-P4238-d21d1ac0-0\tP10\tP1659\tP4238\tnormal\twikibase-property\n", - "P10-P1659-P51-86aca4c5-0\tP10\tP1659\tP51\tnormal\twikibase-property\n", - "P10-P1855-Q15075950-7eff6d65-0\tP10\tP1855\tQ15075950\tnormal\twikibase-item\n", - "P10-P1855-Q4504-a69d2c73-0\tP10\tP1855\tQ4504\tnormal\twikibase-item\n", - "gzcat: error writing to output: Broken pipe\n", - "gzcat: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz: uncompress failed\n" - ] - } - ], - "source": [ - "!gzcat {claims} | head" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-19 16:34:30 sqlstore]: IMPORT graph directly into table graph_18 from /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz ...\n", - "Exception in thread background thread for pid 34908:\n", - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/threading.py\", line 926, in _bootstrap_inner\n", - " self.run()\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/threading.py\", line 870, in run\n", - " self._target(*self._args, **self._kwargs)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 1662, in wrap\n", - " fn(*args, **kwargs)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 2606, in background_thread\n", - " handle_exit_code(exit_code)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 2304, in fn\n", - " return self.command.handle_command_exit_code(exit_code)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 877, in handle_command_exit_code\n", - " raise exc\n", - "sh.ErrorReturnCode_1: \n", - "\n", - " RAN: /usr/bin/gunzip -c '/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz'\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "gunzip: failed to read stdin: Input/output error\n", - "gunzip: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz: uncompress failed\n", - "\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/cli/query.py\", line 148, in run\n", - " index=options.get('index'))\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/query.py\", line 182, in __init__\n", - " store.add_graph(file)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 565, in add_graph\n", - " self.import_graph_data_via_import(table, file)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 652, in import_graph_data_via_import\n", - " sqlproc.wait()\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 856, in wait\n", - " self.process._stdin_process.command.wait()\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 856, in wait\n", - " self.process._stdin_process.command.wait()\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 849, in wait\n", - " self.handle_command_exit_code(exit_code)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 877, in handle_command_exit_code\n", - " raise exc\n", - "sh.ErrorReturnCode_1: \n", - "\n", - " RAN: /usr/bin/gunzip -c '/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz'\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "gunzip: failed to read stdin: Input/output error\n", - "gunzip: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz: uncompress failed\n", - "\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/exceptions.py\", line 46, in __call__\n", - " return_code = func(*args, **kwargs) or 0\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/cli/query.py\", line 180, in run\n", - " raise KGTKException(str(e) + '\\n')\n", - "kgtk.exceptions.KGTKException: \n", - "\n", - " RAN: /usr/bin/gunzip -c '/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz'\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "gunzip: failed to read stdin: Input/output error\n", - "gunzip: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz: uncompress failed\n", - "\n", - "\n", - "\n", - "\n", - " RAN: /usr/bin/gunzip -c '/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz'\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "gunzip: failed to read stdin: Input/output error\n", - "gunzip: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz: uncompress failed\n", - "\n", - "\n" - ] - } - ], - "source": [ - "!{kypher} -i {claims} \\\n", - "--match '()-[]-()' \\\n", - "--limit 10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating a list of all the items we want to remove" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Compute the items to be removed\n", - "\n", - "First look at the classes we will remove" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[90mid\u001b[39m Q13442814\n", - "\u001b[42mLabel\u001b[49m scholarly article\n", - "\u001b[44mDescription\u001b[49m article in an academic publication, usually peer reviewed\n", - "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mscholarly publication \u001b[90m(Q591041)\u001b[39m | article \u001b[90m(Q191067)\u001b[39m | scholarly work \u001b[90m(Q55915575)\u001b[39m\n", - "\n", - "\u001b[90mid\u001b[39m Q523\n", - "\u001b[42mLabel\u001b[49m star\n", - "\u001b[44mDescription\u001b[49m astronomical object consisting of a luminous spheroid of plasma held together by its own gravity\n", - "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39m astronomical object type \u001b[90m(Q17444909)\u001b[39m\n", - "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mastronomical object \u001b[90m(Q6999)\u001b[39m | fusor \u001b[90m(Q1027098)\u001b[39m\n", - "\n", - "\u001b[90mid\u001b[39m Q16521\n", - "\u001b[42mLabel\u001b[49m taxon\n", - "\u001b[44mDescription\u001b[49m group of one or more organism(s), which a taxonomist adjudges to be a unit\n", - "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39m first-order metaclass \u001b[90m(Q24017414)\u001b[39m\n", - "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mliving organism class \u001b[90m(Q21871294)\u001b[39m\n", - "\n", - "\u001b[90mid\u001b[39m Q318\n", - "\u001b[42mLabel\u001b[49m galaxy\n", - "\u001b[44mDescription\u001b[49m astronomical structure\n", - "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39m astronomical object type \u001b[90m(Q17444909)\u001b[39m\n", - "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mdeep-sky object \u001b[90m(Q249389)\u001b[39m\n", - "\n", - "\u001b[90mid\u001b[39m Q7318358\n", - "\u001b[42mLabel\u001b[49m review article\n", - "\u001b[44mDescription\u001b[49m article that summarizes the current state of understanding on a topic\n", - "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39m genre \u001b[90m(Q483394)\u001b[39m\n", - "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mscholarly article \u001b[90m(Q13442814)\u001b[39m\n", - "\n", - "\u001b[90mid\u001b[39m Q7187\n", - "\u001b[42mLabel\u001b[49m gene\n", - "\u001b[44mDescription\u001b[49m basic physical and functional unit of heredity\n", - "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mnucleic acid sequence \u001b[90m(Q863908)\u001b[39m | biological region \u001b[90m(Q50365914)\u001b[39m | biological sequence \u001b[90m(Q3511065)\u001b[39m\n", - "\n", - "\u001b[90mid\u001b[39m Q11173\n", - "\u001b[42mLabel\u001b[49m chemical compound\n", - "\u001b[44mDescription\u001b[49m pure chemical substance consisting of two or more different chemical elements\n", - "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39m group or class of chemical substances \u001b[90m(Q17339814)\u001b[39m | first-order metaclass \u001b[90m(Q24017414)\u001b[39m\n", - "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mpure substance \u001b[90m(Q578779)\u001b[39m | chemical component \u001b[90m(Q20026787)\u001b[39m\n", - "\n", - "\u001b[90mid\u001b[39m Q8054\n", - "\u001b[42mLabel\u001b[49m protein\n", - "\u001b[44mDescription\u001b[49m biological molecule consisting of chains of amino acid residues\n", - "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39m group or class of chemical substances \u001b[90m(Q17339814)\u001b[39m | first-order metaclass \u001b[90m(Q24017414)\u001b[39m\n", - "\u001b[30m\u001b[47msubclass of\u001b[49m\u001b[39m \u001b[90m(P279)\u001b[39m\u001b[90m: \u001b[39mbiopolymer \u001b[90m(Q422649)\u001b[39m | nutrient \u001b[90m(Q181394)\u001b[39m | gene product \u001b[90m(Q424689)\u001b[39m | polyamide \u001b[90m(Q145273)\u001b[39m | biological macromolecule \u001b[90m(Q66560214)\u001b[39m\n" - ] - } - ], - "source": [ - "cmd = \"wd u {}\".format(\" \".join(remove_classes.split(\",\")))\n", - "!{cmd}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Compose the kypher command to remove the classes" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "zcat: failed to read stdin: Input/output error\n" - ] - } - ], - "source": [ - "!zcat < {isa} | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the command, the items to remove will be in file `{temp}/items.remove.tsv.gz`" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-19 09:32:42 sqlstore]: IMPORT graph directly into table graph_15 from /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/derived.isa.tsv.gz ...\n", - "Exception in thread background thread for pid 30155:\n", - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/threading.py\", line 926, in _bootstrap_inner\n", - " self.run()\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/threading.py\", line 870, in run\n", - " self._target(*self._args, **self._kwargs)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 1662, in wrap\n", - " fn(*args, **kwargs)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 2606, in background_thread\n", - " handle_exit_code(exit_code)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 2304, in fn\n", - " return self.command.handle_command_exit_code(exit_code)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 877, in handle_command_exit_code\n", - " raise exc\n", - "sh.ErrorReturnCode_1: \n", - "\n", - " RAN: /usr/bin/gunzip -c '/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/derived.isa.tsv.gz'\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "gunzip: failed to read stdin: Input/output error\n", - "gunzip: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/derived.isa.tsv.gz: uncompress failed\n", - "\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/cli/query.py\", line 148, in run\n", - " index=options.get('index'))\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/query.py\", line 182, in __init__\n", - " store.add_graph(file)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 565, in add_graph\n", - " self.import_graph_data_via_import(table, file)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 652, in import_graph_data_via_import\n", - " sqlproc.wait()\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 856, in wait\n", - " self.process._stdin_process.command.wait()\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 856, in wait\n", - " self.process._stdin_process.command.wait()\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 849, in wait\n", - " self.handle_command_exit_code(exit_code)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/sh-1.13.1-py3.7.egg/sh.py\", line 877, in handle_command_exit_code\n", - " raise exc\n", - "sh.ErrorReturnCode_1: \n", - "\n", - " RAN: /usr/bin/gunzip -c '/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/derived.isa.tsv.gz'\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "gunzip: failed to read stdin: Input/output error\n", - "gunzip: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/derived.isa.tsv.gz: uncompress failed\n", - "\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/exceptions.py\", line 46, in __call__\n", - " return_code = func(*args, **kwargs) or 0\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/cli/query.py\", line 180, in run\n", - " raise KGTKException(str(e) + '\\n')\n", - "kgtk.exceptions.KGTKException: \n", - "\n", - " RAN: /usr/bin/gunzip -c '/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/derived.isa.tsv.gz'\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "gunzip: failed to read stdin: Input/output error\n", - "gunzip: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/derived.isa.tsv.gz: uncompress failed\n", - "\n", - "\n", - "\n", - "\n", - " RAN: /usr/bin/gunzip -c '/Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/derived.isa.tsv.gz'\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "gunzip: failed to read stdin: Input/output error\n", - "gunzip: /Volumes/GoogleDrive/Shared drives/KGTK/datasets/wikidata-20200803-v4/derived.isa.tsv.gz: uncompress failed\n", - "\n", - "\n" - ] - } - ], - "source": [ - "classes = \", \".join(list(map(lambda x: '\"{}\"'.format(x), remove_classes.replace(\" \", \"\").split(\",\"))))\n", - "!{kypher} -i {isa} -i {p279star} -o {temp}/items.remove.tsv.gz \\\n", - "--match 'isa: (n1)-[:isa]->(c), P279star: (c)-[]->(class)' \\\n", - "--where 'class in [{classes}]' \\\n", - "--return 'distinct n1, \"p31_p279star\" as label, class as node2'\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Preview the file" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "!zcat < {temp}/items.remove.tsv.gz | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Collect all the classes of items we will remove, just as a sanity check" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/cli/query.py\", line 148, in run\n", - " index=options.get('index'))\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/query.py\", line 182, in __init__\n", - " store.add_graph(file)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 565, in add_graph\n", - " self.import_graph_data_via_import(table, file)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 630, in import_graph_data_via_import\n", - " if header.endswith('\\r\\n'):\n", - "TypeError: endswith first arg must be bytes or a tuple of bytes, not str\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/exceptions.py\", line 46, in __call__\n", - " return_code = func(*args, **kwargs) or 0\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/cli/query.py\", line 180, in run\n", - " raise KGTKException(str(e) + '\\n')\n", - "kgtk.exceptions.KGTKException: endswith first arg must be bytes or a tuple of bytes, not str\n", - "\n", - "endswith first arg must be bytes or a tuple of bytes, not str\n", - "\n" - ] - } - ], - "source": [ - "!{kypher} -i {temp}/items.remove.tsv.gz \\\n", - "--match '()-[]->(n2)' \\\n", - "--return 'distinct n2' \\\n", - "--limit 10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create the reduced edges file" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Remove the items from the all.tsv and the label, alias and description files\n", - "We will be left with `reduced` files where the edges do not have the unwanted items. We have to remove them from the node1 and node2 positions, so we need to run the ifnotexists commands twice.\n", - "\n", - "Before we start preview the files to see the column headings and check whether they look sorted." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: kgtk [options] command [ / command]*\n", - "kgtk: error: unrecognized arguments: sort2\n" - ] - } - ], - "source": [ - "!$kgtk sort2 -i {temp}/items.remove.tsv.gz -o {temp}/items.remove.sorted.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/bin/bash: /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/items.remove.sorted.tsv.gz: No such file or directory\n" - ] - } - ], - "source": [ - "!zcat < {temp}/items.remove.sorted.tsv.gz | head | col" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/bin/bash: /Volumes/GoogleDrive/Shared\\ drives/KGTK/datasets/wikidata-20200803-v4/claims.tsv.gz: No such file or directory\n" - ] - } - ], - "source": [ - "!zcat < \"{claims}\" | head -5 | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Remove from the full set of edges those edges that have a `node1` present in `items.remove.sorted.tsv`" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: kgtk [options] command [ / command]*\n", - "kgtk: error: unrecognized arguments: ifnotexists --filter-on /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/items.remove.sorted.tsv.gz --input-keys node1 --filter-keys node1 --presorted\n" - ] - } - ], - "source": [ - "!$kgtk ifnotexists -i \"{claims}\" -o {temp}/item.edges.reduced.tsv.gz \\\n", - "--filter-on {temp}/items.remove.sorted.tsv.gz \\\n", - "--input-keys node1 \\\n", - "--filter-keys node1 \\\n", - "--presorted " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the remaining edges, remove those that have a `node2` present in `items.remove.sorted.tsv`" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: kgtk [options] command [ / command]*\n", - "kgtk: error: unrecognized arguments: sort2 --columns node2 label node1 id\n" - ] - } - ], - "source": [ - "!$kgtk sort2 -i {temp}/item.edges.reduced.tsv.gz -o {temp}/item.edges.reduced.sorted.tsv.gz \\\n", - "--columns node2 label node1 id" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: kgtk [options] command [ / command]*\n", - "kgtk: error: unrecognized arguments: ifnotexists --filter-on /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/items.remove.sorted.tsv.gz --input-keys node2 --filter-keys node1 --presorted\n" - ] - } - ], - "source": [ - "!$kgtk ifnotexists -i {temp}/item.edges.reduced.sorted.tsv.gz -o {temp}/item.edges.reduced.2.tsv.gz \\\n", - "--filter-on {temp}/items.remove.sorted.tsv.gz \\\n", - "--input-keys node2 \\\n", - "--filter-keys node1 \\\n", - "--presorted " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a file with the labels" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: kgtk [options] command [ / command]*\n", - "kgtk: error: unrecognized arguments: ifnotexists --filter-on /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/items.remove.sorted.tsv.gz --input-keys node1 --filter-keys node1 --presorted\n" - ] - } - ], - "source": [ - "!$kgtk ifnotexists -i {labels} -o {temp}/label.edges.reduced.tsv.gz \\\n", - "--filter-on {temp}/items.remove.sorted.tsv.gz \\\n", - "--input-keys node1 \\\n", - "--filter-keys node1 \\\n", - "--presorted" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a file with the aliases" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: kgtk [options] command [ / command]*\n", - "kgtk: error: unrecognized arguments: ifnotexists --filter-on /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/items.remove.sorted.tsv.gz --input-keys node1 --filter-keys node1 --presorted\n" - ] - } - ], - "source": [ - "!$kgtk ifnotexists -i {aliases} -o {temp}/alias.edges.reduced.tsv.gz \\\n", - "--filter-on {temp}/items.remove.sorted.tsv.gz \\\n", - "--input-keys node1 \\\n", - "--filter-keys node1 \\\n", - "--presorted" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a file with the descriptions" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: kgtk [options] command [ / command]*\n", - "kgtk: error: unrecognized arguments: ifnotexists --filter-on /Users/pedroszekely/Downloads/kypher/temp.wikidata_os_v5/items.remove.sorted.tsv.gz --input-keys node1 --filter-keys node1 --presorted\n" - ] - } - ], - "source": [ - "!$kgtk ifnotexists -i {descriptions} -o {temp}/description.edges.reduced.tsv.gz \\\n", - "--filter-on {temp}/items.remove.sorted.tsv.gz \\\n", - "--input-keys node1 \\\n", - "--filter-keys node1 \\\n", - "--presorted" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Produce the output files for claims, labels, aliases and descriptions" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: kgtk [options] command [ / command]*\n", - "kgtk: error: unrecognized arguments: sort2\n" - ] - } - ], - "source": [ - "!$kgtk sort2 -i {temp}/item.edges.reduced.2.tsv.gz -o {out}/claims.tsv.gz " - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: kgtk [options] command [ / command]*\n", - "kgtk: error: unrecognized arguments: sort2\n" - ] - } - ], - "source": [ - "!$kgtk sort2 -i {temp}/label.edges.reduced.tsv.gz -o {out}/labels.en.tsv.gz " - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: kgtk [options] command [ / command]*\n", - "kgtk: error: unrecognized arguments: sort2\n" - ] - } - ], - "source": [ - "!$kgtk sort2 -i {temp}/alias.edges.reduced.tsv.gz -o {out}/aliases.en.tsv.gz " - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: kgtk [options] command [ / command]*\n", - "kgtk: error: unrecognized arguments: sort2\n" - ] - } - ], - "source": [ - "!$kgtk sort2 -i {temp}/description.edges.reduced.tsv.gz -o {out}/descriptions.en.tsv.gz " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sanity checks to see if it looks reasonable" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create the reduced qualifiers file\n", - "We do this by finding all the ids of the reduced edges file, and then selecting out from `qual.tsv`\n", - "\n", - "We need to join by id, so we need to sort both files by id, node1, label, node2:\n", - "\n", - "- `{quals}` \n", - "- `{out}/claims.tsv.gz` " - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/bin/bash: /Volumes/GoogleDrive/Shared\\ drives/KGTK/datasets/wikidata-20200803-v4/qualifiers.tsv.gz: No such file or directory\n" - ] - } - ], - "source": [ - "!zcat < \"{quals}\" | head | column -t -s $'\\t' " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run `ifexists` to select out the quals for the edges in `{out}/wikidataos.qual.tsv.gz`. Note that we use `node1` in the qualifier file, matching to `id` in the `wikidataos.all.tsv` file." - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: kgtk [options] command [ / command]*\n", - "kgtk: error: unrecognized arguments: ifexists --filter-on /Users/pedroszekely/Downloads/kypher/wikidata_os_v5/claims.tsv.gz --input-keys node1 --filter-keys id --presorted\n" - ] - } - ], - "source": [ - "!$kgtk ifexists -i \"{quals}\" -o {out}/qualifiers.tsv.gz \\\n", - "--filter-on {out}/claims.tsv.gz \\\n", - "--input-keys node1 \\\n", - "--filter-keys id \\\n", - "--presorted" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Look at the final output for qualifiers" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/bin/bash: /Users/pedroszekely/Downloads/kypher/wikidata_os_v5/qualifiers.tsv.gz: No such file or directory\n" - ] - } - ], - "source": [ - "!zcat < {out}/qualifiers.tsv.gz | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Sanity checks" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-19 09:33:14 sqlstore]: IMPORT graph via csv.reader into table graph_16 from /Users/pedroszekely/Downloads/kypher/wikidata_os_v5/claims.tsv.gz ...\n", - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 565, in add_graph\n", - " self.import_graph_data_via_import(table, file)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 613, in import_graph_data_via_import\n", - " raise KGTKException('only implemented for existing, named files')\n", - "kgtk.exceptions.KGTKException: only implemented for existing, named files\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/cli/query.py\", line 148, in run\n", - " index=options.get('index'))\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/query.py\", line 182, in __init__\n", - " store.add_graph(file)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 568, in add_graph\n", - " self.import_graph_data_via_csv(table, file)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 596, in import_graph_data_via_csv\n", - " with open_to_read(file) as inp:\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 66, in open_to_read\n", - " return gzip.open(file, mode, encoding=enc)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/gzip.py\", line 58, in open\n", - " binary_file = GzipFile(filename, gz_mode, compresslevel)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/gzip.py\", line 168, in __init__\n", - " fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')\n", - "FileNotFoundError: [Errno 2] No such file or directory: '/Users/pedroszekely/Downloads/kypher/wikidata_os_v5/claims.tsv.gz'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/exceptions.py\", line 46, in __call__\n", - " return_code = func(*args, **kwargs) or 0\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/cli/query.py\", line 180, in run\n", - " raise KGTKException(str(e) + '\\n')\n", - "kgtk.exceptions.KGTKException: [Errno 2] No such file or directory: '/Users/pedroszekely/Downloads/kypher/wikidata_os_v5/claims.tsv.gz'\n", - "\n", - "[Errno 2] No such file or directory: '/Users/pedroszekely/Downloads/kypher/wikidata_os_v5/claims.tsv.gz'\n", - "\n" - ] - } - ], - "source": [ - "!{kypher} -i {out}/claims.tsv.gz \\\n", - "--match '(n1:Q368441)-[l]->(n2)' \\\n", - "--limit 10 \\\n", - "| col" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[2020-11-19 09:33:15 sqlstore]: IMPORT graph via csv.reader into table graph_16 from /Users/pedroszekely/Downloads/kypher/wikidata_os_v5/claims.tsv.gz ...\n", - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 565, in add_graph\n", - " self.import_graph_data_via_import(table, file)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 613, in import_graph_data_via_import\n", - " raise KGTKException('only implemented for existing, named files')\n", - "kgtk.exceptions.KGTKException: only implemented for existing, named files\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/cli/query.py\", line 148, in run\n", - " index=options.get('index'))\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/query.py\", line 182, in __init__\n", - " store.add_graph(file)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 568, in add_graph\n", - " self.import_graph_data_via_csv(table, file)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 596, in import_graph_data_via_csv\n", - " with open_to_read(file) as inp:\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/kypher/sqlstore.py\", line 66, in open_to_read\n", - " return gzip.open(file, mode, encoding=enc)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/gzip.py\", line 58, in open\n", - " binary_file = GzipFile(filename, gz_mode, compresslevel)\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/gzip.py\", line 168, in __init__\n", - " fileobj = self.myfileobj = builtins.open(filename, mode or 'rb')\n", - "FileNotFoundError: [Errno 2] No such file or directory: '/Users/pedroszekely/Downloads/kypher/wikidata_os_v5/claims.tsv.gz'\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/exceptions.py\", line 46, in __call__\n", - " return_code = func(*args, **kwargs) or 0\n", - " File \"/Users/pedroszekely/opt/anaconda3/envs/kgtk/lib/python3.7/site-packages/kgtk-0.4.0-py3.7.egg/kgtk/cli/query.py\", line 180, in run\n", - " raise KGTKException(str(e) + '\\n')\n", - "kgtk.exceptions.KGTKException: [Errno 2] No such file or directory: '/Users/pedroszekely/Downloads/kypher/wikidata_os_v5/claims.tsv.gz'\n", - "\n", - "[Errno 2] No such file or directory: '/Users/pedroszekely/Downloads/kypher/wikidata_os_v5/claims.tsv.gz'\n", - "\n" - ] - } - ], - "source": [ - "!{kypher} -i {out}/claims.tsv.gz \\\n", - "--match '(n1:P131)-[l]->(n2)' \\\n", - "--limit 10 \\\n", - "| col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compute the derived files using the `Wikidata Useful Files` Jupyter notebook" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Compute `claims.wikibase-item.tsv.gz` which would be computed by the Wikidata partitioner, but we are not using it here yet" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/bin/bash: /Volumes/GoogleDrive/Shared\\ drives/KGTK/datasets/wikidata-20200803-v4/metadata.property.datatypes.tsv.gz: No such file or directory\n" - ] - } - ], - "source": [ - "!zcat < \"{datatypes}\" | head | col" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/bin/bash: {kgtk}: command not found\n" - ] - } - ], - "source": [ - "!{kypher} -i {out}/claims.tsv.gz -i \"{datatypes}\" -o {out}/claims.wikibase-item.tsv.gz \\\n", - "--match 'claims: (n1)-[l {label: p}]->(n2), datatypes: (p)-[:datatype]->(:`wikibase-item`)' \\\n", - "--return 'l as id, n1 as node1, p as label, n2 as node2' \\\n", - "--order-by 'l' " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To compute the derived files we use papermill to run the `Wikidata Useful Files` notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "ename": "TypeError", - "evalue": "unsupported operand type(s) for +: 'set' and 'str'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m pm.execute_notebook(\n\u001b[1;32m 2\u001b[0m \u001b[0mnotebooks_folder\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0museful_files_notebook\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;34m{\u001b[0m\u001b[0mtemp\u001b[0m\u001b[0;34m}\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"/useful_files_notebook_output.ipynb\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m parameters=dict(\n\u001b[1;32m 5\u001b[0m \u001b[0moutput_path\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"/Users/pedroszekely/Downloads/kypher\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for +: 'set' and 'str'" - ] - } - ], - "source": [ - "pm.execute_notebook(\n", - " notebooks_folder + useful_files_notebook,\n", - " {temp} + \"/useful_files_notebook_output.ipynb\",\n", - " parameters=dict(\n", - " output_path=\"/Users/pedroszekely/Downloads/kypher\",\n", - " output_folder=\"wikidata_os_v1\",\n", - " temp_folder=\"temp.wikidata_os_v1\",\n", - " wiki_root_folder=\"/Users/pedroszekely/Downloads/kypher/wikidata_os_v1/\",\n", - " claims_file=\"claims.tsv.gz\",\n", - " label_file=\"labels.en.tsv.gz\",\n", - " alias_file=\"aliases.en.tsv.gz\",\n", - " description_file=\"descriptions.en.tsv.gz\",\n", - " item_file=\"claims.wikibase-item.tsv.gz\",\n", - " cache_path=\"/Users/pedroszekely/Downloads/kypher/temp.useful_wikidata_files_v4\",\n", - " delete_database=False,\n", - " compute_pagerank=False\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Look at the columns so we know how to construct the kypher query" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary of results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!ls -lh {out}/*wikidataos.*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!zcat < {out}/wikidataos.all.tsv.gz | wc" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Verification" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The edges file must contain edges for properties, this is not the case on 2020-11-10`\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk} -i \"{claims}\" \\\n", - "--match '(:P10)-[l]->(n2)' \\\n", - "--limit 10" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "celltoolbar": "Tags", - "kernelspec": { - "display_name": "kgtk", - "language": "python", - "name": "kgtk" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/use-cases/Wikidata Subsets.ipynb b/use-cases/Wikidata-Subsets.ipynb similarity index 90% rename from use-cases/Wikidata Subsets.ipynb rename to use-cases/Wikidata-Subsets.ipynb index 4b30491f4..d62873e81 100644 --- a/use-cases/Wikidata Subsets.ipynb +++ b/use-cases/Wikidata-Subsets.ipynb @@ -56,7 +56,11 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "tags": [ + "parameters" + ] + }, "outputs": [], "source": [ "input_path = \"/data/amandeep/wikidata-20220505/import-wikidata/data\"\n", @@ -72,9 +76,7 @@ "# Classes to remove\n", "remove_classes = \"Q7318358,Q13442814\"\n", "\n", - "compute_pagerank = True\n", - "\n", - "useful_files_notebook = \"Wikidata Useful Files.ipynb\"\n", + "useful_files_notebook = \"Wikidata-Useful-Files.ipynb\"\n", "notebooks_folder = f\"{kgtk_path}/use-cases\"\n", "\n", "languages = \"en,ru,es,zh-cn,de,it,nl,pl,fr,pt,sv\"\n" @@ -172,84 +174,6 @@ "ck.load_files_into_cache()" ] }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Graph Cache:\n", - "DB file: /data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4/wikidata.sqlite3.db\n", - " size: 535.50 GB \tfree: 0 Bytes \tmodified: 2022-05-14 13:07:10\n", - "\n", - "KGTK File Information:\n", - "/data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4/items.remove.tsv.gz:\n", - " size: 99.33 MB \tmodified: 2022-05-14 12:47:54 \tgraph: graph_11\n", - "alias_all:\n", - " size: 2.07 GB \tmodified: 2022-05-11 06:01:24 \tgraph: graph_3\n", - "claims:\n", - " size: 27.33 GB \tmodified: 2022-05-11 05:55:01 \tgraph: graph_1\n", - "datatypes:\n", - " size: 54.46 KB \tmodified: 2022-05-11 07:29:02 \tgraph: graph_7\n", - "description_all:\n", - " size: 23.66 GB \tmodified: 2022-05-11 07:08:19 \tgraph: graph_4\n", - "isa:\n", - " size: 303.01 MB \tmodified: 2022-05-11 14:42:27 \tgraph: graph_9\n", - "item:\n", - " size: 9.63 GB \tmodified: 2022-05-14 07:47:33 \tgraph: graph_5\n", - "label_all:\n", - " size: 7.88 GB \tmodified: 2022-05-11 07:21:33 \tgraph: graph_2\n", - "p279star:\n", - " size: 698.89 MB \tmodified: 2022-05-11 14:13:56 \tgraph: graph_10\n", - "qualifiers:\n", - " size: 5.36 GB \tmodified: 2022-05-11 05:59:30 \tgraph: graph_6\n", - "types:\n", - " size: 455.79 MB \tmodified: 2022-05-11 07:29:39 \tgraph: graph_8\n", - "\n", - "Graph Table Information:\n", - "graph_1:\n", - " size: 119.52 GB \tcreated: 2022-05-14 09:24:57\n", - " header: ['id', 'node1', 'label', 'node2', 'rank', 'node2;wikidatatype']\n", - "graph_10:\n", - " size: 8.74 GB \tcreated: 2022-05-14 12:06:03\n", - " header: ['node1', 'label', 'node2', 'id']\n", - "graph_11:\n", - " size: 1.54 GB \tcreated: 2022-05-14 13:07:10\n", - " header: ['node1', 'label', 'node2']\n", - "graph_2:\n", - " size: 54.66 GB \tcreated: 2022-05-14 09:47:37\n", - " header: ['id', 'node1', 'label', 'node2', 'lang']\n", - "graph_3:\n", - " size: 11.91 GB \tcreated: 2022-05-14 09:52:44\n", - " header: ['id', 'node1', 'label', 'node2', 'lang']\n", - "graph_4:\n", - " size: 238.50 GB \tcreated: 2022-05-14 11:24:07\n", - " header: ['id', 'node1', 'label', 'node2', 'lang']\n", - "graph_5:\n", - " size: 56.81 GB \tcreated: 2022-05-14 11:47:26\n", - " header: ['id', 'node1', 'label', 'node2', 'rank', 'node2;wikidatatype']\n", - "graph_6:\n", - " size: 33.64 GB \tcreated: 2022-05-14 11:59:40\n", - " header: ['id', 'node1', 'label', 'node2', 'node2;wikidatatype']\n", - "graph_7:\n", - " size: 476.00 KB \tcreated: 2022-05-14 11:59:40\n", - " header: ['id', 'node1', 'label', 'node2']\n", - "graph_8:\n", - " size: 4.38 GB \tcreated: 2022-05-14 12:01:43\n", - " header: ['id', 'node1', 'label', 'node2']\n", - "graph_9:\n", - " size: 5.83 GB \tcreated: 2022-05-14 12:03:14\n", - " header: ['node1', 'label', 'node2']\n" - ] - } - ], - "source": [ - "!kgtk query --gc $STORE --show-cache" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -949,14 +873,16 @@ "outputs": [], "source": [ "pm.execute_notebook(\n", - " os.environ[\"USE_CASES_DIR\"] + \"/Wikidata Useful Files.ipynb\",\n", - " os.environ[\"TEMP\"] + \"/Wikidata Useful Files Out.ipynb\",\n", + " f'{os.environ[\"USE_CASES_DIR\"]}/{useful_files_notebook}',\n", + " os.environ[\"TEMP\"] + \"/Wikidata-Useful-Files-Out.ipynb\",\n", " parameters=dict(\n", " output_path = os.environ[\"OUT\"],\n", " input_path = os.environ[\"OUT\"],\n", - " kgtk_path = '/data/amandeep/Github/kgtk',\n", + " kgtk_path = kgtk_path,\n", " compute_pagerank=True,\n", " compute_degrees=True,\n", + " compute_isa_star=True,\n", + " compute_p31p279_star=True,\n", " debug=False\n", " )\n", ")\n" @@ -1080,9 +1006,9 @@ "metadata": { "celltoolbar": "Tags", "kernelspec": { - "display_name": "kgtk-env-ckg07", + "display_name": "kgtk-env", "language": "python", - "name": "kgtk-env-ckg07" + "name": "kgtk-env" }, "language_info": { "codemirror_mode": { @@ -1094,7 +1020,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.9.7" } }, "nbformat": 4, diff --git a/use-cases/Wikidata Useful Files.ipynb b/use-cases/Wikidata-Useful-Files.ipynb similarity index 89% rename from use-cases/Wikidata Useful Files.ipynb rename to use-cases/Wikidata-Useful-Files.ipynb index 4536df177..1e92f1f69 100644 --- a/use-cases/Wikidata Useful Files.ipynb +++ b/use-cases/Wikidata-Useful-Files.ipynb @@ -92,7 +92,10 @@ "\n", "compute_pagerank = False\n", "compute_degrees = False\n", - "debug = False" + "debug = False\n", + "compute_isa_star = False\n", + "compute_p31p279_star = False\n", + "files_for_cache = None" ] }, { @@ -102,7 +105,11 @@ "outputs": [], "source": [ "files = files.split(',')\n", - "languages = languages.split(',')" + "languages = languages.split(',')\n", + "if files_for_cache is None:\n", + " files_for_cache = files\n", + "else:\n", + " files_for_cache = files_for_cache.split(\",\")" ] }, { @@ -179,50 +186,7 @@ ], "source": [ "if graph_cache_path is None:\n", - " ck.load_files_into_cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Graph Cache:\n", - "DB file: /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files/temp.useful-files/wikidata.sqlite3.db\n", - " size: 424.57 GB \tfree: 0 Bytes \tmodified: 2022-05-11 11:30:56\n", - "\n", - "KGTK File Information:\n", - "alias_all:\n", - " size: 2.07 GB \tmodified: 2022-05-11 06:01:24 \tgraph: graph_3\n", - "claims:\n", - " size: 27.33 GB \tmodified: 2022-05-11 05:55:01 \tgraph: graph_1\n", - "description_all:\n", - " size: 23.66 GB \tmodified: 2022-05-11 07:08:19 \tgraph: graph_4\n", - "label_all:\n", - " size: 7.88 GB \tmodified: 2022-05-11 07:21:33 \tgraph: graph_2\n", - "\n", - "Graph Table Information:\n", - "graph_1:\n", - " size: 119.52 GB \tcreated: 2022-05-11 09:35:56\n", - " header: ['id', 'node1', 'label', 'node2', 'rank', 'node2;wikidatatype']\n", - "graph_2:\n", - " size: 54.66 GB \tcreated: 2022-05-11 09:57:48\n", - " header: ['id', 'node1', 'label', 'node2', 'lang']\n", - "graph_3:\n", - " size: 11.91 GB \tcreated: 2022-05-11 10:02:43\n", - " header: ['id', 'node1', 'label', 'node2', 'lang']\n", - "graph_4:\n", - " size: 238.50 GB \tcreated: 2022-05-11 11:30:56\n", - " header: ['id', 'node1', 'label', 'node2', 'lang']\n" - ] - } - ], - "source": [ - "!kgtk query --show-cache" + " ck.load_files_into_cache(files=files_for_cache)" ] }, { @@ -756,13 +720,14 @@ "metadata": {}, "outputs": [], "source": [ - "!$kypher -i \"$P279STAR\" --as P279star -i \"$ISA\" --as isa \\\n", - "--match '\\\n", - " isa: (n1)-[]->(n2), \\\n", - " P279star: (n2)-[]->(n3)' \\\n", - "--return 'distinct n1 as node1, \"isa_star\" as label, n3 as node2' \\\n", - "--order-by 'n1' \\\n", - "-o \"$TEMP\"/derived.isastar_1.tsv.gz" + "if compute_isa_star:\n", + " !$kypher -i \"$P279STAR\" --as P279star -i \"$ISA\" --as isa \\\n", + " --match '\\\n", + " isa: (n1)-[]->(n2), \\\n", + " P279star: (n2)-[]->(n3)' \\\n", + " --return 'distinct n1 as node1, \"isa_star\" as label, n3 as node2' \\\n", + " --order-by 'n1' \\\n", + " -o \"$TEMP\"/derived.isastar_1.tsv.gz" ] }, { @@ -778,10 +743,11 @@ "metadata": {}, "outputs": [], "source": [ - "kgtk(\"\"\"add-id \n", - " --id-style wikidata \n", - " -i \"$TEMP\"/derived.isastar_1.tsv.gz \n", - " -o \"$OUT\"/derived.isastar.tsv.gz\"\"\")" + "if compute_isa_star:\n", + " kgtk(\"\"\"add-id \n", + " --id-style wikidata \n", + " -i \"$TEMP\"/derived.isastar_1.tsv.gz \n", + " -o \"$OUT\"/derived.isastar.tsv.gz\"\"\")" ] }, { @@ -797,13 +763,14 @@ "metadata": {}, "outputs": [], "source": [ - "!$kypher -i claims -i \"$P279STAR\" --as P279star \\\n", - "--match '\\\n", - " claims: (n1)-[:P31]->(n2), \\\n", - " P279star: (n2)-[]->(n3)' \\\n", - "--return 'distinct n1 as node1, \"P31P279star\" as label, n3 as node2' \\\n", - "--order-by 'n1' \\\n", - "-o \"$TEMP\"/derived.P31P279star.tsv.gz" + "if compute_p31p279_star:\n", + " !$kypher -i claims -i \"$P279STAR\" --as P279star \\\n", + " --match '\\\n", + " claims: (n1)-[:P31]->(n2), \\\n", + " P279star: (n2)-[]->(n3)' \\\n", + " --return 'distinct n1 as node1, \"P31P279star\" as label, n3 as node2' \\\n", + " --order-by 'n1' \\\n", + " -o \"$TEMP\"/derived.P31P279star.tsv.gz" ] }, { @@ -819,10 +786,11 @@ "metadata": {}, "outputs": [], "source": [ - "kgtk(\"\"\"add-id \n", - " --id-style wikidata \n", - " -i \"$TEMP\"/derived.P31P279star.tsv.gz\n", - " -o \"$OUT\"/derived.P31P279star.tsv.gz\"\"\")" + "if compute_p31p279_star:\n", + " kgtk(\"\"\"add-id \n", + " --id-style wikidata \n", + " -i \"$TEMP\"/derived.P31P279star.tsv.gz\n", + " -o \"$OUT\"/derived.P31P279star.tsv.gz\"\"\")" ] }, { @@ -1008,10 +976,11 @@ "metadata": {}, "outputs": [], "source": [ - "!$kypher -i claims -o $TEMP/metadata.out_degree.tsv.gz \\\n", - "--match '(n1)-[l]->()' \\\n", - "--order-by 'n1' \\\n", - "--return 'distinct n1 as node1, count(distinct l) as node2, \"Pout_degree\" as label' " + "if compute_degrees:\n", + " !$kypher -i claims -o $TEMP/metadata.out_degree.tsv.gz \\\n", + " --match '(n1)-[l]->()' \\\n", + " --order-by 'n1' \\\n", + " --return 'distinct n1 as node1, count(distinct l) as node2, \"Pout_degree\" as label' " ] }, { @@ -1020,9 +989,10 @@ "metadata": {}, "outputs": [], "source": [ - "kgtk(\"\"\"add-id --id-style wikidata \n", - " -i $TEMP/metadata.out_degree.tsv.gz \n", - " -o $OUT/metadata.out_degree.tsv.gz\"\"\")" + "if compute_degrees:\n", + " kgtk(\"\"\"add-id --id-style wikidata \n", + " -i $TEMP/metadata.out_degree.tsv.gz \n", + " -o $OUT/metadata.out_degree.tsv.gz\"\"\")" ] }, { @@ -1050,7 +1020,8 @@ } ], "source": [ - "!zcat < $OUT/metadata.out_degree.tsv.gz | head | col" + "if compute_degrees:\n", + " !zcat < $OUT/metadata.out_degree.tsv.gz | head | col" ] }, { @@ -1066,10 +1037,11 @@ "metadata": {}, "outputs": [], "source": [ - "!$kypher -i claims -o $TEMP/metadata.in_degree.tsv.gz \\\n", - "--match '()-[l]->(n2 {`wikidatatype`:\"wikibase-item\"})' \\\n", - "--return 'distinct n2 as node1, count(distinct l) as node2, \"Pin_degree\" as label' \\\n", - "--order-by 'n2'" + "if compute_degrees:\n", + " !$kypher -i claims -o $TEMP/metadata.in_degree.tsv.gz \\\n", + " --match '()-[l]->(n2 {`wikidatatype`:\"wikibase-item\"})' \\\n", + " --return 'distinct n2 as node1, count(distinct l) as node2, \"Pin_degree\" as label' \\\n", + " --order-by 'n2'" ] }, { @@ -1078,9 +1050,10 @@ "metadata": {}, "outputs": [], "source": [ - "kgtk(\"\"\"add-id --id-style wikidata \n", - " -i $TEMP/metadata.in_degree.tsv.gz\n", - " -o $OUT/metadata.in_degree.tsv.gz\"\"\")" + "if compute_degrees:\n", + " kgtk(\"\"\"add-id --id-style wikidata \n", + " -i $TEMP/metadata.in_degree.tsv.gz\n", + " -o $OUT/metadata.in_degree.tsv.gz\"\"\")" ] }, { @@ -1108,7 +1081,8 @@ } ], "source": [ - "!zcat < $OUT/metadata.in_degree.tsv.gz | head | col" + "if compute_degrees:\n", + " !zcat < $OUT/metadata.in_degree.tsv.gz | head | col" ] }, { @@ -1124,10 +1098,11 @@ "metadata": {}, "outputs": [], "source": [ - "!$kypher -i $OUT/metadata.in_degree.tsv.gz -o $OUT/statistics.in_degree.distribution.tsv \\\n", - "--match '(n1)-[]->(n2)' \\\n", - "--return 'distinct n2 as Pin_degree, count(distinct n1) as count, \"count\" as label' \\\n", - "--order-by 'cast(n2, integer)' " + "if compute_degrees:\n", + " !$kypher -i $OUT/metadata.in_degree.tsv.gz -o $OUT/statistics.in_degree.distribution.tsv \\\n", + " --match '(n1)-[]->(n2)' \\\n", + " --return 'distinct n2 as Pin_degree, count(distinct n1) as count, \"count\" as label' \\\n", + " --order-by 'cast(n2, integer)' " ] }, { @@ -1153,7 +1128,8 @@ } ], "source": [ - "!head $OUT/statistics.in_degree.distribution.tsv | col" + "if compute_degrees:\n", + " !head $OUT/statistics.in_degree.distribution.tsv | col" ] }, { @@ -1162,10 +1138,11 @@ "metadata": {}, "outputs": [], "source": [ - "!$kypher -i $OUT/metadata.out_degree.tsv.gz -o $OUT/statistics.out_degree.distribution.tsv \\\n", - "--match '(n1)-[]->(n2)' \\\n", - "--return 'distinct n2 as Pout_degree, count(distinct n1) as count, \"count\" as label' \\\n", - "--order-by 'cast(n2, integer)' " + "if compute_degrees:\n", + " !$kypher -i $OUT/metadata.out_degree.tsv.gz -o $OUT/statistics.out_degree.distribution.tsv \\\n", + " --match '(n1)-[]->(n2)' \\\n", + " --return 'distinct n2 as Pout_degree, count(distinct n1) as count, \"count\" as label' \\\n", + " --order-by 'cast(n2, integer)' " ] }, { @@ -1191,7 +1168,8 @@ } ], "source": [ - "!head $OUT/statistics.out_degree.distribution.tsv | col" + "if compute_degrees:\n", + " !head $OUT/statistics.out_degree.distribution.tsv | col" ] }, { diff --git a/use-cases/create-wikidata-dwd.ipynb b/use-cases/create-wikidata-dwd.ipynb new file mode 100644 index 000000000..2003f588f --- /dev/null +++ b/use-cases/create-wikidata-dwd.ipynb @@ -0,0 +1,221 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "93f651b9-d27d-40bb-b531-cfabad740521", + "metadata": {}, + "outputs": [], + "source": [ + "import papermill as pm\n", + "\n", + "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", + "from kgtk.functions import kgtk, kypher" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "217db6b6-3e26-47f0-ba62-cbce9270021b", + "metadata": {}, + "outputs": [], + "source": [ + "input_path = \"/data/amandeep\"\n", + "output_path = \"/data/amandeep\"\n", + "project_name = \"create-wikidata-dwd\"\n", + "\n", + "kgtk_path = \"/data/amandeep/Github/kgtk\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1c90cb1-baab-4387-bad5-62d703d84ec1", + "metadata": {}, + "outputs": [], + "source": [ + "ck = ConfigureKGTK([], kgtk_path=kgtk_path)\n", + "ck.configure_kgtk(input_graph_path=input_path,\n", + " output_path=output_path,\n", + " project_name=project_name,\n", + " graph_cache_path=None)" + ] + }, + { + "cell_type": "markdown", + "id": "7f51bc57-e01e-4bf5-bcd6-d3c0b50b0c84", + "metadata": {}, + "source": [ + "## Run the Import Wikidata Notebook " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f247bcc7-29a6-4756-9b4e-73a84693af20", + "metadata": {}, + "outputs": [], + "source": [ + "# Parameters for Import Wikidata\n", + "json_file_path = \"/data/amandeep/wikidata-20220519\"\n", + "import_wikidata_path = \"/data/amandeep/wikidata-20220519\"\n", + "wikidata_project_name = \"import-wikidata\"\n", + "wikidata_json_file = \"latest-all.json.bz2\"\n", + "sort_command = 'sort'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a8a6cc0-1c9b-42f2-9f50-9366820dbf58", + "metadata": {}, + "outputs": [], + "source": [ + "pm.execute_notebook(\n", + " \"import-wikidata.ipynb\",\n", + " os.environ[\"TEMP\"] + \"/import-wikidata.out.ipynb\",\n", + " parameters=dict(\n", + " input_path = json_file_path,\n", + " output_path = import_wikidata_path,\n", + " project_name = wikidata_project_name,\n", + " wikidata_json_file = wikidata_json_file,\n", + " kgtk_path = kgtk_path,\n", + " sort_command = sort_command\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7373d403-c73b-47ee-a7d2-13054f5e1516", + "metadata": {}, + "source": [ + "## Run the Useful Files Notebook to compute `isa` and `p279star` files only" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3d42da9-ac53-4f77-85aa-fe889b87c8f0", + "metadata": {}, + "outputs": [], + "source": [ + "# Parameters for First run on Useful Files\n", + "first_useful_files_input_path = f\"{import_wikidata_path}/{wikidata_project_name}\"\n", + "first_useful_files_output_path = import_wikidata_path\n", + "first_useful_files_project_name = \"useful-files\"\n", + "first_useful_files = 'claims,label_all,alias_all,description_all'\n", + "first_useful_files_for_cache = 'claims'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31ad8208-eb1a-41f8-99b4-354f6dcbec07", + "metadata": {}, + "outputs": [], + "source": [ + "pm.execute_notebook(\n", + " \"Wikidata-Useful-Files.ipynb\",\n", + " os.environ[\"TEMP\"] + \"/Wikidata-Useful-Files.out.ipynb\",\n", + " parameters=dict(\n", + " input_path = first_useful_files_input_path,\n", + " output_path = first_useful_files_output_path,\n", + " project_name = first_useful_files_project_name,\n", + " kgtk_path = kgtk_path,\n", + " files = first_useful_files,\n", + " files_for_cache=first_useful_files_for_cache,\n", + " compute_pagerank=False,\n", + " compute_degrees=False,\n", + " debug=False,\n", + " compute_isa_star=False,\n", + " compute_p31p279_star=False\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b14d680-9631-401a-8903-7245a9cad80d", + "metadata": {}, + "outputs": [], + "source": [ + "!cp $import_wikidata_path/$first_useful_files_project_name/derived.isa.tsv.gz $import_wikidata_path\n", + "!cp $import_wikidata_path/$first_useful_files_project_name/derived.P279star.tsv.gz $import_wikidata_path" + ] + }, + { + "cell_type": "markdown", + "id": "54f7dab3-1528-4049-9e8a-e9b6aeff5aa1", + "metadata": {}, + "source": [ + "## Run Wikidata Subsets Notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6341b690-c846-4482-adcc-ba30dc69f267", + "metadata": {}, + "outputs": [], + "source": [ + "subset_input_path = import_wikidata_path\n", + "subset_output_path = \"/data/amandeep\"\n", + "\n", + "\n", + "subset_project_name = \"wikidata-20220519-dwd-v5\"\n", + "\n", + "subset_files = 'claims,label_all,alias_all,description_all,item,qualifiers,datatypes,types,isa,p279star'\n", + "\n", + "# Classes to remove\n", + "remove_classes = \"Q7318358,Q13442814\"\n", + "\n", + "languages = \"en,ru,es,zh-cn,de,it,nl,pl,fr,pt,sv\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce354040-80ce-42bc-a868-dafe324131d1", + "metadata": {}, + "outputs": [], + "source": [ + "pm.execute_notebook(\n", + " \"Wikidata-Subsets.ipynb\",\n", + " os.environ[\"TEMP\"] + \"/Wikidata-Subsets.out.ipynb\",\n", + " parameters=dict(\n", + " input_path = subset_input_path,\n", + " output_path = subset_output_path,\n", + " project_name = subset_project_name,\n", + " kgtk_path = kgtk_path,\n", + " files = subset_files,\n", + " remove_classes = remove_classes,\n", + " languages = languages\n", + " )\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "kgtk-env", + "language": "python", + "name": "kgtk-env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/use-cases/import-wikidata.ipynb b/use-cases/import-wikidata.ipynb index 348a5c3f4..5156c6c31 100644 --- a/use-cases/import-wikidata.ipynb +++ b/use-cases/import-wikidata.ipynb @@ -35,20 +35,24 @@ "cell_type": "code", "execution_count": 4, "id": "1a6cc50d-2a13-4eca-95be-486767de63ec", - "metadata": {}, + "metadata": { + "tags": [ + "parameters" + ] + }, "outputs": [], "source": [ "# Parameters\n", "\n", "# Folder on local machine where to create the output and temporary folders\n", - "input_path = \"/Volumes/saggu-ssd/wikidata-2021-10-27\"\n", "input_path = \"/data/amandeep/wikidata-20220505\"\n", - "output_path = \"/Volumes/saggu-ssd/wikidata-2021-10-27-out\"\n", "output_path = \"/data/amandeep/wikidata-20220505\"\n", "project_name = \"import-wikidata\"\n", "\n", "kgtk_path = \"/data/amandeep/Github/kgtk\"\n", - "wikidata_json_file = \"latest-all.json.bz2\"" + "wikidata_json_file = \"latest-all.json.bz2\"\n", + "# sort_command = 'gsort'\n", + "sort_command = 'sort'" ] }, { @@ -120,23 +124,6 @@ "metadata": {}, "outputs": [], "source": [ - "os.environ['PATTERNDIR'] = f\"{kgtk_path}/wikidata/patterns\"\n", - "\n", - "os.environ['DATADIR'] = f\"{os.environ['OUT']}/data\"\n", - "\n", - "# Temporary files (unsorted) will be stored in in:\n", - "os.environ['TEMPDIR'] = f\"{os.environ['OUT']}/temp\"\n", - "\n", - "# The working log files will be stored in:\n", - "os.environ['LOGDIR'] = f\"{os.environ['OUT']}/logs\"\n", - "\n", - "# The count validation files will be stored in:\n", - "os.environ['COUNTDIR'] = f\"{os.environ['OUT']}/counts\"\n", - "\n", - "# Completed data products will be stored in:\n", - "os.environ['PRODUCTDIR'] = f\"{os.environ['OUT']}/product\"\n", - "\n", - "\n", "os.environ['WIKIDATA_ALL_JSON'] = f\"{os.environ['GRAPH']}/{wikidata_json_file}\"\n", "\n", "# Work file extensions\n", @@ -155,7 +142,7 @@ "#KGTK_FLAGS=\"--debug --timing --progress --progress-tty `tty`\"\n", "os.environ['KGTK_FLAGS'] = \"--debug --timing\"\n", "os.environ['VERBOSE'] = \"--verbose\"\n", - "os.environ['SORT_EXTRAS'] = f\"--parallel 6 --buffer-size 50% -T {os.environ['TEMPDIR']}\"\n", + "os.environ['SORT_EXTRAS'] = f\"--parallel 6 --buffer-size 50% -T {os.environ['TEMP']}\"\n", "\n", "# The Wikidata datatypes:\n", "WIKIDATATYPES = [ \n", @@ -206,21 +193,7 @@ "\t\"metadata.types\"]\n", "\n", "\n", - "os.environ['SORT_COMMAND'] = \"gsort\"\n", - "os.environ['SORT_COMMAND'] = \"sort\"" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "3218233f-37b7-4563-b6bf-aacaa73ecd08", - "metadata": {}, - "outputs": [], - "source": [ - "!mkdir -p ${DATADIR}\n", - "!mkdir -p ${TEMPDIR}\n", - "!mkdir -p ${LOGDIR}\n", - "!mkdir -p ${COUNTDIR}" + "os.environ['SORT_COMMAND'] = sort_command" ] }, { @@ -1526,32 +1499,32 @@ "!kgtk ${KGTK_FLAGS} \\\n", " import-wikidata \\\n", " -i ${WIKIDATA_ALL_JSON} \\\n", - " --node-file ${TEMPDIR}/metadata.node.${UNSORTED_KGTK} \\\n", - " --minimal-edge-file ${TEMPDIR}/claims.raw.${UNSORTED_KGTK} \\\n", - " --minimal-qual-file ${TEMPDIR}/qualifiers.raw.${UNSORTED_KGTK} \\\n", - " --invalid-edge-file ${TEMPDIR}/claims.badvalue.${UNSORTED_KGTK} \\\n", - " --invalid-qual-file ${TEMPDIR}/qualifiers.badvalue.${UNSORTED_KGTK} \\\n", + " --node-file ${TEMP}/metadata.node.${UNSORTED_KGTK} \\\n", + " --minimal-edge-file ${TEMP}/claims.raw.${UNSORTED_KGTK} \\\n", + " --minimal-qual-file ${TEMP}/qualifiers.raw.${UNSORTED_KGTK} \\\n", + " --invalid-edge-file ${TEMP}/claims.badvalue.${UNSORTED_KGTK} \\\n", + " --invalid-qual-file ${TEMP}/qualifiers.badvalue.${UNSORTED_KGTK} \\\n", " --node-file-id-only \\\n", " --explode-values False \\\n", " --all-languages True \\\n", " --lang en \\\n", " --alias-edges True \\\n", - " --split-alias-file ${TEMPDIR}/aliases.${UNSORTED_KGTK} \\\n", - " --split-en-alias-file ${TEMPDIR}/aliases.en.${UNSORTED_KGTK} \\\n", + " --split-alias-file ${TEMP}/aliases.${UNSORTED_KGTK} \\\n", + " --split-en-alias-file ${TEMP}/aliases.en.${UNSORTED_KGTK} \\\n", " --description-edges True \\\n", - " --split-description-file ${TEMPDIR}/descriptions.${UNSORTED_KGTK} \\\n", - " --split-en-description-file ${TEMPDIR}/descriptions.en.${UNSORTED_KGTK} \\\n", + " --split-description-file ${TEMP}/descriptions.${UNSORTED_KGTK} \\\n", + " --split-en-description-file ${TEMP}/descriptions.en.${UNSORTED_KGTK} \\\n", " --label-edges True \\\n", - " --split-label-file ${TEMPDIR}/labels.${UNSORTED_KGTK} \\\n", - " --split-en-label-file ${TEMPDIR}/labels.en.${UNSORTED_KGTK} \\\n", + " --split-label-file ${TEMP}/labels.${UNSORTED_KGTK} \\\n", + " --split-en-label-file ${TEMP}/labels.en.${UNSORTED_KGTK} \\\n", " --datatype-edges True \\\n", - " --split-datatype-file ${TEMPDIR}/metadata.property.datatypes.${UNSORTED_KGTK} \\\n", + " --split-datatype-file ${TEMP}/metadata.property.datatypes.${UNSORTED_KGTK} \\\n", " --entry-type-edges True \\\n", - " --split-type-file ${TEMPDIR}/metadata.types.${UNSORTED_KGTK} \\\n", + " --split-type-file ${TEMP}/metadata.types.${UNSORTED_KGTK} \\\n", " --sitelink-edges True \\\n", " --sitelink-verbose-edges True \\\n", - " --split-sitelink-file ${TEMPDIR}/sitelinks.raw.${UNSORTED_KGTK} \\\n", - " --split-en-sitelink-file ${TEMPDIR}/sitelinks.en.raw.${UNSORTED_KGTK} \\\n", + " --split-sitelink-file ${TEMP}/sitelinks.raw.${UNSORTED_KGTK} \\\n", + " --split-en-sitelink-file ${TEMP}/sitelinks.en.raw.${UNSORTED_KGTK} \\\n", " --value-hash-width 6 \\\n", " --claim-id-hash-width 8 \\\n", " --use-kgtkwriter True \\\n", @@ -1576,7 +1549,7 @@ " --repair-lax-coordinates \\\n", " --allow-language-suffixes \\\n", " --allow-wikidata-lq-strings \\\n", - " | tee ${LOGDIR}/import-split-wikidata.log\n" + " | tee ${TEMP}/import-split-wikidata.log\n" ] }, { @@ -1636,12 +1609,12 @@ "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \\\n", - " --input-file ${TEMPDIR}/claims.raw.${UNSORTED_KGTK} \\\n", + " --input-file ${TEMP}/claims.raw.${UNSORTED_KGTK} \\\n", " --first-match-only \\\n", - " --pattern \";; novalue\" -o ${TEMPDIR}/claims.novalue.${UNSORTED_KGTK} \\\n", - " --pattern \";; somevalue\" -o ${TEMPDIR}/claims.somevalue.${UNSORTED_KGTK} \\\n", - " --reject-file ${TEMPDIR}/claims.${UNSORTED_KGTK} \\\n", - " | tee ${LOGDIR}/split-claims-missing-values.log" + " --pattern \";; novalue\" -o ${TEMP}/claims.novalue.${UNSORTED_KGTK} \\\n", + " --pattern \";; somevalue\" -o ${TEMP}/claims.somevalue.${UNSORTED_KGTK} \\\n", + " --reject-file ${TEMP}/claims.${UNSORTED_KGTK} \\\n", + " | tee ${TEMP}/split-claims-missing-values.log" ] }, { @@ -1825,30 +1798,30 @@ "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \\\n", - " --input-file ${TEMPDIR}/qualifiers.raw.${UNSORTED_KGTK} \\\n", + " --input-file ${TEMP}/qualifiers.raw.${UNSORTED_KGTK} \\\n", " --first-match-only \\\n", - " --pattern \";; novalue\" -o ${TEMPDIR}/qualifiers.novalue.${UNSORTED_KGTK} \\\n", - " --pattern \";; somevalue\" -o ${TEMPDIR}/qualifiers.somevalue.${UNSORTED_KGTK} \\\n", + " --pattern \";; novalue\" -o ${TEMP}/qualifiers.novalue.${UNSORTED_KGTK} \\\n", + " --pattern \";; somevalue\" -o ${TEMP}/qualifiers.somevalue.${UNSORTED_KGTK} \\\n", " --reject-file - \\\n", " / ifexists ${VERBOSE} \\\n", " --input-keys node1 \\\n", - " --filter-file ${TEMPDIR}/claims.novalue.${UNSORTED_KGTK} \\\n", + " --filter-file ${TEMP}/claims.novalue.${UNSORTED_KGTK} \\\n", " --filter-keys id \\\n", - " --output-file ${TEMPDIR}/qualifiers.novalueClaims.${UNSORTED_KGTK} \\\n", + " --output-file ${TEMP}/qualifiers.novalueClaims.${UNSORTED_KGTK} \\\n", " --reject-file - \\\n", " / ifexists ${VERBOSE} \\\n", " --input-keys node1 \\\n", - " --filter-file ${TEMPDIR}/claims.somevalue.${UNSORTED_KGTK} \\\n", + " --filter-file ${TEMP}/claims.somevalue.${UNSORTED_KGTK} \\\n", " --filter-keys id \\\n", - " --output-file ${TEMPDIR}/qualifiers.somevalueClaims.${UNSORTED_KGTK} \\\n", + " --output-file ${TEMP}/qualifiers.somevalueClaims.${UNSORTED_KGTK} \\\n", " --reject-file - \\\n", " / ifexists ${VERBOSE} \\\n", " --input-keys node1 \\\n", - " --filter-file ${TEMPDIR}/claims.badvalue.${UNSORTED_KGTK} \\\n", + " --filter-file ${TEMP}/claims.badvalue.${UNSORTED_KGTK} \\\n", " --filter-keys id \\\n", - " --output-file ${TEMPDIR}/qualifiers.badvalueClaims.${UNSORTED_KGTK} \\\n", - " --reject-file ${TEMPDIR}/qualifiers.${UNSORTED_KGTK} \\\n", - " | tee ${LOGDIR}/split-qualifiers-missing-values.log" + " --output-file ${TEMP}/qualifiers.badvalueClaims.${UNSORTED_KGTK} \\\n", + " --reject-file ${TEMP}/qualifiers.${UNSORTED_KGTK} \\\n", + " | tee ${TEMP}/split-qualifiers-missing-values.log" ] }, { @@ -1905,11 +1878,11 @@ "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \\\n", - " --input-file ${TEMPDIR}/sitelinks.raw.${UNSORTED_KGTK} \\\n", + " --input-file ${TEMP}/sitelinks.raw.${UNSORTED_KGTK} \\\n", " --pattern \"; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;\" \\\n", - " --output-file ${TEMPDIR}/sitelinks.qualifiers.${UNSORTED_KGTK} \\\n", - " --reject-file ${TEMPDIR}/sitelinks.${UNSORTED_KGTK} \\\n", - " | tee ${LOGDIR}/split-sitelink-qualifiers.log" + " --output-file ${TEMP}/sitelinks.qualifiers.${UNSORTED_KGTK} \\\n", + " --reject-file ${TEMP}/sitelinks.${UNSORTED_KGTK} \\\n", + " | tee ${TEMP}/split-sitelink-qualifiers.log" ] }, { @@ -1966,11 +1939,11 @@ "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \\\n", - " --input-file ${TEMPDIR}/sitelinks.en.raw.${UNSORTED_KGTK} \\\n", + " --input-file ${TEMP}/sitelinks.en.raw.${UNSORTED_KGTK} \\\n", " --pattern \"; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;\" \\\n", - " --output-file ${TEMPDIR}/sitelinks.en.qualifiers.${UNSORTED_KGTK} \\\n", - " --reject-file ${TEMPDIR}/sitelinks.en.${UNSORTED_KGTK} \\\n", - " | tee ${LOGDIR}/split-sitelink-en-qualifiers.log" + " --output-file ${TEMP}/sitelinks.en.qualifiers.${UNSORTED_KGTK} \\\n", + " --reject-file ${TEMP}/sitelinks.en.${UNSORTED_KGTK} \\\n", + " | tee ${TEMP}/split-sitelink-en-qualifiers.log" ] }, { @@ -1978,7 +1951,7 @@ "id": "9275fecc-98db-435c-863a-7f4d780f64c9", "metadata": {}, "source": [ - "## Sort the files from `TEMPDIR` to `DATADIR` folder" + "## Sort the files from `TEMP` to `OUT` folder" ] }, { @@ -2821,9 +2794,9 @@ "source": [ "for TARGET in WIKIDATA_IMPORT_SPLIT_FILES:\n", " print(f\"Sort the {TARGET} file.\")\n", - " input_file = f\"{os.environ['TEMPDIR']}/{TARGET}.{os.environ['UNSORTED_KGTK']}\"\n", - " output_file = f\"{os.environ['DATADIR']}/{TARGET}.{os.environ['SORTED_KGTK']}\"\n", - " logfile = f\"{os.environ['LOGDIR']}/{TARGET}-sorted.log\"\n", + " input_file = f\"{os.environ['TEMP']}/{TARGET}.{os.environ['UNSORTED_KGTK']}\"\n", + " output_file = f\"{os.environ['OUT']}/{TARGET}.{os.environ['SORTED_KGTK']}\"\n", + " logfile = f\"{os.environ['TEMP']}/{TARGET}-sorted.log\"\n", " sort_command = f\"\"\"kgtk {os.environ['KGTK_FLAGS']} \\\n", " sort {os.environ['VERBOSE']} \\\n", " --input-file {input_file} \\\n", @@ -3046,20 +3019,20 @@ "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " cat ${VERBOSE} --use-mgzip=${USE_MGZIP} \\\n", - " --input-file ${TEMPDIR}/claims.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMPDIR}/qualifiers.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMPDIR}/aliases.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMPDIR}/descriptions.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMPDIR}/labels.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMPDIR}/sitelinks.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMPDIR}/sitelinks.qualifiers.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMPDIR}/metadata.types.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMPDIR}/metadata.property.datatypes.${UNSORTED_KGTK} \\\n", + " --input-file ${TEMP}/claims.${UNSORTED_KGTK} \\\n", + " --input-file ${TEMP}/qualifiers.${UNSORTED_KGTK} \\\n", + " --input-file ${TEMP}/aliases.${UNSORTED_KGTK} \\\n", + " --input-file ${TEMP}/descriptions.${UNSORTED_KGTK} \\\n", + " --input-file ${TEMP}/labels.${UNSORTED_KGTK} \\\n", + " --input-file ${TEMP}/sitelinks.${UNSORTED_KGTK} \\\n", + " --input-file ${TEMP}/sitelinks.qualifiers.${UNSORTED_KGTK} \\\n", + " --input-file ${TEMP}/metadata.types.${UNSORTED_KGTK} \\\n", + " --input-file ${TEMP}/metadata.property.datatypes.${UNSORTED_KGTK} \\\n", " / sort ${VERBOSE} \\\n", " --gzip-command ${GZIP_CMD} \\\n", " --extra \"${SORT_EXTRAS}\" \\\n", - " --output-file ${DATADIR}/all.${SORTED_KGTK} \\\n", - "| tee ${LOGDIR}/build-all-edges.log" + " --output-file ${OUT}/all.${SORTED_KGTK} \\\n", + "| tee ${TEMP}/build-all-edges.log" ] }, { @@ -3122,12 +3095,12 @@ "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " ifnotexists $VERBOSE --use-mgzip=$USE_MGZIP --presorted \\\n", - " --input-file ${DATADIR}/qualifiers.${SORTED_KGTK} \\\n", + " --input-file ${OUT}/qualifiers.${SORTED_KGTK} \\\n", " --input-keys node1 \\\n", - " --filter-file ${DATADIR}/claims.${SORTED_KGTK} \\\n", + " --filter-file ${OUT}/claims.${SORTED_KGTK} \\\n", " --filter-keys id \\\n", - " --output-file ${DATADIR}/qualifiers.unclaimed.${SORTED_KGTK} \\\n", - "| tee ${LOGDIR}/qualifiers.unclaimed.log" + " --output-file ${OUT}/qualifiers.unclaimed.${SORTED_KGTK} \\\n", + "| tee ${TEMP}/qualifiers.unclaimed.log" ] }, { @@ -3262,46 +3235,46 @@ "source": [ "!kgtk ${KGTK_FLAGS} \\\n", " filter ${VERBOSE} \\\n", - " --input-file ${DATADIR}/claims.${SORTED_KGTK} \\\n", + " --input-file ${OUT}/claims.${SORTED_KGTK} \\\n", " --obj \"node2;wikidatatype\" \\\n", " --first-match-only \\\n", " --pattern \";;commonsMedia\" \\\n", - " --output-file ${DATADIR}/claims.commonsMedia.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.commonsMedia.${SORTED_KGTK} \\\n", " --pattern \";;external-id\" \\\n", - " --output-file ${DATADIR}/claims.external-id.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.external-id.${SORTED_KGTK} \\\n", " --pattern \";;geo-shape\" \\\n", - " --output-file ${DATADIR}/claims.geo-shape.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.geo-shape.${SORTED_KGTK} \\\n", " --pattern \";;globe-coordinate\" \\\n", - " --output-file ${DATADIR}/claims.globe-coordinate.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.globe-coordinate.${SORTED_KGTK} \\\n", " --pattern \";;math\" \\\n", - " --output-file ${DATADIR}/claims.math.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.math.${SORTED_KGTK} \\\n", " --pattern \";;monolingualtext\" \\\n", - " --output-file ${DATADIR}/claims.monolingualtext.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.monolingualtext.${SORTED_KGTK} \\\n", " --pattern \";;musical-notation\" \\\n", - " --output-file ${DATADIR}/claims.musical-notation.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.musical-notation.${SORTED_KGTK} \\\n", " --pattern \";;quantity\" \\\n", - " --output-file ${DATADIR}/claims.quantity.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.quantity.${SORTED_KGTK} \\\n", " --pattern \";;string\" \\\n", - " --output-file ${DATADIR}/claims.string.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.string.${SORTED_KGTK} \\\n", " --pattern \";;tabular-data\" \\\n", - " --output-file ${DATADIR}/claims.tabular-data.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.tabular-data.${SORTED_KGTK} \\\n", " --pattern \";;time\" \\\n", - " --output-file ${DATADIR}/claims.time.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.time.${SORTED_KGTK} \\\n", " --pattern \";;url\" \\\n", - " --output-file ${DATADIR}/claims.url.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.url.${SORTED_KGTK} \\\n", " --pattern \";;wikibase-form\" \\\n", - " --output-file ${DATADIR}/claims.wikibase-form.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.wikibase-form.${SORTED_KGTK} \\\n", " --pattern \";;wikibase-item\" \\\n", - " --output-file ${DATADIR}/claims.wikibase-item.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.wikibase-item.${SORTED_KGTK} \\\n", " --pattern \";;wikibase-lexeme\" \\\n", - " --output-file ${DATADIR}/claims.wikibase-lexeme.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.wikibase-lexeme.${SORTED_KGTK} \\\n", " --pattern \";;wikibase-property\" \\\n", - " --output-file ${DATADIR}/claims.wikibase-property.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.wikibase-property.${SORTED_KGTK} \\\n", " --pattern \";;wikibase-sense\" \\\n", - " --output-file ${DATADIR}/claims.wikibase-sense.${SORTED_KGTK} \\\n", - " --reject-file ${DATADIR}/claims.other.${SORTED_KGTK} \\\n", + " --output-file ${OUT}/claims.wikibase-sense.${SORTED_KGTK} \\\n", + " --reject-file ${OUT}/claims.other.${SORTED_KGTK} \\\n", " --use-mgzip ${USE_MGZIP} \\\n", - " | tee ${LOGDIR}/edge-datatype-split.log" + " | tee ${TEMP}/edge-datatype-split.log" ] }, { @@ -3997,14 +3970,14 @@ " os.environ['TARGET'] = TARGET\n", " !kgtk ${KGTK_FLAGS} \\\n", "\t ifexists ${VERBOSE} \\\n", - "\t --input-file ${DATADIR}/qualifiers.${SORTED_KGTK} \\\n", - "\t --filter-on ${DATADIR}/claims.${TARGET}.${SORTED_KGTK} \\\n", - "\t --output-file ${DATADIR}/qualifiers.${TARGET}.${SORTED_KGTK} \\\n", + "\t --input-file ${OUT}/qualifiers.${SORTED_KGTK} \\\n", + "\t --filter-on ${OUT}/claims.${TARGET}.${SORTED_KGTK} \\\n", + "\t --output-file ${OUT}/qualifiers.${TARGET}.${SORTED_KGTK} \\\n", "\t --input-keys node1 \\\n", "\t --filter-keys id \\\n", "\t --presorted \\\n", "\t --use-mgzip ${USE_MGZIP} \\\n", - "\t| tee ${LOGDIR}/qualifiers.${TARGET}.log" + "\t| tee ${TEMP}/qualifiers.${TARGET}.log" ] }, { @@ -4055,9 +4028,9 @@ ], "source": [ "!kgtk $KGTK_FLAGS filter $VERBOSE --use-mgzip=$USE_MGZIP --regex\\\n", - " --input-file $DATADIR/claims.$SORTED_KGTK \\\n", - " -p '^P ;;' -o $DATADIR/claims.properties.$SORTED_KGTK \\\n", - " | tee ${LOGDIR}/claims.properties.log" + " --input-file $OUT/claims.$SORTED_KGTK \\\n", + " -p '^P ;;' -o $OUT/claims.properties.$SORTED_KGTK \\\n", + " | tee ${TEMP}/claims.properties.log" ] }, { @@ -4108,9 +4081,9 @@ ], "source": [ "!kgtk $KGTK_FLAGS filter $VERBOSE --use-mgzip=$USE_MGZIP --regex \\\n", - " --input-file $DATADIR/qualifiers.$SORTED_KGTK \\\n", - " -p '^P ;;' -o $DATADIR/qualifiers.properties.$SORTED_KGTK \\\n", - " | tee ${LOGDIR}/qualifiers.properties.log" + " --input-file $OUT/qualifiers.$SORTED_KGTK \\\n", + " -p '^P ;;' -o $OUT/qualifiers.properties.$SORTED_KGTK \\\n", + " | tee ${TEMP}/qualifiers.properties.log" ] }, { @@ -4214,9 +4187,9 @@ ], "metadata": { "kernelspec": { - "display_name": "kgtk-env-ckg07", + "display_name": "kgtk-env", "language": "python", - "name": "kgtk-env-ckg07" + "name": "kgtk-env" }, "language_info": { "codemirror_mode": { @@ -4228,7 +4201,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.9.7" } }, "nbformat": 4, From ca0da119c47320e20da924c2f6738933a8c25a37 Mon Sep 17 00:00:00 2001 From: saggu Date: Mon, 23 May 2022 16:37:48 -0700 Subject: [PATCH 06/21] parameter to specify kernel name --- use-cases/create-wikidata-dwd.ipynb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/use-cases/create-wikidata-dwd.ipynb b/use-cases/create-wikidata-dwd.ipynb index 2003f588f..549c8bfc4 100644 --- a/use-cases/create-wikidata-dwd.ipynb +++ b/use-cases/create-wikidata-dwd.ipynb @@ -24,7 +24,8 @@ "output_path = \"/data/amandeep\"\n", "project_name = \"create-wikidata-dwd\"\n", "\n", - "kgtk_path = \"/data/amandeep/Github/kgtk\"" + "kgtk_path = \"/data/amandeep/Github/kgtk\"\n", + "kernel_name = \"kgtk-env-ckg07\"" ] }, { @@ -74,6 +75,7 @@ "pm.execute_notebook(\n", " \"import-wikidata.ipynb\",\n", " os.environ[\"TEMP\"] + \"/import-wikidata.out.ipynb\",\n", + " kernel_name=kernel_name,\n", " parameters=dict(\n", " input_path = json_file_path,\n", " output_path = import_wikidata_path,\n", @@ -118,6 +120,7 @@ "pm.execute_notebook(\n", " \"Wikidata-Useful-Files.ipynb\",\n", " os.environ[\"TEMP\"] + \"/Wikidata-Useful-Files.out.ipynb\",\n", + " kernel_name=kernel_name,\n", " parameters=dict(\n", " input_path = first_useful_files_input_path,\n", " output_path = first_useful_files_output_path,\n", @@ -184,6 +187,7 @@ "pm.execute_notebook(\n", " \"Wikidata-Subsets.ipynb\",\n", " os.environ[\"TEMP\"] + \"/Wikidata-Subsets.out.ipynb\",\n", + " kernel_name=kernel_name,\n", " parameters=dict(\n", " input_path = subset_input_path,\n", " output_path = subset_output_path,\n", From 5b1d361bcbd9d7a83ce5e29610b1cd12f507f1ce Mon Sep 17 00:00:00 2001 From: saggu Date: Tue, 24 May 2022 18:01:15 -0700 Subject: [PATCH 07/21] end-to-end import wikidata --- .../Embeddings-Elasticsearch-&-Triples.ipynb | 614 ------------------ use-cases/Wikidata-Subsets.ipynb | 27 +- use-cases/create-wikidata-dwd.ipynb | 379 ++++++++++- 3 files changed, 392 insertions(+), 628 deletions(-) delete mode 100644 use-cases/Embeddings-Elasticsearch-&-Triples.ipynb diff --git a/use-cases/Embeddings-Elasticsearch-&-Triples.ipynb b/use-cases/Embeddings-Elasticsearch-&-Triples.ipynb deleted file mode 100644 index 6a0985e7d..000000000 --- a/use-cases/Embeddings-Elasticsearch-&-Triples.ipynb +++ /dev/null @@ -1,614 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "naval-vulnerability", - "metadata": {}, - "source": [ - "# Computes Graph and Text Embeddings, Elasticsearch Ready KGTK File and RDF Triples for Blazegraph\n", - "\n", - "This notebook computes the following:\n", - "\n", - "- `complEx` graph embeddings\n", - "- `transE` graph embeddings\n", - "- `BERT` text embeddings\n", - "- `elasticsearch` ready KGTK edge for [KGTK Search](https://kgtk.isi.edu/search/)\n", - "- `elasticsearch` ready KGTK edge file for Table Linker\n", - "- `RDF Triples` to be loaded into blazegraph\n", - "\n", - "Inputs:\n", - "\n", - "- `item_file`: the subset of the `claims_file` consistin of edges for property of data type `wikibase-item`\n", - "- `label_file`, `alias_file` and `description_file` containing labels, aliases and descriptions. It is assume that these files contain the labels, aliases and descriptions of all nodes appearing in the claims file. Users may provide these files for specific languages only.\n" - ] - }, - { - "cell_type": "markdown", - "id": "endless-exemption", - "metadata": {}, - "source": [ - "### Batch Invocation\n", - "Example batch command. The second argument is a notebook where the output will be stored. You can load it to see progress.\n", - "\n", - "```\n", - "papermill Embeddings-Elasticsearch-&-Triples.ipynb Embeddings-Elasticsearch-&-Triples.out.ipynb \\\n", - "-p claims_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/all.tsv.gz \\\n", - "-p label_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/part.label.en.tsv.gz \\\n", - "-p item_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/part.wikibase-item.tsv.gz \\\n", - "-p property_item_file = /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/part.property.wikibase-item.tsv.gz \\\n", - "-p output_path \\\n", - "-p output_folder useful_files_v4 \\\n", - "-p temp_folder temp.useful_files_v4 \\\n", - "-p delete_database no \n", - "-p languages es,ru,zh-cn\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "bb3e0847-155b-4251-821a-34e27d75c8a6", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import sys\n", - "\n", - "import pandas as pd\n", - " \n", - "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", - "from kgtk.functions import kgtk, kypher" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "departmental-connectivity", - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "# Parameters\n", - "\n", - "input_path = \"/data/amandeep/wikidata-20220505-dwd-v4\"\n", - "output_path = \"/data/amandeep/wikidata-20220505-dwd-v4\"\n", - "kgtk_path = \"/Users/amandeep/Github/kgtk\"\n", - "\n", - "graph_cache_path = None\n", - "\n", - "project_name = \"embeddings-elasticsearch-triples\"\n", - "\n", - "languages = 'en,ru,es,zh-cn,de,it,nl,pl,fr,pt,sv'\n", - "\n", - "files = 'label_all,alias_all,description_all'\n", - "compute_embeddings = False\n", - "generate_triples = False\n", - "datatype_property = \"datatype\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f44d69f2-eca7-4ac6-8b63-1d7c42898f59", - "metadata": {}, - "outputs": [], - "source": [ - "files = files.split(',')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "1b52a584-551e-43ad-becb-9314e95932fa", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "User home: /nas/home/amandeep\n", - "Current dir: /data/amandeep/Github/kgtk/use-cases\n", - "KGTK dir: /Users/amandeep/Github/kgtk\n", - "Use-cases dir: /Users/amandeep/Github/kgtk/use-cases\n" - ] - } - ], - "source": [ - "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", - "ck.configure_kgtk(input_graph_path=input_path,\n", - " output_path=output_path,\n", - " project_name=project_name,\n", - " graph_cache_path=graph_cache_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "1ffdcaec-c0d7-468c-a207-186fad300d56", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "USE_CASES_DIR: /Users/amandeep/Github/kgtk/use-cases\n", - "TEMP: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples\n", - "EXAMPLES_DIR: /Users/amandeep/Github/kgtk/examples\n", - "KGTK_LABEL_FILE: /data/amandeep/wikidata-20220505-dwd-v4/labels.en.tsv.gz\n", - "GRAPH: /data/amandeep/wikidata-20220505-dwd-v4\n", - "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples/wikidata.sqlite3.db\n", - "KGTK_OPTION_DEBUG: false\n", - "kgtk: kgtk\n", - "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples/wikidata.sqlite3.db\n", - "OUT: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples\n", - "STORE: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples/wikidata.sqlite3.db\n", - "label_all: /data/amandeep/wikidata-20220505-dwd-v4/labels.tsv.gz\n", - "alias_all: /data/amandeep/wikidata-20220505-dwd-v4/aliases.tsv.gz\n", - "description_all: /data/amandeep/wikidata-20220505-dwd-v4/descriptions.tsv.gz\n" - ] - } - ], - "source": [ - "ck.print_env_variables()" - ] - }, - { - "cell_type": "markdown", - "id": "excellent-passenger", - "metadata": {}, - "source": [ - "## Graph Embeddings" - ] - }, - { - "cell_type": "markdown", - "id": "integrated-slide", - "metadata": {}, - "source": [ - "### complEx" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "eleven-tribe", - "metadata": {}, - "outputs": [], - "source": [ - "complex_temp_folder = f\"{wikidata_root_folder}/temp.graph-embeddings.complex\"" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "found-traffic", - "metadata": {}, - "outputs": [], - "source": [ - "!mkdir -p {complex_temp_folder}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "gentle-wheat", - "metadata": {}, - "outputs": [], - "source": [ - "os.environ['TEMP_COMPLEX'] = complex_temp_folder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "attached-texture", - "metadata": {}, - "outputs": [], - "source": [ - "if compute_embeddings:\n", - " !kgtk graph-embeddings --verbose -i \"$ITEMS\" \\\n", - " -o $OUT/wikidatadwd.complEx.graph-embeddings.txt \\\n", - " --retain_temporary_data True \\\n", - " --operator ComplEx \\\n", - " --workers 24 \\\n", - " --log $TEMP_COMPLEX/ge.complex.log \\\n", - " -T $TEMP_COMPLEX \\\n", - " -ot w2v \\\n", - " -e 600" - ] - }, - { - "cell_type": "markdown", - "id": "piano-thousand", - "metadata": {}, - "source": [ - "### transE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "serial-landscape", - "metadata": {}, - "outputs": [], - "source": [ - "transe_temp_folder = f\"{wikidata_root_folder}/temp.graph-embeddings.transe\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "naval-morgan", - "metadata": {}, - "outputs": [], - "source": [ - "!mkdir -p {transe_temp_folder}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "little-dietary", - "metadata": {}, - "outputs": [], - "source": [ - "os.environ['TEMP_TRANSE'] = transe_temp_folder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "annoying-council", - "metadata": {}, - "outputs": [], - "source": [ - "if compute_embeddings:\n", - " !$kgtk graph-embeddings --verbose -i \"$ITEMS\" \\\n", - " -o $OUT/wikidatadwd.transE.graph-embeddings.txt \\\n", - " --retain_temporary_data True \\\n", - " --operator TransE \\\n", - " --workers 24 \\\n", - " --log $TEMP_TRANSE/ge.transE.log \\\n", - " -T $TEMP_TRANSE \\\n", - " -ot w2v \\\n", - " -e 600" - ] - }, - { - "cell_type": "markdown", - "id": "speaking-torture", - "metadata": {}, - "source": [ - "### BERT Embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "patient-times", - "metadata": {}, - "outputs": [], - "source": [ - "if compute_embeddings:\n", - " !$kgtk text-embedding -i $ALL \\\n", - " --model roberta-large-nli-mean-tokens \\\n", - " --property-labels-file $LABELS_EN \\\n", - " --isa-properties P31 P279 P106 P39 P1382 P373 P452 \\\n", - " --save-embedding-sentence > $OUT/wikidatadwd-text-embeddings-all.tsv" - ] - }, - { - "cell_type": "markdown", - "id": "similar-bidder", - "metadata": {}, - "source": [ - "### Build KGTK edge file for KGTK Search" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "closed-yemen", - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"cat -i $GRAPH/all.tsv.gz \n", - " -i $GRAPH/derived.isastar.tsv.gz \n", - " -i $GRAPH/metadata.pagerank.undirected.tsv.gz\n", - " -i $GRAPH/metadata.pagerank.directed.tsv.gz\n", - " -o $TEMP/wikidata.dwd.all.kgtk.search.unsorted.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "trained-typing", - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(f\"\"\"sort -i $TEMP/wikidata.dwd.all.kgtk.search.unsorted.tsv.gz\n", - " --columns node1\n", - " --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'\n", - " -o $OUT/wikidata.dwd.all.kgtk.search.sorted.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6f2f5864-5dae-47ec-b4de-0726654de82c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Processed 1000000 lines...
0Processed 2000000 lines...
1Processed 3000000 lines...
2Processed 4000000 lines...
3Processed 5000000 lines...
4Processed 6000000 lines...
......
5080Processed 5082000000 lines...
5081Processed 5083000000 lines...
5082Processed 5084000000 lines...
5083Processed 5085000000 lines...
5084Done!
\n", - "

5085 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " Processed 1000000 lines...\n", - "0 Processed 2000000 lines...\n", - "1 Processed 3000000 lines...\n", - "2 Processed 4000000 lines...\n", - "3 Processed 5000000 lines...\n", - "4 Processed 6000000 lines...\n", - "... ...\n", - "5080 Processed 5082000000 lines...\n", - "5081 Processed 5083000000 lines...\n", - "5082 Processed 5084000000 lines...\n", - "5083 Processed 5085000000 lines...\n", - "5084 Done!\n", - "\n", - "[5085 rows x 1 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "kgtk(f\"\"\"--debug build-kgtk-search-input --input-file \"$OUT\"/wikidata.dwd.all.kgtk.search.sorted.tsv.gz\n", - "--output-file \"$OUT\"/wikidata.dwd.all.kgtk.search.sorted.jl \n", - "--label-properties label \n", - "--alias-properties alias \n", - "--extra-alias-properties P1448,P1705,P1477,P1810,P742,P1449 \n", - "--description-properties description \n", - "--pagerank-properties Pundirected_pagerank \n", - "--languages {languages}\n", - "--mapping-file \"$OUT\"/wikidata_dwd_v3_mapping.json \n", - "--property-datatype-file \"$GRAPH\"/metadata.property.datatypes.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "markdown", - "id": "instant-bidder", - "metadata": {}, - "source": [ - "### Build KGTK edge file for Triple generation\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "rolled-poker", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " !$kgtk cat \\\n", - " -i $OUT/wikidata.dwd.all.kgtk.search.sorted.tsv.gz \\\n", - " -i $OUT/derived.isa.tsv.gz \\\n", - " -i $OUT/derived.P279star.tsv.gz \\\n", - " -i $OUT/metadata.in_degree.tsv.gz \\\n", - " -i $OUT/metadata.out_degree.tsv.gz \\\n", - " -o $TEMP/wikidata.dwd.all.kgtk.triples.1.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aggressive-fleet", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " !$kgtk add-id -i $TEMP/wikidata.dwd.all.kgtk.triples.1.tsv.gz \\\n", - " --id-style wikidata \\\n", - " -o $TEMP/wikidata.dwd.all.kgtk.triples.2.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "assumed-ready", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " !$kgtk sort -i $TEMP/wikidata.dwd.all.kgtk.triples.2.tsv.gz \\\n", - " --columns node1 \\\n", - " --extra '--parallel 24 --buffer-size 30% --temporary-directory ' + temp_folder_path \\\n", - " -o $OUT/wikidata.dwd.all.kgtk.triples.sorted.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "id": "wireless-headquarters", - "metadata": {}, - "source": [ - "Split the triples file to parallelize triple generation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "single-gardening", - "metadata": {}, - "outputs": [], - "source": [ - "generate_triples:\n", - " !mkdir -p $OUT/kgtk_triples_split" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "original-charles", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " !$kgtk split -i $OUT/wikidata.dwd.all.kgtk.triples.sorted.tsv.gz \\\n", - " --output-path $OUT/kgtk_triples_split \\\n", - " --gzipped-output --lines 10000000 \\\n", - " --file-prefix kgtk_triples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "auburn-elephant", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " !curl https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv -o $TEMP/kgtk-properties.tsv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "compressed-sight", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " kgtk(f\"\"\"filter -p \";{datatype_property};\" -i $TEMP/kgtk-properties.tsv -o $TEMP/kgtk-properties.datatype.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "threaded-confusion", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cat: illegal option -- i\n", - "usage: cat [-benstuv] [file ...]\n" - ] - } - ], - "source": [ - "if generate_triples:\n", - " !$kgtk cat -i $TEMP/kgtk-properties.datatype.tsv.gz $OUT/metadata.property.datatypes.tsv.gz -o $OUT/metadata.property.datatypes.augmented.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "disciplinary-violation", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " ls $OUT/kgtk_triples_split/*.tsv.gz | parallel -j 18 'kgtk --debug generate-wikidata-triples -lp label -ap alias -dp description -pf $OUT/metadata.property.datatypes.augmented.tsv.gz --output-n-lines 100000 --generate-truthy --warning --use-id --log-path $TEMP/generate_triples_log.txt --error-action log -i {} -o {.}.ttl'\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "kgtk-env-ckg07", - "language": "python", - "name": "kgtk-env-ckg07" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/use-cases/Wikidata-Subsets.ipynb b/use-cases/Wikidata-Subsets.ipynb index d62873e81..76766bd17 100644 --- a/use-cases/Wikidata-Subsets.ipynb +++ b/use-cases/Wikidata-Subsets.ipynb @@ -79,7 +79,8 @@ "useful_files_notebook = \"Wikidata-Useful-Files.ipynb\"\n", "notebooks_folder = f\"{kgtk_path}/use-cases\"\n", "\n", - "languages = \"en,ru,es,zh-cn,de,it,nl,pl,fr,pt,sv\"\n" + "languages = \"en,ru,es,zh-cn,de,it,nl,pl,fr,pt,sv\"\n", + "debug = False" ] }, { @@ -649,7 +650,8 @@ } ], "source": [ - "!zcat < \"$qualifiers\" | head | column -t -s $'\\t' " + "if debug:\n", + " !zcat < \"$qualifiers\" | head | column -t -s $'\\t' " ] }, { @@ -706,7 +708,8 @@ } ], "source": [ - "!zcat $OUT/qualifiers.tsv.gz | head | col" + "if debug:\n", + " !zcat $OUT/qualifiers.tsv.gz | head | col" ] }, { @@ -901,10 +904,11 @@ "metadata": {}, "outputs": [], "source": [ - "!$kypher -i $OUT/claims.tsv.gz \\\n", - "--match '(n1:Q368441)-[l]->(n2)' \\\n", - "--limit 10 \\\n", - "| col" + "if debug:\n", + " !$kypher -i $OUT/claims.tsv.gz \\\n", + " --match '(n1:Q368441)-[l]->(n2)' \\\n", + " --limit 10 \\\n", + " | col" ] }, { @@ -913,10 +917,11 @@ "metadata": {}, "outputs": [], "source": [ - "!$kypher -i $OUT/claims.tsv.gz \\\n", - "--match '(n1:P131)-[l]->(n2)' \\\n", - "--limit 10 \\\n", - "| col" + "if debug:\n", + " !$kypher -i $OUT/claims.tsv.gz \\\n", + " --match '(n1:P131)-[l]->(n2)' \\\n", + " --limit 10 \\\n", + " | col" ] }, { diff --git a/use-cases/create-wikidata-dwd.ipynb b/use-cases/create-wikidata-dwd.ipynb index 549c8bfc4..cb993539e 100644 --- a/use-cases/create-wikidata-dwd.ipynb +++ b/use-cases/create-wikidata-dwd.ipynb @@ -25,6 +25,8 @@ "project_name = \"create-wikidata-dwd\"\n", "\n", "kgtk_path = \"/data/amandeep/Github/kgtk\"\n", + "kgtk_notebooks_path = \"/data/amandeep/Github/kgtk-notebooks\"\n", + "kgtk_browser_path = \"/data/amandeep/Github/kgtk-browser\"\n", "kernel_name = \"kgtk-env-ckg07\"" ] }, @@ -144,8 +146,8 @@ "metadata": {}, "outputs": [], "source": [ - "!cp $import_wikidata_path/$first_useful_files_project_name/derived.isa.tsv.gz $import_wikidata_path\n", - "!cp $import_wikidata_path/$first_useful_files_project_name/derived.P279star.tsv.gz $import_wikidata_path" + "!cp $import_wikidata_path/$first_useful_files_project_name/derived.isa.tsv.gz $import_wikidata_path/$wikidata_project_name\n", + "!cp $import_wikidata_path/$first_useful_files_project_name/derived.P279star.tsv.gz $import_wikidata_path/$wikidata_project_name" ] }, { @@ -156,6 +158,25 @@ "## Run Wikidata Subsets Notebook" ] }, + { + "cell_type": "markdown", + "id": "beb2d229-ea42-4309-aa8f-3c5dda6faeff", + "metadata": {}, + "source": [ + "The following notebook will run the following notebooks ,\n", + "\n", + "1. `../examples/partition-wikidata.ipynb`\n", + "The output will be at the path (example accoding to the parameters specified in the below cell) ,\n", + "`/data/amandeep/wikidata-20220519-dwd-v5/parts`\n", + "\n", + "2. `./Wikidata-Useful-Files.ipynb`\n", + "\n", + "The output will be at the path,\n", + "`/data/amandeep/wikidata-20220519-dwd-v5/useful-files`\n", + "\n", + "We will move the output files from the above 2 notebooks to the path `/data/amandeep/wikidata-20220519-dwd-v5` at the end of execution of the `Wikidata-Subsets.ipynb` notebook." + ] + }, { "cell_type": "code", "execution_count": null, @@ -163,7 +184,7 @@ "metadata": {}, "outputs": [], "source": [ - "subset_input_path = import_wikidata_path\n", + "subset_input_path = f\"{import_wikidata_path}/{wikidata_project_name}\"\n", "subset_output_path = \"/data/amandeep\"\n", "\n", "\n", @@ -199,6 +220,358 @@ " )\n", ")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6a5a06c-cdc1-41c7-9c11-634b76fdcc60", + "metadata": {}, + "outputs": [], + "source": [ + "!mv $subset_output_path/$subset_project_name/parts/*tsv.gz $subset_output_path/$subset_project_name\n", + "!mv $subset_output_path/$subset_project_name/useful-files/*tsv.gz $subset_output_path/$subset_project_name" + ] + }, + { + "cell_type": "markdown", + "id": "dce13d8b-65b3-4e39-bfa4-d69fe736b43a", + "metadata": {}, + "source": [ + "## Create and Load ES Index for KGTK-Search" + ] + }, + { + "cell_type": "markdown", + "id": "83c1d87f-8279-43a9-8041-dc027b5070f8", + "metadata": {}, + "source": [ + "The following notebook will create following file ,\n", + "\n", + "`/data/amandeep/wikidata-20220519-dwd-v5/kgtk-search/wikidata.dwd.all.kgtk.search.sorted.jl`\n", + "\n", + "We will then split the json lines file into 1M line partitions and load it into the ES index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55f19a42-9659-4e76-aa53-e22338fd93dc", + "metadata": {}, + "outputs": [], + "source": [ + "search_input_path = f\"{subset_output_path}/{subset_project_name}\"\n", + "search_output_path = f\"{subset_output_path}/{subset_project_name}\"\n", + "\n", + "search_project_name = \"kgtk-search\"\n", + "\n", + "compute_embeddings = False\n", + "generate_triples = False\n", + "generate_kgtk_search = True\n", + "datatype_property = \"datatype\"\n", + "\n", + "es_url=\"http://ckg07:9200\"\n", + "es_index=\"wikidata-dwd-kgtk-search-03\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cac5c471-8ba6-4769-8068-5fe86750cb93", + "metadata": {}, + "outputs": [], + "source": [ + "pm.execute_notebook(\n", + " \"Embeddings-Elasticsearch-Triples.ipynb\",\n", + " os.environ[\"TEMP\"] + \"/Embeddings-Elasticsearch-Triples.out.ipynb\",\n", + " kernel_name=kernel_name,\n", + " parameters=dict(\n", + " input_path = search_input_path,\n", + " output_path = search_output_path,\n", + " project_name = search_project_name,\n", + " kgtk_path = kgtk_path,\n", + " compute_embeddings = compute_embeddings,\n", + " generate_triples = generate_triples,\n", + " generate_kgtk_search = generate_kgtk_search,\n", + " datatype_property = datatype_property,\n", + " languages = languages\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b4fdd4fa-3c5f-430e-b489-3dbbe19c5bd8", + "metadata": {}, + "source": [ + "### Split the output json lines file to 1M lines partitions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4933fcc1-da1d-48a4-b474-05ecfd688919", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p $search_output_path/kgtk-search/es_split/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38c599e6-ea05-4a1a-8280-c2c27f2ae95f", + "metadata": {}, + "outputs": [], + "source": [ + "!split $search_output_path/kgtk-search/wikidata.dwd.all.kgtk.search.sorted.jl \\\n", + " -l 1000000 \\\n", + " $search_output_path/kgtk-search/es_split/" + ] + }, + { + "cell_type": "markdown", + "id": "91d94f34-088a-499b-865d-7e5b93bb948e", + "metadata": {}, + "source": [ + "### Load the file into ES" + ] + }, + { + "cell_type": "markdown", + "id": "67666184-21df-4bf1-9f06-8f6fccc3f01f", + "metadata": {}, + "source": [ + "**Make sure [table-linker](https://github.com/usc-isi-i2/table-linker) is installed as well**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77c6851f-1ddb-4250-8bb8-b42f1616b21c", + "metadata": {}, + "outputs": [], + "source": [ + "for f in $search_output_path/kgtk-search/es_split/* ;\n", + "do\n", + " tl load-elasticsearch-index --es-url $es_url --es-index $es_index --es-version 7 --kgtk-jl-path $f\n", + " sleep 60\n", + "done" + ] + }, + { + "cell_type": "markdown", + "id": "5c0ba14f-8a36-4109-add2-f158cb54da39", + "metadata": {}, + "source": [ + "## Run Properties-for-this-type-notebook" + ] + }, + { + "cell_type": "markdown", + "id": "94266d0a-79a2-4566-9af1-51305fe67aaf", + "metadata": {}, + "source": [ + "This notebook is in the [kgtk-notebooks](https://github.com/usc-isi-i2/kgtk-notebooks) repo." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc82bc30-7cf9-4144-924c-8ac69bda2a01", + "metadata": {}, + "outputs": [], + "source": [ + "p_input_path=f\"{subset_output_path}/{subset_project_name}\"\n", + "p_output_path = f\"{subset_output_path}/{subset_project_name}\"\n", + "\n", + "# we will re use graph cache from the useful-files notebook\n", + "# at this point it already has the `claims` file loaded into cache.\n", + "# we will only load the required files into the cache, save time\n", + "p_graph_cache_path = f\"{subset_output_path}/{subset_project_name}/useful-files/temp.useful-files/wikidata.sqlite3.db\"\n", + "files_for_cache=\"item,datatypes,p279,p279star\"\n", + "\n", + "p_project_name = \"p1963\"\n", + "debug = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ebc1423-c731-4953-a49f-57deca5887aa", + "metadata": {}, + "outputs": [], + "source": [ + "pm.execute_notebook(\n", + " f\"{kgtk_notebooks_path}/use-cases/properties-for-this-type.ipynb\",\n", + " os.environ[\"TEMP\"] + \"/properties-for-this-type.out.ipynb\",\n", + " kernel_name=kernel_name,\n", + " parameters=dict(\n", + " input_path = p_input_path,\n", + " output_path = p_output_path,\n", + " project_name = p_project_name,\n", + " graph_cache_path = p_graph_cache_path,\n", + " debug = debug,\n", + " files_for_cache=files_for_cache \n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0d6c97a0-f61c-45dc-b352-04b6c997547b", + "metadata": {}, + "source": [ + "**move the files out into the root folder**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50870bc1-a835-4ff7-8b13-52128ff8ac0c", + "metadata": {}, + "outputs": [], + "source": [ + "!mv $p_output_path/$p_project_name/*tsv.gz $p_output_path" + ] + }, + { + "cell_type": "markdown", + "id": "c209583e-501d-462d-abeb-d0eb54c4ceb6", + "metadata": {}, + "source": [ + "## Run class-visualization notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9f164ce-e4ca-4646-8ce4-097c9c4c1e3c", + "metadata": {}, + "outputs": [], + "source": [ + "c_input_path = f\"{subset_output_path}/{subset_project_name}\"\n", + "c_output_path = f\"{subset_output_path}/{subset_project_name}\"\n", + "c_project_name = \"class-visualization\"\n", + "\n", + "# re use the graph cache, at this point the cache has the following files loaded\n", + "# claims,item,datatypes,p279,p279star\n", + "# we only need to load label\n", + "\n", + "c_graph_cache_path = p_graph_cache_path\n", + "files_for_cache = \"label\"\n", + "debug = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67597816-1e22-4ea6-b599-109b6720dff3", + "metadata": {}, + "outputs": [], + "source": [ + "pm.execute_notebook(\n", + " f\"{kgtk_notebooks_path}/use-cases/class-visualization.ipynb\",\n", + " os.environ[\"TEMP\"] + \"/class-visualization.out.ipynb\",\n", + " kernel_name=kernel_name,\n", + " parameters=dict(\n", + " input_path = p_input_path,\n", + " output_path = p_output_path,\n", + " project_name = p_project_name,\n", + " graph_cache_path = p_graph_cache_path,\n", + " debug = debug,\n", + " files_for_cache=files_for_cache \n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fffe2e16-f871-4e39-aa18-151f7a6ee9cd", + "metadata": {}, + "outputs": [], + "source": [ + "!mv $c_output_path/$c_project_name/*tsv.gz $c_output_path" + ] + }, + { + "cell_type": "markdown", + "id": "13565322-b325-4b55-b215-7ada4cb7a435", + "metadata": {}, + "source": [ + "## Run Create-claims-augmented-for-browser notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a4a60f3-aad1-4e01-9382-c0ff82e13289", + "metadata": {}, + "outputs": [], + "source": [ + "a_input_path = f\"{subset_output_path}/{subset_project_name}\"\n", + "a_output_path = f\"{subset_output_path}/{subset_project_name}\"\n", + "a_project_name = \"browser-claims-file\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37270353-4e03-40ff-829a-36fc690b45e1", + "metadata": {}, + "outputs": [], + "source": [ + "pm.execute_notebook(\n", + " f\"{kgtk_browser_path}/Create-claims-augmented-for-browser.ipynb\",\n", + " os.environ[\"TEMP\"] + \"/Create-claims-augmented-for-browser.out.ipynb\",\n", + " kernel_name=kernel_name,\n", + " parameters=dict(\n", + " input_path = a_input_path,\n", + " output_path = a_output_path,\n", + " project_name = a_project_name\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "4d7e10c3-2b7a-4602-a0d0-33a465bda60f", + "metadata": {}, + "source": [ + "## Run KGTK-Query-Text-Search-Setup Notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a11211f-47a5-4224-8dfc-3c9b6dfe09dc", + "metadata": {}, + "outputs": [], + "source": [ + "q_input_path = f\"{a_output_path}/{a_project_name}\"\n", + "q_output_path = f\"{a_output_path}/{a_project_name}\"\n", + "\n", + "q_project_name = \"kgtk-browser-files\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d20ab059-fa30-475e-a2be-96e72a29b80c", + "metadata": {}, + "outputs": [], + "source": [ + "pm.execute_notebook(\n", + " f\"{kgtk_browser_path}/KGTK-Query-Text-Search-Setup.ipynb\",\n", + " os.environ[\"TEMP\"] + \"/KGTK-Query-Text-Search-Setup.ipynb\",\n", + " kernel_name=kernel_name,\n", + " parameters=dict(\n", + " input_path = q_input_path,\n", + " output_path = q_output_path,\n", + " project_name = q_project_name\n", + " )\n", + ")" + ] } ], "metadata": { From 136045dff87fbe79ed5142cbaa1c208ae210801f Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 25 May 2022 10:10:12 -0700 Subject: [PATCH 08/21] PEP 8 --- kgtk/cli/reachable_nodes.py | 241 ++++++++++++++++++++++-------------- 1 file changed, 145 insertions(+), 96 deletions(-) diff --git a/kgtk/cli/reachable_nodes.py b/kgtk/cli/reachable_nodes.py index 29f6eb00e..2abd4d08b 100644 --- a/kgtk/cli/reachable_nodes.py +++ b/kgtk/cli/reachable_nodes.py @@ -12,6 +12,7 @@ from kgtk.cli_argparse import KGTKArgumentParser, KGTKFiles + def parser(): return { 'help': 'Find reachable nodes in a graph.' @@ -33,69 +34,88 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names parser.add_input_file(positional=True, who="The KGTK file to find connected components in.") parser.add_output_file() - # parser.add_argument(action="store", type=str, dest="filename", metavar='filename', help='input filename here') - # parser.add_argument('-o', '--out', action='store', type=str, dest='output', help='File to output the reachable nodes,if empty will be written out to standard output',default=None) - - parser.add_argument('--root',action='store',dest='root',type=str, nargs="*", + parser.add_argument('--root', action='store', dest='root', type=str, nargs="*", help='Set of root nodes to use, space- or comma-separated strings. (default=None)') - parser.add_argument('--root-file', '--rootfile',action='store',dest='rootfile',help='Option to specify a file containing the set of root nodes',default=None) - parser.add_argument('--rootfilecolumn',action='store',type=str,dest='rootfilecolumn', - help='Specify the name or number of the root file column with the root nodes. (default=node1 or its alias if edge file, id if node file)') - parser.add_argument("--subj", action="store", type=str, dest="subject_column_name", help='Name of the subject column. (default: node1 or its alias)') - parser.add_argument("--obj", action="store", type=str, dest="object_column_name", help='Name of the object column. (default: label or its alias)') - parser.add_argument("--pred",action="store" ,type=str, dest="predicate_column_name",help='Name of the predicate column. (default: node2 or its alias)') + parser.add_argument('--root-file', '--rootfile', action='store', dest='rootfile', + help='Option to specify a file containing the set of root nodes', default=None) + parser.add_argument('--rootfilecolumn', action='store', type=str, dest='rootfilecolumn', + help='Specify the name or number of the root file column with the root nodes. ' + '(default=node1 or its alias if edge file, id if node file)') + parser.add_argument("--subj", action="store", type=str, dest="subject_column_name", + help='Name of the subject column. (default: node1 or its alias)') + parser.add_argument("--obj", action="store", type=str, dest="object_column_name", + help='Name of the object column. (default: label or its alias)') + parser.add_argument("--pred", action="store", type=str, dest="predicate_column_name", + help='Name of the predicate column. (default: node2 or its alias)') parser.add_argument("--prop", "--props", action="store", type=str, dest="props", nargs="*", - help='Properties to consider while finding reachable nodes, space- or comma-separated string. (default: all properties)',default=None) + help='Properties to consider while finding reachable nodes, space- or comma-separated string. ' + '(default: all properties)', + default=None) parser.add_argument('--props-file', action='store', dest='props_file', - help='Option to specify a file containing the set of properties',default=None) + help='Option to specify a file containing the set of properties', default=None) parser.add_argument('--propsfilecolumn', action='store', type=str, dest='propsfilecolumn', default=None, - help='Specify the name or number of the props file column with the property names. (default=node1 or its alias if edge file, id if node file)') + help='Specify the name or number of the props file column with the property names. ' + '(default=node1 or its alias if edge file, id if node file)') parser.add_argument('--inverted', dest="inverted", - help="When True, and when --undirected is False, invert the source and target nodes in the graph. (default=%(default)s)", + help="When True, and when --undirected is False, invert the source and target nodes in the " + "graph. (default=%(default)s)", type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") - parser.add_argument("--inverted-prop", "--inverted-props", action="store", type=str, dest="inverted_props", nargs="*", - help='Properties to invert, space- or comma-separated string. (default: no properties)',default=None) + parser.add_argument("--inverted-prop", "--inverted-props", action="store", type=str, dest="inverted_props", + nargs="*", + help='Properties to invert, space- or comma-separated string. (default: no properties)', + default=None) parser.add_argument('--inverted-props-file', action='store', dest='inverted_props_file', - help='Option to specify a file containing the set of inverted properties',default=None) - parser.add_argument('--invertedpropsfilecolumn', action='store', type=str, dest='invertedpropsfilecolumn', default=None, - help='Specify the name or number of the inverted props file column with the property names. (default=node1 or its alias if edge file, id if node file)') + help='Option to specify a file containing the set of inverted properties', default=None) + parser.add_argument('--invertedpropsfilecolumn', action='store', type=str, dest='invertedpropsfilecolumn', + default=None, + help='Specify the name or number of the inverted props file column with the property names. ' + '(default=node1 or its alias if edge file, id if node file)') parser.add_argument('--undirected', dest="undirected", help="When True, specify graph as undirected. (default=%(default)s)", type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") - parser.add_argument("--undirected-prop", "--undirected-props", action="store", type=str, dest="undirected_props", nargs="*", - help='Properties to treat as undirected, space- or comma-separated string. (default: no properties)',default=None) + parser.add_argument("--undirected-prop", "--undirected-props", action="store", type=str, dest="undirected_props", + nargs="*", + help='Properties to treat as undirected, space- or comma-separated string. ' + '(default: no properties)', + default=None) parser.add_argument('--undirected-props-file', action='store', dest='undirected_props_file', - help='Option to specify a file containing the set of undirected properties',default=None) - parser.add_argument('--undirectedpropsfilecolumn', action='store', type=str, dest='undirectedpropsfilecolumn', default=None, - help='Specify the name or number of the undirected props file column with the property names. (default=node1 or its alias if edge file, id if node file)') - - parser.add_argument('--label', action='store', type=str, dest='label', help='The label for the reachable relationship. (default: %(default)s)',default="reachable") - parser.add_argument('--selflink',dest='selflink_bool', + help='Option to specify a file containing the set of undirected properties', default=None) + parser.add_argument('--undirectedpropsfilecolumn', action='store', type=str, dest='undirectedpropsfilecolumn', + default=None, + help='Specify the name or number of the undirected props file column with the property names. ' + '(default=node1 or its alias if edge file, id if node file)') + + parser.add_argument('--label', action='store', type=str, dest='label', + help='The label for the reachable relationship. (default: %(default)s)', default="reachable") + parser.add_argument('--selflink', dest='selflink_bool', help='When True, include a link from each output node to itself. (default=%(default)s)', type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") - parser.add_argument('--show-properties',dest='show_properties', + parser.add_argument('--show-properties', dest='show_properties', help='When True, show the graph properties. (default=%(default)s)', type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") - parser.add_argument('--breadth-first',dest='breadth_first', - help='When True, search the graph breadth first. When false, search depth first. (default=%(default)s)', + parser.add_argument('--breadth-first', dest='breadth_first', + help='When True, search the graph breadth first. When false, search depth first. ' + '(default=%(default)s)', type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") - parser.add_argument('--depth-limit',dest='depth_limit', + parser.add_argument('--depth-limit', dest='depth_limit', help='An optional depth limit for breadth-first searches. (default=%(default)s)', type=int, default=None) - parser.add_argument('--show-distance',dest='show_distance', - help='When True, also given breadth first true, append another column showing the shortest distance, default col name is distance', + parser.add_argument('--show-distance', dest='show_distance', + help='When True, also given breadth first true, append another column showing the ' + 'shortest distance, default col name is distance', type=optional_bool, nargs='?', const=True, default=False, metavar="True|False") - parser.add_argument('--dist-col-name', action='store', type=str, dest='dist_col_name', help='The column name for distance, default is distance',default="distance") + parser.add_argument('--dist-col-name', action='store', type=str, dest='dist_col_name', + help='The column name for distance, default is distance', default="distance") KgtkReader.add_debug_arguments(parser, expert=_expert) KgtkReaderOptions.add_arguments(parser, @@ -109,6 +129,7 @@ def add_arguments_extended(parser: KGTKArgumentParser, parsed_shared_args: Names KgtkReaderOptions.add_arguments(parser, mode_options=True, who="inverted_props", expert=_expert, defaults=False) KgtkValueOptions.add_arguments(parser, expert=_expert) + def run(input_file: KGTKFiles, output_file: KGTKFiles, @@ -147,36 +168,32 @@ def run(input_file: KGTKFiles, show_distance: bool, dist_col_name: str, - **kwargs, # Whatever KgtkFileOptions and KgtkValueOptions want. + **kwargs, # Whatever KgtkFileOptions and KgtkValueOptions want. ): import sys - import csv from pathlib import Path - import time from graph_tool.search import dfs_iterator, bfs_iterator, bfs_search, BFSVisitor - # from graph_tool import load_graph_from_csv from graph_tool.util import find_edge from kgtk.exceptions import KGTKException from kgtk.cli_argparse import KGTKArgumentParser - from kgtk.gt.gt_load import load_graph_from_kgtk from kgtk.io.kgtkwriter import KgtkWriter from kgtk.io.kgtkreader import KgtkReader, KgtkReaderOptions from kgtk.value.kgtkvalueoptions import KgtkValueOptions - #Graph-tool names columns that are not subject or object c0, c1... This function finds the number that graph tool assigned to the predicate column - def find_pred_position(sub,pred,obj): + # Graph-tool names columns that are not subject or object c0, c1... + # This function finds the number that graph tool assigned to the predicate column + def find_pred_position(sub, pred, obj): if pred < sub and pred < obj: return pred - elif (pred > sub and pred < obj) or (predobj): - return pred-1 + elif (pred > sub and pred < obj) or (pred < sub and pred > obj): + return pred - 1 else: - return pred-2 + return pred - 2 def get_edges_by_edge_prop(g, p, v): return find_edge(g, prop=g.properties[('e', p)], match=v) - input_kgtk_file: Path = KGTKArgumentParser.get_input_file(input_file) output_kgtk_file: Path = KGTKArgumentParser.get_output_file(output_file) @@ -187,21 +204,23 @@ def get_edges_by_edge_prop(g, p, v): input_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="input", fallback=True) root_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="root", fallback=True) props_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="props", fallback=True) - undirected_props_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="undirected_props", fallback=True) - inverted_props_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="inverted_props", fallback=True) + undirected_props_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="undirected_props", + fallback=True) + inverted_props_reader_options: KgtkReaderOptions = KgtkReaderOptions.from_dict(kwargs, who="inverted_props", + fallback=True) value_options: KgtkValueOptions = KgtkValueOptions.from_dict(kwargs) if root is None: - root = [ ] # This simplifies matters. + root = [] # This simplifies matters. if props is None: - props = [ ] # This simplifies matters. + props = [] # This simplifies matters. if undirected_props is None: - undirected_props = [ ] # This simplifies matters. + undirected_props = [] # This simplifies matters. if inverted_props is None: - inverted_props = [ ] # This simplifies matters. + inverted_props = [] # This simplifies matters. if show_options: if root is not None: @@ -275,7 +294,7 @@ def get_edges_by_edge_prop(g, p, v): if rootfile is not None: if verbose: - print("Reading the root file %s" % repr(rootfile), file=error_file, flush=True) + print("Reading the root file %s" % repr(rootfile), file=error_file, flush=True) try: root_kr: KgtkReader = KgtkReader.open(Path(rootfile), error_file=error_file, @@ -290,14 +309,24 @@ def get_edges_by_edge_prop(g, p, v): rootcol: int if root_kr.is_edge_file: - rootcol = int(rootfilecolumn) if rootfilecolumn is not None and rootfilecolumn.isdigit() else root_kr.get_node1_column_index(rootfilecolumn) + rootcol = int( + rootfilecolumn) \ + if rootfilecolumn is not None and rootfilecolumn.isdigit() \ + else root_kr.get_node1_column_index(rootfilecolumn) elif root_kr.is_node_file: - rootcol = int(rootfilecolumn) if rootfilecolumn is not None and rootfilecolumn.isdigit() else root_kr.get_id_column_index(rootfilecolumn) + rootcol = int( + rootfilecolumn) \ + if rootfilecolumn is not None and rootfilecolumn.isdigit() \ + else root_kr.get_id_column_index(rootfilecolumn) elif rootfilecolumn is not None: - rootcol = int(rootfilecolumn) if rootfilecolumn is not None and rootfilecolumn.isdigit() else root_kr.column_name_map.get(rootfilecolumn, -1) + rootcol = int( + rootfilecolumn) \ + if rootfilecolumn is not None and rootfilecolumn.isdigit() \ + else root_kr.column_name_map.get(rootfilecolumn, -1) else: root_kr.close() - raise KGTKException("The root file is neither an edge nor a node file and the root column name was not supplied.") + raise KGTKException( + "The root file is neither an edge nor a node file and the root column name was not supplied.") if rootcol < 0: root_kr.close() @@ -307,10 +336,10 @@ def get_edges_by_edge_prop(g, p, v): rootnode: str = row[rootcol] root_set.add(rootnode) root_kr.close() - + if len(root) > 0: if verbose: - print ("Adding root nodes from the command line.", file=error_file, flush=True) + print("Adding root nodes from the command line.", file=error_file, flush=True) root_group: str for root_group in root: r: str @@ -323,11 +352,10 @@ def get_edges_by_edge_prop(g, p, v): elif verbose: print("%d nodes in the root set." % len(root_set), file=error_file, flush=True) - property_set: typing.Set[str] = set() if props_file is not None: if verbose: - print("Reading the root file %s" % repr(props_file), file=error_file, flush=True) + print("Reading the root file %s" % repr(props_file), file=error_file, flush=True) try: props_kr: KgtkReader = KgtkReader.open(Path(props_file), error_file=error_file, @@ -342,14 +370,24 @@ def get_edges_by_edge_prop(g, p, v): propscol: int if props_kr.is_edge_file: - propscol = int(propsfilecolumn) if propsfilecolumn is not None and propsfilecolumn.isdigit() else props_kr.get_node1_column_index(propsfilecolumn) + propscol = int( + propsfilecolumn) \ + if propsfilecolumn is not None and propsfilecolumn.isdigit() \ + else props_kr.get_node1_column_index(propsfilecolumn) elif props_kr.is_node_file: - propscol = int(propsfilecolumn) if propsfilecolumn is not None and propsfilecolumn.isdigit() else props_kr.get_id_column_index(propsfilecolumn) + propscol = int( + propsfilecolumn) \ + if propsfilecolumn is not None and propsfilecolumn.isdigit() \ + else props_kr.get_id_column_index(propsfilecolumn) elif propsfilecolumn is not None: - propscol = int(propsfilecolumn) if propsfilecolumn is not None and propsfilecolumn.isdigit() else props_kr.column_name_map.get(propsfilecolumn, -1) + propscol = int( + propsfilecolumn) \ + if propsfilecolumn is not None and propsfilecolumn.isdigit() \ + else props_kr.column_name_map.get(propsfilecolumn, -1) else: props_kr.close() - raise KGTKException("The props file is neither an edge nor a node file and the root column name was not supplied.") + raise KGTKException( + "The props file is neither an edge nor a node file and the root column name was not supplied.") if propscol < 0: props_kr.close() @@ -359,7 +397,7 @@ def get_edges_by_edge_prop(g, p, v): property_name: str = row[propscol] property_set.add(property_name) props_kr.close() - + if len(props) > 0: # Filter the graph, G, to include only edges where the predicate (label) # column contains one of the selected properties. @@ -370,13 +408,13 @@ def get_edges_by_edge_prop(g, p, v): for prop in prop_group.split(','): property_set.add(prop) if verbose and len(property_set) > 0: - print("property set=%s" % " ".join(sorted(list(property_set))), file=error_file, flush=True) - + print("property set=%s" % " ".join(sorted(list(property_set))), file=error_file, flush=True) undirected_property_set: typing.Set[str] = set() if undirected_props_file is not None: if verbose: - print("Reading the undirected properties file %s" % repr(undirected_props_file), file=error_file, flush=True) + print("Reading the undirected properties file %s" % repr(undirected_props_file), file=error_file, + flush=True) try: undirected_props_kr: KgtkReader = KgtkReader.open(Path(undirected_props_file), error_file=error_file, @@ -385,20 +423,27 @@ def get_edges_by_edge_prop(g, p, v): value_options=value_options, verbose=verbose, very_verbose=very_verbose, - ) + ) except SystemExit: raise KGTKException("Exiting.") undirected_props_col: int if undirected_props_kr.is_edge_file: - undirected_props_col = int(undirectedpropsfilecolumn) if undirectedpropsfilecolumn is not None and undirectedpropsfilecolumn.isdigit() else undirected_props_kr.get_node1_column_index(undirectedpropsfilecolumn) + undirected_props_col = int(undirectedpropsfilecolumn) \ + if undirectedpropsfilecolumn is not None and undirectedpropsfilecolumn.isdigit() \ + else undirected_props_kr.get_node1_column_index(undirectedpropsfilecolumn) elif undirected_props_kr.is_node_file: - undirected_props_col = int(undirectedpropsfilecolumn) if undirectedpropsfilecolumn is not None and undirectedpropsfilecolumn.isdigit() else undirected_props_kr.get_id_column_index(undirectedpropsfilecolumn) + undirected_props_col = int(undirectedpropsfilecolumn) \ + if undirectedpropsfilecolumn is not None and undirectedpropsfilecolumn.isdigit() \ + else undirected_props_kr.get_id_column_index(undirectedpropsfilecolumn) elif undirectedpropsfilecolumn is not None: - undirected_props_col = int(undirectedpropsfilecolumn) if undirectedpropsfilecolumn is not None and undirectedpropsfilecolumn.isdigit() else undirected_props_kr.column_name_map.get(undirectedpropsfilecolumn, -1) + undirected_props_col = int(undirectedpropsfilecolumn) \ + if undirectedpropsfilecolumn is not None and undirectedpropsfilecolumn.isdigit() \ + else undirected_props_kr.column_name_map.get(undirectedpropsfilecolumn, -1) else: undirected_props_kr.close() - raise KGTKException("The undirected props file is neither an edge nor a node file and the root column name was not supplied.") + raise KGTKException("The undirected props file is neither an edge nor a node file and the root column " + "name was not supplied.") if undirected_props_col < 0: undirected_props_kr.close() @@ -418,13 +463,13 @@ def get_edges_by_edge_prop(g, p, v): for und_prop in und_prop_group.split(','): undirected_property_set.add(und_prop) if verbose and len(undirected_property_set) > 0: - print("undirected property set=%s" % " ".join(sorted(list(undirected_property_set))), file=error_file, flush=True) - + print("undirected property set=%s" % " ".join(sorted(list(undirected_property_set))), file=error_file, + flush=True) inverted_property_set: typing.Set[str] = set() if inverted_props_file is not None: if verbose: - print("Reading the inverted properties file %s" % repr(inverted_props_file), file=error_file, flush=True) + print("Reading the inverted properties file %s" % repr(inverted_props_file), file=error_file, flush=True) try: inverted_props_kr: KgtkReader = KgtkReader.open(Path(inverted_props_file), error_file=error_file, @@ -433,20 +478,27 @@ def get_edges_by_edge_prop(g, p, v): value_options=value_options, verbose=verbose, very_verbose=very_verbose, - ) + ) except SystemExit: raise KGTKException("Exiting.") inverted_props_col: int if inverted_props_kr.is_edge_file: - inverted_props_col = int(invertedpropsfilecolumn) if invertedpropsfilecolumn is not None and invertedpropsfilecolumn.isdigit() else inverted_props_kr.get_node1_column_index(invertedpropsfilecolumn) + inverted_props_col = int(invertedpropsfilecolumn) \ + if invertedpropsfilecolumn is not None and invertedpropsfilecolumn.isdigit() \ + else inverted_props_kr.get_node1_column_index(invertedpropsfilecolumn) elif inverted_props_kr.is_node_file: - inverted_props_col = int(invertedpropsfilecolumn) if invertedpropsfilecolumn is not None and invertedpropsfilecolumn.isdigit() else inverted_props_kr.get_id_column_index(invertedpropsfilecolumn) + inverted_props_col = int(invertedpropsfilecolumn) \ + if invertedpropsfilecolumn is not None and invertedpropsfilecolumn.isdigit() \ + else inverted_props_kr.get_id_column_index(invertedpropsfilecolumn) elif invertedpropsfilecolumn is not None: - inverted_props_col = int(invertedpropsfilecolumn) if invertedpropsfilecolumn is not None and invertedpropsfilecolumn.isdigit() else inverted_props_kr.column_name_map.get(invertedpropsfilecolumn, -1) + inverted_props_col = int(invertedpropsfilecolumn) \ + if invertedpropsfilecolumn is not None and invertedpropsfilecolumn.isdigit() \ + else inverted_props_kr.column_name_map.get(invertedpropsfilecolumn, -1) else: inverted_props_kr.close() - raise KGTKException("The inverted props file is neither an edge nor a node file and the root column name was not supplied.") + raise KGTKException( + "The inverted props file is neither an edge nor a node file and the root column name was not supplied.") if inverted_props_col < 0: inverted_props_kr.close() @@ -456,7 +508,7 @@ def get_edges_by_edge_prop(g, p, v): inverted_property_name: str = row[inverted_props_col] inverted_property_set.add(inverted_property_name) inverted_props_kr.close() - + if len(inverted_props) > 0: # Edges where the predicate (label) column contains one of the selected # properties will have the source and target columns swapped. @@ -467,8 +519,7 @@ def get_edges_by_edge_prop(g, p, v): for inv_prop in inv_prop_group.split(','): inverted_property_set.add(inv_prop) if verbose and len(inverted_property_set): - print("inverted property set=%s" % " ".join(sorted(list(inverted_property_set))), file=error_file, flush=True) - + print("inverted property set=%s" % " ".join(sorted(list(inverted_property_set))), file=error_file, flush=True) try: kr: KgtkReader = KgtkReader.open(input_kgtk_file, @@ -481,7 +532,7 @@ def get_edges_by_edge_prop(g, p, v): ) except SystemExit: raise KGTKException("Exiting.") - + sub: int = kr.get_node1_column_index(subject_column_name) if sub < 0: print("Unknown subject column %s" % repr(subject_column_name), file=error_file, flush=True) @@ -499,9 +550,8 @@ def get_edges_by_edge_prop(g, p, v): raise KGTKException("Exiting due to unknown column.") if verbose: - print("special columns: sub=%d pred=%d obj=%d" % (sub, pred, obj), file=error_file, flush=True) + print("special columns: sub=%d pred=%d obj=%d" % (sub, pred, obj), file=error_file, flush=True) - # G = load_graph_from_csv(filename,not(undirected),skip_first=not(header_bool),hashed=True,csv_options={'delimiter': '\t'},ecols=(sub,obj)) G = load_graph_from_kgtk(kr, directed=not undirected, inverted=inverted, @@ -513,11 +563,11 @@ def get_edges_by_edge_prop(g, p, v): verbose=verbose, out=error_file) - name = G.vp["name"] # Get the vertex name property map (vertex to ndoe1 (subject) name) + name = G.vp["name"] # Get the vertex name property map (vertex to ndoe1 (subject) name) if show_properties: print("Graph name=%s" % repr(name), file=error_file, flush=True) - print("Graph properties:" , file=error_file, flush=True) + print("Graph properties:", file=error_file, flush=True) key: typing.Any for key in G.properties: print(" %s: %s" % (repr(key), repr(G.properties[key])), file=error_file, flush=True) @@ -532,9 +582,9 @@ def get_edges_by_edge_prop(g, p, v): print("%d root nodes found in the graph." % len(index_list), file=error_file, flush=True) if breadth_first and show_distance: - output_header: typing.List[str] = ['node1','label','node2', dist_col_name] + output_header: typing.List[str] = ['node1', 'label', 'node2', dist_col_name] else: - output_header: typing.List[str] = ['node1','label','node2'] + output_header: typing.List[str] = ['node1', 'label', 'node2'] try: kw: KgtkWriter = KgtkWriter.open(output_header, @@ -553,7 +603,7 @@ def get_edges_by_edge_prop(g, p, v): kw.writerow([name[index], label, name[index], 0]) elif selflink_bool and not show_distance: kw.writerow([name[index], label, name[index]]) - + if breadth_first: if depth_limit is None: if show_distance: @@ -563,7 +613,7 @@ def get_edges_by_edge_prop(g, p, v): if e.source() in past: count += 1 past = set() - kw.writerow([name[index], label, name[e.target()], count+1]) + kw.writerow([name[index], label, name[e.target()], count + 1]) past.add(e.target()) else: @@ -586,7 +636,7 @@ def tree_edge(self, e): newdist = self.dist[e.source()] + 1 if depth_limit is not None and newdist > depth_limit: - raise DepthExceeded + raise DepthExceeded self.dist[e.target()] = newdist kw.writerow([name[index], label, name[e.target()], newdist]) @@ -613,7 +663,7 @@ def tree_edge(self, e): self.pred[e.target()] = int(e.source()) newdist = self.dist[e.source()] + 1 if depth_limit is not None and newdist > depth_limit: - raise DepthExceeded + raise DepthExceeded self.dist[e.targt()] = newdist kw.writerow([name[index], label, name[e.target()]]) @@ -630,4 +680,3 @@ def tree_edge(self, e): kw.close() kr.close() - From 78e1c0c5c0490c75ee720f335ff86c17679650be Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 25 May 2022 10:10:32 -0700 Subject: [PATCH 09/21] use the --label option in reachable-nodes --- use-cases/Wikidata-Useful-Files.ipynb | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/use-cases/Wikidata-Useful-Files.ipynb b/use-cases/Wikidata-Useful-Files.ipynb index 1e92f1f69..3e5339890 100644 --- a/use-cases/Wikidata-Useful-Files.ipynb +++ b/use-cases/Wikidata-Useful-Files.ipynb @@ -500,6 +500,7 @@ " --rootfile $TEMP/P279.roots.tsv\n", " --selflink \n", " -i $OUT/derived.P279.tsv.gz\n", + " --label P279star\n", " -o $TEMP/P279.reachable.tsv.gz\"\"\")" ] }, @@ -539,10 +540,8 @@ ] }, { - "cell_type": "code", - "execution_count": 24, + "cell_type": "raw", "metadata": {}, - "outputs": [], "source": [ "!$kypher -i $TEMP/P279.reachable.tsv.gz -o $TEMP/P279star.1.tsv.gz \\\n", "--match '(n1)-[]->(n2)' \\\n", @@ -563,7 +562,7 @@ "metadata": {}, "outputs": [], "source": [ - "!$kgtk add-id --id-style wikidata -i $TEMP/P279star.1.tsv.gz -o $OUT/derived.P279star.tsv.gz" + "!$kgtk add-id --id-style wikidata -i $TEMP/P279.reachable.tsv.gz -o $OUT/derived.P279star.tsv.gz" ] }, { From 4ea12adbfa7dd42228842ffa44a4c61189d09f9e Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 25 May 2022 10:54:31 -0700 Subject: [PATCH 10/21] rename notebook --- .../Embeddings-Elasticsearch-Triples.ipynb | 624 ++++++++++++++++++ 1 file changed, 624 insertions(+) create mode 100644 use-cases/Embeddings-Elasticsearch-Triples.ipynb diff --git a/use-cases/Embeddings-Elasticsearch-Triples.ipynb b/use-cases/Embeddings-Elasticsearch-Triples.ipynb new file mode 100644 index 000000000..f5fd0375c --- /dev/null +++ b/use-cases/Embeddings-Elasticsearch-Triples.ipynb @@ -0,0 +1,624 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "naval-vulnerability", + "metadata": {}, + "source": [ + "# Computes Graph and Text Embeddings, Elasticsearch Ready KGTK File and RDF Triples for Blazegraph\n", + "\n", + "This notebook computes the following:\n", + "\n", + "- `complEx` graph embeddings\n", + "- `transE` graph embeddings\n", + "- `BERT` text embeddings\n", + "- `elasticsearch` ready KGTK edge for [KGTK Search](https://kgtk.isi.edu/search/)\n", + "- `elasticsearch` ready KGTK edge file for Table Linker\n", + "- `RDF Triples` to be loaded into blazegraph\n", + "\n", + "Inputs:\n", + "\n", + "- `item_file`: the subset of the `claims_file` consistin of edges for property of data type `wikibase-item`\n", + "- `label_file`, `alias_file` and `description_file` containing labels, aliases and descriptions. It is assume that these files contain the labels, aliases and descriptions of all nodes appearing in the claims file. Users may provide these files for specific languages only.\n" + ] + }, + { + "cell_type": "markdown", + "id": "endless-exemption", + "metadata": {}, + "source": [ + "### Batch Invocation\n", + "Example batch command. The second argument is a notebook where the output will be stored. You can load it to see progress.\n", + "\n", + "```\n", + "papermill Embeddings-Elasticsearch-&-Triples.ipynb Embeddings-Elasticsearch-&-Triples.out.ipynb \\\n", + "-p claims_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/all.tsv.gz \\\n", + "-p label_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/part.label.en.tsv.gz \\\n", + "-p item_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/part.wikibase-item.tsv.gz \\\n", + "-p property_item_file = /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/part.property.wikibase-item.tsv.gz \\\n", + "-p output_path \\\n", + "-p output_folder useful_files_v4 \\\n", + "-p temp_folder temp.useful_files_v4 \\\n", + "-p delete_database no \n", + "-p languages es,ru,zh-cn\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "bb3e0847-155b-4251-821a-34e27d75c8a6", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "import pandas as pd\n", + " \n", + "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", + "from kgtk.functions import kgtk, kypher" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "departmental-connectivity", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "# Parameters\n", + "\n", + "input_path = \"/data/amandeep/wikidata-20220505-dwd-v4\"\n", + "output_path = \"/data/amandeep/wikidata-20220505-dwd-v4\"\n", + "kgtk_path = \"/Users/amandeep/Github/kgtk\"\n", + "\n", + "graph_cache_path = None\n", + "\n", + "project_name = \"embeddings-elasticsearch-triples\"\n", + "\n", + "languages = 'en,ru,es,zh-cn,de,it,nl,pl,fr,pt,sv'\n", + "\n", + "files = 'label_all,alias_all,description_all'\n", + "compute_embeddings = False\n", + "generate_triples = False\n", + "generate_kgtk_search = True\n", + "datatype_property = \"datatype\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f44d69f2-eca7-4ac6-8b63-1d7c42898f59", + "metadata": {}, + "outputs": [], + "source": [ + "files = files.split(',')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1b52a584-551e-43ad-becb-9314e95932fa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User home: /nas/home/amandeep\n", + "Current dir: /data/amandeep/Github/kgtk/use-cases\n", + "KGTK dir: /Users/amandeep/Github/kgtk\n", + "Use-cases dir: /Users/amandeep/Github/kgtk/use-cases\n" + ] + } + ], + "source": [ + "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", + "ck.configure_kgtk(input_graph_path=input_path,\n", + " output_path=output_path,\n", + " project_name=project_name,\n", + " graph_cache_path=graph_cache_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1ffdcaec-c0d7-468c-a207-186fad300d56", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "USE_CASES_DIR: /Users/amandeep/Github/kgtk/use-cases\n", + "TEMP: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples\n", + "EXAMPLES_DIR: /Users/amandeep/Github/kgtk/examples\n", + "KGTK_LABEL_FILE: /data/amandeep/wikidata-20220505-dwd-v4/labels.en.tsv.gz\n", + "GRAPH: /data/amandeep/wikidata-20220505-dwd-v4\n", + "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples/wikidata.sqlite3.db\n", + "KGTK_OPTION_DEBUG: false\n", + "kgtk: kgtk\n", + "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples/wikidata.sqlite3.db\n", + "OUT: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples\n", + "STORE: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples/wikidata.sqlite3.db\n", + "label_all: /data/amandeep/wikidata-20220505-dwd-v4/labels.tsv.gz\n", + "alias_all: /data/amandeep/wikidata-20220505-dwd-v4/aliases.tsv.gz\n", + "description_all: /data/amandeep/wikidata-20220505-dwd-v4/descriptions.tsv.gz\n" + ] + } + ], + "source": [ + "ck.print_env_variables()" + ] + }, + { + "cell_type": "markdown", + "id": "excellent-passenger", + "metadata": {}, + "source": [ + "## Graph Embeddings" + ] + }, + { + "cell_type": "markdown", + "id": "integrated-slide", + "metadata": {}, + "source": [ + "### complEx" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "eleven-tribe", + "metadata": {}, + "outputs": [], + "source": [ + "if compute_embeddings:\n", + " complex_temp_folder = f\"{wikidata_root_folder}/temp.graph-embeddings.complex\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "found-traffic", + "metadata": {}, + "outputs": [], + "source": [ + "if compute_embeddings:\n", + " !mkdir -p {complex_temp_folder}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "gentle-wheat", + "metadata": {}, + "outputs": [], + "source": [ + "if compute_embeddings:\n", + " os.environ['TEMP_COMPLEX'] = complex_temp_folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "attached-texture", + "metadata": {}, + "outputs": [], + "source": [ + "if compute_embeddings:\n", + " !kgtk graph-embeddings --verbose -i \"$ITEMS\" \\\n", + " -o $OUT/wikidatadwd.complEx.graph-embeddings.txt \\\n", + " --retain_temporary_data True \\\n", + " --operator ComplEx \\\n", + " --workers 24 \\\n", + " --log $TEMP_COMPLEX/ge.complex.log \\\n", + " -T $TEMP_COMPLEX \\\n", + " -ot w2v \\\n", + " -e 600" + ] + }, + { + "cell_type": "markdown", + "id": "piano-thousand", + "metadata": {}, + "source": [ + "### transE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "serial-landscape", + "metadata": {}, + "outputs": [], + "source": [ + "if compute_embeddings:\n", + " transe_temp_folder = f\"{wikidata_root_folder}/temp.graph-embeddings.transe\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "naval-morgan", + "metadata": {}, + "outputs": [], + "source": [ + "if compute_embeddings:\n", + " !mkdir -p {transe_temp_folder}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "little-dietary", + "metadata": {}, + "outputs": [], + "source": [ + "if compute_embeddings:\n", + " os.environ['TEMP_TRANSE'] = transe_temp_folder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "annoying-council", + "metadata": {}, + "outputs": [], + "source": [ + "if compute_embeddings:\n", + " !$kgtk graph-embeddings --verbose -i \"$ITEMS\" \\\n", + " -o $OUT/wikidatadwd.transE.graph-embeddings.txt \\\n", + " --retain_temporary_data True \\\n", + " --operator TransE \\\n", + " --workers 24 \\\n", + " --log $TEMP_TRANSE/ge.transE.log \\\n", + " -T $TEMP_TRANSE \\\n", + " -ot w2v \\\n", + " -e 600" + ] + }, + { + "cell_type": "markdown", + "id": "speaking-torture", + "metadata": {}, + "source": [ + "### BERT Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "patient-times", + "metadata": {}, + "outputs": [], + "source": [ + "if compute_embeddings:\n", + " !$kgtk text-embedding -i $ALL \\\n", + " --model roberta-large-nli-mean-tokens \\\n", + " --property-labels-file $LABELS_EN \\\n", + " --isa-properties P31 P279 P106 P39 P1382 P373 P452 \\\n", + " --save-embedding-sentence > $OUT/wikidatadwd-text-embeddings-all.tsv" + ] + }, + { + "cell_type": "markdown", + "id": "similar-bidder", + "metadata": {}, + "source": [ + "### Build KGTK edge file for KGTK Search" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "closed-yemen", + "metadata": {}, + "outputs": [], + "source": [ + "if generate_kgtk_search:\n", + " kgtk(\"\"\"cat -i $GRAPH/all.tsv.gz \n", + " -i $GRAPH/derived.isastar.tsv.gz \n", + " -i $GRAPH/metadata.pagerank.undirected.tsv.gz\n", + " -i $GRAPH/metadata.pagerank.directed.tsv.gz\n", + " -o $TEMP/wikidata.dwd.all.kgtk.search.unsorted.tsv.gz\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "trained-typing", + "metadata": {}, + "outputs": [], + "source": [ + "if generate_kgtk_search:\n", + " kgtk(f\"\"\"sort -i $TEMP/wikidata.dwd.all.kgtk.search.unsorted.tsv.gz\n", + " --columns node1\n", + " --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'\n", + " -o $OUT/wikidata.dwd.all.kgtk.search.sorted.tsv.gz\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "6f2f5864-5dae-47ec-b4de-0726654de82c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Processed 1000000 lines...
0Processed 2000000 lines...
1Processed 3000000 lines...
2Processed 4000000 lines...
3Processed 5000000 lines...
4Processed 6000000 lines...
......
5080Processed 5082000000 lines...
5081Processed 5083000000 lines...
5082Processed 5084000000 lines...
5083Processed 5085000000 lines...
5084Done!
\n", + "

5085 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " Processed 1000000 lines...\n", + "0 Processed 2000000 lines...\n", + "1 Processed 3000000 lines...\n", + "2 Processed 4000000 lines...\n", + "3 Processed 5000000 lines...\n", + "4 Processed 6000000 lines...\n", + "... ...\n", + "5080 Processed 5082000000 lines...\n", + "5081 Processed 5083000000 lines...\n", + "5082 Processed 5084000000 lines...\n", + "5083 Processed 5085000000 lines...\n", + "5084 Done!\n", + "\n", + "[5085 rows x 1 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "if generate_kgtk_search:\n", + " kgtk(f\"\"\"--debug build-kgtk-search-input --input-file \"$OUT\"/wikidata.dwd.all.kgtk.search.sorted.tsv.gz\n", + " --output-file \"$OUT\"/wikidata.dwd.all.kgtk.search.sorted.jl \n", + " --label-properties label \n", + " --alias-properties alias \n", + " --extra-alias-properties P1448,P1705,P1477,P1810,P742,P1449 \n", + " --description-properties description \n", + " --pagerank-properties Pundirected_pagerank \n", + " --languages {languages}\n", + " --mapping-file \"$OUT\"/wikidata_dwd_v3_mapping.json \n", + " --property-datatype-file \"$GRAPH\"/metadata.property.datatypes.tsv.gz\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "instant-bidder", + "metadata": {}, + "source": [ + "### Build KGTK edge file for Triple generation\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "rolled-poker", + "metadata": {}, + "outputs": [], + "source": [ + "if generate_triples:\n", + " !$kgtk cat \\\n", + " -i $OUT/wikidata.dwd.all.kgtk.search.sorted.tsv.gz \\\n", + " -i $OUT/derived.isa.tsv.gz \\\n", + " -i $OUT/derived.P279star.tsv.gz \\\n", + " -i $OUT/metadata.in_degree.tsv.gz \\\n", + " -i $OUT/metadata.out_degree.tsv.gz \\\n", + " -o $TEMP/wikidata.dwd.all.kgtk.triples.1.tsv.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aggressive-fleet", + "metadata": {}, + "outputs": [], + "source": [ + "if generate_triples:\n", + " !$kgtk add-id -i $TEMP/wikidata.dwd.all.kgtk.triples.1.tsv.gz \\\n", + " --id-style wikidata \\\n", + " -o $TEMP/wikidata.dwd.all.kgtk.triples.2.tsv.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "assumed-ready", + "metadata": {}, + "outputs": [], + "source": [ + "if generate_triples:\n", + " !$kgtk sort -i $TEMP/wikidata.dwd.all.kgtk.triples.2.tsv.gz \\\n", + " --columns node1 \\\n", + " --extra '--parallel 24 --buffer-size 30% --temporary-directory ' + temp_folder_path \\\n", + " -o $OUT/wikidata.dwd.all.kgtk.triples.sorted.tsv.gz" + ] + }, + { + "cell_type": "markdown", + "id": "wireless-headquarters", + "metadata": {}, + "source": [ + "Split the triples file to parallelize triple generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "single-gardening", + "metadata": {}, + "outputs": [], + "source": [ + "if generate_triples:\n", + " !mkdir -p $OUT/kgtk_triples_split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "original-charles", + "metadata": {}, + "outputs": [], + "source": [ + "if generate_triples:\n", + " !$kgtk split -i $OUT/wikidata.dwd.all.kgtk.triples.sorted.tsv.gz \\\n", + " --output-path $OUT/kgtk_triples_split \\\n", + " --gzipped-output --lines 10000000 \\\n", + " --file-prefix kgtk_triples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "auburn-elephant", + "metadata": {}, + "outputs": [], + "source": [ + "if generate_triples:\n", + " !curl https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv -o $TEMP/kgtk-properties.tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "compressed-sight", + "metadata": {}, + "outputs": [], + "source": [ + "if generate_triples:\n", + " kgtk(f\"\"\"filter -p \";{datatype_property};\" -i $TEMP/kgtk-properties.tsv -o $TEMP/kgtk-properties.datatype.tsv.gz\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "threaded-confusion", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cat: illegal option -- i\n", + "usage: cat [-benstuv] [file ...]\n" + ] + } + ], + "source": [ + "if generate_triples:\n", + " !$kgtk cat -i $TEMP/kgtk-properties.datatype.tsv.gz $OUT/metadata.property.datatypes.tsv.gz -o $OUT/metadata.property.datatypes.augmented.tsv.gz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "disciplinary-violation", + "metadata": {}, + "outputs": [], + "source": [ + "if generate_triples:\n", + " ls $OUT/kgtk_triples_split/*.tsv.gz | parallel -j 18 'kgtk --debug generate-wikidata-triples -lp label -ap alias -dp description -pf $OUT/metadata.property.datatypes.augmented.tsv.gz --output-n-lines 100000 --generate-truthy --warning --use-id --log-path $TEMP/generate_triples_log.txt --error-action log -i {} -o {.}.ttl'\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "kgtk-env-ckg07", + "language": "python", + "name": "kgtk-env-ckg07" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From cab387b5b64cd17b853c04d237add0cb4f809e68 Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 25 May 2022 11:00:36 -0700 Subject: [PATCH 11/21] add unidecode --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 4b8130159..5fd6e56eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,3 +13,4 @@ plac==1.1.3 parsley>=1.3 odictliteral torchbiggraph +unidecode From a13edbb85470d4b1ae2b6aff0bf1a09d249cd833 Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 25 May 2022 11:08:34 -0700 Subject: [PATCH 12/21] only load the required files --- use-cases/Wikidata-Subsets.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/use-cases/Wikidata-Subsets.ipynb b/use-cases/Wikidata-Subsets.ipynb index 76766bd17..65e083e0c 100644 --- a/use-cases/Wikidata-Subsets.ipynb +++ b/use-cases/Wikidata-Subsets.ipynb @@ -71,7 +71,7 @@ "\n", "project_name = \"wikidata-20220505-dwd-v4\"\n", "\n", - "files = 'claims,label_all,alias_all,description_all,item,qualifiers,datatypes,types,isa,p279star'\n", + "files = 'isa,p279star'\n", "\n", "# Classes to remove\n", "remove_classes = \"Q7318358,Q13442814\"\n", From 5b26ec3f00c70e7acd984f396859ca8d34920403 Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 25 May 2022 11:09:47 -0700 Subject: [PATCH 13/21] only load the required files --- use-cases/create-wikidata-dwd.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/use-cases/create-wikidata-dwd.ipynb b/use-cases/create-wikidata-dwd.ipynb index cb993539e..942517866 100644 --- a/use-cases/create-wikidata-dwd.ipynb +++ b/use-cases/create-wikidata-dwd.ipynb @@ -190,7 +190,7 @@ "\n", "subset_project_name = \"wikidata-20220519-dwd-v5\"\n", "\n", - "subset_files = 'claims,label_all,alias_all,description_all,item,qualifiers,datatypes,types,isa,p279star'\n", + "subset_files = 'isa,p279star'\n", "\n", "# Classes to remove\n", "remove_classes = \"Q7318358,Q13442814\"\n", From 2029459314f1df3f67dbee9a2da7485dd0012826 Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 1 Jun 2022 10:51:00 -0700 Subject: [PATCH 14/21] end to end run for create wikidata --- use-cases/create-wikidata-dwd.ipynb | 250 ++++++++++++++++++++-------- 1 file changed, 183 insertions(+), 67 deletions(-) diff --git a/use-cases/create-wikidata-dwd.ipynb b/use-cases/create-wikidata-dwd.ipynb index 942517866..9212dc8ad 100644 --- a/use-cases/create-wikidata-dwd.ipynb +++ b/use-cases/create-wikidata-dwd.ipynb @@ -2,22 +2,51 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 16, "id": "93f651b9-d27d-40bb-b531-cfabad740521", "metadata": {}, - "outputs": [], - "source": [ + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 61 µs, sys: 26 µs, total: 87 µs\n", + "Wall time: 108 µs\n" + ] + } + ], + "source": [ + "%%time\n", "import papermill as pm\n", "\n", "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", - "from kgtk.functions import kgtk, kypher" + "from kgtk.functions import kgtk, kypher\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "id": "82e1ae83-6541-4816-aad5-c0ae3b4cc5be", + "metadata": {}, + "source": [ + "**NOTE: downloaded.wikipedia.short_abstracts.tsv.gz**\n", + "\n", + "This file is available to be downloaded from `https://drive.google.com/drive/folders/1UkvFFLWbfjJtSw767IKYPfZiFsqUFu5n`\n", + "\n", + "The location on `ckg07` is `/data/amandeep/downloaded.wikipedia.short_abstracts.tsv.gz`\n", + "\n", + "This file is required for building the cache file for KGTK Browser." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "217db6b6-3e26-47f0-ba62-cbce9270021b", - "metadata": {}, + "metadata": { + "tags": [ + "parameters" + ] + }, "outputs": [], "source": [ "input_path = \"/data/amandeep\"\n", @@ -27,16 +56,29 @@ "kgtk_path = \"/data/amandeep/Github/kgtk\"\n", "kgtk_notebooks_path = \"/data/amandeep/Github/kgtk-notebooks\"\n", "kgtk_browser_path = \"/data/amandeep/Github/kgtk-browser\"\n", - "kernel_name = \"kgtk-env-ckg07\"" + "kernel_name = \"kgtk-env-ckg07\"\n", + "wikipedia_short_abstracts_path = '/data/amandeep/downloaded.wikipedia.short_abstracts.tsv.gz'" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "b1c90cb1-baab-4387-bad5-62d703d84ec1", "metadata": {}, - "outputs": [], - "source": [ + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User home: /nas/home/amandeep\n", + "Current dir: /data/amandeep/Github/kgtk/use-cases\n", + "KGTK dir: /data/amandeep/Github/kgtk\n", + "Use-cases dir: /data/amandeep/Github/kgtk/use-cases\n" + ] + } + ], + "source": [ + "%%time\n", "ck = ConfigureKGTK([], kgtk_path=kgtk_path)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", " output_path=output_path,\n", @@ -54,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "f247bcc7-29a6-4756-9b4e-73a84693af20", "metadata": {}, "outputs": [], @@ -74,6 +116,7 @@ "metadata": {}, "outputs": [], "source": [ + "%%time\n", "pm.execute_notebook(\n", " \"import-wikidata.ipynb\",\n", " os.environ[\"TEMP\"] + \"/import-wikidata.out.ipynb\",\n", @@ -99,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "f3d42da9-ac53-4f77-85aa-fe889b87c8f0", "metadata": {}, "outputs": [], @@ -119,6 +162,7 @@ "metadata": {}, "outputs": [], "source": [ + "%%time\n", "pm.execute_notebook(\n", " \"Wikidata-Useful-Files.ipynb\",\n", " os.environ[\"TEMP\"] + \"/Wikidata-Useful-Files.out.ipynb\",\n", @@ -179,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "6341b690-c846-4482-adcc-ba30dc69f267", "metadata": {}, "outputs": [], @@ -205,6 +249,7 @@ "metadata": {}, "outputs": [], "source": [ + "%%time\n", "pm.execute_notebook(\n", " \"Wikidata-Subsets.ipynb\",\n", " os.environ[\"TEMP\"] + \"/Wikidata-Subsets.out.ipynb\",\n", @@ -216,14 +261,15 @@ " kgtk_path = kgtk_path,\n", " files = subset_files,\n", " remove_classes = remove_classes,\n", - " languages = languages\n", + " languages = languages,\n", + " kernel_name = kernel_name\n", " )\n", ")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "c6a5a06c-cdc1-41c7-9c11-634b76fdcc60", "metadata": {}, "outputs": [], @@ -237,7 +283,7 @@ "id": "dce13d8b-65b3-4e39-bfa4-d69fe736b43a", "metadata": {}, "source": [ - "## Create and Load ES Index for KGTK-Search" + "## Create JSON file for KGTK-Search" ] }, { @@ -254,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "55f19a42-9659-4e76-aa53-e22338fd93dc", "metadata": {}, "outputs": [], @@ -270,7 +316,7 @@ "datatype_property = \"datatype\"\n", "\n", "es_url=\"http://ckg07:9200\"\n", - "es_index=\"wikidata-dwd-kgtk-search-03\"" + "es_index=\"wikidata-dwd-kgtk-search-04\"" ] }, { @@ -280,6 +326,7 @@ "metadata": {}, "outputs": [], "source": [ + "%%time\n", "pm.execute_notebook(\n", " \"Embeddings-Elasticsearch-Triples.ipynb\",\n", " os.environ[\"TEMP\"] + \"/Embeddings-Elasticsearch-Triples.out.ipynb\",\n", @@ -308,54 +355,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "4933fcc1-da1d-48a4-b474-05ecfd688919", "metadata": {}, "outputs": [], "source": [ - "!mkdir -p $search_output_path/kgtk-search/es_split/" + "!mkdir -p $search_output_path/$search_project_name/es_split/" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "id": "38c599e6-ea05-4a1a-8280-c2c27f2ae95f", "metadata": {}, "outputs": [], "source": [ - "!split $search_output_path/kgtk-search/wikidata.dwd.all.kgtk.search.sorted.jl \\\n", + "!split $search_output_path/$search_project_name/wikidata.dwd.all.kgtk.search.sorted.jl \\\n", " -l 1000000 \\\n", - " $search_output_path/kgtk-search/es_split/" - ] - }, - { - "cell_type": "markdown", - "id": "91d94f34-088a-499b-865d-7e5b93bb948e", - "metadata": {}, - "source": [ - "### Load the file into ES" - ] - }, - { - "cell_type": "markdown", - "id": "67666184-21df-4bf1-9f06-8f6fccc3f01f", - "metadata": {}, - "source": [ - "**Make sure [table-linker](https://github.com/usc-isi-i2/table-linker) is installed as well**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "77c6851f-1ddb-4250-8bb8-b42f1616b21c", - "metadata": {}, - "outputs": [], - "source": [ - "for f in $search_output_path/kgtk-search/es_split/* ;\n", - "do\n", - " tl load-elasticsearch-index --es-url $es_url --es-index $es_index --es-version 7 --kgtk-jl-path $f\n", - " sleep 60\n", - "done" + " $search_output_path/$search_project_name/es_split/" ] }, { @@ -376,7 +393,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "bc82bc30-7cf9-4144-924c-8ac69bda2a01", "metadata": {}, "outputs": [], @@ -401,6 +418,7 @@ "metadata": {}, "outputs": [], "source": [ + "%%time\n", "pm.execute_notebook(\n", " f\"{kgtk_notebooks_path}/use-cases/properties-for-this-type.ipynb\",\n", " os.environ[\"TEMP\"] + \"/properties-for-this-type.out.ipynb\",\n", @@ -426,7 +444,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "50870bc1-a835-4ff7-8b13-52128ff8ac0c", "metadata": {}, "outputs": [], @@ -444,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "f9f164ce-e4ca-4646-8ce4-097c9c4c1e3c", "metadata": {}, "outputs": [], @@ -469,15 +487,16 @@ "metadata": {}, "outputs": [], "source": [ + "%%time\n", "pm.execute_notebook(\n", " f\"{kgtk_notebooks_path}/use-cases/class-visualization.ipynb\",\n", " os.environ[\"TEMP\"] + \"/class-visualization.out.ipynb\",\n", " kernel_name=kernel_name,\n", " parameters=dict(\n", - " input_path = p_input_path,\n", - " output_path = p_output_path,\n", - " project_name = p_project_name,\n", - " graph_cache_path = p_graph_cache_path,\n", + " input_path = c_input_path,\n", + " output_path = c_output_path,\n", + " project_name = c_project_name,\n", + " graph_cache_path = c_graph_cache_path,\n", " debug = debug,\n", " files_for_cache=files_for_cache \n", " )\n", @@ -486,12 +505,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "fffe2e16-f871-4e39-aa18-151f7a6ee9cd", "metadata": {}, "outputs": [], "source": [ - "!mv $c_output_path/$c_project_name/*tsv.gz $c_output_path" + "!mv $c_output_path/$c_project_name/class-visualization.node.tsv.gz $c_output_path\n", + "!mv $c_output_path/$c_project_name/class-visualization.edge.tsv.gz $c_output_path" ] }, { @@ -504,7 +524,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "0a4a60f3-aad1-4e01-9382-c0ff82e13289", "metadata": {}, "outputs": [], @@ -514,6 +534,16 @@ "a_project_name = \"browser-claims-file\"" ] }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ba61fd58-d461-402e-b447-40f76a4f36cd", + "metadata": {}, + "outputs": [], + "source": [ + "!cp $wikipedia_short_abstracts_path $subset_output_path/$subset_project_name" + ] + }, { "cell_type": "code", "execution_count": null, @@ -521,6 +551,7 @@ "metadata": {}, "outputs": [], "source": [ + "%%time\n", "pm.execute_notebook(\n", " f\"{kgtk_browser_path}/Create-claims-augmented-for-browser.ipynb\",\n", " os.environ[\"TEMP\"] + \"/Create-claims-augmented-for-browser.out.ipynb\",\n", @@ -543,7 +574,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "1a11211f-47a5-4224-8dfc-3c9b6dfe09dc", "metadata": {}, "outputs": [], @@ -554,6 +585,18 @@ "q_project_name = \"kgtk-browser-files\"" ] }, + { + "cell_type": "code", + "execution_count": 15, + "id": "1417e7e7-c526-4f88-a7f3-21d8bc10650e", + "metadata": {}, + "outputs": [], + "source": [ + "!cp $subset_output_path/$subset_project_name/class-visualization.edge.tsv.gz $q_input_path\n", + "!cp $subset_output_path/$subset_project_name/class-visualization.node.tsv.gz $q_input_path\n", + "!cp $subset_output_path/$subset_project_name/metadata.pagerank.undirected.tsv.gz $q_input_path" + ] + }, { "cell_type": "code", "execution_count": null, @@ -561,6 +604,7 @@ "metadata": {}, "outputs": [], "source": [ + "%%time\n", "pm.execute_notebook(\n", " f\"{kgtk_browser_path}/KGTK-Query-Text-Search-Setup.ipynb\",\n", " os.environ[\"TEMP\"] + \"/KGTK-Query-Text-Search-Setup.ipynb\",\n", @@ -572,13 +616,85 @@ " )\n", ")" ] + }, + { + "cell_type": "markdown", + "id": "b411d124-08e3-40cf-8753-2bf490df7b0f", + "metadata": {}, + "source": [ + "## LOAD ES Index" + ] + }, + { + "cell_type": "markdown", + "id": "2024399e-6fcc-43a9-a3f8-a6f7886df756", + "metadata": {}, + "source": [ + "**Make sure [table-linker](https://github.com/usc-isi-i2/table-linker) is installed**" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "a08085a8-a93a-4419-939d-06373a228c30", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\"acknowledged\":true,\"shards_acknowledged\":true,\"index\":\"wikidata-dwd-kgtk-search-04\"}" + ] + } + ], + "source": [ + "!curl -H \"Content-Type: application/json\" \\\n", + "-XPUT $es_url/$es_index -d @$search_output_path/$search_project_name/wikidata_dwd_v3_mapping.json" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "2b8032ba-1ce6-480c-9d71-675edfb878f2", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ['search_output_path']=search_output_path\n", + "os.environ['search_project_name']=search_project_name\n", + "os.environ['es_url']=es_url\n", + "os.environ['es_index']=es_index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3522d6a0-d7d8-426f-aaa4-c9811951ea1e", + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "for f in $search_output_path/$search_project_name/es_split/* ;\n", + "do\n", + " echo $f \n", + " tl load-elasticsearch-index --es-url $es_url --es-index $es_index --es-version 7 --kgtk-jl-path $f > $TEMP/load_es.log\n", + " sleep 60\n", + "done" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c0619b92-6a75-4977-b2c8-a0ca4f3d6f82", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "kgtk-env", + "display_name": "kgtk-env-ckg07", "language": "python", - "name": "kgtk-env" + "name": "kgtk-env-ckg07" }, "language_info": { "codemirror_mode": { @@ -590,7 +706,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.9.12" } }, "nbformat": 4, From 023c27a06c80b9d2dd58dc1f1a0163137884389f Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 1 Jun 2022 11:10:27 -0700 Subject: [PATCH 15/21] move notebooks to kgtk-notebooks --- examples/partition-wikidata.ipynb | 720 --- .../Embeddings-Elasticsearch-Triples.ipynb | 624 --- use-cases/Wikidata-Subsets.ipynb | 1033 ---- use-cases/Wikidata-Useful-Files.ipynb | 1322 ------ use-cases/create-wikidata-dwd.ipynb | 714 --- use-cases/import-wikidata.ipynb | 4209 ----------------- 6 files changed, 8622 deletions(-) delete mode 100644 examples/partition-wikidata.ipynb delete mode 100644 use-cases/Embeddings-Elasticsearch-Triples.ipynb delete mode 100644 use-cases/Wikidata-Subsets.ipynb delete mode 100644 use-cases/Wikidata-Useful-Files.ipynb delete mode 100644 use-cases/create-wikidata-dwd.ipynb delete mode 100644 use-cases/import-wikidata.ipynb diff --git a/examples/partition-wikidata.ipynb b/examples/partition-wikidata.ipynb deleted file mode 100644 index 54b392c48..000000000 --- a/examples/partition-wikidata.ipynb +++ /dev/null @@ -1,720 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Partitioning a subset of Wikidata\n", - "\n", - "This notebook illustrates how to partition a Wikidata KGTK edges file.\n", - "\n", - "Parameters are set up in the first cell so that we can run this notebook in batch mode. Example invocation command:\n", - "\n", - "```\n", - "papermill partition-wikidata.ipynb partition-wikidata.out.ipynb \\\n", - "-p wikidata_input_path /data3/rogers/kgtk/gd/kgtk_public_graphs/cache/wikidata-20201130/data/all.tsv.gz \\\n", - "-p wikidata_parts_path /data3/rogers/kgtk/gd/kgtk_public_graphs/cache/wikidata-20201130/parts \\\n", - "```\n", - "\n", - "Here is a sample of the records that might appear in the input KGTK file:\n", - "```\n", - "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\tlang\n", - "Q1-P1036-418bc4-78f5a565-0\tQ1\tP1036\t\"113\"\tnormal\texternal-id\t\n", - "Q1-P1343-Q19190511-ab132b87-0 Q1 P1343 Q19190511 normal wikibase-item \n", - "Q1-P18-92a7b3-0dcac501-0 Q1 P18 \"Hubble ultra deep field.jpg\" normal commonsMedia \n", - "Q1-P2386-cedfb0-0fdbd641-0 Q1 P2386 +880000000000000000000000Q828224 normal quantity \n", - "Q1-P580-a2fccf-63cf4743-0 Q1 P580 ^-13798000000-00-00T00:00:00Z/3 normal time \n", - "Q1-P920-47c0f2-52689c4e-0 Q1 P920 \"LEM201201756\" normal string \n", - "Q1-P1343-Q19190511-ab132b87-0-P805-Q84065667-0 Q1-P1343-Q19190511-ab132b87-0 P805 Q84065667 wikibase-item \n", - "Q1-P1343-Q88672152-5080b9e2-0-P304-5724c3-0 Q1-P1343-Q88672152-5080b9e2-0 P304 \"13-36\" string \n", - "Q1-P2670-Q18343-030eb87e-0-P1107-ce87f8-0 Q1-P2670-Q18343-030eb87e-0 P1107 +0.70 quantity \n", - "Q1-P793-Q273508-1900d69c-0-P585-a2fccf-0 Q1-P793-Q273508-1900d69c-0 P585 ^-13798000000-00-00T00:00:00Z/3 time \n", - "P10-alias-en-282226-0 P10 alias 'gif'@en\n", - "P10-description-en P10 description 'relevant video. For images, use the property P18. For film trailers, qualify with \\\"object has role\\\" (P3831)=\\\"trailer\\\" (Q622550)'@en en\n", - "P10-label-en P10 label 'video'@en en\n", - "Q1-addl_wikipedia_sitelink-19e42a-0 Q1 addl_wikipedia_sitelink http://enwikiquote.org/wiki/Universe en\n", - "Q1-addl_wikipedia_sitelink-19e42a-0-language-0 Q1-addl_wikipedia_sitelink-19e42a-0 sitelink-language en en\n", - "Q1-addl_wikipedia_sitelink-19e42a-0-site-0 Q1-addl_wikipedia_sitelink-19e42a-0 sitelink-site enwikiquote en\n", - "Q1-addl_wikipedia_sitelink-19e42a-0-title-0 Q1-addl_wikipedia_sitelink-19e42a-0 sitelink-title \"Universe\" en\n", - "Q1-wikipedia_sitelink-5e459a-0 Q1 wikipedia_sitelink http://en.wikipedia.org/wiki/Universe en\n", - "Q1-wikipedia_sitelink-5e459a-0-badge-Q17437798 Q1-wikipedia_sitelink-5e459a-0 sitelink-badge Q17437798 en\n", - "Q1-wikipedia_sitelink-5e459a-0-language-0 Q1-wikipedia_sitelink-5e459a-0 sitelink-language en en\n", - "Q1-wikipedia_sitelink-5e459a-0-site-0 Q1-wikipedia_sitelink-5e459a-0 sitelink-site enwiki en\n", - "Q1-wikipedia_sitelink-5e459a-0-title-0 Q1-wikipedia_sitelink-5e459a-0 sitelink-title \"Universe\" en\n", - "```\n", - "Here are some contraints on the contents of the input file:\n", - "- The input file starts with a KGTK header record.\n", - " - In addition to the `id`, `node1`, `label`, and `node2` columns, the file may contain the `node2;wikidatatype` column.\n", - " - The `node2;wikidatatype` column is used to partition claims by Wikidata property datatype.\n", - " - If it does not exist, it will be created during the partitioning process and populated using `datatype` relationships.\n", - " - If it does exist, any empty values in the column will be populated using `datatype` relationships.\n", - "- The `id` column must contain a nonempty value.\n", - "- The first section of an `id` value must be the `node` value for the record.\n", - " - The qualifier extraction operations depend upon this constraint. \n", - "- In addition to the claims and qualifiers, the input file is expected to contain:\n", - " - English language labels for all property entities appearing in the file.\n", - "- The input file ought to contain the following:\n", - " - claims records,\n", - " - qualifier records,\n", - " - alias records in appropriate languages,\n", - " - description records in appropriate languages,\n", - " - label records in appropriate languages, and\n", - " - sitelink records in appropriate languages.\n", - " - `datatype` records that map Wikidata property entities to Wikidata property datatypes. These records are required if the input file does not contain the `node2;wikidatatype` column.\n", - "- Additionally, this script provides for the appearance of `type` records in the input file.\n", - " - `type` records that list all `entityId` values and identify them as properties or items. These records provides a correctness check on the operation of `kgtk import-wikidata`, and may be deprecated in the future.\n", - "- The input file is assumed to be unsorted. If it is already sorted on the (`id` `node1` `label` `node2`) columns , then set the `presorted` parameter to `True` to shorten the execution time of this script." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Parameters for invoking the notebook\n", - "\n", - "| Parameter | Description | Default |\n", - "| --------- | ----------- | ------- |\n", - "| `wikidata_input_path` | A folder containing the Wikidata KGTK edges to partition. | '/data4/rogers/elicit/cache/datasets/wikidata-20200803/data/all.tsv.gz' |\n", - "| `wikidata_parts_path` | A folder to receive the partitioned Wikidata files, such as `part.wikibase-item.tsv.gz` | '/data4/rogers/elicit/cache/datasets/wikidata-20200803/parts' |\n", - "| `temp_folder_path` | A folder that may be used for temporary files. | wikidata_parts_path + '/temp' |\n", - "| `gzip_command` | The compression command for sorting. | 'pigz' (Note: use version 2.4 or later)|\n", - "| `kgtk_command` | The kgtk commmand. | 'time kgtk' |\n", - "| `kgtk_options` | The kgtk commmand options. | '--debug --timing' |\n", - "| `kgtk_extension` | The file extension for generated KGTK files. Appending `.gz` implies gzip compression. | 'tsv.gz' |\n", - "| `presorted` | When True, the input file is already sorted on the (`id` `node1` `label` `node2`) columns. | 'False' |\n", - "| `sort_extras` | Extra parameters for the sort program. The default specifies a path for temporary files. Other useful parameters include '--buffer-size' and '--parallel'. | '--parallel 24 --buffer-size 30% --temporary-directory ' + temp_folder_path |\n", - "| `use_mgzip` | When True, use the mgzip program where appropriate for faster compression. | 'True' |\n", - "| `verbose` | When True, produce additional feedback messages. | 'True' |\n", - "\n", - "Note: if `pigz` version 2.4 (or later) is not available on your system, use `gzip`.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "# Parameters\n", - "wikidata_input_path = '/data3/rogers/kgtk/gd/kgtk_public_graphs/cache/wikidata-20201130/data/all.tsv.gz'\n", - "wikidata_parts_path = '/data3/rogers/kgtk/gd/kgtk_public_graphs/cache/wikidata-20201130/parts'\n", - "temp_folder_path = wikidata_parts_path + '/temp'\n", - "gzip_command = 'pigz'\n", - "kgtk_command = 'time kgtk'\n", - "kgtk_options = '--debug --timing'\n", - "kgtk_extension = 'tsv.gz'\n", - "presorted = 'False'\n", - "sort_extras = '--parallel 24 --buffer-size 30% --temporary-directory ' + temp_folder_path\n", - "use_mgzip = 'True'\n", - "verbose = 'True'\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "print('wikidata_input_path = %s' % repr(wikidata_input_path))\n", - "print('wikidata_parts_path = %s' % repr(wikidata_parts_path))\n", - "print('temp_folder_path = %s' % repr(temp_folder_path))\n", - "print('gzip_command = %s' % repr(gzip_command))\n", - "print('kgtk_command = %s' % repr(kgtk_command))\n", - "print('kgtk_options = %s' % repr(kgtk_options))\n", - "print('kgtk_extension = %s' % repr(kgtk_extension))\n", - "print('presorted = %s' % repr(presorted))\n", - "print('sort_extras = %s' % repr(sort_extras))\n", - "print('use_mgzip = %s' % repr(use_mgzip))\n", - "print('verbose = %s' % repr(verbose))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create working folders and empty them" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "!mkdir {wikidata_parts_path}\n", - "!mkdir {temp_folder_path}" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "!rm {wikidata_parts_path}/*.tsv {wikidata_parts_path}/*.tsv.gz\n", - "!rm {temp_folder_path}/*.tsv {temp_folder_path}/*.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sort the Input Data Unless Presorted\n", - "Sort the input data file by (id, node1, label, node2).\n", - "This may take a while." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if presorted.lower() == \"true\": \n", - " print('Using a presorted input file %s.' % repr(wikidata_input_path))\n", - " partition_input_file = wikidata_input_path \n", - "else: \n", - " print('Sorting the input file %s.' % repr(wikidata_input_path))\n", - " partition_input_file = wikidata_parts_path + '/all.' + kgtk_extension \n", - " !{kgtk_command} {kgtk_options} sort2 --verbose={verbose} --gzip-command={gzip_command} \\\n", - " --input-file {wikidata_input_path} \\\n", - " --output-file {partition_input_file} \\\n", - " --columns id node1 label node2 \\\n", - " --extra \"{sort_extras}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Partition the Claims, Qualifiers, and Entity Data\n", - "Split out the entity data (alias, description, label, and sitelinks) and additional metadata (datatype, type). Separate the qualifiers from the claims.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} filter --verbose={verbose} --use-mgzip={use_mgzip} --first-match-only \\\n", - " --input-file {partition_input_file} \\\n", - " -p '; datatype ;' -o {wikidata_parts_path}/metadata.property.datatypes.{kgtk_extension} \\\n", - " -p '; alias ;' -o {wikidata_parts_path}/aliases.{kgtk_extension} \\\n", - " -p '; description ;' -o {wikidata_parts_path}/descriptions.{kgtk_extension} \\\n", - " -p '; label ;' -o {wikidata_parts_path}/labels.{kgtk_extension} \\\n", - " -p '; addl_wikipedia_sitelink,wikipedia_sitelink ;' \\\n", - " -o {wikidata_parts_path}/sitelinks.{kgtk_extension} \\\n", - " -p '; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;' \\\n", - " -o {wikidata_parts_path}/sitelinks.qualifiers.{kgtk_extension} \\\n", - " -p '; type ;' -o {wikidata_parts_path}/metadata.types.{kgtk_extension} \\\n", - " --reject-file {temp_folder_path}/claims-and-qualifiers.sorted-by-id.{kgtk_extension}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sort the claims and qualifiers on Node1\n", - "Sort the combined claims and qualifiers file by the node1 column.\n", - "This may take a while." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} sort2 --verbose={verbose} --gzip-command={gzip_command} \\\n", - " --input-file {temp_folder_path}/claims-and-qualifiers.sorted-by-id.{kgtk_extension} \\\n", - " --output-file {temp_folder_path}/claims-and-qualifiers.sorted-by-node1.{kgtk_extension}\\\n", - " --columns node1 \\\n", - " --extra \"{sort_extras}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Split the claims and qualifiers\n", - "If row A's node1 value matches some other row's id value, the then row A is a qualifier." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {temp_folder_path}/claims-and-qualifiers.sorted-by-node1.{kgtk_extension} \\\n", - " --filter-file {temp_folder_path}/claims-and-qualifiers.sorted-by-id.{kgtk_extension} \\\n", - " --output-file {temp_folder_path}/qualifiers.sorted-by-node1.{kgtk_extension}\\\n", - " --reject-file {temp_folder_path}/claims.sorted-by-node1.{kgtk_extension}\\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sort the claims by ID\n", - "Sort the split claims by id, node1, label, node2.\n", - "This may take a while." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} sort2 --verbose={verbose} --gzip-command={gzip_command} \\\n", - " --input-file {temp_folder_path}/claims.sorted-by-node1.{kgtk_extension} \\\n", - " --output-file {temp_folder_path}/claims.no-datatype.{kgtk_extension}\\\n", - " --columns id node1 label node2 \\\n", - " --extra \"{sort_extras}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Merge the Wikidata Property Datatypes into the claims\n", - "Merge the Wikidata Property Datatypes into the claims row as node2;wikidatatype. This column will be used to partition the claims by Wikidata Property Datatype ina later step. If the claims file already has a node2;wikidatatype column, lift only when that column has an empty value.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} lift --verbose={verbose} --use-mgzip={use_mgzip} \\\n", - " --input-file {temp_folder_path}/claims.no-datatype.{kgtk_extension} \\\n", - " --columns-to-lift label \\\n", - " --overwrite False \\\n", - " --label-file {wikidata_parts_path}/metadata.property.datatypes.{kgtk_extension}\\\n", - " --label-value datatype \\\n", - " --output-file {wikidata_parts_path}/claims.{kgtk_extension}\\\n", - " --columns-to-write 'node2;wikidatatype'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sort the qualifiers by ID\n", - "Sort the split qualifiers by id, node1, label, node2.\n", - "This may take a while." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} sort2 --verbose={verbose} --gzip-command={gzip_command} \\\n", - " --input-file {temp_folder_path}/qualifiers.sorted-by-node1.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.{kgtk_extension}\\\n", - " --columns id node1 label node2 \\\n", - " --extra \"{sort_extras}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Extract the English aliases, descriptions, labels, and sitelinks.\n", - "Aliases, descriptions, and labels are extracted by selecting rows where the `node2` value ends in the language suffix for English (`@en`) in a KGTK language-qualified string. This is an abbreviated pattern; a more general pattern would include the single quotes used to delimit a KGTK language-qualified string. If `kgtk import-wikidata` has executed properly, the abbreviated pattern should be sufficient.\n", - "\n", - "Sitelink rows do not have a language-specific marker in the `node2` value. We use the `lang` column to provide the language code for English ('en'). The `lang` column is an additional column created by `kgtk import-wikidata`." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} filter --verbose={verbose} --use-mgzip={use_mgzip} --regex \\\n", - " --input-file {wikidata_parts_path}/aliases.{kgtk_extension} \\\n", - " -p ';; ^.*@en$' -o {wikidata_parts_path}/aliases.en.{kgtk_extension}" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} filter --verbose={verbose} --use-mgzip={use_mgzip} --regex \\\n", - " --input-file {wikidata_parts_path}/descriptions.{kgtk_extension} \\\n", - " -p ';; ^.*@en$' -o {wikidata_parts_path}/descriptions.en.{kgtk_extension}" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} filter --verbose={verbose} --use-mgzip={use_mgzip} --regex \\\n", - " --input-file {wikidata_parts_path}/labels.{kgtk_extension} \\\n", - " -p ';; ^.*@en$' -o {wikidata_parts_path}/labels.en.{kgtk_extension}" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} filter --verbose={verbose} --use-mgzip={use_mgzip} \\\n", - " --input-file {wikidata_parts_path}/sitelinks.qualifiers.{kgtk_extension} \\\n", - " -p '; sitelink-language ; en' -o {temp_folder_path}/sitelinks.language.en.{kgtk_extension}" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/sitelinks.{kgtk_extension} \\\n", - " --filter-on {temp_folder_path}/sitelinks.language.en.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/sitelinks.en.{kgtk_extension} \\\n", - " --input-keys id \\\n", - " --filter-keys node1" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/sitelinks.qualifiers.{kgtk_extension} \\\n", - " --filter-on {temp_folder_path}/sitelinks.language.en.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/sitelinks.qualifiers.en.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys node1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Partition the claims by Wikidata Property Datatype\n", - "Wikidata has two names for each Wikidata property datatype: the name that appears in the JSON dump file, and the name that appears in the TTL dump file. `kgtk import-wikidata` currently imports rows from Wikikdata JSON dump files, and these are the names that appear below.\n", - "\n", - "The `part.other` file catches any records that have an unknown Wikidata property datatype. Additional Wikidata property datatypes may occur when processing from certain Wikidata extensions." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} filter --verbose={verbose} --use-mgzip={use_mgzip} --first-match-only \\\n", - " --input-file {wikidata_parts_path}/claims.{kgtk_extension} \\\n", - " --obj 'node2;wikidatatype' \\\n", - " -p ';; commonsMedia' -o {wikidata_parts_path}/claims.commonsMedia.{kgtk_extension} \\\n", - " -p ';; external-id' -o {wikidata_parts_path}/claims.external-id.{kgtk_extension} \\\n", - " -p ';; geo-shape' -o {wikidata_parts_path}/claims.geo-shape.{kgtk_extension} \\\n", - " -p ';; globe-coordinate' -o {wikidata_parts_path}/claims.globe-coordinate.{kgtk_extension} \\\n", - " -p ';; math' -o {wikidata_parts_path}/claims.math.{kgtk_extension} \\\n", - " -p ';; monolingualtext' -o {wikidata_parts_path}/claims.monolingualtext.{kgtk_extension} \\\n", - " -p ';; musical-notation' -o {wikidata_parts_path}/claims.musical-notation.{kgtk_extension} \\\n", - " -p ';; quantity' -o {wikidata_parts_path}/claims.quantity.{kgtk_extension} \\\n", - " -p ';; string' -o {wikidata_parts_path}/claims.string.{kgtk_extension} \\\n", - " -p ';; tabular-data' -o {wikidata_parts_path}/claims.tabular-data.{kgtk_extension} \\\n", - " -p ';; time' -o {wikidata_parts_path}/claims.time.{kgtk_extension} \\\n", - " -p ';; url' -o {wikidata_parts_path}/claims.url.{kgtk_extension} \\\n", - " -p ';; wikibase-form' -o {wikidata_parts_path}/claims.wikibase-form.{kgtk_extension} \\\n", - " -p ';; wikibase-item' -o {wikidata_parts_path}/claims.wikibase-item.{kgtk_extension} \\\n", - " -p ';; wikibase-lexeme' -o {wikidata_parts_path}/claims.wikibase-lexeme.{kgtk_extension} \\\n", - " -p ';; wikibase-property' -o {wikidata_parts_path}/claims.wikibase-property.{kgtk_extension} \\\n", - " -p ';; wikibase-sense' -o {wikidata_parts_path}/claims.wikibase-sense.{kgtk_extension} \\\n", - " --reject-file {wikidata_parts_path}/claims.other.{kgtk_extension}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Partition the qualifiers\n", - "Extract the qualifier records for each of the Wikidata property datatype partition files." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.commonsMedia.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.commonsMedia.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.external-id.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.external-id.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.geo-shape.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.geo-shape.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.globe-coordinate.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.globe-coordinate.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.math.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.math.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.monolingualtext.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.monolingualtext.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.musical-notation.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.musical-notation.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.quantity.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.quantity.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.string.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.string.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.tabular-data.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.tabular-data.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.time.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.time.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.url.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.url.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.wikibase-form.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.wikibase-form.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.wikibase-item.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.wikibase-item.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.wikibase-lexeme.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.wikibase-lexeme.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.wikibase-property.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.wikibase-property.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "!{kgtk_command} {kgtk_options} ifexists --verbose={verbose} --use-mgzip={use_mgzip} --presorted \\\n", - " --input-file {wikidata_parts_path}/qualifiers.{kgtk_extension} \\\n", - " --filter-on {wikidata_parts_path}/claims.wikibase-sense.{kgtk_extension} \\\n", - " --output-file {wikidata_parts_path}/qualifiers.wikibase-sense.{kgtk_extension} \\\n", - " --input-keys node1 \\\n", - " --filter-keys id" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/use-cases/Embeddings-Elasticsearch-Triples.ipynb b/use-cases/Embeddings-Elasticsearch-Triples.ipynb deleted file mode 100644 index f5fd0375c..000000000 --- a/use-cases/Embeddings-Elasticsearch-Triples.ipynb +++ /dev/null @@ -1,624 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "naval-vulnerability", - "metadata": {}, - "source": [ - "# Computes Graph and Text Embeddings, Elasticsearch Ready KGTK File and RDF Triples for Blazegraph\n", - "\n", - "This notebook computes the following:\n", - "\n", - "- `complEx` graph embeddings\n", - "- `transE` graph embeddings\n", - "- `BERT` text embeddings\n", - "- `elasticsearch` ready KGTK edge for [KGTK Search](https://kgtk.isi.edu/search/)\n", - "- `elasticsearch` ready KGTK edge file for Table Linker\n", - "- `RDF Triples` to be loaded into blazegraph\n", - "\n", - "Inputs:\n", - "\n", - "- `item_file`: the subset of the `claims_file` consistin of edges for property of data type `wikibase-item`\n", - "- `label_file`, `alias_file` and `description_file` containing labels, aliases and descriptions. It is assume that these files contain the labels, aliases and descriptions of all nodes appearing in the claims file. Users may provide these files for specific languages only.\n" - ] - }, - { - "cell_type": "markdown", - "id": "endless-exemption", - "metadata": {}, - "source": [ - "### Batch Invocation\n", - "Example batch command. The second argument is a notebook where the output will be stored. You can load it to see progress.\n", - "\n", - "```\n", - "papermill Embeddings-Elasticsearch-&-Triples.ipynb Embeddings-Elasticsearch-&-Triples.out.ipynb \\\n", - "-p claims_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/all.tsv.gz \\\n", - "-p label_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/part.label.en.tsv.gz \\\n", - "-p item_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/part.wikibase-item.tsv.gz \\\n", - "-p property_item_file = /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v4/part.property.wikibase-item.tsv.gz \\\n", - "-p output_path \\\n", - "-p output_folder useful_files_v4 \\\n", - "-p temp_folder temp.useful_files_v4 \\\n", - "-p delete_database no \n", - "-p languages es,ru,zh-cn\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "bb3e0847-155b-4251-821a-34e27d75c8a6", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import sys\n", - "\n", - "import pandas as pd\n", - " \n", - "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", - "from kgtk.functions import kgtk, kypher" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "departmental-connectivity", - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "# Parameters\n", - "\n", - "input_path = \"/data/amandeep/wikidata-20220505-dwd-v4\"\n", - "output_path = \"/data/amandeep/wikidata-20220505-dwd-v4\"\n", - "kgtk_path = \"/Users/amandeep/Github/kgtk\"\n", - "\n", - "graph_cache_path = None\n", - "\n", - "project_name = \"embeddings-elasticsearch-triples\"\n", - "\n", - "languages = 'en,ru,es,zh-cn,de,it,nl,pl,fr,pt,sv'\n", - "\n", - "files = 'label_all,alias_all,description_all'\n", - "compute_embeddings = False\n", - "generate_triples = False\n", - "generate_kgtk_search = True\n", - "datatype_property = \"datatype\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f44d69f2-eca7-4ac6-8b63-1d7c42898f59", - "metadata": {}, - "outputs": [], - "source": [ - "files = files.split(',')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "1b52a584-551e-43ad-becb-9314e95932fa", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "User home: /nas/home/amandeep\n", - "Current dir: /data/amandeep/Github/kgtk/use-cases\n", - "KGTK dir: /Users/amandeep/Github/kgtk\n", - "Use-cases dir: /Users/amandeep/Github/kgtk/use-cases\n" - ] - } - ], - "source": [ - "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", - "ck.configure_kgtk(input_graph_path=input_path,\n", - " output_path=output_path,\n", - " project_name=project_name,\n", - " graph_cache_path=graph_cache_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "1ffdcaec-c0d7-468c-a207-186fad300d56", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "USE_CASES_DIR: /Users/amandeep/Github/kgtk/use-cases\n", - "TEMP: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples\n", - "EXAMPLES_DIR: /Users/amandeep/Github/kgtk/examples\n", - "KGTK_LABEL_FILE: /data/amandeep/wikidata-20220505-dwd-v4/labels.en.tsv.gz\n", - "GRAPH: /data/amandeep/wikidata-20220505-dwd-v4\n", - "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples/wikidata.sqlite3.db\n", - "KGTK_OPTION_DEBUG: false\n", - "kgtk: kgtk\n", - "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples/wikidata.sqlite3.db\n", - "OUT: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples\n", - "STORE: /data/amandeep/wikidata-20220505-dwd-v4/embeddings-elasticsearch-triples/temp.embeddings-elasticsearch-triples/wikidata.sqlite3.db\n", - "label_all: /data/amandeep/wikidata-20220505-dwd-v4/labels.tsv.gz\n", - "alias_all: /data/amandeep/wikidata-20220505-dwd-v4/aliases.tsv.gz\n", - "description_all: /data/amandeep/wikidata-20220505-dwd-v4/descriptions.tsv.gz\n" - ] - } - ], - "source": [ - "ck.print_env_variables()" - ] - }, - { - "cell_type": "markdown", - "id": "excellent-passenger", - "metadata": {}, - "source": [ - "## Graph Embeddings" - ] - }, - { - "cell_type": "markdown", - "id": "integrated-slide", - "metadata": {}, - "source": [ - "### complEx" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "eleven-tribe", - "metadata": {}, - "outputs": [], - "source": [ - "if compute_embeddings:\n", - " complex_temp_folder = f\"{wikidata_root_folder}/temp.graph-embeddings.complex\"" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "found-traffic", - "metadata": {}, - "outputs": [], - "source": [ - "if compute_embeddings:\n", - " !mkdir -p {complex_temp_folder}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "gentle-wheat", - "metadata": {}, - "outputs": [], - "source": [ - "if compute_embeddings:\n", - " os.environ['TEMP_COMPLEX'] = complex_temp_folder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "attached-texture", - "metadata": {}, - "outputs": [], - "source": [ - "if compute_embeddings:\n", - " !kgtk graph-embeddings --verbose -i \"$ITEMS\" \\\n", - " -o $OUT/wikidatadwd.complEx.graph-embeddings.txt \\\n", - " --retain_temporary_data True \\\n", - " --operator ComplEx \\\n", - " --workers 24 \\\n", - " --log $TEMP_COMPLEX/ge.complex.log \\\n", - " -T $TEMP_COMPLEX \\\n", - " -ot w2v \\\n", - " -e 600" - ] - }, - { - "cell_type": "markdown", - "id": "piano-thousand", - "metadata": {}, - "source": [ - "### transE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "serial-landscape", - "metadata": {}, - "outputs": [], - "source": [ - "if compute_embeddings:\n", - " transe_temp_folder = f\"{wikidata_root_folder}/temp.graph-embeddings.transe\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "naval-morgan", - "metadata": {}, - "outputs": [], - "source": [ - "if compute_embeddings:\n", - " !mkdir -p {transe_temp_folder}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "little-dietary", - "metadata": {}, - "outputs": [], - "source": [ - "if compute_embeddings:\n", - " os.environ['TEMP_TRANSE'] = transe_temp_folder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "annoying-council", - "metadata": {}, - "outputs": [], - "source": [ - "if compute_embeddings:\n", - " !$kgtk graph-embeddings --verbose -i \"$ITEMS\" \\\n", - " -o $OUT/wikidatadwd.transE.graph-embeddings.txt \\\n", - " --retain_temporary_data True \\\n", - " --operator TransE \\\n", - " --workers 24 \\\n", - " --log $TEMP_TRANSE/ge.transE.log \\\n", - " -T $TEMP_TRANSE \\\n", - " -ot w2v \\\n", - " -e 600" - ] - }, - { - "cell_type": "markdown", - "id": "speaking-torture", - "metadata": {}, - "source": [ - "### BERT Embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "patient-times", - "metadata": {}, - "outputs": [], - "source": [ - "if compute_embeddings:\n", - " !$kgtk text-embedding -i $ALL \\\n", - " --model roberta-large-nli-mean-tokens \\\n", - " --property-labels-file $LABELS_EN \\\n", - " --isa-properties P31 P279 P106 P39 P1382 P373 P452 \\\n", - " --save-embedding-sentence > $OUT/wikidatadwd-text-embeddings-all.tsv" - ] - }, - { - "cell_type": "markdown", - "id": "similar-bidder", - "metadata": {}, - "source": [ - "### Build KGTK edge file for KGTK Search" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "closed-yemen", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_kgtk_search:\n", - " kgtk(\"\"\"cat -i $GRAPH/all.tsv.gz \n", - " -i $GRAPH/derived.isastar.tsv.gz \n", - " -i $GRAPH/metadata.pagerank.undirected.tsv.gz\n", - " -i $GRAPH/metadata.pagerank.directed.tsv.gz\n", - " -o $TEMP/wikidata.dwd.all.kgtk.search.unsorted.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "trained-typing", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_kgtk_search:\n", - " kgtk(f\"\"\"sort -i $TEMP/wikidata.dwd.all.kgtk.search.unsorted.tsv.gz\n", - " --columns node1\n", - " --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'\n", - " -o $OUT/wikidata.dwd.all.kgtk.search.sorted.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6f2f5864-5dae-47ec-b4de-0726654de82c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Processed 1000000 lines...
0Processed 2000000 lines...
1Processed 3000000 lines...
2Processed 4000000 lines...
3Processed 5000000 lines...
4Processed 6000000 lines...
......
5080Processed 5082000000 lines...
5081Processed 5083000000 lines...
5082Processed 5084000000 lines...
5083Processed 5085000000 lines...
5084Done!
\n", - "

5085 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " Processed 1000000 lines...\n", - "0 Processed 2000000 lines...\n", - "1 Processed 3000000 lines...\n", - "2 Processed 4000000 lines...\n", - "3 Processed 5000000 lines...\n", - "4 Processed 6000000 lines...\n", - "... ...\n", - "5080 Processed 5082000000 lines...\n", - "5081 Processed 5083000000 lines...\n", - "5082 Processed 5084000000 lines...\n", - "5083 Processed 5085000000 lines...\n", - "5084 Done!\n", - "\n", - "[5085 rows x 1 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "if generate_kgtk_search:\n", - " kgtk(f\"\"\"--debug build-kgtk-search-input --input-file \"$OUT\"/wikidata.dwd.all.kgtk.search.sorted.tsv.gz\n", - " --output-file \"$OUT\"/wikidata.dwd.all.kgtk.search.sorted.jl \n", - " --label-properties label \n", - " --alias-properties alias \n", - " --extra-alias-properties P1448,P1705,P1477,P1810,P742,P1449 \n", - " --description-properties description \n", - " --pagerank-properties Pundirected_pagerank \n", - " --languages {languages}\n", - " --mapping-file \"$OUT\"/wikidata_dwd_v3_mapping.json \n", - " --property-datatype-file \"$GRAPH\"/metadata.property.datatypes.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "markdown", - "id": "instant-bidder", - "metadata": {}, - "source": [ - "### Build KGTK edge file for Triple generation\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "rolled-poker", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " !$kgtk cat \\\n", - " -i $OUT/wikidata.dwd.all.kgtk.search.sorted.tsv.gz \\\n", - " -i $OUT/derived.isa.tsv.gz \\\n", - " -i $OUT/derived.P279star.tsv.gz \\\n", - " -i $OUT/metadata.in_degree.tsv.gz \\\n", - " -i $OUT/metadata.out_degree.tsv.gz \\\n", - " -o $TEMP/wikidata.dwd.all.kgtk.triples.1.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aggressive-fleet", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " !$kgtk add-id -i $TEMP/wikidata.dwd.all.kgtk.triples.1.tsv.gz \\\n", - " --id-style wikidata \\\n", - " -o $TEMP/wikidata.dwd.all.kgtk.triples.2.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "assumed-ready", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " !$kgtk sort -i $TEMP/wikidata.dwd.all.kgtk.triples.2.tsv.gz \\\n", - " --columns node1 \\\n", - " --extra '--parallel 24 --buffer-size 30% --temporary-directory ' + temp_folder_path \\\n", - " -o $OUT/wikidata.dwd.all.kgtk.triples.sorted.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "id": "wireless-headquarters", - "metadata": {}, - "source": [ - "Split the triples file to parallelize triple generation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "single-gardening", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " !mkdir -p $OUT/kgtk_triples_split" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "original-charles", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " !$kgtk split -i $OUT/wikidata.dwd.all.kgtk.triples.sorted.tsv.gz \\\n", - " --output-path $OUT/kgtk_triples_split \\\n", - " --gzipped-output --lines 10000000 \\\n", - " --file-prefix kgtk_triples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "auburn-elephant", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " !curl https://raw.githubusercontent.com/usc-isi-i2/kgtk/dev/kgtk-properties/kgtk.properties.tsv -o $TEMP/kgtk-properties.tsv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "compressed-sight", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " kgtk(f\"\"\"filter -p \";{datatype_property};\" -i $TEMP/kgtk-properties.tsv -o $TEMP/kgtk-properties.datatype.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "threaded-confusion", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cat: illegal option -- i\n", - "usage: cat [-benstuv] [file ...]\n" - ] - } - ], - "source": [ - "if generate_triples:\n", - " !$kgtk cat -i $TEMP/kgtk-properties.datatype.tsv.gz $OUT/metadata.property.datatypes.tsv.gz -o $OUT/metadata.property.datatypes.augmented.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "disciplinary-violation", - "metadata": {}, - "outputs": [], - "source": [ - "if generate_triples:\n", - " ls $OUT/kgtk_triples_split/*.tsv.gz | parallel -j 18 'kgtk --debug generate-wikidata-triples -lp label -ap alias -dp description -pf $OUT/metadata.property.datatypes.augmented.tsv.gz --output-n-lines 100000 --generate-truthy --warning --use-id --log-path $TEMP/generate_triples_log.txt --error-action log -i {} -o {.}.ttl'\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "kgtk-env-ckg07", - "language": "python", - "name": "kgtk-env-ckg07" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/use-cases/Wikidata-Subsets.ipynb b/use-cases/Wikidata-Subsets.ipynb deleted file mode 100644 index 65e083e0c..000000000 --- a/use-cases/Wikidata-Subsets.ipynb +++ /dev/null @@ -1,1033 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generating Subsets of Wikidata" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Batch Invocation\n", - "Example batch command. The second argument is a notebook where the output will be stored. You can load it to see progress.\n", - "\n", - "UPDATE EXAMPLE INVOCATION\n", - "\n", - "\n", - "```\n", - "papermill Wikidata\\ Useful\\ Files.ipynb useful-files.out.ipynb \\\n", - "-p wiki_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v3/all.tsv.gz \\\n", - "-p label_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v3/part.label.en.tsv.gz \\\n", - "-p item_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v3/part.wikibase-item.tsv.gz \\\n", - "-p property_item_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v3/part.property.wikibase-item.tsv.gz \\\n", - "-p qual_file /Volumes/GoogleDrive/Shared\\ drives/KGTK-public-graphs/wikidata-20200803-v3/qual.tsv.gz \\\n", - "-p output_path \\\n", - "-p output_folder useful_files_v4 \\\n", - "-p temp_folder temp.useful_files_v4 \\\n", - "-p delete_database no \\\n", - "-p compute_pagerank no \\\n", - "-p languages es,ru,zh-cn \n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import io\n", - "import os\n", - "import subprocess\n", - "import sys\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import papermill as pm\n", - "\n", - "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", - "from kgtk.functions import kgtk, kypher" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "input_path = \"/data/amandeep/wikidata-20220505/import-wikidata/data\"\n", - "output_path = \"/data/amandeep\"\n", - "kgtk_path = \"/data/amandeep/Github/kgtk\"\n", - "\n", - "graph_cache_path = None\n", - "\n", - "project_name = \"wikidata-20220505-dwd-v4\"\n", - "\n", - "files = 'isa,p279star'\n", - "\n", - "# Classes to remove\n", - "remove_classes = \"Q7318358,Q13442814\"\n", - "\n", - "useful_files_notebook = \"Wikidata-Useful-Files.ipynb\"\n", - "notebooks_folder = f\"{kgtk_path}/use-cases\"\n", - "\n", - "languages = \"en,ru,es,zh-cn,de,it,nl,pl,fr,pt,sv\"\n", - "debug = False" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "files = files.split(',')\n", - "languages = languages.split(',')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "User home: /nas/home/amandeep\n", - "Current dir: /data/amandeep/Github/kgtk/use-cases\n", - "KGTK dir: /data/amandeep/Github/kgtk\n", - "Use-cases dir: /data/amandeep/Github/kgtk/use-cases\n" - ] - } - ], - "source": [ - "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", - "ck.configure_kgtk(input_graph_path=input_path,\n", - " output_path=output_path,\n", - " project_name=project_name,\n", - " graph_cache_path=graph_cache_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4/wikidata.sqlite3.db\n", - "GRAPH: /data/amandeep/wikidata-20220505/import-wikidata/data\n", - "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4/wikidata.sqlite3.db\n", - "STORE: /data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4/wikidata.sqlite3.db\n", - "OUT: /data/amandeep/wikidata-20220505-dwd-v4\n", - "kgtk: kgtk\n", - "TEMP: /data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4\n", - "USE_CASES_DIR: /data/amandeep/Github/kgtk/use-cases\n", - "KGTK_OPTION_DEBUG: false\n", - "EXAMPLES_DIR: /data/amandeep/Github/kgtk/examples\n", - "KGTK_LABEL_FILE: /data/amandeep/wikidata-20220505/import-wikidata/data/labels.en.tsv.gz\n", - "claims: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\n", - "label_all: /data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz\n", - "alias_all: /data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz\n", - "description_all: /data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz\n", - "item: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-item.tsv.gz\n", - "qualifiers: /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz\n", - "datatypes: /data/amandeep/wikidata-20220505/import-wikidata/data/metadata.property.datatypes.tsv.gz\n", - "types: /data/amandeep/wikidata-20220505/import-wikidata/data/metadata.types.tsv.gz\n", - "isa: /data/amandeep/wikidata-20220505/import-wikidata/data/derived.isa.tsv.gz\n", - "p279star: /data/amandeep/wikidata-20220505/import-wikidata/data/derived.P279star.tsv.gz\n" - ] - } - ], - "source": [ - "ck.print_env_variables()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "kgtk query --graph-cache /data/amandeep/wikidata-20220505-dwd-v4/temp.wikidata-20220505-dwd-v4/wikidata.sqlite3.db -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\" --as claims -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz\" --as label_all -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz\" --as alias_all -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz\" --as description_all -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-item.tsv.gz\" --as item -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz\" --as qualifiers -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.property.datatypes.tsv.gz\" --as datatypes -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.types.tsv.gz\" --as types -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/derived.isa.tsv.gz\" --as isa -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/derived.P279star.tsv.gz\" --as p279star --limit 3\n", - "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\tnormal\turl\n", - "P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\tnormal\turl\n", - "P10-P1629-Q34508-bcc39400-0\tP10\tP1629\tQ34508\tnormal\twikibase-item\n" - ] - } - ], - "source": [ - "ck.load_files_into_cache()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Preview the input files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It is always a good practice to peek a the files to make sure the column headings are what we expect" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\tnormal\turl\n", - "P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\tnormal\turl\n", - "P10-P1629-Q34508-bcc39400-0\tP10\tP1629\tQ34508\tnormal\twikibase-item\n", - "P10-P1630-53947a-fbe9093e-0\tP10\tP1630\t\"https://commons.wikimedia.org/wiki/File:$1\"\tnormal\tstring\n", - "P10-P1659-P1651-c4068028-0\tP10\tP1659\tP1651\tnormal\twikibase-property\n", - "P10-P1659-P18-5e4b9c4f-0\tP10\tP1659\tP18\tnormal\twikibase-property\n", - "P10-P1659-P4238-d21d1ac0-0\tP10\tP1659\tP4238\tnormal\twikibase-property\n", - "P10-P1659-P51-86aca4c5-0\tP10\tP1659\tP51\tnormal\twikibase-property\n", - "P10-P1855-Q15075950-7eff6d65-0\tP10\tP1855\tQ15075950\tnormal\twikibase-item\n", - "\n", - "gzip: stdout: Broken pipe\n" - ] - } - ], - "source": [ - "!zcat $claims | head" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating a list of all the items we want to remove" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Compute the items to be removed" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Compose the kypher command to remove the classes" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "node1\tlabel\tnode2\n", - "P10\tisa\tQ18610173\n", - "P10\tisa\tQ19847637\n", - "P1000\tisa\tQ18608871\n", - "P10000\tisa\tQ19833377\n", - "P10000\tisa\tQ89560413\n", - "P10001\tisa\tQ107738007\n", - "\n", - "gzip: P10001\tisa\tQ64221137\n", - "P10002\tisa\tQ93433126\n", - "stdout: Broken pipe\n", - "P10003\tisa\tQ108914651\n" - ] - } - ], - "source": [ - "!zcat $isa | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run the command, the items to remove will be in file `{temp}/items.remove.tsv.gz`" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\"Q7318358\", \"Q13442814\"'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "classes = \", \".join(list(map(lambda x: '\"{}\"'.format(x), remove_classes.replace(\" \", \"\").split(\",\"))))\n", - "\n", - "classes" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "kypher(f\"\"\" -i isa -i p279star -o \"$TEMP\"/items.remove.tsv.gz \n", - " --match 'isa: (n1)-[:isa]->(c), p279star: (c)-[]->(class)' \n", - " --where 'class in [{classes}]' \n", - " --return 'distinct n1, \"p31_p279star\" as label, class as node2' \n", - " --order-by 'n1'\n", - " \"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Preview the file" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "node1\tlabel\tnode2\n", - "Q100000005\tp31_p279star\tQ13442814\n", - "\n", - "gzip: Q100000009\tp31_p279star\tQ13442814\n", - "stdout: Broken pipe\n", - "Q100000015\tp31_p279star\tQ13442814\n", - "Q100000022\tp31_p279star\tQ13442814\n", - "Q100000031\tp31_p279star\tQ13442814\n", - "Q100000044\tp31_p279star\tQ13442814\n", - "Q100000056\tp31_p279star\tQ13442814\n", - "Q100000066\tp31_p279star\tQ13442814\n", - "Q100000074\tp31_p279star\tQ13442814\n" - ] - } - ], - "source": [ - "!zcat < \"$TEMP\"/items.remove.tsv.gz | head | col" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "39873936 119621808 1314915334\n" - ] - } - ], - "source": [ - "!zcat < \"$TEMP\"/items.remove.tsv.gz | wc" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Collect all the classes of items we will remove, just as a sanity check" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "node2\n", - "Q13442814\n", - "Q7318358\n" - ] - } - ], - "source": [ - "!$kypher -i \"$TEMP\"/items.remove.tsv.gz \\\n", - "--match '()-[]->(n2)' \\\n", - "--return 'distinct n2' \\\n", - "--limit 10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create the reduced edges file" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Remove the items from the all.tsv and the label, alias and description files\n", - "We will be left with `reduced` files where the edges do not have the unwanted items. We have to remove them from the node1 and node2 positions, so we need to run the ifnotexists commands twice.\n", - "\n", - "Before we start preview the files to see the column headings and check whether they look sorted." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "node1\tlabel\tnode2\n", - "\n", - "gzip: Q100000005\tp31_p279star\tQ13442814\n", - "Q100000009\tp31_p279star\tQ13442814\n", - "stdout: Broken pipe\n", - "Q100000015\tp31_p279star\tQ13442814\n", - "Q100000022\tp31_p279star\tQ13442814\n", - "Q100000031\tp31_p279star\tQ13442814\n", - "Q100000044\tp31_p279star\tQ13442814\n", - "Q100000056\tp31_p279star\tQ13442814\n", - "Q100000066\tp31_p279star\tQ13442814\n", - "Q100000074\tp31_p279star\tQ13442814\n" - ] - } - ], - "source": [ - "!zcat \"$TEMP\"/items.remove.tsv.gz | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Remove from the full set of edges those edges that have a `node1` present in `items.remove.tsv`" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"ifnotexists \n", - " -i $claims \n", - " -o \"$TEMP\"/item.edges.reduced.tsv.gz\n", - " --filter-on \"$TEMP\"/items.remove.tsv.gz\n", - " --input-keys node1\n", - " --filter-keys node1\n", - " --presorted\n", - " \"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "From the remaining edges, remove those that have a `node2` present in `items.remove.tsv`" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(f\"\"\"sort \n", - " -i \"$TEMP\"/item.edges.reduced.tsv.gz \n", - " -o \"$TEMP\"/item.edges.reduced.sorted.tsv.gz\n", - " --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'\n", - " --columns node2 label node1 id\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"ifnotexists \n", - " -i $TEMP/item.edges.reduced.sorted.tsv.gz \n", - " -o $TEMP/item.edges.reduced.2.tsv.gz\n", - " --filter-on $TEMP/items.remove.tsv.gz\n", - " --input-keys node2\n", - " --filter-keys node1\n", - " --presorted\"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a file with the labels, for all the languages specified, **FIX THIS**" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"ifnotexists -i $label_all \n", - " -o \"$TEMP\"/label.all.edges.reduced.tsv.gz\n", - " --filter-on \"$TEMP\"/items.remove.tsv.gz\n", - " --input-keys node1\n", - " --filter-keys node1\n", - " --presorted\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(f\"\"\"sort \n", - " -i $TEMP/label.all.edges.reduced.tsv.gz \n", - " --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'\n", - " -o $OUT/labels.tsv.gz\"\"\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a file with the aliases, for all the languages specified" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"ifnotexists -i $alias_all\n", - " -o $TEMP/alias.all.edges.reduced.tsv.gz\n", - " --filter-on $TEMP/items.remove.tsv.gz\n", - " --input-keys node1\n", - " --filter-keys node1\n", - " --presorted\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(f\"\"\"sort \n", - " -i $TEMP/alias.all.edges.reduced.tsv.gz \n", - " --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'\n", - " -o $OUT/aliases.tsv.gz\"\"\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a file with the descriptions, for all the languages specified" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"ifnotexists \n", - " -i $description_all\n", - " -o $TEMP/description.all.edges.reduced.tsv.gz\n", - " --filter-on $TEMP/items.remove.tsv.gz\n", - " --input-keys node1\n", - " --filter-keys node1\n", - " --presorted\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(f\"\"\"sort \n", - " -i $TEMP/description.all.edges.reduced.tsv.gz \n", - " --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'\n", - " -o $OUT/descriptions.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Produce the output files for claims, labels, aliases and descriptions" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(f\"\"\"sort \n", - " -i $TEMP/item.edges.reduced.2.tsv.gz\n", - " --extra '--parallel 24 --buffer-size 30% --temporary-directory {os.environ['TEMP']}'\n", - " -o $OUT/claims.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create the reduced qualifiers file\n", - "We do this by finding all the ids of the reduced edges file, and then selecting out from `qual.tsv`\n", - "\n", - "We need to join by id, so we need to sort both files by id, node1, label, node2:\n", - "\n", - "- `$qualifiers` \n", - "- `$OUT/claims.tsv.gz` " - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "gzip: id node1 label node2 node2;wikidatatype\n", - "P10-P1630-53947a-fbe9093e-0-P407-Q20923490-0 P10-P1630-53947a-fbe9093e-0 P407 Q20923490 wikibase-item\n", - "stdout: Broken pipe\n", - "P10-P1855-Q15075950-7eff6d65-0-P10-54b214-0 P10-P1855-Q15075950-7eff6d65-0 P10 \"Smoorverliefd 12 september.webm\" commonsMedia\n", - "P10-P1855-Q15075950-7eff6d65-0-P3831-Q622550-0 P10-P1855-Q15075950-7eff6d65-0 P3831 Q622550 wikibase-item\n", - "P10-P1855-Q4504-a69d2c73-0-P10-bef003-0 P10-P1855-Q4504-a69d2c73-0 P10 \"Komodo dragons video.ogv\" commonsMedia\n", - "P10-P1855-Q69063653-c8cdb04c-0-P10-6fb08f-0 P10-P1855-Q69063653-c8cdb04c-0 P10 \"Couch Commander.webm\" commonsMedia\n", - "P10-P1855-Q825197-555592a4-0-P10-8a982d-0 P10-P1855-Q825197-555592a4-0 P10 \"Elephants Dream (2006).webm\" commonsMedia\n", - "P10-P2302-Q21502404-d012aef4-0-P1793-1f3adb-0 P10-P2302-Q21502404-d012aef4-0 P1793 \"(?i).+\\\\.(webm\\|ogv\\|ogg\\|gif\\|svg)\" string\n", - "P10-P2302-Q21502404-d012aef4-0-P2316-Q21502408-0 P10-P2302-Q21502404-d012aef4-0 P2316 Q21502408 wikibase-item\n", - "P10-P2302-Q21502404-d012aef4-0-P2916-cb0917-0 P10-P2302-Q21502404-d012aef4-0 P2916 'filename with extension: webm, ogg, ogv, or gif (case insensitive)'@en monolingualtext\n" - ] - } - ], - "source": [ - "if debug:\n", - " !zcat < \"$qualifiers\" | head | column -t -s $'\\t' " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Run `ifexists` to select out the quals for the edges in `{out}/wikidataos.qual.tsv.gz`. Note that we use `node1` in the qualifier file, matching to `id` in the `wikidataos.all.tsv` file." - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"ifexists \n", - " -i $qualifiers \n", - " -o $OUT/qualifiers.tsv.gz\n", - " --filter-on $OUT/claims.tsv.gz\n", - " --input-keys node1\n", - " --filter-keys id\n", - " --presorted\"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Look at the final output for qualifiers" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "gzip: stdout: Broken pipe\n", - "id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "P10-P1630-53947a-fbe9093e-0-P407-Q20923490-0\tP10-P1630-53947a-fbe9093e-0\tP407\tQ20923490\twikibase-item\n", - "P10-P1855-Q15075950-7eff6d65-0-P10-54b214-0\tP10-P1855-Q15075950-7eff6d65-0\tP10\t\"Smoorverliefd 12 september.webm\"\tcommonsMedia\n", - "P10-P1855-Q15075950-7eff6d65-0-P3831-Q622550-0\tP10-P1855-Q15075950-7eff6d65-0\tP3831\tQ622550 wikibase-item\n", - "P10-P1855-Q4504-a69d2c73-0-P10-bef003-0 P10-P1855-Q4504-a69d2c73-0\tP10\t\"Komodo dragons video.ogv\"\tcommonsMedia\n", - "P10-P1855-Q69063653-c8cdb04c-0-P10-6fb08f-0\tP10-P1855-Q69063653-c8cdb04c-0\tP10\t\"Couch Commander.webm\"\tcommonsMedia\n", - "P10-P1855-Q825197-555592a4-0-P10-8a982d-0\tP10-P1855-Q825197-555592a4-0\tP10\t\"Elephants Dream (2006).webm\"\tcommonsMedia\n", - "P10-P2302-Q21502404-d012aef4-0-P1793-1f3adb-0\tP10-P2302-Q21502404-d012aef4-0\tP1793\t\"(?i).+\\\\.(webm\\|ogv\\|ogg\\|gif\\|svg)\"\tstring\n", - "P10-P2302-Q21502404-d012aef4-0-P2316-Q21502408-0\tP10-P2302-Q21502404-d012aef4-0\tP2316\tQ21502408\twikibase-item\n", - "P10-P2302-Q21502404-d012aef4-0-P2916-cb0917-0\tP10-P2302-Q21502404-d012aef4-0\tP2916\t'filename with extension: webm, ogg, ogv, or gif (case insensitive)'@en monolingualtext\n" - ] - } - ], - "source": [ - "if debug:\n", - " !zcat $OUT/qualifiers.tsv.gz | head | col" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 34220224\n", - "-rw-r--r-- 1 amandeep isdstaff 2214529468 May 14 20:50 aliases.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 11594856613 May 15 04:31 claims.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 12667243225 May 15 03:52 descriptions.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 6007956701 May 14 20:09 labels.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2556913530 May 15 05:28 qualifiers.tsv.gz\n", - "drwxr-xr-x 2 amandeep isdstaff 288 May 15 04:31 temp.wikidata-20220505-dwd-v4\n" - ] - } - ], - "source": [ - "!ls -l \"$OUT\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copy the property datatypes and metadata types file over" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "!cp $datatypes $OUT/metadata.property.datatypes.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Filter out edges from metdata types file" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"ifexists \n", - " -i \"$types\" -o $OUT/metadata.types.tsv.gz\n", - " --filter-on $OUT/claims.tsv.gz\n", - " --input-keys node1\n", - " --filter-keys node1\n", - " --presorted\"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get the sitelinks as well, the sitelinks are not in claims.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"ifexists \n", - " -i \"$GRAPH/sitelinks.tsv.gz\" \n", - " -o \"$OUT/sitelinks.tsv.gz\"\n", - " --filter-on \"$OUT/claims.tsv.gz\"\n", - " --input-keys node1\n", - " --filter-keys node1\n", - " --presorted\"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Contruct the cat command to generate `all.tsv.gz`" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"cat -i \"$OUT/labels.tsv.gz\"\n", - " -i \"$OUT/aliases.tsv.gz\"\n", - " -i \"$OUT/descriptions.tsv.gz\"\n", - " -i \"$OUT/claims.tsv.gz\"\n", - " -i \"$OUT/qualifiers.tsv.gz\"\n", - " -i \"$OUT/metadata.property.datatypes.tsv.gz\"\n", - " -i \"$OUT/metadata.types.tsv.gz\"\n", - " -i \"$OUT/sitelinks.tsv.gz\"\n", - " -o \"$OUT/all.tsv.gz\"\n", - " \"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run the Partitions Notebook" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pm.execute_notebook(\n", - " os.environ[\"EXAMPLES_DIR\"] + \"/partition-wikidata.ipynb\",\n", - " os.environ[\"TEMP\"] + \"/partition-wikidata.out.ipynb\",\n", - " parameters=dict(\n", - " wikidata_input_path = os.environ[\"OUT\"] + \"/all.tsv.gz\",\n", - " wikidata_parts_path = os.environ[\"OUT\"] + \"/parts\",\n", - " temp_folder_path = os.environ[\"OUT\"] + \"/parts/temp\",\n", - " sort_extras = \"--buffer-size 30% --temporary-directory $OUT/parts/temp\",\n", - " verbose = False,\n", - " gzip_command = 'gzip'\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### copy the `claims.wikibase-item.tsv` file from the `parts` folder" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "!cp $OUT/parts/claims.wikibase-item.tsv.gz $OUT" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### RUN the Useful Files notebook" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pm.execute_notebook(\n", - " f'{os.environ[\"USE_CASES_DIR\"]}/{useful_files_notebook}',\n", - " os.environ[\"TEMP\"] + \"/Wikidata-Useful-Files-Out.ipynb\",\n", - " parameters=dict(\n", - " output_path = os.environ[\"OUT\"],\n", - " input_path = os.environ[\"OUT\"],\n", - " kgtk_path = kgtk_path,\n", - " compute_pagerank=True,\n", - " compute_degrees=True,\n", - " compute_isa_star=True,\n", - " compute_p31p279_star=True,\n", - " debug=False\n", - " )\n", - ")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Sanity checks" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if debug:\n", - " !$kypher -i $OUT/claims.tsv.gz \\\n", - " --match '(n1:Q368441)-[l]->(n2)' \\\n", - " --limit 10 \\\n", - " | col" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "if debug:\n", - " !$kypher -i $OUT/claims.tsv.gz \\\n", - " --match '(n1:P131)-[l]->(n2)' \\\n", - " --limit 10 \\\n", - " | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary of results" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-rw-r--r-- 1 amandeep isdstaff 175M May 16 04:59 /data/amandeep/wikidata-20220505-dwd-v4/aliases.en.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.0G May 16 01:22 /data/amandeep/wikidata-20220505-dwd-v4/aliases.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 39G May 15 22:08 /data/amandeep/wikidata-20220505-dwd-v4/all.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 184M May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.commonsMedia.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.5G May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.external-id.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 779K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.geo-shape.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 227M May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.globe-coordinate.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 689K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.math.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 295M May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.monolingualtext.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 28K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.musical-notation.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 88 May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.other.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.0G May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.quantity.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.1G May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.string.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 421K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.tabular-data.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 301M May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.time.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 11G May 16 04:42 /data/amandeep/wikidata-20220505-dwd-v4/claims.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 123M May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.url.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 115K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.wikibase-form.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 3.6G May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.wikibase-item.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 75K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.wikibase-lexeme.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 643K May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.wikibase-property.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 965 May 16 07:06 /data/amandeep/wikidata-20220505-dwd-v4/claims.wikibase-sense.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 12G May 17 03:41 /data/amandeep/wikidata-20220505-dwd-v4/derived.isastar.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 189M May 16 13:27 /data/amandeep/wikidata-20220505-dwd-v4/derived.isa.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 699M May 16 13:05 /data/amandeep/wikidata-20220505-dwd-v4/derived.P279star.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 42M May 16 11:23 /data/amandeep/wikidata-20220505-dwd-v4/derived.P279.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 12G May 17 17:49 /data/amandeep/wikidata-20220505-dwd-v4/derived.P31P279star.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 717M May 16 11:22 /data/amandeep/wikidata-20220505-dwd-v4/derived.P31.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 395M May 16 06:01 /data/amandeep/wikidata-20220505-dwd-v4/descriptions.en.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 12G May 16 01:22 /data/amandeep/wikidata-20220505-dwd-v4/descriptions.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 640M May 16 06:30 /data/amandeep/wikidata-20220505-dwd-v4/labels.en.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 5.6G May 16 01:22 /data/amandeep/wikidata-20220505-dwd-v4/labels.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 79M May 17 21:15 /data/amandeep/wikidata-20220505-dwd-v4/metadata.in_degree.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 357M May 17 20:44 /data/amandeep/wikidata-20220505-dwd-v4/metadata.out_degree.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 559M May 17 18:52 /data/amandeep/wikidata-20220505-dwd-v4/metadata.pagerank.directed.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 770M May 17 19:59 /data/amandeep/wikidata-20220505-dwd-v4/metadata.pagerank.undirected.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 53K May 16 01:21 /data/amandeep/wikidata-20220505-dwd-v4/metadata.property.datatypes.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 271M May 16 01:22 /data/amandeep/wikidata-20220505-dwd-v4/metadata.types.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 16M May 16 07:12 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.commonsMedia.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 151M May 16 07:22 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.external-id.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 29K May 16 07:27 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.geo-shape.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.9M May 16 07:32 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.globe-coordinate.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 87K May 16 07:38 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.math.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 6.8M May 16 07:43 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.monolingualtext.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.8K May 16 07:48 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.musical-notation.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 900M May 16 07:58 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.quantity.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 530M May 16 08:07 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.string.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 201K May 16 08:12 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.tabular-data.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 16M May 16 08:18 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.time.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.5G May 16 04:52 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 35M May 16 08:23 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.url.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.1K May 16 08:28 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.wikibase-form.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 695M May 16 08:44 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.wikibase-item.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 9.3K May 16 08:49 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.wikibase-lexeme.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 21K May 16 08:54 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.wikibase-property.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.6K May 16 08:58 /data/amandeep/wikidata-20220505-dwd-v4/qualifiers.wikibase-sense.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 88 May 16 06:33 /data/amandeep/wikidata-20220505-dwd-v4/sitelinks.en.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 99 May 16 06:33 /data/amandeep/wikidata-20220505-dwd-v4/sitelinks.qualifiers.en.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 96 May 16 01:22 /data/amandeep/wikidata-20220505-dwd-v4/sitelinks.qualifiers.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.8G May 16 01:22 /data/amandeep/wikidata-20220505-dwd-v4/sitelinks.tsv.gz\n" - ] - } - ], - "source": [ - "!ls -lh $OUT/*.tsv.gz" - ] - } - ], - "metadata": { - "celltoolbar": "Tags", - "kernelspec": { - "display_name": "kgtk-env", - "language": "python", - "name": "kgtk-env" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/use-cases/Wikidata-Useful-Files.ipynb b/use-cases/Wikidata-Useful-Files.ipynb deleted file mode 100644 index 3e5339890..000000000 --- a/use-cases/Wikidata-Useful-Files.ipynb +++ /dev/null @@ -1,1322 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generating Useful Wikidata Files\n", - "\n", - "This notebook generates files that contain derived data that is useful in many applications. The input to the notebook is the full Wikidata or a subset of Wikidata. It also works for arbitrary KGs as long as they follow the representation requirements of Wikidata:\n", - "\n", - "- the *instance of* relation is represented using the `P31` property\n", - "- the *subclass of* relation is represented using the `P279` property\n", - "- all properties declare a datatype, and the data types must be one of the datatypes in Wikidata.\n", - "\n", - "Inputs:\n", - "\n", - "- `claims_file`: contains all statements, which consist of edges `node1/label/node2` where `label` is a property in Wikidata (e.g., sitelinks, labels, aliases and description are not in the claims file.\n", - "- `item_file`: the subset of the `claims_file` consistin of edges for property of data type `wikibase-item`\n", - "- `label_file`, `alias_file` and `description_file` containing labels, aliases and descriptions. It is assume that these files contain the labels, aliases and descriptions of all nodes appearing in the claims file. Users may provide these files for specific languages only.\n", - "\n", - "Outputs:\n", - "\n", - "- **Instance of (P31):** `derived.P31.tsv.gz` contains all the `instance of (P31)` edges present in the claims file.\n", - "- **Subclass of (P279):** `derived.P279.tsv.gz` contains all the `subclass of (P279)` edges present in the claims file.\n", - "- **Is A (isa):** `derived.isa.tsv.gz` contains edges `node`isa/node2` where either `node1/P31/node2` or `node1/P279/node2`\n", - "- **Closure of subclass of (P279star):** `derived.P279star.tsv.gz` contains edges `node1/P279star/node2` where `node2` is reachable from `node1` via zero or more hops using the `P279` property. Note that for example, `Q44/P279star/Q44`. An example when this file is useful is when you want to find all the instance of a class, including instances of subclasses of the given class.\n", - "- **In/out degrees:** `metadata.out_degree.tsv.gz` contains the out degree of every node, and `metadata.in_degree.tsv.gz` contains the in degree of every node.\n", - "- **Pagerank:** outputs page rank on the directed graph in `metadata.pagerank.directed.tsv.gz` and page rank of the directed graph in `metadata.pagerank.undirected.tsv.gz`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Batch Invocation\n", - "Example batch command. The second argument is a notebook where the output will be stored. You can load it to see progress.\n", - "\n", - "```\n", - "papermill Wikidata\\ Useful\\ Files.ipynb useful-files.out.ipynb \\\n", - "-p input_path /data/amandeep/wikidata-20211027-dwd-v3 \\\n", - "-p output_path /data/amandeep/wikidata-20211027-dwd-v3 \\\n", - "-p kgtk_path /Users/amandeep/github/kgtk \\\n", - "-p project_name useful-files \\\n", - "-p languages en,es \\\n", - "-p files claims,label_all,alias_all,description_all \\\n", - "-p compute_pagerank True \\\n", - "-p compute_degrees True \\\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import io\n", - "import os\n", - "import subprocess\n", - "import sys\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - " \n", - "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", - "from kgtk.functions import kgtk, kypher" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "# Parameters\n", - "\n", - "input_path = \"/data/amandeep/wikidata-20220505/import-wikidata/data\"\n", - "output_path = \"/data/amandeep/wikidata-20220505/import-wikidata/data\"\n", - "kgtk_path = \"/Users/amandeep/github/kgtk\"\n", - "\n", - "graph_cache_path = None\n", - "\n", - "project_name = \"useful-files\"\n", - "\n", - "languages = 'en,ru,es,zh-cn,de,it,nl,pl,fr,pt,sv'\n", - "\n", - "files = 'claims,label_all,alias_all,description_all'\n", - "\n", - "compute_pagerank = False\n", - "compute_degrees = False\n", - "debug = False\n", - "compute_isa_star = False\n", - "compute_p31p279_star = False\n", - "files_for_cache = None" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "files = files.split(',')\n", - "languages = languages.split(',')\n", - "if files_for_cache is None:\n", - " files_for_cache = files\n", - "else:\n", - " files_for_cache = files_for_cache.split(\",\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "User home: /nas/home/amandeep\n", - "Current dir: /data/amandeep/Github/kgtk/use-cases\n", - "KGTK dir: /Users/amandeep/github/kgtk\n", - "Use-cases dir: /Users/amandeep/github/kgtk/use-cases\n" - ] - } - ], - "source": [ - "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", - "ck.configure_kgtk(input_graph_path=input_path,\n", - " output_path=output_path,\n", - " project_name=project_name,\n", - " graph_cache_path=graph_cache_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files/temp.useful-files/wikidata.sqlite3.db\n", - "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files/temp.useful-files/wikidata.sqlite3.db\n", - "EXAMPLES_DIR: /Users/amandeep/github/kgtk/examples\n", - "USE_CASES_DIR: /Users/amandeep/github/kgtk/use-cases\n", - "KGTK_LABEL_FILE: /data/amandeep/wikidata-20220505/import-wikidata/data/labels.en.tsv.gz\n", - "kgtk: kgtk\n", - "STORE: /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files/temp.useful-files/wikidata.sqlite3.db\n", - "GRAPH: /data/amandeep/wikidata-20220505/import-wikidata/data\n", - "OUT: /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files\n", - "TEMP: /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files/temp.useful-files\n", - "KGTK_OPTION_DEBUG: false\n", - "claims: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\n", - "label_all: /data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz\n", - "alias_all: /data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz\n", - "description_all: /data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz\n" - ] - } - ], - "source": [ - "ck.print_env_variables()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "kgtk query --graph-cache /data/amandeep/wikidata-20220505/import-wikidata/data/useful-files/temp.useful-files/wikidata.sqlite3.db -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\" --as claims -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz\" --as label_all -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz\" --as alias_all -i \"/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz\" --as description_all --limit 3\n", - "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\tnormal\turl\n", - "P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\tnormal\turl\n", - "P10-P1629-Q34508-bcc39400-0\tP10\tP1629\tQ34508\tnormal\twikibase-item\n" - ] - } - ], - "source": [ - "if graph_cache_path is None:\n", - " ck.load_files_into_cache(files=files_for_cache)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Preview the input files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It is always a good practice to peek a the files to make sure the column headings are what we expect" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/bin/bash: function: No such file or directory\n", - "\n" - ] - } - ], - "source": [ - "!$kypher -i claims --limit 10 | col " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Force creation of the index on the label column" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/bin/bash: function: No such file or directory\n" - ] - } - ], - "source": [ - "!$kypher -i claims -o - \\\n", - "--match '(i)-[:P31]->(c)' \\\n", - "--limit 5 \\\n", - "| column -t -s $'\\t' " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Force creation of the index on the node2 column" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/bin/bash: function: No such file or directory\n" - ] - } - ], - "source": [ - "!$kypher -i claims -o - \\\n", - "--match '(i)-[r]->(:Q5)' \\\n", - "--limit 5 \\\n", - "| column -t -s $'\\t' " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create the P31 and P279 files" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create the `P31` file" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "!$kypher -i claims -o $OUT/derived.P31.tsv.gz \\\n", - "--match '(n1)-[l:P31]->(n2)' \\\n", - "--return 'l, n1, l.label, n2' " - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id\tnode1\tlabel\tnode2\n", - "P10-P31-Q18610173-85ef4d24-0\tP10\tP31\tQ18610173\n", - "P10-P31-Q19847637-e81ded71-0\tP10\tP31\tQ19847637\n", - "P1000-P31-Q18608871-093affb5-0\tP1000\tP31\tQ18608871\n", - "P10000-P31-Q19833377-f87f0d4c-0 P10000\tP31\tQ19833377\n", - "P10000-P31-Q89560413-f555a944-0 P10000\tP31\tQ89560413\n", - "P10001-P31-Q107738007-c7725ce7-0\tP10001\tP31\tQ107738007\n", - "P10001-P31-Q64221137-d154ffd9-0 P10001\tP31\tQ64221137\n", - "P10002-P31-Q93433126-dbd52b84-0 P10002\tP31\tQ93433126\n", - "P10003-P31-Q108914651-f3644858-0\tP10003\tP31\tQ108914651\n", - "\n", - "gzip: stdout: Broken pipe\n" - ] - } - ], - "source": [ - "!zcat < $OUT/derived.P31.tsv.gz | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create the P279 file" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "!$kypher -i claims -o $OUT/derived.P279.tsv.gz \\\n", - " --match '(n1)-[l:P279]->(n2)' \\\n", - " --return 'l, n1, l.label, n2' " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id\tnode1\tlabel\tnode2\n", - "P2217-P279-Q986260-6ee7fda9-0\tP2217\tP279\tQ986260\n", - "Q100000030-P279-Q14748-30394205-0\tQ100000030\tP279\tQ14748\n", - "Q100000058-P279-Q1622444-bd182663-0\tQ100000058\tP279\tQ1622444\n", - "Q1000032-P279-Q1813494-0aa0f1dc-0\tQ1000032\tP279\tQ1813494\n", - "Q1000032-P279-Q83602-482a1943-0 Q1000032\tP279\tQ83602\n", - "Q1000039-P279-Q11555767-2dddfd86-0\tQ1000039\tP279\tQ11555767\n", - "Q100004761-P279-Q100095237-3971e1cd-0\tQ100004761\tP279\tQ100095237\n", - "Q100004761-P279-Q126793-77b1fce8-0\tQ100004761\tP279\tQ126793\n", - "Q100004761-P279-Q4544523-639fbe16-0\tQ100004761\tP279\tQ4544523\n", - "\n", - "gzip: stdout: Broken pipe\n" - ] - } - ], - "source": [ - "!zcat < $OUT/derived.P279.tsv.gz | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create the file that contains all nodes reachable via P279 starting from a node2 in P31 or a node1 in P279" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First compute the roots" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "!$kypher -i $OUT/derived.P279.tsv.gz -o $TEMP/P279.n1.tsv.gz \\\n", - "--match '(n1)-[l]->()' \\\n", - "--return 'n1 as id' " - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "!$kypher -i $OUT/derived.P31.tsv.gz -o $TEMP/P31.n2.tsv.gz \\\n", - "--match '()-[l]->(n2)' \\\n", - "--return 'n2 as id' " - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"cat --mode NONE \n", - " -i $TEMP/P31.n2.tsv.gz\n", - " -i $TEMP/P279.n1.tsv.gz\n", - " -o $TEMP/P279.roots.1.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"sort --mode NONE \n", - " --column id \n", - " -i $TEMP/P279.roots.1.tsv.gz \n", - " -o $TEMP/P279.roots.2.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have lots of duplicates" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id\n", - "P2217\n", - "Q1\n", - "Q1\n", - "Q100000030\n", - "Q100000058\n", - "Q1000017\n", - "Q1000032\n", - "Q1000032\n", - "Q1000039\n", - "\n", - "gzip: stdout: Broken pipe\n" - ] - } - ], - "source": [ - "!zcat < $TEMP/P279.roots.2.tsv.gz | head" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"compact \n", - " -i $TEMP/P279.roots.2.tsv.gz \n", - " --mode NONE\n", - " --presorted \n", - " --columns id\n", - " -o $TEMP/P279.roots.tsv\"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can invoke the reachable-nodes command" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"reachable-nodes\n", - " --rootfile $TEMP/P279.roots.tsv\n", - " --selflink \n", - " -i $OUT/derived.P279.tsv.gz\n", - " --label P279star\n", - " -o $TEMP/P279.reachable.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "node1\tlabel\tnode2\n", - "P2217\treachable\tP2217\n", - "P2217\treachable\tQ986260\n", - "P2217\treachable\tQ3711325\n", - "P2217\treachable\tQ107715\n", - "P2217\treachable\tQ309314\n", - "P2217\treachable\tQ246672\n", - "\n", - "gzip: P2217\treachable\tQ7184903\n", - "stdout: Broken pipe\n", - "P2217\treachable\tQ488383\n", - "P2217\treachable\tQ35120\n" - ] - } - ], - "source": [ - "!zcat < $TEMP/P279.reachable.tsv.gz | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The reachable-nodes command produces edges labeled `reachable`, so we need one command to rename them." - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "!$kypher -i $TEMP/P279.reachable.tsv.gz -o $TEMP/P279star.1.tsv.gz \\\n", - "--match '(n1)-[]->(n2)' \\\n", - "--return 'distinct n1, \"P279star\" as label, n2 as node2' \\\n", - "--order-by 'n1'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add ids" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "!$kgtk add-id --id-style wikidata -i $TEMP/P279.reachable.tsv.gz -o $OUT/derived.P279star.tsv.gz" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "node1\tlabel\tnode2\tid\n", - "P2217\tP279star\tP2217\tP2217-P279star-P2217\n", - "P2217\tP279star\tQ986260 P2217-P279star-Q986260\n", - "P2217\tP279star\tQ3711325\tP2217-P279star-Q3711325\n", - "P2217\tP279star\tQ107715 P2217-P279star-Q107715\n", - "P2217\tP279star\tQ309314 P2217-P279star-Q309314\n", - "P2217\tP279star\tQ246672 P2217-P279star-Q246672\n", - "P2217\tP279star\tQ7184903\tP2217-P279star-Q7184903\n", - "\n", - "gzip: P2217\tP279star\tQ488383 P2217-P279star-Q488383\n", - "stdout: Broken pipe\n", - "P2217\tP279star\tQ35120\tP2217-P279star-Q35120\n" - ] - } - ], - "source": [ - "!zcat < $OUT/derived.P279star.tsv.gz | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is how we would do the typical `?item P31/P279* ?class` in Kypher. \n", - "The example shows how to get all the counts of instances of subclasses of city (Q515)." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "if debug:\n", - " !$kypher -i $OUT/derived.P31.tsv.gz -i $OUT/derived.P279star.tsv.gz -i label \\\n", - " --match 'P31: (n1)-[:P31]->(c), P279star: (c)-[]->(:Q515), label: (n1)-[:label]->(label), label: (c)-[:label]->(c_label)' \\\n", - " --return 'distinct c as class, count(c) as count, c_label as `class name`, n1 as instance, label as `label`' \\\n", - " --order-by 'count(c) desc, c, n1' \\\n", - " --limit 10 \\\n", - " | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Illustrate that it is indeed `P279*`" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "if debug:\n", - " !$kypher -i $OUT/derived.P31.tsv.gz -i $OUT/derived.P279star.tsv.gz -i label \\\n", - " --match 'P31: (n1)-[:P31]->(c), P279star: (c)-[]->(:Q63440326), label: (n1)-[:label]->(label), label: (c)-[:label]->(c_label)' \\\n", - " --return 'distinct c as class, c_label as `class name`, n1 as instance, label as `label`' \\\n", - " --order-by 'c, n1' \\\n", - " --limit 10 \\\n", - " | col " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a file to do generalized Is-A queries\n", - "The idea is that `(n1)-[:isa]->(n2)` when `(n1)-[:P31]->(n2)` or `(n1)-[:P279]->(n2)`\n", - "\n", - "We do this by concatenating the files and renaming the relation" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "kgtk(\"\"\"cat \n", - " -i $OUT/derived.P31.tsv.gz \n", - " -i $OUT/derived.P279.tsv.gz\n", - " -o $TEMP/isa.1.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "!$kypher -i $TEMP/isa.1.tsv.gz -o $OUT/derived.isa.tsv.gz \\\n", - "--match '(n1)-[]->(n2)' \\\n", - "--return 'n1, \"isa\" as label, n2' \\\n", - "--order-by 'n1'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Example of how to use the `isa` relation" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "if debug:\n", - " !$kypher -i $OUT/derived.isa.tsv.gz -i $OUT/derived.P279star.tsv.gz -i label -o - \\\n", - " --match 'isa: (n1)-[l:isa]->(c), P279star: (c)-[]->(:Q44), label: (n1)-[:label]->(label)' \\\n", - " --return 'distinct n1, l.label, \"Q44\" as node2, label as n1_label' \\\n", - " --limit 10 \\\n", - " | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create files with `isa/P279* and P31/P279*` \n", - "This file is useful to find all nodes that are below a q-node via P279 or isa.\n", - "\n", - "> These files are very large and take many hours to compute" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "os.environ['P279STAR'] = f\"{os.environ['OUT']}/derived.P279star.tsv.gz\"\n", - "os.environ['ISA'] = f\"{os.environ['OUT']}/derived.isa.tsv.gz\"" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_isa_star:\n", - " !$kypher -i \"$P279STAR\" --as P279star -i \"$ISA\" --as isa \\\n", - " --match '\\\n", - " isa: (n1)-[]->(n2), \\\n", - " P279star: (n2)-[]->(n3)' \\\n", - " --return 'distinct n1 as node1, \"isa_star\" as label, n3 as node2' \\\n", - " --order-by 'n1' \\\n", - " -o \"$TEMP\"/derived.isastar_1.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now add ids" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_isa_star:\n", - " kgtk(\"\"\"add-id \n", - " --id-style wikidata \n", - " -i \"$TEMP\"/derived.isastar_1.tsv.gz \n", - " -o \"$OUT\"/derived.isastar.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Also calculate the same file by for P31/P279*" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_p31p279_star:\n", - " !$kypher -i claims -i \"$P279STAR\" --as P279star \\\n", - " --match '\\\n", - " claims: (n1)-[:P31]->(n2), \\\n", - " P279star: (n2)-[]->(n3)' \\\n", - " --return 'distinct n1 as node1, \"P31P279star\" as label, n3 as node2' \\\n", - " --order-by 'n1' \\\n", - " -o \"$TEMP\"/derived.P31P279star.tsv.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add ids" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_p31p279_star:\n", - " kgtk(\"\"\"add-id \n", - " --id-style wikidata \n", - " -i \"$TEMP\"/derived.P31P279star.tsv.gz\n", - " -o \"$OUT\"/derived.P31P279star.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It is also very big" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "if debug:\n", - " !zcat < \"$OUT\"/derived.P31P279star.tsv.gz | wc" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compute pagerank" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now compute pagerank. These commands will exceed 16GB memory for graphs containing over 25 million nodes." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_pagerank:\n", - " kgtk(\"\"\"graph-statistics \n", - " -i \"$GRAPH/claims.wikibase-item.tsv.gz\" \n", - " -o $OUT/metadata.pagerank.directed.tsv.gz \n", - " --compute-pagerank True \n", - " --compute-hits False \n", - " --page-rank-property Pdirected_pagerank \n", - " --output-degrees False \n", - " --use-mgzip True \n", - " --mgzip-threads 12 \n", - " --output-pagerank True \n", - " --output-hits False \n", - " --output-statistics-only \n", - " --undirected False \n", - " --log-file $TEMP/metadata.pagerank.directed.summary.txt\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "graph loaded! It has 94903511 nodes and 670635690 edges\n", - "\n", - "*** Top relations:\n", - "P2860\t285098156\n", - "P31\t99559383\n", - "P1433\t37893478\n", - "P50\t22619544\n", - "P921\t21565587\n", - "P17\t14723889\n", - "P407\t14494498\n", - "P131\t11189895\n", - "P106\t9239520\n", - "P6259\t8076517\n", - "\n", - "*** Degrees:\n", - "in degree stats: mean=7.066500, std=0.456495, max=1\n", - "out degree stats: mean=7.066500, std=0.001451, max=1\n", - "total degree stats: mean=14.133001, std=0.456502, max=1\n", - "\n", - "*** PageRank\n", - "Max pageranks\n", - "7296\tQ4167836\t0.024407\n", - "30751\tQ13442814\t0.020599\n", - "2476\tQ1860\t0.007204\n", - "5853\tQ5\t0.006323\n", - "5852\tQ11266439\t0.005784\n" - ] - } - ], - "source": [ - "if compute_pagerank:\n", - " !cat $TEMP/metadata.pagerank.directed.summary.txt" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_pagerank:\n", - " kgtk(\"\"\"graph-statistics \n", - " -i \"$GRAPH/claims.wikibase-item.tsv.gz\" \n", - " -o $OUT/metadata.pagerank.undirected.tsv.gz \n", - " --compute-pagerank True \n", - " --compute-hits False \n", - " --page-rank-property Pundirected_pagerank\n", - " --use-mgzip True \n", - " --mgzip-threads 12\n", - " --output-degrees False \n", - " --output-pagerank True \n", - " --output-hits False \n", - " --output-statistics-only \n", - " --undirected True \n", - " --log-file $TEMP/metadata.pagerank.undirected.summary.txt\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "graph loaded! It has 94903511 nodes and 670635690 edges\n", - "\n", - "*** Top relations:\n", - "P2860\t285098156\n", - "P31\t99559383\n", - "P1433\t37893478\n", - "P50\t22619544\n", - "P921\t21565587\n", - "P17\t14723889\n", - "P407\t14494498\n", - "P131\t11189895\n", - "P106\t9239520\n", - "P6259\t8076517\n", - "\n", - "*** Degrees:\n", - "in degree stats: mean=0.000000, std=0.000000, max=1\n", - "out degree stats: mean=14.133001, std=0.456502, max=1\n", - "total degree stats: mean=14.133001, std=0.456502, max=1\n", - "\n", - "*** PageRank\n", - "Max pageranks\n", - "30751\tQ13442814\t0.029250\n", - "130053\tQ1264450\t0.013161\n", - "7296\tQ4167836\t0.012312\n", - "5853\tQ5\t0.008650\n", - "2476\tQ1860\t0.006818\n" - ] - } - ], - "source": [ - "if compute_pagerank:\n", - " !cat $TEMP/metadata.pagerank.undirected.summary.txt " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compute Degrees" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Kypher can compute the out degree by counting the node2s for each node1" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_degrees:\n", - " !$kypher -i claims -o $TEMP/metadata.out_degree.tsv.gz \\\n", - " --match '(n1)-[l]->()' \\\n", - " --order-by 'n1' \\\n", - " --return 'distinct n1 as node1, count(distinct l) as node2, \"Pout_degree\" as label' " - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_degrees:\n", - " kgtk(\"\"\"add-id --id-style wikidata \n", - " -i $TEMP/metadata.out_degree.tsv.gz \n", - " -o $OUT/metadata.out_degree.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "node1\tnode2\tlabel\tid\n", - "\n", - "gzip: P10\t20\tPout_degree\tP10-Pout_degree-f5ca38\n", - "stdout: Broken pipe\n", - "P1000\t10\tPout_degree\tP1000-Pout_degree-4a44dc\n", - "P10000\t25\tPout_degree\tP10000-Pout_degree-b7a568\n", - "P10001\t30\tPout_degree\tP10001-Pout_degree-624b60\n", - "P10002\t21\tPout_degree\tP10002-Pout_degree-6f4b66\n", - "P10003\t20\tPout_degree\tP10003-Pout_degree-f5ca38\n", - "P10004\t23\tPout_degree\tP10004-Pout_degree-535fa3\n", - "P10005\t21\tPout_degree\tP10005-Pout_degree-6f4b66\n", - "P10006\t25\tPout_degree\tP10006-Pout_degree-b7a568\n" - ] - } - ], - "source": [ - "if compute_degrees:\n", - " !zcat < $OUT/metadata.out_degree.tsv.gz | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To count the in-degree we only care when the node2 is a wikibase-item" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_degrees:\n", - " !$kypher -i claims -o $TEMP/metadata.in_degree.tsv.gz \\\n", - " --match '()-[l]->(n2 {`wikidatatype`:\"wikibase-item\"})' \\\n", - " --return 'distinct n2 as node1, count(distinct l) as node2, \"Pin_degree\" as label' \\\n", - " --order-by 'n2'" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_degrees:\n", - " kgtk(\"\"\"add-id --id-style wikidata \n", - " -i $TEMP/metadata.in_degree.tsv.gz\n", - " -o $OUT/metadata.in_degree.tsv.gz\"\"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "node1\tnode2\tlabel\tid\n", - "Q1\t104\tPin_degree\tQ1-Pin_degree-5ef6fd\n", - "Q100\t14133\tPin_degree\tQ100-Pin_degree-ef9f82\n", - "Q1000\t6812\tPin_degree\tQ1000-Pin_degree-7536db\n", - "\n", - "gzip: Q10000\t2\tPin_degree\tQ10000-Pin_degree-d4735e\n", - "stdout: Broken pipe\n", - "Q100000 125\tPin_degree\tQ100000-Pin_degree-0f8ef3\n", - "Q10000000\t1\tPin_degree\tQ10000000-Pin_degree-6b86b2\n", - "Q100000001\t5\tPin_degree\tQ100000001-Pin_degree-ef2d12\n", - "Q10000002\t1\tPin_degree\tQ10000002-Pin_degree-6b86b2\n", - "Q100000040\t4\tPin_degree\tQ100000040-Pin_degree-4b2277\n" - ] - } - ], - "source": [ - "if compute_degrees:\n", - " !zcat < $OUT/metadata.in_degree.tsv.gz | head | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Calculate the distribution so we can make a nice chart" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_degrees:\n", - " !$kypher -i $OUT/metadata.in_degree.tsv.gz -o $OUT/statistics.in_degree.distribution.tsv \\\n", - " --match '(n1)-[]->(n2)' \\\n", - " --return 'distinct n2 as Pin_degree, count(distinct n1) as count, \"count\" as label' \\\n", - " --order-by 'cast(n2, integer)' " - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pin_degree\tcount\tlabel\n", - "1\t12410535\tcount\n", - "2\t5079189 count\n", - "3\t2954842 count\n", - "4\t1981895 count\n", - "5\t1530432 count\n", - "6\t1212475 count\n", - "7\t1008174 count\n", - "8\t827467\tcount\n", - "9\t706367\tcount\n" - ] - } - ], - "source": [ - "if compute_degrees:\n", - " !head $OUT/statistics.in_degree.distribution.tsv | col" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [], - "source": [ - "if compute_degrees:\n", - " !$kypher -i $OUT/metadata.out_degree.tsv.gz -o $OUT/statistics.out_degree.distribution.tsv \\\n", - " --match '(n1)-[]->(n2)' \\\n", - " --return 'distinct n2 as Pout_degree, count(distinct n1) as count, \"count\" as label' \\\n", - " --order-by 'cast(n2, integer)' " - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Pout_degree\tcount\tlabel\n", - "1\t6266209 count\n", - "2\t2622464 count\n", - "3\t2889122 count\n", - "4\t3106569 count\n", - "5\t4518981 count\n", - "6\t6059016 count\n", - "7\t5408942 count\n", - "8\t5105646 count\n", - "9\t6513341 count\n" - ] - } - ], - "source": [ - "if compute_degrees:\n", - " !head $OUT/statistics.out_degree.distribution.tsv | col" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Draw some charts" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "if debug:\n", - " data = pd.read_csv(\n", - " os.environ[\"OUT\"] + \"/statistics.in_degree.distribution.tsv\", sep=\"\\t\"\n", - " )\n", - "\n", - " alt.Chart(data).mark_circle(size=60).encode(\n", - " x=alt.X(\"in_degree\", scale=alt.Scale(type=\"log\")),\n", - " y=alt.Y(\"count\", scale=alt.Scale(type=\"log\"), title=\"count of nodes\"),\n", - " tooltip=[\"in_degree\", \"count\"],\n", - " ).interactive().properties(title=\"Distribution of In Degree\")" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "metadata": {}, - "outputs": [], - "source": [ - "if debug:\n", - " data = pd.read_csv(\n", - " os.environ[\"OUT\"] + \"/statistics.out_degree.distribution.tsv\", sep=\"\\t\"\n", - " )\n", - "\n", - " alt.Chart(data).mark_circle(size=60).encode(\n", - " x=alt.X(\"out_degree\", scale=alt.Scale(type=\"log\")),\n", - " y=alt.Y(\"count\", scale=alt.Scale(type=\"log\"), title=\"count of nodes\"),\n", - " tooltip=[\"out_degree\", \"count\"],\n", - " ).interactive().properties(title=\"Distribution of Out Degree\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary of results" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-rw-r--r-- 1 amandeep isdstaff 21G May 6 23:11 /data/amandeep/wikidata-20220409/useful-files/useful-files/derived.isastar.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 303M May 5 22:49 /data/amandeep/wikidata-20220409/useful-files/useful-files/derived.isa.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 710M May 5 21:57 /data/amandeep/wikidata-20220409/useful-files/useful-files/derived.P279star.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 42M May 5 20:10 /data/amandeep/wikidata-20220409/useful-files/useful-files/derived.P279.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 22G May 8 00:24 /data/amandeep/wikidata-20220409/useful-files/useful-files/derived.P31P279star.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.2G May 5 20:09 /data/amandeep/wikidata-20220409/useful-files/useful-files/derived.P31.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 235M May 8 08:39 /data/amandeep/wikidata-20220409/useful-files/useful-files/metadata.in_degree.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 610M May 8 06:52 /data/amandeep/wikidata-20220409/useful-files/useful-files/metadata.out_degree.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.1G May 8 02:12 /data/amandeep/wikidata-20220409/useful-files/useful-files/metadata.pagerank.directed.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.5G May 8 05:02 /data/amandeep/wikidata-20220409/useful-files/useful-files/metadata.pagerank.undirected.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 126K May 8 08:41 /data/amandeep/wikidata-20220409/useful-files/useful-files/statistics.in_degree.distribution.tsv\n", - "-rw-r--r-- 1 amandeep isdstaff 24K May 8 08:46 /data/amandeep/wikidata-20220409/useful-files/useful-files/statistics.out_degree.distribution.tsv\n", - "\n", - "/data/amandeep/wikidata-20220409/useful-files/useful-files/temp.useful-files:\n", - "total 245G\n", - "-rw-r--r-- 1 amandeep isdstaff 11G May 6 08:56 derived.isastar_1.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 11G May 7 10:28 derived.P31P279star.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.2G May 5 22:38 isa.1.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 107M May 8 08:32 metadata.in_degree.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 292M May 8 06:35 metadata.out_degree.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 527 May 8 03:10 metadata.pagerank.directed.summary.txt\n", - "-rw-r--r-- 1 amandeep isdstaff 529 May 8 05:50 metadata.pagerank.undirected.summary.txt\n", - "-rw-r--r-- 1 amandeep isdstaff 5.9M May 5 20:10 P279.n1.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 352M May 5 21:11 P279.reachable.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 68M May 5 20:16 P279.roots.1.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 7.7M May 5 20:17 P279.roots.2.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 26M May 5 20:22 P279.roots.tsv\n", - "-rw-r--r-- 1 amandeep isdstaff 351M May 5 21:31 P279star.1.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 58M May 5 20:16 P31.n2.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 222G May 8 08:43 wikidata.sqlite3.db\n" - ] - } - ], - "source": [ - "!ls -lh $OUT/*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Highest page rank" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [], - "source": [ - "if debug:\n", - " if compute_pagerank:\n", - " !$kypher -i $OUT/metadata.pagerank.undirected.tsv.gz -i label \\\n", - " --match 'pagerank: (n1)-[:Pundirected_pagerank]->(page_rank), label: (n1)-[:label]->(label)' \\\n", - " --return 'distinct n1, label as label, page_rank as `undirected page rank`' \\\n", - " --order-by 'page_rank desc' \\\n", - " --limit 10 " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "celltoolbar": "Tags", - "kernelspec": { - "display_name": "kgtk-env-ckg07", - "language": "python", - "name": "kgtk-env-ckg07" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/use-cases/create-wikidata-dwd.ipynb b/use-cases/create-wikidata-dwd.ipynb deleted file mode 100644 index 9212dc8ad..000000000 --- a/use-cases/create-wikidata-dwd.ipynb +++ /dev/null @@ -1,714 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 16, - "id": "93f651b9-d27d-40bb-b531-cfabad740521", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 61 µs, sys: 26 µs, total: 87 µs\n", - "Wall time: 108 µs\n" - ] - } - ], - "source": [ - "%%time\n", - "import papermill as pm\n", - "\n", - "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", - "from kgtk.functions import kgtk, kypher\n", - "import os" - ] - }, - { - "cell_type": "markdown", - "id": "82e1ae83-6541-4816-aad5-c0ae3b4cc5be", - "metadata": {}, - "source": [ - "**NOTE: downloaded.wikipedia.short_abstracts.tsv.gz**\n", - "\n", - "This file is available to be downloaded from `https://drive.google.com/drive/folders/1UkvFFLWbfjJtSw767IKYPfZiFsqUFu5n`\n", - "\n", - "The location on `ckg07` is `/data/amandeep/downloaded.wikipedia.short_abstracts.tsv.gz`\n", - "\n", - "This file is required for building the cache file for KGTK Browser." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "217db6b6-3e26-47f0-ba62-cbce9270021b", - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "input_path = \"/data/amandeep\"\n", - "output_path = \"/data/amandeep\"\n", - "project_name = \"create-wikidata-dwd\"\n", - "\n", - "kgtk_path = \"/data/amandeep/Github/kgtk\"\n", - "kgtk_notebooks_path = \"/data/amandeep/Github/kgtk-notebooks\"\n", - "kgtk_browser_path = \"/data/amandeep/Github/kgtk-browser\"\n", - "kernel_name = \"kgtk-env-ckg07\"\n", - "wikipedia_short_abstracts_path = '/data/amandeep/downloaded.wikipedia.short_abstracts.tsv.gz'" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "b1c90cb1-baab-4387-bad5-62d703d84ec1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "User home: /nas/home/amandeep\n", - "Current dir: /data/amandeep/Github/kgtk/use-cases\n", - "KGTK dir: /data/amandeep/Github/kgtk\n", - "Use-cases dir: /data/amandeep/Github/kgtk/use-cases\n" - ] - } - ], - "source": [ - "%%time\n", - "ck = ConfigureKGTK([], kgtk_path=kgtk_path)\n", - "ck.configure_kgtk(input_graph_path=input_path,\n", - " output_path=output_path,\n", - " project_name=project_name,\n", - " graph_cache_path=None)" - ] - }, - { - "cell_type": "markdown", - "id": "7f51bc57-e01e-4bf5-bcd6-d3c0b50b0c84", - "metadata": {}, - "source": [ - "## Run the Import Wikidata Notebook " - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "f247bcc7-29a6-4756-9b4e-73a84693af20", - "metadata": {}, - "outputs": [], - "source": [ - "# Parameters for Import Wikidata\n", - "json_file_path = \"/data/amandeep/wikidata-20220519\"\n", - "import_wikidata_path = \"/data/amandeep/wikidata-20220519\"\n", - "wikidata_project_name = \"import-wikidata\"\n", - "wikidata_json_file = \"latest-all.json.bz2\"\n", - "sort_command = 'sort'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a8a6cc0-1c9b-42f2-9f50-9366820dbf58", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "pm.execute_notebook(\n", - " \"import-wikidata.ipynb\",\n", - " os.environ[\"TEMP\"] + \"/import-wikidata.out.ipynb\",\n", - " kernel_name=kernel_name,\n", - " parameters=dict(\n", - " input_path = json_file_path,\n", - " output_path = import_wikidata_path,\n", - " project_name = wikidata_project_name,\n", - " wikidata_json_file = wikidata_json_file,\n", - " kgtk_path = kgtk_path,\n", - " sort_command = sort_command\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "7373d403-c73b-47ee-a7d2-13054f5e1516", - "metadata": {}, - "source": [ - "## Run the Useful Files Notebook to compute `isa` and `p279star` files only" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "f3d42da9-ac53-4f77-85aa-fe889b87c8f0", - "metadata": {}, - "outputs": [], - "source": [ - "# Parameters for First run on Useful Files\n", - "first_useful_files_input_path = f\"{import_wikidata_path}/{wikidata_project_name}\"\n", - "first_useful_files_output_path = import_wikidata_path\n", - "first_useful_files_project_name = \"useful-files\"\n", - "first_useful_files = 'claims,label_all,alias_all,description_all'\n", - "first_useful_files_for_cache = 'claims'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "31ad8208-eb1a-41f8-99b4-354f6dcbec07", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "pm.execute_notebook(\n", - " \"Wikidata-Useful-Files.ipynb\",\n", - " os.environ[\"TEMP\"] + \"/Wikidata-Useful-Files.out.ipynb\",\n", - " kernel_name=kernel_name,\n", - " parameters=dict(\n", - " input_path = first_useful_files_input_path,\n", - " output_path = first_useful_files_output_path,\n", - " project_name = first_useful_files_project_name,\n", - " kgtk_path = kgtk_path,\n", - " files = first_useful_files,\n", - " files_for_cache=first_useful_files_for_cache,\n", - " compute_pagerank=False,\n", - " compute_degrees=False,\n", - " debug=False,\n", - " compute_isa_star=False,\n", - " compute_p31p279_star=False\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b14d680-9631-401a-8903-7245a9cad80d", - "metadata": {}, - "outputs": [], - "source": [ - "!cp $import_wikidata_path/$first_useful_files_project_name/derived.isa.tsv.gz $import_wikidata_path/$wikidata_project_name\n", - "!cp $import_wikidata_path/$first_useful_files_project_name/derived.P279star.tsv.gz $import_wikidata_path/$wikidata_project_name" - ] - }, - { - "cell_type": "markdown", - "id": "54f7dab3-1528-4049-9e8a-e9b6aeff5aa1", - "metadata": {}, - "source": [ - "## Run Wikidata Subsets Notebook" - ] - }, - { - "cell_type": "markdown", - "id": "beb2d229-ea42-4309-aa8f-3c5dda6faeff", - "metadata": {}, - "source": [ - "The following notebook will run the following notebooks ,\n", - "\n", - "1. `../examples/partition-wikidata.ipynb`\n", - "The output will be at the path (example accoding to the parameters specified in the below cell) ,\n", - "`/data/amandeep/wikidata-20220519-dwd-v5/parts`\n", - "\n", - "2. `./Wikidata-Useful-Files.ipynb`\n", - "\n", - "The output will be at the path,\n", - "`/data/amandeep/wikidata-20220519-dwd-v5/useful-files`\n", - "\n", - "We will move the output files from the above 2 notebooks to the path `/data/amandeep/wikidata-20220519-dwd-v5` at the end of execution of the `Wikidata-Subsets.ipynb` notebook." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "6341b690-c846-4482-adcc-ba30dc69f267", - "metadata": {}, - "outputs": [], - "source": [ - "subset_input_path = f\"{import_wikidata_path}/{wikidata_project_name}\"\n", - "subset_output_path = \"/data/amandeep\"\n", - "\n", - "\n", - "subset_project_name = \"wikidata-20220519-dwd-v5\"\n", - "\n", - "subset_files = 'isa,p279star'\n", - "\n", - "# Classes to remove\n", - "remove_classes = \"Q7318358,Q13442814\"\n", - "\n", - "languages = \"en,ru,es,zh-cn,de,it,nl,pl,fr,pt,sv\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ce354040-80ce-42bc-a868-dafe324131d1", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "pm.execute_notebook(\n", - " \"Wikidata-Subsets.ipynb\",\n", - " os.environ[\"TEMP\"] + \"/Wikidata-Subsets.out.ipynb\",\n", - " kernel_name=kernel_name,\n", - " parameters=dict(\n", - " input_path = subset_input_path,\n", - " output_path = subset_output_path,\n", - " project_name = subset_project_name,\n", - " kgtk_path = kgtk_path,\n", - " files = subset_files,\n", - " remove_classes = remove_classes,\n", - " languages = languages,\n", - " kernel_name = kernel_name\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "c6a5a06c-cdc1-41c7-9c11-634b76fdcc60", - "metadata": {}, - "outputs": [], - "source": [ - "!mv $subset_output_path/$subset_project_name/parts/*tsv.gz $subset_output_path/$subset_project_name\n", - "!mv $subset_output_path/$subset_project_name/useful-files/*tsv.gz $subset_output_path/$subset_project_name" - ] - }, - { - "cell_type": "markdown", - "id": "dce13d8b-65b3-4e39-bfa4-d69fe736b43a", - "metadata": {}, - "source": [ - "## Create JSON file for KGTK-Search" - ] - }, - { - "cell_type": "markdown", - "id": "83c1d87f-8279-43a9-8041-dc027b5070f8", - "metadata": {}, - "source": [ - "The following notebook will create following file ,\n", - "\n", - "`/data/amandeep/wikidata-20220519-dwd-v5/kgtk-search/wikidata.dwd.all.kgtk.search.sorted.jl`\n", - "\n", - "We will then split the json lines file into 1M line partitions and load it into the ES index" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "55f19a42-9659-4e76-aa53-e22338fd93dc", - "metadata": {}, - "outputs": [], - "source": [ - "search_input_path = f\"{subset_output_path}/{subset_project_name}\"\n", - "search_output_path = f\"{subset_output_path}/{subset_project_name}\"\n", - "\n", - "search_project_name = \"kgtk-search\"\n", - "\n", - "compute_embeddings = False\n", - "generate_triples = False\n", - "generate_kgtk_search = True\n", - "datatype_property = \"datatype\"\n", - "\n", - "es_url=\"http://ckg07:9200\"\n", - "es_index=\"wikidata-dwd-kgtk-search-04\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cac5c471-8ba6-4769-8068-5fe86750cb93", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "pm.execute_notebook(\n", - " \"Embeddings-Elasticsearch-Triples.ipynb\",\n", - " os.environ[\"TEMP\"] + \"/Embeddings-Elasticsearch-Triples.out.ipynb\",\n", - " kernel_name=kernel_name,\n", - " parameters=dict(\n", - " input_path = search_input_path,\n", - " output_path = search_output_path,\n", - " project_name = search_project_name,\n", - " kgtk_path = kgtk_path,\n", - " compute_embeddings = compute_embeddings,\n", - " generate_triples = generate_triples,\n", - " generate_kgtk_search = generate_kgtk_search,\n", - " datatype_property = datatype_property,\n", - " languages = languages\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "b4fdd4fa-3c5f-430e-b489-3dbbe19c5bd8", - "metadata": {}, - "source": [ - "### Split the output json lines file to 1M lines partitions" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "4933fcc1-da1d-48a4-b474-05ecfd688919", - "metadata": {}, - "outputs": [], - "source": [ - "!mkdir -p $search_output_path/$search_project_name/es_split/" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "38c599e6-ea05-4a1a-8280-c2c27f2ae95f", - "metadata": {}, - "outputs": [], - "source": [ - "!split $search_output_path/$search_project_name/wikidata.dwd.all.kgtk.search.sorted.jl \\\n", - " -l 1000000 \\\n", - " $search_output_path/$search_project_name/es_split/" - ] - }, - { - "cell_type": "markdown", - "id": "5c0ba14f-8a36-4109-add2-f158cb54da39", - "metadata": {}, - "source": [ - "## Run Properties-for-this-type-notebook" - ] - }, - { - "cell_type": "markdown", - "id": "94266d0a-79a2-4566-9af1-51305fe67aaf", - "metadata": {}, - "source": [ - "This notebook is in the [kgtk-notebooks](https://github.com/usc-isi-i2/kgtk-notebooks) repo." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "bc82bc30-7cf9-4144-924c-8ac69bda2a01", - "metadata": {}, - "outputs": [], - "source": [ - "p_input_path=f\"{subset_output_path}/{subset_project_name}\"\n", - "p_output_path = f\"{subset_output_path}/{subset_project_name}\"\n", - "\n", - "# we will re use graph cache from the useful-files notebook\n", - "# at this point it already has the `claims` file loaded into cache.\n", - "# we will only load the required files into the cache, save time\n", - "p_graph_cache_path = f\"{subset_output_path}/{subset_project_name}/useful-files/temp.useful-files/wikidata.sqlite3.db\"\n", - "files_for_cache=\"item,datatypes,p279,p279star\"\n", - "\n", - "p_project_name = \"p1963\"\n", - "debug = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ebc1423-c731-4953-a49f-57deca5887aa", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "pm.execute_notebook(\n", - " f\"{kgtk_notebooks_path}/use-cases/properties-for-this-type.ipynb\",\n", - " os.environ[\"TEMP\"] + \"/properties-for-this-type.out.ipynb\",\n", - " kernel_name=kernel_name,\n", - " parameters=dict(\n", - " input_path = p_input_path,\n", - " output_path = p_output_path,\n", - " project_name = p_project_name,\n", - " graph_cache_path = p_graph_cache_path,\n", - " debug = debug,\n", - " files_for_cache=files_for_cache \n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "0d6c97a0-f61c-45dc-b352-04b6c997547b", - "metadata": {}, - "source": [ - "**move the files out into the root folder**" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "50870bc1-a835-4ff7-8b13-52128ff8ac0c", - "metadata": {}, - "outputs": [], - "source": [ - "!mv $p_output_path/$p_project_name/*tsv.gz $p_output_path" - ] - }, - { - "cell_type": "markdown", - "id": "c209583e-501d-462d-abeb-d0eb54c4ceb6", - "metadata": {}, - "source": [ - "## Run class-visualization notebook" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "f9f164ce-e4ca-4646-8ce4-097c9c4c1e3c", - "metadata": {}, - "outputs": [], - "source": [ - "c_input_path = f\"{subset_output_path}/{subset_project_name}\"\n", - "c_output_path = f\"{subset_output_path}/{subset_project_name}\"\n", - "c_project_name = \"class-visualization\"\n", - "\n", - "# re use the graph cache, at this point the cache has the following files loaded\n", - "# claims,item,datatypes,p279,p279star\n", - "# we only need to load label\n", - "\n", - "c_graph_cache_path = p_graph_cache_path\n", - "files_for_cache = \"label\"\n", - "debug = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67597816-1e22-4ea6-b599-109b6720dff3", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "pm.execute_notebook(\n", - " f\"{kgtk_notebooks_path}/use-cases/class-visualization.ipynb\",\n", - " os.environ[\"TEMP\"] + \"/class-visualization.out.ipynb\",\n", - " kernel_name=kernel_name,\n", - " parameters=dict(\n", - " input_path = c_input_path,\n", - " output_path = c_output_path,\n", - " project_name = c_project_name,\n", - " graph_cache_path = c_graph_cache_path,\n", - " debug = debug,\n", - " files_for_cache=files_for_cache \n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "fffe2e16-f871-4e39-aa18-151f7a6ee9cd", - "metadata": {}, - "outputs": [], - "source": [ - "!mv $c_output_path/$c_project_name/class-visualization.node.tsv.gz $c_output_path\n", - "!mv $c_output_path/$c_project_name/class-visualization.edge.tsv.gz $c_output_path" - ] - }, - { - "cell_type": "markdown", - "id": "13565322-b325-4b55-b215-7ada4cb7a435", - "metadata": {}, - "source": [ - "## Run Create-claims-augmented-for-browser notebook" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "0a4a60f3-aad1-4e01-9382-c0ff82e13289", - "metadata": {}, - "outputs": [], - "source": [ - "a_input_path = f\"{subset_output_path}/{subset_project_name}\"\n", - "a_output_path = f\"{subset_output_path}/{subset_project_name}\"\n", - "a_project_name = \"browser-claims-file\"" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "ba61fd58-d461-402e-b447-40f76a4f36cd", - "metadata": {}, - "outputs": [], - "source": [ - "!cp $wikipedia_short_abstracts_path $subset_output_path/$subset_project_name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "37270353-4e03-40ff-829a-36fc690b45e1", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "pm.execute_notebook(\n", - " f\"{kgtk_browser_path}/Create-claims-augmented-for-browser.ipynb\",\n", - " os.environ[\"TEMP\"] + \"/Create-claims-augmented-for-browser.out.ipynb\",\n", - " kernel_name=kernel_name,\n", - " parameters=dict(\n", - " input_path = a_input_path,\n", - " output_path = a_output_path,\n", - " project_name = a_project_name\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "4d7e10c3-2b7a-4602-a0d0-33a465bda60f", - "metadata": {}, - "source": [ - "## Run KGTK-Query-Text-Search-Setup Notebook" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "1a11211f-47a5-4224-8dfc-3c9b6dfe09dc", - "metadata": {}, - "outputs": [], - "source": [ - "q_input_path = f\"{a_output_path}/{a_project_name}\"\n", - "q_output_path = f\"{a_output_path}/{a_project_name}\"\n", - "\n", - "q_project_name = \"kgtk-browser-files\"" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "1417e7e7-c526-4f88-a7f3-21d8bc10650e", - "metadata": {}, - "outputs": [], - "source": [ - "!cp $subset_output_path/$subset_project_name/class-visualization.edge.tsv.gz $q_input_path\n", - "!cp $subset_output_path/$subset_project_name/class-visualization.node.tsv.gz $q_input_path\n", - "!cp $subset_output_path/$subset_project_name/metadata.pagerank.undirected.tsv.gz $q_input_path" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d20ab059-fa30-475e-a2be-96e72a29b80c", - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "pm.execute_notebook(\n", - " f\"{kgtk_browser_path}/KGTK-Query-Text-Search-Setup.ipynb\",\n", - " os.environ[\"TEMP\"] + \"/KGTK-Query-Text-Search-Setup.ipynb\",\n", - " kernel_name=kernel_name,\n", - " parameters=dict(\n", - " input_path = q_input_path,\n", - " output_path = q_output_path,\n", - " project_name = q_project_name\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "b411d124-08e3-40cf-8753-2bf490df7b0f", - "metadata": {}, - "source": [ - "## LOAD ES Index" - ] - }, - { - "cell_type": "markdown", - "id": "2024399e-6fcc-43a9-a3f8-a6f7886df756", - "metadata": {}, - "source": [ - "**Make sure [table-linker](https://github.com/usc-isi-i2/table-linker) is installed**" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "a08085a8-a93a-4419-939d-06373a228c30", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\"acknowledged\":true,\"shards_acknowledged\":true,\"index\":\"wikidata-dwd-kgtk-search-04\"}" - ] - } - ], - "source": [ - "!curl -H \"Content-Type: application/json\" \\\n", - "-XPUT $es_url/$es_index -d @$search_output_path/$search_project_name/wikidata_dwd_v3_mapping.json" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "2b8032ba-1ce6-480c-9d71-675edfb878f2", - "metadata": {}, - "outputs": [], - "source": [ - "os.environ['search_output_path']=search_output_path\n", - "os.environ['search_project_name']=search_project_name\n", - "os.environ['es_url']=es_url\n", - "os.environ['es_index']=es_index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3522d6a0-d7d8-426f-aaa4-c9811951ea1e", - "metadata": {}, - "outputs": [], - "source": [ - "%%bash\n", - "for f in $search_output_path/$search_project_name/es_split/* ;\n", - "do\n", - " echo $f \n", - " tl load-elasticsearch-index --es-url $es_url --es-index $es_index --es-version 7 --kgtk-jl-path $f > $TEMP/load_es.log\n", - " sleep 60\n", - "done" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c0619b92-6a75-4977-b2c8-a0ca4f3d6f82", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "kgtk-env-ckg07", - "language": "python", - "name": "kgtk-env-ckg07" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/use-cases/import-wikidata.ipynb b/use-cases/import-wikidata.ipynb deleted file mode 100644 index 5156c6c31..000000000 --- a/use-cases/import-wikidata.ipynb +++ /dev/null @@ -1,4209 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "0929e7bb-51f8-4ab2-ad69-07130a2e368e", - "metadata": {}, - "source": [ - "# Import Wikidata" - ] - }, - { - "cell_type": "markdown", - "id": "6303f2a2-babb-4a1b-9ab1-cc08bf4771ee", - "metadata": {}, - "source": [ - "This notebook assumes the file `latest-all.json.bz2` is already [downloaded](https://dumps.wikimedia.org/wikidatawiki/entities/) and stored in the `input_path` in the cell marked as #Parameters.\n", - "\n", - "You can download the `gz` version as well, please update the variable `wikidata_json_file` with correct file name." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "c5f9d560-8293-4dec-9667-f7e08c6ccf52", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", - "from kgtk.functions import kgtk, kypher" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "1a6cc50d-2a13-4eca-95be-486767de63ec", - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "# Parameters\n", - "\n", - "# Folder on local machine where to create the output and temporary folders\n", - "input_path = \"/data/amandeep/wikidata-20220505\"\n", - "output_path = \"/data/amandeep/wikidata-20220505\"\n", - "project_name = \"import-wikidata\"\n", - "\n", - "kgtk_path = \"/data/amandeep/Github/kgtk\"\n", - "wikidata_json_file = \"latest-all.json.bz2\"\n", - "# sort_command = 'gsort'\n", - "sort_command = 'sort'" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d56ac16c-ba43-4810-8760-2a0755bfbd5f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "User home: /nas/home/amandeep\n", - "Current dir: /data/amandeep/Github/kgtk/use-cases\n", - "KGTK dir: /data/amandeep/Github/kgtk\n", - "Use-cases dir: /data/amandeep/Github/kgtk/use-cases\n" - ] - } - ], - "source": [ - "files = []\n", - "\n", - "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", - "ck.configure_kgtk(input_graph_path=input_path,\n", - " output_path=output_path,\n", - " project_name=project_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "046a0b40-c0c1-4e9f-9b36-afcfac05edfe", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "OUT: /data/amandeep/wikidata-20220505/import-wikidata\n", - "KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db\n", - "kypher: kgtk query --graph-cache /data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db\n", - "EXAMPLES_DIR: /data/amandeep/Github/kgtk/examples\n", - "KGTK_OPTION_DEBUG: false\n", - "USE_CASES_DIR: /data/amandeep/Github/kgtk/use-cases\n", - "STORE: /data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db\n", - "kgtk: kgtk\n", - "GRAPH: /data/amandeep/wikidata-20220505\n", - "TEMP: /data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata\n", - "KGTK_LABEL_FILE: /data/amandeep/wikidata-20220505/labels.en.tsv.gz\n" - ] - } - ], - "source": [ - "ck.print_env_variables()" - ] - }, - { - "cell_type": "markdown", - "id": "4c9ec80b-48fe-48cc-8984-6cf94e69c2d0", - "metadata": {}, - "source": [ - "## Define some ENV Variables, users can simply run this step, no changes required" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "48fb958a-ff91-4360-b73f-3a136797f056", - "metadata": {}, - "outputs": [], - "source": [ - "os.environ['WIKIDATA_ALL_JSON'] = f\"{os.environ['GRAPH']}/{wikidata_json_file}\"\n", - "\n", - "# Work file extensions\n", - "os.environ['UNSORTED_KGTK'] = \"unsorted.tsv.gz\"\n", - "os.environ['SORTED_KGTK'] = \"tsv.gz\"\n", - "\n", - "# Use mgzip in some cases?\n", - "os.environ['USE_MGZIP'] = \"TRUE\"\n", - "\n", - "# Select on of the following gzip implementations:\n", - "# GZIP_CMD=bzip\n", - "os.environ['GZIP_CMD'] = \"pigz\"\n", - "\n", - "\n", - "# Some common flags:\n", - "#KGTK_FLAGS=\"--debug --timing --progress --progress-tty `tty`\"\n", - "os.environ['KGTK_FLAGS'] = \"--debug --timing\"\n", - "os.environ['VERBOSE'] = \"--verbose\"\n", - "os.environ['SORT_EXTRAS'] = f\"--parallel 6 --buffer-size 50% -T {os.environ['TEMP']}\"\n", - "\n", - "# The Wikidata datatypes:\n", - "WIKIDATATYPES = [ \n", - " \"commonsMedia\",\n", - " \"external-id\",\n", - " \"geo-shape\",\n", - " \"globe-coordinate\",\n", - " \"math\",\n", - " \"monolingualtext\",\n", - " \"musical-notation\",\n", - " \"quantity\",\n", - " \"string\",\n", - " \"tabular-data\",\n", - " \"time\",\n", - " \"url\",\n", - " \"wikibase-form\",\n", - " \"wikibase-item\",\n", - " \"wikibase-lexeme\",\n", - " \"wikibase-property\",\n", - " \"wikibase-sense\",\n", - " \"other\"\n", - " ]\n", - "\n", - "# The wikidata import split files to be sorted:\n", - "WIKIDATA_IMPORT_SPLIT_FILES = [ \"claims\",\n", - "\t\"claims.badvalue\",\n", - "\t\"claims.novalue\",\n", - "\t\"claims.somevalue\",\n", - "\t\"qualifiers\",\n", - "\t\"qualifiers.badvalue\",\n", - "\t\"qualifiers.badvalueClaims\",\n", - "\t\"qualifiers.novalue\",\n", - "\t\"qualifiers.novalueClaims\",\n", - "\t\"qualifiers.somevalue\",\n", - "\t\"qualifiers.somevalueClaims\",\n", - "\t\"aliases\",\n", - "\t\"aliases.en\",\n", - "\t\"descriptions\",\n", - "\t\"descriptions.en\",\n", - "\t\"labels\",\n", - "\t\"labels.en\",\n", - "\t\"sitelinks\",\n", - "\t\"sitelinks.en\",\n", - "\t\"sitelinks.en.qualifiers\",\n", - "\t\"sitelinks.qualifiers\",\n", - "\t\"metadata.node\",\n", - "\t\"metadata.property.datatypes\",\n", - "\t\"metadata.types\"]\n", - "\n", - "\n", - "os.environ['SORT_COMMAND'] = sort_command" - ] - }, - { - "cell_type": "markdown", - "id": "5b91bcd0-50f0-415a-903b-f9da7394a4f4", - "metadata": {}, - "source": [ - "## Run the `import-wikidata` command" - ] - }, - { - "cell_type": "markdown", - "id": "926b6689-9fae-4bb9-b5c8-c2daead96f4f", - "metadata": {}, - "source": [ - "**NOTE**:\n", - "This command is set to import only english labels/aliases/descriptions, controlled by parameters `--all-languages False` and `--lang en`.\n", - "\n", - "If you wish to import all languages, simple set `--all-languages True`." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "df8647da-650a-44a6-9d19-290c64765e31", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "kgtk import-wikidata version: 2021-11-17T01:38:17.437678+00:00#9z/aARcXhiV2hPdyVXjAREcpZwh2MawWFp6numz8GZBCtAg2WypLYAFpHjP43k97Zj8VHVaoel0oEit9KHXH0w==\n", - "Starting main process (pid 118098).\n", - "Processing.\n", - "Processing wikidata file /data/amandeep/wikidata-20220505/latest-all.json.bz2\n", - "Decompressing (bz2)\n", - "Creating the collector queue.\n", - "The collector node queue has been created (maxsize=36).\n", - "Creating the node_collector.\n", - "Creating the node collector process.\n", - "Starting the node collector process.\n", - "Started the node collector process.\n", - "The node collector is starting (pid 118140).\n", - "The collector edge queue has been created (maxsize=36).\n", - "Creating the edge_collector.\n", - "Creating the edge collector process.\n", - "Starting the edge collector process.\n", - "Started the edge collector process.\n", - "The edge collector is starting (pid 118141).\n", - "The collector qual queue has been created (maxsize=36).\n", - "Creating the qual_collector.\n", - "Creating the qual collector process.\n", - "Starting the qual collector process.\n", - "Started the qual collector process.\n", - "The qual collector is starting (pid 118142).\n", - "The collector invalid edge queue has been created (maxsize=36).\n", - "Creating the invalid_edge_collector.\n", - "Creating the invalid edge collector process.\n", - "Starting the invalid edge collector process.\n", - "Started the invalid edge collector process.\n", - "The invalid edge collector is starting (pid 118143).\n", - "The collector invalid qual queue has been created (maxsize=36).\n", - "Creating the invalid_qual_collector.\n", - "Creating the invalid qual collector process.\n", - "Starting the invalid qual collector process.\n", - "Started the invalid qual collector process.\n", - "The invalid qual collector is starting (pid 118144).\n", - "The collector description queue has been created (maxsize=36).\n", - "Creating the description collector.\n", - "Creating the description collector process.\n", - "Starting the description collector process.\n", - "Started the description collector process.\n", - "The description collector is starting (pid 118145).\n", - "The collector sitelink queue has been created (maxsize=36).\n", - "Creating the sitelink collector.\n", - "Creating the sitelink collector process.\n", - "Starting the sitelink collector process.\n", - "Started the sitelink collector process.\n", - "Sending the node header to the collector.\n", - "The sitelink collector is starting (pid 118146).\n", - "Sent the node header to the collector.\n", - "Sending the minimal edge file header to the collector.\n", - "Sent the minimal edge file header to the collector.\n", - "Sending the alias file header to the collector.\n", - "Sent the alias file header to the collector.\n", - "Sending the English alias file header to the collector.\n", - "Sent the English alias file header to the collector.\n", - "Sending the datatype file header to the collector.\n", - "Opening the node file in the node collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.node.unsorted.tsv.gz\n", - "Sent the datatype file header to the collector.\n", - "Sending the description file header to the collector.\n", - "Sent the description file header to the collector.\n", - "Sending the English description file header to the collector.\n", - "Sent the English description file header to the collector.\n", - "Sending the label file header to the collector.\n", - "Opening the minimal edge file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz\n", - "Sent the label file header to the collector.\n", - "Sending the English label file header to the collector.\n", - "Sent the English label file header to the collector.\n", - "Sending the sitelink file header to the collector.\n", - "Sent the sitelink file header to the collector.\n", - "Sending the English sitelink file header to the collector.\n", - "Sent the English sitelink file header to the collector.\n", - "Sending the entry type file header to the collector.\n", - "Sent the entry type file header to the collector.\n", - "Sending the minimal invalid edge header to the collector.\n", - "Opening the description file in the description collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz\n", - "Sent the minimal invalid edge header to the collector.\n", - "Sending the minimal qual file header to the collector.\n", - "Sent the minimal qual file header to the collector.\n", - "Sending the minimal invalid qual header to the collector.\n", - "Sent the minimal invalid qual header to the collector.\n", - "Opening the wikipedia_sitelink file in the sitelink collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz\n", - "Creating parallel processor for /data/amandeep/wikidata-20220505/latest-all.json.bz2\n", - "Opening the invalid edge file in the invalid edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz\n", - "Opening the minimal qual file in the qual collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz\n", - "Opening the qual file in the invalid qual collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalue.unsorted.tsv.gz\n", - "Opening the alias file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz\n", - "Opening the English description file in the description collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.en.unsorted.tsv.gz\n", - "Opening the English wikipedia_sitelink file in the sitelink collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz\n", - "Opening the English alias file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.en.unsorted.tsv.gz\n", - "Opening the datatype file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz\n", - "Opening the label file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz\n", - "Start parallel processing\n", - "Opening the English label file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.en.unsorted.tsv.gz\n", - "Opening the type file in the edge collector with KgtkWriter: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz\n", - "Starting worker process 0 (pid 118147).\n", - "Starting worker process 1 (pid 118148).\n", - "Starting worker process 2 (pid 118149).\n", - "Starting worker process 3 (pid 118150).\n", - "Starting worker process 4 (pid 118151).\n", - "Starting worker process 5 (pid 118152).\n", - "Starting worker process 6 (pid 118153).\n", - "Starting worker process 7 (pid 118154).\n", - "Starting worker process 8 (pid 118155).\n", - "Starting worker process 9 (pid 118156).\n", - "Starting worker process 10 (pid 118157).\n", - "Starting worker process 11 (pid 118158).\n", - "\n", - "*** Sitelink collision #1 detected for Q7580-wikipedia_sitelink-dcda22 (http://lv.wikipedia.org/wiki/1743._gads)\n", - "\n", - "*** Qualifier collision #1 detected for Q37062-P26-Q2028843-b2e6740f-0-P580-6f4356 (^1411-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q61814-P26-Q66516-1fa99291-0-P580-d435a1 (^1502-04-10T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q62481-P26-Q2086776-87b8910e-0-P580-360391 (^1561-10-12T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q70789-P26-Q935411-28987fd8-0-P580-941716 (^1463-05-10T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q89405-P26-Q101877-d20a377b-0-P580-2b9eed (^1560-07-01T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q150611-P26-Q233335-575116d2-0-P580-29c809 (^1521-05-25T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q165284-P26-Q353-84a8ff47-0-P580-776c43 (^1200-05-23T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q169992-P26-Q235487-0e315055-0-P580-7a47d9 (^1332-07-28T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q183698-P26-Q256222-4322595e-0-P580-1fecee (^1684-01-09T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q235447-P26-Q161958-0d89305f-0-P580-52c362 (^1406-10-26T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q256222-P26-Q183698-415fc5b0-0-P580-1fecee (^1684-01-09T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q380373-P26-Q1141121-48bebee4-0-P580-2e184a (^1294-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q380868-P26-Q384941-46f6240f-0-P580-4b742f (^1533-08-25T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q453771-P26-Q443876-84acba5b-0-P580-84a26a (^1446-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q477343-P26-Q3374718-c7014aa0-0-P580-a95d2d (^1573-10-27T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q702835-wikipedia_sitelink-6ce2fd (http://uk.wikipedia.org/wiki/Бессі_Купер)\n", - "\n", - "*** Qualifier collision #1 detected for Q1834423-P26-Q322841-6c85598c-0-P580-876067 (^1559-06-16T00:00:00Z/11)\n", - "\n", - "*** Alias collision #1 detected for Q2336516-alias-es-f24d14 ('Elecciones al Parlamento Europeo de 1989 en Dinamarca'@es)\n", - "\n", - "*** Qualifier collision #1 detected for Q3007367-P26-Q430782-f64d3af2-0-P580-5b468d (^1555-02-07T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q4299475-wikipedia_sitelink-39fa76 (http://.wikipedia.org/wiki/Template:Bot)\n", - "\n", - "*** Sitelink collision #1 detected for Q4847311-wikipedia_sitelink-c4b491 (http://.wikipedia.org/wiki/Template:Delete)\n", - "\n", - "*** Sitelink collision #1 detected for Q5406510-wikipedia_sitelink-c7418e (http://.wikipedia.org/wiki/Template:=)\n", - "\n", - "*** Sitelink collision #1 detected for Q5412328-wikipedia_sitelink-6bc2e1 (http://.wikipedia.org/wiki/Template:Trim)\n", - "\n", - "*** Sitelink collision #1 detected for Q5607945-wikipedia_sitelink-6795ff (http://mr.wikipedia.org/wiki/वर्ग:मार्गक्रमण_साचे)\n", - "\n", - "*** Sitelink collision #1 detected for Q5621274-wikipedia_sitelink-126246 (http://.wikipedia.org/wiki/Template:Column-count)\n", - "\n", - "*** Sitelink collision #1 detected for Q5882248-wikipedia_sitelink-1cc4bd (http://.wikipedia.org/wiki/Template:Documentation_subpage)\n", - "\n", - "*** Qualifier collision #1 detected for Q7529231-P26-Q6792225-896048a6-0-P580-5a896b (^1508-00-00T00:00:00Z/9)\n", - "\n", - "*** Sitelink collision #1 detected for Q9737782-wikipedia_sitelink-0b4a66 (http://.wikipedia.org/wiki/Category:User_templates)\n", - "\n", - "*** Sitelink collision #1 detected for Q10560270-wikipedia_sitelink-2fe24f (http://.wikipedia.org/wiki/Template:Under_construction)\n", - "\n", - "*** Qualifier collision #1 detected for Q13058108-P159-Q1354-267a1462-0-P625-cb2660 (@23.728063/90.419591)\n", - "\n", - "*** Sitelink collision #1 detected for Q13156670-wikipedia_sitelink-c35c37 (http://.wikipedia.org/wiki/Template:Interwiki_redirect)\n", - "\n", - "*** Sitelink collision #1 detected for Q14511701-wikipedia_sitelink-5b1836 (http://.wikipedia.org/wiki/Template:TemplateData_header)\n", - "\n", - "*** Sitelink collision #1 detected for Q14635514-wikipedia_sitelink-156619 (http://.wikipedia.org/wiki/Template:Reply_to)\n", - "\n", - "*** Sitelink collision #1 detected for Q7253814-wikipedia_sitelink-9e2840 (http://.wikipedia.org/wiki/Module:String)\n", - "\n", - "*** Sitelink collision #1 detected for Q7348344-wikipedia_sitelink-d451bf (http://.wikipedia.org/wiki/Module:Coordinates)\n", - "\n", - "*** Sitelink collision #1 detected for Q15818920-wikipedia_sitelink-fa275b (http://.wikipedia.org/wiki/Template:Autoarchive_resolved_section)\n", - "The node collector called 500000 times: 2500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 500000 times: 0 nrows, 60662629 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 500000 times: 0 nrows, 81273275 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The sitelink collector called 500000 times: 0 nrows, 33927067 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q29053400-wikipedia_sitelink-b11f2e (http://.wikipedia.org/wiki/Category:Pages_with_template_loops)\n", - "The qual collector called 500000 times: 0 nrows, 0 erows, 7571086 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 1000000 times: 5000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 1000000 times: 0 nrows, 133466230 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 1000000 times: 0 nrows, 200314003 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "500000 lines processed by processor 5\n", - "500000 lines processed by processor 1\n", - "500000 lines processed by processor 9\n", - "500000 lines processed by processor 4\n", - "500000 lines processed by processor 10\n", - "500000 lines processed by processor 6\n", - "500000 lines processed by processor 7\n", - "500000 lines processed by processor 2\n", - "500000 lines processed by processor 11\n", - "500000 lines processed by processor 8\n", - "500000 lines processed by processor 3\n", - "\n", - "*** Qualifier collision #1 detected for Q55579391-P26-Q121846-1952d1ff-0-P580-cae35d (^1284-00-00T00:00:00Z/9)\n", - "500000 lines processed by processor 0\n", - "\n", - "*** Sitelink collision #1 detected for Q58832772-wikipedia_sitelink-ea0ae7 (http://.wikipedia.org/wiki/Module:LangSwitch)\n", - "The qual collector called 1000000 times: 0 nrows, 0 erows, 18531429 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 1500000 times: 7500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 1500000 times: 0 nrows, 183700332 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 1500000 times: 0 nrows, 268671270 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The qual collector called 1500000 times: 0 nrows, 0 erows, 28002684 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 2000000 times: 10000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 2000000 times: 0 nrows, 243125009 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 2000000 times: 0 nrows, 300937648 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q90722487-wikipedia_sitelink-24b73c (http://.wikipedia.org/wiki/Category:Pages_using_deprecated_source_tags)\n", - "\n", - "*** Sitelink collision #1 detected for Q99735928-wikipedia_sitelink-a62902 (http://.wikipedia.org/wiki/Template:BCP47)\n", - "The sitelink collector called 1000000 times: 0 nrows, 40653120 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "1000000 lines processed by processor 5\n", - "\n", - "*** Sitelink collision #1 detected for Q109249671-wikipedia_sitelink-f0183d (http://.wikipedia.org/wiki/Template:None)\n", - "1000000 lines processed by processor 8\n", - "1000000 lines processed by processor 10\n", - "1000000 lines processed by processor 7\n", - "1000000 lines processed by processor 0\n", - "1000000 lines processed by processor 6\n", - "1000000 lines processed by processor 11\n", - "1000000 lines processed by processor 9\n", - "1000000 lines processed by processor 4\n", - "1000000 lines processed by processor 1\n", - "1000000 lines processed by processor 2\n", - "1000000 lines processed by processor 3\n", - "The qual collector called 2000000 times: 0 nrows, 0 erows, 38508407 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q8058-P26-Q254085-4eab60ab-0-P580-8df26d (^1436-06-24T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q8384-P26-Q70590-6edd7354-0-P580-e23c66 (^1305-09-23T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q13167-P348-99b09e-08cc7a6d-0-P577-07f6e3 (^2016-07-12T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q57161-P26-Q441394-f0d02358-0-P580-77780b (^1308-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q57161-P26-Q467019-4ee33344-0-P580-9268e9 (^1324-02-25T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q74019-P26-Q540767-c098df36-0-P580-5774e5 (^1422-07-25T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q121130-P26-Q119431-af2d7776-0-P580-10c067 (^1197-05-25T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q151587-P26-Q7996-4448a491-0-P580-62d46c (^1572-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q153319-P26-Q57852-a80af489-0-P580-56d3ba (^1725-06-01T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q154998-P26-Q234549-77a9d927-0-P580-45ce34 (^1525-10-29T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q184868-P26-Q390071-b34d3d54-0-P580-90dfde (^1680-07-18T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q203647-P26-Q2284422-aed54bb0-0-P580-246002 (^1045-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q232137-P26-Q41847-0dcc4fd6-0-P580-fe3abc (^0956-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q263474-P26-Q3044-90d4ea9f-0-P580-9b0b8a (^0770-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q333359-P26-Q3052486-9cbd9d9e-0-P580-355ae9 (^0960-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q454810-P26-Q702209-74f88753-0-P580-e60df9 (^1476-08-25T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q598906-P26-Q1635933-0d160adf-0-P580-ec2def (^1236-11-01T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q672446-P26-Q2912335-f19e5091-0-P580-93d3bd (^1447-12-14T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q674931-P26-Q19601994-f7d507fb-0-P580-9b41a5 (^1222-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q694351-P26-Q329555-c88da6e5-0-P580-15a1f0 (^1381-09-02T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q702602-P26-Q79176-0f28ed9a-0-P580-676c21 (^1431-06-03T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q719501-P26-Q69462-4f695a08-0-P580-79dbc8 (^1512-07-06T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q921009-wikipedia_sitelink-53a877 (http://ceb.wikipedia.org/wiki/Balod_(lungsod_sa_Indiya))\n", - "\n", - "*** Sitelink collision #1 detected for Q1007634-wikipedia_sitelink-ed9b51 (http://ko.wikipedia.org/wiki/에스텔리)\n", - "\n", - "*** Sitelink collision #1 detected for Q1071820-wikipedia_sitelink-aab9e2 (http://br.wikipedia.org/wiki/Lagostomus)\n", - "\n", - "*** Qualifier collision #1 detected for Q1106184-P26-Q4331742-bc45332b-0-P580-e7880a (^1555-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q1381324-P26-Q535528-9a0e7ede-0-P580-c9b00e (^1221-06-19T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q3139317-P159-Q9799-e865820c-0-P625-be8120 (@50.8802/5.9595)\n", - "The node collector called 2500000 times: 12500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 2500000 times: 0 nrows, 314121442 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q4115450-P159-Q191204-8fdad044-0-P625-a98823 (@35.569778/45.352163)\n", - "The description collector called 2500000 times: 0 nrows, 345747666 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q5411705-wikipedia_sitelink-7111a4 (http://.wikipedia.org/wiki/Template:Clear)\n", - "\n", - "*** Sitelink collision #1 detected for Q5459259-wikipedia_sitelink-044e48 (http://.wikipedia.org/wiki/Template:Center)\n", - "\n", - "*** Sitelink collision #1 detected for Q5622198-wikipedia_sitelink-a3c8ed (http://.wikipedia.org/wiki/Template:Done)\n", - "\n", - "*** Sitelink collision #1 detected for Q5646673-wikipedia_sitelink-162b1a (http://.wikipedia.org/wiki/Template:Pp-template)\n", - "\n", - "*** Sitelink collision #1 detected for Q6063221-wikipedia_sitelink-4fe51c (http://.wikipedia.org/wiki/Template:Mbox)\n", - "\n", - "*** Sitelink collision #1 detected for Q6133158-wikipedia_sitelink-62de50 (http://.wikipedia.org/wiki/Template:@)\n", - "\n", - "*** Qualifier collision #1 detected for Q6867218-P159-Q9268849-32a831ad-0-P625-51c420 (@52.223817/21.005108)\n", - "\n", - "*** Qualifier collision #1 detected for Q9061646-P39-Q84701409-5a714518-0-P580-3e1e37 (^1116-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q9061646-P39-Q84701409-5a714518-0-P582-ac0fb1 (^1154-00-00T00:00:00Z/9)\n", - "\n", - "*** Sitelink collision #1 detected for Q16051502-wikipedia_sitelink-1ba00f (http://arz.wikipedia.org/wiki/صوت_الصمت_2013)\n", - "\n", - "*** Sitelink collision #1 detected for Q16748603-wikipedia_sitelink-1c7ff7 (http://.wikipedia.org/wiki/Module:No_globals)\n", - "\n", - "*** Sitelink collision #1 detected for Q17347205-wikipedia_sitelink-9e0e56 (http://.wikipedia.org/wiki/Module:Category_handler/config)\n", - "The sitelink collector called 1500000 times: 0 nrows, 73918436 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q26878179-P26-Q55169081-de1c53f2-0-P580-01b412 (^1571-09-08T00:00:00Z/11)\n", - "The node collector called 3000000 times: 15000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 3000000 times: 0 nrows, 363934028 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q28941166-wikipedia_sitelink-d4f303 (http://fr.wikipedia.org/wiki/Tempête_de_neige_de_la_mi-mars_2017_dans_l'est_de_l'Amérique_du_Nord)\n", - "The description collector called 3000000 times: 0 nrows, 435008293 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The qual collector called 2500000 times: 0 nrows, 0 erows, 45967305 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 3500000 times: 17500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 3500000 times: 0 nrows, 439374610 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 3500000 times: 0 nrows, 558743824 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "1500000 lines processed by processor 5\n", - "1500000 lines processed by processor 7\n", - "1500000 lines processed by processor 4\n", - "1500000 lines processed by processor 9\n", - "1500000 lines processed by processor 11\n", - "1500000 lines processed by processor 8\n", - "1500000 lines processed by processor 0\n", - "1500000 lines processed by processor 1\n", - "1500000 lines processed by processor 10\n", - "1500000 lines processed by processor 6\n", - "1500000 lines processed by processor 2\n", - "\n", - "*** Qualifier collision #1 detected for Q54902946-P26-Q31191593-fb18c102-0-P580-d109bb (^1560-12-15T00:00:00Z/11)\n", - "1500000 lines processed by processor 3\n", - "\n", - "*** Sitelink collision #1 detected for Q56528384-wikipedia_sitelink-563962 (http://.wikipedia.org/wiki/Module:I18n/date)\n", - "\n", - "*** Qualifier collision #1 detected for Q56582849-P26-Q72922-06e7a6cd-0-P580-c16f56 (^1499-01-21T00:00:00Z/11)\n", - "The qual collector called 3000000 times: 0 nrows, 0 erows, 56981081 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 4000000 times: 20000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 4000000 times: 0 nrows, 486362243 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 4000000 times: 0 nrows, 604767527 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q75458516-P26-Q7324457-79a267cb-0-P580-221dc5 (^1568-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q75552262-P26-Q75552257-6fa3779f-0-P580-04284b (^1556-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q75552257-P26-Q75552262-2af17717-0-P580-04284b (^1556-00-00T00:00:00Z/9)\n", - "The qual collector called 3500000 times: 0 nrows, 0 erows, 66478883 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 4500000 times: 22500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 4500000 times: 0 nrows, 548812363 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 4500000 times: 0 nrows, 643030994 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The sitelink collector called 2000000 times: 0 nrows, 81240831 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "2000000 lines processed by processor 5\n", - "2000000 lines processed by processor 7\n", - "2000000 lines processed by processor 4\n", - "2000000 lines processed by processor 11\n", - "2000000 lines processed by processor 8\n", - "2000000 lines processed by processor 2\n", - "2000000 lines processed by processor 9\n", - "2000000 lines processed by processor 10\n", - "2000000 lines processed by processor 0\n", - "2000000 lines processed by processor 1\n", - "2000000 lines processed by processor 6\n", - "2000000 lines processed by processor 3\n", - "The qual collector called 4000000 times: 0 nrows, 0 erows, 77003327 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q2323-wikipedia_sitelink-8e4ee7 (http://yo.wikipedia.org/wiki/8_February)\n", - "\n", - "*** Sitelink collision #1 detected for Q8877-wikipedia_sitelink-dcbe09 (http://scn.wikipedia.org/wiki/Steven_Spielberg)\n", - "\n", - "*** Sitelink collision #1 detected for Q9696-wikipedia_sitelink-c8fdf8 (http://haw.wikipedia.org/wiki/John_Fitzgerald_Kennedy)\n", - "\n", - "*** Qualifier collision #1 detected for Q40433-P26-Q463669-cd43ed58-0-P580-480b99 (^1550-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q57654-P26-Q154041-8d52292f-0-P580-3b3df4 (^1572-07-20T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q66270-P26-Q325505-28bc872e-0-P580-08d4a0 (^1478-05-29T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q66516-P26-Q61814-43ebfd75-0-P580-d435a1 (^1502-04-10T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q101877-P26-Q89405-6e0cba4d-0-P580-2b9eed (^1560-07-01T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q155167-P26-Q269586-cc56bab6-0-P580-c54274 (^1334-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q168669-P26-Q193658-6bff08d2-0-P580-e8a3ec (^0939-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q241797-P26-Q7731-b7834ae7-0-P580-a01064 (^1671-02-01T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q262059-P26-Q187312-c501aba2-0-P580-7e48ad (^1302-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q267483-P26-Q57920-80635ac2-0-P580-c0fc4c (^1570-01-08T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q271799-P26-Q169319-a97c2304-0-P580-0d082c (^1523-12-11T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q326738-P26-Q684224-2df6ee20-0-P580-ff2137 (^1524-01-17T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q327572-P26-Q68952-ae5f6316-0-P580-5906e2 (^1563-05-10T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q374210-P26-Q4768218-c9e0eacd-0-P580-ef8382 (^1571-12-19T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q384941-P26-Q380868-4ca9581a-0-P580-4b742f (^1533-08-25T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q536174-P26-Q551752-c9a99a5e-0-P580-16c9b2 (^1229-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q574718-P26-Q21153658-ffa49040-0-P580-03dd18 (^1319-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q623188-P26-Q553289-7323bb58-0-P580-d8d288 (^1090-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q947423-P26-Q5358431-95b068e2-0-P580-6b2ce5 (^1152-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q2039358-P26-Q13474657-3f305fc3-0-P580-593f4e (^1558-00-00T00:00:00Z/9)\n", - "\n", - "*** Sitelink collision #1 detected for Q6428609-wikipedia_sitelink-660dd5 (http://nah.wikipedia.org/wiki/Neneuhcāyōtl:Tlatequitiltilīlli_pt-1)\n", - "\n", - "*** Sitelink collision #1 detected for Q6705618-wikipedia_sitelink-4b5e22 (http://.wikipedia.org/wiki/Template:Autotranslate)\n", - "The node collector called 5000000 times: 25000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 5000000 times: 0 nrows, 619728420 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q7221363-wikipedia_sitelink-78630c (http://.wikipedia.org/wiki/Category:Lua-based_templates)\n", - "\n", - "*** Qualifier collision #1 detected for Q7324457-P26-Q75567328-84b7c804-0-P580-4e6c67 (^1553-11-24T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q7324457-P26-Q75458516-fee1a551-0-P580-221dc5 (^1568-00-00T00:00:00Z/9)\n", - "\n", - "*** Sitelink collision #1 detected for Q7453435-wikipedia_sitelink-c016e9 (http://ku.wikipedia.org/wiki/Kategorî:Ewrasya)\n", - "The description collector called 5000000 times: 0 nrows, 710063872 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q10350561-wikipedia_sitelink-5f2907 (http://.wikipedia.org/wiki/Template:Lua)\n", - "\n", - "*** Sitelink collision #1 detected for Q15116966-wikipedia_sitelink-74d712 (http://.wikipedia.org/wiki/Module:Message_box)\n", - "\n", - "*** Sitelink collision #1 detected for Q15212145-wikipedia_sitelink-907e39 (http://.wikipedia.org/wiki/Template:LangSwitch)\n", - "\n", - "*** Sitelink collision #1 detected for Q17121869-wikipedia_sitelink-582caf (http://.wikipedia.org/wiki/Module:Lua_banner)\n", - "The sitelink collector called 2500000 times: 0 nrows, 113956702 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q21153658-P26-Q574718-39f28f24-0-P580-03dd18 (^1319-00-00T00:00:00Z/9)\n", - "\n", - "*** Sitelink collision #1 detected for Q26905108-wikipedia_sitelink-51f8f2 (http://.wikipedia.org/wiki/Module:I18n/complex_date)\n", - "The node collector called 5500000 times: 27500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 5500000 times: 0 nrows, 669593652 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 5500000 times: 0 nrows, 797514432 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The qual collector called 4500000 times: 0 nrows, 0 erows, 84435753 qrows, 0 invalid erows, 0 invalid qrows\n", - "2500000 lines processed by processor 5\n", - "2500000 lines processed by processor 7\n", - "2500000 lines processed by processor 4\n", - "2500000 lines processed by processor 11\n", - "2500000 lines processed by processor 0\n", - "2500000 lines processed by processor 8\n", - "2500000 lines processed by processor 9\n", - "The node collector called 6000000 times: 30000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 6000000 times: 0 nrows, 742817342 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "2500000 lines processed by processor 2\n", - "2500000 lines processed by processor 10\n", - "2500000 lines processed by processor 6\n", - "2500000 lines processed by processor 1\n", - "2500000 lines processed by processor 3\n", - "The description collector called 6000000 times: 0 nrows, 918682552 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The qual collector called 5000000 times: 0 nrows, 0 erows, 95556082 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 6500000 times: 0 nrows, 788430809 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 6500000 times: 32500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 6500000 times: 0 nrows, 944911355 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The qual collector called 5500000 times: 0 nrows, 0 erows, 104971336 qrows, 0 invalid erows, 0 invalid qrows\n", - "The sitelink collector called 3000000 times: 0 nrows, 121890140 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 7000000 times: 35000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 7000000 times: 0 nrows, 852807659 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 7000000 times: 0 nrows, 985723776 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "3000000 lines processed by processor 5\n", - "3000000 lines processed by processor 7\n", - "3000000 lines processed by processor 2\n", - "3000000 lines processed by processor 11\n", - "3000000 lines processed by processor 10\n", - "3000000 lines processed by processor 0\n", - "3000000 lines processed by processor 8\n", - "3000000 lines processed by processor 9\n", - "3000000 lines processed by processor 4\n", - "3000000 lines processed by processor 6\n", - "3000000 lines processed by processor 3\n", - "3000000 lines processed by processor 1\n", - "The qual collector called 6000000 times: 0 nrows, 0 erows, 115587156 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q3044-P26-Q263474-631d88d0-0-P580-9b0b8a (^0770-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q3044-P26-Q261866-27b1ed09-0-P580-3fbd66 (^0794-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q27932-P26-Q287503-29306074-0-P580-11c3a9 (^1237-04-25T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q38370-P26-Q80823-ae3ce4e4-0-P580-c9d352 (^1533-01-25T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q38370-P26-Q182637-8103e2ff-0-P580-7524c3 (^1536-05-30T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q38370-P26-Q57126-cb76b09d-0-P580-c55b0a (^1540-01-06T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q38370-P26-Q188926-259757b1-0-P580-3301d6 (^1540-07-28T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q38370-P26-Q192943-4b53adeb-0-P580-1ea2b6 (^1543-07-12T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q43682-wikipedia_sitelink-7b59de (http://ms.wikipedia.org/wiki/Philipp_Lahm)\n", - "\n", - "*** Qualifier collision #1 detected for Q60563-P26-Q2915743-f5fdee07-0-P580-cade68 (^1169-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q65946-P26-Q462536-89b54878-0-P580-48f754 (^1407-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q66888-P26-Q3721846-b7243730-0-P580-4e0bc1 (^1571-01-09T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q66888-P26-Q23771111-34bc78ba-0-P580-713f01 (^1560-03-03T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q68304-P26-Q539111-dfcad6f4-0-P580-d0edbb (^1545-05-17T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q95627-P26-Q354945-873a167d-0-P580-3b86a9 (^1276-11-24T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q121846-P26-Q55579391-afdbc2b3-0-P580-cae35d (^1284-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q122794-P26-Q430950-e085ea2d-0-P580-94ae3a (^1577-10-20T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q124682-P26-Q337057-4fb67536-0-P580-db5ec5 (^1389-08-17T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q130005-P26-Q259564-c738415f-0-P580-a4a595 (^1045-01-23T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q132545-P26-Q131552-2fbc7eb5-0-P580-e56690 (^1533-10-28T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q134452-P26-Q201143-a2079e30-0-P580-7c0e43 (^1491-12-06T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q160349-P26-Q154064-ec5ff971-0-P580-7a7cba (^1385-07-17T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q220845-P26-Q936976-0f99833d-0-P580-5eeb19 (^1572-08-18T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q234257-P26-Q170398-56a0eb9a-0-P580-850b4d (^1816-01-24T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q259564-P26-Q130005-bd5ab415-0-P580-a4a595 (^1045-01-23T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q325583-P26-Q527486-704144b1-0-P580-f981af (^1577-05-19T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q326449-P26-Q23682783-85a9914e-0-P580-13178a (^1736-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q440132-P26-Q506527-db15118a-0-P580-2bef25 (^1524-11-06T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q469389-P26-Q1924994-36c61689-0-P580-017942 (^1377-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q658714-P26-Q20498980-28fdf4a6-0-P580-0672b7 (^1409-01-30T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q682736-P26-Q68285-f3f03090-0-P580-eae385 (^1460-11-19T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q684276-P26-Q61576937-87fbff2c-0-P580-2b5632 (^1217-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q684224-P26-Q326738-18c31ccf-0-P580-ff2137 (^1524-01-17T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q703249-P26-Q1309296-09047836-0-P580-a97c74 (^1228-00-00T00:00:00Z/9)\n", - "\n", - "*** Sitelink collision #1 detected for Q1581723-wikipedia_sitelink-ff37b1 (http://eu.wikipedia.org/wiki/The_Love_Parade)\n", - "\n", - "*** Qualifier collision #1 detected for Q2028843-P26-Q37062-259ae253-0-P580-6f4356 (^1411-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q6244699-P26-Q76366716-754c9057-0-P580-d7261a (^1579-04-27T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q7020051-wikipedia_sitelink-033b05 (http://fi.wikipedia.org/wiki/Luokka:Palkitut)\n", - "The node collector called 7500000 times: 37500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 7500000 times: 0 nrows, 922083165 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q15117391-wikipedia_sitelink-00ef81 (http://.wikipedia.org/wiki/Module:Message_box/configuration)\n", - "The description collector called 7500000 times: 0 nrows, 1062631793 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The sitelink collector called 3500000 times: 0 nrows, 153793624 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q25713407-wikipedia_sitelink-ff4697 (http://.wikipedia.org/wiki/Template:CURRENTCONTENTLANGUAGE)\n", - "The node collector called 8000000 times: 40000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 8000000 times: 0 nrows, 974216636 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The qual collector called 6500000 times: 0 nrows, 0 erows, 122984270 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 8000000 times: 0 nrows, 1155675148 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "3500000 lines processed by processor 0\n", - "3500000 lines processed by processor 5\n", - "3500000 lines processed by processor 7\n", - "3500000 lines processed by processor 11\n", - "3500000 lines processed by processor 2\n", - "3500000 lines processed by processor 10\n", - "3500000 lines processed by processor 8\n", - "3500000 lines processed by processor 9\n", - "3500000 lines processed by processor 4\n", - "3500000 lines processed by processor 3\n", - "3500000 lines processed by processor 6\n", - "3500000 lines processed by processor 1\n", - "The node collector called 8500000 times: 42500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 8500000 times: 0 nrows, 1047004832 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The qual collector called 7000000 times: 0 nrows, 0 erows, 134121504 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 8500000 times: 0 nrows, 1254745369 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q64944842-wikipedia_sitelink-8258da (http://.wikipedia.org/wiki/Module:Portal_navigation)\n", - "\n", - "*** Qualifier collision #1 detected for Q65617406-P26-Q265478-6faeca05-0-P580-dc9c16 (^1884-05-30T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q76366716-P26-Q6244699-b5d45f0b-0-P580-d7261a (^1579-04-27T00:00:00Z/11)\n", - "The edge collector called 9000000 times: 0 nrows, 1092386056 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 9000000 times: 45000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The qual collector called 7500000 times: 0 nrows, 0 erows, 143533696 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 9000000 times: 0 nrows, 1286483409 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The sitelink collector called 4000000 times: 0 nrows, 162571004 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 9500000 times: 47500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 9500000 times: 0 nrows, 1155205313 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q102226589-wikipedia_sitelink-c65fff (http://.wikipedia.org/wiki/Template:User_mnw)\n", - "4000000 lines processed by processor 7\n", - "4000000 lines processed by processor 0\n", - "4000000 lines processed by processor 11\n", - "4000000 lines processed by processor 5\n", - "4000000 lines processed by processor 10\n", - "4000000 lines processed by processor 8\n", - "4000000 lines processed by processor 2\n", - "The description collector called 9500000 times: 0 nrows, 1331240615 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "4000000 lines processed by processor 9\n", - "4000000 lines processed by processor 4\n", - "4000000 lines processed by processor 3\n", - "4000000 lines processed by processor 1\n", - "4000000 lines processed by processor 6\n", - "The qual collector called 8000000 times: 0 nrows, 0 erows, 154200276 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q3740-wikipedia_sitelink-7aaed8 (http://.wikipedia.org/wiki/Category:Templates)\n", - "\n", - "*** Sitelink collision #1 detected for Q8079-wikipedia_sitelink-ade6e5 (http://ga.wikipedia.org/wiki/Nintendo_Wii)\n", - "\n", - "*** Qualifier collision #1 detected for Q41847-P26-Q232137-573ea212-0-P580-fe3abc (^0956-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q60386-P26-Q157776-b68a50b9-0-P580-e55fcf (^1478-09-06T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q60211-P26-Q264709-a5d5e20b-0-P580-7f1413 (^1564-12-17T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q70828-P26-Q110845-78948fbb-0-P580-189c4f (^1282-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q79176-P26-Q702602-bcda292d-0-P580-676c21 (^1431-06-03T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q112783-wikipedia_sitelink-d7b2c3 (http://uk.wikipedia.org/wiki/Вілкокс_(округ))\n", - "\n", - "*** Qualifier collision #1 detected for Q119050-P26-Q26882160-c09f6014-0-P580-78cd0b (^1567-01-13T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q120365-P26-Q69620-c54a3667-0-P580-05429a (^1116-07-13T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q167782-P26-Q231794-aef59aa3-0-P580-7e2e98 (^1350-04-08T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q170586-P26-Q231742-11295529-0-P580-90c66e (^1313-07-00T00:00:00Z/10)\n", - "\n", - "*** Qualifier collision #1 detected for Q172203-P26-Q229419-b442326a-0-P580-a50c51 (^1262-05-28T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q174964-P26-Q231798-bd2d3d6b-0-P580-dc0f7a (^1322-09-21T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q202566-P26-Q688471-440b6399-0-P580-283d12 (^1531-09-20T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q232801-P26-Q721680-fa26b14e-0-P580-70598b (^1473-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q270234-P26-Q210569-6b693078-0-P580-f6928a (^1446-06-20T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q325824-P26-Q547225-762b0607-0-P580-df29d7 (^1467-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q441394-P26-Q57161-47bffbac-0-P580-77780b (^1308-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q454769-P26-Q76956-91d862f6-0-P580-981a99 (^1245-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q455201-P26-Q152148-9eb66558-0-P580-e1de94 (^1389-05-02T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q479538-P26-Q98010-40ca7cda-0-P580-31ff5b (^1582-11-10T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q513315-P26-Q87066-cd6b2f7c-0-P580-3f638b (^1551-03-10T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q513315-P26-Q70019-5c7fa382-0-P580-f9548c (^1558-08-01T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q547225-P26-Q325824-31db3890-0-P580-df29d7 (^1467-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q567378-P26-Q434771-205319b2-0-P580-4d06ab (^1509-11-20T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q571597-P26-Q327750-b0a44162-0-P580-f16789 (^1555-09-10T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q687028-P26-Q2334373-525f829d-0-P580-a6af64 (^1556-02-16T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q1070853-P26-Q2467970-c7d5f6fa-0-P580-d71f7b (^1358-09-04T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q1141121-P26-Q380373-eeba5d95-0-P580-2e184a (^1294-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q1916706-P26-Q80714-35bbccc5-0-P580-d3fce7 (^1109-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q2049198-P26-Q63291-be046904-0-P580-f3b88a (^1372-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q2334373-P26-Q687028-b6b9f398-0-P580-a6af64 (^1556-02-16T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q2334373-P26-Q328693-3f939052-0-P580-0d15c6 (^1543-08-26T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q3926051-wikipedia_sitelink-a027cd (http://.wikipedia.org/wiki/Template:Tl)\n", - "\n", - "*** Sitelink collision #1 detected for Q4481730-wikipedia_sitelink-1d5954 (http://.wikipedia.org/wiki/Template:Tracked)\n", - "\n", - "*** Sitelink collision #1 detected for Q4989282-wikipedia_sitelink-8a2fcc (http://.wikipedia.org/wiki/Category:Pages_with_broken_file_links)\n", - "\n", - "*** Sitelink collision #1 detected for Q5070586-wikipedia_sitelink-b1e4d1 (http://.wikipedia.org/wiki/Template:Shortcut)\n", - "\n", - "*** Sitelink collision #1 detected for Q6027565-wikipedia_sitelink-91a43e (http://.wikipedia.org/wiki/Template:Tag)\n", - "\n", - "*** Qualifier collision #1 detected for Q6940461-P159-Q61302-c45d5aa7-0-P625-dc88d7 (@28.6386/-106.0756)\n", - "\n", - "*** Sitelink collision #1 detected for Q7643575-wikipedia_sitelink-d4d012 (http://.wikipedia.org/wiki/Template:Colon)\n", - "\n", - "*** Qualifier collision #1 detected for Q9150575-P26-Q679083-79dd46a6-0-P580-5d5db4 (^1320-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q13972091-P26-Q75389849-5b19ecc3-0-P1319-532ed8 (^1509-07-04T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q13972091-P26-Q6469914-b9869239-0-P1319-839147 (^1520-00-00T00:00:00Z/9)\n", - "The node collector called 10000000 times: 50000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 10000000 times: 0 nrows, 1225496761 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q15281133-P26-Q75240211-8a7057f8-0-P580-97ad08 (^1526-07-20T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q15605797-wikipedia_sitelink-d3c200 (http://.wikipedia.org/wiki/Module:List)\n", - "The sitelink collector called 4500000 times: 0 nrows, 193397096 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q17347230-wikipedia_sitelink-be73cc (http://.wikipedia.org/wiki/Module:Category_handler/blacklist)\n", - "\n", - "*** Qualifier collision #1 detected for Q20202663-P26-Q299612-893fda0a-0-P580-b50376 (^1080-00-00T00:00:00Z/9)\n", - "The description collector called 10000000 times: 0 nrows, 1410220232 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q20819962-wikipedia_sitelink-377adb (http://.wikipedia.org/wiki/Module:Fallback)\n", - "\n", - "*** Qualifier collision #1 detected for Q26877297-P26-Q542751-a70d423c-0-P580-d584ea (^1488-02-17T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q27031232-P26-Q55217321-0fe60a4f-0-P580-7606e7 (^1280-00-00T00:00:00Z/9)\n", - "The qual collector called 8500000 times: 0 nrows, 0 erows, 161593444 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 10500000 times: 52500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 10500000 times: 0 nrows, 1291455380 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 10500000 times: 0 nrows, 1518697994 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q44191792-P26-Q54862322-4c83e8e6-0-P580-a27cd1 (^1567-08-21T00:00:00Z/11)\n", - "4500000 lines processed by processor 7\n", - "4500000 lines processed by processor 0\n", - "4500000 lines processed by processor 11\n", - "4500000 lines processed by processor 5\n", - "4500000 lines processed by processor 10\n", - "4500000 lines processed by processor 8\n", - "4500000 lines processed by processor 9\n", - "4500000 lines processed by processor 2\n", - "4500000 lines processed by processor 4\n", - "4500000 lines processed by processor 3\n", - "4500000 lines processed by processor 1\n", - "4500000 lines processed by processor 6\n", - "The qual collector called 9000000 times: 0 nrows, 0 erows, 172525914 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 11000000 times: 55000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 11000000 times: 0 nrows, 1351553522 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q64506586-P26-Q262726-b670dee9-0-P580-4a9d3d (^1298-00-00T00:00:00Z/9)\n", - "The description collector called 11000000 times: 0 nrows, 1600991120 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The sitelink collector called 5000000 times: 0 nrows, 203335940 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q75395291-P26-Q76157640-e3d697ee-0-P580-54254d (^1578-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q75420332-P26-Q208922-3b5559ee-0-P580-447dca (^1559-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q76157640-P26-Q75395291-d75eee5e-0-P580-54254d (^1578-00-00T00:00:00Z/9)\n", - "The edge collector called 11500000 times: 0 nrows, 1398071827 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 11500000 times: 57500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The qual collector called 9500000 times: 0 nrows, 0 erows, 181961851 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 11500000 times: 0 nrows, 1631823417 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "5000000 lines processed by processor 0\n", - "5000000 lines processed by processor 11\n", - "5000000 lines processed by processor 7\n", - "5000000 lines processed by processor 8\n", - "5000000 lines processed by processor 10\n", - "5000000 lines processed by processor 5\n", - "The node collector called 12000000 times: 60000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 12000000 times: 0 nrows, 1457551225 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "5000000 lines processed by processor 9\n", - "5000000 lines processed by processor 4\n", - "5000000 lines processed by processor 2\n", - "5000000 lines processed by processor 3\n", - "5000000 lines processed by processor 6\n", - "5000000 lines processed by processor 1\n", - "The qual collector called 10000000 times: 0 nrows, 0 erows, 192683479 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q7731-P26-Q259907-7f7cc241-0-P580-8d5052 (^1648-01-26T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q7731-P26-Q241797-ff9269a2-0-P580-a01064 (^1671-02-01T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q57920-P26-Q267483-a2460de3-0-P580-c0fc4c (^1570-01-08T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q62483-P26-Q229286-18b62769-0-P580-bebb21 (^1541-06-14T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q62483-P26-Q261905-fc01d066-0-P580-7aecc7 (^1546-07-18T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q69334-P26-Q2419674-1dc5e587-0-P580-e1ff18 (^1183-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q72922-P26-Q56582849-77ca7313-0-P580-c16f56 (^1499-01-21T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q76956-P26-Q454769-cf7fc40d-0-P580-981a99 (^1245-00-00T00:00:00Z/9)\n", - "\n", - "*** Sitelink collision #1 detected for Q80235-wikipedia_sitelink-03e5ac (http://is.wikipedia.org/wiki/Tamarind)\n", - "\n", - "*** Qualifier collision #1 detected for Q86055-P26-Q24661944-f75c4596-0-P580-54820b (^1472-10-19T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q151327-wikipedia_sitelink-9044c0 (http://oc.wikipedia.org/wiki/(333)_Badenia)\n", - "\n", - "*** Qualifier collision #1 detected for Q168664-P26-Q15193-1b533b05-0-P580-a310ca (^1793-10-09T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q276526-P26-Q10855916-3e70b907-0-P580-f18c2a (^1392-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q287503-P26-Q316828-d4637da7-0-P580-9879f5 (^1261-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q316831-P26-Q238609-208f7dcc-0-P580-92ae06 (^1153-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q316831-P26-Q450971-656d5797-0-P580-5ed4f3 (^1177-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q336754-P26-Q2084307-30a93eb5-0-P580-b520a9 (^1318-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q370902-P26-Q75289133-2d7df0e9-0-P580-83a193 (^1275-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q430782-P26-Q3007367-9502d33f-0-P580-5b468d (^1555-02-07T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q443876-P26-Q453771-bbc80f51-0-P580-84a26a (^1446-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q563792-P26-Q4958342-a85e5b57-0-P580-acfb1b (^1391-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q588852-P26-Q58514-55e81240-0-P580-ae0480 (^1514-10-09T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q651948-P26-Q9165680-bf5d7e43-0-P580-a08da9 (^1396-03-06T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q679083-P26-Q9150575-c56910ae-0-P580-5d5db4 (^1320-00-00T00:00:00Z/9)\n", - "\n", - "*** Sitelink collision #1 detected for Q929200-wikipedia_sitelink-7842a9 (http://vi.wikipedia.org/wiki/Ilicura_militaris)\n", - "\n", - "*** Qualifier collision #1 detected for Q936976-P26-Q220845-281a5972-0-P580-5eeb19 (^1572-08-18T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q1166728-P26-Q1494018-db61e006-0-P580-3550f9 (^1285-00-00T00:00:00Z/9)\n", - "The description collector called 12000000 times: 0 nrows, 1675439151 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q1494018-P26-Q1166728-5c17988d-0-P580-3550f9 (^1285-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q1524640-P26-Q166853-40fa3891-0-P580-515f76 (^1375-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q2465095-P26-Q1167368-4ffb7291-0-P580-e64863 (^1257-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q2834394-P39-Q84701409-f487718d-0-P580-ac0fb1 (^1154-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q2834394-P39-Q84701409-f487718d-0-P582-35fc60 (^1173-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q2844574-P793-Q2238935-db1dea90-0-P585-ab2ece (^1350-00-00T00:00:00Z/7)\n", - "\n", - "*** Alias collision #1 detected for Q4493910-alias-ru-70f749 ('Фёдоров А. В.'@ru)\n", - "\n", - "*** Sitelink collision #1 detected for Q4608595-wikipedia_sitelink-4a0154 (http://.wikipedia.org/wiki/Template:Documentation)\n", - "\n", - "*** Sitelink collision #1 detected for Q5611978-wikipedia_sitelink-3b808e (http://.wikipedia.org/wiki/Template:Welcome)\n", - "\n", - "*** Qualifier collision #1 detected for Q6129540-P106-Q25393460-4c72cbac-0-P580-9eefc6 (^1552-07-17T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q6426831-wikipedia_sitelink-0d77ef (http://.wikipedia.org/wiki/Template:Edit_filter_warning)\n", - "\n", - "*** Sitelink collision #1 detected for Q7605021-wikipedia_sitelink-3a136d (http://.wikipedia.org/wiki/Template:Comment)\n", - "The sitelink collector called 5500000 times: 0 nrows, 232340004 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q15098140-wikipedia_sitelink-bfa017 (http://.wikipedia.org/wiki/Module:Yesno)\n", - "\n", - "*** Sitelink collision #1 detected for Q15117218-wikipedia_sitelink-4b5db5 (http://.wikipedia.org/wiki/Module:Category_handler)\n", - "\n", - "*** Sitelink collision #1 detected for Q15506579-wikipedia_sitelink-c363ea (http://.wikipedia.org/wiki/Module:Documentation/config)\n", - "\n", - "*** Sitelink collision #1 detected for Q8244473-wikipedia_sitelink-0e32ac (http://.wikipedia.org/wiki/Module:InfoboxImage)\n", - "The node collector called 12500000 times: 62500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 12500000 times: 0 nrows, 1530628094 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q20962109-wikipedia_sitelink-0f5e03 (http://.wikipedia.org/wiki/Module:ISOdate)\n", - "\n", - "*** Sitelink collision #1 detected for Q22910717-wikipedia_sitelink-4da401 (http://.wikipedia.org/wiki/Template:Sandbox_other)\n", - "The description collector called 12500000 times: 0 nrows, 1757625754 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q25714577-wikipedia_sitelink-5c85f4 (http://.wikipedia.org/wiki/Module:WikidataIB)\n", - "\n", - "*** Qualifier collision #1 detected for Q26877285-P26-Q828710-08b99587-0-P580-c30f0a (^1566-02-16T00:00:00Z/11)\n", - "The qual collector called 10500000 times: 0 nrows, 0 erows, 200088433 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 13000000 times: 65000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 13000000 times: 0 nrows, 1599577883 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 13000000 times: 0 nrows, 1879410082 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "5500000 lines processed by processor 0\n", - "5500000 lines processed by processor 11\n", - "5500000 lines processed by processor 7\n", - "5500000 lines processed by processor 8\n", - "5500000 lines processed by processor 10\n", - "5500000 lines processed by processor 5\n", - "5500000 lines processed by processor 9\n", - "5500000 lines processed by processor 2\n", - "5500000 lines processed by processor 4\n", - "5500000 lines processed by processor 3\n", - "5500000 lines processed by processor 6\n", - "5500000 lines processed by processor 1\n", - "The qual collector called 11000000 times: 0 nrows, 0 erows, 210987903 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q61140016-P26-Q44192051-99f17a00-0-P580-7a76d2 (^1575-01-30T00:00:00Z/11)\n", - "The sitelink collector called 6000000 times: 0 nrows, 243922942 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 13500000 times: 67500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 13500000 times: 0 nrows, 1654081431 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 13500000 times: 0 nrows, 1939040832 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q75390802-P26-Q6135465-d640c900-0-P580-91ce5e (^1422-10-20T00:00:00Z/11)\n", - "The qual collector called 11500000 times: 0 nrows, 0 erows, 220410486 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 14000000 times: 70000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 14000000 times: 0 nrows, 1703202908 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 14000000 times: 0 nrows, 1972258264 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "6000000 lines processed by processor 11\n", - "6000000 lines processed by processor 0\n", - "6000000 lines processed by processor 7\n", - "6000000 lines processed by processor 8\n", - "6000000 lines processed by processor 10\n", - "6000000 lines processed by processor 5\n", - "6000000 lines processed by processor 9\n", - "6000000 lines processed by processor 4\n", - "6000000 lines processed by processor 2\n", - "6000000 lines processed by processor 3\n", - "6000000 lines processed by processor 6\n", - "6000000 lines processed by processor 1\n", - "The node collector called 14500000 times: 72500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 14500000 times: 0 nrows, 1766161278 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The qual collector called 12000000 times: 0 nrows, 0 erows, 231223257 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q15193-P26-Q168664-80106bb4-0-P580-a310ca (^1793-10-09T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q53459-P26-Q93408-6fe4810b-0-P580-df7c8b (^1454-02-10T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q56226-wikipedia_sitelink-ff2d61 (http://sl.wikipedia.org/wiki/Kim_Džong-un)\n", - "\n", - "*** Qualifier collision #1 detected for Q58514-P26-Q236220-01ae7b47-0-P580-766cef (^1476-09-08T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q58514-P26-Q201143-bc6f20e4-0-P580-5a4f65 (^1499-01-08T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q58514-P26-Q588852-cdd0895d-0-P580-ae0480 (^1514-10-09T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q69620-P26-Q120365-10ec8d2b-0-P580-05429a (^1116-07-13T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q91003-P26-Q72789-6b43d0ab-0-P580-4be813 (^1150-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q130969-P26-Q229189-a7f573de-0-P580-aacd85 (^1284-08-16T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q131412-P26-Q132548-dcd19f44-0-P580-3d6c00 (^1558-04-24T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q157789-P26-Q260926-aa804d7f-0-P580-d435a1 (^1502-04-10T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q157789-P26-Q233790-625e10c2-0-P580-4cdfa7 (^1518-10-09T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q160165-P26-Q63494-d339ca47-0-P580-b1f503 (^1710-11-11T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q178525-P26-Q134259-aaf86e95-0-P580-d2c4ca (^1137-07-25T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q178525-P26-Q102140-ab1e10d9-0-P580-8b694f (^1152-05-18T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q208420-wikipedia_sitelink-87d729 (http://pl.wikipedia.org/wiki/Triera)\n", - "\n", - "*** Qualifier collision #1 detected for Q231476-P26-Q161866-fa54b200-0-P580-3b4b8f (^1403-02-07T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q231476-P26-Q449008-63ea5e99-0-P580-6fd386 (^1386-09-11T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q254927-P26-Q367001-9bafa4c7-0-P580-a91848 (^1112-02-03T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q259907-P26-Q7731-5a5cc2ce-0-P580-8d5052 (^1648-01-26T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q261847-P26-Q767582-bc9963e7-0-P580-5c7f0d (^1456-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q266309-P26-Q510987-645ea879-0-P580-5513d1 (^1272-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q283548-P26-Q450768-eb0b2e92-0-P580-26c3cf (^1572-08-17T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q397733-wikipedia_sitelink-d5cb47 (http://ar.wikipedia.org/wiki/سلوني)\n", - "\n", - "*** Qualifier collision #1 detected for Q505918-P26-Q274025-fb8c2108-0-P580-e7b518 (^1545-02-15T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q540767-P26-Q74019-da034a0b-0-P580-5774e5 (^1422-07-25T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q702209-P26-Q454810-36c82016-0-P580-e60df9 (^1476-08-25T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q935411-P26-Q70789-de6fac79-0-P580-941716 (^1463-05-10T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q1360375-P26-Q4726173-be500fdb-0-P580-98b695 (^1253-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q2318556-P26-Q317621-b5da07f6-0-P580-a8e531 (^1575-06-14T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q3721525-P26-Q571632-481d6ba3-0-P580-10eca4 (^1436-02-12T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q3904375-P26-Q57231616-7c1080b1-0-P580-df5c65 (^1281-00-00T00:00:00Z/9)\n", - "\n", - "*** Sitelink collision #1 detected for Q4048908-wikipedia_sitelink-0836ac (http://.wikipedia.org/wiki/Category:Hidden_categories)\n", - "\n", - "*** Sitelink collision #1 detected for Q5324375-wikipedia_sitelink-91df00 (http://.wikipedia.org/wiki/Category:Maintenance)\n", - "\n", - "*** Sitelink collision #1 detected for Q5626735-wikipedia_sitelink-7430af (http://.wikipedia.org/wiki/Template:Infobox)\n", - "\n", - "*** Qualifier collision #1 detected for Q6135465-P26-Q75390802-f0852539-0-P580-91ce5e (^1422-10-20T00:00:00Z/11)\n", - "The description collector called 14500000 times: 0 nrows, 2017690517 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q6469914-P26-Q4497270-0dd2e8d0-0-P580-bd95e9 (^1511-08-28T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q8213590-wikipedia_sitelink-5ee1a4 (http://.wikipedia.org/wiki/Template:Sister_project)\n", - "\n", - "*** Qualifier collision #1 detected for Q9264442-P159-Q270-adc8754c-0-P625-bdfc28 (@52.228472/21.013139)\n", - "The sitelink collector called 6500000 times: 0 nrows, 271891952 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q12118776-wikipedia_sitelink-2380bc (http://tt.wikipedia.org/wiki/Луминица_(Констанца))\n", - "\n", - "*** Sitelink collision #1 detected for Q15408619-wikipedia_sitelink-e2b771 (http://.wikipedia.org/wiki/Module:TableTools)\n", - "\n", - "*** Sitelink collision #1 detected for Q13107716-wikipedia_sitelink-595446 (http://.wikipedia.org/wiki/Module:Infobox)\n", - "\n", - "*** Sitelink collision #1 detected for Q15818852-wikipedia_sitelink-659cea (http://.wikipedia.org/wiki/Template:Section_resolved)\n", - "\n", - "*** Qualifier collision #1 detected for Q16566720-P26-Q319870-602b0c96-0-P580-ee5a8a (^1572-09-08T00:00:00Z/11)\n", - "\n", - "*** Sitelink collision #1 detected for Q16830095-wikipedia_sitelink-064884 (http://.wikipedia.org/wiki/Module:Check_for_unknown_parameters)\n", - "\n", - "*** Sitelink collision #1 detected for Q17347215-wikipedia_sitelink-aada88 (http://.wikipedia.org/wiki/Module:Category_handler/data)\n", - "\n", - "*** Sitelink collision #1 detected for Q18123834-wikipedia_sitelink-e4e0f9 (http://.wikipedia.org/wiki/Template:Mono)\n", - "\n", - "*** Sitelink collision #1 detected for Q18338361-wikipedia_sitelink-a89d12 (http://.wikipedia.org/wiki/Category:Pages_using_duplicate_arguments_in_template_calls)\n", - "The node collector called 15000000 times: 75000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 15000000 times: 0 nrows, 1833984215 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q23769960-P1416-Q26222380-85afc868-0-P580-896ff2 (^1999-12-01T00:00:00Z/11)\n", - "The description collector called 15000000 times: 0 nrows, 2108547178 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The qual collector called 12500000 times: 0 nrows, 0 erows, 238549976 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 15500000 times: 77500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 15500000 times: 0 nrows, 1905410846 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "6500000 lines processed by processor 11\n", - "6500000 lines processed by processor 7\n", - "6500000 lines processed by processor 0\n", - "6500000 lines processed by processor 8\n", - "6500000 lines processed by processor 5\n", - "6500000 lines processed by processor 10\n", - "6500000 lines processed by processor 9\n", - "6500000 lines processed by processor 4\n", - "6500000 lines processed by processor 2\n", - "6500000 lines processed by processor 3\n", - "6500000 lines processed by processor 6\n", - "6500000 lines processed by processor 1\n", - "The description collector called 15500000 times: 0 nrows, 2236189283 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The sitelink collector called 7000000 times: 0 nrows, 284558497 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q56528363-wikipedia_sitelink-12db12 (http://.wikipedia.org/wiki/Module:DateI18n)\n", - "The qual collector called 13000000 times: 0 nrows, 0 erows, 249491175 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q64944768-wikipedia_sitelink-f8a890 (http://.wikipedia.org/wiki/Template:Portal_navigation)\n", - "The node collector called 16000000 times: 80000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 16000000 times: 0 nrows, 1956689637 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 16000000 times: 0 nrows, 2275342115 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q75273844-P26-Q75273846-b5720745-0-P580-3f9e86 (^1468-07-08T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q75389849-P26-Q13972091-b1b4193f-0-P1319-532ed8 (^1509-07-04T00:00:00Z/11)\n", - "The qual collector called 13500000 times: 0 nrows, 0 erows, 258909784 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 16500000 times: 82500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 16500000 times: 0 nrows, 2011778907 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 16500000 times: 0 nrows, 2312084116 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "7000000 lines processed by processor 11\n", - "7000000 lines processed by processor 0\n", - "7000000 lines processed by processor 7\n", - "7000000 lines processed by processor 8\n", - "7000000 lines processed by processor 5\n", - "7000000 lines processed by processor 10\n", - "7000000 lines processed by processor 4\n", - "7000000 lines processed by processor 9\n", - "7000000 lines processed by processor 2\n", - "7000000 lines processed by processor 3\n", - "\n", - "*** Sitelink collision #1 detected for Q105429923-wikipedia_sitelink-61eaae (http://.wikipedia.org/wiki/Special:RecentChanges)\n", - "7000000 lines processed by processor 6\n", - "7000000 lines processed by processor 1\n", - "The qual collector called 14000000 times: 0 nrows, 0 erows, 269782940 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q42305-P26-Q229807-78d461dc-0-P580-34f596 (^1191-05-12T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q51089-P26-Q378756-91878f66-0-P580-78aa9d (^1350-09-27T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q64222-P26-Q969770-fd31fc8c-0-P580-684a59 (^1433-11-12T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q68567-P26-Q53441-de9c32df-0-P580-4d0846 (^1115-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q68952-P26-Q327572-72aafaf0-0-P580-5906e2 (^1563-05-10T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q132548-P26-Q131412-dba1192f-0-P580-3d6c00 (^1558-04-24T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q155581-P26-Q61261-80392b57-0-P580-3550f9 (^1285-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q166853-P26-Q1524640-013a0a25-0-P580-515f76 (^1375-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q234549-P26-Q154998-45a333f4-0-P580-45ce34 (^1525-10-29T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q235484-P26-Q105378-affa9a7c-0-P580-e8c2d5 (^1168-02-01T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q238609-P26-Q316831-e3e16df6-0-P580-92ae06 (^1153-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q242636-P26-Q129308-cfe073cc-0-P580-c6fd17 (^1189-08-29T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q242636-P26-Q1502979-64af4f58-0-P580-366f19 (^1214-01-20T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q242636-P26-Q1381324-c86a9785-0-P580-762370 (^1217-09-00T00:00:00Z/10)\n", - "\n", - "*** Qualifier collision #1 detected for Q260926-P26-Q157789-f6d156ff-0-P580-d435a1 (^1502-04-10T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q264709-P26-Q60211-e9a2f9fc-0-P580-7f1413 (^1564-12-17T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q266025-P26-Q312110-2dbf2ebf-0-P580-189c4f (^1282-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q273307-P26-Q1772833-b61045de-0-P580-a25d98 (^1271-00-00T00:00:00Z/9)\n", - "\n", - "*** Sitelink collision #1 detected for Q282380-wikipedia_sitelink-bf04f9 (http://et.wikipedia.org/wiki/Jedwabne)\n", - "\n", - "*** Qualifier collision #1 detected for Q325041-P26-Q2309561-25e0a31e-0-P580-f872f9 (^1515-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q435726-P26-Q719626-f865d7a4-0-P580-5a49c5 (^1496-11-21T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q450971-P26-Q316831-e5799852-0-P580-5ed4f3 (^1177-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q463669-P26-Q40433-6767e474-0-P580-480b99 (^1550-00-00T00:00:00Z/9)\n", - "\n", - "*** Qualifier collision #1 detected for Q506527-P26-Q440132-e6b92d24-0-P580-2bef25 (^1524-11-06T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q553550-P26-Q465382-752c3e78-0-P580-6fd04d (^1540-02-08T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q926335-P26-Q2419674-95719c4f-0-P580-d5e047 (^1190-00-00T00:00:00Z/9)\n", - "\n", - "*** Sitelink collision #1 detected for Q956852-wikipedia_sitelink-44d82a (http://zh-min-nan.wikipedia.org/wiki/Buffalo_(Missouri))\n", - "The node collector called 17000000 times: 85000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 17000000 times: 0 nrows, 2082839182 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q2299423-P159-Q1715-d2db66d9-0-P625-ceaec4 (@52.3704/9.7734)\n", - "\n", - "*** Qualifier collision #1 detected for Q2467970-P26-Q1070853-48f1aeab-0-P580-d71f7b (^1358-09-04T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q2517901-P106-Q25393460-25aee660-0-P580-e5e8e2 (^1558-03-05T00:00:00Z/11)\n", - "\n", - "*** Qualifier collision #1 detected for Q3997398-P159-Q101500-ee2c35e9-0-P625-79ba3e (@45.718139/9.715862)\n", - "\n", - "*** Sitelink collision #1 detected for Q5640659-wikipedia_sitelink-53102e (http://.wikipedia.org/wiki/Template:Ombox)\n", - "\n", - "*** Sitelink collision #1 detected for Q5843835-wikipedia_sitelink-6be4da (http://.wikipedia.org/wiki/Template:Fmbox)\n", - "\n", - "*** Sitelink collision #1 detected for Q7009036-wikipedia_sitelink-904fb9 (http://lij.wikipedia.org/wiki/Categorîa:Bahrain)\n", - "The sitelink collector called 7500000 times: 0 nrows, 311160195 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 17000000 times: 0 nrows, 2383647839 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q13889180-wikipedia_sitelink-61fce0 (http://ceb.wikipedia.org/wiki/Neotanais_minimus)\n", - "\n", - "*** Sitelink collision #1 detected for Q14357839-wikipedia_sitelink-379650 (http://.wikipedia.org/wiki/Module:Documentation)\n", - "\n", - "*** Sitelink collision #1 detected for Q15379728-wikipedia_sitelink-f395bc (http://.wikipedia.org/wiki/Module:Arguments)\n", - "\n", - "*** Sitelink collision #1 detected for Q16746551-wikipedia_sitelink-3b6af8 (http://.wikipedia.org/wiki/Template:Bulleted_list)\n", - "\n", - "*** Sitelink collision #1 detected for Q17347224-wikipedia_sitelink-98d4f5 (http://.wikipedia.org/wiki/Module:Category_handler/shared)\n", - "\n", - "*** Sitelink collision #1 detected for Q18577165-wikipedia_sitelink-79293a (http://.wikipedia.org/wiki/Template:Translatable_template)\n", - "\n", - "*** Sitelink collision #1 detected for Q18577187-wikipedia_sitelink-8b452d (http://.wikipedia.org/wiki/Template:Translatable_template_name)\n", - "The node collector called 17500000 times: 87500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 17500000 times: 0 nrows, 2136259786 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Sitelink collision #1 detected for Q26905045-wikipedia_sitelink-969685 (http://.wikipedia.org/wiki/Module:Complex_date)\n", - "\n", - "*** Sitelink collision #1 detected for Q28132212-wikipedia_sitelink-62565d (http://.wikipedia.org/wiki/Module:TNT)\n", - "\n", - "*** Qualifier collision #1 detected for Q31191558-P26-Q61139836-6de8c7ca-0-P580-70d445 (^1577-02-18T00:00:00Z/11)\n", - "The qual collector called 14500000 times: 0 nrows, 0 erows, 277058427 qrows, 0 invalid erows, 0 invalid qrows\n", - "The description collector called 17500000 times: 0 nrows, 2471569069 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "7500000 lines processed by processor 11\n", - "7500000 lines processed by processor 0\n", - "7500000 lines processed by processor 8\n", - "7500000 lines processed by processor 7\n", - "7500000 lines processed by processor 10\n", - "7500000 lines processed by processor 5\n", - "The node collector called 18000000 times: 90000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 18000000 times: 0 nrows, 2210319612 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "7500000 lines processed by processor 9\n", - "7500000 lines processed by processor 4\n", - "7500000 lines processed by processor 2\n", - "7500000 lines processed by processor 6\n", - "7500000 lines processed by processor 3\n", - "The sitelink collector called 8000000 times: 0 nrows, 324984687 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "7500000 lines processed by processor 1\n", - "\n", - "*** Qualifier collision #1 detected for Q54882974-P26-Q26205746-cdf8483f-0-P580-d027aa (^1549-02-16T00:00:00Z/11)\n", - "The description collector called 18000000 times: 0 nrows, 2585990048 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The qual collector called 15000000 times: 0 nrows, 0 erows, 287910963 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 18500000 times: 92500000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 18500000 times: 0 nrows, 2259350521 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "\n", - "*** Qualifier collision #1 detected for Q75273846-P26-Q75273844-b3554203-0-P580-3f9e86 (^1468-07-08T00:00:00Z/11)\n", - "The description collector called 18500000 times: 0 nrows, 2615091276 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The qual collector called 15500000 times: 0 nrows, 0 erows, 297353277 qrows, 0 invalid erows, 0 invalid qrows\n", - "The node collector called 19000000 times: 95000000 nrows, 0 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "The edge collector called 19000000 times: 0 nrows, 2320200322 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "8000000 lines processed by processor 11\n", - "8000000 lines processed by processor 0\n", - "8000000 lines processed by processor 10\n", - "8000000 lines processed by processor 5\n", - "8000000 lines processed by processor 7\n", - "8000000 lines processed by processor 8\n", - "8000000 lines processed by processor 9\n", - "8000000 lines processed by processor 4\n", - "8000000 lines processed by processor 2\n", - "The description collector called 19000000 times: 0 nrows, 2654687085 erows, 0 qrows, 0 invalid erows, 0 invalid qrows\n", - "8000000 lines processed by processor 6\n", - "8000000 lines processed by processor 3\n", - "8000000 lines processed by processor 1\n", - "The qual collector called 16000000 times: 0 nrows, 0 erows, 308264519 qrows, 0 invalid erows, 0 invalid qrows\n", - "Done processing /data/amandeep/wikidata-20220505/latest-all.json.bz2\n", - "Telling the workers to shut down.\n", - "Exiting worker process 4 (pid 118151).\n", - "Exiting worker process 0 (pid 118147).\n", - "Exiting worker process 10 (pid 118157).\n", - "Exiting worker process 8 (pid 118155).\n", - "Waiting for the workers to shut down.\n", - "Exiting worker process 6 (pid 118153).\n", - "Exiting worker process 1 (pid 118148).\n", - "Exiting worker process 5 (pid 118152).\n", - "Exiting worker process 9 (pid 118156).\n", - "Exiting worker process 11 (pid 118158).\n", - "Exiting worker process 3 (pid 118150).\n", - "Exiting worker process 7 (pid 118154).\n", - "Exiting worker process 2 (pid 118149).\n", - "Worker shut down is complete.\n", - "Telling the node collector to shut down.\n", - "Waiting for the node collector to shut down.\n", - "Exiting the node collector (pid 118140).\n", - "The node collector has closed its output files.\n", - "Node collector shut down is complete.\n", - "Telling the edge collector to shut down.\n", - "Waiting for the edge collector to shut down.\n", - "Exiting the edge collector (pid 118141).\n", - "The edge collector has closed its output files.\n", - "Edge collector shut down is complete.\n", - "Telling the qual collector to shut down.\n", - "Waiting for the qual collector to shut down.\n", - "Exiting the qual collector (pid 118142).\n", - "The qual collector has closed its output files.\n", - "Qual collector shut down is complete.\n", - "Telling the invalid edge collector to shut down.\n", - "Waiting for the invalid edge collector to shut down.\n", - "Exiting the invalid edge collector (pid 118143).\n", - "The invalid edge collector has closed its output files.\n", - "Invalid edge collector shut down is complete.\n", - "Telling the invalid qual collector to shut down.\n", - "Waiting for the invalid qual collector to shut down.\n", - "Exiting the invalid qual collector (pid 118144).\n", - "The invalid qual collector has closed its output files.\n", - "Invalid qual collector shut down is complete.\n", - "Telling the description collector to shut down.\n", - "Waiting for the description collector to shut down.\n", - "Exiting the description collector (pid 118145).\n", - "The description collector has closed its output files.\n", - "Description collector shut down is complete.\n", - "Telling the sitelink collector to shut down.\n", - "Waiting for the sitelink collector to shut down.\n", - "Exiting the sitelink collector (pid 118146).\n", - "The sitelink collector has closed its output files.\n", - "Sitelink collector shut down is complete.\n", - "import complete\n", - "time taken : 45465.164197444916s\n", - "Timing: elapsed=12:37:48.283348 CPU=10:19:31.139152 ( 81.8%): import-wikidata -i /data/amandeep/wikidata-20220505/latest-all.json.bz2 --node-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.node.unsorted.tsv.gz --minimal-edge-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz --minimal-qual-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz --invalid-edge-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz --invalid-qual-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalue.unsorted.tsv.gz --node-file-id-only --explode-values False --all-languages True --lang en --alias-edges True --split-alias-file /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz --split-en-alias-file /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.en.unsorted.tsv.gz --description-edges True --split-description-file /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz --split-en-description-file /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.en.unsorted.tsv.gz --label-edges True --split-label-file /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz --split-en-label-file /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.en.unsorted.tsv.gz --datatype-edges True --split-datatype-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz --entry-type-edges True --split-type-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz --sitelink-edges True --sitelink-verbose-edges True --split-sitelink-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz --split-en-sitelink-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz --value-hash-width 6 --claim-id-hash-width 8 --use-kgtkwriter True --use-mgzip-for-input False --use-mgzip-for-output False --use-shm True --procs 12 --mapper-batch-size 5 --max-size-per-mapper-queue 3 --single-mapper-queue True --collect-results True --collect-seperately True --collector-batch-size 5 --collector-queue-per-proc-size 3 --progress-interval 500000 --clean --allow-end-of-day False --repair-month-or-day-zero --minimum-valid-year 1 --maximum-valid-year 9999 --validate-fromisoformat --repair-lax-coordinates --allow-language-suffixes --allow-wikidata-lq-strings\n" - ] - } - ], - "source": [ - "!kgtk ${KGTK_FLAGS} \\\n", - " import-wikidata \\\n", - " -i ${WIKIDATA_ALL_JSON} \\\n", - " --node-file ${TEMP}/metadata.node.${UNSORTED_KGTK} \\\n", - " --minimal-edge-file ${TEMP}/claims.raw.${UNSORTED_KGTK} \\\n", - " --minimal-qual-file ${TEMP}/qualifiers.raw.${UNSORTED_KGTK} \\\n", - " --invalid-edge-file ${TEMP}/claims.badvalue.${UNSORTED_KGTK} \\\n", - " --invalid-qual-file ${TEMP}/qualifiers.badvalue.${UNSORTED_KGTK} \\\n", - " --node-file-id-only \\\n", - " --explode-values False \\\n", - " --all-languages True \\\n", - " --lang en \\\n", - " --alias-edges True \\\n", - " --split-alias-file ${TEMP}/aliases.${UNSORTED_KGTK} \\\n", - " --split-en-alias-file ${TEMP}/aliases.en.${UNSORTED_KGTK} \\\n", - " --description-edges True \\\n", - " --split-description-file ${TEMP}/descriptions.${UNSORTED_KGTK} \\\n", - " --split-en-description-file ${TEMP}/descriptions.en.${UNSORTED_KGTK} \\\n", - " --label-edges True \\\n", - " --split-label-file ${TEMP}/labels.${UNSORTED_KGTK} \\\n", - " --split-en-label-file ${TEMP}/labels.en.${UNSORTED_KGTK} \\\n", - " --datatype-edges True \\\n", - " --split-datatype-file ${TEMP}/metadata.property.datatypes.${UNSORTED_KGTK} \\\n", - " --entry-type-edges True \\\n", - " --split-type-file ${TEMP}/metadata.types.${UNSORTED_KGTK} \\\n", - " --sitelink-edges True \\\n", - " --sitelink-verbose-edges True \\\n", - " --split-sitelink-file ${TEMP}/sitelinks.raw.${UNSORTED_KGTK} \\\n", - " --split-en-sitelink-file ${TEMP}/sitelinks.en.raw.${UNSORTED_KGTK} \\\n", - " --value-hash-width 6 \\\n", - " --claim-id-hash-width 8 \\\n", - " --use-kgtkwriter True \\\n", - " --use-mgzip-for-input False \\\n", - " --use-mgzip-for-output False \\\n", - " --use-shm True \\\n", - " --procs 12 \\\n", - " --mapper-batch-size 5 \\\n", - " --max-size-per-mapper-queue 3 \\\n", - " --single-mapper-queue True \\\n", - " --collect-results True \\\n", - " --collect-seperately True\\\n", - " --collector-batch-size 5 \\\n", - " --collector-queue-per-proc-size 3 \\\n", - " --progress-interval 500000 \\\n", - " --clean \\\n", - " --allow-end-of-day False \\\n", - " --repair-month-or-day-zero \\\n", - " --minimum-valid-year 1 \\\n", - " --maximum-valid-year 9999 \\\n", - " --validate-fromisoformat \\\n", - " --repair-lax-coordinates \\\n", - " --allow-language-suffixes \\\n", - " --allow-wikidata-lq-strings \\\n", - " | tee ${TEMP}/import-split-wikidata.log\n" - ] - }, - { - "cell_type": "markdown", - "id": "3d4a8e15-8826-4d50-83c9-9a17346eb206", - "metadata": {}, - "source": [ - "## Split `somevalue` and `novalue` from `claims.raw.unsorted.tsv.gz`" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "6aad9a3c-c27b-4858-802b-b633c40dbb5d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", - "Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the reject file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Applying a dispatched multiple-output object filter\n", - "Read 1362524112 rows, rejected 1361968102 rows, wrote 556010 rows.\n", - "Closing output files.\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "All output files have been closed.\n", - "Timing: elapsed=1:21:07.331541 CPU=4:34:39.248156 (338.6%): filter --verbose --use-mgzip TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.raw.unsorted.tsv.gz --first-match-only --pattern ;; novalue -o /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz --pattern ;; somevalue -o /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz --reject-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n" - ] - } - ], - "source": [ - "!kgtk ${KGTK_FLAGS} \\\n", - " filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \\\n", - " --input-file ${TEMP}/claims.raw.${UNSORTED_KGTK} \\\n", - " --first-match-only \\\n", - " --pattern \";; novalue\" -o ${TEMP}/claims.novalue.${UNSORTED_KGTK} \\\n", - " --pattern \";; somevalue\" -o ${TEMP}/claims.somevalue.${UNSORTED_KGTK} \\\n", - " --reject-file ${TEMP}/claims.${UNSORTED_KGTK} \\\n", - " | tee ${TEMP}/split-claims-missing-values.log" - ] - }, - { - "cell_type": "markdown", - "id": "d7d9a090-de83-4e9f-84d9-93fb49f486b8", - "metadata": {}, - "source": [ - " ## Split `somevalue` and `novalue` from `qualifiers.raw.tsv.gz`" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "7538ad9d-018a-45ea-95f8-46467e68affe", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: -\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", - "Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: -\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '-' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading stdin\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '-' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading stdin\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: -\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '-' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading stdin\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Opening the reject file: -\n", - "KgtkWriter: writing stdout\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Applying a dispatched multiple-output object filter\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Opening the reject file: -\n", - "KgtkWriter: writing stdout\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing by cacheing the filter file's key set.\n", - "Building the filter key set from /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz\n", - "There are 124282 entries in the filter key set.\n", - "Filtering records from -\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Opening the reject file: -\n", - "KgtkWriter: writing stdout\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing by cacheing the filter file's key set.\n", - "Building the filter key set from /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz\n", - "There are 431728 entries in the filter key set.\n", - "Filtering records from -\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Opening the reject file: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing gzip /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing by cacheing the filter file's key set.\n", - "Building the filter key set from /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz\n", - "There are 50133 entries in the filter key set.\n", - "Filtering records from -\n", - "Read 308638139 rows, rejected 308457119 rows, wrote 181020 rows.\n", - "Closing output files.\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: not closing standard output\n", - "All output files have been closed.\n", - "Timing: elapsed=1:10:16.268229 CPU=0:19:02.118320 ( 27.1%): filter --verbose --use-mgzip TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz --first-match-only --pattern ;; novalue -o /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz --pattern ;; somevalue -o /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz --reject-file -\n", - "Read 308457119 input records, accepted 39972 records, rejected 308417147 records.\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: not closing standard output\n", - "Timing: elapsed=1:10:16.404495 CPU=0:16:26.828742 ( 23.4%): ifexists --verbose --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz --reject-file -\n", - "Read 308417147 input records, accepted 368548 records, rejected 308048599 records.\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: not closing standard output\n", - "Timing: elapsed=1:10:16.610469 CPU=0:16:06.036942 ( 22.9%): ifexists --verbose --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz --reject-file -\n", - "Read 308048599 input records, accepted 14219 records, rejected 308034380 records.\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=1:10:16.726708 CPU=1:09:22.133053 ( 98.7%): ifexists --verbose --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz --reject-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", - "Timing: elapsed=1:10:29.856671 CPU=0:00:07.875724 ( 0.2%): filter --verbose --use-mgzip TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.raw.unsorted.tsv.gz --first-match-only --pattern ;; novalue -o /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz --pattern ;; somevalue -o /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz --reject-file - / ifexists --verbose --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz --reject-file - / ifexists --verbose --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz --reject-file - / ifexists --verbose --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz --reject-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n" - ] - } - ], - "source": [ - "!kgtk ${KGTK_FLAGS} \\\n", - " filter ${VERBOSE} --use-mgzip ${USE_MGZIP} \\\n", - " --input-file ${TEMP}/qualifiers.raw.${UNSORTED_KGTK} \\\n", - " --first-match-only \\\n", - " --pattern \";; novalue\" -o ${TEMP}/qualifiers.novalue.${UNSORTED_KGTK} \\\n", - " --pattern \";; somevalue\" -o ${TEMP}/qualifiers.somevalue.${UNSORTED_KGTK} \\\n", - " --reject-file - \\\n", - " / ifexists ${VERBOSE} \\\n", - " --input-keys node1 \\\n", - " --filter-file ${TEMP}/claims.novalue.${UNSORTED_KGTK} \\\n", - " --filter-keys id \\\n", - " --output-file ${TEMP}/qualifiers.novalueClaims.${UNSORTED_KGTK} \\\n", - " --reject-file - \\\n", - " / ifexists ${VERBOSE} \\\n", - " --input-keys node1 \\\n", - " --filter-file ${TEMP}/claims.somevalue.${UNSORTED_KGTK} \\\n", - " --filter-keys id \\\n", - " --output-file ${TEMP}/qualifiers.somevalueClaims.${UNSORTED_KGTK} \\\n", - " --reject-file - \\\n", - " / ifexists ${VERBOSE} \\\n", - " --input-keys node1 \\\n", - " --filter-file ${TEMP}/claims.badvalue.${UNSORTED_KGTK} \\\n", - " --filter-keys id \\\n", - " --output-file ${TEMP}/qualifiers.badvalueClaims.${UNSORTED_KGTK} \\\n", - " --reject-file ${TEMP}/qualifiers.${UNSORTED_KGTK} \\\n", - " | tee ${TEMP}/split-qualifiers-missing-values.log" - ] - }, - { - "cell_type": "markdown", - "id": "d6f67ef6-4570-4786-b41f-a1355fb63981", - "metadata": {}, - "source": [ - "## Split `sitelinks.raw.unsorted.tsv.gz`" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "261a84b8-c671-4134-afaa-12af4c4a7762", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", - "Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tlang\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tlang\n", - "Opening the reject file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tlang\n", - "Applying a single general filter\n", - "Read 328865880 rows, rejected 82326179 rows, wrote 246539701 rows.\n", - "Keep counts: subject=0, predicate=246539701, object=0.\n", - "Reject counts: subject=0, predicate=82326179, object=0.\n", - "Closing output files.\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "All output files have been closed.\n", - "Timing: elapsed=0:23:09.677401 CPU=1:32:45.933341 (400.5%): filter --verbose --use-mgzip=TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.raw.unsorted.tsv.gz --pattern ; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ; --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz --reject-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n" - ] - } - ], - "source": [ - "!kgtk ${KGTK_FLAGS} \\\n", - " filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \\\n", - " --input-file ${TEMP}/sitelinks.raw.${UNSORTED_KGTK} \\\n", - " --pattern \"; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;\" \\\n", - " --output-file ${TEMP}/sitelinks.qualifiers.${UNSORTED_KGTK} \\\n", - " --reject-file ${TEMP}/sitelinks.${UNSORTED_KGTK} \\\n", - " | tee ${TEMP}/split-sitelink-qualifiers.log" - ] - }, - { - "cell_type": "markdown", - "id": "e7d55217-89c7-42e9-b01b-aaf3e0426234", - "metadata": {}, - "source": [ - "## Split `sitelinks.en.raw.unsorted.tsv.gz`" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "9bce9d69-6032-4e2f-aa04-12da7998d508", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", - "Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.qualifiers.unsorted.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.qualifiers.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\n", - "Opening the reject file: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.unsorted.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\n", - "Applying a single general filter\n", - "Read 55048589 rows, rejected 13745591 rows, wrote 41302998 rows.\n", - "Keep counts: subject=0, predicate=41302998, object=0.\n", - "Reject counts: subject=0, predicate=13745591, object=0.\n", - "Closing output files.\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "All output files have been closed.\n", - "Timing: elapsed=0:04:06.637752 CPU=0:16:28.598107 (400.8%): filter --verbose --use-mgzip=TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.raw.unsorted.tsv.gz --pattern ; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ; --output-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.qualifiers.unsorted.tsv.gz --reject-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.unsorted.tsv.gz\n" - ] - } - ], - "source": [ - "!kgtk ${KGTK_FLAGS} \\\n", - " filter ${VERBOSE} --use-mgzip=${USE_MGZIP} \\\n", - " --input-file ${TEMP}/sitelinks.en.raw.${UNSORTED_KGTK} \\\n", - " --pattern \"; sitelink-badge,sitelink-language,sitelink-site,sitelink-title ;\" \\\n", - " --output-file ${TEMP}/sitelinks.en.qualifiers.${UNSORTED_KGTK} \\\n", - " --reject-file ${TEMP}/sitelinks.en.${UNSORTED_KGTK} \\\n", - " | tee ${TEMP}/split-sitelink-en-qualifiers.log" - ] - }, - { - "cell_type": "markdown", - "id": "9275fecc-98db-435c-863a-7f4d780f64c9", - "metadata": {}, - "source": [ - "## Sort the files from `TEMP` to `OUT` folder" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "90b70419-1894-4dcb-954d-5b83d4c80d48", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sort the claims file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz'\n", - "Monitoring the cat command (pid=175774).\n", - "Running the sort script (pid=175778).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 rank node2;wikidatatype\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=175786)\n", - "Cleanup.\n", - "Timing: elapsed=0:38:27.177785 CPU=0:00:06.266422 ( 0.3%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the claims.badvalue file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.badvalue.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.badvalue.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.badvalue.tsv.gz'\n", - "Monitoring the cat command (pid=180475).\n", - "Running the sort script (pid=180479).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 rank node2;wikidatatype\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Exception looking for sort command: \n", - "\n", - " RAN: /usr/bin/pgrep -g 180479 --newest sort\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "\n", - "Cleanup.\n", - "Timing: elapsed=0:00:06.927161 CPU=0:00:06.170400 ( 89.1%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.badvalue.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.badvalue.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the claims.novalue file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.novalue.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.novalue.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.novalue.tsv.gz'\n", - "Monitoring the cat command (pid=180571).\n", - "Running the sort script (pid=180575).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 rank node2;wikidatatype\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Exception looking for sort command: \n", - "\n", - " RAN: /usr/bin/pgrep -g 180575 --newest sort\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "\n", - "Cleanup.\n", - "Timing: elapsed=0:00:02.236354 CPU=0:00:05.542310 (247.8%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.novalue.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.novalue.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the claims.somevalue file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.somevalue.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.somevalue.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.somevalue.tsv.gz'\n", - "Monitoring the cat command (pid=180673).\n", - "Running the sort script (pid=180677).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 rank node2;wikidatatype\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Exception looking for sort command: \n", - "\n", - " RAN: /usr/bin/pgrep -g 180677 --newest sort\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "\n", - "Cleanup.\n", - "Timing: elapsed=0:00:02.156390 CPU=0:00:05.279872 (244.8%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.somevalue.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.somevalue.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the qualifiers file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz'\n", - "Monitoring the cat command (pid=180779).\n", - "Running the sort script (pid=180783).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 node2;wikidatatype\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=180791)\n", - "Cleanup.\n", - "Timing: elapsed=0:04:15.085880 CPU=0:00:05.685372 ( 2.2%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the qualifiers.badvalue file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalue.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalue.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalue.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalue.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalue.tsv.gz'\n", - "Monitoring the cat command (pid=181176).\n", - "Running the sort script (pid=181180).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 node2;wikidatatype\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Exception looking for sort command: \n", - "\n", - " RAN: /usr/bin/pgrep -g 181180 --newest sort\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "\n", - "Cleanup.\n", - "Timing: elapsed=0:00:03.547380 CPU=0:00:06.041493 (170.3%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalue.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalue.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the qualifiers.badvalueClaims file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalueClaims.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalueClaims.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalueClaims.tsv.gz'\n", - "Monitoring the cat command (pid=181268).\n", - "Running the sort script (pid=181272).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 node2;wikidatatype\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Exception looking for sort command: \n", - "\n", - " RAN: /usr/bin/pgrep -g 181272 --newest sort\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "\n", - "Cleanup.\n", - "Timing: elapsed=0:00:01.955742 CPU=0:00:05.388535 (275.5%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.badvalueClaims.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.badvalueClaims.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the qualifiers.novalue file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalue.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalue.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalue.tsv.gz'\n", - "Monitoring the cat command (pid=181345).\n", - "Running the sort script (pid=181349).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 node2;wikidatatype\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Exception looking for sort command: \n", - "\n", - " RAN: /usr/bin/pgrep -g 181349 --newest sort\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "\n", - "Cleanup.\n", - "Timing: elapsed=0:00:02.107548 CPU=0:00:05.496728 (260.8%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalue.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalue.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the qualifiers.novalueClaims file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalueClaims.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalueClaims.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalueClaims.tsv.gz'\n", - "Monitoring the cat command (pid=181447).\n", - "Running the sort script (pid=181453).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 node2;wikidatatype\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Exception looking for sort command: \n", - "\n", - " RAN: /usr/bin/pgrep -g 181453 --newest sort\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "\n", - "Cleanup.\n", - "Timing: elapsed=0:00:02.208194 CPU=0:00:05.602475 (253.7%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.novalueClaims.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.novalueClaims.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the qualifiers.somevalue file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalue.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalue.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalue.tsv.gz'\n", - "Monitoring the cat command (pid=181543).\n", - "Running the sort script (pid=181547).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 node2;wikidatatype\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Exception looking for sort command: \n", - "\n", - " RAN: /usr/bin/pgrep -g 181547 --newest sort\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "\n", - "Cleanup.\n", - "Timing: elapsed=0:00:02.095882 CPU=0:00:05.336625 (254.6%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalue.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalue.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the qualifiers.somevalueClaims file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalueClaims.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalueClaims.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalueClaims.tsv.gz'\n", - "Monitoring the cat command (pid=181645).\n", - "Running the sort script (pid=181651).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 node2;wikidatatype\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Exception looking for sort command: \n", - "\n", - " RAN: /usr/bin/pgrep -g 181651 --newest sort\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "\n", - "Cleanup.\n", - "Timing: elapsed=0:00:02.544968 CPU=0:00:05.689815 (223.6%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.somevalueClaims.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.somevalueClaims.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the aliases file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz'\n", - "Monitoring the cat command (pid=181750).\n", - "Running the sort script (pid=181754).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\tlang\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 lang\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=181762)\n", - "Cleanup.\n", - "Timing: elapsed=0:01:36.071250 CPU=0:00:05.613290 ( 5.8%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/aliases.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the aliases.en file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.en.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.en.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.en.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.en.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/aliases.en.tsv.gz'\n", - "Monitoring the cat command (pid=181952).\n", - "Running the sort script (pid=181956).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=181964)\n", - "Cleanup.\n", - "Timing: elapsed=0:00:13.705462 CPU=0:00:05.731224 ( 41.8%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.en.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/aliases.en.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the descriptions file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz'\n", - "Monitoring the cat command (pid=182059).\n", - "Running the sort script (pid=182063).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\tlang\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 lang\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=182071)\n", - "Cleanup.\n", - "Timing: elapsed=1:06:40.955939 CPU=0:00:06.018089 ( 0.2%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the descriptions.en file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.en.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.en.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.en.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.en.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.en.tsv.gz'\n", - "Monitoring the cat command (pid=186189).\n", - "Running the sort script (pid=186193).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=186201)\n", - "Cleanup.\n", - "Timing: elapsed=0:00:46.518554 CPU=0:00:04.968905 ( 10.7%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.en.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/descriptions.en.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the labels file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz'\n", - "Monitoring the cat command (pid=186298).\n", - "Running the sort script (pid=186302).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\tlang\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 lang\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=186310)\n", - "Cleanup.\n", - "Timing: elapsed=0:12:25.459447 CPU=0:00:05.751043 ( 0.8%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/labels.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the labels.en file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/labels.en.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/labels.en.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/labels.en.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/labels.en.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/labels.en.tsv.gz'\n", - "Monitoring the cat command (pid=187128).\n", - "Running the sort script (pid=187132).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=187140)\n", - "Cleanup.\n", - "Timing: elapsed=0:01:16.965917 CPU=0:00:06.148691 ( 8.0%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.en.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/labels.en.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the sitelinks file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.tsv.gz'\n", - "Monitoring the cat command (pid=187343).\n", - "Running the sort script (pid=187347).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\tlang\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 lang\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=187355)\n", - "Cleanup.\n", - "Timing: elapsed=0:01:27.116634 CPU=0:00:06.156457 ( 7.1%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the sitelinks.en file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.tsv.gz'\n", - "Monitoring the cat command (pid=187715).\n", - "Running the sort script (pid=187719).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=187735)\n", - "Cleanup.\n", - "Timing: elapsed=0:00:17.925203 CPU=0:00:05.605642 ( 31.3%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the sitelinks.en.qualifiers file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.qualifiers.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.qualifiers.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.qualifiers.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.qualifiers.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.qualifiers.tsv.gz'\n", - "Monitoring the cat command (pid=187877).\n", - "Running the sort script (pid=187881).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=187889)\n", - "Cleanup.\n", - "Timing: elapsed=0:00:33.135455 CPU=0:00:05.715740 ( 17.2%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.en.qualifiers.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.en.qualifiers.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the sitelinks.qualifiers file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.qualifiers.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.qualifiers.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.qualifiers.tsv.gz'\n", - "Monitoring the cat command (pid=188131).\n", - "Running the sort script (pid=188135).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\tlang\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 lang\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=188144)\n", - "Cleanup.\n", - "Timing: elapsed=0:03:19.657764 CPU=0:00:05.317021 ( 2.7%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/sitelinks.qualifiers.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the metadata.node file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.node.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.node.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.node.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.node.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.node.tsv.gz'\n", - "Monitoring the cat command (pid=189086).\n", - "Running the sort script (pid=189090).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\n", - "node1 column not found, assuming this is a KGTK node file\n", - "KgtkReader: is_edge_file=False is_node_file=True\n", - "KgtkReader: Special columns: node1=-1 label=-1 node2=-1 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=189098)\n", - "Cleanup.\n", - "Timing: elapsed=0:00:28.235616 CPU=0:00:05.945252 ( 21.1%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.node.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/metadata.node.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the metadata.property.datatypes file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.property.datatypes.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.property.datatypes.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.property.datatypes.tsv.gz'\n", - "Monitoring the cat command (pid=189196).\n", - "Running the sort script (pid=189200).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Exception looking for sort command: \n", - "\n", - " RAN: /usr/bin/pgrep -g 189200 --newest sort\n", - "\n", - " STDOUT:\n", - "\n", - "\n", - " STDERR:\n", - "\n", - "Cleanup.\n", - "Timing: elapsed=0:00:02.100170 CPU=0:00:05.558517 (264.7%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/metadata.property.datatypes.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n", - "Sort the metadata.types file.\n", - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.types.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.types.tsv.gz'\n", - "gunzip input file: '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz'\n", - "full command: pigz -dc '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz' | { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/metadata.types.tsv.gz'\n", - "Monitoring the cat command (pid=189262).\n", - "Running the sort script (pid=189268).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "\n", - "Done reading the input file\n", - "Monitoring the sort command (pid=189274)\n", - "Cleanup.\n", - "Timing: elapsed=0:00:36.869484 CPU=0:00:05.592611 ( 15.2%): sort --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/metadata.types.tsv.gz --gzip-command pigz --sort-command sort --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp\n" - ] - } - ], - "source": [ - "for TARGET in WIKIDATA_IMPORT_SPLIT_FILES:\n", - " print(f\"Sort the {TARGET} file.\")\n", - " input_file = f\"{os.environ['TEMP']}/{TARGET}.{os.environ['UNSORTED_KGTK']}\"\n", - " output_file = f\"{os.environ['OUT']}/{TARGET}.{os.environ['SORTED_KGTK']}\"\n", - " logfile = f\"{os.environ['TEMP']}/{TARGET}-sorted.log\"\n", - " sort_command = f\"\"\"kgtk {os.environ['KGTK_FLAGS']} \\\n", - " sort {os.environ['VERBOSE']} \\\n", - " --input-file {input_file} \\\n", - " --output-file {output_file} \\\n", - " --gzip-command {os.environ['GZIP_CMD']} \\\n", - " --sort-command {os.environ['SORT_COMMAND']} \\\n", - " --extra '{os.environ['SORT_EXTRAS']}' | tee {logfile}\"\"\"\n", - " !$sort_command\n" - ] - }, - { - "cell_type": "markdown", - "id": "22ea12ca-5cb1-4e51-82ae-6cb733f4a555", - "metadata": {}, - "source": [ - "## Build the `all.tsv.gz file`" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "47560fa3-d87e-4840-800e-ecdf7d1d4341", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using the sort command 'sort'\n", - "header pipe: read_fd=4 write_fd=5\n", - "sort options pipe: read_fd=6 write_fd=7\n", - "gzip output file: '/data/amandeep/wikidata-20220505/import-wikidata/data/all.tsv.gz'\n", - "sort command: { IFS= read -r header ; { printf \"%s\\n\" \"$header\" >&5 ; } ; printf \"%s\\n\" \"$header\" ; IFS= read -u 6 -r options ; LC_ALL=C sort -t '\t' $options ; } | pigz - > '/data/amandeep/wikidata-20220505/import-wikidata/data/all.tsv.gz'\n", - "Running the sort script (pid=158825).\n", - "Reading the KGTK input file header line with KgtkReader\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Starting kgtkcat pid=158741\n", - "Opening the 9 input files.\n", - "Opening file 1: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '<4' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: reading file descriptor 4\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "The output file will be an edge file.\n", - "Mapping the 6 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz.\n", - "Opening file 2: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Mapping the 5 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz.\n", - "Opening file 3: /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tlang\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Mapping the 5 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz.\n", - "Opening file 4: /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tlang\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Mapping the 5 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz.\n", - "Opening file 5: /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tlang\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Mapping the 5 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz.\n", - "Opening file 6: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tlang\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Mapping the 5 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz.\n", - "Opening file 7: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tlang\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Mapping the 5 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz.\n", - "Opening file 8: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Mapping the 4 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz.\n", - "Opening file 9: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Mapping the 4 column names in /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz.\n", - "There are 7 merged columns.\n", - "Opening the output edge file: -\n", - "KgtkWriter: writing stdout\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\tlang\n", - "Copying data from file 1: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n", - "Shortcut not possible: len(kr.column_names)=6 != len(kw.column_names)=7\n", - "Row by row file copy\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\tlang\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "KGTK header: id node1 label node2 rank node2;wikidatatype lang\n", - "sort options: --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp -k 1,1 -k 2,2 -k 3,3 -k 4,4\n", - "\n", - "Waiting for the sort command to complete.\n", - "\n", - "Read 1361968102 data lines from file 1: /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz\n", - "Copying data from file 2: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", - "Shortcut not possible: len(kr.column_names)=5 != len(kw.column_names)=7\n", - "Row by row file copy with a shuffle list: 0 1 2 3 5\n", - "Read 308034380 data lines from file 2: /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz\n", - "Copying data from file 3: /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz\n", - "Shortcut not possible: len(kr.column_names)=5 != len(kw.column_names)=7\n", - "Row by row file copy with a shuffle list: 0 1 2 3 6\n", - "Read 170178120 data lines from file 3: /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz\n", - "Copying data from file 4: /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz\n", - "Shortcut not possible: len(kr.column_names)=5 != len(kw.column_names)=7\n", - "Row by row file copy with a shuffle list: 0 1 2 3 6\n", - "Read 2670247344 data lines from file 4: /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz\n", - "Copying data from file 5: /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz\n", - "Shortcut not possible: len(kr.column_names)=5 != len(kw.column_names)=7\n", - "Row by row file copy with a shuffle list: 0 1 2 3 6\n", - "Read 739125735 data lines from file 5: /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz\n", - "Copying data from file 6: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n", - "Shortcut not possible: len(kr.column_names)=5 != len(kw.column_names)=7\n", - "Row by row file copy with a shuffle list: 0 1 2 3 6\n", - "Read 82326179 data lines from file 6: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz\n", - "Copying data from file 7: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz\n", - "Shortcut not possible: len(kr.column_names)=5 != len(kw.column_names)=7\n", - "Row by row file copy with a shuffle list: 0 1 2 3 6\n", - "Read 246539701 data lines from file 7: /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz\n", - "Copying data from file 8: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz\n", - "Shortcut not possible: len(kr.column_names)=4 != len(kw.column_names)=7\n", - "Row by row file copy\n", - "Read 96951235 data lines from file 8: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz\n", - "Copying data from file 9: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz\n", - "Shortcut not possible: len(kr.column_names)=4 != len(kw.column_names)=7\n", - "Row by row file copy\n", - "Read 9984 data lines from file 9: /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz\n", - "Wrote 5675380780 lines total from 9 files\n", - "KgtkWriter: not closing standard output\n", - "Timing: elapsed=7:05:38.707032 CPU=6:22:33.602783 ( 89.9%): cat --verbose --use-mgzip=TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz\n", - "Cleanup.\n", - "Timing: elapsed=8:15:48.511512 CPU=0:00:06.981474 ( 0.0%): sort --verbose --gzip-command pigz --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/all.tsv.gz\n", - "Timing: elapsed=8:15:54.400740 CPU=0:00:10.688431 ( 0.0%): cat --verbose --use-mgzip=TRUE --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/claims.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/qualifiers.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/aliases.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/descriptions.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/labels.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/sitelinks.qualifiers.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.types.unsorted.tsv.gz --input-file /data/amandeep/wikidata-20220505/import-wikidata/temp/metadata.property.datatypes.unsorted.tsv.gz / sort --verbose --gzip-command pigz --extra --parallel 6 --buffer-size 50% -T /data/amandeep/wikidata-20220505/import-wikidata/temp --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/all.tsv.gz\n" - ] - } - ], - "source": [ - "!kgtk ${KGTK_FLAGS} \\\n", - " cat ${VERBOSE} --use-mgzip=${USE_MGZIP} \\\n", - " --input-file ${TEMP}/claims.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMP}/qualifiers.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMP}/aliases.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMP}/descriptions.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMP}/labels.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMP}/sitelinks.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMP}/sitelinks.qualifiers.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMP}/metadata.types.${UNSORTED_KGTK} \\\n", - " --input-file ${TEMP}/metadata.property.datatypes.${UNSORTED_KGTK} \\\n", - " / sort ${VERBOSE} \\\n", - " --gzip-command ${GZIP_CMD} \\\n", - " --extra \"${SORT_EXTRAS}\" \\\n", - " --output-file ${OUT}/all.${SORTED_KGTK} \\\n", - "| tee ${TEMP}/build-all-edges.log" - ] - }, - { - "cell_type": "markdown", - "id": "b1da3135-a165-452e-a87c-12b9c68bc55a", - "metadata": {}, - "source": [ - "## Check for unclaimed qualifiers" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "68f89d90-9094-4bff-91ee-911bd2c7773d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.unclaimed.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.unclaimed.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 308034380 input records, accepted 0 records, rejected 308034380 records.\n", - "Read 1361968102 filter records, 271770825 found matching input records, 1090197277 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=1:04:24.282472 CPU=1:01:51.930236 ( 96.1%): ifnotexists --verbose --use-mgzip=TRUE --presorted --input-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.tsv.gz --input-keys node1 --filter-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz --filter-keys id --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/qualifiers.unclaimed.tsv.gz\n" - ] - } - ], - "source": [ - "!kgtk ${KGTK_FLAGS} \\\n", - " ifnotexists $VERBOSE --use-mgzip=$USE_MGZIP --presorted \\\n", - " --input-file ${OUT}/qualifiers.${SORTED_KGTK} \\\n", - " --input-keys node1 \\\n", - " --filter-file ${OUT}/claims.${SORTED_KGTK} \\\n", - " --filter-keys id \\\n", - " --output-file ${OUT}/qualifiers.unclaimed.${SORTED_KGTK} \\\n", - "| tee ${TEMP}/qualifiers.unclaimed.log" - ] - }, - { - "cell_type": "markdown", - "id": "39507315-fde7-44dc-81cb-12343ca41bc0", - "metadata": {}, - "source": [ - "## Split edges by datatype" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "8dfffa80-b5c7-4999-96be-0611401c4ffc", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", - "Opening the input file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220505/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.commonsMedia.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.commonsMedia.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.external-id.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.external-id.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.geo-shape.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.geo-shape.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.globe-coordinate.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.globe-coordinate.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.math.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.math.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.monolingualtext.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.monolingualtext.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.musical-notation.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.musical-notation.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.quantity.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.quantity.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.string.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.string.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tabular-data.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tabular-data.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.time.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.time.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.url.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.url.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-form.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-form.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-item.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-item.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-lexeme.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-lexeme.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-property.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-property.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the output file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-sense.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-sense.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Opening the reject file: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.other.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220505/import-wikidata/data/claims.other.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Applying a dispatched multiple-output object filter\n", - "Read 1361968102 rows, rejected 0 rows, wrote 1361968102 rows.\n", - "Closing output files.\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "KgtkWriter: closing the output file\n", - "All output files have been closed.\n", - "Timing: elapsed=1:28:16.121467 CPU=4:51:32.610135 (330.3%): filter --verbose --input-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tsv.gz --obj node2;wikidatatype --first-match-only --pattern ;;commonsMedia --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.commonsMedia.tsv.gz --pattern ;;external-id --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.external-id.tsv.gz --pattern ;;geo-shape --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.geo-shape.tsv.gz --pattern ;;globe-coordinate --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.globe-coordinate.tsv.gz --pattern ;;math --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.math.tsv.gz --pattern ;;monolingualtext --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.monolingualtext.tsv.gz --pattern ;;musical-notation --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.musical-notation.tsv.gz --pattern ;;quantity --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.quantity.tsv.gz --pattern ;;string --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.string.tsv.gz --pattern ;;tabular-data --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.tabular-data.tsv.gz --pattern ;;time --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.time.tsv.gz --pattern ;;url --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.url.tsv.gz --pattern ;;wikibase-form --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-form.tsv.gz --pattern ;;wikibase-item --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-item.tsv.gz --pattern ;;wikibase-lexeme --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-lexeme.tsv.gz --pattern ;;wikibase-property --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-property.tsv.gz --pattern ;;wikibase-sense --output-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.wikibase-sense.tsv.gz --reject-file /data/amandeep/wikidata-20220505/import-wikidata/data/claims.other.tsv.gz --use-mgzip TRUE\n" - ] - } - ], - "source": [ - "!kgtk ${KGTK_FLAGS} \\\n", - " filter ${VERBOSE} \\\n", - " --input-file ${OUT}/claims.${SORTED_KGTK} \\\n", - " --obj \"node2;wikidatatype\" \\\n", - " --first-match-only \\\n", - " --pattern \";;commonsMedia\" \\\n", - " --output-file ${OUT}/claims.commonsMedia.${SORTED_KGTK} \\\n", - " --pattern \";;external-id\" \\\n", - " --output-file ${OUT}/claims.external-id.${SORTED_KGTK} \\\n", - " --pattern \";;geo-shape\" \\\n", - " --output-file ${OUT}/claims.geo-shape.${SORTED_KGTK} \\\n", - " --pattern \";;globe-coordinate\" \\\n", - " --output-file ${OUT}/claims.globe-coordinate.${SORTED_KGTK} \\\n", - " --pattern \";;math\" \\\n", - " --output-file ${OUT}/claims.math.${SORTED_KGTK} \\\n", - " --pattern \";;monolingualtext\" \\\n", - " --output-file ${OUT}/claims.monolingualtext.${SORTED_KGTK} \\\n", - " --pattern \";;musical-notation\" \\\n", - " --output-file ${OUT}/claims.musical-notation.${SORTED_KGTK} \\\n", - " --pattern \";;quantity\" \\\n", - " --output-file ${OUT}/claims.quantity.${SORTED_KGTK} \\\n", - " --pattern \";;string\" \\\n", - " --output-file ${OUT}/claims.string.${SORTED_KGTK} \\\n", - " --pattern \";;tabular-data\" \\\n", - " --output-file ${OUT}/claims.tabular-data.${SORTED_KGTK} \\\n", - " --pattern \";;time\" \\\n", - " --output-file ${OUT}/claims.time.${SORTED_KGTK} \\\n", - " --pattern \";;url\" \\\n", - " --output-file ${OUT}/claims.url.${SORTED_KGTK} \\\n", - " --pattern \";;wikibase-form\" \\\n", - " --output-file ${OUT}/claims.wikibase-form.${SORTED_KGTK} \\\n", - " --pattern \";;wikibase-item\" \\\n", - " --output-file ${OUT}/claims.wikibase-item.${SORTED_KGTK} \\\n", - " --pattern \";;wikibase-lexeme\" \\\n", - " --output-file ${OUT}/claims.wikibase-lexeme.${SORTED_KGTK} \\\n", - " --pattern \";;wikibase-property\" \\\n", - " --output-file ${OUT}/claims.wikibase-property.${SORTED_KGTK} \\\n", - " --pattern \";;wikibase-sense\" \\\n", - " --output-file ${OUT}/claims.wikibase-sense.${SORTED_KGTK} \\\n", - " --reject-file ${OUT}/claims.other.${SORTED_KGTK} \\\n", - " --use-mgzip ${USE_MGZIP} \\\n", - " | tee ${TEMP}/edge-datatype-split.log" - ] - }, - { - "cell_type": "markdown", - "id": "a768d04d-e984-40c8-afbb-680cfb0c0232", - "metadata": {}, - "source": [ - "## Extract qualifiers for edge datatype splits" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "05788888-c90a-4841-a943-35fcdee72668", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Extract any qualifiers for the properties in claims.commonsMedia\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.commonsMedia.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.commonsMedia.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.commonsMedia.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.commonsMedia.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.commonsMedia.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 464056 records, rejected 305315947 records.\n", - "Read 5426154 filter records, 376326 found matching input records, 5049828 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:10:47.085673 CPU=0:10:50.451530 (100.5%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.commonsMedia.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.commonsMedia.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.external-id\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.external-id.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.external-id.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.external-id.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.external-id.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.external-id.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 6309903 records, rejected 299470100 records.\n", - "Read 188875219 filter records, 3510610 found matching input records, 185364608 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:16:52.937288 CPU=0:19:32.692672 (115.8%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.external-id.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.external-id.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.geo-shape\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.geo-shape.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.geo-shape.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.geo-shape.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.geo-shape.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.geo-shape.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 1754 records, rejected 305778249 records.\n", - "Read 28215 filter records, 1396 found matching input records, 26819 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:10:18.439706 CPU=0:10:17.922182 ( 99.9%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.geo-shape.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.geo-shape.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.globe-coordinate\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.globe-coordinate.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.globe-coordinate.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.globe-coordinate.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.globe-coordinate.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.globe-coordinate.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 162832 records, rejected 305617171 records.\n", - "Read 9156940 filter records, 155142 found matching input records, 9001798 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:10:32.063434 CPU=0:10:39.611307 (101.2%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.globe-coordinate.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.globe-coordinate.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.math\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.math.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.math.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.math.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.math.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.math.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 3807 records, rejected 305776196 records.\n", - "Read 24996 filter records, 3726 found matching input records, 21270 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:10:43.502118 CPU=0:10:46.397159 (100.4%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.math.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.math.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.monolingualtext\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.monolingualtext.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.monolingualtext.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.monolingualtext.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.monolingualtext.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.monolingualtext.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 317870 records, rejected 305462133 records.\n", - "Read 47753791 filter records, 231442 found matching input records, 47522349 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:12:29.399306 CPU=0:13:21.059256 (106.9%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.monolingualtext.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.monolingualtext.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.musical-notation\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.musical-notation.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.musical-notation.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.musical-notation.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.musical-notation.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.musical-notation.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 62 records, rejected 305779941 records.\n", - "Read 942 filter records, 38 found matching input records, 904 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:10:41.608581 CPU=0:10:44.486667 (100.4%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.musical-notation.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.musical-notation.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.quantity\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.quantity.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.quantity.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.quantity.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.quantity.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.quantity.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 54751054 records, rejected 251028949 records.\n", - "Read 86267605 filter records, 49747714 found matching input records, 36519891 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:15:38.545722 CPU=0:28:59.081318 (185.3%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.quantity.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.quantity.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.string\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.string.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.string.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.string.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.string.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.string.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 166537462 records, rejected 139242541 records.\n", - "Read 286774252 filter records, 163568733 found matching input records, 123205519 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:26:46.648710 CPU=1:07:50.078828 (253.3%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.string.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.string.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.tabular-data\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tabular-data.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tabular-data.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tabular-data.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tabular-data.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tabular-data.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 12355 records, rejected 305767648 records.\n", - "Read 22880 filter records, 12334 found matching input records, 10546 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:10:29.056178 CPU=0:10:29.203481 (100.0%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tabular-data.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tabular-data.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.time\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.time.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.time.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.time.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.time.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.time.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 1048220 records, rejected 304731783 records.\n", - "Read 54361593 filter records, 751395 found matching input records, 53610197 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:12:35.679876 CPU=0:13:09.415386 (104.5%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.time.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.time.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.url\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.url.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.url.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.url.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.url.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.url.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 2382690 records, rejected 303397313 records.\n", - "Read 8328249 filter records, 1750479 found matching input records, 6577770 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:11:04.391601 CPU=0:11:27.257304 (103.4%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.url.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.url.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.wikibase-form\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-form.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-form.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-form.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-form.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-form.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 28 records, rejected 305779975 records.\n", - "Read 8241 filter records, 25 found matching input records, 8216 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:10:17.631748 CPU=0:10:14.786283 ( 99.5%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-form.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-form.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.wikibase-item\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-item.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-item.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-item.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-item.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-item.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 73786828 records, rejected 231993175 records.\n", - "Read 670635690 filter records, 50007257 found matching input records, 620628432 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:34:14.591949 CPU=0:52:31.248502 (153.4%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-item.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-item.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.wikibase-lexeme\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-lexeme.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-lexeme.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-lexeme.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-lexeme.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-lexeme.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 338 records, rejected 305779665 records.\n", - "Read 4524 filter records, 279 found matching input records, 4245 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:10:08.115616 CPU=0:10:05.726132 ( 99.6%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-lexeme.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-lexeme.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.wikibase-property\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-property.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-property.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-property.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-property.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-property.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 683 records, rejected 305779320 records.\n", - "Read 39288 filter records, 552 found matching input records, 38736 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:10:21.397888 CPU=0:10:20.181629 ( 99.8%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-property.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-property.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.wikibase-sense\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-sense.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-sense.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-sense.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-sense.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-sense.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 61 records, rejected 305779942 records.\n", - "Read 47 filter records, 46 found matching input records, 1 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:09:44.827663 CPU=0:09:42.992229 ( 99.7%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.wikibase-sense.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.wikibase-sense.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n", - "Extract any qualifiers for the properties in claims.other\n", - "KgtkIfEfexists version: 2020-12-03T17:23:24.872146+00:00#U5P2iPrj3w+Az10+UMbGGMcK/SHBl0wuwe3R1sFky9gXILt9e5oSjHFhPMQEWYVnQtoPd7FUqsZZqR3PfFWaAg==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the filter input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.other.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.other.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.other.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Input key columns: node1\n", - "Filter key columns: id\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.other.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.other.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Processing presorted files.\n", - "Read 305780003 input records, accepted 0 records, rejected 305780003 records.\n", - "Read 0 filter records, 0 found matching input records, 0 did not find matches.\n", - "KgtkWriter: closing the output file\n", - "Timing: elapsed=0:10:16.915168 CPU=0:10:14.315237 ( 99.6%): ifexists --verbose --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz --filter-on /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.other.tsv.gz --output-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.other.tsv.gz --input-keys node1 --filter-keys id --presorted --use-mgzip TRUE\n" - ] - } - ], - "source": [ - "for TARGET in WIKIDATATYPES:\n", - " print(f\"Extract any qualifiers for the properties in claims.{TARGET}\")\n", - " os.environ['TARGET'] = TARGET\n", - " !kgtk ${KGTK_FLAGS} \\\n", - "\t ifexists ${VERBOSE} \\\n", - "\t --input-file ${OUT}/qualifiers.${SORTED_KGTK} \\\n", - "\t --filter-on ${OUT}/claims.${TARGET}.${SORTED_KGTK} \\\n", - "\t --output-file ${OUT}/qualifiers.${TARGET}.${SORTED_KGTK} \\\n", - "\t --input-keys node1 \\\n", - "\t --filter-keys id \\\n", - "\t --presorted \\\n", - "\t --use-mgzip ${USE_MGZIP} \\\n", - "\t| tee ${TEMP}/qualifiers.${TARGET}.log" - ] - }, - { - "cell_type": "markdown", - "id": "2c990274-cc12-4d1e-a4e4-ea32314590d4", - "metadata": {}, - "source": [ - "## Extract claims with a property in the node1 column" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "d48fdaf8-d5cd-4e2b-a2c5-24be1d82d41b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.properties.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.properties.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n", - "Applying a multiple-output general regex match filter\n", - "Read 1357708626 rows, rejected 1357510220 rows, wrote 198406 rows.\n", - "Keep counts: subject=198406, predicate=0, object=0.\n", - "Reject counts: subject=1357510220, predicate=0, object=0.\n", - "Closing output files.\n", - "KgtkWriter: closing the output file\n", - "All output files have been closed.\n", - "Timing: elapsed=0:51:41.926642 CPU=0:51:39.721178 ( 99.9%): filter --verbose --use-mgzip=TRUE --regex --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.tsv.gz -p ^P ;; -o /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/claims.properties.tsv.gz\n" - ] - } - ], - "source": [ - "!kgtk $KGTK_FLAGS filter $VERBOSE --use-mgzip=$USE_MGZIP --regex\\\n", - " --input-file $OUT/claims.$SORTED_KGTK \\\n", - " -p '^P ;;' -o $OUT/claims.properties.$SORTED_KGTK \\\n", - " | tee ${TEMP}/claims.properties.log" - ] - }, - { - "cell_type": "markdown", - "id": "c4e9c4f7-bf23-4d6d-a19b-3a2a38279e8a", - "metadata": {}, - "source": [ - "## Extract qualifiers for claims with a property in node1 column" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "d25d8095-dae2-406e-9a28-bc6ab74429f6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "kgtk filter version: 2021-09-24T02:35:27.840163+00:00#gysblgql6Q7482L14Zozt/ne8Owd497FJa7MVp92+UbmixJKElkfg/GY5UmGBsog86NPtmYy+dXWa6PRMIyuIw==\n", - "Opening the input file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "input format: kgtk\n", - "Using KGTK_GRAPH_CACHE='/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db'\n", - "Graph cache '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/temp.import-wikidata/wikidata.sqlite3.db': file '/data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz' not found in the cache.\n", - "KgtkReader: OK to use the fast read path.\n", - "KgtkReader: File_path.suffix: .gz\n", - "KgtkReader: reading mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "node1 column found, this is a KGTK edge file\n", - "KgtkReader: is_edge_file=True is_node_file=False\n", - "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", - "KgtkReader: Reading a kgtk file using the fast path.\n", - "Opening the output file: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.properties.tsv.gz\n", - "File_path.suffix: .gz\n", - "KgtkWriter: writing mgzip with 3 threads: /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.properties.tsv.gz\n", - "header: id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", - "Applying a multiple-output general regex match filter\n", - "Read 305780003 rows, rejected 305620319 rows, wrote 159684 rows.\n", - "Keep counts: subject=159684, predicate=0, object=0.\n", - "Reject counts: subject=305620319, predicate=0, object=0.\n", - "Closing output files.\n", - "KgtkWriter: closing the output file\n", - "All output files have been closed.\n", - "Timing: elapsed=0:11:44.249832 CPU=0:11:37.552965 ( 99.0%): filter --verbose --use-mgzip=TRUE --regex --input-file /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.tsv.gz -p ^P ;; -o /data/amandeep/wikidata-20220409/wikidata-dwd-v4/import-wikidata/data/qualifiers.properties.tsv.gz\n" - ] - } - ], - "source": [ - "!kgtk $KGTK_FLAGS filter $VERBOSE --use-mgzip=$USE_MGZIP --regex \\\n", - " --input-file $OUT/qualifiers.$SORTED_KGTK \\\n", - " -p '^P ;;' -o $OUT/qualifiers.properties.$SORTED_KGTK \\\n", - " | tee ${TEMP}/qualifiers.properties.log" - ] - }, - { - "cell_type": "markdown", - "id": "ed09fd50-8cc4-464f-8b5e-b7b5097bc0f8", - "metadata": {}, - "source": [ - "## Files in the output data Folder" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "44cb612c-a71a-43e9-a3dc-44410e68fdd2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 115G\n", - "-rw-r--r-- 1 amandeep isdstaff 28G Apr 15 11:15 claims.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 994K Apr 15 11:16 claims.badvalue.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.7M Apr 15 11:16 claims.novalue.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 5.4M Apr 15 11:16 claims.somevalue.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 5.4G Apr 15 11:20 qualifiers.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 592K Apr 15 11:20 qualifiers.badvalue.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 242K Apr 15 11:20 qualifiers.badvalueClaims.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 862K Apr 15 11:20 qualifiers.novalue.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 573K Apr 15 11:20 qualifiers.novalueClaims.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.1M Apr 15 11:20 qualifiers.somevalue.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 6.7M Apr 15 11:20 qualifiers.somevalueClaims.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 184M Apr 15 11:20 aliases.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 181M Apr 15 11:20 aliases.en.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 694M Apr 15 11:21 descriptions.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 692M Apr 15 11:22 descriptions.en.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.2G Apr 15 11:23 labels.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.2G Apr 15 11:25 labels.en.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.8G Apr 15 11:26 sitelinks.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 287M Apr 15 11:27 sitelinks.en.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 507M Apr 15 11:27 sitelinks.en.qualifiers.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 3.3G Apr 15 11:30 sitelinks.qualifiers.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 204M Apr 15 11:31 metadata.node.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 54K Apr 15 11:31 metadata.property.datatypes.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 455M Apr 15 11:31 metadata.types.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 40G Apr 15 14:41 all.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 87 Apr 15 15:51 qualifiers.unclaimed.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 183M Apr 15 17:20 claims.commonsMedia.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 4.1G Apr 15 17:20 claims.external-id.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 778K Apr 15 17:20 claims.geo-shape.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 226M Apr 15 17:20 claims.globe-coordinate.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 686K Apr 15 17:20 claims.math.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.3G Apr 15 17:20 claims.monolingualtext.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 28K Apr 15 17:20 claims.musical-notation.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.0G Apr 15 17:21 claims.quantity.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 5.7G Apr 15 17:21 claims.string.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 420K Apr 15 17:21 claims.tabular-data.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 850M Apr 15 17:21 claims.time.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 205M Apr 15 17:21 claims.url.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 115K Apr 15 17:21 claims.wikibase-form.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 9.6G Apr 15 17:21 claims.wikibase-item.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 74K Apr 15 17:21 claims.wikibase-lexeme.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 634K Apr 15 17:21 claims.wikibase-property.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 960 Apr 15 17:21 claims.wikibase-sense.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 84 Apr 15 17:21 claims.other.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 15M Apr 15 17:32 qualifiers.commonsMedia.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 150M Apr 15 17:49 qualifiers.external-id.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 29K Apr 15 17:59 qualifiers.geo-shape.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.8M Apr 15 18:09 qualifiers.globe-coordinate.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 85K Apr 15 18:20 qualifiers.math.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 6.9M Apr 15 18:33 qualifiers.monolingualtext.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.8K Apr 15 18:43 qualifiers.musical-notation.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 891M Apr 15 18:59 qualifiers.quantity.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.7G Apr 15 19:26 qualifiers.string.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 201K Apr 15 19:36 qualifiers.tabular-data.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 16M Apr 15 19:49 qualifiers.time.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 39M Apr 15 20:00 qualifiers.url.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.1K Apr 15 20:10 qualifiers.wikibase-form.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.3G Apr 15 20:45 qualifiers.wikibase-item.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 9.2K Apr 15 20:55 qualifiers.wikibase-lexeme.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 18K Apr 15 21:05 qualifiers.wikibase-property.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 1.6K Apr 15 21:15 qualifiers.wikibase-sense.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 83 Apr 15 21:25 qualifiers.other.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 3.8M Apr 15 22:17 claims.properties.tsv.gz\n", - "-rw-r--r-- 1 amandeep isdstaff 2.8M Apr 15 22:29 qualifiers.properties.tsv.gz\n" - ] - } - ], - "source": [ - "!ls -lrth $OUT/data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b105f781-0a64-4f43-a780-82cbf9dedad8", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "kgtk-env", - "language": "python", - "name": "kgtk-env" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From 89f89c1a0110e9aa7ce63d2d407d46d870139ef9 Mon Sep 17 00:00:00 2001 From: saggu Date: Thu, 2 Jun 2022 13:47:00 -0700 Subject: [PATCH 16/21] fix typo in description --- kgtk-properties/kgtk.properties.tsv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kgtk-properties/kgtk.properties.tsv b/kgtk-properties/kgtk.properties.tsv index 79a247e09..d6ad74fcd 100644 --- a/kgtk-properties/kgtk.properties.tsv +++ b/kgtk-properties/kgtk.properties.tsv @@ -33,14 +33,14 @@ P31P279star P31 Q18647521 P31P279star-P31-Q18647521 P31P279star datatype wikibase-item P31P279star-datatype-643cc9 Pdirected_pagerank label 'pagerank (directed)'@en Pdirected_pagerank-label-c415d3 Pdirected_pagerank alias 'page rank (directed)'@en Pdirected_pagerank-alias-ffb26c -Pdirected_pagerank description 'pagerank canculated on the directed graph'@en Pdirected_pagerank-description-73cf78 +Pdirected_pagerank description 'pagerank calculated on the directed graph'@en Pdirected_pagerank-description-73cf78 Pdirected_pagerank P31 Q18616576 Pdirected_pagerank-P31-Q18616576 Pdirected_pagerank P31 Q47512165 Pdirected_pagerank-P31-Q47512165 Pdirected_pagerank P1629 Q184316 Pdirected_pagerank-P1629-Q184316 Pdirected_pagerank datatype quantity Pdirected_pagerank-datatype-1a7b30 Pundirected_pagerank label 'pagerank (undirected)'@en Pundirected_pagerank-label-75f812 Pundirected_pagerank alias 'page rank (undirected)'@en Pundirected_pagerank-alias-ce271c -Pundirected_pagerank description 'pagerank canculated on the undirected graph'@en Pundirected_pagerank-description-fc33dc +Pundirected_pagerank description 'pagerank calculated on the undirected graph'@en Pundirected_pagerank-description-fc33dc Pundirected_pagerank P31 Q18616576 Pundirected_pagerank-P31-Q18616576 Pundirected_pagerank P31 Q47512165 Pundirected_pagerank-P31-Q47512165 Pundirected_pagerank P1629 Q184316 Pundirected_pagerank-P1629-Q184316 From 2dc5d7476830f6d138bbbc50a72eb8c9158a4fb7 Mon Sep 17 00:00:00 2001 From: saggu Date: Thu, 2 Jun 2022 14:25:03 -0700 Subject: [PATCH 17/21] add kgtk as source --- kgtk-properties/kgtk.properties.tsv | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kgtk-properties/kgtk.properties.tsv b/kgtk-properties/kgtk.properties.tsv index d6ad74fcd..86da8fdbd 100644 --- a/kgtk-properties/kgtk.properties.tsv +++ b/kgtk-properties/kgtk.properties.tsv @@ -33,16 +33,18 @@ P31P279star P31 Q18647521 P31P279star-P31-Q18647521 P31P279star datatype wikibase-item P31P279star-datatype-643cc9 Pdirected_pagerank label 'pagerank (directed)'@en Pdirected_pagerank-label-c415d3 Pdirected_pagerank alias 'page rank (directed)'@en Pdirected_pagerank-alias-ffb26c -Pdirected_pagerank description 'pagerank calculated on the directed graph'@en Pdirected_pagerank-description-73cf78 +Pdirected_pagerank description 'pagerank calculated on the directed graph'@en Pdirected_pagerank-description-e27151 Pdirected_pagerank P31 Q18616576 Pdirected_pagerank-P31-Q18616576 Pdirected_pagerank P31 Q47512165 Pdirected_pagerank-P31-Q47512165 +Pdirected_pagerank P7482 Q108739856 Pdirected_pagerank-P7482-Q108739856 Pdirected_pagerank P1629 Q184316 Pdirected_pagerank-P1629-Q184316 Pdirected_pagerank datatype quantity Pdirected_pagerank-datatype-1a7b30 Pundirected_pagerank label 'pagerank (undirected)'@en Pundirected_pagerank-label-75f812 Pundirected_pagerank alias 'page rank (undirected)'@en Pundirected_pagerank-alias-ce271c -Pundirected_pagerank description 'pagerank calculated on the undirected graph'@en Pundirected_pagerank-description-fc33dc +Pundirected_pagerank description 'pagerank calculated on the undirected graph'@en Pundirected_pagerank-description-9bad0d Pundirected_pagerank P31 Q18616576 Pundirected_pagerank-P31-Q18616576 Pundirected_pagerank P31 Q47512165 Pundirected_pagerank-P31-Q47512165 +Pundirected_pagerank P7482 Q108739856 Pundirected_pagerank-P7482-Q108739856 Pundirected_pagerank P1629 Q184316 Pundirected_pagerank-P1629-Q184316 Pundirected_pagerank datatype quantity Pundirected_pagerank-datatype-1a7b30 Pin_degree label 'degree (incoming)'@en Pin_degree-label-51d6f6 From fe110eec02bdc5ecb34de15b2d9d2a80f6ac113e Mon Sep 17 00:00:00 2001 From: saggu Date: Wed, 8 Jun 2022 15:49:24 -0700 Subject: [PATCH 18/21] rename propertiy labels --- kgtk-properties/kgtk.properties.tsv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kgtk-properties/kgtk.properties.tsv b/kgtk-properties/kgtk.properties.tsv index 86da8fdbd..79be731da 100644 --- a/kgtk-properties/kgtk.properties.tsv +++ b/kgtk-properties/kgtk.properties.tsv @@ -80,7 +80,7 @@ P279dwdstar P31 Q18616576 P279dwdstar-P31-Q18616576 P279dwdstar P31 Q28326461 P279dwdstar-P31-Q28326461 P279dwdstar P31 Q18647519 P279dwdstar-P31-Q18647519 P279dwdstar datatype wikibase-item P279dwdstar-datatype-643cc9 -P1963computed label 'properties for this type'@en P1963computed-label-813bce +P1963computed label 'properties in instances of this class'@en P1963computed-label-c451a6 P1963computed description 'The properties defined for a class, computed based on its instances'@en P1963computed-description-2e5ab8 P1963computed P31 Q19820110 P1963computed-P31-Q19820110 P1963computed P279 Q22582645 P1963computed-P279-Q22582645 @@ -139,7 +139,7 @@ Pshort_abstract label 'short abstract from Wikipedia articles'@en Pshort_abstrac Pshort_abstract description 'text before the table of contents from Wikipedia articles shortened to 2-3 sentences'@en Pshort_abstract-description-d251e5 Pshort_abstract P31 Q18616576 Pshort_abstract-P31-Q18616576 Pshort_abstract datatype string Pshort_abstract-datatype-473287 -P1963computed_star label 'properties for this type including instances of subclasses'@en P1963computed_star-label-c1ae81 +P1963computed_star label 'properties in instances and subclasses of this class'@en P1963computed_star-label-10788b P1963computed_star description 'The properties defined for a class, computed based on its instances and instances of all subclasses'@en P1963computed_star-description-fd0dc9 P1963computed_star P31 Q19820110 P1963computed_star-P31-Q19820110 P1963computed_star P279 Q22582645 P1963computed_star-P279-Q22582645 From e3123e6228a67f72adbda5cbc40960326f029d09 Mon Sep 17 00:00:00 2001 From: saggu Date: Thu, 9 Jun 2022 09:59:32 -0700 Subject: [PATCH 19/21] fix ast bug --- kgtk/visualize/visualize_api.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kgtk/visualize/visualize_api.py b/kgtk/visualize/visualize_api.py index 4db2dddbe..7590ccb56 100644 --- a/kgtk/visualize/visualize_api.py +++ b/kgtk/visualize/visualize_api.py @@ -210,20 +210,26 @@ def process_edge_file(self): # incase node file is not present or a node is missing from the node file node1_label = row[node1_label_idx] node2_label = row[node2_label_idx] + print(row) + print(node1_label_idx) + print(node2_label_idx) if '@' in node1_label: - clean_node1_label, _, _ = kgtk_format.destringify(node1_label) + # clean_node1_label, _, _ = kgtk_format.destringify(node1_label) + clean_node1_label = node1_label.split('@')[0] else: clean_node1_label = node1_label nodes.add((row[node1_idx], clean_node1_label)) if '@' in node2_label: - clean_node2_label, _, _ = kgtk_format.destringify(node2_label) + # clean_node2_label, _, _ = kgtk_format.destringify(node2_label) + clean_node2_label = node2_label.split('@')[0] else: clean_node2_label = node2_label nodes.add((row[node2_idx], clean_node2_label)) if '@' in row[label_label_idx]: - _label_label, _, _ = kgtk_format.destringify(row[label_label_idx]) + # _label_label, _, _ = kgtk_format.destringify(row[label_label_idx]) + _label_label = row[label_label_idx].split('@')[0] else: _label_label = row[label_label_idx] From 29fcb98f718b0e629659ef9fbe79593f87ce6b0b Mon Sep 17 00:00:00 2001 From: Amandeep Singh Date: Thu, 9 Jun 2022 10:08:32 -0700 Subject: [PATCH 20/21] Update __init__.py --- kgtk/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kgtk/__init__.py b/kgtk/__init__.py index 98d186bed..4e7c72a59 100644 --- a/kgtk/__init__.py +++ b/kgtk/__init__.py @@ -1 +1 @@ -__version__ = '1.4.2' +__version__ = '1.4.3' From cc08d5105418988cd4a5e1590e1e092aaffb727d Mon Sep 17 00:00:00 2001 From: saggu Date: Thu, 9 Jun 2022 10:18:32 -0700 Subject: [PATCH 21/21] remove print statements --- kgtk/visualize/visualize_api.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/kgtk/visualize/visualize_api.py b/kgtk/visualize/visualize_api.py index 7590ccb56..1803a3545 100644 --- a/kgtk/visualize/visualize_api.py +++ b/kgtk/visualize/visualize_api.py @@ -210,9 +210,6 @@ def process_edge_file(self): # incase node file is not present or a node is missing from the node file node1_label = row[node1_label_idx] node2_label = row[node2_label_idx] - print(row) - print(node1_label_idx) - print(node2_label_idx) if '@' in node1_label: # clean_node1_label, _, _ = kgtk_format.destringify(node1_label) clean_node1_label = node1_label.split('@')[0]