From a0475a3d00daa6ba95c28f879bc27cf46c10dd52 Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Mon, 14 Oct 2024 13:42:36 +0200 Subject: [PATCH 01/14] new branch + added again the entity shcema --- json/entities/entities-backup.schema.json | 144 ++++++++++++++++++++++ json/entities/entities.schema.json | 126 +++++++++---------- 2 files changed, 202 insertions(+), 68 deletions(-) create mode 100644 json/entities/entities-backup.schema.json diff --git a/json/entities/entities-backup.schema.json b/json/entities/entities-backup.schema.json new file mode 100644 index 0000000..7b7515c --- /dev/null +++ b/json/entities/entities-backup.schema.json @@ -0,0 +1,144 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://impresso.github.io/impresso-schemas/json/entities.schema.json", + "title": "Named Entity JSON Schema", + "description": "Definition of the output representation of entity processing, before indexing. Named entity mentions are expressed as offline annotations with character offsets relative to content items. Essentially, the NE output is a list of JSON documents (in json line format), where each document corresponds to a content item that has a list of NE mentions (no output for CI with no mentions). The tagset corresponds to impresso-HIPE.", + "type": "object", + "properties": { + "id": { + "type": "string", + "description": "impresso content item id." + }, + "ts": { + "type": "string", + "description": "Timestamp of creation of the JSON file (e.g. '2018-09-18T08:00:08Z')" + }, + "sys_id": { + "type": "string", + "description": "An alias for the system or model that produced this output (preferably short, but still understandable), used for transparency and traceability. Should be unique and thus include elements that distinguish one model from another, such as a base name, a version, the language, e.g. bert-xxxx-xxxx-fr." + }, + "nes": { + "type": "array", + "description": "The list of named entity mentions identified in the document", + "minItems": 1, + "items": { + "type": "object", + "properties": { + "type": { + "type": "string", + "description":"NE type", + "enum": [ + "building", + "loc", + "loc.add", + "loc.add.elec", + "loc.add.phys", + "loc.adm", + "loc.adm.nat", + "loc.adm.reg", + "loc.adm.sup", + "loc.adm.town", + "loc.admin.sup", + "loc.fac", + "loc.oro", + "loc.phys", + "loc.phys.astro", + "loc.phys.geo", + "loc.phys.hydro", + "loc.unk", + "org", + "org.adm", + "org.ent", + "org.ent.pressagency", + "per", + "per.author", + "pers", + "pers.coll", + "pers.ind", + "pers.ind.articleauthor", + "prod", + "prod.doctr", + "prod.media", + "street", + "time", + "time.date.abs" + ] + }, + "surface": { + "type": "string", + "description":"The (string) surface of the named entity mention, as it appears in the text" + }, + "name": { + "type": "string", + "description":"In case of a person mention, the entity component of type name." + }, + "lOffset": { + "type": "integer", + "description":"The left character offset of the named entity with respect to the content item, as in the rebuilt format." + }, + "rOffset": { + "type": "integer", + "description":"The right character offset of the named entity with respect to the content item, as in the rebuilt format." + }, + "firstname": { + "type": "string", + "description":"In case of a person mention and if available, the first name." + }, + "surname": { + "type": "string", + "description":"In case of a person mention and if available, the surname." + }, + "title": { + "type": "string", + "description":"In case of a person mention, the entity component of type 'title'." + }, + "function": { + "type": "string", + "description":"In case of a person mention, the entity component of type 'function'." + }, + "demonym": { + "type": "string", + "description":"In case of a person mention, the entity component of type 'demonym'." + }, + "nested": { + "type": "boolean", + "description":"In case of a nested mention, this property should be set to true. Can be ignored if not." + }, + "wkd_id": { + "type": "string", + "description":"If exists, wikidata QID" + }, + "wkpedia_pagename": { + "type": "string", + "description":"If exists, wikipedia page name or, if not possible, wikipedia URL, in the language the NE recognition is made (e.g. page name 'Etats-Unis' if EL performed against French wikipedia, and 'United_States' is against English Wikipedia. " + }, + "confidence": { + "type": "string", + "enum": [ + "low", + "medium", + "high" + ] + }, + "id": { + "type": "string", + "description":"The id of the named entity mention composed of the following set of values concatenated with a colon (':') : content item id + loffset + roffset + type + sys_id (e.g. 'LLE-1989-04-04-a-i0195:56:69:person:bert-xxxx-xxxx-fr'." + } + }, + "required": [ + "type", + "surface", + "lOffset", + "rOffset", + "id" + ] + } + } + }, + "required": [ + "id", + "ts", + "sys_id", + "nes" + ] +} \ No newline at end of file diff --git a/json/entities/entities.schema.json b/json/entities/entities.schema.json index 7b7515c..d6cdb25 100644 --- a/json/entities/entities.schema.json +++ b/json/entities/entities.schema.json @@ -7,122 +7,112 @@ "properties": { "id": { "type": "string", - "description": "impresso content item id." + "description": "Impresso content item ID." }, "ts": { "type": "string", - "description": "Timestamp of creation of the JSON file (e.g. '2018-09-18T08:00:08Z')" + "description": "Timestamp of creation of the JSON file (e.g. '2024-05-26T09:48:01Z')." }, "sys_id": { "type": "string", - "description": "An alias for the system or model that produced this output (preferably short, but still understandable), used for transparency and traceability. Should be unique and thus include elements that distinguish one model from another, such as a base name, a version, the language, e.g. bert-xxxx-xxxx-fr." + "description": "An alias for the system or model that produced this output, used for transparency and traceability. It should include distinguishing elements like a base name, version, and language." }, "nes": { "type": "array", - "description": "The list of named entity mentions identified in the document", - "minItems": 1, + "description": "The list of named entity mentions identified in the document.", + "minItems": 0, "items": { "type": "object", "properties": { "type": { "type": "string", - "description":"NE type", + "description": "NE type (coarse-grained and fine-grained).", "enum": [ - "building", - "loc", - "loc.add", - "loc.add.elec", - "loc.add.phys", - "loc.adm", - "loc.adm.nat", - "loc.adm.reg", - "loc.adm.sup", - "loc.adm.town", - "loc.admin.sup", - "loc.fac", - "loc.oro", - "loc.phys", - "loc.phys.astro", - "loc.phys.geo", - "loc.phys.hydro", - "loc.unk", + "pers", + "pers.ind", + "pers.coll", + "pers.ind.articleauthor", "org", "org.adm", "org.ent", "org.ent.pressagency", - "per", - "per.author", - "pers", - "pers.coll", - "pers.ind", - "pers.ind.articleauthor", "prod", - "prod.doctr", "prod.media", - "street", + "prod.doctr", "time", - "time.date.abs" + "time.date.abs", + "loc", + "loc.adm.town", + "loc.adm.reg", + "loc.adm.nat", + "loc.adm.sup", + "loc.phys.geo", + "loc.phys.hydro", + "loc.phys.astro", + "loc.oro", + "loc.fac", + "loc.add.phys", + "loc.add.elec", + "loc.unk", + "comp.name", + "comp.title", + "comp.qualifier", + "comp.function", + "comp.demonym" ] }, "surface": { "type": "string", - "description":"The (string) surface of the named entity mention, as it appears in the text" - }, - "name": { - "type": "string", - "description":"In case of a person mention, the entity component of type name." + "description": "The surface form of the named entity mention, as it appears in the text." }, "lOffset": { "type": "integer", - "description":"The left character offset of the named entity with respect to the content item, as in the rebuilt format." + "description": "The left character offset of the named entity with respect to the content item." }, "rOffset": { "type": "integer", - "description":"The right character offset of the named entity with respect to the content item, as in the rebuilt format." - }, - "firstname": { - "type": "string", - "description":"In case of a person mention and if available, the first name." + "description": "The right character offset of the named entity with respect to the content item." }, - "surname": { - "type": "string", - "description":"In case of a person mention and if available, the surname." + "nested": { + "type": "boolean", + "description": "Indicates whether the mention is nested." }, - "title": { - "type": "string", - "description":"In case of a person mention, the entity component of type 'title'." + "confidence_ner": { + "type": "number", + "description": "Confidence score of the Named Entity Recognition process." }, - "function": { + "confidence_nel": { + "type": "number", + "description": "Confidence score of the Named Entity Linking process." + }, + "wkd_id": { "type": "string", - "description":"In case of a person mention, the entity component of type 'function'." + "description": "Wikidata QID if available." }, - "demonym": { + "wkpedia_pagename": { "type": "string", - "description":"In case of a person mention, the entity component of type 'demonym'." + "description": "Wikipedia page name or URL in the relevant language." }, - "nested": { - "type": "boolean", - "description":"In case of a nested mention, this property should be set to true. Can be ignored if not." + "name": { + "type": "string", + "description": "Full name of the entity, including first, middle, and last names as well as initials and nicknames, if applicable." }, - "wkd_id": { + "title": { "type": "string", - "description":"If exists, wikidata QID" + "description": "Title or designator of a person (e.g., 'Herr Chirac', 'Son Altesse le prince Rainier'), if applicable." }, - "wkpedia_pagename": { + "function": { "type": "string", - "description":"If exists, wikipedia page name or, if not possible, wikipedia URL, in the language the NE recognition is made (e.g. page name 'Etats-Unis' if EL performed against French wikipedia, and 'United_States' is against English Wikipedia. " + "description": "Function or job of a named person (e.g., 'Bürgermeister Ann Hidalgo von Paris'), if applicable." }, - "confidence": { + "demonym": { "type": "string", - "enum": [ - "low", - "medium", - "high" - ] + "description": "Demonym or nationality of a person, if applicable." }, "id": { "type": "string", - "description":"The id of the named entity mention composed of the following set of values concatenated with a colon (':') : content item id + loffset + roffset + type + sys_id (e.g. 'LLE-1989-04-04-a-i0195:56:69:person:bert-xxxx-xxxx-fr'." + "description": "The unique identifier of the named entity mention: \n[Document ID]:[Left Offset]:[Right Offset]:[Entity Type]:[NER Model]|[NEL Model]", + "pattern": "^[a-zA-Z0-9_]+:[0-9]+:[0-9]+:[a-zA-Z0-9_]+:[a-zA-Z0-9_]+$" } }, "required": [ From fc5cf962ff215aaeada60d65a95730784c842d01 Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Mon, 14 Oct 2024 13:44:23 +0200 Subject: [PATCH 02/14] modified examples --- examples/entities/example0.json | 7 +++++-- examples/entities/example1.json | 6 ++++-- examples/entities/example2.json | 9 +++++++-- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/examples/entities/example0.json b/examples/entities/example0.json index 7ef10bb..0389bed 100644 --- a/examples/entities/example0.json +++ b/examples/entities/example0.json @@ -12,7 +12,8 @@ "name": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "wkd_id": "Q683672", "wkpedia_pagename": "Société_du_Grütli", - "confidence": "medium" + "confidence_ner": 50.45, + "confidence_nel": 50.45 }, { "type": "pers.ind", @@ -24,7 +25,9 @@ "function": "mécanicien à Echallens", "wkd_id": "NIL", "wkpedia_pagename": "NIL", - "confidence": "high" + "confidence_ner": 50.45, + "confidence_nel": 50.45, + "function": "mécanicien à Echallens" } ] } diff --git a/examples/entities/example1.json b/examples/entities/example1.json index cccf567..5a5894a 100644 --- a/examples/entities/example1.json +++ b/examples/entities/example1.json @@ -14,7 +14,8 @@ "function": "ancien ministre du gouvernement cantonais", "wkd_id": "NIL", "wkpedia_pagename": "NIL", - "confidence": "medium" + "confidence_ner": 50.45, + "confidence_nel": 50.45 }, { "type": "loc.adm.nat", @@ -24,7 +25,8 @@ "id": "EXP-1888-01-09-a-i0035:32:42:loc.adm.nat:bert-fr", "wkd_id": "Q30", "wkpedia_pagename": "États-Unis", - "confidence": "medium" + "confidence_ner": 50.45, + "confidence_nel": 50.45 } ] } \ No newline at end of file diff --git a/examples/entities/example2.json b/examples/entities/example2.json index 3f640b6..abf7849 100644 --- a/examples/entities/example2.json +++ b/examples/entities/example2.json @@ -12,7 +12,10 @@ "name": "Sylvie Maurial", "function": "championne de France de ski nautique", "wkd_id": "Q20993704", - "wkpedia_pagename": "Sylvie_Maurial" + "wkpedia_pagename": "Sylvie_Maurial", + "confidence_ner": 50.45, + "confidence_nel": 50.45 + }, { "type": "loc.adm.nat", @@ -23,7 +26,9 @@ "nested": true, "wkd_id": "Q20993704", "wkpedia_pagename": "France", - "confidence": "medium" + "confidence": "medium", + "confidence_ner": 50.45, + "confidence_nel": 50.45 } ] } From fe81fd0610694013882a0e6167b383c405afcf11 Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Mon, 14 Oct 2024 13:47:16 +0200 Subject: [PATCH 03/14] jsonschema all passed --- json/entities/entities.schema.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/json/entities/entities.schema.json b/json/entities/entities.schema.json index d6cdb25..037dad3 100644 --- a/json/entities/entities.schema.json +++ b/json/entities/entities.schema.json @@ -111,8 +111,7 @@ }, "id": { "type": "string", - "description": "The unique identifier of the named entity mention: \n[Document ID]:[Left Offset]:[Right Offset]:[Entity Type]:[NER Model]|[NEL Model]", - "pattern": "^[a-zA-Z0-9_]+:[0-9]+:[0-9]+:[a-zA-Z0-9_]+:[a-zA-Z0-9_]+$" + "description": "The unique identifier of the named entity mention: [Document ID]:[Left Offset]:[Right Offset]:[Entity Type]:[NER Model]|[NEL Model]" } }, "required": [ From bb26fd12679f6db5b6062867d6f933e7ae256959 Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Sat, 19 Oct 2024 16:53:54 +0200 Subject: [PATCH 04/14] solved review https://github.com/impresso/impresso-schemas/pull/39/files/fe81fd0610694013882a0e6167b383c405afcf11#r1806755805 --- json/entities/entities.schema.json | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/json/entities/entities.schema.json b/json/entities/entities.schema.json index 037dad3..bc01538 100644 --- a/json/entities/entities.schema.json +++ b/json/entities/entities.schema.json @@ -53,12 +53,7 @@ "loc.fac", "loc.add.phys", "loc.add.elec", - "loc.unk", - "comp.name", - "comp.title", - "comp.qualifier", - "comp.function", - "comp.demonym" + "loc.unk" ] }, "surface": { @@ -73,10 +68,6 @@ "type": "integer", "description": "The right character offset of the named entity with respect to the content item." }, - "nested": { - "type": "boolean", - "description": "Indicates whether the mention is nested." - }, "confidence_ner": { "type": "number", "description": "Confidence score of the Named Entity Recognition process." @@ -91,23 +82,27 @@ }, "wkpedia_pagename": { "type": "string", - "description": "Wikipedia page name or URL in the relevant language." + "description": "Wikipedia page name, i.e. the last part of the wikipedia URL (e.g. United_States)" + }, + "wkpedia_url": { + "type": "string", + "description": "Wikipedia page URL, e.g. https://en.wikipedia.org/wiki/United_States" }, "name": { "type": "string", - "description": "Full name of the entity, including first, middle, and last names as well as initials and nicknames, if applicable." + "description": "Full name of the entity, including first, middle, and last names as well as initials and nicknames, if applicable. For other types of entities (ie. location, organization), this field can be used to store the full surface of the entity." }, "title": { "type": "string", - "description": "Title or designator of a person (e.g., 'Herr Chirac', 'Son Altesse le prince Rainier'), if applicable." + "description": "Title or designator of a person (e.g., 'Herr Chirac', 'Son Altesse le prince Rainier'), if applicable. N/A if it is not applicable." }, "function": { "type": "string", - "description": "Function or job of a named person (e.g., 'Bürgermeister Ann Hidalgo von Paris'), if applicable." + "description": "Function or job of a named person (e.g., 'Bürgermeister Ann Hidalgo von Paris'), if applicable. N/A if it is not applicable." }, "demonym": { "type": "string", - "description": "Demonym or nationality of a person, if applicable." + "description": "Demonym or nationality of a person, if applicable. N/A if it is not applicable." }, "id": { "type": "string", From 6658062063db407270ddc23b3a8d8fbe7e3322a6 Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Sat, 19 Oct 2024 16:57:27 +0200 Subject: [PATCH 05/14] all refs to hipe guidelines https://github.com/impresso/impresso-schemas/pull/39/files/fe81fd0610694013882a0e6167b383c405afcf11#r1806770074 --- json/entities/entities.schema.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/json/entities/entities.schema.json b/json/entities/entities.schema.json index bc01538..ad8c9ed 100644 --- a/json/entities/entities.schema.json +++ b/json/entities/entities.schema.json @@ -90,19 +90,19 @@ }, "name": { "type": "string", - "description": "Full name of the entity, including first, middle, and last names as well as initials and nicknames, if applicable. For other types of entities (ie. location, organization), this field can be used to store the full surface of the entity." + "description":"In case of a person mention, the entity component of type 'name', as defined in the Impresso HIPE NE Annotation guidelines (https://zenodo.org/records/3585750). For technical reasons, all entities have a name component, but it is only relevant for person mentions." }, "title": { "type": "string", - "description": "Title or designator of a person (e.g., 'Herr Chirac', 'Son Altesse le prince Rainier'), if applicable. N/A if it is not applicable." + "description":"In case of a person mention, the entity component of type 'title', as defined in the Impresso HIPE NE Annotation guidelines (https://zenodo.org/records/3585750). " }, "function": { "type": "string", - "description": "Function or job of a named person (e.g., 'Bürgermeister Ann Hidalgo von Paris'), if applicable. N/A if it is not applicable." + "description":"In case of a person mention, the entity component of type 'function', as defined in the Impresso HIPE NE Annotation guidelines (https://zenodo.org/records/3585750). " }, "demonym": { "type": "string", - "description": "Demonym or nationality of a person, if applicable. N/A if it is not applicable." + "description":"In case of a person mention, the entity component of type 'demonym', as defined in the Impresso HIPE NE Annotation guidelines (https://zenodo.org/records/3585750). " }, "id": { "type": "string", From cf9cb00f4d3b2a9ce6f5294492849e5680747d07 Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Sat, 19 Oct 2024 17:00:39 +0200 Subject: [PATCH 06/14] all refs to hipe guidelines https://github.com/impresso/impresso-schemas/pull/39/files/fe81fd0610694013882a0e6167b383c405afcf11#r1806770074 --- examples/entities/example0.json | 13 +++++++------ examples/entities/example1.json | 7 ++++--- json/entities/entities.schema.json | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/examples/entities/example0.json b/examples/entities/example0.json index 0389bed..87cf4f6 100644 --- a/examples/entities/example0.json +++ b/examples/entities/example0.json @@ -4,30 +4,31 @@ "sys_id": "bert-fr", "nes": [ { + "id": "EXP-1888-01-09-a-i0035:32:56:org.adm:bert-fr", "type": "org.adm", "surface": "Société suisse du Grutli", "lOffset": 32, "rOffset": 56, - "id": "EXP-1888-01-09-a-i0035:32:56:org.adm:bert-fr", - "name": "ABCDEFGHIJKLMNOPQRSTUVWXYZ", + "name": "Société suisse du Grutli", "wkd_id": "Q683672", "wkpedia_pagename": "Société_du_Grütli", + "wkpedia_url": "https://fr.wikipedia.org/wiki/Soci%C3%A9t%C3%A9_du_Gr%C3%BCtli", "confidence_ner": 50.45, "confidence_nel": 50.45 }, { "type": "pers.ind", - "surface": "Bovat, mécanicien à Echallens", + "surface": "Mr. Bovat, mécanicien à Echallens", "lOffset": 156, "rOffset": 178, "id": "EXP-1888-01-09-a-i0035:156:178:pers.ind:bert-fr", - "name": "EXP-1888-01-09-a-i0035:156:178:pers.ind:bert-fr", - "function": "mécanicien à Echallens", + "name": "Bovat", "wkd_id": "NIL", "wkpedia_pagename": "NIL", "confidence_ner": 50.45, "confidence_nel": 50.45, - "function": "mécanicien à Echallens" + "function": "mécanicien à Echallens", + "title": "Mr." } ] } diff --git a/examples/entities/example1.json b/examples/entities/example1.json index 5a5894a..1f5bd43 100644 --- a/examples/entities/example1.json +++ b/examples/entities/example1.json @@ -4,11 +4,11 @@ "sys_id": "bert-fr", "nes": [ { + "id": "EXP-1928-05-15-a-i0009:50:99:loc.adm.nat:bert-fr", "type": "pers.ind", "surface": "M. Wou, ancien ministre du gouvernement cantonais", "lOffset": 50, "rOffset": 99, - "id": "EXP-1928-05-15-a-i0009:50:99:loc.adm.nat:bert-fr", "name": "Wou", "title": "M.", "function": "ancien ministre du gouvernement cantonais", @@ -18,15 +18,16 @@ "confidence_nel": 50.45 }, { + "id": "EXP-1888-01-09-a-i0035:32:42:loc.adm.nat:bert-fr", "type": "loc.adm.nat", "surface": "Etats-Unis", "lOffset": 32, "rOffset": 42, - "id": "EXP-1888-01-09-a-i0035:32:42:loc.adm.nat:bert-fr", "wkd_id": "Q30", "wkpedia_pagename": "États-Unis", "confidence_ner": 50.45, - "confidence_nel": 50.45 + "confidence_nel": 50.45, + "name": "Etats-Unis" } ] } \ No newline at end of file diff --git a/json/entities/entities.schema.json b/json/entities/entities.schema.json index ad8c9ed..d0832cc 100644 --- a/json/entities/entities.schema.json +++ b/json/entities/entities.schema.json @@ -90,7 +90,7 @@ }, "name": { "type": "string", - "description":"In case of a person mention, the entity component of type 'name', as defined in the Impresso HIPE NE Annotation guidelines (https://zenodo.org/records/3585750). For technical reasons, all entities have a name component, but it is only relevant for person mentions." + "description":"In case of a person mention, the entity component of type 'name', as defined in the Impresso HIPE NE Annotation guidelines (https://zenodo.org/records/3585750)." }, "title": { "type": "string", From b45a12799d257a3baa04e27dc3b8f928fbd04d34 Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Sat, 19 Oct 2024 17:08:42 +0200 Subject: [PATCH 07/14] solved entity examples https://github.com/impresso/impresso-schemas/pull/39/files/fe81fd0610694013882a0e6167b383c405afcf11#r1807354257 https://github.com/impresso/impresso-schemas/pull/39/files/fe81fd0610694013882a0e6167b383c405afcf11#r1807367513 and https://github.com/impresso/impresso-schemas/pull/39/files/fe81fd0610694013882a0e6167b383c405afcf11#r1807366105 --- examples/entities/example0.json | 10 +++++----- examples/entities/example1.json | 3 +-- examples/entities/example2.json | 19 +++++++++---------- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/examples/entities/example0.json b/examples/entities/example0.json index 87cf4f6..07afdd1 100644 --- a/examples/entities/example0.json +++ b/examples/entities/example0.json @@ -4,7 +4,6 @@ "sys_id": "bert-fr", "nes": [ { - "id": "EXP-1888-01-09-a-i0035:32:56:org.adm:bert-fr", "type": "org.adm", "surface": "Société suisse du Grutli", "lOffset": 32, @@ -14,21 +13,22 @@ "wkpedia_pagename": "Société_du_Grütli", "wkpedia_url": "https://fr.wikipedia.org/wiki/Soci%C3%A9t%C3%A9_du_Gr%C3%BCtli", "confidence_ner": 50.45, - "confidence_nel": 50.45 + "confidence_nel": 50.45, + "id": "EXP-1888-01-09-a-i0035:32:56:org.adm:bert-fr" }, { "type": "pers.ind", "surface": "Mr. Bovat, mécanicien à Echallens", "lOffset": 156, "rOffset": 178, - "id": "EXP-1888-01-09-a-i0035:156:178:pers.ind:bert-fr", - "name": "Bovat", "wkd_id": "NIL", "wkpedia_pagename": "NIL", "confidence_ner": 50.45, "confidence_nel": 50.45, + "name": "Bovat", + "title": "Mr.", "function": "mécanicien à Echallens", - "title": "Mr." + "id": "EXP-1888-01-09-a-i0035:156:178:pers.ind:bert-fr" } ] } diff --git a/examples/entities/example1.json b/examples/entities/example1.json index 1f5bd43..3697a82 100644 --- a/examples/entities/example1.json +++ b/examples/entities/example1.json @@ -4,7 +4,7 @@ "sys_id": "bert-fr", "nes": [ { - "id": "EXP-1928-05-15-a-i0009:50:99:loc.adm.nat:bert-fr", + "id": "EXP-1928-05-15-a-i0009:50:99:pers.ind:bert-fr", "type": "pers.ind", "surface": "M. Wou, ancien ministre du gouvernement cantonais", "lOffset": 50, @@ -27,7 +27,6 @@ "wkpedia_pagename": "États-Unis", "confidence_ner": 50.45, "confidence_nel": 50.45, - "name": "Etats-Unis" } ] } \ No newline at end of file diff --git a/examples/entities/example2.json b/examples/entities/example2.json index abf7849..39de0e7 100644 --- a/examples/entities/example2.json +++ b/examples/entities/example2.json @@ -8,27 +8,26 @@ "surface": "championne de France de ski nautique, Sylvie Maurial", "lOffset": 50, "rOffset": 102, - "id": "EXP-1968-02-23-a-i0262:50:102:loc.adm.nat:bert-fr", - "name": "Sylvie Maurial", - "function": "championne de France de ski nautique", + "confidence_ner": 50.45, + "confidence_nel": 50.45, "wkd_id": "Q20993704", "wkpedia_pagename": "Sylvie_Maurial", - "confidence_ner": 50.45, - "confidence_nel": 50.45 - + "wkpedia_url": "https://fr.wikipedia.org/wiki/Sylvie_Maurial", + "name": "Sylvie Maurial", + "function": "championne de France de ski nautique", + "id": "EXP-1968-02-23-a-i0262:50:102:pers.ind:bert-fr" }, { "type": "loc.adm.nat", "surface": "France", "lOffset": 64, "rOffset": 70, - "id": "EXP-1888-01-09-a-i0035:64:70:loc.adm.nat:bert-fr", - "nested": true, "wkd_id": "Q20993704", "wkpedia_pagename": "France", - "confidence": "medium", + "wkpedia_url": "https://fr.wikipedia.org/wiki/France", "confidence_ner": 50.45, - "confidence_nel": 50.45 + "confidence_nel": 50.45, + "id": "EXP-1888-01-09-a-i0035:64:70:loc.adm.nat:bert-fr" } ] } From 4d81a93e76aa744bbb43b0a5f5be82a96c15fbb7 Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Sat, 19 Oct 2024 17:21:38 +0200 Subject: [PATCH 08/14] added wkpedia_url N/A for NIL --- examples/entities/example0.json | 1 + examples/entities/example1.json | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/entities/example0.json b/examples/entities/example0.json index 07afdd1..e3335d8 100644 --- a/examples/entities/example0.json +++ b/examples/entities/example0.json @@ -23,6 +23,7 @@ "rOffset": 178, "wkd_id": "NIL", "wkpedia_pagename": "NIL", + "wkpedia_url": "N/A", "confidence_ner": 50.45, "confidence_nel": 50.45, "name": "Bovat", diff --git a/examples/entities/example1.json b/examples/entities/example1.json index 3697a82..cbdeb37 100644 --- a/examples/entities/example1.json +++ b/examples/entities/example1.json @@ -14,6 +14,7 @@ "function": "ancien ministre du gouvernement cantonais", "wkd_id": "NIL", "wkpedia_pagename": "NIL", + "wkpedia_url": "N/A", "confidence_ner": 50.45, "confidence_nel": 50.45 }, @@ -25,8 +26,9 @@ "rOffset": 42, "wkd_id": "Q30", "wkpedia_pagename": "États-Unis", + "wkpedia_url": "https://fr.wikipedia.org/wiki/%C3%89tats-Unis", "confidence_ner": 50.45, - "confidence_nel": 50.45, + "confidence_nel": 50.45 } ] } \ No newline at end of file From b4fc9cb4cf35b54dda350e2f32dda86dea8c02e6 Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Wed, 23 Oct 2024 18:40:09 +0200 Subject: [PATCH 09/14] entity shcema reviewed --- json/entities/entities.schema.json | 36 +++++------------------------- 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/json/entities/entities.schema.json b/json/entities/entities.schema.json index d0832cc..b903c7e 100644 --- a/json/entities/entities.schema.json +++ b/json/entities/entities.schema.json @@ -18,6 +18,10 @@ "description": "An alias for the system or model that produced this output, used for transparency and traceability. It should include distinguishing elements like a base name, version, and language." }, "nes": { + "id": { + "type": "string", + "description": "The unique identifier of the named entity mention: [Document ID]:[Left Offset]:[Right Offset]:[Entity Type]:[NER Model]|[NEL Model]" + }, "type": "array", "description": "The list of named entity mentions identified in the document.", "minItems": 0, @@ -29,31 +33,11 @@ "description": "NE type (coarse-grained and fine-grained).", "enum": [ "pers", - "pers.ind", - "pers.coll", - "pers.ind.articleauthor", "org", - "org.adm", - "org.ent", - "org.ent.pressagency", "prod", - "prod.media", - "prod.doctr", "time", - "time.date.abs", "loc", - "loc.adm.town", - "loc.adm.reg", - "loc.adm.nat", - "loc.adm.sup", - "loc.phys.geo", - "loc.phys.hydro", - "loc.phys.astro", - "loc.oro", - "loc.fac", - "loc.add.phys", - "loc.add.elec", - "loc.unk" + "unk" ] }, "surface": { @@ -99,22 +83,14 @@ "function": { "type": "string", "description":"In case of a person mention, the entity component of type 'function', as defined in the Impresso HIPE NE Annotation guidelines (https://zenodo.org/records/3585750). " - }, - "demonym": { - "type": "string", - "description":"In case of a person mention, the entity component of type 'demonym', as defined in the Impresso HIPE NE Annotation guidelines (https://zenodo.org/records/3585750). " - }, - "id": { - "type": "string", - "description": "The unique identifier of the named entity mention: [Document ID]:[Left Offset]:[Right Offset]:[Entity Type]:[NER Model]|[NEL Model]" } }, "required": [ + "id", "type", "surface", "lOffset", "rOffset", - "id" ] } } From 01af8483011d64f36e66907fece46ad90460228d Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Wed, 23 Oct 2024 18:42:48 +0200 Subject: [PATCH 10/14] entity shcema reviewed --- json/entities/entities.schema.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/json/entities/entities.schema.json b/json/entities/entities.schema.json index b903c7e..7658f0f 100644 --- a/json/entities/entities.schema.json +++ b/json/entities/entities.schema.json @@ -36,6 +36,7 @@ "org", "prod", "time", + "date", "loc", "unk" ] @@ -90,7 +91,7 @@ "type", "surface", "lOffset", - "rOffset", + "rOffset" ] } } From 485a9f05bb88df225376fd22971d559c69545ae8 Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Wed, 23 Oct 2024 18:56:03 +0200 Subject: [PATCH 11/14] I propose to add all types of entities and newsagency [to be also used further in the API] --- json/entities/entities.schema.json | 64 ++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/json/entities/entities.schema.json b/json/entities/entities.schema.json index 7658f0f..00b8b6e 100644 --- a/json/entities/entities.schema.json +++ b/json/entities/entities.schema.json @@ -32,12 +32,70 @@ "type": "string", "description": "NE type (coarse-grained and fine-grained).", "enum": [ - "pers", + "comp.demonym", + "comp.function", + "comp.name", + "comp.qualifier", + "comp.title", + "loc", + "loc.add.elec", + "loc.add.phys", + "loc.adm.nat", + "loc.adm.reg", + "loc.adm.sup", + "loc.adm.town", + "loc.fac", + "loc.oro", + "loc.phys.astro", + "loc.phys.geo", + "loc.phys.hydro", + "loc.unk", "org", + "org.adm", + "org.ent", + "org.ent.pressagency", + "pers", + "pers.coll", + "pers.ind", + "pers.ind.articleauthor", "prod", + "prod.doctr", + "prod.media", "time", - "date", - "loc", + "time.date.abs", + "time.hour.abs", + "org.ent.pressagency.Reuters", + "org.ent.pressagency.Stefani", + "org.ent.pressagency.Extel", + "org.ent.pressagency.Havas", + "org.ent.pressagency.Xinhua", + "org.ent.pressagency.Domei", + "org.ent.pressagency.Belga", + "org.ent.pressagency.CTK", + "org.ent.pressagency.ANSA", + "org.ent.pressagency.DNB", + "pers.ind.articleauthor", + "org.ent.pressagency.Wolff", + "org.ent.pressagency.unk", + "org.ent.pressagency.UP-UPI", + "org.ent.pressagency.ATS-SDA", + "org.ent.pressagency.DPA", + "org.ent.pressagency.AFP", + "pers.ind.articleauthor", + "org.ent.pressagency.Kipa", + "org.ent.pressagency.ag", + "org.ent.pressagency.Extel", + "org.ent.pressagency.ATS-SDA", + "org.ent.pressagency.Havas", + "org.ent.pressagency.Reuters", + "org.ent.pressagency.Xinhua", + "org.ent.pressagency.AP", + "org.ent.pressagency.APA", + "org.ent.pressagency.ANSA", + "org.ent.pressagency.DDP-DAPD", + "org.ent.pressagency.TASS", + "org.ent.pressagency.Europapress", + "org.ent.pressagency.SPK-SMP", "unk" ] }, From 699ed36d4130f5839f839cf5cb4657d833b2f074 Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Tue, 5 Nov 2024 14:04:51 +0100 Subject: [PATCH 12/14] model_id + ci_id + ci_type (?) --- json/entities/entities.schema.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/json/entities/entities.schema.json b/json/entities/entities.schema.json index 00b8b6e..232df88 100644 --- a/json/entities/entities.schema.json +++ b/json/entities/entities.schema.json @@ -5,7 +5,7 @@ "description": "Definition of the output representation of entity processing, before indexing. Named entity mentions are expressed as offline annotations with character offsets relative to content items. Essentially, the NE output is a list of JSON documents (in json line format), where each document corresponds to a content item that has a list of NE mentions (no output for CI with no mentions). The tagset corresponds to impresso-HIPE.", "type": "object", "properties": { - "id": { + "ci_id": { "type": "string", "description": "Impresso content item ID." }, @@ -13,7 +13,7 @@ "type": "string", "description": "Timestamp of creation of the JSON file (e.g. '2024-05-26T09:48:01Z')." }, - "sys_id": { + "model_id": { "type": "string", "description": "An alias for the system or model that produced this output, used for transparency and traceability. It should include distinguishing elements like a base name, version, and language." }, @@ -145,8 +145,8 @@ } }, "required": [ - "id", - "type", + "ci_id", + "ci_type", "surface", "lOffset", "rOffset" From 80f0625ec1f2f58beb31625984c0fe37b5ab4c99 Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Tue, 5 Nov 2024 14:06:13 +0100 Subject: [PATCH 13/14] modified entity examples --- examples/entities/example0.json | 4 ++-- examples/entities/example1.json | 4 ++-- examples/entities/example2.json | 4 ++-- json/entities/entities.schema.json | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/entities/example0.json b/examples/entities/example0.json index e3335d8..1523708 100644 --- a/examples/entities/example0.json +++ b/examples/entities/example0.json @@ -1,7 +1,7 @@ { - "id": "EXP-1888-01-09-a-i0035", + "ci_id": "EXP-1888-01-09-a-i0035", "ts": "2019-10-17T11:49:50Z", - "sys_id": "bert-fr", + "model_id": "bert-fr", "nes": [ { "type": "org.adm", diff --git a/examples/entities/example1.json b/examples/entities/example1.json index cbdeb37..cdf00c6 100644 --- a/examples/entities/example1.json +++ b/examples/entities/example1.json @@ -1,7 +1,7 @@ { - "id": "EXP-1928-05-15-a-i0009", + "ci_id": "EXP-1928-05-15-a-i0009", "ts": "2019-10-17T11:49:50Z", - "sys_id": "bert-fr", + "model_id": "bert-fr", "nes": [ { "id": "EXP-1928-05-15-a-i0009:50:99:pers.ind:bert-fr", diff --git a/examples/entities/example2.json b/examples/entities/example2.json index 39de0e7..20ca320 100644 --- a/examples/entities/example2.json +++ b/examples/entities/example2.json @@ -1,7 +1,7 @@ { - "id": "EXP-1968-02-23-a-i0262", + "ci_id": "EXP-1968-02-23-a-i0262", "ts": "2019-10-17T11:49:50Z", - "sys_id": "bert-fr", + "model_id": "bert-fr", "nes": [ { "type": "pers.ind", diff --git a/json/entities/entities.schema.json b/json/entities/entities.schema.json index 232df88..28abb3c 100644 --- a/json/entities/entities.schema.json +++ b/json/entities/entities.schema.json @@ -146,7 +146,7 @@ }, "required": [ "ci_id", - "ci_type", + "type", "surface", "lOffset", "rOffset" @@ -157,7 +157,7 @@ "required": [ "id", "ts", - "sys_id", + "model_id", "nes" ] } \ No newline at end of file From cb963f23fa83fb4342fee048040d11e59b5466c6 Mon Sep 17 00:00:00 2001 From: Emanuela Boros Date: Fri, 15 Nov 2024 16:01:03 +0100 Subject: [PATCH 14/14] added ci_type --- json/entities/entities.schema.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/json/entities/entities.schema.json b/json/entities/entities.schema.json index 28abb3c..f11b0d1 100644 --- a/json/entities/entities.schema.json +++ b/json/entities/entities.schema.json @@ -9,6 +9,11 @@ "type": "string", "description": "Impresso content item ID." }, + "ci_type": + { + "type": "string", + "description": "Impresso content item type." + }, "ts": { "type": "string", "description": "Timestamp of creation of the JSON file (e.g. '2024-05-26T09:48:01Z')."