Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

conllu2folia + folia2annotatedtxt #44

Open
jwijffels opened this issue Aug 25, 2021 · 0 comments
Open

conllu2folia + folia2annotatedtxt #44

jwijffels opened this issue Aug 25, 2021 · 0 comments

Comments

@jwijffels
Copy link

I'm testing the conversion from conllu to folia and next to annotatedtext as follows on the following conllu file called traindata.conllu

# newdoc id = doc1
# newpar
# sent_id = 1
# text = Ik ben de weg kwijt, kunt u me zeggen waar de Lange Wapper ligt?
1	Ik	ik	PRON	Pron|per|1|ev|nom	Case=Nom|Number=Sing|Person=1|PronType=Prs	5	nsubj	_	_
2	ben	ben	AUX	V|hulpofkopp|ott|1|ev	Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin	5	cop	_	_
3	de	de	DET	Art|bep|zijdofmv|neut	Definite=Def|PronType=Art	4	det	_	_
4	weg	weg	NOUN	N|soort|ev|neut	Number=Sing	5	obj	_	_
5	kwijt	kwijt	ADJ	Adj|attr|stell|onverv	Degree=Pos	0	root	_	SpaceAfter=No
6	,	,	PUNCT	Punc|komma	PunctType=Comm	5	punct	_	_
7	kunt	kan	VERB	V|hulp|ott|2|ev	Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin|VerbType=Mod	5	parataxis	_	_
8	u	u	PRON	Pron|per|2|ev|nom	Case=Nom|Number=Sing|Person=2|PronType=Prs	7	nsubj	_	_
9	me	me	PRON	Pron|per|1|ev|datofacc	Case=Acc,Dat|Number=Sing|Person=1|PronType=Prs	10	obj	_	_
10	zeggen	zeg	VERB	V|trans|inf	Subcat=Tran|VerbForm=Inf	7	xcomp	_	_
11	waar	waar	ADV	Adv|gew|vrag	PronType=Int	15	mark	_	_
12	de	de	DET	Art|bep|zijdofmv|neut	Definite=Def|PronType=Art	13	det	_	_
13	Lange	Lange	PROPN	N_N|eigen|ev|neut_eigen|ev|neut	_	15	nsubj	_	_
14	Wapper	Wapper	PROPN	PROPN	_	13	flat	_	_
15	ligt	lig	VERB	V|intrans|ott|3|ev	Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Intr|Tense=Pres|VerbForm=Fin	10	acl	_	SpaceAfter=No
16	?	?	PUNCT	Punc|vraag	PunctType=Qest	5	punct	_	_

# sent_id = 2
# text = Jazeker meneer
1	Jazeker	zeker	ADJ	Adj|attr|stell|onverv	Degree=Pos	2	amod	_	_
2	meneer	meneer	NOUN	N|soort|ev|neut	Number=Sing	0	root	_	SpacesAfter=\n

# newdoc id = doc2
# newpar
# sent_id = 1
# text = Het gaat vooruit, het gaat verbazend goed vooruit
1	Het	het	PRON	Pron|onbep|neut|zelfst	PronType=Ind	2	nsubj	_	_
2	gaat	ga	VERB	V|intrans|ott|3|ev	Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Intr|Tense=Pres|VerbForm=Fin	0	root	_	_
3	vooruit	vooruit	ADV	Adv|gew|geenfunc|stell|onverv	Degree=Pos	2	advmod	_	SpaceAfter=No
4	,	,	PUNCT	Punc|komma	PunctType=Comm	2	punct	_	_
5	het	het	PRON	Pron|onbep|neut|zelfst	PronType=Ind	6	nsubj	_	_
6	gaat	ga	VERB	V|intrans|ott|3|ev	Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Subcat=Intr|Tense=Pres|VerbForm=Fin	2	parataxis	_	_
7	verbazend	verbazend	VERB	V|intrans|tegdw|onverv	Subcat=Intr|Tense=Pres|VerbForm=Part	6	advcl	_	_
8	goed	goed	ADJ	Adj|adv|stell|onverv	Degree=Pos|Variant=Short	6	obl	_	_
9	vooruit	vooruit	ADV	Adv|gew|geenfunc|stell|onverv	Degree=Pos	6	compound:prt	_	SpacesAfter=\n
Jan@bnosac MINGW64 ~/Dropbox/Work/RForgeBNOSAC/BNOSAC/udpipe/inst/dummydata (master)
$ conllu2folia traindata.conllu
Wrote doc1.folia.xml
Wrote doc2.folia.xml

Jan@bnosac MINGW64 ~/Dropbox/Work/RForgeBNOSAC/BNOSAC/udpipe/inst/dummydata (master)
$ folia2annotatedtxt -c text,pos,lemma doc1.folia.xml > test.tmp
Processing doc1.folia.xml
Traceback (most recent call last):
  File "c:\python39\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\python39\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Python39\Scripts\folia2annotatedtxt.exe\__main__.py", line 7, in <module>
  File "c:\python39\lib\site-packages\foliatools\folia2annotatedtxt.py", line 117, in main
    process(x, outputfile)
  File "c:\python39\lib\site-packages\foliatools\folia2annotatedtxt.py", line 174, in process
    if w.paragraph() != prevpar and i > 0:
  File "c:\python39\lib\site-packages\folia\main.py", line 3844, in paragraph
    return self.ancestor(Paragraph)
  File "c:\python39\lib\site-packages\folia\main.py", line 2528, in ancestor
    raise NoSuchAnnotation
folia.main.NoSuchAnnotation

File causing the failure here (doc1.folia.xml) looks like this

<?xml version='1.0' encoding='utf-8'?>
<FoLiA xmlns="http://ilk.uvt.nl/folia" xmlns:xlink="http://www.w3.org/1999/xlink" xml:id="doc1" version="2.5.1" generator="foliapy-v2.5.6">
  <metadata type="native">
    <annotations>
      <text-annotation set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/text.foliaset.ttl">
        <annotator processor="proc.conllu2folia.5e385a4e"/>
      </text-annotation>
      <sentence-annotation>
        <annotator processor="proc.conllu2folia.5e385a4e"/>
      </sentence-annotation>
      <token-annotation>
        <annotator processor="proc.conllu2folia.5e385a4e"/>
      </token-annotation>
      <pos-annotation set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl">
        <annotator processor="proc.conllu2folia.5e385a4e"/>
      </pos-annotation>
      <pos-annotation set="undefined">
        <annotator processor="proc.conllu2folia.5e385a4e"/>
      </pos-annotation>
      <lemma-annotation set="undefined">
        <annotator processor="proc.conllu2folia.5e385a4e"/>
      </lemma-annotation>
      <dependency-annotation set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-dependencies.foliaset.ttl">
        <annotator processor="proc.conllu2folia.5e385a4e"/>
      </dependency-annotation>
    </annotations>
    <provenance>
      <processor xml:id="proc.conllu2folia.5e385a4e" name="conllu2folia" type="auto" version="2.5.2" folia_version="2.5.1" command="conllu2folia traindata.conllu" host="bnosac" begindatetime="2021-08-25T17:34:37">
        <processor xml:id="proc.conllu2folia.5e385a4e.generator" name="foliapy" type="generator" version="2.5.6" folia_version="2.5.1" src="https://github.com/proycon/foliapy"/>
      </processor>
    </provenance>
  </metadata>
  <text xml:id="doc1.text">
    <s xml:id="doc1.s.1">
      <t class="original">Ik ben de weg kwijt, kunt u me zeggen waar de Lange Wapper ligt?</t>
      <w xml:id="doc1.s.1.w.1">
        <t>Ik</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="PRON">
          <feat subset="Case" class="Nom"/>
          <feat subset="Number" class="Sing"/>
          <feat subset="Person" class="1"/>
          <feat subset="PronType" class="Prs"/>
        </pos>
        <pos set="undefined" class="Pron|per|1|ev|nom"/>
        <lemma class="ik"/>
      </w>
      <w xml:id="doc1.s.1.w.2">
        <t>ben</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="AUX">
          <feat subset="Aspect" class="Imp"/>
          <feat subset="Mood" class="Ind"/>
          <feat subset="Number" class="Sing"/>
          <feat subset="Person" class="1"/>
          <feat subset="Tense" class="Pres"/>
          <feat subset="VerbForm" class="Fin"/>
        </pos>
        <pos set="undefined" class="V|hulpofkopp|ott|1|ev"/>
        <lemma class="ben"/>
      </w>
      <w xml:id="doc1.s.1.w.3">
        <t>de</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="DET">
          <feat subset="Definite" class="Def"/>
          <feat subset="PronType" class="Art"/>
        </pos>
        <pos set="undefined" class="Art|bep|zijdofmv|neut"/>
        <lemma class="de"/>
      </w>
      <w xml:id="doc1.s.1.w.4">
        <t>weg</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="NOUN">
          <feat subset="Number" class="Sing"/>
        </pos>
        <pos set="undefined" class="N|soort|ev|neut"/>
        <lemma class="weg"/>
      </w>
      <w xml:id="doc1.s.1.w.5" space="no">
        <t>kwijt</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="ADJ">
          <feat subset="Degree" class="Pos"/>
        </pos>
        <pos set="undefined" class="Adj|attr|stell|onverv"/>
        <lemma class="kwijt"/>
      </w>
      <w xml:id="doc1.s.1.w.6">
        <t>,</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="PUNCT">
          <feat subset="PunctType" class="Comm"/>
        </pos>
        <pos set="undefined" class="Punc|komma"/>
        <lemma class=","/>
      </w>
      <w xml:id="doc1.s.1.w.7">
        <t>kunt</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="VERB">
          <feat subset="Aspect" class="Imp"/>
          <feat subset="Mood" class="Ind"/>
          <feat subset="Number" class="Sing"/>
          <feat subset="Person" class="2"/>
          <feat subset="Tense" class="Pres"/>
          <feat subset="VerbForm" class="Fin"/>
          <feat subset="VerbType" class="Mod"/>
        </pos>
        <pos set="undefined" class="V|hulp|ott|2|ev"/>
        <lemma class="kan"/>
      </w>
      <w xml:id="doc1.s.1.w.8">
        <t>u</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="PRON">
          <feat subset="Case" class="Nom"/>
          <feat subset="Number" class="Sing"/>
          <feat subset="Person" class="2"/>
          <feat subset="PronType" class="Prs"/>
        </pos>
        <pos set="undefined" class="Pron|per|2|ev|nom"/>
        <lemma class="u"/>
      </w>
      <w xml:id="doc1.s.1.w.9">
        <t>me</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="PRON">
          <feat subset="Case" class="Acc,Dat"/>
          <feat subset="Number" class="Sing"/>
          <feat subset="Person" class="1"/>
          <feat subset="PronType" class="Prs"/>
        </pos>
        <pos set="undefined" class="Pron|per|1|ev|datofacc"/>
        <lemma class="me"/>
      </w>
      <w xml:id="doc1.s.1.w.10">
        <t>zeggen</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="VERB">
          <feat subset="Subcat" class="Tran"/>
          <feat subset="VerbForm" class="Inf"/>
        </pos>
        <pos set="undefined" class="V|trans|inf"/>
        <lemma class="zeg"/>
      </w>
      <w xml:id="doc1.s.1.w.11">
        <t>waar</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="ADV">
          <feat subset="PronType" class="Int"/>
        </pos>
        <pos set="undefined" class="Adv|gew|vrag"/>
        <lemma class="waar"/>
      </w>
      <w xml:id="doc1.s.1.w.12">
        <t>de</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="DET">
          <feat subset="Definite" class="Def"/>
          <feat subset="PronType" class="Art"/>
        </pos>
        <pos set="undefined" class="Art|bep|zijdofmv|neut"/>
        <lemma class="de"/>
      </w>
      <w xml:id="doc1.s.1.w.13">
        <t>Lange</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="PROPN"/>
        <pos set="undefined" class="N_N|eigen|ev|neut_eigen|ev|neut"/>
        <lemma class="Lange"/>
      </w>
      <w xml:id="doc1.s.1.w.14">
        <t>Wapper</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="PROPN"/>
        <pos set="undefined" class="PROPN"/>
        <lemma class="Wapper"/>
      </w>
      <w xml:id="doc1.s.1.w.15" space="no">
        <t>ligt</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="VERB">
          <feat subset="Aspect" class="Imp"/>
          <feat subset="Mood" class="Ind"/>
          <feat subset="Number" class="Sing"/>
          <feat subset="Person" class="3"/>
          <feat subset="Subcat" class="Intr"/>
          <feat subset="Tense" class="Pres"/>
          <feat subset="VerbForm" class="Fin"/>
        </pos>
        <pos set="undefined" class="V|intrans|ott|3|ev"/>
        <lemma class="lig"/>
      </w>
      <w xml:id="doc1.s.1.w.16">
        <t>?</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="PUNCT">
          <feat subset="PunctType" class="Qest"/>
        </pos>
        <pos set="undefined" class="Punc|vraag"/>
        <lemma class="?"/>
      </w>
      <dependencies>
        <dependency class="nsubj">
          <dep>
            <wref id="doc1.s.1.w.1" t="Ik"/>
          </dep>
          <hd>
            <wref id="doc1.s.1.w.5" t="kwijt"/>
          </hd>
        </dependency>
        <dependency class="cop">
          <dep>
            <wref id="doc1.s.1.w.2" t="ben"/>
          </dep>
          <hd>
            <wref id="doc1.s.1.w.5" t="kwijt"/>
          </hd>
        </dependency>
        <dependency class="det">
          <dep>
            <wref id="doc1.s.1.w.3" t="de"/>
          </dep>
          <hd>
            <wref id="doc1.s.1.w.4" t="weg"/>
          </hd>
        </dependency>
        <dependency class="obj">
          <dep>
            <wref id="doc1.s.1.w.4" t="weg"/>
          </dep>
          <hd>
            <wref id="doc1.s.1.w.5" t="kwijt"/>
          </hd>
        </dependency>
        <dependency class="punct">
          <hd>
            <wref id="doc1.s.1.w.5" t="kwijt"/>
          </hd>
          <dep>
            <wref id="doc1.s.1.w.6" t=","/>
          </dep>
        </dependency>
        <dependency class="parataxis">
          <hd>
            <wref id="doc1.s.1.w.5" t="kwijt"/>
          </hd>
          <dep>
            <wref id="doc1.s.1.w.7" t="kunt"/>
          </dep>
        </dependency>
        <dependency class="nsubj">
          <hd>
            <wref id="doc1.s.1.w.7" t="kunt"/>
          </hd>
          <dep>
            <wref id="doc1.s.1.w.8" t="u"/>
          </dep>
        </dependency>
        <dependency class="obj">
          <dep>
            <wref id="doc1.s.1.w.9" t="me"/>
          </dep>
          <hd>
            <wref id="doc1.s.1.w.10" t="zeggen"/>
          </hd>
        </dependency>
        <dependency class="xcomp">
          <hd>
            <wref id="doc1.s.1.w.7" t="kunt"/>
          </hd>
          <dep>
            <wref id="doc1.s.1.w.10" t="zeggen"/>
          </dep>
        </dependency>
        <dependency class="mark">
          <dep>
            <wref id="doc1.s.1.w.11" t="waar"/>
          </dep>
          <hd>
            <wref id="doc1.s.1.w.15" t="ligt"/>
          </hd>
        </dependency>
        <dependency class="det">
          <dep>
            <wref id="doc1.s.1.w.12" t="de"/>
          </dep>
          <hd>
            <wref id="doc1.s.1.w.13" t="Lange"/>
          </hd>
        </dependency>
        <dependency class="nsubj">
          <dep>
            <wref id="doc1.s.1.w.13" t="Lange"/>
          </dep>
          <hd>
            <wref id="doc1.s.1.w.15" t="ligt"/>
          </hd>
        </dependency>
        <dependency class="flat">
          <hd>
            <wref id="doc1.s.1.w.13" t="Lange"/>
          </hd>
          <dep>
            <wref id="doc1.s.1.w.14" t="Wapper"/>
          </dep>
        </dependency>
        <dependency class="acl">
          <hd>
            <wref id="doc1.s.1.w.10" t="zeggen"/>
          </hd>
          <dep>
            <wref id="doc1.s.1.w.15" t="ligt"/>
          </dep>
        </dependency>
        <dependency class="punct">
          <hd>
            <wref id="doc1.s.1.w.5" t="kwijt"/>
          </hd>
          <dep>
            <wref id="doc1.s.1.w.16" t="?"/>
          </dep>
        </dependency>
      </dependencies>
    </s>
    <s xml:id="doc1.s.2">
      <t class="original">Jazeker meneer</t>
      <w xml:id="doc1.s.2.w.1">
        <t>Jazeker</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="ADJ">
          <feat subset="Degree" class="Pos"/>
        </pos>
        <pos set="undefined" class="Adj|attr|stell|onverv"/>
        <lemma class="zeker"/>
      </w>
      <w xml:id="doc1.s.2.w.2">
        <t>meneer</t>
        <pos set="https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/universal-pos.foliaset.ttl" class="NOUN">
          <feat subset="Number" class="Sing"/>
        </pos>
        <pos set="undefined" class="N|soort|ev|neut"/>
        <lemma class="meneer"/>
      </w>
      <dependencies>
        <dependency class="amod">
          <dep>
            <wref id="doc1.s.2.w.1" t="Jazeker"/>
          </dep>
          <hd>
            <wref id="doc1.s.2.w.2" t="meneer"/>
          </hd>
        </dependency>
      </dependencies>
    </s>
  </text>
</FoLiA>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant