Skip to content

Commit

Permalink
Merge pull request #3595 from flairNLP/documents_as_sentences
Browse files Browse the repository at this point in the history
Add options to load full documents as Sentence objects
  • Loading branch information
alanakbik authored Jan 11, 2025
2 parents c9ece7d + 06a5c0c commit ba41223
Show file tree
Hide file tree
Showing 5 changed files with 213 additions and 1 deletion.
31 changes: 30 additions & 1 deletion flair/datasets/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,8 @@ def __init__(
label_name_map: Optional[dict[str, str]] = None,
banned_sentences: Optional[list[str]] = None,
default_whitespace_after: int = 1,
every_sentence_is_independent: bool = False,
documents_as_sentences: bool = False,
**corpusargs,
) -> None:
r"""Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
Expand Down Expand Up @@ -361,6 +363,8 @@ def __init__(
skip_first_line=skip_first_line,
label_name_map=label_name_map,
default_whitespace_after=default_whitespace_after,
every_sentence_is_independent=every_sentence_is_independent,
documents_as_sentences=documents_as_sentences,
)
for train_file in train_files
]
Expand All @@ -385,6 +389,8 @@ def __init__(
skip_first_line=skip_first_line,
label_name_map=label_name_map,
default_whitespace_after=default_whitespace_after,
every_sentence_is_independent=every_sentence_is_independent,
documents_as_sentences=documents_as_sentences,
)
for test_file in test_files
]
Expand All @@ -409,6 +415,8 @@ def __init__(
skip_first_line=skip_first_line,
label_name_map=label_name_map,
default_whitespace_after=default_whitespace_after,
every_sentence_is_independent=every_sentence_is_independent,
documents_as_sentences=documents_as_sentences,
)
for dev_file in dev_files
]
Expand Down Expand Up @@ -481,10 +489,12 @@ def __init__(
banned_sentences: Optional[list[str]] = None,
in_memory: bool = True,
document_separator_token: Optional[str] = None,
every_sentence_is_independent: bool = False,
encoding: str = "utf-8",
skip_first_line: bool = False,
label_name_map: Optional[dict[str, str]] = None,
default_whitespace_after: int = 1,
documents_as_sentences: bool = False,
) -> None:
r"""Instantiates a column dataset.
Expand All @@ -505,9 +515,17 @@ def __init__(
self.column_delimiter = re.compile(column_delimiter)
self.comment_symbol = comment_symbol
self.document_separator_token = document_separator_token
self.every_sentence_is_independent = every_sentence_is_independent
self.label_name_map = label_name_map
self.banned_sentences = banned_sentences
self.default_whitespace_after = default_whitespace_after
self.documents_as_sentences = documents_as_sentences

if documents_as_sentences and not document_separator_token:
log.error(
"document_as_sentences was set to True, but no document_separator_token was provided. Please set"
"a value for document_separator_token in order to enable the document_as_sentence functionality."
)

# store either Sentence objects in memory, or only file offsets
self.in_memory = in_memory
Expand Down Expand Up @@ -702,6 +720,9 @@ def _convert_lines_to_sentence(
if sentence.to_original_text() == self.document_separator_token:
sentence.is_document_boundary = True

if self.every_sentence_is_independent or self.documents_as_sentences:
sentence.is_document_boundary = True

# add span labels
if span_level_tag_columns:
for span_column in span_level_tag_columns:
Expand Down Expand Up @@ -818,6 +839,13 @@ def _remap_label(self, tag):
return tag

def __line_completes_sentence(self, line: str) -> bool:

if self.documents_as_sentences and self.document_separator_token:
if line.startswith(self.document_separator_token):
return True
else:
return False

sentence_completed = line.isspace() or line == ""
return sentence_completed

Expand Down Expand Up @@ -5035,7 +5063,8 @@ def __init__(
test_file=None,
column_format=columns,
in_memory=in_memory,
sample_missing_splits=False, # No test data is available, so do not shrink dev data for shared task preparation!
sample_missing_splits=False,
# No test data is available, so do not shrink dev data for shared task preparation!
**corpusargs,
)
corpora.append(corpus)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
this O
is O
New B-LOC
York I-LOC

here O
is O
New B-LOC
York I-LOC

I O
like O
New B-LOC
York I-LOC

we O
like O
New B-LOC
York I-LOC

-DOCSTART-

this O
is O
Berlin B-LOC

here O
is O
Berlin B-LOC

I O
like O
Berlin B-LOC

we O
like O
Berlin B-LOC
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
this O
is O
New B-LOC
York I-LOC

here O
is O
New B-LOC
York I-LOC

I O
like O
New B-LOC
York I-LOC

we O
like O
New B-LOC
York I-LOC

-DOCSTART-

this O
is O
Berlin B-LOC

here O
is O
Berlin B-LOC

I O
like O
Berlin B-LOC

we O
like O
Berlin B-LOC

-DOCSTART-
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
this O
is O
New B-LOC
York I-LOC

here O
is O
New B-LOC
York I-LOC

I O
like O
New B-LOC
York I-LOC

we O
like O
New B-LOC
York I-LOC

-DOCSTART-

this O
is O
Berlin B-LOC

here O
is O
Berlin B-LOC

I O
like O
Berlin B-LOC

we O
like O
Berlin B-LOC

-DOCSTART-

this O
is O
New B-LOC
York I-LOC

here O
is O
New B-LOC
York I-LOC

I O
like O
New B-LOC
York I-LOC

we O
like O
New B-LOC
York I-LOC
48 changes: 48 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,54 @@ def test_load_sequence_labeling_data(tasks_base_path):
assert len(corpus.test) == 1


def test_load_sequence_labeling_data_with_boundaries(tasks_base_path):
# get training, test and dev data
corpus = flair.datasets.ColumnCorpus(
tasks_base_path / "trivial" / "trivial_bioes_with_boundaries", column_format={0: "text", 1: "ner"}
)

assert len(corpus.train) == 14
assert len(corpus.dev) == 9
assert len(corpus.test) == 10

# now exclude -DOCSTART- sentences
corpus = flair.datasets.ColumnCorpus(
tasks_base_path / "trivial" / "trivial_bioes_with_boundaries",
column_format={0: "text", 1: "ner"},
banned_sentences=["-DOCSTART-"],
)

assert len(corpus.train) == 12
assert len(corpus.dev) == 8
assert len(corpus.test) == 8

assert len(corpus.train[0].right_context(5)) == 5

# now load whole documents as sentences
corpus = flair.datasets.ColumnCorpus(
tasks_base_path / "trivial" / "trivial_bioes_with_boundaries",
column_format={0: "text", 1: "ner"},
document_separator_token="-DOCSTART-",
documents_as_sentences=True,
)

assert len(corpus.train) == 3
assert len(corpus.dev) == 2
assert len(corpus.test) == 2

assert len(corpus.train[0].right_context(5)) == 0

# ban each boundary but set each sentence to be independent
corpus = flair.datasets.ColumnCorpus(
tasks_base_path / "trivial" / "trivial_bioes_with_boundaries",
column_format={0: "text", 1: "ner"},
banned_sentences=["-DOCSTART-"],
every_sentence_is_independent=True,
)

assert len(corpus.train[0].right_context(5)) == 0


def test_load_sequence_labeling_whitespace_after(tasks_base_path):
# get training, test and dev data
corpus = flair.datasets.ColumnCorpus(
Expand Down

0 comments on commit ba41223

Please sign in to comment.