Merge pull request #3595 from flairNLP/documents_as_sentences

Add options to load full documents as Sentence objects
flairNLP · Jan 11, 2025 · ba41223 · ba41223
2 parents c9ece7d + 06a5c0c
commit ba41223
Show file tree

Hide file tree

Showing 5 changed files with 213 additions and 1 deletion.
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
@@ -326,6 +326,8 @@ def __init__(
         label_name_map: Optional[dict[str, str]] = None,
         banned_sentences: Optional[list[str]] = None,
         default_whitespace_after: int = 1,
+        every_sentence_is_independent: bool = False,
+        documents_as_sentences: bool = False,
         **corpusargs,
     ) -> None:
         r"""Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
@@ -361,6 +363,8 @@ def __init__(
                         skip_first_line=skip_first_line,
                         label_name_map=label_name_map,
                         default_whitespace_after=default_whitespace_after,
+                        every_sentence_is_independent=every_sentence_is_independent,
+                        documents_as_sentences=documents_as_sentences,
                     )
                     for train_file in train_files
                 ]
@@ -385,6 +389,8 @@ def __init__(
                         skip_first_line=skip_first_line,
                         label_name_map=label_name_map,
                         default_whitespace_after=default_whitespace_after,
+                        every_sentence_is_independent=every_sentence_is_independent,
+                        documents_as_sentences=documents_as_sentences,
                     )
                     for test_file in test_files
                 ]
@@ -409,6 +415,8 @@ def __init__(
                         skip_first_line=skip_first_line,
                         label_name_map=label_name_map,
                         default_whitespace_after=default_whitespace_after,
+                        every_sentence_is_independent=every_sentence_is_independent,
+                        documents_as_sentences=documents_as_sentences,
                     )
                     for dev_file in dev_files
                 ]
@@ -481,10 +489,12 @@ def __init__(
         banned_sentences: Optional[list[str]] = None,
         in_memory: bool = True,
         document_separator_token: Optional[str] = None,
+        every_sentence_is_independent: bool = False,
         encoding: str = "utf-8",
         skip_first_line: bool = False,
         label_name_map: Optional[dict[str, str]] = None,
         default_whitespace_after: int = 1,
+        documents_as_sentences: bool = False,
     ) -> None:
         r"""Instantiates a column dataset.
 
@@ -505,9 +515,17 @@ def __init__(
         self.column_delimiter = re.compile(column_delimiter)
         self.comment_symbol = comment_symbol
         self.document_separator_token = document_separator_token
+        self.every_sentence_is_independent = every_sentence_is_independent
         self.label_name_map = label_name_map
         self.banned_sentences = banned_sentences
         self.default_whitespace_after = default_whitespace_after
+        self.documents_as_sentences = documents_as_sentences
+
+        if documents_as_sentences and not document_separator_token:
+            log.error(
+                "document_as_sentences was set to True, but no document_separator_token was provided. Please set"
+                "a value for document_separator_token in order to enable the document_as_sentence functionality."
+            )
 
         # store either Sentence objects in memory, or only file offsets
         self.in_memory = in_memory
@@ -702,6 +720,9 @@ def _convert_lines_to_sentence(
         if sentence.to_original_text() == self.document_separator_token:
             sentence.is_document_boundary = True
 
+        if self.every_sentence_is_independent or self.documents_as_sentences:
+            sentence.is_document_boundary = True
+
         # add span labels
         if span_level_tag_columns:
             for span_column in span_level_tag_columns:
@@ -818,6 +839,13 @@ def _remap_label(self, tag):
         return tag
 
     def __line_completes_sentence(self, line: str) -> bool:
+
+        if self.documents_as_sentences and self.document_separator_token:
+            if line.startswith(self.document_separator_token):
+                return True
+            else:
+                return False
+
         sentence_completed = line.isspace() or line == ""
         return sentence_completed
 
@@ -5035,7 +5063,8 @@ def __init__(
                 test_file=None,
                 column_format=columns,
                 in_memory=in_memory,
-                sample_missing_splits=False,  # No test data is available, so do not shrink dev data for shared task preparation!
+                sample_missing_splits=False,
+                # No test data is available, so do not shrink dev data for shared task preparation!
                 **corpusargs,
             )
             corpora.append(corpus)

diff --git a/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/dev.txt b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/dev.txt
@@ -0,0 +1,37 @@
+this O
+is O
+New B-LOC
+York I-LOC
+
+here O
+is O
+New B-LOC
+York I-LOC
+
+I O
+like O
+New B-LOC
+York I-LOC
+
+we O
+like O
+New B-LOC
+York I-LOC
+
+-DOCSTART-
+
+this O
+is O
+Berlin B-LOC
+
+here O
+is O
+Berlin B-LOC
+
+I O
+like O
+Berlin B-LOC
+
+we O
+like O
+Berlin B-LOC
diff --git a/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/test.txt b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/test.txt
@@ -0,0 +1,39 @@
+this O
+is O
+New B-LOC
+York I-LOC
+
+here O
+is O
+New B-LOC
+York I-LOC
+
+I O
+like O
+New B-LOC
+York I-LOC
+
+we O
+like O
+New B-LOC
+York I-LOC
+
+-DOCSTART-
+
+this O
+is O
+Berlin B-LOC
+
+here O
+is O
+Berlin B-LOC
+
+I O
+like O
+Berlin B-LOC
+
+we O
+like O
+Berlin B-LOC
+
+-DOCSTART-
diff --git a/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/train.txt b/tests/resources/tasks/trivial/trivial_bioes_with_boundaries/train.txt
@@ -0,0 +1,59 @@
+this O
+is O
+New B-LOC
+York I-LOC
+
+here O
+is O
+New B-LOC
+York I-LOC
+
+I O
+like O
+New B-LOC
+York I-LOC
+
+we O
+like O
+New B-LOC
+York I-LOC
+
+-DOCSTART-
+
+this O
+is O
+Berlin B-LOC
+
+here O
+is O
+Berlin B-LOC
+
+I O
+like O
+Berlin B-LOC
+
+we O
+like O
+Berlin B-LOC
+
+-DOCSTART-
+
+this O
+is O
+New B-LOC
+York I-LOC
+
+here O
+is O
+New B-LOC
+York I-LOC
+
+I O
+like O
+New B-LOC
+York I-LOC
+
+we O
+like O
+New B-LOC
+York I-LOC
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -75,6 +75,54 @@ def test_load_sequence_labeling_data(tasks_base_path):
     assert len(corpus.test) == 1
 
 
+def test_load_sequence_labeling_data_with_boundaries(tasks_base_path):
+    # get training, test and dev data
+    corpus = flair.datasets.ColumnCorpus(
+        tasks_base_path / "trivial" / "trivial_bioes_with_boundaries", column_format={0: "text", 1: "ner"}
+    )
+
+    assert len(corpus.train) == 14
+    assert len(corpus.dev) == 9
+    assert len(corpus.test) == 10
+
+    # now exclude -DOCSTART- sentences
+    corpus = flair.datasets.ColumnCorpus(
+        tasks_base_path / "trivial" / "trivial_bioes_with_boundaries",
+        column_format={0: "text", 1: "ner"},
+        banned_sentences=["-DOCSTART-"],
+    )
+
+    assert len(corpus.train) == 12
+    assert len(corpus.dev) == 8
+    assert len(corpus.test) == 8
+
+    assert len(corpus.train[0].right_context(5)) == 5
+
+    # now load whole documents as sentences
+    corpus = flair.datasets.ColumnCorpus(
+        tasks_base_path / "trivial" / "trivial_bioes_with_boundaries",
+        column_format={0: "text", 1: "ner"},
+        document_separator_token="-DOCSTART-",
+        documents_as_sentences=True,
+    )
+
+    assert len(corpus.train) == 3
+    assert len(corpus.dev) == 2
+    assert len(corpus.test) == 2
+
+    assert len(corpus.train[0].right_context(5)) == 0
+
+    # ban each boundary but set each sentence to be independent
+    corpus = flair.datasets.ColumnCorpus(
+        tasks_base_path / "trivial" / "trivial_bioes_with_boundaries",
+        column_format={0: "text", 1: "ner"},
+        banned_sentences=["-DOCSTART-"],
+        every_sentence_is_independent=True,
+    )
+
+    assert len(corpus.train[0].right_context(5)) == 0
+
+
 def test_load_sequence_labeling_whitespace_after(tasks_base_path):
     # get training, test and dev data
     corpus = flair.datasets.ColumnCorpus(