From d88fb8d7ad0167f3da10b638f9b62d395788486d Mon Sep 17 00:00:00 2001
From: Sami Virpioja <sami.virpioja@helsinki.fi>
Date: Thu, 20 Jun 2024 11:49:27 +0300
Subject: [PATCH 1/7] change fasttext and pycld2 requirements optional

---
 docs/installation.md |  9 +++++++--
 setup.py             | 20 ++++++++++++++------
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/docs/installation.md b/docs/installation.md
index 92e86b3..e04de8b 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -18,14 +18,12 @@ versions from 3.8 to 3.11.
 
 * beautifulsoup4
 * opus-fast-mosestokenizer
-* fasttext
 * graphviz
 * langid
 * matplotlib
 * morfessor
 * OpusTools
 * pandas
-* pycld2
 * rapidfuzz
 * ruamel.yaml
 * regex
@@ -41,6 +39,13 @@ See `setup.py` for possible version requirements.
 
 ## Optional libraries and tools
 
+### FastText and PyCLD2 language detection
+
+* fasttext
+* pycld2
+
+These libararies are no longer updated?
+
 ### Jieba and MeCab word segmentation
 
 For Chinese tokenization (word segmentation), you can use the
diff --git a/setup.py b/setup.py
index 183b980..a39f586 100644
--- a/setup.py
+++ b/setup.py
@@ -8,14 +8,12 @@
     "numpy<2.0.0",
     "opustools",
     "beautifulsoup4>=4.8.0",
-    "fasttext",
     "graphviz",
     "langid",
     "matplotlib",
     "morfessor",
     "opus-fast-mosestokenizer>=0.0.8.5",
     "pandas>=1.0.0",
-    "pycld2",
     "xxhash>=3.2.0",
     "sentence-splitter",
     "rapidfuzz",
@@ -28,6 +26,14 @@
     "lingua-language-detector>=1.3.0"
 ]
 
+pycld2_require = [
+    "pycld2"
+]
+
+fasttext_require = [
+    "fasttext"
+]
+
 eflomal_require = [
     'eflomal>=2.0.0'
 ]
@@ -60,7 +66,8 @@
     'sphinxcontrib-bibtex'
 ]
 
-all_require = eflomal_require + jieba_require + mecab_require + laser_require + varikn_require + tests_require + docs_require
+all_require = pycld2_require + fasttext_require + eflomal_require + jieba_require + \
+    mecab_require + laser_require + varikn_require + tests_require + docs_require
 
 setuptools.setup(
     name="opusfilter",
@@ -78,9 +85,10 @@
         "bin/opusfilter-scores", "bin/opusfilter-test"],
     install_requires=install_requires,
     tests_require=tests_require,
-    extras_require={'test': tests_require, 'eflomal': eflomal_require, 'jieba': jieba_require,
-                    'mecab': mecab_require, 'laser': laser_require, 'varikn': varikn_require,
-                    'docs': docs_require, 'all': all_require},
+    extras_require={'test': tests_require, 'pycld2': pycld2_require, 'fasttext': fasttext_require,
+                    'eflomal': eflomal_require, 'jieba': jieba_require, 'mecab': mecab_require,
+                    'laser': laser_require, 'varikn': varikn_require, 'docs': docs_require,
+                    'all': all_require},
     classifiers=(
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License",

From 2e8d1e1f2ba14fe6cbba6b70c9b11406da85cfad Mon Sep 17 00:00:00 2001
From: Sami Virpioja <sami.virpioja@helsinki.fi>
Date: Thu, 20 Jun 2024 17:37:46 +0300
Subject: [PATCH 2/7] fix unittests for new optional libraries

---
 .github/workflows/ci.yml |   6 +-
 docs/CHANGELOG.md        |   4 ++
 opusfilter/autogen.py    |   6 +-
 opusfilter/filters.py    |  12 +++-
 requirements.txt         |   8 +--
 setup.py                 |   2 +-
 tests/test_autogen.py    |   2 +-
 tests/test_filters.py    | 128 ---------------------------------
 tests/test_lid.py        | 151 +++++++++++++++++++++++++++++++++++++++
 9 files changed, 177 insertions(+), 142 deletions(-)
 create mode 100644 tests/test_lid.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b5715ac..ec43b71 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -32,10 +32,12 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
+          python -m ensurepip --upgrade
+          python -m pip install --upgrade setuptools
           python -m pip install --upgrade pip
           python -m pip install flake8 pytest wheel
-          pip install -r ${{ matrix.requirements-file }}
-          python setup.py install
+          python -m pip install -r ${{ matrix.requirements-file }}
+          python -m pip install .
       - name: Lint with flake8
         run: |
           # stop the build if there are Python syntax errors or undefined names
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 836436c..04e1096 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- make pycld2 and fasttext libraries optional
+
 ## [3.1.0] - 2024-06-05
 
 ### Added
diff --git a/opusfilter/autogen.py b/opusfilter/autogen.py
index b718486..520da2b 100644
--- a/opusfilter/autogen.py
+++ b/opusfilter/autogen.py
@@ -217,7 +217,7 @@ class DefaultParameterFilters(AutoFiltersABC):
                        'AverageWordLengthFilter', 'AlphabetRatioFilter',
                        'TerminalPunctuationFilter', 'NonZeroNumeralsFilter',
                        'LongestCommonSubstringFilter', 'SimilarityFilter', 'RepetitionFilter',
-                       'CharacterScoreFilter', ('LanguageIDFilter', {'id_method': 'cld2'})]
+                       'CharacterScoreFilter', ('LanguageIDFilter', {'id_method': 'lingua'})]
 
     def set_filter_thresholds(self):
         """Set filter thresholds"""
@@ -272,7 +272,7 @@ class PercentileFilters(DataBasedFiltersABC):
                        'AverageWordLengthFilter', 'AlphabetRatioFilter',
                        'TerminalPunctuationFilter', 'NonZeroNumeralsFilter',
                        'LongestCommonSubstringFilter', 'SimilarityFilter', 'RepetitionFilter',
-                       'CharacterScoreFilter', ('LanguageIDFilter', {'id_method': 'cld2'})]
+                       'CharacterScoreFilter', ('LanguageIDFilter', {'id_method': 'lingua'})]
 
     def __init__(self, files, excluded_percentile=0.001, **kwargs):
         super().__init__(files, **kwargs)
@@ -512,7 +512,7 @@ class ClusterFilters(DataBasedFiltersABC):
                        ('LengthRatioFilter.word', {'unit': 'word'}),
                        'NonZeroNumeralsFilter',
                        'CharacterScoreFilter',
-                       ('LanguageIDFilter', {'id_method': 'cld2'}),
+                       ('LanguageIDFilter', {'id_method': 'lingua'}),
                        'TerminalPunctuationFilter']
 
     def __init__(self, files, k=2, max_length=150, **kwargs):
diff --git a/opusfilter/filters.py b/opusfilter/filters.py
index 92511cf..13f1461 100644
--- a/opusfilter/filters.py
+++ b/opusfilter/filters.py
@@ -344,7 +344,11 @@ def init_fastttext(self, fasttext_model_path):
         if not fasttext_model_path:
             raise ConfigurationError("FastText language ID method was choosen without specifying "
                                      "any path to fasttext model")
-        import fasttext
+        try:
+            import fasttext
+        except ImportError:
+            logger.warning("Could not import fasttext. Select another id_method for LanguageIDFilter.")
+            raise
         self.fasttext_model = fasttext.load_model(os.path.join(self.workdir, fasttext_model_path))
 
     def init_lingua(self, lingua_mode):
@@ -366,7 +370,11 @@ def confidence(self, sentence: str, lan: str) -> float:
             return 1.0
 
         if self.id_method == 'cld2':
-            import pycld2
+            try:
+                import pycld2
+            except ImportError:
+                logger.warning("Could not import pycld2. Select another id_method for LanguageIDFilter.")
+                raise
             try:
                 clddetails = pycld2.detect(sentence, **self.cld2_options)
             except pycld2.error as err:
diff --git a/requirements.txt b/requirements.txt
index b82f47b..623a808 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-setuptools==65.5.1
-setuptools_scm==6.4.2
-numpy<2.0.0
+setuptools>=65.5.1
+setuptools_scm>=6.4.2
+numpy>=1.24.4
 opustools
 jieba>=0.42
 beautifulsoup4>=4.8.2
@@ -9,7 +9,6 @@ langid==1.1.6
 matplotlib>=3.3.0
 opus-fast-mosestokenizer>=0.0.8.5
 pandas>=1.0.0
-pycld2==0.41
 xxhash==3.2.0
 rapidfuzz>=2.0.5
 regex>=2019.11.1
@@ -18,7 +17,6 @@ ruamel.yaml>=0.15.0
 scikit-learn>=0.24.0
 sentence-splitter==1.4
 tqdm>=4.38.0
-fasttext==0.9.2
 mecab-python3>=1.0.8
 unidic-lite==1.0.8
 subword-nmt==0.3.8
diff --git a/setup.py b/setup.py
index a39f586..6161b90 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,6 @@
 
 install_requires = [
     "setuptools",
-    "numpy<2.0.0",
     "opustools",
     "beautifulsoup4>=4.8.0",
     "graphviz",
@@ -31,6 +30,7 @@
 ]
 
 fasttext_require = [
+    "numpy<2.0.0",
     "fasttext"
 ]
 
diff --git a/tests/test_autogen.py b/tests/test_autogen.py
index 5fecd5f..dd19c53 100644
--- a/tests/test_autogen.py
+++ b/tests/test_autogen.py
@@ -103,7 +103,7 @@ class TestThresholdFinder(unittest.TestCase):
         {'LengthRatioFilter': {'name': 'word', 'threshold': 1, 'unit': 'word'}},
         {'NonZeroNumeralsFilter': {'threshold': 1}},
         {'CharacterScoreFilter': {'scripts': ['latin', 'latin'], 'thresholds': [1, 1]}},
-        {'LanguageIDFilter': {'id_method': 'cld2', 'languages': ['en', 'de'], 'thresholds': [1, 1]}},
+        {'LanguageIDFilter': {'id_method': 'lingua', 'languages': ['en', 'de'], 'thresholds': [1, 1]}},
         {'TerminalPunctuationFilter': {'threshold': 1}}
     ]
 
diff --git a/tests/test_filters.py b/tests/test_filters.py
index b0b548f..554cf9e 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -5,9 +5,7 @@
 import tempfile
 import unittest
 
-from opusfilter import ConfigurationError
 from opusfilter.filters import *
-from opusfilter.util import file_download
 
 
 class TestLengthFilter(unittest.TestCase):
@@ -248,132 +246,6 @@ def test_trilingual_any(self):
             self.assertSequenceEqual(result, correct)
 
 
-class TestLangIDMethod(unittest.TestCase):
-
-    pairs_inputs = [
-        ("This sentence is in english", "Je suis une phrase en français"),
-        ("me llamo bernardo", "je m'appelle Bernard")
-    ]
-
-
-class TestLangId(TestLangIDMethod):
-
-    def test_accept(self):
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99])
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [True, False]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-    def test_accept_with_set_languages(self):
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99],
-            langid_languages=['fr', 'de'])
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [False, False]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-
-class TestCLD2(TestLangIDMethod):
-
-    pairs_inputs = [
-        ("This sentence is in english", "Je suis une phrase en français"),
-        ("me llamo bernardo", "je m'appelle Bernard"),
-        ("english sentence", "phrase français")
-    ]
-
-    def test_accept(self):
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9])
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [True, False, False]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-    def test_accept_with_options(self):
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9],
-            cld2_options={'bestEffort': True})
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [True, False, True]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            logging.info('%s %s', pair_score, pair_expected)
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-
-class TestFasttext(TestLangIDMethod):
-
-    fasttext_inputs = ["This sentence is in english", "Je suis une phrase en français"]
-    model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
-
-    @classmethod
-    def setUpClass(self):
-        self.tempdir = tempfile.mkdtemp()
-        self.testmodel = os.path.join(self.tempdir, 'model.ftz')
-        try:
-            file_download(self.model_url, self.testmodel)
-        except requests.exceptions.ConnectionError:
-            self.testmodel = None
-
-    @classmethod
-    def tearDownClass(self):
-        shutil.rmtree(self.tempdir)
-
-    def test_missing_model(self):
-        with self.assertRaises(ConfigurationError):
-            model = LanguageIDFilter(
-                languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99])
-
-    def test_wrong_method_with_model(self):
-        with self.assertRaises(ConfigurationError):
-            model = LanguageIDFilter(
-                languages=['en', 'fr'], thresholds=[0.8, 0.99], fasttext_model_path=self.tempdir)
-
-    def test_fasttext_predict_lang(self):
-        if self.testmodel is None:
-            self.skipTest("Failed to download test resources")
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99],
-            fasttext_model_path=self.testmodel)
-        expected = ['en', 'fr']
-        results = [model._fasttext_predict_lang(fasttext_input)[0]
-                   for fasttext_input in self.fasttext_inputs]
-        self.assertSequenceEqual(expected, results)
-
-    def test_accept(self):
-        if self.testmodel is None:
-            self.skipTest("Failed to download test resources")
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99],
-            fasttext_model_path=self.testmodel)
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [True, False]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-
-class TestLingua(TestLangIDMethod):
-
-    def test_accept(self):
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99])
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [True, False]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-    def test_accept_high(self):
-        model = LanguageIDFilter(
-            languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7])
-        pair_scores = model.score(self.pairs_inputs)
-        pair_expecteds = [True, False]
-        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
-            self.assertEqual(model.accept(pair_score), pair_expected)
-
-
-
 class TestRepetitionFilter(unittest.TestCase):
 
     def test_get_repetition(self):
diff --git a/tests/test_lid.py b/tests/test_lid.py
new file mode 100644
index 0000000..ef0eefe
--- /dev/null
+++ b/tests/test_lid.py
@@ -0,0 +1,151 @@
+import logging
+import os
+import shutil
+import tempfile
+import unittest
+
+import requests
+
+from opusfilter import ConfigurationError
+from opusfilter.filters import *
+from opusfilter.util import file_download
+
+
+try:
+    import fasttext
+except ImportError:
+    logging.warning("Could not import fasttext")
+
+try:
+    import pycld2
+except ImportError:
+    logging.warning("Could not import pycld2")
+
+
+class TestLangIDMethod(unittest.TestCase):
+
+    pairs_inputs = [
+        ("This sentence is in english", "Je suis une phrase en français"),
+        ("me llamo bernardo", "je m'appelle Bernard")
+    ]
+
+
+class TestLangId(TestLangIDMethod):
+
+    def test_accept(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99])
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+    def test_accept_with_set_languages(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99],
+            langid_languages=['fr', 'de'])
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [False, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+
+class TestCLD2(TestLangIDMethod):
+
+    pairs_inputs = [
+        ("This sentence is in english", "Je suis une phrase en français"),
+        ("me llamo bernardo", "je m'appelle Bernard"),
+        ("english sentence", "phrase français")
+    ]
+
+    @unittest.skipIf('pycld2' not in globals(), 'pycld2 not installed')
+    def test_accept(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9])
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+    @unittest.skipIf('pycld2' not in globals(), 'pycld2 not installed')
+    def test_accept_with_options(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9],
+            cld2_options={'bestEffort': True})
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False, True]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            logging.info('%s %s', pair_score, pair_expected)
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+
+class TestFasttext(TestLangIDMethod):
+
+    fasttext_inputs = ["This sentence is in english", "Je suis une phrase en français"]
+    model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
+
+    @classmethod
+    def setUpClass(self):
+        self.tempdir = tempfile.mkdtemp()
+        if 'fasttext' not in globals():
+            raise unittest.SkipTest('fasttext not installed')
+        self.testmodel = os.path.join(self.tempdir, 'model.ftz')
+        try:
+            file_download(self.model_url, self.testmodel)
+        except requests.exceptions.ConnectionError:
+            self.testmodel = None
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree(self.tempdir)
+
+    def test_missing_model(self):
+        with self.assertRaises(ConfigurationError):
+            model = LanguageIDFilter(
+                languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99])
+
+    def test_wrong_method_with_model(self):
+        with self.assertRaises(ConfigurationError):
+            model = LanguageIDFilter(
+                languages=['en', 'fr'], thresholds=[0.8, 0.99], fasttext_model_path=self.tempdir)
+
+    def test_fasttext_predict_lang(self):
+        if self.testmodel is None:
+            self.skipTest("Failed to download test resources")
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99],
+            fasttext_model_path=self.testmodel)
+        expected = ['en', 'fr']
+        results = [model._fasttext_predict_lang(fasttext_input)[0]
+                   for fasttext_input in self.fasttext_inputs]
+        self.assertSequenceEqual(expected, results)
+
+    def test_accept(self):
+        if self.testmodel is None:
+            self.skipTest("Failed to download test resources")
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99],
+            fasttext_model_path=self.testmodel)
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+
+class TestLingua(TestLangIDMethod):
+
+    def test_accept(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99])
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)
+
+    def test_accept_high(self):
+        model = LanguageIDFilter(
+            languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7])
+        pair_scores = model.score(self.pairs_inputs)
+        pair_expecteds = [True, False]
+        for pair_score, pair_expected in zip(pair_scores, pair_expecteds):
+            self.assertEqual(model.accept(pair_score), pair_expected)

From 8c6f2ebdf2a38d5537ce33155a3837a29b1b4a0b Mon Sep 17 00:00:00 2001
From: Sami Virpioja <sami.virpioja@helsinki.fi>
Date: Wed, 26 Jun 2024 09:36:39 +0300
Subject: [PATCH 3/7] update documentation

---
 docs/CHANGELOG.md                             |  1 +
 ...ipt_and_language_identification_filters.md |  1 +
 docs/installation.md                          | 29 +++++++++++++------
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 04e1096..b11bd69 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changed
 
 - make pycld2 and fasttext libraries optional
+- update github workflows and include Python 3.12 tests
 
 ## [3.1.0] - 2024-06-05
 
diff --git a/docs/filters/script_and_language_identification_filters.md b/docs/filters/script_and_language_identification_filters.md
index 961898e..020c69d 100644
--- a/docs/filters/script_and_language_identification_filters.md
+++ b/docs/filters/script_and_language_identification_filters.md
@@ -48,3 +48,4 @@ See [langid.py](https://github.com/saffsd/langid.py) and
 [pycld2](https://github.com/aboSamoor/pycld2) for the method-specific
 options. A pretrained `fasttext` model can be downloaded from
 [fasttext.cc/docs/en/language-identification.html](https://fasttext.cc/docs/en/language-identification.html).
+The `cld2` and `fasttext` methods require [installing optional libraries](../installation.md).
diff --git a/docs/installation.md b/docs/installation.md
index e04de8b..d9e0473 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -39,31 +39,42 @@ See `setup.py` for possible version requirements.
 
 ## Optional libraries and tools
 
-### FastText and PyCLD2 language detection
+### FastText and PyCLD2 language identification
 
-* fasttext
-* pycld2
+The language identification methods currently supported out-of-the-box
+are [langid](https://github.com/saffsd/langid.py) and
+[lingua](https://github.com/pemistahl/lingua-py). The support for for
+[pycld2](https://github.com/aboSamoor/pycld2) and
+[fasttext models](https://fasttext.cc/docs/en/language-identification.html)
+have been changed to optional due to the lack of support especially
+for newer Python versions.
 
-These libararies are no longer updated?
+The PyCLD2 support can be installed automatically with pip by
+including the extras `[pycld2]` or `[all]` (e.g.
+`pip install opusfilter[pycld2]`).
+
+The support for FastText models can be installed automatically with
+pip by including the extras `[fasttext]` or `[all]` (e.g.
+`pip install opusfilter[fasttext]`).
 
 ### Jieba and MeCab word segmentation
 
 For Chinese tokenization (word segmentation), you can use the
 [jieba](https://github.com/fxsjy/jieba) library. It can be installed
 automatically with pip by including the extras `[jieba]` or `[all]`
-(e.g. `pip install opusfilter[all]`).
+(e.g. `pip install opusfilter[jieba]`).
 
 For Japanese tokenization (word segmentation), you can use the
 [MeCab](https://github.com/SamuraiT/mecab-python3) library. It can be installed
 automatically with pip by including the extras `[mecab]` or `[all]`
-(e.g. `pip install opusfilter[all]`).
+(e.g. `pip install opusfilter[mecab]`).
 
 ### LASER sentence embeddings
 
 For using sentence embeddings filters, you need to install
 `laserembeddings` (https://github.com/yannvgn/laserembeddings). It can
 be installed automatically with pip by including the extras `[laser]`
-or `[all]` (e.g. `pip install opusfilter[all]`). The package will also
+or `[all]` (e.g. `pip install opusfilter[laser]`). The package will also
 require a number of additional libraries, including PyTorch, jieba,
 and MeCab. Note that you need also to download the prebuild models
 with `python -m laserembeddings download-models`.
@@ -73,12 +84,12 @@ with `python -m laserembeddings download-models`.
 For using n-gram language model filters, you need to install the
 Python wrapper for VariKN (https://github.com/vsiivola/variKN). It can
 be installed automatically with pip by including the extras `[varikn]`
-or `[all]` (e.g. `pip install opusfilter[all]`).
+or `[all]` (e.g. `pip install opusfilter[varikn]`).
 
 ### Eflomal word alignment
 
 For using word alignment filters, you need to install elfomal
 (https://github.com/robertostling/eflomal). It can be installed
 automatically with pip by including the extras `[eflomal]` or `[all]`
-(e.g. `pip install opusfilter[all]`). Note that you will need `Cython`
+(e.g. `pip install opusfilter[eflomal]`). Note that you will need `Cython`
 for the installation.

From 9fbe7d063e676a5a995ba181480d80a799de784b Mon Sep 17 00:00:00 2001
From: Sami Virpioja <sami.virpioja@helsinki.fi>
Date: Wed, 26 Jun 2024 10:06:26 +0300
Subject: [PATCH 4/7] fix tests failing in some configurations

---
 tests/test_filter_pipeline.py | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/tests/test_filter_pipeline.py b/tests/test_filter_pipeline.py
index ee37855..4a5884f 100644
--- a/tests/test_filter_pipeline.py
+++ b/tests/test_filter_pipeline.py
@@ -1,10 +1,26 @@
 import copy
 import unittest
 
+from numpy.testing import assert_almost_equal
+
 from opusfilter.pipeline import FilterPipeline
 
 
-class TestFilterPipeline(unittest.TestCase):
+class TestFilterPipelineBase(unittest.TestCase):
+
+    def assert_scores_equal(self, sdict1, sdict2):
+        self.assertEqual(set(sdict1), set(sdict2))  # same keys
+        for key, val1 in sdict1.items():
+            val2 = sdict2[key]
+            if isinstance(val1, list):
+                self.assertEqual(len(val1), len(val2), msg=f"Scores do not match for {key}: {val1} {val2}")
+                for item1, item2 in zip(val1, val2):
+                    self.assertAlmostEqual(item1, item2, msg=f"Scores do not match for {key}: {val1} {val2}")
+            else:
+                self.assertAlmostEqual(val1, val2, msg=f"Scores do not match for {key}: {val1} {val2}")
+
+
+class TestFilterPipeline(TestFilterPipelineBase):
 
     @classmethod
     def setUpClass(self):
@@ -46,7 +62,7 @@ def test_score(self):
                  ('1245..', '12345.....'),
                  ('', '')]
         scores = list(fp.score(pairs))
-        self.assertEqual(
+        self.assert_scores_equal(
             scores[0],
             {'LengthFilter': [5, 9],
              'LengthRatioFilter': 1.8,
@@ -57,7 +73,7 @@ def test_score(self):
              'LanguageIDFilter': [1.0, 1.0],
              'TerminalPunctuationFilter': -0.0,
              'NonZeroNumeralsFilter': [1.0]})
-        self.assertEqual(
+        self.assert_scores_equal(
             scores[1],
             {'LengthFilter': [1, 1],
              'LengthRatioFilter': 1.0,
@@ -68,7 +84,7 @@ def test_score(self):
              'LanguageIDFilter': [0.17, 0.0],
              'TerminalPunctuationFilter': -2.1972245773362196,
              'NonZeroNumeralsFilter': [0.8888888888888888]})
-        self.assertEqual(
+        self.assert_scores_equal(
             scores[2],
             {'LengthFilter': [0, 0],
              'LengthRatioFilter': 0,
@@ -132,7 +148,7 @@ def test_filter_empty(self):
             filtered, [('', ''), ('this is English', 'det är Svenska'), ('', '')])
 
 
-class TestFilterPipelineScoreNames(unittest.TestCase):
+class TestFilterPipelineScoreNames(TestFilterPipelineBase):
 
     def test_without_names(self):
         config = [
@@ -152,10 +168,10 @@ def test_without_names(self):
                  ('1245..',
                   '12345.....')]
         scores = list(fp.score(pairs))
-        self.assertEqual(
+        self.assert_scores_equal(
             scores[0],
             {'LengthFilter': {'1': [5, 9], '2': [34, 65]}})
-        self.assertEqual(
+        self.assert_scores_equal(
             scores[1],
             {'LengthFilter': {'1': [1, 1], '2': [6, 10]}})
 
@@ -177,9 +193,9 @@ def test_with_names(self):
                  ('1245..',
                   '12345.....')]
         scores = list(fp.score(pairs))
-        self.assertEqual(
+        self.assert_scores_equal(
             scores[0],
             {'LengthFilter': {'words': [5, 9], 'chars': [34, 65]}})
-        self.assertEqual(
+        self.assert_scores_equal(
             scores[1],
             {'LengthFilter': {'words': [1, 1], 'chars': [6, 10]}})

From 55ec9a7c8135060ccb4fa801c958346079903f65 Mon Sep 17 00:00:00 2001
From: Sami Virpioja <sami.virpioja@helsinki.fi>
Date: Wed, 26 Jun 2024 10:28:51 +0300
Subject: [PATCH 5/7] replace langid.py with py3langid

---
 README.md                                     |  2 +-
 docs/CHANGELOG.md                             |  3 ++-
 docs/CONTRIBUTING.md                          |  4 ++--
 ...ipt_and_language_identification_filters.md | 19 +++++++++++++------
 docs/installation.md                          | 12 ++++++------
 opusfilter/filters.py                         |  4 ++--
 requirements.txt                              |  2 +-
 setup.py                                      |  3 ++-
 8 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index b19b768..adcd196 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ Install from source:
 
 ### Troubleshooting
 
-OpusFilter should generally work fine on Python 3.8 to 3.11. In the case of troubles, try installing the exact versions in `requirements.txt`:
+OpusFilter should generally work fine on Python 3.8 to 3.12. In the case of troubles, try installing the exact versions in `requirements.txt`:
 
 * `pip install -r requirements.txt`
 
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index b11bd69..745e705 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -9,7 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
-- make pycld2 and fasttext libraries optional
+- make `pycld2` and `fasttext` libraries optional
+- replace `langid.py` library with `py3langid`
 - update github workflows and include Python 3.12 tests
 
 ## [3.1.0] - 2024-06-05
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
index 506eb41..9a0de65 100644
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@@ -5,7 +5,7 @@ issues page. We are also happy to consider pull requests. There are a
 few rules for pull requests:
 
 * Make a pull request to the `develop` branch instead of `master`.
-* The code should support at least Python versions from 3.8 to 3.11.
+* The code should support at least Python versions from 3.8 to 3.12.
 * Please follow [PEP 8](https://www.python.org/dev/peps/pep-0008/). Exception: The maximum line length is 127 characters instead of 79.
 * Especially for new features, please include test cases for unit testing.
 
@@ -20,7 +20,7 @@ skips the respective tests if not.)
 
 GitHub workflows defined in the project run automatically `flake8`
 checks and unit testing with `pytest` using Python 3.8, 3.9, 3.10,
-and 3.11.
+3.11, and 3.12.
 
 Especially for larger contributions, consider using a code analysis
 tool like [Pylint](https://github.com/PyCQA/pylint). Install it
diff --git a/docs/filters/script_and_language_identification_filters.md b/docs/filters/script_and_language_identification_filters.md
index 020c69d..cf1a665 100644
--- a/docs/filters/script_and_language_identification_filters.md
+++ b/docs/filters/script_and_language_identification_filters.md
@@ -35,7 +35,7 @@ Filter segments based on their language identification confidence scores.
 Parameters:
 
 * `languages`: expected languages (ISO639 language codes) for the segments
-* `id_method`: language indentification method (`langid` for using the `langid` library, `cld2` for using the `cld2` library, or `fasttext` for using a `fasttext` model; the default is `langid`)
+* `id_method`: language indentification method (`langid`, `lingua`, `cld2`, `fasttext`; default `langid`)
 * `thresholds`: minimum identification confidence score for the segments (a single float or a list of floats per language)
 * `fasttext_model_path`: path for a `fasttext` model (required only for the `fasttext` method; default `null`)
 * `langid_languages`: limit detection to a list of possible languages (valid only for the `langid` method; default `null`)
@@ -44,8 +44,15 @@ Parameters:
 
 Returned scores are the language identification confidence scores from a given identification method for the segments. The scores range from 0 to 1. In filtering, all values have to be greater than the minimum thresholds. Negative threshold can be used to skip filtering for a language.
 
-See [langid.py](https://github.com/saffsd/langid.py) and
-[pycld2](https://github.com/aboSamoor/pycld2) for the method-specific
-options. A pretrained `fasttext` model can be downloaded from
-[fasttext.cc/docs/en/language-identification.html](https://fasttext.cc/docs/en/language-identification.html).
-The `cld2` and `fasttext` methods require [installing optional libraries](../installation.md).
+Currently the following identification methods are supported:
+
+* `langid` (default) :cite:`lui-baldwin-2012-langid`
+  * See https://github.com/adbar/py3langid
+* `lingua`
+  * See https://github.com/pemistahl/lingua-py
+* `cld2`
+  * See https://github.com/CLD2Owners/cld2
+  * Requires [installing optional libraries](../installation.md).
+* `fasttext` :cite:`joulin-etal-2016-fasttext` and :cite:`joulin-etal-2017-bag`
+  * A pretrained model can be downloaded from [fasttext.cc/docs/en/language-identification.html](https://fasttext.cc/docs/en/language-identification.html).
+  * Requires [installing optional libraries](../installation.md).
diff --git a/docs/installation.md b/docs/installation.md
index d9e0473..c14bee3 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -12,14 +12,14 @@ Install from source:
 
 Note that all required libraries are not available to install via PyPI
 on Windows OS. On Linux and MacOS, it should work directly for Python
-versions from 3.8 to 3.11.
+versions from 3.8 to 3.12.
 
 ## Required libraries
 
 * beautifulsoup4
 * opus-fast-mosestokenizer
 * graphviz
-* langid
+* py3langid
 * matplotlib
 * morfessor
 * OpusTools
@@ -41,11 +41,11 @@ See `setup.py` for possible version requirements.
 
 ### FastText and PyCLD2 language identification
 
-The language identification methods currently supported out-of-the-box
-are [langid](https://github.com/saffsd/langid.py) and
+The language identification libraries currently supported out-of-the-box
+are [py3langid](https://github.com/adbar/py3langid) and
 [lingua](https://github.com/pemistahl/lingua-py). The support for for
-[pycld2](https://github.com/aboSamoor/pycld2) and
-[fasttext models](https://fasttext.cc/docs/en/language-identification.html)
+[PyCLD2](https://github.com/aboSamoor/pycld2) and
+[FastText models](https://fasttext.cc/docs/en/language-identification.html)
 have been changed to optional due to the lack of support especially
 for newer Python versions.
 
diff --git a/opusfilter/filters.py b/opusfilter/filters.py
index 13f1461..bf2058f 100644
--- a/opusfilter/filters.py
+++ b/opusfilter/filters.py
@@ -334,8 +334,8 @@ def __init__(self, languages=None, id_method='langid', thresholds=None,
 
     def init_langid(self, langid_languages):
         """Initialize langid identifier"""
-        from langid.langid import LanguageIdentifier, model
-        self.identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
+        from py3langid.langid import LanguageIdentifier, MODEL_FILE
+        self.identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True)
         if langid_languages:
             self.identifier.set_languages(langid_languages)
 
diff --git a/requirements.txt b/requirements.txt
index 623a808..44a157f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ opustools
 jieba>=0.42
 beautifulsoup4>=4.8.2
 graphviz>=0.16
-langid==1.1.6
+py3langid==0.3.0
 matplotlib>=3.3.0
 opus-fast-mosestokenizer>=0.0.8.5
 pandas>=1.0.0
diff --git a/setup.py b/setup.py
index 6161b90..74dab3e 100644
--- a/setup.py
+++ b/setup.py
@@ -8,7 +8,7 @@
     "opustools",
     "beautifulsoup4>=4.8.0",
     "graphviz",
-    "langid",
+    "py3langid>=0.2.2",
     "matplotlib",
     "morfessor",
     "opus-fast-mosestokenizer>=0.0.8.5",
@@ -30,6 +30,7 @@
 ]
 
 fasttext_require = [
+    "py3langid<0.3.0",  # 0.3.0 requires numpy 2.0.0
     "numpy<2.0.0",
     "fasttext"
 ]

From b661b39fd10a3281833ac23b7457a0abcb25f247 Mon Sep 17 00:00:00 2001
From: Sami Virpioja <sami.virpioja@helsinki.fi>
Date: Wed, 26 Jun 2024 10:53:21 +0300
Subject: [PATCH 6/7] convert langid confidences to floats

---
 opusfilter/filters.py    | 2 +-
 opusfilter/opusfilter.py | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/opusfilter/filters.py b/opusfilter/filters.py
index bf2058f..46aeb27 100644
--- a/opusfilter/filters.py
+++ b/opusfilter/filters.py
@@ -388,7 +388,7 @@ def confidence(self, sentence: str, lan: str) -> float:
 
         if self.id_method == 'langid':
             lidetails = self.identifier.classify(sentence)
-            lilan, liconf = lidetails[0], round(lidetails[1], 2)
+            lilan, liconf = lidetails[0], round(float(lidetails[1]), 2)
             if lilan != lan:
                 liconf = 0.0
             return liconf
diff --git a/opusfilter/opusfilter.py b/opusfilter/opusfilter.py
index d415b8c..8490e7d 100644
--- a/opusfilter/opusfilter.py
+++ b/opusfilter/opusfilter.py
@@ -556,7 +556,11 @@ def _write_jsonl(objects, fname):
         """Write objects to file as JSON lines"""
         with file_open(fname, 'w') as fobj:
             for obj in objects:
-                fobj.write(json.dumps(obj, sort_keys=True)+'\n')
+                try:
+                    fobj.write(json.dumps(obj, sort_keys=True)+'\n')
+                except TypeError as err:
+                    logger.error("Could not convert to JSON: %s", obj)
+                    raise err
 
     @staticmethod
     def _read_jsonl(fname):

From 01b7e12e830295cef09b4994d8736436418c9190 Mon Sep 17 00:00:00 2001
From: Sami Virpioja <sami.virpioja@helsinki.fi>
Date: Wed, 26 Jun 2024 11:01:00 +0300
Subject: [PATCH 7/7] test --no-cache-dir in github pipeline

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ec43b71..ddbc398 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -36,7 +36,7 @@ jobs:
           python -m pip install --upgrade setuptools
           python -m pip install --upgrade pip
           python -m pip install flake8 pytest wheel
-          python -m pip install -r ${{ matrix.requirements-file }}
+          python -m pip install --no-cache-dir -r ${{ matrix.requirements-file }}
           python -m pip install .
       - name: Lint with flake8
         run: |