From d88fb8d7ad0167f3da10b638f9b62d395788486d Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Thu, 20 Jun 2024 11:49:27 +0300 Subject: [PATCH 1/7] change fasttext and pycld2 requirements optional --- docs/installation.md | 9 +++++++-- setup.py | 20 ++++++++++++++------ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index 92e86b3..e04de8b 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -18,14 +18,12 @@ versions from 3.8 to 3.11. * beautifulsoup4 * opus-fast-mosestokenizer -* fasttext * graphviz * langid * matplotlib * morfessor * OpusTools * pandas -* pycld2 * rapidfuzz * ruamel.yaml * regex @@ -41,6 +39,13 @@ See `setup.py` for possible version requirements. ## Optional libraries and tools +### FastText and PyCLD2 language detection + +* fasttext +* pycld2 + +These libararies are no longer updated? + ### Jieba and MeCab word segmentation For Chinese tokenization (word segmentation), you can use the diff --git a/setup.py b/setup.py index 183b980..a39f586 100644 --- a/setup.py +++ b/setup.py @@ -8,14 +8,12 @@ "numpy<2.0.0", "opustools", "beautifulsoup4>=4.8.0", - "fasttext", "graphviz", "langid", "matplotlib", "morfessor", "opus-fast-mosestokenizer>=0.0.8.5", "pandas>=1.0.0", - "pycld2", "xxhash>=3.2.0", "sentence-splitter", "rapidfuzz", @@ -28,6 +26,14 @@ "lingua-language-detector>=1.3.0" ] +pycld2_require = [ + "pycld2" +] + +fasttext_require = [ + "fasttext" +] + eflomal_require = [ 'eflomal>=2.0.0' ] @@ -60,7 +66,8 @@ 'sphinxcontrib-bibtex' ] -all_require = eflomal_require + jieba_require + mecab_require + laser_require + varikn_require + tests_require + docs_require +all_require = pycld2_require + fasttext_require + eflomal_require + jieba_require + \ + mecab_require + laser_require + varikn_require + tests_require + docs_require setuptools.setup( name="opusfilter", @@ -78,9 +85,10 @@ "bin/opusfilter-scores", "bin/opusfilter-test"], install_requires=install_requires, tests_require=tests_require, - extras_require={'test': tests_require, 'eflomal': eflomal_require, 'jieba': jieba_require, - 'mecab': mecab_require, 'laser': laser_require, 'varikn': varikn_require, - 'docs': docs_require, 'all': all_require}, + extras_require={'test': tests_require, 'pycld2': pycld2_require, 'fasttext': fasttext_require, + 'eflomal': eflomal_require, 'jieba': jieba_require, 'mecab': mecab_require, + 'laser': laser_require, 'varikn': varikn_require, 'docs': docs_require, + 'all': all_require}, classifiers=( "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", From 2e8d1e1f2ba14fe6cbba6b70c9b11406da85cfad Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Thu, 20 Jun 2024 17:37:46 +0300 Subject: [PATCH 2/7] fix unittests for new optional libraries --- .github/workflows/ci.yml | 6 +- docs/CHANGELOG.md | 4 ++ opusfilter/autogen.py | 6 +- opusfilter/filters.py | 12 +++- requirements.txt | 8 +-- setup.py | 2 +- tests/test_autogen.py | 2 +- tests/test_filters.py | 128 --------------------------------- tests/test_lid.py | 151 +++++++++++++++++++++++++++++++++++++++ 9 files changed, 177 insertions(+), 142 deletions(-) create mode 100644 tests/test_lid.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b5715ac..ec43b71 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,10 +32,12 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | + python -m ensurepip --upgrade + python -m pip install --upgrade setuptools python -m pip install --upgrade pip python -m pip install flake8 pytest wheel - pip install -r ${{ matrix.requirements-file }} - python setup.py install + python -m pip install -r ${{ matrix.requirements-file }} + python -m pip install . - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 836436c..04e1096 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- make pycld2 and fasttext libraries optional + ## [3.1.0] - 2024-06-05 ### Added diff --git a/opusfilter/autogen.py b/opusfilter/autogen.py index b718486..520da2b 100644 --- a/opusfilter/autogen.py +++ b/opusfilter/autogen.py @@ -217,7 +217,7 @@ class DefaultParameterFilters(AutoFiltersABC): 'AverageWordLengthFilter', 'AlphabetRatioFilter', 'TerminalPunctuationFilter', 'NonZeroNumeralsFilter', 'LongestCommonSubstringFilter', 'SimilarityFilter', 'RepetitionFilter', - 'CharacterScoreFilter', ('LanguageIDFilter', {'id_method': 'cld2'})] + 'CharacterScoreFilter', ('LanguageIDFilter', {'id_method': 'lingua'})] def set_filter_thresholds(self): """Set filter thresholds""" @@ -272,7 +272,7 @@ class PercentileFilters(DataBasedFiltersABC): 'AverageWordLengthFilter', 'AlphabetRatioFilter', 'TerminalPunctuationFilter', 'NonZeroNumeralsFilter', 'LongestCommonSubstringFilter', 'SimilarityFilter', 'RepetitionFilter', - 'CharacterScoreFilter', ('LanguageIDFilter', {'id_method': 'cld2'})] + 'CharacterScoreFilter', ('LanguageIDFilter', {'id_method': 'lingua'})] def __init__(self, files, excluded_percentile=0.001, **kwargs): super().__init__(files, **kwargs) @@ -512,7 +512,7 @@ class ClusterFilters(DataBasedFiltersABC): ('LengthRatioFilter.word', {'unit': 'word'}), 'NonZeroNumeralsFilter', 'CharacterScoreFilter', - ('LanguageIDFilter', {'id_method': 'cld2'}), + ('LanguageIDFilter', {'id_method': 'lingua'}), 'TerminalPunctuationFilter'] def __init__(self, files, k=2, max_length=150, **kwargs): diff --git a/opusfilter/filters.py b/opusfilter/filters.py index 92511cf..13f1461 100644 --- a/opusfilter/filters.py +++ b/opusfilter/filters.py @@ -344,7 +344,11 @@ def init_fastttext(self, fasttext_model_path): if not fasttext_model_path: raise ConfigurationError("FastText language ID method was choosen without specifying " "any path to fasttext model") - import fasttext + try: + import fasttext + except ImportError: + logger.warning("Could not import fasttext. Select another id_method for LanguageIDFilter.") + raise self.fasttext_model = fasttext.load_model(os.path.join(self.workdir, fasttext_model_path)) def init_lingua(self, lingua_mode): @@ -366,7 +370,11 @@ def confidence(self, sentence: str, lan: str) -> float: return 1.0 if self.id_method == 'cld2': - import pycld2 + try: + import pycld2 + except ImportError: + logger.warning("Could not import pycld2. Select another id_method for LanguageIDFilter.") + raise try: clddetails = pycld2.detect(sentence, **self.cld2_options) except pycld2.error as err: diff --git a/requirements.txt b/requirements.txt index b82f47b..623a808 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -setuptools==65.5.1 -setuptools_scm==6.4.2 -numpy<2.0.0 +setuptools>=65.5.1 +setuptools_scm>=6.4.2 +numpy>=1.24.4 opustools jieba>=0.42 beautifulsoup4>=4.8.2 @@ -9,7 +9,6 @@ langid==1.1.6 matplotlib>=3.3.0 opus-fast-mosestokenizer>=0.0.8.5 pandas>=1.0.0 -pycld2==0.41 xxhash==3.2.0 rapidfuzz>=2.0.5 regex>=2019.11.1 @@ -18,7 +17,6 @@ ruamel.yaml>=0.15.0 scikit-learn>=0.24.0 sentence-splitter==1.4 tqdm>=4.38.0 -fasttext==0.9.2 mecab-python3>=1.0.8 unidic-lite==1.0.8 subword-nmt==0.3.8 diff --git a/setup.py b/setup.py index a39f586..6161b90 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,6 @@ install_requires = [ "setuptools", - "numpy<2.0.0", "opustools", "beautifulsoup4>=4.8.0", "graphviz", @@ -31,6 +30,7 @@ ] fasttext_require = [ + "numpy<2.0.0", "fasttext" ] diff --git a/tests/test_autogen.py b/tests/test_autogen.py index 5fecd5f..dd19c53 100644 --- a/tests/test_autogen.py +++ b/tests/test_autogen.py @@ -103,7 +103,7 @@ class TestThresholdFinder(unittest.TestCase): {'LengthRatioFilter': {'name': 'word', 'threshold': 1, 'unit': 'word'}}, {'NonZeroNumeralsFilter': {'threshold': 1}}, {'CharacterScoreFilter': {'scripts': ['latin', 'latin'], 'thresholds': [1, 1]}}, - {'LanguageIDFilter': {'id_method': 'cld2', 'languages': ['en', 'de'], 'thresholds': [1, 1]}}, + {'LanguageIDFilter': {'id_method': 'lingua', 'languages': ['en', 'de'], 'thresholds': [1, 1]}}, {'TerminalPunctuationFilter': {'threshold': 1}} ] diff --git a/tests/test_filters.py b/tests/test_filters.py index b0b548f..554cf9e 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -5,9 +5,7 @@ import tempfile import unittest -from opusfilter import ConfigurationError from opusfilter.filters import * -from opusfilter.util import file_download class TestLengthFilter(unittest.TestCase): @@ -248,132 +246,6 @@ def test_trilingual_any(self): self.assertSequenceEqual(result, correct) -class TestLangIDMethod(unittest.TestCase): - - pairs_inputs = [ - ("This sentence is in english", "Je suis une phrase en français"), - ("me llamo bernardo", "je m'appelle Bernard") - ] - - -class TestLangId(TestLangIDMethod): - - def test_accept(self): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99]) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [True, False] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - self.assertEqual(model.accept(pair_score), pair_expected) - - def test_accept_with_set_languages(self): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99], - langid_languages=['fr', 'de']) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [False, False] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - self.assertEqual(model.accept(pair_score), pair_expected) - - -class TestCLD2(TestLangIDMethod): - - pairs_inputs = [ - ("This sentence is in english", "Je suis une phrase en français"), - ("me llamo bernardo", "je m'appelle Bernard"), - ("english sentence", "phrase français") - ] - - def test_accept(self): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9]) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [True, False, False] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - self.assertEqual(model.accept(pair_score), pair_expected) - - def test_accept_with_options(self): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9], - cld2_options={'bestEffort': True}) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [True, False, True] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - logging.info('%s %s', pair_score, pair_expected) - self.assertEqual(model.accept(pair_score), pair_expected) - - -class TestFasttext(TestLangIDMethod): - - fasttext_inputs = ["This sentence is in english", "Je suis une phrase en français"] - model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz' - - @classmethod - def setUpClass(self): - self.tempdir = tempfile.mkdtemp() - self.testmodel = os.path.join(self.tempdir, 'model.ftz') - try: - file_download(self.model_url, self.testmodel) - except requests.exceptions.ConnectionError: - self.testmodel = None - - @classmethod - def tearDownClass(self): - shutil.rmtree(self.tempdir) - - def test_missing_model(self): - with self.assertRaises(ConfigurationError): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99]) - - def test_wrong_method_with_model(self): - with self.assertRaises(ConfigurationError): - model = LanguageIDFilter( - languages=['en', 'fr'], thresholds=[0.8, 0.99], fasttext_model_path=self.tempdir) - - def test_fasttext_predict_lang(self): - if self.testmodel is None: - self.skipTest("Failed to download test resources") - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99], - fasttext_model_path=self.testmodel) - expected = ['en', 'fr'] - results = [model._fasttext_predict_lang(fasttext_input)[0] - for fasttext_input in self.fasttext_inputs] - self.assertSequenceEqual(expected, results) - - def test_accept(self): - if self.testmodel is None: - self.skipTest("Failed to download test resources") - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99], - fasttext_model_path=self.testmodel) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [True, False] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - self.assertEqual(model.accept(pair_score), pair_expected) - - -class TestLingua(TestLangIDMethod): - - def test_accept(self): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99]) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [True, False] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - self.assertEqual(model.accept(pair_score), pair_expected) - - def test_accept_high(self): - model = LanguageIDFilter( - languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7]) - pair_scores = model.score(self.pairs_inputs) - pair_expecteds = [True, False] - for pair_score, pair_expected in zip(pair_scores, pair_expecteds): - self.assertEqual(model.accept(pair_score), pair_expected) - - - class TestRepetitionFilter(unittest.TestCase): def test_get_repetition(self): diff --git a/tests/test_lid.py b/tests/test_lid.py new file mode 100644 index 0000000..ef0eefe --- /dev/null +++ b/tests/test_lid.py @@ -0,0 +1,151 @@ +import logging +import os +import shutil +import tempfile +import unittest + +import requests + +from opusfilter import ConfigurationError +from opusfilter.filters import * +from opusfilter.util import file_download + + +try: + import fasttext +except ImportError: + logging.warning("Could not import fasttext") + +try: + import pycld2 +except ImportError: + logging.warning("Could not import pycld2") + + +class TestLangIDMethod(unittest.TestCase): + + pairs_inputs = [ + ("This sentence is in english", "Je suis une phrase en français"), + ("me llamo bernardo", "je m'appelle Bernard") + ] + + +class TestLangId(TestLangIDMethod): + + def test_accept(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99]) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected) + + def test_accept_with_set_languages(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='langid', thresholds=[0.8, 0.99], + langid_languages=['fr', 'de']) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [False, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected) + + +class TestCLD2(TestLangIDMethod): + + pairs_inputs = [ + ("This sentence is in english", "Je suis une phrase en français"), + ("me llamo bernardo", "je m'appelle Bernard"), + ("english sentence", "phrase français") + ] + + @unittest.skipIf('pycld2' not in globals(), 'pycld2 not installed') + def test_accept(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9]) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected) + + @unittest.skipIf('pycld2' not in globals(), 'pycld2 not installed') + def test_accept_with_options(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='cld2', thresholds=[0.9, 0.9], + cld2_options={'bestEffort': True}) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False, True] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + logging.info('%s %s', pair_score, pair_expected) + self.assertEqual(model.accept(pair_score), pair_expected) + + +class TestFasttext(TestLangIDMethod): + + fasttext_inputs = ["This sentence is in english", "Je suis une phrase en français"] + model_url = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz' + + @classmethod + def setUpClass(self): + self.tempdir = tempfile.mkdtemp() + if 'fasttext' not in globals(): + raise unittest.SkipTest('fasttext not installed') + self.testmodel = os.path.join(self.tempdir, 'model.ftz') + try: + file_download(self.model_url, self.testmodel) + except requests.exceptions.ConnectionError: + self.testmodel = None + + @classmethod + def tearDownClass(self): + shutil.rmtree(self.tempdir) + + def test_missing_model(self): + with self.assertRaises(ConfigurationError): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99]) + + def test_wrong_method_with_model(self): + with self.assertRaises(ConfigurationError): + model = LanguageIDFilter( + languages=['en', 'fr'], thresholds=[0.8, 0.99], fasttext_model_path=self.tempdir) + + def test_fasttext_predict_lang(self): + if self.testmodel is None: + self.skipTest("Failed to download test resources") + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99], + fasttext_model_path=self.testmodel) + expected = ['en', 'fr'] + results = [model._fasttext_predict_lang(fasttext_input)[0] + for fasttext_input in self.fasttext_inputs] + self.assertSequenceEqual(expected, results) + + def test_accept(self): + if self.testmodel is None: + self.skipTest("Failed to download test resources") + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='fasttext', thresholds=[0.8, 0.99], + fasttext_model_path=self.testmodel) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected) + + +class TestLingua(TestLangIDMethod): + + def test_accept(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='lingua', thresholds=[0.4, 0.99]) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected) + + def test_accept_high(self): + model = LanguageIDFilter( + languages=['en', 'fr'], id_method='lingua', lingua_mode="high", thresholds=[0.5, 0.7]) + pair_scores = model.score(self.pairs_inputs) + pair_expecteds = [True, False] + for pair_score, pair_expected in zip(pair_scores, pair_expecteds): + self.assertEqual(model.accept(pair_score), pair_expected) From 8c6f2ebdf2a38d5537ce33155a3837a29b1b4a0b Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 26 Jun 2024 09:36:39 +0300 Subject: [PATCH 3/7] update documentation --- docs/CHANGELOG.md | 1 + ...ipt_and_language_identification_filters.md | 1 + docs/installation.md | 29 +++++++++++++------ 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 04e1096..b11bd69 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - make pycld2 and fasttext libraries optional +- update github workflows and include Python 3.12 tests ## [3.1.0] - 2024-06-05 diff --git a/docs/filters/script_and_language_identification_filters.md b/docs/filters/script_and_language_identification_filters.md index 961898e..020c69d 100644 --- a/docs/filters/script_and_language_identification_filters.md +++ b/docs/filters/script_and_language_identification_filters.md @@ -48,3 +48,4 @@ See [langid.py](https://github.com/saffsd/langid.py) and [pycld2](https://github.com/aboSamoor/pycld2) for the method-specific options. A pretrained `fasttext` model can be downloaded from [fasttext.cc/docs/en/language-identification.html](https://fasttext.cc/docs/en/language-identification.html). +The `cld2` and `fasttext` methods require [installing optional libraries](../installation.md). diff --git a/docs/installation.md b/docs/installation.md index e04de8b..d9e0473 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -39,31 +39,42 @@ See `setup.py` for possible version requirements. ## Optional libraries and tools -### FastText and PyCLD2 language detection +### FastText and PyCLD2 language identification -* fasttext -* pycld2 +The language identification methods currently supported out-of-the-box +are [langid](https://github.com/saffsd/langid.py) and +[lingua](https://github.com/pemistahl/lingua-py). The support for for +[pycld2](https://github.com/aboSamoor/pycld2) and +[fasttext models](https://fasttext.cc/docs/en/language-identification.html) +have been changed to optional due to the lack of support especially +for newer Python versions. -These libararies are no longer updated? +The PyCLD2 support can be installed automatically with pip by +including the extras `[pycld2]` or `[all]` (e.g. +`pip install opusfilter[pycld2]`). + +The support for FastText models can be installed automatically with +pip by including the extras `[fasttext]` or `[all]` (e.g. +`pip install opusfilter[fasttext]`). ### Jieba and MeCab word segmentation For Chinese tokenization (word segmentation), you can use the [jieba](https://github.com/fxsjy/jieba) library. It can be installed automatically with pip by including the extras `[jieba]` or `[all]` -(e.g. `pip install opusfilter[all]`). +(e.g. `pip install opusfilter[jieba]`). For Japanese tokenization (word segmentation), you can use the [MeCab](https://github.com/SamuraiT/mecab-python3) library. It can be installed automatically with pip by including the extras `[mecab]` or `[all]` -(e.g. `pip install opusfilter[all]`). +(e.g. `pip install opusfilter[mecab]`). ### LASER sentence embeddings For using sentence embeddings filters, you need to install `laserembeddings` (https://github.com/yannvgn/laserembeddings). It can be installed automatically with pip by including the extras `[laser]` -or `[all]` (e.g. `pip install opusfilter[all]`). The package will also +or `[all]` (e.g. `pip install opusfilter[laser]`). The package will also require a number of additional libraries, including PyTorch, jieba, and MeCab. Note that you need also to download the prebuild models with `python -m laserembeddings download-models`. @@ -73,12 +84,12 @@ with `python -m laserembeddings download-models`. For using n-gram language model filters, you need to install the Python wrapper for VariKN (https://github.com/vsiivola/variKN). It can be installed automatically with pip by including the extras `[varikn]` -or `[all]` (e.g. `pip install opusfilter[all]`). +or `[all]` (e.g. `pip install opusfilter[varikn]`). ### Eflomal word alignment For using word alignment filters, you need to install elfomal (https://github.com/robertostling/eflomal). It can be installed automatically with pip by including the extras `[eflomal]` or `[all]` -(e.g. `pip install opusfilter[all]`). Note that you will need `Cython` +(e.g. `pip install opusfilter[eflomal]`). Note that you will need `Cython` for the installation. From 9fbe7d063e676a5a995ba181480d80a799de784b Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 26 Jun 2024 10:06:26 +0300 Subject: [PATCH 4/7] fix tests failing in some configurations --- tests/test_filter_pipeline.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/tests/test_filter_pipeline.py b/tests/test_filter_pipeline.py index ee37855..4a5884f 100644 --- a/tests/test_filter_pipeline.py +++ b/tests/test_filter_pipeline.py @@ -1,10 +1,26 @@ import copy import unittest +from numpy.testing import assert_almost_equal + from opusfilter.pipeline import FilterPipeline -class TestFilterPipeline(unittest.TestCase): +class TestFilterPipelineBase(unittest.TestCase): + + def assert_scores_equal(self, sdict1, sdict2): + self.assertEqual(set(sdict1), set(sdict2)) # same keys + for key, val1 in sdict1.items(): + val2 = sdict2[key] + if isinstance(val1, list): + self.assertEqual(len(val1), len(val2), msg=f"Scores do not match for {key}: {val1} {val2}") + for item1, item2 in zip(val1, val2): + self.assertAlmostEqual(item1, item2, msg=f"Scores do not match for {key}: {val1} {val2}") + else: + self.assertAlmostEqual(val1, val2, msg=f"Scores do not match for {key}: {val1} {val2}") + + +class TestFilterPipeline(TestFilterPipelineBase): @classmethod def setUpClass(self): @@ -46,7 +62,7 @@ def test_score(self): ('1245..', '12345.....'), ('', '')] scores = list(fp.score(pairs)) - self.assertEqual( + self.assert_scores_equal( scores[0], {'LengthFilter': [5, 9], 'LengthRatioFilter': 1.8, @@ -57,7 +73,7 @@ def test_score(self): 'LanguageIDFilter': [1.0, 1.0], 'TerminalPunctuationFilter': -0.0, 'NonZeroNumeralsFilter': [1.0]}) - self.assertEqual( + self.assert_scores_equal( scores[1], {'LengthFilter': [1, 1], 'LengthRatioFilter': 1.0, @@ -68,7 +84,7 @@ def test_score(self): 'LanguageIDFilter': [0.17, 0.0], 'TerminalPunctuationFilter': -2.1972245773362196, 'NonZeroNumeralsFilter': [0.8888888888888888]}) - self.assertEqual( + self.assert_scores_equal( scores[2], {'LengthFilter': [0, 0], 'LengthRatioFilter': 0, @@ -132,7 +148,7 @@ def test_filter_empty(self): filtered, [('', ''), ('this is English', 'det är Svenska'), ('', '')]) -class TestFilterPipelineScoreNames(unittest.TestCase): +class TestFilterPipelineScoreNames(TestFilterPipelineBase): def test_without_names(self): config = [ @@ -152,10 +168,10 @@ def test_without_names(self): ('1245..', '12345.....')] scores = list(fp.score(pairs)) - self.assertEqual( + self.assert_scores_equal( scores[0], {'LengthFilter': {'1': [5, 9], '2': [34, 65]}}) - self.assertEqual( + self.assert_scores_equal( scores[1], {'LengthFilter': {'1': [1, 1], '2': [6, 10]}}) @@ -177,9 +193,9 @@ def test_with_names(self): ('1245..', '12345.....')] scores = list(fp.score(pairs)) - self.assertEqual( + self.assert_scores_equal( scores[0], {'LengthFilter': {'words': [5, 9], 'chars': [34, 65]}}) - self.assertEqual( + self.assert_scores_equal( scores[1], {'LengthFilter': {'words': [1, 1], 'chars': [6, 10]}}) From 55ec9a7c8135060ccb4fa801c958346079903f65 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 26 Jun 2024 10:28:51 +0300 Subject: [PATCH 5/7] replace langid.py with py3langid --- README.md | 2 +- docs/CHANGELOG.md | 3 ++- docs/CONTRIBUTING.md | 4 ++-- ...ipt_and_language_identification_filters.md | 19 +++++++++++++------ docs/installation.md | 12 ++++++------ opusfilter/filters.py | 4 ++-- requirements.txt | 2 +- setup.py | 3 ++- 8 files changed, 29 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index b19b768..adcd196 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ Install from source: ### Troubleshooting -OpusFilter should generally work fine on Python 3.8 to 3.11. In the case of troubles, try installing the exact versions in `requirements.txt`: +OpusFilter should generally work fine on Python 3.8 to 3.12. In the case of troubles, try installing the exact versions in `requirements.txt`: * `pip install -r requirements.txt` diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index b11bd69..745e705 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -9,7 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- make pycld2 and fasttext libraries optional +- make `pycld2` and `fasttext` libraries optional +- replace `langid.py` library with `py3langid` - update github workflows and include Python 3.12 tests ## [3.1.0] - 2024-06-05 diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 506eb41..9a0de65 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -5,7 +5,7 @@ issues page. We are also happy to consider pull requests. There are a few rules for pull requests: * Make a pull request to the `develop` branch instead of `master`. -* The code should support at least Python versions from 3.8 to 3.11. +* The code should support at least Python versions from 3.8 to 3.12. * Please follow [PEP 8](https://www.python.org/dev/peps/pep-0008/). Exception: The maximum line length is 127 characters instead of 79. * Especially for new features, please include test cases for unit testing. @@ -20,7 +20,7 @@ skips the respective tests if not.) GitHub workflows defined in the project run automatically `flake8` checks and unit testing with `pytest` using Python 3.8, 3.9, 3.10, -and 3.11. +3.11, and 3.12. Especially for larger contributions, consider using a code analysis tool like [Pylint](https://github.com/PyCQA/pylint). Install it diff --git a/docs/filters/script_and_language_identification_filters.md b/docs/filters/script_and_language_identification_filters.md index 020c69d..cf1a665 100644 --- a/docs/filters/script_and_language_identification_filters.md +++ b/docs/filters/script_and_language_identification_filters.md @@ -35,7 +35,7 @@ Filter segments based on their language identification confidence scores. Parameters: * `languages`: expected languages (ISO639 language codes) for the segments -* `id_method`: language indentification method (`langid` for using the `langid` library, `cld2` for using the `cld2` library, or `fasttext` for using a `fasttext` model; the default is `langid`) +* `id_method`: language indentification method (`langid`, `lingua`, `cld2`, `fasttext`; default `langid`) * `thresholds`: minimum identification confidence score for the segments (a single float or a list of floats per language) * `fasttext_model_path`: path for a `fasttext` model (required only for the `fasttext` method; default `null`) * `langid_languages`: limit detection to a list of possible languages (valid only for the `langid` method; default `null`) @@ -44,8 +44,15 @@ Parameters: Returned scores are the language identification confidence scores from a given identification method for the segments. The scores range from 0 to 1. In filtering, all values have to be greater than the minimum thresholds. Negative threshold can be used to skip filtering for a language. -See [langid.py](https://github.com/saffsd/langid.py) and -[pycld2](https://github.com/aboSamoor/pycld2) for the method-specific -options. A pretrained `fasttext` model can be downloaded from -[fasttext.cc/docs/en/language-identification.html](https://fasttext.cc/docs/en/language-identification.html). -The `cld2` and `fasttext` methods require [installing optional libraries](../installation.md). +Currently the following identification methods are supported: + +* `langid` (default) :cite:`lui-baldwin-2012-langid` + * See https://github.com/adbar/py3langid +* `lingua` + * See https://github.com/pemistahl/lingua-py +* `cld2` + * See https://github.com/CLD2Owners/cld2 + * Requires [installing optional libraries](../installation.md). +* `fasttext` :cite:`joulin-etal-2016-fasttext` and :cite:`joulin-etal-2017-bag` + * A pretrained model can be downloaded from [fasttext.cc/docs/en/language-identification.html](https://fasttext.cc/docs/en/language-identification.html). + * Requires [installing optional libraries](../installation.md). diff --git a/docs/installation.md b/docs/installation.md index d9e0473..c14bee3 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -12,14 +12,14 @@ Install from source: Note that all required libraries are not available to install via PyPI on Windows OS. On Linux and MacOS, it should work directly for Python -versions from 3.8 to 3.11. +versions from 3.8 to 3.12. ## Required libraries * beautifulsoup4 * opus-fast-mosestokenizer * graphviz -* langid +* py3langid * matplotlib * morfessor * OpusTools @@ -41,11 +41,11 @@ See `setup.py` for possible version requirements. ### FastText and PyCLD2 language identification -The language identification methods currently supported out-of-the-box -are [langid](https://github.com/saffsd/langid.py) and +The language identification libraries currently supported out-of-the-box +are [py3langid](https://github.com/adbar/py3langid) and [lingua](https://github.com/pemistahl/lingua-py). The support for for -[pycld2](https://github.com/aboSamoor/pycld2) and -[fasttext models](https://fasttext.cc/docs/en/language-identification.html) +[PyCLD2](https://github.com/aboSamoor/pycld2) and +[FastText models](https://fasttext.cc/docs/en/language-identification.html) have been changed to optional due to the lack of support especially for newer Python versions. diff --git a/opusfilter/filters.py b/opusfilter/filters.py index 13f1461..bf2058f 100644 --- a/opusfilter/filters.py +++ b/opusfilter/filters.py @@ -334,8 +334,8 @@ def __init__(self, languages=None, id_method='langid', thresholds=None, def init_langid(self, langid_languages): """Initialize langid identifier""" - from langid.langid import LanguageIdentifier, model - self.identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) + from py3langid.langid import LanguageIdentifier, MODEL_FILE + self.identifier = LanguageIdentifier.from_pickled_model(MODEL_FILE, norm_probs=True) if langid_languages: self.identifier.set_languages(langid_languages) diff --git a/requirements.txt b/requirements.txt index 623a808..44a157f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ opustools jieba>=0.42 beautifulsoup4>=4.8.2 graphviz>=0.16 -langid==1.1.6 +py3langid==0.3.0 matplotlib>=3.3.0 opus-fast-mosestokenizer>=0.0.8.5 pandas>=1.0.0 diff --git a/setup.py b/setup.py index 6161b90..74dab3e 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ "opustools", "beautifulsoup4>=4.8.0", "graphviz", - "langid", + "py3langid>=0.2.2", "matplotlib", "morfessor", "opus-fast-mosestokenizer>=0.0.8.5", @@ -30,6 +30,7 @@ ] fasttext_require = [ + "py3langid<0.3.0", # 0.3.0 requires numpy 2.0.0 "numpy<2.0.0", "fasttext" ] From b661b39fd10a3281833ac23b7457a0abcb25f247 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 26 Jun 2024 10:53:21 +0300 Subject: [PATCH 6/7] convert langid confidences to floats --- opusfilter/filters.py | 2 +- opusfilter/opusfilter.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/opusfilter/filters.py b/opusfilter/filters.py index bf2058f..46aeb27 100644 --- a/opusfilter/filters.py +++ b/opusfilter/filters.py @@ -388,7 +388,7 @@ def confidence(self, sentence: str, lan: str) -> float: if self.id_method == 'langid': lidetails = self.identifier.classify(sentence) - lilan, liconf = lidetails[0], round(lidetails[1], 2) + lilan, liconf = lidetails[0], round(float(lidetails[1]), 2) if lilan != lan: liconf = 0.0 return liconf diff --git a/opusfilter/opusfilter.py b/opusfilter/opusfilter.py index d415b8c..8490e7d 100644 --- a/opusfilter/opusfilter.py +++ b/opusfilter/opusfilter.py @@ -556,7 +556,11 @@ def _write_jsonl(objects, fname): """Write objects to file as JSON lines""" with file_open(fname, 'w') as fobj: for obj in objects: - fobj.write(json.dumps(obj, sort_keys=True)+'\n') + try: + fobj.write(json.dumps(obj, sort_keys=True)+'\n') + except TypeError as err: + logger.error("Could not convert to JSON: %s", obj) + raise err @staticmethod def _read_jsonl(fname): From 01b7e12e830295cef09b4994d8736436418c9190 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 26 Jun 2024 11:01:00 +0300 Subject: [PATCH 7/7] test --no-cache-dir in github pipeline --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ec43b71..ddbc398 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,7 +36,7 @@ jobs: python -m pip install --upgrade setuptools python -m pip install --upgrade pip python -m pip install flake8 pytest wheel - python -m pip install -r ${{ matrix.requirements-file }} + python -m pip install --no-cache-dir -r ${{ matrix.requirements-file }} python -m pip install . - name: Lint with flake8 run: |