diff --git a/.buildinfo b/.buildinfo index 2e55811..db6596a 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: c1d81864d9a486aace2e262a23a8d628 +config: 049ffd4e121bdafbe656d7a50163aab1 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/.doctrees/CHANGELOG.doctree b/.doctrees/CHANGELOG.doctree index ee2720f..a603529 100644 Binary files a/.doctrees/CHANGELOG.doctree and b/.doctrees/CHANGELOG.doctree differ diff --git a/.doctrees/CONTRIBUTING.doctree b/.doctrees/CONTRIBUTING.doctree index 6de23a5..92bed06 100644 Binary files a/.doctrees/CONTRIBUTING.doctree and b/.doctrees/CONTRIBUTING.doctree differ diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle index 560452d..194aaa4 100644 Binary files a/.doctrees/environment.pickle and b/.doctrees/environment.pickle differ diff --git a/.doctrees/filters/script_and_language_identification_filters.doctree b/.doctrees/filters/script_and_language_identification_filters.doctree index 6d1999f..79c2452 100644 Binary files a/.doctrees/filters/script_and_language_identification_filters.doctree and b/.doctrees/filters/script_and_language_identification_filters.doctree differ diff --git a/.doctrees/functions/downloading_and_selecting_data.doctree b/.doctrees/functions/downloading_and_selecting_data.doctree index 92937f2..e0eafbd 100644 Binary files a/.doctrees/functions/downloading_and_selecting_data.doctree and b/.doctrees/functions/downloading_and_selecting_data.doctree differ diff --git a/.doctrees/installation.doctree b/.doctrees/installation.doctree index 05675fe..0e7412d 100644 Binary files a/.doctrees/installation.doctree and b/.doctrees/installation.doctree differ diff --git a/CHANGELOG.html b/CHANGELOG.html index 5ec470c..2f56fb3 100644 --- a/CHANGELOG.html +++ b/CHANGELOG.html @@ -4,7 +4,7 @@ - Changelog — OpusFilter 3.1.0 documentation + Changelog — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -36,7 +36,7 @@ OpusFilter
- 3.1 + 3.2
@@ -89,79 +89,84 @@
  • Contributing
  • Changelog
    • Unreleased
    • -
    • 3.1.0 - 2024-06-05
        -
      • Added
      • -
      • Removed
      • +
      • 3.2.0 - 2024-08-14
      • -
      • 3.0.0 - 2023-10-11
          -
        • Added
        • -
        • Changed
        • -
        • Removed
        • -
        • Fixed
        • +
        • 3.1.0 - 2024-06-05
        • -
        • 2.6.0 - 2022-11-30
            -
          • Added
          • -
          • Changed
          • -
          • Fixed
          • +
          • 3.0.0 - 2023-10-11
          • -
          • 2.5.1 - 2022-09-28
              -
            • Fixed
            • +
            • 2.6.0 - 2022-11-30
            • -
            • 2.5.0 - 2022-09-28
                -
              • Added
              • -
              • Changed
              • -
              • Fixed
              • +
              • 2.5.1 - 2022-09-28
              • -
              • 2.4.0 - 2022-04-05
                  -
                • Added
                • -
                • Changed
                • +
                • 2.5.0 - 2022-09-28
                • -
                • 2.3.1 - 2022-01-28
                    -
                  • Fixed
                  • +
                  • 2.4.0 - 2022-04-05
                  • -
                  • 2.3.0 - 2022-01-18
                      -
                    • Added
                    • -
                    • Changed
                    • +
                    • 2.3.1 - 2022-01-28
                    • -
                    • 2.2.0 - 2021-11-23
                        +
                      • 2.3.0 - 2022-01-18
                      • -
                      • 2.1.2 - 2021-11-11
                          -
                        • Fixed
                        • +
                        • 2.2.0 - 2021-11-23
                        • -
                        • 2.1.1 - 2021-10-19
                            -
                          • Changed
                          • +
                          • 2.1.2 - 2021-11-11
                          • -
                          • 2.1.0 - 2021-08-31
                              +
                            • 2.1.1 - 2021-10-19
                            • -
                            • 2.0.0 - 2021-06-01 @@ -195,10 +200,27 @@

                              ChangelogKeep a Changelog, and this project adheres to Semantic Versioning.

                              -

                              Unreleased

                              +

                              Unreleased

                              -

                              3.1.0 - 2024-06-05

                              +

                              3.2.0 - 2024-08-14

                              +
                              +

                              Changed

                              +
                                +
                              • make pycld2 and fasttext libraries optional

                              • +
                              • replace langid.py library with py3langid

                              • +
                              • update github workflows and include Python 3.12 tests

                              • +
                              +
                              +
                              +

                              Fixed

                              +
                                +
                              • OpusRead interface using moses format (requires opustools >= 1.6.2)

                              • +
                              +
                              +
                              +
                              +

                              3.1.0 - 2024-06-05

                              Added

                                @@ -211,25 +233,25 @@

                                Removed -

                                Fixed

                                +
                                +

                                Fixed

                                • fix score method in SentenceEmbeddingFilter (https://github.com/Helsinki-NLP/OpusFilter/pull/71)

                                • fix filter and filterfalse methods in SentenceEmbeddingFilter

                              -
                              -

                              3.0.0 - 2023-10-11

                              -
                              -

                              Added

                              +
                              +

                              3.0.0 - 2023-10-11

                              +
                              +

                              Added

                              • opusfilter-autogen script for automatic filter config generation

                              • score_direction, accept_threshold, and reject_threshold properties for filters

                              -
                              -

                              Changed

                              +
                              +

                              Changed

                              • refactor code and move auxiliary methods to opusfilter.util

                              • update varikn installation instructions (installable from PyPI)

                              • @@ -240,30 +262,30 @@

                                Changed -

                                Removed

                                +
                                +

                                Removed

                                • Python 3.6 support

                                -
                                -

                                Fixed

                                +
                                +

                                Fixed

                                • catch NotImplementedError from beautifulsoup 4.11.2

                                • catch ParserRejectedMarkup from beautifulsoup 4.12.0

                                -
                                -

                                2.6.0 - 2022-11-30

                                -
                                -

                                Added

                                +
                                +

                                2.6.0 - 2022-11-30

                                +
                                +

                                Added

                                • add slice missing from the enabled steps

                                -
                                -

                                Changed

                                +
                                +

                                Changed

                                -
                                -

                                Fixed

                                +
                                +

                                Fixed

                                • allow float thresholds for AverageWordLengthFilter

                                • remove unnecessary code from RegExpSub

                                • @@ -280,42 +302,42 @@

                                  Fixed

                                -
                                -

                                2.5.1 - 2022-09-28

                                -
                                -

                                Fixed

                                +
                                +

                                2.5.1 - 2022-09-28

                                +
                                +

                                Fixed

                                • add missing document file

                                -
                                -

                                2.5.0 - 2022-09-28

                                -
                                -

                                Added

                                +
                                +

                                2.5.0 - 2022-09-28

                                +
                                +

                                Added

                                • map_space_to option for Jieba and MeCab tokenizers to preserve existing space characters in input

                                • parallel processing options for filter, score, and preprocess steps

                                -
                                -

                                Changed

                                +
                                +

                                Changed

                                • re-organize documentation and support building it with sphinx

                                -
                                -

                                Fixed

                                +
                                +

                                Fixed

                                • catch TypeError exceptions from BeautifulSoup in HtmlTagFilter

                                -
                                -

                                2.4.0 - 2022-04-05

                                -
                                -

                                Added

                                +
                                +

                                2.4.0 - 2022-04-05

                                +
                                +

                                Added

                                -
                                -

                                Changed

                                +
                                +

                                Changed

                                • allow per-language parameters for LengthFilter, LengthRatioFilter, LongWordFilter, and AverageWordLengthFilter

                                • fix documentation for train_aligment parameters

                                -
                                -

                                2.3.1 - 2022-01-28

                                -
                                -

                                Fixed

                                +
                                +

                                2.3.1 - 2022-01-28

                                +
                                +

                                Fixed

                                • fix bug in classifier training without development set

                                -
                                -

                                2.3.0 - 2022-01-18

                                -
                                -

                                Added

                                +
                                +

                                2.3.0 - 2022-01-18

                                +
                                +

                                Added

                                -
                                -

                                Changed

                                +
                                +

                                Changed

                                • add workdir attribute to the FilterABC base class and change that the filters should use it for any file parameters

                                • @@ -365,50 +387,50 @@

                                  Changed

                                -
                                -

                                2.2.0 - 2021-11-23

                                -
                                -

                                Added

                                +
                                +

                                2.2.0 - 2021-11-23

                                +
                                +

                                Added

                                • support for Chinese word segmentation using jieba as a tokenizer (https://github.com/Helsinki-NLP/OpusFilter/pull/27)

                                -
                                -

                                2.1.2 - 2021-11-11

                                -
                                -

                                Fixed

                                +
                                +

                                2.1.2 - 2021-11-11

                                +
                                +

                                Fixed

                                • fix wrong keyword argument name in opusfilter-duplicates

                                -
                                -

                                2.1.1 - 2021-10-19

                                -
                                -

                                Changed

                                +
                                +

                                2.1.1 - 2021-10-19

                                +
                                +

                                Changed

                                • move “How to contribute” to docs/CONTRIBUTING.md

                                -
                                -

                                Fixed

                                +
                                +

                                Fixed

                                • fix setuptools requirement (https://github.com/Helsinki-NLP/OpusFilter/issues/21)

                                • fix version requirement for pandas (>=1.0.0)

                                -
                                -

                                2.1.0 - 2021-08-31

                                -
                                -

                                Changed

                                +
                                +

                                2.1.0 - 2021-08-31

                                +
                                +

                                Changed

                                • replace PyYAML with ruamel.yaml

                                -
                                -

                                Added

                                +
                                +

                                Added

                                • support for variables in the YAML configuration (https://github.com/Helsinki-NLP/OpusFilter/pull/13)

                                • support to fasttext based for language detection (https://github.com/Helsinki-NLP/OpusFilter/pull/20)

                                • @@ -420,17 +442,17 @@

                                  Added

                                -
                                -

                                2.0.0 - 2021-06-01

                                -
                                -

                                Changed

                                +
                                +

                                2.0.0 - 2021-06-01

                                +
                                +

                                Changed

                                • extend to n-lingual parallel data instead of just bilingual data

                                • switch tokenizer to fast-mosestokenizer

                                -
                                -

                                Added

                                +
                                +

                                Added

                                -
                                -

                                Fixed

                                +
                                +

                                Fixed

                                • behaviour of simple filters on empty segments

                                -
                                -

                                1.0.1 - 2020-05-25

                                -
                                -

                                Added

                                +
                                +

                                1.0.1 - 2020-05-25

                                +
                                +

                                Added

                                • improved logging, documentation, and project files

                                -
                                -

                                Fixed

                                +
                                +

                                Fixed

                                • prevent UnboundLocalError for empty output after filter

                                -
                                -

                                1.0.0 - 2020-04-10

                                +
                                +

                                1.0.0 - 2020-04-10

                                First tagged version.

                                diff --git a/CONTRIBUTING.html b/CONTRIBUTING.html index 7067da8..f0de894 100644 --- a/CONTRIBUTING.html +++ b/CONTRIBUTING.html @@ -4,7 +4,7 @@ - Contributing — OpusFilter 3.1.0 documentation + Contributing — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                                - 3.1 + 3.2
                                @@ -121,7 +121,7 @@

                                Contributing
                              • Make a pull request to the develop branch instead of master.

                              • -
                              • The code should support at least Python versions from 3.8 to 3.11.

                              • +
                              • The code should support at least Python versions from 3.8 to 3.12.

                              • Please follow PEP 8. Exception: The maximum line length is 127 characters instead of 79.

                              • Especially for new features, please include test cases for unit testing.

                              @@ -134,7 +134,7 @@

                              Contributingflake8 checks and unit testing with pytest using Python 3.8, 3.9, 3.10, -and 3.11.

                              +3.11, and 3.12.

                              Especially for larger contributions, consider using a code analysis tool like Pylint. Install it e.g. via pip, run pylint opusfilter/ in the project root and fix diff --git a/_sources/CHANGELOG.md.txt b/_sources/CHANGELOG.md.txt index 836436c..4a496b5 100644 --- a/_sources/CHANGELOG.md.txt +++ b/_sources/CHANGELOG.md.txt @@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [3.2.0] - 2024-08-14 + +### Changed + +- make `pycld2` and `fasttext` libraries optional +- replace `langid.py` library with `py3langid` +- update github workflows and include Python 3.12 tests + +### Fixed + +- `OpusRead` interface using `moses` format (requires `opustools >= 1.6.2`) + ## [3.1.0] - 2024-06-05 ### Added @@ -204,7 +216,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 First tagged version. -[Unreleased]: https://github.com/Helsinki-NLP/OpusFilter/compare/3.1.0...develop +[Unreleased]: https://github.com/Helsinki-NLP/OpusFilter/compare/3.2.0...develop +[3.2.0]: https://github.com/Helsinki-NLP/OpusFilter/compare/3.1.0...3.2.0 [3.1.0]: https://github.com/Helsinki-NLP/OpusFilter/compare/3.0.0...3.1.0 [3.0.0]: https://github.com/Helsinki-NLP/OpusFilter/compare/2.6.0...3.0.0 [2.6.0]: https://github.com/Helsinki-NLP/OpusFilter/compare/2.5.1...2.6.0 diff --git a/_sources/CONTRIBUTING.md.txt b/_sources/CONTRIBUTING.md.txt index 506eb41..9a0de65 100644 --- a/_sources/CONTRIBUTING.md.txt +++ b/_sources/CONTRIBUTING.md.txt @@ -5,7 +5,7 @@ issues page. We are also happy to consider pull requests. There are a few rules for pull requests: * Make a pull request to the `develop` branch instead of `master`. -* The code should support at least Python versions from 3.8 to 3.11. +* The code should support at least Python versions from 3.8 to 3.12. * Please follow [PEP 8](https://www.python.org/dev/peps/pep-0008/). Exception: The maximum line length is 127 characters instead of 79. * Especially for new features, please include test cases for unit testing. @@ -20,7 +20,7 @@ skips the respective tests if not.) GitHub workflows defined in the project run automatically `flake8` checks and unit testing with `pytest` using Python 3.8, 3.9, 3.10, -and 3.11. +3.11, and 3.12. Especially for larger contributions, consider using a code analysis tool like [Pylint](https://github.com/PyCQA/pylint). Install it diff --git a/_sources/filters/script_and_language_identification_filters.md.txt b/_sources/filters/script_and_language_identification_filters.md.txt index 961898e..cf1a665 100644 --- a/_sources/filters/script_and_language_identification_filters.md.txt +++ b/_sources/filters/script_and_language_identification_filters.md.txt @@ -35,7 +35,7 @@ Filter segments based on their language identification confidence scores. Parameters: * `languages`: expected languages (ISO639 language codes) for the segments -* `id_method`: language indentification method (`langid` for using the `langid` library, `cld2` for using the `cld2` library, or `fasttext` for using a `fasttext` model; the default is `langid`) +* `id_method`: language indentification method (`langid`, `lingua`, `cld2`, `fasttext`; default `langid`) * `thresholds`: minimum identification confidence score for the segments (a single float or a list of floats per language) * `fasttext_model_path`: path for a `fasttext` model (required only for the `fasttext` method; default `null`) * `langid_languages`: limit detection to a list of possible languages (valid only for the `langid` method; default `null`) @@ -44,7 +44,15 @@ Parameters: Returned scores are the language identification confidence scores from a given identification method for the segments. The scores range from 0 to 1. In filtering, all values have to be greater than the minimum thresholds. Negative threshold can be used to skip filtering for a language. -See [langid.py](https://github.com/saffsd/langid.py) and -[pycld2](https://github.com/aboSamoor/pycld2) for the method-specific -options. A pretrained `fasttext` model can be downloaded from -[fasttext.cc/docs/en/language-identification.html](https://fasttext.cc/docs/en/language-identification.html). +Currently the following identification methods are supported: + +* `langid` (default) :cite:`lui-baldwin-2012-langid` + * See https://github.com/adbar/py3langid +* `lingua` + * See https://github.com/pemistahl/lingua-py +* `cld2` + * See https://github.com/CLD2Owners/cld2 + * Requires [installing optional libraries](../installation.md). +* `fasttext` :cite:`joulin-etal-2016-fasttext` and :cite:`joulin-etal-2017-bag` + * A pretrained model can be downloaded from [fasttext.cc/docs/en/language-identification.html](https://fasttext.cc/docs/en/language-identification.html). + * Requires [installing optional libraries](../installation.md). diff --git a/_sources/functions/downloading_and_selecting_data.md.txt b/_sources/functions/downloading_and_selecting_data.md.txt index cad15ae..95d35a0 100644 --- a/_sources/functions/downloading_and_selecting_data.md.txt +++ b/_sources/functions/downloading_and_selecting_data.md.txt @@ -11,11 +11,16 @@ Parameters: * `source_language`: language code for the source language * `target_language`: language code for the target language * `release`: version of the corpus in OPUS -* `preprocessing`: `raw` for untokenized and `xml` for tokenized segments +* `preprocessing`: `moses` or `raw` for untokenized and `xml` for tokenized segments * `src_output`: output file for source language * `tgt_output`: output file for target language * `suppress_prompts`: `false` (default) prompts user to confirm before download, `true` to download without prompting +The `moses` preprocessing type (available with `OpusTools` version +1.6.2 and above) is recommended for those corpora for which it +exists. The output is equivalent to `raw`, but in some cases it can +significantly reduce the amount of data downloaded in the process. + ## concatenate Concatenate two or more text files. diff --git a/_sources/installation.md.txt b/_sources/installation.md.txt index 92e86b3..c14bee3 100644 --- a/_sources/installation.md.txt +++ b/_sources/installation.md.txt @@ -12,20 +12,18 @@ Install from source: Note that all required libraries are not available to install via PyPI on Windows OS. On Linux and MacOS, it should work directly for Python -versions from 3.8 to 3.11. +versions from 3.8 to 3.12. ## Required libraries * beautifulsoup4 * opus-fast-mosestokenizer -* fasttext * graphviz -* langid +* py3langid * matplotlib * morfessor * OpusTools * pandas -* pycld2 * rapidfuzz * ruamel.yaml * regex @@ -41,24 +39,42 @@ See `setup.py` for possible version requirements. ## Optional libraries and tools +### FastText and PyCLD2 language identification + +The language identification libraries currently supported out-of-the-box +are [py3langid](https://github.com/adbar/py3langid) and +[lingua](https://github.com/pemistahl/lingua-py). The support for for +[PyCLD2](https://github.com/aboSamoor/pycld2) and +[FastText models](https://fasttext.cc/docs/en/language-identification.html) +have been changed to optional due to the lack of support especially +for newer Python versions. + +The PyCLD2 support can be installed automatically with pip by +including the extras `[pycld2]` or `[all]` (e.g. +`pip install opusfilter[pycld2]`). + +The support for FastText models can be installed automatically with +pip by including the extras `[fasttext]` or `[all]` (e.g. +`pip install opusfilter[fasttext]`). + ### Jieba and MeCab word segmentation For Chinese tokenization (word segmentation), you can use the [jieba](https://github.com/fxsjy/jieba) library. It can be installed automatically with pip by including the extras `[jieba]` or `[all]` -(e.g. `pip install opusfilter[all]`). +(e.g. `pip install opusfilter[jieba]`). For Japanese tokenization (word segmentation), you can use the [MeCab](https://github.com/SamuraiT/mecab-python3) library. It can be installed automatically with pip by including the extras `[mecab]` or `[all]` -(e.g. `pip install opusfilter[all]`). +(e.g. `pip install opusfilter[mecab]`). ### LASER sentence embeddings For using sentence embeddings filters, you need to install `laserembeddings` (https://github.com/yannvgn/laserembeddings). It can be installed automatically with pip by including the extras `[laser]` -or `[all]` (e.g. `pip install opusfilter[all]`). The package will also +or `[all]` (e.g. `pip install opusfilter[laser]`). The package will also require a number of additional libraries, including PyTorch, jieba, and MeCab. Note that you need also to download the prebuild models with `python -m laserembeddings download-models`. @@ -68,12 +84,12 @@ with `python -m laserembeddings download-models`. For using n-gram language model filters, you need to install the Python wrapper for VariKN (https://github.com/vsiivola/variKN). It can be installed automatically with pip by including the extras `[varikn]` -or `[all]` (e.g. `pip install opusfilter[all]`). +or `[all]` (e.g. `pip install opusfilter[varikn]`). ### Eflomal word alignment For using word alignment filters, you need to install elfomal (https://github.com/robertostling/eflomal). It can be installed automatically with pip by including the extras `[eflomal]` or `[all]` -(e.g. `pip install opusfilter[all]`). Note that you will need `Cython` +(e.g. `pip install opusfilter[eflomal]`). Note that you will need `Cython` for the installation. diff --git a/_static/documentation_options.js b/_static/documentation_options.js index 6ba7fc6..db3470d 100644 --- a/_static/documentation_options.js +++ b/_static/documentation_options.js @@ -1,6 +1,6 @@ var DOCUMENTATION_OPTIONS = { URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), - VERSION: '3.1.0', + VERSION: '3.2.0', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/automatic_configuration.html b/automatic_configuration.html index 7a0f801..515813f 100644 --- a/automatic_configuration.html +++ b/automatic_configuration.html @@ -4,7 +4,7 @@ - Automatic configuration generation — OpusFilter 3.1.0 documentation + Automatic configuration generation — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter

                              - 3.1 + 3.2
                              diff --git a/command_line_tools.html b/command_line_tools.html index 944c49a..b9b24d5 100644 --- a/command_line_tools.html +++ b/command_line_tools.html @@ -4,7 +4,7 @@ - Command line tools for analysis — OpusFilter 3.1.0 documentation + Command line tools for analysis — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                              - 3.1 + 3.2
                              diff --git a/filters/alignment_model_filters.html b/filters/alignment_model_filters.html index f9eff9e..7cc5a70 100644 --- a/filters/alignment_model_filters.html +++ b/filters/alignment_model_filters.html @@ -4,7 +4,7 @@ - Alignment model filters — OpusFilter 3.1.0 documentation + Alignment model filters — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                              - 3.1 + 3.2
                              diff --git a/filters/custom_filters.html b/filters/custom_filters.html index 5a090f8..9891e12 100644 --- a/filters/custom_filters.html +++ b/filters/custom_filters.html @@ -4,7 +4,7 @@ - Custom filters — OpusFilter 3.1.0 documentation + Custom filters — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                              - 3.1 + 3.2
                              diff --git a/filters/language_model_filters.html b/filters/language_model_filters.html index b9b722c..8e4df4d 100644 --- a/filters/language_model_filters.html +++ b/filters/language_model_filters.html @@ -4,7 +4,7 @@ - Language model filters — OpusFilter 3.1.0 documentation + Language model filters — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                              - 3.1 + 3.2
                              diff --git a/filters/length_filters.html b/filters/length_filters.html index c97eb48..bfd9d8f 100644 --- a/filters/length_filters.html +++ b/filters/length_filters.html @@ -4,7 +4,7 @@ - Length filters — OpusFilter 3.1.0 documentation + Length filters — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                              - 3.1 + 3.2
                              diff --git a/filters/script_and_language_identification_filters.html b/filters/script_and_language_identification_filters.html index 6d03967..95a8ce8 100644 --- a/filters/script_and_language_identification_filters.html +++ b/filters/script_and_language_identification_filters.html @@ -4,7 +4,7 @@ - Script and language identification filters — OpusFilter 3.1.0 documentation + Script and language identification filters — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                              - 3.1 + 3.2
                              @@ -152,7 +152,7 @@

                              LanguageIDFilter
                            • languages: expected languages (ISO639 language codes) for the segments

                            • -
                            • id_method: language indentification method (langid for using the langid library, cld2 for using the cld2 library, or fasttext for using a fasttext model; the default is langid)

                            • +
                            • id_method: language indentification method (langid, lingua, cld2, fasttext; default langid)

                            • thresholds: minimum identification confidence score for the segments (a single float or a list of floats per language)

                            • fasttext_model_path: path for a fasttext model (required only for the fasttext method; default null)

                            • langid_languages: limit detection to a list of possible languages (valid only for the langid method; default null)

                            • @@ -160,10 +160,31 @@

                              LanguageIDFilterlingua_mode: a string specifying whether to use lingua’s high or low accuracy mode

                            Returned scores are the language identification confidence scores from a given identification method for the segments. The scores range from 0 to 1. In filtering, all values have to be greater than the minimum thresholds. Negative threshold can be used to skip filtering for a language.

                            -

                            See langid.py and -pycld2 for the method-specific -options. A pretrained fasttext model can be downloaded from -fasttext.cc/docs/en/language-identification.html.

                            +

                            Currently the following identification methods are supported:

                            + diff --git a/filters/sentence_embedding_filters.html b/filters/sentence_embedding_filters.html index a71f995..1d56ae7 100644 --- a/filters/sentence_embedding_filters.html +++ b/filters/sentence_embedding_filters.html @@ -4,7 +4,7 @@ - Sentence embedding filters — OpusFilter 3.1.0 documentation + Sentence embedding filters — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                            - 3.1 + 3.2
                            diff --git a/filters/special_character_and_similarity_filters.html b/filters/special_character_and_similarity_filters.html index 372c0c0..00c6be6 100644 --- a/filters/special_character_and_similarity_filters.html +++ b/filters/special_character_and_similarity_filters.html @@ -4,7 +4,7 @@ - Special character and similarity filters — OpusFilter 3.1.0 documentation + Special character and similarity filters — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                            - 3.1 + 3.2
                            diff --git a/functions/downloading_and_selecting_data.html b/functions/downloading_and_selecting_data.html index ae888ff..828fc6a 100644 --- a/functions/downloading_and_selecting_data.html +++ b/functions/downloading_and_selecting_data.html @@ -4,7 +4,7 @@ - Downloading and selecting data — OpusFilter 3.1.0 documentation + Downloading and selecting data — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                            - 3.1 + 3.2
                          • target_language: language code for the target language

                          • release: version of the corpus in OPUS

                          • -
                          • preprocessing: raw for untokenized and xml for tokenized segments

                          • +
                          • preprocessing: moses or raw for untokenized and xml for tokenized segments

                          • src_output: output file for source language

                          • tgt_output: output file for target language

                          • suppress_prompts: false (default) prompts user to confirm before download, true to download without prompting

                          +

                          The moses preprocessing type (available with OpusTools version +1.6.2 and above) is recommended for those corpora for which it +exists. The output is equivalent to raw, but in some cases it can +significantly reduce the amount of data downloaded in the process.

                          concatenate

                          diff --git a/functions/filtering_and_scoring.html b/functions/filtering_and_scoring.html index 318f093..99b2005 100644 --- a/functions/filtering_and_scoring.html +++ b/functions/filtering_and_scoring.html @@ -4,7 +4,7 @@ - Filtering and scoring — OpusFilter 3.1.0 documentation + Filtering and scoring — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/functions/preprocessing_text.html b/functions/preprocessing_text.html index d4b6584..624ab42 100644 --- a/functions/preprocessing_text.html +++ b/functions/preprocessing_text.html @@ -4,7 +4,7 @@ - Preprocessing text — OpusFilter 3.1.0 documentation + Preprocessing text — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/functions/training_and_using_classifiers.html b/functions/training_and_using_classifiers.html index 7162e7a..af7ed59 100644 --- a/functions/training_and_using_classifiers.html +++ b/functions/training_and_using_classifiers.html @@ -4,7 +4,7 @@ - Training and using classifiers — OpusFilter 3.1.0 documentation + Training and using classifiers — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/functions/training_language_and_alignment_models.html b/functions/training_language_and_alignment_models.html index 9f152ea..ca3063a 100644 --- a/functions/training_language_and_alignment_models.html +++ b/functions/training_language_and_alignment_models.html @@ -4,7 +4,7 @@ - Training language and alignment models — OpusFilter 3.1.0 documentation + Training language and alignment models — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/functions/using_score_files.html b/functions/using_score_files.html index cbcd797..010810b 100644 --- a/functions/using_score_files.html +++ b/functions/using_score_files.html @@ -4,7 +4,7 @@ - Using score files — OpusFilter 3.1.0 documentation + Using score files — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/genindex.html b/genindex.html index 98b8e32..1ee8683 100644 --- a/genindex.html +++ b/genindex.html @@ -3,7 +3,7 @@ - Index — OpusFilter 3.1.0 documentation + Index — OpusFilter 3.2.0 documentation @@ -14,7 +14,7 @@ - + @@ -34,7 +34,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/index.html b/index.html index 5a0d8c6..5b95760 100644 --- a/index.html +++ b/index.html @@ -4,7 +4,7 @@ - OpusFilter — OpusFilter 3.1.0 documentation + OpusFilter — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -36,7 +36,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/installation.html b/installation.html index 88c0f97..864cd51 100644 --- a/installation.html +++ b/installation.html @@ -4,7 +4,7 @@ - Installation — OpusFilter 3.1.0 documentation + Installation — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          @@ -52,6 +52,7 @@
                        • Installation
                        • diff --git a/objects.inv b/objects.inv index 8582661..72fc5dc 100644 Binary files a/objects.inv and b/objects.inv differ diff --git a/preprocessors/bpe_segmentation.html b/preprocessors/bpe_segmentation.html index 5df3bd9..f155043 100644 --- a/preprocessors/bpe_segmentation.html +++ b/preprocessors/bpe_segmentation.html @@ -4,7 +4,7 @@ - BPESegmentation — OpusFilter 3.1.0 documentation + BPESegmentation — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/preprocessors/custom_preprocessors.html b/preprocessors/custom_preprocessors.html index 1906844..6e236ab 100644 --- a/preprocessors/custom_preprocessors.html +++ b/preprocessors/custom_preprocessors.html @@ -4,7 +4,7 @@ - Custom preprocessors — OpusFilter 3.1.0 documentation + Custom preprocessors — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/preprocessors/detokenizer.html b/preprocessors/detokenizer.html index aec81e2..c006ec5 100644 --- a/preprocessors/detokenizer.html +++ b/preprocessors/detokenizer.html @@ -4,7 +4,7 @@ - Detokenizer — OpusFilter 3.1.0 documentation + Detokenizer — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/preprocessors/monolingual_sentence_splitter.html b/preprocessors/monolingual_sentence_splitter.html index 1bd1b63..68a224e 100644 --- a/preprocessors/monolingual_sentence_splitter.html +++ b/preprocessors/monolingual_sentence_splitter.html @@ -4,7 +4,7 @@ - MonolingualSentenceSplitter — OpusFilter 3.1.0 documentation + MonolingualSentenceSplitter — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/preprocessors/morfessor_segmentation.html b/preprocessors/morfessor_segmentation.html index 77ba96e..62de82c 100644 --- a/preprocessors/morfessor_segmentation.html +++ b/preprocessors/morfessor_segmentation.html @@ -4,7 +4,7 @@ - MorfessorSegmentation — OpusFilter 3.1.0 documentation + MorfessorSegmentation — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/preprocessors/reg_exp_sub.html b/preprocessors/reg_exp_sub.html index b889a72..7d3b4aa 100644 --- a/preprocessors/reg_exp_sub.html +++ b/preprocessors/reg_exp_sub.html @@ -4,7 +4,7 @@ - RegExpSub — OpusFilter 3.1.0 documentation + RegExpSub — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/preprocessors/tokenizer.html b/preprocessors/tokenizer.html index 499baa7..aecdda2 100644 --- a/preprocessors/tokenizer.html +++ b/preprocessors/tokenizer.html @@ -4,7 +4,7 @@ - Tokenizer — OpusFilter 3.1.0 documentation + Tokenizer — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/preprocessors/whitespaceNormalizer.html b/preprocessors/whitespaceNormalizer.html index 6637b3b..49d0ae3 100644 --- a/preprocessors/whitespaceNormalizer.html +++ b/preprocessors/whitespaceNormalizer.html @@ -4,7 +4,7 @@ - WhitespaceNormalizer — OpusFilter 3.1.0 documentation + WhitespaceNormalizer — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/references.html b/references.html index 4dc42aa..6e4e4b6 100644 --- a/references.html +++ b/references.html @@ -4,7 +4,7 @@ - Citing and references — OpusFilter 3.1.0 documentation + Citing and references — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/search.html b/search.html index 64de7a8..7792a39 100644 --- a/search.html +++ b/search.html @@ -3,7 +3,7 @@ - Search — OpusFilter 3.1.0 documentation + Search — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2
                          diff --git a/searchindex.js b/searchindex.js index 0356061..5e40dc8 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["CHANGELOG", "CONTRIBUTING", "automatic_configuration", "command_line_tools", "filters/alignment_model_filters", "filters/custom_filters", "filters/language_model_filters", "filters/length_filters", "filters/script_and_language_identification_filters", "filters/sentence_embedding_filters", "filters/special_character_and_similarity_filters", "functions/downloading_and_selecting_data", "functions/filtering_and_scoring", "functions/preprocessing_text", "functions/training_and_using_classifiers", "functions/training_language_and_alignment_models", "functions/using_score_files", "index", "installation", "preprocessors/bpe_segmentation", "preprocessors/custom_preprocessors", "preprocessors/detokenizer", "preprocessors/monolingual_sentence_splitter", "preprocessors/morfessor_segmentation", "preprocessors/reg_exp_sub", "preprocessors/tokenizer", "preprocessors/whitespaceNormalizer", "references", "usage"], "filenames": ["CHANGELOG.md", "CONTRIBUTING.md", "automatic_configuration.md", "command_line_tools.md", "filters/alignment_model_filters.md", "filters/custom_filters.md", "filters/language_model_filters.md", "filters/length_filters.md", "filters/script_and_language_identification_filters.md", "filters/sentence_embedding_filters.md", "filters/special_character_and_similarity_filters.md", "functions/downloading_and_selecting_data.md", "functions/filtering_and_scoring.md", "functions/preprocessing_text.md", "functions/training_and_using_classifiers.md", "functions/training_language_and_alignment_models.md", "functions/using_score_files.md", "index.rst", "installation.md", "preprocessors/bpe_segmentation.md", "preprocessors/custom_preprocessors.md", "preprocessors/detokenizer.md", "preprocessors/monolingual_sentence_splitter.md", "preprocessors/morfessor_segmentation.md", "preprocessors/reg_exp_sub.md", "preprocessors/tokenizer.md", "preprocessors/whitespaceNormalizer.md", "references.rst", "usage.md"], "titles": ["Changelog", "Contributing", "Automatic configuration generation", "Command line tools for analysis", "Alignment model filters", "Custom filters", "Language model filters", "Length filters", "Script and language identification filters", "Sentence embedding filters", "Special character and similarity filters", "Downloading and selecting data", "Filtering and scoring", "Preprocessing text", "Training and using classifiers", "Training language and alignment models", "Using score files", "OpusFilter", "Installation", "BPESegmentation", "Custom preprocessors", "Detokenizer", "MonolingualSentenceSplitter", "MorfessorSegmentation", "RegExpSub", "Tokenizer", "WhitespaceNormalizer", "Citing and references", "Basic usage"], "terms": {"all": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 15, 18, 19, 23, 24, 25, 27, 28], "notabl": 0, "thi": [0, 2, 3, 6, 9, 10, 12, 15, 16, 19, 20, 22, 28], "project": [0, 1, 2], "document": [0, 15, 17, 19, 23, 25, 27], "file": [0, 2, 3, 4, 5, 9, 11, 12, 13, 14, 15, 17, 19, 20, 22, 23, 25, 28], "The": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 22, 24, 25, 26, 27, 28], "format": [0, 3, 6, 12, 14, 16, 17, 28], "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 22, 24, 25, 26, 28], "base": [0, 2, 3, 5, 6, 7, 8, 10, 11, 12, 14, 15, 16, 17, 20, 27], "keep": [0, 25], "adher": 0, "semant": 0, "version": [0, 1, 11, 18], "support": [0, 1, 2, 4, 9, 22, 25], "lingua": [0, 8, 18], "languag": [0, 2, 4, 5, 7, 9, 10, 11, 12, 17, 18, 20, 21, 22, 24, 25, 27, 28], "detect": [0, 8], "http": [0, 18, 27], "github": [0, 1, 4, 18], "com": [0, 4, 18], "helsinki": [0, 27], "nlp": 0, "opusfilt": [0, 1, 2, 5, 16, 18, 20, 25, 27], "pull": [0, 1], "65": 0, "python": [0, 1, 5, 10, 17, 18, 27, 28], "7": 0, "score": [0, 2, 4, 5, 6, 7, 8, 9, 10, 14, 15, 17, 28], "method": [0, 2, 4, 5, 6, 8, 12, 20, 22, 28], "sentenceembeddingfilt": [0, 15], "71": 0, "filter": [0, 3, 11, 14, 18, 20, 27, 28], "filterfals": [0, 4, 5, 12, 28], "autogen": [0, 2, 3], "script": [0, 2, 3, 17], "automat": [0, 1, 5, 17, 18], "config": [0, 2, 28], "gener": [0, 3, 5, 10, 11, 12, 15, 17, 20, 28], "score_direct": [0, 5], "accept_threshold": [0, 5], "reject_threshold": [0, 5], "properti": [0, 5], "refactor": 0, "code": [0, 1, 2, 4, 5, 6, 8, 9, 11, 15, 19, 20, 21, 22, 25, 27], "move": 0, "auxiliari": 0, "util": 0, "updat": [0, 10, 16], "varikn": [0, 1, 15], "instal": [0, 1, 17, 25], "instruct": [0, 1], "from": [0, 1, 2, 3, 5, 6, 8, 10, 11, 12, 14, 15, 16, 17, 18, 20, 22, 24, 28], "pypi": [0, 18], "workflow": [0, 1], "includ": [0, 1, 4, 5, 6, 10, 12, 13, 15, 18, 28], "test": [0, 1, 9, 11, 12], "librari": [0, 3, 8, 10, 17, 22, 24, 26], "requir": [0, 5, 8, 10, 11, 15, 28], "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 22, 23, 24, 25, 27, 28], "xxhash": [0, 11, 12, 18], "instead": [0, 1, 2, 3, 6, 10, 12, 15], "pyhash": 0, "hash": [0, 3, 11, 12], "function": [0, 3, 11, 12, 14, 25, 28], "opu": [0, 11, 17, 18, 25, 27, 28], "fast": [0, 18, 25], "mosestoken": [0, 18, 25], "eflom": [0, 1, 4, 15], "new": [0, 1, 15, 22, 28], "interfac": [0, 11, 27], "wordalignfilt": [0, 15], "catch": 0, "notimplementederror": 0, "beautifulsoup": 0, "parserrejectedmarkup": 0, "12": [0, 27], "add": [0, 2, 3, 5], "slice": 0, "miss": 0, "enabl": [0, 4, 9, 20], "step": [0, 2, 5, 11, 12, 15, 20, 25, 26, 28], "improv": 0, "import": [0, 5, 20, 22], "slow": [0, 9], "onli": [0, 2, 3, 5, 8, 10, 11, 12, 14, 15, 22, 25, 28], "when": [0, 6, 14, 24, 28], "need": [0, 5, 11, 15, 18, 28], "chunk": [0, 4, 28], "repetitionfilt": 0, "singl": [0, 2, 3, 5, 8, 10, 11, 12, 13, 22, 26], "consist": [0, 26], "threshold": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 28], "allow": [0, 2, 7, 8, 15], "float": [0, 2, 8, 16], "averagewordlengthfilt": 0, "unnecessari": 0, "regexpsub": [0, 17], "setuptool": 0, "map_space_to": [0, 25], "option": [0, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 19, 20, 21, 22, 23, 25, 28], "jieba": [0, 25], "mecab": [0, 25], "token": [0, 4, 6, 11, 15, 17, 18, 20, 21], "preserv": [0, 10], "exist": [0, 2, 16, 28], "space": [0, 10, 25, 26], "charact": [0, 1, 5, 6, 7, 8, 12, 15, 17, 25, 26], "input": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 21, 22, 25, 28], "parallel": [0, 2, 3, 4, 5, 7, 10, 11, 12, 13, 17, 20, 21, 22, 25, 27, 28], "process": [0, 4, 9, 11, 12, 13, 15, 17, 20, 27, 28], "preprocess": [0, 10, 11, 17, 20, 25, 28], "re": [0, 24, 26, 28], "organ": 0, "build": 0, "sphinx": 0, "typeerror": 0, "except": [0, 1, 5, 11, 12, 22, 28], "htmltagfilt": 0, "an": [0, 2, 3, 4, 5, 6, 11, 12, 14, 16, 19, 20, 22, 25, 27, 28], "write": [0, 3, 14, 15, 28], "alphabetratiofilt": [0, 2], "regexpfilt": 0, "similarityfilt": 0, "japanes": [0, 18, 25], "word": [0, 4, 6, 7, 10, 12, 15, 17, 19, 23, 27, 28], "segment": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 20, 22, 23, 27, 28], "subword": [0, 6, 11, 15, 19, 23, 27], "bpesegment": [0, 15, 17], "morfessorsegment": [0, 15, 17], "n": [0, 6, 10, 11, 15, 17, 27, 28], "gram": [0, 6, 15, 17, 27], "model": [0, 5, 8, 9, 10, 11, 14, 17, 19, 23, 27], "per": [0, 5, 7, 8, 11, 15, 16, 22, 28], "paramet": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 19, 21, 22, 23, 24, 25, 28], "lengthfilt": [0, 28], "lengthratiofilt": [0, 2, 28], "longwordfilt": 0, "train_alig": [0, 4], "bug": [0, 1], "classifi": [0, 6, 12, 17], "train": [0, 4, 6, 9, 11, 12, 17, 19, 23, 27], "without": [0, 11], "develop": [0, 1, 22], "set": [0, 1, 2, 6, 11, 12, 13, 15, 16, 28], "opusfilterruntimeerror": 0, "have": [0, 1, 5, 6, 7, 8, 10, 11, 12, 14, 16, 28], "e": [0, 1, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 15, 16, 17, 18, 24, 27, 28], "g": [0, 1, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 15, 16, 17, 18, 24, 28], "empti": [0, 2, 3, 4, 6, 11, 12, 15, 25], "data": [0, 2, 3, 4, 5, 6, 10, 12, 14, 15, 17, 22, 27, 28], "save": [0, 2, 5, 14, 15], "creat": [0, 6, 11, 15, 16, 28], "alig": [0, 4, 15, 17], "prior": [0, 4, 15, 23], "repeat": [0, 10], "substr": [0, 10], "preprocessor": [0, 13, 15], "sentenc": [0, 2, 5, 6, 11, 12, 14, 15, 16, 17, 22, 27], "split": [0, 15, 19, 22, 23], "monolingu": [0, 3, 22], "specif": [0, 6, 8, 12, 13, 24], "languageidfilt": [0, 2], "chunksiz": [0, 4, 9, 28], "common": [0, 10, 11, 12, 13, 17, 28], "section": [0, 12, 13, 28], "lmclassifierfilt": 0, "classif": [0, 6, 14, 27], "workdir": [0, 5], "attribut": [0, 5], "filterabc": [0, 5], "class": [0, 2, 3, 5, 6, 12, 13, 15, 20, 25], "should": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 15, 16, 18, 20, 25, 28], "ani": [0, 5, 7, 10, 12, 17, 19, 26, 28], "increas": [0, 2, 28], "default": [0, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 19, 22, 23, 24, 25, 28], "filterpipelin": 0, "10000": [0, 15], "100000": [0, 2, 4, 28], "clean": [0, 2, 5, 6, 14, 28], "up": [0, 1, 6, 15, 28], "chines": [0, 18, 25], "27": 0, "wrong": 0, "keyword": [0, 5], "argument": [0, 2, 3, 5, 12, 20, 24, 28], "name": [0, 2, 3, 5, 11, 12, 13, 14, 15, 28], "duplic": [0, 2, 11, 12], "how": [0, 2], "contribut": [0, 17], "doc": [0, 8], "md": 0, "issu": [0, 1, 2, 11], "21": 0, "panda": [0, 12, 18], "replac": [0, 24, 26, 28], "pyyaml": 0, "ruamel": [0, 18], "yaml": [0, 3, 6, 17, 18, 28], "variabl": [0, 5], "configur": [0, 3, 5, 6, 17, 20, 27], "13": 0, "fasttext": [0, 8, 18, 27], "20": [0, 7, 27], "suppress_prompt": [0, 11], "opus_read": [0, 28], "download": [0, 8, 17, 18, 28], "readm": 0, "bibliographi": [0, 27], "refer": [0, 9, 17, 28], "extend": [0, 17], "lingual": [0, 10], "just": [0, 28], "bilingu": [0, 10], "switch": 0, "command": [0, 15, 17], "diagram": 0, "longestcommonsubstringfilt": 0, "latest": [0, 18], "corpu": [0, 2, 3, 9, 11, 12, 15, 17, 22, 27, 28], "releas": [0, 11, 18, 28], "overlap": [0, 3, 12], "remove_dupl": [0, 3], "lower": [0, 4, 5, 6, 7], "crossentropyfilt": [0, 15], "ci": 0, "flake8": [0, 1], "unittest": 0, "behaviour": 0, "simpl": [0, 1, 3, 5, 17, 28], "log": [0, 6, 10, 15], "prevent": [0, 11], "unboundlocalerror": 0, "output": [0, 2, 3, 5, 11, 12, 13, 14, 15, 16, 22, 28], "after": [0, 6, 8], "first": [0, 2, 3, 6, 10, 11, 12, 24, 26, 28], "tag": [0, 6, 10, 15, 28], "question": 1, "report": [1, 27], "featur": [1, 2, 14, 17, 25, 27], "wish": 1, "ar": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 22, 24, 25, 26, 27, 28], "welcom": [1, 17], "page": [1, 2, 27], "we": [1, 2], "also": [1, 2, 3, 5, 7, 8, 11, 16, 18, 20, 24, 28], "happi": 1, "consid": [1, 2, 5, 15, 28], "request": [1, 18], "There": [1, 10], "few": [1, 2], "rule": [1, 15], "make": [1, 2, 12, 28], "branch": 1, "master": 1, "least": [1, 10, 12, 15], "3": [1, 4, 10, 15, 18, 27, 28], "8": [1, 18, 28], "11": [1, 18, 27], "pleas": [1, 2, 25, 27], "follow": [1, 2, 4, 5, 10, 28], "pep": 1, "maximum": [1, 5, 6, 7, 10, 14], "line": [1, 11, 12, 14, 16, 17, 22, 28], "length": [1, 5, 6, 10, 12, 17, 25, 28], "127": 1, "79": [1, 27], "especi": [1, 9, 12, 28], "case": [1, 6, 7, 12, 22, 28], "unit": [1, 7, 10, 12, 19, 23, 27, 28], "compat": [1, 4], "can": [1, 2, 3, 5, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 20, 22, 25, 28], "check": [1, 28], "via": [1, 18], "pip": [1, 18], "run": [1, 3, 4, 11, 12, 13, 24], "root": 1, "locat": [1, 2, 5], "directori": [1, 2, 5, 28], "To": 1, "them": [1, 3, 6, 11, 12, 16, 28], "pytest": 1, "nosetest": 1, "work": [1, 2, 10, 12, 15, 18, 28], "you": [1, 2, 3, 4, 5, 6, 10, 11, 12, 15, 16, 18, 20, 22, 24, 25, 27, 28], "skip": [1, 8, 11, 15, 28], "respect": [1, 5, 6, 10, 11, 12, 28], "defin": [1, 2, 3, 5, 11, 12, 13, 14, 15, 20, 22, 28], "9": [1, 10], "10": [1, 5, 6, 15, 27], "larger": [1, 4, 15], "analysi": [1, 17], "tool": [1, 5, 16, 17, 27], "like": [1, 2, 9, 16, 28], "pylint": 1, "fix": 1, "everyth": 1, "note": [1, 2, 6, 11, 12, 15, 18, 28], "current": [1, 2, 9, 20, 28], "yield": [1, 5, 12, 20], "warn": [1, 22], "ad": [2, 3, 14], "determin": 2, "usag": [2, 17], "descript": [2, 21], "h": [2, 27], "textfil": 2, "lang": 2, "langcod": 2, "percentil": 2, "cluster": 2, "sampl": [2, 11], "size": [2, 4, 11, 12, 15, 28], "sample_s": 2, "noisi": [2, 11, 14], "noisy_percentil": 2, "dir": 2, "work_dir": 2, "inter": 2, "inter_dir": 2, "plot": [2, 3], "list": [2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 21, 24, 25, 28], "json": [2, 3, 12, 14, 16, 28], "overwrit": [2, 28], "o": [2, 5, 18, 27], "configfil": 2, "initi": [2, 10, 14], "text": [2, 11, 15, 17, 20, 21, 22, 23, 25, 27, 28], "help": [2, 9, 28], "show": [2, 5], "messag": 2, "exit": 2, "": [2, 3, 6, 8, 10, 11, 12, 15, 17, 22, 26, 27, 28], "correspond": [2, 4, 6, 9, 10, 14, 15, 16, 24, 28], "If": [2, 3, 5, 6, 9, 10, 11, 12, 13, 15, 16, 25, 27, 28], "omit": 2, "alphabet": [2, 8], "latin": 2, "characterscorefilt": 2, "int": [2, 16], "max": [2, 10, 14], "number": [2, 3, 5, 9, 10, 11, 12, 13, 15, 16, 18, 22, 25, 27, 28], "pair": [2, 4, 5, 6, 7, 9, 10, 12, 14, 15, 28], "proport": [2, 3, 5, 8, 11], "0": [2, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 19, 23, 24, 27], "001": [2, 15], "k": [2, 9, 11, 27], "try": 2, "too": [2, 15], "much": 2, "2": [2, 4, 6, 7, 10, 11, 12, 15, 23, 27, 28], "sourc": [2, 4, 6, 7, 10, 11, 12, 15, 16, 18, 28], "target": [2, 4, 6, 7, 10, 11, 12, 15, 16, 28], "intermedi": 2, "temporari": [2, 5], "given": [2, 3, 5, 7, 8, 11, 12, 14, 15, 24, 25, 28], "scatter": [2, 3], "histogram": [2, 3], "distribut": [2, 12], "quit": 2, "object": [2, 3, 5, 12, 15, 16, 28], "mai": [2, 4, 5, 9, 10, 11, 12, 15, 24, 27, 28], "dot": [2, 3, 16], "uniqu": 2, "identifi": [2, 6], "order": [2, 7, 8, 10, 11, 15, 24, 25, 28], "multipl": [2, 3, 5, 11, 12, 14, 16, 21, 22, 24, 25, 28], "same": [2, 3, 4, 9, 11, 12, 20, 28], "exampl": [2, 4, 5, 6, 11, 12, 14, 15, 16], "cld2": [2, 8], "id_method": [2, 8], "assum": [2, 5, 28], "each": [2, 5, 6, 10, 11, 12, 13, 14, 15, 16, 21, 22, 25, 28], "independ": 2, "most": [2, 11, 25, 28], "three": [2, 5, 10], "describ": [2, 3], "more": [2, 9, 11, 12, 15, 16, 25, 26, 28], "detail": [2, 4, 15, 19, 23], "below": [2, 5, 6, 7, 9, 10, 12, 13], "howev": 2, "applic": 2, "limit": [2, 8, 10, 15], "implement": [2, 5, 9, 17, 20, 27], "introduc": 2, "aulamo": [2, 11, 17, 27], "et": [2, 6, 9, 10, 11, 15, 17, 19, 23, 27], "al": [2, 6, 9, 10, 11, 15, 17, 19, 23, 27], "2023": [2, 27], "It": [2, 12, 17, 18, 20, 28], "take": [2, 3, 5, 11, 20, 28], "tri": 2, "separ": [2, 6, 7, 10, 11, 12, 15, 16, 19, 23, 24, 28], "nonzeronumeralsfilt": 2, "terminalpunctuationfilt": 2, "expand": [2, 28], "made": [2, 12, 13], "flexibl": 2, "futur": 2, "remov": [2, 3, 8, 10, 12, 26], "next": 2, "subset": 2, "100k": 2, "produc": [2, 3, 4, 10, 11, 12, 14, 19, 28], "previous": 2, "mention": 2, "These": [2, 5], "mean": [2, 15, 16], "group": 2, "valu": [2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 25, 28], "center": 2, "look": [2, 15, 28], "mani": [2, 12, 15, 17, 19, 28], "figur": 2, "want": [2, 11, 12, 15, 16, 24, 28], "sure": [2, 11, 12, 28], "experiment": 2, "expect": [2, 4, 6, 8, 16, 22], "give": [2, 4, 5, 11, 12, 15, 25], "good": [2, 5, 6, 11], "result": [2, 4, 5, 12], "corpora": [2, 3, 6, 15, 17], "feedback": 2, "apart": [3, 15, 16], "main": [3, 28], "cmd": [3, 28], "packag": [3, 5, 18, 20, 25, 28], "provid": [3, 4, 9, 10, 16, 19, 22, 25, 28], "analyz": [3, 12], "draw": 3, "direct": [3, 4, 14], "acycl": 3, "graph": 3, "graphviz": [3, 18], "rankdir": 3, "tb": 3, "lr": 3, "chang": [3, 6, 15, 25, 28], "left": 3, "right": 3, "top": [3, 12, 13, 28], "bottom": 3, "end": [3, 4, 6, 11, 15, 28], "raw": [3, 11, 17, 28], "otherwis": [3, 10], "render": 3, "indic": [3, 5, 10, 11, 12, 14, 28], "extens": [3, 27], "pdf": [3, 27], "png": 3, "print": 3, "out": [3, 9, 10, 12, 15], "statist": [3, 27], "entri": [3, 5, 7, 14, 20, 27], "either": [3, 5, 7, 10, 11, 12, 15, 28], "one": [3, 5, 6, 10, 12, 13, 15, 16, 22, 25, 26, 28], "calcul": [3, 5, 6, 7, 9, 10, 12], "two": [3, 4, 5, 10, 11, 15, 16, 26, 28], "between": [3, 5, 6, 7, 10, 15, 19], "syntax": [3, 28], "letter": [3, 4, 5], "lowercas": [3, 10, 23], "essenti": 3, "ha": [3, 5, 7, 10, 11, 12, 14, 15, 16, 17, 20, 28], "sever": [3, 11], "subcommand": 3, "column": 3, "basic": [3, 17], "corr": [3, 27], "correl": 3, "matrix": 3, "hist": 3, "amount": 3, "would": [3, 22], "similar": [3, 6, 9, 15, 17], "definit": [3, 28], "which": [3, 7, 11, 12, 14, 15, 20, 28], "second": [3, 12, 26, 28], "For": [3, 4, 5, 6, 8, 10, 11, 12, 15, 16, 18, 28], "dictionari": [3, 5, 6, 8, 12, 13, 14, 15, 16, 21, 24, 25, 28], "total": 3, "In": [3, 4, 6, 7, 8, 10, 12, 25, 27, 28], "addit": [3, 15, 18, 20], "possibl": [3, 4, 6, 8, 10, 12, 18, 28], "collect": [3, 11, 12, 17], "similarli": [3, 20], "\u00f6": [4, 15, 27], "stling": [4, 15, 27], "tiedemann": [4, 11, 15, 17, 27], "2016": [4, 15, 19, 27], "src_threshold": 4, "tgt_threshold": 4, "src_token": [4, 15], "null": [4, 6, 8, 9, 11, 12, 15, 16, 19, 22, 25], "tgt_token": [4, 15], "type": [4, 5, 7, 10, 14, 15, 16, 21, 25, 27, 28], "score_for_empti": [4, 6], "100": [4, 7, 10, 28], "A": [4, 6, 8, 11, 24, 27, 28], "accept": [4, 5, 6, 7, 10, 12], "both": [4, 10, 11, 12, 28], "than": [4, 5, 8, 9, 10, 25], "mose": [4, 11, 25], "tupl": [4, 5, 24], "contain": [4, 5, 10, 11, 12, 14, 15, 16, 24, 28], "appropri": [4, 25], "en": [4, 8, 28], "english": [4, 27, 28], "1": [4, 5, 7, 8, 9, 10, 11, 12, 14, 15, 19, 27, 28], "ibm1": 4, "hmm": 4, "fertil": 4, "see": [4, 6, 8, 9, 11, 12, 13, 15, 18, 19, 21, 23, 25, 28], "robertostl": [4, 18], "caveat": 4, "stochast": 4, "exactli": [4, 11], "thu": [4, 27], "your": [4, 5, 11, 12, 17, 20, 27], "pipelin": [4, 17], "full": [4, 27], "replic": 4, "moreov": 4, "estim": [4, 15], "even": [4, 11, 20], "consequ": [4, 11], "matter": 4, "onc": [4, 12], "wors": 4, "n_job": [4, 12, 13, 28], "regardless": 4, "other": [4, 11, 15, 25], "own": [5, 17, 20], "modul": [5, 10, 12, 13, 20], "kei": [5, 12, 13, 15, 16, 20, 28], "inherit": [5, 20], "abstract": [5, 20], "thei": [5, 10, 12, 25, 28], "addition": [5, 12, 13], "adjust": 5, "recommend": [5, 6, 12, 28], "iter": [5, 20], "over": [5, 6, 20], "return": [5, 6, 7, 8, 10, 11, 15], "whether": [5, 6, 8, 10], "constant": [5, 24], "depend": [5, 11, 15, 28], "clean_low": 5, "clean_high": 5, "abov": [5, 10, 15, 28], "clean_between": 5, "minimum": [5, 6, 7, 8, 10, 14, 28], "clean_tru": 5, "true": [5, 6, 7, 8, 10, 11, 12, 15, 16], "clean_fals": 5, "fals": [5, 6, 7, 8, 10, 11, 12, 15, 16, 22, 23], "__init__": 5, "arbitrari": [5, 10, 24], "kwarg": 5, "call": 5, "remain": [5, 28], "reserv": 5, "non": [5, 6, 10, 12, 19, 22], "temprari": 5, "compabl": 5, "global": [5, 28], "forc": [5, 15, 16, 28], "reject": [5, 10, 12], "That": 5, "sensibl": 5, "alwai": [5, 6, 7, 11, 22], "upper": [5, 6], "min_threshold": 5, "max_threshold": 5, "min_length": [5, 7, 10, 28], "max_length": [5, 7, 10, 28], "decis": [5, 11], "redefin": 5, "reason": [5, 15], "uppercas": 5, "less": 5, "50": [5, 6], "uppercasefilt": 5, "6": 5, "def": 5, "self": 5, "5": [5, 6, 9, 10, 27], "super": 5, "uppercase_ratio": 5, "len": 5, "sum": [5, 6], "char": [5, 7, 10, 15], "sent": 5, "isupp": 5, "ratio": [5, 7, 8, 10, 28], "customfilt": 5, "eviron": 5, "py": [5, 8, 18, 27], "pythonpath": 5, "environ": 5, "select": [5, 6, 12, 14, 16, 17, 27], "extern": [5, 16], "resourc": [5, 27], "store": [5, 11, 12, 28], "itself": 5, "path": [5, 8, 28], "join": [5, 11], "rel": [5, 10], "probabl": [6, 14, 19], "lm_param": 6, "score_typ": 6, "cross": 6, "entropi": 6, "perplixti": 6, "perplex": 6, "neg": [6, 8, 14], "logprob": 6, "low_threshold": 6, "diff_threshold": 6, "absolut": [6, 7, 10, 15], "differ": [6, 7, 8, 9, 10, 11, 12, 14, 28], "manual": [6, 12, 28], "filenam": [6, 15], "arpa": [6, 15], "lm": [6, 15], "binari": [6, 15, 23], "unk": 6, "unknown": 6, "symbol": [6, 15], "sensit": 6, "include_unk": 6, "cc": [6, 8], "context": 6, "cue": 6, "ignor": 6, "mb": [6, 15], "morph": 6, "boundari": [6, 14, 15], "mark": [6, 10, 15], "wb": [6, 15], "w": [6, 15], "init_hist": 6, "interpol": 6, "weight": [6, 10, 15], "train_ngram": 6, "match": [6, 10, 12, 15, 19, 22, 25, 26], "do": [6, 10, 12, 15, 16, 22], "unless": [6, 12], "know": 6, "what": [6, 8], "moor": [6, 27], "lewi": [6, 27], "2010": [6, 27], "id_lm_param": 6, "domain": 6, "nd_lm_param": 6, "content": [6, 10, 11], "naiv": 6, "bay": 6, "label": [6, 14], "map": [6, 28], "relative_scor": 6, "normal": [6, 9, 10, 26], "largest": 6, "likelihood": 6, "divid": [6, 7, 14, 15], "get": [6, 15, 16], "custom": [6, 12, 13, 17, 22, 28], "vatanen": [6, 27], "discount": [6, 15], "4": [6, 15, 24, 27], "so": [6, 12, 15, 28], "idea": 6, "small": [6, 9, 15], "unigram": 6, "background": 6, "coeffici": 6, "found": [6, 10, 24], "example_config": 6, "qed_lm_langid": 6, "whitespac": [7, 8, 10, 26], "pass_empti": 7, "zero": [7, 10], "nth": 7, "appli": [7, 12, 13, 24], "higher": [7, 14, 15], "infin": 7, "averag": [7, 9], "40": 7, "longest": [7, 10], "across": 7, "75": 8, "exclude_whitespac": 8, "exclud": 8, "equal": [8, 10, 11, 15], "greater": [8, 10], "valid": [8, 14, 28], "www": [8, 27], "regular": [8, 10, 15, 24], "express": [8, 10, 15, 24, 28], "info": 8, "unicod": [8, 16, 26], "html": [8, 10], "confid": 8, "iso639": 8, "indentif": 8, "langid": [8, 18, 27], "fasttext_model_path": 8, "langid_languag": 8, "cld2_option": 8, "lingua_mod": 8, "string": [8, 10, 11, 12, 15, 16, 25, 28], "specifi": [8, 11, 14, 15], "high": [8, 14, 15], "low": [8, 10, 14, 27], "accuraci": [8, 15], "mode": 8, "rang": 8, "pycld2": [8, 18], "pretrain": 8, "nn_model": 9, "nearest": [9, 15], "neighbor": [9, 15], "time": [9, 12, 15, 28], "200": 9, "multilingu": [9, 17, 27], "laser": 9, "propos": 9, "artetx": [9, 27], "schwenk": [9, 27], "2018": [9, 27], "chaudhari": [9, 27], "2019": [9, 10, 27], "cosin": [9, 15], "train_nearest_neighbor": 9, "With": [9, 28], "closer": 9, "suitabl": [9, 15], "enough": 9, "gpu": 9, "comput": [9, 15, 27], "pytorch": [9, 18], "laserembed": [9, 18], "boolean": 10, "none": [10, 12, 15], "penalti": 10, "co": 10, "occurr": 10, "thermin": 10, "punctuat": 10, "v": [10, 27], "\u00e1": [10, 27], "zquez": [10, 27], "formul": 10, "termin": 10, "count": [10, 12, 24], "increment": 10, "beyond": 10, "occur": [10, 12], "final": [10, 12, 19, 27, 28], "greatest": 10, "smaller": [10, 15], "measur": 10, "numer": [10, 12, 16], "extract": 10, "sequenc": [10, 24, 26], "sequencematch": 10, "difflib": 10, "require_al": 10, "reach": 10, "c": [10, 27], "pairwis": 10, "shorter": 10, "compar": [10, 11, 12, 16], "levenshtein": 10, "distanc": [10, 15], "integ": [10, 11, 12, 16], "cost": [10, 15, 28], "edit": 10, "oper": [10, 11, 16, 17, 19], "insert": [10, 16], "delet": 10, "substitut": [10, 24, 28], "qualiti": 10, "nmt": [10, 15, 19], "repetit": 10, "activ": 10, "were": [10, 28], "longer": [10, 15], "cannot": [10, 12, 16, 28], "start": [10, 11, 15, 24], "regexp": 10, "accept_match": 10, "experss": 10, "regex": [10, 18, 19], "read": [11, 16, 28], "2012": [11, 17, 27], "opustool": [11, 17, 18, 27], "2020": [11, 17, 27], "corpus_nam": [11, 28], "source_languag": [11, 28], "target_languag": [11, 28], "untoken": 11, "xml": 11, "src_output": [11, 28], "tgt_output": [11, 28], "prompt": 11, "user": [11, 28], "confirm": 11, "befor": [11, 25], "url": [11, 27], "last": [11, 15, 28], "memori": [11, 12, 17, 28], "index": [11, 20, 24], "stop": [11, 15, 28], "until": 11, "part": [11, 16], "approxim": 11, "fraction": 11, "outputs_2": 11, "rest": 11, "divisor": 11, "modulo": 11, "algorithm": [11, 12, 15, 22], "xxh64": [11, 12], "seed": [11, 15], "condit": 11, "where": [11, 28], "hold": 11, "written": [11, 12, 14, 17, 28], "doe": 11, "random": [11, 15], "benefit": 11, "approach": 11, "fulli": 11, "determinist": 11, "ident": 11, "goe": 11, "origin": [11, 25], "downsid": 11, "care": [11, 12], "consecut": 11, "themselv": 11, "unexpect": 11, "distinct": 11, "prime": 11, "choic": [11, 28], "ensur": 11, "shuffle_subset": 11, "shuffl": 11, "cartesian": 11, "skip_empti": 11, "skip_dupl": 11, "item": [11, 14, 16], "combin": [11, 12, 13, 17], "altern": 11, "translat": [11, 27], "meaning": 11, "variat": 11, "segmenat": 11, "b": [11, 27], "m": [11, 18, 27], "x": [11, 16, 27, 28], "style": 11, "tab": 11, "convert": [11, 25], "alreadi": [11, 28], "mostli": [11, 12, 13, 15, 22], "comparison": 12, "crawl": 12, "web": 12, "been": [12, 14, 17, 28], "exact": 12, "cryptograph": 12, "reduc": 12, "consumpt": 12, "veri": [12, 15, 28], "larg": [12, 15, 17], "concaten": [12, 28], "togeth": 12, "64": 12, "bit": [12, 28], "fine": 12, "practic": 12, "about": 12, "extra": [12, 18, 25], "collis": 12, "disabl": [12, 22, 25], "sub": [12, 13, 15, 24], "job": [12, 13, 28], "default_n_job": [12, 13, 28], "pass": [12, 15, 25], "repres": [12, 13, 15], "level": [12, 13, 28], "typic": [12, 13, 15], "lenghtfilt": 12, "special": [12, 13, 17, 22], "under": [12, 13, 16, 28], "avail": [12, 13, 18, 25], "readi": [12, 13], "effect": [12, 27], "those": [12, 15], "opposit": 12, "manner": [12, 28], "simpli": 12, "its": [12, 25, 28], "anoth": [12, 28], "instanc": 12, "easi": 12, "load": 12, "datafram": 12, "json_norm": 12, "whitespacenorm": [13, 17], "sklearn": 14, "training_scor": 14, "criterion": 14, "optim": [14, 15], "ce": 14, "roc_auc": 14, "sse": 14, "aic": 14, "bic": 14, "dev_scor": 14, "model_typ": 14, "logisticregress": 14, "model_paramet": 14, "examplefilt": 14, "quantil": 14, "min": 14, "init": 14, "posit": 14, "achiev": 14, "highest": 14, "assign": 14, "output_prob": 14, "cleanest": 14, "noisiest": 14, "output_label": 14, "varigram": 15, "siivola": [15, 27], "2007": [15, 27], "crossentropydifferencefilt": 15, "optdata": 15, "leav": 15, "norder": 15, "dscale": 15, "scale": 15, "factor": 15, "dscale2": 15, "dure": 15, "prune": [15, 27], "use_3nz": 15, "kneser": 15, "nei": 15, "smooth": [15, 27], "cutoff": 15, "intern": [15, 27], "unsur": 15, "balanc": 15, "thumb": 15, "doubl": 15, "bpe": [15, 19], "morfessor": [15, 18, 23, 27], "latter": [15, 28], "prefix": [15, 22], "postfix": 15, "preceed": 15, "break": [15, 22], "tation": 15, "ation": 15, "src_data": 15, "tgt_data": 15, "unsupervis": [15, 27], "search": 15, "embed": [15, 17, 27], "n_neighbor": 15, "neightbor": 15, "queri": 15, "brute": 15, "metric": 15, "wrapper": [15, 18], "scikit": [15, 18], "learn": [15, 18, 19], "nearestneighbor": 15, "inform": 15, "proper": 15, "caution": 15, "sennrich": [15, 19, 27], "min_frequ": 15, "frequenc": 15, "num_work": 15, "processor": 15, "multiprocess": 15, "cpu_count": 15, "virpioja": [15, 23, 27], "2013": [15, 23, 27], "corpusweight": 15, "dampen": 15, "logarithm": 15, "ones": 15, "use_skip": 15, "frequent": 15, "seen": 15, "compound": 15, "speed": [15, 28], "forcesplit_list": 15, "nosplit_r": 15, "surround": 15, "deeper": 16, "hierarch": 16, "y": 16, "overwritten": [16, 28], "myscor": 16, "src": 16, "tgt": 16, "jsonl": 16, "gz": [16, 28], "old": 16, "now": 16, "revers": 16, "descend": 16, "field": 16, "convers": 16, "str": 16, "interpret": 16, "plain": 16, "access": 16, "effici": [17, 27], "identif": [17, 27], "present": 17, "acl": [17, 27], "system": [17, 27], "demonstr": [17, 27], "align": [17, 27], "detoken": 17, "monolingualsentencesplitt": 17, "cite": 17, "changelog": 17, "setup": [18, 28], "window": 18, "On": [18, 27], "linux": 18, "maco": 18, "directli": 18, "beautifulsoup4": 18, "matplotlib": 18, "rapidfuzz": 18, "splitter": [18, 22], "subword_nmt": 18, "tqdm": 18, "detector": 18, "yannvgn": 18, "prebuild": 18, "vsiivola": 18, "elfom": 18, "cython": 18, "merg": 19, "vocab": 19, "vocabulari": 19, "revert": 19, "oov": 19, "glossari": 19, "affect": 19, "dropout": 19, "train_bp": 19, "preprocessorabc": 20, "modifi": 20, "f_idx": 20, "being": 20, "vari": 20, "tokenzi": [21, 25], "non_breaking_prefix_fil": 22, "overrid": [22, 28], "enable_parallel": 22, "rais": [22, 28], "heurist": 22, "philipp": [22, 27], "koehn": [22, 27], "josh": 22, "schroeder": 22, "europarl": [22, 27], "2005": [22, 27], "european": [22, 27], "intend": [22, 28], "becaus": 22, "viterbi_max_len": 23, "30": [23, 27], "viterbi_smooth": 23, "train_morfessor": 23, "pattern": 24, "lang_pattern": 24, "flag": 24, "compil": 24, "fork": 25, "avali": 25, "zh": 25, "zh_cn": 25, "track": 25, "cut": 25, "jp": 25, "By": [25, 28], "unid": 25, "lite": 25, "mecab_arg": 25, "again": [25, 28], "standard": [26, 28], "lead": 26, "trail": 26, "research": 27, "our": 27, "paper": 27, "inproceed": 27, "etal": 27, "titl": 27, "pu": 27, "f": 27, "ilter": 27, "toolbox": 27, "author": 27, "mikko": 27, "sami": 27, "j": 27, "rg": 27, "booktitl": 27, "proceed": 27, "58th": 27, "annual": 27, "meet": 27, "associ": 27, "linguist": 27, "month": 27, "jul": 27, "year": 27, "publish": 27, "aclweb": 27, "org": 27, "anthologi": 27, "demo": 27, "doi": 27, "18653": 27, "v1": 27, "150": 27, "156": 27, "margin": 27, "mikel": 27, "holger": 27, "mine": 27, "arxiv": 27, "ab": 27, "1811": 27, "01136": 27, "ona": 27, "de": 27, "gibert": 27, "24th": 27, "confer": 27, "machin": 27, "31": 27, "38": 27, "tamper": 27, "finland": 27, "june": 27, "aclanthologi": 27, "eamt": 27, "umut": 27, "sulubacak": 27, "t": 27, "ool": 27, "diagnost": 27, "12th": 27, "evalu": 27, "3782": 27, "3789": 27, "marseil": 27, "franc": 27, "lrec": 27, "467": 27, "juli": 27, "vishrav": 27, "yuqe": 27, "tang": 27, "francisco": 27, "guzm": 27, "fourth": 27, "volum": 27, "share": 27, "task": 27, "dai": 27, "261": 27, "266": 27, "florenc": 27, "itali": 27, "august": 27, "w19": 27, "5435": 27, "joulin": 27, "armand": 27, "edouard": 27, "grave": 27, "piotr": 27, "bojanowski": 27, "matthij": 27, "douz": 27, "herv": 27, "\u00e9": 27, "gou": 27, "tom": 27, "mikolov": 27, "zip": 27, "compress": [27, 28], "1612": 27, "03651": 27, "2017": 27, "bag": 27, "toma": 27, "trick": 27, "15th": 27, "uropean": 27, "chapter": 27, "short": 27, "427": 27, "431": 27, "valencia": 27, "spain": 27, "april": 27, "e17": 27, "2068": 27, "uroparl": 27, "summit": 27, "86": 27, "phuket": 27, "thailand": 27, "septemb": 27, "mtsummit": 27, "lui": 27, "baldwin": 27, "marco": 27, "timothi": 27, "off": 27, "shelf": 27, "25": 27, "jeju": 27, "island": 27, "korea": 27, "p12": 27, "3005": 27, "intellig": 27, "robert": 27, "william": 27, "220": 27, "224": 27, "uppsala": 27, "sweden": 27, "p10": 27, "2041": 27, "neural": 27, "rico": 27, "barri": 27, "haddow": 27, "alexandra": 27, "birch": 27, "rare": 27, "54th": 27, "long": 27, "1715": 27, "1725": 27, "berlin": 27, "germani": 27, "p16": 27, "1162": 27, "grow": 27, "vesa": 27, "teemu": 27, "hirsim\u00e4ki": 27, "neser": 27, "ei": 27, "ieee": 27, "transact": 27, "audio": 27, "speech": 27, "15": 27, "1617": 27, "1624": 27, "1109": 27, "tasl": 27, "896666": 27, "eighth": 27, "2214": 27, "2218": 27, "istanbul": 27, "turkei": 27, "elra": 27, "conf": 27, "lrec2012": 27, "463_paper": 27, "tommi": 27, "jaakko": 27, "\u00e4": 27, "yrynen": 27, "nicoletta": 27, "calzolari": 27, "khalid": 27, "choukri": 27, "bent": 27, "maegaard": 27, "joseph": 27, "mariani": 27, "jan": 27, "odjik": 27, "stelio": 27, "piperidi": 27, "mike": 27, "rosner": 27, "daniel": 27, "tapia": 27, "editor": 27, "seventh": 27, "peter": 27, "smit": 27, "stig": 27, "arn": 27, "gr": 27, "nroo": 27, "kurimo": 27, "orfessor": 27, "aselin": 27, "aalto": 27, "univers": 27, "public": 27, "seri": 27, "scienc": 27, "technologi": 27, "depart": 27, "signal": 27, "acoust": 27, "vazquez": 27, "ra": 27, "\u00fa": 27, "l": 27, "u": 27, "nivers": 27, "elsinki": 27, "submiss": 27, "wmt": [27, 28], "19": 27, "294": 27, "300": 27, "5441": 27, "ostl": 27, "arkov": 27, "hain": 27, "ont": 27, "arlo": 27, "pragu": 27, "bulletin": 27, "mathemat": 27, "106": 27, "125": 27, "146": 27, "octob": 27, "ufal": 27, "mff": 27, "cuni": 27, "cz": 27, "pbml": 27, "art": 27, "here": [27, 28], "articl": 27, "journal": 27, "eprinttyp": 27, "eprint": 27, "timestamp": 27, "22": 27, "nov": 27, "17": 27, "58": 27, "0100": 27, "biburl": 27, "dblp": 27, "rec": 27, "bib": 27, "bibsourc": 27, "2020a": 27, "address": 27, "isbn": 27, "979": 27, "95546": 27, "34": 27, "2020b": 27, "jun": 27, "aug": 27, "archiveprefix": 27, "mon": 27, "28": 27, "dec": 27, "02": 27, "joulingbdjm16": 27, "apr": 27, "sep": 27, "\u00f6stling": 27, "owner": 27, "08": 27, "26": 27, "hirsim": 27, "aki": 27, "v\u00e1zquez": 27, "techreport": 27, "institut": 27, "eng": 27, "At": 28, "point": 28, "output_directori": 28, "thing": 28, "finnish": 28, "paracrawl": 28, "v4": 28, "fi": 28, "utf": 28, "gzip": 28, "bz2": 28, "bzip2": 28, "complex": 28, "v2019": 28, "node": 28, "anchor": 28, "previou": 28, "paracrawl_filt": 28, "myfilt": 28, "wmt_filter": 28, "bitext": 28, "lot": 28, "while": 28, "programmat": 28, "coupl": 28, "var": 28, "scope": 28, "kind": 28, "varstr": 28, "within": 28, "l1": 28, "l2": 28, "txt": 28, "templat": 28, "quot": 28, "loader": 28, "insid": 28, "brace": 28, "individu": 28, "local": 28, "place": 28, "file1": 28, "file2": 28, "sv": 28, "substep": 28, "exploit": 28, "wihtout": 28, "complet": 28, "conveni": 28, "outputdir": 28, "outdir": 28, "wai": 28, "former": 28, "perform": 28, "difficult": 28, "pars": 28, "notat": 28, "dash": 28, "underscor": 28, "still": 28, "shown": 28, "easier": 28}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"changelog": 0, "unreleas": 0, "3": 0, "1": 0, "0": 0, "2024": 0, "06": 0, "05": 0, "ad": 0, "remov": 0, "fix": 0, "2023": 0, "10": 0, "11": 0, "chang": 0, "2": 0, "6": 0, "2022": 0, "30": 0, "5": 0, "09": 0, "28": 0, "4": 0, "04": 0, "01": 0, "18": 0, "2021": 0, "23": 0, "19": 0, "08": 0, "31": 0, "2020": 0, "25": 0, "contribut": 1, "automat": 2, "configur": [2, 28], "gener": 2, "unsupervis": 2, "threshold": 2, "select": [2, 11], "filter": [2, 4, 5, 6, 7, 8, 9, 10, 12, 17], "command": [3, 28], "line": 3, "tool": [3, 18], "analysi": 3, "opusfilt": [3, 17, 28], "diagram": 3, "duplic": 3, "score": [3, 12, 16], "test": 3, "align": [4, 15, 18], "model": [4, 6, 15, 18], "wordalignfilt": 4, "custom": [5, 20], "languag": [6, 8, 15], "crossentropyfilt": 6, "crossentropydifferencefilt": 6, "lmclassifierfilt": 6, "length": 7, "lengthfilt": 7, "lengthratiofilt": 7, "averagewordlengthfilt": 7, "longwordfilt": 7, "script": [8, 28], "identif": 8, "alphabetratiofilt": 8, "characterscorefilt": 8, "languageidfilt": 8, "sentenc": [9, 18], "embed": [9, 18], "sentenceembeddingfilt": 9, "special": 10, "charact": 10, "similar": 10, "htmltagfilt": 10, "terminalpunctuationfilt": 10, "nonzeronumeralsfilt": 10, "longestcommonsubstringfilt": 10, "similarityfilt": 10, "repetitionfilt": 10, "regexpfilt": 10, "download": 11, "data": 11, "opus_read": 11, "concaten": 11, "head": 11, "tail": 11, "slice": 11, "split": 11, "subset": 11, "product": 11, "unzip": 11, "write": 11, "remove_dupl": 12, "preprocess": 13, "text": 13, "train": [14, 15], "us": [14, 16], "classifi": 14, "train_classifi": 14, "train_ngram": 15, "train_alig": 15, "train_nearest_neighbor": 15, "train_bp": 15, "train_morfessor": 15, "file": 16, "join": 16, "sort": 16, "get": 17, "start": 17, "avail": 17, "function": 17, "preprocessor": [17, 20], "other": 17, "inform": 17, "instal": 18, "requir": 18, "librari": 18, "option": 18, "jieba": 18, "mecab": 18, "word": 18, "segment": 18, "laser": 18, "varikn": 18, "n": 18, "gram": 18, "eflom": 18, "bpesegment": 19, "detoken": 21, "monolingualsentencesplitt": 22, "morfessorsegment": 23, "regexpsub": 24, "token": 25, "whitespacenorm": 26, "cite": 27, "refer": 27, "bibtex": 27, "basic": 28, "usag": 28, "exampl": 28, "variabl": 28, "constant": 28, "run": 28, "singl": 28}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinxcontrib.bibtex": 9, "sphinx": 58}, "alltitles": {"Changelog": [[0, "changelog"]], "Unreleased": [[0, "unreleased"]], "3.1.0 - 2024-06-05": [[0, "id1"]], "Added": [[0, "added"], [0, "id3"], [0, "id7"], [0, "id13"], [0, "id17"], [0, "id22"], [0, "id25"], [0, "id33"], [0, "id36"], [0, "id39"]], "Removed": [[0, "removed"], [0, "id4"]], "Fixed": [[0, "fixed"], [0, "id5"], [0, "id9"], [0, "id11"], [0, "id15"], [0, "id20"], [0, "id27"], [0, "id30"], [0, "id37"], [0, "id40"]], "3.0.0 - 2023-10-11": [[0, "id2"]], "Changed": [[0, "changed"], [0, "id8"], [0, "id14"], [0, "id18"], [0, "id23"], [0, "id29"], [0, "id32"], [0, "id35"]], "2.6.0 - 2022-11-30": [[0, "id6"]], "2.5.1 - 2022-09-28": [[0, "id10"]], "2.5.0 - 2022-09-28": [[0, "id12"]], "2.4.0 - 2022-04-05": [[0, "id16"]], "2.3.1 - 2022-01-28": [[0, "id19"]], "2.3.0 - 2022-01-18": [[0, "id21"]], "2.2.0 - 2021-11-23": [[0, "id24"]], "2.1.2 - 2021-11-11": [[0, "id26"]], "2.1.1 - 2021-10-19": [[0, "id28"]], "2.1.0 - 2021-08-31": [[0, "id31"]], "2.0.0 - 2021-06-01": [[0, "id34"]], "1.0.1 - 2020-05-25": [[0, "id38"]], "1.0.0 - 2020-04-10": [[0, "id41"]], "Contributing": [[1, "contributing"]], "Automatic configuration generation": [[2, "automatic-configuration-generation"]], "Unsupervised threshold selection for filters": [[2, "unsupervised-threshold-selection-for-filters"]], "Command line tools for analysis": [[3, "command-line-tools-for-analysis"]], "opusfilter-diagram": [[3, "opusfilter-diagram"]], "opusfilter-duplicates": [[3, "opusfilter-duplicates"]], "opusfilter-scores": [[3, "opusfilter-scores"]], "opusfilter-test": [[3, "opusfilter-test"]], "Alignment model filters": [[4, "alignment-model-filters"]], "WordAlignFilter": [[4, "wordalignfilter"]], "Custom filters": [[5, "custom-filters"]], "Language model filters": [[6, "language-model-filters"]], "CrossEntropyFilter": [[6, "crossentropyfilter"]], "CrossEntropyDifferenceFilter": [[6, "crossentropydifferencefilter"]], "LMClassifierFilter": [[6, "lmclassifierfilter"]], "Length filters": [[7, "length-filters"]], "LengthFilter": [[7, "lengthfilter"]], "LengthRatioFilter": [[7, "lengthratiofilter"]], "AverageWordLengthFilter": [[7, "averagewordlengthfilter"]], "LongWordFilter": [[7, "longwordfilter"]], "Script and language identification filters": [[8, "script-and-language-identification-filters"]], "AlphabetRatioFilter": [[8, "alphabetratiofilter"]], "CharacterScoreFilter": [[8, "characterscorefilter"]], "LanguageIDFilter": [[8, "languageidfilter"]], "Sentence embedding filters": [[9, "sentence-embedding-filters"]], "SentenceEmbeddingFilter": [[9, "sentenceembeddingfilter"]], "Special character and similarity filters": [[10, "special-character-and-similarity-filters"]], "HtmlTagFilter": [[10, "htmltagfilter"]], "TerminalPunctuationFilter": [[10, "terminalpunctuationfilter"]], "NonZeroNumeralsFilter": [[10, "nonzeronumeralsfilter"]], "LongestCommonSubstringFilter": [[10, "longestcommonsubstringfilter"]], "SimilarityFilter": [[10, "similarityfilter"]], "RepetitionFilter": [[10, "repetitionfilter"]], "RegExpFilter": [[10, "regexpfilter"]], "Downloading and selecting data": [[11, "downloading-and-selecting-data"]], "opus_read": [[11, "opus-read"]], "concatenate": [[11, "concatenate"]], "download": [[11, "download"]], "head": [[11, "head"]], "tail": [[11, "tail"]], "slice": [[11, "slice"]], "split": [[11, "split"]], "subset": [[11, "subset"]], "product": [[11, "product"]], "unzip": [[11, "unzip"]], "write": [[11, "write"]], "Filtering and scoring": [[12, "filtering-and-scoring"]], "remove_duplicates": [[12, "remove-duplicates"]], "filter": [[12, "filter"]], "score": [[12, "score"]], "Preprocessing text": [[13, "preprocessing-text"]], "preprocess": [[13, "preprocess"]], "Training and using classifiers": [[14, "training-and-using-classifiers"]], "train_classifier": [[14, "train-classifier"]], "classify": [[14, "classify"]], "Training language and alignment models": [[15, "training-language-and-alignment-models"]], "train_ngram": [[15, "train-ngram"]], "train_aligment": [[15, "train-aligment"]], "train_nearest_neighbors": [[15, "train-nearest-neighbors"]], "train_bpe": [[15, "train-bpe"]], "train_morfessor": [[15, "train-morfessor"]], "Using score files": [[16, "using-score-files"]], "join": [[16, "join"]], "sort": [[16, "sort"]], "OpusFilter": [[17, "opusfilter"]], "Get started": [[17, null]], "Available functions": [[17, null]], "Available filters": [[17, null]], "Available preprocessors": [[17, null]], "Other information": [[17, null]], "Installation": [[18, "installation"]], "Required libraries": [[18, "required-libraries"]], "Optional libraries and tools": [[18, "optional-libraries-and-tools"]], "Jieba and MeCab word segmentation": [[18, "jieba-and-mecab-word-segmentation"]], "LASER sentence embeddings": [[18, "laser-sentence-embeddings"]], "VariKN n-gram models": [[18, "varikn-n-gram-models"]], "Eflomal word alignment": [[18, "eflomal-word-alignment"]], "BPESegmentation": [[19, "bpesegmentation"]], "Custom preprocessors": [[20, "custom-preprocessors"]], "Detokenizer": [[21, "detokenizer"]], "MonolingualSentenceSplitter": [[22, "monolingualsentencesplitter"]], "MorfessorSegmentation": [[23, "morfessorsegmentation"]], "RegExpSub": [[24, "regexpsub"]], "Tokenizer": [[25, "tokenizer"]], "WhitespaceNormalizer": [[26, "whitespacenormalizer"]], "Citing and references": [[27, "citing-and-references"]], "Citing": [[27, "citing"]], "References": [[27, "references"]], "References as BibTeX": [[27, "references-as-bibtex"]], "Basic usage": [[28, "basic-usage"]], "opusfilter script": [[28, "opusfilter-script"]], "Configuration examples": [[28, "configuration-examples"]], "Variables and constants": [[28, "variables-and-constants"]], "Running a single command": [[28, "running-a-single-command"]]}, "indexentries": {}}) \ No newline at end of file +Search.setIndex({"docnames": ["CHANGELOG", "CONTRIBUTING", "automatic_configuration", "command_line_tools", "filters/alignment_model_filters", "filters/custom_filters", "filters/language_model_filters", "filters/length_filters", "filters/script_and_language_identification_filters", "filters/sentence_embedding_filters", "filters/special_character_and_similarity_filters", "functions/downloading_and_selecting_data", "functions/filtering_and_scoring", "functions/preprocessing_text", "functions/training_and_using_classifiers", "functions/training_language_and_alignment_models", "functions/using_score_files", "index", "installation", "preprocessors/bpe_segmentation", "preprocessors/custom_preprocessors", "preprocessors/detokenizer", "preprocessors/monolingual_sentence_splitter", "preprocessors/morfessor_segmentation", "preprocessors/reg_exp_sub", "preprocessors/tokenizer", "preprocessors/whitespaceNormalizer", "references", "usage"], "filenames": ["CHANGELOG.md", "CONTRIBUTING.md", "automatic_configuration.md", "command_line_tools.md", "filters/alignment_model_filters.md", "filters/custom_filters.md", "filters/language_model_filters.md", "filters/length_filters.md", "filters/script_and_language_identification_filters.md", "filters/sentence_embedding_filters.md", "filters/special_character_and_similarity_filters.md", "functions/downloading_and_selecting_data.md", "functions/filtering_and_scoring.md", "functions/preprocessing_text.md", "functions/training_and_using_classifiers.md", "functions/training_language_and_alignment_models.md", "functions/using_score_files.md", "index.rst", "installation.md", "preprocessors/bpe_segmentation.md", "preprocessors/custom_preprocessors.md", "preprocessors/detokenizer.md", "preprocessors/monolingual_sentence_splitter.md", "preprocessors/morfessor_segmentation.md", "preprocessors/reg_exp_sub.md", "preprocessors/tokenizer.md", "preprocessors/whitespaceNormalizer.md", "references.rst", "usage.md"], "titles": ["Changelog", "Contributing", "Automatic configuration generation", "Command line tools for analysis", "Alignment model filters", "Custom filters", "Language model filters", "Length filters", "Script and language identification filters", "Sentence embedding filters", "Special character and similarity filters", "Downloading and selecting data", "Filtering and scoring", "Preprocessing text", "Training and using classifiers", "Training language and alignment models", "Using score files", "OpusFilter", "Installation", "BPESegmentation", "Custom preprocessors", "Detokenizer", "MonolingualSentenceSplitter", "MorfessorSegmentation", "RegExpSub", "Tokenizer", "WhitespaceNormalizer", "Citing and references", "Basic usage"], "terms": {"all": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 15, 18, 19, 23, 24, 25, 27, 28], "notabl": 0, "thi": [0, 2, 3, 6, 9, 10, 12, 15, 16, 19, 20, 22, 28], "project": [0, 1, 2], "document": [0, 15, 17, 19, 23, 25, 27], "file": [0, 2, 3, 4, 5, 9, 11, 12, 13, 14, 15, 17, 19, 20, 22, 23, 25, 28], "The": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 22, 24, 25, 26, 27, 28], "format": [0, 3, 6, 12, 14, 16, 17, 28], "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20, 22, 24, 25, 26, 28], "base": [0, 2, 3, 5, 6, 7, 8, 10, 11, 12, 14, 15, 16, 17, 20, 27], "keep": [0, 25], "adher": 0, "semant": 0, "version": [0, 1, 11, 18], "make": [0, 1, 2, 12, 28], "pycld2": 0, "fasttext": [0, 8, 27], "librari": [0, 3, 8, 10, 17, 22, 24, 26], "option": [0, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 19, 20, 21, 22, 23, 25, 28], "replac": [0, 24, 26, 28], "langid": [0, 8, 27], "py": [0, 5, 8, 18, 27], "py3langid": [0, 8, 18], "updat": [0, 10, 16], "github": [0, 1, 4, 8, 18], "workflow": [0, 1], "includ": [0, 1, 4, 5, 6, 10, 12, 13, 15, 18, 28], "python": [0, 1, 5, 10, 17, 18, 27, 28], "12": [0, 1, 18, 27], "test": [0, 1, 9, 11, 12], "opusread": 0, "interfac": [0, 11, 27], "us": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 22, 23, 24, 25, 27, 28], "mose": [0, 4, 11, 25], "requir": [0, 5, 8, 10, 11, 15, 28], "opustool": [0, 11, 17, 18, 27], "support": [0, 1, 2, 4, 8, 9, 18, 22, 25], "lingua": [0, 8, 18], "languag": [0, 2, 4, 5, 7, 9, 10, 11, 12, 17, 20, 21, 22, 24, 25, 27, 28], "detect": [0, 8], "http": [0, 8, 18, 27], "com": [0, 4, 8, 18], "helsinki": [0, 27], "nlp": 0, "opusfilt": [0, 1, 2, 5, 16, 18, 20, 25, 27], "pull": [0, 1], "65": 0, "7": 0, "score": [0, 2, 4, 5, 6, 7, 8, 9, 10, 14, 15, 17, 28], "method": [0, 2, 4, 5, 6, 8, 12, 20, 22, 28], "sentenceembeddingfilt": [0, 15], "71": 0, "filter": [0, 3, 11, 14, 18, 20, 27, 28], "filterfals": [0, 4, 5, 12, 28], "autogen": [0, 2, 3], "script": [0, 2, 3, 17], "automat": [0, 1, 5, 17, 18], "config": [0, 2, 28], "gener": [0, 3, 5, 10, 11, 12, 15, 17, 20, 28], "score_direct": [0, 5], "accept_threshold": [0, 5], "reject_threshold": [0, 5], "properti": [0, 5], "refactor": 0, "code": [0, 1, 2, 4, 5, 6, 8, 9, 11, 15, 19, 20, 21, 22, 25, 27], "move": 0, "auxiliari": 0, "util": 0, "varikn": [0, 1, 15], "instal": [0, 1, 8, 17, 25], "instruct": [0, 1], "from": [0, 1, 2, 3, 5, 6, 8, 10, 11, 12, 14, 15, 16, 17, 18, 20, 22, 24, 28], "pypi": [0, 18], "xxhash": [0, 11, 12, 18], "instead": [0, 1, 2, 3, 6, 10, 12, 15], "pyhash": 0, "hash": [0, 3, 11, 12], "function": [0, 3, 11, 12, 14, 25, 28], "opu": [0, 11, 17, 18, 25, 27, 28], "fast": [0, 18, 25], "mosestoken": [0, 18, 25], "eflom": [0, 1, 4, 15], "new": [0, 1, 15, 22, 28], "wordalignfilt": [0, 15], "catch": 0, "notimplementederror": 0, "beautifulsoup": 0, "parserrejectedmarkup": 0, "add": [0, 2, 3, 5], "slice": 0, "miss": 0, "enabl": [0, 4, 9, 20], "step": [0, 2, 5, 11, 12, 15, 20, 25, 26, 28], "improv": 0, "import": [0, 5, 20, 22], "slow": [0, 9], "onli": [0, 2, 3, 5, 8, 10, 11, 12, 14, 15, 22, 25, 28], "when": [0, 6, 14, 24, 28], "need": [0, 5, 11, 15, 18, 28], "chunk": [0, 4, 28], "repetitionfilt": 0, "singl": [0, 2, 3, 5, 8, 10, 11, 12, 13, 22, 26], "consist": [0, 26], "threshold": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 28], "allow": [0, 2, 7, 8, 15], "float": [0, 2, 8, 16], "averagewordlengthfilt": 0, "unnecessari": 0, "regexpsub": [0, 17], "setuptool": 0, "map_space_to": [0, 25], "jieba": [0, 25], "mecab": [0, 25], "token": [0, 4, 6, 11, 15, 17, 18, 20, 21], "preserv": [0, 10], "exist": [0, 2, 11, 16, 28], "space": [0, 10, 25, 26], "charact": [0, 1, 5, 6, 7, 8, 12, 15, 17, 25, 26], "input": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 21, 22, 25, 28], "parallel": [0, 2, 3, 4, 5, 7, 10, 11, 12, 13, 17, 20, 21, 22, 25, 27, 28], "process": [0, 4, 9, 11, 12, 13, 15, 17, 20, 27, 28], "preprocess": [0, 10, 11, 17, 20, 25, 28], "re": [0, 24, 26, 28], "organ": 0, "build": 0, "sphinx": 0, "typeerror": 0, "except": [0, 1, 5, 11, 12, 22, 28], "htmltagfilt": 0, "an": [0, 2, 3, 4, 5, 6, 11, 12, 14, 16, 19, 20, 22, 25, 27, 28], "write": [0, 3, 14, 15, 28], "alphabetratiofilt": [0, 2], "regexpfilt": 0, "similarityfilt": 0, "japanes": [0, 18, 25], "word": [0, 4, 6, 7, 10, 12, 15, 17, 19, 23, 27, 28], "segment": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 20, 22, 23, 27, 28], "subword": [0, 6, 11, 15, 19, 23, 27], "bpesegment": [0, 15, 17], "morfessorsegment": [0, 15, 17], "n": [0, 6, 10, 11, 15, 17, 27, 28], "gram": [0, 6, 15, 17, 27], "model": [0, 5, 8, 9, 10, 11, 14, 17, 19, 23, 27], "per": [0, 5, 7, 8, 11, 15, 16, 22, 28], "paramet": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 19, 21, 22, 23, 24, 25, 28], "lengthfilt": [0, 28], "lengthratiofilt": [0, 2, 28], "longwordfilt": 0, "train_alig": [0, 4], "bug": [0, 1], "classifi": [0, 6, 12, 17], "train": [0, 4, 6, 9, 11, 12, 17, 19, 23, 27], "without": [0, 11], "develop": [0, 1, 22], "set": [0, 1, 2, 6, 11, 12, 13, 15, 16, 28], "opusfilterruntimeerror": 0, "have": [0, 1, 5, 6, 7, 8, 10, 11, 12, 14, 16, 18, 28], "e": [0, 1, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 15, 16, 17, 18, 24, 27, 28], "g": [0, 1, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 15, 16, 17, 18, 24, 28], "empti": [0, 2, 3, 4, 6, 11, 12, 15, 25], "data": [0, 2, 3, 4, 5, 6, 10, 12, 14, 15, 17, 22, 27, 28], "save": [0, 2, 5, 14, 15], "creat": [0, 6, 11, 15, 16, 28], "alig": [0, 4, 15, 17], "prior": [0, 4, 15, 23], "repeat": [0, 10], "substr": [0, 10], "preprocessor": [0, 13, 15], "sentenc": [0, 2, 5, 6, 11, 12, 14, 15, 16, 17, 22, 27], "split": [0, 15, 19, 22, 23], "monolingu": [0, 3, 22], "specif": [0, 6, 12, 13, 24], "languageidfilt": [0, 2], "chunksiz": [0, 4, 9, 28], "common": [0, 10, 11, 12, 13, 17, 28], "section": [0, 12, 13, 28], "lmclassifierfilt": 0, "classif": [0, 6, 14, 27], "workdir": [0, 5], "attribut": [0, 5], "filterabc": [0, 5], "class": [0, 2, 3, 5, 6, 12, 13, 15, 20, 25], "should": [0, 1, 2, 3, 4, 5, 6, 11, 12, 13, 15, 16, 18, 20, 25, 28], "ani": [0, 5, 7, 10, 12, 17, 19, 26, 28], "increas": [0, 2, 28], "default": [0, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 19, 22, 23, 24, 25, 28], "filterpipelin": 0, "10000": [0, 15], "100000": [0, 2, 4, 28], "clean": [0, 2, 5, 6, 14, 28], "up": [0, 1, 6, 15, 28], "chines": [0, 18, 25], "27": 0, "wrong": 0, "keyword": [0, 5], "argument": [0, 2, 3, 5, 12, 20, 24, 28], "name": [0, 2, 3, 5, 11, 12, 13, 14, 15, 28], "duplic": [0, 2, 11, 12], "how": [0, 2], "contribut": [0, 17], "doc": [0, 8], "md": 0, "issu": [0, 1, 2, 11], "21": 0, "panda": [0, 12, 18], "pyyaml": 0, "ruamel": [0, 18], "yaml": [0, 3, 6, 17, 18, 28], "variabl": [0, 5], "configur": [0, 3, 5, 6, 17, 20, 27], "13": 0, "20": [0, 7, 27], "suppress_prompt": [0, 11], "opus_read": [0, 28], "download": [0, 8, 17, 18, 28], "readm": 0, "bibliographi": [0, 27], "refer": [0, 9, 17, 28], "extend": [0, 17], "lingual": [0, 10], "just": [0, 28], "bilingu": [0, 10], "switch": 0, "command": [0, 15, 17], "diagram": 0, "longestcommonsubstringfilt": 0, "latest": [0, 18], "corpu": [0, 2, 3, 9, 11, 12, 15, 17, 22, 27, 28], "releas": [0, 11, 18, 28], "overlap": [0, 3, 12], "remove_dupl": [0, 3], "lower": [0, 4, 5, 6, 7], "crossentropyfilt": [0, 15], "ci": 0, "flake8": [0, 1], "unittest": 0, "behaviour": 0, "simpl": [0, 1, 3, 5, 17, 28], "log": [0, 6, 10, 15], "prevent": [0, 11], "unboundlocalerror": 0, "output": [0, 2, 3, 5, 11, 12, 13, 14, 15, 16, 22, 28], "after": [0, 6, 8], "first": [0, 2, 3, 6, 10, 11, 12, 24, 26, 28], "tag": [0, 6, 10, 15, 28], "question": 1, "report": [1, 27], "featur": [1, 2, 14, 17, 25, 27], "wish": 1, "ar": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 22, 24, 25, 26, 27, 28], "welcom": [1, 17], "page": [1, 2, 27], "we": [1, 2], "also": [1, 2, 3, 5, 7, 8, 11, 16, 18, 20, 24, 28], "happi": 1, "consid": [1, 2, 5, 15, 28], "request": [1, 18], "There": [1, 10], "few": [1, 2], "rule": [1, 15], "branch": 1, "master": 1, "least": [1, 10, 12, 15], "3": [1, 4, 10, 15, 18, 27, 28], "8": [1, 18, 28], "pleas": [1, 2, 25, 27], "follow": [1, 2, 4, 5, 8, 10, 28], "pep": 1, "maximum": [1, 5, 6, 7, 10, 14], "line": [1, 11, 12, 14, 16, 17, 22, 28], "length": [1, 5, 6, 10, 12, 17, 25, 28], "127": 1, "79": [1, 27], "especi": [1, 9, 12, 18, 28], "case": [1, 6, 7, 11, 12, 22, 28], "unit": [1, 7, 10, 12, 19, 23, 27, 28], "compat": [1, 4], "can": [1, 2, 3, 5, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 20, 22, 25, 28], "check": [1, 28], "via": [1, 18], "pip": [1, 18], "run": [1, 3, 4, 11, 12, 13, 24], "root": 1, "locat": [1, 2, 5], "directori": [1, 2, 5, 28], "To": 1, "them": [1, 3, 6, 11, 12, 16, 28], "pytest": 1, "nosetest": 1, "work": [1, 2, 10, 12, 15, 18, 28], "you": [1, 2, 3, 4, 5, 6, 10, 11, 12, 15, 16, 18, 20, 22, 24, 25, 27, 28], "skip": [1, 8, 11, 15, 28], "respect": [1, 5, 6, 10, 11, 12, 28], "defin": [1, 2, 3, 5, 11, 12, 13, 14, 15, 20, 22, 28], "9": [1, 10], "10": [1, 5, 6, 15, 27], "11": [1, 27], "larger": [1, 4, 15], "analysi": [1, 17], "tool": [1, 5, 16, 17, 27], "like": [1, 2, 9, 16, 28], "pylint": 1, "fix": 1, "everyth": 1, "note": [1, 2, 6, 11, 12, 15, 18, 28], "current": [1, 2, 8, 9, 18, 20, 28], "yield": [1, 5, 12, 20], "warn": [1, 22], "ad": [2, 3, 14], "determin": 2, "usag": [2, 17], "descript": [2, 21], "h": [2, 27], "textfil": 2, "lang": 2, "langcod": 2, "percentil": 2, "cluster": 2, "sampl": [2, 11], "size": [2, 4, 11, 12, 15, 28], "sample_s": 2, "noisi": [2, 11, 14], "noisy_percentil": 2, "dir": 2, "work_dir": 2, "inter": 2, "inter_dir": 2, "plot": [2, 3], "list": [2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 21, 24, 25, 28], "json": [2, 3, 12, 14, 16, 28], "overwrit": [2, 28], "o": [2, 5, 18, 27], "configfil": 2, "initi": [2, 10, 14], "text": [2, 11, 15, 17, 20, 21, 22, 23, 25, 27, 28], "help": [2, 9, 28], "show": [2, 5], "messag": 2, "exit": 2, "": [2, 3, 6, 8, 10, 11, 12, 15, 17, 22, 26, 27, 28], "correspond": [2, 4, 6, 9, 10, 14, 15, 16, 24, 28], "If": [2, 3, 5, 6, 9, 10, 11, 12, 13, 15, 16, 25, 27, 28], "omit": 2, "alphabet": [2, 8], "latin": 2, "characterscorefilt": 2, "int": [2, 16], "max": [2, 10, 14], "number": [2, 3, 5, 9, 10, 11, 12, 13, 15, 16, 18, 22, 25, 27, 28], "pair": [2, 4, 5, 6, 7, 9, 10, 12, 14, 15, 28], "proport": [2, 3, 5, 8, 11], "0": [2, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 19, 23, 24, 27], "001": [2, 15], "k": [2, 9, 11, 27], "try": 2, "too": [2, 15], "much": 2, "2": [2, 4, 6, 7, 10, 11, 12, 15, 23, 27, 28], "sourc": [2, 4, 6, 7, 10, 11, 12, 15, 16, 18, 28], "target": [2, 4, 6, 7, 10, 11, 12, 15, 16, 28], "intermedi": 2, "temporari": [2, 5], "given": [2, 3, 5, 7, 8, 11, 12, 14, 15, 24, 25, 28], "scatter": [2, 3], "histogram": [2, 3], "distribut": [2, 12], "quit": 2, "object": [2, 3, 5, 12, 15, 16, 28], "mai": [2, 4, 5, 9, 10, 11, 12, 15, 24, 27, 28], "dot": [2, 3, 16], "uniqu": 2, "identifi": [2, 6], "order": [2, 7, 8, 10, 11, 15, 24, 25, 28], "multipl": [2, 3, 5, 11, 12, 14, 16, 21, 22, 24, 25, 28], "same": [2, 3, 4, 9, 11, 12, 20, 28], "exampl": [2, 4, 5, 6, 11, 12, 14, 15, 16], "cld2": [2, 8], "id_method": [2, 8], "assum": [2, 5, 28], "each": [2, 5, 6, 10, 11, 12, 13, 14, 15, 16, 21, 22, 25, 28], "independ": 2, "most": [2, 11, 25, 28], "three": [2, 5, 10], "describ": [2, 3], "more": [2, 9, 11, 12, 15, 16, 25, 26, 28], "detail": [2, 4, 15, 19, 23], "below": [2, 5, 6, 7, 9, 10, 12, 13], "howev": 2, "applic": 2, "limit": [2, 8, 10, 15], "implement": [2, 5, 9, 17, 20, 27], "introduc": 2, "aulamo": [2, 11, 17, 27], "et": [2, 6, 9, 10, 11, 15, 17, 19, 23, 27], "al": [2, 6, 9, 10, 11, 15, 17, 19, 23, 27], "2023": [2, 27], "It": [2, 12, 17, 18, 20, 28], "take": [2, 3, 5, 11, 20, 28], "tri": 2, "separ": [2, 6, 7, 10, 11, 12, 15, 16, 19, 23, 24, 28], "nonzeronumeralsfilt": 2, "terminalpunctuationfilt": 2, "expand": [2, 28], "made": [2, 12, 13], "flexibl": 2, "futur": 2, "remov": [2, 3, 8, 10, 12, 26], "next": 2, "subset": 2, "100k": 2, "produc": [2, 3, 4, 10, 11, 12, 14, 19, 28], "previous": 2, "mention": 2, "These": [2, 5], "mean": [2, 15, 16], "group": 2, "valu": [2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 25, 28], "center": 2, "look": [2, 15, 28], "mani": [2, 12, 15, 17, 19, 28], "figur": 2, "want": [2, 11, 12, 15, 16, 24, 28], "sure": [2, 11, 12, 28], "experiment": 2, "expect": [2, 4, 6, 8, 16, 22], "give": [2, 4, 5, 11, 12, 15, 25], "good": [2, 5, 6, 11], "result": [2, 4, 5, 12], "corpora": [2, 3, 6, 11, 15, 17], "feedback": 2, "apart": [3, 15, 16], "main": [3, 28], "cmd": [3, 28], "packag": [3, 5, 18, 20, 25, 28], "provid": [3, 4, 9, 10, 16, 19, 22, 25, 28], "analyz": [3, 12], "draw": 3, "direct": [3, 4, 14], "acycl": 3, "graph": 3, "graphviz": [3, 18], "rankdir": 3, "tb": 3, "lr": 3, "chang": [3, 6, 15, 18, 25, 28], "left": 3, "right": 3, "top": [3, 12, 13, 28], "bottom": 3, "end": [3, 4, 6, 11, 15, 28], "raw": [3, 11, 17, 28], "otherwis": [3, 10], "render": 3, "indic": [3, 5, 10, 11, 12, 14, 28], "extens": [3, 27], "pdf": [3, 27], "png": 3, "print": 3, "out": [3, 9, 10, 12, 15, 18], "statist": [3, 27], "entri": [3, 5, 7, 14, 20, 27], "either": [3, 5, 7, 10, 11, 12, 15, 28], "one": [3, 5, 6, 10, 12, 13, 15, 16, 22, 25, 26, 28], "calcul": [3, 5, 6, 7, 9, 10, 12], "two": [3, 4, 5, 10, 11, 15, 16, 26, 28], "between": [3, 5, 6, 7, 10, 15, 19], "syntax": [3, 28], "letter": [3, 4, 5], "lowercas": [3, 10, 23], "essenti": 3, "ha": [3, 5, 7, 10, 11, 12, 14, 15, 16, 17, 20, 28], "sever": [3, 11], "subcommand": 3, "column": 3, "basic": [3, 17], "corr": [3, 27], "correl": 3, "matrix": 3, "hist": 3, "amount": [3, 11], "would": [3, 22], "similar": [3, 6, 9, 15, 17], "definit": [3, 28], "which": [3, 7, 11, 12, 14, 15, 20, 28], "second": [3, 12, 26, 28], "For": [3, 4, 5, 6, 8, 10, 11, 12, 15, 16, 18, 28], "dictionari": [3, 5, 6, 8, 12, 13, 14, 15, 16, 21, 24, 25, 28], "total": 3, "In": [3, 4, 6, 7, 8, 10, 12, 25, 27, 28], "addit": [3, 15, 18, 20], "possibl": [3, 4, 6, 8, 10, 12, 18, 28], "collect": [3, 11, 12, 17], "similarli": [3, 20], "\u00f6": [4, 15, 27], "stling": [4, 15, 27], "tiedemann": [4, 11, 15, 17, 27], "2016": [4, 8, 15, 19, 27], "src_threshold": 4, "tgt_threshold": 4, "src_token": [4, 15], "null": [4, 6, 8, 9, 11, 12, 15, 16, 19, 22, 25], "tgt_token": [4, 15], "type": [4, 5, 7, 10, 11, 14, 15, 16, 21, 25, 27, 28], "score_for_empti": [4, 6], "100": [4, 7, 10, 28], "A": [4, 6, 8, 11, 24, 27, 28], "accept": [4, 5, 6, 7, 10, 12], "both": [4, 10, 11, 12, 28], "than": [4, 5, 8, 9, 10, 25], "tupl": [4, 5, 24], "contain": [4, 5, 10, 11, 12, 14, 15, 16, 24, 28], "appropri": [4, 25], "en": [4, 8, 28], "english": [4, 27, 28], "1": [4, 5, 7, 8, 9, 10, 11, 12, 14, 15, 19, 27, 28], "ibm1": 4, "hmm": 4, "fertil": 4, "see": [4, 6, 8, 9, 11, 12, 13, 15, 18, 19, 21, 23, 25, 28], "robertostl": [4, 18], "caveat": 4, "stochast": 4, "exactli": [4, 11], "thu": [4, 27], "your": [4, 5, 11, 12, 17, 20, 27], "pipelin": [4, 17], "full": [4, 27], "replic": 4, "moreov": 4, "estim": [4, 15], "even": [4, 11, 20], "consequ": [4, 11], "matter": 4, "onc": [4, 12], "wors": 4, "n_job": [4, 12, 13, 28], "regardless": 4, "other": [4, 11, 15, 25], "own": [5, 17, 20], "modul": [5, 10, 12, 13, 20], "kei": [5, 12, 13, 15, 16, 20, 28], "inherit": [5, 20], "abstract": [5, 20], "thei": [5, 10, 12, 25, 28], "addition": [5, 12, 13], "adjust": 5, "recommend": [5, 6, 11, 12, 28], "iter": [5, 20], "over": [5, 6, 20], "return": [5, 6, 7, 8, 10, 11, 15], "whether": [5, 6, 8, 10], "constant": [5, 24], "depend": [5, 11, 15, 28], "clean_low": 5, "clean_high": 5, "abov": [5, 10, 11, 15, 28], "clean_between": 5, "minimum": [5, 6, 7, 8, 10, 14, 28], "clean_tru": 5, "true": [5, 6, 7, 8, 10, 11, 12, 15, 16], "clean_fals": 5, "fals": [5, 6, 7, 8, 10, 11, 12, 15, 16, 22, 23], "__init__": 5, "arbitrari": [5, 10, 24], "kwarg": 5, "call": 5, "remain": [5, 28], "reserv": 5, "non": [5, 6, 10, 12, 19, 22], "temprari": 5, "compabl": 5, "global": [5, 28], "forc": [5, 15, 16, 28], "reject": [5, 10, 12], "That": 5, "sensibl": 5, "alwai": [5, 6, 7, 11, 22], "upper": [5, 6], "min_threshold": 5, "max_threshold": 5, "min_length": [5, 7, 10, 28], "max_length": [5, 7, 10, 28], "decis": [5, 11], "redefin": 5, "reason": [5, 15], "uppercas": 5, "less": 5, "50": [5, 6], "uppercasefilt": 5, "6": [5, 11], "def": 5, "self": 5, "5": [5, 6, 9, 10, 27], "super": 5, "uppercase_ratio": 5, "len": 5, "sum": [5, 6], "char": [5, 7, 10, 15], "sent": 5, "isupp": 5, "ratio": [5, 7, 8, 10, 28], "customfilt": 5, "eviron": 5, "pythonpath": 5, "environ": 5, "select": [5, 6, 12, 14, 16, 17, 27], "extern": [5, 16], "resourc": [5, 27], "store": [5, 11, 12, 28], "itself": 5, "path": [5, 8, 28], "join": [5, 11], "rel": [5, 10], "probabl": [6, 14, 19], "lm_param": 6, "score_typ": 6, "cross": 6, "entropi": 6, "perplixti": 6, "perplex": 6, "neg": [6, 8, 14], "logprob": 6, "low_threshold": 6, "diff_threshold": 6, "absolut": [6, 7, 10, 15], "differ": [6, 7, 8, 9, 10, 11, 12, 14, 28], "manual": [6, 12, 28], "filenam": [6, 15], "arpa": [6, 15], "lm": [6, 15], "binari": [6, 15, 23], "unk": 6, "unknown": 6, "symbol": [6, 15], "sensit": 6, "include_unk": 6, "cc": [6, 8], "context": 6, "cue": 6, "ignor": 6, "mb": [6, 15], "morph": 6, "boundari": [6, 14, 15], "mark": [6, 10, 15], "wb": [6, 15], "w": [6, 15], "init_hist": 6, "interpol": 6, "weight": [6, 10, 15], "train_ngram": 6, "match": [6, 10, 12, 15, 19, 22, 25, 26], "do": [6, 10, 12, 15, 16, 22], "unless": [6, 12], "know": 6, "what": [6, 8], "moor": [6, 27], "lewi": [6, 27], "2010": [6, 27], "id_lm_param": 6, "domain": 6, "nd_lm_param": 6, "content": [6, 10, 11], "naiv": 6, "bay": 6, "label": [6, 14], "map": [6, 28], "relative_scor": 6, "normal": [6, 9, 10, 26], "largest": 6, "likelihood": 6, "divid": [6, 7, 14, 15], "get": [6, 15, 16], "custom": [6, 12, 13, 17, 22, 28], "vatanen": [6, 27], "discount": [6, 15], "4": [6, 15, 24, 27], "so": [6, 12, 15, 28], "idea": 6, "small": [6, 9, 15], "unigram": 6, "background": 6, "coeffici": 6, "found": [6, 10, 24], "example_config": 6, "qed_lm_langid": 6, "whitespac": [7, 8, 10, 26], "pass_empti": 7, "zero": [7, 10], "nth": 7, "appli": [7, 12, 13, 24], "higher": [7, 14, 15], "infin": 7, "averag": [7, 9], "40": 7, "longest": [7, 10], "across": 7, "75": 8, "exclude_whitespac": 8, "exclud": 8, "equal": [8, 10, 11, 15], "greater": [8, 10], "valid": [8, 14, 28], "www": [8, 27], "regular": [8, 10, 15, 24], "express": [8, 10, 15, 24, 28], "info": 8, "unicod": [8, 16, 26], "html": [8, 10], "confid": 8, "iso639": 8, "indentif": 8, "fasttext_model_path": 8, "langid_languag": 8, "cld2_option": 8, "lingua_mod": 8, "string": [8, 10, 11, 12, 15, 16, 25, 28], "specifi": [8, 11, 14, 15], "high": [8, 14, 15], "low": [8, 10, 14, 27], "accuraci": [8, 15], "mode": 8, "rang": 8, "cite": [8, 17], "lui": [8, 27], "baldwin": [8, 27], "2012": [8, 11, 17, 27], "adbar": 8, "pemistahl": 8, "cld2owner": 8, "joulin": [8, 27], "etal": [8, 27], "2017": [8, 27], "bag": [8, 27], "pretrain": 8, "nn_model": 9, "nearest": [9, 15], "neighbor": [9, 15], "time": [9, 12, 15, 28], "200": 9, "multilingu": [9, 17, 27], "laser": 9, "propos": 9, "artetx": [9, 27], "schwenk": [9, 27], "2018": [9, 27], "chaudhari": [9, 27], "2019": [9, 10, 27], "cosin": [9, 15], "train_nearest_neighbor": 9, "With": [9, 28], "closer": 9, "suitabl": [9, 15], "enough": 9, "gpu": 9, "comput": [9, 15, 27], "pytorch": [9, 18], "laserembed": [9, 18], "boolean": 10, "none": [10, 12, 15], "penalti": 10, "co": 10, "occurr": 10, "thermin": 10, "punctuat": 10, "v": [10, 27], "\u00e1": [10, 27], "zquez": [10, 27], "formul": 10, "termin": 10, "count": [10, 12, 24], "increment": 10, "beyond": 10, "occur": [10, 12], "final": [10, 12, 19, 27, 28], "greatest": 10, "smaller": [10, 15], "measur": 10, "numer": [10, 12, 16], "extract": 10, "sequenc": [10, 24, 26], "sequencematch": 10, "difflib": 10, "require_al": 10, "reach": 10, "c": [10, 27], "pairwis": 10, "shorter": 10, "compar": [10, 11, 12, 16], "levenshtein": 10, "distanc": [10, 15], "integ": [10, 11, 12, 16], "cost": [10, 15, 28], "edit": 10, "oper": [10, 11, 16, 17, 19], "insert": [10, 16], "delet": 10, "substitut": [10, 24, 28], "qualiti": 10, "nmt": [10, 15, 19], "repetit": 10, "activ": 10, "were": [10, 28], "longer": [10, 15], "cannot": [10, 12, 16, 28], "start": [10, 11, 15, 24], "regexp": 10, "accept_match": 10, "experss": 10, "regex": [10, 18, 19], "read": [11, 16, 28], "2020": [11, 17, 27], "corpus_nam": [11, 28], "source_languag": [11, 28], "target_languag": [11, 28], "untoken": 11, "xml": 11, "src_output": [11, 28], "tgt_output": [11, 28], "prompt": 11, "user": [11, 28], "confirm": 11, "befor": [11, 25], "avail": [11, 12, 13, 18, 25], "those": [11, 12, 15], "equival": 11, "some": 11, "significantli": 11, "reduc": [11, 12], "url": [11, 27], "last": [11, 15, 28], "memori": [11, 12, 17, 28], "index": [11, 20, 24], "stop": [11, 15, 28], "until": 11, "part": [11, 16], "approxim": 11, "fraction": 11, "outputs_2": 11, "rest": 11, "divisor": 11, "modulo": 11, "algorithm": [11, 12, 15, 22], "xxh64": [11, 12], "seed": [11, 15], "condit": 11, "where": [11, 28], "hold": 11, "written": [11, 12, 14, 17, 28], "doe": 11, "random": [11, 15], "benefit": 11, "approach": 11, "fulli": 11, "determinist": 11, "ident": 11, "goe": 11, "origin": [11, 25], "downsid": 11, "care": [11, 12], "consecut": 11, "themselv": 11, "unexpect": 11, "distinct": 11, "prime": 11, "choic": [11, 28], "ensur": 11, "shuffle_subset": 11, "shuffl": 11, "cartesian": 11, "skip_empti": 11, "skip_dupl": 11, "item": [11, 14, 16], "combin": [11, 12, 13, 17], "altern": 11, "translat": [11, 27], "meaning": 11, "variat": 11, "segmenat": 11, "b": [11, 27], "m": [11, 18, 27], "x": [11, 16, 27, 28], "style": 11, "tab": 11, "convert": [11, 25], "alreadi": [11, 28], "mostli": [11, 12, 13, 15, 22], "comparison": 12, "crawl": 12, "web": 12, "been": [12, 14, 17, 18, 28], "exact": 12, "cryptograph": 12, "consumpt": 12, "veri": [12, 15, 28], "larg": [12, 15, 17], "concaten": [12, 28], "togeth": 12, "64": 12, "bit": [12, 28], "fine": 12, "practic": 12, "about": 12, "extra": [12, 18, 25], "collis": 12, "disabl": [12, 22, 25], "sub": [12, 13, 15, 24], "job": [12, 13, 28], "default_n_job": [12, 13, 28], "pass": [12, 15, 25], "repres": [12, 13, 15], "level": [12, 13, 28], "typic": [12, 13, 15], "lenghtfilt": 12, "special": [12, 13, 17, 22], "under": [12, 13, 16, 28], "readi": [12, 13], "effect": [12, 27], "opposit": 12, "manner": [12, 28], "simpli": 12, "its": [12, 25, 28], "anoth": [12, 28], "instanc": 12, "easi": 12, "load": 12, "datafram": 12, "json_norm": 12, "whitespacenorm": [13, 17], "sklearn": 14, "training_scor": 14, "criterion": 14, "optim": [14, 15], "ce": 14, "roc_auc": 14, "sse": 14, "aic": 14, "bic": 14, "dev_scor": 14, "model_typ": 14, "logisticregress": 14, "model_paramet": 14, "examplefilt": 14, "quantil": 14, "min": 14, "init": 14, "posit": 14, "achiev": 14, "highest": 14, "assign": 14, "output_prob": 14, "cleanest": 14, "noisiest": 14, "output_label": 14, "varigram": 15, "siivola": [15, 27], "2007": [15, 27], "crossentropydifferencefilt": 15, "optdata": 15, "leav": 15, "norder": 15, "dscale": 15, "scale": 15, "factor": 15, "dscale2": 15, "dure": 15, "prune": [15, 27], "use_3nz": 15, "kneser": 15, "nei": 15, "smooth": [15, 27], "cutoff": 15, "intern": [15, 27], "unsur": 15, "balanc": 15, "thumb": 15, "doubl": 15, "bpe": [15, 19], "morfessor": [15, 18, 23, 27], "latter": [15, 28], "prefix": [15, 22], "postfix": 15, "preceed": 15, "break": [15, 22], "tation": 15, "ation": 15, "src_data": 15, "tgt_data": 15, "unsupervis": [15, 27], "search": 15, "embed": [15, 17, 27], "n_neighbor": 15, "neightbor": 15, "queri": 15, "brute": 15, "metric": 15, "wrapper": [15, 18], "scikit": [15, 18], "learn": [15, 18, 19], "nearestneighbor": 15, "inform": 15, "proper": 15, "caution": 15, "sennrich": [15, 19, 27], "min_frequ": 15, "frequenc": 15, "num_work": 15, "processor": 15, "multiprocess": 15, "cpu_count": 15, "virpioja": [15, 23, 27], "2013": [15, 23, 27], "corpusweight": 15, "dampen": 15, "logarithm": 15, "ones": 15, "use_skip": 15, "frequent": 15, "seen": 15, "compound": 15, "speed": [15, 28], "forcesplit_list": 15, "nosplit_r": 15, "surround": 15, "deeper": 16, "hierarch": 16, "y": 16, "overwritten": [16, 28], "myscor": 16, "src": 16, "tgt": 16, "jsonl": 16, "gz": [16, 28], "old": 16, "now": 16, "revers": 16, "descend": 16, "field": 16, "convers": 16, "str": 16, "interpret": 16, "plain": 16, "access": 16, "effici": [17, 27], "identif": [17, 27], "present": 17, "acl": [17, 27], "system": [17, 27], "demonstr": [17, 27], "align": [17, 27], "detoken": 17, "monolingualsentencesplitt": 17, "changelog": 17, "setup": [18, 28], "window": 18, "On": [18, 27], "linux": 18, "maco": 18, "directli": 18, "beautifulsoup4": 18, "matplotlib": 18, "rapidfuzz": 18, "splitter": [18, 22], "subword_nmt": 18, "tqdm": 18, "detector": 18, "box": 18, "due": 18, "lack": 18, "newer": 18, "yannvgn": 18, "prebuild": 18, "vsiivola": 18, "elfom": 18, "cython": 18, "merg": 19, "vocab": 19, "vocabulari": 19, "revert": 19, "oov": 19, "glossari": 19, "affect": 19, "dropout": 19, "train_bp": 19, "preprocessorabc": 20, "modifi": 20, "f_idx": 20, "being": 20, "vari": 20, "tokenzi": [21, 25], "non_breaking_prefix_fil": 22, "overrid": [22, 28], "enable_parallel": 22, "rais": [22, 28], "heurist": 22, "philipp": [22, 27], "koehn": [22, 27], "josh": 22, "schroeder": 22, "europarl": [22, 27], "2005": [22, 27], "european": [22, 27], "intend": [22, 28], "becaus": 22, "viterbi_max_len": 23, "30": [23, 27], "viterbi_smooth": 23, "train_morfessor": 23, "pattern": 24, "lang_pattern": 24, "flag": 24, "compil": 24, "fork": 25, "avali": 25, "zh": 25, "zh_cn": 25, "track": 25, "cut": 25, "jp": 25, "By": [25, 28], "unid": 25, "lite": 25, "mecab_arg": 25, "again": [25, 28], "standard": [26, 28], "lead": 26, "trail": 26, "research": 27, "our": 27, "paper": 27, "inproceed": 27, "titl": 27, "pu": 27, "f": 27, "ilter": 27, "toolbox": 27, "author": 27, "mikko": 27, "sami": 27, "j": 27, "rg": 27, "booktitl": 27, "proceed": 27, "58th": 27, "annual": 27, "meet": 27, "associ": 27, "linguist": 27, "month": 27, "jul": 27, "year": 27, "publish": 27, "aclweb": 27, "org": 27, "anthologi": 27, "demo": 27, "doi": 27, "18653": 27, "v1": 27, "150": 27, "156": 27, "margin": 27, "mikel": 27, "holger": 27, "mine": 27, "arxiv": 27, "ab": 27, "1811": 27, "01136": 27, "ona": 27, "de": 27, "gibert": 27, "24th": 27, "confer": 27, "machin": 27, "31": 27, "38": 27, "tamper": 27, "finland": 27, "june": 27, "aclanthologi": 27, "eamt": 27, "umut": 27, "sulubacak": 27, "t": 27, "ool": 27, "diagnost": 27, "12th": 27, "evalu": 27, "3782": 27, "3789": 27, "marseil": 27, "franc": 27, "lrec": 27, "467": 27, "juli": 27, "vishrav": 27, "yuqe": 27, "tang": 27, "francisco": 27, "guzm": 27, "fourth": 27, "volum": 27, "share": 27, "task": 27, "dai": 27, "261": 27, "266": 27, "florenc": 27, "itali": 27, "august": 27, "w19": 27, "5435": 27, "armand": 27, "edouard": 27, "grave": 27, "piotr": 27, "bojanowski": 27, "matthij": 27, "douz": 27, "herv": 27, "\u00e9": 27, "gou": 27, "tom": 27, "mikolov": 27, "zip": 27, "compress": [27, 28], "1612": 27, "03651": 27, "toma": 27, "trick": 27, "15th": 27, "uropean": 27, "chapter": 27, "short": 27, "427": 27, "431": 27, "valencia": 27, "spain": 27, "april": 27, "e17": 27, "2068": 27, "uroparl": 27, "summit": 27, "86": 27, "phuket": 27, "thailand": 27, "septemb": 27, "mtsummit": 27, "marco": 27, "timothi": 27, "off": 27, "shelf": 27, "25": 27, "jeju": 27, "island": 27, "korea": 27, "p12": 27, "3005": 27, "intellig": 27, "robert": 27, "william": 27, "220": 27, "224": 27, "uppsala": 27, "sweden": 27, "p10": 27, "2041": 27, "neural": 27, "rico": 27, "barri": 27, "haddow": 27, "alexandra": 27, "birch": 27, "rare": 27, "54th": 27, "long": 27, "1715": 27, "1725": 27, "berlin": 27, "germani": 27, "p16": 27, "1162": 27, "grow": 27, "vesa": 27, "teemu": 27, "hirsim\u00e4ki": 27, "neser": 27, "ei": 27, "ieee": 27, "transact": 27, "audio": 27, "speech": 27, "15": 27, "1617": 27, "1624": 27, "1109": 27, "tasl": 27, "896666": 27, "eighth": 27, "2214": 27, "2218": 27, "istanbul": 27, "turkei": 27, "elra": 27, "conf": 27, "lrec2012": 27, "463_paper": 27, "tommi": 27, "jaakko": 27, "\u00e4": 27, "yrynen": 27, "nicoletta": 27, "calzolari": 27, "khalid": 27, "choukri": 27, "bent": 27, "maegaard": 27, "joseph": 27, "mariani": 27, "jan": 27, "odjik": 27, "stelio": 27, "piperidi": 27, "mike": 27, "rosner": 27, "daniel": 27, "tapia": 27, "editor": 27, "seventh": 27, "peter": 27, "smit": 27, "stig": 27, "arn": 27, "gr": 27, "nroo": 27, "kurimo": 27, "orfessor": 27, "aselin": 27, "aalto": 27, "univers": 27, "public": 27, "seri": 27, "scienc": 27, "technologi": 27, "depart": 27, "signal": 27, "acoust": 27, "vazquez": 27, "ra": 27, "\u00fa": 27, "l": 27, "u": 27, "nivers": 27, "elsinki": 27, "submiss": 27, "wmt": [27, 28], "19": 27, "294": 27, "300": 27, "5441": 27, "ostl": 27, "arkov": 27, "hain": 27, "ont": 27, "arlo": 27, "pragu": 27, "bulletin": 27, "mathemat": 27, "106": 27, "125": 27, "146": 27, "octob": 27, "ufal": 27, "mff": 27, "cuni": 27, "cz": 27, "pbml": 27, "art": 27, "here": [27, 28], "articl": 27, "journal": 27, "eprinttyp": 27, "eprint": 27, "timestamp": 27, "22": 27, "nov": 27, "17": 27, "58": 27, "0100": 27, "biburl": 27, "dblp": 27, "rec": 27, "bib": 27, "bibsourc": 27, "2020a": 27, "address": 27, "isbn": 27, "979": 27, "95546": 27, "34": 27, "2020b": 27, "jun": 27, "aug": 27, "archiveprefix": 27, "mon": 27, "28": 27, "dec": 27, "02": 27, "joulingbdjm16": 27, "apr": 27, "sep": 27, "\u00f6stling": 27, "owner": 27, "08": 27, "26": 27, "hirsim": 27, "aki": 27, "v\u00e1zquez": 27, "techreport": 27, "institut": 27, "eng": 27, "At": 28, "point": 28, "output_directori": 28, "thing": 28, "finnish": 28, "paracrawl": 28, "v4": 28, "fi": 28, "utf": 28, "gzip": 28, "bz2": 28, "bzip2": 28, "complex": 28, "v2019": 28, "node": 28, "anchor": 28, "previou": 28, "paracrawl_filt": 28, "myfilt": 28, "wmt_filter": 28, "bitext": 28, "lot": 28, "while": 28, "programmat": 28, "coupl": 28, "var": 28, "scope": 28, "kind": 28, "varstr": 28, "within": 28, "l1": 28, "l2": 28, "txt": 28, "templat": 28, "quot": 28, "loader": 28, "insid": 28, "brace": 28, "individu": 28, "local": 28, "place": 28, "file1": 28, "file2": 28, "sv": 28, "substep": 28, "exploit": 28, "wihtout": 28, "complet": 28, "conveni": 28, "outputdir": 28, "outdir": 28, "wai": 28, "former": 28, "perform": 28, "difficult": 28, "pars": 28, "notat": 28, "dash": 28, "underscor": 28, "still": 28, "shown": 28, "easier": 28}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"changelog": 0, "unreleas": 0, "3": 0, "2": 0, "0": 0, "2024": 0, "08": 0, "14": 0, "chang": 0, "fix": 0, "1": 0, "06": 0, "05": 0, "ad": 0, "remov": 0, "2023": 0, "10": 0, "11": 0, "6": 0, "2022": 0, "30": 0, "5": 0, "09": 0, "28": 0, "4": 0, "04": 0, "01": 0, "18": 0, "2021": 0, "23": 0, "19": 0, "31": 0, "2020": 0, "25": 0, "contribut": 1, "automat": 2, "configur": [2, 28], "gener": 2, "unsupervis": 2, "threshold": 2, "select": [2, 11], "filter": [2, 4, 5, 6, 7, 8, 9, 10, 12, 17], "command": [3, 28], "line": 3, "tool": [3, 18], "analysi": 3, "opusfilt": [3, 17, 28], "diagram": 3, "duplic": 3, "score": [3, 12, 16], "test": 3, "align": [4, 15, 18], "model": [4, 6, 15, 18], "wordalignfilt": 4, "custom": [5, 20], "languag": [6, 8, 15, 18], "crossentropyfilt": 6, "crossentropydifferencefilt": 6, "lmclassifierfilt": 6, "length": 7, "lengthfilt": 7, "lengthratiofilt": 7, "averagewordlengthfilt": 7, "longwordfilt": 7, "script": [8, 28], "identif": [8, 18], "alphabetratiofilt": 8, "characterscorefilt": 8, "languageidfilt": 8, "sentenc": [9, 18], "embed": [9, 18], "sentenceembeddingfilt": 9, "special": 10, "charact": 10, "similar": 10, "htmltagfilt": 10, "terminalpunctuationfilt": 10, "nonzeronumeralsfilt": 10, "longestcommonsubstringfilt": 10, "similarityfilt": 10, "repetitionfilt": 10, "regexpfilt": 10, "download": 11, "data": 11, "opus_read": 11, "concaten": 11, "head": 11, "tail": 11, "slice": 11, "split": 11, "subset": 11, "product": 11, "unzip": 11, "write": 11, "remove_dupl": 12, "preprocess": 13, "text": 13, "train": [14, 15], "us": [14, 16], "classifi": 14, "train_classifi": 14, "train_ngram": 15, "train_alig": 15, "train_nearest_neighbor": 15, "train_bp": 15, "train_morfessor": 15, "file": 16, "join": 16, "sort": 16, "get": 17, "start": 17, "avail": 17, "function": 17, "preprocessor": [17, 20], "other": 17, "inform": 17, "instal": 18, "requir": 18, "librari": 18, "option": 18, "fasttext": 18, "pycld2": 18, "jieba": 18, "mecab": 18, "word": 18, "segment": 18, "laser": 18, "varikn": 18, "n": 18, "gram": 18, "eflom": 18, "bpesegment": 19, "detoken": 21, "monolingualsentencesplitt": 22, "morfessorsegment": 23, "regexpsub": 24, "token": 25, "whitespacenorm": 26, "cite": 27, "refer": 27, "bibtex": 27, "basic": 28, "usag": 28, "exampl": 28, "variabl": 28, "constant": 28, "run": 28, "singl": 28}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinxcontrib.bibtex": 9, "sphinx": 58}, "alltitles": {"Changelog": [[0, "changelog"]], "Unreleased": [[0, "unreleased"]], "3.2.0 - 2024-08-14": [[0, "id1"]], "Changed": [[0, "changed"], [0, "id6"], [0, "id11"], [0, "id17"], [0, "id21"], [0, "id26"], [0, "id32"], [0, "id35"], [0, "id38"]], "Fixed": [[0, "fixed"], [0, "id3"], [0, "id8"], [0, "id12"], [0, "id14"], [0, "id18"], [0, "id23"], [0, "id30"], [0, "id33"], [0, "id40"], [0, "id43"]], "3.1.0 - 2024-06-05": [[0, "id2"]], "Added": [[0, "added"], [0, "id5"], [0, "id10"], [0, "id16"], [0, "id20"], [0, "id25"], [0, "id28"], [0, "id36"], [0, "id39"], [0, "id42"]], "Removed": [[0, "removed"], [0, "id7"]], "3.0.0 - 2023-10-11": [[0, "id4"]], "2.6.0 - 2022-11-30": [[0, "id9"]], "2.5.1 - 2022-09-28": [[0, "id13"]], "2.5.0 - 2022-09-28": [[0, "id15"]], "2.4.0 - 2022-04-05": [[0, "id19"]], "2.3.1 - 2022-01-28": [[0, "id22"]], "2.3.0 - 2022-01-18": [[0, "id24"]], "2.2.0 - 2021-11-23": [[0, "id27"]], "2.1.2 - 2021-11-11": [[0, "id29"]], "2.1.1 - 2021-10-19": [[0, "id31"]], "2.1.0 - 2021-08-31": [[0, "id34"]], "2.0.0 - 2021-06-01": [[0, "id37"]], "1.0.1 - 2020-05-25": [[0, "id41"]], "1.0.0 - 2020-04-10": [[0, "id44"]], "Contributing": [[1, "contributing"]], "Automatic configuration generation": [[2, "automatic-configuration-generation"]], "Unsupervised threshold selection for filters": [[2, "unsupervised-threshold-selection-for-filters"]], "Command line tools for analysis": [[3, "command-line-tools-for-analysis"]], "opusfilter-diagram": [[3, "opusfilter-diagram"]], "opusfilter-duplicates": [[3, "opusfilter-duplicates"]], "opusfilter-scores": [[3, "opusfilter-scores"]], "opusfilter-test": [[3, "opusfilter-test"]], "Alignment model filters": [[4, "alignment-model-filters"]], "WordAlignFilter": [[4, "wordalignfilter"]], "Custom filters": [[5, "custom-filters"]], "Language model filters": [[6, "language-model-filters"]], "CrossEntropyFilter": [[6, "crossentropyfilter"]], "CrossEntropyDifferenceFilter": [[6, "crossentropydifferencefilter"]], "LMClassifierFilter": [[6, "lmclassifierfilter"]], "Length filters": [[7, "length-filters"]], "LengthFilter": [[7, "lengthfilter"]], "LengthRatioFilter": [[7, "lengthratiofilter"]], "AverageWordLengthFilter": [[7, "averagewordlengthfilter"]], "LongWordFilter": [[7, "longwordfilter"]], "Script and language identification filters": [[8, "script-and-language-identification-filters"]], "AlphabetRatioFilter": [[8, "alphabetratiofilter"]], "CharacterScoreFilter": [[8, "characterscorefilter"]], "LanguageIDFilter": [[8, "languageidfilter"]], "Sentence embedding filters": [[9, "sentence-embedding-filters"]], "SentenceEmbeddingFilter": [[9, "sentenceembeddingfilter"]], "Special character and similarity filters": [[10, "special-character-and-similarity-filters"]], "HtmlTagFilter": [[10, "htmltagfilter"]], "TerminalPunctuationFilter": [[10, "terminalpunctuationfilter"]], "NonZeroNumeralsFilter": [[10, "nonzeronumeralsfilter"]], "LongestCommonSubstringFilter": [[10, "longestcommonsubstringfilter"]], "SimilarityFilter": [[10, "similarityfilter"]], "RepetitionFilter": [[10, "repetitionfilter"]], "RegExpFilter": [[10, "regexpfilter"]], "Downloading and selecting data": [[11, "downloading-and-selecting-data"]], "opus_read": [[11, "opus-read"]], "concatenate": [[11, "concatenate"]], "download": [[11, "download"]], "head": [[11, "head"]], "tail": [[11, "tail"]], "slice": [[11, "slice"]], "split": [[11, "split"]], "subset": [[11, "subset"]], "product": [[11, "product"]], "unzip": [[11, "unzip"]], "write": [[11, "write"]], "Filtering and scoring": [[12, "filtering-and-scoring"]], "remove_duplicates": [[12, "remove-duplicates"]], "filter": [[12, "filter"]], "score": [[12, "score"]], "Preprocessing text": [[13, "preprocessing-text"]], "preprocess": [[13, "preprocess"]], "Training and using classifiers": [[14, "training-and-using-classifiers"]], "train_classifier": [[14, "train-classifier"]], "classify": [[14, "classify"]], "Training language and alignment models": [[15, "training-language-and-alignment-models"]], "train_ngram": [[15, "train-ngram"]], "train_aligment": [[15, "train-aligment"]], "train_nearest_neighbors": [[15, "train-nearest-neighbors"]], "train_bpe": [[15, "train-bpe"]], "train_morfessor": [[15, "train-morfessor"]], "Using score files": [[16, "using-score-files"]], "join": [[16, "join"]], "sort": [[16, "sort"]], "OpusFilter": [[17, "opusfilter"]], "Get started": [[17, null]], "Available functions": [[17, null]], "Available filters": [[17, null]], "Available preprocessors": [[17, null]], "Other information": [[17, null]], "Installation": [[18, "installation"]], "Required libraries": [[18, "required-libraries"]], "Optional libraries and tools": [[18, "optional-libraries-and-tools"]], "FastText and PyCLD2 language identification": [[18, "fasttext-and-pycld2-language-identification"]], "Jieba and MeCab word segmentation": [[18, "jieba-and-mecab-word-segmentation"]], "LASER sentence embeddings": [[18, "laser-sentence-embeddings"]], "VariKN n-gram models": [[18, "varikn-n-gram-models"]], "Eflomal word alignment": [[18, "eflomal-word-alignment"]], "BPESegmentation": [[19, "bpesegmentation"]], "Custom preprocessors": [[20, "custom-preprocessors"]], "Detokenizer": [[21, "detokenizer"]], "MonolingualSentenceSplitter": [[22, "monolingualsentencesplitter"]], "MorfessorSegmentation": [[23, "morfessorsegmentation"]], "RegExpSub": [[24, "regexpsub"]], "Tokenizer": [[25, "tokenizer"]], "WhitespaceNormalizer": [[26, "whitespacenormalizer"]], "Citing and references": [[27, "citing-and-references"]], "Citing": [[27, "citing"]], "References": [[27, "references"]], "References as BibTeX": [[27, "references-as-bibtex"]], "Basic usage": [[28, "basic-usage"]], "opusfilter script": [[28, "opusfilter-script"]], "Configuration examples": [[28, "configuration-examples"]], "Variables and constants": [[28, "variables-and-constants"]], "Running a single command": [[28, "running-a-single-command"]]}, "indexentries": {}}) \ No newline at end of file diff --git a/usage.html b/usage.html index b0af93d..01479a4 100644 --- a/usage.html +++ b/usage.html @@ -4,7 +4,7 @@ - Basic usage — OpusFilter 3.1.0 documentation + Basic usage — OpusFilter 3.2.0 documentation @@ -15,7 +15,7 @@ - + @@ -37,7 +37,7 @@ OpusFilter
                          - 3.1 + 3.2