From 5fc495bddf8e6bda8b1c0d1c15dc00d0e2a7024a Mon Sep 17 00:00:00 2001 From: Tim Schopf Date: Mon, 29 Apr 2024 10:50:21 +0200 Subject: [PATCH] fix docs Signed-off-by: Tim Schopf --- .readthedocs.yaml | 3 +-- README.md | 14 +++++++------- docs/requirements.txt | 1 + 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index e5df02f..8b07701 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -18,7 +18,6 @@ formats: all python: install: - requirements: docs/requirements.txt - - requirements: requirements.txt - method: pip path: . extra_requirements: @@ -27,7 +26,7 @@ python: build: os: ubuntu-22.04 tools: - python: "3.7" + python: "3.8" submodules: include: all diff --git a/README.md b/README.md index 21dd684..3dd9aae 100644 --- a/README.md +++ b/README.md @@ -127,9 +127,12 @@ vectorizer = KeyphraseCountVectorizer() # Print parameters print(vectorizer.get_params()) +``` +```plaintext >>> {'binary': False, 'dtype': , 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '*+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1} ``` + By default, the vectorizer is initialized for the English language. That means, an English `spacy_pipeline` is specified, English `stop_words` are removed, and the `pos_pattern` extracts keywords that have 0 or more adjectives, followed by 1 or more nouns using the English spaCy part-of-speech tags. In addition, the spaCy pipeline @@ -255,14 +258,11 @@ vectorizer = KeyphraseTfidfVectorizer() # Print parameters print(vectorizer.get_params()) ->>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': < - - -class 'numpy.int64'>, 'lowercase': True, 'max_df': None - -, 'min_df': None, 'pos_pattern': '*+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner', - 'textcat'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1} ``` +```plaintext +>>> {'binary': False, 'custom_pos_tagger': None, 'decay': None, 'delete_min_df': None, 'dtype': , 'lowercase': True, 'max_df': None, 'min_df': None, 'pos_pattern': '*+', 'spacy_exclude': ['parser', 'attribute_ruler', 'lemmatizer', 'ner','textcat'], 'spacy_pipeline': 'en_core_web_sm', 'stop_words': 'english', 'workers': 1} +``` + To calculate tf values instead, set `use_idf=False`. diff --git a/docs/requirements.txt b/docs/requirements.txt index 8a3ea7e..0186de7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -14,6 +14,7 @@ docutils>=0.16 numpy>=1.18.5 spacy>=3.0.1 spacy-transformers>=1.1.6 +spacy-curated-transformers>=0.2.2 nltk>=3.6.1 scikit-learn>=1.0 scipy>=1.7.3