Skip to content

Commit

Permalink
Merge pull request #110 from AnFreTh/main
Browse files Browse the repository at this point in the history
Releasing version 0.2.0
  • Loading branch information
AnFreTh authored Jan 4, 2025
2 parents dc2d4db + 50da9a3 commit bc3ddfb
Show file tree
Hide file tree
Showing 10 changed files with 119 additions and 79 deletions.
24 changes: 15 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,22 +73,28 @@ You can install STREAM directly from PyPI or from the GitHub repository:

1. **PyPI (Recommended)**:
```bash
pip install stream_topic
pip install stream-topic
```

2. **GitHub**:
```bash
pip install git+https://github.com/AnFreTh/STREAM.git
```

3. **Download NLTK Resources**:
Ensure you have the necessary NLTK resources installed:
```python
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
3. **Install requirements for add-ons**:
To use STREAMS visualizations, simply run:
```bash
pip install stream-topic[plotting]
```

For BERTopic, run:
```bash
pip install stream-topic[hdbscan]
```

For DCTE:
```bash
pip install stream-topic[dcte]
```

# 📦 Available Models
Expand Down
42 changes: 14 additions & 28 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,42 +1,28 @@
# basics
numpy<=1.26.4
numpy
pandas
pyarrow
scikit-learn==1.1.0
scipy==1.10.1
scikit-learn
nltk
datasets

# dl
lightning==2.3.3
torch==2.4.0
lightning
torch
sentence_transformers

# nlp
transformers==4.40.2
setfit==1.0.3
gensim==4.2.0
umap-learn==0.5.6
wordcloud==1.9.3
transformers
gensim
umap-learn


community
networkx==3.3
networkx
python_louvain
langdetect
hdbscan==0.8.37
huggingface_hub==0.23.5
# nltk
# datasets==2.20.0
# sentence-transformers==3.0.1


# plotting
dash
plotly
matplotlib

# misc
loguru
ipywidgets
ipykernel<6.22.0
# tqdm
# pre-commit
optuna==3.6.1
optuna-integration==3.6.0
optuna
optuna-integration
36 changes: 33 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# -*- coding: utf-8 -*-
import os
from pathlib import Path

from setuptools import find_packages, setup
from setuptools.command.install import install

# Package meta-data.
NAME = "stream_topic"
Expand All @@ -12,7 +12,27 @@
DOCS = "https://stream.readthedocs.io/en/"
EMAIL = "[email protected]"
AUTHOR = "Anton Thielmann"
REQUIRES_PYTHON = ">=3.6, <=3.11"
REQUIRES_PYTHON = ">=3.6"


class PostInstallCommand(install):
"""Post-installation for downloading NLTK resources."""

def run(self):
install.run(self)
try:
import nltk

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt_tab")
nltk.download("brown")
nltk.download("averaged_perceptron_tagger_eng")
except ImportError:
print(
"NLTK not installed. Ensure it is listed in install_requires or installed separately."
)


# Load the package's verison file and its content.
ROOT_DIR = Path(__file__).resolve().parent
Expand All @@ -30,6 +50,13 @@
if not line.startswith("#") and not line.startswith("git+")
]

extras_require = {
"plotting": ["dash", "plotly", "matplotlib", "wordcloud"],
"bertopic": ["hdbscan"],
"dcte": ["pyarrow", "setfit"],
}


# get long description from readme file
with open(os.path.join(ROOT_DIR, "README.md")) as f:
LONG_DESCRIPTION = f.read()
Expand All @@ -45,7 +72,7 @@
author_email=EMAIL,
python_requires=REQUIRES_PYTHON,
install_requires=install_reqs,
# extras_require=extras_reqs,
extras_require=extras_require,
license="MIT", # adapt based on your needs
packages=find_packages(exclude=["examples", "examples.*", "tests", "tests.*"]),
include_package_data=True,
Expand All @@ -65,4 +92,7 @@
],
project_urls={"Documentation": DOCS},
url=HOMEPAGE,
cmdclass={
"install": PostInstallCommand,
},
)
2 changes: 1 addition & 1 deletion stream_topic/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Version information."""

# The following line *must* be the last in the module, exactly as formatted:
__version__ = "0.1.9"
__version__ = "0.2.0"
33 changes: 20 additions & 13 deletions stream_topic/metrics/coherence_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,26 @@ def __init__(

self.n_words = n_words

def get_info(self):
"""
Get information about the metric.
Returns
-------
dict
Dictionary containing model information including metric name,
number of top words, number of intruders, embedding model name,
metric range and metric description.
"""

info = {
"metric_name": "Embedding Coherence",
"n_words": self.n_words,
"description": "Embedding Coherence coherence",
}

return info

def score_per_topic(self, topics):
"""
Calculates coherence scores for each topic individually based on embedding similarities.
Expand Down Expand Up @@ -414,16 +434,3 @@ def score(self, topics):
"""
res = self.score_per_topic(topics).values()
return sum(res) / len(res)


def _load_default_texts():
"""
Loads default general texts
Returns
-------
result : default 20newsgroup texts
"""
dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")
return dataset.get_corpus()
24 changes: 19 additions & 5 deletions stream_topic/models/DCTE.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,8 @@
import pyarrow as pa
from datasets import Dataset
from loguru import logger
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, TrainingArguments
from setfit import Trainer as SetfitTrainer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from ..commons.check_steps import check_dataset_steps
from ..preprocessor._tf_idf import c_tf_idf, extract_tfidf_topics
from ..utils.dataset import TMDataset
Expand All @@ -21,6 +17,19 @@
# logger.add(f"{MODEL_NAME}_{time}.log", backtrace=True, diagnose=True)


def import_setfit():
try:
from setfit import SetFitModel, TrainingArguments
from setfit import Trainer as SetfitTrainer
from sentence_transformers.losses import CosineSimilarityLoss

return SetFitModel, TrainingArguments, SetfitTrainer, CosineSimilarityLoss
except ImportError as e:
raise ImportError(
"Setfit is not installed. Please install it by running 'pip install setfit'."
) from e


class DCTE(BaseModel):
"""
A document classification and topic extraction class that utilizes the SetFitModel for
Expand Down Expand Up @@ -62,6 +71,9 @@ def __init__(
)
self.n_topics = None

# Lazy import SetFit components
SetFitModel, _, _, _ = import_setfit()

self.model = SetFitModel.from_pretrained(f"sentence-transformers/{model}")
self._status = TrainingStatus.NOT_STARTED
self.n_topics = None
Expand Down Expand Up @@ -122,7 +134,7 @@ def _get_topic_representation(self, predict_df: pd.DataFrame, top_words: int):
n=top_words,
)

one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder = OneHotEncoder(sparse_output=False)
predictions_one_hot = one_hot_encoder.fit_transform(predict_df[["predictions"]])

beta = tfidf
Expand Down Expand Up @@ -154,6 +166,8 @@ def fit(
dict: A dictionary containing the extracted topics and the topic-word matrix.
"""

_, TrainingArguments, SetfitTrainer, CosineSimilarityLoss = import_setfit()

assert isinstance(
dataset, TMDataset
), "The dataset must be an instance of TMDataset."
Expand Down
2 changes: 1 addition & 1 deletion stream_topic/models/KmeansTM.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def fit(
)
self.topic_dict = extract_tfidf_topics(tfidf, count, docs_per_topic, n=100)

one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder = OneHotEncoder(sparse_output=False)
predictions_one_hot = one_hot_encoder.fit_transform(
self.dataframe[["predictions"]]
)
Expand Down
7 changes: 4 additions & 3 deletions stream_topic/models/abstract_helper_models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,9 @@
import pickle
from abc import ABC, abstractmethod
from enum import Enum

import optuna
import torch.nn as nn
import umap.umap_ as umap
from loguru import logger
from optuna.integration import PyTorchLightningPruningCallback


class BaseModel(ABC):
Expand Down Expand Up @@ -340,6 +337,10 @@ def optimize_hyperparameters(
dict
Dictionary containing the best parameters and the optimal number of topics.
"""
import importlib

optuna = importlib.import_module("optuna")

assert criterion in [
"aic",
"bic",
Expand Down
8 changes: 5 additions & 3 deletions stream_topic/models/bertopicTM.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
from datetime import datetime

import hdbscan
import numpy as np
from loguru import logger
from sklearn.preprocessing import OneHotEncoder
Expand Down Expand Up @@ -121,6 +119,10 @@ def _clustering(self):
Applies K-Means clustering to the reduced embeddings.
"""

import importlib

hdbscan = importlib.import_module("hdbscan")

assert (
hasattr(self, "reduced_embeddings") and self.reduced_embeddings is not None
), "Reduced embeddings must be generated before clustering."
Expand Down Expand Up @@ -192,7 +194,7 @@ def fit(self, dataset, n_topics=None):

self.topic_dict = extract_tfidf_topics(tfidf, count, docs_per_topic, n=100)

one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder = OneHotEncoder(sparse_output=False)
predictions_one_hot = one_hot_encoder.fit_transform(
self.dataframe[["predictions"]]
)
Expand Down
20 changes: 7 additions & 13 deletions stream_topic/models/cbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@

from ..commons.check_steps import check_dataset_steps
from ..preprocessor import c_tf_idf, extract_tfidf_topics
from ..utils.cbc_utils import (DocumentCoherence,
get_top_tfidf_words_per_document)
from ..utils.cbc_utils import DocumentCoherence, get_top_tfidf_words_per_document
from ..utils.dataset import TMDataset
from .abstract_helper_models.base import BaseModel, TrainingStatus

Expand Down Expand Up @@ -189,12 +188,10 @@ def fit(
clusters = self.cluster_documents()

num_clusters = len(clusters)
print(
f"Iteration {iteration}: {num_clusters} clusters formed.")
print(f"Iteration {iteration}: {num_clusters} clusters formed.")

# Prepare for the next iteration
combined_documents = self.combine_documents(
current_documents, clusters)
combined_documents = self.combine_documents(current_documents, clusters)
current_documents = combined_documents
iteration += 1

Expand Down Expand Up @@ -247,8 +244,7 @@ def fit(
self.labels += 1

# Update the 'predictions' column in the dataframe with -1 where NaN was present
self.dataframe["predictions"] = self.dataframe["predictions"].fillna(
-1)
self.dataframe["predictions"] = self.dataframe["predictions"].fillna(-1)
self.dataframe["predictions"] += 1
print("--- replaced NaN values with 0 in topics ---")
print(
Expand All @@ -259,13 +255,11 @@ def fit(
{"text": " ".join}
)
logger.info("--- Extract topics ---")
tfidf, count = c_tf_idf(
docs_per_topic["text"].values, m=len(self.dataframe))
self.topic_dict = extract_tfidf_topics(
tfidf, count, docs_per_topic, n=10)
tfidf, count = c_tf_idf(docs_per_topic["text"].values, m=len(self.dataframe))
self.topic_dict = extract_tfidf_topics(tfidf, count, docs_per_topic, n=10)

one_hot_encoder = OneHotEncoder(
sparse=False
sparse_output=False
) # Use sparse=False to get a dense array
predictions_one_hot = one_hot_encoder.fit_transform(
self.dataframe[["predictions"]]
Expand Down

0 comments on commit bc3ddfb

Please sign in to comment.