Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Isolated the generation of document vectors for external usage. #336

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 58 additions & 22 deletions top2vec/Top2Vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -1503,6 +1503,58 @@ def get_documents_topics(self, doc_ids, reduced=False, num_topics=1):

return doc_topics, doc_dist, topic_words, topic_word_scores

def get_document_vectors(self,
documents,
tokenizer=None,
use_embedding_model_tokenizer=False,
embedding_batch_size=32):
"""
Returns document vectors

The documents will be added to the current model without changing
existing document, word and topic vectors. Topic sizes will be updated.

If adding a large quantity of documents relative to the current model
size, or documents containing a largely new vocabulary, a new model
should be trained for best results.

Parameters
----------
documents: List of str

tokenizer: callable (Optional, default None)
Override the default tokenization method. If None then
gensim.utils.simple_preprocess will be used.

use_embedding_model_tokenizer: bool (Optional, default False)
If using an embedding model other than doc2vec, use the model's
tokenizer for document embedding.

embedding_batch_size: int (default=32)
Batch size for documents being embedded.
"""

# if tokenizer is not passed use default
if tokenizer is None:
tokenizer = default_tokenizer

if self.embedding_model == "doc2vec":
docs_processed = [tokenizer(doc) for doc in documents]
document_vectors = np.vstack([self.model.infer_vector(doc_words=doc,
alpha=0.025,
min_alpha=0.01,
epochs=100) for doc in docs_processed])
document_vectors = self._l2_normalize(document_vectors)

else:
if use_embedding_model_tokenizer:
docs_training = documents
else:
docs_processed = [tokenizer(doc) for doc in documents]
docs_training = [' '.join(doc) for doc in docs_processed]
document_vectors = self._embed_documents(docs_training, embedding_batch_size)
return document_vectors

def add_documents(self,
documents,
doc_ids=None,
Expand Down Expand Up @@ -1540,9 +1592,6 @@ def add_documents(self,
embedding_batch_size: int (default=32)
Batch size for documents being embedded.
"""
# if tokenizer is not passed use default
if tokenizer is None:
tokenizer = default_tokenizer

# add documents
self._validate_documents(documents)
Expand All @@ -1566,24 +1615,11 @@ def add_documents(self,
else:
raise ValueError("doc_ids cannot be used because they were not provided to model during training.")

if self.embedding_model == "doc2vec":
docs_processed = [tokenizer(doc) for doc in documents]
document_vectors = np.vstack([self.model.infer_vector(doc_words=doc,
alpha=0.025,
min_alpha=0.01,
epochs=100) for doc in docs_processed])

document_vectors = self._l2_normalize(document_vectors)
self.document_vectors = np.vstack([self.document_vectors, document_vectors])

else:
if use_embedding_model_tokenizer:
docs_training = documents
else:
docs_processed = [tokenizer(doc) for doc in documents]
docs_training = [' '.join(doc) for doc in docs_processed]
document_vectors = self._embed_documents(docs_training, embedding_batch_size)
self.document_vectors = np.vstack([self.document_vectors, document_vectors])
# get document vectors
document_vectors = self.get_document_vectors(
documents, tokenizer, use_embedding_model_tokenizer, embedding_batch_size
)
self.document_vectors = np.vstack([self.document_vectors, document_vectors])

# update index
if self.documents_indexed:
Expand Down Expand Up @@ -2736,4 +2772,4 @@ def generate_topic_wordcloud(self, topic_num, background_color="black", reduced=
WordCloud(width=1600,
height=400,
background_color=background_color).generate_from_frequencies(word_score_dict))
plt.title("Topic " + str(topic_num), loc='left', fontsize=25, pad=20)
plt.title("Topic " + str(topic_num), loc='left', fontsize=25, pad=20)