Added vectorizer & predict_weights

KanishkNavale · Aug 31, 2021 · 3d5a8fd · 3d5a8fd
1 parent 664b47a
commit 3d5a8fd
Show file tree

Hide file tree

Showing 10 changed files with 211 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -28,7 +28,7 @@ from irtm.toolbox import *
     >>> 'M466'
     ```
 
-2. Tokenizer: Convert a sequence of characters into a sequence of tokens.
+2. Tokenizer: Converts a sequence of characters into a sequence of tokens.
 
     ```python
     print(tokenize('LINUX'))
@@ -39,3 +39,46 @@ from irtm.toolbox import *
     >>> ['linux']
     >>> ['text', 'mining']
     ```
+
+3. Vectorize: Converts a string to token based weight tensor.
+
+    ```python
+    vector = vectorize([
+            'texts ([string]): a multiline or a single line string.',
+            'dict ([list], optional): list of tokens. Defaults to None.',
+            'enable_Idf (bool, optional): use IDF or not. Defaults to True.',
+            'normalize (str, optional): normalization of vector. Defaults to l2.',
+            'max_dim ([int], optional): dimension of vector. Defaults to None.',
+            'smooth (bool, optional): restricts value >0. Defaults to True.',
+            'weightedTf (bool, optional): Tf = 1+log(Tf). Defaults to True.',
+            'return_features (bool, optional): feature vector. Defaults to False.'
+            ])
+
+    print(f'Vector Shape={vector.shape}')
+    ```
+
+    ```bash
+    >>> Vector Shape=(8, 37)
+    ```
+
+4. Predict Token Weights: Computes importance of a token based on classification optimization.
+
+    ```python
+    dictionary = ['vector', 'string', 'bool']
+    vector = vectorize([
+            'X ([np.array]): vectorized matrix columns arraged as per the dictionary.',
+            'y ([labels]): True classification labels.',
+            'epochs ([int]): Optimization epochs.',
+            'verbose (bool, optional): Enable verbose outputs. Defaults to False.',
+            'dict ([type], optional): list of tokens. Defaults to None.'
+            ], dict=dictionary)
+
+    labels = np.random.randint(1, size=(vector.shape[0], 1))
+    weights = predict_weights(vector, labels, 100, dict=dictionary)
+    ```
+
+    ```bash
+    >>> Token-Weights Mappings: {'vector': 0.22097790924850977, 
+                                 'string': 0.39296369957440075, 
+                                 'bool': 0.689853175081446}
+    ```
diff --git a/dist/irtm-0.0.2-py3-none-any.whl b/dist/irtm-0.0.2-py3-none-any.whl
diff --git a/dist/irtm-0.0.2.tar.gz b/dist/irtm-0.0.2.tar.gz
diff --git a/dist/irtm-0.0.3-py3-none-any.whl b/dist/irtm-0.0.3-py3-none-any.whl
diff --git a/dist/irtm-0.0.3.tar.gz b/dist/irtm-0.0.3.tar.gz
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,3 @@
-nltk==3.5
+numpy
+nltk
+scikit_learn
diff --git a/setup.py b/setup.py
@@ -10,10 +10,10 @@
                  name='irtm',
 
                  # Version ID
-                 version='0.0.2',
+                 version='0.0.3',
 
                  # Package Descriptions
-                 description='A toolbox for Information Retreival & Text Mining.',
+                 description='A toolbox for Information Retrieval & Text Mining.',
                  long_description=long_description,
                  long_description_content_type="text/markdown",
                  url='https://github.com/KanishkNavale/IRTM-Toolbox.git',
@@ -27,7 +27,9 @@
 
                  # Setup Prequisites
                  install_requires=[
-                                    'nltk==3.5'
+                                    'numpy',
+                                    'nltk',
+                                    'scikit_learn'
                                   ],
 
                  # Package Build Source Pointer

diff --git a/src/irtm.egg-info/PKG-INFO b/src/irtm.egg-info/PKG-INFO
@@ -1,7 +1,7 @@
 Metadata-Version: 2.1
 Name: irtm
-Version: 0.0.2
-Summary: A toolbox for Information Retreival & Text Mining.
+Version: 0.0.3
+Summary: A toolbox for Information Retrieval & Text Mining.
 Home-page: https://github.com/KanishkNavale/IRTM-Toolbox.git
 Author: Kanishk Navale
 Author-email: [email protected]
@@ -44,7 +44,7 @@ from irtm.toolbox import *
     >>> 'M466'
     ```
 
-2. Tokenizer: Convert a sequence of characters into a sequence of tokens.
+2. Tokenizer: Converts a sequence of characters into a sequence of tokens.
 
     ```python
     print(tokenize('LINUX'))
@@ -56,4 +56,47 @@ from irtm.toolbox import *
     >>> ['text', 'mining']
     ```
 
+3. Vectorize: Converts a string to token based weight tensor.
+
+    ```python
+    vector = vectorize([
+            'texts ([string]): a multiline or a single line string.',
+            'dict ([list], optional): list of tokens. Defaults to None.',
+            'enable_Idf (bool, optional): use IDF or not. Defaults to True.',
+            'normalize (str, optional): normalization of vector. Defaults to l2.',
+            'max_dim ([int], optional): dimension of vector. Defaults to None.',
+            'smooth (bool, optional): restricts value >0. Defaults to True.',
+            'weightedTf (bool, optional): Tf = 1+log(Tf). Defaults to True.',
+            'return_features (bool, optional): feature vector. Defaults to False.'
+            ])
+
+    print(f'Vector Shape={vector.shape}')
+    ```
+
+    ```bash
+    >>> Vector Shape=(8, 37)
+    ```
+
+4. Predict Token Weights: Computes importance of a token based on classification optimization.
+
+    ```python
+    dictionary = ['vector', 'string', 'bool']
+    vector = vectorize([
+            'X ([np.array]): vectorized matrix columns arraged as per the dictionary.',
+            'y ([labels]): True classification labels.',
+            'epochs ([int]): Optimization epochs.',
+            'verbose (bool, optional): Enable verbose outputs. Defaults to False.',
+            'dict ([type], optional): list of tokens. Defaults to None.'
+            ], dict=dictionary)
+
+    labels = np.random.randint(1, size=(vector.shape[0], 1))
+    weights = predict_weights(vector, labels, 100, dict=dictionary)
+    ```
+
+    ```bash
+    >>> Token-Weights Mappings: {'vector': 0.22097790924850977, 
+                                 'string': 0.39296369957440075, 
+                                 'bool': 0.689853175081446}
+    ```
+
 
diff --git a/src/irtm.egg-info/requires.txt b/src/irtm.egg-info/requires.txt
@@ -1 +1,3 @@
-nltk==3.5
+numpy
+nltk
+scikit_learn
diff --git a/src/irtm/toolbox.py b/src/irtm/toolbox.py
@@ -4,6 +4,9 @@
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from nltk.stem import WordNetLemmatizer
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+from collections import deque
 
 
 ###############################################################################
@@ -18,7 +21,6 @@ def soundex(word):
     Args:
         word (string): string for conversion.
     """
-
     if word.isalpha():
 
         # Clip the first value
@@ -60,7 +62,7 @@ def soundex(word):
         for i in range(len(word)):
             try:
                 word.remove('0')
-            except Exception as e:
+            except:
                 pass
 
         # Add the header in
@@ -89,6 +91,16 @@ def soundex(word):
 ###############################################################################
 
 def tokenize(word):
+    """
+    Description:
+        Tokenizes a string.
+
+    Args:
+        word ([str]): str
+
+    Returns:
+        [list]: list of tokens.
+    """
     tokens = word_tokenize(word)
     tokens = [word for word in tokens if word.isalpha()]
     tokens = [word.lower() for word in tokens]
@@ -97,3 +109,99 @@ def tokenize(word):
     tokens = [lemma.lemmatize(word, pos="v") for word in tokens]
     tokens = [lemma.lemmatize(word, pos="n") for word in tokens]
     return tokens
+
+
+###############################################################################
+# Vectorizer
+###############################################################################
+
+def vectorize(texts, dict=None, enable_Idf=True,
+              normalize='l2', max_dim=None,
+              smooth=True, weightedTf=True, return_features=False):
+    """
+    Description:
+        Creates weights tensor based on parsed string.
+
+    Args:
+        texts ([string]): a multiline or a single line string
+        dict ([list], optional): list of tokens. Defaults to None.
+        enable_Idf (bool, optional): use IDF or not. Defaults to True.
+        normalize (str, optional): normalization of vector. Defaults to 'l2'.
+        max_dim ([int], optional): dimension of vector. Defaults to None.
+        smooth (bool, optional): restricts value >0. Defaults to True.
+        weightedTf (bool, optional): Tf = 1+log(Tf). Defaults to True.
+        return_features (bool, optional): feature vector. Defaults to False.
+
+    Returns:
+        [np.matrix]: vectorized weight matrix
+        [list]: feature vectors
+    """
+    if dict is None:
+        vectorizer = TfidfVectorizer(use_idf=enable_Idf,
+                                     norm=normalize, max_features=max_dim,
+                                     sublinear_tf=weightedTf,
+                                     smooth_idf=smooth)
+    else:
+        vectorizer = TfidfVectorizer(vocabulary=dict, use_idf=enable_Idf,
+                                     norm=normalize, max_features=max_dim,
+                                     sublinear_tf=weightedTf,
+                                     smooth_idf=smooth)
+
+    vector = vectorizer.fit_transform(texts)
+
+    if return_features:
+        return vector.todense(), vectorizer.get_feature_names()
+    else:
+        return vector.todense()
+
+
+###############################################################################
+# PREDICT WEIGHTS
+###############################################################################
+def predict_weights(X, y, epochs, verbose=False, dict=None):
+    """
+    Description:
+        Predicts importance of a token based on classification optimization.
+
+    Args:
+        X ([np.array]): vectorized matrix columns arraged as per the dictionary.
+        y ([labels]): True classification labels.
+        epochs ([int]): Optimization epochs.
+        verbose (bool, optional): Enable verbose outputs. Defaults to False.
+        dict ([type], optional): list of tokens. Defaults to None.
+
+    Returns:
+        [dictionary]: Mappings of token & it's weights
+    """
+
+    W = np.random.uniform(0, 1, (1, X.shape[1]))
+    v = np.zeros(W.shape)
+    loss_log = deque(maxlen=3)
+
+    for i in range(loss_log.maxlen):
+        loss_log.append(0)
+
+    for i in range(int(epochs)):
+        pred_y = 1.0 / 1.0 + np.exp(X @ W.T)
+        loss = -np.mean(np.log(pred_y.T) @ y)
+        loss_log.append(loss)
+        gradient = (pred_y - y).T @ X + (2.0 * 0.1 * W)
+        v = (0.9 * v) + (1e-3 * gradient)
+        W = W - v
+
+        if verbose:
+            if i % 100 == 0:
+                print(f'Epoch={i} \t Loss={loss}')
+            if np.mean(loss_log) == loss:
+                print('Loss is not decreasing enough!')
+                break
+        else:
+            if np.mean(loss_log) == loss:
+                break
+
+    mapping = {}
+    weights = np.ravel(W)
+    for i in range(len(dict)):
+        mapping[dict[i]] = weights[i]
+
+    return mapping