Skip to content

Commit

Permalink
Added vectorizer & predict_weights
Browse files Browse the repository at this point in the history
  • Loading branch information
KanishkNavale committed Aug 31, 2021
1 parent 664b47a commit 3d5a8fd
Show file tree
Hide file tree
Showing 10 changed files with 211 additions and 11 deletions.
45 changes: 44 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ from irtm.toolbox import *
>>> 'M466'
```

2. Tokenizer: Convert a sequence of characters into a sequence of tokens.
2. Tokenizer: Converts a sequence of characters into a sequence of tokens.

```python
print(tokenize('LINUX'))
Expand All @@ -39,3 +39,46 @@ from irtm.toolbox import *
>>> ['linux']
>>> ['text', 'mining']
```

3. Vectorize: Converts a string to token based weight tensor.

```python
vector = vectorize([
'texts ([string]): a multiline or a single line string.',
'dict ([list], optional): list of tokens. Defaults to None.',
'enable_Idf (bool, optional): use IDF or not. Defaults to True.',
'normalize (str, optional): normalization of vector. Defaults to l2.',
'max_dim ([int], optional): dimension of vector. Defaults to None.',
'smooth (bool, optional): restricts value >0. Defaults to True.',
'weightedTf (bool, optional): Tf = 1+log(Tf). Defaults to True.',
'return_features (bool, optional): feature vector. Defaults to False.'
])

print(f'Vector Shape={vector.shape}')
```

```bash
>>> Vector Shape=(8, 37)
```

4. Predict Token Weights: Computes importance of a token based on classification optimization.

```python
dictionary = ['vector', 'string', 'bool']
vector = vectorize([
'X ([np.array]): vectorized matrix columns arraged as per the dictionary.',
'y ([labels]): True classification labels.',
'epochs ([int]): Optimization epochs.',
'verbose (bool, optional): Enable verbose outputs. Defaults to False.',
'dict ([type], optional): list of tokens. Defaults to None.'
], dict=dictionary)

labels = np.random.randint(1, size=(vector.shape[0], 1))
weights = predict_weights(vector, labels, 100, dict=dictionary)
```

```bash
>>> Token-Weights Mappings: {'vector': 0.22097790924850977,
'string': 0.39296369957440075,
'bool': 0.689853175081446}
```
Binary file removed dist/irtm-0.0.2-py3-none-any.whl
Binary file not shown.
Binary file removed dist/irtm-0.0.2.tar.gz
Binary file not shown.
Binary file added dist/irtm-0.0.3-py3-none-any.whl
Binary file not shown.
Binary file added dist/irtm-0.0.3.tar.gz
Binary file not shown.
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
nltk==3.5
numpy
nltk
scikit_learn
8 changes: 5 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
name='irtm',

# Version ID
version='0.0.2',
version='0.0.3',

# Package Descriptions
description='A toolbox for Information Retreival & Text Mining.',
description='A toolbox for Information Retrieval & Text Mining.',
long_description=long_description,
long_description_content_type="text/markdown",
url='https://github.com/KanishkNavale/IRTM-Toolbox.git',
Expand All @@ -27,7 +27,9 @@

# Setup Prequisites
install_requires=[
'nltk==3.5'
'numpy',
'nltk',
'scikit_learn'
],

# Package Build Source Pointer
Expand Down
49 changes: 46 additions & 3 deletions src/irtm.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Metadata-Version: 2.1
Name: irtm
Version: 0.0.2
Summary: A toolbox for Information Retreival & Text Mining.
Version: 0.0.3
Summary: A toolbox for Information Retrieval & Text Mining.
Home-page: https://github.com/KanishkNavale/IRTM-Toolbox.git
Author: Kanishk Navale
Author-email: [email protected]
Expand Down Expand Up @@ -44,7 +44,7 @@ from irtm.toolbox import *
>>> 'M466'
```

2. Tokenizer: Convert a sequence of characters into a sequence of tokens.
2. Tokenizer: Converts a sequence of characters into a sequence of tokens.

```python
print(tokenize('LINUX'))
Expand All @@ -56,4 +56,47 @@ from irtm.toolbox import *
>>> ['text', 'mining']
```

3. Vectorize: Converts a string to token based weight tensor.

```python
vector = vectorize([
'texts ([string]): a multiline or a single line string.',
'dict ([list], optional): list of tokens. Defaults to None.',
'enable_Idf (bool, optional): use IDF or not. Defaults to True.',
'normalize (str, optional): normalization of vector. Defaults to l2.',
'max_dim ([int], optional): dimension of vector. Defaults to None.',
'smooth (bool, optional): restricts value >0. Defaults to True.',
'weightedTf (bool, optional): Tf = 1+log(Tf). Defaults to True.',
'return_features (bool, optional): feature vector. Defaults to False.'
])

print(f'Vector Shape={vector.shape}')
```

```bash
>>> Vector Shape=(8, 37)
```

4. Predict Token Weights: Computes importance of a token based on classification optimization.

```python
dictionary = ['vector', 'string', 'bool']
vector = vectorize([
'X ([np.array]): vectorized matrix columns arraged as per the dictionary.',
'y ([labels]): True classification labels.',
'epochs ([int]): Optimization epochs.',
'verbose (bool, optional): Enable verbose outputs. Defaults to False.',
'dict ([type], optional): list of tokens. Defaults to None.'
], dict=dictionary)

labels = np.random.randint(1, size=(vector.shape[0], 1))
weights = predict_weights(vector, labels, 100, dict=dictionary)
```

```bash
>>> Token-Weights Mappings: {'vector': 0.22097790924850977,
'string': 0.39296369957440075,
'bool': 0.689853175081446}
```


4 changes: 3 additions & 1 deletion src/irtm.egg-info/requires.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
nltk==3.5
numpy
nltk
scikit_learn
112 changes: 110 additions & 2 deletions src/irtm/toolbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from collections import deque


###############################################################################
Expand All @@ -18,7 +21,6 @@ def soundex(word):
Args:
word (string): string for conversion.
"""

if word.isalpha():

# Clip the first value
Expand Down Expand Up @@ -60,7 +62,7 @@ def soundex(word):
for i in range(len(word)):
try:
word.remove('0')
except Exception as e:
except:
pass

# Add the header in
Expand Down Expand Up @@ -89,6 +91,16 @@ def soundex(word):
###############################################################################

def tokenize(word):
"""
Description:
Tokenizes a string.
Args:
word ([str]): str
Returns:
[list]: list of tokens.
"""
tokens = word_tokenize(word)
tokens = [word for word in tokens if word.isalpha()]
tokens = [word.lower() for word in tokens]
Expand All @@ -97,3 +109,99 @@ def tokenize(word):
tokens = [lemma.lemmatize(word, pos="v") for word in tokens]
tokens = [lemma.lemmatize(word, pos="n") for word in tokens]
return tokens


###############################################################################
# Vectorizer
###############################################################################

def vectorize(texts, dict=None, enable_Idf=True,
normalize='l2', max_dim=None,
smooth=True, weightedTf=True, return_features=False):
"""
Description:
Creates weights tensor based on parsed string.
Args:
texts ([string]): a multiline or a single line string
dict ([list], optional): list of tokens. Defaults to None.
enable_Idf (bool, optional): use IDF or not. Defaults to True.
normalize (str, optional): normalization of vector. Defaults to 'l2'.
max_dim ([int], optional): dimension of vector. Defaults to None.
smooth (bool, optional): restricts value >0. Defaults to True.
weightedTf (bool, optional): Tf = 1+log(Tf). Defaults to True.
return_features (bool, optional): feature vector. Defaults to False.
Returns:
[np.matrix]: vectorized weight matrix
[list]: feature vectors
"""
if dict is None:
vectorizer = TfidfVectorizer(use_idf=enable_Idf,
norm=normalize, max_features=max_dim,
sublinear_tf=weightedTf,
smooth_idf=smooth)
else:
vectorizer = TfidfVectorizer(vocabulary=dict, use_idf=enable_Idf,
norm=normalize, max_features=max_dim,
sublinear_tf=weightedTf,
smooth_idf=smooth)

vector = vectorizer.fit_transform(texts)

if return_features:
return vector.todense(), vectorizer.get_feature_names()
else:
return vector.todense()


###############################################################################
# PREDICT WEIGHTS
###############################################################################
def predict_weights(X, y, epochs, verbose=False, dict=None):
"""
Description:
Predicts importance of a token based on classification optimization.
Args:
X ([np.array]): vectorized matrix columns arraged as per the dictionary.
y ([labels]): True classification labels.
epochs ([int]): Optimization epochs.
verbose (bool, optional): Enable verbose outputs. Defaults to False.
dict ([type], optional): list of tokens. Defaults to None.
Returns:
[dictionary]: Mappings of token & it's weights
"""

W = np.random.uniform(0, 1, (1, X.shape[1]))
v = np.zeros(W.shape)
loss_log = deque(maxlen=3)

for i in range(loss_log.maxlen):
loss_log.append(0)

for i in range(int(epochs)):
pred_y = 1.0 / 1.0 + np.exp(X @ W.T)
loss = -np.mean(np.log(pred_y.T) @ y)
loss_log.append(loss)
gradient = (pred_y - y).T @ X + (2.0 * 0.1 * W)
v = (0.9 * v) + (1e-3 * gradient)
W = W - v

if verbose:
if i % 100 == 0:
print(f'Epoch={i} \t Loss={loss}')
if np.mean(loss_log) == loss:
print('Loss is not decreasing enough!')
break
else:
if np.mean(loss_log) == loss:
break

mapping = {}
weights = np.ravel(W)
for i in range(len(dict)):
mapping[dict[i]] = weights[i]

return mapping

0 comments on commit 3d5a8fd

Please sign in to comment.