Skip to content

Commit

Permalink
using naive bayes classifier
Browse files Browse the repository at this point in the history
  • Loading branch information
SumukhSKashyap committed May 14, 2024
1 parent eef33c6 commit 2cad496
Show file tree
Hide file tree
Showing 2 changed files with 38,013 additions and 38,041 deletions.
Original file line number Diff line number Diff line change
@@ -1,38 +1,11 @@
from pathlib import Path
from tqdm import tqdm
import pandas as pd
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from tira.rest_api_client import Client
from tira.third_party_integrations import get_output_directory

# Function to generate character n-grams
def generate_ngrams(text, n=3):
ngrams = []
for i in range(len(text) - n + 1):
ngrams.append(text[i:i + n])
return ngrams

# Function to count n-grams frequency
def count_ngrams(text, n=3):
ngram_counts = {}
ngrams = generate_ngrams(text, n)
for ngram in ngrams:
if ngram in ngram_counts:
ngram_counts[ngram] += 1
else:
ngram_counts[ngram] = 1
return ngram_counts

# Function to extract n-gram features
def extract_features(texts, n=3):
num_texts = len(texts)
features = lil_matrix((num_texts, len(lang_ids)), dtype=int)
for i, text in enumerate(texts):
ngram_counts = count_ngrams(text, n)
for j, lang_id in enumerate(lang_ids):
features[i, j] = ngram_counts.get(lang_id, 0)
return features

if __name__ == "__main__":

tira = Client()
Expand All @@ -48,20 +21,19 @@ def extract_features(texts, n=3):
# Define language IDs
lang_ids = ["af", "az", "bg", "cs", "da", "de", "el", "en", "es", "fi", "fr", "hr", "it", "ko", "nl", "no", "pl", "ru", "ur", "zh"]

# Extracting features using character n-grams
n = 3 # Adjust n-gram size as needed
features = extract_features(text_validation['text'], n)
# Convert text data into character n-grams
vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 3))
X = vectorizer.fit_transform(text_validation['text'])

# Train Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X, targets_validation['lang'])

# Classifying the data based on features
prediction = []
for i in tqdm(range(features.shape[0])):
max_lang_index = features[i].todense().argmax(axis=1)
max_lang = lang_ids[max_lang_index[0, 0]]
prediction.append(max_lang)
# Predict language for validation data
prediction = clf.predict(X)

# Create DataFrame for predictions
prediction_df = pd.DataFrame({'lang': prediction, 'id': text_validation['id']})
print(prediction_df)

# saving the prediction
output_directory = get_output_directory(str(Path(__file__).parent))
Expand Down
Loading

0 comments on commit 2cad496

Please sign in to comment.