using naive bayes classifier

tira-io · May 14, 2024 · 2cad496 · 2cad496
1 parent eef33c6
commit 2cad496
Show file tree

Hide file tree

Showing 2 changed files with 38,013 additions and 38,041 deletions.
diff --git a/language-identification-stopwords/language_identification_stopwords.py b/language-identification-stopwords/language_identification_stopwords.py
@@ -1,38 +1,11 @@
 from pathlib import Path
 from tqdm import tqdm
 import pandas as pd
-from scipy.sparse import lil_matrix
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.naive_bayes import MultinomialNB
 from tira.rest_api_client import Client
 from tira.third_party_integrations import get_output_directory
 
-# Function to generate character n-grams
-def generate_ngrams(text, n=3):
-    ngrams = []
-    for i in range(len(text) - n + 1):
-        ngrams.append(text[i:i + n])
-    return ngrams
-
-# Function to count n-grams frequency
-def count_ngrams(text, n=3):
-    ngram_counts = {}
-    ngrams = generate_ngrams(text, n)
-    for ngram in ngrams:
-        if ngram in ngram_counts:
-            ngram_counts[ngram] += 1
-        else:
-            ngram_counts[ngram] = 1
-    return ngram_counts
-
-# Function to extract n-gram features
-def extract_features(texts, n=3):
-    num_texts = len(texts)
-    features = lil_matrix((num_texts, len(lang_ids)), dtype=int)
-    for i, text in enumerate(texts):
-        ngram_counts = count_ngrams(text, n)
-        for j, lang_id in enumerate(lang_ids):
-            features[i, j] = ngram_counts.get(lang_id, 0)
-    return features
-
 if __name__ == "__main__":
 
     tira = Client()
@@ -48,20 +21,19 @@ def extract_features(texts, n=3):
     # Define language IDs
     lang_ids = ["af", "az", "bg", "cs", "da", "de", "el", "en", "es", "fi", "fr", "hr", "it", "ko", "nl", "no", "pl", "ru", "ur", "zh"]
 
-    # Extracting features using character n-grams
-    n = 3  # Adjust n-gram size as needed
-    features = extract_features(text_validation['text'], n)
+    # Convert text data into character n-grams
+    vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 3))
+    X = vectorizer.fit_transform(text_validation['text'])
+
+    # Train Naive Bayes classifier
+    clf = MultinomialNB()
+    clf.fit(X, targets_validation['lang'])
 
-    # Classifying the data based on features
-    prediction = []
-    for i in tqdm(range(features.shape[0])):
-        max_lang_index = features[i].todense().argmax(axis=1)
-        max_lang = lang_ids[max_lang_index[0, 0]]
-        prediction.append(max_lang)
+    # Predict language for validation data
+    prediction = clf.predict(X)
 
     # Create DataFrame for predictions
     prediction_df = pd.DataFrame({'lang': prediction, 'id': text_validation['id']})
-    print(prediction_df)
 
     # saving the prediction
     output_directory = get_output_directory(str(Path(__file__).parent))