From 7a1e4f3a11fb42fc749249e04e905613980edb46 Mon Sep 17 00:00:00 2001 From: Pradeep18102003 Date: Thu, 27 Jun 2024 15:10:03 +0530 Subject: [PATCH] created a better model --- projects/5-sentiment-analysis/train.py | 107 ++++++++++++++----------- 1 file changed, 58 insertions(+), 49 deletions(-) diff --git a/projects/5-sentiment-analysis/train.py b/projects/5-sentiment-analysis/train.py index 0714a3810..29bc2614c 100644 --- a/projects/5-sentiment-analysis/train.py +++ b/projects/5-sentiment-analysis/train.py @@ -1,62 +1,71 @@ -# This is a sentiment classifier on a fairly small dataset. -# This code adds the incredibly useful scikit learn package -# to your toolkit, which is especially useful for processing text data. -# -# This uses a "naive bayes" classifier instead of a neural net. -# You can add keras to do the classification as an additional challenge -# but the goal here is to improve the 66% validation accuracy to above -# 68%. One approach is to use TfidfVectorizer instead of CountVectorizer -# and then SGDClassifier instead of naive bayes ("MultinomialNB", but there -# are many other ways. -# -# Check out examples/scikit for inspiration. - import pandas as pd import numpy as np -import wandb - -wandb.init() +import re +import nltk +from nltk.corpus import stopwords +from nltk.stem.porter import PorterStemmer +ps = PorterStemmer() -# Get a pandas DataFrame object of all the data in the csv file: -df = pd.read_csv('tweets.csv') +text = pd.read_csv('tweets.csv') +text.columns = ['Tweets', 'device', 'Emotion'] +text = text.dropna(how='any') -# Get pandas Series object of the "tweet text" column: -text = df['tweet_text'] +def cleantext(text): + text = str(text).lower() + #text = re.sub('https?://\S+|www\.\S+', '', text) + text = re.sub('[^a-zA-Z]', ' ', text) + text = re.sub('<.*?>+', ' ', text) + text = re.sub(' +', ' ', text) + text = re.sub('\n', ' ', text) + text = text.split() + text = [ps.stem(word) for word in text if word not in stopwords.words('english')] + text = ' '.join(text) + return text -# Get pandas Series object of the "emotion" column: -target = df['is_there_an_emotion_directed_at_a_brand_or_product'] +text['Tweets'] = text['Tweets'].apply(cleantext) -# Remove the blank rows from the series: -target = target[pd.notnull(text)] -text = text[pd.notnull(text)] +x = text['Tweets'] +y = text['Emotion'] -# Perform feature extraction -# Try changing this! -from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer -count_vect = CountVectorizer() -count_vect.fit(text) -counts = count_vect.transform(text) -# Train with this data with a Naive Bayes classifier: -from sklearn.naive_bayes import MultinomialNB -from sklearn.linear_model import SGDClassifier -clf = MultinomialNB() +from sklearn.preprocessing import LabelEncoder +le = LabelEncoder() +le.fit_transform(y) -# (Tweets 0 to 5999 are used for training data) -clf.fit(counts[0:6000], target[0:6000]) +from sklearn.model_selection import train_test_split +x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) -# See what the classifier predicts for some new tweets: -# (Tweets 6000 to 9091 are used for testing) -predictions = clf.predict(counts[6000:9092]) -correct_predictions = sum(predictions == target[6000:9092]) -incorrect_predictions = (9092 - 6000) - correct_predictions - -train_predictions = clf.predict(counts[0:6000]) -train_correct_predictions = sum(train_predictions == target[0:6000]) -train_incorrect_predictions = 6000 - train_correct_predictions +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.naive_bayes import MultinomialNB +from sklearn.svm import SVC +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score +from sklearn.pipeline import Pipeline +Model_1 = Pipeline([ + ('vectorizer', CountVectorizer()), + ('tfidf', TfidfTransformer()), + ('model', MultinomialNB()) +]) +Model_2 = Pipeline([ + ('vectorizer', CountVectorizer()), + ('tfidf', TfidfTransformer()), + ('model', LogisticRegression()) +]) +Model_3 = Pipeline([ + ('vectorizer', CountVectorizer()), + ('tfidf', TfidfTransformer()), + ('model', SVC()) +]) -train_accuracy = train_correct_predictions/(train_correct_predictions+train_incorrect_predictions) -val_accuracy = correct_predictions/(correct_predictions+incorrect_predictions) +Model_1.fit(x_train, y_train) +Model_2.fit(x_train, y_train) +Model_3.fit(x_train, y_train) -wandb.log({"val_accuracy": val_accuracy, "train_accuracy": train_accuracy}) +y_pred_1 = Model_1.predict(x_test) +y_pred_2 = Model_2.predict(x_test) +y_pred_3 = Model_3.predict(x_test) +print('Naive Bayes:', accuracy_score(y_test, y_pred_1)) +print('Logistic Regression:', accuracy_score(y_test, y_pred_2)) +print('SVC:',accuracy_score(y_test, y_pred_3)) \ No newline at end of file