Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

created a better model #78

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 58 additions & 49 deletions projects/5-sentiment-analysis/train.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,71 @@
# This is a sentiment classifier on a fairly small dataset.
# This code adds the incredibly useful scikit learn package
# to your toolkit, which is especially useful for processing text data.
#
# This uses a "naive bayes" classifier instead of a neural net.
# You can add keras to do the classification as an additional challenge
# but the goal here is to improve the 66% validation accuracy to above
# 68%. One approach is to use TfidfVectorizer instead of CountVectorizer
# and then SGDClassifier instead of naive bayes ("MultinomialNB", but there
# are many other ways.
#
# Check out examples/scikit for inspiration.

import pandas as pd
import numpy as np
import wandb

wandb.init()
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

# Get a pandas DataFrame object of all the data in the csv file:
df = pd.read_csv('tweets.csv')
text = pd.read_csv('tweets.csv')
text.columns = ['Tweets', 'device', 'Emotion']
text = text.dropna(how='any')

# Get pandas Series object of the "tweet text" column:
text = df['tweet_text']
def cleantext(text):
text = str(text).lower()
#text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('[^a-zA-Z]', ' ', text)
text = re.sub('<.*?>+', ' ', text)
text = re.sub(' +', ' ', text)
text = re.sub('\n', ' ', text)
text = text.split()
text = [ps.stem(word) for word in text if word not in stopwords.words('english')]
text = ' '.join(text)
return text

# Get pandas Series object of the "emotion" column:
target = df['is_there_an_emotion_directed_at_a_brand_or_product']
text['Tweets'] = text['Tweets'].apply(cleantext)

# Remove the blank rows from the series:
target = target[pd.notnull(text)]
text = text[pd.notnull(text)]
x = text['Tweets']
y = text['Emotion']

# Perform feature extraction
# Try changing this!
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
count_vect = CountVectorizer()
count_vect.fit(text)
counts = count_vect.transform(text)

# Train with this data with a Naive Bayes classifier:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
clf = MultinomialNB()
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(y)

# (Tweets 0 to 5999 are used for training data)
clf.fit(counts[0:6000], target[0:6000])
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# See what the classifier predicts for some new tweets:
# (Tweets 6000 to 9091 are used for testing)
predictions = clf.predict(counts[6000:9092])
correct_predictions = sum(predictions == target[6000:9092])
incorrect_predictions = (9092 - 6000) - correct_predictions

train_predictions = clf.predict(counts[0:6000])
train_correct_predictions = sum(train_predictions == target[0:6000])
train_incorrect_predictions = 6000 - train_correct_predictions
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
Model_1 = Pipeline([
('vectorizer', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', MultinomialNB())
])
Model_2 = Pipeline([
('vectorizer', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', LogisticRegression())
])
Model_3 = Pipeline([
('vectorizer', CountVectorizer()),
('tfidf', TfidfTransformer()),
('model', SVC())
])

train_accuracy = train_correct_predictions/(train_correct_predictions+train_incorrect_predictions)
val_accuracy = correct_predictions/(correct_predictions+incorrect_predictions)
Model_1.fit(x_train, y_train)
Model_2.fit(x_train, y_train)
Model_3.fit(x_train, y_train)

wandb.log({"val_accuracy": val_accuracy, "train_accuracy": train_accuracy})
y_pred_1 = Model_1.predict(x_test)
y_pred_2 = Model_2.predict(x_test)
y_pred_3 = Model_3.predict(x_test)

print('Naive Bayes:', accuracy_score(y_test, y_pred_1))
print('Logistic Regression:', accuracy_score(y_test, y_pred_2))
print('SVC:',accuracy_score(y_test, y_pred_3))