297 lines
5.3 KiB
Plaintext
297 lines
5.3 KiB
Plaintext
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
|
|
df.head()
|
|
|
|
|
|
len(df)
|
|
|
|
|
|
|
|
|
|
|
|
from IPython.display import Markdown, display
|
|
display(Markdown('> '+df['review'][0]))
|
|
|
|
|
|
|
|
|
|
|
|
# Check for the existence of NaN values in a cell:
|
|
df.isnull().sum()
|
|
|
|
|
|
|
|
|
|
|
|
df.dropna(inplace=True)
|
|
|
|
len(df)
|
|
|
|
|
|
|
|
|
|
|
|
blanks = [] # start with an empty list
|
|
|
|
for i,lb,rv in df.itertuples(): # iterate over the DataFrame
|
|
if type(rv)==str: # avoid NaN values
|
|
if rv.isspace(): # test 'review' for whitespace
|
|
blanks.append(i) # add matching index numbers to the list
|
|
|
|
print(len(blanks), 'blanks: ', blanks)
|
|
|
|
|
|
|
|
|
|
|
|
df.drop(blanks, inplace=True)
|
|
|
|
len(df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df['label'].value_counts()
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
X = df['review']
|
|
y = df['label']
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.svm import LinearSVC
|
|
|
|
# Naïve Bayes:
|
|
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
|
|
('clf', MultinomialNB()),
|
|
])
|
|
|
|
# Linear SVC:
|
|
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
|
|
('clf', LinearSVC()),
|
|
])
|
|
|
|
|
|
|
|
|
|
|
|
text_clf_nb.fit(X_train, y_train)
|
|
|
|
|
|
|
|
|
|
|
|
# Form a prediction set
|
|
predictions = text_clf_nb.predict(X_test)
|
|
|
|
|
|
# Report the confusion matrix
|
|
from sklearn import metrics
|
|
print(metrics.confusion_matrix(y_test,predictions))
|
|
|
|
|
|
# Print a classification report
|
|
print(metrics.classification_report(y_test,predictions))
|
|
|
|
|
|
# Print the overall accuracy
|
|
print(metrics.accuracy_score(y_test,predictions))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text_clf_lsvc.fit(X_train, y_train)
|
|
|
|
|
|
|
|
|
|
|
|
# Form a prediction set
|
|
predictions = text_clf_lsvc.predict(X_test)
|
|
|
|
|
|
# Report the confusion matrix
|
|
from sklearn import metrics
|
|
print(metrics.confusion_matrix(y_test,predictions))
|
|
|
|
|
|
# Print a classification report
|
|
print(metrics.classification_report(y_test,predictions))
|
|
|
|
|
|
# Print the overall accuracy
|
|
print(metrics.accuracy_score(y_test,predictions))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
|
|
'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
|
|
'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
|
|
'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
|
|
'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']
|
|
|
|
|
|
|
|
|
|
|
|
# YOU DO NOT NEED TO RUN THIS CELL UNLESS YOU HAVE
|
|
# RECENTLY OPENED THIS NOTEBOOK OR RESTARTED THE KERNEL:
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
|
|
df.dropna(inplace=True)
|
|
blanks = []
|
|
for i,lb,rv in df.itertuples():
|
|
if type(rv)==str:
|
|
if rv.isspace():
|
|
blanks.append(i)
|
|
df.drop(blanks, inplace=True)
|
|
from sklearn.model_selection import train_test_split
|
|
X = df['review']
|
|
y = df['label']
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
|
|
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.svm import LinearSVC
|
|
from sklearn import metrics
|
|
|
|
|
|
# RUN THIS CELL TO ADD STOPWORDS TO THE LINEAR SVC PIPELINE:
|
|
text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
|
|
('clf', LinearSVC()),
|
|
])
|
|
text_clf_lsvc2.fit(X_train, y_train)
|
|
|
|
|
|
predictions = text_clf_lsvc2.predict(X_test)
|
|
print(metrics.confusion_matrix(y_test,predictions))
|
|
|
|
|
|
print(metrics.classification_report(y_test,predictions))
|
|
|
|
|
|
print(metrics.accuracy_score(y_test,predictions))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# YOU DO NOT NEED TO RUN THIS CELL UNLESS YOU HAVE
|
|
# RECENTLY OPENED THIS NOTEBOOK OR RESTARTED THE KERNEL:
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
|
|
df.dropna(inplace=True)
|
|
blanks = []
|
|
for i,lb,rv in df.itertuples():
|
|
if type(rv)==str:
|
|
if rv.isspace():
|
|
blanks.append(i)
|
|
df.drop(blanks, inplace=True)
|
|
from sklearn.model_selection import train_test_split
|
|
X = df['review']
|
|
y = df['label']
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
|
|
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.naive_bayes import MultinomialNB
|
|
from sklearn.svm import LinearSVC
|
|
from sklearn import metrics
|
|
|
|
# Naïve Bayes Model:
|
|
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
|
|
('clf', MultinomialNB()),
|
|
])
|
|
|
|
# Linear SVC Model:
|
|
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
|
|
('clf', LinearSVC()),
|
|
])
|
|
|
|
# Train both models on the moviereviews.tsv training set:
|
|
text_clf_nb.fit(X_train, y_train)
|
|
text_clf_lsvc.fit(X_train, y_train)
|
|
|
|
|
|
|
|
|
|
|
|
myreview = "A movie I really wanted to love was terrible. \
|
|
I'm sure the producers had the best intentions, but the execution was lacking."
|
|
|
|
|
|
# Use this space to write your own review. Experiment with different lengths and writing styles.
|
|
myreview =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(text_clf_nb.predict([myreview])) # be sure to put "myreview" inside square brackets
|
|
|
|
|
|
print(text_clf_lsvc.predict([myreview]))
|
|
|
|
|
|
|
|
|
|
|
|
|