297 lines
5.3 KiB
Plaintext

import numpy as np
import pandas as pd
df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
df.head()
len(df)
from IPython.display import Markdown, display
display(Markdown('> '+df['review'][0]))
# Check for the existence of NaN values in a cell:
df.isnull().sum()
df.dropna(inplace=True)
len(df)
blanks = [] # start with an empty list
for i,lb,rv in df.itertuples(): # iterate over the DataFrame
if type(rv)==str: # avoid NaN values
if rv.isspace(): # test 'review' for whitespace
blanks.append(i) # add matching index numbers to the list
print(len(blanks), 'blanks: ', blanks)
df.drop(blanks, inplace=True)
len(df)
df['label'].value_counts()
from sklearn.model_selection import train_test_split
X = df['review']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
('clf', MultinomialNB()),
])
# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
('clf', LinearSVC()),
])
text_clf_nb.fit(X_train, y_train)
# Form a prediction set
predictions = text_clf_nb.predict(X_test)
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))
# Print a classification report
print(metrics.classification_report(y_test,predictions))
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))
text_clf_lsvc.fit(X_train, y_train)
# Form a prediction set
predictions = text_clf_lsvc.predict(X_test)
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))
# Print a classification report
print(metrics.classification_report(y_test,predictions))
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']
# YOU DO NOT NEED TO RUN THIS CELL UNLESS YOU HAVE
# RECENTLY OPENED THIS NOTEBOOK OR RESTARTED THE KERNEL:
import numpy as np
import pandas as pd
df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
df.dropna(inplace=True)
blanks = []
for i,lb,rv in df.itertuples():
if type(rv)==str:
if rv.isspace():
blanks.append(i)
df.drop(blanks, inplace=True)
from sklearn.model_selection import train_test_split
X = df['review']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics
# RUN THIS CELL TO ADD STOPWORDS TO THE LINEAR SVC PIPELINE:
text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
('clf', LinearSVC()),
])
text_clf_lsvc2.fit(X_train, y_train)
predictions = text_clf_lsvc2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))
print(metrics.classification_report(y_test,predictions))
print(metrics.accuracy_score(y_test,predictions))
# YOU DO NOT NEED TO RUN THIS CELL UNLESS YOU HAVE
# RECENTLY OPENED THIS NOTEBOOK OR RESTARTED THE KERNEL:
import numpy as np
import pandas as pd
df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
df.dropna(inplace=True)
blanks = []
for i,lb,rv in df.itertuples():
if type(rv)==str:
if rv.isspace():
blanks.append(i)
df.drop(blanks, inplace=True)
from sklearn.model_selection import train_test_split
X = df['review']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics
# Naïve Bayes Model:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
('clf', MultinomialNB()),
])
# Linear SVC Model:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
('clf', LinearSVC()),
])
# Train both models on the moviereviews.tsv training set:
text_clf_nb.fit(X_train, y_train)
text_clf_lsvc.fit(X_train, y_train)
myreview = "A movie I really wanted to love was terrible. \
I'm sure the producers had the best intentions, but the execution was lacking."
# Use this space to write your own review. Experiment with different lengths and writing styles.
myreview =
print(text_clf_nb.predict([myreview])) # be sure to put "myreview" inside square brackets
print(text_clf_lsvc.predict([myreview]))