import numpy as np import pandas as pd df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t') df.head() len(df) from IPython.display import Markdown, display display(Markdown('> '+df['review'][0])) # Check for the existence of NaN values in a cell: df.isnull().sum() df.dropna(inplace=True) len(df) blanks = [] # start with an empty list for i,lb,rv in df.itertuples(): # iterate over the DataFrame if type(rv)==str: # avoid NaN values if rv.isspace(): # test 'review' for whitespace blanks.append(i) # add matching index numbers to the list print(len(blanks), 'blanks: ', blanks) df.drop(blanks, inplace=True) len(df) df['label'].value_counts() from sklearn.model_selection import train_test_split X = df['review'] y = df['label'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.svm import LinearSVC # Naïve Bayes: text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB()), ]) # Linear SVC: text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC()), ]) text_clf_nb.fit(X_train, y_train) # Form a prediction set predictions = text_clf_nb.predict(X_test) # Report the confusion matrix from sklearn import metrics print(metrics.confusion_matrix(y_test,predictions)) # Print a classification report print(metrics.classification_report(y_test,predictions)) # Print the overall accuracy print(metrics.accuracy_score(y_test,predictions)) text_clf_lsvc.fit(X_train, y_train) # Form a prediction set predictions = text_clf_lsvc.predict(X_test) # Report the confusion matrix from sklearn import metrics print(metrics.confusion_matrix(y_test,predictions)) # Print a classification report print(metrics.classification_report(y_test,predictions)) # Print the overall accuracy print(metrics.accuracy_score(y_test,predictions)) stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \ 'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \ 'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \ 'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \ 'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you'] # YOU DO NOT NEED TO RUN THIS CELL UNLESS YOU HAVE # RECENTLY OPENED THIS NOTEBOOK OR RESTARTED THE KERNEL: import numpy as np import pandas as pd df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t') df.dropna(inplace=True) blanks = [] for i,lb,rv in df.itertuples(): if type(rv)==str: if rv.isspace(): blanks.append(i) df.drop(blanks, inplace=True) from sklearn.model_selection import train_test_split X = df['review'] y = df['label'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.svm import LinearSVC from sklearn import metrics # RUN THIS CELL TO ADD STOPWORDS TO THE LINEAR SVC PIPELINE: text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)), ('clf', LinearSVC()), ]) text_clf_lsvc2.fit(X_train, y_train) predictions = text_clf_lsvc2.predict(X_test) print(metrics.confusion_matrix(y_test,predictions)) print(metrics.classification_report(y_test,predictions)) print(metrics.accuracy_score(y_test,predictions)) # YOU DO NOT NEED TO RUN THIS CELL UNLESS YOU HAVE # RECENTLY OPENED THIS NOTEBOOK OR RESTARTED THE KERNEL: import numpy as np import pandas as pd df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t') df.dropna(inplace=True) blanks = [] for i,lb,rv in df.itertuples(): if type(rv)==str: if rv.isspace(): blanks.append(i) df.drop(blanks, inplace=True) from sklearn.model_selection import train_test_split X = df['review'] y = df['label'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.svm import LinearSVC from sklearn import metrics # Naïve Bayes Model: text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB()), ]) # Linear SVC Model: text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC()), ]) # Train both models on the moviereviews.tsv training set: text_clf_nb.fit(X_train, y_train) text_clf_lsvc.fit(X_train, y_train) myreview = "A movie I really wanted to love was terrible. \ I'm sure the producers had the best intentions, but the execution was lacking." # Use this space to write your own review. Experiment with different lengths and writing styles. myreview = print(text_clf_nb.predict([myreview])) # be sure to put "myreview" inside square brackets print(text_clf_lsvc.predict([myreview]))