materi-praktikum/.virtual_documents/Praktikum Python Code/03-Text-Classification/02-Text-Classification-Project.ipynb



import numpy as np
import pandas as pd

df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
df.head()


len(df)


from IPython.display import Markdown, display
display(Markdown('> '+df['review'][0]))


# Check for the existence of NaN values in a cell:
df.isnull().sum()


df.dropna(inplace=True)

len(df)


blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list

print(len(blanks), 'blanks: ', blanks)


df.drop(blanks, inplace=True)

len(df)


df['label'].value_counts()


from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])


text_clf_nb.fit(X_train, y_train)


# Form a prediction set
predictions = text_clf_nb.predict(X_test)


# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))


# Print a classification report
print(metrics.classification_report(y_test,predictions))


# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))


text_clf_lsvc.fit(X_train, y_train)


# Form a prediction set
predictions = text_clf_lsvc.predict(X_test)


# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))


# Print a classification report
print(metrics.classification_report(y_test,predictions))


# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))


stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']


# YOU DO NOT NEED TO RUN THIS CELL UNLESS YOU HAVE
# RECENTLY OPENED THIS NOTEBOOK OR RESTARTED THE KERNEL:

import numpy as np
import pandas as pd

df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
df.dropna(inplace=True)
blanks = []
for i,lb,rv in df.itertuples():
    if type(rv)==str:
        if rv.isspace():
            blanks.append(i)
df.drop(blanks, inplace=True)
from sklearn.model_selection import train_test_split
X = df['review']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics


# RUN THIS CELL TO ADD STOPWORDS TO THE LINEAR SVC PIPELINE:
text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
                     ('clf', LinearSVC()),
])
text_clf_lsvc2.fit(X_train, y_train)


predictions = text_clf_lsvc2.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))


print(metrics.classification_report(y_test,predictions))


print(metrics.accuracy_score(y_test,predictions))


# YOU DO NOT NEED TO RUN THIS CELL UNLESS YOU HAVE
# RECENTLY OPENED THIS NOTEBOOK OR RESTARTED THE KERNEL:

import numpy as np
import pandas as pd

df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
df.dropna(inplace=True)
blanks = []
for i,lb,rv in df.itertuples():
    if type(rv)==str:
        if rv.isspace():
            blanks.append(i)
df.drop(blanks, inplace=True)
from sklearn.model_selection import train_test_split
X = df['review']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics

# Naïve Bayes Model:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Linear SVC Model:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Train both models on the moviereviews.tsv training set:
text_clf_nb.fit(X_train, y_train)
text_clf_lsvc.fit(X_train, y_train)


myreview = "A movie I really wanted to love was terrible. \
I'm sure the producers had the best intentions, but the execution was lacking."


# Use this space to write your own review. Experiment with different lengths and writing styles.
myreview =


print(text_clf_nb.predict([myreview]))  # be sure to put "myreview" inside square brackets


print(text_clf_lsvc.predict([myreview]))