materi-praktikum/.virtual_documents/Praktikum Python Code/03-Text-Classification/01-Feature-Extraction-from-Text.ipynb



%%writefile 1.txt
This is a story about cats
our feline pets
Cats are furry animals


%%writefile 2.txt
This story is about surfing
Catching waves is fun
Surfing is a popular water sport


vocab = {}
i = 1

with open('1.txt') as f:
    x = f.read().lower().split()

for word in x:
    if word in vocab:
        continue
    else:
        vocab[word]=i
        i+=1

print(vocab)


with open('2.txt') as f:
    x = f.read().lower().split()

for word in x:
    if word in vocab:
        continue
    else:
        vocab[word]=i
        i+=1

print(vocab)


# Create an empty vector with space for each word in the vocabulary:
one = ['1.txt']+[0]*len(vocab)
one


# map the frequencies of each word in 1.txt to our vector:
with open('1.txt') as f:
    x = f.read().lower().split()

for word in x:
    one[vocab[word]]+=1

one


# Do the same for the second document:
two = ['2.txt']+[0]*len(vocab)

with open('2.txt') as f:
    x = f.read().lower().split()

for word in x:
    two[vocab[word]]+=1


# Compare the two vectors:
print(f'{one}\n{two}')


# Perform imports and load the dataset:
import numpy as np
import pandas as pd

df = pd.read_csv('../TextFiles/smsspamcollection.tsv', sep='\t')
df.head()


df.isnull().sum()


df['label'].value_counts()


from sklearn.model_selection import train_test_split

X = df['message']  # this time we want to look at the text
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape


from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set
X_train_tfidf.shape


from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)


from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)


# Form a prediction set
predictions = text_clf.predict(X_test)


# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))


# Print a classification report
print(metrics.classification_report(y_test,predictions))


# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))