227 lines
2.8 KiB
Plaintext
227 lines
2.8 KiB
Plaintext
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
%%writefile 1.txt
|
|
This is a story about cats
|
|
our feline pets
|
|
Cats are furry animals
|
|
|
|
|
|
%%writefile 2.txt
|
|
This story is about surfing
|
|
Catching waves is fun
|
|
Surfing is a popular water sport
|
|
|
|
|
|
|
|
|
|
|
|
vocab = {}
|
|
i = 1
|
|
|
|
with open('1.txt') as f:
|
|
x = f.read().lower().split()
|
|
|
|
for word in x:
|
|
if word in vocab:
|
|
continue
|
|
else:
|
|
vocab[word]=i
|
|
i+=1
|
|
|
|
print(vocab)
|
|
|
|
|
|
with open('2.txt') as f:
|
|
x = f.read().lower().split()
|
|
|
|
for word in x:
|
|
if word in vocab:
|
|
continue
|
|
else:
|
|
vocab[word]=i
|
|
i+=1
|
|
|
|
print(vocab)
|
|
|
|
|
|
|
|
|
|
|
|
# Create an empty vector with space for each word in the vocabulary:
|
|
one = ['1.txt']+[0]*len(vocab)
|
|
one
|
|
|
|
|
|
# map the frequencies of each word in 1.txt to our vector:
|
|
with open('1.txt') as f:
|
|
x = f.read().lower().split()
|
|
|
|
for word in x:
|
|
one[vocab[word]]+=1
|
|
|
|
one
|
|
|
|
|
|
|
|
|
|
|
|
# Do the same for the second document:
|
|
two = ['2.txt']+[0]*len(vocab)
|
|
|
|
with open('2.txt') as f:
|
|
x = f.read().lower().split()
|
|
|
|
for word in x:
|
|
two[vocab[word]]+=1
|
|
|
|
|
|
# Compare the two vectors:
|
|
print(f'{one}\n{two}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Perform imports and load the dataset:
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
df = pd.read_csv('../TextFiles/smsspamcollection.tsv', sep='\t')
|
|
df.head()
|
|
|
|
|
|
|
|
|
|
|
|
df.isnull().sum()
|
|
|
|
|
|
|
|
|
|
|
|
df['label'].value_counts()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
X = df['message'] # this time we want to look at the text
|
|
y = df['label']
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
count_vect = CountVectorizer()
|
|
|
|
X_train_counts = count_vect.fit_transform(X_train)
|
|
X_train_counts.shape
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.feature_extraction.text import TfidfTransformer
|
|
tfidf_transformer = TfidfTransformer()
|
|
|
|
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
|
|
X_train_tfidf.shape
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
vectorizer = TfidfVectorizer()
|
|
|
|
X_train_tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set
|
|
X_train_tfidf.shape
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.svm import LinearSVC
|
|
clf = LinearSVC()
|
|
clf.fit(X_train_tfidf,y_train)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from sklearn.pipeline import Pipeline
|
|
# from sklearn.feature_extraction.text import TfidfVectorizer
|
|
# from sklearn.svm import LinearSVC
|
|
|
|
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
|
|
('clf', LinearSVC()),
|
|
])
|
|
|
|
# Feed the training data through the pipeline
|
|
text_clf.fit(X_train, y_train)
|
|
|
|
|
|
|
|
|
|
|
|
# Form a prediction set
|
|
predictions = text_clf.predict(X_test)
|
|
|
|
|
|
# Report the confusion matrix
|
|
from sklearn import metrics
|
|
print(metrics.confusion_matrix(y_test,predictions))
|
|
|
|
|
|
# Print a classification report
|
|
print(metrics.classification_report(y_test,predictions))
|
|
|
|
|
|
# Print the overall accuracy
|
|
print(metrics.accuracy_score(y_test,predictions))
|
|
|
|
|
|
|