Feedforward ANN Text classification

This commit is contained in:
Arif Dwiyanto 2025-11-15 05:08:25 +00:00
parent fd6d17f1ab
commit fc0b273149
16 changed files with 2104 additions and 2 deletions

View File

@ -1 +1,8 @@
# Kompilasi Materi Praktikum # Kompilasi Materi Praktikum
## Ganjil 2025/2026
- NLP
- Machine Learning
- Big Data
- Data Mining
- Data Management

View File

@ -0,0 +1,75 @@
# Input jumlah dokumen
import pandas as pd
n = int(input("Masukkan jumlah dokumen yang ingin dimasukkan: "))
# Input teks dokumen satu per satu
documents = []
for i in range(n):
teks = input(f"Masukkan teks untuk dokumen ke-{i+1}: ")
documents.append(teks)
print("\n=== Dokumen yang Dimasukkan ===")
for i, doc in enumerate(documents):
print(f"Doc {i+1}: {doc}")
# Tahap Tokenisasi
tokenized_docs = []
for doc in documents:
tokens = doc.lower().split()
tokenized_docs.append(tokens)
print("\n=== Hasil Tokenisasi ===")
for i, tokens in enumerate(tokenized_docs):
print(f"Doc {i+1}: {tokens}")
# Pembuatan Corpus
corpus_all = [word for doc in tokenized_docs for word in doc]
print("\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===")
print(corpus_all)
print(f"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}")
# Pembuatan Vocabulary
vocabulary = sorted(set(corpus_all))
print("\n=== Vocabulary (Kata Unik) ===")
print(vocabulary)
print(f"Jumlah kata unik (vocabulary size): {len(vocabulary)}")
vocabulary = sorted(set(corpus_all))
print("\n=== Vocabulary (Kata Unik) ===")
for idx, word in enumerate(vocabulary, start=1):
print(f"{idx:>2}. {word}")
print(f"\nJumlah kata unik (vocabulary size): {len(vocabulary)}")
# Representasi Numerik (Matriks BoW)
bow_matrix = []
for doc in tokenized_docs:
vector = [doc.count(word) for word in vocabulary]
bow_matrix.append(vector)
df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)
df_bow.index = [f"D{i}" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3
print("\n=== Matriks Bag of Words ===")
print(df_bow)
# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)
word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()
word_frequencies.columns = ["Kata", "Frekuensi"]
print("\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===")
print(word_frequencies)
print(f"Frekuensi kata: {len(word_frequencies)}")

View File

@ -0,0 +1,84 @@
# ---------------------------------------------------------
# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network
# ---------------------------------------------------------
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
# -----------------------------------------
# 1. Contoh Dataset
# -----------------------------------------
# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)
data = {
"text": [
"Saya suka produk ini, luar biasa",
"Layanannya buruk, sangat kecewa",
"Pembelian terbaik yang pernah saya lakukan",
"Saya benci produk ini, buang-buang uang",
"Kualitasnya sangat bagus, direkomendasikan",
"Pengalaman buruk, tidak akan membeli lagi"
],
"label": ["positive", "negative", "positive", "negative", "positive", "negative"]
}
df = pd.DataFrame(data)
# -----------------------------------------
# 2. Split Train & Test
# -----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
df["text"], df["label"], test_size=0.3, random_state=42
)
# -----------------------------------------
# 3. TF-IDF Vectorization
# -----------------------------------------
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
# -----------------------------------------
# 4. Feedforward ANN (MLPClassifier)
# -----------------------------------------
model = MLPClassifier(
hidden_layer_sizes=(256, 64),
activation='relu',
solver='adam',
max_iter=500,
random_state=42
)
model.fit(X_train_tfidf, y_train)
# -----------------------------------------
# 5. Evaluasi Model
# -----------------------------------------
y_pred = model.predict(X_test_tfidf)
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
# -----------------------------------------
# 6. Prediksi Teks Baru
# -----------------------------------------
sample_text = ["barang bagus luar biasa"]
sample_text = ["barang buruk, saya kecewa"]
sample_vec = tfidf.transform(sample_text)
prediction = model.predict(sample_vec)
print("\nPrediksi untuk:", sample_text[0])
print("Hasil:", prediction[0])

View File

@ -0,0 +1,209 @@
from collections import Counter
from IPython.display import clear_output
import math
# 1. Input Kalimat dan Tokenisasi
kalimat = input("Masukkan kalimat: ").strip()
# Bersihkan output (khusus lingkungan notebook)
try:
clear_output()
except:
pass
print(f"Corpus: {kalimat}")
# Tokenize
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")
# 2. Hitung Frekuensi Unigram
unigram_counts = Counter(tokens)
total_tokens = sum(unigram_counts.values())
print("\nFrekuensi Unigram dalam kalimat")
for pair, count in unigram_counts.items():
print(f" ('{pair}'): {count}")
print(f"\nTotal unigram dalam 1 kalimat: {total_tokens}")
# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata
unigram_probabilities = {}
for word, count in unigram_counts.items():
prob = count / total_tokens
unigram_probabilities[word] = prob
print("\nProbabilitas masing-masing unigram:")
for word, prob in unigram_probabilities.items():
print(f" P({word}) = {prob:.2f} ({prob*100:.2f}%)")
# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)
p_kalimat = 1
prob_parts = []
# Loop untuk menghitung probabilitas total dan membangun string rumus detail
for word in tokens:
prob_value = unigram_probabilities[word]
p_kalimat *= prob_value
# Format: P(word)=prob_value
prob_parts.append(f"P({word})={prob_value:.2f}")
# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail
prob_str = " x ".join(prob_parts)
print("\nProbabilitas Keseluruhan Kalimat (Model Unigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)")
from collections import Counter
from IPython.display import clear_output
import math
# 1. Input Kalimat dan Tokenisasi
kalimat = input("Masukkan kalimat: ").strip()
# Bersihkan output (khusus lingkungan notebook)
try:
clear_output()
except:
pass
print(f"Corpus: {kalimat}")
# Tokenisasi
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")
# 2. Hitung Frekuensi Unigram dan Bigram
unigram_counts = Counter(tokens)
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
bigram_counts = Counter(bigrams)
print("\nFrekuensi Bigram dalam kalimat:")
for pair, count in bigram_counts.items():
print(f" {pair}: {count}")
print(f"\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}")
# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)
bigram_probabilities = {}
for (w1, w2), count in bigram_counts.items():
prob = count / unigram_counts[w1]
bigram_probabilities[(w1, w2)] = prob
print("\nProbabilitas masing-masing bigram:")
for (w1, w2), prob in bigram_probabilities.items():
print(f" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)")
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...
total_tokens = sum(unigram_counts.values())
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)
p_kalimat = p_w1 # Inisialisasi dengan P(w1)
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"] # Tambahkan P(w1) ke rumus
for i in range(1, len(tokens)):
pair = (tokens[i-1], tokens[i])
p = bigram_probabilities.get(pair, 0)
p_kalimat *= p
prob_str_parts.append(f"P({pair[1]}|{pair[0]})={p:.2f}")
# Gabungkan rumus perkalian untuk ditampilkan
prob_str = " x ".join(prob_str_parts)
print("\nProbabilitas Keseluruhan Kalimat (Model Bigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
from collections import Counter
from IPython.display import clear_output
import math
# 1. Input Kalimat dan Tokenisasi
kalimat = input("Masukkan kalimat: ").strip()
# Bersihkan output (khusus lingkungan notebook)
try:
clear_output()
except:
pass
print(f"Corpus: {kalimat}")
# Tokenisasi
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")
# 2. Hitung Frekuensi Bigram dan Trigram
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)
print("\nFrekuensi Trigram dalam kalimat:")
for tg, count in trigram_counts.items():
print(f" {tg}: {count}")
print(f"\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}")
# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)
trigram_probabilities = {}
for (w1, w2, w3), count in trigram_counts.items():
# Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)
if bigram_counts[(w1, w2)] > 0:
prob = count / bigram_counts[(w1, w2)]
else:
prob = 0
trigram_probabilities[(w1, w2, w3)] = prob
print("\nProbabilitas masing-masing trigram:")
for (w1, w2, w3), prob in trigram_probabilities.items():
print(f" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)")
# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))
unigram_counts = Counter(tokens)
total_tokens = sum(unigram_counts.values())
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...
# a. P(w1)
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0
# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)
if len(tokens) > 1:
count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0
p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1
else:
p_w2_w1 = 1.0 # Jika hanya 1 kata
p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)
# Daftar bagian rumus untuk ditampilkan
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"]
if len(tokens) > 1:
prob_str_parts.append(f"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}")
# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3
for i in range(len(tokens) - 2):
triplet = (tokens[i], tokens[i+1], tokens[i+2])
p = trigram_probabilities.get(triplet, 0)
p_kalimat *= p
prob_str_parts.append(f"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}")
prob_str = " x ".join(prob_str_parts)
print("\nProbabilitas Keseluruhan Kalimat (Model Trigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")

View File

@ -0,0 +1,213 @@
name = 'Fred'
# Using the old .format() method:
print('His name is {var}.'.format(var=name))
# Using f-strings:
print(f'His name is {name}.')
print(f'His name is {name!r}')
d = {'a':123,'b':456}
print(f'Address: {d['a']} Main Street')
d = {'a':123,'b':456}
print(f"Address: {d['a']} Main Street")
library = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)]
for book in library:
print(f'{book[0]:{10}} {book[1]:{8}} {book[2]:{7}}')
for book in library:
print(f'{book[0]:{10}} {book[1]:{10}} {book[2]:.>{7}}') # here .> was added
from datetime import datetime
today = datetime(year=2018, month=1, day=27)
print(f'{today:%B %d, %Y}')
%%writefile test.txt
Hello, this is a quick test file.
This is the second line of the file.
myfile = open('whoops.txt')
pwd
# Open the text.txt file we created earlier
my_file = open('test.txt')
my_file
# We can now read the file
my_file.read()
# But what happens if we try to read it again?
my_file.read()
# Seek to the start of file (index 0)
my_file.seek(0)
# Now read again
my_file.read()
# Readlines returns a list of the lines in the file
my_file.seek(0)
my_file.readlines()
my_file.close()
# Add a second argument to the function, 'w' which stands for write.
# Passing 'w+' lets us read and write to the file
my_file = open('test.txt','w+')
# Write to the file
my_file.write('This is a new first line')
# Read the file
my_file.seek(0)
my_file.read()
my_file.close() # always do this when you're done with a file
my_file = open('test.txt','a+')
my_file.write('\nThis line is being appended to test.txt')
my_file.write('\nAnd another line here.')
my_file.seek(0)
print(my_file.read())
my_file.close()
%%writefile -a test.txt
This is more text being appended to test.txt
And another line here.
with open('test.txt','r') as txt:
first_line = txt.readlines()[0]
print(first_line)
txt.read()
with open('test.txt','r') as txt:
for line in txt:
print(line, end='') # the end='' argument removes extra linebreaks

View File

@ -0,0 +1,145 @@
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_sm')
# Create a Doc object
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')
# Print each token separately
for token in doc:
print(token.text, token.pos_, token.dep_)
nlp.pipeline
nlp.pipe_names
doc2 = nlp(u"Tesla isn't looking into startups anymore.")
for token in doc2:
print(token.text, token.pos_, token.dep_)
doc2
doc2[0]
type(doc2)
doc2[0].pos_
doc2[0].dep_
spacy.explain('PROPN')
spacy.explain('nsubj')
# Lemmas (the base form of the word):
print(doc2[4].text)
print(doc2[4].lemma_)
# Simple Parts-of-Speech & Detailed Tags:
print(doc2[4].pos_)
print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))
# Word Shapes:
print(doc2[0].text+': '+doc2[0].shape_)
print(doc[5].text+' : '+doc[5].shape_)
# Boolean Values:
print(doc2[0].is_alpha)
print(doc2[0].is_stop)
doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
the phrase "Life is what happens to us while we are making other plans" was written by \
cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')
life_quote = doc3[16:30]
print(life_quote)
type(life_quote)
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')
for sent in doc4.sents:
print(sent)
doc4[6].is_sent_start

View File

@ -0,0 +1,188 @@
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_sm')
# Create a string that includes opening and closing quotation marks
mystring = '"We\'re moving to L.A.!"'
print(mystring)
# Create a Doc object and explore tokens
doc = nlp(mystring)
for token in doc:
print(token.text, end=' | ')
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")
for t in doc2:
print(t)
doc3 = nlp(u'A 5km NYC cab ride costs $10.30')
for t in doc3:
print(t)
doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")
for t in doc4:
print(t)
len(doc)
len(doc.vocab)
doc5 = nlp(u'It is better to give than to receive.')
# Retrieve the third token:
doc5[2]
# Retrieve three tokens from the middle:
doc5[2:5]
# Retrieve the last four tokens:
doc5[-4:]
doc6 = nlp(u'My dinner was horrible.')
doc7 = nlp(u'Your dinner was delicious.')
# Try to change "My dinner was horrible" to "My dinner was delicious"
doc6[3] = doc7[3]
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')
for token in doc8:
print(token.text, end=' | ')
print('\n----')
for ent in doc8.ents:
print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
len(doc8.ents)
doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")
for chunk in doc9.noun_chunks:
print(chunk.text)
doc10 = nlp(u"Red cars do not carry higher insurance rates.")
for chunk in doc10.noun_chunks:
print(chunk.text)
doc11 = nlp(u"He was a one-eyed, one-horned, flying, purple people-eater.")
for chunk in doc11.noun_chunks:
print(chunk.text)
from spacy import displacy
doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)
doc = nlp(u'This is a sentence.')
displacy.serve(doc, style='dep')

View File

@ -0,0 +1,107 @@
# RUN THIS CELL to perform standard imports:
import spacy
nlp = spacy.load('en_core_web_sm')
# Enter your code here:
with open('../TextFiles/owlcreek.txt') as f:
doc = nlp(f.read())
# Run this cell to verify it worked:
doc[:36]
len(doc)
sents = [sent for sent in doc.sents]
len(sents)
print(sents[1].text)
# NORMAL SOLUTION:
for token in sents[1]:
print(token.text, token.pos_, token.dep_, token.lemma_)
# CHALLENGE SOLUTION:
for token in sents[1]:
print(f'{token.text:{15}} {token.pos_:{5}} {token.dep_:{10}} {token.lemma_:{15}}')
# Import the Matcher library:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
# Create a pattern and add it to matcher:
pattern = [{'LOWER': 'swimming'}, {'IS_SPACE': True, 'OP':'*'}, {'LOWER': 'vigorously'}]
matcher.add('Swimming', None, pattern)
# Create a list of matches called "found_matches" and print the list:
found_matches = matcher(doc)
print(found_matches)
print(doc[1265:1290])
print(doc[3600:3615])
for sent in sents:
if found_matches[0][1] < sent.end:
print(sent)
break
for sent in sents:
if found_matches[1][1] < sent.end:
print(sent)
break

View File

@ -0,0 +1,107 @@
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')
# Create a simple Doc object
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
# Print the full text:
print(doc.text)
# Print the fifth word and associated tags:
print(doc[4].text, doc[4].pos_, doc[4].tag_, spacy.explain(doc[4].tag_))
for token in doc:
print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')
doc = nlp(u'I read books on NLP.')
r = doc[1]
print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')
doc = nlp(u'I read a book on NLP.')
r = doc[1]
print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
# Count the frequencies of different coarse-grained POS tags:
POS_counts = doc.count_by(spacy.attrs.POS)
POS_counts
doc.vocab[83].text
for k,v in sorted(POS_counts.items()):
print(f'{k}. {doc.vocab[k].text:{5}}: {v}')
# Count the different fine-grained tags:
TAG_counts = doc.count_by(spacy.attrs.TAG)
for k,v in sorted(TAG_counts.items()):
print(f'{k}. {doc.vocab[k].text:{4}}: {v}')
# Count the different dependencies:
DEP_counts = doc.count_by(spacy.attrs.DEP)
for k,v in sorted(DEP_counts.items()):
print(f'{k}. {doc.vocab[k].text:{4}}: {v}')

View File

@ -0,0 +1,35 @@
# Perform standard imports
import spacy
nlp = spacy.load('en_core_web_sm')
# Import the game script
import game
# Enter your text here:
text = u"The quick brown fox jumped over the lazy dog's back."
# Make your Doc object and pass it into the scorer:
doc = nlp(text)
print(game.scorer(doc))
# For practice, visualize your fine-grained POS tags (shown in the third column):
print(f"{'TOKEN':{10}} {'COARSE':{8}} {'FINE':{6}} {'DESCRIPTION'}")
print(f"{'-----':{10}} {'------':{8}} {'----':{6}} {'-----------'}")
for token in doc:
print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')

View File

@ -0,0 +1,84 @@
# ---------------------------------------------------------
# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network
# ---------------------------------------------------------
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
# -----------------------------------------
# 1. Contoh Dataset
# -----------------------------------------
# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)
data = {
"text": [
"Saya suka produk ini, luar biasa",
"Layanannya buruk, sangat kecewa",
"Pembelian terbaik yang pernah saya lakukan",
"Saya benci produk ini, buang-buang uang",
"Kualitasnya sangat bagus, direkomendasikan",
"Pengalaman buruk, tidak akan membeli lagi"
],
"label": ["positive", "negative", "positive", "negative", "positive", "negative"]
}
df = pd.DataFrame(data)
# -----------------------------------------
# 2. Split Train & Test
# -----------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
df["text"], df["label"], test_size=0.3, random_state=42
)
# -----------------------------------------
# 3. TF-IDF Vectorization
# -----------------------------------------
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
# -----------------------------------------
# 4. Feedforward ANN (MLPClassifier)
# -----------------------------------------
model = MLPClassifier(
hidden_layer_sizes=(256, 64),
activation='relu',
solver='adam',
max_iter=500,
random_state=42
)
model.fit(X_train_tfidf, y_train)
# -----------------------------------------
# 5. Evaluasi Model
# -----------------------------------------
y_pred = model.predict(X_test_tfidf)
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))
# -----------------------------------------
# 6. Prediksi Teks Baru
# -----------------------------------------
sample_text = ["barang bagus luar biasa"]
sample_text = ["barang buruk, saya kecewa"]
sample_vec = tfidf.transform(sample_text)
prediction = model.predict(sample_vec)
print("\nPrediksi untuk:", sample_text[0])
print("Hasil:", prediction[0])

View File

@ -0,0 +1,310 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qBYcPYAb059g",
"outputId": "9f57b704-da1b-4495-d366-24c30586dc76"
},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
"Masukkan jumlah dokumen yang ingin dimasukkan: 3\n"
]
}
],
"source": [
"# Input jumlah dokumen\n",
"import pandas as pd\n",
"n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mo-yt5Ob1N8j",
"outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Masukkan teks untuk dokumen ke-1: saya belajar nlp di kampus\n",
"Masukkan teks untuk dokumen ke-2: saya suka belajar ai\n",
"Masukkan teks untuk dokumen ke-3: mahasiswa belajar data science dan nlp\n",
"\n",
"=== Dokumen yang Dimasukkan ===\n",
"Doc 1: saya belajar nlp di kampus\n",
"Doc 2: saya suka belajar ai\n",
"Doc 3: mahasiswa belajar data science dan nlp\n"
]
}
],
"source": [
"# Input teks dokumen satu per satu\n",
"documents = []\n",
"for i in range(n):\n",
" teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
" documents.append(teks)\n",
"\n",
"print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
"for i, doc in enumerate(documents):\n",
" print(f\"Doc {i+1}: {doc}\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FkmxRAFq1oDK",
"outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Hasil Tokenisasi ===\n",
"Doc 1: ['saya', 'belajar', 'nlp', 'di', 'kampus']\n",
"Doc 2: ['saya', 'suka', 'belajar', 'ai']\n",
"Doc 3: ['mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n"
]
}
],
"source": [
"# Tahap Tokenisasi\n",
"tokenized_docs = []\n",
"for doc in documents:\n",
" tokens = doc.lower().split()\n",
" tokenized_docs.append(tokens)\n",
"\n",
"print(\"\\n=== Hasil Tokenisasi ===\")\n",
"for i, tokens in enumerate(tokenized_docs):\n",
" print(f\"Doc {i+1}: {tokens}\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ybC1Vo2C_c3q",
"outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
"['saya', 'belajar', 'nlp', 'di', 'kampus', 'saya', 'suka', 'belajar', 'ai', 'mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n",
"Jumlah total kata dalam seluruh dokumen: 15\n"
]
}
],
"source": [
"# Pembuatan Corpus\n",
"corpus_all = [word for doc in tokenized_docs for word in doc]\n",
"\n",
"print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
"print(corpus_all)\n",
"print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "s6S-Ma4R1xuq",
"outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Vocabulary (Kata Unik) ===\n",
"['ai', 'belajar', 'dan', 'data', 'di', 'kampus', 'mahasiswa', 'nlp', 'saya', 'science', 'suka']\n",
"Jumlah kata unik (vocabulary size): 11\n",
"\n",
"=== Vocabulary (Kata Unik) ===\n",
" 1. ai\n",
" 2. belajar\n",
" 3. dan\n",
" 4. data\n",
" 5. di\n",
" 6. kampus\n",
" 7. mahasiswa\n",
" 8. nlp\n",
" 9. saya\n",
"10. science\n",
"11. suka\n",
"\n",
"Jumlah kata unik (vocabulary size): 11\n"
]
}
],
"source": [
"# Pembuatan Vocabulary\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"print(vocabulary)\n",
"print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
"\n",
"\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"for idx, word in enumerate(vocabulary, start=1):\n",
" print(f\"{idx:>2}. {word}\")\n",
"print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "ShevCTva2Fg9"
},
"outputs": [],
"source": [
"# Representasi Numerik (Matriks BoW)\n",
"bow_matrix = []\n",
"for doc in tokenized_docs:\n",
" vector = [doc.count(word) for word in vocabulary]\n",
" bow_matrix.append(vector)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-yB6D2pY2M0E",
"outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Matriks Bag of Words ===\n",
" ai belajar dan data di kampus mahasiswa nlp saya science suka\n",
"D1 0 1 0 0 1 1 0 1 1 0 0\n",
"D2 1 1 0 0 0 0 0 0 1 0 1\n",
"D3 0 1 1 1 0 0 1 1 0 1 0\n"
]
}
],
"source": [
"df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
"df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3\n",
"\n",
"print(\"\\n=== Matriks Bag of Words ===\")\n",
"print(df_bow)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8ruf5vKL2rGD",
"outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n",
" Kata Frekuensi\n",
"0 belajar 3\n",
"1 nlp 2\n",
"2 saya 2\n",
"3 dan 1\n",
"4 ai 1\n",
"5 data 1\n",
"6 di 1\n",
"7 mahasiswa 1\n",
"8 kampus 1\n",
"9 science 1\n",
"10 suka 1\n",
"Frekuensi kata: 11\n"
]
}
],
"source": [
"# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n",
"word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n",
"word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n",
"\n",
"print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n",
"print(word_frequencies)\n",
"print(f\"Frekuensi kata: {len(word_frequencies)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NQjExannHuj0"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,380 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "JVPdWpz3hhbj"
},
"source": [
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4Mvva3v65h1v"
},
"source": [
"# **UNIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1cub_VJnUJMl",
"outputId": "1889eb61-4f3f-4780-f42e-02368076cce3"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: saya suka makan nasi\n",
"Tokens (4): ['saya', 'suka', 'makan', 'nasi']\n",
"\n",
"Frekuensi Unigram dalam kalimat\n",
" ('saya'): 1\n",
" ('suka'): 1\n",
" ('makan'): 1\n",
" ('nasi'): 1\n",
"\n",
"Total unigram dalam 1 kalimat: 4\n",
"\n",
"Probabilitas masing-masing unigram:\n",
" P(saya) = 0.25 (25.00%)\n",
" P(suka) = 0.25 (25.00%)\n",
" P(makan) = 0.25 (25.00%)\n",
" P(nasi) = 0.25 (25.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Unigram):\n",
" P(saya suka makan nasi) = P(saya)=0.25 x P(suka)=0.25 x P(makan)=0.25 x P(nasi)=0.25 = 0.0039 (0.39%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenize\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Unigram\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"print(\"\\nFrekuensi Unigram dalam kalimat\")\n",
"for pair, count in unigram_counts.items():\n",
" print(f\" ('{pair}'): {count}\")\n",
"print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n",
"\n",
"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n",
"unigram_probabilities = {}\n",
"for word, count in unigram_counts.items():\n",
" prob = count / total_tokens\n",
" unigram_probabilities[word] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing unigram:\")\n",
"for word, prob in unigram_probabilities.items():\n",
" print(f\" P({word}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)\n",
"p_kalimat = 1\n",
"prob_parts = []\n",
"\n",
"# Loop untuk menghitung probabilitas total dan membangun string rumus detail\n",
"for word in tokens:\n",
" prob_value = unigram_probabilities[word]\n",
" p_kalimat *= prob_value\n",
" # Format: P(word)=prob_value\n",
" prob_parts.append(f\"P({word})={prob_value:.2f}\")\n",
"\n",
"# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail\n",
"prob_str = \" x \".join(prob_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Vstwt996-FrS"
},
"source": [
"# **BIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XRIY4qgTVbjl",
"outputId": "ea6e62ce-45a0-40c9-ca98-1fcc30558479"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: saya makan nasi dan saya makan roti\n",
"Tokens (7): ['saya', 'makan', 'nasi', 'dan', 'saya', 'makan', 'roti']\n",
"\n",
"Frekuensi Bigram dalam kalimat:\n",
" ('saya', 'makan'): 2\n",
" ('makan', 'nasi'): 1\n",
" ('nasi', 'dan'): 1\n",
" ('dan', 'saya'): 1\n",
" ('makan', 'roti'): 1\n",
"\n",
"Total bigram dalam 1 kalimat: 6\n",
"\n",
"Probabilitas masing-masing bigram:\n",
" P(makan|saya) = 1.00 (100.00%)\n",
" P(nasi|makan) = 0.50 (50.00%)\n",
" P(dan|nasi) = 1.00 (100.00%)\n",
" P(saya|dan) = 1.00 (100.00%)\n",
" P(roti|makan) = 0.50 (50.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Bigram):\n",
" P(saya makan nasi dan saya makan roti) = P(saya)=0.29 x P(makan|saya)=1.00 x P(nasi|makan)=0.50 x P(dan|nasi)=1.00 x P(saya|dan)=1.00 x P(makan|saya)=1.00 x P(roti|makan)=0.50 = 0.071429 (7.14%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Unigram dan Bigram\n",
"unigram_counts = Counter(tokens)\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"bigram_counts = Counter(bigrams)\n",
"\n",
"print(\"\\nFrekuensi Bigram dalam kalimat:\")\n",
"for pair, count in bigram_counts.items():\n",
" print(f\" {pair}: {count}\")\n",
"print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n",
"\n",
"# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n",
"bigram_probabilities = {}\n",
"for (w1, w2), count in bigram_counts.items():\n",
" prob = count / unigram_counts[w1]\n",
" bigram_probabilities[(w1, w2)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing bigram:\")\n",
"for (w1, w2), prob in bigram_probabilities.items():\n",
" print(f\" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)\n",
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...\n",
"total_tokens = sum(unigram_counts.values())\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)\n",
"p_kalimat = p_w1 # Inisialisasi dengan P(w1)\n",
"\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"] # Tambahkan P(w1) ke rumus\n",
"\n",
"for i in range(1, len(tokens)):\n",
" pair = (tokens[i-1], tokens[i])\n",
" p = bigram_probabilities.get(pair, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.2f}\")\n",
"\n",
"# Gabungkan rumus perkalian untuk ditampilkan\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "E6n1IU8X-G9S"
},
"source": [
"# **TRIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BIRARsj2FHJg",
"outputId": "968d420e-9370-40e5-e7e1-148e1d351d62"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning\n",
"Tokens (7): ['mahasiswa', 'mengerjakan', 'tugas', 'kemudian', 'mahasiswa', 'upload', 'e-learning']\n",
"\n",
"Frekuensi Trigram dalam kalimat:\n",
" ('mahasiswa', 'mengerjakan', 'tugas'): 1\n",
" ('mengerjakan', 'tugas', 'kemudian'): 1\n",
" ('tugas', 'kemudian', 'mahasiswa'): 1\n",
" ('kemudian', 'mahasiswa', 'upload'): 1\n",
" ('mahasiswa', 'upload', 'e-learning'): 1\n",
"\n",
"Total trigram dalam 1 kalimat: 5\n",
"\n",
"Probabilitas masing-masing trigram:\n",
" P(tugas|mahasiswa,mengerjakan) = 1.00 (100.00%)\n",
" P(kemudian|mengerjakan,tugas) = 1.00 (100.00%)\n",
" P(mahasiswa|tugas,kemudian) = 1.00 (100.00%)\n",
" P(upload|kemudian,mahasiswa) = 1.00 (100.00%)\n",
" P(e-learning|mahasiswa,upload) = 1.00 (100.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Trigram):\n",
" P(mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning) = P(mahasiswa)=0.29 x P(mengerjakan|mahasiswa)=0.50 x P(tugas|mahasiswa,mengerjakan)=1.00 x P(kemudian|mengerjakan,tugas)=1.00 x P(mahasiswa|tugas,kemudian)=1.00 x P(upload|kemudian,mahasiswa)=1.00 x P(e-learning|mahasiswa,upload)=1.00 = 0.142857 (14.29%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Bigram dan Trigram\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n",
"\n",
"bigram_counts = Counter(bigrams)\n",
"trigram_counts = Counter(trigrams)\n",
"\n",
"print(\"\\nFrekuensi Trigram dalam kalimat:\")\n",
"for tg, count in trigram_counts.items():\n",
" print(f\" {tg}: {count}\")\n",
"print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n",
"\n",
"# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)\n",
"trigram_probabilities = {}\n",
"for (w1, w2, w3), count in trigram_counts.items():\n",
" # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)\n",
" if bigram_counts[(w1, w2)] > 0:\n",
" prob = count / bigram_counts[(w1, w2)]\n",
" else:\n",
" prob = 0\n",
" trigram_probabilities[(w1, w2, w3)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing trigram:\")\n",
"for (w1, w2, w3), prob in trigram_probabilities.items():\n",
" print(f\" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)\n",
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...\n",
"\n",
"# a. P(w1)\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n",
"\n",
"# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)\n",
"if len(tokens) > 1:\n",
" count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0\n",
" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n",
"else:\n",
" p_w2_w1 = 1.0 # Jika hanya 1 kata\n",
"\n",
"p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)\n",
"\n",
"# Daftar bagian rumus untuk ditampilkan\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"]\n",
"if len(tokens) > 1:\n",
" prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}\")\n",
"\n",
"# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3\n",
"for i in range(len(tokens) - 2):\n",
" triplet = (tokens[i], tokens[i+1], tokens[i+2])\n",
" p = trigram_probabilities.get(triplet, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}\")\n",
"\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")\n"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -0,0 +1,151 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "f4a1399a-f23d-4060-a07e-bce5a5c7ddac",
"metadata": {},
"source": [
"# Klasifikasi Teks\n",
"## Arif R Dwiyanto"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "53a214ae-c9cf-4d46-925d-068f1685537b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== Classification Report ===\n",
" precision recall f1-score support\n",
"\n",
" negative 0.00 0.00 0.00 1.0\n",
" positive 0.00 0.00 0.00 1.0\n",
"\n",
" accuracy 0.00 2.0\n",
" macro avg 0.00 0.00 0.00 2.0\n",
"weighted avg 0.00 0.00 0.00 2.0\n",
"\n",
"=== Confusion Matrix ===\n",
"[[0 1]\n",
" [1 0]]\n",
"\n",
"Prediksi untuk: barang buruk, saya kecewa\n",
"Hasil: negative\n"
]
}
],
"source": [
"# ---------------------------------------------------------\n",
"# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network\n",
"# ---------------------------------------------------------\n",
"\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.neural_network import MLPClassifier\n",
"from sklearn.metrics import classification_report, confusion_matrix\n",
"\n",
"# -----------------------------------------\n",
"# 1. Contoh Dataset\n",
"# -----------------------------------------\n",
"# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)\n",
"\n",
"data = {\n",
" \"text\": [\n",
" \"Saya suka produk ini, luar biasa\",\n",
" \"Layanannya buruk, sangat kecewa\",\n",
" \"Pembelian terbaik yang pernah saya lakukan\",\n",
" \"Saya benci produk ini, buang-buang uang\",\n",
" \"Kualitasnya sangat bagus, direkomendasikan\",\n",
" \"Pengalaman buruk, tidak akan membeli lagi\"\n",
" ],\n",
" \"label\": [\"positive\", \"negative\", \"positive\", \"negative\", \"positive\", \"negative\"]\n",
"}\n",
"\n",
"df = pd.DataFrame(data)\n",
"\n",
"# -----------------------------------------\n",
"# 2. Split Train & Test\n",
"# -----------------------------------------\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" df[\"text\"], df[\"label\"], test_size=0.3, random_state=42\n",
")\n",
"\n",
"# -----------------------------------------\n",
"# 3. TF-IDF Vectorization\n",
"# -----------------------------------------\n",
"tfidf = TfidfVectorizer(max_features=5000)\n",
"X_train_tfidf = tfidf.fit_transform(X_train)\n",
"X_test_tfidf = tfidf.transform(X_test)\n",
"\n",
"# -----------------------------------------\n",
"# 4. Feedforward ANN (MLPClassifier)\n",
"# -----------------------------------------\n",
"model = MLPClassifier(\n",
" hidden_layer_sizes=(256, 64),\n",
" activation='relu',\n",
" solver='adam',\n",
" max_iter=500,\n",
" random_state=42\n",
")\n",
"\n",
"model.fit(X_train_tfidf, y_train)\n",
"\n",
"# -----------------------------------------\n",
"# 5. Evaluasi Model\n",
"# -----------------------------------------\n",
"y_pred = model.predict(X_test_tfidf)\n",
"\n",
"print(\"=== Classification Report ===\")\n",
"print(classification_report(y_test, y_pred))\n",
"\n",
"print(\"=== Confusion Matrix ===\")\n",
"print(confusion_matrix(y_test, y_pred))\n",
"\n",
"# -----------------------------------------\n",
"# 6. Prediksi Teks Baru\n",
"# -----------------------------------------\n",
"sample_text = [\"barang bagus luar biasa\"]\n",
"sample_text = [\"barang buruk, saya kecewa\"]\n",
"sample_vec = tfidf.transform(sample_text)\n",
"prediction = model.predict(sample_vec)\n",
"\n",
"print(\"\\nPrediksi untuk:\", sample_text[0])\n",
"print(\"Hasil:\", prediction[0])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f7d90fe-4af4-446c-9547-c9312bfa6fc7",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -4,4 +4,5 @@
- NLP - NLP
- Machine Learning - Machine Learning
- Big Data - Big Data
- Data Mining - Data Mining
- Data Management