Feedforward ANN Text classification

2025-11-15 05:08:25 +00:00 · 2025-11-15 05:08:25 +00:00 · fc0b273149
commit fc0b273149
parent fd6d17f1ab
16 changed files with 2104 additions and 2 deletions
--- a/.ipynb_checkpoints/README-checkpoint.md
+++ b/.ipynb_checkpoints/README-checkpoint.md
@ -1 +1,8 @@
 # Kompilasi Materi Praktikum
+## Ganjil 2025/2026
+
+- NLP
+- Machine Learning
+- Big Data
+- Data Mining
+- Data Management
--- a/.virtual_documents/NLP/Fitur_Ekstraksi_BOW.ipynb
+++ b/.virtual_documents/NLP/Fitur_Ekstraksi_BOW.ipynb
@ -0,0 +1,75 @@
+# Input jumlah dokumen
+import pandas as pd
+n = int(input("Masukkan jumlah dokumen yang ingin dimasukkan: "))
+
+
+# Input teks dokumen satu per satu
+documents = []
+for i in range(n):
+    teks = input(f"Masukkan teks untuk dokumen ke-{i+1}: ")
+    documents.append(teks)
+
+print("\n=== Dokumen yang Dimasukkan ===")
+for i, doc in enumerate(documents):
+    print(f"Doc {i+1}: {doc}")
+
+
+# Tahap Tokenisasi
+tokenized_docs = []
+for doc in documents:
+    tokens = doc.lower().split()
+    tokenized_docs.append(tokens)
+
+print("\n=== Hasil Tokenisasi ===")
+for i, tokens in enumerate(tokenized_docs):
+    print(f"Doc {i+1}: {tokens}")
+
+
+# Pembuatan Corpus
+corpus_all = [word for doc in tokenized_docs for word in doc]
+
+print("\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===")
+print(corpus_all)
+print(f"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}")
+
+
+# Pembuatan Vocabulary
+vocabulary = sorted(set(corpus_all))
+
+print("\n=== Vocabulary (Kata Unik) ===")
+print(vocabulary)
+print(f"Jumlah kata unik (vocabulary size): {len(vocabulary)}")
+
+
+vocabulary = sorted(set(corpus_all))
+
+print("\n=== Vocabulary (Kata Unik) ===")
+for idx, word in enumerate(vocabulary, start=1):
+    print(f"{idx:>2}. {word}")
+print(f"\nJumlah kata unik (vocabulary size): {len(vocabulary)}")
+
+
+# Representasi Numerik (Matriks BoW)
+bow_matrix = []
+for doc in tokenized_docs:
+    vector = [doc.count(word) for word in vocabulary]
+    bow_matrix.append(vector)
+
+
+df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)
+df_bow.index = [f"D{i}" for i in range(1, len(documents)+1)]  # ubah label indeks jadi D1, D2, D3
+
+print("\n=== Matriks Bag of Words ===")
+print(df_bow)
+
+
+# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)
+word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()
+word_frequencies.columns = ["Kata", "Frekuensi"]
+
+print("\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===")
+print(word_frequencies)
+print(f"Frekuensi kata: {len(word_frequencies)}")
+
+
+
--- a/.virtual_documents/NLP/Klasifikasi
+++ b/.virtual_documents/NLP/Klasifikasi
@ -0,0 +1,84 @@
+
+
+
+# ---------------------------------------------------------
+# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network
+# ---------------------------------------------------------
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.neural_network import MLPClassifier
+from sklearn.metrics import classification_report, confusion_matrix
+
+# -----------------------------------------
+# 1. Contoh Dataset
+# -----------------------------------------
+# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)
+
+data = {
+    "text": [
+        "Saya suka produk ini, luar biasa",
+        "Layanannya buruk, sangat kecewa",
+        "Pembelian terbaik yang pernah saya lakukan",
+        "Saya benci produk ini, buang-buang uang",
+        "Kualitasnya sangat bagus, direkomendasikan",
+        "Pengalaman buruk, tidak akan membeli lagi"
+    ],
+    "label": ["positive", "negative", "positive", "negative", "positive", "negative"]
+}
+
+df = pd.DataFrame(data)
+
+# -----------------------------------------
+# 2. Split Train & Test
+# -----------------------------------------
+X_train, X_test, y_train, y_test = train_test_split(
+    df["text"], df["label"], test_size=0.3, random_state=42
+)
+
+# -----------------------------------------
+# 3. TF-IDF Vectorization
+# -----------------------------------------
+tfidf = TfidfVectorizer(max_features=5000)
+X_train_tfidf = tfidf.fit_transform(X_train)
+X_test_tfidf = tfidf.transform(X_test)
+
+# -----------------------------------------
+# 4. Feedforward ANN (MLPClassifier)
+# -----------------------------------------
+model = MLPClassifier(
+    hidden_layer_sizes=(256, 64),
+    activation='relu',
+    solver='adam',
+    max_iter=500,
+    random_state=42
+)
+
+model.fit(X_train_tfidf, y_train)
+
+# -----------------------------------------
+# 5. Evaluasi Model
+# -----------------------------------------
+y_pred = model.predict(X_test_tfidf)
+
+print("=== Classification Report ===")
+print(classification_report(y_test, y_pred))
+
+print("=== Confusion Matrix ===")
+print(confusion_matrix(y_test, y_pred))
+
+# -----------------------------------------
+# 6. Prediksi Teks Baru
+# -----------------------------------------
+sample_text = ["barang bagus luar biasa"]
+sample_text = ["barang buruk, saya kecewa"]
+sample_vec = tfidf.transform(sample_text)
+prediction = model.predict(sample_vec)
+
+print("\nPrediksi untuk:", sample_text[0])
+print("Hasil:", prediction[0])
+
+
+
+
--- a/.virtual_documents/NLP/N-Gram.ipynb
+++ b/.virtual_documents/NLP/N-Gram.ipynb
@ -0,0 +1,209 @@
+
+
+
+
+
+
+from collections import Counter
+from IPython.display import clear_output
+import math
+
+# 1. Input Kalimat dan Tokenisasi
+kalimat = input("Masukkan kalimat: ").strip()
+
+# Bersihkan output (khusus lingkungan notebook)
+try:
+    clear_output()
+except:
+    pass
+
+print(f"Corpus: {kalimat}")
+
+# Tokenize
+tokens = kalimat.lower().split()
+print(f"Tokens ({len(tokens)}): {tokens}")
+
+# 2. Hitung Frekuensi Unigram
+unigram_counts = Counter(tokens)
+total_tokens = sum(unigram_counts.values())
+
+print("\nFrekuensi Unigram dalam kalimat")
+for pair, count in unigram_counts.items():
+    print(f" ('{pair}'): {count}")
+print(f"\nTotal unigram dalam 1 kalimat: {total_tokens}")
+
+# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata
+unigram_probabilities = {}
+for word, count in unigram_counts.items():
+    prob = count / total_tokens
+    unigram_probabilities[word] = prob
+
+print("\nProbabilitas masing-masing unigram:")
+for word, prob in unigram_probabilities.items():
+    print(f" P({word}) = {prob:.2f} ({prob*100:.2f}%)")
+
+# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)
+p_kalimat = 1
+prob_parts = []
+
+# Loop untuk menghitung probabilitas total dan membangun string rumus detail
+for word in tokens:
+    prob_value = unigram_probabilities[word]
+    p_kalimat *= prob_value
+    # Format: P(word)=prob_value
+    prob_parts.append(f"P({word})={prob_value:.2f}")
+
+# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail
+prob_str = " x ".join(prob_parts)
+
+print("\nProbabilitas Keseluruhan Kalimat (Model Unigram):")
+print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)")
+
+
+
+
+
+from collections import Counter
+from IPython.display import clear_output
+import math
+
+# 1. Input Kalimat dan Tokenisasi
+kalimat = input("Masukkan kalimat: ").strip()
+
+# Bersihkan output (khusus lingkungan notebook)
+try:
+    clear_output()
+except:
+    pass
+
+print(f"Corpus: {kalimat}")
+
+# Tokenisasi
+tokens = kalimat.lower().split()
+print(f"Tokens ({len(tokens)}): {tokens}")
+
+# 2. Hitung Frekuensi Unigram dan Bigram
+unigram_counts = Counter(tokens)
+bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
+bigram_counts = Counter(bigrams)
+
+print("\nFrekuensi Bigram dalam kalimat:")
+for pair, count in bigram_counts.items():
+    print(f" {pair}: {count}")
+print(f"\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}")
+
+# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)
+bigram_probabilities = {}
+for (w1, w2), count in bigram_counts.items():
+    prob = count / unigram_counts[w1]
+    bigram_probabilities[(w1, w2)] = prob
+
+print("\nProbabilitas masing-masing bigram:")
+for (w1, w2), prob in bigram_probabilities.items():
+    print(f" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)")
+
+# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)
+#    P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...
+total_tokens = sum(unigram_counts.values())
+p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)
+p_kalimat = p_w1 # Inisialisasi dengan P(w1)
+
+prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"] # Tambahkan P(w1) ke rumus
+
+for i in range(1, len(tokens)):
+    pair = (tokens[i-1], tokens[i])
+    p = bigram_probabilities.get(pair, 0)
+    p_kalimat *= p
+    prob_str_parts.append(f"P({pair[1]}|{pair[0]})={p:.2f}")
+
+# Gabungkan rumus perkalian untuk ditampilkan
+prob_str = " x ".join(prob_str_parts)
+
+print("\nProbabilitas Keseluruhan Kalimat (Model Bigram):")
+print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
+
+
+
+
+
+from collections import Counter
+from IPython.display import clear_output
+import math
+
+# 1. Input Kalimat dan Tokenisasi
+kalimat = input("Masukkan kalimat: ").strip()
+
+# Bersihkan output (khusus lingkungan notebook)
+try:
+    clear_output()
+except:
+    pass
+
+print(f"Corpus: {kalimat}")
+
+# Tokenisasi
+tokens = kalimat.lower().split()
+print(f"Tokens ({len(tokens)}): {tokens}")
+
+# 2. Hitung Frekuensi Bigram dan Trigram
+bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
+trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]
+
+bigram_counts = Counter(bigrams)
+trigram_counts = Counter(trigrams)
+
+print("\nFrekuensi Trigram dalam kalimat:")
+for tg, count in trigram_counts.items():
+    print(f" {tg}: {count}")
+print(f"\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}")
+
+# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)
+trigram_probabilities = {}
+for (w1, w2, w3), count in trigram_counts.items():
+    # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)
+    if bigram_counts[(w1, w2)] > 0:
+        prob = count / bigram_counts[(w1, w2)]
+    else:
+        prob = 0
+    trigram_probabilities[(w1, w2, w3)] = prob
+
+print("\nProbabilitas masing-masing trigram:")
+for (w1, w2, w3), prob in trigram_probabilities.items():
+    print(f" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)")
+
+# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))
+unigram_counts = Counter(tokens)
+total_tokens = sum(unigram_counts.values())
+
+# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)
+#    P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...
+
+# a. P(w1)
+p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0
+
+# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)
+if len(tokens) > 1:
+    count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0
+    p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1
+else:
+    p_w2_w1 = 1.0 # Jika hanya 1 kata
+
+p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)
+
+# Daftar bagian rumus untuk ditampilkan
+prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"]
+if len(tokens) > 1:
+    prob_str_parts.append(f"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}")
+
+# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3
+for i in range(len(tokens) - 2):
+    triplet = (tokens[i], tokens[i+1], tokens[i+2])
+    p = trigram_probabilities.get(triplet, 0)
+    p_kalimat *= p
+    prob_str_parts.append(f"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}")
+
+prob_str = " x ".join(prob_str_parts)
+
+print("\nProbabilitas Keseluruhan Kalimat (Model Trigram):")
+print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
+
--- a/Code/00-Python-Text-Basics/00-Working-with-Text-Files.ipynb
+++ b/Code/00-Python-Text-Basics/00-Working-with-Text-Files.ipynb
@ -0,0 +1,213 @@
+
+
+
+
+
+
+
+
+
+
+
+
+name = 'Fred'
+
+# Using the old .format() method:
+print('His name is {var}.'.format(var=name))
+
+# Using f-strings:
+print(f'His name is {name}.')
+
+
+
+
+
+print(f'His name is {name!r}')
+
+
+
+
+
+d = {'a':123,'b':456}
+
+print(f'Address: {d['a']} Main Street')
+
+
+
+
+
+d = {'a':123,'b':456}
+
+print(f"Address: {d['a']} Main Street")
+
+
+
+
+
+library = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)]
+
+for book in library:
+    print(f'{book[0]:{10}} {book[1]:{8}} {book[2]:{7}}')
+
+
+
+
+
+for book in library:
+    print(f'{book[0]:{10}} {book[1]:{10}} {book[2]:.>{7}}') # here .> was added
+
+
+
+
+
+from datetime import datetime
+
+today = datetime(year=2018, month=1, day=27)
+
+print(f'{today:%B %d, %Y}')
+
+
+
+
+
+
+
+
+%%writefile test.txt
+Hello, this is a quick test file.
+This is the second line of the file.
+
+
+
+
+
+myfile = open('whoops.txt')
+
+
+
+
+
+pwd
+
+
+
+
+
+# Open the text.txt file we created earlier
+my_file = open('test.txt')
+
+
+my_file
+
+
+
+
+
+# We can now read the file
+my_file.read()
+
+
+# But what happens if we try to read it again?
+my_file.read()
+
+
+
+
+
+# Seek to the start of file (index 0)
+my_file.seek(0)
+
+
+# Now read again
+my_file.read()
+
+
+
+
+
+# Readlines returns a list of the lines in the file
+my_file.seek(0)
+my_file.readlines()
+
+
+
+
+
+my_file.close()
+
+
+
+
+
+# Add a second argument to the function, 'w' which stands for write.
+# Passing 'w+' lets us read and write to the file
+
+my_file = open('test.txt','w+')
+
+
+
+
+
+# Write to the file
+my_file.write('This is a new first line')
+
+
+# Read the file
+my_file.seek(0)
+my_file.read()
+
+
+my_file.close()  # always do this when you're done with a file
+
+
+
+
+
+my_file = open('test.txt','a+')
+my_file.write('\nThis line is being appended to test.txt')
+my_file.write('\nAnd another line here.')
+
+
+my_file.seek(0)
+print(my_file.read())
+
+
+my_file.close()
+
+
+
+
+
+%%writefile -a test.txt
+
+This is more text being appended to test.txt
+And another line here.
+
+
+
+
+
+
+
+
+with open('test.txt','r') as txt:
+    first_line = txt.readlines()[0]
+    
+print(first_line)
+
+
+
+
+
+txt.read()
+
+
+
+
+
+with open('test.txt','r') as txt:
+    for line in txt:
+        print(line, end='')  # the end='' argument removes extra linebreaks
+
+
+
--- a/Code/01-NLP-Python-Basics/00-Spacy-Basics.ipynb
+++ b/Code/01-NLP-Python-Basics/00-Spacy-Basics.ipynb
@ -0,0 +1,145 @@
+
+
+
+
+
+
+
+
+
+
+
+
+# Import spaCy and load the language library
+import spacy
+nlp = spacy.load('en_core_web_sm')
+
+# Create a Doc object
+doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')
+
+# Print each token separately
+for token in doc:
+    print(token.text, token.pos_, token.dep_)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+nlp.pipeline
+
+
+nlp.pipe_names
+
+
+
+
+
+doc2 = nlp(u"Tesla isn't   looking into startups anymore.")
+
+for token in doc2:
+    print(token.text, token.pos_, token.dep_)
+
+
+
+
+
+doc2
+
+
+doc2[0]
+
+
+type(doc2)
+
+
+
+
+
+doc2[0].pos_
+
+
+
+
+
+doc2[0].dep_
+
+
+
+
+
+spacy.explain('PROPN')
+
+
+spacy.explain('nsubj')
+
+
+
+
+
+
+
+
+# Lemmas (the base form of the word):
+print(doc2[4].text)
+print(doc2[4].lemma_)
+
+
+# Simple Parts-of-Speech & Detailed Tags:
+print(doc2[4].pos_)
+print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))
+
+
+# Word Shapes:
+print(doc2[0].text+': '+doc2[0].shape_)
+print(doc[5].text+' : '+doc[5].shape_)
+
+
+# Boolean Values:
+print(doc2[0].is_alpha)
+print(doc2[0].is_stop)
+
+
+
+
+
+doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
+the phrase "Life is what happens to us while we are making other plans" was written by \
+cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')
+
+
+life_quote = doc3[16:30]
+print(life_quote)
+
+
+type(life_quote)
+
+
+
+
+
+
+
+
+doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')
+
+
+for sent in doc4.sents:
+    print(sent)
+
+
+doc4[6].is_sent_start
+
+
+
--- a/Code/01-NLP-Python-Basics/01-Tokenization.ipynb
+++ b/Code/01-NLP-Python-Basics/01-Tokenization.ipynb
@ -0,0 +1,188 @@
+
+
+
+
+
+
+# Import spaCy and load the language library
+import spacy
+nlp = spacy.load('en_core_web_sm')
+
+
+# Create a string that includes opening and closing quotation marks
+mystring = '"We\'re moving to L.A.!"'
+print(mystring)
+
+
+# Create a Doc object and explore tokens
+doc = nlp(mystring)
+
+for token in doc:
+    print(token.text, end=' | ')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")
+
+for t in doc2:
+    print(t)
+
+
+
+
+
+doc3 = nlp(u'A 5km NYC cab ride costs $10.30')
+
+for t in doc3:
+    print(t)
+
+
+
+
+
+
+
+
+doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")
+
+for t in doc4:
+    print(t)
+
+
+
+
+
+
+
+
+len(doc)
+
+
+
+
+
+len(doc.vocab)
+
+
+
+
+
+
+
+
+doc5 = nlp(u'It is better to give than to receive.')
+
+# Retrieve the third token:
+doc5[2]
+
+
+# Retrieve three tokens from the middle:
+doc5[2:5]
+
+
+# Retrieve the last four tokens:
+doc5[-4:]
+
+
+
+
+
+doc6 = nlp(u'My dinner was horrible.')
+doc7 = nlp(u'Your dinner was delicious.')
+
+
+# Try to change "My dinner was horrible" to "My dinner was delicious"
+doc6[3] = doc7[3]
+
+
+
+
+
+doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')
+
+for token in doc8:
+    print(token.text, end=' | ')
+
+print('\n----')
+
+for ent in doc8.ents:
+    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
+
+
+
+
+
+len(doc8.ents)
+
+
+
+
+
+
+
+
+doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")
+
+for chunk in doc9.noun_chunks:
+    print(chunk.text)
+
+
+doc10 = nlp(u"Red cars do not carry higher insurance rates.")
+
+for chunk in doc10.noun_chunks:
+    print(chunk.text)
+
+
+doc11 = nlp(u"He was a one-eyed, one-horned, flying, purple people-eater.")
+
+for chunk in doc11.noun_chunks:
+    print(chunk.text)
+
+
+
+
+
+
+
+
+
+
+
+from spacy import displacy
+
+doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
+displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})
+
+
+
+
+
+
+
+
+doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
+displacy.render(doc, style='ent', jupyter=True)
+
+
+
+
+
+doc = nlp(u'This is a sentence.')
+displacy.serve(doc, style='dep')
+
+
+
+
+
+
--- a/Code/01-NLP-Python-Basics/07-NLP-Basics-Assessment-Solution.ipynb
+++ b/Code/01-NLP-Python-Basics/07-NLP-Basics-Assessment-Solution.ipynb
@ -0,0 +1,107 @@
+
+
+
+
+
+
+
+
+
+# RUN THIS CELL to perform standard imports:
+import spacy
+nlp = spacy.load('en_core_web_sm')
+
+
+
+
+
+# Enter your code here:
+
+with open('../TextFiles/owlcreek.txt') as f:
+    doc = nlp(f.read())
+
+
+# Run this cell to verify it worked:
+
+doc[:36]
+
+
+
+
+
+len(doc)
+
+
+
+
+
+sents = [sent for sent in doc.sents]
+len(sents)
+
+
+
+
+
+print(sents[1].text)
+
+
+
+
+
+# NORMAL SOLUTION:
+for token in sents[1]:
+    print(token.text, token.pos_, token.dep_, token.lemma_)
+
+
+# CHALLENGE SOLUTION:
+for token in sents[1]:
+    print(f'{token.text:{15}} {token.pos_:{5}} {token.dep_:{10}} {token.lemma_:{15}}')
+
+
+
+
+
+# Import the Matcher library:
+
+from spacy.matcher import Matcher
+matcher = Matcher(nlp.vocab)
+
+
+# Create a pattern and add it to matcher:
+
+pattern = [{'LOWER': 'swimming'}, {'IS_SPACE': True, 'OP':'*'}, {'LOWER': 'vigorously'}]
+
+matcher.add('Swimming', None, pattern)
+
+
+# Create a list of matches called "found_matches" and print the list:
+
+found_matches = matcher(doc)
+print(found_matches)
+
+
+
+
+
+print(doc[1265:1290])
+
+
+print(doc[3600:3615])
+
+
+
+
+
+for sent in sents:
+    if found_matches[0][1] < sent.end:
+        print(sent)
+        break
+
+
+for sent in sents:
+    if found_matches[1][1] < sent.end:
+        print(sent)
+        break
+
+
+
--- a/Code/02-Parts-of-Speech-Tagging/00-POS-Basics.ipynb
+++ b/Code/02-Parts-of-Speech-Tagging/00-POS-Basics.ipynb
@ -0,0 +1,107 @@
+
+
+
+
+
+
+# Perform standard imports
+import spacy
+nlp = spacy.load('en_core_web_sm')
+
+
+# Create a simple Doc object
+doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
+
+
+
+
+
+# Print the full text:
+print(doc.text)
+
+
+# Print the fifth word and associated tags:
+print(doc[4].text, doc[4].pos_, doc[4].tag_, spacy.explain(doc[4].tag_))
+
+
+
+
+
+for token in doc:
+    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+doc = nlp(u'I read books on NLP.')
+r = doc[1]
+
+print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')
+
+
+doc = nlp(u'I read a book on NLP.')
+r = doc[1]
+
+print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')
+
+
+
+
+
+
+
+
+doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
+
+# Count the frequencies of different coarse-grained POS tags:
+POS_counts = doc.count_by(spacy.attrs.POS)
+POS_counts
+
+
+
+
+
+doc.vocab[83].text
+
+
+
+
+
+for k,v in sorted(POS_counts.items()):
+    print(f'{k}. {doc.vocab[k].text:{5}}: {v}')
+
+
+# Count the different fine-grained tags:
+TAG_counts = doc.count_by(spacy.attrs.TAG)
+
+for k,v in sorted(TAG_counts.items()):
+    print(f'{k}. {doc.vocab[k].text:{4}}: {v}')
+
+
+
+
+
+# Count the different dependencies:
+DEP_counts = doc.count_by(spacy.attrs.DEP)
+
+for k,v in sorted(DEP_counts.items()):
+    print(f'{k}. {doc.vocab[k].text:{4}}: {v}')
+
+
+
+
+
+
+
+
+
--- a/Code/02-Parts-of-Speech-Tagging/07-POS-Challenge
+++ b/Code/02-Parts-of-Speech-Tagging/07-POS-Challenge
@ -0,0 +1,35 @@
+
+
+
+
+
+
+# Perform standard imports
+import spacy
+nlp = spacy.load('en_core_web_sm')
+
+# Import the game script
+import game
+
+
+# Enter your text here:
+text = u"The quick brown fox jumped over the lazy dog's back."
+
+
+# Make your Doc object and pass it into the scorer:
+doc = nlp(text)
+print(game.scorer(doc))
+
+
+
+
+
+# For practice, visualize your fine-grained POS tags (shown in the third column):
+print(f"{'TOKEN':{10}} {'COARSE':{8}} {'FINE':{6}} {'DESCRIPTION'}")
+print(f"{'-----':{10}} {'------':{8}} {'----':{6}} {'-----------'}")
+
+for token in doc:
+    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')
+
+
+
--- a/.virtual_documents/NLP/Untitled.ipynb
+++ b/.virtual_documents/NLP/Untitled.ipynb
@ -0,0 +1,84 @@
+
+
+
+# ---------------------------------------------------------
+# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network
+# ---------------------------------------------------------
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.neural_network import MLPClassifier
+from sklearn.metrics import classification_report, confusion_matrix
+
+# -----------------------------------------
+# 1. Contoh Dataset
+# -----------------------------------------
+# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)
+
+data = {
+    "text": [
+        "Saya suka produk ini, luar biasa",
+        "Layanannya buruk, sangat kecewa",
+        "Pembelian terbaik yang pernah saya lakukan",
+        "Saya benci produk ini, buang-buang uang",
+        "Kualitasnya sangat bagus, direkomendasikan",
+        "Pengalaman buruk, tidak akan membeli lagi"
+    ],
+    "label": ["positive", "negative", "positive", "negative", "positive", "negative"]
+}
+
+df = pd.DataFrame(data)
+
+# -----------------------------------------
+# 2. Split Train & Test
+# -----------------------------------------
+X_train, X_test, y_train, y_test = train_test_split(
+    df["text"], df["label"], test_size=0.3, random_state=42
+)
+
+# -----------------------------------------
+# 3. TF-IDF Vectorization
+# -----------------------------------------
+tfidf = TfidfVectorizer(max_features=5000)
+X_train_tfidf = tfidf.fit_transform(X_train)
+X_test_tfidf = tfidf.transform(X_test)
+
+# -----------------------------------------
+# 4. Feedforward ANN (MLPClassifier)
+# -----------------------------------------
+model = MLPClassifier(
+    hidden_layer_sizes=(256, 64),
+    activation='relu',
+    solver='adam',
+    max_iter=500,
+    random_state=42
+)
+
+model.fit(X_train_tfidf, y_train)
+
+# -----------------------------------------
+# 5. Evaluasi Model
+# -----------------------------------------
+y_pred = model.predict(X_test_tfidf)
+
+print("=== Classification Report ===")
+print(classification_report(y_test, y_pred))
+
+print("=== Confusion Matrix ===")
+print(confusion_matrix(y_test, y_pred))
+
+# -----------------------------------------
+# 6. Prediksi Teks Baru
+# -----------------------------------------
+sample_text = ["barang bagus luar biasa"]
+sample_text = ["barang buruk, saya kecewa"]
+sample_vec = tfidf.transform(sample_text)
+prediction = model.predict(sample_vec)
+
+print("\nPrediksi untuk:", sample_text[0])
+print("Hasil:", prediction[0])
+
+
+
+
--- a/NLP/.ipynb_checkpoints/Fitur_Ekstraksi_BOW-checkpoint.ipynb
+++ b/NLP/.ipynb_checkpoints/Fitur_Ekstraksi_BOW-checkpoint.ipynb
@ -0,0 +1,310 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "qBYcPYAb059g",
+    "outputId": "9f57b704-da1b-4495-d366-24c30586dc76"
+   },
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Masukkan jumlah dokumen yang ingin dimasukkan:  3\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Input jumlah dokumen\n",
+    "import pandas as pd\n",
+    "n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "mo-yt5Ob1N8j",
+    "outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Masukkan teks untuk dokumen ke-1: saya belajar nlp di kampus\n",
+      "Masukkan teks untuk dokumen ke-2: saya suka belajar ai\n",
+      "Masukkan teks untuk dokumen ke-3: mahasiswa belajar data science dan nlp\n",
+      "\n",
+      "=== Dokumen yang Dimasukkan ===\n",
+      "Doc 1: saya belajar nlp di kampus\n",
+      "Doc 2: saya suka belajar ai\n",
+      "Doc 3: mahasiswa belajar data science dan nlp\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Input teks dokumen satu per satu\n",
+    "documents = []\n",
+    "for i in range(n):\n",
+    "    teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
+    "    documents.append(teks)\n",
+    "\n",
+    "print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
+    "for i, doc in enumerate(documents):\n",
+    "    print(f\"Doc {i+1}: {doc}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "FkmxRAFq1oDK",
+    "outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== Hasil Tokenisasi ===\n",
+      "Doc 1: ['saya', 'belajar', 'nlp', 'di', 'kampus']\n",
+      "Doc 2: ['saya', 'suka', 'belajar', 'ai']\n",
+      "Doc 3: ['mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Tahap Tokenisasi\n",
+    "tokenized_docs = []\n",
+    "for doc in documents:\n",
+    "    tokens = doc.lower().split()\n",
+    "    tokenized_docs.append(tokens)\n",
+    "\n",
+    "print(\"\\n=== Hasil Tokenisasi ===\")\n",
+    "for i, tokens in enumerate(tokenized_docs):\n",
+    "    print(f\"Doc {i+1}: {tokens}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "ybC1Vo2C_c3q",
+    "outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
+      "['saya', 'belajar', 'nlp', 'di', 'kampus', 'saya', 'suka', 'belajar', 'ai', 'mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n",
+      "Jumlah total kata dalam seluruh dokumen: 15\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Pembuatan Corpus\n",
+    "corpus_all = [word for doc in tokenized_docs for word in doc]\n",
+    "\n",
+    "print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
+    "print(corpus_all)\n",
+    "print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "s6S-Ma4R1xuq",
+    "outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== Vocabulary (Kata Unik) ===\n",
+      "['ai', 'belajar', 'dan', 'data', 'di', 'kampus', 'mahasiswa', 'nlp', 'saya', 'science', 'suka']\n",
+      "Jumlah kata unik (vocabulary size): 11\n",
+      "\n",
+      "=== Vocabulary (Kata Unik) ===\n",
+      " 1. ai\n",
+      " 2. belajar\n",
+      " 3. dan\n",
+      " 4. data\n",
+      " 5. di\n",
+      " 6. kampus\n",
+      " 7. mahasiswa\n",
+      " 8. nlp\n",
+      " 9. saya\n",
+      "10. science\n",
+      "11. suka\n",
+      "\n",
+      "Jumlah kata unik (vocabulary size): 11\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Pembuatan Vocabulary\n",
+    "vocabulary = sorted(set(corpus_all))\n",
+    "\n",
+    "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
+    "print(vocabulary)\n",
+    "print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
+    "\n",
+    "\n",
+    "vocabulary = sorted(set(corpus_all))\n",
+    "\n",
+    "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
+    "for idx, word in enumerate(vocabulary, start=1):\n",
+    "    print(f\"{idx:>2}. {word}\")\n",
+    "print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "id": "ShevCTva2Fg9"
+   },
+   "outputs": [],
+   "source": [
+    "# Representasi Numerik (Matriks BoW)\n",
+    "bow_matrix = []\n",
+    "for doc in tokenized_docs:\n",
+    "    vector = [doc.count(word) for word in vocabulary]\n",
+    "    bow_matrix.append(vector)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "-yB6D2pY2M0E",
+    "outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== Matriks Bag of Words ===\n",
+      "    ai  belajar  dan  data  di  kampus  mahasiswa  nlp  saya  science  suka\n",
+      "D1   0        1    0     0   1       1          0    1     1        0     0\n",
+      "D2   1        1    0     0   0       0          0    0     1        0     1\n",
+      "D3   0        1    1     1   0       0          1    1     0        1     0\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
+    "df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)]  # ubah label indeks jadi D1, D2, D3\n",
+    "\n",
+    "print(\"\\n=== Matriks Bag of Words ===\")\n",
+    "print(df_bow)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "8ruf5vKL2rGD",
+    "outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n",
+      "         Kata  Frekuensi\n",
+      "0     belajar          3\n",
+      "1         nlp          2\n",
+      "2        saya          2\n",
+      "3         dan          1\n",
+      "4          ai          1\n",
+      "5        data          1\n",
+      "6          di          1\n",
+      "7   mahasiswa          1\n",
+      "8      kampus          1\n",
+      "9     science          1\n",
+      "10       suka          1\n",
+      "Frekuensi kata: 11\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n",
+    "word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n",
+    "word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n",
+    "\n",
+    "print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n",
+    "print(word_frequencies)\n",
+    "print(f\"Frekuensi kata: {len(word_frequencies)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "NQjExannHuj0"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/NLP/.ipynb_checkpoints/Klasifikasi
+++ b/NLP/.ipynb_checkpoints/Klasifikasi
@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/NLP/.ipynb_checkpoints/N-Gram-checkpoint.ipynb
+++ b/NLP/.ipynb_checkpoints/N-Gram-checkpoint.ipynb
@ -0,0 +1,380 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "JVPdWpz3hhbj"
+   },
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "4Mvva3v65h1v"
+   },
+   "source": [
+    "# **UNIGRAM**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "1cub_VJnUJMl",
+    "outputId": "1889eb61-4f3f-4780-f42e-02368076cce3"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Corpus: saya suka makan nasi\n",
+      "Tokens (4): ['saya', 'suka', 'makan', 'nasi']\n",
+      "\n",
+      "Frekuensi Unigram dalam kalimat\n",
+      " ('saya'): 1\n",
+      " ('suka'): 1\n",
+      " ('makan'): 1\n",
+      " ('nasi'): 1\n",
+      "\n",
+      "Total unigram dalam 1 kalimat: 4\n",
+      "\n",
+      "Probabilitas masing-masing unigram:\n",
+      " P(saya) = 0.25 (25.00%)\n",
+      " P(suka) = 0.25 (25.00%)\n",
+      " P(makan) = 0.25 (25.00%)\n",
+      " P(nasi) = 0.25 (25.00%)\n",
+      "\n",
+      "Probabilitas Keseluruhan Kalimat (Model Unigram):\n",
+      " P(saya suka makan nasi) = P(saya)=0.25 x P(suka)=0.25 x P(makan)=0.25 x P(nasi)=0.25 = 0.0039 (0.39%)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from collections import Counter\n",
+    "from IPython.display import clear_output\n",
+    "import math\n",
+    "\n",
+    "# 1. Input Kalimat dan Tokenisasi\n",
+    "kalimat = input(\"Masukkan kalimat: \").strip()\n",
+    "\n",
+    "# Bersihkan output (khusus lingkungan notebook)\n",
+    "try:\n",
+    "    clear_output()\n",
+    "except:\n",
+    "    pass\n",
+    "\n",
+    "print(f\"Corpus: {kalimat}\")\n",
+    "\n",
+    "# Tokenize\n",
+    "tokens = kalimat.lower().split()\n",
+    "print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
+    "\n",
+    "# 2. Hitung Frekuensi Unigram\n",
+    "unigram_counts = Counter(tokens)\n",
+    "total_tokens = sum(unigram_counts.values())\n",
+    "\n",
+    "print(\"\\nFrekuensi Unigram dalam kalimat\")\n",
+    "for pair, count in unigram_counts.items():\n",
+    "    print(f\" ('{pair}'): {count}\")\n",
+    "print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n",
+    "\n",
+    "# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n",
+    "unigram_probabilities = {}\n",
+    "for word, count in unigram_counts.items():\n",
+    "    prob = count / total_tokens\n",
+    "    unigram_probabilities[word] = prob\n",
+    "\n",
+    "print(\"\\nProbabilitas masing-masing unigram:\")\n",
+    "for word, prob in unigram_probabilities.items():\n",
+    "    print(f\" P({word}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
+    "\n",
+    "# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)\n",
+    "p_kalimat = 1\n",
+    "prob_parts = []\n",
+    "\n",
+    "# Loop untuk menghitung probabilitas total dan membangun string rumus detail\n",
+    "for word in tokens:\n",
+    "    prob_value = unigram_probabilities[word]\n",
+    "    p_kalimat *= prob_value\n",
+    "    # Format: P(word)=prob_value\n",
+    "    prob_parts.append(f\"P({word})={prob_value:.2f}\")\n",
+    "\n",
+    "# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail\n",
+    "prob_str = \" x \".join(prob_parts)\n",
+    "\n",
+    "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n",
+    "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Vstwt996-FrS"
+   },
+   "source": [
+    "# **BIGRAM**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "XRIY4qgTVbjl",
+    "outputId": "ea6e62ce-45a0-40c9-ca98-1fcc30558479"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Corpus: saya makan nasi dan saya makan roti\n",
+      "Tokens (7): ['saya', 'makan', 'nasi', 'dan', 'saya', 'makan', 'roti']\n",
+      "\n",
+      "Frekuensi Bigram dalam kalimat:\n",
+      " ('saya', 'makan'): 2\n",
+      " ('makan', 'nasi'): 1\n",
+      " ('nasi', 'dan'): 1\n",
+      " ('dan', 'saya'): 1\n",
+      " ('makan', 'roti'): 1\n",
+      "\n",
+      "Total bigram dalam 1 kalimat: 6\n",
+      "\n",
+      "Probabilitas masing-masing bigram:\n",
+      " P(makan|saya) = 1.00 (100.00%)\n",
+      " P(nasi|makan) = 0.50 (50.00%)\n",
+      " P(dan|nasi) = 1.00 (100.00%)\n",
+      " P(saya|dan) = 1.00 (100.00%)\n",
+      " P(roti|makan) = 0.50 (50.00%)\n",
+      "\n",
+      "Probabilitas Keseluruhan Kalimat (Model Bigram):\n",
+      " P(saya makan nasi dan saya makan roti) = P(saya)=0.29 x P(makan|saya)=1.00 x P(nasi|makan)=0.50 x P(dan|nasi)=1.00 x P(saya|dan)=1.00 x P(makan|saya)=1.00 x P(roti|makan)=0.50 = 0.071429 (7.14%)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from collections import Counter\n",
+    "from IPython.display import clear_output\n",
+    "import math\n",
+    "\n",
+    "# 1. Input Kalimat dan Tokenisasi\n",
+    "kalimat = input(\"Masukkan kalimat: \").strip()\n",
+    "\n",
+    "# Bersihkan output (khusus lingkungan notebook)\n",
+    "try:\n",
+    "    clear_output()\n",
+    "except:\n",
+    "    pass\n",
+    "\n",
+    "print(f\"Corpus: {kalimat}\")\n",
+    "\n",
+    "# Tokenisasi\n",
+    "tokens = kalimat.lower().split()\n",
+    "print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
+    "\n",
+    "# 2. Hitung Frekuensi Unigram dan Bigram\n",
+    "unigram_counts = Counter(tokens)\n",
+    "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
+    "bigram_counts = Counter(bigrams)\n",
+    "\n",
+    "print(\"\\nFrekuensi Bigram dalam kalimat:\")\n",
+    "for pair, count in bigram_counts.items():\n",
+    "    print(f\" {pair}: {count}\")\n",
+    "print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n",
+    "\n",
+    "# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n",
+    "bigram_probabilities = {}\n",
+    "for (w1, w2), count in bigram_counts.items():\n",
+    "    prob = count / unigram_counts[w1]\n",
+    "    bigram_probabilities[(w1, w2)] = prob\n",
+    "\n",
+    "print(\"\\nProbabilitas masing-masing bigram:\")\n",
+    "for (w1, w2), prob in bigram_probabilities.items():\n",
+    "    print(f\" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
+    "\n",
+    "# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)\n",
+    "#    P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...\n",
+    "total_tokens = sum(unigram_counts.values())\n",
+    "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)\n",
+    "p_kalimat = p_w1 # Inisialisasi dengan P(w1)\n",
+    "\n",
+    "prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"] # Tambahkan P(w1) ke rumus\n",
+    "\n",
+    "for i in range(1, len(tokens)):\n",
+    "    pair = (tokens[i-1], tokens[i])\n",
+    "    p = bigram_probabilities.get(pair, 0)\n",
+    "    p_kalimat *= p\n",
+    "    prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.2f}\")\n",
+    "\n",
+    "# Gabungkan rumus perkalian untuk ditampilkan\n",
+    "prob_str = \" x \".join(prob_str_parts)\n",
+    "\n",
+    "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n",
+    "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "E6n1IU8X-G9S"
+   },
+   "source": [
+    "# **TRIGRAM**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "BIRARsj2FHJg",
+    "outputId": "968d420e-9370-40e5-e7e1-148e1d351d62"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Corpus: mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning\n",
+      "Tokens (7): ['mahasiswa', 'mengerjakan', 'tugas', 'kemudian', 'mahasiswa', 'upload', 'e-learning']\n",
+      "\n",
+      "Frekuensi Trigram dalam kalimat:\n",
+      " ('mahasiswa', 'mengerjakan', 'tugas'): 1\n",
+      " ('mengerjakan', 'tugas', 'kemudian'): 1\n",
+      " ('tugas', 'kemudian', 'mahasiswa'): 1\n",
+      " ('kemudian', 'mahasiswa', 'upload'): 1\n",
+      " ('mahasiswa', 'upload', 'e-learning'): 1\n",
+      "\n",
+      "Total trigram dalam 1 kalimat: 5\n",
+      "\n",
+      "Probabilitas masing-masing trigram:\n",
+      " P(tugas|mahasiswa,mengerjakan) = 1.00 (100.00%)\n",
+      " P(kemudian|mengerjakan,tugas) = 1.00 (100.00%)\n",
+      " P(mahasiswa|tugas,kemudian) = 1.00 (100.00%)\n",
+      " P(upload|kemudian,mahasiswa) = 1.00 (100.00%)\n",
+      " P(e-learning|mahasiswa,upload) = 1.00 (100.00%)\n",
+      "\n",
+      "Probabilitas Keseluruhan Kalimat (Model Trigram):\n",
+      " P(mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning) = P(mahasiswa)=0.29 x P(mengerjakan|mahasiswa)=0.50 x P(tugas|mahasiswa,mengerjakan)=1.00 x P(kemudian|mengerjakan,tugas)=1.00 x P(mahasiswa|tugas,kemudian)=1.00 x P(upload|kemudian,mahasiswa)=1.00 x P(e-learning|mahasiswa,upload)=1.00 = 0.142857 (14.29%)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from collections import Counter\n",
+    "from IPython.display import clear_output\n",
+    "import math\n",
+    "\n",
+    "# 1. Input Kalimat dan Tokenisasi\n",
+    "kalimat = input(\"Masukkan kalimat: \").strip()\n",
+    "\n",
+    "# Bersihkan output (khusus lingkungan notebook)\n",
+    "try:\n",
+    "    clear_output()\n",
+    "except:\n",
+    "    pass\n",
+    "\n",
+    "print(f\"Corpus: {kalimat}\")\n",
+    "\n",
+    "# Tokenisasi\n",
+    "tokens = kalimat.lower().split()\n",
+    "print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
+    "\n",
+    "# 2. Hitung Frekuensi Bigram dan Trigram\n",
+    "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
+    "trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n",
+    "\n",
+    "bigram_counts = Counter(bigrams)\n",
+    "trigram_counts = Counter(trigrams)\n",
+    "\n",
+    "print(\"\\nFrekuensi Trigram dalam kalimat:\")\n",
+    "for tg, count in trigram_counts.items():\n",
+    "    print(f\" {tg}: {count}\")\n",
+    "print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n",
+    "\n",
+    "# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)\n",
+    "trigram_probabilities = {}\n",
+    "for (w1, w2, w3), count in trigram_counts.items():\n",
+    "    # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)\n",
+    "    if bigram_counts[(w1, w2)] > 0:\n",
+    "        prob = count / bigram_counts[(w1, w2)]\n",
+    "    else:\n",
+    "        prob = 0\n",
+    "    trigram_probabilities[(w1, w2, w3)] = prob\n",
+    "\n",
+    "print(\"\\nProbabilitas masing-masing trigram:\")\n",
+    "for (w1, w2, w3), prob in trigram_probabilities.items():\n",
+    "    print(f\" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
+    "\n",
+    "# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))\n",
+    "unigram_counts = Counter(tokens)\n",
+    "total_tokens = sum(unigram_counts.values())\n",
+    "\n",
+    "# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)\n",
+    "#    P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...\n",
+    "\n",
+    "# a. P(w1)\n",
+    "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n",
+    "\n",
+    "# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)\n",
+    "if len(tokens) > 1:\n",
+    "    count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0\n",
+    "    p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n",
+    "else:\n",
+    "    p_w2_w1 = 1.0 # Jika hanya 1 kata\n",
+    "\n",
+    "p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)\n",
+    "\n",
+    "# Daftar bagian rumus untuk ditampilkan\n",
+    "prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"]\n",
+    "if len(tokens) > 1:\n",
+    "    prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}\")\n",
+    "\n",
+    "# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3\n",
+    "for i in range(len(tokens) - 2):\n",
+    "    triplet = (tokens[i], tokens[i+1], tokens[i+2])\n",
+    "    p = trigram_probabilities.get(triplet, 0)\n",
+    "    p_kalimat *= p\n",
+    "    prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}\")\n",
+    "\n",
+    "prob_str = \" x \".join(prob_str_parts)\n",
+    "\n",
+    "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n",
+    "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--- a/NLP/Klasifikasi
+++ b/NLP/Klasifikasi
@ -0,0 +1,151 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f4a1399a-f23d-4060-a07e-bce5a5c7ddac",
+   "metadata": {},
+   "source": [
+    "# Klasifikasi Teks\n",
+    "## Arif R Dwiyanto"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "53a214ae-c9cf-4d46-925d-068f1685537b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "=== Classification Report ===\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "    negative       0.00      0.00      0.00       1.0\n",
+      "    positive       0.00      0.00      0.00       1.0\n",
+      "\n",
+      "    accuracy                           0.00       2.0\n",
+      "   macro avg       0.00      0.00      0.00       2.0\n",
+      "weighted avg       0.00      0.00      0.00       2.0\n",
+      "\n",
+      "=== Confusion Matrix ===\n",
+      "[[0 1]\n",
+      " [1 0]]\n",
+      "\n",
+      "Prediksi untuk: barang buruk, saya kecewa\n",
+      "Hasil: negative\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ---------------------------------------------------------\n",
+    "# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network\n",
+    "# ---------------------------------------------------------\n",
+    "\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.neural_network import MLPClassifier\n",
+    "from sklearn.metrics import classification_report, confusion_matrix\n",
+    "\n",
+    "# -----------------------------------------\n",
+    "# 1. Contoh Dataset\n",
+    "# -----------------------------------------\n",
+    "# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)\n",
+    "\n",
+    "data = {\n",
+    "    \"text\": [\n",
+    "        \"Saya suka produk ini, luar biasa\",\n",
+    "        \"Layanannya buruk, sangat kecewa\",\n",
+    "        \"Pembelian terbaik yang pernah saya lakukan\",\n",
+    "        \"Saya benci produk ini, buang-buang uang\",\n",
+    "        \"Kualitasnya sangat bagus, direkomendasikan\",\n",
+    "        \"Pengalaman buruk, tidak akan membeli lagi\"\n",
+    "    ],\n",
+    "    \"label\": [\"positive\", \"negative\", \"positive\", \"negative\", \"positive\", \"negative\"]\n",
+    "}\n",
+    "\n",
+    "df = pd.DataFrame(data)\n",
+    "\n",
+    "# -----------------------------------------\n",
+    "# 2. Split Train & Test\n",
+    "# -----------------------------------------\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    df[\"text\"], df[\"label\"], test_size=0.3, random_state=42\n",
+    ")\n",
+    "\n",
+    "# -----------------------------------------\n",
+    "# 3. TF-IDF Vectorization\n",
+    "# -----------------------------------------\n",
+    "tfidf = TfidfVectorizer(max_features=5000)\n",
+    "X_train_tfidf = tfidf.fit_transform(X_train)\n",
+    "X_test_tfidf = tfidf.transform(X_test)\n",
+    "\n",
+    "# -----------------------------------------\n",
+    "# 4. Feedforward ANN (MLPClassifier)\n",
+    "# -----------------------------------------\n",
+    "model = MLPClassifier(\n",
+    "    hidden_layer_sizes=(256, 64),\n",
+    "    activation='relu',\n",
+    "    solver='adam',\n",
+    "    max_iter=500,\n",
+    "    random_state=42\n",
+    ")\n",
+    "\n",
+    "model.fit(X_train_tfidf, y_train)\n",
+    "\n",
+    "# -----------------------------------------\n",
+    "# 5. Evaluasi Model\n",
+    "# -----------------------------------------\n",
+    "y_pred = model.predict(X_test_tfidf)\n",
+    "\n",
+    "print(\"=== Classification Report ===\")\n",
+    "print(classification_report(y_test, y_pred))\n",
+    "\n",
+    "print(\"=== Confusion Matrix ===\")\n",
+    "print(confusion_matrix(y_test, y_pred))\n",
+    "\n",
+    "# -----------------------------------------\n",
+    "# 6. Prediksi Teks Baru\n",
+    "# -----------------------------------------\n",
+    "sample_text = [\"barang bagus luar biasa\"]\n",
+    "sample_text = [\"barang buruk, saya kecewa\"]\n",
+    "sample_vec = tfidf.transform(sample_text)\n",
+    "prediction = model.predict(sample_vec)\n",
+    "\n",
+    "print(\"\\nPrediksi untuk:\", sample_text[0])\n",
+    "print(\"Hasil:\", prediction[0])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f7d90fe-4af4-446c-9547-c9312bfa6fc7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/README.md
+++ b/README.md
@ -5,3 +5,4 @@
 - Machine Learning
 - Big Data
 - Data Mining
+- Data Management