Feedforward ANN Text classification

2025-11-15 05:08:25 +00:00 · 2025-11-15 05:08:25 +00:00 · fc0b273149
commit fc0b273149
parent fd6d17f1ab
16 changed files with 2104 additions and 2 deletions
--- a/.ipynb_checkpoints/README-checkpoint.md
+++ b/.ipynb_checkpoints/README-checkpoint.md
@ -1 +1,8 @@
-# Kompilasi Materi Praktikum
+# Kompilasi Materi Praktikum
 ## Ganjil 2025/2026
 - NLP
 - Machine Learning
 - Big Data
 - Data Mining
 - Data Management
--- a/.virtual_documents/NLP/Fitur_Ekstraksi_BOW.ipynb
+++ b/.virtual_documents/NLP/Fitur_Ekstraksi_BOW.ipynb
@ -0,0 +1,75 @@
 # Input jumlah dokumen
 import pandas as pd
 n = int(input("Masukkan jumlah dokumen yang ingin dimasukkan: "))
 # Input teks dokumen satu per satu
 documents = []
 for i in range(n):
    teks = input(f"Masukkan teks untuk dokumen ke-{i+1}: ")
    documents.append(teks)
 print("\n=== Dokumen yang Dimasukkan ===")
 for i, doc in enumerate(documents):
    print(f"Doc {i+1}: {doc}")
 # Tahap Tokenisasi
 tokenized_docs = []
 for doc in documents:
    tokens = doc.lower().split()
    tokenized_docs.append(tokens)
 print("\n=== Hasil Tokenisasi ===")
 for i, tokens in enumerate(tokenized_docs):
    print(f"Doc {i+1}: {tokens}")
 # Pembuatan Corpus
 corpus_all = [word for doc in tokenized_docs for word in doc]
 print("\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===")
 print(corpus_all)
 print(f"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}")
 # Pembuatan Vocabulary
 vocabulary = sorted(set(corpus_all))
 print("\n=== Vocabulary (Kata Unik) ===")
 print(vocabulary)
 print(f"Jumlah kata unik (vocabulary size): {len(vocabulary)}")
 vocabulary = sorted(set(corpus_all))
 print("\n=== Vocabulary (Kata Unik) ===")
 for idx, word in enumerate(vocabulary, start=1):
    print(f"{idx:>2}. {word}")
 print(f"\nJumlah kata unik (vocabulary size): {len(vocabulary)}")
 # Representasi Numerik (Matriks BoW)
 bow_matrix = []
 for doc in tokenized_docs:
    vector = [doc.count(word) for word in vocabulary]
    bow_matrix.append(vector)
 df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)
 df_bow.index = [f"D{i}" for i in range(1, len(documents)+1)]  # ubah label indeks jadi D1, D2, D3
 print("\n=== Matriks Bag of Words ===")
 print(df_bow)
 # Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)
 word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()
 word_frequencies.columns = ["Kata", "Frekuensi"]
 print("\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===")
 print(word_frequencies)
 print(f"Frekuensi kata: {len(word_frequencies)}")
--- a/.virtual_documents/NLP/Klasifikasi
+++ b/.virtual_documents/NLP/Klasifikasi
@ -0,0 +1,84 @@
 # ---------------------------------------------------------
 # Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network
 # ---------------------------------------------------------
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.neural_network import MLPClassifier
 from sklearn.metrics import classification_report, confusion_matrix
 # -----------------------------------------
 # 1. Contoh Dataset
 # -----------------------------------------
 # Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)
 data = {
    "text": [
        "Saya suka produk ini, luar biasa",
        "Layanannya buruk, sangat kecewa",
        "Pembelian terbaik yang pernah saya lakukan",
        "Saya benci produk ini, buang-buang uang",
        "Kualitasnya sangat bagus, direkomendasikan",
        "Pengalaman buruk, tidak akan membeli lagi"
    ],
    "label": ["positive", "negative", "positive", "negative", "positive", "negative"]
 }
 df = pd.DataFrame(data)
 # -----------------------------------------
 # 2. Split Train & Test
 # -----------------------------------------
 X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.3, random_state=42
 )
 # -----------------------------------------
 # 3. TF-IDF Vectorization
 # -----------------------------------------
 tfidf = TfidfVectorizer(max_features=5000)
 X_train_tfidf = tfidf.fit_transform(X_train)
 X_test_tfidf = tfidf.transform(X_test)
 # -----------------------------------------
 # 4. Feedforward ANN (MLPClassifier)
 # -----------------------------------------
 model = MLPClassifier(
    hidden_layer_sizes=(256, 64),
    activation='relu',
    solver='adam',
    max_iter=500,
    random_state=42
 )
 model.fit(X_train_tfidf, y_train)
 # -----------------------------------------
 # 5. Evaluasi Model
 # -----------------------------------------
 y_pred = model.predict(X_test_tfidf)
 print("=== Classification Report ===")
 print(classification_report(y_test, y_pred))
 print("=== Confusion Matrix ===")
 print(confusion_matrix(y_test, y_pred))
 # -----------------------------------------
 # 6. Prediksi Teks Baru
 # -----------------------------------------
 sample_text = ["barang bagus luar biasa"]
 sample_text = ["barang buruk, saya kecewa"]
 sample_vec = tfidf.transform(sample_text)
 prediction = model.predict(sample_vec)
 print("\nPrediksi untuk:", sample_text[0])
 print("Hasil:", prediction[0])
--- a/.virtual_documents/NLP/N-Gram.ipynb
+++ b/.virtual_documents/NLP/N-Gram.ipynb
@ -0,0 +1,209 @@
 from collections import Counter
 from IPython.display import clear_output
 import math
 # 1. Input Kalimat dan Tokenisasi
 kalimat = input("Masukkan kalimat: ").strip()
 # Bersihkan output (khusus lingkungan notebook)
 try:
    clear_output()
 except:
    pass
 print(f"Corpus: {kalimat}")
 # Tokenize
 tokens = kalimat.lower().split()
 print(f"Tokens ({len(tokens)}): {tokens}")
 # 2. Hitung Frekuensi Unigram
 unigram_counts = Counter(tokens)
 total_tokens = sum(unigram_counts.values())
 print("\nFrekuensi Unigram dalam kalimat")
 for pair, count in unigram_counts.items():
    print(f" ('{pair}'): {count}")
 print(f"\nTotal unigram dalam 1 kalimat: {total_tokens}")
 # 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata
 unigram_probabilities = {}
 for word, count in unigram_counts.items():
    prob = count / total_tokens
    unigram_probabilities[word] = prob
 print("\nProbabilitas masing-masing unigram:")
 for word, prob in unigram_probabilities.items():
    print(f" P({word}) = {prob:.2f} ({prob*100:.2f}%)")
 # 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)
 p_kalimat = 1
 prob_parts = []
 # Loop untuk menghitung probabilitas total dan membangun string rumus detail
 for word in tokens:
    prob_value = unigram_probabilities[word]
    p_kalimat *= prob_value
    # Format: P(word)=prob_value
    prob_parts.append(f"P({word})={prob_value:.2f}")
 # Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail
 prob_str = " x ".join(prob_parts)
 print("\nProbabilitas Keseluruhan Kalimat (Model Unigram):")
 print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)")
 from collections import Counter
 from IPython.display import clear_output
 import math
 # 1. Input Kalimat dan Tokenisasi
 kalimat = input("Masukkan kalimat: ").strip()
 # Bersihkan output (khusus lingkungan notebook)
 try:
    clear_output()
 except:
    pass
 print(f"Corpus: {kalimat}")
 # Tokenisasi
 tokens = kalimat.lower().split()
 print(f"Tokens ({len(tokens)}): {tokens}")
 # 2. Hitung Frekuensi Unigram dan Bigram
 unigram_counts = Counter(tokens)
 bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
 bigram_counts = Counter(bigrams)
 print("\nFrekuensi Bigram dalam kalimat:")
 for pair, count in bigram_counts.items():
    print(f" {pair}: {count}")
 print(f"\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}")
 # 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)
 bigram_probabilities = {}
 for (w1, w2), count in bigram_counts.items():
    prob = count / unigram_counts[w1]
    bigram_probabilities[(w1, w2)] = prob
 print("\nProbabilitas masing-masing bigram:")
 for (w1, w2), prob in bigram_probabilities.items():
    print(f" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)")
 # 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)
 #    P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...
 total_tokens = sum(unigram_counts.values())
 p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)
 p_kalimat = p_w1 # Inisialisasi dengan P(w1)
 prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"] # Tambahkan P(w1) ke rumus
 for i in range(1, len(tokens)):
    pair = (tokens[i-1], tokens[i])
    p = bigram_probabilities.get(pair, 0)
    p_kalimat *= p
    prob_str_parts.append(f"P({pair[1]}|{pair[0]})={p:.2f}")
 # Gabungkan rumus perkalian untuk ditampilkan
 prob_str = " x ".join(prob_str_parts)
 print("\nProbabilitas Keseluruhan Kalimat (Model Bigram):")
 print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
 from collections import Counter
 from IPython.display import clear_output
 import math
 # 1. Input Kalimat dan Tokenisasi
 kalimat = input("Masukkan kalimat: ").strip()
 # Bersihkan output (khusus lingkungan notebook)
 try:
    clear_output()
 except:
    pass
 print(f"Corpus: {kalimat}")
 # Tokenisasi
 tokens = kalimat.lower().split()
 print(f"Tokens ({len(tokens)}): {tokens}")
 # 2. Hitung Frekuensi Bigram dan Trigram
 bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
 trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]
 bigram_counts = Counter(bigrams)
 trigram_counts = Counter(trigrams)
 print("\nFrekuensi Trigram dalam kalimat:")
 for tg, count in trigram_counts.items():
    print(f" {tg}: {count}")
 print(f"\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}")
 # 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)
 trigram_probabilities = {}
 for (w1, w2, w3), count in trigram_counts.items():
    # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)
    if bigram_counts[(w1, w2)] > 0:
        prob = count / bigram_counts[(w1, w2)]
    else:
        prob = 0
    trigram_probabilities[(w1, w2, w3)] = prob
 print("\nProbabilitas masing-masing trigram:")
 for (w1, w2, w3), prob in trigram_probabilities.items():
    print(f" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)")
 # Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))
 unigram_counts = Counter(tokens)
 total_tokens = sum(unigram_counts.values())
 # 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)
 #    P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...
 # a. P(w1)
 p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0
 # b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)
 if len(tokens) > 1:
    count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0
    p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1
 else:
    p_w2_w1 = 1.0 # Jika hanya 1 kata
 p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)
 # Daftar bagian rumus untuk ditampilkan
 prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"]
 if len(tokens) > 1:
    prob_str_parts.append(f"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}")
 # c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3
 for i in range(len(tokens) - 2):
    triplet = (tokens[i], tokens[i+1], tokens[i+2])
    p = trigram_probabilities.get(triplet, 0)
    p_kalimat *= p
    prob_str_parts.append(f"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}")
 prob_str = " x ".join(prob_str_parts)
 print("\nProbabilitas Keseluruhan Kalimat (Model Trigram):")
 print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
--- a/Code/00-Python-Text-Basics/00-Working-with-Text-Files.ipynb
+++ b/Code/00-Python-Text-Basics/00-Working-with-Text-Files.ipynb
@ -0,0 +1,213 @@
 name = 'Fred'
 # Using the old .format() method:
 print('His name is {var}.'.format(var=name))
 # Using f-strings:
 print(f'His name is {name}.')
 print(f'His name is {name!r}')
 d = {'a':123,'b':456}
 print(f'Address: {d['a']} Main Street')
 d = {'a':123,'b':456}
 print(f"Address: {d['a']} Main Street")
 library = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)]
 for book in library:
    print(f'{book[0]:{10}} {book[1]:{8}} {book[2]:{7}}')
 for book in library:
    print(f'{book[0]:{10}} {book[1]:{10}} {book[2]:.>{7}}') # here .> was added
 from datetime import datetime
 today = datetime(year=2018, month=1, day=27)
 print(f'{today:%B %d, %Y}')
 %%writefile test.txt
 Hello, this is a quick test file.
 This is the second line of the file.
 myfile = open('whoops.txt')
 pwd
 # Open the text.txt file we created earlier
 my_file = open('test.txt')
 my_file
 # We can now read the file
 my_file.read()
 # But what happens if we try to read it again?
 my_file.read()
 # Seek to the start of file (index 0)
 my_file.seek(0)
 # Now read again
 my_file.read()
 # Readlines returns a list of the lines in the file
 my_file.seek(0)
 my_file.readlines()
 my_file.close()
 # Add a second argument to the function, 'w' which stands for write.
 # Passing 'w+' lets us read and write to the file
 my_file = open('test.txt','w+')
 # Write to the file
 my_file.write('This is a new first line')
 # Read the file
 my_file.seek(0)
 my_file.read()
 my_file.close()  # always do this when you're done with a file
 my_file = open('test.txt','a+')
 my_file.write('\nThis line is being appended to test.txt')
 my_file.write('\nAnd another line here.')
 my_file.seek(0)
 print(my_file.read())
 my_file.close()
 %%writefile -a test.txt
 This is more text being appended to test.txt
 And another line here.
 with open('test.txt','r') as txt:
    first_line = txt.readlines()[0]
 print(first_line)
 txt.read()
 with open('test.txt','r') as txt:
    for line in txt:
        print(line, end='')  # the end='' argument removes extra linebreaks
--- a/Code/01-NLP-Python-Basics/00-Spacy-Basics.ipynb
+++ b/Code/01-NLP-Python-Basics/00-Spacy-Basics.ipynb
@ -0,0 +1,145 @@
 # Import spaCy and load the language library
 import spacy
 nlp = spacy.load('en_core_web_sm')
 # Create a Doc object
 doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')
 # Print each token separately
 for token in doc:
    print(token.text, token.pos_, token.dep_)
 nlp.pipeline
 nlp.pipe_names
 doc2 = nlp(u"Tesla isn't   looking into startups anymore.")
 for token in doc2:
    print(token.text, token.pos_, token.dep_)
 doc2
 doc2[0]
 type(doc2)
 doc2[0].pos_
 doc2[0].dep_
 spacy.explain('PROPN')
 spacy.explain('nsubj')
 # Lemmas (the base form of the word):
 print(doc2[4].text)
 print(doc2[4].lemma_)
 # Simple Parts-of-Speech & Detailed Tags:
 print(doc2[4].pos_)
 print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))
 # Word Shapes:
 print(doc2[0].text+': '+doc2[0].shape_)
 print(doc[5].text+' : '+doc[5].shape_)
 # Boolean Values:
 print(doc2[0].is_alpha)
 print(doc2[0].is_stop)
 doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \
 the phrase "Life is what happens to us while we are making other plans" was written by \
 cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.')
 life_quote = doc3[16:30]
 print(life_quote)
 type(life_quote)
 doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')
 for sent in doc4.sents:
    print(sent)
 doc4[6].is_sent_start
--- a/Code/01-NLP-Python-Basics/01-Tokenization.ipynb
+++ b/Code/01-NLP-Python-Basics/01-Tokenization.ipynb
@ -0,0 +1,188 @@
 # Import spaCy and load the language library
 import spacy
 nlp = spacy.load('en_core_web_sm')
 # Create a string that includes opening and closing quotation marks
 mystring = '"We\'re moving to L.A.!"'
 print(mystring)
 # Create a Doc object and explore tokens
 doc = nlp(mystring)
 for token in doc:
    print(token.text, end=' | ')
 doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")
 for t in doc2:
    print(t)
 doc3 = nlp(u'A 5km NYC cab ride costs $10.30')
 for t in doc3:
    print(t)
 doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.")
 for t in doc4:
    print(t)
 len(doc)
 len(doc.vocab)
 doc5 = nlp(u'It is better to give than to receive.')
 # Retrieve the third token:
 doc5[2]
 # Retrieve three tokens from the middle:
 doc5[2:5]
 # Retrieve the last four tokens:
 doc5[-4:]
 doc6 = nlp(u'My dinner was horrible.')
 doc7 = nlp(u'Your dinner was delicious.')
 # Try to change "My dinner was horrible" to "My dinner was delicious"
 doc6[3] = doc7[3]
 doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')
 for token in doc8:
    print(token.text, end=' | ')
 print('\n----')
 for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
 len(doc8.ents)
 doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")
 for chunk in doc9.noun_chunks:
    print(chunk.text)
 doc10 = nlp(u"Red cars do not carry higher insurance rates.")
 for chunk in doc10.noun_chunks:
    print(chunk.text)
 doc11 = nlp(u"He was a one-eyed, one-horned, flying, purple people-eater.")
 for chunk in doc11.noun_chunks:
    print(chunk.text)
 from spacy import displacy
 doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
 displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})
 doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
 displacy.render(doc, style='ent', jupyter=True)
 doc = nlp(u'This is a sentence.')
 displacy.serve(doc, style='dep')
--- a/Code/01-NLP-Python-Basics/07-NLP-Basics-Assessment-Solution.ipynb
+++ b/Code/01-NLP-Python-Basics/07-NLP-Basics-Assessment-Solution.ipynb
@ -0,0 +1,107 @@
 # RUN THIS CELL to perform standard imports:
 import spacy
 nlp = spacy.load('en_core_web_sm')
 # Enter your code here:
 with open('../TextFiles/owlcreek.txt') as f:
    doc = nlp(f.read())
 # Run this cell to verify it worked:
 doc[:36]
 len(doc)
 sents = [sent for sent in doc.sents]
 len(sents)
 print(sents[1].text)
 # NORMAL SOLUTION:
 for token in sents[1]:
    print(token.text, token.pos_, token.dep_, token.lemma_)
 # CHALLENGE SOLUTION:
 for token in sents[1]:
    print(f'{token.text:{15}} {token.pos_:{5}} {token.dep_:{10}} {token.lemma_:{15}}')
 # Import the Matcher library:
 from spacy.matcher import Matcher
 matcher = Matcher(nlp.vocab)
 # Create a pattern and add it to matcher:
 pattern = [{'LOWER': 'swimming'}, {'IS_SPACE': True, 'OP':'*'}, {'LOWER': 'vigorously'}]
 matcher.add('Swimming', None, pattern)
 # Create a list of matches called "found_matches" and print the list:
 found_matches = matcher(doc)
 print(found_matches)
 print(doc[1265:1290])
 print(doc[3600:3615])
 for sent in sents:
    if found_matches[0][1] < sent.end:
        print(sent)
        break
 for sent in sents:
    if found_matches[1][1] < sent.end:
        print(sent)
        break
--- a/Code/02-Parts-of-Speech-Tagging/00-POS-Basics.ipynb
+++ b/Code/02-Parts-of-Speech-Tagging/00-POS-Basics.ipynb
@ -0,0 +1,107 @@
 # Perform standard imports
 import spacy
 nlp = spacy.load('en_core_web_sm')
 # Create a simple Doc object
 doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
 # Print the full text:
 print(doc.text)
 # Print the fifth word and associated tags:
 print(doc[4].text, doc[4].pos_, doc[4].tag_, spacy.explain(doc[4].tag_))
 for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')
 doc = nlp(u'I read books on NLP.')
 r = doc[1]
 print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')
 doc = nlp(u'I read a book on NLP.')
 r = doc[1]
 print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')
 doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
 # Count the frequencies of different coarse-grained POS tags:
 POS_counts = doc.count_by(spacy.attrs.POS)
 POS_counts
 doc.vocab[83].text
 for k,v in sorted(POS_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{5}}: {v}')
 # Count the different fine-grained tags:
 TAG_counts = doc.count_by(spacy.attrs.TAG)
 for k,v in sorted(TAG_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{4}}: {v}')
 # Count the different dependencies:
 DEP_counts = doc.count_by(spacy.attrs.DEP)
 for k,v in sorted(DEP_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{4}}: {v}')
--- a/Code/02-Parts-of-Speech-Tagging/07-POS-Challenge
+++ b/Code/02-Parts-of-Speech-Tagging/07-POS-Challenge
@ -0,0 +1,35 @@
 # Perform standard imports
 import spacy
 nlp = spacy.load('en_core_web_sm')
 # Import the game script
 import game
 # Enter your text here:
 text = u"The quick brown fox jumped over the lazy dog's back."
 # Make your Doc object and pass it into the scorer:
 doc = nlp(text)
 print(game.scorer(doc))
 # For practice, visualize your fine-grained POS tags (shown in the third column):
 print(f"{'TOKEN':{10}} {'COARSE':{8}} {'FINE':{6}} {'DESCRIPTION'}")
 print(f"{'-----':{10}} {'------':{8}} {'----':{6}} {'-----------'}")
 for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')
--- a/.virtual_documents/NLP/Untitled.ipynb
+++ b/.virtual_documents/NLP/Untitled.ipynb
@ -0,0 +1,84 @@
 # ---------------------------------------------------------
 # Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network
 # ---------------------------------------------------------
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.neural_network import MLPClassifier
 from sklearn.metrics import classification_report, confusion_matrix
 # -----------------------------------------
 # 1. Contoh Dataset
 # -----------------------------------------
 # Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)
 data = {
    "text": [
        "Saya suka produk ini, luar biasa",
        "Layanannya buruk, sangat kecewa",
        "Pembelian terbaik yang pernah saya lakukan",
        "Saya benci produk ini, buang-buang uang",
        "Kualitasnya sangat bagus, direkomendasikan",
        "Pengalaman buruk, tidak akan membeli lagi"
    ],
    "label": ["positive", "negative", "positive", "negative", "positive", "negative"]
 }
 df = pd.DataFrame(data)
 # -----------------------------------------
 # 2. Split Train & Test
 # -----------------------------------------
 X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.3, random_state=42
 )
 # -----------------------------------------
 # 3. TF-IDF Vectorization
 # -----------------------------------------
 tfidf = TfidfVectorizer(max_features=5000)
 X_train_tfidf = tfidf.fit_transform(X_train)
 X_test_tfidf = tfidf.transform(X_test)
 # -----------------------------------------
 # 4. Feedforward ANN (MLPClassifier)
 # -----------------------------------------
 model = MLPClassifier(
    hidden_layer_sizes=(256, 64),
    activation='relu',
    solver='adam',
    max_iter=500,
    random_state=42
 )
 model.fit(X_train_tfidf, y_train)
 # -----------------------------------------
 # 5. Evaluasi Model
 # -----------------------------------------
 y_pred = model.predict(X_test_tfidf)
 print("=== Classification Report ===")
 print(classification_report(y_test, y_pred))
 print("=== Confusion Matrix ===")
 print(confusion_matrix(y_test, y_pred))
 # -----------------------------------------
 # 6. Prediksi Teks Baru
 # -----------------------------------------
 sample_text = ["barang bagus luar biasa"]
 sample_text = ["barang buruk, saya kecewa"]
 sample_vec = tfidf.transform(sample_text)
 prediction = model.predict(sample_vec)
 print("\nPrediksi untuk:", sample_text[0])
 print("Hasil:", prediction[0])
--- a/NLP/.ipynb_checkpoints/Fitur_Ekstraksi_BOW-checkpoint.ipynb
+++ b/NLP/.ipynb_checkpoints/Fitur_Ekstraksi_BOW-checkpoint.ipynb
@ -0,0 +1,310 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "qBYcPYAb059g",
    "outputId": "9f57b704-da1b-4495-d366-24c30586dc76"
   },
   "outputs": [
    {
     "name": "stdin",
     "output_type": "stream",
     "text": [
      "Masukkan jumlah dokumen yang ingin dimasukkan:  3\n"
     ]
    }
   ],
   "source": [
    "# Input jumlah dokumen\n",
    "import pandas as pd\n",
    "n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "mo-yt5Ob1N8j",
    "outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Masukkan teks untuk dokumen ke-1: saya belajar nlp di kampus\n",
      "Masukkan teks untuk dokumen ke-2: saya suka belajar ai\n",
      "Masukkan teks untuk dokumen ke-3: mahasiswa belajar data science dan nlp\n",
      "\n",
      "=== Dokumen yang Dimasukkan ===\n",
      "Doc 1: saya belajar nlp di kampus\n",
      "Doc 2: saya suka belajar ai\n",
      "Doc 3: mahasiswa belajar data science dan nlp\n"
     ]
    }
   ],
   "source": [
    "# Input teks dokumen satu per satu\n",
    "documents = []\n",
    "for i in range(n):\n",
    "    teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
    "    documents.append(teks)\n",
    "\n",
    "print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
    "for i, doc in enumerate(documents):\n",
    "    print(f\"Doc {i+1}: {doc}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "FkmxRAFq1oDK",
    "outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== Hasil Tokenisasi ===\n",
      "Doc 1: ['saya', 'belajar', 'nlp', 'di', 'kampus']\n",
      "Doc 2: ['saya', 'suka', 'belajar', 'ai']\n",
      "Doc 3: ['mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n"
     ]
    }
   ],
   "source": [
    "# Tahap Tokenisasi\n",
    "tokenized_docs = []\n",
    "for doc in documents:\n",
    "    tokens = doc.lower().split()\n",
    "    tokenized_docs.append(tokens)\n",
    "\n",
    "print(\"\\n=== Hasil Tokenisasi ===\")\n",
    "for i, tokens in enumerate(tokenized_docs):\n",
    "    print(f\"Doc {i+1}: {tokens}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "ybC1Vo2C_c3q",
    "outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
      "['saya', 'belajar', 'nlp', 'di', 'kampus', 'saya', 'suka', 'belajar', 'ai', 'mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n",
      "Jumlah total kata dalam seluruh dokumen: 15\n"
     ]
    }
   ],
   "source": [
    "# Pembuatan Corpus\n",
    "corpus_all = [word for doc in tokenized_docs for word in doc]\n",
    "\n",
    "print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
    "print(corpus_all)\n",
    "print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "s6S-Ma4R1xuq",
    "outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== Vocabulary (Kata Unik) ===\n",
      "['ai', 'belajar', 'dan', 'data', 'di', 'kampus', 'mahasiswa', 'nlp', 'saya', 'science', 'suka']\n",
      "Jumlah kata unik (vocabulary size): 11\n",
      "\n",
      "=== Vocabulary (Kata Unik) ===\n",
      " 1. ai\n",
      " 2. belajar\n",
      " 3. dan\n",
      " 4. data\n",
      " 5. di\n",
      " 6. kampus\n",
      " 7. mahasiswa\n",
      " 8. nlp\n",
      " 9. saya\n",
      "10. science\n",
      "11. suka\n",
      "\n",
      "Jumlah kata unik (vocabulary size): 11\n"
     ]
    }
   ],
   "source": [
    "# Pembuatan Vocabulary\n",
    "vocabulary = sorted(set(corpus_all))\n",
    "\n",
    "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
    "print(vocabulary)\n",
    "print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
    "\n",
    "\n",
    "vocabulary = sorted(set(corpus_all))\n",
    "\n",
    "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
    "for idx, word in enumerate(vocabulary, start=1):\n",
    "    print(f\"{idx:>2}. {word}\")\n",
    "print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "id": "ShevCTva2Fg9"
   },
   "outputs": [],
   "source": [
    "# Representasi Numerik (Matriks BoW)\n",
    "bow_matrix = []\n",
    "for doc in tokenized_docs:\n",
    "    vector = [doc.count(word) for word in vocabulary]\n",
    "    bow_matrix.append(vector)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "-yB6D2pY2M0E",
    "outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== Matriks Bag of Words ===\n",
      "    ai  belajar  dan  data  di  kampus  mahasiswa  nlp  saya  science  suka\n",
      "D1   0        1    0     0   1       1          0    1     1        0     0\n",
      "D2   1        1    0     0   0       0          0    0     1        0     1\n",
      "D3   0        1    1     1   0       0          1    1     0        1     0\n"
     ]
    }
   ],
   "source": [
    "df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
    "df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)]  # ubah label indeks jadi D1, D2, D3\n",
    "\n",
    "print(\"\\n=== Matriks Bag of Words ===\")\n",
    "print(df_bow)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "8ruf5vKL2rGD",
    "outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n",
      "         Kata  Frekuensi\n",
      "0     belajar          3\n",
      "1         nlp          2\n",
      "2        saya          2\n",
      "3         dan          1\n",
      "4          ai          1\n",
      "5        data          1\n",
      "6          di          1\n",
      "7   mahasiswa          1\n",
      "8      kampus          1\n",
      "9     science          1\n",
      "10       suka          1\n",
      "Frekuensi kata: 11\n"
     ]
    }
   ],
   "source": [
    "# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n",
    "word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n",
    "word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n",
    "\n",
    "print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n",
    "print(word_frequencies)\n",
    "print(f\"Frekuensi kata: {len(word_frequencies)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "NQjExannHuj0"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/NLP/.ipynb_checkpoints/Klasifikasi
+++ b/NLP/.ipynb_checkpoints/Klasifikasi
@ -0,0 +1,6 @@
 {
 "cells": [],
 "metadata": {},
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/NLP/.ipynb_checkpoints/N-Gram-checkpoint.ipynb
+++ b/NLP/.ipynb_checkpoints/N-Gram-checkpoint.ipynb
@ -0,0 +1,380 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "JVPdWpz3hhbj"
   },
   "source": [
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "4Mvva3v65h1v"
   },
   "source": [
    "# **UNIGRAM**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "1cub_VJnUJMl",
    "outputId": "1889eb61-4f3f-4780-f42e-02368076cce3"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Corpus: saya suka makan nasi\n",
      "Tokens (4): ['saya', 'suka', 'makan', 'nasi']\n",
      "\n",
      "Frekuensi Unigram dalam kalimat\n",
      " ('saya'): 1\n",
      " ('suka'): 1\n",
      " ('makan'): 1\n",
      " ('nasi'): 1\n",
      "\n",
      "Total unigram dalam 1 kalimat: 4\n",
      "\n",
      "Probabilitas masing-masing unigram:\n",
      " P(saya) = 0.25 (25.00%)\n",
      " P(suka) = 0.25 (25.00%)\n",
      " P(makan) = 0.25 (25.00%)\n",
      " P(nasi) = 0.25 (25.00%)\n",
      "\n",
      "Probabilitas Keseluruhan Kalimat (Model Unigram):\n",
      " P(saya suka makan nasi) = P(saya)=0.25 x P(suka)=0.25 x P(makan)=0.25 x P(nasi)=0.25 = 0.0039 (0.39%)\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "from IPython.display import clear_output\n",
    "import math\n",
    "\n",
    "# 1. Input Kalimat dan Tokenisasi\n",
    "kalimat = input(\"Masukkan kalimat: \").strip()\n",
    "\n",
    "# Bersihkan output (khusus lingkungan notebook)\n",
    "try:\n",
    "    clear_output()\n",
    "except:\n",
    "    pass\n",
    "\n",
    "print(f\"Corpus: {kalimat}\")\n",
    "\n",
    "# Tokenize\n",
    "tokens = kalimat.lower().split()\n",
    "print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
    "\n",
    "# 2. Hitung Frekuensi Unigram\n",
    "unigram_counts = Counter(tokens)\n",
    "total_tokens = sum(unigram_counts.values())\n",
    "\n",
    "print(\"\\nFrekuensi Unigram dalam kalimat\")\n",
    "for pair, count in unigram_counts.items():\n",
    "    print(f\" ('{pair}'): {count}\")\n",
    "print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n",
    "\n",
    "# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n",
    "unigram_probabilities = {}\n",
    "for word, count in unigram_counts.items():\n",
    "    prob = count / total_tokens\n",
    "    unigram_probabilities[word] = prob\n",
    "\n",
    "print(\"\\nProbabilitas masing-masing unigram:\")\n",
    "for word, prob in unigram_probabilities.items():\n",
    "    print(f\" P({word}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
    "\n",
    "# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)\n",
    "p_kalimat = 1\n",
    "prob_parts = []\n",
    "\n",
    "# Loop untuk menghitung probabilitas total dan membangun string rumus detail\n",
    "for word in tokens:\n",
    "    prob_value = unigram_probabilities[word]\n",
    "    p_kalimat *= prob_value\n",
    "    # Format: P(word)=prob_value\n",
    "    prob_parts.append(f\"P({word})={prob_value:.2f}\")\n",
    "\n",
    "# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail\n",
    "prob_str = \" x \".join(prob_parts)\n",
    "\n",
    "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n",
    "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Vstwt996-FrS"
   },
   "source": [
    "# **BIGRAM**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "XRIY4qgTVbjl",
    "outputId": "ea6e62ce-45a0-40c9-ca98-1fcc30558479"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Corpus: saya makan nasi dan saya makan roti\n",
      "Tokens (7): ['saya', 'makan', 'nasi', 'dan', 'saya', 'makan', 'roti']\n",
      "\n",
      "Frekuensi Bigram dalam kalimat:\n",
      " ('saya', 'makan'): 2\n",
      " ('makan', 'nasi'): 1\n",
      " ('nasi', 'dan'): 1\n",
      " ('dan', 'saya'): 1\n",
      " ('makan', 'roti'): 1\n",
      "\n",
      "Total bigram dalam 1 kalimat: 6\n",
      "\n",
      "Probabilitas masing-masing bigram:\n",
      " P(makan|saya) = 1.00 (100.00%)\n",
      " P(nasi|makan) = 0.50 (50.00%)\n",
      " P(dan|nasi) = 1.00 (100.00%)\n",
      " P(saya|dan) = 1.00 (100.00%)\n",
      " P(roti|makan) = 0.50 (50.00%)\n",
      "\n",
      "Probabilitas Keseluruhan Kalimat (Model Bigram):\n",
      " P(saya makan nasi dan saya makan roti) = P(saya)=0.29 x P(makan|saya)=1.00 x P(nasi|makan)=0.50 x P(dan|nasi)=1.00 x P(saya|dan)=1.00 x P(makan|saya)=1.00 x P(roti|makan)=0.50 = 0.071429 (7.14%)\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "from IPython.display import clear_output\n",
    "import math\n",
    "\n",
    "# 1. Input Kalimat dan Tokenisasi\n",
    "kalimat = input(\"Masukkan kalimat: \").strip()\n",
    "\n",
    "# Bersihkan output (khusus lingkungan notebook)\n",
    "try:\n",
    "    clear_output()\n",
    "except:\n",
    "    pass\n",
    "\n",
    "print(f\"Corpus: {kalimat}\")\n",
    "\n",
    "# Tokenisasi\n",
    "tokens = kalimat.lower().split()\n",
    "print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
    "\n",
    "# 2. Hitung Frekuensi Unigram dan Bigram\n",
    "unigram_counts = Counter(tokens)\n",
    "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
    "bigram_counts = Counter(bigrams)\n",
    "\n",
    "print(\"\\nFrekuensi Bigram dalam kalimat:\")\n",
    "for pair, count in bigram_counts.items():\n",
    "    print(f\" {pair}: {count}\")\n",
    "print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n",
    "\n",
    "# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n",
    "bigram_probabilities = {}\n",
    "for (w1, w2), count in bigram_counts.items():\n",
    "    prob = count / unigram_counts[w1]\n",
    "    bigram_probabilities[(w1, w2)] = prob\n",
    "\n",
    "print(\"\\nProbabilitas masing-masing bigram:\")\n",
    "for (w1, w2), prob in bigram_probabilities.items():\n",
    "    print(f\" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
    "\n",
    "# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)\n",
    "#    P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...\n",
    "total_tokens = sum(unigram_counts.values())\n",
    "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)\n",
    "p_kalimat = p_w1 # Inisialisasi dengan P(w1)\n",
    "\n",
    "prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"] # Tambahkan P(w1) ke rumus\n",
    "\n",
    "for i in range(1, len(tokens)):\n",
    "    pair = (tokens[i-1], tokens[i])\n",
    "    p = bigram_probabilities.get(pair, 0)\n",
    "    p_kalimat *= p\n",
    "    prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.2f}\")\n",
    "\n",
    "# Gabungkan rumus perkalian untuk ditampilkan\n",
    "prob_str = \" x \".join(prob_str_parts)\n",
    "\n",
    "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n",
    "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "E6n1IU8X-G9S"
   },
   "source": [
    "# **TRIGRAM**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "BIRARsj2FHJg",
    "outputId": "968d420e-9370-40e5-e7e1-148e1d351d62"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Corpus: mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning\n",
      "Tokens (7): ['mahasiswa', 'mengerjakan', 'tugas', 'kemudian', 'mahasiswa', 'upload', 'e-learning']\n",
      "\n",
      "Frekuensi Trigram dalam kalimat:\n",
      " ('mahasiswa', 'mengerjakan', 'tugas'): 1\n",
      " ('mengerjakan', 'tugas', 'kemudian'): 1\n",
      " ('tugas', 'kemudian', 'mahasiswa'): 1\n",
      " ('kemudian', 'mahasiswa', 'upload'): 1\n",
      " ('mahasiswa', 'upload', 'e-learning'): 1\n",
      "\n",
      "Total trigram dalam 1 kalimat: 5\n",
      "\n",
      "Probabilitas masing-masing trigram:\n",
      " P(tugas|mahasiswa,mengerjakan) = 1.00 (100.00%)\n",
      " P(kemudian|mengerjakan,tugas) = 1.00 (100.00%)\n",
      " P(mahasiswa|tugas,kemudian) = 1.00 (100.00%)\n",
      " P(upload|kemudian,mahasiswa) = 1.00 (100.00%)\n",
      " P(e-learning|mahasiswa,upload) = 1.00 (100.00%)\n",
      "\n",
      "Probabilitas Keseluruhan Kalimat (Model Trigram):\n",
      " P(mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning) = P(mahasiswa)=0.29 x P(mengerjakan|mahasiswa)=0.50 x P(tugas|mahasiswa,mengerjakan)=1.00 x P(kemudian|mengerjakan,tugas)=1.00 x P(mahasiswa|tugas,kemudian)=1.00 x P(upload|kemudian,mahasiswa)=1.00 x P(e-learning|mahasiswa,upload)=1.00 = 0.142857 (14.29%)\n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "from IPython.display import clear_output\n",
    "import math\n",
    "\n",
    "# 1. Input Kalimat dan Tokenisasi\n",
    "kalimat = input(\"Masukkan kalimat: \").strip()\n",
    "\n",
    "# Bersihkan output (khusus lingkungan notebook)\n",
    "try:\n",
    "    clear_output()\n",
    "except:\n",
    "    pass\n",
    "\n",
    "print(f\"Corpus: {kalimat}\")\n",
    "\n",
    "# Tokenisasi\n",
    "tokens = kalimat.lower().split()\n",
    "print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
    "\n",
    "# 2. Hitung Frekuensi Bigram dan Trigram\n",
    "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
    "trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n",
    "\n",
    "bigram_counts = Counter(bigrams)\n",
    "trigram_counts = Counter(trigrams)\n",
    "\n",
    "print(\"\\nFrekuensi Trigram dalam kalimat:\")\n",
    "for tg, count in trigram_counts.items():\n",
    "    print(f\" {tg}: {count}\")\n",
    "print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n",
    "\n",
    "# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)\n",
    "trigram_probabilities = {}\n",
    "for (w1, w2, w3), count in trigram_counts.items():\n",
    "    # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)\n",
    "    if bigram_counts[(w1, w2)] > 0:\n",
    "        prob = count / bigram_counts[(w1, w2)]\n",
    "    else:\n",
    "        prob = 0\n",
    "    trigram_probabilities[(w1, w2, w3)] = prob\n",
    "\n",
    "print(\"\\nProbabilitas masing-masing trigram:\")\n",
    "for (w1, w2, w3), prob in trigram_probabilities.items():\n",
    "    print(f\" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
    "\n",
    "# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))\n",
    "unigram_counts = Counter(tokens)\n",
    "total_tokens = sum(unigram_counts.values())\n",
    "\n",
    "# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)\n",
    "#    P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...\n",
    "\n",
    "# a. P(w1)\n",
    "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n",
    "\n",
    "# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)\n",
    "if len(tokens) > 1:\n",
    "    count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0\n",
    "    p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n",
    "else:\n",
    "    p_w2_w1 = 1.0 # Jika hanya 1 kata\n",
    "\n",
    "p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)\n",
    "\n",
    "# Daftar bagian rumus untuk ditampilkan\n",
    "prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"]\n",
    "if len(tokens) > 1:\n",
    "    prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}\")\n",
    "\n",
    "# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3\n",
    "for i in range(len(tokens) - 2):\n",
    "    triplet = (tokens[i], tokens[i+1], tokens[i+2])\n",
    "    p = trigram_probabilities.get(triplet, 0)\n",
    "    p_kalimat *= p\n",
    "    prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}\")\n",
    "\n",
    "prob_str = \" x \".join(prob_str_parts)\n",
    "\n",
    "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n",
    "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")\n"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/NLP/Klasifikasi
+++ b/NLP/Klasifikasi
@ -0,0 +1,151 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f4a1399a-f23d-4060-a07e-bce5a5c7ddac",
   "metadata": {},
   "source": [
    "# Klasifikasi Teks\n",
    "## Arif R Dwiyanto"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "53a214ae-c9cf-4d46-925d-068f1685537b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=== Classification Report ===\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "    negative       0.00      0.00      0.00       1.0\n",
      "    positive       0.00      0.00      0.00       1.0\n",
      "\n",
      "    accuracy                           0.00       2.0\n",
      "   macro avg       0.00      0.00      0.00       2.0\n",
      "weighted avg       0.00      0.00      0.00       2.0\n",
      "\n",
      "=== Confusion Matrix ===\n",
      "[[0 1]\n",
      " [1 0]]\n",
      "\n",
      "Prediksi untuk: barang buruk, saya kecewa\n",
      "Hasil: negative\n"
     ]
    }
   ],
   "source": [
    "# ---------------------------------------------------------\n",
    "# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network\n",
    "# ---------------------------------------------------------\n",
    "\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "from sklearn.metrics import classification_report, confusion_matrix\n",
    "\n",
    "# -----------------------------------------\n",
    "# 1. Contoh Dataset\n",
    "# -----------------------------------------\n",
    "# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)\n",
    "\n",
    "data = {\n",
    "    \"text\": [\n",
    "        \"Saya suka produk ini, luar biasa\",\n",
    "        \"Layanannya buruk, sangat kecewa\",\n",
    "        \"Pembelian terbaik yang pernah saya lakukan\",\n",
    "        \"Saya benci produk ini, buang-buang uang\",\n",
    "        \"Kualitasnya sangat bagus, direkomendasikan\",\n",
    "        \"Pengalaman buruk, tidak akan membeli lagi\"\n",
    "    ],\n",
    "    \"label\": [\"positive\", \"negative\", \"positive\", \"negative\", \"positive\", \"negative\"]\n",
    "}\n",
    "\n",
    "df = pd.DataFrame(data)\n",
    "\n",
    "# -----------------------------------------\n",
    "# 2. Split Train & Test\n",
    "# -----------------------------------------\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    df[\"text\"], df[\"label\"], test_size=0.3, random_state=42\n",
    ")\n",
    "\n",
    "# -----------------------------------------\n",
    "# 3. TF-IDF Vectorization\n",
    "# -----------------------------------------\n",
    "tfidf = TfidfVectorizer(max_features=5000)\n",
    "X_train_tfidf = tfidf.fit_transform(X_train)\n",
    "X_test_tfidf = tfidf.transform(X_test)\n",
    "\n",
    "# -----------------------------------------\n",
    "# 4. Feedforward ANN (MLPClassifier)\n",
    "# -----------------------------------------\n",
    "model = MLPClassifier(\n",
    "    hidden_layer_sizes=(256, 64),\n",
    "    activation='relu',\n",
    "    solver='adam',\n",
    "    max_iter=500,\n",
    "    random_state=42\n",
    ")\n",
    "\n",
    "model.fit(X_train_tfidf, y_train)\n",
    "\n",
    "# -----------------------------------------\n",
    "# 5. Evaluasi Model\n",
    "# -----------------------------------------\n",
    "y_pred = model.predict(X_test_tfidf)\n",
    "\n",
    "print(\"=== Classification Report ===\")\n",
    "print(classification_report(y_test, y_pred))\n",
    "\n",
    "print(\"=== Confusion Matrix ===\")\n",
    "print(confusion_matrix(y_test, y_pred))\n",
    "\n",
    "# -----------------------------------------\n",
    "# 6. Prediksi Teks Baru\n",
    "# -----------------------------------------\n",
    "sample_text = [\"barang bagus luar biasa\"]\n",
    "sample_text = [\"barang buruk, saya kecewa\"]\n",
    "sample_vec = tfidf.transform(sample_text)\n",
    "prediction = model.predict(sample_vec)\n",
    "\n",
    "print(\"\\nPrediksi untuk:\", sample_text[0])\n",
    "print(\"Hasil:\", prediction[0])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f7d90fe-4af4-446c-9547-c9312bfa6fc7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/README.md
+++ b/README.md
@ -4,4 +4,5 @@
 - NLP
 - Machine Learning
 - Big Data
- Data Mining
+- Data Mining
 - Data Management