From fc0b27314950baf8fd4c4ffcdec9362c4984d6d8 Mon Sep 17 00:00:00 2001 From: Arif Dwiyanto Date: Sat, 15 Nov 2025 05:08:25 +0000 Subject: [PATCH] Feedforward ANN Text classification --- .ipynb_checkpoints/README-checkpoint.md | 9 +- .../NLP/Fitur_Ekstraksi_BOW.ipynb | 75 ++++ .../NLP/Klasifikasi Teks FNN.ipynb | 84 ++++ .virtual_documents/NLP/N-Gram.ipynb | 209 ++++++++++ .../00-Working-with-Text-Files.ipynb | 213 ++++++++++ .../00-Spacy-Basics.ipynb | 145 +++++++ .../01-Tokenization.ipynb | 188 +++++++++ .../07-NLP-Basics-Assessment-Solution.ipynb | 107 +++++ .../00-POS-Basics.ipynb | 107 +++++ .../07-POS-Challenge (optional).ipynb | 35 ++ .virtual_documents/NLP/Untitled.ipynb | 84 ++++ .../Fitur_Ekstraksi_BOW-checkpoint.ipynb | 310 ++++++++++++++ .../Klasifikasi Teks FNN-checkpoint.ipynb | 6 + .../N-Gram-checkpoint.ipynb | 380 ++++++++++++++++++ NLP/Klasifikasi Teks FNN.ipynb | 151 +++++++ README.md | 3 +- 16 files changed, 2104 insertions(+), 2 deletions(-) create mode 100644 .virtual_documents/NLP/Fitur_Ekstraksi_BOW.ipynb create mode 100644 .virtual_documents/NLP/Klasifikasi Teks FNN.ipynb create mode 100644 .virtual_documents/NLP/N-Gram.ipynb create mode 100644 .virtual_documents/NLP/Praktikum Python Code/00-Python-Text-Basics/00-Working-with-Text-Files.ipynb create mode 100644 .virtual_documents/NLP/Praktikum Python Code/01-NLP-Python-Basics/00-Spacy-Basics.ipynb create mode 100644 .virtual_documents/NLP/Praktikum Python Code/01-NLP-Python-Basics/01-Tokenization.ipynb create mode 100644 .virtual_documents/NLP/Praktikum Python Code/01-NLP-Python-Basics/07-NLP-Basics-Assessment-Solution.ipynb create mode 100644 .virtual_documents/NLP/Praktikum Python Code/02-Parts-of-Speech-Tagging/00-POS-Basics.ipynb create mode 100644 .virtual_documents/NLP/Praktikum Python Code/02-Parts-of-Speech-Tagging/07-POS-Challenge (optional).ipynb create mode 100644 .virtual_documents/NLP/Untitled.ipynb create mode 100644 NLP/.ipynb_checkpoints/Fitur_Ekstraksi_BOW-checkpoint.ipynb create mode 100644 NLP/.ipynb_checkpoints/Klasifikasi Teks FNN-checkpoint.ipynb create mode 100644 NLP/.ipynb_checkpoints/N-Gram-checkpoint.ipynb create mode 100644 NLP/Klasifikasi Teks FNN.ipynb diff --git a/.ipynb_checkpoints/README-checkpoint.md b/.ipynb_checkpoints/README-checkpoint.md index a2c9334..6a8a662 100644 --- a/.ipynb_checkpoints/README-checkpoint.md +++ b/.ipynb_checkpoints/README-checkpoint.md @@ -1 +1,8 @@ -# Kompilasi Materi Praktikum \ No newline at end of file +# Kompilasi Materi Praktikum +## Ganjil 2025/2026 + +- NLP +- Machine Learning +- Big Data +- Data Mining +- Data Management \ No newline at end of file diff --git a/.virtual_documents/NLP/Fitur_Ekstraksi_BOW.ipynb b/.virtual_documents/NLP/Fitur_Ekstraksi_BOW.ipynb new file mode 100644 index 0000000..97fc63e --- /dev/null +++ b/.virtual_documents/NLP/Fitur_Ekstraksi_BOW.ipynb @@ -0,0 +1,75 @@ +# Input jumlah dokumen +import pandas as pd +n = int(input("Masukkan jumlah dokumen yang ingin dimasukkan: ")) + + +# Input teks dokumen satu per satu +documents = [] +for i in range(n): + teks = input(f"Masukkan teks untuk dokumen ke-{i+1}: ") + documents.append(teks) + +print("\n=== Dokumen yang Dimasukkan ===") +for i, doc in enumerate(documents): + print(f"Doc {i+1}: {doc}") + + +# Tahap Tokenisasi +tokenized_docs = [] +for doc in documents: + tokens = doc.lower().split() + tokenized_docs.append(tokens) + +print("\n=== Hasil Tokenisasi ===") +for i, tokens in enumerate(tokenized_docs): + print(f"Doc {i+1}: {tokens}") + + +# Pembuatan Corpus +corpus_all = [word for doc in tokenized_docs for word in doc] + +print("\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===") +print(corpus_all) +print(f"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}") + + +# Pembuatan Vocabulary +vocabulary = sorted(set(corpus_all)) + +print("\n=== Vocabulary (Kata Unik) ===") +print(vocabulary) +print(f"Jumlah kata unik (vocabulary size): {len(vocabulary)}") + + +vocabulary = sorted(set(corpus_all)) + +print("\n=== Vocabulary (Kata Unik) ===") +for idx, word in enumerate(vocabulary, start=1): + print(f"{idx:>2}. {word}") +print(f"\nJumlah kata unik (vocabulary size): {len(vocabulary)}") + + +# Representasi Numerik (Matriks BoW) +bow_matrix = [] +for doc in tokenized_docs: + vector = [doc.count(word) for word in vocabulary] + bow_matrix.append(vector) + + +df_bow = pd.DataFrame(bow_matrix, columns=vocabulary) +df_bow.index = [f"D{i}" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3 + +print("\n=== Matriks Bag of Words ===") +print(df_bow) + + +# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen) +word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index() +word_frequencies.columns = ["Kata", "Frekuensi"] + +print("\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===") +print(word_frequencies) +print(f"Frekuensi kata: {len(word_frequencies)}") + + + diff --git a/.virtual_documents/NLP/Klasifikasi Teks FNN.ipynb b/.virtual_documents/NLP/Klasifikasi Teks FNN.ipynb new file mode 100644 index 0000000..1ebdb71 --- /dev/null +++ b/.virtual_documents/NLP/Klasifikasi Teks FNN.ipynb @@ -0,0 +1,84 @@ + + + +# --------------------------------------------------------- +# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network +# --------------------------------------------------------- + +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.neural_network import MLPClassifier +from sklearn.metrics import classification_report, confusion_matrix + +# ----------------------------------------- +# 1. Contoh Dataset +# ----------------------------------------- +# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll) + +data = { + "text": [ + "Saya suka produk ini, luar biasa", + "Layanannya buruk, sangat kecewa", + "Pembelian terbaik yang pernah saya lakukan", + "Saya benci produk ini, buang-buang uang", + "Kualitasnya sangat bagus, direkomendasikan", + "Pengalaman buruk, tidak akan membeli lagi" + ], + "label": ["positive", "negative", "positive", "negative", "positive", "negative"] +} + +df = pd.DataFrame(data) + +# ----------------------------------------- +# 2. Split Train & Test +# ----------------------------------------- +X_train, X_test, y_train, y_test = train_test_split( + df["text"], df["label"], test_size=0.3, random_state=42 +) + +# ----------------------------------------- +# 3. TF-IDF Vectorization +# ----------------------------------------- +tfidf = TfidfVectorizer(max_features=5000) +X_train_tfidf = tfidf.fit_transform(X_train) +X_test_tfidf = tfidf.transform(X_test) + +# ----------------------------------------- +# 4. Feedforward ANN (MLPClassifier) +# ----------------------------------------- +model = MLPClassifier( + hidden_layer_sizes=(256, 64), + activation='relu', + solver='adam', + max_iter=500, + random_state=42 +) + +model.fit(X_train_tfidf, y_train) + +# ----------------------------------------- +# 5. Evaluasi Model +# ----------------------------------------- +y_pred = model.predict(X_test_tfidf) + +print("=== Classification Report ===") +print(classification_report(y_test, y_pred)) + +print("=== Confusion Matrix ===") +print(confusion_matrix(y_test, y_pred)) + +# ----------------------------------------- +# 6. Prediksi Teks Baru +# ----------------------------------------- +sample_text = ["barang bagus luar biasa"] +sample_text = ["barang buruk, saya kecewa"] +sample_vec = tfidf.transform(sample_text) +prediction = model.predict(sample_vec) + +print("\nPrediksi untuk:", sample_text[0]) +print("Hasil:", prediction[0]) + + + + diff --git a/.virtual_documents/NLP/N-Gram.ipynb b/.virtual_documents/NLP/N-Gram.ipynb new file mode 100644 index 0000000..cf12efa --- /dev/null +++ b/.virtual_documents/NLP/N-Gram.ipynb @@ -0,0 +1,209 @@ + + + + + + +from collections import Counter +from IPython.display import clear_output +import math + +# 1. Input Kalimat dan Tokenisasi +kalimat = input("Masukkan kalimat: ").strip() + +# Bersihkan output (khusus lingkungan notebook) +try: + clear_output() +except: + pass + +print(f"Corpus: {kalimat}") + +# Tokenize +tokens = kalimat.lower().split() +print(f"Tokens ({len(tokens)}): {tokens}") + +# 2. Hitung Frekuensi Unigram +unigram_counts = Counter(tokens) +total_tokens = sum(unigram_counts.values()) + +print("\nFrekuensi Unigram dalam kalimat") +for pair, count in unigram_counts.items(): + print(f" ('{pair}'): {count}") +print(f"\nTotal unigram dalam 1 kalimat: {total_tokens}") + +# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata +unigram_probabilities = {} +for word, count in unigram_counts.items(): + prob = count / total_tokens + unigram_probabilities[word] = prob + +print("\nProbabilitas masing-masing unigram:") +for word, prob in unigram_probabilities.items(): + print(f" P({word}) = {prob:.2f} ({prob*100:.2f}%)") + +# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...) +p_kalimat = 1 +prob_parts = [] + +# Loop untuk menghitung probabilitas total dan membangun string rumus detail +for word in tokens: + prob_value = unigram_probabilities[word] + p_kalimat *= prob_value + # Format: P(word)=prob_value + prob_parts.append(f"P({word})={prob_value:.2f}") + +# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail +prob_str = " x ".join(prob_parts) + +print("\nProbabilitas Keseluruhan Kalimat (Model Unigram):") +print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)") + + + + + +from collections import Counter +from IPython.display import clear_output +import math + +# 1. Input Kalimat dan Tokenisasi +kalimat = input("Masukkan kalimat: ").strip() + +# Bersihkan output (khusus lingkungan notebook) +try: + clear_output() +except: + pass + +print(f"Corpus: {kalimat}") + +# Tokenisasi +tokens = kalimat.lower().split() +print(f"Tokens ({len(tokens)}): {tokens}") + +# 2. Hitung Frekuensi Unigram dan Bigram +unigram_counts = Counter(tokens) +bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)] +bigram_counts = Counter(bigrams) + +print("\nFrekuensi Bigram dalam kalimat:") +for pair, count in bigram_counts.items(): + print(f" {pair}: {count}") +print(f"\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}") + +# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1) +bigram_probabilities = {} +for (w1, w2), count in bigram_counts.items(): + prob = count / unigram_counts[w1] + bigram_probabilities[(w1, w2)] = prob + +print("\nProbabilitas masing-masing bigram:") +for (w1, w2), prob in bigram_probabilities.items(): + print(f" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)") + +# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram) +# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ... +total_tokens = sum(unigram_counts.values()) +p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1) +p_kalimat = p_w1 # Inisialisasi dengan P(w1) + +prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"] # Tambahkan P(w1) ke rumus + +for i in range(1, len(tokens)): + pair = (tokens[i-1], tokens[i]) + p = bigram_probabilities.get(pair, 0) + p_kalimat *= p + prob_str_parts.append(f"P({pair[1]}|{pair[0]})={p:.2f}") + +# Gabungkan rumus perkalian untuk ditampilkan +prob_str = " x ".join(prob_str_parts) + +print("\nProbabilitas Keseluruhan Kalimat (Model Bigram):") +print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)") + + + + + +from collections import Counter +from IPython.display import clear_output +import math + +# 1. Input Kalimat dan Tokenisasi +kalimat = input("Masukkan kalimat: ").strip() + +# Bersihkan output (khusus lingkungan notebook) +try: + clear_output() +except: + pass + +print(f"Corpus: {kalimat}") + +# Tokenisasi +tokens = kalimat.lower().split() +print(f"Tokens ({len(tokens)}): {tokens}") + +# 2. Hitung Frekuensi Bigram dan Trigram +bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)] +trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)] + +bigram_counts = Counter(bigrams) +trigram_counts = Counter(trigrams) + +print("\nFrekuensi Trigram dalam kalimat:") +for tg, count in trigram_counts.items(): + print(f" {tg}: {count}") +print(f"\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}") + +# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2) +trigram_probabilities = {} +for (w1, w2, w3), count in trigram_counts.items(): + # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul) + if bigram_counts[(w1, w2)] > 0: + prob = count / bigram_counts[(w1, w2)] + else: + prob = 0 + trigram_probabilities[(w1, w2, w3)] = prob + +print("\nProbabilitas masing-masing trigram:") +for (w1, w2, w3), prob in trigram_probabilities.items(): + print(f" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)") + +# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1)) +unigram_counts = Counter(tokens) +total_tokens = sum(unigram_counts.values()) + +# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram) +# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ... + +# a. P(w1) +p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0 + +# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing) +if len(tokens) > 1: + count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0 + p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1 +else: + p_w2_w1 = 1.0 # Jika hanya 1 kata + +p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1) + +# Daftar bagian rumus untuk ditampilkan +prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"] +if len(tokens) > 1: + prob_str_parts.append(f"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}") + +# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3 +for i in range(len(tokens) - 2): + triplet = (tokens[i], tokens[i+1], tokens[i+2]) + p = trigram_probabilities.get(triplet, 0) + p_kalimat *= p + prob_str_parts.append(f"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}") + +prob_str = " x ".join(prob_str_parts) + +print("\nProbabilitas Keseluruhan Kalimat (Model Trigram):") +print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)") + diff --git a/.virtual_documents/NLP/Praktikum Python Code/00-Python-Text-Basics/00-Working-with-Text-Files.ipynb b/.virtual_documents/NLP/Praktikum Python Code/00-Python-Text-Basics/00-Working-with-Text-Files.ipynb new file mode 100644 index 0000000..dae8303 --- /dev/null +++ b/.virtual_documents/NLP/Praktikum Python Code/00-Python-Text-Basics/00-Working-with-Text-Files.ipynb @@ -0,0 +1,213 @@ + + + + + + + + + + + + +name = 'Fred' + +# Using the old .format() method: +print('His name is {var}.'.format(var=name)) + +# Using f-strings: +print(f'His name is {name}.') + + + + + +print(f'His name is {name!r}') + + + + + +d = {'a':123,'b':456} + +print(f'Address: {d['a']} Main Street') + + + + + +d = {'a':123,'b':456} + +print(f"Address: {d['a']} Main Street") + + + + + +library = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)] + +for book in library: + print(f'{book[0]:{10}} {book[1]:{8}} {book[2]:{7}}') + + + + + +for book in library: + print(f'{book[0]:{10}} {book[1]:{10}} {book[2]:.>{7}}') # here .> was added + + + + + +from datetime import datetime + +today = datetime(year=2018, month=1, day=27) + +print(f'{today:%B %d, %Y}') + + + + + + + + +%%writefile test.txt +Hello, this is a quick test file. +This is the second line of the file. + + + + + +myfile = open('whoops.txt') + + + + + +pwd + + + + + +# Open the text.txt file we created earlier +my_file = open('test.txt') + + +my_file + + + + + +# We can now read the file +my_file.read() + + +# But what happens if we try to read it again? +my_file.read() + + + + + +# Seek to the start of file (index 0) +my_file.seek(0) + + +# Now read again +my_file.read() + + + + + +# Readlines returns a list of the lines in the file +my_file.seek(0) +my_file.readlines() + + + + + +my_file.close() + + + + + +# Add a second argument to the function, 'w' which stands for write. +# Passing 'w+' lets us read and write to the file + +my_file = open('test.txt','w+') + + + + + +# Write to the file +my_file.write('This is a new first line') + + +# Read the file +my_file.seek(0) +my_file.read() + + +my_file.close() # always do this when you're done with a file + + + + + +my_file = open('test.txt','a+') +my_file.write('\nThis line is being appended to test.txt') +my_file.write('\nAnd another line here.') + + +my_file.seek(0) +print(my_file.read()) + + +my_file.close() + + + + + +%%writefile -a test.txt + +This is more text being appended to test.txt +And another line here. + + + + + + + + +with open('test.txt','r') as txt: + first_line = txt.readlines()[0] + +print(first_line) + + + + + +txt.read() + + + + + +with open('test.txt','r') as txt: + for line in txt: + print(line, end='') # the end='' argument removes extra linebreaks + + + diff --git a/.virtual_documents/NLP/Praktikum Python Code/01-NLP-Python-Basics/00-Spacy-Basics.ipynb b/.virtual_documents/NLP/Praktikum Python Code/01-NLP-Python-Basics/00-Spacy-Basics.ipynb new file mode 100644 index 0000000..ad13190 --- /dev/null +++ b/.virtual_documents/NLP/Praktikum Python Code/01-NLP-Python-Basics/00-Spacy-Basics.ipynb @@ -0,0 +1,145 @@ + + + + + + + + + + + + +# Import spaCy and load the language library +import spacy +nlp = spacy.load('en_core_web_sm') + +# Create a Doc object +doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million') + +# Print each token separately +for token in doc: + print(token.text, token.pos_, token.dep_) + + + + + + + + + + + + + + + + + +nlp.pipeline + + +nlp.pipe_names + + + + + +doc2 = nlp(u"Tesla isn't looking into startups anymore.") + +for token in doc2: + print(token.text, token.pos_, token.dep_) + + + + + +doc2 + + +doc2[0] + + +type(doc2) + + + + + +doc2[0].pos_ + + + + + +doc2[0].dep_ + + + + + +spacy.explain('PROPN') + + +spacy.explain('nsubj') + + + + + + + + +# Lemmas (the base form of the word): +print(doc2[4].text) +print(doc2[4].lemma_) + + +# Simple Parts-of-Speech & Detailed Tags: +print(doc2[4].pos_) +print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_)) + + +# Word Shapes: +print(doc2[0].text+': '+doc2[0].shape_) +print(doc[5].text+' : '+doc[5].shape_) + + +# Boolean Values: +print(doc2[0].is_alpha) +print(doc2[0].is_stop) + + + + + +doc3 = nlp(u'Although commmonly attributed to John Lennon from his song "Beautiful Boy", \ +the phrase "Life is what happens to us while we are making other plans" was written by \ +cartoonist Allen Saunders and published in Reader\'s Digest in 1957, when Lennon was 17.') + + +life_quote = doc3[16:30] +print(life_quote) + + +type(life_quote) + + + + + + + + +doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.') + + +for sent in doc4.sents: + print(sent) + + +doc4[6].is_sent_start + + + diff --git a/.virtual_documents/NLP/Praktikum Python Code/01-NLP-Python-Basics/01-Tokenization.ipynb b/.virtual_documents/NLP/Praktikum Python Code/01-NLP-Python-Basics/01-Tokenization.ipynb new file mode 100644 index 0000000..d0c8426 --- /dev/null +++ b/.virtual_documents/NLP/Praktikum Python Code/01-NLP-Python-Basics/01-Tokenization.ipynb @@ -0,0 +1,188 @@ + + + + + + +# Import spaCy and load the language library +import spacy +nlp = spacy.load('en_core_web_sm') + + +# Create a string that includes opening and closing quotation marks +mystring = '"We\'re moving to L.A.!"' +print(mystring) + + +# Create a Doc object and explore tokens +doc = nlp(mystring) + +for token in doc: + print(token.text, end=' | ') + + + + + + + + + + + + + + +doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!") + +for t in doc2: + print(t) + + + + + +doc3 = nlp(u'A 5km NYC cab ride costs $10.30') + +for t in doc3: + print(t) + + + + + + + + +doc4 = nlp(u"Let's visit St. Louis in the U.S. next year.") + +for t in doc4: + print(t) + + + + + + + + +len(doc) + + + + + +len(doc.vocab) + + + + + + + + +doc5 = nlp(u'It is better to give than to receive.') + +# Retrieve the third token: +doc5[2] + + +# Retrieve three tokens from the middle: +doc5[2:5] + + +# Retrieve the last four tokens: +doc5[-4:] + + + + + +doc6 = nlp(u'My dinner was horrible.') +doc7 = nlp(u'Your dinner was delicious.') + + +# Try to change "My dinner was horrible" to "My dinner was delicious" +doc6[3] = doc7[3] + + + + + +doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million') + +for token in doc8: + print(token.text, end=' | ') + +print('\n----') + +for ent in doc8.ents: + print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_))) + + + + + +len(doc8.ents) + + + + + + + + +doc9 = nlp(u"Autonomous cars shift insurance liability toward manufacturers.") + +for chunk in doc9.noun_chunks: + print(chunk.text) + + +doc10 = nlp(u"Red cars do not carry higher insurance rates.") + +for chunk in doc10.noun_chunks: + print(chunk.text) + + +doc11 = nlp(u"He was a one-eyed, one-horned, flying, purple people-eater.") + +for chunk in doc11.noun_chunks: + print(chunk.text) + + + + + + + + + + + +from spacy import displacy + +doc = nlp(u'Apple is going to build a U.K. factory for $6 million.') +displacy.render(doc, style='dep', jupyter=True, options={'distance': 110}) + + + + + + + + +doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.') +displacy.render(doc, style='ent', jupyter=True) + + + + + +doc = nlp(u'This is a sentence.') +displacy.serve(doc, style='dep') + + + + + + diff --git a/.virtual_documents/NLP/Praktikum Python Code/01-NLP-Python-Basics/07-NLP-Basics-Assessment-Solution.ipynb b/.virtual_documents/NLP/Praktikum Python Code/01-NLP-Python-Basics/07-NLP-Basics-Assessment-Solution.ipynb new file mode 100644 index 0000000..fd4aa21 --- /dev/null +++ b/.virtual_documents/NLP/Praktikum Python Code/01-NLP-Python-Basics/07-NLP-Basics-Assessment-Solution.ipynb @@ -0,0 +1,107 @@ + + + + + + + + + +# RUN THIS CELL to perform standard imports: +import spacy +nlp = spacy.load('en_core_web_sm') + + + + + +# Enter your code here: + +with open('../TextFiles/owlcreek.txt') as f: + doc = nlp(f.read()) + + +# Run this cell to verify it worked: + +doc[:36] + + + + + +len(doc) + + + + + +sents = [sent for sent in doc.sents] +len(sents) + + + + + +print(sents[1].text) + + + + + +# NORMAL SOLUTION: +for token in sents[1]: + print(token.text, token.pos_, token.dep_, token.lemma_) + + +# CHALLENGE SOLUTION: +for token in sents[1]: + print(f'{token.text:{15}} {token.pos_:{5}} {token.dep_:{10}} {token.lemma_:{15}}') + + + + + +# Import the Matcher library: + +from spacy.matcher import Matcher +matcher = Matcher(nlp.vocab) + + +# Create a pattern and add it to matcher: + +pattern = [{'LOWER': 'swimming'}, {'IS_SPACE': True, 'OP':'*'}, {'LOWER': 'vigorously'}] + +matcher.add('Swimming', None, pattern) + + +# Create a list of matches called "found_matches" and print the list: + +found_matches = matcher(doc) +print(found_matches) + + + + + +print(doc[1265:1290]) + + +print(doc[3600:3615]) + + + + + +for sent in sents: + if found_matches[0][1] < sent.end: + print(sent) + break + + +for sent in sents: + if found_matches[1][1] < sent.end: + print(sent) + break + + + diff --git a/.virtual_documents/NLP/Praktikum Python Code/02-Parts-of-Speech-Tagging/00-POS-Basics.ipynb b/.virtual_documents/NLP/Praktikum Python Code/02-Parts-of-Speech-Tagging/00-POS-Basics.ipynb new file mode 100644 index 0000000..b9c2eaf --- /dev/null +++ b/.virtual_documents/NLP/Praktikum Python Code/02-Parts-of-Speech-Tagging/00-POS-Basics.ipynb @@ -0,0 +1,107 @@ + + + + + + +# Perform standard imports +import spacy +nlp = spacy.load('en_core_web_sm') + + +# Create a simple Doc object +doc = nlp(u"The quick brown fox jumped over the lazy dog's back.") + + + + + +# Print the full text: +print(doc.text) + + +# Print the fifth word and associated tags: +print(doc[4].text, doc[4].pos_, doc[4].tag_, spacy.explain(doc[4].tag_)) + + + + + +for token in doc: + print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}') + + + + + + + + + + + + + + +doc = nlp(u'I read books on NLP.') +r = doc[1] + +print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}') + + +doc = nlp(u'I read a book on NLP.') +r = doc[1] + +print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}') + + + + + + + + +doc = nlp(u"The quick brown fox jumped over the lazy dog's back.") + +# Count the frequencies of different coarse-grained POS tags: +POS_counts = doc.count_by(spacy.attrs.POS) +POS_counts + + + + + +doc.vocab[83].text + + + + + +for k,v in sorted(POS_counts.items()): + print(f'{k}. {doc.vocab[k].text:{5}}: {v}') + + +# Count the different fine-grained tags: +TAG_counts = doc.count_by(spacy.attrs.TAG) + +for k,v in sorted(TAG_counts.items()): + print(f'{k}. {doc.vocab[k].text:{4}}: {v}') + + + + + +# Count the different dependencies: +DEP_counts = doc.count_by(spacy.attrs.DEP) + +for k,v in sorted(DEP_counts.items()): + print(f'{k}. {doc.vocab[k].text:{4}}: {v}') + + + + + + + + + diff --git a/.virtual_documents/NLP/Praktikum Python Code/02-Parts-of-Speech-Tagging/07-POS-Challenge (optional).ipynb b/.virtual_documents/NLP/Praktikum Python Code/02-Parts-of-Speech-Tagging/07-POS-Challenge (optional).ipynb new file mode 100644 index 0000000..41609aa --- /dev/null +++ b/.virtual_documents/NLP/Praktikum Python Code/02-Parts-of-Speech-Tagging/07-POS-Challenge (optional).ipynb @@ -0,0 +1,35 @@ + + + + + + +# Perform standard imports +import spacy +nlp = spacy.load('en_core_web_sm') + +# Import the game script +import game + + +# Enter your text here: +text = u"The quick brown fox jumped over the lazy dog's back." + + +# Make your Doc object and pass it into the scorer: +doc = nlp(text) +print(game.scorer(doc)) + + + + + +# For practice, visualize your fine-grained POS tags (shown in the third column): +print(f"{'TOKEN':{10}} {'COARSE':{8}} {'FINE':{6}} {'DESCRIPTION'}") +print(f"{'-----':{10}} {'------':{8}} {'----':{6}} {'-----------'}") + +for token in doc: + print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}') + + + diff --git a/.virtual_documents/NLP/Untitled.ipynb b/.virtual_documents/NLP/Untitled.ipynb new file mode 100644 index 0000000..1ebdb71 --- /dev/null +++ b/.virtual_documents/NLP/Untitled.ipynb @@ -0,0 +1,84 @@ + + + +# --------------------------------------------------------- +# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network +# --------------------------------------------------------- + +import pandas as pd +from sklearn.model_selection import train_test_split +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.neural_network import MLPClassifier +from sklearn.metrics import classification_report, confusion_matrix + +# ----------------------------------------- +# 1. Contoh Dataset +# ----------------------------------------- +# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll) + +data = { + "text": [ + "Saya suka produk ini, luar biasa", + "Layanannya buruk, sangat kecewa", + "Pembelian terbaik yang pernah saya lakukan", + "Saya benci produk ini, buang-buang uang", + "Kualitasnya sangat bagus, direkomendasikan", + "Pengalaman buruk, tidak akan membeli lagi" + ], + "label": ["positive", "negative", "positive", "negative", "positive", "negative"] +} + +df = pd.DataFrame(data) + +# ----------------------------------------- +# 2. Split Train & Test +# ----------------------------------------- +X_train, X_test, y_train, y_test = train_test_split( + df["text"], df["label"], test_size=0.3, random_state=42 +) + +# ----------------------------------------- +# 3. TF-IDF Vectorization +# ----------------------------------------- +tfidf = TfidfVectorizer(max_features=5000) +X_train_tfidf = tfidf.fit_transform(X_train) +X_test_tfidf = tfidf.transform(X_test) + +# ----------------------------------------- +# 4. Feedforward ANN (MLPClassifier) +# ----------------------------------------- +model = MLPClassifier( + hidden_layer_sizes=(256, 64), + activation='relu', + solver='adam', + max_iter=500, + random_state=42 +) + +model.fit(X_train_tfidf, y_train) + +# ----------------------------------------- +# 5. Evaluasi Model +# ----------------------------------------- +y_pred = model.predict(X_test_tfidf) + +print("=== Classification Report ===") +print(classification_report(y_test, y_pred)) + +print("=== Confusion Matrix ===") +print(confusion_matrix(y_test, y_pred)) + +# ----------------------------------------- +# 6. Prediksi Teks Baru +# ----------------------------------------- +sample_text = ["barang bagus luar biasa"] +sample_text = ["barang buruk, saya kecewa"] +sample_vec = tfidf.transform(sample_text) +prediction = model.predict(sample_vec) + +print("\nPrediksi untuk:", sample_text[0]) +print("Hasil:", prediction[0]) + + + + diff --git a/NLP/.ipynb_checkpoints/Fitur_Ekstraksi_BOW-checkpoint.ipynb b/NLP/.ipynb_checkpoints/Fitur_Ekstraksi_BOW-checkpoint.ipynb new file mode 100644 index 0000000..188217b --- /dev/null +++ b/NLP/.ipynb_checkpoints/Fitur_Ekstraksi_BOW-checkpoint.ipynb @@ -0,0 +1,310 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qBYcPYAb059g", + "outputId": "9f57b704-da1b-4495-d366-24c30586dc76" + }, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Masukkan jumlah dokumen yang ingin dimasukkan: 3\n" + ] + } + ], + "source": [ + "# Input jumlah dokumen\n", + "import pandas as pd\n", + "n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mo-yt5Ob1N8j", + "outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Masukkan teks untuk dokumen ke-1: saya belajar nlp di kampus\n", + "Masukkan teks untuk dokumen ke-2: saya suka belajar ai\n", + "Masukkan teks untuk dokumen ke-3: mahasiswa belajar data science dan nlp\n", + "\n", + "=== Dokumen yang Dimasukkan ===\n", + "Doc 1: saya belajar nlp di kampus\n", + "Doc 2: saya suka belajar ai\n", + "Doc 3: mahasiswa belajar data science dan nlp\n" + ] + } + ], + "source": [ + "# Input teks dokumen satu per satu\n", + "documents = []\n", + "for i in range(n):\n", + " teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n", + " documents.append(teks)\n", + "\n", + "print(\"\\n=== Dokumen yang Dimasukkan ===\")\n", + "for i, doc in enumerate(documents):\n", + " print(f\"Doc {i+1}: {doc}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FkmxRAFq1oDK", + "outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Hasil Tokenisasi ===\n", + "Doc 1: ['saya', 'belajar', 'nlp', 'di', 'kampus']\n", + "Doc 2: ['saya', 'suka', 'belajar', 'ai']\n", + "Doc 3: ['mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n" + ] + } + ], + "source": [ + "# Tahap Tokenisasi\n", + "tokenized_docs = []\n", + "for doc in documents:\n", + " tokens = doc.lower().split()\n", + " tokenized_docs.append(tokens)\n", + "\n", + "print(\"\\n=== Hasil Tokenisasi ===\")\n", + "for i, tokens in enumerate(tokenized_docs):\n", + " print(f\"Doc {i+1}: {tokens}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ybC1Vo2C_c3q", + "outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n", + "['saya', 'belajar', 'nlp', 'di', 'kampus', 'saya', 'suka', 'belajar', 'ai', 'mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n", + "Jumlah total kata dalam seluruh dokumen: 15\n" + ] + } + ], + "source": [ + "# Pembuatan Corpus\n", + "corpus_all = [word for doc in tokenized_docs for word in doc]\n", + "\n", + "print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n", + "print(corpus_all)\n", + "print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "s6S-Ma4R1xuq", + "outputId": "98c3685b-1798-4038-d17e-6e45ca419b51" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Vocabulary (Kata Unik) ===\n", + "['ai', 'belajar', 'dan', 'data', 'di', 'kampus', 'mahasiswa', 'nlp', 'saya', 'science', 'suka']\n", + "Jumlah kata unik (vocabulary size): 11\n", + "\n", + "=== Vocabulary (Kata Unik) ===\n", + " 1. ai\n", + " 2. belajar\n", + " 3. dan\n", + " 4. data\n", + " 5. di\n", + " 6. kampus\n", + " 7. mahasiswa\n", + " 8. nlp\n", + " 9. saya\n", + "10. science\n", + "11. suka\n", + "\n", + "Jumlah kata unik (vocabulary size): 11\n" + ] + } + ], + "source": [ + "# Pembuatan Vocabulary\n", + "vocabulary = sorted(set(corpus_all))\n", + "\n", + "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n", + "print(vocabulary)\n", + "print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n", + "\n", + "\n", + "vocabulary = sorted(set(corpus_all))\n", + "\n", + "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n", + "for idx, word in enumerate(vocabulary, start=1):\n", + " print(f\"{idx:>2}. {word}\")\n", + "print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "ShevCTva2Fg9" + }, + "outputs": [], + "source": [ + "# Representasi Numerik (Matriks BoW)\n", + "bow_matrix = []\n", + "for doc in tokenized_docs:\n", + " vector = [doc.count(word) for word in vocabulary]\n", + " bow_matrix.append(vector)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-yB6D2pY2M0E", + "outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Matriks Bag of Words ===\n", + " ai belajar dan data di kampus mahasiswa nlp saya science suka\n", + "D1 0 1 0 0 1 1 0 1 1 0 0\n", + "D2 1 1 0 0 0 0 0 0 1 0 1\n", + "D3 0 1 1 1 0 0 1 1 0 1 0\n" + ] + } + ], + "source": [ + "df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n", + "df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3\n", + "\n", + "print(\"\\n=== Matriks Bag of Words ===\")\n", + "print(df_bow)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8ruf5vKL2rGD", + "outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n", + " Kata Frekuensi\n", + "0 belajar 3\n", + "1 nlp 2\n", + "2 saya 2\n", + "3 dan 1\n", + "4 ai 1\n", + "5 data 1\n", + "6 di 1\n", + "7 mahasiswa 1\n", + "8 kampus 1\n", + "9 science 1\n", + "10 suka 1\n", + "Frekuensi kata: 11\n" + ] + } + ], + "source": [ + "# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n", + "word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n", + "word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n", + "\n", + "print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n", + "print(word_frequencies)\n", + "print(f\"Frekuensi kata: {len(word_frequencies)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NQjExannHuj0" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/NLP/.ipynb_checkpoints/Klasifikasi Teks FNN-checkpoint.ipynb b/NLP/.ipynb_checkpoints/Klasifikasi Teks FNN-checkpoint.ipynb new file mode 100644 index 0000000..363fcab --- /dev/null +++ b/NLP/.ipynb_checkpoints/Klasifikasi Teks FNN-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/NLP/.ipynb_checkpoints/N-Gram-checkpoint.ipynb b/NLP/.ipynb_checkpoints/N-Gram-checkpoint.ipynb new file mode 100644 index 0000000..affd42d --- /dev/null +++ b/NLP/.ipynb_checkpoints/N-Gram-checkpoint.ipynb @@ -0,0 +1,380 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "JVPdWpz3hhbj" + }, + "source": [ + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4Mvva3v65h1v" + }, + "source": [ + "# **UNIGRAM**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1cub_VJnUJMl", + "outputId": "1889eb61-4f3f-4780-f42e-02368076cce3" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Corpus: saya suka makan nasi\n", + "Tokens (4): ['saya', 'suka', 'makan', 'nasi']\n", + "\n", + "Frekuensi Unigram dalam kalimat\n", + " ('saya'): 1\n", + " ('suka'): 1\n", + " ('makan'): 1\n", + " ('nasi'): 1\n", + "\n", + "Total unigram dalam 1 kalimat: 4\n", + "\n", + "Probabilitas masing-masing unigram:\n", + " P(saya) = 0.25 (25.00%)\n", + " P(suka) = 0.25 (25.00%)\n", + " P(makan) = 0.25 (25.00%)\n", + " P(nasi) = 0.25 (25.00%)\n", + "\n", + "Probabilitas Keseluruhan Kalimat (Model Unigram):\n", + " P(saya suka makan nasi) = P(saya)=0.25 x P(suka)=0.25 x P(makan)=0.25 x P(nasi)=0.25 = 0.0039 (0.39%)\n" + ] + } + ], + "source": [ + "from collections import Counter\n", + "from IPython.display import clear_output\n", + "import math\n", + "\n", + "# 1. Input Kalimat dan Tokenisasi\n", + "kalimat = input(\"Masukkan kalimat: \").strip()\n", + "\n", + "# Bersihkan output (khusus lingkungan notebook)\n", + "try:\n", + " clear_output()\n", + "except:\n", + " pass\n", + "\n", + "print(f\"Corpus: {kalimat}\")\n", + "\n", + "# Tokenize\n", + "tokens = kalimat.lower().split()\n", + "print(f\"Tokens ({len(tokens)}): {tokens}\")\n", + "\n", + "# 2. Hitung Frekuensi Unigram\n", + "unigram_counts = Counter(tokens)\n", + "total_tokens = sum(unigram_counts.values())\n", + "\n", + "print(\"\\nFrekuensi Unigram dalam kalimat\")\n", + "for pair, count in unigram_counts.items():\n", + " print(f\" ('{pair}'): {count}\")\n", + "print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n", + "\n", + "# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n", + "unigram_probabilities = {}\n", + "for word, count in unigram_counts.items():\n", + " prob = count / total_tokens\n", + " unigram_probabilities[word] = prob\n", + "\n", + "print(\"\\nProbabilitas masing-masing unigram:\")\n", + "for word, prob in unigram_probabilities.items():\n", + " print(f\" P({word}) = {prob:.2f} ({prob*100:.2f}%)\")\n", + "\n", + "# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)\n", + "p_kalimat = 1\n", + "prob_parts = []\n", + "\n", + "# Loop untuk menghitung probabilitas total dan membangun string rumus detail\n", + "for word in tokens:\n", + " prob_value = unigram_probabilities[word]\n", + " p_kalimat *= prob_value\n", + " # Format: P(word)=prob_value\n", + " prob_parts.append(f\"P({word})={prob_value:.2f}\")\n", + "\n", + "# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail\n", + "prob_str = \" x \".join(prob_parts)\n", + "\n", + "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n", + "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Vstwt996-FrS" + }, + "source": [ + "# **BIGRAM**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XRIY4qgTVbjl", + "outputId": "ea6e62ce-45a0-40c9-ca98-1fcc30558479" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Corpus: saya makan nasi dan saya makan roti\n", + "Tokens (7): ['saya', 'makan', 'nasi', 'dan', 'saya', 'makan', 'roti']\n", + "\n", + "Frekuensi Bigram dalam kalimat:\n", + " ('saya', 'makan'): 2\n", + " ('makan', 'nasi'): 1\n", + " ('nasi', 'dan'): 1\n", + " ('dan', 'saya'): 1\n", + " ('makan', 'roti'): 1\n", + "\n", + "Total bigram dalam 1 kalimat: 6\n", + "\n", + "Probabilitas masing-masing bigram:\n", + " P(makan|saya) = 1.00 (100.00%)\n", + " P(nasi|makan) = 0.50 (50.00%)\n", + " P(dan|nasi) = 1.00 (100.00%)\n", + " P(saya|dan) = 1.00 (100.00%)\n", + " P(roti|makan) = 0.50 (50.00%)\n", + "\n", + "Probabilitas Keseluruhan Kalimat (Model Bigram):\n", + " P(saya makan nasi dan saya makan roti) = P(saya)=0.29 x P(makan|saya)=1.00 x P(nasi|makan)=0.50 x P(dan|nasi)=1.00 x P(saya|dan)=1.00 x P(makan|saya)=1.00 x P(roti|makan)=0.50 = 0.071429 (7.14%)\n" + ] + } + ], + "source": [ + "from collections import Counter\n", + "from IPython.display import clear_output\n", + "import math\n", + "\n", + "# 1. Input Kalimat dan Tokenisasi\n", + "kalimat = input(\"Masukkan kalimat: \").strip()\n", + "\n", + "# Bersihkan output (khusus lingkungan notebook)\n", + "try:\n", + " clear_output()\n", + "except:\n", + " pass\n", + "\n", + "print(f\"Corpus: {kalimat}\")\n", + "\n", + "# Tokenisasi\n", + "tokens = kalimat.lower().split()\n", + "print(f\"Tokens ({len(tokens)}): {tokens}\")\n", + "\n", + "# 2. Hitung Frekuensi Unigram dan Bigram\n", + "unigram_counts = Counter(tokens)\n", + "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n", + "bigram_counts = Counter(bigrams)\n", + "\n", + "print(\"\\nFrekuensi Bigram dalam kalimat:\")\n", + "for pair, count in bigram_counts.items():\n", + " print(f\" {pair}: {count}\")\n", + "print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n", + "\n", + "# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n", + "bigram_probabilities = {}\n", + "for (w1, w2), count in bigram_counts.items():\n", + " prob = count / unigram_counts[w1]\n", + " bigram_probabilities[(w1, w2)] = prob\n", + "\n", + "print(\"\\nProbabilitas masing-masing bigram:\")\n", + "for (w1, w2), prob in bigram_probabilities.items():\n", + " print(f\" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)\")\n", + "\n", + "# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)\n", + "# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...\n", + "total_tokens = sum(unigram_counts.values())\n", + "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)\n", + "p_kalimat = p_w1 # Inisialisasi dengan P(w1)\n", + "\n", + "prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"] # Tambahkan P(w1) ke rumus\n", + "\n", + "for i in range(1, len(tokens)):\n", + " pair = (tokens[i-1], tokens[i])\n", + " p = bigram_probabilities.get(pair, 0)\n", + " p_kalimat *= p\n", + " prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.2f}\")\n", + "\n", + "# Gabungkan rumus perkalian untuk ditampilkan\n", + "prob_str = \" x \".join(prob_str_parts)\n", + "\n", + "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n", + "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E6n1IU8X-G9S" + }, + "source": [ + "# **TRIGRAM**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BIRARsj2FHJg", + "outputId": "968d420e-9370-40e5-e7e1-148e1d351d62" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Corpus: mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning\n", + "Tokens (7): ['mahasiswa', 'mengerjakan', 'tugas', 'kemudian', 'mahasiswa', 'upload', 'e-learning']\n", + "\n", + "Frekuensi Trigram dalam kalimat:\n", + " ('mahasiswa', 'mengerjakan', 'tugas'): 1\n", + " ('mengerjakan', 'tugas', 'kemudian'): 1\n", + " ('tugas', 'kemudian', 'mahasiswa'): 1\n", + " ('kemudian', 'mahasiswa', 'upload'): 1\n", + " ('mahasiswa', 'upload', 'e-learning'): 1\n", + "\n", + "Total trigram dalam 1 kalimat: 5\n", + "\n", + "Probabilitas masing-masing trigram:\n", + " P(tugas|mahasiswa,mengerjakan) = 1.00 (100.00%)\n", + " P(kemudian|mengerjakan,tugas) = 1.00 (100.00%)\n", + " P(mahasiswa|tugas,kemudian) = 1.00 (100.00%)\n", + " P(upload|kemudian,mahasiswa) = 1.00 (100.00%)\n", + " P(e-learning|mahasiswa,upload) = 1.00 (100.00%)\n", + "\n", + "Probabilitas Keseluruhan Kalimat (Model Trigram):\n", + " P(mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning) = P(mahasiswa)=0.29 x P(mengerjakan|mahasiswa)=0.50 x P(tugas|mahasiswa,mengerjakan)=1.00 x P(kemudian|mengerjakan,tugas)=1.00 x P(mahasiswa|tugas,kemudian)=1.00 x P(upload|kemudian,mahasiswa)=1.00 x P(e-learning|mahasiswa,upload)=1.00 = 0.142857 (14.29%)\n" + ] + } + ], + "source": [ + "from collections import Counter\n", + "from IPython.display import clear_output\n", + "import math\n", + "\n", + "# 1. Input Kalimat dan Tokenisasi\n", + "kalimat = input(\"Masukkan kalimat: \").strip()\n", + "\n", + "# Bersihkan output (khusus lingkungan notebook)\n", + "try:\n", + " clear_output()\n", + "except:\n", + " pass\n", + "\n", + "print(f\"Corpus: {kalimat}\")\n", + "\n", + "# Tokenisasi\n", + "tokens = kalimat.lower().split()\n", + "print(f\"Tokens ({len(tokens)}): {tokens}\")\n", + "\n", + "# 2. Hitung Frekuensi Bigram dan Trigram\n", + "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n", + "trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n", + "\n", + "bigram_counts = Counter(bigrams)\n", + "trigram_counts = Counter(trigrams)\n", + "\n", + "print(\"\\nFrekuensi Trigram dalam kalimat:\")\n", + "for tg, count in trigram_counts.items():\n", + " print(f\" {tg}: {count}\")\n", + "print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n", + "\n", + "# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)\n", + "trigram_probabilities = {}\n", + "for (w1, w2, w3), count in trigram_counts.items():\n", + " # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)\n", + " if bigram_counts[(w1, w2)] > 0:\n", + " prob = count / bigram_counts[(w1, w2)]\n", + " else:\n", + " prob = 0\n", + " trigram_probabilities[(w1, w2, w3)] = prob\n", + "\n", + "print(\"\\nProbabilitas masing-masing trigram:\")\n", + "for (w1, w2, w3), prob in trigram_probabilities.items():\n", + " print(f\" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)\")\n", + "\n", + "# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))\n", + "unigram_counts = Counter(tokens)\n", + "total_tokens = sum(unigram_counts.values())\n", + "\n", + "# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)\n", + "# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...\n", + "\n", + "# a. P(w1)\n", + "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n", + "\n", + "# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)\n", + "if len(tokens) > 1:\n", + " count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0\n", + " p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n", + "else:\n", + " p_w2_w1 = 1.0 # Jika hanya 1 kata\n", + "\n", + "p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)\n", + "\n", + "# Daftar bagian rumus untuk ditampilkan\n", + "prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"]\n", + "if len(tokens) > 1:\n", + " prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}\")\n", + "\n", + "# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3\n", + "for i in range(len(tokens) - 2):\n", + " triplet = (tokens[i], tokens[i+1], tokens[i+2])\n", + " p = trigram_probabilities.get(triplet, 0)\n", + " p_kalimat *= p\n", + " prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}\")\n", + "\n", + "prob_str = \" x \".join(prob_str_parts)\n", + "\n", + "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n", + "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/NLP/Klasifikasi Teks FNN.ipynb b/NLP/Klasifikasi Teks FNN.ipynb new file mode 100644 index 0000000..cae22a7 --- /dev/null +++ b/NLP/Klasifikasi Teks FNN.ipynb @@ -0,0 +1,151 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f4a1399a-f23d-4060-a07e-bce5a5c7ddac", + "metadata": {}, + "source": [ + "# Klasifikasi Teks\n", + "## Arif R Dwiyanto" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "53a214ae-c9cf-4d46-925d-068f1685537b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Classification Report ===\n", + " precision recall f1-score support\n", + "\n", + " negative 0.00 0.00 0.00 1.0\n", + " positive 0.00 0.00 0.00 1.0\n", + "\n", + " accuracy 0.00 2.0\n", + " macro avg 0.00 0.00 0.00 2.0\n", + "weighted avg 0.00 0.00 0.00 2.0\n", + "\n", + "=== Confusion Matrix ===\n", + "[[0 1]\n", + " [1 0]]\n", + "\n", + "Prediksi untuk: barang buruk, saya kecewa\n", + "Hasil: negative\n" + ] + } + ], + "source": [ + "# ---------------------------------------------------------\n", + "# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network\n", + "# ---------------------------------------------------------\n", + "\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.neural_network import MLPClassifier\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "\n", + "# -----------------------------------------\n", + "# 1. Contoh Dataset\n", + "# -----------------------------------------\n", + "# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)\n", + "\n", + "data = {\n", + " \"text\": [\n", + " \"Saya suka produk ini, luar biasa\",\n", + " \"Layanannya buruk, sangat kecewa\",\n", + " \"Pembelian terbaik yang pernah saya lakukan\",\n", + " \"Saya benci produk ini, buang-buang uang\",\n", + " \"Kualitasnya sangat bagus, direkomendasikan\",\n", + " \"Pengalaman buruk, tidak akan membeli lagi\"\n", + " ],\n", + " \"label\": [\"positive\", \"negative\", \"positive\", \"negative\", \"positive\", \"negative\"]\n", + "}\n", + "\n", + "df = pd.DataFrame(data)\n", + "\n", + "# -----------------------------------------\n", + "# 2. Split Train & Test\n", + "# -----------------------------------------\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " df[\"text\"], df[\"label\"], test_size=0.3, random_state=42\n", + ")\n", + "\n", + "# -----------------------------------------\n", + "# 3. TF-IDF Vectorization\n", + "# -----------------------------------------\n", + "tfidf = TfidfVectorizer(max_features=5000)\n", + "X_train_tfidf = tfidf.fit_transform(X_train)\n", + "X_test_tfidf = tfidf.transform(X_test)\n", + "\n", + "# -----------------------------------------\n", + "# 4. Feedforward ANN (MLPClassifier)\n", + "# -----------------------------------------\n", + "model = MLPClassifier(\n", + " hidden_layer_sizes=(256, 64),\n", + " activation='relu',\n", + " solver='adam',\n", + " max_iter=500,\n", + " random_state=42\n", + ")\n", + "\n", + "model.fit(X_train_tfidf, y_train)\n", + "\n", + "# -----------------------------------------\n", + "# 5. Evaluasi Model\n", + "# -----------------------------------------\n", + "y_pred = model.predict(X_test_tfidf)\n", + "\n", + "print(\"=== Classification Report ===\")\n", + "print(classification_report(y_test, y_pred))\n", + "\n", + "print(\"=== Confusion Matrix ===\")\n", + "print(confusion_matrix(y_test, y_pred))\n", + "\n", + "# -----------------------------------------\n", + "# 6. Prediksi Teks Baru\n", + "# -----------------------------------------\n", + "sample_text = [\"barang bagus luar biasa\"]\n", + "sample_text = [\"barang buruk, saya kecewa\"]\n", + "sample_vec = tfidf.transform(sample_text)\n", + "prediction = model.predict(sample_vec)\n", + "\n", + "print(\"\\nPrediksi untuk:\", sample_text[0])\n", + "print(\"Hasil:\", prediction[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f7d90fe-4af4-446c-9547-c9312bfa6fc7", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/README.md b/README.md index 06b5c87..6a8a662 100644 --- a/README.md +++ b/README.md @@ -4,4 +4,5 @@ - NLP - Machine Learning - Big Data -- Data Mining \ No newline at end of file +- Data Mining +- Data Management \ No newline at end of file