From e871ab77ad58e850c6cc74a12c007ff01f799171 Mon Sep 17 00:00:00 2001 From: 202210715288 FATAH SABILA ROSYAD <202210715288@mhs.ubharajaya.ac.id> Date: Tue, 2 Dec 2025 23:47:39 +0700 Subject: [PATCH] Update tiga file praktikum NLP (Fatah): - Fitur Ekstraksi BOW - Klasifikasi Teks TF-IDF + ANN Tugas praktikum Sudah saya edit sesuai instruksi dosen. Signed-off-by: 202210715288 FATAH SABILA ROSYAD <202210715288@mhs.ubharajaya.ac.id> --- Fitur_Ekstraksi_BOW.ipynb | 742 +++++++++++++++++++++++++++++++++++++ Klasifikasi Teks FNN.ipynb | 360 ++++++++++++++++++ N-Gram.ipynb | 362 ++++++++++++++++++ 3 files changed, 1464 insertions(+) create mode 100644 Fitur_Ekstraksi_BOW.ipynb create mode 100644 Klasifikasi Teks FNN.ipynb create mode 100644 N-Gram.ipynb diff --git a/Fitur_Ekstraksi_BOW.ipynb b/Fitur_Ekstraksi_BOW.ipynb new file mode 100644 index 0000000..67ba5c9 --- /dev/null +++ b/Fitur_Ekstraksi_BOW.ipynb @@ -0,0 +1,742 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fitur Ekstraksi Bag-of-Words (BOW)\n", + "\n", + "**Nama:** Fatah Sabila Rosyad \n", + "**NIM:** **202210715288** \n", + "**Kelas:** F7B2 \n", + "**MK:** NLP \n", + "\n", + "**Tujuan praktikum:** \n", + "Melakukan ekstraksi fitur teks menggunakan Bag-of-Words dengan variasi parameter, yaitu: \n", + "- Mengubah contoh teks \n", + "- Mengubah jumlah fitur (`max_features`) \n", + "- Menggunakan rentang n-gram baru (`ngram_range = (1,3)`) \n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Masukkan jumlah dokumen yang ingin dimasukkan: 3\n" + ] + } + ], + "source": [ + "# Input jumlah dokumen\n", + "n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mo-yt5Ob1N8j", + "outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32" + }, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Masukkan teks untuk dokumen ke-1: Saya Fatah Sabila Rosyad, mahasiswa informatika yang sedang mempelajari NLP dan machine learning\n", + "Masukkan teks untuk dokumen ke-2: Fatah melakukan eksperimen Bag-of-Words untuk melihat bagaimana fitur teks direpresentasikan sebagai angka\n", + "Masukkan teks untuk dokumen ke-3: Pada tahap ini Fatah menggunakan n-gram untuk menguji kombinasi kata yang sering muncul bersama\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Dokumen yang Dimasukkan ===\n", + "Doc 1: Saya Fatah Sabila Rosyad, mahasiswa informatika yang sedang mempelajari NLP dan machine learning\n", + "Doc 2: Fatah melakukan eksperimen Bag-of-Words untuk melihat bagaimana fitur teks direpresentasikan sebagai angka\n", + "Doc 3: Pada tahap ini Fatah menggunakan n-gram untuk menguji kombinasi kata yang sering muncul bersama\n" + ] + } + ], + "source": [ + "# Input teks dokumen satu per satu\n", + "documents = []\n", + "for i in range(n):\n", + " teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n", + " documents.append(teks)\n", + "\n", + "print(\"\\n=== Dokumen yang Dimasukkan ===\")\n", + "for i, doc in enumerate(documents):\n", + " print(f\"Doc {i+1}: {doc}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FkmxRAFq1oDK", + "outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Hasil Tokenisasi ===\n", + "Doc 1: ['saya', 'fatah', 'sabila', 'rosyad,', 'mahasiswa', 'informatika', 'yang', 'sedang', 'mempelajari', 'nlp', 'dan', 'machine', 'learning']\n", + "Doc 2: ['fatah', 'melakukan', 'eksperimen', 'bag-of-words', 'untuk', 'melihat', 'bagaimana', 'fitur', 'teks', 'direpresentasikan', 'sebagai', 'angka']\n", + "Doc 3: ['pada', 'tahap', 'ini', 'fatah', 'menggunakan', 'n-gram', 'untuk', 'menguji', 'kombinasi', 'kata', 'yang', 'sering', 'muncul', 'bersama']\n" + ] + } + ], + "source": [ + "# Tahap Tokenisasi\n", + "tokenized_docs = []\n", + "for doc in documents:\n", + " tokens = doc.lower().split()\n", + " tokenized_docs.append(tokens)\n", + "\n", + "print(\"\\n=== Hasil Tokenisasi ===\")\n", + "for i, tokens in enumerate(tokenized_docs):\n", + " print(f\"Doc {i+1}: {tokens}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ybC1Vo2C_c3q", + "outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n", + "['saya', 'fatah', 'sabila', 'rosyad,', 'mahasiswa', 'informatika', 'yang', 'sedang', 'mempelajari', 'nlp', 'dan', 'machine', 'learning', 'fatah', 'melakukan', 'eksperimen', 'bag-of-words', 'untuk', 'melihat', 'bagaimana', 'fitur', 'teks', 'direpresentasikan', 'sebagai', 'angka', 'pada', 'tahap', 'ini', 'fatah', 'menggunakan', 'n-gram', 'untuk', 'menguji', 'kombinasi', 'kata', 'yang', 'sering', 'muncul', 'bersama']\n", + "Jumlah total kata dalam seluruh dokumen: 39\n" + ] + } + ], + "source": [ + "# Pembuatan Corpus\n", + "corpus_all = [word for doc in tokenized_docs for word in doc]\n", + "\n", + "print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n", + "print(corpus_all)\n", + "print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Hasil BOW Manual (Frekuensi Kata) ===\n" + ] + }, + { + "data": { + "text/plain": [ + "{'fatah': 3,\n", + " 'yang': 2,\n", + " 'untuk': 2,\n", + " 'saya': 1,\n", + " 'sabila': 1,\n", + " 'rosyad,': 1,\n", + " 'mahasiswa': 1,\n", + " 'informatika': 1,\n", + " 'sedang': 1,\n", + " 'mempelajari': 1,\n", + " 'nlp': 1,\n", + " 'dan': 1,\n", + " 'machine': 1,\n", + " 'learning': 1,\n", + " 'melakukan': 1,\n", + " 'eksperimen': 1,\n", + " 'bag-of-words': 1,\n", + " 'melihat': 1,\n", + " 'bagaimana': 1,\n", + " 'fitur': 1,\n", + " 'teks': 1,\n", + " 'direpresentasikan': 1,\n", + " 'sebagai': 1,\n", + " 'angka': 1,\n", + " 'pada': 1,\n", + " 'tahap': 1,\n", + " 'ini': 1,\n", + " 'menggunakan': 1,\n", + " 'n-gram': 1,\n", + " 'menguji': 1,\n", + " 'kombinasi': 1,\n", + " 'kata': 1,\n", + " 'sering': 1,\n", + " 'muncul': 1,\n", + " 'bersama': 1}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 4) Membuat Bag-of-Words manual (frekuensi kata)\n", + "bow_manual = {}\n", + "for tokens in tokenized_docs:\n", + " for token in tokens:\n", + " bow_manual[token] = bow_manual.get(token, 0) + 1\n", + "\n", + "print(\"\\n=== Hasil BOW Manual (Frekuensi Kata) ===\")\n", + "# Tampilkan dictionary secara sorted by frequency (desc)\n", + "bow_sorted = dict(sorted(bow_manual.items(), key=lambda x: x[1], reverse=True))\n", + "bow_sorted" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "s6S-Ma4R1xuq", + "outputId": "98c3685b-1798-4038-d17e-6e45ca419b51" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Vocabulary (Kata Unik) ===\n", + "['angka', 'bag-of-words', 'bagaimana', 'bersama', 'dan', 'direpresentasikan', 'eksperimen', 'fatah', 'fitur', 'informatika', 'ini', 'kata', 'kombinasi', 'learning', 'machine', 'mahasiswa', 'melakukan', 'melihat', 'mempelajari', 'menggunakan', 'menguji', 'muncul', 'n-gram', 'nlp', 'pada', 'rosyad,', 'sabila', 'saya', 'sebagai', 'sedang', 'sering', 'tahap', 'teks', 'untuk', 'yang']\n", + "Jumlah kata unik (vocabulary size): 35\n", + "\n", + "=== Vocabulary (Kata Unik) ===\n", + " 1. angka\n", + " 2. bag-of-words\n", + " 3. bagaimana\n", + " 4. bersama\n", + " 5. dan\n", + " 6. direpresentasikan\n", + " 7. eksperimen\n", + " 8. fatah\n", + " 9. fitur\n", + "10. informatika\n", + "11. ini\n", + "12. kata\n", + "13. kombinasi\n", + "14. learning\n", + "15. machine\n", + "16. mahasiswa\n", + "17. melakukan\n", + "18. melihat\n", + "19. mempelajari\n", + "20. menggunakan\n", + "21. menguji\n", + "22. muncul\n", + "23. n-gram\n", + "24. nlp\n", + "25. pada\n", + "26. rosyad,\n", + "27. sabila\n", + "28. saya\n", + "29. sebagai\n", + "30. sedang\n", + "31. sering\n", + "32. tahap\n", + "33. teks\n", + "34. untuk\n", + "35. yang\n", + "\n", + "Jumlah kata unik (vocabulary size): 35\n" + ] + } + ], + "source": [ + "# Pembuatan Vocabulary\n", + "vocabulary = sorted(set(corpus_all))\n", + "\n", + "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n", + "print(vocabulary)\n", + "print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n", + "\n", + "\n", + "vocabulary = sorted(set(corpus_all))\n", + "\n", + "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n", + "for idx, word in enumerate(vocabulary, start=1):\n", + " print(f\"{idx:>2}. {word}\")\n", + "print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Jumlah dokumen: 3\n", + "Jumlah tokenized_docs: 3\n", + "Jumlah kata di vocabulary (unique): 35\n", + "Contoh 10 kata pertama vocabulary: ['angka', 'bag-of-words', 'bagaimana', 'bersama', 'dan', 'direpresentasikan', 'eksperimen', 'fatah', 'fitur', 'informatika']\n" + ] + } + ], + "source": [ + "print(\"Jumlah dokumen:\", len(documents))\n", + "print(\"Jumlah tokenized_docs:\", len(tokenized_docs))\n", + "print(\"Jumlah kata di vocabulary (unique):\", len(vocabulary))\n", + "print(\"Contoh 10 kata pertama vocabulary:\", vocabulary[:10])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Info CountVectorizer ===\n", + "n-gram range: (1, 3)\n", + "max_features: 500\n", + "Jumlah fitur (vocabulary size): 110\n", + "\n", + "Contoh 20 fitur pertama:\n", + "['angka' 'bag' 'bag of' 'bag of words' 'bagaimana' 'bagaimana fitur'\n", + " 'bagaimana fitur teks' 'bersama' 'dan' 'dan machine'\n", + " 'dan machine learning' 'direpresentasikan' 'direpresentasikan sebagai'\n", + " 'direpresentasikan sebagai angka' 'eksperimen' 'eksperimen bag'\n", + " 'eksperimen bag of' 'fatah' 'fatah melakukan'\n", + " 'fatah melakukan eksperimen']\n" + ] + } + ], + "source": [ + "# BOW modern: CountVectorizer dengan n-gram dan max_features \n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "# Ubah di sini bila mau nilai lain:\n", + "NGRAM_RANGE = (1, 3) # ubah n-gram (contoh: (1,2) atau (1,4))\n", + "MAX_FEATURES = 500 # ubah jumlah fitur (contoh: 200, 1000)\n", + "\n", + "vectorizer = CountVectorizer(ngram_range=NGRAM_RANGE, max_features=MAX_FEATURES)\n", + "X = vectorizer.fit_transform(documents)\n", + "\n", + "print(\"\\n=== Info CountVectorizer ===\")\n", + "print(\"n-gram range:\", NGRAM_RANGE)\n", + "print(\"max_features:\", MAX_FEATURES)\n", + "print(\"Jumlah fitur (vocabulary size):\", len(vectorizer.vocabulary_))\n", + "print(\"\\nContoh 20 fitur pertama:\")\n", + "print(vectorizer.get_feature_names_out()[:20])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
angkabagbag ofbag of wordsbagaimanabagaimana fiturbagaimana fitur teksbersamadandan machine...untuk mengujiuntuk menguji kombinasiwordswords untukwords untuk melihatyangyang sedangyang sedang mempelajariyang seringyang sering muncul
00000000011...0000011100
11111111000...0011100000
20000000100...1100010011
\n", + "

3 rows × 110 columns

\n", + "
" + ], + "text/plain": [ + " angka bag bag of bag of words bagaimana bagaimana fitur \\\n", + "0 0 0 0 0 0 0 \n", + "1 1 1 1 1 1 1 \n", + "2 0 0 0 0 0 0 \n", + "\n", + " bagaimana fitur teks bersama dan dan machine ... untuk menguji \\\n", + "0 0 0 1 1 ... 0 \n", + "1 1 0 0 0 ... 0 \n", + "2 0 1 0 0 ... 1 \n", + "\n", + " untuk menguji kombinasi words words untuk words untuk melihat yang \\\n", + "0 0 0 0 0 1 \n", + "1 0 1 1 1 0 \n", + "2 1 0 0 0 1 \n", + "\n", + " yang sedang yang sedang mempelajari yang sering yang sering muncul \n", + "0 1 1 0 0 \n", + "1 0 0 0 0 \n", + "2 0 0 1 1 \n", + "\n", + "[3 rows x 110 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# 6) Konversi hasil ke DataFrame\n", + "import pandas as pd\n", + "\n", + "df_features = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())\n", + "display(df_features) # di Jupyter ini akan tampil tabel\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "ShevCTva2Fg9" + }, + "outputs": [], + "source": [ + "# Representasi Numerik (Matriks BoW)\n", + "bow_matrix = []\n", + "for doc in tokenized_docs:\n", + " vector = [doc.count(word) for word in vocabulary]\n", + " bow_matrix.append(vector)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
angkabag-of-wordsbagaimanabersamadandirepresentasikaneksperimenfatahfiturinformatika...rosyad,sabilasayasebagaisedangseringtahapteksuntukyang
00000100101...1110100001
11110011110...0001000110
20001000100...0000011011
\n", + "

3 rows × 35 columns

\n", + "
" + ], + "text/plain": [ + " angka bag-of-words bagaimana bersama dan direpresentasikan \\\n", + "0 0 0 0 0 1 0 \n", + "1 1 1 1 0 0 1 \n", + "2 0 0 0 1 0 0 \n", + "\n", + " eksperimen fatah fitur informatika ... rosyad, sabila saya sebagai \\\n", + "0 0 1 0 1 ... 1 1 1 0 \n", + "1 1 1 1 0 ... 0 0 0 1 \n", + "2 0 1 0 0 ... 0 0 0 0 \n", + "\n", + " sedang sering tahap teks untuk yang \n", + "0 1 0 0 0 0 1 \n", + "1 0 0 0 1 1 0 \n", + "2 0 1 1 0 1 1 \n", + "\n", + "[3 rows x 35 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Shape (dokumen x fitur): (3, 35)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# Buat matrix manual berdasarkan vocabulary yang sudah kamu buat\n", + "bow_matrix = []\n", + "for tokens in tokenized_docs:\n", + " # hitung frekuensi tiap kata pada vocabulary pada dokumen ini\n", + " vector = [tokens.count(word) for word in vocabulary] # tokens adalah list kata\n", + " bow_matrix.append(vector)\n", + "\n", + "# Konversi ke DataFrame agar rapi saat ditampilkan\n", + "df_bow_manual = pd.DataFrame(bow_matrix, columns=vocabulary)\n", + "display(df_bow_manual) # di Jupyter ini akan tampil tabel\n", + "print(\"\\nShape (dokumen x fitur):\", df_bow_manual.shape)\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Klasifikasi Teks FNN.ipynb b/Klasifikasi Teks FNN.ipynb new file mode 100644 index 0000000..ef47aba --- /dev/null +++ b/Klasifikasi Teks FNN.ipynb @@ -0,0 +1,360 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f4a1399a-f23d-4060-a07e-bce5a5c7ddac", + "metadata": {}, + "source": [ + "# Klasifikasi Teks menggunakan ANN (TF-IDF + FNN)\n", + "\n", + "**Nama:** Fatah Sabila Rosyad \n", + "**NIM:** 202210715288 \n", + "**Kelas:** F7B2 \n", + "**MK:** NLP \n", + "\n", + "**Tujuan praktikum:**\n", + "Menerapkan klasifikasi teks sentimen sederhana menggunakan TF-IDF dan Feedforward Neural Network (MLPClassifier), dengan:\n", + "- Mengubah contoh teks (menggunakan kalimat yang dibuat sendiri)\n", + "- Mengubah parameter TF-IDF (`max_features`, `ngram_range`)\n", + "- Mengubah arsitektur dan parameter model ANN (`hidden_layer_sizes`, `max_iter`, `learning_rate_init`)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "4c395092-326a-4abc-b308-067392277cfa", + "metadata": {}, + "outputs": [], + "source": [ + "# ---------------------------------------------------------\n", + "# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network\n", + "# ---------------------------------------------------------\n", + "\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.neural_network import MLPClassifier\n", + "from sklearn.metrics import classification_report, confusion_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4ac91b0c-e6af-4766-8933-db10ebf69140", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textlabel
0Saya Fatah Sabila Rosyad merasa sangat puas de...positive
1Sebagai pelanggan, Fatah kecewa karena pelayan...negative
2Pengalaman belanja Fatah kali ini menyenangkan...positive
3Fatah benci produk ini karena mudah rusak dan ...negative
4Menurut Fatah kualitas produk ini sangat bagus...positive
5Fatah tidak akan membeli lagi di sini karena p...negative
\n", + "
" + ], + "text/plain": [ + " text label\n", + "0 Saya Fatah Sabila Rosyad merasa sangat puas de... positive\n", + "1 Sebagai pelanggan, Fatah kecewa karena pelayan... negative\n", + "2 Pengalaman belanja Fatah kali ini menyenangkan... positive\n", + "3 Fatah benci produk ini karena mudah rusak dan ... negative\n", + "4 Menurut Fatah kualitas produk ini sangat bagus... positive\n", + "5 Fatah tidak akan membeli lagi di sini karena p... negative" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# -----------------------------------------\n", + "# 1. Contoh Dataset (teks buatan Fatah)\n", + "# -----------------------------------------\n", + "\n", + "data = {\n", + " \"text\": [\n", + " \"Saya Fatah Sabila Rosyad merasa sangat puas dengan kualitas produk ini.\",\n", + " \"Sebagai pelanggan, Fatah kecewa karena pelayanan toko sangat lambat.\",\n", + " \"Pengalaman belanja Fatah kali ini menyenangkan, proses cepat dan barang sesuai.\",\n", + " \"Fatah benci produk ini karena mudah rusak dan tidak sesuai deskripsi.\",\n", + " \"Menurut Fatah kualitas produk ini sangat bagus dan layak direkomendasikan.\",\n", + " \"Fatah tidak akan membeli lagi di sini karena pelayanan buruk dan respon yang lambat.\"\n", + " ],\n", + " \"label\": [\"positive\", \"negative\", \"positive\", \"negative\", \"positive\", \"negative\"]\n", + "}\n", + "\n", + "df = pd.DataFrame(data)\n", + "df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6dab8e80-c225-4de8-aecc-8b457153c3ee", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Jumlah data latih : 3\n", + "Jumlah data uji : 3\n" + ] + } + ], + "source": [ + "# -----------------------------------------\n", + "# 2. Split Train & Test (PERUBAHAN: test_size & random_state)\n", + "# -----------------------------------------\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " df[\"text\"],\n", + " df[\"label\"],\n", + " test_size=0.34, # semula 0.3\n", + " random_state=7 # semula 42\n", + ")\n", + "\n", + "print(\"Jumlah data latih :\", len(X_train))\n", + "print(\"Jumlah data uji :\", len(X_test))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "2cb05f0c-b497-4e9e-87bc-25d167f0c0ee", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape X_train_tfidf: (3, 52)\n", + "Shape X_test_tfidf : (3, 52)\n" + ] + } + ], + "source": [ + "# -----------------------------------------\n", + "# 3. TF-IDF Vectorization (PERUBAHAN PARAMETER)\n", + "# -----------------------------------------\n", + "\n", + "tfidf = TfidfVectorizer(\n", + " max_features=1000, # semula 5000\n", + " ngram_range=(1, 2) # tambahan: gunakan unigram + bigram\n", + ")\n", + "\n", + "X_train_tfidf = tfidf.fit_transform(X_train)\n", + "X_test_tfidf = tfidf.transform(X_test)\n", + "\n", + "print(\"Shape X_train_tfidf:\", X_train_tfidf.shape)\n", + "print(\"Shape X_test_tfidf :\", X_test_tfidf.shape)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0cb99708-b1bd-43a7-84b9-4e4925bf2914", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model selesai dilatih.\n" + ] + } + ], + "source": [ + "# -----------------------------------------\n", + "# 4. Feedforward ANN (MLPClassifier) (PERUBAHAN PARAMETER)\n", + "# -----------------------------------------\n", + "\n", + "model = MLPClassifier(\n", + " hidden_layer_sizes=(128, 32), # semula (256, 64)\n", + " activation='relu',\n", + " solver='adam',\n", + " learning_rate_init=0.001, # tambahan\n", + " max_iter=300, # semula 500\n", + " random_state=7\n", + ")\n", + "\n", + "model.fit(X_train_tfidf, y_train)\n", + "print(\"Model selesai dilatih.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d388afdf-0f08-48ea-92d1-e03390dee1d9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Classification Report ===\n", + " precision recall f1-score support\n", + "\n", + " negative 1.00 0.50 0.67 2\n", + " positive 0.50 1.00 0.67 1\n", + "\n", + " accuracy 0.67 3\n", + " macro avg 0.75 0.75 0.67 3\n", + "weighted avg 0.83 0.67 0.67 3\n", + "\n", + "=== Confusion Matrix ===\n", + "[[1 1]\n", + " [0 1]]\n" + ] + } + ], + "source": [ + "# -----------------------------------------\n", + "# 5. Evaluasi Model\n", + "# -----------------------------------------\n", + "\n", + "y_pred = model.predict(X_test_tfidf)\n", + "\n", + "print(\"=== Classification Report ===\")\n", + "print(classification_report(y_test, y_pred))\n", + "\n", + "print(\"=== Confusion Matrix ===\")\n", + "print(confusion_matrix(y_test, y_pred))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "64141093-c8fd-4118-aaf3-6e48454c5e76", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediksi untuk: Menurut Fatah, pengalaman belanja kali ini sangat memuaskan.\n", + "Hasil: positive\n" + ] + } + ], + "source": [ + "# -----------------------------------------\n", + "# 6. Prediksi Teks Baru (contoh 1 - positif)\n", + "# -----------------------------------------\n", + "sample_text = [\"Menurut Fatah, pengalaman belanja kali ini sangat memuaskan.\"]\n", + "sample_vec = tfidf.transform(sample_text)\n", + "prediction = model.predict(sample_vec)\n", + "\n", + "print(\"Prediksi untuk:\", sample_text[0])\n", + "print(\"Hasil:\", prediction[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d4bf8434-fe3b-4a88-a294-207fa731de7d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Prediksi untuk: Saya Fatah merasa kecewa karena layanan toko sangat buruk.\n", + "Hasil: negative\n" + ] + } + ], + "source": [ + "# -----------------------------------------\n", + "# 6. Prediksi Teks Baru (contoh 2 - negatif)\n", + "# -----------------------------------------\n", + "sample_text = [\"Saya Fatah merasa kecewa karena layanan toko sangat buruk.\"]\n", + "sample_vec = tfidf.transform(sample_text)\n", + "prediction = model.predict(sample_vec)\n", + "\n", + "print(\"Prediksi untuk:\", sample_text[0])\n", + "print(\"Hasil:\", prediction[0])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "724617be-aa1d-41bd-ad39-e6517fbcf837", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/N-Gram.ipynb b/N-Gram.ipynb new file mode 100644 index 0000000..0d74337 --- /dev/null +++ b/N-Gram.ipynb @@ -0,0 +1,362 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model Unigram, Bigram, dan Trigram\n", + "\n", + "**Nama:** Fatah Sabila Rosyad \n", + "**NIM:** 202210715288 \n", + "**Kelas:** F7B2 \n", + "\n", + "**Tujuan praktikum:** \n", + "Memahami cara menghitung frekuensi dan probabilitas kalimat menggunakan model Unigram, Bigram, dan Trigram pada teks bahasa Indonesia.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter\n", + "from IPython.display import clear_output\n", + "import math" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Corpus: fatah suka olahraga lari dan suka olahraga badminton\n", + "Tokens (8): ['fatah', 'suka', 'olahraga', 'lari', 'dan', 'suka', 'olahraga', 'badminton']\n", + "\n", + "Frekuensi Unigram dalam kalimat:\n", + " ('fatah'): 1\n", + " ('suka'): 2\n", + " ('olahraga'): 2\n", + " ('lari'): 1\n", + " ('dan'): 1\n", + " ('badminton'): 1\n", + "\n", + "Total unigram dalam 1 kalimat: 8\n", + "\n", + "Probabilitas masing-masing unigram:\n", + " P(fatah) = 0.1250 (12.50%)\n", + " P(suka) = 0.2500 (25.00%)\n", + " P(olahraga) = 0.2500 (25.00%)\n", + " P(lari) = 0.1250 (12.50%)\n", + " P(dan) = 0.1250 (12.50%)\n", + " P(badminton) = 0.1250 (12.50%)\n", + "\n", + "Probabilitas Keseluruhan Kalimat (Model Unigram):\n", + " P(fatah suka olahraga lari dan suka olahraga badminton) = P(fatah)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(lari)=0.1250 x P(dan)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(badminton)=0.1250 = 0.00000095 (0.000095%)\n" + ] + } + ], + "source": [ + "# ================= UNIGRAM =================\n", + "\n", + "# 1. Input Kalimat dan Tokenisasi\n", + "kalimat = input(\"Masukkan kalimat: \").strip()\n", + "\n", + "# Bersihkan output (khusus lingkungan notebook)\n", + "try:\n", + " clear_output()\n", + "except:\n", + " pass\n", + "\n", + "print(f\"Corpus: {kalimat}\")\n", + "\n", + "# Tokenize\n", + "tokens = kalimat.lower().split()\n", + "print(f\"Tokens ({len(tokens)}): {tokens}\")\n", + "\n", + "# 2. Hitung Frekuensi Unigram\n", + "unigram_counts = Counter(tokens)\n", + "total_tokens = sum(unigram_counts.values())\n", + "\n", + "print(\"\\nFrekuensi Unigram dalam kalimat:\")\n", + "for pair, count in unigram_counts.items():\n", + " print(f\" ('{pair}'): {count}\")\n", + "print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n", + "\n", + "# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n", + "unigram_probabilities = {}\n", + "for word, count in unigram_counts.items():\n", + " prob = count / total_tokens\n", + " unigram_probabilities[word] = prob\n", + "\n", + "print(\"\\nProbabilitas masing-masing unigram:\")\n", + "for word, prob in unigram_probabilities.items():\n", + " print(f\" P({word}) = {prob:.4f} ({prob*100:.2f}%)\")\n", + "\n", + "# 4. Probabilitas Kalimat Keseluruhan (P(w1) * P(w2) * ...)\n", + "p_kalimat = 1\n", + "prob_parts = []\n", + "\n", + "for word in tokens:\n", + " prob_value = unigram_probabilities[word]\n", + " p_kalimat *= prob_value\n", + " prob_parts.append(f\"P({word})={prob_value:.4f}\")\n", + "\n", + "prob_str = \" x \".join(prob_parts)\n", + "\n", + "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n", + "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Corpus: Fatah sedang belajar model bigram untuk menghitung probabilitas kalimat dan fatah sangat suka belajar\n", + "Tokens (14): ['fatah', 'sedang', 'belajar', 'model', 'bigram', 'untuk', 'menghitung', 'probabilitas', 'kalimat', 'dan', 'fatah', 'sangat', 'suka', 'belajar']\n", + "\n", + "Frekuensi Bigram dalam kalimat:\n", + " ('fatah', 'sedang'): 1\n", + " ('sedang', 'belajar'): 1\n", + " ('belajar', 'model'): 1\n", + " ('model', 'bigram'): 1\n", + " ('bigram', 'untuk'): 1\n", + " ('untuk', 'menghitung'): 1\n", + " ('menghitung', 'probabilitas'): 1\n", + " ('probabilitas', 'kalimat'): 1\n", + " ('kalimat', 'dan'): 1\n", + " ('dan', 'fatah'): 1\n", + " ('fatah', 'sangat'): 1\n", + " ('sangat', 'suka'): 1\n", + " ('suka', 'belajar'): 1\n", + "\n", + "Total bigram dalam 1 kalimat: 13\n", + "\n", + "Probabilitas masing-masing bigram:\n", + " P(sedang|fatah) = 0.5000 (50.00%)\n", + " P(belajar|sedang) = 1.0000 (100.00%)\n", + " P(model|belajar) = 0.5000 (50.00%)\n", + " P(bigram|model) = 1.0000 (100.00%)\n", + " P(untuk|bigram) = 1.0000 (100.00%)\n", + " P(menghitung|untuk) = 1.0000 (100.00%)\n", + " P(probabilitas|menghitung) = 1.0000 (100.00%)\n", + " P(kalimat|probabilitas) = 1.0000 (100.00%)\n", + " P(dan|kalimat) = 1.0000 (100.00%)\n", + " P(fatah|dan) = 1.0000 (100.00%)\n", + " P(sangat|fatah) = 0.5000 (50.00%)\n", + " P(suka|sangat) = 1.0000 (100.00%)\n", + " P(belajar|suka) = 1.0000 (100.00%)\n", + "\n", + "Probabilitas Keseluruhan Kalimat (Model Bigram):\n", + " P(fatah sedang belajar model bigram untuk menghitung probabilitas kalimat dan fatah sangat suka belajar) = P(fatah)=0.1429 x P(sedang|fatah)=0.5000 x P(belajar|sedang)=1.0000 x P(model|belajar)=0.5000 x P(bigram|model)=1.0000 x P(untuk|bigram)=1.0000 x P(menghitung|untuk)=1.0000 x P(probabilitas|menghitung)=1.0000 x P(kalimat|probabilitas)=1.0000 x P(dan|kalimat)=1.0000 x P(fatah|dan)=1.0000 x P(sangat|fatah)=0.5000 x P(suka|sangat)=1.0000 x P(belajar|suka)=1.0000 = 0.01785714 (1.785714%)\n" + ] + } + ], + "source": [ + "# ================= BIGRAM =================\n", + "\n", + "kalimat = input(\"Masukkan kalimat: \").strip()\n", + "\n", + "try:\n", + " clear_output()\n", + "except:\n", + " pass\n", + "\n", + "print(f\"Corpus: {kalimat}\")\n", + "\n", + "# Tokenisasi\n", + "tokens = kalimat.lower().split()\n", + "print(f\"Tokens ({len(tokens)}): {tokens}\")\n", + "\n", + "# 2. Frekuensi Unigram dan Bigram\n", + "unigram_counts = Counter(tokens)\n", + "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n", + "bigram_counts = Counter(bigrams)\n", + "\n", + "print(\"\\nFrekuensi Bigram dalam kalimat:\")\n", + "for pair, count in bigram_counts.items():\n", + " print(f\" {pair}: {count}\")\n", + "print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n", + "\n", + "# 3. Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n", + "bigram_probabilities = {}\n", + "for (w1, w2), count in bigram_counts.items():\n", + " prob = count / unigram_counts[w1]\n", + " bigram_probabilities[(w1, w2)] = prob\n", + "\n", + "print(\"\\nProbabilitas masing-masing bigram:\")\n", + "for (w1, w2), prob in bigram_probabilities.items():\n", + " print(f\" P({w2}|{w1}) = {prob:.4f} ({prob*100:.2f}%)\")\n", + "\n", + "# 4. Probabilitas Kalimat (Model Bigram)\n", + "total_tokens = sum(unigram_counts.values())\n", + "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens\n", + "p_kalimat = p_w1\n", + "\n", + "prob_str_parts = [f\"P({tokens[0]})={p_w1:.4f}\"]\n", + "\n", + "for i in range(1, len(tokens)):\n", + " pair = (tokens[i-1], tokens[i])\n", + " p = bigram_probabilities.get(pair, 0)\n", + " p_kalimat *= p\n", + " prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.4f}\")\n", + "\n", + "prob_str = \" x \".join(prob_str_parts)\n", + "\n", + "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n", + "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Corpus: Pada praktikum ini Fatah sedang mempelajari model trigram untuk kalimat bahasa Indonesia\n", + "Tokens (12): ['pada', 'praktikum', 'ini', 'fatah', 'sedang', 'mempelajari', 'model', 'trigram', 'untuk', 'kalimat', 'bahasa', 'indonesia']\n", + "\n", + "Frekuensi Trigram dalam kalimat:\n", + " ('pada', 'praktikum', 'ini'): 1\n", + " ('praktikum', 'ini', 'fatah'): 1\n", + " ('ini', 'fatah', 'sedang'): 1\n", + " ('fatah', 'sedang', 'mempelajari'): 1\n", + " ('sedang', 'mempelajari', 'model'): 1\n", + " ('mempelajari', 'model', 'trigram'): 1\n", + " ('model', 'trigram', 'untuk'): 1\n", + " ('trigram', 'untuk', 'kalimat'): 1\n", + " ('untuk', 'kalimat', 'bahasa'): 1\n", + " ('kalimat', 'bahasa', 'indonesia'): 1\n", + "\n", + "Total trigram dalam 1 kalimat: 10\n", + "\n", + "Probabilitas masing-masing trigram:\n", + " P(ini|pada,praktikum) = 1.0000 (100.00%)\n", + " P(fatah|praktikum,ini) = 1.0000 (100.00%)\n", + " P(sedang|ini,fatah) = 1.0000 (100.00%)\n", + " P(mempelajari|fatah,sedang) = 1.0000 (100.00%)\n", + " P(model|sedang,mempelajari) = 1.0000 (100.00%)\n", + " P(trigram|mempelajari,model) = 1.0000 (100.00%)\n", + " P(untuk|model,trigram) = 1.0000 (100.00%)\n", + " P(kalimat|trigram,untuk) = 1.0000 (100.00%)\n", + " P(bahasa|untuk,kalimat) = 1.0000 (100.00%)\n", + " P(indonesia|kalimat,bahasa) = 1.0000 (100.00%)\n", + "\n", + "Probabilitas Keseluruhan Kalimat (Model Trigram):\n", + " P(pada praktikum ini fatah sedang mempelajari model trigram untuk kalimat bahasa indonesia) = P(pada)=0.0833 x P(praktikum|pada)=1.0000 x P(ini|pada,praktikum)=1.0000 x P(fatah|praktikum,ini)=1.0000 x P(sedang|ini,fatah)=1.0000 x P(mempelajari|fatah,sedang)=1.0000 x P(model|sedang,mempelajari)=1.0000 x P(trigram|mempelajari,model)=1.0000 x P(untuk|model,trigram)=1.0000 x P(kalimat|trigram,untuk)=1.0000 x P(bahasa|untuk,kalimat)=1.0000 x P(indonesia|kalimat,bahasa)=1.0000 = 0.08333333 (8.333333%)\n" + ] + } + ], + "source": [ + "# ================= TRIGRAM =================\n", + "\n", + "kalimat = input(\"Masukkan kalimat: \").strip()\n", + "\n", + "try:\n", + " clear_output()\n", + "except:\n", + " pass\n", + "\n", + "print(f\"Corpus: {kalimat}\")\n", + "\n", + "# Tokenisasi\n", + "tokens = kalimat.lower().split()\n", + "print(f\"Tokens ({len(tokens)}): {tokens}\")\n", + "\n", + "# 2. Frekuensi Bigram dan Trigram\n", + "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n", + "trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n", + "\n", + "bigram_counts = Counter(bigrams)\n", + "trigram_counts = Counter(trigrams)\n", + "\n", + "print(\"\\nFrekuensi Trigram dalam kalimat:\")\n", + "for tg, count in trigram_counts.items():\n", + " print(f\" {tg}: {count}\")\n", + "print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n", + "\n", + "# 3. Probabilitas Trigram: P(w3 | w1, w2)\n", + "trigram_probabilities = {}\n", + "for (w1, w2, w3), count in trigram_counts.items():\n", + " if bigram_counts[(w1, w2)] > 0:\n", + " prob = count / bigram_counts[(w1, w2)]\n", + " else:\n", + " prob = 0\n", + " trigram_probabilities[(w1, w2, w3)] = prob\n", + "\n", + "print(\"\\nProbabilitas masing-masing trigram:\")\n", + "for (w1, w2, w3), prob in trigram_probabilities.items():\n", + " print(f\" P({w3}|{w1},{w2}) = {prob:.4f} ({prob*100:.2f}%)\")\n", + "\n", + "# 4. Probabilitas Kalimat (Model Trigram)\n", + "\n", + "unigram_counts = Counter(tokens)\n", + "total_tokens = sum(unigram_counts.values())\n", + "\n", + "# P(w1)\n", + "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n", + "\n", + "# P(w2|w1)\n", + "if len(tokens) > 1:\n", + " count_w1 = unigram_counts.get(tokens[0], 1)\n", + " p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n", + "else:\n", + " p_w2_w1 = 1.0\n", + "\n", + "p_kalimat = p_w1 * p_w2_w1\n", + "\n", + "prob_str_parts = [f\"P({tokens[0]})={p_w1:.4f}\"]\n", + "if len(tokens) > 1:\n", + " prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.4f}\")\n", + "\n", + "# Perkalian trigram untuk i >= 3\n", + "for i in range(len(tokens) - 2):\n", + " triplet = (tokens[i], tokens[i+1], tokens[i+2])\n", + " p = trigram_probabilities.get(triplet, 0)\n", + " p_kalimat *= p\n", + " prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.4f}\")\n", + "\n", + "prob_str = \" x \".join(prob_str_parts)\n", + "\n", + "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n", + "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\")\n" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}