{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "JVPdWpz3hhbj" }, "source": [ "\n" ] }, { "cell_type": "markdown", "metadata": { "id": "4Mvva3v65h1v" }, "source": [ "# **UNIGRAM**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1cub_VJnUJMl", "outputId": "1889eb61-4f3f-4780-f42e-02368076cce3" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Corpus: saya suka makan nasi\n", "Tokens (4): ['saya', 'suka', 'makan', 'nasi']\n", "\n", "Frekuensi Unigram dalam kalimat\n", " ('saya'): 1\n", " ('suka'): 1\n", " ('makan'): 1\n", " ('nasi'): 1\n", "\n", "Total unigram dalam 1 kalimat: 4\n", "\n", "Probabilitas masing-masing unigram:\n", " P(saya) = 0.25 (25.00%)\n", " P(suka) = 0.25 (25.00%)\n", " P(makan) = 0.25 (25.00%)\n", " P(nasi) = 0.25 (25.00%)\n", "\n", "Probabilitas Keseluruhan Kalimat (Model Unigram):\n", " P(saya suka makan nasi) = P(saya)=0.25 x P(suka)=0.25 x P(makan)=0.25 x P(nasi)=0.25 = 0.0039 (0.39%)\n" ] } ], "source": [ "from collections import Counter\n", "from IPython.display import clear_output\n", "import math\n", "\n", "# 1. Input Kalimat dan Tokenisasi\n", "kalimat = input(\"Masukkan kalimat: \").strip()\n", "\n", "# Bersihkan output (khusus lingkungan notebook)\n", "try:\n", " clear_output()\n", "except:\n", " pass\n", "\n", "print(f\"Corpus: {kalimat}\")\n", "\n", "# Tokenize\n", "tokens = kalimat.lower().split()\n", "print(f\"Tokens ({len(tokens)}): {tokens}\")\n", "\n", "# 2. Hitung Frekuensi Unigram\n", "unigram_counts = Counter(tokens)\n", "total_tokens = sum(unigram_counts.values())\n", "\n", "print(\"\\nFrekuensi Unigram dalam kalimat\")\n", "for pair, count in unigram_counts.items():\n", " print(f\" ('{pair}'): {count}\")\n", "print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n", "\n", "# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n", "unigram_probabilities = {}\n", "for word, count in unigram_counts.items():\n", " prob = count / total_tokens\n", " unigram_probabilities[word] = prob\n", "\n", "print(\"\\nProbabilitas masing-masing unigram:\")\n", "for word, prob in unigram_probabilities.items():\n", " print(f\" P({word}) = {prob:.2f} ({prob*100:.2f}%)\")\n", "\n", "# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)\n", "p_kalimat = 1\n", "prob_parts = []\n", "\n", "# Loop untuk menghitung probabilitas total dan membangun string rumus detail\n", "for word in tokens:\n", " prob_value = unigram_probabilities[word]\n", " p_kalimat *= prob_value\n", " # Format: P(word)=prob_value\n", " prob_parts.append(f\"P({word})={prob_value:.2f}\")\n", "\n", "# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail\n", "prob_str = \" x \".join(prob_parts)\n", "\n", "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n", "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)\")" ] }, { "cell_type": "markdown", "metadata": { "id": "Vstwt996-FrS" }, "source": [ "# **BIGRAM**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XRIY4qgTVbjl", "outputId": "ea6e62ce-45a0-40c9-ca98-1fcc30558479" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Corpus: saya makan nasi dan saya makan roti\n", "Tokens (7): ['saya', 'makan', 'nasi', 'dan', 'saya', 'makan', 'roti']\n", "\n", "Frekuensi Bigram dalam kalimat:\n", " ('saya', 'makan'): 2\n", " ('makan', 'nasi'): 1\n", " ('nasi', 'dan'): 1\n", " ('dan', 'saya'): 1\n", " ('makan', 'roti'): 1\n", "\n", "Total bigram dalam 1 kalimat: 6\n", "\n", "Probabilitas masing-masing bigram:\n", " P(makan|saya) = 1.00 (100.00%)\n", " P(nasi|makan) = 0.50 (50.00%)\n", " P(dan|nasi) = 1.00 (100.00%)\n", " P(saya|dan) = 1.00 (100.00%)\n", " P(roti|makan) = 0.50 (50.00%)\n", "\n", "Probabilitas Keseluruhan Kalimat (Model Bigram):\n", " P(saya makan nasi dan saya makan roti) = P(saya)=0.29 x P(makan|saya)=1.00 x P(nasi|makan)=0.50 x P(dan|nasi)=1.00 x P(saya|dan)=1.00 x P(makan|saya)=1.00 x P(roti|makan)=0.50 = 0.071429 (7.14%)\n" ] } ], "source": [ "from collections import Counter\n", "from IPython.display import clear_output\n", "import math\n", "\n", "# 1. Input Kalimat dan Tokenisasi\n", "kalimat = input(\"Masukkan kalimat: \").strip()\n", "\n", "# Bersihkan output (khusus lingkungan notebook)\n", "try:\n", " clear_output()\n", "except:\n", " pass\n", "\n", "print(f\"Corpus: {kalimat}\")\n", "\n", "# Tokenisasi\n", "tokens = kalimat.lower().split()\n", "print(f\"Tokens ({len(tokens)}): {tokens}\")\n", "\n", "# 2. Hitung Frekuensi Unigram dan Bigram\n", "unigram_counts = Counter(tokens)\n", "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n", "bigram_counts = Counter(bigrams)\n", "\n", "print(\"\\nFrekuensi Bigram dalam kalimat:\")\n", "for pair, count in bigram_counts.items():\n", " print(f\" {pair}: {count}\")\n", "print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n", "\n", "# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n", "bigram_probabilities = {}\n", "for (w1, w2), count in bigram_counts.items():\n", " prob = count / unigram_counts[w1]\n", " bigram_probabilities[(w1, w2)] = prob\n", "\n", "print(\"\\nProbabilitas masing-masing bigram:\")\n", "for (w1, w2), prob in bigram_probabilities.items():\n", " print(f\" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)\")\n", "\n", "# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)\n", "# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...\n", "total_tokens = sum(unigram_counts.values())\n", "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)\n", "p_kalimat = p_w1 # Inisialisasi dengan P(w1)\n", "\n", "prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"] # Tambahkan P(w1) ke rumus\n", "\n", "for i in range(1, len(tokens)):\n", " pair = (tokens[i-1], tokens[i])\n", " p = bigram_probabilities.get(pair, 0)\n", " p_kalimat *= p\n", " prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.2f}\")\n", "\n", "# Gabungkan rumus perkalian untuk ditampilkan\n", "prob_str = \" x \".join(prob_str_parts)\n", "\n", "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n", "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")" ] }, { "cell_type": "markdown", "metadata": { "id": "E6n1IU8X-G9S" }, "source": [ "# **TRIGRAM**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BIRARsj2FHJg", "outputId": "968d420e-9370-40e5-e7e1-148e1d351d62" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Corpus: mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning\n", "Tokens (7): ['mahasiswa', 'mengerjakan', 'tugas', 'kemudian', 'mahasiswa', 'upload', 'e-learning']\n", "\n", "Frekuensi Trigram dalam kalimat:\n", " ('mahasiswa', 'mengerjakan', 'tugas'): 1\n", " ('mengerjakan', 'tugas', 'kemudian'): 1\n", " ('tugas', 'kemudian', 'mahasiswa'): 1\n", " ('kemudian', 'mahasiswa', 'upload'): 1\n", " ('mahasiswa', 'upload', 'e-learning'): 1\n", "\n", "Total trigram dalam 1 kalimat: 5\n", "\n", "Probabilitas masing-masing trigram:\n", " P(tugas|mahasiswa,mengerjakan) = 1.00 (100.00%)\n", " P(kemudian|mengerjakan,tugas) = 1.00 (100.00%)\n", " P(mahasiswa|tugas,kemudian) = 1.00 (100.00%)\n", " P(upload|kemudian,mahasiswa) = 1.00 (100.00%)\n", " P(e-learning|mahasiswa,upload) = 1.00 (100.00%)\n", "\n", "Probabilitas Keseluruhan Kalimat (Model Trigram):\n", " P(mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning) = P(mahasiswa)=0.29 x P(mengerjakan|mahasiswa)=0.50 x P(tugas|mahasiswa,mengerjakan)=1.00 x P(kemudian|mengerjakan,tugas)=1.00 x P(mahasiswa|tugas,kemudian)=1.00 x P(upload|kemudian,mahasiswa)=1.00 x P(e-learning|mahasiswa,upload)=1.00 = 0.142857 (14.29%)\n" ] } ], "source": [ "from collections import Counter\n", "from IPython.display import clear_output\n", "import math\n", "\n", "# 1. Input Kalimat dan Tokenisasi\n", "kalimat = input(\"Masukkan kalimat: \").strip()\n", "\n", "# Bersihkan output (khusus lingkungan notebook)\n", "try:\n", " clear_output()\n", "except:\n", " pass\n", "\n", "print(f\"Corpus: {kalimat}\")\n", "\n", "# Tokenisasi\n", "tokens = kalimat.lower().split()\n", "print(f\"Tokens ({len(tokens)}): {tokens}\")\n", "\n", "# 2. Hitung Frekuensi Bigram dan Trigram\n", "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n", "trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n", "\n", "bigram_counts = Counter(bigrams)\n", "trigram_counts = Counter(trigrams)\n", "\n", "print(\"\\nFrekuensi Trigram dalam kalimat:\")\n", "for tg, count in trigram_counts.items():\n", " print(f\" {tg}: {count}\")\n", "print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n", "\n", "# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)\n", "trigram_probabilities = {}\n", "for (w1, w2, w3), count in trigram_counts.items():\n", " # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)\n", " if bigram_counts[(w1, w2)] > 0:\n", " prob = count / bigram_counts[(w1, w2)]\n", " else:\n", " prob = 0\n", " trigram_probabilities[(w1, w2, w3)] = prob\n", "\n", "print(\"\\nProbabilitas masing-masing trigram:\")\n", "for (w1, w2, w3), prob in trigram_probabilities.items():\n", " print(f\" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)\")\n", "\n", "# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))\n", "unigram_counts = Counter(tokens)\n", "total_tokens = sum(unigram_counts.values())\n", "\n", "# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)\n", "# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...\n", "\n", "# a. P(w1)\n", "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n", "\n", "# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)\n", "if len(tokens) > 1:\n", " count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0\n", " p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n", "else:\n", " p_w2_w1 = 1.0 # Jika hanya 1 kata\n", "\n", "p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)\n", "\n", "# Daftar bagian rumus untuk ditampilkan\n", "prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"]\n", "if len(tokens) > 1:\n", " prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}\")\n", "\n", "# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3\n", "for i in range(len(tokens) - 2):\n", " triplet = (tokens[i], tokens[i+1], tokens[i+2])\n", " p = trigram_probabilities.get(triplet, 0)\n", " p_kalimat *= p\n", " prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}\")\n", "\n", "prob_str = \" x \".join(prob_str_parts)\n", "\n", "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n", "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")\n" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 4 }