{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Model Unigram, Bigram, dan Trigram\n", "\n", "**Nama:** Fatah Sabila Rosyad \n", "**NIM:** 202210715288 \n", "**Kelas:** F7B2 \n", "\n", "**Tujuan praktikum:** \n", "Memahami cara menghitung frekuensi dan probabilitas kalimat menggunakan model Unigram, Bigram, dan Trigram pada teks bahasa Indonesia.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from collections import Counter\n", "from IPython.display import clear_output\n", "import math" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Corpus: fatah suka olahraga lari dan suka olahraga badminton\n", "Tokens (8): ['fatah', 'suka', 'olahraga', 'lari', 'dan', 'suka', 'olahraga', 'badminton']\n", "\n", "Frekuensi Unigram dalam kalimat:\n", " ('fatah'): 1\n", " ('suka'): 2\n", " ('olahraga'): 2\n", " ('lari'): 1\n", " ('dan'): 1\n", " ('badminton'): 1\n", "\n", "Total unigram dalam 1 kalimat: 8\n", "\n", "Probabilitas masing-masing unigram:\n", " P(fatah) = 0.1250 (12.50%)\n", " P(suka) = 0.2500 (25.00%)\n", " P(olahraga) = 0.2500 (25.00%)\n", " P(lari) = 0.1250 (12.50%)\n", " P(dan) = 0.1250 (12.50%)\n", " P(badminton) = 0.1250 (12.50%)\n", "\n", "Probabilitas Keseluruhan Kalimat (Model Unigram):\n", " P(fatah suka olahraga lari dan suka olahraga badminton) = P(fatah)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(lari)=0.1250 x P(dan)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(badminton)=0.1250 = 0.00000095 (0.000095%)\n" ] } ], "source": [ "# ================= UNIGRAM =================\n", "\n", "# 1. Input Kalimat dan Tokenisasi\n", "kalimat = input(\"Masukkan kalimat: \").strip()\n", "\n", "# Bersihkan output (khusus lingkungan notebook)\n", "try:\n", " clear_output()\n", "except:\n", " pass\n", "\n", "print(f\"Corpus: {kalimat}\")\n", "\n", "# Tokenize\n", "tokens = kalimat.lower().split()\n", "print(f\"Tokens ({len(tokens)}): {tokens}\")\n", "\n", "# 2. Hitung Frekuensi Unigram\n", "unigram_counts = Counter(tokens)\n", "total_tokens = sum(unigram_counts.values())\n", "\n", "print(\"\\nFrekuensi Unigram dalam kalimat:\")\n", "for pair, count in unigram_counts.items():\n", " print(f\" ('{pair}'): {count}\")\n", "print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n", "\n", "# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n", "unigram_probabilities = {}\n", "for word, count in unigram_counts.items():\n", " prob = count / total_tokens\n", " unigram_probabilities[word] = prob\n", "\n", "print(\"\\nProbabilitas masing-masing unigram:\")\n", "for word, prob in unigram_probabilities.items():\n", " print(f\" P({word}) = {prob:.4f} ({prob*100:.2f}%)\")\n", "\n", "# 4. Probabilitas Kalimat Keseluruhan (P(w1) * P(w2) * ...)\n", "p_kalimat = 1\n", "prob_parts = []\n", "\n", "for word in tokens:\n", " prob_value = unigram_probabilities[word]\n", " p_kalimat *= prob_value\n", " prob_parts.append(f\"P({word})={prob_value:.4f}\")\n", "\n", "prob_str = \" x \".join(prob_parts)\n", "\n", "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n", "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\")\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Corpus: Fatah sedang belajar model bigram untuk menghitung probabilitas kalimat dan fatah sangat suka belajar\n", "Tokens (14): ['fatah', 'sedang', 'belajar', 'model', 'bigram', 'untuk', 'menghitung', 'probabilitas', 'kalimat', 'dan', 'fatah', 'sangat', 'suka', 'belajar']\n", "\n", "Frekuensi Bigram dalam kalimat:\n", " ('fatah', 'sedang'): 1\n", " ('sedang', 'belajar'): 1\n", " ('belajar', 'model'): 1\n", " ('model', 'bigram'): 1\n", " ('bigram', 'untuk'): 1\n", " ('untuk', 'menghitung'): 1\n", " ('menghitung', 'probabilitas'): 1\n", " ('probabilitas', 'kalimat'): 1\n", " ('kalimat', 'dan'): 1\n", " ('dan', 'fatah'): 1\n", " ('fatah', 'sangat'): 1\n", " ('sangat', 'suka'): 1\n", " ('suka', 'belajar'): 1\n", "\n", "Total bigram dalam 1 kalimat: 13\n", "\n", "Probabilitas masing-masing bigram:\n", " P(sedang|fatah) = 0.5000 (50.00%)\n", " P(belajar|sedang) = 1.0000 (100.00%)\n", " P(model|belajar) = 0.5000 (50.00%)\n", " P(bigram|model) = 1.0000 (100.00%)\n", " P(untuk|bigram) = 1.0000 (100.00%)\n", " P(menghitung|untuk) = 1.0000 (100.00%)\n", " P(probabilitas|menghitung) = 1.0000 (100.00%)\n", " P(kalimat|probabilitas) = 1.0000 (100.00%)\n", " P(dan|kalimat) = 1.0000 (100.00%)\n", " P(fatah|dan) = 1.0000 (100.00%)\n", " P(sangat|fatah) = 0.5000 (50.00%)\n", " P(suka|sangat) = 1.0000 (100.00%)\n", " P(belajar|suka) = 1.0000 (100.00%)\n", "\n", "Probabilitas Keseluruhan Kalimat (Model Bigram):\n", " P(fatah sedang belajar model bigram untuk menghitung probabilitas kalimat dan fatah sangat suka belajar) = P(fatah)=0.1429 x P(sedang|fatah)=0.5000 x P(belajar|sedang)=1.0000 x P(model|belajar)=0.5000 x P(bigram|model)=1.0000 x P(untuk|bigram)=1.0000 x P(menghitung|untuk)=1.0000 x P(probabilitas|menghitung)=1.0000 x P(kalimat|probabilitas)=1.0000 x P(dan|kalimat)=1.0000 x P(fatah|dan)=1.0000 x P(sangat|fatah)=0.5000 x P(suka|sangat)=1.0000 x P(belajar|suka)=1.0000 = 0.01785714 (1.785714%)\n" ] } ], "source": [ "# ================= BIGRAM =================\n", "\n", "kalimat = input(\"Masukkan kalimat: \").strip()\n", "\n", "try:\n", " clear_output()\n", "except:\n", " pass\n", "\n", "print(f\"Corpus: {kalimat}\")\n", "\n", "# Tokenisasi\n", "tokens = kalimat.lower().split()\n", "print(f\"Tokens ({len(tokens)}): {tokens}\")\n", "\n", "# 2. Frekuensi Unigram dan Bigram\n", "unigram_counts = Counter(tokens)\n", "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n", "bigram_counts = Counter(bigrams)\n", "\n", "print(\"\\nFrekuensi Bigram dalam kalimat:\")\n", "for pair, count in bigram_counts.items():\n", " print(f\" {pair}: {count}\")\n", "print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n", "\n", "# 3. Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n", "bigram_probabilities = {}\n", "for (w1, w2), count in bigram_counts.items():\n", " prob = count / unigram_counts[w1]\n", " bigram_probabilities[(w1, w2)] = prob\n", "\n", "print(\"\\nProbabilitas masing-masing bigram:\")\n", "for (w1, w2), prob in bigram_probabilities.items():\n", " print(f\" P({w2}|{w1}) = {prob:.4f} ({prob*100:.2f}%)\")\n", "\n", "# 4. Probabilitas Kalimat (Model Bigram)\n", "total_tokens = sum(unigram_counts.values())\n", "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens\n", "p_kalimat = p_w1\n", "\n", "prob_str_parts = [f\"P({tokens[0]})={p_w1:.4f}\"]\n", "\n", "for i in range(1, len(tokens)):\n", " pair = (tokens[i-1], tokens[i])\n", " p = bigram_probabilities.get(pair, 0)\n", " p_kalimat *= p\n", " prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.4f}\")\n", "\n", "prob_str = \" x \".join(prob_str_parts)\n", "\n", "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n", "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\")\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Corpus: Pada praktikum ini Fatah sedang mempelajari model trigram untuk kalimat bahasa Indonesia\n", "Tokens (12): ['pada', 'praktikum', 'ini', 'fatah', 'sedang', 'mempelajari', 'model', 'trigram', 'untuk', 'kalimat', 'bahasa', 'indonesia']\n", "\n", "Frekuensi Trigram dalam kalimat:\n", " ('pada', 'praktikum', 'ini'): 1\n", " ('praktikum', 'ini', 'fatah'): 1\n", " ('ini', 'fatah', 'sedang'): 1\n", " ('fatah', 'sedang', 'mempelajari'): 1\n", " ('sedang', 'mempelajari', 'model'): 1\n", " ('mempelajari', 'model', 'trigram'): 1\n", " ('model', 'trigram', 'untuk'): 1\n", " ('trigram', 'untuk', 'kalimat'): 1\n", " ('untuk', 'kalimat', 'bahasa'): 1\n", " ('kalimat', 'bahasa', 'indonesia'): 1\n", "\n", "Total trigram dalam 1 kalimat: 10\n", "\n", "Probabilitas masing-masing trigram:\n", " P(ini|pada,praktikum) = 1.0000 (100.00%)\n", " P(fatah|praktikum,ini) = 1.0000 (100.00%)\n", " P(sedang|ini,fatah) = 1.0000 (100.00%)\n", " P(mempelajari|fatah,sedang) = 1.0000 (100.00%)\n", " P(model|sedang,mempelajari) = 1.0000 (100.00%)\n", " P(trigram|mempelajari,model) = 1.0000 (100.00%)\n", " P(untuk|model,trigram) = 1.0000 (100.00%)\n", " P(kalimat|trigram,untuk) = 1.0000 (100.00%)\n", " P(bahasa|untuk,kalimat) = 1.0000 (100.00%)\n", " P(indonesia|kalimat,bahasa) = 1.0000 (100.00%)\n", "\n", "Probabilitas Keseluruhan Kalimat (Model Trigram):\n", " P(pada praktikum ini fatah sedang mempelajari model trigram untuk kalimat bahasa indonesia) = P(pada)=0.0833 x P(praktikum|pada)=1.0000 x P(ini|pada,praktikum)=1.0000 x P(fatah|praktikum,ini)=1.0000 x P(sedang|ini,fatah)=1.0000 x P(mempelajari|fatah,sedang)=1.0000 x P(model|sedang,mempelajari)=1.0000 x P(trigram|mempelajari,model)=1.0000 x P(untuk|model,trigram)=1.0000 x P(kalimat|trigram,untuk)=1.0000 x P(bahasa|untuk,kalimat)=1.0000 x P(indonesia|kalimat,bahasa)=1.0000 = 0.08333333 (8.333333%)\n" ] } ], "source": [ "# ================= TRIGRAM =================\n", "\n", "kalimat = input(\"Masukkan kalimat: \").strip()\n", "\n", "try:\n", " clear_output()\n", "except:\n", " pass\n", "\n", "print(f\"Corpus: {kalimat}\")\n", "\n", "# Tokenisasi\n", "tokens = kalimat.lower().split()\n", "print(f\"Tokens ({len(tokens)}): {tokens}\")\n", "\n", "# 2. Frekuensi Bigram dan Trigram\n", "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n", "trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n", "\n", "bigram_counts = Counter(bigrams)\n", "trigram_counts = Counter(trigrams)\n", "\n", "print(\"\\nFrekuensi Trigram dalam kalimat:\")\n", "for tg, count in trigram_counts.items():\n", " print(f\" {tg}: {count}\")\n", "print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n", "\n", "# 3. Probabilitas Trigram: P(w3 | w1, w2)\n", "trigram_probabilities = {}\n", "for (w1, w2, w3), count in trigram_counts.items():\n", " if bigram_counts[(w1, w2)] > 0:\n", " prob = count / bigram_counts[(w1, w2)]\n", " else:\n", " prob = 0\n", " trigram_probabilities[(w1, w2, w3)] = prob\n", "\n", "print(\"\\nProbabilitas masing-masing trigram:\")\n", "for (w1, w2, w3), prob in trigram_probabilities.items():\n", " print(f\" P({w3}|{w1},{w2}) = {prob:.4f} ({prob*100:.2f}%)\")\n", "\n", "# 4. Probabilitas Kalimat (Model Trigram)\n", "\n", "unigram_counts = Counter(tokens)\n", "total_tokens = sum(unigram_counts.values())\n", "\n", "# P(w1)\n", "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n", "\n", "# P(w2|w1)\n", "if len(tokens) > 1:\n", " count_w1 = unigram_counts.get(tokens[0], 1)\n", " p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n", "else:\n", " p_w2_w1 = 1.0\n", "\n", "p_kalimat = p_w1 * p_w2_w1\n", "\n", "prob_str_parts = [f\"P({tokens[0]})={p_w1:.4f}\"]\n", "if len(tokens) > 1:\n", " prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.4f}\")\n", "\n", "# Perkalian trigram untuk i >= 3\n", "for i in range(len(tokens) - 2):\n", " triplet = (tokens[i], tokens[i+1], tokens[i+2])\n", " p = trigram_probabilities.get(triplet, 0)\n", " p_kalimat *= p\n", " prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.4f}\")\n", "\n", "prob_str = \" x \".join(prob_str_parts)\n", "\n", "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n", "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\")\n" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.5" } }, "nbformat": 4, "nbformat_minor": 4 }