From 058e263d0e5c04558b7d3daa17f23ce0ff3db671 Mon Sep 17 00:00:00 2001
From: 202210715229 ALPRIAN BAHARAJA SITORUS
 <202210715229@mhs.ubharajaya.ac.id>
Date: Wed, 21 Jan 2026 01:50:34 +0700
Subject: [PATCH] Upload files to "/"

---
 N-Gram.ipynb | 395 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 395 insertions(+)
 create mode 100644 N-Gram.ipynb

diff --git a/N-Gram.ipynb b/N-Gram.ipynb
new file mode 100644
index 0000000..1b197d6
--- /dev/null
+++ b/N-Gram.ipynb
@@ -0,0 +1,395 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "JVPdWpz3hhbj"
+   },
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "4Mvva3v65h1v"
+   },
+   "source": [
+    "# **UNIGRAM**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "1cub_VJnUJMl",
+    "outputId": "1889eb61-4f3f-4780-f42e-02368076cce3"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Corpus: Saya sangat mencintai pacar saya\n",
+      "Tokens (5): ['saya', 'sangat', 'mencintai', 'pacar', 'saya']\n",
+      "\n",
+      "Frekuensi Unigram dalam kalimat\n",
+      " ('saya'): 2\n",
+      " ('sangat'): 1\n",
+      " ('mencintai'): 1\n",
+      " ('pacar'): 1\n",
+      "\n",
+      "Total unigram dalam 1 kalimat: 5\n",
+      "\n",
+      "Probabilitas masing-masing unigram:\n",
+      " P(saya) = 0.40 (40.00%)\n",
+      " P(sangat) = 0.20 (20.00%)\n",
+      " P(mencintai) = 0.20 (20.00%)\n",
+      " P(pacar) = 0.20 (20.00%)\n",
+      "\n",
+      "Probabilitas Keseluruhan Kalimat (Model Unigram):\n",
+      " P(saya sangat mencintai pacar saya) = P(saya)=0.40 x P(sangat)=0.20 x P(mencintai)=0.20 x P(pacar)=0.20 x P(saya)=0.40 = 0.0013 (0.13%)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from collections import Counter\n",
+    "from IPython.display import clear_output\n",
+    "import math\n",
+    "\n",
+    "# 1. Input Kalimat dan Tokenisasi\n",
+    "kalimat = input(\"Masukkan kalimat: \").strip()\n",
+    "\n",
+    "# Bersihkan output (khusus lingkungan notebook)\n",
+    "try:\n",
+    "    clear_output()\n",
+    "except:\n",
+    "    pass\n",
+    "\n",
+    "print(f\"Corpus: {kalimat}\")\n",
+    "\n",
+    "# Tokenize\n",
+    "tokens = kalimat.lower().split()\n",
+    "print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
+    "\n",
+    "# 2. Hitung Frekuensi Unigram\n",
+    "unigram_counts = Counter(tokens)\n",
+    "total_tokens = sum(unigram_counts.values())\n",
+    "\n",
+    "print(\"\\nFrekuensi Unigram dalam kalimat\")\n",
+    "for pair, count in unigram_counts.items():\n",
+    "    print(f\" ('{pair}'): {count}\")\n",
+    "print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n",
+    "\n",
+    "# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n",
+    "unigram_probabilities = {}\n",
+    "for word, count in unigram_counts.items():\n",
+    "    prob = count / total_tokens\n",
+    "    unigram_probabilities[word] = prob\n",
+    "\n",
+    "print(\"\\nProbabilitas masing-masing unigram:\")\n",
+    "for word, prob in unigram_probabilities.items():\n",
+    "    print(f\" P({word}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
+    "\n",
+    "# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)\n",
+    "p_kalimat = 1\n",
+    "prob_parts = []\n",
+    "\n",
+    "# Loop untuk menghitung probabilitas total dan membangun string rumus detail\n",
+    "for word in tokens:\n",
+    "    prob_value = unigram_probabilities[word]\n",
+    "    p_kalimat *= prob_value\n",
+    "    # Format: P(word)=prob_value\n",
+    "    prob_parts.append(f\"P({word})={prob_value:.2f}\")\n",
+    "\n",
+    "# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail\n",
+    "prob_str = \" x \".join(prob_parts)\n",
+    "\n",
+    "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n",
+    "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Vstwt996-FrS"
+   },
+   "source": [
+    "# **BIGRAM**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "XRIY4qgTVbjl",
+    "outputId": "ea6e62ce-45a0-40c9-ca98-1fcc30558479"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Corpus: Saya adalah pemain liga sepak bola terbaik di dunia\n",
+      "Tokens (9): ['saya', 'adalah', 'pemain', 'liga', 'sepak', 'bola', 'terbaik', 'di', 'dunia']\n",
+      "\n",
+      "Frekuensi Bigram dalam kalimat:\n",
+      " ('saya', 'adalah'): 1\n",
+      " ('adalah', 'pemain'): 1\n",
+      " ('pemain', 'liga'): 1\n",
+      " ('liga', 'sepak'): 1\n",
+      " ('sepak', 'bola'): 1\n",
+      " ('bola', 'terbaik'): 1\n",
+      " ('terbaik', 'di'): 1\n",
+      " ('di', 'dunia'): 1\n",
+      "\n",
+      "Total bigram dalam 1 kalimat: 8\n",
+      "\n",
+      "Probabilitas masing-masing bigram:\n",
+      " P(adalah|saya) = 1.00 (100.00%)\n",
+      " P(pemain|adalah) = 1.00 (100.00%)\n",
+      " P(liga|pemain) = 1.00 (100.00%)\n",
+      " P(sepak|liga) = 1.00 (100.00%)\n",
+      " P(bola|sepak) = 1.00 (100.00%)\n",
+      " P(terbaik|bola) = 1.00 (100.00%)\n",
+      " P(di|terbaik) = 1.00 (100.00%)\n",
+      " P(dunia|di) = 1.00 (100.00%)\n",
+      "\n",
+      "Probabilitas Keseluruhan Kalimat (Model Bigram):\n",
+      " P(saya adalah pemain liga sepak bola terbaik di dunia) = P(saya)=0.11 x P(adalah|saya)=1.00 x P(pemain|adalah)=1.00 x P(liga|pemain)=1.00 x P(sepak|liga)=1.00 x P(bola|sepak)=1.00 x P(terbaik|bola)=1.00 x P(di|terbaik)=1.00 x P(dunia|di)=1.00 = 0.111111 (11.11%)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from collections import Counter\n",
+    "from IPython.display import clear_output\n",
+    "import math\n",
+    "\n",
+    "# 1. Input Kalimat dan Tokenisasi\n",
+    "kalimat = input(\"Masukkan kalimat: \").strip()\n",
+    "\n",
+    "# Bersihkan output (khusus lingkungan notebook)\n",
+    "try:\n",
+    "    clear_output()\n",
+    "except:\n",
+    "    pass\n",
+    "\n",
+    "print(f\"Corpus: {kalimat}\")\n",
+    "\n",
+    "# Tokenisasi\n",
+    "tokens = kalimat.lower().split()\n",
+    "print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
+    "\n",
+    "# 2. Hitung Frekuensi Unigram dan Bigram\n",
+    "unigram_counts = Counter(tokens)\n",
+    "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
+    "bigram_counts = Counter(bigrams)\n",
+    "\n",
+    "print(\"\\nFrekuensi Bigram dalam kalimat:\")\n",
+    "for pair, count in bigram_counts.items():\n",
+    "    print(f\" {pair}: {count}\")\n",
+    "print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n",
+    "\n",
+    "# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n",
+    "bigram_probabilities = {}\n",
+    "for (w1, w2), count in bigram_counts.items():\n",
+    "    prob = count / unigram_counts[w1]\n",
+    "    bigram_probabilities[(w1, w2)] = prob\n",
+    "\n",
+    "print(\"\\nProbabilitas masing-masing bigram:\")\n",
+    "for (w1, w2), prob in bigram_probabilities.items():\n",
+    "    print(f\" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
+    "\n",
+    "# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)\n",
+    "#    P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...\n",
+    "total_tokens = sum(unigram_counts.values())\n",
+    "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)\n",
+    "p_kalimat = p_w1 # Inisialisasi dengan P(w1)\n",
+    "\n",
+    "prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"] # Tambahkan P(w1) ke rumus\n",
+    "\n",
+    "for i in range(1, len(tokens)):\n",
+    "    pair = (tokens[i-1], tokens[i])\n",
+    "    p = bigram_probabilities.get(pair, 0)\n",
+    "    p_kalimat *= p\n",
+    "    prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.2f}\")\n",
+    "\n",
+    "# Gabungkan rumus perkalian untuk ditampilkan\n",
+    "prob_str = \" x \".join(prob_str_parts)\n",
+    "\n",
+    "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n",
+    "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "E6n1IU8X-G9S"
+   },
+   "source": [
+    "# **TRIGRAM**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "BIRARsj2FHJg",
+    "outputId": "968d420e-9370-40e5-e7e1-148e1d351d62"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Corpus: Saya adalah anak dari ibu dan bapak saya\n",
+      "Tokens (8): ['saya', 'adalah', 'anak', 'dari', 'ibu', 'dan', 'bapak', 'saya']\n",
+      "\n",
+      "Frekuensi Trigram dalam kalimat:\n",
+      " ('saya', 'adalah', 'anak'): 1\n",
+      " ('adalah', 'anak', 'dari'): 1\n",
+      " ('anak', 'dari', 'ibu'): 1\n",
+      " ('dari', 'ibu', 'dan'): 1\n",
+      " ('ibu', 'dan', 'bapak'): 1\n",
+      " ('dan', 'bapak', 'saya'): 1\n",
+      "\n",
+      "Total trigram dalam 1 kalimat: 6\n",
+      "\n",
+      "Probabilitas masing-masing trigram:\n",
+      " P(anak|saya,adalah) = 1.00 (100.00%)\n",
+      " P(dari|adalah,anak) = 1.00 (100.00%)\n",
+      " P(ibu|anak,dari) = 1.00 (100.00%)\n",
+      " P(dan|dari,ibu) = 1.00 (100.00%)\n",
+      " P(bapak|ibu,dan) = 1.00 (100.00%)\n",
+      " P(saya|dan,bapak) = 1.00 (100.00%)\n",
+      "\n",
+      "Probabilitas Keseluruhan Kalimat (Model Trigram):\n",
+      " P(saya adalah anak dari ibu dan bapak saya) = P(saya)=0.25 x P(adalah|saya)=0.50 x P(anak|saya,adalah)=1.00 x P(dari|adalah,anak)=1.00 x P(ibu|anak,dari)=1.00 x P(dan|dari,ibu)=1.00 x P(bapak|ibu,dan)=1.00 x P(saya|dan,bapak)=1.00 = 0.125000 (12.50%)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from collections import Counter\n",
+    "from IPython.display import clear_output\n",
+    "import math\n",
+    "\n",
+    "# 1. Input Kalimat dan Tokenisasi\n",
+    "kalimat = input(\"Masukkan kalimat: \").strip()\n",
+    "\n",
+    "# Bersihkan output (khusus lingkungan notebook)\n",
+    "try:\n",
+    "    clear_output()\n",
+    "except:\n",
+    "    pass\n",
+    "\n",
+    "print(f\"Corpus: {kalimat}\")\n",
+    "\n",
+    "# Tokenisasi\n",
+    "tokens = kalimat.lower().split()\n",
+    "print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
+    "\n",
+    "# 2. Hitung Frekuensi Bigram dan Trigram\n",
+    "bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
+    "trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n",
+    "\n",
+    "bigram_counts = Counter(bigrams)\n",
+    "trigram_counts = Counter(trigrams)\n",
+    "\n",
+    "print(\"\\nFrekuensi Trigram dalam kalimat:\")\n",
+    "for tg, count in trigram_counts.items():\n",
+    "    print(f\" {tg}: {count}\")\n",
+    "print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n",
+    "\n",
+    "# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)\n",
+    "trigram_probabilities = {}\n",
+    "for (w1, w2, w3), count in trigram_counts.items():\n",
+    "    # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)\n",
+    "    if bigram_counts[(w1, w2)] > 0:\n",
+    "        prob = count / bigram_counts[(w1, w2)]\n",
+    "    else:\n",
+    "        prob = 0\n",
+    "    trigram_probabilities[(w1, w2, w3)] = prob\n",
+    "\n",
+    "print(\"\\nProbabilitas masing-masing trigram:\")\n",
+    "for (w1, w2, w3), prob in trigram_probabilities.items():\n",
+    "    print(f\" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
+    "\n",
+    "# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))\n",
+    "unigram_counts = Counter(tokens)\n",
+    "total_tokens = sum(unigram_counts.values())\n",
+    "\n",
+    "# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)\n",
+    "#    P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...\n",
+    "\n",
+    "# a. P(w1)\n",
+    "p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n",
+    "\n",
+    "# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)\n",
+    "if len(tokens) > 1:\n",
+    "    count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0\n",
+    "    p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n",
+    "else:\n",
+    "    p_w2_w1 = 1.0 # Jika hanya 1 kata\n",
+    "\n",
+    "p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)\n",
+    "\n",
+    "# Daftar bagian rumus untuk ditampilkan\n",
+    "prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"]\n",
+    "if len(tokens) > 1:\n",
+    "    prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}\")\n",
+    "\n",
+    "# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3\n",
+    "for i in range(len(tokens) - 2):\n",
+    "    triplet = (tokens[i], tokens[i+1], tokens[i+2])\n",
+    "    p = trigram_probabilities.get(triplet, 0)\n",
+    "    p_kalimat *= p\n",
+    "    prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}\")\n",
+    "\n",
+    "prob_str = \" x \".join(prob_str_parts)\n",
+    "\n",
+    "print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n",
+    "print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}