Praktikum-NLP/N-Gram.ipynb
202210715288 FATAH SABILA ROSYAD e871ab77ad Update tiga file praktikum NLP (Fatah): - Fitur Ekstraksi BOW - Klasifikasi Teks TF-IDF + ANN
Tugas praktikum Sudah saya edit sesuai instruksi dosen.

Signed-off-by: 202210715288 FATAH SABILA ROSYAD <202210715288@mhs.ubharajaya.ac.id>
2025-12-02 23:47:39 +07:00

363 lines
13 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Model Unigram, Bigram, dan Trigram\n",
"\n",
"**Nama:** Fatah Sabila Rosyad \n",
"**NIM:** 202210715288 \n",
"**Kelas:** F7B2 \n",
"\n",
"**Tujuan praktikum:** \n",
"Memahami cara menghitung frekuensi dan probabilitas kalimat menggunakan model Unigram, Bigram, dan Trigram pada teks bahasa Indonesia.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: fatah suka olahraga lari dan suka olahraga badminton\n",
"Tokens (8): ['fatah', 'suka', 'olahraga', 'lari', 'dan', 'suka', 'olahraga', 'badminton']\n",
"\n",
"Frekuensi Unigram dalam kalimat:\n",
" ('fatah'): 1\n",
" ('suka'): 2\n",
" ('olahraga'): 2\n",
" ('lari'): 1\n",
" ('dan'): 1\n",
" ('badminton'): 1\n",
"\n",
"Total unigram dalam 1 kalimat: 8\n",
"\n",
"Probabilitas masing-masing unigram:\n",
" P(fatah) = 0.1250 (12.50%)\n",
" P(suka) = 0.2500 (25.00%)\n",
" P(olahraga) = 0.2500 (25.00%)\n",
" P(lari) = 0.1250 (12.50%)\n",
" P(dan) = 0.1250 (12.50%)\n",
" P(badminton) = 0.1250 (12.50%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Unigram):\n",
" P(fatah suka olahraga lari dan suka olahraga badminton) = P(fatah)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(lari)=0.1250 x P(dan)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(badminton)=0.1250 = 0.00000095 (0.000095%)\n"
]
}
],
"source": [
"# ================= UNIGRAM =================\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenize\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Unigram\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"print(\"\\nFrekuensi Unigram dalam kalimat:\")\n",
"for pair, count in unigram_counts.items():\n",
" print(f\" ('{pair}'): {count}\")\n",
"print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n",
"\n",
"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n",
"unigram_probabilities = {}\n",
"for word, count in unigram_counts.items():\n",
" prob = count / total_tokens\n",
" unigram_probabilities[word] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing unigram:\")\n",
"for word, prob in unigram_probabilities.items():\n",
" print(f\" P({word}) = {prob:.4f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Probabilitas Kalimat Keseluruhan (P(w1) * P(w2) * ...)\n",
"p_kalimat = 1\n",
"prob_parts = []\n",
"\n",
"for word in tokens:\n",
" prob_value = unigram_probabilities[word]\n",
" p_kalimat *= prob_value\n",
" prob_parts.append(f\"P({word})={prob_value:.4f}\")\n",
"\n",
"prob_str = \" x \".join(prob_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\")\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: Fatah sedang belajar model bigram untuk menghitung probabilitas kalimat dan fatah sangat suka belajar\n",
"Tokens (14): ['fatah', 'sedang', 'belajar', 'model', 'bigram', 'untuk', 'menghitung', 'probabilitas', 'kalimat', 'dan', 'fatah', 'sangat', 'suka', 'belajar']\n",
"\n",
"Frekuensi Bigram dalam kalimat:\n",
" ('fatah', 'sedang'): 1\n",
" ('sedang', 'belajar'): 1\n",
" ('belajar', 'model'): 1\n",
" ('model', 'bigram'): 1\n",
" ('bigram', 'untuk'): 1\n",
" ('untuk', 'menghitung'): 1\n",
" ('menghitung', 'probabilitas'): 1\n",
" ('probabilitas', 'kalimat'): 1\n",
" ('kalimat', 'dan'): 1\n",
" ('dan', 'fatah'): 1\n",
" ('fatah', 'sangat'): 1\n",
" ('sangat', 'suka'): 1\n",
" ('suka', 'belajar'): 1\n",
"\n",
"Total bigram dalam 1 kalimat: 13\n",
"\n",
"Probabilitas masing-masing bigram:\n",
" P(sedang|fatah) = 0.5000 (50.00%)\n",
" P(belajar|sedang) = 1.0000 (100.00%)\n",
" P(model|belajar) = 0.5000 (50.00%)\n",
" P(bigram|model) = 1.0000 (100.00%)\n",
" P(untuk|bigram) = 1.0000 (100.00%)\n",
" P(menghitung|untuk) = 1.0000 (100.00%)\n",
" P(probabilitas|menghitung) = 1.0000 (100.00%)\n",
" P(kalimat|probabilitas) = 1.0000 (100.00%)\n",
" P(dan|kalimat) = 1.0000 (100.00%)\n",
" P(fatah|dan) = 1.0000 (100.00%)\n",
" P(sangat|fatah) = 0.5000 (50.00%)\n",
" P(suka|sangat) = 1.0000 (100.00%)\n",
" P(belajar|suka) = 1.0000 (100.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Bigram):\n",
" P(fatah sedang belajar model bigram untuk menghitung probabilitas kalimat dan fatah sangat suka belajar) = P(fatah)=0.1429 x P(sedang|fatah)=0.5000 x P(belajar|sedang)=1.0000 x P(model|belajar)=0.5000 x P(bigram|model)=1.0000 x P(untuk|bigram)=1.0000 x P(menghitung|untuk)=1.0000 x P(probabilitas|menghitung)=1.0000 x P(kalimat|probabilitas)=1.0000 x P(dan|kalimat)=1.0000 x P(fatah|dan)=1.0000 x P(sangat|fatah)=0.5000 x P(suka|sangat)=1.0000 x P(belajar|suka)=1.0000 = 0.01785714 (1.785714%)\n"
]
}
],
"source": [
"# ================= BIGRAM =================\n",
"\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Frekuensi Unigram dan Bigram\n",
"unigram_counts = Counter(tokens)\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"bigram_counts = Counter(bigrams)\n",
"\n",
"print(\"\\nFrekuensi Bigram dalam kalimat:\")\n",
"for pair, count in bigram_counts.items():\n",
" print(f\" {pair}: {count}\")\n",
"print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n",
"\n",
"# 3. Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n",
"bigram_probabilities = {}\n",
"for (w1, w2), count in bigram_counts.items():\n",
" prob = count / unigram_counts[w1]\n",
" bigram_probabilities[(w1, w2)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing bigram:\")\n",
"for (w1, w2), prob in bigram_probabilities.items():\n",
" print(f\" P({w2}|{w1}) = {prob:.4f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Probabilitas Kalimat (Model Bigram)\n",
"total_tokens = sum(unigram_counts.values())\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens\n",
"p_kalimat = p_w1\n",
"\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.4f}\"]\n",
"\n",
"for i in range(1, len(tokens)):\n",
" pair = (tokens[i-1], tokens[i])\n",
" p = bigram_probabilities.get(pair, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.4f}\")\n",
"\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\")\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: Pada praktikum ini Fatah sedang mempelajari model trigram untuk kalimat bahasa Indonesia\n",
"Tokens (12): ['pada', 'praktikum', 'ini', 'fatah', 'sedang', 'mempelajari', 'model', 'trigram', 'untuk', 'kalimat', 'bahasa', 'indonesia']\n",
"\n",
"Frekuensi Trigram dalam kalimat:\n",
" ('pada', 'praktikum', 'ini'): 1\n",
" ('praktikum', 'ini', 'fatah'): 1\n",
" ('ini', 'fatah', 'sedang'): 1\n",
" ('fatah', 'sedang', 'mempelajari'): 1\n",
" ('sedang', 'mempelajari', 'model'): 1\n",
" ('mempelajari', 'model', 'trigram'): 1\n",
" ('model', 'trigram', 'untuk'): 1\n",
" ('trigram', 'untuk', 'kalimat'): 1\n",
" ('untuk', 'kalimat', 'bahasa'): 1\n",
" ('kalimat', 'bahasa', 'indonesia'): 1\n",
"\n",
"Total trigram dalam 1 kalimat: 10\n",
"\n",
"Probabilitas masing-masing trigram:\n",
" P(ini|pada,praktikum) = 1.0000 (100.00%)\n",
" P(fatah|praktikum,ini) = 1.0000 (100.00%)\n",
" P(sedang|ini,fatah) = 1.0000 (100.00%)\n",
" P(mempelajari|fatah,sedang) = 1.0000 (100.00%)\n",
" P(model|sedang,mempelajari) = 1.0000 (100.00%)\n",
" P(trigram|mempelajari,model) = 1.0000 (100.00%)\n",
" P(untuk|model,trigram) = 1.0000 (100.00%)\n",
" P(kalimat|trigram,untuk) = 1.0000 (100.00%)\n",
" P(bahasa|untuk,kalimat) = 1.0000 (100.00%)\n",
" P(indonesia|kalimat,bahasa) = 1.0000 (100.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Trigram):\n",
" P(pada praktikum ini fatah sedang mempelajari model trigram untuk kalimat bahasa indonesia) = P(pada)=0.0833 x P(praktikum|pada)=1.0000 x P(ini|pada,praktikum)=1.0000 x P(fatah|praktikum,ini)=1.0000 x P(sedang|ini,fatah)=1.0000 x P(mempelajari|fatah,sedang)=1.0000 x P(model|sedang,mempelajari)=1.0000 x P(trigram|mempelajari,model)=1.0000 x P(untuk|model,trigram)=1.0000 x P(kalimat|trigram,untuk)=1.0000 x P(bahasa|untuk,kalimat)=1.0000 x P(indonesia|kalimat,bahasa)=1.0000 = 0.08333333 (8.333333%)\n"
]
}
],
"source": [
"# ================= TRIGRAM =================\n",
"\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Frekuensi Bigram dan Trigram\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n",
"\n",
"bigram_counts = Counter(bigrams)\n",
"trigram_counts = Counter(trigrams)\n",
"\n",
"print(\"\\nFrekuensi Trigram dalam kalimat:\")\n",
"for tg, count in trigram_counts.items():\n",
" print(f\" {tg}: {count}\")\n",
"print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n",
"\n",
"# 3. Probabilitas Trigram: P(w3 | w1, w2)\n",
"trigram_probabilities = {}\n",
"for (w1, w2, w3), count in trigram_counts.items():\n",
" if bigram_counts[(w1, w2)] > 0:\n",
" prob = count / bigram_counts[(w1, w2)]\n",
" else:\n",
" prob = 0\n",
" trigram_probabilities[(w1, w2, w3)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing trigram:\")\n",
"for (w1, w2, w3), prob in trigram_probabilities.items():\n",
" print(f\" P({w3}|{w1},{w2}) = {prob:.4f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Probabilitas Kalimat (Model Trigram)\n",
"\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"# P(w1)\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n",
"\n",
"# P(w2|w1)\n",
"if len(tokens) > 1:\n",
" count_w1 = unigram_counts.get(tokens[0], 1)\n",
" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n",
"else:\n",
" p_w2_w1 = 1.0\n",
"\n",
"p_kalimat = p_w1 * p_w2_w1\n",
"\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.4f}\"]\n",
"if len(tokens) > 1:\n",
" prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.4f}\")\n",
"\n",
"# Perkalian trigram untuk i >= 3\n",
"for i in range(len(tokens) - 2):\n",
" triplet = (tokens[i], tokens[i+1], tokens[i+2])\n",
" p = trigram_probabilities.get(triplet, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.4f}\")\n",
"\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\")\n"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}