Initial Commit

This commit is contained in:
Arif Dwiyanto 2025-11-14 16:01:27 +00:00
commit 4489eb5e9b
12 changed files with 1896 additions and 0 deletions

View File

@ -0,0 +1,33 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "28217e47-db17-4572-853d-151630b47bc8",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,3 @@
[Trash Info]
Path=Untitled.ipynb
DeletionDate=2025-11-14T15:53:34

View File

@ -0,0 +1,300 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qBYcPYAb059g",
"outputId": "9f57b704-da1b-4495-d366-24c30586dc76"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Masukkan jumlah dokumen yang ingin dimasukkan: 3\n"
]
}
],
"source": [
"# Input jumlah dokumen\n",
"import pandas as pd\n",
"n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
]
},
{
"cell_type": "code",
"source": [
"# Input teks dokumen satu per satu\n",
"documents = []\n",
"for i in range(n):\n",
" teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
" documents.append(teks)\n",
"\n",
"print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
"for i, doc in enumerate(documents):\n",
" print(f\"Doc {i+1}: {doc}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mo-yt5Ob1N8j",
"outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Masukkan teks untuk dokumen ke-1: saya belajar nlp di kampus\n",
"Masukkan teks untuk dokumen ke-2: saya suka belajar ai\n",
"Masukkan teks untuk dokumen ke-3: mahasiswa belajar data science dan nlp\n",
"\n",
"=== Dokumen yang Dimasukkan ===\n",
"Doc 1: saya belajar nlp di kampus\n",
"Doc 2: saya suka belajar ai\n",
"Doc 3: mahasiswa belajar data science dan nlp\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Tahap Tokenisasi\n",
"tokenized_docs = []\n",
"for doc in documents:\n",
" tokens = doc.lower().split()\n",
" tokenized_docs.append(tokens)\n",
"\n",
"print(\"\\n=== Hasil Tokenisasi ===\")\n",
"for i, tokens in enumerate(tokenized_docs):\n",
" print(f\"Doc {i+1}: {tokens}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FkmxRAFq1oDK",
"outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"=== Hasil Tokenisasi ===\n",
"Doc 1: ['saya', 'belajar', 'nlp', 'di', 'kampus']\n",
"Doc 2: ['saya', 'suka', 'belajar', 'ai']\n",
"Doc 3: ['mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Pembuatan Corpus\n",
"corpus_all = [word for doc in tokenized_docs for word in doc]\n",
"\n",
"print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
"print(corpus_all)\n",
"print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ybC1Vo2C_c3q",
"outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
"['saya', 'belajar', 'nlp', 'di', 'kampus', 'saya', 'suka', 'belajar', 'ai', 'mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n",
"Jumlah total kata dalam seluruh dokumen: 15\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Pembuatan Vocabulary\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"print(vocabulary)\n",
"print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
"\n",
"\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"for idx, word in enumerate(vocabulary, start=1):\n",
" print(f\"{idx:>2}. {word}\")\n",
"print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "s6S-Ma4R1xuq",
"outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"=== Vocabulary (Kata Unik) ===\n",
"['ai', 'belajar', 'dan', 'data', 'di', 'kampus', 'mahasiswa', 'nlp', 'saya', 'science', 'suka']\n",
"Jumlah kata unik (vocabulary size): 11\n",
"\n",
"=== Vocabulary (Kata Unik) ===\n",
" 1. ai\n",
" 2. belajar\n",
" 3. dan\n",
" 4. data\n",
" 5. di\n",
" 6. kampus\n",
" 7. mahasiswa\n",
" 8. nlp\n",
" 9. saya\n",
"10. science\n",
"11. suka\n",
"\n",
"Jumlah kata unik (vocabulary size): 11\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Representasi Numerik (Matriks BoW)\n",
"bow_matrix = []\n",
"for doc in tokenized_docs:\n",
" vector = [doc.count(word) for word in vocabulary]\n",
" bow_matrix.append(vector)"
],
"metadata": {
"id": "ShevCTva2Fg9"
},
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"source": [
"df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
"df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3\n",
"\n",
"print(\"\\n=== Matriks Bag of Words ===\")\n",
"print(df_bow)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-yB6D2pY2M0E",
"outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"=== Matriks Bag of Words ===\n",
" ai belajar dan data di kampus mahasiswa nlp saya science suka\n",
"D1 0 1 0 0 1 1 0 1 1 0 0\n",
"D2 1 1 0 0 0 0 0 0 1 0 1\n",
"D3 0 1 1 1 0 0 1 1 0 1 0\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n",
"word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n",
"word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n",
"\n",
"print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n",
"print(word_frequencies)\n",
"print(f\"Frekuensi kata: {len(word_frequencies)}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8ruf5vKL2rGD",
"outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2"
},
"execution_count": 8,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n",
" Kata Frekuensi\n",
"0 belajar 3\n",
"1 nlp 2\n",
"2 saya 2\n",
"3 dan 1\n",
"4 ai 1\n",
"5 data 1\n",
"6 di 1\n",
"7 mahasiswa 1\n",
"8 kampus 1\n",
"9 science 1\n",
"10 suka 1\n",
"Frekuensi kata: 11\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "NQjExannHuj0"
},
"execution_count": null,
"outputs": []
}
]
}

View File

@ -0,0 +1,374 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"source": [
"# **PENGOLAH BAHASA ALAMI F7A1 | Pertemuan ke-5 - Jum'at, 17 Oktober 2025**\n",
"## **Tugas:** Membuat N-GRAM (Unigram, Bigram, & Trigram)\n",
"### **Dosen Pengampu:** Muhammad Yasir, S.Si., M.Kom.\n",
"#### **Disusun Oleh:** Mega Gloria (202210715173)\n",
"\n"
],
"metadata": {
"id": "JVPdWpz3hhbj"
}
},
{
"cell_type": "markdown",
"source": [
"# **UNIGRAM**"
],
"metadata": {
"id": "4Mvva3v65h1v"
}
},
{
"cell_type": "code",
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenize\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Unigram\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"print(\"\\nFrekuensi Unigram dalam kalimat\")\n",
"for pair, count in unigram_counts.items():\n",
" print(f\" ('{pair}'): {count}\")\n",
"print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n",
"\n",
"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n",
"unigram_probabilities = {}\n",
"for word, count in unigram_counts.items():\n",
" prob = count / total_tokens\n",
" unigram_probabilities[word] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing unigram:\")\n",
"for word, prob in unigram_probabilities.items():\n",
" print(f\" P({word}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)\n",
"p_kalimat = 1\n",
"prob_parts = []\n",
"\n",
"# Loop untuk menghitung probabilitas total dan membangun string rumus detail\n",
"for word in tokens:\n",
" prob_value = unigram_probabilities[word]\n",
" p_kalimat *= prob_value\n",
" # Format: P(word)=prob_value\n",
" prob_parts.append(f\"P({word})={prob_value:.2f}\")\n",
"\n",
"# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail\n",
"prob_str = \" x \".join(prob_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1cub_VJnUJMl",
"outputId": "1889eb61-4f3f-4780-f42e-02368076cce3"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Corpus: saya suka makan nasi\n",
"Tokens (4): ['saya', 'suka', 'makan', 'nasi']\n",
"\n",
"Frekuensi Unigram dalam kalimat\n",
" ('saya'): 1\n",
" ('suka'): 1\n",
" ('makan'): 1\n",
" ('nasi'): 1\n",
"\n",
"Total unigram dalam 1 kalimat: 4\n",
"\n",
"Probabilitas masing-masing unigram:\n",
" P(saya) = 0.25 (25.00%)\n",
" P(suka) = 0.25 (25.00%)\n",
" P(makan) = 0.25 (25.00%)\n",
" P(nasi) = 0.25 (25.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Unigram):\n",
" P(saya suka makan nasi) = P(saya)=0.25 x P(suka)=0.25 x P(makan)=0.25 x P(nasi)=0.25 = 0.0039 (0.39%)\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# **BIGRAM**"
],
"metadata": {
"id": "Vstwt996-FrS"
}
},
{
"cell_type": "code",
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Unigram dan Bigram\n",
"unigram_counts = Counter(tokens)\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"bigram_counts = Counter(bigrams)\n",
"\n",
"print(\"\\nFrekuensi Bigram dalam kalimat:\")\n",
"for pair, count in bigram_counts.items():\n",
" print(f\" {pair}: {count}\")\n",
"print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n",
"\n",
"# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n",
"bigram_probabilities = {}\n",
"for (w1, w2), count in bigram_counts.items():\n",
" prob = count / unigram_counts[w1]\n",
" bigram_probabilities[(w1, w2)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing bigram:\")\n",
"for (w1, w2), prob in bigram_probabilities.items():\n",
" print(f\" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)\n",
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...\n",
"total_tokens = sum(unigram_counts.values())\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)\n",
"p_kalimat = p_w1 # Inisialisasi dengan P(w1)\n",
"\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"] # Tambahkan P(w1) ke rumus\n",
"\n",
"for i in range(1, len(tokens)):\n",
" pair = (tokens[i-1], tokens[i])\n",
" p = bigram_probabilities.get(pair, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.2f}\")\n",
"\n",
"# Gabungkan rumus perkalian untuk ditampilkan\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XRIY4qgTVbjl",
"outputId": "ea6e62ce-45a0-40c9-ca98-1fcc30558479"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Corpus: saya makan nasi dan saya makan roti\n",
"Tokens (7): ['saya', 'makan', 'nasi', 'dan', 'saya', 'makan', 'roti']\n",
"\n",
"Frekuensi Bigram dalam kalimat:\n",
" ('saya', 'makan'): 2\n",
" ('makan', 'nasi'): 1\n",
" ('nasi', 'dan'): 1\n",
" ('dan', 'saya'): 1\n",
" ('makan', 'roti'): 1\n",
"\n",
"Total bigram dalam 1 kalimat: 6\n",
"\n",
"Probabilitas masing-masing bigram:\n",
" P(makan|saya) = 1.00 (100.00%)\n",
" P(nasi|makan) = 0.50 (50.00%)\n",
" P(dan|nasi) = 1.00 (100.00%)\n",
" P(saya|dan) = 1.00 (100.00%)\n",
" P(roti|makan) = 0.50 (50.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Bigram):\n",
" P(saya makan nasi dan saya makan roti) = P(saya)=0.29 x P(makan|saya)=1.00 x P(nasi|makan)=0.50 x P(dan|nasi)=1.00 x P(saya|dan)=1.00 x P(makan|saya)=1.00 x P(roti|makan)=0.50 = 0.071429 (7.14%)\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# **TRIGRAM**"
],
"metadata": {
"id": "E6n1IU8X-G9S"
}
},
{
"cell_type": "code",
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Bigram dan Trigram\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n",
"\n",
"bigram_counts = Counter(bigrams)\n",
"trigram_counts = Counter(trigrams)\n",
"\n",
"print(\"\\nFrekuensi Trigram dalam kalimat:\")\n",
"for tg, count in trigram_counts.items():\n",
" print(f\" {tg}: {count}\")\n",
"print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n",
"\n",
"# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)\n",
"trigram_probabilities = {}\n",
"for (w1, w2, w3), count in trigram_counts.items():\n",
" # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)\n",
" if bigram_counts[(w1, w2)] > 0:\n",
" prob = count / bigram_counts[(w1, w2)]\n",
" else:\n",
" prob = 0\n",
" trigram_probabilities[(w1, w2, w3)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing trigram:\")\n",
"for (w1, w2, w3), prob in trigram_probabilities.items():\n",
" print(f\" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)\n",
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...\n",
"\n",
"# a. P(w1)\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n",
"\n",
"# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)\n",
"if len(tokens) > 1:\n",
" count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0\n",
" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n",
"else:\n",
" p_w2_w1 = 1.0 # Jika hanya 1 kata\n",
"\n",
"p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)\n",
"\n",
"# Daftar bagian rumus untuk ditampilkan\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"]\n",
"if len(tokens) > 1:\n",
" prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}\")\n",
"\n",
"# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3\n",
"for i in range(len(tokens) - 2):\n",
" triplet = (tokens[i], tokens[i+1], tokens[i+2])\n",
" p = trigram_probabilities.get(triplet, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}\")\n",
"\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BIRARsj2FHJg",
"outputId": "968d420e-9370-40e5-e7e1-148e1d351d62"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Corpus: mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning\n",
"Tokens (7): ['mahasiswa', 'mengerjakan', 'tugas', 'kemudian', 'mahasiswa', 'upload', 'e-learning']\n",
"\n",
"Frekuensi Trigram dalam kalimat:\n",
" ('mahasiswa', 'mengerjakan', 'tugas'): 1\n",
" ('mengerjakan', 'tugas', 'kemudian'): 1\n",
" ('tugas', 'kemudian', 'mahasiswa'): 1\n",
" ('kemudian', 'mahasiswa', 'upload'): 1\n",
" ('mahasiswa', 'upload', 'e-learning'): 1\n",
"\n",
"Total trigram dalam 1 kalimat: 5\n",
"\n",
"Probabilitas masing-masing trigram:\n",
" P(tugas|mahasiswa,mengerjakan) = 1.00 (100.00%)\n",
" P(kemudian|mengerjakan,tugas) = 1.00 (100.00%)\n",
" P(mahasiswa|tugas,kemudian) = 1.00 (100.00%)\n",
" P(upload|kemudian,mahasiswa) = 1.00 (100.00%)\n",
" P(e-learning|mahasiswa,upload) = 1.00 (100.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Trigram):\n",
" P(mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning) = P(mahasiswa)=0.29 x P(mengerjakan|mahasiswa)=0.50 x P(tugas|mahasiswa,mengerjakan)=1.00 x P(kemudian|mengerjakan,tugas)=1.00 x P(mahasiswa|tugas,kemudian)=1.00 x P(upload|kemudian,mahasiswa)=1.00 x P(e-learning|mahasiswa,upload)=1.00 = 0.142857 (14.29%)\n"
]
}
]
}
]
}

View File

@ -0,0 +1 @@
# Kompilasi Materi Praktikum

View File

@ -0,0 +1,75 @@
# Input jumlah dokumen
import pandas as pd
n = int(input("Masukkan jumlah dokumen yang ingin dimasukkan: "))
# Input teks dokumen satu per satu
documents = []
for i in range(n):
teks = input(f"Masukkan teks untuk dokumen ke-{i+1}: ")
documents.append(teks)
print("\n=== Dokumen yang Dimasukkan ===")
for i, doc in enumerate(documents):
print(f"Doc {i+1}: {doc}")
# Tahap Tokenisasi
tokenized_docs = []
for doc in documents:
tokens = doc.lower().split()
tokenized_docs.append(tokens)
print("\n=== Hasil Tokenisasi ===")
for i, tokens in enumerate(tokenized_docs):
print(f"Doc {i+1}: {tokens}")
# Pembuatan Corpus
corpus_all = [word for doc in tokenized_docs for word in doc]
print("\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===")
print(corpus_all)
print(f"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}")
# Pembuatan Vocabulary
vocabulary = sorted(set(corpus_all))
print("\n=== Vocabulary (Kata Unik) ===")
print(vocabulary)
print(f"Jumlah kata unik (vocabulary size): {len(vocabulary)}")
vocabulary = sorted(set(corpus_all))
print("\n=== Vocabulary (Kata Unik) ===")
for idx, word in enumerate(vocabulary, start=1):
print(f"{idx:>2}. {word}")
print(f"\nJumlah kata unik (vocabulary size): {len(vocabulary)}")
# Representasi Numerik (Matriks BoW)
bow_matrix = []
for doc in tokenized_docs:
vector = [doc.count(word) for word in vocabulary]
bow_matrix.append(vector)
df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)
df_bow.index = [f"D{i}" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3
print("\n=== Matriks Bag of Words ===")
print(df_bow)
# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)
word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()
word_frequencies.columns = ["Kata", "Frekuensi"]
print("\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===")
print(word_frequencies)
print(f"Frekuensi kata: {len(word_frequencies)}")

View File

@ -0,0 +1,209 @@
from collections import Counter
from IPython.display import clear_output
import math
# 1. Input Kalimat dan Tokenisasi
kalimat = input("Masukkan kalimat: ").strip()
# Bersihkan output (khusus lingkungan notebook)
try:
clear_output()
except:
pass
print(f"Corpus: {kalimat}")
# Tokenize
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")
# 2. Hitung Frekuensi Unigram
unigram_counts = Counter(tokens)
total_tokens = sum(unigram_counts.values())
print("\nFrekuensi Unigram dalam kalimat")
for pair, count in unigram_counts.items():
print(f" ('{pair}'): {count}")
print(f"\nTotal unigram dalam 1 kalimat: {total_tokens}")
# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata
unigram_probabilities = {}
for word, count in unigram_counts.items():
prob = count / total_tokens
unigram_probabilities[word] = prob
print("\nProbabilitas masing-masing unigram:")
for word, prob in unigram_probabilities.items():
print(f" P({word}) = {prob:.2f} ({prob*100:.2f}%)")
# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)
p_kalimat = 1
prob_parts = []
# Loop untuk menghitung probabilitas total dan membangun string rumus detail
for word in tokens:
prob_value = unigram_probabilities[word]
p_kalimat *= prob_value
# Format: P(word)=prob_value
prob_parts.append(f"P({word})={prob_value:.2f}")
# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail
prob_str = " x ".join(prob_parts)
print("\nProbabilitas Keseluruhan Kalimat (Model Unigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)")
from collections import Counter
from IPython.display import clear_output
import math
# 1. Input Kalimat dan Tokenisasi
kalimat = input("Masukkan kalimat: ").strip()
# Bersihkan output (khusus lingkungan notebook)
try:
clear_output()
except:
pass
print(f"Corpus: {kalimat}")
# Tokenisasi
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")
# 2. Hitung Frekuensi Unigram dan Bigram
unigram_counts = Counter(tokens)
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
bigram_counts = Counter(bigrams)
print("\nFrekuensi Bigram dalam kalimat:")
for pair, count in bigram_counts.items():
print(f" {pair}: {count}")
print(f"\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}")
# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)
bigram_probabilities = {}
for (w1, w2), count in bigram_counts.items():
prob = count / unigram_counts[w1]
bigram_probabilities[(w1, w2)] = prob
print("\nProbabilitas masing-masing bigram:")
for (w1, w2), prob in bigram_probabilities.items():
print(f" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)")
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...
total_tokens = sum(unigram_counts.values())
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)
p_kalimat = p_w1 # Inisialisasi dengan P(w1)
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"] # Tambahkan P(w1) ke rumus
for i in range(1, len(tokens)):
pair = (tokens[i-1], tokens[i])
p = bigram_probabilities.get(pair, 0)
p_kalimat *= p
prob_str_parts.append(f"P({pair[1]}|{pair[0]})={p:.2f}")
# Gabungkan rumus perkalian untuk ditampilkan
prob_str = " x ".join(prob_str_parts)
print("\nProbabilitas Keseluruhan Kalimat (Model Bigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
from collections import Counter
from IPython.display import clear_output
import math
# 1. Input Kalimat dan Tokenisasi
kalimat = input("Masukkan kalimat: ").strip()
# Bersihkan output (khusus lingkungan notebook)
try:
clear_output()
except:
pass
print(f"Corpus: {kalimat}")
# Tokenisasi
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")
# 2. Hitung Frekuensi Bigram dan Trigram
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)
print("\nFrekuensi Trigram dalam kalimat:")
for tg, count in trigram_counts.items():
print(f" {tg}: {count}")
print(f"\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}")
# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)
trigram_probabilities = {}
for (w1, w2, w3), count in trigram_counts.items():
# Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)
if bigram_counts[(w1, w2)] > 0:
prob = count / bigram_counts[(w1, w2)]
else:
prob = 0
trigram_probabilities[(w1, w2, w3)] = prob
print("\nProbabilitas masing-masing trigram:")
for (w1, w2, w3), prob in trigram_probabilities.items():
print(f" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)")
# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))
unigram_counts = Counter(tokens)
total_tokens = sum(unigram_counts.values())
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...
# a. P(w1)
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0
# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)
if len(tokens) > 1:
count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0
p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1
else:
p_w2_w1 = 1.0 # Jika hanya 1 kata
p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)
# Daftar bagian rumus untuk ditampilkan
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"]
if len(tokens) > 1:
prob_str_parts.append(f"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}")
# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3
for i in range(len(tokens) - 2):
triplet = (tokens[i], tokens[i+1], tokens[i+2])
p = trigram_probabilities.get(triplet, 0)
p_kalimat *= p
prob_str_parts.append(f"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}")
prob_str = " x ".join(prob_str_parts)
print("\nProbabilitas Keseluruhan Kalimat (Model Trigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")

View File

@ -0,0 +1,209 @@
from collections import Counter
from IPython.display import clear_output
import math
# 1. Input Kalimat dan Tokenisasi
kalimat = input("Masukkan kalimat: ").strip()
# Bersihkan output (khusus lingkungan notebook)
try:
clear_output()
except:
pass
print(f"Corpus: {kalimat}")
# Tokenize
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")
# 2. Hitung Frekuensi Unigram
unigram_counts = Counter(tokens)
total_tokens = sum(unigram_counts.values())
print("\nFrekuensi Unigram dalam kalimat")
for pair, count in unigram_counts.items():
print(f" ('{pair}'): {count}")
print(f"\nTotal unigram dalam 1 kalimat: {total_tokens}")
# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata
unigram_probabilities = {}
for word, count in unigram_counts.items():
prob = count / total_tokens
unigram_probabilities[word] = prob
print("\nProbabilitas masing-masing unigram:")
for word, prob in unigram_probabilities.items():
print(f" P({word}) = {prob:.2f} ({prob*100:.2f}%)")
# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)
p_kalimat = 1
prob_parts = []
# Loop untuk menghitung probabilitas total dan membangun string rumus detail
for word in tokens:
prob_value = unigram_probabilities[word]
p_kalimat *= prob_value
# Format: P(word)=prob_value
prob_parts.append(f"P({word})={prob_value:.2f}")
# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail
prob_str = " x ".join(prob_parts)
print("\nProbabilitas Keseluruhan Kalimat (Model Unigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)")
from collections import Counter
from IPython.display import clear_output
import math
# 1. Input Kalimat dan Tokenisasi
kalimat = input("Masukkan kalimat: ").strip()
# Bersihkan output (khusus lingkungan notebook)
try:
clear_output()
except:
pass
print(f"Corpus: {kalimat}")
# Tokenisasi
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")
# 2. Hitung Frekuensi Unigram dan Bigram
unigram_counts = Counter(tokens)
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
bigram_counts = Counter(bigrams)
print("\nFrekuensi Bigram dalam kalimat:")
for pair, count in bigram_counts.items():
print(f" {pair}: {count}")
print(f"\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}")
# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)
bigram_probabilities = {}
for (w1, w2), count in bigram_counts.items():
prob = count / unigram_counts[w1]
bigram_probabilities[(w1, w2)] = prob
print("\nProbabilitas masing-masing bigram:")
for (w1, w2), prob in bigram_probabilities.items():
print(f" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)")
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...
total_tokens = sum(unigram_counts.values())
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)
p_kalimat = p_w1 # Inisialisasi dengan P(w1)
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"] # Tambahkan P(w1) ke rumus
for i in range(1, len(tokens)):
pair = (tokens[i-1], tokens[i])
p = bigram_probabilities.get(pair, 0)
p_kalimat *= p
prob_str_parts.append(f"P({pair[1]}|{pair[0]})={p:.2f}")
# Gabungkan rumus perkalian untuk ditampilkan
prob_str = " x ".join(prob_str_parts)
print("\nProbabilitas Keseluruhan Kalimat (Model Bigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
from collections import Counter
from IPython.display import clear_output
import math
# 1. Input Kalimat dan Tokenisasi
kalimat = input("Masukkan kalimat: ").strip()
# Bersihkan output (khusus lingkungan notebook)
try:
clear_output()
except:
pass
print(f"Corpus: {kalimat}")
# Tokenisasi
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")
# 2. Hitung Frekuensi Bigram dan Trigram
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)
print("\nFrekuensi Trigram dalam kalimat:")
for tg, count in trigram_counts.items():
print(f" {tg}: {count}")
print(f"\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}")
# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)
trigram_probabilities = {}
for (w1, w2, w3), count in trigram_counts.items():
# Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)
if bigram_counts[(w1, w2)] > 0:
prob = count / bigram_counts[(w1, w2)]
else:
prob = 0
trigram_probabilities[(w1, w2, w3)] = prob
print("\nProbabilitas masing-masing trigram:")
for (w1, w2, w3), prob in trigram_probabilities.items():
print(f" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)")
# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))
unigram_counts = Counter(tokens)
total_tokens = sum(unigram_counts.values())
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...
# a. P(w1)
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0
# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)
if len(tokens) > 1:
count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0
p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1
else:
p_w2_w1 = 1.0 # Jika hanya 1 kata
p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)
# Daftar bagian rumus untuk ditampilkan
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"]
if len(tokens) > 1:
prob_str_parts.append(f"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}")
# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3
for i in range(len(tokens) - 2):
triplet = (tokens[i], tokens[i+1], tokens[i+2])
p = trigram_probabilities.get(triplet, 0)
p_kalimat *= p
prob_str_parts.append(f"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}")
prob_str = " x ".join(prob_str_parts)
print("\nProbabilitas Keseluruhan Kalimat (Model Trigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,310 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qBYcPYAb059g",
"outputId": "9f57b704-da1b-4495-d366-24c30586dc76"
},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
"Masukkan jumlah dokumen yang ingin dimasukkan: 3\n"
]
}
],
"source": [
"# Input jumlah dokumen\n",
"import pandas as pd\n",
"n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mo-yt5Ob1N8j",
"outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Masukkan teks untuk dokumen ke-1: saya belajar nlp di kampus\n",
"Masukkan teks untuk dokumen ke-2: saya suka belajar ai\n",
"Masukkan teks untuk dokumen ke-3: mahasiswa belajar data science dan nlp\n",
"\n",
"=== Dokumen yang Dimasukkan ===\n",
"Doc 1: saya belajar nlp di kampus\n",
"Doc 2: saya suka belajar ai\n",
"Doc 3: mahasiswa belajar data science dan nlp\n"
]
}
],
"source": [
"# Input teks dokumen satu per satu\n",
"documents = []\n",
"for i in range(n):\n",
" teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
" documents.append(teks)\n",
"\n",
"print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
"for i, doc in enumerate(documents):\n",
" print(f\"Doc {i+1}: {doc}\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FkmxRAFq1oDK",
"outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Hasil Tokenisasi ===\n",
"Doc 1: ['saya', 'belajar', 'nlp', 'di', 'kampus']\n",
"Doc 2: ['saya', 'suka', 'belajar', 'ai']\n",
"Doc 3: ['mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n"
]
}
],
"source": [
"# Tahap Tokenisasi\n",
"tokenized_docs = []\n",
"for doc in documents:\n",
" tokens = doc.lower().split()\n",
" tokenized_docs.append(tokens)\n",
"\n",
"print(\"\\n=== Hasil Tokenisasi ===\")\n",
"for i, tokens in enumerate(tokenized_docs):\n",
" print(f\"Doc {i+1}: {tokens}\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ybC1Vo2C_c3q",
"outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
"['saya', 'belajar', 'nlp', 'di', 'kampus', 'saya', 'suka', 'belajar', 'ai', 'mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n",
"Jumlah total kata dalam seluruh dokumen: 15\n"
]
}
],
"source": [
"# Pembuatan Corpus\n",
"corpus_all = [word for doc in tokenized_docs for word in doc]\n",
"\n",
"print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
"print(corpus_all)\n",
"print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "s6S-Ma4R1xuq",
"outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Vocabulary (Kata Unik) ===\n",
"['ai', 'belajar', 'dan', 'data', 'di', 'kampus', 'mahasiswa', 'nlp', 'saya', 'science', 'suka']\n",
"Jumlah kata unik (vocabulary size): 11\n",
"\n",
"=== Vocabulary (Kata Unik) ===\n",
" 1. ai\n",
" 2. belajar\n",
" 3. dan\n",
" 4. data\n",
" 5. di\n",
" 6. kampus\n",
" 7. mahasiswa\n",
" 8. nlp\n",
" 9. saya\n",
"10. science\n",
"11. suka\n",
"\n",
"Jumlah kata unik (vocabulary size): 11\n"
]
}
],
"source": [
"# Pembuatan Vocabulary\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"print(vocabulary)\n",
"print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
"\n",
"\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"for idx, word in enumerate(vocabulary, start=1):\n",
" print(f\"{idx:>2}. {word}\")\n",
"print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "ShevCTva2Fg9"
},
"outputs": [],
"source": [
"# Representasi Numerik (Matriks BoW)\n",
"bow_matrix = []\n",
"for doc in tokenized_docs:\n",
" vector = [doc.count(word) for word in vocabulary]\n",
" bow_matrix.append(vector)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-yB6D2pY2M0E",
"outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Matriks Bag of Words ===\n",
" ai belajar dan data di kampus mahasiswa nlp saya science suka\n",
"D1 0 1 0 0 1 1 0 1 1 0 0\n",
"D2 1 1 0 0 0 0 0 0 1 0 1\n",
"D3 0 1 1 1 0 0 1 1 0 1 0\n"
]
}
],
"source": [
"df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
"df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3\n",
"\n",
"print(\"\\n=== Matriks Bag of Words ===\")\n",
"print(df_bow)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8ruf5vKL2rGD",
"outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n",
" Kata Frekuensi\n",
"0 belajar 3\n",
"1 nlp 2\n",
"2 saya 2\n",
"3 dan 1\n",
"4 ai 1\n",
"5 data 1\n",
"6 di 1\n",
"7 mahasiswa 1\n",
"8 kampus 1\n",
"9 science 1\n",
"10 suka 1\n",
"Frekuensi kata: 11\n"
]
}
],
"source": [
"# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n",
"word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n",
"word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n",
"\n",
"print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n",
"print(word_frequencies)\n",
"print(f\"Frekuensi kata: {len(word_frequencies)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NQjExannHuj0"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

380
NLP/N-Gram.ipynb Normal file
View File

@ -0,0 +1,380 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "JVPdWpz3hhbj"
},
"source": [
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4Mvva3v65h1v"
},
"source": [
"# **UNIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1cub_VJnUJMl",
"outputId": "1889eb61-4f3f-4780-f42e-02368076cce3"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: saya suka makan nasi\n",
"Tokens (4): ['saya', 'suka', 'makan', 'nasi']\n",
"\n",
"Frekuensi Unigram dalam kalimat\n",
" ('saya'): 1\n",
" ('suka'): 1\n",
" ('makan'): 1\n",
" ('nasi'): 1\n",
"\n",
"Total unigram dalam 1 kalimat: 4\n",
"\n",
"Probabilitas masing-masing unigram:\n",
" P(saya) = 0.25 (25.00%)\n",
" P(suka) = 0.25 (25.00%)\n",
" P(makan) = 0.25 (25.00%)\n",
" P(nasi) = 0.25 (25.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Unigram):\n",
" P(saya suka makan nasi) = P(saya)=0.25 x P(suka)=0.25 x P(makan)=0.25 x P(nasi)=0.25 = 0.0039 (0.39%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenize\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Unigram\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"print(\"\\nFrekuensi Unigram dalam kalimat\")\n",
"for pair, count in unigram_counts.items():\n",
" print(f\" ('{pair}'): {count}\")\n",
"print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n",
"\n",
"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n",
"unigram_probabilities = {}\n",
"for word, count in unigram_counts.items():\n",
" prob = count / total_tokens\n",
" unigram_probabilities[word] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing unigram:\")\n",
"for word, prob in unigram_probabilities.items():\n",
" print(f\" P({word}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)\n",
"p_kalimat = 1\n",
"prob_parts = []\n",
"\n",
"# Loop untuk menghitung probabilitas total dan membangun string rumus detail\n",
"for word in tokens:\n",
" prob_value = unigram_probabilities[word]\n",
" p_kalimat *= prob_value\n",
" # Format: P(word)=prob_value\n",
" prob_parts.append(f\"P({word})={prob_value:.2f}\")\n",
"\n",
"# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail\n",
"prob_str = \" x \".join(prob_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Vstwt996-FrS"
},
"source": [
"# **BIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XRIY4qgTVbjl",
"outputId": "ea6e62ce-45a0-40c9-ca98-1fcc30558479"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: saya makan nasi dan saya makan roti\n",
"Tokens (7): ['saya', 'makan', 'nasi', 'dan', 'saya', 'makan', 'roti']\n",
"\n",
"Frekuensi Bigram dalam kalimat:\n",
" ('saya', 'makan'): 2\n",
" ('makan', 'nasi'): 1\n",
" ('nasi', 'dan'): 1\n",
" ('dan', 'saya'): 1\n",
" ('makan', 'roti'): 1\n",
"\n",
"Total bigram dalam 1 kalimat: 6\n",
"\n",
"Probabilitas masing-masing bigram:\n",
" P(makan|saya) = 1.00 (100.00%)\n",
" P(nasi|makan) = 0.50 (50.00%)\n",
" P(dan|nasi) = 1.00 (100.00%)\n",
" P(saya|dan) = 1.00 (100.00%)\n",
" P(roti|makan) = 0.50 (50.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Bigram):\n",
" P(saya makan nasi dan saya makan roti) = P(saya)=0.29 x P(makan|saya)=1.00 x P(nasi|makan)=0.50 x P(dan|nasi)=1.00 x P(saya|dan)=1.00 x P(makan|saya)=1.00 x P(roti|makan)=0.50 = 0.071429 (7.14%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Unigram dan Bigram\n",
"unigram_counts = Counter(tokens)\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"bigram_counts = Counter(bigrams)\n",
"\n",
"print(\"\\nFrekuensi Bigram dalam kalimat:\")\n",
"for pair, count in bigram_counts.items():\n",
" print(f\" {pair}: {count}\")\n",
"print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n",
"\n",
"# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n",
"bigram_probabilities = {}\n",
"for (w1, w2), count in bigram_counts.items():\n",
" prob = count / unigram_counts[w1]\n",
" bigram_probabilities[(w1, w2)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing bigram:\")\n",
"for (w1, w2), prob in bigram_probabilities.items():\n",
" print(f\" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)\n",
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...\n",
"total_tokens = sum(unigram_counts.values())\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)\n",
"p_kalimat = p_w1 # Inisialisasi dengan P(w1)\n",
"\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"] # Tambahkan P(w1) ke rumus\n",
"\n",
"for i in range(1, len(tokens)):\n",
" pair = (tokens[i-1], tokens[i])\n",
" p = bigram_probabilities.get(pair, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.2f}\")\n",
"\n",
"# Gabungkan rumus perkalian untuk ditampilkan\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "E6n1IU8X-G9S"
},
"source": [
"# **TRIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BIRARsj2FHJg",
"outputId": "968d420e-9370-40e5-e7e1-148e1d351d62"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning\n",
"Tokens (7): ['mahasiswa', 'mengerjakan', 'tugas', 'kemudian', 'mahasiswa', 'upload', 'e-learning']\n",
"\n",
"Frekuensi Trigram dalam kalimat:\n",
" ('mahasiswa', 'mengerjakan', 'tugas'): 1\n",
" ('mengerjakan', 'tugas', 'kemudian'): 1\n",
" ('tugas', 'kemudian', 'mahasiswa'): 1\n",
" ('kemudian', 'mahasiswa', 'upload'): 1\n",
" ('mahasiswa', 'upload', 'e-learning'): 1\n",
"\n",
"Total trigram dalam 1 kalimat: 5\n",
"\n",
"Probabilitas masing-masing trigram:\n",
" P(tugas|mahasiswa,mengerjakan) = 1.00 (100.00%)\n",
" P(kemudian|mengerjakan,tugas) = 1.00 (100.00%)\n",
" P(mahasiswa|tugas,kemudian) = 1.00 (100.00%)\n",
" P(upload|kemudian,mahasiswa) = 1.00 (100.00%)\n",
" P(e-learning|mahasiswa,upload) = 1.00 (100.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Trigram):\n",
" P(mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning) = P(mahasiswa)=0.29 x P(mengerjakan|mahasiswa)=0.50 x P(tugas|mahasiswa,mengerjakan)=1.00 x P(kemudian|mengerjakan,tugas)=1.00 x P(mahasiswa|tugas,kemudian)=1.00 x P(upload|kemudian,mahasiswa)=1.00 x P(e-learning|mahasiswa,upload)=1.00 = 0.142857 (14.29%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Bigram dan Trigram\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n",
"\n",
"bigram_counts = Counter(bigrams)\n",
"trigram_counts = Counter(trigrams)\n",
"\n",
"print(\"\\nFrekuensi Trigram dalam kalimat:\")\n",
"for tg, count in trigram_counts.items():\n",
" print(f\" {tg}: {count}\")\n",
"print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n",
"\n",
"# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)\n",
"trigram_probabilities = {}\n",
"for (w1, w2, w3), count in trigram_counts.items():\n",
" # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)\n",
" if bigram_counts[(w1, w2)] > 0:\n",
" prob = count / bigram_counts[(w1, w2)]\n",
" else:\n",
" prob = 0\n",
" trigram_probabilities[(w1, w2, w3)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing trigram:\")\n",
"for (w1, w2, w3), prob in trigram_probabilities.items():\n",
" print(f\" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)\n",
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...\n",
"\n",
"# a. P(w1)\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n",
"\n",
"# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)\n",
"if len(tokens) > 1:\n",
" count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0\n",
" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n",
"else:\n",
" p_w2_w1 = 1.0 # Jika hanya 1 kata\n",
"\n",
"p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)\n",
"\n",
"# Daftar bagian rumus untuk ditampilkan\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"]\n",
"if len(tokens) > 1:\n",
" prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}\")\n",
"\n",
"# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3\n",
"for i in range(len(tokens) - 2):\n",
" triplet = (tokens[i], tokens[i+1], tokens[i+2])\n",
" p = trigram_probabilities.get(triplet, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}\")\n",
"\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")\n"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

1
README.md Normal file
View File

@ -0,0 +1 @@
# Kompilasi Materi Praktikum