Repositori-NLP/Ngram-nlp.ipynb

390 lines
20 KiB
Plaintext

{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "kWlKIkcVOrKy"
},
"outputs": [],
"source": [
"{\n",
" \"cells\": [\n",
" {\n",
" \"cell_type\": \"markdown\",\n",
" \"metadata\": {},\n",
" \"source\": [\n",
" \"# Model Unigram, Bigram, dan Trigram\\n\",\n",
" \"\\n\",\n",
" \"**Nama:** Alya Priscilla \\n\",\n",
" \"**NIM:** 202210715016 \\n\",\n",
" \"**Kelas:** F7B2 \\n\",\n",
" \"\\n\",\n",
" \"**Tujuan praktikum:** \\n\",\n",
" \"Memahami cara menghitung frekuensi dan probabilitas kalimat menggunakan model Unigram, Bigram, dan Trigram pada teks bahasa Indonesia.\\n\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": null,\n",
" \"metadata\": {},\n",
" \"outputs\": [],\n",
" \"source\": [\n",
" \"from collections import Counter\\n\",\n",
" \"from IPython.display import clear_output\\n\",\n",
" \"import math\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 11,\n",
" \"metadata\": {},\n",
" \"outputs\": [\n",
" {\n",
" \"name\": \"stdout\",\n",
" \"output_type\": \"stream\",\n",
" \"text\": [\n",
" \"Corpus: alya suka olahraga lari dan suka olahraga badminton\\n\",\n",
" \"Tokens (8): ['alya', 'suka', 'olahraga', 'lari', 'dan', 'suka', 'olahraga', 'badminton']\\n\",\n",
" \"\\n\",\n",
" \"Frekuensi Unigram dalam kalimat:\\n\",\n",
" \" ('alya'): 1\\n\",\n",
" \" ('suka'): 2\\n\",\n",
" \" ('olahraga'): 2\\n\",\n",
" \" ('lari'): 1\\n\",\n",
" \" ('dan'): 1\\n\",\n",
" \" ('badminton'): 1\\n\",\n",
" \"\\n\",\n",
" \"Total unigram dalam 1 kalimat: 8\\n\",\n",
" \"\\n\",\n",
" \"Probabilitas masing-masing unigram:\\n\",\n",
" \" P(alya) = 0.1250 (12.50%)\\n\",\n",
" \" P(suka) = 0.2500 (25.00%)\\n\",\n",
" \" P(olahraga) = 0.2500 (25.00%)\\n\",\n",
" \" P(lari) = 0.1250 (12.50%)\\n\",\n",
" \" P(dan) = 0.1250 (12.50%)\\n\",\n",
" \" P(badminton) = 0.1250 (12.50%)\\n\",\n",
" \"\\n\",\n",
" \"Probabilitas Keseluruhan Kalimat (Model Unigram):\\n\",\n",
" \" P(alya suka olahraga lari dan suka olahraga badminton) = P(alya)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(lari)=0.1250 x P(dan)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(badminton)=0.1250 = 0.00000095 (0.000095%)\\n\"\n",
" ]\n",
" }\n",
" ],\n",
" \"source\": [\n",
" \"# ================= UNIGRAM =================\\n\",\n",
" \"\\n\",\n",
" \"# 1. Input Kalimat dan Tokenisasi\\n\",\n",
" \"kalimat = input(\\\"Masukkan kalimat: \\\").strip()\\n\",\n",
" \"\\n\",\n",
" \"# Bersihkan output (khusus lingkungan notebook)\\n\",\n",
" \"try:\\n\",\n",
" \" clear_output()\\n\",\n",
" \"except:\\n\",\n",
" \" pass\\n\",\n",
" \"\\n\",\n",
" \"print(f\\\"Corpus: {kalimat}\\\")\\n\",\n",
" \"\\n\",\n",
" \"# Tokenize\\n\",\n",
" \"tokens = kalimat.lower().split()\\n\",\n",
" \"print(f\\\"Tokens ({len(tokens)}): {tokens}\\\")\\n\",\n",
" \"\\n\",\n",
" \"# 2. Hitung Frekuensi Unigram\\n\",\n",
" \"unigram_counts = Counter(tokens)\\n\",\n",
" \"total_tokens = sum(unigram_counts.values())\\n\",\n",
" \"\\n\",\n",
" \"print(\\\"\\\\nFrekuensi Unigram dalam kalimat:\\\")\\n\",\n",
" \"for pair, count in unigram_counts.items():\\n\",\n",
" \" print(f\\\" ('{pair}'): {count}\\\")\\n\",\n",
" \"print(f\\\"\\\\nTotal unigram dalam 1 kalimat: {total_tokens}\\\")\\n\",\n",
" \"\\n\",\n",
" \"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\\n\",\n",
" \"unigram_probabilities = {}\\n\",\n",
" \"for word, count in unigram_counts.items():\\n\",\n",
" \" prob = count / total_tokens\\n\",\n",
" \" unigram_probabilities[word] = prob\\n\",\n",
" \"\\n\",\n",
" \"print(\\\"\\\\nProbabilitas masing-masing unigram:\\\")\\n\",\n",
" \"for word, prob in unigram_probabilities.items():\\n\",\n",
" \" print(f\\\" P({word}) = {prob:.4f} ({prob*100:.2f}%)\\\")\\n\",\n",
" \"\\n\",\n",
" \"# 4. Probabilitas Kalimat Keseluruhan (P(w1) * P(w2) * ...)\\n\",\n",
" \"p_kalimat = 1\\n\",\n",
" \"prob_parts = []\\n\",\n",
" \"\\n\",\n",
" \"for word in tokens:\\n\",\n",
" \" prob_value = unigram_probabilities[word]\\n\",\n",
" \" p_kalimat *= prob_value\\n\",\n",
" \" prob_parts.append(f\\\"P({word})={prob_value:.4f}\\\")\\n\",\n",
" \"\\n\",\n",
" \"prob_str = \\\" x \\\".join(prob_parts)\\n\",\n",
" \"\\n\",\n",
" \"print(\\\"\\\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\\\")\\n\",\n",
" \"print(f\\\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\\\")\\n\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 12,\n",
" \"metadata\": {},\n",
" \"outputs\": [\n",
" {\n",
" \"name\": \"stdout\",\n",
" \"output_type\": \"stream\",\n",
" \"text\": [\n",
" \"Corpus: Alya sedang belajar model bigram untuk menghitung probabilitas kalimat dan fatah sangat suka belajar\\n\",\n",
" \"Tokens (14): ['alya', 'sedang', 'belajar', 'model', 'bigram', 'untuk', 'menghitung', 'probabilitas', 'kalimat', 'dan', 'alya', 'sangat', 'suka', 'belajar']\\n\",\n",
" \"\\n\",\n",
" \"Frekuensi Bigram dalam kalimat:\\n\",\n",
" \" ('alya', 'sedang'): 1\\n\",\n",
" \" ('sedang', 'belajar'): 1\\n\",\n",
" \" ('belajar', 'model'): 1\\n\",\n",
" \" ('model', 'bigram'): 1\\n\",\n",
" \" ('bigram', 'untuk'): 1\\n\",\n",
" \" ('untuk', 'menghitung'): 1\\n\",\n",
" \" ('menghitung', 'probabilitas'): 1\\n\",\n",
" \" ('probabilitas', 'kalimat'): 1\\n\",\n",
" \" ('kalimat', 'dan'): 1\\n\",\n",
" \" ('dan', 'alya'): 1\\n\",\n",
" \" ('alya', 'sangat'): 1\\n\",\n",
" \" ('sangat', 'suka'): 1\\n\",\n",
" \" ('suka', 'belajar'): 1\\n\",\n",
" \"\\n\",\n",
" \"Total bigram dalam 1 kalimat: 13\\n\",\n",
" \"\\n\",\n",
" \"Probabilitas masing-masing bigram:\\n\",\n",
" \" P(sedang|alya) = 0.5000 (50.00%)\\n\",\n",
" \" P(belajar|sedang) = 1.0000 (100.00%)\\n\",\n",
" \" P(model|belajar) = 0.5000 (50.00%)\\n\",\n",
" \" P(bigram|model) = 1.0000 (100.00%)\\n\",\n",
" \" P(untuk|bigram) = 1.0000 (100.00%)\\n\",\n",
" \" P(menghitung|untuk) = 1.0000 (100.00%)\\n\",\n",
" \" P(probabilitas|menghitung) = 1.0000 (100.00%)\\n\",\n",
" \" P(kalimat|probabilitas) = 1.0000 (100.00%)\\n\",\n",
" \" P(dan|kalimat) = 1.0000 (100.00%)\\n\",\n",
" \" P(alya|dan) = 1.0000 (100.00%)\\n\",\n",
" \" P(sangat|alya) = 0.5000 (50.00%)\\n\",\n",
" \" P(suka|sangat) = 1.0000 (100.00%)\\n\",\n",
" \" P(belajar|suka) = 1.0000 (100.00%)\\n\",\n",
" \"\\n\",\n",
" \"Probabilitas Keseluruhan Kalimat (Model Bigram):\\n\",\n",
" \" P(alya sedang belajar model bigram untuk menghitung probabilitas kalimat dan alya sangat suka belajar) = P(alya)=0.1429 x P(sedang|fatah)=0.5000 x P(belajar|sedang)=1.0000 x P(model|belajar)=0.5000 x P(bigram|model)=1.0000 x P(untuk|bigram)=1.0000 x P(menghitung|untuk)=1.0000 x P(probabilitas|menghitung)=1.0000 x P(kalimat|probabilitas)=1.0000 x P(dan|kalimat)=1.0000 x P(fatah|dan)=1.0000 x P(sangat|fatah)=0.5000 x P(suka|sangat)=1.0000 x P(belajar|suka)=1.0000 = 0.01785714 (1.785714%)\\n\"\n",
" ]\n",
" }\n",
" ],\n",
" \"source\": [\n",
" \"# ================= BIGRAM =================\\n\",\n",
" \"\\n\",\n",
" \"kalimat = input(\\\"Masukkan kalimat: \\\").strip()\\n\",\n",
" \"\\n\",\n",
" \"try:\\n\",\n",
" \" clear_output()\\n\",\n",
" \"except:\\n\",\n",
" \" pass\\n\",\n",
" \"\\n\",\n",
" \"print(f\\\"Corpus: {kalimat}\\\")\\n\",\n",
" \"\\n\",\n",
" \"# Tokenisasi\\n\",\n",
" \"tokens = kalimat.lower().split()\\n\",\n",
" \"print(f\\\"Tokens ({len(tokens)}): {tokens}\\\")\\n\",\n",
" \"\\n\",\n",
" \"# 2. Frekuensi Unigram dan Bigram\\n\",\n",
" \"unigram_counts = Counter(tokens)\\n\",\n",
" \"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\\n\",\n",
" \"bigram_counts = Counter(bigrams)\\n\",\n",
" \"\\n\",\n",
" \"print(\\\"\\\\nFrekuensi Bigram dalam kalimat:\\\")\\n\",\n",
" \"for pair, count in bigram_counts.items():\\n\",\n",
" \" print(f\\\" {pair}: {count}\\\")\\n\",\n",
" \"print(f\\\"\\\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\\\")\\n\",\n",
" \"\\n\",\n",
" \"# 3. Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\\n\",\n",
" \"bigram_probabilities = {}\\n\",\n",
" \"for (w1, w2), count in bigram_counts.items():\\n\",\n",
" \" prob = count / unigram_counts[w1]\\n\",\n",
" \" bigram_probabilities[(w1, w2)] = prob\\n\",\n",
" \"\\n\",\n",
" \"print(\\\"\\\\nProbabilitas masing-masing bigram:\\\")\\n\",\n",
" \"for (w1, w2), prob in bigram_probabilities.items():\\n\",\n",
" \" print(f\\\" P({w2}|{w1}) = {prob:.4f} ({prob*100:.2f}%)\\\")\\n\",\n",
" \"\\n\",\n",
" \"# 4. Probabilitas Kalimat (Model Bigram)\\n\",\n",
" \"total_tokens = sum(unigram_counts.values())\\n\",\n",
" \"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens\\n\",\n",
" \"p_kalimat = p_w1\\n\",\n",
" \"\\n\",\n",
" \"prob_str_parts = [f\\\"P({tokens[0]})={p_w1:.4f}\\\"]\\n\",\n",
" \"\\n\",\n",
" \"for i in range(1, len(tokens)):\\n\",\n",
" \" pair = (tokens[i-1], tokens[i])\\n\",\n",
" \" p = bigram_probabilities.get(pair, 0)\\n\",\n",
" \" p_kalimat *= p\\n\",\n",
" \" prob_str_parts.append(f\\\"P({pair[1]}|{pair[0]})={p:.4f}\\\")\\n\",\n",
" \"\\n\",\n",
" \"prob_str = \\\" x \\\".join(prob_str_parts)\\n\",\n",
" \"\\n\",\n",
" \"print(\\\"\\\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\\\")\\n\",\n",
" \"print(f\\\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\\\")\\n\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 13,\n",
" \"metadata\": {},\n",
" \"outputs\": [\n",
" {\n",
" \"name\": \"stdout\",\n",
" \"output_type\": \"stream\",\n",
" \"text\": [\n",
" \"Corpus: Pada praktikum ini alya sedang mempelajari model trigram untuk kalimat bahasa Indonesia\\n\",\n",
" \"Tokens (12): ['pada', 'praktikum', 'ini', 'alya', 'sedang', 'mempelajari', 'model', 'trigram', 'untuk', 'kalimat', 'bahasa', 'indonesia']\\n\",\n",
" \"\\n\",\n",
" \"Frekuensi Trigram dalam kalimat:\\n\",\n",
" \" ('pada', 'praktikum', 'ini'): 1\\n\",\n",
" \" ('praktikum', 'ini', 'alya'): 1\\n\",\n",
" \" ('ini', 'alya', 'sedang'): 1\\n\",\n",
" \" ('alya', 'sedang', 'mempelajari'): 1\\n\",\n",
" \" ('sedang', 'mempelajari', 'model'): 1\\n\",\n",
" \" ('mempelajari', 'model', 'trigram'): 1\\n\",\n",
" \" ('model', 'trigram', 'untuk'): 1\\n\",\n",
" \" ('trigram', 'untuk', 'kalimat'): 1\\n\",\n",
" \" ('untuk', 'kalimat', 'bahasa'): 1\\n\",\n",
" \" ('kalimat', 'bahasa', 'indonesia'): 1\\n\",\n",
" \"\\n\",\n",
" \"Total trigram dalam 1 kalimat: 10\\n\",\n",
" \"\\n\",\n",
" \"Probabilitas masing-masing trigram:\\n\",\n",
" \" P(ini|pada,praktikum) = 1.0000 (100.00%)\\n\",\n",
" \" P(alya|praktikum,ini) = 1.0000 (100.00%)\\n\",\n",
" \" P(sedang|ini,alya) = 1.0000 (100.00%)\\n\",\n",
" \" P(mempelajari|alya,sedang) = 1.0000 (100.00%)\\n\",\n",
" \" P(model|sedang,mempelajari) = 1.0000 (100.00%)\\n\",\n",
" \" P(trigram|mempelajari,model) = 1.0000 (100.00%)\\n\",\n",
" \" P(untuk|model,trigram) = 1.0000 (100.00%)\\n\",\n",
" \" P(kalimat|trigram,untuk) = 1.0000 (100.00%)\\n\",\n",
" \" P(bahasa|untuk,kalimat) = 1.0000 (100.00%)\\n\",\n",
" \" P(indonesia|kalimat,bahasa) = 1.0000 (100.00%)\\n\",\n",
" \"\\n\",\n",
" \"Probabilitas Keseluruhan Kalimat (Model Trigram):\\n\",\n",
" \" P(pada praktikum ini alya sedang mempelajari model trigram untuk kalimat bahasa indonesia) = P(pada)=0.0833 x P(praktikum|pada)=1.0000 x P(ini|pada,praktikum)=1.0000 x P(fatah|praktikum,ini)=1.0000 x P(sedang|ini,fatah)=1.0000 x P(mempelajari|fatah,sedang)=1.0000 x P(model|sedang,mempelajari)=1.0000 x P(trigram|mempelajari,model)=1.0000 x P(untuk|model,trigram)=1.0000 x P(kalimat|trigram,untuk)=1.0000 x P(bahasa|untuk,kalimat)=1.0000 x P(indonesia|kalimat,bahasa)=1.0000 = 0.08333333 (8.333333%)\\n\"\n",
" ]\n",
" }\n",
" ],\n",
" \"source\": [\n",
" \"# ================= TRIGRAM =================\\n\",\n",
" \"\\n\",\n",
" \"kalimat = input(\\\"Masukkan kalimat: \\\").strip()\\n\",\n",
" \"\\n\",\n",
" \"try:\\n\",\n",
" \" clear_output()\\n\",\n",
" \"except:\\n\",\n",
" \" pass\\n\",\n",
" \"\\n\",\n",
" \"print(f\\\"Corpus: {kalimat}\\\")\\n\",\n",
" \"\\n\",\n",
" \"# Tokenisasi\\n\",\n",
" \"tokens = kalimat.lower().split()\\n\",\n",
" \"print(f\\\"Tokens ({len(tokens)}): {tokens}\\\")\\n\",\n",
" \"\\n\",\n",
" \"# 2. Frekuensi Bigram dan Trigram\\n\",\n",
" \"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\\n\",\n",
" \"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\\n\",\n",
" \"\\n\",\n",
" \"bigram_counts = Counter(bigrams)\\n\",\n",
" \"trigram_counts = Counter(trigrams)\\n\",\n",
" \"\\n\",\n",
" \"print(\\\"\\\\nFrekuensi Trigram dalam kalimat:\\\")\\n\",\n",
" \"for tg, count in trigram_counts.items():\\n\",\n",
" \" print(f\\\" {tg}: {count}\\\")\\n\",\n",
" \"print(f\\\"\\\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\\\")\\n\",\n",
" \"\\n\",\n",
" \"# 3. Probabilitas Trigram: P(w3 | w1, w2)\\n\",\n",
" \"trigram_probabilities = {}\\n\",\n",
" \"for (w1, w2, w3), count in trigram_counts.items():\\n\",\n",
" \" if bigram_counts[(w1, w2)] > 0:\\n\",\n",
" \" prob = count / bigram_counts[(w1, w2)]\\n\",\n",
" \" else:\\n\",\n",
" \" prob = 0\\n\",\n",
" \" trigram_probabilities[(w1, w2, w3)] = prob\\n\",\n",
" \"\\n\",\n",
" \"print(\\\"\\\\nProbabilitas masing-masing trigram:\\\")\\n\",\n",
" \"for (w1, w2, w3), prob in trigram_probabilities.items():\\n\",\n",
" \" print(f\\\" P({w3}|{w1},{w2}) = {prob:.4f} ({prob*100:.2f}%)\\\")\\n\",\n",
" \"\\n\",\n",
" \"# 4. Probabilitas Kalimat (Model Trigram)\\n\",\n",
" \"\\n\",\n",
" \"unigram_counts = Counter(tokens)\\n\",\n",
" \"total_tokens = sum(unigram_counts.values())\\n\",\n",
" \"\\n\",\n",
" \"# P(w1)\\n\",\n",
" \"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\\n\",\n",
" \"\\n\",\n",
" \"# P(w2|w1)\\n\",\n",
" \"if len(tokens) > 1:\\n\",\n",
" \" count_w1 = unigram_counts.get(tokens[0], 1)\\n\",\n",
" \" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\\n\",\n",
" \"else:\\n\",\n",
" \" p_w2_w1 = 1.0\\n\",\n",
" \"\\n\",\n",
" \"p_kalimat = p_w1 * p_w2_w1\\n\",\n",
" \"\\n\",\n",
" \"prob_str_parts = [f\\\"P({tokens[0]})={p_w1:.4f}\\\"]\\n\",\n",
" \"if len(tokens) > 1:\\n\",\n",
" \" prob_str_parts.append(f\\\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.4f}\\\")\\n\",\n",
" \"\\n\",\n",
" \"# Perkalian trigram untuk i >= 3\\n\",\n",
" \"for i in range(len(tokens) - 2):\\n\",\n",
" \" triplet = (tokens[i], tokens[i+1], tokens[i+2])\\n\",\n",
" \" p = trigram_probabilities.get(triplet, 0)\\n\",\n",
" \" p_kalimat *= p\\n\",\n",
" \" prob_str_parts.append(f\\\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.4f}\\\")\\n\",\n",
" \"\\n\",\n",
" \"prob_str = \\\" x \\\".join(prob_str_parts)\\n\",\n",
" \"\\n\",\n",
" \"print(\\\"\\\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\\\")\\n\",\n",
" \"print(f\\\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\\\")\\n\"\n",
" ]\n",
" }\n",
" ],\n",
" \"metadata\": {\n",
" \"colab\": {\n",
" \"provenance\": []\n",
" },\n",
" \"kernelspec\": {\n",
" \"display_name\": \"Python 3 (ipykernel)\",\n",
" \"language\": \"python\",\n",
" \"name\": \"python3\"\n",
" },\n",
" \"language_info\": {\n",
" \"codemirror_mode\": {\n",
" \"name\": \"ipython\",\n",
" \"version\": 3\n",
" },\n",
" \"file_extension\": \".py\",\n",
" \"mimetype\": \"text/x-python\",\n",
" \"name\": \"python\",\n",
" \"nbconvert_exporter\": \"python\",\n",
" \"pygments_lexer\": \"ipython3\",\n",
" \"version\": \"3.13.5\"\n",
" }\n",
" },\n",
" \"nbformat\": 4,\n",
" \"nbformat_minor\": 4\n",
"}\n"
]
}
]
}