{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "kWlKIkcVOrKy" }, "outputs": [], "source": [ "{\n", " \"cells\": [\n", " {\n", " \"cell_type\": \"markdown\",\n", " \"metadata\": {},\n", " \"source\": [\n", " \"# Model Unigram, Bigram, dan Trigram\\n\",\n", " \"\\n\",\n", " \"**Nama:** Alya Priscilla \\n\",\n", " \"**NIM:** 202210715016 \\n\",\n", " \"**Kelas:** F7B2 \\n\",\n", " \"\\n\",\n", " \"**Tujuan praktikum:** \\n\",\n", " \"Memahami cara menghitung frekuensi dan probabilitas kalimat menggunakan model Unigram, Bigram, dan Trigram pada teks bahasa Indonesia.\\n\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": null,\n", " \"metadata\": {},\n", " \"outputs\": [],\n", " \"source\": [\n", " \"from collections import Counter\\n\",\n", " \"from IPython.display import clear_output\\n\",\n", " \"import math\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 11,\n", " \"metadata\": {},\n", " \"outputs\": [\n", " {\n", " \"name\": \"stdout\",\n", " \"output_type\": \"stream\",\n", " \"text\": [\n", " \"Corpus: fatah suka olahraga lari dan suka olahraga badminton\\n\",\n", " \"Tokens (8): ['fatah', 'suka', 'olahraga', 'lari', 'dan', 'suka', 'olahraga', 'badminton']\\n\",\n", " \"\\n\",\n", " \"Frekuensi Unigram dalam kalimat:\\n\",\n", " \" ('fatah'): 1\\n\",\n", " \" ('suka'): 2\\n\",\n", " \" ('olahraga'): 2\\n\",\n", " \" ('lari'): 1\\n\",\n", " \" ('dan'): 1\\n\",\n", " \" ('badminton'): 1\\n\",\n", " \"\\n\",\n", " \"Total unigram dalam 1 kalimat: 8\\n\",\n", " \"\\n\",\n", " \"Probabilitas masing-masing unigram:\\n\",\n", " \" P(fatah) = 0.1250 (12.50%)\\n\",\n", " \" P(suka) = 0.2500 (25.00%)\\n\",\n", " \" P(olahraga) = 0.2500 (25.00%)\\n\",\n", " \" P(lari) = 0.1250 (12.50%)\\n\",\n", " \" P(dan) = 0.1250 (12.50%)\\n\",\n", " \" P(badminton) = 0.1250 (12.50%)\\n\",\n", " \"\\n\",\n", " \"Probabilitas Keseluruhan Kalimat (Model Unigram):\\n\",\n", " \" P(fatah suka olahraga lari dan suka olahraga badminton) = P(fatah)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(lari)=0.1250 x P(dan)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(badminton)=0.1250 = 0.00000095 (0.000095%)\\n\"\n", " ]\n", " }\n", " ],\n", " \"source\": [\n", " \"# ================= UNIGRAM =================\\n\",\n", " \"\\n\",\n", " \"# 1. Input Kalimat dan Tokenisasi\\n\",\n", " \"kalimat = input(\\\"Masukkan kalimat: \\\").strip()\\n\",\n", " \"\\n\",\n", " \"# Bersihkan output (khusus lingkungan notebook)\\n\",\n", " \"try:\\n\",\n", " \" clear_output()\\n\",\n", " \"except:\\n\",\n", " \" pass\\n\",\n", " \"\\n\",\n", " \"print(f\\\"Corpus: {kalimat}\\\")\\n\",\n", " \"\\n\",\n", " \"# Tokenize\\n\",\n", " \"tokens = kalimat.lower().split()\\n\",\n", " \"print(f\\\"Tokens ({len(tokens)}): {tokens}\\\")\\n\",\n", " \"\\n\",\n", " \"# 2. Hitung Frekuensi Unigram\\n\",\n", " \"unigram_counts = Counter(tokens)\\n\",\n", " \"total_tokens = sum(unigram_counts.values())\\n\",\n", " \"\\n\",\n", " \"print(\\\"\\\\nFrekuensi Unigram dalam kalimat:\\\")\\n\",\n", " \"for pair, count in unigram_counts.items():\\n\",\n", " \" print(f\\\" ('{pair}'): {count}\\\")\\n\",\n", " \"print(f\\\"\\\\nTotal unigram dalam 1 kalimat: {total_tokens}\\\")\\n\",\n", " \"\\n\",\n", " \"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\\n\",\n", " \"unigram_probabilities = {}\\n\",\n", " \"for word, count in unigram_counts.items():\\n\",\n", " \" prob = count / total_tokens\\n\",\n", " \" unigram_probabilities[word] = prob\\n\",\n", " \"\\n\",\n", " \"print(\\\"\\\\nProbabilitas masing-masing unigram:\\\")\\n\",\n", " \"for word, prob in unigram_probabilities.items():\\n\",\n", " \" print(f\\\" P({word}) = {prob:.4f} ({prob*100:.2f}%)\\\")\\n\",\n", " \"\\n\",\n", " \"# 4. Probabilitas Kalimat Keseluruhan (P(w1) * P(w2) * ...)\\n\",\n", " \"p_kalimat = 1\\n\",\n", " \"prob_parts = []\\n\",\n", " \"\\n\",\n", " \"for word in tokens:\\n\",\n", " \" prob_value = unigram_probabilities[word]\\n\",\n", " \" p_kalimat *= prob_value\\n\",\n", " \" prob_parts.append(f\\\"P({word})={prob_value:.4f}\\\")\\n\",\n", " \"\\n\",\n", " \"prob_str = \\\" x \\\".join(prob_parts)\\n\",\n", " \"\\n\",\n", " \"print(\\\"\\\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\\\")\\n\",\n", " \"print(f\\\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\\\")\\n\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 12,\n", " \"metadata\": {},\n", " \"outputs\": [\n", " {\n", " \"name\": \"stdout\",\n", " \"output_type\": \"stream\",\n", " \"text\": [\n", " \"Corpus: Alya sedang belajar model bigram untuk menghitung probabilitas kalimat dan fatah sangat suka belajar\\n\",\n", " \"Tokens (14): ['alya', 'sedang', 'belajar', 'model', 'bigram', 'untuk', 'menghitung', 'probabilitas', 'kalimat', 'dan', 'alya', 'sangat', 'suka', 'belajar']\\n\",\n", " \"\\n\",\n", " \"Frekuensi Bigram dalam kalimat:\\n\",\n", " \" ('alya', 'sedang'): 1\\n\",\n", " \" ('sedang', 'belajar'): 1\\n\",\n", " \" ('belajar', 'model'): 1\\n\",\n", " \" ('model', 'bigram'): 1\\n\",\n", " \" ('bigram', 'untuk'): 1\\n\",\n", " \" ('untuk', 'menghitung'): 1\\n\",\n", " \" ('menghitung', 'probabilitas'): 1\\n\",\n", " \" ('probabilitas', 'kalimat'): 1\\n\",\n", " \" ('kalimat', 'dan'): 1\\n\",\n", " \" ('dan', 'alya'): 1\\n\",\n", " \" ('alya', 'sangat'): 1\\n\",\n", " \" ('sangat', 'suka'): 1\\n\",\n", " \" ('suka', 'belajar'): 1\\n\",\n", " \"\\n\",\n", " \"Total bigram dalam 1 kalimat: 13\\n\",\n", " \"\\n\",\n", " \"Probabilitas masing-masing bigram:\\n\",\n", " \" P(sedang|fatah) = 0.5000 (50.00%)\\n\",\n", " \" P(belajar|sedang) = 1.0000 (100.00%)\\n\",\n", " \" P(model|belajar) = 0.5000 (50.00%)\\n\",\n", " \" P(bigram|model) = 1.0000 (100.00%)\\n\",\n", " \" P(untuk|bigram) = 1.0000 (100.00%)\\n\",\n", " \" P(menghitung|untuk) = 1.0000 (100.00%)\\n\",\n", " \" P(probabilitas|menghitung) = 1.0000 (100.00%)\\n\",\n", " \" P(kalimat|probabilitas) = 1.0000 (100.00%)\\n\",\n", " \" P(dan|kalimat) = 1.0000 (100.00%)\\n\",\n", " \" P(alya|dan) = 1.0000 (100.00%)\\n\",\n", " \" P(sangat|alya) = 0.5000 (50.00%)\\n\",\n", " \" P(suka|sangat) = 1.0000 (100.00%)\\n\",\n", " \" P(belajar|suka) = 1.0000 (100.00%)\\n\",\n", " \"\\n\",\n", " \"Probabilitas Keseluruhan Kalimat (Model Bigram):\\n\",\n", " \" P(alya sedang belajar model bigram untuk menghitung probabilitas kalimat dan alya sangat suka belajar) = P(alya)=0.1429 x P(sedang|fatah)=0.5000 x P(belajar|sedang)=1.0000 x P(model|belajar)=0.5000 x P(bigram|model)=1.0000 x P(untuk|bigram)=1.0000 x P(menghitung|untuk)=1.0000 x P(probabilitas|menghitung)=1.0000 x P(kalimat|probabilitas)=1.0000 x P(dan|kalimat)=1.0000 x P(fatah|dan)=1.0000 x P(sangat|fatah)=0.5000 x P(suka|sangat)=1.0000 x P(belajar|suka)=1.0000 = 0.01785714 (1.785714%)\\n\"\n", " ]\n", " }\n", " ],\n", " \"source\": [\n", " \"# ================= BIGRAM =================\\n\",\n", " \"\\n\",\n", " \"kalimat = input(\\\"Masukkan kalimat: \\\").strip()\\n\",\n", " \"\\n\",\n", " \"try:\\n\",\n", " \" clear_output()\\n\",\n", " \"except:\\n\",\n", " \" pass\\n\",\n", " \"\\n\",\n", " \"print(f\\\"Corpus: {kalimat}\\\")\\n\",\n", " \"\\n\",\n", " \"# Tokenisasi\\n\",\n", " \"tokens = kalimat.lower().split()\\n\",\n", " \"print(f\\\"Tokens ({len(tokens)}): {tokens}\\\")\\n\",\n", " \"\\n\",\n", " \"# 2. Frekuensi Unigram dan Bigram\\n\",\n", " \"unigram_counts = Counter(tokens)\\n\",\n", " \"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\\n\",\n", " \"bigram_counts = Counter(bigrams)\\n\",\n", " \"\\n\",\n", " \"print(\\\"\\\\nFrekuensi Bigram dalam kalimat:\\\")\\n\",\n", " \"for pair, count in bigram_counts.items():\\n\",\n", " \" print(f\\\" {pair}: {count}\\\")\\n\",\n", " \"print(f\\\"\\\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\\\")\\n\",\n", " \"\\n\",\n", " \"# 3. Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\\n\",\n", " \"bigram_probabilities = {}\\n\",\n", " \"for (w1, w2), count in bigram_counts.items():\\n\",\n", " \" prob = count / unigram_counts[w1]\\n\",\n", " \" bigram_probabilities[(w1, w2)] = prob\\n\",\n", " \"\\n\",\n", " \"print(\\\"\\\\nProbabilitas masing-masing bigram:\\\")\\n\",\n", " \"for (w1, w2), prob in bigram_probabilities.items():\\n\",\n", " \" print(f\\\" P({w2}|{w1}) = {prob:.4f} ({prob*100:.2f}%)\\\")\\n\",\n", " \"\\n\",\n", " \"# 4. Probabilitas Kalimat (Model Bigram)\\n\",\n", " \"total_tokens = sum(unigram_counts.values())\\n\",\n", " \"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens\\n\",\n", " \"p_kalimat = p_w1\\n\",\n", " \"\\n\",\n", " \"prob_str_parts = [f\\\"P({tokens[0]})={p_w1:.4f}\\\"]\\n\",\n", " \"\\n\",\n", " \"for i in range(1, len(tokens)):\\n\",\n", " \" pair = (tokens[i-1], tokens[i])\\n\",\n", " \" p = bigram_probabilities.get(pair, 0)\\n\",\n", " \" p_kalimat *= p\\n\",\n", " \" prob_str_parts.append(f\\\"P({pair[1]}|{pair[0]})={p:.4f}\\\")\\n\",\n", " \"\\n\",\n", " \"prob_str = \\\" x \\\".join(prob_str_parts)\\n\",\n", " \"\\n\",\n", " \"print(\\\"\\\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\\\")\\n\",\n", " \"print(f\\\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\\\")\\n\"\n", " ]\n", " },\n", " {\n", " \"cell_type\": \"code\",\n", " \"execution_count\": 13,\n", " \"metadata\": {},\n", " \"outputs\": [\n", " {\n", " \"name\": \"stdout\",\n", " \"output_type\": \"stream\",\n", " \"text\": [\n", " \"Corpus: Pada praktikum ini Fatah sedang mempelajari model trigram untuk kalimat bahasa Indonesia\\n\",\n", " \"Tokens (12): ['pada', 'praktikum', 'ini', 'alya', 'sedang', 'mempelajari', 'model', 'trigram', 'untuk', 'kalimat', 'bahasa', 'indonesia']\\n\",\n", " \"\\n\",\n", " \"Frekuensi Trigram dalam kalimat:\\n\",\n", " \" ('pada', 'praktikum', 'ini'): 1\\n\",\n", " \" ('praktikum', 'ini', 'alya'): 1\\n\",\n", " \" ('ini', 'alya', 'sedang'): 1\\n\",\n", " \" ('alya', 'sedang', 'mempelajari'): 1\\n\",\n", " \" ('sedang', 'mempelajari', 'model'): 1\\n\",\n", " \" ('mempelajari', 'model', 'trigram'): 1\\n\",\n", " \" ('model', 'trigram', 'untuk'): 1\\n\",\n", " \" ('trigram', 'untuk', 'kalimat'): 1\\n\",\n", " \" ('untuk', 'kalimat', 'bahasa'): 1\\n\",\n", " \" ('kalimat', 'bahasa', 'indonesia'): 1\\n\",\n", " \"\\n\",\n", " \"Total trigram dalam 1 kalimat: 10\\n\",\n", " \"\\n\",\n", " \"Probabilitas masing-masing trigram:\\n\",\n", " \" P(ini|pada,praktikum) = 1.0000 (100.00%)\\n\",\n", " \" P(alya|praktikum,ini) = 1.0000 (100.00%)\\n\",\n", " \" P(sedang|ini,alya) = 1.0000 (100.00%)\\n\",\n", " \" P(mempelajari|alya,sedang) = 1.0000 (100.00%)\\n\",\n", " \" P(model|sedang,mempelajari) = 1.0000 (100.00%)\\n\",\n", " \" P(trigram|mempelajari,model) = 1.0000 (100.00%)\\n\",\n", " \" P(untuk|model,trigram) = 1.0000 (100.00%)\\n\",\n", " \" P(kalimat|trigram,untuk) = 1.0000 (100.00%)\\n\",\n", " \" P(bahasa|untuk,kalimat) = 1.0000 (100.00%)\\n\",\n", " \" P(indonesia|kalimat,bahasa) = 1.0000 (100.00%)\\n\",\n", " \"\\n\",\n", " \"Probabilitas Keseluruhan Kalimat (Model Trigram):\\n\",\n", " \" P(pada praktikum ini fatah sedang mempelajari model trigram untuk kalimat bahasa indonesia) = P(pada)=0.0833 x P(praktikum|pada)=1.0000 x P(ini|pada,praktikum)=1.0000 x P(fatah|praktikum,ini)=1.0000 x P(sedang|ini,fatah)=1.0000 x P(mempelajari|fatah,sedang)=1.0000 x P(model|sedang,mempelajari)=1.0000 x P(trigram|mempelajari,model)=1.0000 x P(untuk|model,trigram)=1.0000 x P(kalimat|trigram,untuk)=1.0000 x P(bahasa|untuk,kalimat)=1.0000 x P(indonesia|kalimat,bahasa)=1.0000 = 0.08333333 (8.333333%)\\n\"\n", " ]\n", " }\n", " ],\n", " \"source\": [\n", " \"# ================= TRIGRAM =================\\n\",\n", " \"\\n\",\n", " \"kalimat = input(\\\"Masukkan kalimat: \\\").strip()\\n\",\n", " \"\\n\",\n", " \"try:\\n\",\n", " \" clear_output()\\n\",\n", " \"except:\\n\",\n", " \" pass\\n\",\n", " \"\\n\",\n", " \"print(f\\\"Corpus: {kalimat}\\\")\\n\",\n", " \"\\n\",\n", " \"# Tokenisasi\\n\",\n", " \"tokens = kalimat.lower().split()\\n\",\n", " \"print(f\\\"Tokens ({len(tokens)}): {tokens}\\\")\\n\",\n", " \"\\n\",\n", " \"# 2. Frekuensi Bigram dan Trigram\\n\",\n", " \"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\\n\",\n", " \"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\\n\",\n", " \"\\n\",\n", " \"bigram_counts = Counter(bigrams)\\n\",\n", " \"trigram_counts = Counter(trigrams)\\n\",\n", " \"\\n\",\n", " \"print(\\\"\\\\nFrekuensi Trigram dalam kalimat:\\\")\\n\",\n", " \"for tg, count in trigram_counts.items():\\n\",\n", " \" print(f\\\" {tg}: {count}\\\")\\n\",\n", " \"print(f\\\"\\\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\\\")\\n\",\n", " \"\\n\",\n", " \"# 3. Probabilitas Trigram: P(w3 | w1, w2)\\n\",\n", " \"trigram_probabilities = {}\\n\",\n", " \"for (w1, w2, w3), count in trigram_counts.items():\\n\",\n", " \" if bigram_counts[(w1, w2)] > 0:\\n\",\n", " \" prob = count / bigram_counts[(w1, w2)]\\n\",\n", " \" else:\\n\",\n", " \" prob = 0\\n\",\n", " \" trigram_probabilities[(w1, w2, w3)] = prob\\n\",\n", " \"\\n\",\n", " \"print(\\\"\\\\nProbabilitas masing-masing trigram:\\\")\\n\",\n", " \"for (w1, w2, w3), prob in trigram_probabilities.items():\\n\",\n", " \" print(f\\\" P({w3}|{w1},{w2}) = {prob:.4f} ({prob*100:.2f}%)\\\")\\n\",\n", " \"\\n\",\n", " \"# 4. Probabilitas Kalimat (Model Trigram)\\n\",\n", " \"\\n\",\n", " \"unigram_counts = Counter(tokens)\\n\",\n", " \"total_tokens = sum(unigram_counts.values())\\n\",\n", " \"\\n\",\n", " \"# P(w1)\\n\",\n", " \"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\\n\",\n", " \"\\n\",\n", " \"# P(w2|w1)\\n\",\n", " \"if len(tokens) > 1:\\n\",\n", " \" count_w1 = unigram_counts.get(tokens[0], 1)\\n\",\n", " \" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\\n\",\n", " \"else:\\n\",\n", " \" p_w2_w1 = 1.0\\n\",\n", " \"\\n\",\n", " \"p_kalimat = p_w1 * p_w2_w1\\n\",\n", " \"\\n\",\n", " \"prob_str_parts = [f\\\"P({tokens[0]})={p_w1:.4f}\\\"]\\n\",\n", " \"if len(tokens) > 1:\\n\",\n", " \" prob_str_parts.append(f\\\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.4f}\\\")\\n\",\n", " \"\\n\",\n", " \"# Perkalian trigram untuk i >= 3\\n\",\n", " \"for i in range(len(tokens) - 2):\\n\",\n", " \" triplet = (tokens[i], tokens[i+1], tokens[i+2])\\n\",\n", " \" p = trigram_probabilities.get(triplet, 0)\\n\",\n", " \" p_kalimat *= p\\n\",\n", " \" prob_str_parts.append(f\\\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.4f}\\\")\\n\",\n", " \"\\n\",\n", " \"prob_str = \\\" x \\\".join(prob_str_parts)\\n\",\n", " \"\\n\",\n", " \"print(\\\"\\\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\\\")\\n\",\n", " \"print(f\\\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\\\")\\n\"\n", " ]\n", " }\n", " ],\n", " \"metadata\": {\n", " \"colab\": {\n", " \"provenance\": []\n", " },\n", " \"kernelspec\": {\n", " \"display_name\": \"Python 3 (ipykernel)\",\n", " \"language\": \"python\",\n", " \"name\": \"python3\"\n", " },\n", " \"language_info\": {\n", " \"codemirror_mode\": {\n", " \"name\": \"ipython\",\n", " \"version\": 3\n", " },\n", " \"file_extension\": \".py\",\n", " \"mimetype\": \"text/x-python\",\n", " \"name\": \"python\",\n", " \"nbconvert_exporter\": \"python\",\n", " \"pygments_lexer\": \"ipython3\",\n", " \"version\": \"3.13.5\"\n", " }\n", " },\n", " \"nbformat\": 4,\n", " \"nbformat_minor\": 4\n", "}\n" ] } ] }