418 lines
28 KiB
Plaintext
418 lines
28 KiB
Plaintext
{
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0,
|
|
"metadata": {
|
|
"colab": {
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3"
|
|
},
|
|
"language_info": {
|
|
"name": "python"
|
|
}
|
|
},
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "qiOnctzkPKXj"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"{\n",
|
|
" \"nbformat\": 4,\n",
|
|
" \"nbformat_minor\": 0,\n",
|
|
" \"metadata\": {\n",
|
|
" \"colab\": {\n",
|
|
" \"provenance\": []\n",
|
|
" },\n",
|
|
" \"kernelspec\": {\n",
|
|
" \"name\": \"python3\",\n",
|
|
" \"display_name\": \"Python 3\"\n",
|
|
" },\n",
|
|
" \"language_info\": {\n",
|
|
" \"name\": \"python\"\n",
|
|
" }\n",
|
|
" },\n",
|
|
" \"cells\": [\n",
|
|
" {\n",
|
|
" \"cell_type\": \"code\",\n",
|
|
" \"execution_count\": null,\n",
|
|
" \"metadata\": {\n",
|
|
" \"id\": \"kWlKIkcVOrKy\"\n",
|
|
" },\n",
|
|
" \"outputs\": [],\n",
|
|
" \"source\": [\n",
|
|
" \"{\\n\",\n",
|
|
" \" \\\"cells\\\": [\\n\",\n",
|
|
" \" {\\n\",\n",
|
|
" \" \\\"cell_type\\\": \\\"markdown\\\",\\n\",\n",
|
|
" \" \\\"metadata\\\": {},\\n\",\n",
|
|
" \" \\\"source\\\": [\\n\",\n",
|
|
" \" \\\"# Model Unigram, Bigram, dan Trigram\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"**Nama:** Wildanul Jannah \\\\n\\\",\\n\",\n",
|
|
" \" \\\"**NIM:** 202210715061 \\\\n\\\",\\n\",\n",
|
|
" \" \\\"**Kelas:** F7B2 \\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"**Tujuan praktikum:** \\\\n\\\",\\n\",\n",
|
|
" \" \\\"Memahami cara menghitung frekuensi dan probabilitas kalimat menggunakan model Unigram, Bigram, dan Trigram pada teks bahasa Indonesia.\\\\n\\\"\\n\",\n",
|
|
" \" ]\\n\",\n",
|
|
" \" },\\n\",\n",
|
|
" \" {\\n\",\n",
|
|
" \" \\\"cell_type\\\": \\\"code\\\",\\n\",\n",
|
|
" \" \\\"execution_count\\\": null,\\n\",\n",
|
|
" \" \\\"metadata\\\": {},\\n\",\n",
|
|
" \" \\\"outputs\\\": [],\\n\",\n",
|
|
" \" \\\"source\\\": [\\n\",\n",
|
|
" \" \\\"from collections import Counter\\\\n\\\",\\n\",\n",
|
|
" \" \\\"from IPython.display import clear_output\\\\n\\\",\\n\",\n",
|
|
" \" \\\"import math\\\"\\n\",\n",
|
|
" \" ]\\n\",\n",
|
|
" \" },\\n\",\n",
|
|
" \" {\\n\",\n",
|
|
" \" \\\"cell_type\\\": \\\"code\\\",\\n\",\n",
|
|
" \" \\\"execution_count\\\": 11,\\n\",\n",
|
|
" \" \\\"metadata\\\": {},\\n\",\n",
|
|
" \" \\\"outputs\\\": [\\n\",\n",
|
|
" \" {\\n\",\n",
|
|
" \" \\\"name\\\": \\\"stdout\\\",\\n\",\n",
|
|
" \" \\\"output_type\\\": \\\"stream\\\",\\n\",\n",
|
|
" \" \\\"text\\\": [\\n\",\n",
|
|
" \" \\\"Corpus: Wilda suka olahraga lari dan suka olahraga badminton\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Tokens (8): ['Wilda', 'suka', 'olahraga', 'lari', 'dan', 'suka', 'olahraga', 'badminton']\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Frekuensi Unigram dalam kalimat:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('Wilda'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('suka'): 2\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('olahraga'): 2\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('lari'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('dan'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('badminton'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Total unigram dalam 1 kalimat: 8\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Probabilitas masing-masing unigram:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(Wilda) = 0.1250 (12.50%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(suka) = 0.2500 (25.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(olahraga) = 0.2500 (25.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(lari) = 0.1250 (12.50%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(dan) = 0.1250 (12.50%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(badminton) = 0.1250 (12.50%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Probabilitas Keseluruhan Kalimat (Model Unigram):\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(Wilda suka olahraga lari dan suka olahraga badminton) = P(Wilda)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(lari)=0.1250 x P(dan)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(badminton)=0.1250 = 0.00000095 (0.000095%)\\\\n\\\"\\n\",\n",
|
|
" \" ]\\n\",\n",
|
|
" \" }\\n\",\n",
|
|
" \" ],\\n\",\n",
|
|
" \" \\\"source\\\": [\\n\",\n",
|
|
" \" \\\"# ================= UNIGRAM =================\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# 1. Input Kalimat dan Tokenisasi\\\\n\\\",\\n\",\n",
|
|
" \" \\\"kalimat = input(\\\\\\\"Masukkan kalimat: \\\\\\\").strip()\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# Bersihkan output (khusus lingkungan notebook)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"try:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" clear_output()\\\\n\\\",\\n\",\n",
|
|
" \" \\\"except:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" pass\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(f\\\\\\\"Corpus: {kalimat}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# Tokenize\\\\n\\\",\\n\",\n",
|
|
" \" \\\"tokens = kalimat.lower().split()\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(f\\\\\\\"Tokens ({len(tokens)}): {tokens}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# 2. Hitung Frekuensi Unigram\\\\n\\\",\\n\",\n",
|
|
" \" \\\"unigram_counts = Counter(tokens)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"total_tokens = sum(unigram_counts.values())\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(\\\\\\\"\\\\\\\\nFrekuensi Unigram dalam kalimat:\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"for pair, count in unigram_counts.items():\\\\n\\\",\\n\",\n",
|
|
" \" \\\" print(f\\\\\\\" ('{pair}'): {count}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(f\\\\\\\"\\\\\\\\nTotal unigram dalam 1 kalimat: {total_tokens}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\\\\n\\\",\\n\",\n",
|
|
" \" \\\"unigram_probabilities = {}\\\\n\\\",\\n\",\n",
|
|
" \" \\\"for word, count in unigram_counts.items():\\\\n\\\",\\n\",\n",
|
|
" \" \\\" prob = count / total_tokens\\\\n\\\",\\n\",\n",
|
|
" \" \\\" unigram_probabilities[word] = prob\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(\\\\\\\"\\\\\\\\nProbabilitas masing-masing unigram:\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"for word, prob in unigram_probabilities.items():\\\\n\\\",\\n\",\n",
|
|
" \" \\\" print(f\\\\\\\" P({word}) = {prob:.4f} ({prob*100:.2f}%)\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# 4. Probabilitas Kalimat Keseluruhan (P(w1) * P(w2) * ...)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"p_kalimat = 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\"prob_parts = []\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"for word in tokens:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" prob_value = unigram_probabilities[word]\\\\n\\\",\\n\",\n",
|
|
" \" \\\" p_kalimat *= prob_value\\\\n\\\",\\n\",\n",
|
|
" \" \\\" prob_parts.append(f\\\\\\\"P({word})={prob_value:.4f}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"prob_str = \\\\\\\" x \\\\\\\".join(prob_parts)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(\\\\\\\"\\\\\\\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(f\\\\\\\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\\\\\\\")\\\\n\\\"\\n\",\n",
|
|
" \" ]\\n\",\n",
|
|
" \" },\\n\",\n",
|
|
" \" {\\n\",\n",
|
|
" \" \\\"cell_type\\\": \\\"code\\\",\\n\",\n",
|
|
" \" \\\"execution_count\\\": 12,\\n\",\n",
|
|
" \" \\\"metadata\\\": {},\\n\",\n",
|
|
" \" \\\"outputs\\\": [\\n\",\n",
|
|
" \" {\\n\",\n",
|
|
" \" \\\"name\\\": \\\"stdout\\\",\\n\",\n",
|
|
" \" \\\"output_type\\\": \\\"stream\\\",\\n\",\n",
|
|
" \" \\\"text\\\": [\\n\",\n",
|
|
" \" \\\"Corpus: Wilda sedang belajar model bigram untuk menghitung probabilitas kalimat dan Wilda sangat suka belajar\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Tokens (14): ['Wilda', 'sedang', 'belajar', 'model', 'bigram', 'untuk', 'menghitung', 'probabilitas', 'kalimat', 'dan', 'alya', 'sangat', 'suka', 'belajar']\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Frekuensi Bigram dalam kalimat:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('Wilda', 'sedang'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('sedang', 'belajar'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('belajar', 'model'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('model', 'bigram'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('bigram', 'untuk'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('untuk', 'menghitung'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('menghitung', 'probabilitas'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('probabilitas', 'kalimat'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('kalimat', 'dan'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('dan', 'Wilda'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('Wilda', 'sangat'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('sangat', 'suka'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('suka', 'belajar'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Total bigram dalam 1 kalimat: 13\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Probabilitas masing-masing bigram:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(sedang|Wilda) = 0.5000 (50.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(belajar|sedang) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(model|belajar) = 0.5000 (50.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(bigram|model) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(untuk|bigram) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(menghitung|untuk) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(probabilitas|menghitung) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(kalimat|probabilitas) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(dan|kalimat) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(Wilda|dan) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(sangat|Wilda) = 0.5000 (50.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(suka|sangat) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(belajar|suka) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Probabilitas Keseluruhan Kalimat (Model Bigram):\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(Wilda sedang belajar model bigram untuk menghitung probabilitas kalimat dan Wilda sangat suka belajar) = P(Wilda)=0.1429 x P(sedang|alya)=0.5000 x P(belajar|sedang)=1.0000 x P(model|belajar)=0.5000 x P(bigram|model)=1.0000 x P(untuk|bigram)=1.0000 x P(menghitung|untuk)=1.0000 x P(probabilitas|menghitung)=1.0000 x P(kalimat|probabilitas)=1.0000 x P(dan|kalimat)=1.0000 x P(alya|dan)=1.0000 x P(sangat|alya)=0.5000 x P(suka|sangat)=1.0000 x P(belajar|suka)=1.0000 = 0.01785714 (1.785714%)\\\\n\\\"\\n\",\n",
|
|
" \" ]\\n\",\n",
|
|
" \" }\\n\",\n",
|
|
" \" ],\\n\",\n",
|
|
" \" \\\"source\\\": [\\n\",\n",
|
|
" \" \\\"# ================= BIGRAM =================\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"kalimat = input(\\\\\\\"Masukkan kalimat: \\\\\\\").strip()\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"try:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" clear_output()\\\\n\\\",\\n\",\n",
|
|
" \" \\\"except:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" pass\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(f\\\\\\\"Corpus: {kalimat}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# Tokenisasi\\\\n\\\",\\n\",\n",
|
|
" \" \\\"tokens = kalimat.lower().split()\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(f\\\\\\\"Tokens ({len(tokens)}): {tokens}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# 2. Frekuensi Unigram dan Bigram\\\\n\\\",\\n\",\n",
|
|
" \" \\\"unigram_counts = Counter(tokens)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\\\\n\\\",\\n\",\n",
|
|
" \" \\\"bigram_counts = Counter(bigrams)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(\\\\\\\"\\\\\\\\nFrekuensi Bigram dalam kalimat:\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"for pair, count in bigram_counts.items():\\\\n\\\",\\n\",\n",
|
|
" \" \\\" print(f\\\\\\\" {pair}: {count}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(f\\\\\\\"\\\\\\\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# 3. Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"bigram_probabilities = {}\\\\n\\\",\\n\",\n",
|
|
" \" \\\"for (w1, w2), count in bigram_counts.items():\\\\n\\\",\\n\",\n",
|
|
" \" \\\" prob = count / unigram_counts[w1]\\\\n\\\",\\n\",\n",
|
|
" \" \\\" bigram_probabilities[(w1, w2)] = prob\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(\\\\\\\"\\\\\\\\nProbabilitas masing-masing bigram:\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"for (w1, w2), prob in bigram_probabilities.items():\\\\n\\\",\\n\",\n",
|
|
" \" \\\" print(f\\\\\\\" P({w2}|{w1}) = {prob:.4f} ({prob*100:.2f}%)\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# 4. Probabilitas Kalimat (Model Bigram)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"total_tokens = sum(unigram_counts.values())\\\\n\\\",\\n\",\n",
|
|
" \" \\\"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens\\\\n\\\",\\n\",\n",
|
|
" \" \\\"p_kalimat = p_w1\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"prob_str_parts = [f\\\\\\\"P({tokens[0]})={p_w1:.4f}\\\\\\\"]\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"for i in range(1, len(tokens)):\\\\n\\\",\\n\",\n",
|
|
" \" \\\" pair = (tokens[i-1], tokens[i])\\\\n\\\",\\n\",\n",
|
|
" \" \\\" p = bigram_probabilities.get(pair, 0)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" p_kalimat *= p\\\\n\\\",\\n\",\n",
|
|
" \" \\\" prob_str_parts.append(f\\\\\\\"P({pair[1]}|{pair[0]})={p:.4f}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"prob_str = \\\\\\\" x \\\\\\\".join(prob_str_parts)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(\\\\\\\"\\\\\\\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(f\\\\\\\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\\\\\\\")\\\\n\\\"\\n\",\n",
|
|
" \" ]\\n\",\n",
|
|
" \" },\\n\",\n",
|
|
" \" {\\n\",\n",
|
|
" \" \\\"cell_type\\\": \\\"code\\\",\\n\",\n",
|
|
" \" \\\"execution_count\\\": 13,\\n\",\n",
|
|
" \" \\\"metadata\\\": {},\\n\",\n",
|
|
" \" \\\"outputs\\\": [\\n\",\n",
|
|
" \" {\\n\",\n",
|
|
" \" \\\"name\\\": \\\"stdout\\\",\\n\",\n",
|
|
" \" \\\"output_type\\\": \\\"stream\\\",\\n\",\n",
|
|
" \" \\\"text\\\": [\\n\",\n",
|
|
" \" \\\"Corpus: Pada praktikum ini Wilda sedang mempelajari model trigram untuk kalimat bahasa Indonesia\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Tokens (12): ['pada', 'praktikum', 'ini', 'Wilda', 'sedang', 'mempelajari', 'model', 'trigram', 'untuk', 'kalimat', 'bahasa', 'indonesia']\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Frekuensi Trigram dalam kalimat:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('pada', 'praktikum', 'ini'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('praktikum', 'ini', 'Wilda'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('ini', 'Wilda', 'sedang'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('Wilda', 'sedang', 'mempelajari'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('sedang', 'mempelajari', 'model'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('mempelajari', 'model', 'trigram'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('model', 'trigram', 'untuk'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('trigram', 'untuk', 'kalimat'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('untuk', 'kalimat', 'bahasa'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\" ('kalimat', 'bahasa', 'indonesia'): 1\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Total trigram dalam 1 kalimat: 10\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Probabilitas masing-masing trigram:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(ini|pada,praktikum) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(Wilda|praktikum,ini) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(sedang|ini,Wilda) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(mempelajari|Wilda,sedang) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(model|sedang,mempelajari) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(trigram|mempelajari,model) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(untuk|model,trigram) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(kalimat|trigram,untuk) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(bahasa|untuk,kalimat) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(indonesia|kalimat,bahasa) = 1.0000 (100.00%)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"Probabilitas Keseluruhan Kalimat (Model Trigram):\\\\n\\\",\\n\",\n",
|
|
" \" \\\" P(pada praktikum ini Wilda sedang mempelajari model trigram untuk kalimat bahasa indonesia) = P(pada)=0.0833 x P(praktikum|pada)=1.0000 x P(ini|pada,praktikum)=1.0000 x P(alya|praktikum,ini)=1.0000 x P(sedang|ini,alya)=1.0000 x P(mempelajari|alya,sedang)=1.0000 x P(model|sedang,mempelajari)=1.0000 x P(trigram|mempelajari,model)=1.0000 x P(untuk|model,trigram)=1.0000 x P(kalimat|trigram,untuk)=1.0000 x P(bahasa|untuk,kalimat)=1.0000 x P(indonesia|kalimat,bahasa)=1.0000 = 0.08333333 (8.333333%)\\\\n\\\"\\n\",\n",
|
|
" \" ]\\n\",\n",
|
|
" \" }\\n\",\n",
|
|
" \" ],\\n\",\n",
|
|
" \" \\\"source\\\": [\\n\",\n",
|
|
" \" \\\"# ================= TRIGRAM =================\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"kalimat = input(\\\\\\\"Masukkan kalimat: \\\\\\\").strip()\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"try:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" clear_output()\\\\n\\\",\\n\",\n",
|
|
" \" \\\"except:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" pass\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(f\\\\\\\"Corpus: {kalimat}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# Tokenisasi\\\\n\\\",\\n\",\n",
|
|
" \" \\\"tokens = kalimat.lower().split()\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(f\\\\\\\"Tokens ({len(tokens)}): {tokens}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# 2. Frekuensi Bigram dan Trigram\\\\n\\\",\\n\",\n",
|
|
" \" \\\"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\\\\n\\\",\\n\",\n",
|
|
" \" \\\"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"bigram_counts = Counter(bigrams)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"trigram_counts = Counter(trigrams)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(\\\\\\\"\\\\\\\\nFrekuensi Trigram dalam kalimat:\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"for tg, count in trigram_counts.items():\\\\n\\\",\\n\",\n",
|
|
" \" \\\" print(f\\\\\\\" {tg}: {count}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(f\\\\\\\"\\\\\\\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# 3. Probabilitas Trigram: P(w3 | w1, w2)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"trigram_probabilities = {}\\\\n\\\",\\n\",\n",
|
|
" \" \\\"for (w1, w2, w3), count in trigram_counts.items():\\\\n\\\",\\n\",\n",
|
|
" \" \\\" if bigram_counts[(w1, w2)] > 0:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" prob = count / bigram_counts[(w1, w2)]\\\\n\\\",\\n\",\n",
|
|
" \" \\\" else:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" prob = 0\\\\n\\\",\\n\",\n",
|
|
" \" \\\" trigram_probabilities[(w1, w2, w3)] = prob\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(\\\\\\\"\\\\\\\\nProbabilitas masing-masing trigram:\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"for (w1, w2, w3), prob in trigram_probabilities.items():\\\\n\\\",\\n\",\n",
|
|
" \" \\\" print(f\\\\\\\" P({w3}|{w1},{w2}) = {prob:.4f} ({prob*100:.2f}%)\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# 4. Probabilitas Kalimat (Model Trigram)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"unigram_counts = Counter(tokens)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"total_tokens = sum(unigram_counts.values())\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# P(w1)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# P(w2|w1)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"if len(tokens) > 1:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" count_w1 = unigram_counts.get(tokens[0], 1)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\\\\n\\\",\\n\",\n",
|
|
" \" \\\"else:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" p_w2_w1 = 1.0\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"p_kalimat = p_w1 * p_w2_w1\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"prob_str_parts = [f\\\\\\\"P({tokens[0]})={p_w1:.4f}\\\\\\\"]\\\\n\\\",\\n\",\n",
|
|
" \" \\\"if len(tokens) > 1:\\\\n\\\",\\n\",\n",
|
|
" \" \\\" prob_str_parts.append(f\\\\\\\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.4f}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"# Perkalian trigram untuk i >= 3\\\\n\\\",\\n\",\n",
|
|
" \" \\\"for i in range(len(tokens) - 2):\\\\n\\\",\\n\",\n",
|
|
" \" \\\" triplet = (tokens[i], tokens[i+1], tokens[i+2])\\\\n\\\",\\n\",\n",
|
|
" \" \\\" p = trigram_probabilities.get(triplet, 0)\\\\n\\\",\\n\",\n",
|
|
" \" \\\" p_kalimat *= p\\\\n\\\",\\n\",\n",
|
|
" \" \\\" prob_str_parts.append(f\\\\\\\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.4f}\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"prob_str = \\\\\\\" x \\\\\\\".join(prob_str_parts)\\\\n\\\",\\n\",\n",
|
|
" \" \\\"\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(\\\\\\\"\\\\\\\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\\\\\\\")\\\\n\\\",\\n\",\n",
|
|
" \" \\\"print(f\\\\\\\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\\\\\\\")\\\\n\\\"\\n\",\n",
|
|
" \" ]\\n\",\n",
|
|
" \" }\\n\",\n",
|
|
" \" ],\\n\",\n",
|
|
" \" \\\"metadata\\\": {\\n\",\n",
|
|
" \" \\\"colab\\\": {\\n\",\n",
|
|
" \" \\\"provenance\\\": []\\n\",\n",
|
|
" \" },\\n\",\n",
|
|
" \" \\\"kernelspec\\\": {\\n\",\n",
|
|
" \" \\\"display_name\\\": \\\"Python 3 (ipykernel)\\\",\\n\",\n",
|
|
" \" \\\"language\\\": \\\"python\\\",\\n\",\n",
|
|
" \" \\\"name\\\": \\\"python3\\\"\\n\",\n",
|
|
" \" },\\n\",\n",
|
|
" \" \\\"language_info\\\": {\\n\",\n",
|
|
" \" \\\"codemirror_mode\\\": {\\n\",\n",
|
|
" \" \\\"name\\\": \\\"ipython\\\",\\n\",\n",
|
|
" \" \\\"version\\\": 3\\n\",\n",
|
|
" \" },\\n\",\n",
|
|
" \" \\\"file_extension\\\": \\\".py\\\",\\n\",\n",
|
|
" \" \\\"mimetype\\\": \\\"text/x-python\\\",\\n\",\n",
|
|
" \" \\\"name\\\": \\\"python\\\",\\n\",\n",
|
|
" \" \\\"nbconvert_exporter\\\": \\\"python\\\",\\n\",\n",
|
|
" \" \\\"pygments_lexer\\\": \\\"ipython3\\\",\\n\",\n",
|
|
" \" \\\"version\\\": \\\"3.13.5\\\"\\n\",\n",
|
|
" \" }\\n\",\n",
|
|
" \" },\\n\",\n",
|
|
" \" \\\"nbformat\\\": 4,\\n\",\n",
|
|
" \" \\\"nbformat_minor\\\": 4\\n\",\n",
|
|
" \"}\\n\"\n",
|
|
" ]\n",
|
|
" }\n",
|
|
" ]\n",
|
|
"}"
|
|
]
|
|
}
|
|
]
|
|
} |