210 lines
6.1 KiB
Plaintext

from collections import Counter
from IPython.display import clear_output
import math
# 1. Input Kalimat dan Tokenisasi
kalimat = input("Masukkan kalimat: ").strip()
# Bersihkan output (khusus lingkungan notebook)
try:
clear_output()
except:
pass
print(f"Corpus: {kalimat}")
# Tokenize
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")
# 2. Hitung Frekuensi Unigram
unigram_counts = Counter(tokens)
total_tokens = sum(unigram_counts.values())
print("\nFrekuensi Unigram dalam kalimat")
for pair, count in unigram_counts.items():
print(f" ('{pair}'): {count}")
print(f"\nTotal unigram dalam 1 kalimat: {total_tokens}")
# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata
unigram_probabilities = {}
for word, count in unigram_counts.items():
prob = count / total_tokens
unigram_probabilities[word] = prob
print("\nProbabilitas masing-masing unigram:")
for word, prob in unigram_probabilities.items():
print(f" P({word}) = {prob:.2f} ({prob*100:.2f}%)")
# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)
p_kalimat = 1
prob_parts = []
# Loop untuk menghitung probabilitas total dan membangun string rumus detail
for word in tokens:
prob_value = unigram_probabilities[word]
p_kalimat *= prob_value
# Format: P(word)=prob_value
prob_parts.append(f"P({word})={prob_value:.2f}")
# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail
prob_str = " x ".join(prob_parts)
print("\nProbabilitas Keseluruhan Kalimat (Model Unigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)")
from collections import Counter
from IPython.display import clear_output
import math
# 1. Input Kalimat dan Tokenisasi
kalimat = input("Masukkan kalimat: ").strip()
# Bersihkan output (khusus lingkungan notebook)
try:
clear_output()
except:
pass
print(f"Corpus: {kalimat}")
# Tokenisasi
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")
# 2. Hitung Frekuensi Unigram dan Bigram
unigram_counts = Counter(tokens)
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
bigram_counts = Counter(bigrams)
print("\nFrekuensi Bigram dalam kalimat:")
for pair, count in bigram_counts.items():
print(f" {pair}: {count}")
print(f"\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}")
# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)
bigram_probabilities = {}
for (w1, w2), count in bigram_counts.items():
prob = count / unigram_counts[w1]
bigram_probabilities[(w1, w2)] = prob
print("\nProbabilitas masing-masing bigram:")
for (w1, w2), prob in bigram_probabilities.items():
print(f" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)")
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...
total_tokens = sum(unigram_counts.values())
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)
p_kalimat = p_w1 # Inisialisasi dengan P(w1)
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"] # Tambahkan P(w1) ke rumus
for i in range(1, len(tokens)):
pair = (tokens[i-1], tokens[i])
p = bigram_probabilities.get(pair, 0)
p_kalimat *= p
prob_str_parts.append(f"P({pair[1]}|{pair[0]})={p:.2f}")
# Gabungkan rumus perkalian untuk ditampilkan
prob_str = " x ".join(prob_str_parts)
print("\nProbabilitas Keseluruhan Kalimat (Model Bigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
from collections import Counter
from IPython.display import clear_output
import math
# 1. Input Kalimat dan Tokenisasi
kalimat = input("Masukkan kalimat: ").strip()
# Bersihkan output (khusus lingkungan notebook)
try:
clear_output()
except:
pass
print(f"Corpus: {kalimat}")
# Tokenisasi
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")
# 2. Hitung Frekuensi Bigram dan Trigram
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)
print("\nFrekuensi Trigram dalam kalimat:")
for tg, count in trigram_counts.items():
print(f" {tg}: {count}")
print(f"\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}")
# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)
trigram_probabilities = {}
for (w1, w2, w3), count in trigram_counts.items():
# Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)
if bigram_counts[(w1, w2)] > 0:
prob = count / bigram_counts[(w1, w2)]
else:
prob = 0
trigram_probabilities[(w1, w2, w3)] = prob
print("\nProbabilitas masing-masing trigram:")
for (w1, w2, w3), prob in trigram_probabilities.items():
print(f" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)")
# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))
unigram_counts = Counter(tokens)
total_tokens = sum(unigram_counts.values())
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...
# a. P(w1)
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0
# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)
if len(tokens) > 1:
count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0
p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1
else:
p_w2_w1 = 1.0 # Jika hanya 1 kata
p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)
# Daftar bagian rumus untuk ditampilkan
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"]
if len(tokens) > 1:
prob_str_parts.append(f"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}")
# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3
for i in range(len(tokens) - 2):
triplet = (tokens[i], tokens[i+1], tokens[i+2])
p = trigram_probabilities.get(triplet, 0)
p_kalimat *= p
prob_str_parts.append(f"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}")
prob_str = " x ".join(prob_str_parts)
print("\nProbabilitas Keseluruhan Kalimat (Model Trigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")