210 lines
6.1 KiB
Plaintext
210 lines
6.1 KiB
Plaintext
|
|
|
|
|
|
|
|
|
|
|
|
from collections import Counter
|
|
from IPython.display import clear_output
|
|
import math
|
|
|
|
# 1. Input Kalimat dan Tokenisasi
|
|
kalimat = input("Masukkan kalimat: ").strip()
|
|
|
|
# Bersihkan output (khusus lingkungan notebook)
|
|
try:
|
|
clear_output()
|
|
except:
|
|
pass
|
|
|
|
print(f"Corpus: {kalimat}")
|
|
|
|
# Tokenize
|
|
tokens = kalimat.lower().split()
|
|
print(f"Tokens ({len(tokens)}): {tokens}")
|
|
|
|
# 2. Hitung Frekuensi Unigram
|
|
unigram_counts = Counter(tokens)
|
|
total_tokens = sum(unigram_counts.values())
|
|
|
|
print("\nFrekuensi Unigram dalam kalimat")
|
|
for pair, count in unigram_counts.items():
|
|
print(f" ('{pair}'): {count}")
|
|
print(f"\nTotal unigram dalam 1 kalimat: {total_tokens}")
|
|
|
|
# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata
|
|
unigram_probabilities = {}
|
|
for word, count in unigram_counts.items():
|
|
prob = count / total_tokens
|
|
unigram_probabilities[word] = prob
|
|
|
|
print("\nProbabilitas masing-masing unigram:")
|
|
for word, prob in unigram_probabilities.items():
|
|
print(f" P({word}) = {prob:.2f} ({prob*100:.2f}%)")
|
|
|
|
# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)
|
|
p_kalimat = 1
|
|
prob_parts = []
|
|
|
|
# Loop untuk menghitung probabilitas total dan membangun string rumus detail
|
|
for word in tokens:
|
|
prob_value = unigram_probabilities[word]
|
|
p_kalimat *= prob_value
|
|
# Format: P(word)=prob_value
|
|
prob_parts.append(f"P({word})={prob_value:.2f}")
|
|
|
|
# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail
|
|
prob_str = " x ".join(prob_parts)
|
|
|
|
print("\nProbabilitas Keseluruhan Kalimat (Model Unigram):")
|
|
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)")
|
|
|
|
|
|
|
|
|
|
|
|
from collections import Counter
|
|
from IPython.display import clear_output
|
|
import math
|
|
|
|
# 1. Input Kalimat dan Tokenisasi
|
|
kalimat = input("Masukkan kalimat: ").strip()
|
|
|
|
# Bersihkan output (khusus lingkungan notebook)
|
|
try:
|
|
clear_output()
|
|
except:
|
|
pass
|
|
|
|
print(f"Corpus: {kalimat}")
|
|
|
|
# Tokenisasi
|
|
tokens = kalimat.lower().split()
|
|
print(f"Tokens ({len(tokens)}): {tokens}")
|
|
|
|
# 2. Hitung Frekuensi Unigram dan Bigram
|
|
unigram_counts = Counter(tokens)
|
|
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
|
|
bigram_counts = Counter(bigrams)
|
|
|
|
print("\nFrekuensi Bigram dalam kalimat:")
|
|
for pair, count in bigram_counts.items():
|
|
print(f" {pair}: {count}")
|
|
print(f"\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}")
|
|
|
|
# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)
|
|
bigram_probabilities = {}
|
|
for (w1, w2), count in bigram_counts.items():
|
|
prob = count / unigram_counts[w1]
|
|
bigram_probabilities[(w1, w2)] = prob
|
|
|
|
print("\nProbabilitas masing-masing bigram:")
|
|
for (w1, w2), prob in bigram_probabilities.items():
|
|
print(f" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)")
|
|
|
|
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)
|
|
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...
|
|
total_tokens = sum(unigram_counts.values())
|
|
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)
|
|
p_kalimat = p_w1 # Inisialisasi dengan P(w1)
|
|
|
|
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"] # Tambahkan P(w1) ke rumus
|
|
|
|
for i in range(1, len(tokens)):
|
|
pair = (tokens[i-1], tokens[i])
|
|
p = bigram_probabilities.get(pair, 0)
|
|
p_kalimat *= p
|
|
prob_str_parts.append(f"P({pair[1]}|{pair[0]})={p:.2f}")
|
|
|
|
# Gabungkan rumus perkalian untuk ditampilkan
|
|
prob_str = " x ".join(prob_str_parts)
|
|
|
|
print("\nProbabilitas Keseluruhan Kalimat (Model Bigram):")
|
|
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
|
|
|
|
|
|
|
|
|
|
|
|
from collections import Counter
|
|
from IPython.display import clear_output
|
|
import math
|
|
|
|
# 1. Input Kalimat dan Tokenisasi
|
|
kalimat = input("Masukkan kalimat: ").strip()
|
|
|
|
# Bersihkan output (khusus lingkungan notebook)
|
|
try:
|
|
clear_output()
|
|
except:
|
|
pass
|
|
|
|
print(f"Corpus: {kalimat}")
|
|
|
|
# Tokenisasi
|
|
tokens = kalimat.lower().split()
|
|
print(f"Tokens ({len(tokens)}): {tokens}")
|
|
|
|
# 2. Hitung Frekuensi Bigram dan Trigram
|
|
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
|
|
trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]
|
|
|
|
bigram_counts = Counter(bigrams)
|
|
trigram_counts = Counter(trigrams)
|
|
|
|
print("\nFrekuensi Trigram dalam kalimat:")
|
|
for tg, count in trigram_counts.items():
|
|
print(f" {tg}: {count}")
|
|
print(f"\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}")
|
|
|
|
# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)
|
|
trigram_probabilities = {}
|
|
for (w1, w2, w3), count in trigram_counts.items():
|
|
# Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)
|
|
if bigram_counts[(w1, w2)] > 0:
|
|
prob = count / bigram_counts[(w1, w2)]
|
|
else:
|
|
prob = 0
|
|
trigram_probabilities[(w1, w2, w3)] = prob
|
|
|
|
print("\nProbabilitas masing-masing trigram:")
|
|
for (w1, w2, w3), prob in trigram_probabilities.items():
|
|
print(f" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)")
|
|
|
|
# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))
|
|
unigram_counts = Counter(tokens)
|
|
total_tokens = sum(unigram_counts.values())
|
|
|
|
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)
|
|
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...
|
|
|
|
# a. P(w1)
|
|
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0
|
|
|
|
# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)
|
|
if len(tokens) > 1:
|
|
count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0
|
|
p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1
|
|
else:
|
|
p_w2_w1 = 1.0 # Jika hanya 1 kata
|
|
|
|
p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)
|
|
|
|
# Daftar bagian rumus untuk ditampilkan
|
|
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"]
|
|
if len(tokens) > 1:
|
|
prob_str_parts.append(f"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}")
|
|
|
|
# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3
|
|
for i in range(len(tokens) - 2):
|
|
triplet = (tokens[i], tokens[i+1], tokens[i+2])
|
|
p = trigram_probabilities.get(triplet, 0)
|
|
p_kalimat *= p
|
|
prob_str_parts.append(f"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}")
|
|
|
|
prob_str = " x ".join(prob_str_parts)
|
|
|
|
print("\nProbabilitas Keseluruhan Kalimat (Model Trigram):")
|
|
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
|
|
|