# Model Unigram, Bigram, dan Trigram

**Nama:** Fatah Sabila Rosyad 
**NIM:** 202210715288 
**Kelas:** F7B2 

**Tujuan praktikum:** 
Memahami cara menghitung frekuensi dan probabilitas kalimat menggunakan model Unigram, Bigram, dan Trigram pada teks bahasa Indonesia.


In [None]:
from collections import Counter
from IPython.display import clear_output
import math

In [11]:
# ================= UNIGRAM =================

# 1. Input Kalimat dan Tokenisasi
kalimat = input("Masukkan kalimat: ").strip()

# Bersihkan output (khusus lingkungan notebook)
try:
 clear_output()
except:
 pass

print(f"Corpus: {kalimat}")

# Tokenize
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")

# 2. Hitung Frekuensi Unigram
unigram_counts = Counter(tokens)
total_tokens = sum(unigram_counts.values())

print("\nFrekuensi Unigram dalam kalimat:")
for pair, count in unigram_counts.items():
 print(f" ('{pair}'): {count}")
print(f"\nTotal unigram dalam 1 kalimat: {total_tokens}")

# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata
unigram_probabilities = {}
for word, count in unigram_counts.items():
 prob = count / total_tokens
 unigram_probabilities[word] = prob

print("\nProbabilitas masing-masing unigram:")
for word, prob in unigram_probabilities.items():
 print(f" P({word}) = {prob:.4f} ({prob*100:.2f}%)")

# 4. Probabilitas Kalimat Keseluruhan (P(w1) * P(w2) * ...)
p_kalimat = 1
prob_parts = []

for word in tokens:
 prob_value = unigram_probabilities[word]
 p_kalimat *= prob_value
 prob_parts.append(f"P({word})={prob_value:.4f}")

prob_str = " x ".join(prob_parts)

print("\nProbabilitas Keseluruhan Kalimat (Model Unigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)")


Corpus: fatah suka olahraga lari dan suka olahraga badminton
Tokens (8): ['fatah', 'suka', 'olahraga', 'lari', 'dan', 'suka', 'olahraga', 'badminton']

Frekuensi Unigram dalam kalimat:
 ('fatah'): 1
 ('suka'): 2
 ('olahraga'): 2
 ('lari'): 1
 ('dan'): 1
 ('badminton'): 1

Total unigram dalam 1 kalimat: 8

Probabilitas masing-masing unigram:
 P(fatah) = 0.1250 (12.50%)
 P(suka) = 0.2500 (25.00%)
 P(olahraga) = 0.2500 (25.00%)
 P(lari) = 0.1250 (12.50%)
 P(dan) = 0.1250 (12.50%)
 P(badminton) = 0.1250 (12.50%)

Probabilitas Keseluruhan Kalimat (Model Unigram):
 P(fatah suka olahraga lari dan suka olahraga badminton) = P(fatah)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(lari)=0.1250 x P(dan)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(badminton)=0.1250 = 0.00000095 (0.000095%)


In [12]:
# ================= BIGRAM =================

kalimat = input("Masukkan kalimat: ").strip()

try:
 clear_output()
except:
 pass

print(f"Corpus: {kalimat}")

# Tokenisasi
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")

# 2. Frekuensi Unigram dan Bigram
unigram_counts = Counter(tokens)
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
bigram_counts = Counter(bigrams)

print("\nFrekuensi Bigram dalam kalimat:")
for pair, count in bigram_counts.items():
 print(f" {pair}: {count}")
print(f"\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}")

# 3. Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)
bigram_probabilities = {}
for (w1, w2), count in bigram_counts.items():
 prob = count / unigram_counts[w1]
 bigram_probabilities[(w1, w2)] = prob

print("\nProbabilitas masing-masing bigram:")
for (w1, w2), prob in bigram_probabilities.items():
 print(f" P({w2}|{w1}) = {prob:.4f} ({prob*100:.2f}%)")

# 4. Probabilitas Kalimat (Model Bigram)
total_tokens = sum(unigram_counts.values())
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens
p_kalimat = p_w1

prob_str_parts = [f"P({tokens[0]})={p_w1:.4f}"]

for i in range(1, len(tokens)):
 pair = (tokens[i-1], tokens[i])
 p = bigram_probabilities.get(pair, 0)
 p_kalimat *= p
 prob_str_parts.append(f"P({pair[1]}|{pair[0]})={p:.4f}")

prob_str = " x ".join(prob_str_parts)

print("\nProbabilitas Keseluruhan Kalimat (Model Bigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)")


Corpus: Fatah sedang belajar model bigram untuk menghitung probabilitas kalimat dan fatah sangat suka belajar
Tokens (14): ['fatah', 'sedang', 'belajar', 'model', 'bigram', 'untuk', 'menghitung', 'probabilitas', 'kalimat', 'dan', 'fatah', 'sangat', 'suka', 'belajar']

Frekuensi Bigram dalam kalimat:
 ('fatah', 'sedang'): 1
 ('sedang', 'belajar'): 1
 ('belajar', 'model'): 1
 ('model', 'bigram'): 1
 ('bigram', 'untuk'): 1
 ('untuk', 'menghitung'): 1
 ('menghitung', 'probabilitas'): 1
 ('probabilitas', 'kalimat'): 1
 ('kalimat', 'dan'): 1
 ('dan', 'fatah'): 1
 ('fatah', 'sangat'): 1
 ('sangat', 'suka'): 1
 ('suka', 'belajar'): 1

Total bigram dalam 1 kalimat: 13

Probabilitas masing-masing bigram:
 P(sedang|fatah) = 0.5000 (50.00%)
 P(belajar|sedang) = 1.0000 (100.00%)
 P(model|belajar) = 0.5000 (50.00%)
 P(bigram|model) = 1.0000 (100.00%)
 P(untuk|bigram) = 1.0000 (100.00%)
 P(menghitung|untuk) = 1.0000 (100.00%)
 P(probabilitas|menghitung) = 1.0000 (100.00%)
 P(kalimat|probabilitas) = 1

In [13]:
# ================= TRIGRAM =================

kalimat = input("Masukkan kalimat: ").strip()

try:
 clear_output()
except:
 pass

print(f"Corpus: {kalimat}")

# Tokenisasi
tokens = kalimat.lower().split()
print(f"Tokens ({len(tokens)}): {tokens}")

# 2. Frekuensi Bigram dan Trigram
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]

bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)

print("\nFrekuensi Trigram dalam kalimat:")
for tg, count in trigram_counts.items():
 print(f" {tg}: {count}")
print(f"\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}")

# 3. Probabilitas Trigram: P(w3 | w1, w2)
trigram_probabilities = {}
for (w1, w2, w3), count in trigram_counts.items():
 if bigram_counts[(w1, w2)] > 0:
 prob = count / bigram_counts[(w1, w2)]
 else:
 prob = 0
 trigram_probabilities[(w1, w2, w3)] = prob

print("\nProbabilitas masing-masing trigram:")
for (w1, w2, w3), prob in trigram_probabilities.items():
 print(f" P({w3}|{w1},{w2}) = {prob:.4f} ({prob*100:.2f}%)")

# 4. Probabilitas Kalimat (Model Trigram)

unigram_counts = Counter(tokens)
total_tokens = sum(unigram_counts.values())

# P(w1)
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0

# P(w2|w1)
if len(tokens) > 1:
 count_w1 = unigram_counts.get(tokens[0], 1)
 p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1
else:
 p_w2_w1 = 1.0

p_kalimat = p_w1 * p_w2_w1

prob_str_parts = [f"P({tokens[0]})={p_w1:.4f}"]
if len(tokens) > 1:
 prob_str_parts.append(f"P({tokens[1]}|{tokens[0]})={p_w2_w1:.4f}")

# Perkalian trigram untuk i >= 3
for i in range(len(tokens) - 2):
 triplet = (tokens[i], tokens[i+1], tokens[i+2])
 p = trigram_probabilities.get(triplet, 0)
 p_kalimat *= p
 prob_str_parts.append(f"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.4f}")

prob_str = " x ".join(prob_str_parts)

print("\nProbabilitas Keseluruhan Kalimat (Model Trigram):")
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)")


Corpus: Pada praktikum ini Fatah sedang mempelajari model trigram untuk kalimat bahasa Indonesia
Tokens (12): ['pada', 'praktikum', 'ini', 'fatah', 'sedang', 'mempelajari', 'model', 'trigram', 'untuk', 'kalimat', 'bahasa', 'indonesia']

Frekuensi Trigram dalam kalimat:
 ('pada', 'praktikum', 'ini'): 1
 ('praktikum', 'ini', 'fatah'): 1
 ('ini', 'fatah', 'sedang'): 1
 ('fatah', 'sedang', 'mempelajari'): 1
 ('sedang', 'mempelajari', 'model'): 1
 ('mempelajari', 'model', 'trigram'): 1
 ('model', 'trigram', 'untuk'): 1
 ('trigram', 'untuk', 'kalimat'): 1
 ('untuk', 'kalimat', 'bahasa'): 1
 ('kalimat', 'bahasa', 'indonesia'): 1

Total trigram dalam 1 kalimat: 10

Probabilitas masing-masing trigram:
 P(ini|pada,praktikum) = 1.0000 (100.00%)
 P(fatah|praktikum,ini) = 1.0000 (100.00%)
 P(sedang|ini,fatah) = 1.0000 (100.00%)
 P(mempelajari|fatah,sedang) = 1.0000 (100.00%)
 P(model|sedang,mempelajari) = 1.0000 (100.00%)
 P(trigram|mempelajari,model) = 1.0000 (100.00%)
 P(untuk|model,trigram) = 1.