From dbb192d6e1ed247d844a211a1a99a84d59560dc3 Mon Sep 17 00:00:00 2001 From: 202210715016 ALYA PRISCILLA PUTRI <202210715016@mhs.ubharajaya.ac.id> Date: Wed, 21 Jan 2026 01:35:48 +0700 Subject: [PATCH] Upload files to "/" --- Ngram-nlp.ipynb | 390 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 390 insertions(+) create mode 100644 Ngram-nlp.ipynb diff --git a/Ngram-nlp.ipynb b/Ngram-nlp.ipynb new file mode 100644 index 0000000..7645915 --- /dev/null +++ b/Ngram-nlp.ipynb @@ -0,0 +1,390 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kWlKIkcVOrKy" + }, + "outputs": [], + "source": [ + "{\n", + " \"cells\": [\n", + " {\n", + " \"cell_type\": \"markdown\",\n", + " \"metadata\": {},\n", + " \"source\": [\n", + " \"# Model Unigram, Bigram, dan Trigram\\n\",\n", + " \"\\n\",\n", + " \"**Nama:** Alya Priscilla \\n\",\n", + " \"**NIM:** 202210715016 \\n\",\n", + " \"**Kelas:** F7B2 \\n\",\n", + " \"\\n\",\n", + " \"**Tujuan praktikum:** \\n\",\n", + " \"Memahami cara menghitung frekuensi dan probabilitas kalimat menggunakan model Unigram, Bigram, dan Trigram pada teks bahasa Indonesia.\\n\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": null,\n", + " \"metadata\": {},\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"from collections import Counter\\n\",\n", + " \"from IPython.display import clear_output\\n\",\n", + " \"import math\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 11,\n", + " \"metadata\": {},\n", + " \"outputs\": [\n", + " {\n", + " \"name\": \"stdout\",\n", + " \"output_type\": \"stream\",\n", + " \"text\": [\n", + " \"Corpus: alya suka olahraga lari dan suka olahraga badminton\\n\",\n", + " \"Tokens (8): ['alya', 'suka', 'olahraga', 'lari', 'dan', 'suka', 'olahraga', 'badminton']\\n\",\n", + " \"\\n\",\n", + " \"Frekuensi Unigram dalam kalimat:\\n\",\n", + " \" ('alya'): 1\\n\",\n", + " \" ('suka'): 2\\n\",\n", + " \" ('olahraga'): 2\\n\",\n", + " \" ('lari'): 1\\n\",\n", + " \" ('dan'): 1\\n\",\n", + " \" ('badminton'): 1\\n\",\n", + " \"\\n\",\n", + " \"Total unigram dalam 1 kalimat: 8\\n\",\n", + " \"\\n\",\n", + " \"Probabilitas masing-masing unigram:\\n\",\n", + " \" P(alya) = 0.1250 (12.50%)\\n\",\n", + " \" P(suka) = 0.2500 (25.00%)\\n\",\n", + " \" P(olahraga) = 0.2500 (25.00%)\\n\",\n", + " \" P(lari) = 0.1250 (12.50%)\\n\",\n", + " \" P(dan) = 0.1250 (12.50%)\\n\",\n", + " \" P(badminton) = 0.1250 (12.50%)\\n\",\n", + " \"\\n\",\n", + " \"Probabilitas Keseluruhan Kalimat (Model Unigram):\\n\",\n", + " \" P(alya suka olahraga lari dan suka olahraga badminton) = P(alya)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(lari)=0.1250 x P(dan)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(badminton)=0.1250 = 0.00000095 (0.000095%)\\n\"\n", + " ]\n", + " }\n", + " ],\n", + " \"source\": [\n", + " \"# ================= UNIGRAM =================\\n\",\n", + " \"\\n\",\n", + " \"# 1. Input Kalimat dan Tokenisasi\\n\",\n", + " \"kalimat = input(\\\"Masukkan kalimat: \\\").strip()\\n\",\n", + " \"\\n\",\n", + " \"# Bersihkan output (khusus lingkungan notebook)\\n\",\n", + " \"try:\\n\",\n", + " \" clear_output()\\n\",\n", + " \"except:\\n\",\n", + " \" pass\\n\",\n", + " \"\\n\",\n", + " \"print(f\\\"Corpus: {kalimat}\\\")\\n\",\n", + " \"\\n\",\n", + " \"# Tokenize\\n\",\n", + " \"tokens = kalimat.lower().split()\\n\",\n", + " \"print(f\\\"Tokens ({len(tokens)}): {tokens}\\\")\\n\",\n", + " \"\\n\",\n", + " \"# 2. Hitung Frekuensi Unigram\\n\",\n", + " \"unigram_counts = Counter(tokens)\\n\",\n", + " \"total_tokens = sum(unigram_counts.values())\\n\",\n", + " \"\\n\",\n", + " \"print(\\\"\\\\nFrekuensi Unigram dalam kalimat:\\\")\\n\",\n", + " \"for pair, count in unigram_counts.items():\\n\",\n", + " \" print(f\\\" ('{pair}'): {count}\\\")\\n\",\n", + " \"print(f\\\"\\\\nTotal unigram dalam 1 kalimat: {total_tokens}\\\")\\n\",\n", + " \"\\n\",\n", + " \"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\\n\",\n", + " \"unigram_probabilities = {}\\n\",\n", + " \"for word, count in unigram_counts.items():\\n\",\n", + " \" prob = count / total_tokens\\n\",\n", + " \" unigram_probabilities[word] = prob\\n\",\n", + " \"\\n\",\n", + " \"print(\\\"\\\\nProbabilitas masing-masing unigram:\\\")\\n\",\n", + " \"for word, prob in unigram_probabilities.items():\\n\",\n", + " \" print(f\\\" P({word}) = {prob:.4f} ({prob*100:.2f}%)\\\")\\n\",\n", + " \"\\n\",\n", + " \"# 4. Probabilitas Kalimat Keseluruhan (P(w1) * P(w2) * ...)\\n\",\n", + " \"p_kalimat = 1\\n\",\n", + " \"prob_parts = []\\n\",\n", + " \"\\n\",\n", + " \"for word in tokens:\\n\",\n", + " \" prob_value = unigram_probabilities[word]\\n\",\n", + " \" p_kalimat *= prob_value\\n\",\n", + " \" prob_parts.append(f\\\"P({word})={prob_value:.4f}\\\")\\n\",\n", + " \"\\n\",\n", + " \"prob_str = \\\" x \\\".join(prob_parts)\\n\",\n", + " \"\\n\",\n", + " \"print(\\\"\\\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\\\")\\n\",\n", + " \"print(f\\\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\\\")\\n\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 12,\n", + " \"metadata\": {},\n", + " \"outputs\": [\n", + " {\n", + " \"name\": \"stdout\",\n", + " \"output_type\": \"stream\",\n", + " \"text\": [\n", + " \"Corpus: Alya sedang belajar model bigram untuk menghitung probabilitas kalimat dan fatah sangat suka belajar\\n\",\n", + " \"Tokens (14): ['alya', 'sedang', 'belajar', 'model', 'bigram', 'untuk', 'menghitung', 'probabilitas', 'kalimat', 'dan', 'alya', 'sangat', 'suka', 'belajar']\\n\",\n", + " \"\\n\",\n", + " \"Frekuensi Bigram dalam kalimat:\\n\",\n", + " \" ('alya', 'sedang'): 1\\n\",\n", + " \" ('sedang', 'belajar'): 1\\n\",\n", + " \" ('belajar', 'model'): 1\\n\",\n", + " \" ('model', 'bigram'): 1\\n\",\n", + " \" ('bigram', 'untuk'): 1\\n\",\n", + " \" ('untuk', 'menghitung'): 1\\n\",\n", + " \" ('menghitung', 'probabilitas'): 1\\n\",\n", + " \" ('probabilitas', 'kalimat'): 1\\n\",\n", + " \" ('kalimat', 'dan'): 1\\n\",\n", + " \" ('dan', 'alya'): 1\\n\",\n", + " \" ('alya', 'sangat'): 1\\n\",\n", + " \" ('sangat', 'suka'): 1\\n\",\n", + " \" ('suka', 'belajar'): 1\\n\",\n", + " \"\\n\",\n", + " \"Total bigram dalam 1 kalimat: 13\\n\",\n", + " \"\\n\",\n", + " \"Probabilitas masing-masing bigram:\\n\",\n", + " \" P(sedang|alya) = 0.5000 (50.00%)\\n\",\n", + " \" P(belajar|sedang) = 1.0000 (100.00%)\\n\",\n", + " \" P(model|belajar) = 0.5000 (50.00%)\\n\",\n", + " \" P(bigram|model) = 1.0000 (100.00%)\\n\",\n", + " \" P(untuk|bigram) = 1.0000 (100.00%)\\n\",\n", + " \" P(menghitung|untuk) = 1.0000 (100.00%)\\n\",\n", + " \" P(probabilitas|menghitung) = 1.0000 (100.00%)\\n\",\n", + " \" P(kalimat|probabilitas) = 1.0000 (100.00%)\\n\",\n", + " \" P(dan|kalimat) = 1.0000 (100.00%)\\n\",\n", + " \" P(alya|dan) = 1.0000 (100.00%)\\n\",\n", + " \" P(sangat|alya) = 0.5000 (50.00%)\\n\",\n", + " \" P(suka|sangat) = 1.0000 (100.00%)\\n\",\n", + " \" P(belajar|suka) = 1.0000 (100.00%)\\n\",\n", + " \"\\n\",\n", + " \"Probabilitas Keseluruhan Kalimat (Model Bigram):\\n\",\n", + " \" P(alya sedang belajar model bigram untuk menghitung probabilitas kalimat dan alya sangat suka belajar) = P(alya)=0.1429 x P(sedang|fatah)=0.5000 x P(belajar|sedang)=1.0000 x P(model|belajar)=0.5000 x P(bigram|model)=1.0000 x P(untuk|bigram)=1.0000 x P(menghitung|untuk)=1.0000 x P(probabilitas|menghitung)=1.0000 x P(kalimat|probabilitas)=1.0000 x P(dan|kalimat)=1.0000 x P(fatah|dan)=1.0000 x P(sangat|fatah)=0.5000 x P(suka|sangat)=1.0000 x P(belajar|suka)=1.0000 = 0.01785714 (1.785714%)\\n\"\n", + " ]\n", + " }\n", + " ],\n", + " \"source\": [\n", + " \"# ================= BIGRAM =================\\n\",\n", + " \"\\n\",\n", + " \"kalimat = input(\\\"Masukkan kalimat: \\\").strip()\\n\",\n", + " \"\\n\",\n", + " \"try:\\n\",\n", + " \" clear_output()\\n\",\n", + " \"except:\\n\",\n", + " \" pass\\n\",\n", + " \"\\n\",\n", + " \"print(f\\\"Corpus: {kalimat}\\\")\\n\",\n", + " \"\\n\",\n", + " \"# Tokenisasi\\n\",\n", + " \"tokens = kalimat.lower().split()\\n\",\n", + " \"print(f\\\"Tokens ({len(tokens)}): {tokens}\\\")\\n\",\n", + " \"\\n\",\n", + " \"# 2. Frekuensi Unigram dan Bigram\\n\",\n", + " \"unigram_counts = Counter(tokens)\\n\",\n", + " \"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\\n\",\n", + " \"bigram_counts = Counter(bigrams)\\n\",\n", + " \"\\n\",\n", + " \"print(\\\"\\\\nFrekuensi Bigram dalam kalimat:\\\")\\n\",\n", + " \"for pair, count in bigram_counts.items():\\n\",\n", + " \" print(f\\\" {pair}: {count}\\\")\\n\",\n", + " \"print(f\\\"\\\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\\\")\\n\",\n", + " \"\\n\",\n", + " \"# 3. Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\\n\",\n", + " \"bigram_probabilities = {}\\n\",\n", + " \"for (w1, w2), count in bigram_counts.items():\\n\",\n", + " \" prob = count / unigram_counts[w1]\\n\",\n", + " \" bigram_probabilities[(w1, w2)] = prob\\n\",\n", + " \"\\n\",\n", + " \"print(\\\"\\\\nProbabilitas masing-masing bigram:\\\")\\n\",\n", + " \"for (w1, w2), prob in bigram_probabilities.items():\\n\",\n", + " \" print(f\\\" P({w2}|{w1}) = {prob:.4f} ({prob*100:.2f}%)\\\")\\n\",\n", + " \"\\n\",\n", + " \"# 4. Probabilitas Kalimat (Model Bigram)\\n\",\n", + " \"total_tokens = sum(unigram_counts.values())\\n\",\n", + " \"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens\\n\",\n", + " \"p_kalimat = p_w1\\n\",\n", + " \"\\n\",\n", + " \"prob_str_parts = [f\\\"P({tokens[0]})={p_w1:.4f}\\\"]\\n\",\n", + " \"\\n\",\n", + " \"for i in range(1, len(tokens)):\\n\",\n", + " \" pair = (tokens[i-1], tokens[i])\\n\",\n", + " \" p = bigram_probabilities.get(pair, 0)\\n\",\n", + " \" p_kalimat *= p\\n\",\n", + " \" prob_str_parts.append(f\\\"P({pair[1]}|{pair[0]})={p:.4f}\\\")\\n\",\n", + " \"\\n\",\n", + " \"prob_str = \\\" x \\\".join(prob_str_parts)\\n\",\n", + " \"\\n\",\n", + " \"print(\\\"\\\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\\\")\\n\",\n", + " \"print(f\\\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\\\")\\n\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 13,\n", + " \"metadata\": {},\n", + " \"outputs\": [\n", + " {\n", + " \"name\": \"stdout\",\n", + " \"output_type\": \"stream\",\n", + " \"text\": [\n", + " \"Corpus: Pada praktikum ini alya sedang mempelajari model trigram untuk kalimat bahasa Indonesia\\n\",\n", + " \"Tokens (12): ['pada', 'praktikum', 'ini', 'alya', 'sedang', 'mempelajari', 'model', 'trigram', 'untuk', 'kalimat', 'bahasa', 'indonesia']\\n\",\n", + " \"\\n\",\n", + " \"Frekuensi Trigram dalam kalimat:\\n\",\n", + " \" ('pada', 'praktikum', 'ini'): 1\\n\",\n", + " \" ('praktikum', 'ini', 'alya'): 1\\n\",\n", + " \" ('ini', 'alya', 'sedang'): 1\\n\",\n", + " \" ('alya', 'sedang', 'mempelajari'): 1\\n\",\n", + " \" ('sedang', 'mempelajari', 'model'): 1\\n\",\n", + " \" ('mempelajari', 'model', 'trigram'): 1\\n\",\n", + " \" ('model', 'trigram', 'untuk'): 1\\n\",\n", + " \" ('trigram', 'untuk', 'kalimat'): 1\\n\",\n", + " \" ('untuk', 'kalimat', 'bahasa'): 1\\n\",\n", + " \" ('kalimat', 'bahasa', 'indonesia'): 1\\n\",\n", + " \"\\n\",\n", + " \"Total trigram dalam 1 kalimat: 10\\n\",\n", + " \"\\n\",\n", + " \"Probabilitas masing-masing trigram:\\n\",\n", + " \" P(ini|pada,praktikum) = 1.0000 (100.00%)\\n\",\n", + " \" P(alya|praktikum,ini) = 1.0000 (100.00%)\\n\",\n", + " \" P(sedang|ini,alya) = 1.0000 (100.00%)\\n\",\n", + " \" P(mempelajari|alya,sedang) = 1.0000 (100.00%)\\n\",\n", + " \" P(model|sedang,mempelajari) = 1.0000 (100.00%)\\n\",\n", + " \" P(trigram|mempelajari,model) = 1.0000 (100.00%)\\n\",\n", + " \" P(untuk|model,trigram) = 1.0000 (100.00%)\\n\",\n", + " \" P(kalimat|trigram,untuk) = 1.0000 (100.00%)\\n\",\n", + " \" P(bahasa|untuk,kalimat) = 1.0000 (100.00%)\\n\",\n", + " \" P(indonesia|kalimat,bahasa) = 1.0000 (100.00%)\\n\",\n", + " \"\\n\",\n", + " \"Probabilitas Keseluruhan Kalimat (Model Trigram):\\n\",\n", + " \" P(pada praktikum ini alya sedang mempelajari model trigram untuk kalimat bahasa indonesia) = P(pada)=0.0833 x P(praktikum|pada)=1.0000 x P(ini|pada,praktikum)=1.0000 x P(fatah|praktikum,ini)=1.0000 x P(sedang|ini,fatah)=1.0000 x P(mempelajari|fatah,sedang)=1.0000 x P(model|sedang,mempelajari)=1.0000 x P(trigram|mempelajari,model)=1.0000 x P(untuk|model,trigram)=1.0000 x P(kalimat|trigram,untuk)=1.0000 x P(bahasa|untuk,kalimat)=1.0000 x P(indonesia|kalimat,bahasa)=1.0000 = 0.08333333 (8.333333%)\\n\"\n", + " ]\n", + " }\n", + " ],\n", + " \"source\": [\n", + " \"# ================= TRIGRAM =================\\n\",\n", + " \"\\n\",\n", + " \"kalimat = input(\\\"Masukkan kalimat: \\\").strip()\\n\",\n", + " \"\\n\",\n", + " \"try:\\n\",\n", + " \" clear_output()\\n\",\n", + " \"except:\\n\",\n", + " \" pass\\n\",\n", + " \"\\n\",\n", + " \"print(f\\\"Corpus: {kalimat}\\\")\\n\",\n", + " \"\\n\",\n", + " \"# Tokenisasi\\n\",\n", + " \"tokens = kalimat.lower().split()\\n\",\n", + " \"print(f\\\"Tokens ({len(tokens)}): {tokens}\\\")\\n\",\n", + " \"\\n\",\n", + " \"# 2. Frekuensi Bigram dan Trigram\\n\",\n", + " \"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\\n\",\n", + " \"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\\n\",\n", + " \"\\n\",\n", + " \"bigram_counts = Counter(bigrams)\\n\",\n", + " \"trigram_counts = Counter(trigrams)\\n\",\n", + " \"\\n\",\n", + " \"print(\\\"\\\\nFrekuensi Trigram dalam kalimat:\\\")\\n\",\n", + " \"for tg, count in trigram_counts.items():\\n\",\n", + " \" print(f\\\" {tg}: {count}\\\")\\n\",\n", + " \"print(f\\\"\\\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\\\")\\n\",\n", + " \"\\n\",\n", + " \"# 3. Probabilitas Trigram: P(w3 | w1, w2)\\n\",\n", + " \"trigram_probabilities = {}\\n\",\n", + " \"for (w1, w2, w3), count in trigram_counts.items():\\n\",\n", + " \" if bigram_counts[(w1, w2)] > 0:\\n\",\n", + " \" prob = count / bigram_counts[(w1, w2)]\\n\",\n", + " \" else:\\n\",\n", + " \" prob = 0\\n\",\n", + " \" trigram_probabilities[(w1, w2, w3)] = prob\\n\",\n", + " \"\\n\",\n", + " \"print(\\\"\\\\nProbabilitas masing-masing trigram:\\\")\\n\",\n", + " \"for (w1, w2, w3), prob in trigram_probabilities.items():\\n\",\n", + " \" print(f\\\" P({w3}|{w1},{w2}) = {prob:.4f} ({prob*100:.2f}%)\\\")\\n\",\n", + " \"\\n\",\n", + " \"# 4. Probabilitas Kalimat (Model Trigram)\\n\",\n", + " \"\\n\",\n", + " \"unigram_counts = Counter(tokens)\\n\",\n", + " \"total_tokens = sum(unigram_counts.values())\\n\",\n", + " \"\\n\",\n", + " \"# P(w1)\\n\",\n", + " \"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\\n\",\n", + " \"\\n\",\n", + " \"# P(w2|w1)\\n\",\n", + " \"if len(tokens) > 1:\\n\",\n", + " \" count_w1 = unigram_counts.get(tokens[0], 1)\\n\",\n", + " \" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\\n\",\n", + " \"else:\\n\",\n", + " \" p_w2_w1 = 1.0\\n\",\n", + " \"\\n\",\n", + " \"p_kalimat = p_w1 * p_w2_w1\\n\",\n", + " \"\\n\",\n", + " \"prob_str_parts = [f\\\"P({tokens[0]})={p_w1:.4f}\\\"]\\n\",\n", + " \"if len(tokens) > 1:\\n\",\n", + " \" prob_str_parts.append(f\\\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.4f}\\\")\\n\",\n", + " \"\\n\",\n", + " \"# Perkalian trigram untuk i >= 3\\n\",\n", + " \"for i in range(len(tokens) - 2):\\n\",\n", + " \" triplet = (tokens[i], tokens[i+1], tokens[i+2])\\n\",\n", + " \" p = trigram_probabilities.get(triplet, 0)\\n\",\n", + " \" p_kalimat *= p\\n\",\n", + " \" prob_str_parts.append(f\\\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.4f}\\\")\\n\",\n", + " \"\\n\",\n", + " \"prob_str = \\\" x \\\".join(prob_str_parts)\\n\",\n", + " \"\\n\",\n", + " \"print(\\\"\\\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\\\")\\n\",\n", + " \"print(f\\\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\\\")\\n\"\n", + " ]\n", + " }\n", + " ],\n", + " \"metadata\": {\n", + " \"colab\": {\n", + " \"provenance\": []\n", + " },\n", + " \"kernelspec\": {\n", + " \"display_name\": \"Python 3 (ipykernel)\",\n", + " \"language\": \"python\",\n", + " \"name\": \"python3\"\n", + " },\n", + " \"language_info\": {\n", + " \"codemirror_mode\": {\n", + " \"name\": \"ipython\",\n", + " \"version\": 3\n", + " },\n", + " \"file_extension\": \".py\",\n", + " \"mimetype\": \"text/x-python\",\n", + " \"name\": \"python\",\n", + " \"nbconvert_exporter\": \"python\",\n", + " \"pygments_lexer\": \"ipython3\",\n", + " \"version\": \"3.13.5\"\n", + " }\n", + " },\n", + " \"nbformat\": 4,\n", + " \"nbformat_minor\": 4\n", + "}\n" + ] + } + ] +} \ No newline at end of file