materi-praktikum/.virtual_documents/NLP/Fitur_Ekstraksi_BOW.ipynb

76 lines
1.9 KiB
Plaintext

# Input jumlah dokumen
import pandas as pd
n = int(input("Masukkan jumlah dokumen yang ingin dimasukkan: "))
# Input teks dokumen satu per satu
documents = []
for i in range(n):
teks = input(f"Masukkan teks untuk dokumen ke-{i+1}: ")
documents.append(teks)
print("\n=== Dokumen yang Dimasukkan ===")
for i, doc in enumerate(documents):
print(f"Doc {i+1}: {doc}")
# Tahap Tokenisasi
tokenized_docs = []
for doc in documents:
tokens = doc.lower().split()
tokenized_docs.append(tokens)
print("\n=== Hasil Tokenisasi ===")
for i, tokens in enumerate(tokenized_docs):
print(f"Doc {i+1}: {tokens}")
# Pembuatan Corpus
corpus_all = [word for doc in tokenized_docs for word in doc]
print("\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===")
print(corpus_all)
print(f"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}")
# Pembuatan Vocabulary
vocabulary = sorted(set(corpus_all))
print("\n=== Vocabulary (Kata Unik) ===")
print(vocabulary)
print(f"Jumlah kata unik (vocabulary size): {len(vocabulary)}")
vocabulary = sorted(set(corpus_all))
print("\n=== Vocabulary (Kata Unik) ===")
for idx, word in enumerate(vocabulary, start=1):
print(f"{idx:>2}. {word}")
print(f"\nJumlah kata unik (vocabulary size): {len(vocabulary)}")
# Representasi Numerik (Matriks BoW)
bow_matrix = []
for doc in tokenized_docs:
vector = [doc.count(word) for word in vocabulary]
bow_matrix.append(vector)
df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)
df_bow.index = [f"D{i}" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3
print("\n=== Matriks Bag of Words ===")
print(df_bow)
# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)
word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()
word_frequencies.columns = ["Kata", "Frekuensi"]
print("\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===")
print(word_frequencies)
print(f"Frekuensi kata: {len(word_frequencies)}")