76 lines
1.9 KiB
Plaintext
76 lines
1.9 KiB
Plaintext
# Input jumlah dokumen
|
|
import pandas as pd
|
|
n = int(input("Masukkan jumlah dokumen yang ingin dimasukkan: "))
|
|
|
|
|
|
# Input teks dokumen satu per satu
|
|
documents = []
|
|
for i in range(n):
|
|
teks = input(f"Masukkan teks untuk dokumen ke-{i+1}: ")
|
|
documents.append(teks)
|
|
|
|
print("\n=== Dokumen yang Dimasukkan ===")
|
|
for i, doc in enumerate(documents):
|
|
print(f"Doc {i+1}: {doc}")
|
|
|
|
|
|
# Tahap Tokenisasi
|
|
tokenized_docs = []
|
|
for doc in documents:
|
|
tokens = doc.lower().split()
|
|
tokenized_docs.append(tokens)
|
|
|
|
print("\n=== Hasil Tokenisasi ===")
|
|
for i, tokens in enumerate(tokenized_docs):
|
|
print(f"Doc {i+1}: {tokens}")
|
|
|
|
|
|
# Pembuatan Corpus
|
|
corpus_all = [word for doc in tokenized_docs for word in doc]
|
|
|
|
print("\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===")
|
|
print(corpus_all)
|
|
print(f"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}")
|
|
|
|
|
|
# Pembuatan Vocabulary
|
|
vocabulary = sorted(set(corpus_all))
|
|
|
|
print("\n=== Vocabulary (Kata Unik) ===")
|
|
print(vocabulary)
|
|
print(f"Jumlah kata unik (vocabulary size): {len(vocabulary)}")
|
|
|
|
|
|
vocabulary = sorted(set(corpus_all))
|
|
|
|
print("\n=== Vocabulary (Kata Unik) ===")
|
|
for idx, word in enumerate(vocabulary, start=1):
|
|
print(f"{idx:>2}. {word}")
|
|
print(f"\nJumlah kata unik (vocabulary size): {len(vocabulary)}")
|
|
|
|
|
|
# Representasi Numerik (Matriks BoW)
|
|
bow_matrix = []
|
|
for doc in tokenized_docs:
|
|
vector = [doc.count(word) for word in vocabulary]
|
|
bow_matrix.append(vector)
|
|
|
|
|
|
df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)
|
|
df_bow.index = [f"D{i}" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3
|
|
|
|
print("\n=== Matriks Bag of Words ===")
|
|
print(df_bow)
|
|
|
|
|
|
# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)
|
|
word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()
|
|
word_frequencies.columns = ["Kata", "Frekuensi"]
|
|
|
|
print("\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===")
|
|
print(word_frequencies)
|
|
print(f"Frekuensi kata: {len(word_frequencies)}")
|
|
|
|
|
|
|