# Input jumlah dokumen import pandas as pd n = int(input("Masukkan jumlah dokumen yang ingin dimasukkan: ")) # Input teks dokumen satu per satu documents = [] for i in range(n): teks = input(f"Masukkan teks untuk dokumen ke-{i+1}: ") documents.append(teks) print("\n=== Dokumen yang Dimasukkan ===") for i, doc in enumerate(documents): print(f"Doc {i+1}: {doc}") # Tahap Tokenisasi tokenized_docs = [] for doc in documents: tokens = doc.lower().split() tokenized_docs.append(tokens) print("\n=== Hasil Tokenisasi ===") for i, tokens in enumerate(tokenized_docs): print(f"Doc {i+1}: {tokens}") # Pembuatan Corpus corpus_all = [word for doc in tokenized_docs for word in doc] print("\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===") print(corpus_all) print(f"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}") # Pembuatan Vocabulary vocabulary = sorted(set(corpus_all)) print("\n=== Vocabulary (Kata Unik) ===") print(vocabulary) print(f"Jumlah kata unik (vocabulary size): {len(vocabulary)}") vocabulary = sorted(set(corpus_all)) print("\n=== Vocabulary (Kata Unik) ===") for idx, word in enumerate(vocabulary, start=1): print(f"{idx:>2}. {word}") print(f"\nJumlah kata unik (vocabulary size): {len(vocabulary)}") # Representasi Numerik (Matriks BoW) bow_matrix = [] for doc in tokenized_docs: vector = [doc.count(word) for word in vocabulary] bow_matrix.append(vector) df_bow = pd.DataFrame(bow_matrix, columns=vocabulary) df_bow.index = [f"D{i}" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3 print("\n=== Matriks Bag of Words ===") print(df_bow) # Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen) word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index() word_frequencies.columns = ["Kata", "Frekuensi"] print("\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===") print(word_frequencies) print(f"Frekuensi kata: {len(word_frequencies)}")