Upload files to "/"

This commit is contained in:
202210715061 WILDANUL JANNAH 2026-01-21 23:31:55 +07:00
parent 890a0e9e2d
commit 7b47fc5b27
3 changed files with 609 additions and 0 deletions

28
app (1).py Normal file
View File

@ -0,0 +1,28 @@
import streamlit as st
import joblib
import re
# Load model & vectorizer
model = joblib.load("model_nb.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")
st.title("📰 Klasifikasi Topik Berita (NLP)")
st.write("Masukkan teks berita berbahasa Indonesia")
text = st.text_area("Teks Berita", height=200)
def preprocess_text(text):
text = text.lower()
text = re.sub(r"http\S+", "", text)
text = re.sub(r"[^a-zA-Z\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
if st.button("Klasifikasikan"):
if text.strip() == "":
st.warning("Teks tidak boleh kosong!")
else:
clean_text = preprocess_text(text)
text_tfidf = vectorizer.transform([clean_text])
prediction = model.predict(text_tfidf)[0]
st.success(f"Prediksi Topik: **{prediction}**")

View File

@ -0,0 +1,581 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "0a533e2e-9fee-4f5a-a4a2-2e3faa18d85a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: requests in c:\\users\\rosyad\\lib\\site-packages (2.32.3)Note: you may need to restart the kernel to use updated packages.\n",
"\n",
"Requirement already satisfied: pandas in c:\\users\\rosyad\\lib\\site-packages (2.2.3)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\rosyad\\lib\\site-packages (from requests) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\rosyad\\lib\\site-packages (from requests) (3.7)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\rosyad\\lib\\site-packages (from requests) (2.3.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\rosyad\\lib\\site-packages (from requests) (2025.4.26)\n",
"Requirement already satisfied: numpy>=1.26.0 in c:\\users\\rosyad\\lib\\site-packages (from pandas) (2.1.3)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\rosyad\\lib\\site-packages (from pandas) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\rosyad\\lib\\site-packages (from pandas) (2024.1)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\rosyad\\lib\\site-packages (from pandas) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\rosyad\\lib\\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n"
]
}
],
"source": [
"pip install requests pandas"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "3cbceaf7-0932-4f21-9cac-6fb2ecb6e37c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Politik: 50\n"
]
}
],
"source": [
"import requests\n",
"import pandas as pd\n",
"\n",
"API_KEY = \"ef2ecb89fbe0f7a22d4d075da633640c\"\n",
"url = \"https://gnews.io/api/v4/search\"\n",
"\n",
"rows = []\n",
"\n",
"for page in range(1, 6):\n",
" params = {\n",
" \"q\": \"pemerintah OR presiden OR DPR OR kebijakan\",\n",
" \"lang\": \"id\",\n",
" \"max\": 20,\n",
" \"page\": page,\n",
" \"token\": API_KEY\n",
" }\n",
"\n",
" r = requests.get(url, params=params)\n",
" data = r.json()\n",
"\n",
" for article in data[\"articles\"]:\n",
" text = article[\"title\"] + \" \" + article[\"description\"]\n",
" rows.append({\"text\": text, \"label\": \"Politik\"})\n",
"\n",
"df = pd.DataFrame(rows).head(100)\n",
"df.to_csv(\"berita_politik.csv\", index=False)\n",
"print(\"Politik:\", len(df))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "ecad8de9-3f98-4a51-b17b-8c0ca960bf4f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Olahraga: 12\n"
]
}
],
"source": [
"import requests\n",
"import pandas as pd\n",
"\n",
"API_KEY = \"ef2ecb89fbe0f7a22d4d075da633640c\"\n",
"url = \"https://gnews.io/api/v4/search\"\n",
"\n",
"rows = []\n",
"\n",
"for page in range(1, 6):\n",
" params = {\n",
" \"q\": \"sepak bola OR timnas OR liga indonesia OR olahraga\",\n",
" \"lang\": \"id\",\n",
" \"max\": 20,\n",
" \"page\": page,\n",
" \"token\": API_KEY\n",
" }\n",
"\n",
" r = requests.get(url, params=params)\n",
" data = r.json()\n",
"\n",
" for article in data[\"articles\"]:\n",
" text = article[\"title\"] + \" \" + article[\"description\"]\n",
" rows.append({\"text\": text, \"label\": \"Olahraga\"})\n",
"\n",
"df = pd.DataFrame(rows).head(100)\n",
"df.to_csv(\"berita_olahraga.csv\", index=False)\n",
"print(\"Olahraga:\", len(df))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a3746155-cbcd-4df1-aed7-06db697a4e17",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Teknologi: 50\n"
]
}
],
"source": [
"import requests\n",
"import pandas as pd\n",
"\n",
"API_KEY = \"ef2ecb89fbe0f7a22d4d075da633640c\"\n",
"url = \"https://gnews.io/api/v4/search\"\n",
"\n",
"rows = []\n",
"\n",
"for page in range(1, 6):\n",
" params = {\n",
" \"q\": \"teknologi OR startup OR aplikasi OR AI OR gadget\",\n",
" \"lang\": \"id\",\n",
" \"max\": 20,\n",
" \"page\": page,\n",
" \"token\": API_KEY\n",
" }\n",
"\n",
" r = requests.get(url, params=params)\n",
" data = r.json()\n",
"\n",
" for article in data[\"articles\"]:\n",
" text = article[\"title\"] + \" \" + article[\"description\"]\n",
" rows.append({\"text\": text, \"label\": \"Teknologi\"})\n",
"\n",
"df = pd.DataFrame(rows).head(100)\n",
"df.to_csv(\"berita_teknologi.csv\", index=False)\n",
"print(\"Teknologi:\", len(df))"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "c9c6de7e-1dbd-490a-aff4-9e647e1e62b8",
"metadata": {},
"outputs": [],
"source": [
"df_olahraga = pd.read_csv(\"berita_olahraga.csv\", encoding=\"latin1\")\n",
"df_olahraga.to_csv(\"berita_olahraga_utf8.csv\", index=False, encoding=\"utf-8\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "73658334-e371-4975-8f5f-e98f7b3da55d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"label\n",
"Politik 50\n",
"Olahraga 50\n",
"Teknologi 50\n",
"Name: count, dtype: int64"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.concat([\n",
" pd.read_csv(\"berita_politik.csv\"),\n",
" pd.read_csv(\"berita_olahraga_utf8.csv\"),\n",
" pd.read_csv(\"berita_teknologi.csv\")\n",
"])\n",
"\n",
"df[\"label\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "56cd79d3-256f-4e89-be56-2d440969c7cf",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"label\n",
"Politik 50\n",
"Olahraga 50\n",
"Teknologi 50\n",
"Name: count, dtype: int64"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[\"label\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "85fa5276-a690-4796-ac9c-f49583dcff19",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"dataset_berita_final.csv\", index=False, encoding=\"utf-8\")"
]
},
{
"cell_type": "markdown",
"id": "38aa3a7d-b0d1-4ec4-bbbe-f00393119a6e",
"metadata": {},
"source": [
"PREPROCESSING"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "b52a028e-24cd-4cb3-9e54-d9479981fb2c",
"metadata": {},
"outputs": [],
"source": [
"#IMPORT LIBRARY\n",
"import pandas as pd\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "fb25cc8c-bf70-4181-85f9-686c8a48ca9e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Dewan Perdamaian Gaza Resmi Dibentuk Trump, Ba...</td>\n",
" <td>Politik</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Puasa 2026 Tanggal Berapa? Ini Jadwal Versi Pe...</td>\n",
" <td>Politik</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Ini Dia Presiden yang Menang Piplres hingga 7 ...</td>\n",
" <td>Politik</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Solidaritas Antar Provinsi: Sumsel Kirim Semba...</td>\n",
" <td>Politik</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Tinjau Lokasi Asap Tambang Pongkor, Adian Mint...</td>\n",
" <td>Politik</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text label\n",
"0 Dewan Perdamaian Gaza Resmi Dibentuk Trump, Ba... Politik\n",
"1 Puasa 2026 Tanggal Berapa? Ini Jadwal Versi Pe... Politik\n",
"2 Ini Dia Presiden yang Menang Piplres hingga 7 ... Politik\n",
"3 Solidaritas Antar Provinsi: Sumsel Kirim Semba... Politik\n",
"4 Tinjau Lokasi Asap Tambang Pongkor, Adian Mint... Politik"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#LOAD DATASET\n",
"df = pd.read_csv(\"dataset_berita_final.csv\")\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "2600ea09-49fa-45d7-8309-f9f159653d70",
"metadata": {},
"outputs": [],
"source": [
"#FUNGSI PREPROCESSING\n",
"def preprocess_text(text):\n",
" text = text.lower() # lowercase\n",
" text = re.sub(r\"http\\S+\", \"\", text) # hapus URL\n",
" text = re.sub(r\"[^a-zA-Z\\s]\", \" \", text) # hapus angka & simbol\n",
" text = re.sub(r\"\\s+\", \" \", text).strip() # hapus spasi berlebih\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "01164d05-2aaa-4fa6-9664-a86c35bcc6d5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>clean_text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Dewan Perdamaian Gaza Resmi Dibentuk Trump, Ba...</td>\n",
" <td>dewan perdamaian gaza resmi dibentuk trump bak...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Puasa 2026 Tanggal Berapa? Ini Jadwal Versi Pe...</td>\n",
" <td>puasa tanggal berapa ini jadwal versi pemerint...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Ini Dia Presiden yang Menang Piplres hingga 7 ...</td>\n",
" <td>ini dia presiden yang menang piplres hingga ka...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Solidaritas Antar Provinsi: Sumsel Kirim Semba...</td>\n",
" <td>solidaritas antar provinsi sumsel kirim sembak...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Tinjau Lokasi Asap Tambang Pongkor, Adian Mint...</td>\n",
" <td>tinjau lokasi asap tambang pongkor adian minta...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" text \\\n",
"0 Dewan Perdamaian Gaza Resmi Dibentuk Trump, Ba... \n",
"1 Puasa 2026 Tanggal Berapa? Ini Jadwal Versi Pe... \n",
"2 Ini Dia Presiden yang Menang Piplres hingga 7 ... \n",
"3 Solidaritas Antar Provinsi: Sumsel Kirim Semba... \n",
"4 Tinjau Lokasi Asap Tambang Pongkor, Adian Mint... \n",
"\n",
" clean_text \n",
"0 dewan perdamaian gaza resmi dibentuk trump bak... \n",
"1 puasa tanggal berapa ini jadwal versi pemerint... \n",
"2 ini dia presiden yang menang piplres hingga ka... \n",
"3 solidaritas antar provinsi sumsel kirim sembak... \n",
"4 tinjau lokasi asap tambang pongkor adian minta... "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#TERAPKAN KE DATASET\n",
"df[\"clean_text\"] = df[\"text\"].apply(preprocess_text)\n",
"df[[\"text\", \"clean_text\"]].head()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "9c9b6282-2b86-4a80-b449-4e7d5c99338f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: Sastrawi in c:\\users\\rosyad\\lib\\site-packages (1.0.1)Note: you may need to restart the kernel to use updated packages.\n",
"\n"
]
}
],
"source": [
"#INSTALL SASTRAWI\n",
"pip install Sastrawi"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "ce7659a9-c512-470b-a3d5-8ccbd0cf985e",
"metadata": {},
"outputs": [],
"source": [
"#IMPORT & STOPWORD\n",
"from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory\n",
"\n",
"factory = StopWordRemoverFactory()\n",
"stopword_remover = factory.create_stop_word_remover()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "30ae25a0-8d86-4052-879c-a57fa466741f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 dewan perdamaian gaza resmi dibentuk trump bak...\n",
"1 puasa tanggal berapa jadwal versi pemerintah n...\n",
"2 dia presiden menang piplres hingga kali yoweri...\n",
"3 solidaritas antar provinsi sumsel kirim sembak...\n",
"4 tinjau lokasi asap tambang pongkor adian minta...\n",
"Name: clean_text, dtype: object"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#TERAPKAN\n",
"df[\"clean_text\"] = df[\"clean_text\"].apply(lambda x: stopword_remover.remove(x))\n",
"df[\"clean_text\"].head()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "c50305b0-5dce-4d86-ad36-76e44d002c79",
"metadata": {},
"outputs": [],
"source": [
"#IMPORT STEMMER\n",
"from Sastrawi.Stemmer.StemmerFactory import StemmerFactory\n",
"\n",
"stemmer = StemmerFactory().create_stemmer()"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "fb5ddd48-6615-4cfa-9785-6a377d4325c6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 dewan damai gaza resmi bentuk trump bakal jadi...\n",
"1 puasa tanggal berapa jadwal versi perintah nu ...\n",
"2 dia presiden menang piplres hingga kali yoweri...\n",
"3 solidaritas antar provinsi sumsel kirim sembak...\n",
"4 tinjau lokasi asap tambang pongkor adi minta a...\n",
"Name: clean_text, dtype: object"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#TERAPKAN\n",
"df[\"clean_text\"] = df[\"clean_text\"].apply(lambda x: stemmer.stem(x))\n",
"df[\"clean_text\"].head()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "dbb677b1-3126-4389-8a49-10df9cae83b5",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"dataset_berita_preprocessed.csv\", index=False, encoding=\"utf-8\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}