diff --git a/data_preparation_&_preprocessing.ipynb b/data_preparation_&_preprocessing.ipynb
new file mode 100644
index 0000000..28d0c8b
--- /dev/null
+++ b/data_preparation_&_preprocessing.ipynb
@@ -0,0 +1,581 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "0a533e2e-9fee-4f5a-a4a2-2e3faa18d85a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: requests in c:\\users\\rosyad\\lib\\site-packages (2.32.3)Note: you may need to restart the kernel to use updated packages.\n",
+ "\n",
+ "Requirement already satisfied: pandas in c:\\users\\rosyad\\lib\\site-packages (2.2.3)\n",
+ "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\rosyad\\lib\\site-packages (from requests) (3.3.2)\n",
+ "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\rosyad\\lib\\site-packages (from requests) (3.7)\n",
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\rosyad\\lib\\site-packages (from requests) (2.3.0)\n",
+ "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\rosyad\\lib\\site-packages (from requests) (2025.4.26)\n",
+ "Requirement already satisfied: numpy>=1.26.0 in c:\\users\\rosyad\\lib\\site-packages (from pandas) (2.1.3)\n",
+ "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\rosyad\\lib\\site-packages (from pandas) (2.9.0.post0)\n",
+ "Requirement already satisfied: pytz>=2020.1 in c:\\users\\rosyad\\lib\\site-packages (from pandas) (2024.1)\n",
+ "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\rosyad\\lib\\site-packages (from pandas) (2025.2)\n",
+ "Requirement already satisfied: six>=1.5 in c:\\users\\rosyad\\lib\\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n"
+ ]
+ }
+ ],
+ "source": [
+ "pip install requests pandas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "3cbceaf7-0932-4f21-9cac-6fb2ecb6e37c",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Politik: 50\n"
+ ]
+ }
+ ],
+ "source": [
+ "import requests\n",
+ "import pandas as pd\n",
+ "\n",
+ "API_KEY = \"ef2ecb89fbe0f7a22d4d075da633640c\"\n",
+ "url = \"https://gnews.io/api/v4/search\"\n",
+ "\n",
+ "rows = []\n",
+ "\n",
+ "for page in range(1, 6):\n",
+ " params = {\n",
+ " \"q\": \"pemerintah OR presiden OR DPR OR kebijakan\",\n",
+ " \"lang\": \"id\",\n",
+ " \"max\": 20,\n",
+ " \"page\": page,\n",
+ " \"token\": API_KEY\n",
+ " }\n",
+ "\n",
+ " r = requests.get(url, params=params)\n",
+ " data = r.json()\n",
+ "\n",
+ " for article in data[\"articles\"]:\n",
+ " text = article[\"title\"] + \" \" + article[\"description\"]\n",
+ " rows.append({\"text\": text, \"label\": \"Politik\"})\n",
+ "\n",
+ "df = pd.DataFrame(rows).head(100)\n",
+ "df.to_csv(\"berita_politik.csv\", index=False)\n",
+ "print(\"Politik:\", len(df))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "ecad8de9-3f98-4a51-b17b-8c0ca960bf4f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Olahraga: 12\n"
+ ]
+ }
+ ],
+ "source": [
+ "import requests\n",
+ "import pandas as pd\n",
+ "\n",
+ "API_KEY = \"ef2ecb89fbe0f7a22d4d075da633640c\"\n",
+ "url = \"https://gnews.io/api/v4/search\"\n",
+ "\n",
+ "rows = []\n",
+ "\n",
+ "for page in range(1, 6):\n",
+ " params = {\n",
+ " \"q\": \"sepak bola OR timnas OR liga indonesia OR olahraga\",\n",
+ " \"lang\": \"id\",\n",
+ " \"max\": 20,\n",
+ " \"page\": page,\n",
+ " \"token\": API_KEY\n",
+ " }\n",
+ "\n",
+ " r = requests.get(url, params=params)\n",
+ " data = r.json()\n",
+ "\n",
+ " for article in data[\"articles\"]:\n",
+ " text = article[\"title\"] + \" \" + article[\"description\"]\n",
+ " rows.append({\"text\": text, \"label\": \"Olahraga\"})\n",
+ "\n",
+ "df = pd.DataFrame(rows).head(100)\n",
+ "df.to_csv(\"berita_olahraga.csv\", index=False)\n",
+ "print(\"Olahraga:\", len(df))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "a3746155-cbcd-4df1-aed7-06db697a4e17",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Teknologi: 50\n"
+ ]
+ }
+ ],
+ "source": [
+ "import requests\n",
+ "import pandas as pd\n",
+ "\n",
+ "API_KEY = \"ef2ecb89fbe0f7a22d4d075da633640c\"\n",
+ "url = \"https://gnews.io/api/v4/search\"\n",
+ "\n",
+ "rows = []\n",
+ "\n",
+ "for page in range(1, 6):\n",
+ " params = {\n",
+ " \"q\": \"teknologi OR startup OR aplikasi OR AI OR gadget\",\n",
+ " \"lang\": \"id\",\n",
+ " \"max\": 20,\n",
+ " \"page\": page,\n",
+ " \"token\": API_KEY\n",
+ " }\n",
+ "\n",
+ " r = requests.get(url, params=params)\n",
+ " data = r.json()\n",
+ "\n",
+ " for article in data[\"articles\"]:\n",
+ " text = article[\"title\"] + \" \" + article[\"description\"]\n",
+ " rows.append({\"text\": text, \"label\": \"Teknologi\"})\n",
+ "\n",
+ "df = pd.DataFrame(rows).head(100)\n",
+ "df.to_csv(\"berita_teknologi.csv\", index=False)\n",
+ "print(\"Teknologi:\", len(df))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "c9c6de7e-1dbd-490a-aff4-9e647e1e62b8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_olahraga = pd.read_csv(\"berita_olahraga.csv\", encoding=\"latin1\")\n",
+ "df_olahraga.to_csv(\"berita_olahraga_utf8.csv\", index=False, encoding=\"utf-8\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "73658334-e371-4975-8f5f-e98f7b3da55d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "label\n",
+ "Politik 50\n",
+ "Olahraga 50\n",
+ "Teknologi 50\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.concat([\n",
+ " pd.read_csv(\"berita_politik.csv\"),\n",
+ " pd.read_csv(\"berita_olahraga_utf8.csv\"),\n",
+ " pd.read_csv(\"berita_teknologi.csv\")\n",
+ "])\n",
+ "\n",
+ "df[\"label\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "56cd79d3-256f-4e89-be56-2d440969c7cf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "label\n",
+ "Politik 50\n",
+ "Olahraga 50\n",
+ "Teknologi 50\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[\"label\"].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "85fa5276-a690-4796-ac9c-f49583dcff19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_csv(\"dataset_berita_final.csv\", index=False, encoding=\"utf-8\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "38aa3a7d-b0d1-4ec4-bbbe-f00393119a6e",
+ "metadata": {},
+ "source": [
+ "PREPROCESSING"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "b52a028e-24cd-4cb3-9e54-d9479981fb2c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#IMPORT LIBRARY\n",
+ "import pandas as pd\n",
+ "import re"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "fb25cc8c-bf70-4181-85f9-686c8a48ca9e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " text | \n",
+ " label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Dewan Perdamaian Gaza Resmi Dibentuk Trump, Ba... | \n",
+ " Politik | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Puasa 2026 Tanggal Berapa? Ini Jadwal Versi Pe... | \n",
+ " Politik | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Ini Dia Presiden yang Menang Piplres hingga 7 ... | \n",
+ " Politik | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Solidaritas Antar Provinsi: Sumsel Kirim Semba... | \n",
+ " Politik | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Tinjau Lokasi Asap Tambang Pongkor, Adian Mint... | \n",
+ " Politik | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " text label\n",
+ "0 Dewan Perdamaian Gaza Resmi Dibentuk Trump, Ba... Politik\n",
+ "1 Puasa 2026 Tanggal Berapa? Ini Jadwal Versi Pe... Politik\n",
+ "2 Ini Dia Presiden yang Menang Piplres hingga 7 ... Politik\n",
+ "3 Solidaritas Antar Provinsi: Sumsel Kirim Semba... Politik\n",
+ "4 Tinjau Lokasi Asap Tambang Pongkor, Adian Mint... Politik"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#LOAD DATASET\n",
+ "df = pd.read_csv(\"dataset_berita_final.csv\")\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "2600ea09-49fa-45d7-8309-f9f159653d70",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#FUNGSI PREPROCESSING\n",
+ "def preprocess_text(text):\n",
+ " text = text.lower() # lowercase\n",
+ " text = re.sub(r\"http\\S+\", \"\", text) # hapus URL\n",
+ " text = re.sub(r\"[^a-zA-Z\\s]\", \" \", text) # hapus angka & simbol\n",
+ " text = re.sub(r\"\\s+\", \" \", text).strip() # hapus spasi berlebih\n",
+ " return text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "01164d05-2aaa-4fa6-9664-a86c35bcc6d5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " text | \n",
+ " clean_text | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Dewan Perdamaian Gaza Resmi Dibentuk Trump, Ba... | \n",
+ " dewan perdamaian gaza resmi dibentuk trump bak... | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Puasa 2026 Tanggal Berapa? Ini Jadwal Versi Pe... | \n",
+ " puasa tanggal berapa ini jadwal versi pemerint... | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Ini Dia Presiden yang Menang Piplres hingga 7 ... | \n",
+ " ini dia presiden yang menang piplres hingga ka... | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Solidaritas Antar Provinsi: Sumsel Kirim Semba... | \n",
+ " solidaritas antar provinsi sumsel kirim sembak... | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Tinjau Lokasi Asap Tambang Pongkor, Adian Mint... | \n",
+ " tinjau lokasi asap tambang pongkor adian minta... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " text \\\n",
+ "0 Dewan Perdamaian Gaza Resmi Dibentuk Trump, Ba... \n",
+ "1 Puasa 2026 Tanggal Berapa? Ini Jadwal Versi Pe... \n",
+ "2 Ini Dia Presiden yang Menang Piplres hingga 7 ... \n",
+ "3 Solidaritas Antar Provinsi: Sumsel Kirim Semba... \n",
+ "4 Tinjau Lokasi Asap Tambang Pongkor, Adian Mint... \n",
+ "\n",
+ " clean_text \n",
+ "0 dewan perdamaian gaza resmi dibentuk trump bak... \n",
+ "1 puasa tanggal berapa ini jadwal versi pemerint... \n",
+ "2 ini dia presiden yang menang piplres hingga ka... \n",
+ "3 solidaritas antar provinsi sumsel kirim sembak... \n",
+ "4 tinjau lokasi asap tambang pongkor adian minta... "
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#TERAPKAN KE DATASET\n",
+ "df[\"clean_text\"] = df[\"text\"].apply(preprocess_text)\n",
+ "df[[\"text\", \"clean_text\"]].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "9c9b6282-2b86-4a80-b449-4e7d5c99338f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Requirement already satisfied: Sastrawi in c:\\users\\rosyad\\lib\\site-packages (1.0.1)Note: you may need to restart the kernel to use updated packages.\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "#INSTALL SASTRAWI\n",
+ "pip install Sastrawi"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "ce7659a9-c512-470b-a3d5-8ccbd0cf985e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#IMPORT & STOPWORD\n",
+ "from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory\n",
+ "\n",
+ "factory = StopWordRemoverFactory()\n",
+ "stopword_remover = factory.create_stop_word_remover()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "30ae25a0-8d86-4052-879c-a57fa466741f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 dewan perdamaian gaza resmi dibentuk trump bak...\n",
+ "1 puasa tanggal berapa jadwal versi pemerintah n...\n",
+ "2 dia presiden menang piplres hingga kali yoweri...\n",
+ "3 solidaritas antar provinsi sumsel kirim sembak...\n",
+ "4 tinjau lokasi asap tambang pongkor adian minta...\n",
+ "Name: clean_text, dtype: object"
+ ]
+ },
+ "execution_count": 34,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#TERAPKAN\n",
+ "df[\"clean_text\"] = df[\"clean_text\"].apply(lambda x: stopword_remover.remove(x))\n",
+ "df[\"clean_text\"].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "c50305b0-5dce-4d86-ad36-76e44d002c79",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#IMPORT STEMMER\n",
+ "from Sastrawi.Stemmer.StemmerFactory import StemmerFactory\n",
+ "\n",
+ "stemmer = StemmerFactory().create_stemmer()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "fb5ddd48-6615-4cfa-9785-6a377d4325c6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 dewan damai gaza resmi bentuk trump bakal jadi...\n",
+ "1 puasa tanggal berapa jadwal versi perintah nu ...\n",
+ "2 dia presiden menang piplres hingga kali yoweri...\n",
+ "3 solidaritas antar provinsi sumsel kirim sembak...\n",
+ "4 tinjau lokasi asap tambang pongkor adi minta a...\n",
+ "Name: clean_text, dtype: object"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "#TERAPKAN\n",
+ "df[\"clean_text\"] = df[\"clean_text\"].apply(lambda x: stemmer.stem(x))\n",
+ "df[\"clean_text\"].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "dbb677b1-3126-4389-8a49-10df9cae83b5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df.to_csv(\"dataset_berita_preprocessed.csv\", index=False, encoding=\"utf-8\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.13.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}