{ "cells": [ { "cell_type": "code", "execution_count": 6, "id": "7c7601d6-3c91-453e-8c29-706528237596", "metadata": { "scrolled": true, "colab": { "base_uri": "https://localhost:8080/" }, "id": "7c7601d6-3c91-453e-8c29-706528237596", "outputId": "df473be6-c537-431b-8bc9-66b1ca1d64b1" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Teks sumber:\n", "\n", "PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023\n", "di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\n", "disaksikan oleh ratusan undangan.\n", "\n", "\n", "HASIL INFORMATION EXTRACTION:\n", "EventType: Product Launch\n", "Organization: PT Maju Jaya\n", "LaunchDateOriginal: 12 Agustus 2023\n", "LaunchDateISO: 2023-08-12\n", "Location: Jakarta\n", "SourceText: PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023 di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan disaksikan oleh ratusan undangan.\n" ] } ], "source": [ "# =========================\n", "# NLP – Information Extraction (MODIFIED VERSION)\n", "# =========================\n", "\n", "import re\n", "import nltk\n", "from datetime import datetime\n", "\n", "# =========================\n", "# 1. DATA TEKS (DIMODIFIKASI)\n", "# =========================\n", "text = \"\"\"\n", "PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023\n", "di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\n", "disaksikan oleh ratusan undangan.\n", "\"\"\"\n", "\n", "print(\"Teks sumber:\")\n", "print(text)\n", "\n", "# =========================\n", "# 2. PREPROCESSING TAMBAHAN\n", "# =========================\n", "def clean_text(text):\n", " text = text.strip()\n", " text = re.sub(r\"\\s+\", \" \", text)\n", " return text\n", "\n", "text = clean_text(text)\n", "\n", "# =========================\n", "# 3. EKSTRAKSI TANGGAL (LEBIH DARI 1 FORMAT)\n", "# =========================\n", "date_patterns = [\n", " r\"\\d{1,2}\\s(?:Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\\s\\d{4}\",\n", " r\"\\d{4}-\\d{2}-\\d{2}\"\n", "]\n", "\n", "dates = []\n", "for pattern in date_patterns:\n", " dates.extend(re.findall(pattern, text))\n", "\n", "# =========================\n", "# 4. NORMALISASI TANGGAL\n", "# =========================\n", "bulan_map = {\n", " \"Januari\": \"01\", \"Februari\": \"02\", \"Maret\": \"03\",\n", " \"April\": \"04\", \"Mei\": \"05\", \"Juni\": \"06\",\n", " \"Juli\": \"07\", \"Agustus\": \"08\", \"September\": \"09\",\n", " \"Oktober\": \"10\", \"November\": \"11\", \"Desember\": \"12\"\n", "}\n", "\n", "def normalize_date(date_str):\n", " try:\n", " parts = date_str.split()\n", " return f\"{parts[2]}-{bulan_map[parts[1]]}-{parts[0].zfill(2)}\"\n", " except:\n", " return None\n", "\n", "normalized_dates = [normalize_date(d) for d in dates]\n", "\n", "# =========================\n", "# 5. EKSTRAKSI LOKASI (RULE-BASED)\n", "# =========================\n", "location_pattern = r\"di\\s([A-Z][a-zA-Z]+)\"\n", "location_match = re.search(location_pattern, text)\n", "location = location_match.group(1) if location_match else None\n", "\n", "# =========================\n", "# 6. EKSTRAKSI ORGANISASI (MODIFIKASI)\n", "# =========================\n", "org_pattern = r\"PT\\s[A-Z][a-zA-Z]+\\s[A-Z][a-zA-Z]+\"\n", "organization = re.findall(org_pattern, text)\n", "\n", "# =========================\n", "# 7. EVENT TEMPLATE (LEBIH LENGKAP)\n", "# =========================\n", "event_template = {\n", " \"EventType\": \"Product Launch\",\n", " \"Organization\": organization[0] if organization else None,\n", " \"LaunchDateOriginal\": dates[0] if dates else None,\n", " \"LaunchDateISO\": normalized_dates[0] if normalized_dates else None,\n", " \"Location\": location,\n", " \"SourceText\": text\n", "}\n", "\n", "# =========================\n", "# 8. OUTPUT\n", "# =========================\n", "print(\"\\nHASIL INFORMATION EXTRACTION:\")\n", "for key, value in event_template.items():\n", " print(f\"{key}: {value}\")" ] }, { "cell_type": "code", "execution_count": 6, "id": "1e128113-af1e-45a1-8586-48c4acf578b4", "metadata": { "id": "1e128113-af1e-45a1-8586-48c4acf578b4" }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" }, "colab": { "provenance": [] } }, "nbformat": 4, "nbformat_minor": 5 }