167 lines
5.9 KiB
Plaintext
167 lines
5.9 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "7c7601d6-3c91-453e-8c29-706528237596",
|
||
"metadata": {
|
||
"scrolled": true,
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "7c7601d6-3c91-453e-8c29-706528237596",
|
||
"outputId": "df473be6-c537-431b-8bc9-66b1ca1d64b1"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"Teks sumber:\n",
|
||
"\n",
|
||
"PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023\n",
|
||
"di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\n",
|
||
"disaksikan oleh ratusan undangan.\n",
|
||
"\n",
|
||
"\n",
|
||
"HASIL INFORMATION EXTRACTION:\n",
|
||
"EventType: Product Launch\n",
|
||
"Organization: PT Maju Jaya\n",
|
||
"LaunchDateOriginal: 12 Agustus 2023\n",
|
||
"LaunchDateISO: 2023-08-12\n",
|
||
"Location: Jakarta\n",
|
||
"SourceText: PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023 di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan disaksikan oleh ratusan undangan.\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# =========================\n",
|
||
"# NLP – Information Extraction (MODIFIED VERSION)\n",
|
||
"# =========================\n",
|
||
"\n",
|
||
"import re\n",
|
||
"import nltk\n",
|
||
"from datetime import datetime\n",
|
||
"\n",
|
||
"# =========================\n",
|
||
"# 1. DATA TEKS (DIMODIFIKASI)\n",
|
||
"# =========================\n",
|
||
"text = \"\"\"\n",
|
||
"PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023\n",
|
||
"di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\n",
|
||
"disaksikan oleh ratusan undangan.\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"print(\"Teks sumber:\")\n",
|
||
"print(text)\n",
|
||
"\n",
|
||
"# =========================\n",
|
||
"# 2. PREPROCESSING TAMBAHAN\n",
|
||
"# =========================\n",
|
||
"def clean_text(text):\n",
|
||
" text = text.strip()\n",
|
||
" text = re.sub(r\"\\s+\", \" \", text)\n",
|
||
" return text\n",
|
||
"\n",
|
||
"text = clean_text(text)\n",
|
||
"\n",
|
||
"# =========================\n",
|
||
"# 3. EKSTRAKSI TANGGAL (LEBIH DARI 1 FORMAT)\n",
|
||
"# =========================\n",
|
||
"date_patterns = [\n",
|
||
" r\"\\d{1,2}\\s(?:Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\\s\\d{4}\",\n",
|
||
" r\"\\d{4}-\\d{2}-\\d{2}\"\n",
|
||
"]\n",
|
||
"\n",
|
||
"dates = []\n",
|
||
"for pattern in date_patterns:\n",
|
||
" dates.extend(re.findall(pattern, text))\n",
|
||
"\n",
|
||
"# =========================\n",
|
||
"# 4. NORMALISASI TANGGAL\n",
|
||
"# =========================\n",
|
||
"bulan_map = {\n",
|
||
" \"Januari\": \"01\", \"Februari\": \"02\", \"Maret\": \"03\",\n",
|
||
" \"April\": \"04\", \"Mei\": \"05\", \"Juni\": \"06\",\n",
|
||
" \"Juli\": \"07\", \"Agustus\": \"08\", \"September\": \"09\",\n",
|
||
" \"Oktober\": \"10\", \"November\": \"11\", \"Desember\": \"12\"\n",
|
||
"}\n",
|
||
"\n",
|
||
"def normalize_date(date_str):\n",
|
||
" try:\n",
|
||
" parts = date_str.split()\n",
|
||
" return f\"{parts[2]}-{bulan_map[parts[1]]}-{parts[0].zfill(2)}\"\n",
|
||
" except:\n",
|
||
" return None\n",
|
||
"\n",
|
||
"normalized_dates = [normalize_date(d) for d in dates]\n",
|
||
"\n",
|
||
"# =========================\n",
|
||
"# 5. EKSTRAKSI LOKASI (RULE-BASED)\n",
|
||
"# =========================\n",
|
||
"location_pattern = r\"di\\s([A-Z][a-zA-Z]+)\"\n",
|
||
"location_match = re.search(location_pattern, text)\n",
|
||
"location = location_match.group(1) if location_match else None\n",
|
||
"\n",
|
||
"# =========================\n",
|
||
"# 6. EKSTRAKSI ORGANISASI (MODIFIKASI)\n",
|
||
"# =========================\n",
|
||
"org_pattern = r\"PT\\s[A-Z][a-zA-Z]+\\s[A-Z][a-zA-Z]+\"\n",
|
||
"organization = re.findall(org_pattern, text)\n",
|
||
"\n",
|
||
"# =========================\n",
|
||
"# 7. EVENT TEMPLATE (LEBIH LENGKAP)\n",
|
||
"# =========================\n",
|
||
"event_template = {\n",
|
||
" \"EventType\": \"Product Launch\",\n",
|
||
" \"Organization\": organization[0] if organization else None,\n",
|
||
" \"LaunchDateOriginal\": dates[0] if dates else None,\n",
|
||
" \"LaunchDateISO\": normalized_dates[0] if normalized_dates else None,\n",
|
||
" \"Location\": location,\n",
|
||
" \"SourceText\": text\n",
|
||
"}\n",
|
||
"\n",
|
||
"# =========================\n",
|
||
"# 8. OUTPUT\n",
|
||
"# =========================\n",
|
||
"print(\"\\nHASIL INFORMATION EXTRACTION:\")\n",
|
||
"for key, value in event_template.items():\n",
|
||
" print(f\"{key}: {value}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "1e128113-af1e-45a1-8586-48c4acf578b4",
|
||
"metadata": {
|
||
"id": "1e128113-af1e-45a1-8586-48c4acf578b4"
|
||
},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.12.2"
|
||
},
|
||
"colab": {
|
||
"provenance": []
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
} |