Upload files to "/"
This commit is contained in:
parent
ba1061b563
commit
c3e95e66d9
167
Information_Extraction.ipynb
Normal file
167
Information_Extraction.ipynb
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "7c7601d6-3c91-453e-8c29-706528237596",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true,
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "7c7601d6-3c91-453e-8c29-706528237596",
|
||||||
|
"outputId": "df473be6-c537-431b-8bc9-66b1ca1d64b1"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Teks sumber:\n",
|
||||||
|
"\n",
|
||||||
|
"PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023\n",
|
||||||
|
"di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\n",
|
||||||
|
"disaksikan oleh ratusan undangan.\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"HASIL INFORMATION EXTRACTION:\n",
|
||||||
|
"EventType: Product Launch\n",
|
||||||
|
"Organization: PT Maju Jaya\n",
|
||||||
|
"LaunchDateOriginal: 12 Agustus 2023\n",
|
||||||
|
"LaunchDateISO: 2023-08-12\n",
|
||||||
|
"Location: Jakarta\n",
|
||||||
|
"SourceText: PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023 di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan disaksikan oleh ratusan undangan.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# =========================\n",
|
||||||
|
"# NLP – Information Extraction (MODIFIED VERSION)\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"\n",
|
||||||
|
"import re\n",
|
||||||
|
"import nltk\n",
|
||||||
|
"from datetime import datetime\n",
|
||||||
|
"\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"# 1. DATA TEKS (DIMODIFIKASI)\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"text = \"\"\"\n",
|
||||||
|
"PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023\n",
|
||||||
|
"di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\n",
|
||||||
|
"disaksikan oleh ratusan undangan.\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"Teks sumber:\")\n",
|
||||||
|
"print(text)\n",
|
||||||
|
"\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"# 2. PREPROCESSING TAMBAHAN\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"def clean_text(text):\n",
|
||||||
|
" text = text.strip()\n",
|
||||||
|
" text = re.sub(r\"\\s+\", \" \", text)\n",
|
||||||
|
" return text\n",
|
||||||
|
"\n",
|
||||||
|
"text = clean_text(text)\n",
|
||||||
|
"\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"# 3. EKSTRAKSI TANGGAL (LEBIH DARI 1 FORMAT)\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"date_patterns = [\n",
|
||||||
|
" r\"\\d{1,2}\\s(?:Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\\s\\d{4}\",\n",
|
||||||
|
" r\"\\d{4}-\\d{2}-\\d{2}\"\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"dates = []\n",
|
||||||
|
"for pattern in date_patterns:\n",
|
||||||
|
" dates.extend(re.findall(pattern, text))\n",
|
||||||
|
"\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"# 4. NORMALISASI TANGGAL\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"bulan_map = {\n",
|
||||||
|
" \"Januari\": \"01\", \"Februari\": \"02\", \"Maret\": \"03\",\n",
|
||||||
|
" \"April\": \"04\", \"Mei\": \"05\", \"Juni\": \"06\",\n",
|
||||||
|
" \"Juli\": \"07\", \"Agustus\": \"08\", \"September\": \"09\",\n",
|
||||||
|
" \"Oktober\": \"10\", \"November\": \"11\", \"Desember\": \"12\"\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"def normalize_date(date_str):\n",
|
||||||
|
" try:\n",
|
||||||
|
" parts = date_str.split()\n",
|
||||||
|
" return f\"{parts[2]}-{bulan_map[parts[1]]}-{parts[0].zfill(2)}\"\n",
|
||||||
|
" except:\n",
|
||||||
|
" return None\n",
|
||||||
|
"\n",
|
||||||
|
"normalized_dates = [normalize_date(d) for d in dates]\n",
|
||||||
|
"\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"# 5. EKSTRAKSI LOKASI (RULE-BASED)\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"location_pattern = r\"di\\s([A-Z][a-zA-Z]+)\"\n",
|
||||||
|
"location_match = re.search(location_pattern, text)\n",
|
||||||
|
"location = location_match.group(1) if location_match else None\n",
|
||||||
|
"\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"# 6. EKSTRAKSI ORGANISASI (MODIFIKASI)\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"org_pattern = r\"PT\\s[A-Z][a-zA-Z]+\\s[A-Z][a-zA-Z]+\"\n",
|
||||||
|
"organization = re.findall(org_pattern, text)\n",
|
||||||
|
"\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"# 7. EVENT TEMPLATE (LEBIH LENGKAP)\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"event_template = {\n",
|
||||||
|
" \"EventType\": \"Product Launch\",\n",
|
||||||
|
" \"Organization\": organization[0] if organization else None,\n",
|
||||||
|
" \"LaunchDateOriginal\": dates[0] if dates else None,\n",
|
||||||
|
" \"LaunchDateISO\": normalized_dates[0] if normalized_dates else None,\n",
|
||||||
|
" \"Location\": location,\n",
|
||||||
|
" \"SourceText\": text\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"# 8. OUTPUT\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"print(\"\\nHASIL INFORMATION EXTRACTION:\")\n",
|
||||||
|
"for key, value in event_template.items():\n",
|
||||||
|
" print(f\"{key}: {value}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "1e128113-af1e-45a1-8586-48c4acf578b4",
|
||||||
|
"metadata": {
|
||||||
|
"id": "1e128113-af1e-45a1-8586-48c4acf578b4"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.2"
|
||||||
|
},
|
||||||
|
"colab": {
|
||||||
|
"provenance": []
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user