Upload files to "/"
This commit is contained in:
parent
0af3226a9f
commit
1dbfae5442
195
information_extraxtion.ipynb
Normal file
195
information_extraxtion.ipynb
Normal file
@ -0,0 +1,195 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "8kSq7ukiTzaw"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"{\n",
|
||||
" \"cells\": [\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": 6,\n",
|
||||
" \"id\": \"7c7601d6-3c91-453e-8c29-706528237596\",\n",
|
||||
" \"metadata\": {\n",
|
||||
" \"scrolled\": true,\n",
|
||||
" \"colab\": {\n",
|
||||
" \"base_uri\": \"https://localhost:8080/\"\n",
|
||||
" },\n",
|
||||
" \"id\": \"7c7601d6-3c91-453e-8c29-706528237596\",\n",
|
||||
" \"outputId\": \"df473be6-c537-431b-8bc9-66b1ca1d64b1\"\n",
|
||||
" },\n",
|
||||
" \"outputs\": [\n",
|
||||
" {\n",
|
||||
" \"output_type\": \"stream\",\n",
|
||||
" \"name\": \"stdout\",\n",
|
||||
" \"text\": [\n",
|
||||
" \"Teks sumber:\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"PT Abadi resmi meluncurkan produk terbaru mereka pada 12 Agustus 2026\\n\",\n",
|
||||
" \"di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\\n\",\n",
|
||||
" \"disaksikan oleh ratusan undangan.\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"HASIL INFORMATION EXTRACTION:\\n\",\n",
|
||||
" \"EventType: Product Launch\\n\",\n",
|
||||
" \"Organization: PT Abadi\\n\",\n",
|
||||
" \"LaunchDateOriginal: 12 Agustus 2026\\n\",\n",
|
||||
" \"LaunchDateISO: 2026-08-12\\n\",\n",
|
||||
" \"Location: Jakarta\\n\",\n",
|
||||
" \"SourceText: PT Abadi resmi meluncurkan produk terbaru mereka pada 12 Agustus 2026 di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan disaksikan oleh ratusan undangan.\\n\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"source\": [\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"# NLP – Information Extraction (MODIFIED VERSION)\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"import re\\n\",\n",
|
||||
" \"import nltk\\n\",\n",
|
||||
" \"from datetime import datetime\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"# 1. DATA TEKS (DIMODIFIKASI)\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"text = \\\"\\\"\\\"\\n\",\n",
|
||||
" \"PT Abadi resmi meluncurkan produk terbaru mereka pada 12 Agustus 2026\\n\",\n",
|
||||
" \"di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\\n\",\n",
|
||||
" \"disaksikan oleh ratusan undangan.\\n\",\n",
|
||||
" \"\\\"\\\"\\\"\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"print(\\\"Teks sumber:\\\")\\n\",\n",
|
||||
" \"print(text)\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"# 2. PREPROCESSING TAMBAHAN\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"def clean_text(text):\\n\",\n",
|
||||
" \" text = text.strip()\\n\",\n",
|
||||
" \" text = re.sub(r\\\"\\\\s+\\\", \\\" \\\", text)\\n\",\n",
|
||||
" \" return text\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"text = clean_text(text)\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"# 3. EKSTRAKSI TANGGAL (LEBIH DARI 1 FORMAT)\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"date_patterns = [\\n\",\n",
|
||||
" \" r\\\"\\\\d{1,2}\\\\s(?:Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\\\\s\\\\d{4}\\\",\\n\",\n",
|
||||
" \" r\\\"\\\\d{4}-\\\\d{2}-\\\\d{2}\\\"\\n\",\n",
|
||||
" \"]\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"dates = []\\n\",\n",
|
||||
" \"for pattern in date_patterns:\\n\",\n",
|
||||
" \" dates.extend(re.findall(pattern, text))\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"# 4. NORMALISASI TANGGAL\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"bulan_map = {\\n\",\n",
|
||||
" \" \\\"Januari\\\": \\\"01\\\", \\\"Februari\\\": \\\"02\\\", \\\"Maret\\\": \\\"03\\\",\\n\",\n",
|
||||
" \" \\\"April\\\": \\\"04\\\", \\\"Mei\\\": \\\"05\\\", \\\"Juni\\\": \\\"06\\\",\\n\",\n",
|
||||
" \" \\\"Juli\\\": \\\"07\\\", \\\"Agustus\\\": \\\"08\\\", \\\"September\\\": \\\"09\\\",\\n\",\n",
|
||||
" \" \\\"Oktober\\\": \\\"10\\\", \\\"November\\\": \\\"11\\\", \\\"Desember\\\": \\\"12\\\"\\n\",\n",
|
||||
" \"}\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"def normalize_date(date_str):\\n\",\n",
|
||||
" \" try:\\n\",\n",
|
||||
" \" parts = date_str.split()\\n\",\n",
|
||||
" \" return f\\\"{parts[2]}-{bulan_map[parts[1]]}-{parts[0].zfill(2)}\\\"\\n\",\n",
|
||||
" \" except:\\n\",\n",
|
||||
" \" return None\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"normalized_dates = [normalize_date(d) for d in dates]\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"# 5. EKSTRAKSI LOKASI (RULE-BASED)\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"location_pattern = r\\\"di\\\\s([A-Z][a-zA-Z]+)\\\"\\n\",\n",
|
||||
" \"location_match = re.search(location_pattern, text)\\n\",\n",
|
||||
" \"location = location_match.group(1) if location_match else None\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"# 6. EKSTRAKSI ORGANISASI (MODIFIKASI)\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"org_pattern = r\\\"PT\\\\s[A-Z][a-zA-Z]+\\\\s[A-Z][a-zA-Z]+\\\"\\n\",\n",
|
||||
" \"organization = re.findall(org_pattern, text)\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"# 7. EVENT TEMPLATE (LEBIH LENGKAP)\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"event_template = {\\n\",\n",
|
||||
" \" \\\"EventType\\\": \\\"Product Launch\\\",\\n\",\n",
|
||||
" \" \\\"Organization\\\": organization[0] if organization else None,\\n\",\n",
|
||||
" \" \\\"LaunchDateOriginal\\\": dates[0] if dates else None,\\n\",\n",
|
||||
" \" \\\"LaunchDateISO\\\": normalized_dates[0] if normalized_dates else None,\\n\",\n",
|
||||
" \" \\\"Location\\\": location,\\n\",\n",
|
||||
" \" \\\"SourceText\\\": text\\n\",\n",
|
||||
" \"}\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"# 8. OUTPUT\\n\",\n",
|
||||
" \"# =========================\\n\",\n",
|
||||
" \"print(\\\"\\\\nHASIL INFORMATION EXTRACTION:\\\")\\n\",\n",
|
||||
" \"for key, value in event_template.items():\\n\",\n",
|
||||
" \" print(f\\\"{key}: {value}\\\")\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": 6,\n",
|
||||
" \"id\": \"1e128113-af1e-45a1-8586-48c4acf578b4\",\n",
|
||||
" \"metadata\": {\n",
|
||||
" \"id\": \"1e128113-af1e-45a1-8586-48c4acf578b4\"\n",
|
||||
" },\n",
|
||||
" \"outputs\": [],\n",
|
||||
" \"source\": []\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"metadata\": {\n",
|
||||
" \"kernelspec\": {\n",
|
||||
" \"display_name\": \"Python 3 (ipykernel)\",\n",
|
||||
" \"language\": \"python\",\n",
|
||||
" \"name\": \"python3\"\n",
|
||||
" },\n",
|
||||
" \"language_info\": {\n",
|
||||
" \"codemirror_mode\": {\n",
|
||||
" \"name\": \"ipython\",\n",
|
||||
" \"version\": 3\n",
|
||||
" },\n",
|
||||
" \"file_extension\": \".py\",\n",
|
||||
" \"mimetype\": \"text/x-python\",\n",
|
||||
" \"name\": \"python\",\n",
|
||||
" \"nbconvert_exporter\": \"python\",\n",
|
||||
" \"pygments_lexer\": \"ipython3\",\n",
|
||||
" \"version\": \"3.12.2\"\n",
|
||||
" },\n",
|
||||
" \"colab\": {\n",
|
||||
" \"provenance\": []\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"nbformat\": 4,\n",
|
||||
" \"nbformat_minor\": 5\n",
|
||||
"}"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user