Upload files to "/"
This commit is contained in:
parent
86c32a9499
commit
890a0e9e2d
223
information_extraxtion(Wildanul_Jannah).ipynb
Normal file
223
information_extraxtion(Wildanul_Jannah).ipynb
Normal file
@ -0,0 +1,223 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "ky8yGDf_kyx8"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"{\n",
|
||||
" \"nbformat\": 4,\n",
|
||||
" \"nbformat_minor\": 0,\n",
|
||||
" \"metadata\": {\n",
|
||||
" \"colab\": {\n",
|
||||
" \"provenance\": []\n",
|
||||
" },\n",
|
||||
" \"kernelspec\": {\n",
|
||||
" \"name\": \"python3\",\n",
|
||||
" \"display_name\": \"Python 3\"\n",
|
||||
" },\n",
|
||||
" \"language_info\": {\n",
|
||||
" \"name\": \"python\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"cells\": [\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": null,\n",
|
||||
" \"metadata\": {\n",
|
||||
" \"id\": \"8kSq7ukiTzaw\"\n",
|
||||
" },\n",
|
||||
" \"outputs\": [],\n",
|
||||
" \"source\": [\n",
|
||||
" \"{\\n\",\n",
|
||||
" \" \\\"cells\\\": [\\n\",\n",
|
||||
" \" {\\n\",\n",
|
||||
" \" \\\"cell_type\\\": \\\"code\\\",\\n\",\n",
|
||||
" \" \\\"execution_count\\\": 6,\\n\",\n",
|
||||
" \" \\\"id\\\": \\\"7c7601d6-3c91-453e-8c29-706528237596\\\",\\n\",\n",
|
||||
" \" \\\"metadata\\\": {\\n\",\n",
|
||||
" \" \\\"scrolled\\\": true,\\n\",\n",
|
||||
" \" \\\"colab\\\": {\\n\",\n",
|
||||
" \" \\\"base_uri\\\": \\\"https://localhost:8080/\\\"\\n\",\n",
|
||||
" \" },\\n\",\n",
|
||||
" \" \\\"id\\\": \\\"7c7601d6-3c91-453e-8c29-706528237596\\\",\\n\",\n",
|
||||
" \" \\\"outputId\\\": \\\"df473be6-c537-431b-8bc9-66b1ca1d64b1\\\"\\n\",\n",
|
||||
" \" },\\n\",\n",
|
||||
" \" \\\"outputs\\\": [\\n\",\n",
|
||||
" \" {\\n\",\n",
|
||||
" \" \\\"output_type\\\": \\\"stream\\\",\\n\",\n",
|
||||
" \" \\\"name\\\": \\\"stdout\\\",\\n\",\n",
|
||||
" \" \\\"text\\\": [\\n\",\n",
|
||||
" \" \\\"Teks sumber:\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"PT Tempo Grub resmi meluncurkan produk terbaru mereka pada 28 maret 2026\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"di Bandung. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"disaksikan oleh ratusan undangan.\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"HASIL INFORMATION EXTRACTION:\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"EventType: Product Launch\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"Organization: PT Abadi\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"LaunchDateOriginal: 12 Agustus 2026\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"LaunchDateISO: 2026-03-28\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"Location: Bandung\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"SourceText: PT Tempo Grub resmi meluncurkan produk terbaru mereka pada 12 Agustus 2026 di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan disaksikan oleh ratusan undangan.\\\\n\\\"\\n\",\n",
|
||||
" \" ]\\n\",\n",
|
||||
" \" }\\n\",\n",
|
||||
" \" ],\\n\",\n",
|
||||
" \" \\\"source\\\": [\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# NLP – Information Extraction (MODIFIED VERSION)\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"import re\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"import nltk\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"from datetime import datetime\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# 1. DATA TEKS (DIMODIFIKASI)\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"text = \\\\\\\"\\\\\\\"\\\\\\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"PT Tempo Grub resmi meluncurkan produk terbaru mereka pada 28 maret 2026\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"di Bandung. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"disaksikan oleh ratusan undangan.\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\\\\"\\\\\\\"\\\\\\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"print(\\\\\\\"Teks sumber:\\\\\\\")\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"print(text)\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# 2. PREPROCESSING TAMBAHAN\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"def clean_text(text):\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" text = text.strip()\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" text = re.sub(r\\\\\\\"\\\\\\\\s+\\\\\\\", \\\\\\\" \\\\\\\", text)\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" return text\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"text = clean_text(text)\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# 3. EKSTRAKSI TANGGAL (LEBIH DARI 1 FORMAT)\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"date_patterns = [\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" r\\\\\\\"\\\\\\\\d{2,8}\\\\\\\\s(?:Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\\\\\\\\s\\\\\\\\d{4}\\\\\\\",\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" r\\\\\\\"\\\\\\\\d{4}-\\\\\\\\d{2}-\\\\\\\\d{2}\\\\\\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"]\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"dates = []\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"for pattern in date_patterns:\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" dates.extend(re.findall(pattern, text))\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# 4. NORMALISASI TANGGAL\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"bulan_map = {\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" \\\\\\\"Januari\\\\\\\": \\\\\\\"01\\\\\\\", \\\\\\\"Februari\\\\\\\": \\\\\\\"02\\\\\\\", \\\\\\\"Maret\\\\\\\": \\\\\\\"03\\\\\\\",\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" \\\\\\\"April\\\\\\\": \\\\\\\"04\\\\\\\", \\\\\\\"Mei\\\\\\\": \\\\\\\"05\\\\\\\", \\\\\\\"Juni\\\\\\\": \\\\\\\"06\\\\\\\",\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" \\\\\\\"Juli\\\\\\\": \\\\\\\"07\\\\\\\", \\\\\\\"Agustus\\\\\\\": \\\\\\\"08\\\\\\\", \\\\\\\"September\\\\\\\": \\\\\\\"09\\\\\\\",\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" \\\\\\\"Oktober\\\\\\\": \\\\\\\"10\\\\\\\", \\\\\\\"November\\\\\\\": \\\\\\\"11\\\\\\\", \\\\\\\"Desember\\\\\\\": \\\\\\\"12\\\\\\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"}\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"def normalize_date(date_str):\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" try:\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" parts = date_str.split()\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" return f\\\\\\\"{parts[2]}-{bulan_map[parts[1]]}-{parts[0].zfill(2)}\\\\\\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" except:\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" return None\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"normalized_dates = [normalize_date(d) for d in dates]\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# 5. EKSTRAKSI LOKASI (RULE-BASED)\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"location_pattern = r\\\\\\\"di\\\\\\\\s([A-Z][a-zA-Z]+)\\\\\\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"location_match = re.search(location_pattern, text)\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"location = location_match.group(1) if location_match else None\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# 6. EKSTRAKSI ORGANISASI (MODIFIKASI)\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"org_pattern = r\\\\\\\"PT\\\\\\\\s[A-Z][a-zA-Z]+\\\\\\\\s[A-Z][a-zA-Z]+\\\\\\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"organization = re.findall(org_pattern, text)\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# 7. EVENT TEMPLATE (LEBIH LENGKAP)\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"event_template = {\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" \\\\\\\"EventType\\\\\\\": \\\\\\\"Product Launch\\\\\\\",\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" \\\\\\\"Organization\\\\\\\": organization[0] if organization else None,\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" \\\\\\\"LaunchDateOriginal\\\\\\\": dates[0] if dates else None,\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" \\\\\\\"LaunchDateISO\\\\\\\": normalized_dates[0] if normalized_dates else None,\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" \\\\\\\"Location\\\\\\\": location,\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" \\\\\\\"SourceText\\\\\\\": text\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"}\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# 8. OUTPUT\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"# =========================\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"print(\\\\\\\"\\\\\\\\nHASIL INFORMATION EXTRACTION:\\\\\\\")\\\\n\\\",\\n\",\n",
|
||||
" \" \\\"for key, value in event_template.items():\\\\n\\\",\\n\",\n",
|
||||
" \" \\\" print(f\\\\\\\"{key}: {value}\\\\\\\")\\\"\\n\",\n",
|
||||
" \" ]\\n\",\n",
|
||||
" \" },\\n\",\n",
|
||||
" \" {\\n\",\n",
|
||||
" \" \\\"cell_type\\\": \\\"code\\\",\\n\",\n",
|
||||
" \" \\\"execution_count\\\": 6,\\n\",\n",
|
||||
" \" \\\"id\\\": \\\"1e128113-af1e-45a1-8586-48c4acf578b4\\\",\\n\",\n",
|
||||
" \" \\\"metadata\\\": {\\n\",\n",
|
||||
" \" \\\"id\\\": \\\"1e128113-af1e-45a1-8586-48c4acf578b4\\\"\\n\",\n",
|
||||
" \" },\\n\",\n",
|
||||
" \" \\\"outputs\\\": [],\\n\",\n",
|
||||
" \" \\\"source\\\": []\\n\",\n",
|
||||
" \" }\\n\",\n",
|
||||
" \" ],\\n\",\n",
|
||||
" \" \\\"metadata\\\": {\\n\",\n",
|
||||
" \" \\\"kernelspec\\\": {\\n\",\n",
|
||||
" \" \\\"display_name\\\": \\\"Python 3 (ipykernel)\\\",\\n\",\n",
|
||||
" \" \\\"language\\\": \\\"python\\\",\\n\",\n",
|
||||
" \" \\\"name\\\": \\\"python3\\\"\\n\",\n",
|
||||
" \" },\\n\",\n",
|
||||
" \" \\\"language_info\\\": {\\n\",\n",
|
||||
" \" \\\"codemirror_mode\\\": {\\n\",\n",
|
||||
" \" \\\"name\\\": \\\"ipython\\\",\\n\",\n",
|
||||
" \" \\\"version\\\": 3\\n\",\n",
|
||||
" \" },\\n\",\n",
|
||||
" \" \\\"file_extension\\\": \\\".py\\\",\\n\",\n",
|
||||
" \" \\\"mimetype\\\": \\\"text/x-python\\\",\\n\",\n",
|
||||
" \" \\\"name\\\": \\\"python\\\",\\n\",\n",
|
||||
" \" \\\"nbconvert_exporter\\\": \\\"python\\\",\\n\",\n",
|
||||
" \" \\\"pygments_lexer\\\": \\\"ipython3\\\",\\n\",\n",
|
||||
" \" \\\"version\\\": \\\"3.12.2\\\"\\n\",\n",
|
||||
" \" },\\n\",\n",
|
||||
" \" \\\"colab\\\": {\\n\",\n",
|
||||
" \" \\\"provenance\\\": []\\n\",\n",
|
||||
" \" }\\n\",\n",
|
||||
" \" },\\n\",\n",
|
||||
" \" \\\"nbformat\\\": 4,\\n\",\n",
|
||||
" \" \\\"nbformat_minor\\\": 5\\n\",\n",
|
||||
" \"}\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" ]\n",
|
||||
"}"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user