Praktikum_NLP/Information_Extraction_tugas_.ipynb

218 lines
8.8 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"id": "7c7601d6-3c91-453e-8c29-706528237596",
"metadata": {
"scrolled": true,
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "7c7601d6-3c91-453e-8c29-706528237596",
"outputId": "ff077c13-80fa-4370-da43-0da9c9f0d603"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]\n",
"NLTK : 3.9.1\n",
"Resource punkt siap dipakai\n",
"TEKS CONTOH:\n",
"\n",
"Ketika orang mulai menulis, ada gagasan bahwa Anda harus melakukan semuanya dengan benar sejak awal, setiap kalimat harus sempurna, setiap paragraf harus sempurna, setiap bab harus sempurna, tetapi apa yang Anda lakukan bukanlah semacam pertunjukan publik, sampai Anda siap untuk itu.\n",
"\n",
"\n",
"TOKENS:\n",
"['Ketika', 'orang', 'mulai', 'menulis', ',', 'ada', 'gagasan', 'bahwa', 'Anda', 'harus', 'melakukan', 'semuanya', 'dengan', 'benar', 'sejak', 'awal', ',', 'setiap', 'kalimat', 'harus', 'sempurna', ',', 'setiap', 'paragraf', 'harus', 'sempurna', ',', 'setiap', 'bab', 'harus', 'sempurna', ',', 'tetapi', 'apa', 'yang', 'Anda', 'lakukan', 'bukanlah', 'semacam', 'pertunjukan', 'publik', ',', 'sampai', 'Anda', 'siap', 'untuk', 'itu', '.']\n",
"\n",
"POS TAGS (30 pertama):\n",
"[('Ketika', 'NNP'), ('orang', 'MD'), ('mulai', 'VB'), ('menulis', 'NN'), (',', ','), ('ada', 'JJ'), ('gagasan', 'JJ'), ('bahwa', 'NN'), ('Anda', 'NNP'), ('harus', 'NN'), ('melakukan', 'NN'), ('semuanya', 'NN'), ('dengan', 'JJ'), ('benar', 'NN'), ('sejak', 'NN'), ('awal', 'NN'), (',', ','), ('setiap', 'JJ'), ('kalimat', 'NN'), ('harus', 'NN'), ('sempurna', 'NN'), (',', ','), ('setiap', 'JJ'), ('paragraf', 'NN'), ('harus', 'NN'), ('sempurna', 'NN'), (',', ','), ('setiap', 'JJ'), ('bab', 'NN'), ('harus', 'NN')]\n",
"\n",
"CANDIDATE ENTITIES (sederhana):\n",
"- Ketika\n",
"- Anda\n",
"- Anda\n",
"- Anda\n",
"\n",
"TANGGAL YANG DITEMUKAN:\n",
"\n",
"TANGGAL (FORMAT ISO 8601):\n",
"\n",
"Pola event peluncuran produk tidak ditemukan.\n",
"\n",
"TEMPLATE EVENT (hasil akhir IE):\n",
"{'Company': None, 'Product': None, 'LaunchDateOriginal': None, 'LaunchDateISO': None, 'Location': None}\n"
]
}
],
"source": [
"# =========================\n",
"# NLP Information Extraction (IE)\n",
"# Praktikum\n",
"# =========================\n",
"\n",
"# 1. Cek versi & instalasi dasar\n",
"import sys\n",
"import nltk\n",
"print(\"Python:\", sys.version)\n",
"print(\"NLTK :\", nltk.__version__)\n",
"\n",
"# Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah:\n",
"# !{sys.executable} -m pip install --upgrade nltk\n",
"\n",
"# 2. Import pustaka\n",
"import re\n",
"from nltk.tokenize import wordpunct_tokenize\n",
"from nltk import pos_tag\n",
"\n",
"# 3. Download resource NLTK yang stabil\n",
"nltk.download('averaged_perceptron_tagger', quiet=True)\n",
"try:\n",
" nltk.download('punkt', quiet=True)\n",
" print(\"Resource punkt siap dipakai\")\n",
"except:\n",
" print(\"Gagal download punkt, akan tetap pakai wordpunct_tokenize\")\n",
"\n",
"# 4. Contoh teks (bisa diganti)\n",
"text = \"\"\"\n",
"Ketika orang mulai menulis, ada gagasan bahwa Anda harus melakukan semuanya dengan benar sejak awal, setiap kalimat harus sempurna, setiap paragraf harus sempurna, setiap bab harus sempurna, tetapi apa yang Anda lakukan bukanlah semacam pertunjukan publik, sampai Anda siap untuk itu.\n",
"\"\"\"\n",
"print(\"TEKS CONTOH:\")\n",
"print(text)\n",
"\n",
"# 5. Tokenisasi dan POS tagging\n",
"tokens = wordpunct_tokenize(text)\n",
"print(\"\\nTOKENS:\")\n",
"print(tokens)\n",
"\n",
"pos_tags = pos_tag(tokens)\n",
"print(\"\\nPOS TAGS (30 pertama):\")\n",
"print(pos_tags[:30])\n",
"\n",
"# 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital)\n",
"def simple_capital_ner(tokens):\n",
" entities = []\n",
" current = []\n",
" for tok in tokens:\n",
" if tok.istitle() and tok.isalpha():\n",
" current.append(tok)\n",
" else:\n",
" if current:\n",
" entities.append(\" \".join(current))\n",
" current = []\n",
" if current:\n",
" entities.append(\" \".join(current))\n",
" return entities\n",
"\n",
"candidate_entities = simple_capital_ner(tokens)\n",
"print(\"\\nCANDIDATE ENTITIES (sederhana):\")\n",
"for e in candidate_entities:\n",
" print(\"-\", e)\n",
"\n",
"# 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan RegEx\n",
"bulan_id = r\"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\"\n",
"date_pattern = rf\"\\b(\\d{{1,2}})\\s+{bulan_id}\\s+(\\d{{4}})\\b\"\n",
"\n",
"dates = re.findall(date_pattern, text)\n",
"print(\"\\nTANGGAL YANG DITEMUKAN:\")\n",
"for d in dates:\n",
" print(\"-\", \" \".join(d))\n",
"\n",
"# 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD)\n",
"bulan_map = {\n",
" \"Januari\": \"01\",\n",
" \"Februari\": \"02\",\n",
" \"Maret\": \"03\",\n",
" \"April\": \"04\",\n",
" \"Mei\": \"05\",\n",
" \"Juni\": \"06\",\n",
" \"Juli\": \"07\",\n",
" \"Agustus\": \"08\",\n",
" \"September\": \"09\",\n",
" \"Oktober\": \"10\",\n",
" \"November\": \"11\",\n",
" \"Desember\": \"12\",\n",
"}\n",
"\n",
"normalized_dates = []\n",
"for hari, bulan, tahun in dates:\n",
" bulan_num = bulan_map[bulan]\n",
" hari_num = hari.zfill(2)\n",
" iso = f\"{tahun}-{bulan_num}-{hari_num}\"\n",
" normalized_dates.append(iso)\n",
"\n",
"print(\"\\nTANGGAL (FORMAT ISO 8601):\")\n",
"for iso in normalized_dates:\n",
" print(\"-\", iso)\n",
"\n",
"# 9. Ekstraksi event peluncuran produk (pattern-based)\n",
"pattern = r\"(.+?) meluncurkan (.+?) pada (\\d{1,2} \" + bulan_id + r\" \\d{4}) di (.+?)\\.\"\n",
"match = re.search(pattern, text)\n",
"\n",
"if match:\n",
" company = match.group(1).strip(\" ,\")\n",
" product = match.group(2).strip(\" ,\")\n",
" tgl_str = match.group(3)\n",
" location = match.group(4).strip(\" ,\")\n",
"\n",
" print(\"\\nEVENT PELUNCURAN PRODUK:\")\n",
" print(\" Perusahaan :\", company)\n",
" print(\" Produk :\", product)\n",
" print(\" Tanggal :\", tgl_str)\n",
" print(\" Lokasi :\", location)\n",
"else:\n",
" company = product = tgl_str = location = None\n",
" print(\"\\nPola event peluncuran produk tidak ditemukan.\")\n",
"\n",
"# 10. Template IE (Template Extraction)\n",
"event_template = {\n",
" \"Company\": company,\n",
" \"Product\": product,\n",
" \"LaunchDateOriginal\": tgl_str,\n",
" \"LaunchDateISO\": normalized_dates[0] if normalized_dates else None,\n",
" \"Location\": location,\n",
"}\n",
"\n",
"print(\"\\nTEMPLATE EVENT (hasil akhir IE):\")\n",
"print(event_template)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e128113-af1e-45a1-8586-48c4acf578b4",
"metadata": {
"id": "1e128113-af1e-45a1-8586-48c4acf578b4"
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
},
"colab": {
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 5
}