materi-praktikum/NLP/Information Extraction.ipynb
2025-12-02 02:51:49 +00:00

220 lines
7.7 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "7c7601d6-3c91-453e-8c29-706528237596",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Python: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0]\n",
"NLTK : 3.9.2\n",
"Resource punkt siap dipakai\n",
"TEKS CONTOH:\n",
"\n",
"CEO Apple, Tim Cook, meluncurkan iPhone 15 pada 12 September 2023 di Cupertino, California.\n",
"Pada hari berikutnya, ia menghadiri pertemuan investor di San Francisco.\n",
"\n",
"\n",
"TOKENS:\n",
"['CEO', 'Apple', ',', 'Tim', 'Cook', ',', 'meluncurkan', 'iPhone', '15', 'pada', '12', 'September', '2023', 'di', 'Cupertino', ',', 'California', '.', 'Pada', 'hari', 'berikutnya', ',', 'ia', 'menghadiri', 'pertemuan', 'investor', 'di', 'San', 'Francisco', '.']\n",
"\n",
"POS TAGS (30 pertama):\n",
"[('CEO', 'NNP'), ('Apple', 'NNP'), (',', ','), ('Tim', 'NNP'), ('Cook', 'NNP'), (',', ','), ('meluncurkan', 'VBD'), ('iPhone', 'NN'), ('15', 'CD'), ('pada', 'NN'), ('12', 'CD'), ('September', 'NNP'), ('2023', 'CD'), ('di', 'NN'), ('Cupertino', 'NNP'), (',', ','), ('California', 'NNP'), ('.', '.'), ('Pada', 'NNP'), ('hari', 'NN'), ('berikutnya', 'NN'), (',', ','), ('ia', 'JJ'), ('menghadiri', 'NN'), ('pertemuan', 'JJ'), ('investor', 'NN'), ('di', 'JJ'), ('San', 'NNP'), ('Francisco', 'NNP'), ('.', '.')]\n",
"\n",
"CANDIDATE ENTITIES (sederhana):\n",
"- Apple\n",
"- Tim Cook\n",
"- September\n",
"- Cupertino\n",
"- California\n",
"- Pada\n",
"- San Francisco\n",
"\n",
"TANGGAL YANG DITEMUKAN:\n",
"- 12 September 2023\n",
"\n",
"TANGGAL (FORMAT ISO 8601):\n",
"- 2023-09-12\n",
"\n",
"EVENT PELUNCURAN PRODUK:\n",
" Perusahaan : CEO Apple, Tim Cook\n",
" Produk : iPhone 15\n",
" Tanggal : 12 September 2023\n",
" Lokasi : September\n",
"\n",
"TEMPLATE EVENT (hasil akhir IE):\n",
"{'Company': 'CEO Apple, Tim Cook', 'Product': 'iPhone 15', 'LaunchDateOriginal': '12 September 2023', 'LaunchDateISO': '2023-09-12', 'Location': 'September'}\n"
]
}
],
"source": [
"# =========================\n",
"# NLP Information Extraction (IE)\n",
"# Praktikum\n",
"# =========================\n",
"\n",
"# 1. Cek versi & instalasi dasar\n",
"import sys\n",
"import nltk\n",
"print(\"Python:\", sys.version)\n",
"print(\"NLTK :\", nltk.__version__)\n",
"\n",
"# Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah:\n",
"# !{sys.executable} -m pip install --upgrade nltk\n",
"\n",
"# 2. Import pustaka\n",
"import re\n",
"from nltk.tokenize import wordpunct_tokenize\n",
"from nltk import pos_tag\n",
"\n",
"# 3. Download resource NLTK yang stabil\n",
"nltk.download('averaged_perceptron_tagger', quiet=True)\n",
"try:\n",
" nltk.download('punkt', quiet=True)\n",
" print(\"Resource punkt siap dipakai\")\n",
"except:\n",
" print(\"Gagal download punkt, akan tetap pakai wordpunct_tokenize\")\n",
"\n",
"# 4. Contoh teks (bisa diganti)\n",
"text = \"\"\"\n",
"CEO Apple, Tim Cook, meluncurkan iPhone 15 pada 12 September 2023 di Cupertino, California.\n",
"Pada hari berikutnya, ia menghadiri pertemuan investor di San Francisco.\n",
"\"\"\"\n",
"print(\"TEKS CONTOH:\")\n",
"print(text)\n",
"\n",
"# 5. Tokenisasi dan POS tagging\n",
"tokens = wordpunct_tokenize(text)\n",
"print(\"\\nTOKENS:\")\n",
"print(tokens)\n",
"\n",
"pos_tags = pos_tag(tokens)\n",
"print(\"\\nPOS TAGS (30 pertama):\")\n",
"print(pos_tags[:30])\n",
"\n",
"# 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital)\n",
"def simple_capital_ner(tokens):\n",
" entities = []\n",
" current = []\n",
" for tok in tokens:\n",
" if tok.istitle() and tok.isalpha():\n",
" current.append(tok)\n",
" else:\n",
" if current:\n",
" entities.append(\" \".join(current))\n",
" current = []\n",
" if current:\n",
" entities.append(\" \".join(current))\n",
" return entities\n",
"\n",
"candidate_entities = simple_capital_ner(tokens)\n",
"print(\"\\nCANDIDATE ENTITIES (sederhana):\")\n",
"for e in candidate_entities:\n",
" print(\"-\", e)\n",
"\n",
"# 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan RegEx\n",
"bulan_id = r\"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\"\n",
"date_pattern = rf\"\\b(\\d{{1,2}})\\s+{bulan_id}\\s+(\\d{{4}})\\b\"\n",
"\n",
"dates = re.findall(date_pattern, text)\n",
"print(\"\\nTANGGAL YANG DITEMUKAN:\")\n",
"for d in dates:\n",
" print(\"-\", \" \".join(d))\n",
"\n",
"# 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD)\n",
"bulan_map = {\n",
" \"Januari\": \"01\",\n",
" \"Februari\": \"02\",\n",
" \"Maret\": \"03\",\n",
" \"April\": \"04\",\n",
" \"Mei\": \"05\",\n",
" \"Juni\": \"06\",\n",
" \"Juli\": \"07\",\n",
" \"Agustus\": \"08\",\n",
" \"September\": \"09\",\n",
" \"Oktober\": \"10\",\n",
" \"November\": \"11\",\n",
" \"Desember\": \"12\",\n",
"}\n",
"\n",
"normalized_dates = []\n",
"for hari, bulan, tahun in dates:\n",
" bulan_num = bulan_map[bulan]\n",
" hari_num = hari.zfill(2)\n",
" iso = f\"{tahun}-{bulan_num}-{hari_num}\"\n",
" normalized_dates.append(iso)\n",
"\n",
"print(\"\\nTANGGAL (FORMAT ISO 8601):\")\n",
"for iso in normalized_dates:\n",
" print(\"-\", iso)\n",
"\n",
"# 9. Ekstraksi event peluncuran produk (pattern-based)\n",
"pattern = r\"(.+?) meluncurkan (.+?) pada (\\d{1,2} \" + bulan_id + r\" \\d{4}) di (.+?)\\.\"\n",
"match = re.search(pattern, text)\n",
"\n",
"if match:\n",
" company = match.group(1).strip(\" ,\")\n",
" product = match.group(2).strip(\" ,\")\n",
" tgl_str = match.group(3)\n",
" location = match.group(4).strip(\" ,\")\n",
"\n",
" print(\"\\nEVENT PELUNCURAN PRODUK:\")\n",
" print(\" Perusahaan :\", company)\n",
" print(\" Produk :\", product)\n",
" print(\" Tanggal :\", tgl_str)\n",
" print(\" Lokasi :\", location)\n",
"else:\n",
" company = product = tgl_str = location = None\n",
" print(\"\\nPola event peluncuran produk tidak ditemukan.\")\n",
"\n",
"# 10. Template IE (Template Extraction)\n",
"event_template = {\n",
" \"Company\": company,\n",
" \"Product\": product,\n",
" \"LaunchDateOriginal\": tgl_str,\n",
" \"LaunchDateISO\": normalized_dates[0] if normalized_dates else None,\n",
" \"Location\": location,\n",
"}\n",
"\n",
"print(\"\\nTEMPLATE EVENT (hasil akhir IE):\")\n",
"print(event_template)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e128113-af1e-45a1-8586-48c4acf578b4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}