Praktikum-NLP/Information Extraction.ipynb

288 lines
10 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "e48b73eb-5463-4c81-99e5-f4eb5439380c",
"metadata": {},
"source": [
"# =========================\n",
"# NLP Information Extraction (IE)\n",
"# Praktikum Fatah\n",
"#\n",
"# Nama : Fatah Sabila Rosyad\n",
"# NIM : 202210715288\n",
"# Kelas : F7B2\n",
"# ========================="
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "36757708-dd4b-4984-8ea6-8fd75685448a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] C:\\Users\\Fatah Sabila\n",
"[nltk_data] Rosyad\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
"[nltk_data] date!\n",
"[nltk_data] Downloading package averaged_perceptron_tagger_eng to\n",
"[nltk_data] C:\\Users\\Fatah Sabila\n",
"[nltk_data] Rosyad\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Unzipping taggers\\averaged_perceptron_tagger_eng.zip.\n",
"[nltk_data] Downloading package punkt to C:\\Users\\Fatah Sabila\n",
"[nltk_data] Rosyad\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import nltk\n",
"\n",
"# Download resource yang dibutuhkan untuk POS Tagging (versi terbaru)\n",
"nltk.download('averaged_perceptron_tagger')\n",
"nltk.download('averaged_perceptron_tagger_eng')\n",
"nltk.download('punkt')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7c7601d6-3c91-453e-8c29-706528237596",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Python: 3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 16:37:03) [MSC v.1929 64 bit (AMD64)]\n",
"NLTK : 3.9.1\n",
"Resource punkt siap dipakai\n",
"TEKS CONTOH:\n",
"\n",
"Mahasiswa informatika bernama Fatah Sabila Rosyad melakukan praktikum NLP\n",
"tentang Information Extraction pada 22 November 2025 di Universitas Bhayangkara Jakarta Raya.\n",
"Pada kegiatan tersebut, Fatah mempelajari ekstraksi entitas, tanggal, dan event berbasis teks.\n",
"\n",
"\n",
"TOKENS:\n",
"['Mahasiswa', 'informatika', 'bernama', 'Fatah', 'Sabila', 'Rosyad', 'melakukan', 'praktikum', 'NLP', 'tentang', 'Information', 'Extraction', 'pada', '22', 'November', '2025', 'di', 'Universitas', 'Bhayangkara', 'Jakarta', 'Raya', '.', 'Pada', 'kegiatan', 'tersebut', ',', 'Fatah', 'mempelajari', 'ekstraksi', 'entitas', ',', 'tanggal', ',', 'dan', 'event', 'berbasis', 'teks', '.']\n",
"\n",
"POS TAGS (30 pertama):\n",
"[('Mahasiswa', 'NNP'), ('informatika', 'JJ'), ('bernama', 'NN'), ('Fatah', 'NNP'), ('Sabila', 'NNP'), ('Rosyad', 'NNP'), ('melakukan', 'NN'), ('praktikum', 'NN'), ('NLP', 'NNP'), ('tentang', 'NN'), ('Information', 'NNP'), ('Extraction', 'NNP'), ('pada', 'VBD'), ('22', 'CD'), ('November', 'NNP'), ('2025', 'CD'), ('di', 'NN'), ('Universitas', 'NNP'), ('Bhayangkara', 'NNP'), ('Jakarta', 'NNP'), ('Raya', 'NNP'), ('.', '.'), ('Pada', 'NNP'), ('kegiatan', 'NN'), ('tersebut', 'NN'), (',', ','), ('Fatah', 'NNP'), ('mempelajari', 'NNP'), ('ekstraksi', 'NN'), ('entitas', 'NNS')]\n",
"\n",
"CANDIDATE ENTITIES (sederhana):\n",
"- Mahasiswa\n",
"- Fatah Sabila Rosyad\n",
"- Information Extraction\n",
"- November\n",
"- Universitas Bhayangkara Jakarta Raya\n",
"- Pada\n",
"- Fatah\n",
"\n",
"TANGGAL YANG DITEMUKAN:\n",
"- 22 November 2025\n",
"\n",
"TANGGAL (FORMAT ISO 8601):\n",
"- 2025-11-22\n",
"\n",
"Pola event peluncuran produk tidak ditemukan.\n",
"\n",
"TEMPLATE EVENT (hasil akhir IE):\n",
"{'Company': None, 'Product': None, 'LaunchDateOriginal': None, 'LaunchDateISO': '2025-11-22', 'Location': None}\n"
]
}
],
"source": [
"# =========================\n",
"# NLP Information Extraction (IE)\n",
"# Praktikum\n",
"# =========================\n",
"\n",
"# 1. Cek versi & instalasi dasar\n",
"import sys\n",
"import nltk\n",
"print(\"Python:\", sys.version)\n",
"print(\"NLTK :\", nltk.__version__)\n",
"\n",
"# Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah:\n",
"# !{sys.executable} -m pip install --upgrade nltk\n",
"\n",
"# 2. Import pustaka\n",
"import re\n",
"from nltk.tokenize import wordpunct_tokenize\n",
"from nltk import pos_tag\n",
"\n",
"# 3. Download resource NLTK yang stabil\n",
"nltk.download('averaged_perceptron_tagger', quiet=True)\n",
"try:\n",
" nltk.download('punkt', quiet=True)\n",
" print(\"Resource punkt siap dipakai\")\n",
"except:\n",
" print(\"Gagal download punkt, akan tetap pakai wordpunct_tokenize\")\n",
"\n",
"# 4. Contoh teks (bisa diganti)\n",
"text = \"\"\"\n",
"Mahasiswa informatika bernama Fatah Sabila Rosyad melakukan praktikum NLP\n",
"tentang Information Extraction pada 22 November 2025 di Universitas Bhayangkara Jakarta Raya.\n",
"Pada kegiatan tersebut, Fatah mempelajari ekstraksi entitas, tanggal, dan event berbasis teks.\n",
"\"\"\"\n",
"print(\"TEKS CONTOH:\")\n",
"print(text)\n",
"\n",
"# 5. Tokenisasi dan POS tagging\n",
"tokens = wordpunct_tokenize(text)\n",
"print(\"\\nTOKENS:\")\n",
"print(tokens)\n",
"\n",
"pos_tags = pos_tag(tokens)\n",
"print(\"\\nPOS TAGS (30 pertama):\")\n",
"print(pos_tags[:30])\n",
"\n",
"# 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital)\n",
"def simple_capital_ner(tokens):\n",
" entities = []\n",
" current = []\n",
" for tok in tokens:\n",
" if tok.istitle() and tok.isalpha():\n",
" current.append(tok)\n",
" else:\n",
" if current:\n",
" entities.append(\" \".join(current))\n",
" current = []\n",
" if current:\n",
" entities.append(\" \".join(current))\n",
" return entities\n",
"\n",
"candidate_entities = simple_capital_ner(tokens)\n",
"print(\"\\nCANDIDATE ENTITIES (sederhana):\")\n",
"for e in candidate_entities:\n",
" print(\"-\", e)\n",
"\n",
"# 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan RegEx\n",
"bulan_id = r\"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\"\n",
"date_pattern = rf\"\\b(\\d{{1,2}})\\s+{bulan_id}\\s+(\\d{{4}})\\b\"\n",
"\n",
"dates = re.findall(date_pattern, text)\n",
"print(\"\\nTANGGAL YANG DITEMUKAN:\")\n",
"for d in dates:\n",
" print(\"-\", \" \".join(d))\n",
"\n",
"# 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD)\n",
"bulan_map = {\n",
" \"Januari\": \"01\",\n",
" \"Februari\": \"02\",\n",
" \"Maret\": \"03\",\n",
" \"April\": \"04\",\n",
" \"Mei\": \"05\",\n",
" \"Juni\": \"06\",\n",
" \"Juli\": \"07\",\n",
" \"Agustus\": \"08\",\n",
" \"September\": \"09\",\n",
" \"Oktober\": \"10\",\n",
" \"November\": \"11\",\n",
" \"Desember\": \"12\",\n",
"}\n",
"\n",
"normalized_dates = []\n",
"for hari, bulan, tahun in dates:\n",
" bulan_num = bulan_map[bulan]\n",
" hari_num = hari.zfill(2)\n",
" iso = f\"{tahun}-{bulan_num}-{hari_num}\"\n",
" normalized_dates.append(iso)\n",
"\n",
"print(\"\\nTANGGAL (FORMAT ISO 8601):\")\n",
"for iso in normalized_dates:\n",
" print(\"-\", iso)\n",
"\n",
"# 9. Ekstraksi event peluncuran produk (pattern-based)\n",
"pattern = r\"(.+?) meluncurkan (.+?) pada (\\d{1,2} \" + bulan_id + r\" \\d{4}) di (.+?)\\.\"\n",
"match = re.search(pattern, text)\n",
"\n",
"if match:\n",
" company = match.group(1).strip(\" ,\")\n",
" product = match.group(2).strip(\" ,\")\n",
" tgl_str = match.group(3)\n",
" location = match.group(4).strip(\" ,\")\n",
"\n",
" print(\"\\nEVENT PELUNCURAN PRODUK:\")\n",
" print(\" Perusahaan :\", company)\n",
" print(\" Produk :\", product)\n",
" print(\" Tanggal :\", tgl_str)\n",
" print(\" Lokasi :\", location)\n",
"else:\n",
" company = product = tgl_str = location = None\n",
" print(\"\\nPola event peluncuran produk tidak ditemukan.\")\n",
"\n",
"# 10. Template IE (Template Extraction)\n",
"event_template = {\n",
" \"Company\": company,\n",
" \"Product\": product,\n",
" \"LaunchDateOriginal\": tgl_str,\n",
" \"LaunchDateISO\": normalized_dates[0] if normalized_dates else None,\n",
" \"Location\": location,\n",
"}\n",
"\n",
"print(\"\\nTEMPLATE EVENT (hasil akhir IE):\")\n",
"print(event_template)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1e128113-af1e-45a1-8586-48c4acf578b4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Proses Information Extraction selesai dijalankan oleh Fatah.\n"
]
}
],
"source": [
"print(\"\\nProses Information Extraction selesai dijalankan oleh Fatah.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}