Add Information Extraction notebook (NLP Practicum)
This commit is contained in:
parent
e871ab77ad
commit
988dcd59b6
287
Information Extraction.ipynb
Normal file
287
Information Extraction.ipynb
Normal file
@ -0,0 +1,287 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e48b73eb-5463-4c81-99e5-f4eb5439380c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# =========================\n",
|
||||
"# NLP – Information Extraction (IE)\n",
|
||||
"# Praktikum Fatah\n",
|
||||
"#\n",
|
||||
"# Nama : Fatah Sabila Rosyad\n",
|
||||
"# NIM : 202210715288\n",
|
||||
"# Kelas : F7B2\n",
|
||||
"# ========================="
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "36757708-dd4b-4984-8ea6-8fd75685448a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
|
||||
"[nltk_data] C:\\Users\\Fatah Sabila\n",
|
||||
"[nltk_data] Rosyad\\AppData\\Roaming\\nltk_data...\n",
|
||||
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
|
||||
"[nltk_data] date!\n",
|
||||
"[nltk_data] Downloading package averaged_perceptron_tagger_eng to\n",
|
||||
"[nltk_data] C:\\Users\\Fatah Sabila\n",
|
||||
"[nltk_data] Rosyad\\AppData\\Roaming\\nltk_data...\n",
|
||||
"[nltk_data] Unzipping taggers\\averaged_perceptron_tagger_eng.zip.\n",
|
||||
"[nltk_data] Downloading package punkt to C:\\Users\\Fatah Sabila\n",
|
||||
"[nltk_data] Rosyad\\AppData\\Roaming\\nltk_data...\n",
|
||||
"[nltk_data] Package punkt is already up-to-date!\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import nltk\n",
|
||||
"\n",
|
||||
"# Download resource yang dibutuhkan untuk POS Tagging (versi terbaru)\n",
|
||||
"nltk.download('averaged_perceptron_tagger')\n",
|
||||
"nltk.download('averaged_perceptron_tagger_eng')\n",
|
||||
"nltk.download('punkt')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "7c7601d6-3c91-453e-8c29-706528237596",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Python: 3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 16:37:03) [MSC v.1929 64 bit (AMD64)]\n",
|
||||
"NLTK : 3.9.1\n",
|
||||
"Resource punkt siap dipakai\n",
|
||||
"TEKS CONTOH:\n",
|
||||
"\n",
|
||||
"Mahasiswa informatika bernama Fatah Sabila Rosyad melakukan praktikum NLP\n",
|
||||
"tentang Information Extraction pada 22 November 2025 di Universitas Bhayangkara Jakarta Raya.\n",
|
||||
"Pada kegiatan tersebut, Fatah mempelajari ekstraksi entitas, tanggal, dan event berbasis teks.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"TOKENS:\n",
|
||||
"['Mahasiswa', 'informatika', 'bernama', 'Fatah', 'Sabila', 'Rosyad', 'melakukan', 'praktikum', 'NLP', 'tentang', 'Information', 'Extraction', 'pada', '22', 'November', '2025', 'di', 'Universitas', 'Bhayangkara', 'Jakarta', 'Raya', '.', 'Pada', 'kegiatan', 'tersebut', ',', 'Fatah', 'mempelajari', 'ekstraksi', 'entitas', ',', 'tanggal', ',', 'dan', 'event', 'berbasis', 'teks', '.']\n",
|
||||
"\n",
|
||||
"POS TAGS (30 pertama):\n",
|
||||
"[('Mahasiswa', 'NNP'), ('informatika', 'JJ'), ('bernama', 'NN'), ('Fatah', 'NNP'), ('Sabila', 'NNP'), ('Rosyad', 'NNP'), ('melakukan', 'NN'), ('praktikum', 'NN'), ('NLP', 'NNP'), ('tentang', 'NN'), ('Information', 'NNP'), ('Extraction', 'NNP'), ('pada', 'VBD'), ('22', 'CD'), ('November', 'NNP'), ('2025', 'CD'), ('di', 'NN'), ('Universitas', 'NNP'), ('Bhayangkara', 'NNP'), ('Jakarta', 'NNP'), ('Raya', 'NNP'), ('.', '.'), ('Pada', 'NNP'), ('kegiatan', 'NN'), ('tersebut', 'NN'), (',', ','), ('Fatah', 'NNP'), ('mempelajari', 'NNP'), ('ekstraksi', 'NN'), ('entitas', 'NNS')]\n",
|
||||
"\n",
|
||||
"CANDIDATE ENTITIES (sederhana):\n",
|
||||
"- Mahasiswa\n",
|
||||
"- Fatah Sabila Rosyad\n",
|
||||
"- Information Extraction\n",
|
||||
"- November\n",
|
||||
"- Universitas Bhayangkara Jakarta Raya\n",
|
||||
"- Pada\n",
|
||||
"- Fatah\n",
|
||||
"\n",
|
||||
"TANGGAL YANG DITEMUKAN:\n",
|
||||
"- 22 November 2025\n",
|
||||
"\n",
|
||||
"TANGGAL (FORMAT ISO 8601):\n",
|
||||
"- 2025-11-22\n",
|
||||
"\n",
|
||||
"Pola event peluncuran produk tidak ditemukan.\n",
|
||||
"\n",
|
||||
"TEMPLATE EVENT (hasil akhir IE):\n",
|
||||
"{'Company': None, 'Product': None, 'LaunchDateOriginal': None, 'LaunchDateISO': '2025-11-22', 'Location': None}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# =========================\n",
|
||||
"# NLP – Information Extraction (IE)\n",
|
||||
"# Praktikum\n",
|
||||
"# =========================\n",
|
||||
"\n",
|
||||
"# 1. Cek versi & instalasi dasar\n",
|
||||
"import sys\n",
|
||||
"import nltk\n",
|
||||
"print(\"Python:\", sys.version)\n",
|
||||
"print(\"NLTK :\", nltk.__version__)\n",
|
||||
"\n",
|
||||
"# Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah:\n",
|
||||
"# !{sys.executable} -m pip install --upgrade nltk\n",
|
||||
"\n",
|
||||
"# 2. Import pustaka\n",
|
||||
"import re\n",
|
||||
"from nltk.tokenize import wordpunct_tokenize\n",
|
||||
"from nltk import pos_tag\n",
|
||||
"\n",
|
||||
"# 3. Download resource NLTK yang stabil\n",
|
||||
"nltk.download('averaged_perceptron_tagger', quiet=True)\n",
|
||||
"try:\n",
|
||||
" nltk.download('punkt', quiet=True)\n",
|
||||
" print(\"Resource punkt siap dipakai\")\n",
|
||||
"except:\n",
|
||||
" print(\"Gagal download punkt, akan tetap pakai wordpunct_tokenize\")\n",
|
||||
"\n",
|
||||
"# 4. Contoh teks (bisa diganti)\n",
|
||||
"text = \"\"\"\n",
|
||||
"Mahasiswa informatika bernama Fatah Sabila Rosyad melakukan praktikum NLP\n",
|
||||
"tentang Information Extraction pada 22 November 2025 di Universitas Bhayangkara Jakarta Raya.\n",
|
||||
"Pada kegiatan tersebut, Fatah mempelajari ekstraksi entitas, tanggal, dan event berbasis teks.\n",
|
||||
"\"\"\"\n",
|
||||
"print(\"TEKS CONTOH:\")\n",
|
||||
"print(text)\n",
|
||||
"\n",
|
||||
"# 5. Tokenisasi dan POS tagging\n",
|
||||
"tokens = wordpunct_tokenize(text)\n",
|
||||
"print(\"\\nTOKENS:\")\n",
|
||||
"print(tokens)\n",
|
||||
"\n",
|
||||
"pos_tags = pos_tag(tokens)\n",
|
||||
"print(\"\\nPOS TAGS (30 pertama):\")\n",
|
||||
"print(pos_tags[:30])\n",
|
||||
"\n",
|
||||
"# 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital)\n",
|
||||
"def simple_capital_ner(tokens):\n",
|
||||
" entities = []\n",
|
||||
" current = []\n",
|
||||
" for tok in tokens:\n",
|
||||
" if tok.istitle() and tok.isalpha():\n",
|
||||
" current.append(tok)\n",
|
||||
" else:\n",
|
||||
" if current:\n",
|
||||
" entities.append(\" \".join(current))\n",
|
||||
" current = []\n",
|
||||
" if current:\n",
|
||||
" entities.append(\" \".join(current))\n",
|
||||
" return entities\n",
|
||||
"\n",
|
||||
"candidate_entities = simple_capital_ner(tokens)\n",
|
||||
"print(\"\\nCANDIDATE ENTITIES (sederhana):\")\n",
|
||||
"for e in candidate_entities:\n",
|
||||
" print(\"-\", e)\n",
|
||||
"\n",
|
||||
"# 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan RegEx\n",
|
||||
"bulan_id = r\"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\"\n",
|
||||
"date_pattern = rf\"\\b(\\d{{1,2}})\\s+{bulan_id}\\s+(\\d{{4}})\\b\"\n",
|
||||
"\n",
|
||||
"dates = re.findall(date_pattern, text)\n",
|
||||
"print(\"\\nTANGGAL YANG DITEMUKAN:\")\n",
|
||||
"for d in dates:\n",
|
||||
" print(\"-\", \" \".join(d))\n",
|
||||
"\n",
|
||||
"# 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD)\n",
|
||||
"bulan_map = {\n",
|
||||
" \"Januari\": \"01\",\n",
|
||||
" \"Februari\": \"02\",\n",
|
||||
" \"Maret\": \"03\",\n",
|
||||
" \"April\": \"04\",\n",
|
||||
" \"Mei\": \"05\",\n",
|
||||
" \"Juni\": \"06\",\n",
|
||||
" \"Juli\": \"07\",\n",
|
||||
" \"Agustus\": \"08\",\n",
|
||||
" \"September\": \"09\",\n",
|
||||
" \"Oktober\": \"10\",\n",
|
||||
" \"November\": \"11\",\n",
|
||||
" \"Desember\": \"12\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"normalized_dates = []\n",
|
||||
"for hari, bulan, tahun in dates:\n",
|
||||
" bulan_num = bulan_map[bulan]\n",
|
||||
" hari_num = hari.zfill(2)\n",
|
||||
" iso = f\"{tahun}-{bulan_num}-{hari_num}\"\n",
|
||||
" normalized_dates.append(iso)\n",
|
||||
"\n",
|
||||
"print(\"\\nTANGGAL (FORMAT ISO 8601):\")\n",
|
||||
"for iso in normalized_dates:\n",
|
||||
" print(\"-\", iso)\n",
|
||||
"\n",
|
||||
"# 9. Ekstraksi event peluncuran produk (pattern-based)\n",
|
||||
"pattern = r\"(.+?) meluncurkan (.+?) pada (\\d{1,2} \" + bulan_id + r\" \\d{4}) di (.+?)\\.\"\n",
|
||||
"match = re.search(pattern, text)\n",
|
||||
"\n",
|
||||
"if match:\n",
|
||||
" company = match.group(1).strip(\" ,\")\n",
|
||||
" product = match.group(2).strip(\" ,\")\n",
|
||||
" tgl_str = match.group(3)\n",
|
||||
" location = match.group(4).strip(\" ,\")\n",
|
||||
"\n",
|
||||
" print(\"\\nEVENT PELUNCURAN PRODUK:\")\n",
|
||||
" print(\" Perusahaan :\", company)\n",
|
||||
" print(\" Produk :\", product)\n",
|
||||
" print(\" Tanggal :\", tgl_str)\n",
|
||||
" print(\" Lokasi :\", location)\n",
|
||||
"else:\n",
|
||||
" company = product = tgl_str = location = None\n",
|
||||
" print(\"\\nPola event peluncuran produk tidak ditemukan.\")\n",
|
||||
"\n",
|
||||
"# 10. Template IE (Template Extraction)\n",
|
||||
"event_template = {\n",
|
||||
" \"Company\": company,\n",
|
||||
" \"Product\": product,\n",
|
||||
" \"LaunchDateOriginal\": tgl_str,\n",
|
||||
" \"LaunchDateISO\": normalized_dates[0] if normalized_dates else None,\n",
|
||||
" \"Location\": location,\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"print(\"\\nTEMPLATE EVENT (hasil akhir IE):\")\n",
|
||||
"print(event_template)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "1e128113-af1e-45a1-8586-48c4acf578b4",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Proses Information Extraction selesai dijalankan oleh Fatah.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"\\nProses Information Extraction selesai dijalankan oleh Fatah.\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user