diff --git a/Information Extraction.ipynb b/Information Extraction.ipynb new file mode 100644 index 0000000..2b68ebb --- /dev/null +++ b/Information Extraction.ipynb @@ -0,0 +1,287 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e48b73eb-5463-4c81-99e5-f4eb5439380c", + "metadata": {}, + "source": [ + "# =========================\n", + "# NLP – Information Extraction (IE)\n", + "# Praktikum Fatah\n", + "#\n", + "# Nama : Fatah Sabila Rosyad\n", + "# NIM : 202210715288\n", + "# Kelas : F7B2\n", + "# =========================" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "36757708-dd4b-4984-8ea6-8fd75685448a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package averaged_perceptron_tagger to\n", + "[nltk_data] C:\\Users\\Fatah Sabila\n", + "[nltk_data] Rosyad\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", + "[nltk_data] date!\n", + "[nltk_data] Downloading package averaged_perceptron_tagger_eng to\n", + "[nltk_data] C:\\Users\\Fatah Sabila\n", + "[nltk_data] Rosyad\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Unzipping taggers\\averaged_perceptron_tagger_eng.zip.\n", + "[nltk_data] Downloading package punkt to C:\\Users\\Fatah Sabila\n", + "[nltk_data] Rosyad\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import nltk\n", + "\n", + "# Download resource yang dibutuhkan untuk POS Tagging (versi terbaru)\n", + "nltk.download('averaged_perceptron_tagger')\n", + "nltk.download('averaged_perceptron_tagger_eng')\n", + "nltk.download('punkt')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7c7601d6-3c91-453e-8c29-706528237596", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python: 3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 16:37:03) [MSC v.1929 64 bit (AMD64)]\n", + "NLTK : 3.9.1\n", + "Resource punkt siap dipakai\n", + "TEKS CONTOH:\n", + "\n", + "Mahasiswa informatika bernama Fatah Sabila Rosyad melakukan praktikum NLP\n", + "tentang Information Extraction pada 22 November 2025 di Universitas Bhayangkara Jakarta Raya.\n", + "Pada kegiatan tersebut, Fatah mempelajari ekstraksi entitas, tanggal, dan event berbasis teks.\n", + "\n", + "\n", + "TOKENS:\n", + "['Mahasiswa', 'informatika', 'bernama', 'Fatah', 'Sabila', 'Rosyad', 'melakukan', 'praktikum', 'NLP', 'tentang', 'Information', 'Extraction', 'pada', '22', 'November', '2025', 'di', 'Universitas', 'Bhayangkara', 'Jakarta', 'Raya', '.', 'Pada', 'kegiatan', 'tersebut', ',', 'Fatah', 'mempelajari', 'ekstraksi', 'entitas', ',', 'tanggal', ',', 'dan', 'event', 'berbasis', 'teks', '.']\n", + "\n", + "POS TAGS (30 pertama):\n", + "[('Mahasiswa', 'NNP'), ('informatika', 'JJ'), ('bernama', 'NN'), ('Fatah', 'NNP'), ('Sabila', 'NNP'), ('Rosyad', 'NNP'), ('melakukan', 'NN'), ('praktikum', 'NN'), ('NLP', 'NNP'), ('tentang', 'NN'), ('Information', 'NNP'), ('Extraction', 'NNP'), ('pada', 'VBD'), ('22', 'CD'), ('November', 'NNP'), ('2025', 'CD'), ('di', 'NN'), ('Universitas', 'NNP'), ('Bhayangkara', 'NNP'), ('Jakarta', 'NNP'), ('Raya', 'NNP'), ('.', '.'), ('Pada', 'NNP'), ('kegiatan', 'NN'), ('tersebut', 'NN'), (',', ','), ('Fatah', 'NNP'), ('mempelajari', 'NNP'), ('ekstraksi', 'NN'), ('entitas', 'NNS')]\n", + "\n", + "CANDIDATE ENTITIES (sederhana):\n", + "- Mahasiswa\n", + "- Fatah Sabila Rosyad\n", + "- Information Extraction\n", + "- November\n", + "- Universitas Bhayangkara Jakarta Raya\n", + "- Pada\n", + "- Fatah\n", + "\n", + "TANGGAL YANG DITEMUKAN:\n", + "- 22 November 2025\n", + "\n", + "TANGGAL (FORMAT ISO 8601):\n", + "- 2025-11-22\n", + "\n", + "Pola event peluncuran produk tidak ditemukan.\n", + "\n", + "TEMPLATE EVENT (hasil akhir IE):\n", + "{'Company': None, 'Product': None, 'LaunchDateOriginal': None, 'LaunchDateISO': '2025-11-22', 'Location': None}\n" + ] + } + ], + "source": [ + "# =========================\n", + "# NLP – Information Extraction (IE)\n", + "# Praktikum\n", + "# =========================\n", + "\n", + "# 1. Cek versi & instalasi dasar\n", + "import sys\n", + "import nltk\n", + "print(\"Python:\", sys.version)\n", + "print(\"NLTK :\", nltk.__version__)\n", + "\n", + "# Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah:\n", + "# !{sys.executable} -m pip install --upgrade nltk\n", + "\n", + "# 2. Import pustaka\n", + "import re\n", + "from nltk.tokenize import wordpunct_tokenize\n", + "from nltk import pos_tag\n", + "\n", + "# 3. Download resource NLTK yang stabil\n", + "nltk.download('averaged_perceptron_tagger', quiet=True)\n", + "try:\n", + " nltk.download('punkt', quiet=True)\n", + " print(\"Resource punkt siap dipakai\")\n", + "except:\n", + " print(\"Gagal download punkt, akan tetap pakai wordpunct_tokenize\")\n", + "\n", + "# 4. Contoh teks (bisa diganti)\n", + "text = \"\"\"\n", + "Mahasiswa informatika bernama Fatah Sabila Rosyad melakukan praktikum NLP\n", + "tentang Information Extraction pada 22 November 2025 di Universitas Bhayangkara Jakarta Raya.\n", + "Pada kegiatan tersebut, Fatah mempelajari ekstraksi entitas, tanggal, dan event berbasis teks.\n", + "\"\"\"\n", + "print(\"TEKS CONTOH:\")\n", + "print(text)\n", + "\n", + "# 5. Tokenisasi dan POS tagging\n", + "tokens = wordpunct_tokenize(text)\n", + "print(\"\\nTOKENS:\")\n", + "print(tokens)\n", + "\n", + "pos_tags = pos_tag(tokens)\n", + "print(\"\\nPOS TAGS (30 pertama):\")\n", + "print(pos_tags[:30])\n", + "\n", + "# 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital)\n", + "def simple_capital_ner(tokens):\n", + " entities = []\n", + " current = []\n", + " for tok in tokens:\n", + " if tok.istitle() and tok.isalpha():\n", + " current.append(tok)\n", + " else:\n", + " if current:\n", + " entities.append(\" \".join(current))\n", + " current = []\n", + " if current:\n", + " entities.append(\" \".join(current))\n", + " return entities\n", + "\n", + "candidate_entities = simple_capital_ner(tokens)\n", + "print(\"\\nCANDIDATE ENTITIES (sederhana):\")\n", + "for e in candidate_entities:\n", + " print(\"-\", e)\n", + "\n", + "# 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan RegEx\n", + "bulan_id = r\"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\"\n", + "date_pattern = rf\"\\b(\\d{{1,2}})\\s+{bulan_id}\\s+(\\d{{4}})\\b\"\n", + "\n", + "dates = re.findall(date_pattern, text)\n", + "print(\"\\nTANGGAL YANG DITEMUKAN:\")\n", + "for d in dates:\n", + " print(\"-\", \" \".join(d))\n", + "\n", + "# 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD)\n", + "bulan_map = {\n", + " \"Januari\": \"01\",\n", + " \"Februari\": \"02\",\n", + " \"Maret\": \"03\",\n", + " \"April\": \"04\",\n", + " \"Mei\": \"05\",\n", + " \"Juni\": \"06\",\n", + " \"Juli\": \"07\",\n", + " \"Agustus\": \"08\",\n", + " \"September\": \"09\",\n", + " \"Oktober\": \"10\",\n", + " \"November\": \"11\",\n", + " \"Desember\": \"12\",\n", + "}\n", + "\n", + "normalized_dates = []\n", + "for hari, bulan, tahun in dates:\n", + " bulan_num = bulan_map[bulan]\n", + " hari_num = hari.zfill(2)\n", + " iso = f\"{tahun}-{bulan_num}-{hari_num}\"\n", + " normalized_dates.append(iso)\n", + "\n", + "print(\"\\nTANGGAL (FORMAT ISO 8601):\")\n", + "for iso in normalized_dates:\n", + " print(\"-\", iso)\n", + "\n", + "# 9. Ekstraksi event peluncuran produk (pattern-based)\n", + "pattern = r\"(.+?) meluncurkan (.+?) pada (\\d{1,2} \" + bulan_id + r\" \\d{4}) di (.+?)\\.\"\n", + "match = re.search(pattern, text)\n", + "\n", + "if match:\n", + " company = match.group(1).strip(\" ,\")\n", + " product = match.group(2).strip(\" ,\")\n", + " tgl_str = match.group(3)\n", + " location = match.group(4).strip(\" ,\")\n", + "\n", + " print(\"\\nEVENT PELUNCURAN PRODUK:\")\n", + " print(\" Perusahaan :\", company)\n", + " print(\" Produk :\", product)\n", + " print(\" Tanggal :\", tgl_str)\n", + " print(\" Lokasi :\", location)\n", + "else:\n", + " company = product = tgl_str = location = None\n", + " print(\"\\nPola event peluncuran produk tidak ditemukan.\")\n", + "\n", + "# 10. Template IE (Template Extraction)\n", + "event_template = {\n", + " \"Company\": company,\n", + " \"Product\": product,\n", + " \"LaunchDateOriginal\": tgl_str,\n", + " \"LaunchDateISO\": normalized_dates[0] if normalized_dates else None,\n", + " \"Location\": location,\n", + "}\n", + "\n", + "print(\"\\nTEMPLATE EVENT (hasil akhir IE):\")\n", + "print(event_template)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1e128113-af1e-45a1-8586-48c4acf578b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Proses Information Extraction selesai dijalankan oleh Fatah.\n" + ] + } + ], + "source": [ + "print(\"\\nProses Information Extraction selesai dijalankan oleh Fatah.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}