{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "7c7601d6-3c91-453e-8c29-706528237596", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Python: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0]\n", "NLTK : 3.9.2\n", "Resource punkt siap dipakai\n", "TEKS CONTOH:\n", "\n", "CEO Apple, Tim Cook, meluncurkan iPhone 15 pada 12 September 2023 di Cupertino, California.\n", "Pada hari berikutnya, ia menghadiri pertemuan investor di San Francisco.\n", "\n", "\n", "TOKENS:\n", "['CEO', 'Apple', ',', 'Tim', 'Cook', ',', 'meluncurkan', 'iPhone', '15', 'pada', '12', 'September', '2023', 'di', 'Cupertino', ',', 'California', '.', 'Pada', 'hari', 'berikutnya', ',', 'ia', 'menghadiri', 'pertemuan', 'investor', 'di', 'San', 'Francisco', '.']\n", "\n", "POS TAGS (30 pertama):\n", "[('CEO', 'NNP'), ('Apple', 'NNP'), (',', ','), ('Tim', 'NNP'), ('Cook', 'NNP'), (',', ','), ('meluncurkan', 'VBD'), ('iPhone', 'NN'), ('15', 'CD'), ('pada', 'NN'), ('12', 'CD'), ('September', 'NNP'), ('2023', 'CD'), ('di', 'NN'), ('Cupertino', 'NNP'), (',', ','), ('California', 'NNP'), ('.', '.'), ('Pada', 'NNP'), ('hari', 'NN'), ('berikutnya', 'NN'), (',', ','), ('ia', 'JJ'), ('menghadiri', 'NN'), ('pertemuan', 'JJ'), ('investor', 'NN'), ('di', 'JJ'), ('San', 'NNP'), ('Francisco', 'NNP'), ('.', '.')]\n", "\n", "CANDIDATE ENTITIES (sederhana):\n", "- Apple\n", "- Tim Cook\n", "- September\n", "- Cupertino\n", "- California\n", "- Pada\n", "- San Francisco\n", "\n", "TANGGAL YANG DITEMUKAN:\n", "- 12 September 2023\n", "\n", "TANGGAL (FORMAT ISO 8601):\n", "- 2023-09-12\n", "\n", "EVENT PELUNCURAN PRODUK:\n", " Perusahaan : CEO Apple, Tim Cook\n", " Produk : iPhone 15\n", " Tanggal : 12 September 2023\n", " Lokasi : September\n", "\n", "TEMPLATE EVENT (hasil akhir IE):\n", "{'Company': 'CEO Apple, Tim Cook', 'Product': 'iPhone 15', 'LaunchDateOriginal': '12 September 2023', 'LaunchDateISO': '2023-09-12', 'Location': 'September'}\n" ] } ], "source": [ "# =========================\n", "# NLP – Information Extraction (IE)\n", "# Praktikum\n", "# =========================\n", "\n", "# 1. Cek versi & instalasi dasar\n", "import sys\n", "import nltk\n", "print(\"Python:\", sys.version)\n", "print(\"NLTK :\", nltk.__version__)\n", "\n", "# Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah:\n", "# !{sys.executable} -m pip install --upgrade nltk\n", "\n", "# 2. Import pustaka\n", "import re\n", "from nltk.tokenize import wordpunct_tokenize\n", "from nltk import pos_tag\n", "\n", "# 3. Download resource NLTK yang stabil\n", "nltk.download('averaged_perceptron_tagger', quiet=True)\n", "try:\n", " nltk.download('punkt', quiet=True)\n", " print(\"Resource punkt siap dipakai\")\n", "except:\n", " print(\"Gagal download punkt, akan tetap pakai wordpunct_tokenize\")\n", "\n", "# 4. Contoh teks (bisa diganti)\n", "text = \"\"\"\n", "CEO Apple, Tim Cook, meluncurkan iPhone 15 pada 12 September 2023 di Cupertino, California.\n", "Pada hari berikutnya, ia menghadiri pertemuan investor di San Francisco.\n", "\"\"\"\n", "print(\"TEKS CONTOH:\")\n", "print(text)\n", "\n", "# 5. Tokenisasi dan POS tagging\n", "tokens = wordpunct_tokenize(text)\n", "print(\"\\nTOKENS:\")\n", "print(tokens)\n", "\n", "pos_tags = pos_tag(tokens)\n", "print(\"\\nPOS TAGS (30 pertama):\")\n", "print(pos_tags[:30])\n", "\n", "# 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital)\n", "def simple_capital_ner(tokens):\n", " entities = []\n", " current = []\n", " for tok in tokens:\n", " if tok.istitle() and tok.isalpha():\n", " current.append(tok)\n", " else:\n", " if current:\n", " entities.append(\" \".join(current))\n", " current = []\n", " if current:\n", " entities.append(\" \".join(current))\n", " return entities\n", "\n", "candidate_entities = simple_capital_ner(tokens)\n", "print(\"\\nCANDIDATE ENTITIES (sederhana):\")\n", "for e in candidate_entities:\n", " print(\"-\", e)\n", "\n", "# 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan RegEx\n", "bulan_id = r\"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\"\n", "date_pattern = rf\"\\b(\\d{{1,2}})\\s+{bulan_id}\\s+(\\d{{4}})\\b\"\n", "\n", "dates = re.findall(date_pattern, text)\n", "print(\"\\nTANGGAL YANG DITEMUKAN:\")\n", "for d in dates:\n", " print(\"-\", \" \".join(d))\n", "\n", "# 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD)\n", "bulan_map = {\n", " \"Januari\": \"01\",\n", " \"Februari\": \"02\",\n", " \"Maret\": \"03\",\n", " \"April\": \"04\",\n", " \"Mei\": \"05\",\n", " \"Juni\": \"06\",\n", " \"Juli\": \"07\",\n", " \"Agustus\": \"08\",\n", " \"September\": \"09\",\n", " \"Oktober\": \"10\",\n", " \"November\": \"11\",\n", " \"Desember\": \"12\",\n", "}\n", "\n", "normalized_dates = []\n", "for hari, bulan, tahun in dates:\n", " bulan_num = bulan_map[bulan]\n", " hari_num = hari.zfill(2)\n", " iso = f\"{tahun}-{bulan_num}-{hari_num}\"\n", " normalized_dates.append(iso)\n", "\n", "print(\"\\nTANGGAL (FORMAT ISO 8601):\")\n", "for iso in normalized_dates:\n", " print(\"-\", iso)\n", "\n", "# 9. Ekstraksi event peluncuran produk (pattern-based)\n", "pattern = r\"(.+?) meluncurkan (.+?) pada (\\d{1,2} \" + bulan_id + r\" \\d{4}) di (.+?)\\.\"\n", "match = re.search(pattern, text)\n", "\n", "if match:\n", " company = match.group(1).strip(\" ,\")\n", " product = match.group(2).strip(\" ,\")\n", " tgl_str = match.group(3)\n", " location = match.group(4).strip(\" ,\")\n", "\n", " print(\"\\nEVENT PELUNCURAN PRODUK:\")\n", " print(\" Perusahaan :\", company)\n", " print(\" Produk :\", product)\n", " print(\" Tanggal :\", tgl_str)\n", " print(\" Lokasi :\", location)\n", "else:\n", " company = product = tgl_str = location = None\n", " print(\"\\nPola event peluncuran produk tidak ditemukan.\")\n", "\n", "# 10. Template IE (Template Extraction)\n", "event_template = {\n", " \"Company\": company,\n", " \"Product\": product,\n", " \"LaunchDateOriginal\": tgl_str,\n", " \"LaunchDateISO\": normalized_dates[0] if normalized_dates else None,\n", " \"Location\": location,\n", "}\n", "\n", "print(\"\\nTEMPLATE EVENT (hasil akhir IE):\")\n", "print(event_template)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "1e128113-af1e-45a1-8586-48c4acf578b4", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 5 }