# K-Means Clustering: Jumlah Pendidik SMA 2024 # Analisis Pengelompokan Wilayah Berdasarkan Data Pendidik # ========================================== # 1. IMPORT LIBRARY # ========================================== import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler from sklearn.metrics import silhouette_score, davies_bouldin_score import warnings from google.colab import files # Tetap diperlukan untuk fungsionalitas Colab # Mengabaikan warning dan set style warnings.filterwarnings('ignore') plt.style.use('seaborn-v0_8-darkgrid') sns.set_palette("husl") # ========================================== # 2. LOAD DATA # ========================================== # Data contoh (Hapus jika Anda menggunakan file upload) data_contoh = { 'wilayah': ['DKI Jakarta', 'Jawa Barat', 'Jawa Tengah', 'Jawa Timur', 'Sumatera Utara', 'Banten', 'Sulawesi Selatan', 'Kalimantan Timur', 'Papua', 'Maluku', 'Bali', 'NTB', 'Lampung', 'Riau', 'Sulawesi Utara', 'Kalimantan Selatan', 'Jambi', 'Bengkulu', 'Aceh', 'Sumatera Barat', 'NTT', 'Papua Barat', 'Gorontalo', 'Maluku Utara'], 'jumlah_pendidik': [15000, 25000, 20000, 23000, 12000, 10000, 8000, 5000, 3000, 2500, 6000, 4500, 7000, 6500, 4000, 5500, 4200, 3500, 8500, 7500, 3800, 2000, 1800, 2200], 'jumlah_sekolah': [450, 850, 750, 800, 400, 350, 300, 200, 150, 120, 250, 180, 280, 260, 170, 220, 190, 160, 320, 300, 190, 100, 90, 110] } df = pd.DataFrame(data_contoh) # Hitung rasio guru per sekolah df['rasio_guru_per_sekolah'] = df['jumlah_pendidik'] / df['jumlah_sekolah'] # ========================================== # 3. EKSPLORASI DATA RINGKAS # ========================================== print("="*60) print("EKSPLORASI DATA AWAL") print("="*60) print(f"\nJumlah data: {len(df)} wilayah. Tidak ada missing value.\n") print(df.head().to_markdown(index=False, numalign="left", stralign="left")) print("\nStatistik Deskriptif:\n", df.describe().round(1).to_markdown(numalign="left", stralign="left")) # ========================================== # 4. VISUALISASI DATA AWAL (Disederhanakan) # ========================================== fig, axes = plt.subplots(1, 3, figsize=(18, 5)) fig.suptitle('Eksplorasi Data Pendidik SMA 2024', fontsize=16, fontweight='bold') # Plot 1: Scatter plot axes[0].scatter(df['jumlah_sekolah'], df['jumlah_pendidik'], alpha=0.7, s=100) axes[0].set_xlabel('Jumlah Sekolah') axes[0].set_ylabel('Jumlah Pendidik') axes[0].set_title('Distribusi Pendidik vs Sekolah') # Plot 2: Distribusi rasio guru per sekolah sns.histplot(df['rasio_guru_per_sekolah'], bins=15, ax=axes[1], kde=True) axes[1].set_xlabel('Rasio Guru per Sekolah') axes[1].set_title('Distribusi Rasio Guru per Sekolah') # Plot 3: Box Plot Jumlah Pendidik dan Sekolah (Menggunakan Seaborn untuk kemudahan) sns.boxplot(data=df[['jumlah_pendidik', 'jumlah_sekolah']], ax=axes[2], orient='v', palette=['#FFA07A', '#98D8C8']) axes[2].set_title('Box Plot Outlier') plt.tight_layout(rect=[0, 0, 1, 0.95]) plt.show() # ========================================== # 5. PERSIAPAN DATA UNTUK CLUSTERING # ========================================== print("\n" + "="*60) print("PERSIAPAN DATA") print("="*60) # Pilih fitur, Normalisasi, dan Persiapan K-Means features = ['jumlah_pendidik', 'jumlah_sekolah'] X = df[features].values scaler = StandardScaler() X_scaled = scaler.fit_transform(X) print(f"\nFitur yang digunakan: {features}") print(f"Shape data: {X.shape}") print("Data telah dinormalisasi menggunakan StandardScaler.") # ========================================== # 6. MENENTUKAN JUMLAH CLUSTER OPTIMAL # ========================================== print("\n" + "="*60) print("MENENTUKAN JUMLAH CLUSTER OPTIMAL") print("="*60) inertias = [] silhouette_scores = [] K_range = range(2, 11) for k in K_range: kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto') kmeans.fit(X_scaled) inertias.append(kmeans.inertia_) silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_)) # Visualisasi Elbow Method dan Silhouette Score fig, axes = plt.subplots(1, 2, figsize=(14, 5)) # Elbow Method axes[0].plot(K_range, inertias, 'bo-') axes[0].set_xlabel('Jumlah Cluster (K)') axes[0].set_ylabel('Inertia') axes[0].set_title('Elbow Method') axes[0].axvline(x=3, color='r', linestyle='--', label='K=3 (Rekomendasi)') axes[0].legend() # Silhouette Score axes[1].plot(K_range, silhouette_scores, 'go-') axes[1].set_xlabel('Jumlah Cluster (K)') axes[1].set_ylabel('Silhouette Score') axes[1].set_title('Silhouette Score') axes[1].axvline(x=3, color='r', linestyle='--', label='K=3 (Rekomendasi)') axes[1].legend() plt.show() # Tampilkan tabel evaluasi eval_df = pd.DataFrame({'K': list(K_range), 'Inertia': inertias, 'Silhouette Score': silhouette_scores}) print("\nEvaluasi untuk berbagai K:\n", eval_df.to_markdown(index=False)) # ========================================== # 7. IMPLEMENTASI K-MEANS CLUSTERING # ========================================== print("\n" + "="*60) print("IMPLEMENTASI K-MEANS CLUSTERING (K=3)") print("="*60) optimal_k = 3 kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init='auto') df['cluster'] = kmeans.fit_predict(X_scaled) # Metrik Evaluasi Akhir s_score = silhouette_score(X_scaled, df['cluster']) db_score = davies_bouldin_score(X_scaled, df['cluster']) print(f"\nClustering selesai! Iterasi: {kmeans.n_iter_}") print(f"Inertia: {kmeans.inertia_:.2f} | Silhouette Score: {s_score:.3f} | Davies-Bouldin Index: {db_score:.3f}") # ========================================== # 8. ANALISIS HASIL CLUSTERING RINGKAS # ========================================== print("\n" + "="*60) print("ANALISIS HASIL CLUSTERING") print("="*60) # Buat label cluster cluster_labels = {} # Menghitung rata-rata pendidik untuk penentuan label avg_pendidik = df.groupby('cluster')['jumlah_pendidik'].mean() for i in range(optimal_k): if avg_pendidik[i] > 15000: cluster_labels[i] = 'Kepadatan Tinggi' elif avg_pendidik[i] > 7000: cluster_labels[i] = 'Kepadatan Sedang' else: cluster_labels[i] = 'Kepadatan Rendah' df['label_cluster'] = df['cluster'].map(cluster_labels) # Ringkasan per cluster summary = df.groupby('cluster').agg( wilayah=('wilayah', 'count'), pendidik_avg=('jumlah_pendidik', 'mean'), sekolah_avg=('jumlah_sekolah', 'mean'), rasio_avg=('rasio_guru_per_sekolah', 'mean') ).round(2).rename(index=cluster_labels) print("\nRingkasan Statistik per Cluster:\n", summary.to_markdown(numalign="left", stralign="left")) # Tampilkan wilayah per cluster print("\nWilayah dalam setiap Cluster:") for i in range(optimal_k): wilayah_list = df[df['cluster'] == i]['wilayah'].values print(f" • Cluster {i} ({cluster_labels[i]}): {', '.join(wilayah_list)}") # ========================================== # 9. VISUALISASI HASIL CLUSTERING (Disederhanakan) # ========================================== fig, axes = plt.subplots(1, 2, figsize=(16, 6)) fig.suptitle('Hasil K-Means Clustering: Pendidik SMA 2024', fontsize=16, fontweight='bold') # Plot 1: Scatter plot dengan cluster centroids = scaler.inverse_transform(kmeans.cluster_centers_) sns.scatterplot(data=df, x='jumlah_sekolah', y='jumlah_pendidik', hue='label_cluster', style='label_cluster', s=150, ax=axes[0], palette='viridis', legend='full', edgecolor='black') axes[0].scatter(centroids[:, 1], centroids[:, 0], marker='X', s=400, c='red', edgecolors='black', linewidth=2, label='Centroid', zorder=5) axes[0].set_xlabel('Jumlah Sekolah') axes[0].set_ylabel('Jumlah Pendidik') axes[0].set_title('Scatter Plot: Hasil Clustering (Pendidik vs Sekolah)') axes[0].legend(title='Cluster') # Plot 2: Box plot rasio guru per sekolah per cluster sns.boxplot(data=df, x='label_cluster', y='rasio_guru_per_sekolah', ax=axes[1], palette='viridis') axes[1].set_xlabel('Cluster') axes[1].set_ylabel('Rasio Guru per Sekolah') axes[1].set_title('Perbandingan Rasio Guru per Sekolah per Cluster') plt.tight_layout(rect=[0, 0, 1, 0.95]) plt.show() # ========================================== # 10. INTERPRETASI DAN REKOMENDASI RINGKAS # ========================================== print("\n" + "="*60) print("INTERPRETASI & REKOMENDASI") print("="*60) for cluster, label in cluster_labels.items(): avg_rasio = summary.loc[label, 'rasio_avg'] print(f"\nCluster: {cluster} - {label} (Rasio rata-rata: {avg_rasio:.2f})") if avg_rasio < 25: print(" REKOMENDASI: **PENAMBAHAN guru** (Rasio di bawah standar ideal 25-30)") elif avg_rasio > 35: print(" REKOMENDASI: **SURPLUS guru** (Pertimbangkan redistribusi)") else: print(" ✓ STATUS: Rasio guru sudah dalam **batas ideal**.") # ========================================== # 11. EXPORT HASIL # ========================================== print("\n" + "="*60) print("EXPORT HASIL") print("="*60) hasil_df = df[['wilayah', 'jumlah_pendidik', 'jumlah_sekolah', 'rasio_guru_per_sekolah', 'cluster', 'label_cluster']].sort_values('cluster') hasil_df.to_csv('hasil_clustering_pendidik_sma_2024.csv', index=False) print("Hasil clustering telah disimpan ke: hasil_clustering_pendidik_sma_2024.csv") try: files.download('hasil_clustering_pendidik_sma_2024.csv') except Exception as e: print("Gagal auto-download. Pastikan Anda menjalankan kode di Google Colab.") print("\n" + "="*60) print("ANALISIS SELESAI!") print("="*60) OUTPUT ============================================================ EKSPLORASI DATA AWAL ============================================================ Jumlah data: 24 wilayah. Tidak ada missing value. | wilayah | jumlah_pendidik | jumlah_sekolah | rasio_guru_per_sekolah | |:---------------|:------------------|:-----------------|:-------------------------| | DKI Jakarta | 15000 | 450 | 33.3333 | | Jawa Barat | 25000 | 850 | 29.4118 | | Jawa Tengah | 20000 | 750 | 26.6667 | | Jawa Timur | 23000 | 800 | 28.75 | | Sumatera Utara | 12000 | 400 | 30 | Statistik Deskriptif: | | jumlah_pendidik | jumlah_sekolah | rasio_guru_per_sekolah | |:------|:------------------|:-----------------|:-------------------------| | count | 24 | 24 | 24 | | mean | 7937.5 | 299.6 | 24.7 | | std | 6573.4 | 214.8 | 3.7 | | min | 1800 | 90 | 20 | | 25% | 3725 | 167.5 | 21.6 | | 50% | 5750 | 235 | 25 | | 75% | 8875 | 327.5 | 26.7 | | max | 25000 | 850 | 33.3 | ============================================================ PERSIAPAN DATA ============================================================ Fitur yang digunakan: ['jumlah_pendidik', 'jumlah_sekolah'] Shape data: (24, 2) Data telah dinormalisasi menggunakan StandardScaler. ============================================================ MENENTUKAN JUMLAH CLUSTER OPTIMAL ============================================================ Evaluasi untuk berbagai K: | K | Inertia | Silhouette Score | |----:|----------:|-------------------:| | 2 | 10.4674 | 0.731571 | | 3 | 3.67937 | 0.594546 | | 4 | 1.82605 | 0.585692 | | 5 | 1.17329 | 0.557479 | | 6 | 0.830915 | 0.501176 | | 7 | 0.532124 | 0.472502 | | 8 | 0.399343 | 0.420903 | | 9 | 0.322782 | 0.386624 | | 10 | 0.249896 | 0.353665 | ============================================================ IMPLEMENTASI K-MEANS CLUSTERING (K=3) ============================================================ Clustering selesai! Iterasi: 6 Inertia: 3.68 | Silhouette Score: 0.595 | Davies-Bouldin Index: 0.464 ============================================================ ANALISIS HASIL CLUSTERING ============================================================ Ringkasan Statistik per Cluster: | cluster | wilayah | pendidik_avg | sekolah_avg | rasio_avg | |:-----------------|:----------|:---------------|:--------------|:------------| | Kepadatan Rendah | 14 | 3892.86 | 170.71 | 22.31 | | Kepadatan Tinggi | 3 | 22666.7 | 800 | 28.28 | | Kepadatan Sedang | 7 | 9714.29 | 342.86 | 27.88 | Wilayah dalam setiap Cluster: • Cluster 0 (Kepadatan Rendah): Kalimantan Timur, Papua, Maluku, Bali, NTB, Riau, Sulawesi Utara, Kalimantan Selatan, Jambi, Bengkulu, NTT, Papua Barat, Gorontalo, Maluku Utara • Cluster 1 (Kepadatan Tinggi): Jawa Barat, Jawa Tengah, Jawa Timur • Cluster 2 (Kepadatan Sedang): DKI Jakarta, Sumatera Utara, Banten, Sulawesi Selatan, Lampung, Aceh, Sumatera Barat ============================================================ INTERPRETASI & REKOMENDASI ============================================================ Cluster: 0 - Kepadatan Rendah (Rasio rata-rata: 22.31) REKOMENDASI: **PENAMBAHAN guru** (Rasio di bawah standar ideal 25-30) Cluster: 1 - Kepadatan Tinggi (Rasio rata-rata: 28.28) ✓ STATUS: Rasio guru sudah dalam **batas ideal**. Cluster: 2 - Kepadatan Sedang (Rasio rata-rata: 27.88) ✓ STATUS: Rasio guru sudah dalam **batas ideal**. ============================================================ EXPORT HASIL ============================================================ Hasil clustering telah disimpan ke: hasil_clustering_pendidik_sma_2024.csv ============================================================ ANALISIS SELESAI! ============================================================