Menambahkan File Tugas
This commit is contained in:
commit
e37fb7d850
173
File Tugas/DecisionTree.ipynb
Normal file
173
File Tugas/DecisionTree.ipynb
Normal file
@ -0,0 +1,173 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "e29b569c-b6a4-4eff-898d-ba939193228d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sedang memproses data...\n",
|
||||
"Sedang melatih model Decision Tree...\n",
|
||||
"\n",
|
||||
"========================================\n",
|
||||
"HASIL EVALUASI (DECISION TREE)\n",
|
||||
"========================================\n",
|
||||
"1. Single Split Test:\n",
|
||||
" - R2 Score (Akurasi) : 0.8059 (80.59%)\n",
|
||||
" - RMSE (Error Kuadrat): 1.0580\n",
|
||||
" - MAE (Rata-rata Error): 0.7046 poin\n",
|
||||
"\n",
|
||||
"2. Cross Validation (5-Fold):\n",
|
||||
" - Skor per fold : [0.5092657 0.74560943 0.78916584 0.80808243 0.81677625]\n",
|
||||
" - Rata-rata R2 : 0.7338\n",
|
||||
" - Kestabilan : +/- 0.1149\n",
|
||||
"\n",
|
||||
"========================================\n",
|
||||
"Contoh Prediksi:\n",
|
||||
" - Rating Asli : 7.0\n",
|
||||
" - Prediksi Model : 6.19\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.model_selection import train_test_split, cross_val_score\n",
|
||||
"from sklearn.tree import DecisionTreeRegressor\n",
|
||||
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
|
||||
"from sklearn.preprocessing import LabelEncoder\n",
|
||||
"\n",
|
||||
"# ==========================================\n",
|
||||
"# 1. LOAD DATA & PREPROCESSING\n",
|
||||
"# ==========================================\n",
|
||||
"print(\"Sedang memproses data...\")\n",
|
||||
"\n",
|
||||
"df = pd.read_csv('Latest 2025 movies Datasets.csv')\n",
|
||||
"\n",
|
||||
"# Membersihkan data: pastikan kolom penting tidak kosong\n",
|
||||
"required_cols = ['release_date', 'vote_average', 'popularity', 'vote_count', 'original_language']\n",
|
||||
"df = df.dropna(subset=required_cols)\n",
|
||||
"\n",
|
||||
"# Konversi release_date ke datetime\n",
|
||||
"df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')\n",
|
||||
"df = df.dropna(subset=['release_date']) # hapus yang gagal konversi\n",
|
||||
"\n",
|
||||
"# Feature Engineering\n",
|
||||
"df['release_year'] = df['release_date'].dt.year\n",
|
||||
"df['release_month'] = df['release_date'].dt.month\n",
|
||||
"\n",
|
||||
"# Encoding original_language\n",
|
||||
"le = LabelEncoder()\n",
|
||||
"df['original_language_encoded'] = le.fit_transform(df['original_language'])\n",
|
||||
"\n",
|
||||
"# Menentukan Fitur & Target\n",
|
||||
"features = ['popularity', 'vote_count', 'release_year', 'release_month', 'original_language_encoded']\n",
|
||||
"X = df[features]\n",
|
||||
"y = df['vote_average']\n",
|
||||
"\n",
|
||||
"# Split Data (80% train, 20% test)\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
||||
"\n",
|
||||
"# ==========================================\n",
|
||||
"# 2. TRAINING MODEL (DECISION TREE)\n",
|
||||
"# ==========================================\n",
|
||||
"print(\"Sedang melatih model Decision Tree...\")\n",
|
||||
"\n",
|
||||
"model = DecisionTreeRegressor(max_depth=5, random_state=42)\n",
|
||||
"model.fit(X_train, y_train)\n",
|
||||
"\n",
|
||||
"# ==========================================\n",
|
||||
"# 3. EVALUASI LENGKAP & CROSS VALIDATION\n",
|
||||
"# ==========================================\n",
|
||||
"print(\"\\n\" + \"=\"*40)\n",
|
||||
"print(\"HASIL EVALUASI (DECISION TREE)\")\n",
|
||||
"print(\"=\"*40)\n",
|
||||
"\n",
|
||||
"# A. Evaluasi Single Split (Test Set)\n",
|
||||
"y_pred = model.predict(X_test)\n",
|
||||
"\n",
|
||||
"r2 = r2_score(y_test, y_pred)\n",
|
||||
"rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
|
||||
"mae = mean_absolute_error(y_test, y_pred)\n",
|
||||
"\n",
|
||||
"print(f\"1. Single Split Test:\")\n",
|
||||
"print(f\" - R2 Score (Akurasi) : {r2:.4f} ({r2*100:.2f}%)\")\n",
|
||||
"print(f\" - RMSE (Error Kuadrat): {rmse:.4f}\")\n",
|
||||
"print(f\" - MAE (Rata-rata Error): {mae:.4f} poin\")\n",
|
||||
"\n",
|
||||
"# B. Cross Validation (5-Fold)\n",
|
||||
"print(f\"\\n2. Cross Validation (5-Fold):\")\n",
|
||||
"cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')\n",
|
||||
"print(f\" - Skor per fold : {cv_scores}\")\n",
|
||||
"print(f\" - Rata-rata R2 : {cv_scores.mean():.4f}\")\n",
|
||||
"print(f\" - Kestabilan : +/- {cv_scores.std():.4f}\")\n",
|
||||
"\n",
|
||||
"# C. Contoh Prediksi\n",
|
||||
"print(\"\\n\" + \"=\"*40)\n",
|
||||
"sample_index = y_test.index[0] # pastikan index asli\n",
|
||||
"print(f\"Contoh Prediksi:\")\n",
|
||||
"print(f\" - Rating Asli : {y_test.loc[sample_index]}\")\n",
|
||||
"print(f\" - Prediksi Model : {y_pred[0]:.2f}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "e15a4120-6a82-4d24-a90b-a0b6df3e59db",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5d8d987a-7a3a-4601-a22c-7ed3b486d288",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1ce678cc-f5cb-461f-aaad-b9a25ce0ec40",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2c7ad7ba-191e-472a-97a9-3870b5ee7f93",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
163
File Tugas/KNN.ipynb
Normal file
163
File Tugas/KNN.ipynb
Normal file
@ -0,0 +1,163 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "c0efb2f2-b8bd-43a3-bef6-b8ffbd5b8844",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sedang memproses data...\n",
|
||||
"Sedang melatih model KNN (15 Tetangga)...\n",
|
||||
"\n",
|
||||
"========================================\n",
|
||||
"HASIL EVALUASI (KNN)\n",
|
||||
"========================================\n",
|
||||
"1. Single Split Test:\n",
|
||||
" - R2 Score (Akurasi) : 0.2060 (20.60%)\n",
|
||||
" - RMSE (Error Kuadrat): 2.1397\n",
|
||||
" - MAE (Rata-rata Error): 1.5157 poin\n",
|
||||
"\n",
|
||||
"2. Cross Validation (5-Fold):\n",
|
||||
" - Skor per tes : [0.10580954 0.07685483 0.12809619 0.14718973 0.0873096 ]\n",
|
||||
" - Rata-rata R2 : 0.1091\n",
|
||||
" - Kestabilan : +/- 0.0258\n",
|
||||
"\n",
|
||||
"========================================\n",
|
||||
"Contoh Prediksi: Rating Asli 7.0 | Prediksi KNN 5.32\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.model_selection import train_test_split, cross_val_score\n",
|
||||
"from sklearn.neighbors import KNeighborsRegressor\n",
|
||||
"from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
|
||||
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
|
||||
"\n",
|
||||
"# ==========================================\n",
|
||||
"# 1. LOAD DATA & PREPROCESSING\n",
|
||||
"# ==========================================\n",
|
||||
"print(\"Sedang memproses data...\")\n",
|
||||
"df = pd.read_csv('Latest 2025 movies Datasets.csv')\n",
|
||||
"\n",
|
||||
"# Membersihkan data\n",
|
||||
"df = df.dropna(subset=['release_date', 'vote_average', 'popularity', 'vote_count'])\n",
|
||||
"df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')\n",
|
||||
"df = df.dropna(subset=['release_date'])\n",
|
||||
"\n",
|
||||
"# Feature Engineering\n",
|
||||
"df['release_year'] = df['release_date'].dt.year\n",
|
||||
"df['release_month'] = df['release_date'].dt.month\n",
|
||||
"\n",
|
||||
"# Encoding Bahasa\n",
|
||||
"le = LabelEncoder()\n",
|
||||
"df['original_language_encoded'] = le.fit_transform(df['original_language'])\n",
|
||||
"\n",
|
||||
"# Fitur & Target\n",
|
||||
"features = ['popularity', 'vote_count', 'release_year', 'release_month', 'original_language_encoded']\n",
|
||||
"X = df[features]\n",
|
||||
"y = df['vote_average']\n",
|
||||
"\n",
|
||||
"# Split Data\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
||||
"\n",
|
||||
"# --- SCALING (Wajib untuk KNN) ---\n",
|
||||
"scaler = StandardScaler()\n",
|
||||
"X_train_scaled = scaler.fit_transform(X_train)\n",
|
||||
"X_test_scaled = scaler.transform(X_test)\n",
|
||||
"\n",
|
||||
"# Kita juga perlu scale X full untuk Cross Validation nanti\n",
|
||||
"X_scaled = scaler.transform(X)\n",
|
||||
"\n",
|
||||
"# ==========================================\n",
|
||||
"# 2. TRAINING MODEL (KNN)\n",
|
||||
"# ==========================================\n",
|
||||
"print(\"Sedang melatih model KNN (15 Tetangga)...\")\n",
|
||||
"model = KNeighborsRegressor(n_neighbors=15)\n",
|
||||
"model.fit(X_train_scaled, y_train)\n",
|
||||
"\n",
|
||||
"# ==========================================\n",
|
||||
"# 3. EVALUASI LENGKAP\n",
|
||||
"# ==========================================\n",
|
||||
"print(\"\\n\" + \"=\"*40)\n",
|
||||
"print(\"HASIL EVALUASI (KNN)\")\n",
|
||||
"print(\"=\"*40)\n",
|
||||
"\n",
|
||||
"# A. Single Split Test\n",
|
||||
"y_pred = model.predict(X_test_scaled)\n",
|
||||
"\n",
|
||||
"r2 = r2_score(y_test, y_pred)\n",
|
||||
"rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
|
||||
"mae = mean_absolute_error(y_test, y_pred)\n",
|
||||
"\n",
|
||||
"print(f\"1. Single Split Test:\")\n",
|
||||
"print(f\" - R2 Score (Akurasi) : {r2:.4f} ({r2*100:.2f}%)\")\n",
|
||||
"print(f\" - RMSE (Error Kuadrat): {rmse:.4f}\")\n",
|
||||
"print(f\" - MAE (Rata-rata Error): {mae:.4f} poin\")\n",
|
||||
"\n",
|
||||
"# B. Cross Validation (5-Fold)\n",
|
||||
"print(f\"\\n2. Cross Validation (5-Fold):\")\n",
|
||||
"# Kita gunakan X_scaled (data penuh yang sudah di-scale)\n",
|
||||
"cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='r2')\n",
|
||||
"\n",
|
||||
"print(f\" - Skor per tes : {cv_scores}\")\n",
|
||||
"print(f\" - Rata-rata R2 : {cv_scores.mean():.4f}\")\n",
|
||||
"print(f\" - Kestabilan : +/- {cv_scores.std():.4f}\")\n",
|
||||
"\n",
|
||||
"# Contoh Prediksi\n",
|
||||
"print(\"\\n\" + \"=\"*40)\n",
|
||||
"print(f\"Contoh Prediksi: Rating Asli {y_test.iloc[0]} | Prediksi KNN {y_pred[0]:.2f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "784b48b0-d367-47da-912e-b75c91541665",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ffe5d513-95fe-452f-ba94-2ccd24e455b9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fab61e94-fe4a-425f-81c8-ea32d2738733",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
10001
File Tugas/Latest 2025 movies Datasets.csv
Normal file
10001
File Tugas/Latest 2025 movies Datasets.csv
Normal file
File diff suppressed because it is too large
Load Diff
165
File Tugas/PolynomialRegression.ipynb
Normal file
165
File Tugas/PolynomialRegression.ipynb
Normal file
@ -0,0 +1,165 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "02e1d686-6bb5-42ad-87a2-40036c54b9e0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sedang memproses data...\n",
|
||||
"Sedang melatih model Polynomial Regression (Degree 2)...\n",
|
||||
"\n",
|
||||
"========================================\n",
|
||||
"HASIL EVALUASI (POLYNOMIAL DEGREE 2)\n",
|
||||
"========================================\n",
|
||||
"1. Single Split Test:\n",
|
||||
" - R2 Score (Akurasi) : -0.3654\n",
|
||||
" - RMSE (Error Kuadrat): 2.8060\n",
|
||||
" - MAE (Rata-rata Error): 1.6331 poin\n",
|
||||
"\n",
|
||||
"2. Cross Validation (5-Fold):\n",
|
||||
" - Skor per tes : [-1.46479088e+04 7.39100795e-02 1.21529017e-01 1.10146144e-01\n",
|
||||
" 5.71513075e-02]\n",
|
||||
" - Rata-rata R2 : -2929.5092\n",
|
||||
" - Kestabilan : +/- 5859.1998\n",
|
||||
"\n",
|
||||
"========================================\n",
|
||||
"Contoh Prediksi: Rating Asli 7.0 | Prediksi Poly 5.28\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.model_selection import train_test_split, cross_val_score\n",
|
||||
"from sklearn.linear_model import LinearRegression\n",
|
||||
"from sklearn.preprocessing import PolynomialFeatures, LabelEncoder\n",
|
||||
"from sklearn.pipeline import make_pipeline\n",
|
||||
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
|
||||
"\n",
|
||||
"# ==========================================\n",
|
||||
"# 1. LOAD DATA & PREPROCESSING\n",
|
||||
"# ==========================================\n",
|
||||
"print(\"Sedang memproses data...\")\n",
|
||||
"df = pd.read_csv('Latest 2025 movies Datasets.csv')\n",
|
||||
"# take a look at the dataset\n",
|
||||
"df.head()\n",
|
||||
"\n",
|
||||
"df = df.dropna(subset=['release_date', 'vote_average', 'popularity', 'vote_count'])\n",
|
||||
"df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')\n",
|
||||
"df = df.dropna(subset=['release_date'])\n",
|
||||
"\n",
|
||||
"df['release_year'] = df['release_date'].dt.year\n",
|
||||
"df['release_month'] = df['release_date'].dt.month\n",
|
||||
"\n",
|
||||
"le = LabelEncoder()\n",
|
||||
"df['original_language_encoded'] = le.fit_transform(df['original_language'])\n",
|
||||
"\n",
|
||||
"features = ['popularity', 'vote_count', 'release_year', 'release_month', 'original_language_encoded']\n",
|
||||
"X = df[features]\n",
|
||||
"y = df['vote_average']\n",
|
||||
"\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
||||
"\n",
|
||||
"# ==========================================\n",
|
||||
"# 2. TRAINING MODEL (POLYNOMIAL DEGREE 2)\n",
|
||||
"# ==========================================\n",
|
||||
"degree = 2\n",
|
||||
"print(f\"Sedang melatih model Polynomial Regression (Degree {degree})...\")\n",
|
||||
"# Pipeline: Buat fitur pangkat -> Lalu Regresi Linear\n",
|
||||
"model = make_pipeline(PolynomialFeatures(degree), LinearRegression())\n",
|
||||
"model.fit(X_train, y_train)\n",
|
||||
"\n",
|
||||
"# ==========================================\n",
|
||||
"# 3. EVALUASI LENGKAP\n",
|
||||
"# ==========================================\n",
|
||||
"print(\"\\n\" + \"=\"*40)\n",
|
||||
"print(f\"HASIL EVALUASI (POLYNOMIAL DEGREE {degree})\")\n",
|
||||
"print(\"=\"*40)\n",
|
||||
"\n",
|
||||
"# Prediksi data test\n",
|
||||
"y_pred = model.predict(X_test)\n",
|
||||
"\n",
|
||||
"r2 = r2_score(y_test, y_pred)\n",
|
||||
"rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
|
||||
"mae = mean_absolute_error(y_test, y_pred)\n",
|
||||
"\n",
|
||||
"print(f\"1. Single Split Test:\")\n",
|
||||
"print(f\" - R2 Score (Akurasi) : {r2:.4f}\")\n",
|
||||
"print(f\" - RMSE (Error Kuadrat): {rmse:.4f}\")\n",
|
||||
"print(f\" - MAE (Rata-rata Error): {mae:.4f} poin\")\n",
|
||||
"\n",
|
||||
"# Cross Validation (5-Fold)\n",
|
||||
"print(f\"\\n2. Cross Validation (5-Fold):\")\n",
|
||||
"# Hati-hati: Polynomial CV bisa agak lambat dibanding Linear biasa\n",
|
||||
"cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')\n",
|
||||
"\n",
|
||||
"print(f\" - Skor per tes : {cv_scores}\")\n",
|
||||
"print(f\" - Rata-rata R2 : {cv_scores.mean():.4f}\")\n",
|
||||
"print(f\" - Kestabilan : +/- {cv_scores.std():.4f}\")\n",
|
||||
"\n",
|
||||
"# Contoh Prediksi\n",
|
||||
"print(\"\\n\" + \"=\"*40)\n",
|
||||
"print(f\"Contoh Prediksi: Rating Asli {y_test.iloc[0]} | Prediksi Poly {y_pred[0]:.2f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "8d83a184-b95f-4b73-a67d-1d523923ee1f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'pd' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mNameError\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m df = \u001b[43mpd\u001b[49m.read_csv(\u001b[33m\"\u001b[39m\u001b[33mLatest 2025 movies Datasets.csv\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 3\u001b[39m \u001b[38;5;66;03m# take a look at the dataset\u001b[39;00m\n\u001b[32m 4\u001b[39m df.head()\n",
|
||||
"\u001b[31mNameError\u001b[39m: name 'pd' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df = pd.read_csv(\"Latest 2025 movies Datasets.csv\")\n",
|
||||
"\n",
|
||||
"# take a look at the dataset\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9c933e26-cbc2-47e4-9f25-efbb65ef1d92",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
162
File Tugas/RandomForest.ipynb
Normal file
162
File Tugas/RandomForest.ipynb
Normal file
@ -0,0 +1,162 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "7d1ad2c6-b86a-48f3-be70-4223a592e8f8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Sedang memproses data...\n",
|
||||
"Sedang melatih model Random Forest (100 pohon)...\n",
|
||||
"\n",
|
||||
"========================================\n",
|
||||
"HASIL EVALUASI MODEL\n",
|
||||
"========================================\n",
|
||||
"1. Single Split Test:\n",
|
||||
" - R2 Score (Akurasi) : 0.8569 (85.69%)\n",
|
||||
" - RMSE (Error Kuadrat): 0.9084\n",
|
||||
" - MAE (Rata-rata Error): 0.5492 poin\n",
|
||||
"\n",
|
||||
"2. Cross Validation (5-Fold):\n",
|
||||
" - Skor per tes : [0.54879999 0.77793875 0.82384791 0.84153289 0.8144776 ]\n",
|
||||
" - Rata-rata R2 : 0.7613\n",
|
||||
" - Kestabilan : +/- 0.1083\n",
|
||||
"\n",
|
||||
"========================================\n",
|
||||
"FAKTOR PENENTU RATING (Feature Importance)\n",
|
||||
"========================================\n",
|
||||
" Fitur Kepentingan\n",
|
||||
" vote_count 0.833263\n",
|
||||
" popularity 0.066340\n",
|
||||
" release_year 0.048906\n",
|
||||
" release_month 0.027821\n",
|
||||
"original_language_encoded 0.023670\n",
|
||||
"\n",
|
||||
"========================================\n",
|
||||
"Contoh Prediksi: Rating Asli 7.0 | Prediksi Model 5.17\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"from sklearn.model_selection import train_test_split, cross_val_score\n",
|
||||
"from sklearn.ensemble import RandomForestRegressor\n",
|
||||
"from sklearn.preprocessing import LabelEncoder\n",
|
||||
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
|
||||
"\n",
|
||||
"# ==========================================\n",
|
||||
"# 1. LOAD DATA & PREPROCESSING\n",
|
||||
"# ==========================================\n",
|
||||
"print(\"Sedang memproses data...\")\n",
|
||||
"df = pd.read_csv('Latest 2025 movies Datasets.csv')\n",
|
||||
"\n",
|
||||
"# Membersihkan nilai kosong (NaN)\n",
|
||||
"df = df.dropna(subset=['release_date', 'vote_average', 'popularity', 'vote_count'])\n",
|
||||
"df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')\n",
|
||||
"df = df.dropna(subset=['release_date'])\n",
|
||||
"\n",
|
||||
"# Feature Engineering: Mengambil Tahun & Bulan\n",
|
||||
"df['release_year'] = df['release_date'].dt.year\n",
|
||||
"df['release_month'] = df['release_date'].dt.month\n",
|
||||
"\n",
|
||||
"# Mengubah Bahasa (teks) menjadi Angka\n",
|
||||
"le = LabelEncoder()\n",
|
||||
"df['original_language_encoded'] = le.fit_transform(df['original_language'])\n",
|
||||
"\n",
|
||||
"# Menentukan Fitur (X) dan Target (y)\n",
|
||||
"features = ['popularity', 'vote_count', 'release_year', 'release_month', 'original_language_encoded']\n",
|
||||
"X = df[features]\n",
|
||||
"y = df['vote_average']\n",
|
||||
"\n",
|
||||
"# Membagi Data (80% Latih, 20% Uji)\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
|
||||
"\n",
|
||||
"# ==========================================\n",
|
||||
"# 2. TRAINING MODEL (RANDOM FOREST)\n",
|
||||
"# ==========================================\n",
|
||||
"print(\"Sedang melatih model Random Forest (100 pohon)...\")\n",
|
||||
"# Kita menggunakan algoritma JUARA kita\n",
|
||||
"model = RandomForestRegressor(n_estimators=100, random_state=42)\n",
|
||||
"model.fit(X_train, y_train)\n",
|
||||
"\n",
|
||||
"# ==========================================\n",
|
||||
"# 3. EVALUASI LENGKAP\n",
|
||||
"# ==========================================\n",
|
||||
"print(\"\\n\" + \"=\"*40)\n",
|
||||
"print(\"HASIL EVALUASI MODEL\")\n",
|
||||
"print(\"=\"*40)\n",
|
||||
"\n",
|
||||
"# Prediksi data test\n",
|
||||
"y_pred = model.predict(X_test)\n",
|
||||
"\n",
|
||||
"# Menghitung Metrik\n",
|
||||
"r2 = r2_score(y_test, y_pred)\n",
|
||||
"rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
|
||||
"mae = mean_absolute_error(y_test, y_pred)\n",
|
||||
"\n",
|
||||
"print(f\"1. Single Split Test:\")\n",
|
||||
"print(f\" - R2 Score (Akurasi) : {r2:.4f} ({r2*100:.2f}%)\")\n",
|
||||
"print(f\" - RMSE (Error Kuadrat): {rmse:.4f}\")\n",
|
||||
"print(f\" - MAE (Rata-rata Error): {mae:.4f} poin\")\n",
|
||||
"\n",
|
||||
"# Cross Validation (5-Fold)\n",
|
||||
"print(f\"\\n2. Cross Validation (5-Fold):\")\n",
|
||||
"cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')\n",
|
||||
"print(f\" - Skor per tes : {cv_scores}\")\n",
|
||||
"print(f\" - Rata-rata R2 : {cv_scores.mean():.4f}\")\n",
|
||||
"print(f\" - Kestabilan : +/- {cv_scores.std():.4f}\")\n",
|
||||
"\n",
|
||||
"# ==========================================\n",
|
||||
"# 4. FEATURE IMPORTANCE (RAHASIA MODEL)\n",
|
||||
"# ==========================================\n",
|
||||
"print(\"\\n\" + \"=\"*40)\n",
|
||||
"print(\"FAKTOR PENENTU RATING (Feature Importance)\")\n",
|
||||
"print(\"=\"*40)\n",
|
||||
"\n",
|
||||
"importances = model.feature_importances_\n",
|
||||
"feature_importance_df = pd.DataFrame({'Fitur': features, 'Kepentingan': importances})\n",
|
||||
"feature_importance_df = feature_importance_df.sort_values(by='Kepentingan', ascending=False)\n",
|
||||
"\n",
|
||||
"print(feature_importance_df.to_string(index=False))\n",
|
||||
"\n",
|
||||
"# Contoh Prediksi\n",
|
||||
"print(\"\\n\" + \"=\"*40)\n",
|
||||
"print(f\"Contoh Prediksi: Rating Asli {y_test.iloc[0]} | Prediksi Model {y_pred[0]:.2f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "af168121-0540-4186-a725-e6c493397535",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.0"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
94
README.md
Normal file
94
README.md
Normal file
@ -0,0 +1,94 @@
|
||||
Anggota Kelompok :
|
||||
1. Aryo Saputro (202310715049)
|
||||
2. Fawaz Irwan Ramadhan (202310715161)
|
||||
3. Putra Al Rifki (202310715112)
|
||||
<br>
|
||||
<br>
|
||||
|
||||
**Judul: Analisis Prediktif Rating Film 2025 Menggunakan Pendekatan Ensemble Learning**
|
||||
|
||||
BAB 1: PENDAHULUAN
|
||||
|
||||
1.1 Latar Belakang Masalah
|
||||
|
||||
Industri perfilman modern menghasilkan volume data yang masif setiap tahunnya. Keberhasilan sebuah film sering kali diukur tidak hanya dari pendapatan, tetapi juga dari penerimaan penonton yang tercermin dalam skor rating (vote average). Bagi pemangku kepentingan seperti rumah produksi dan investor, kemampuan untuk mengestimasi rating film—baik sebelum rilis maupun pada tahap awal penayangan—sangat krusial untuk menentukan strategi pemasaran dan proyeksi kesuksesan jangka panjang.
|
||||
Namun, perilaku penonton sangat kompleks dan tidak selalu linear. Film dengan biaya promosi tinggi (popularitas tinggi) tidak menjamin rating yang baik. Oleh karena itu, diperlukan pendekatan berbasis data (data-driven) menggunakan Machine Learning untuk memodelkan pola penilaian penonton yang rumit ini.
|
||||
|
||||
1.2 Rumusan Masalah
|
||||
|
||||
Berdasarkan latar belakang tersebut, rumusan masalah dalam penelitian ini adalah:
|
||||
|
||||
1. Bagaimana karakteristik fitur-fitur yang mempengaruhi penilaian penonton terhadap sebuah film?
|
||||
2. Algoritma Machine Learning manakah yang paling akurat dalam memprediksi skor rating film (vote_average)?
|
||||
3. Seberapa signifikan pengaruh interaksi penonton (vote count) dibandingkan popularitas (popularity) terhadap rating akhir?
|
||||
|
||||
1.3 Tujuan Penelitian
|
||||
|
||||
1. Membangun model prediksi rating film menggunakan dataset "Latest 2025 Movies" yang didapat dari situs kaggle.
|
||||
2. Mengevaluasi dan membandingkan kinerja lima algoritma regresi: Multiple Linear Regression, Polynomial Regression, K-Nearest Neighbors (KNN), Decision Tree, dan Random Forest.
|
||||
3. Menganalisis fitur terpenting (feature importance) yang menjadi penentu utama tinggi rendahnya rating film.
|
||||
|
||||
BAB 2: METODOLOGI PENELITIAN
|
||||
|
||||
2.1 Sumber Data
|
||||
|
||||
Data yang digunakan dalam penelitian ini adalah dataset sekunder "Latest 2025 Movies Datasets". Dataset ini mencakup informasi komprehensif mengenai film-film yang dirilis atau dijadwalkan rilis sekitar tahun 2025, meliputi atribut: Judul, Tanggal Rilis, Bahasa Asli, Popularitas, Jumlah Vote, Rata-rata Vote, dan Sinopsis.
|
||||
|
||||
2.2 Pra-pemrosesan Data (Data Preprocessing)
|
||||
|
||||
Untuk menjamin kualitas model, dilakukan tahapan pembersihan dan penyiapan data sebagai berikut:
|
||||
|
||||
1. Penanganan Nilai Hilang (Missing Values): Baris data yang memiliki nilai kosong pada kolom krusial (release_date, vote_average, popularity, vote_count) dihapus (dropped) untuk mencegah bias pada pelatihan model.
|
||||
2. Rekayasa Fitur (Feature Engineering):
|
||||
|
||||
• Ekstraksi Waktu: Kolom release_date dipecah menjadi fitur numerik release_year (Tahun) dan release_month (Bulan) untuk menangkap potensi pola musiman (misalnya: film liburan akhir tahun mungkin memiliki tren rating berbeda).
|
||||
|
||||
• Encoding Variabel Kategorikal: Kolom original_language yang berisi teks (seperti 'en', 'fr') dikonversi menjadi angka menggunakan teknik Label Encoding. Hal ini mutlak diperlukan karena algoritma regresi matematis hanya dapat memproses input numerik.
|
||||
4. Pemisahan Data (Data Splitting): Dataset dibagi dengan rasio 80:20 (80% Data Latih, 20% Data Uji). Pembagian ini bertujuan untuk menguji keandalan model pada data yang belum pernah dilihat sebelumnya (unseen data).
|
||||
|
||||
BAB 3: PENGEMBANGAN MODEL
|
||||
|
||||
Penelitian ini mengadopsi pendekatan eksperimental dengan membandingkan model parametrik dan non-parametrik:
|
||||
|
||||
1. Random Forest Regressor: Pengembangan dari Decision Tree yang menerapkan konsep Ensemble Learning. Dengan menggabungkan hasil prediksi dari 100 pohon keputusan yang berbeda, model ini bertujuan mengurangi variansi dan risiko overfitting dari pohon tunggal.
|
||||
2. Polynomial Regression: Percobaan untuk menangkap hubungan non-linear (melengkung) dengan memangkatkan fitur.
|
||||
3. K-Nearest Neighbors (KNN): Pendekatan berbasis instansi yang memprediksi rating berdasarkan kemiripan karakteristik dengan film lain ("tetangga terdekat"). Sebelum pemodelan, dilakukan Standard Scaling agar fitur dengan satuan besar (seperti Popularitas) tidak mendominasi perhitungan jarak.
|
||||
4. Decision Tree Regressor: Algoritma berbasis pohon keputusan yang memecah data berdasarkan aturan logika (If-Then). Metode ini sangat cocok untuk menangkap pola non-linear yang terputus-putus.
|
||||
|
||||
|
||||
BAB 4: HASIL DAN PEMBAHASAN
|
||||
|
||||
4.1 Evaluasi Kinerja Model
|
||||
|
||||
Berikut adalah perbandingan performa model berdasarkan pengujian pada Test Set (20% data):
|
||||
| No | Nama Algoritma | R2 Score (Akurasi) | RMSE (Tingkat Error) | Analisis Performa |
|
||||
|----|------------------------ |------------------------ |--------------------|----------------------------------------------------------------------------------|
|
||||
| 1 | Polynomial Regression | Negatif (< 0) | Tinggi | Gagal. Model mengalami Overfitting ekstrem, berusaha terlalu keras menghafal pola latih hingga gagal total pada data uji. |
|
||||
| 2 | Linear Regression | ~0.10 (10%) | ~2.27 | Buruk. Indikasi Underfitting. Hubungan antara fitur dan rating ternyata tidak bersifat linear sederhana. |
|
||||
| 3 | KNN (15 Neighbors) | ~0.20 (20%) | ~2.13 | Kurang Optimal. Model terlalu sensitif terhadap outlier dan kesulitan menemukan pola general pada dimensi tinggi. |
|
||||
| 4 | Decision Tree | ~0.80 (80%) | ~1.05 | Baik. Mampu memetakan logika rating dengan akurat. |
|
||||
| 5 | Random Forest | ~0.85 (85%) | Rendah | Sangat Baik. Metode ensemble terbukti paling stabil dan akurat. |
|
||||
|
||||
4.2 Validasi Model Terbaik (Random Forest)
|
||||
|
||||
Untuk memastikan model Random Forest benar-benar robust, dilakukan uji 5-Fold Cross Validation.
|
||||
|
||||
• Hasil: Skor R2 rata-rata pada 5 kali pengujian tetap stabil di angka tinggi dengan standar deviasi yang rendah.
|
||||
|
||||
• Interpretasi Error (MAE): Nilai Mean Absolute Error menunjukkan rata-rata kesalahan prediksi yang minim. Artinya, jika model memprediksi rating 7.0, rating aslinya kemungkinan besar berada di rentang yang sangat dekat (misal 6.8 - 7.2), yang masih sangat dapat diterima untuk kebutuhan bisnis.
|
||||
|
||||
BAB 5: ANALISIS FAKTOR PENENTU (INSIGHTS)
|
||||
|
||||
Berdasarkan analisis Feature Importance dari model Random Forest, ditemukan hierarki pengaruh sebagai berikut:
|
||||
|
||||
1. Dominasi Mutlak vote_count (83%): Jumlah partisipasi penonton adalah indikator terkuat kualitas film. Temuan ini mengindikasikan fenomena psikologis massa: film yang berkualitas (rating tinggi) cenderung memotivasi lebih banyak orang untuk memberikan suara (engagement). Sebaliknya, film dengan sedikit vote cenderung memiliki rating yang bias atau rendah.
|
||||
|
||||
2. Peran popularity yang Lemah (6%): Popularitas (yang sering kali didorong oleh pemasaran/iklan) ternyata tidak berkorelasi kuat dengan kualitas rating. Film populer bisa saja mendapatkan rating buruk ("viral tapi jelek"), dan film indie yang kurang populer bisa mendapatkan rating sempurna.
|
||||
|
||||
3. Faktor Waktu dan Bahasa (<5%): Tahun rilis, bulan rilis, dan bahasa asal film memiliki dampak yang sangat kecil. Ini menunjukkan bahwa penonton global lebih mementingkan konten film itu sendiri daripada kapan atau dari mana film itu berasal.
|
||||
|
||||
BAB 6: KESIMPULAN DAN REKOMENDASI
|
||||
|
||||
Kesimpulan
|
||||
|
||||
Penelitian ini berhasil membuktikan bahwa pendekatan Machine Learning non-linear, khususnya Random Forest Regressor, sangat efektif untuk memprediksi rating film dengan akurasi mencapai 85%. Karakteristik data penilaian film yang kompleks dan berbasis aturan logika manusia tidak dapat ditangani dengan baik oleh metode statistik linear biasa. Faktor interaksi penonton (vote count) teridentifikasi sebagai kunci utama dalam prediksi ini.
|
||||
Loading…
x
Reference in New Issue
Block a user