This notebook demonstrates how to predict diabetes progression using various machine learning models. We will perform data preprocessing, exploratory data analysis (EDA), and implement three models: Linear Regression, Random Forest Regressor, and Gradient Boosting Regressor. We will compare the models using Mean Squared Error (MSE) and R² score, and visualize key findings.
## Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.datasets import load_diabetes
diabetes_data = load_diabetes()
df = pd.DataFrame(data=diabetes_data.data, columns=diabetes_data.feature_names)
df['target'] = diabetes_data.target
# Display the first few rows
df.head()
age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.038076 | 0.050680 | 0.061696 | 0.021872 | -0.044223 | -0.034821 | -0.043401 | -0.002592 | 0.019908 | -0.017646 | 151.0 |
1 | -0.001882 | -0.044642 | -0.051474 | -0.026328 | -0.008449 | -0.019163 | 0.074412 | -0.039493 | -0.068330 | -0.092204 | 75.0 |
2 | 0.085299 | 0.050680 | 0.044451 | -0.005671 | -0.045599 | -0.034194 | -0.032356 | -0.002592 | 0.002864 | -0.025930 | 141.0 |
3 | -0.089063 | -0.044642 | -0.011595 | -0.036656 | 0.012191 | 0.024991 | -0.036038 | 0.034309 | 0.022692 | -0.009362 | 206.0 |
4 | 0.005383 | -0.044642 | -0.036385 | 0.021872 | 0.003935 | 0.015596 | 0.008142 | -0.002592 | -0.031991 | -0.046641 | 135.0 |
# Descriptive statistics
df.describe()
age | sex | bmi | bp | s1 | s2 | s3 | s4 | s5 | s6 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 4.420000e+02 | 442.000000 |
mean | -3.634285e-16 | 1.308343e-16 | -8.045349e-16 | 1.281655e-16 | -8.835316e-17 | 1.327024e-16 | -4.574646e-16 | 3.777301e-16 | -3.830854e-16 | -3.412882e-16 | 152.133484 |
std | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 4.761905e-02 | 77.093005 |
min | -1.072256e-01 | -4.464164e-02 | -9.027530e-02 | -1.123996e-01 | -1.267807e-01 | -1.156131e-01 | -1.023071e-01 | -7.639450e-02 | -1.260974e-01 | -1.377672e-01 | 25.000000 |
25% | -3.729927e-02 | -4.464164e-02 | -3.422907e-02 | -3.665645e-02 | -3.424784e-02 | -3.035840e-02 | -3.511716e-02 | -3.949338e-02 | -3.324879e-02 | -3.317903e-02 | 87.000000 |
50% | 5.383060e-03 | -4.464164e-02 | -7.283766e-03 | -5.670611e-03 | -4.320866e-03 | -3.819065e-03 | -6.584468e-03 | -2.592262e-03 | -1.947634e-03 | -1.077698e-03 | 140.500000 |
75% | 3.807591e-02 | 5.068012e-02 | 3.124802e-02 | 3.564384e-02 | 2.835801e-02 | 2.984439e-02 | 2.931150e-02 | 3.430886e-02 | 3.243323e-02 | 2.791705e-02 | 211.500000 |
max | 1.107267e-01 | 5.068012e-02 | 1.705552e-01 | 1.320442e-01 | 1.539137e-01 | 1.987880e-01 | 1.811791e-01 | 1.852344e-01 | 1.335990e-01 | 1.356118e-01 | 346.000000 |
# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Features')
plt.show()
# Distribution of target variable
sns.histplot(df['target'], kde=True)
plt.title('Distribution of Diabetes Progression')
plt.show()
# Split data into features and target variable
X = df.drop(columns=['target'])
y = df['target']
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
# Predictions and evaluation
y_pred_lin = lin_reg.predict(X_test)
mse_lin = mean_squared_error(y_test, y_pred_lin)
r2_lin = r2_score(y_test, y_pred_lin)
print(f'Linear Regression - MSE: {mse_lin:.2f}, R²: {r2_lin:.2f}')
Linear Regression - MSE: 2900.17, R²: 0.45
# Train Random Forest
rf_reg = RandomForestRegressor(random_state=42)
rf_reg.fit(X_train, y_train)
# Predictions and evaluation
y_pred_rf = rf_reg.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f'Random Forest - MSE: {mse_rf:.2f}, R²: {r2_rf:.2f}')
Random Forest - MSE: 2945.29, R²: 0.44
# Train Gradient Boosting
gb_reg = GradientBoostingRegressor(random_state=42)
gb_reg.fit(X_train, y_train)
# Predictions and evaluation
y_pred_gb = gb_reg.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)
print(f'Gradient Boosting - MSE: {mse_gb:.2f}, R²: {r2_gb:.2f}')
Gradient Boosting - MSE: 2906.46, R²: 0.45
# Compare model performance
models = ['Linear Regression', 'Random Forest', 'Gradient Boosting']
mse_values = [mse_lin, mse_rf, mse_gb]
r2_values = [r2_lin, r2_rf, r2_gb]
# Plot comparison
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.barplot(x=models, y=mse_values)
plt.title('Mean Squared Error Comparison')
plt.ylabel('MSE')
plt.subplot(1, 2, 2)
sns.barplot(x=models, y=r2_values)
plt.title('R² Score Comparison')
plt.ylabel('R²')
plt.show()