165 lines
4.8 KiB
Python
Executable File
165 lines
4.8 KiB
Python
Executable File
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.linear_model import LinearRegression
|
|
from sklearn.feature_selection import SequentialFeatureSelector
|
|
from ISLP import load_data
|
|
|
|
###
|
|
# Forward stepwise selection
|
|
###
|
|
# Load Hitters dataset from ISLP
|
|
Hitters = load_data('Hitters')
|
|
|
|
# Remove missing values
|
|
Hitters = Hitters.dropna()
|
|
|
|
# Create dummy variables for categorical columns
|
|
Hitters = pd.get_dummies(Hitters, drop_first=True)
|
|
|
|
# Separate response (target) and predictors
|
|
y = Hitters['Salary']
|
|
X = Hitters.drop(columns=['Salary'])
|
|
|
|
# Define the linear regression model
|
|
model = LinearRegression()
|
|
|
|
# Perform forward stepwise selection using SequentialFeatureSelector
|
|
#sfs = SequentialFeatureSelector(model, n_features_to_select=15, direction='forward', cv=5)
|
|
sfs = SequentialFeatureSelector(model, n_features_to_select=15, direction='forward')
|
|
|
|
# Fit the model to the data
|
|
sfs.fit(X, y)
|
|
|
|
# Get the selected features
|
|
selected_features = X.columns[sfs.get_support()]
|
|
|
|
# Fit the model with the selected features
|
|
model.fit(X[selected_features], y)
|
|
|
|
# Coefficients of the selected features
|
|
coefficients = pd.DataFrame({
|
|
'Feature': selected_features,
|
|
'Coefficient': model.coef_
|
|
})
|
|
|
|
# Printing short summary - intercept, coefficients and $R^{2}$
|
|
print("\nIntercept:")
|
|
print(model.intercept_)
|
|
|
|
print("\nCoefficients:")
|
|
print(coefficients)
|
|
|
|
print("\nR-squared:")
|
|
print(model.score(X[selected_features], y))
|
|
|
|
|
|
###
|
|
# Validation errors for FSS
|
|
###
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import mean_squared_error as MSE
|
|
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
|
|
import statsmodels.api as sm
|
|
|
|
# Split the data into training and validation sets based on row indices
|
|
train_data = Hitters.iloc[:184] # First 184 rows for training data
|
|
val_data = Hitters.iloc[184:263] # Rows 185 to 263 for validation data
|
|
|
|
# Define X and y for both training and validation sets
|
|
X_train = train_data.drop(columns=['Salary'])
|
|
y_train = train_data['Salary']
|
|
X_val = val_data.drop(columns=['Salary'])
|
|
y_val = val_data['Salary']
|
|
|
|
# Ensure that all categorical variables are encoded as numeric
|
|
X_train = pd.get_dummies(X_train, drop_first=True).astype(float)
|
|
X_val = pd.get_dummies(X_val, drop_first=True).astype(float)
|
|
|
|
# Align columns of validation set to match training set
|
|
X_val = X_val.reindex(columns=X_train.columns, fill_value=0).astype(float)
|
|
|
|
# Convert validation data to matrix form (for statsmodels)
|
|
val_data = sm.add_constant(X_val)
|
|
|
|
# Ensure target variable is numeric
|
|
y_train_np = np.asarray(y_train).astype(float)
|
|
y_val_np = np.asarray(y_val).astype(float)
|
|
|
|
|
|
# Run forward stepwise selection using sklearn's SequentialFeatureSelector
|
|
model2 = LinearRegression()
|
|
|
|
sfs2 = SFS(model2,
|
|
k_features=15,
|
|
forward=True,
|
|
floating=False,
|
|
scoring='neg_mean_squared_error',
|
|
cv=0) # No cross-validation
|
|
|
|
sfs2.fit(X_train, y_train)
|
|
|
|
# Extract selected features for each number of features (1 to 15)
|
|
#selected_features = list(sfs2.subsets_)
|
|
selected_features = sfs2.subsets_
|
|
|
|
# Compute validation mean squared errors for each model
|
|
val_err = np.zeros(15)
|
|
for i in range(1, 16):
|
|
# Get the selected feature names for this step
|
|
feature_names = selected_features[i]['feature_names']
|
|
|
|
# Select the corresponding features from X_train
|
|
X_train_selected = X_train[list(feature_names)]
|
|
|
|
# Add constant (intercept) term
|
|
X_train_selected = sm.add_constant(X_train_selected).astype(float)
|
|
|
|
# Ensure the selected features are numeric
|
|
X_train_selected_np = np.asarray(X_train_selected).astype(float)
|
|
|
|
# Fit OLS model
|
|
model = sm.OLS(y_train_np, X_train_selected_np).fit()
|
|
|
|
# Predict on validation set
|
|
X_val_selected = val_data[list(feature_names)]
|
|
X_val_selected_np = sm.add_constant(X_val_selected).astype(float) # Ensure numpy array is float
|
|
|
|
y_pred_val = model.predict(X_val_selected_np)
|
|
|
|
# Compute MSE for validation set
|
|
val_err[i - 1] = MSE(y_val_np, y_pred_val)
|
|
|
|
# Print validation errors for each model size
|
|
print("Validation Errors for each model size (1 to 15 features):")
|
|
print(val_err)
|
|
|
|
print("\nMin val_err: ", min(val_err))
|
|
|
|
|
|
##
|
|
# PLOT results
|
|
##
|
|
import matplotlib.pyplot as plt
|
|
# Assuming 'val_err' contains the validation MSE values
|
|
|
|
# Find the index of the minimum validation error
|
|
min_index = np.argmin(val_err) + 1 # +1 because index starts from 0, but variables start from 1
|
|
|
|
# Plot the validation errors
|
|
plt.figure(figsize=(8, 5))
|
|
plt.plot(range(1, 16), val_err, marker='o', linestyle='--', color='black')
|
|
|
|
# Highlight the minimum MSE with a red vertical line
|
|
plt.axvline(x=min_index, color='red', linestyle='-', linewidth=1.5)
|
|
|
|
# Label the axes
|
|
plt.xlabel("# Variables", fontsize=12)
|
|
plt.ylabel("Validation MSE", fontsize=12)
|
|
|
|
# Title for the plot (optional)
|
|
plt.title("Validation MSE vs Number of Variables", fontsize=14)
|
|
|
|
# Show the plot
|
|
plt.tight_layout()
|
|
plt.show()
|