Files

165 lines
4.8 KiB
Python
Executable File

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from ISLP import load_data
###
# Forward stepwise selection
###
# Load Hitters dataset from ISLP
Hitters = load_data('Hitters')
# Remove missing values
Hitters = Hitters.dropna()
# Create dummy variables for categorical columns
Hitters = pd.get_dummies(Hitters, drop_first=True)
# Separate response (target) and predictors
y = Hitters['Salary']
X = Hitters.drop(columns=['Salary'])
# Define the linear regression model
model = LinearRegression()
# Perform forward stepwise selection using SequentialFeatureSelector
#sfs = SequentialFeatureSelector(model, n_features_to_select=15, direction='forward', cv=5)
sfs = SequentialFeatureSelector(model, n_features_to_select=15, direction='forward')
# Fit the model to the data
sfs.fit(X, y)
# Get the selected features
selected_features = X.columns[sfs.get_support()]
# Fit the model with the selected features
model.fit(X[selected_features], y)
# Coefficients of the selected features
coefficients = pd.DataFrame({
'Feature': selected_features,
'Coefficient': model.coef_
})
# Printing short summary - intercept, coefficients and $R^{2}$
print("\nIntercept:")
print(model.intercept_)
print("\nCoefficients:")
print(coefficients)
print("\nR-squared:")
print(model.score(X[selected_features], y))
###
# Validation errors for FSS
###
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import statsmodels.api as sm
# Split the data into training and validation sets based on row indices
train_data = Hitters.iloc[:184] # First 184 rows for training data
val_data = Hitters.iloc[184:263] # Rows 185 to 263 for validation data
# Define X and y for both training and validation sets
X_train = train_data.drop(columns=['Salary'])
y_train = train_data['Salary']
X_val = val_data.drop(columns=['Salary'])
y_val = val_data['Salary']
# Ensure that all categorical variables are encoded as numeric
X_train = pd.get_dummies(X_train, drop_first=True).astype(float)
X_val = pd.get_dummies(X_val, drop_first=True).astype(float)
# Align columns of validation set to match training set
X_val = X_val.reindex(columns=X_train.columns, fill_value=0).astype(float)
# Convert validation data to matrix form (for statsmodels)
val_data = sm.add_constant(X_val)
# Ensure target variable is numeric
y_train_np = np.asarray(y_train).astype(float)
y_val_np = np.asarray(y_val).astype(float)
# Run forward stepwise selection using sklearn's SequentialFeatureSelector
model2 = LinearRegression()
sfs2 = SFS(model2,
k_features=15,
forward=True,
floating=False,
scoring='neg_mean_squared_error',
cv=0) # No cross-validation
sfs2.fit(X_train, y_train)
# Extract selected features for each number of features (1 to 15)
#selected_features = list(sfs2.subsets_)
selected_features = sfs2.subsets_
# Compute validation mean squared errors for each model
val_err = np.zeros(15)
for i in range(1, 16):
# Get the selected feature names for this step
feature_names = selected_features[i]['feature_names']
# Select the corresponding features from X_train
X_train_selected = X_train[list(feature_names)]
# Add constant (intercept) term
X_train_selected = sm.add_constant(X_train_selected).astype(float)
# Ensure the selected features are numeric
X_train_selected_np = np.asarray(X_train_selected).astype(float)
# Fit OLS model
model = sm.OLS(y_train_np, X_train_selected_np).fit()
# Predict on validation set
X_val_selected = val_data[list(feature_names)]
X_val_selected_np = sm.add_constant(X_val_selected).astype(float) # Ensure numpy array is float
y_pred_val = model.predict(X_val_selected_np)
# Compute MSE for validation set
val_err[i - 1] = MSE(y_val_np, y_pred_val)
# Print validation errors for each model size
print("Validation Errors for each model size (1 to 15 features):")
print(val_err)
print("\nMin val_err: ", min(val_err))
##
# PLOT results
##
import matplotlib.pyplot as plt
# Assuming 'val_err' contains the validation MSE values
# Find the index of the minimum validation error
min_index = np.argmin(val_err) + 1 # +1 because index starts from 0, but variables start from 1
# Plot the validation errors
plt.figure(figsize=(8, 5))
plt.plot(range(1, 16), val_err, marker='o', linestyle='--', color='black')
# Highlight the minimum MSE with a red vertical line
plt.axvline(x=min_index, color='red', linestyle='-', linewidth=1.5)
# Label the axes
plt.xlabel("# Variables", fontsize=12)
plt.ylabel("Validation MSE", fontsize=12)
# Title for the plot (optional)
plt.title("Validation MSE vs Number of Variables", fontsize=14)
# Show the plot
plt.tight_layout()
plt.show()