econometrics-and-machine-le…/Machine Learning for Economics and Finance/04_Subset Selection & Shrinkage/Script04_codes_FSS.py

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from ISLP import load_data

###
# Forward stepwise selection
###
# Load Hitters dataset from ISLP
Hitters = load_data('Hitters')

# Remove missing values
Hitters = Hitters.dropna()

# Create dummy variables for categorical columns
Hitters = pd.get_dummies(Hitters, drop_first=True)

# Separate response (target) and predictors
y = Hitters['Salary']
X = Hitters.drop(columns=['Salary'])

# Define the linear regression model
model = LinearRegression()

# Perform forward stepwise selection using SequentialFeatureSelector
#sfs = SequentialFeatureSelector(model, n_features_to_select=15, direction='forward', cv=5)
sfs = SequentialFeatureSelector(model, n_features_to_select=15, direction='forward')

# Fit the model to the data
sfs.fit(X, y)

# Get the selected features
selected_features = X.columns[sfs.get_support()]

# Fit the model with the selected features
model.fit(X[selected_features], y)

# Coefficients of the selected features
coefficients = pd.DataFrame({
    'Feature': selected_features,
    'Coefficient': model.coef_
})

# Printing short summary - intercept, coefficients and $R^{2}$
print("\nIntercept:")
print(model.intercept_)

print("\nCoefficients:")
print(coefficients)

print("\nR-squared:")
print(model.score(X[selected_features], y))


###
# Validation errors for FSS
###
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import statsmodels.api as sm

# Split the data into training and validation sets based on row indices
train_data = Hitters.iloc[:184]   # First 184 rows for training data
val_data = Hitters.iloc[184:263]  # Rows 185 to 263 for validation data

# Define X and y for both training and validation sets
X_train = train_data.drop(columns=['Salary'])
y_train = train_data['Salary']
X_val = val_data.drop(columns=['Salary'])
y_val = val_data['Salary']

# Ensure that all categorical variables are encoded as numeric
X_train = pd.get_dummies(X_train, drop_first=True).astype(float)
X_val = pd.get_dummies(X_val, drop_first=True).astype(float)

# Align columns of validation set to match training set
X_val = X_val.reindex(columns=X_train.columns, fill_value=0).astype(float)

# Convert validation data to matrix form (for statsmodels)
val_data = sm.add_constant(X_val)

# Ensure target variable is numeric
y_train_np = np.asarray(y_train).astype(float)
y_val_np = np.asarray(y_val).astype(float)


# Run forward stepwise selection using sklearn's SequentialFeatureSelector
model2 = LinearRegression()

sfs2 = SFS(model2,
          k_features=15,
          forward=True,
          floating=False,
          scoring='neg_mean_squared_error',
          cv=0)  # No cross-validation

sfs2.fit(X_train, y_train)

# Extract selected features for each number of features (1 to 15)
#selected_features = list(sfs2.subsets_)
selected_features = sfs2.subsets_

# Compute validation mean squared errors for each model
val_err = np.zeros(15)
for i in range(1, 16):
  # Get the selected feature names for this step
  feature_names = selected_features[i]['feature_names']

  # Select the corresponding features from X_train
  X_train_selected = X_train[list(feature_names)]

  # Add constant (intercept) term
  X_train_selected = sm.add_constant(X_train_selected).astype(float)

  # Ensure the selected features are numeric
  X_train_selected_np = np.asarray(X_train_selected).astype(float)

  # Fit OLS model
  model = sm.OLS(y_train_np, X_train_selected_np).fit()

  # Predict on validation set
  X_val_selected = val_data[list(feature_names)]
  X_val_selected_np = sm.add_constant(X_val_selected).astype(float)  # Ensure numpy array is float

  y_pred_val = model.predict(X_val_selected_np)

  # Compute MSE for validation set
  val_err[i - 1] = MSE(y_val_np, y_pred_val)

# Print validation errors for each model size
print("Validation Errors for each model size (1 to 15 features):")
print(val_err)

print("\nMin val_err: ", min(val_err))


##
# PLOT results
##
import matplotlib.pyplot as plt
# Assuming 'val_err' contains the validation MSE values

# Find the index of the minimum validation error
min_index = np.argmin(val_err) + 1  # +1 because index starts from 0, but variables start from 1

# Plot the validation errors
plt.figure(figsize=(8, 5))
plt.plot(range(1, 16), val_err, marker='o', linestyle='--', color='black')

# Highlight the minimum MSE with a red vertical line
plt.axvline(x=min_index, color='red', linestyle='-', linewidth=1.5)

# Label the axes
plt.xlabel("# Variables", fontsize=12)
plt.ylabel("Validation MSE", fontsize=12)

# Title for the plot (optional)
plt.title("Validation MSE vs Number of Variables", fontsize=14)

# Show the plot
plt.tight_layout()
plt.show()