import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression from sklearn.feature_selection import SequentialFeatureSelector from ISLP import load_data ### # Forward stepwise selection ### # Load Hitters dataset from ISLP Hitters = load_data('Hitters') # Remove missing values Hitters = Hitters.dropna() # Create dummy variables for categorical columns Hitters = pd.get_dummies(Hitters, drop_first=True) # Separate response (target) and predictors y = Hitters['Salary'] X = Hitters.drop(columns=['Salary']) # Define the linear regression model model = LinearRegression() # Perform forward stepwise selection using SequentialFeatureSelector #sfs = SequentialFeatureSelector(model, n_features_to_select=15, direction='forward', cv=5) sfs = SequentialFeatureSelector(model, n_features_to_select=15, direction='forward') # Fit the model to the data sfs.fit(X, y) # Get the selected features selected_features = X.columns[sfs.get_support()] # Fit the model with the selected features model.fit(X[selected_features], y) # Coefficients of the selected features coefficients = pd.DataFrame({ 'Feature': selected_features, 'Coefficient': model.coef_ }) # Printing short summary - intercept, coefficients and $R^{2}$ print("\nIntercept:") print(model.intercept_) print("\nCoefficients:") print(coefficients) print("\nR-squared:") print(model.score(X[selected_features], y)) ### # Validation errors for FSS ### from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error as MSE from mlxtend.feature_selection import SequentialFeatureSelector as SFS import statsmodels.api as sm # Split the data into training and validation sets based on row indices train_data = Hitters.iloc[:184] # First 184 rows for training data val_data = Hitters.iloc[184:263] # Rows 185 to 263 for validation data # Define X and y for both training and validation sets X_train = train_data.drop(columns=['Salary']) y_train = train_data['Salary'] X_val = val_data.drop(columns=['Salary']) y_val = val_data['Salary'] # Ensure that all categorical variables are encoded as numeric X_train = pd.get_dummies(X_train, drop_first=True).astype(float) X_val = pd.get_dummies(X_val, drop_first=True).astype(float) # Align columns of validation set to match training set X_val = X_val.reindex(columns=X_train.columns, fill_value=0).astype(float) # Convert validation data to matrix form (for statsmodels) val_data = sm.add_constant(X_val) # Ensure target variable is numeric y_train_np = np.asarray(y_train).astype(float) y_val_np = np.asarray(y_val).astype(float) # Run forward stepwise selection using sklearn's SequentialFeatureSelector model2 = LinearRegression() sfs2 = SFS(model2, k_features=15, forward=True, floating=False, scoring='neg_mean_squared_error', cv=0) # No cross-validation sfs2.fit(X_train, y_train) # Extract selected features for each number of features (1 to 15) #selected_features = list(sfs2.subsets_) selected_features = sfs2.subsets_ # Compute validation mean squared errors for each model val_err = np.zeros(15) for i in range(1, 16): # Get the selected feature names for this step feature_names = selected_features[i]['feature_names'] # Select the corresponding features from X_train X_train_selected = X_train[list(feature_names)] # Add constant (intercept) term X_train_selected = sm.add_constant(X_train_selected).astype(float) # Ensure the selected features are numeric X_train_selected_np = np.asarray(X_train_selected).astype(float) # Fit OLS model model = sm.OLS(y_train_np, X_train_selected_np).fit() # Predict on validation set X_val_selected = val_data[list(feature_names)] X_val_selected_np = sm.add_constant(X_val_selected).astype(float) # Ensure numpy array is float y_pred_val = model.predict(X_val_selected_np) # Compute MSE for validation set val_err[i - 1] = MSE(y_val_np, y_pred_val) # Print validation errors for each model size print("Validation Errors for each model size (1 to 15 features):") print(val_err) print("\nMin val_err: ", min(val_err)) ## # PLOT results ## import matplotlib.pyplot as plt # Assuming 'val_err' contains the validation MSE values # Find the index of the minimum validation error min_index = np.argmin(val_err) + 1 # +1 because index starts from 0, but variables start from 1 # Plot the validation errors plt.figure(figsize=(8, 5)) plt.plot(range(1, 16), val_err, marker='o', linestyle='--', color='black') # Highlight the minimum MSE with a red vertical line plt.axvline(x=min_index, color='red', linestyle='-', linewidth=1.5) # Label the axes plt.xlabel("# Variables", fontsize=12) plt.ylabel("Validation MSE", fontsize=12) # Title for the plot (optional) plt.title("Validation MSE vs Number of Variables", fontsize=14) # Show the plot plt.tight_layout() plt.show()