import numpy as np import pandas as pd from ISLP import load_data from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier, plot_tree from sklearn.metrics import accuracy_score from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt import seaborn as sns # Load and preprocess data Carseats = load_data('Carseats').dropna() # Create qualitative variable "High" vs "Low" Sales Carseats['High'] = np.where(Carseats['Sales'] <= 8, 'No', 'Yes') Carseats['High'] = Carseats['High'].astype('category') # Drop 'Sales' from predictors X = Carseats.drop(columns=['Sales', 'High']) X = pd.get_dummies(X, drop_first=True) # Convert categorical to dummy variables y = Carseats['High'] # Train/test split (200 obs each) np.random.seed(2) train_idx = np.random.choice(len(Carseats), size=200, replace=False) X_train = X.iloc[train_idx] X_test = X.drop(train_idx) y_train = y.iloc[train_idx] y_test = y.drop(train_idx) # Fit classification tree tree_model = DecisionTreeClassifier(criterion='entropy', random_state=2) tree_model.fit(X_train, y_train) # Summary print(f"Tree depth: {tree_model.get_depth()}, Terminal nodes: {tree_model.get_n_leaves()}") # Plot tree plt.figure(figsize=(16, 8)) plot_tree(tree_model, filled=True, feature_names=X.columns, class_names=tree_model.classes_, fontsize=8) plt.title("Classification Tree") plt.show() # Test error rate y_pred = tree_model.predict(X_test) error_rate_test = np.mean(y_pred != y_test) print(f"Test Error (Unpruned Tree): {error_rate_test:.3f}") # Cross-validation to find optimal pruning parameter using cost-complexity pruning path = tree_model.cost_complexity_pruning_path(X_train, y_train) ccp_alphas = path.ccp_alphas[:-1] # exclude the last (trivial) alpha cv_errors = [] for alpha in ccp_alphas: clf = DecisionTreeClassifier(random_state=2, ccp_alpha=alpha) scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy') cv_errors.append(1 - scores.mean()) # Plot CV errors plt.figure(figsize=(8, 5)) plt.plot(ccp_alphas, cv_errors, marker='o') plt.xlabel("ccp_alpha") plt.ylabel("Cross-Validated Classification Error") plt.title("CV Error vs. Tree Complexity") plt.show() # Prune tree with optimal alpha (min CV error) optimal_alpha = ccp_alphas[np.argmin(cv_errors)] pruned_tree = DecisionTreeClassifier(random_state=2, ccp_alpha=optimal_alpha) pruned_tree.fit(X_train, y_train) # Plot pruned tree plt.figure(figsize=(16, 8)) plot_tree(pruned_tree, filled=True, feature_names=X.columns, class_names=pruned_tree.classes_, fontsize=8) plt.title("Pruned Classification Tree") plt.show() # Test error of pruned tree y_pred_pruned = pruned_tree.predict(X_test) error_rate_pruned = np.mean(y_pred_pruned != y_test) print(f"Test Error (Pruned Tree): {error_rate_pruned:.3f}") # Fit Random Forest rf_model = RandomForestClassifier(n_estimators=500, max_features=3, oob_score=True, random_state=2) rf_model.fit(X_train, y_train) # OOB Error oob_error = 1 - rf_model.oob_score_ if rf_model.oob_score else "OOB not enabled" print(f"OOB Error Rate: {oob_error}") # Test error of RF rf_pred = rf_model.predict(X_test) error_rate_rf = np.mean(rf_pred != y_test) print(f"Test Error (Random Forest): {error_rate_rf:.3f}") # Feature importance importances = pd.Series(rf_model.feature_importances_, index=X.columns) importances.sort_values(ascending=True).plot(kind='barh', figsize=(10, 8), title="Variable Importance") plt.xlabel("Importance") plt.tight_layout() plt.show()