major upload of (python) course material & solutions
This commit is contained in:
@@ -0,0 +1,101 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from ISLP import load_data
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.tree import DecisionTreeClassifier, plot_tree
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.model_selection import cross_val_score
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
# Load and preprocess data
|
||||
Carseats = load_data('Carseats').dropna()
|
||||
|
||||
# Create qualitative variable "High" vs "Low" Sales
|
||||
Carseats['High'] = np.where(Carseats['Sales'] <= 8, 'No', 'Yes')
|
||||
Carseats['High'] = Carseats['High'].astype('category')
|
||||
|
||||
# Drop 'Sales' from predictors
|
||||
X = Carseats.drop(columns=['Sales', 'High'])
|
||||
X = pd.get_dummies(X, drop_first=True) # Convert categorical to dummy variables
|
||||
y = Carseats['High']
|
||||
|
||||
# Train/test split (200 obs each)
|
||||
np.random.seed(2)
|
||||
train_idx = np.random.choice(len(Carseats), size=200, replace=False)
|
||||
X_train = X.iloc[train_idx]
|
||||
X_test = X.drop(train_idx)
|
||||
y_train = y.iloc[train_idx]
|
||||
y_test = y.drop(train_idx)
|
||||
|
||||
# Fit classification tree
|
||||
tree_model = DecisionTreeClassifier(criterion='entropy', random_state=2)
|
||||
tree_model.fit(X_train, y_train)
|
||||
|
||||
# Summary
|
||||
print(f"Tree depth: {tree_model.get_depth()}, Terminal nodes: {tree_model.get_n_leaves()}")
|
||||
|
||||
# Plot tree
|
||||
plt.figure(figsize=(16, 8))
|
||||
plot_tree(tree_model, filled=True, feature_names=X.columns, class_names=tree_model.classes_, fontsize=8)
|
||||
plt.title("Classification Tree")
|
||||
plt.show()
|
||||
|
||||
# Test error rate
|
||||
y_pred = tree_model.predict(X_test)
|
||||
error_rate_test = np.mean(y_pred != y_test)
|
||||
print(f"Test Error (Unpruned Tree): {error_rate_test:.3f}")
|
||||
|
||||
# Cross-validation to find optimal pruning parameter using cost-complexity pruning
|
||||
path = tree_model.cost_complexity_pruning_path(X_train, y_train)
|
||||
ccp_alphas = path.ccp_alphas[:-1] # exclude the last (trivial) alpha
|
||||
cv_errors = []
|
||||
|
||||
for alpha in ccp_alphas:
|
||||
clf = DecisionTreeClassifier(random_state=2, ccp_alpha=alpha)
|
||||
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
|
||||
cv_errors.append(1 - scores.mean())
|
||||
|
||||
# Plot CV errors
|
||||
plt.figure(figsize=(8, 5))
|
||||
plt.plot(ccp_alphas, cv_errors, marker='o')
|
||||
plt.xlabel("ccp_alpha")
|
||||
plt.ylabel("Cross-Validated Classification Error")
|
||||
plt.title("CV Error vs. Tree Complexity")
|
||||
plt.show()
|
||||
|
||||
# Prune tree with optimal alpha (min CV error)
|
||||
optimal_alpha = ccp_alphas[np.argmin(cv_errors)]
|
||||
pruned_tree = DecisionTreeClassifier(random_state=2, ccp_alpha=optimal_alpha)
|
||||
pruned_tree.fit(X_train, y_train)
|
||||
|
||||
# Plot pruned tree
|
||||
plt.figure(figsize=(16, 8))
|
||||
plot_tree(pruned_tree, filled=True, feature_names=X.columns, class_names=pruned_tree.classes_, fontsize=8)
|
||||
plt.title("Pruned Classification Tree")
|
||||
plt.show()
|
||||
|
||||
# Test error of pruned tree
|
||||
y_pred_pruned = pruned_tree.predict(X_test)
|
||||
error_rate_pruned = np.mean(y_pred_pruned != y_test)
|
||||
print(f"Test Error (Pruned Tree): {error_rate_pruned:.3f}")
|
||||
|
||||
# Fit Random Forest
|
||||
rf_model = RandomForestClassifier(n_estimators=500, max_features=3, oob_score=True, random_state=2)
|
||||
rf_model.fit(X_train, y_train)
|
||||
|
||||
# OOB Error
|
||||
oob_error = 1 - rf_model.oob_score_ if rf_model.oob_score else "OOB not enabled"
|
||||
print(f"OOB Error Rate: {oob_error}")
|
||||
|
||||
# Test error of RF
|
||||
rf_pred = rf_model.predict(X_test)
|
||||
error_rate_rf = np.mean(rf_pred != y_test)
|
||||
print(f"Test Error (Random Forest): {error_rate_rf:.3f}")
|
||||
|
||||
# Feature importance
|
||||
importances = pd.Series(rf_model.feature_importances_, index=X.columns)
|
||||
importances.sort_values(ascending=True).plot(kind='barh', figsize=(10, 8), title="Variable Importance")
|
||||
plt.xlabel("Importance")
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
Reference in New Issue
Block a user