major upload of (python) course material & solutions

This commit is contained in:
2025-12-03 14:39:45 +01:00
parent 52552e20cb
commit e95a0b2ecc
39 changed files with 13598 additions and 0 deletions

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,267 @@
{
"cells": [
{
"cell_type": "raw",
"id": "03c68072-8fd9-4c26-9f8b-e6f6e24fd583",
"metadata": {},
"source": [
"\\vspace{-4cm}\n",
"\\begin{center}\n",
" \\LARGE{Machine Learning for Economics and Finance}\\\\[0.5cm]\n",
" \\Large{\\textbf{01\\_Auto\\_data\\_1}}\\\\[1.0cm]\n",
" \\large{Ole Wilms}\\\\[0.5cm]\n",
" \\large{July 29, 2024}\\\\\n",
"\\end{center}"
]
},
{
"cell_type": "raw",
"id": "4e117807-3711-444b-838d-775303383d93",
"metadata": {},
"source": [
"\\setcounter{secnumdepth}{0}"
]
},
{
"cell_type": "markdown",
"id": "e0c43ee7-0ede-4d7e-9966-f00493b33f0a",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"**Get and Set working directory**:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "57ab9acc-8d99-4165-8930-db6ae2be39a9",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/mnt/ds/home/UHH_MLSJ_2024/Code/Python/01_SupLearn_Regression\n"
]
}
],
"source": [
"import os # Package to access system related information \n",
"print(os.getcwd()) # Prints the current working directory\n",
"path = os.getcwd()\n",
"os.chdir(path) # Set the working directory"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "4424246b-bee1-4b9e-a5ac-79c20e4b4c26",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>mpg</th>\n",
" <th>cylinders</th>\n",
" <th>displacement</th>\n",
" <th>horsepower</th>\n",
" <th>weight</th>\n",
" <th>acceleration</th>\n",
" <th>year</th>\n",
" <th>origin</th>\n",
" <th>name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>18.0</td>\n",
" <td>8</td>\n",
" <td>307.0</td>\n",
" <td>130</td>\n",
" <td>3504</td>\n",
" <td>12.0</td>\n",
" <td>70</td>\n",
" <td>1</td>\n",
" <td>chevrolet chevelle malibu</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>15.0</td>\n",
" <td>8</td>\n",
" <td>350.0</td>\n",
" <td>165</td>\n",
" <td>3693</td>\n",
" <td>11.5</td>\n",
" <td>70</td>\n",
" <td>1</td>\n",
" <td>buick skylark 320</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>18.0</td>\n",
" <td>8</td>\n",
" <td>318.0</td>\n",
" <td>150</td>\n",
" <td>3436</td>\n",
" <td>11.0</td>\n",
" <td>70</td>\n",
" <td>1</td>\n",
" <td>plymouth satellite</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>16.0</td>\n",
" <td>8</td>\n",
" <td>304.0</td>\n",
" <td>150</td>\n",
" <td>3433</td>\n",
" <td>12.0</td>\n",
" <td>70</td>\n",
" <td>1</td>\n",
" <td>amc rebel sst</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>17.0</td>\n",
" <td>8</td>\n",
" <td>302.0</td>\n",
" <td>140</td>\n",
" <td>3449</td>\n",
" <td>10.5</td>\n",
" <td>70</td>\n",
" <td>1</td>\n",
" <td>ford torino</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" mpg cylinders displacement horsepower weight acceleration year \\\n",
"0 18.0 8 307.0 130 3504 12.0 70 \n",
"1 15.0 8 350.0 165 3693 11.5 70 \n",
"2 18.0 8 318.0 150 3436 11.0 70 \n",
"3 16.0 8 304.0 150 3433 12.0 70 \n",
"4 17.0 8 302.0 140 3449 10.5 70 \n",
"\n",
" origin name \n",
"0 1 chevrolet chevelle malibu \n",
"1 1 buick skylark 320 \n",
"2 1 plymouth satellite \n",
"3 1 amc rebel sst \n",
"4 1 ford torino "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from ISLP import load_data # Package which contains the data\n",
"Auto = load_data('Auto') # Loading the data\n",
"Auto.head() # Showing the first 5 Lines of Data."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "141f0257-39a4-4e21-9be0-78dfd645445a",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import statsmodels.formula.api as smf\n",
"\n",
"# fit model on training data and calculate training MSE\n",
"fit_lm = smf.ols(formula='mpg ~ horsepower', data = Auto).fit()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4ae9bf59-3b73-4020-b039-885948f6cbbd",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"==============================================================================\n",
" coef std err t P>|t| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"Intercept 39.9359 0.717 55.660 0.000 38.525 41.347\n",
"horsepower -0.1578 0.006 -24.489 0.000 -0.171 -0.145\n",
"==============================================================================\n"
]
}
],
"source": [
"print(fit_lm.summary().tables[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "77170717-6eb1-41fa-a2b6-fdbf5e9193cf",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"date": " ",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
},
"title": " ",
"toc-autonumbering": false,
"toc-showcode": false,
"toc-showmarkdowntxt": false,
"toc-showtags": false
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

View File

@@ -0,0 +1,184 @@
{
"cells": [
{
"cell_type": "raw",
"id": "6cbef61b-0897-42bf-b456-c0a409b87c41",
"metadata": {},
"source": [
"\\vspace{-4cm}\n",
"\\begin{center}\n",
" \\LARGE{Machine Learning for Economics and Finance}\\\\\n",
" \\Large{Task 1: Logistic Regressions}\\\\[0.5cm]\n",
" \\Large{\\textbf{02\\_Default\\_data}}\\\\[1.0cm]\n",
" \\large{Ole Wilms}\\\\[0.5cm]\n",
" \\large{July 29, 2024}\\\\\n",
"\\end{center}"
]
},
{
"cell_type": "raw",
"id": "13be77f3-44f0-4983-b4cb-bd3e4b5dba8b",
"metadata": {},
"source": [
"\\setcounter{secnumdepth}{0}"
]
},
{
"cell_type": "markdown",
"id": "72f918a4-cdd4-4b46-a88f-f4b43c3c3a88",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"## Task 1: Logistic Regressions"
]
},
{
"cell_type": "markdown",
"id": "0b3f9fc6-db4f-47b0-9dfa-e41d9f85a5ba",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"1.1 Randomly split the data into $7000$ observations for training and $3000$ observations for testing and set the seed to $1$ before sampling the data. Call these two datasets *train_data* and *test_data* respectively. (Hint: use the code to split the data from 01 Auto_data_2.R or Auto_data_2.Rmd)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "335aa198-5a94-4c5a-8ad8-67c78bcf71f5",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "116c466d-0627-43d6-adbe-a937ac846a28",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"1.2 Fit a logistic regression of default on *income* using the *train_data*. Analyze the significance of\n",
"the estimated coefficients."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2e38a201-7f2d-4999-beab-5739217a9318",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "43c6dade-5a22-476a-b3bf-bfd1b880038d",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"1.3 Compute the *out-of-sample accuracy* and *error rate* and compare to the *in-sample statistics*. Do\n",
"you think this is a good model to predict default?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "44028726-1eff-436f-bc47-04a6786ae3ad",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "c28971ef-8bee-462d-9612-88f1534bfcb5",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"1.4 Add balance as a predictor and compute the *out-of-sample error rate* and *accuracy*. Do you\n",
"think this is a good model to predict *default*?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3a7216df-adf5-4df0-9593-69c1a7649f64",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "f267ef66-1775-42a8-a1e9-45fda849f4d9",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"1.5 Compare the results for Task $1.4$ to a model with only balance as a predictor. Which model\n",
"would you choose?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "28082bd5-8fe1-4160-aec0-1a92aebfa671",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "7ccad70f-5ef5-42c8-8c2e-22e76943d281",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"1.6 Take the model from Task $1.4$ but now re-estimate the model using different *seeds* to draw your\n",
"*training* and *test data*. Does your *test error rate* change with the seed? Whats going on here?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9ab2f559-83b1-4a66-b1dc-8799b8301d85",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"date": " ",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.1"
},
"title": " ",
"toc-autonumbering": false,
"toc-showcode": false,
"toc-showmarkdowntxt": false,
"toc-showtags": false
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,465 @@
{
"cells": [
{
"cell_type": "raw",
"id": "6cbef61b-0897-42bf-b456-c0a409b87c41",
"metadata": {},
"source": [
"\\vspace{-4cm}\n",
"\\begin{center}\n",
" \\LARGE{Machine Learning for Economics and Finance}\\\\[0.5cm]\n",
" \\Large{\\textbf{03\\_Default\\_data}}\\\\[1.0cm]\n",
" \\large{Ole Wilms}\\\\[0.5cm]\n",
" \\large{July 29, 2024}\\\\\n",
"\\end{center}"
]
},
{
"cell_type": "raw",
"id": "13be77f3-44f0-4983-b4cb-bd3e4b5dba8b",
"metadata": {},
"source": [
"\\setcounter{secnumdepth}{0}"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "335aa198-5a94-4c5a-8ad8-67c78bcf71f5",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/mnt/ds/home/UHH_MLSJ_2024/Code/Python/03-CrossValidation\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>default</th>\n",
" <th>student</th>\n",
" <th>balance</th>\n",
" <th>income</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>729.526495</td>\n",
" <td>44361.625074</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>817.180407</td>\n",
" <td>12106.134700</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>1073.549164</td>\n",
" <td>31767.138947</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>529.250605</td>\n",
" <td>35704.493935</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>785.655883</td>\n",
" <td>38463.495879</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" default student balance income\n",
"0 No No 729.526495 44361.625074\n",
"1 No Yes 817.180407 12106.134700\n",
"2 No No 1073.549164 31767.138947\n",
"3 No No 529.250605 35704.493935\n",
"4 No No 785.655883 38463.495879"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os # Package to access system related information \n",
"print(os.getcwd()) # Prints the current working directory\n",
"path = os.getcwd()\n",
"os.chdir(path) # Set the working directory\n",
"\n",
"from ISLP import load_data # Package which contains the data\n",
"default_data = load_data('Default') # Loading the data\n",
"default_data.head() # Showing the first 5 Lines of Data."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2e38a201-7f2d-4999-beab-5739217a9318",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 10000 entries, 0 to 9999\n",
"Data columns (total 4 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 default 10000 non-null object \n",
" 1 student 10000 non-null object \n",
" 2 balance 10000 non-null float64\n",
" 3 income 10000 non-null float64\n",
"dtypes: float64(2), object(2)\n",
"memory usage: 312.6+ KB\n",
"None\n"
]
}
],
"source": [
"print(default_data.info())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7dd29324-cd54-415c-ba83-56c0d9f74159",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" balance income\n",
"count 10000.000000 10000.000000\n",
"mean 835.374886 33516.981876\n",
"std 483.714985 13336.639563\n",
"min 0.000000 771.967729\n",
"25% 481.731105 21340.462903\n",
"50% 823.636973 34552.644802\n",
"75% 1166.308386 43807.729272\n",
"max 2654.322576 73554.233495\n"
]
}
],
"source": [
"print(default_data.describe())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3debf6d8-efda-4414-bcca-dd758dc65512",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"# set seed\n",
"np.random.seed(1)\n",
"\n",
"# Number of observations in the dataset\n",
"n = len(default_data)\n",
"\n",
"# Shuffle the dataset using np.random.permutation\n",
"shuffled_indices = np.random.permutation(n)\n",
"\n",
"# Compute training and validation sample sizes\n",
"nT = int(0.7 * n) # Training sample size\n",
"\n",
"# Split the shuffled dataset based on the shuffled indices\n",
"train_data = default_data.iloc[shuffled_indices[:nT]] # First 70% for training\n",
"test_data = default_data.iloc[shuffled_indices[nT:]] # Remaining 30% for validation"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e1b2a560-2a8e-4881-8d51-f3d96c3b05fe",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train data percentage of defaulting: 0.03157\n",
"Test data percentage of defaulting: 0.03733\n"
]
}
],
"source": [
"defaulting_train = (train_data['default'] == 'Yes').mean()\n",
"defaulting_test = (test_data['default'] == 'Yes').mean()\n",
"# The \"train_data$default == \"Yes\": creates a logical vector where each element is TRUE \n",
"# if the corresponding element.\n",
"# The outer mean() function than calculates the proportion of TRUE values \n",
"# in the logical vector.\n",
"\n",
"# Output the results\n",
"print(f\"Train data percentage of defaulting: {round(defaulting_train, 5)}\")\n",
"print(f\"Test data percentage of defaulting: {round(defaulting_test, 5)}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f9a25057-a631-48dc-883f-643bd09d0999",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: default No. Observations: 7000\n",
"Model: GLM Df Residuals: 6997\n",
"Model Family: Binomial Df Model: 2\n",
"Link Function: Logit Scale: 1.0000\n",
"Method: IRLS Log-Likelihood: -542.14\n",
"Date: Sat, 19 Oct 2024 Deviance: 1084.3\n",
"Time: 16:53:00 Pearson chi2: 5.42e+03\n",
"No. Iterations: 9 Pseudo R-squ. (CS): 0.1179\n",
"Covariance Type: nonrobust \n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -11.3514 0.515 -22.060 0.000 -12.360 -10.343\n",
"income 1.847e-05 5.98e-06 3.091 0.002 6.76e-06 3.02e-05\n",
"balance 0.0055 0.000 20.428 0.000 0.005 0.006\n",
"==============================================================================\n"
]
}
],
"source": [
"import statsmodels.api as sm\n",
"\n",
"train_data_copy = train_data.copy()\n",
"train_data_copy['default'] = train_data_copy['default'].map({'No': 0, 'Yes': 1})\n",
"\n",
"test_data_copy = test_data.copy()\n",
"test_data_copy['default'] = test_data_copy['default'].map({'No': 0, 'Yes': 1})\n",
"\n",
"# Logistic regression model:\n",
"X_train = train_data_copy[['income','balance']]\n",
"X_train = sm.add_constant(X_train) # Adds an intercept term to the model\n",
"X_test = test_data_copy[['income','balance']]\n",
"X_test = sm.add_constant(X_test) # Adds an intercept term to the model\n",
"y_train = train_data_copy['default']\n",
"\n",
"# Fit the logistic regression model\n",
"glm_fit = sm.GLM(y_train, X_train, family=sm.families.Binomial()).fit()\n",
"print(glm_fit.summary())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "b5c7de71-463d-455b-a596-923cfcddcefb",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"const -11.351394\n",
"income 0.000018\n",
"balance 0.005536\n",
"dtype: float64\n"
]
}
],
"source": [
"print(glm_fit.params) # print coefficients"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6b8fb99c-d172-4398-92e5-89324c1787f8",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"K-fold Cross-Validation Error Rate: 0.02571\n"
]
}
],
"source": [
"from sklearn.metrics import accuracy_score\n",
"from sklearn.model_selection import KFold\n",
"\n",
"# ---- K-Fold Cross-Validation ----\n",
"folds = 10\n",
"kf = KFold(n_splits=folds, shuffle=True, random_state=12)\n",
"cv_errors = []\n",
"\n",
"for train_index, test_index in kf.split(X_train):\n",
" X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]\n",
" y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]\n",
" \n",
" # Fit model on this fold\n",
" glm_fold = sm.GLM(y_train_fold, X_train_fold, family=sm.families.Binomial()).fit()\n",
" \n",
" # Compute the out-of-sample error for this fold\n",
" preds_fold = glm_fold.predict(X_test_fold)\n",
" pred_labels_fold = [1 if p > 0.5 else 0 for p in preds_fold]\n",
" fold_error = np.mean(pred_labels_fold != y_test_fold)\n",
" \n",
" cv_errors.append(fold_error)\n",
"\n",
"cv_error_rate = np.mean(cv_errors)\n",
"print(f\"K-fold Cross-Validation Error Rate: {cv_error_rate:.5f}\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "06091455-d874-4a10-9919-78c8c9ddfbed",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In-sample accuracy: 0.97486\n",
"In-sample error rate: 0.02514\n"
]
}
],
"source": [
"# ---- In-sample predictions ----\n",
"glm_probs_train = glm_fit.predict(X_train)\n",
"glm_pred_train = np.where(glm_probs_train > 0.5, 1, 0) # ternary operator\n",
"\n",
"# Compute in-sample accuracy and error rate\n",
"accuracy_train = accuracy_score(y_train, glm_pred_train)\n",
"error_rate_train = np.mean(glm_pred_train != y_train)\n",
"\n",
"print(f\"In-sample accuracy: {round(accuracy_train, 5)}\")\n",
"print(f\"In-sample error rate: {round(error_rate_train, 5)}\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "9d115c02-9520-41d5-b04b-e8cbe84b0277",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# ---- Out-of-sample predictions ----\n",
"glm_probs_test = glm_fit.predict(X_test)\n",
"glm_pred_test = np.where(glm_probs_test > 0.5, 1, 0) # ternary operator"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "80aadaaf-e914-4e70-9ea3-411965a8d9d7",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Out-of-sample accuracy: 0.97067\n",
"Out-of-sample error rate: 0.02933\n"
]
}
],
"source": [
"# Compute out-of-sample accuracy and error rate\n",
"accuracy_test = accuracy_score(test_data_copy['default'], glm_pred_test)\n",
"error_rate_test = np.mean(glm_pred_test != test_data_copy['default'])\n",
"\n",
"print(f\"Out-of-sample accuracy: {round(accuracy_test, 5)}\")\n",
"print(f\"Out-of-sample error rate: {round(error_rate_test, 5)}\")"
]
}
],
"metadata": {
"date": " ",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
},
"title": " ",
"toc-autonumbering": false,
"toc-showcode": false,
"toc-showmarkdowntxt": false,
"toc-showtags": false
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,196 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "fb0c424f-1667-4fb2-baab-2d88d8abb387",
"metadata": {},
"source": [
"# Preliminary setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "de6396ca-e17d-4c95-8f96-1f78a09e9ce2",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from ISLP import load_data\n",
"from matplotlib.pyplot import subplots, show\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"# Load and preprocess data\n",
"Hitters = load_data('Hitters').dropna()"
]
},
{
"cell_type": "markdown",
"id": "87902d82-5336-456b-bec8-403530c75f00",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"# Task"
]
},
{
"cell_type": "markdown",
"id": "0ce8adda-23e7-498f-9ff3-26c138903b88",
"metadata": {},
"source": [
"1. Use the final model (tuning parameter) obtained from 10-fold CV and fit the model again using the full dataset and display the corresponding coefficients."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac884445-bc95-4659-b656-d9c5f821bf52",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "05635216-4afb-4d0d-982a-a2af35d6bf3a",
"metadata": {},
"source": [
"2. Multiply the feature Errors by $1/1000$ and again fit the model from Task 1. Display the coefficients and interpret. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70bc0da8-6134-4d4d-ad1f-e43ea26fae3c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "b6e19093-51bf-4e68-aba6-01c34905b5e4",
"metadata": {},
"source": [
"3. Redo Task 2 BUT without the normalizing (standardize) the data. Refit the same model again and display the coefficients. Interpret. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5a38add3-642e-41a8-8b80-c3d01a63e538",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "df85262d-8a38-4bf9-9dfa-0a001e117d33",
"metadata": {},
"source": [
"4. Split the dataset into a training set using $80\\%$ of the observations and validation set using all other observations."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0a152a8-395e-49e2-973d-252b88cd379c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "e1e3e60e-0d5a-4340-ae29-9153ffdad7c8",
"metadata": {},
"source": [
"5. Set up a grid for the tuning parameter $\\lambda$ and fit Lasso regressions for all tuning parameters using the training data. Make sure that you choose the mininmum and maximum values of $\\lambda$ so that it allows you to determine the optimal $\\lambda$ parameter in the next task (you might need to play with the grid size a bit). "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5e0cff0-6782-40a3-8d7f-891c19bb5f4d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "21ba53c0-def1-4059-9872-27e6b437b8af",
"metadata": {},
"source": [
"6. For each model (tuning parameter), compute the mean squared prediction error in the validation dataset. Plot the validation error as a function of $\\lambda$ and find the best model which minimizes the validation error. Display the estimated coefficients for the best model and check whether some features are not selected in the final regression. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8323ce02-17fe-4f54-820d-030f198a34fe",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "19f07912-bffd-4a19-9a92-aa1a2dc48c75",
"metadata": {},
"source": [
"7. Finally compare the best Lasso model obtained from the validation set approach from Task 6 to the best Lasso model obtained by 5-fold cross-validation. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e0166113-9d31-4e42-a8df-69f2048b65af",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "8fd306c8-2247-4343-8c30-5dd99393c9d0",
"metadata": {},
"source": [
"8. Compare the best model from Task 7 to the best ridge regression obtained from 5-fold cross validation. How do the coefficients of the two models differ?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c70e9bd-78d9-4a91-a28f-588fca65c616",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"date": " ",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.3"
},
"title": " ",
"toc-autonumbering": false,
"toc-showcode": false,
"toc-showmarkdowntxt": false,
"toc-showtags": false
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,164 @@
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from ISLP import load_data
###
# Forward stepwise selection
###
# Load Hitters dataset from ISLP
Hitters = load_data('Hitters')
# Remove missing values
Hitters = Hitters.dropna()
# Create dummy variables for categorical columns
Hitters = pd.get_dummies(Hitters, drop_first=True)
# Separate response (target) and predictors
y = Hitters['Salary']
X = Hitters.drop(columns=['Salary'])
# Define the linear regression model
model = LinearRegression()
# Perform forward stepwise selection using SequentialFeatureSelector
#sfs = SequentialFeatureSelector(model, n_features_to_select=15, direction='forward', cv=5)
sfs = SequentialFeatureSelector(model, n_features_to_select=15, direction='forward')
# Fit the model to the data
sfs.fit(X, y)
# Get the selected features
selected_features = X.columns[sfs.get_support()]
# Fit the model with the selected features
model.fit(X[selected_features], y)
# Coefficients of the selected features
coefficients = pd.DataFrame({
'Feature': selected_features,
'Coefficient': model.coef_
})
# Printing short summary - intercept, coefficients and $R^{2}$
print("\nIntercept:")
print(model.intercept_)
print("\nCoefficients:")
print(coefficients)
print("\nR-squared:")
print(model.score(X[selected_features], y))
###
# Validation errors for FSS
###
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import statsmodels.api as sm
# Split the data into training and validation sets based on row indices
train_data = Hitters.iloc[:184] # First 184 rows for training data
val_data = Hitters.iloc[184:263] # Rows 185 to 263 for validation data
# Define X and y for both training and validation sets
X_train = train_data.drop(columns=['Salary'])
y_train = train_data['Salary']
X_val = val_data.drop(columns=['Salary'])
y_val = val_data['Salary']
# Ensure that all categorical variables are encoded as numeric
X_train = pd.get_dummies(X_train, drop_first=True).astype(float)
X_val = pd.get_dummies(X_val, drop_first=True).astype(float)
# Align columns of validation set to match training set
X_val = X_val.reindex(columns=X_train.columns, fill_value=0).astype(float)
# Convert validation data to matrix form (for statsmodels)
val_data = sm.add_constant(X_val)
# Ensure target variable is numeric
y_train_np = np.asarray(y_train).astype(float)
y_val_np = np.asarray(y_val).astype(float)
# Run forward stepwise selection using sklearn's SequentialFeatureSelector
model2 = LinearRegression()
sfs2 = SFS(model2,
k_features=15,
forward=True,
floating=False,
scoring='neg_mean_squared_error',
cv=0) # No cross-validation
sfs2.fit(X_train, y_train)
# Extract selected features for each number of features (1 to 15)
#selected_features = list(sfs2.subsets_)
selected_features = sfs2.subsets_
# Compute validation mean squared errors for each model
val_err = np.zeros(15)
for i in range(1, 16):
# Get the selected feature names for this step
feature_names = selected_features[i]['feature_names']
# Select the corresponding features from X_train
X_train_selected = X_train[list(feature_names)]
# Add constant (intercept) term
X_train_selected = sm.add_constant(X_train_selected).astype(float)
# Ensure the selected features are numeric
X_train_selected_np = np.asarray(X_train_selected).astype(float)
# Fit OLS model
model = sm.OLS(y_train_np, X_train_selected_np).fit()
# Predict on validation set
X_val_selected = val_data[list(feature_names)]
X_val_selected_np = sm.add_constant(X_val_selected).astype(float) # Ensure numpy array is float
y_pred_val = model.predict(X_val_selected_np)
# Compute MSE for validation set
val_err[i - 1] = MSE(y_val_np, y_pred_val)
# Print validation errors for each model size
print("Validation Errors for each model size (1 to 15 features):")
print(val_err)
print("\nMin val_err: ", min(val_err))
##
# PLOT results
##
import matplotlib.pyplot as plt
# Assuming 'val_err' contains the validation MSE values
# Find the index of the minimum validation error
min_index = np.argmin(val_err) + 1 # +1 because index starts from 0, but variables start from 1
# Plot the validation errors
plt.figure(figsize=(8, 5))
plt.plot(range(1, 16), val_err, marker='o', linestyle='--', color='black')
# Highlight the minimum MSE with a red vertical line
plt.axvline(x=min_index, color='red', linestyle='-', linewidth=1.5)
# Label the axes
plt.xlabel("# Variables", fontsize=12)
plt.ylabel("Validation MSE", fontsize=12)
# Title for the plot (optional)
plt.title("Validation MSE vs Number of Variables", fontsize=14)
# Show the plot
plt.tight_layout()
plt.show()

View File

@@ -0,0 +1,50 @@
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, RidgeCV, LassoCV
from sklearn.preprocessing import StandardScaler
from ISLP import load_data
# === Setup ===
# Load and preprocess data
Hitters = load_data('Hitters').dropna()
Hitters = pd.get_dummies(Hitters, drop_first=True)
y = Hitters['Salary']
X = Hitters.drop(columns=['Salary'])
# Standardize predictors
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# === SLIDE 1: Ridge regression with fixed lambda ===
ridge_fixed = Ridge(alpha=100)
ridge_fixed.fit(X_scaled, y)
ridge_fixed_coeffs = ridge_fixed.coef_
ridge_fixed_preds = ridge_fixed.predict(X_scaled[:5])
# === SLIDE 2: Ridge regression with cross-validation to find best lambda ===
lambdas = 10**np.linspace(10, -2, 100) * 0.5 # Equivalent to R's lambda grid
ridge_cv = RidgeCV(alphas=lambdas, scoring='neg_mean_squared_error', cv=10)
ridge_cv.fit(X_scaled, y)
best_lambda_ridge = ridge_cv.alpha_
ridge_cv_coeffs = ridge_cv.coef_
ridge_cv_preds = ridge_cv.predict(X_scaled[:5])
# === SLIDE 3: Lasso regression with cross-validation ===
lasso_cv = LassoCV(cv=10, max_iter=10000)
lasso_cv.fit(X_scaled, y)
best_lambda_lasso = lasso_cv.alpha_
lasso_cv_coeffs = lasso_cv.coef_
lasso_cv_preds = lasso_cv.predict(X_scaled[:5])
# === Create summary DataFrame ===
summary = pd.DataFrame({
'Model': ['Ridge (lambda=100)', 'RidgeCV (best lambda)', 'LassoCV (best lambda)'],
'Best Lambda': [100, best_lambda_ridge, best_lambda_lasso],
'Non-zero Coefficients': [
np.sum(ridge_fixed_coeffs != 0),
np.sum(ridge_cv_coeffs != 0),
np.sum(lasso_cv_coeffs != 0)
]
})
print(summary)

View File

@@ -0,0 +1,101 @@
import numpy as np
import pandas as pd
from ISLP import load_data
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
# Load and preprocess data
Carseats = load_data('Carseats').dropna()
# Create qualitative variable "High" vs "Low" Sales
Carseats['High'] = np.where(Carseats['Sales'] <= 8, 'No', 'Yes')
Carseats['High'] = Carseats['High'].astype('category')
# Drop 'Sales' from predictors
X = Carseats.drop(columns=['Sales', 'High'])
X = pd.get_dummies(X, drop_first=True) # Convert categorical to dummy variables
y = Carseats['High']
# Train/test split (200 obs each)
np.random.seed(2)
train_idx = np.random.choice(len(Carseats), size=200, replace=False)
X_train = X.iloc[train_idx]
X_test = X.drop(train_idx)
y_train = y.iloc[train_idx]
y_test = y.drop(train_idx)
# Fit classification tree
tree_model = DecisionTreeClassifier(criterion='entropy', random_state=2)
tree_model.fit(X_train, y_train)
# Summary
print(f"Tree depth: {tree_model.get_depth()}, Terminal nodes: {tree_model.get_n_leaves()}")
# Plot tree
plt.figure(figsize=(16, 8))
plot_tree(tree_model, filled=True, feature_names=X.columns, class_names=tree_model.classes_, fontsize=8)
plt.title("Classification Tree")
plt.show()
# Test error rate
y_pred = tree_model.predict(X_test)
error_rate_test = np.mean(y_pred != y_test)
print(f"Test Error (Unpruned Tree): {error_rate_test:.3f}")
# Cross-validation to find optimal pruning parameter using cost-complexity pruning
path = tree_model.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas[:-1] # exclude the last (trivial) alpha
cv_errors = []
for alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=2, ccp_alpha=alpha)
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
cv_errors.append(1 - scores.mean())
# Plot CV errors
plt.figure(figsize=(8, 5))
plt.plot(ccp_alphas, cv_errors, marker='o')
plt.xlabel("ccp_alpha")
plt.ylabel("Cross-Validated Classification Error")
plt.title("CV Error vs. Tree Complexity")
plt.show()
# Prune tree with optimal alpha (min CV error)
optimal_alpha = ccp_alphas[np.argmin(cv_errors)]
pruned_tree = DecisionTreeClassifier(random_state=2, ccp_alpha=optimal_alpha)
pruned_tree.fit(X_train, y_train)
# Plot pruned tree
plt.figure(figsize=(16, 8))
plot_tree(pruned_tree, filled=True, feature_names=X.columns, class_names=pruned_tree.classes_, fontsize=8)
plt.title("Pruned Classification Tree")
plt.show()
# Test error of pruned tree
y_pred_pruned = pruned_tree.predict(X_test)
error_rate_pruned = np.mean(y_pred_pruned != y_test)
print(f"Test Error (Pruned Tree): {error_rate_pruned:.3f}")
# Fit Random Forest
rf_model = RandomForestClassifier(n_estimators=500, max_features=3, oob_score=True, random_state=2)
rf_model.fit(X_train, y_train)
# OOB Error
oob_error = 1 - rf_model.oob_score_ if rf_model.oob_score else "OOB not enabled"
print(f"OOB Error Rate: {oob_error}")
# Test error of RF
rf_pred = rf_model.predict(X_test)
error_rate_rf = np.mean(rf_pred != y_test)
print(f"Test Error (Random Forest): {error_rate_rf:.3f}")
# Feature importance
importances = pd.Series(rf_model.feature_importances_, index=X.columns)
importances.sort_values(ascending=True).plot(kind='barh', figsize=(10, 8), title="Variable Importance")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()

View File

@@ -0,0 +1,293 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "67cd5699-6111-4576-9386-0fe46130f060",
"metadata": {},
"source": [
"# Preliminary setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0ea9c10a-5919-467d-8aca-efa3f2bc05e3",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from ISLP import load_data\n",
"from matplotlib.pyplot import subplots, show\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"# Load and preprocess data\n",
"Hitters = load_data('Hitters').dropna()"
]
},
{
"cell_type": "markdown",
"id": "ce3b15bc-bebb-48cb-b0ab-8754b5004796",
"metadata": {},
"source": [
"# Task 1"
]
},
{
"cell_type": "markdown",
"id": "a277a01e-5932-4376-9771-ca735b510eab",
"metadata": {},
"source": [
"1. Use the Hitters data and remove all rows that contain missing values. Create a new\n",
"variable that is the log of Salary and provide histograms for Salary and Log(Salary).\n",
"Interpret."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bcc5d1a2-c5b8-401d-b854-dd0ff5837704",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "5ce10e96-7257-4e74-b4dd-61eadc98090a",
"metadata": {},
"source": [
"2. Split the sample into a training dataset consisting of the first 200 observations and a\n",
"test dataset containing the remaining observations."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1c39b34-4e4e-42bb-a915-ff7d9edc2bb5",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "2cffb0ba-7e62-4cff-b79d-ef5e027a62ec",
"metadata": {},
"source": [
"3. Fit a large, unpruned regression tree to predigt Log(Salary). Which features are used\n",
"to construct the tree, which features are the most important and how many terminal\n",
"nodes does the tree have? You might want to plot the tree for this exercise."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "425892e5-ba65-4be4-b103-5d1968973cf5",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "0c19dc38-6d3d-4d83-8e77-eab071883a1e",
"metadata": {},
"source": [
"4. Compute the mean squared prediction error for the test data."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "eb73ed7b-6730-4a98-b04e-0d12c0c7125d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "dbae3448-f484-4fe2-afd1-40a741b8ef9e",
"metadata": {},
"source": [
"5. Lets try to improve predictions using k-fold CV. Set the seed to 2 and run 5-fold cross\n",
"validation. Plot the mean squared cross validation error against the tree size and\n",
"report the tree size and the pruning parameter α that minimize the mean squared\n",
"cross validation error."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "31280859-0b4f-4b8d-9aeb-4e9c83bd008a",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "37322a0e-a542-4b10-88e3-eb88d7b1f2ac",
"metadata": {},
"source": [
"6. Use the pruning parameter from the previous task to prune the tree. Plot the tree and\n",
"report the most important variables."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b8bf40b3-8cba-4335-92e2-686ba0a93185",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "67496351-580b-4e9f-9b17-2776f2c55843",
"metadata": {},
"source": [
"7. Compute the test mean squared prediction error for pruned tree and compare to the\n",
"results from Task 4."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c3104831-7607-4eab-a0a2-861adde2658d",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "30021421-8807-4481-b28d-6ea23cb06b82",
"metadata": {},
"source": [
"8. Use random forest to improve the predictions. Fit $500$ trees using $m = \\sqrt(p)$ (round to the nearest integer)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c907edbf-5755-4a5c-bd12-ea80a2358358",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "4b014396-e91b-4f72-9b58-85fa80805eb0",
"metadata": {},
"source": [
"9. Do you think it was necessary to fit $500$ trees or would have fewer trees be sufficient? Determine the number of trees that provides the lowest OOB error."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "77cb58bd-6d3d-4b0d-ad5e-e18737501cb8",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "2cea0e71-cc51-4890-b776-e4f03d7af94d",
"metadata": {},
"source": [
"10. Compute the OOB estimate of the out-of-sample error and compare it to best pruned model from CV of Task 5. Interpret the outcomes."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6aafe1d3-b54c-4bca-9070-ea62ac27f885",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "992771aa-1fec-44d0-b3f5-e8525bd1ce79",
"metadata": {},
"source": [
"11. Which are the most important variables used in the random forest?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85841a9e-4df5-4d14-ae2b-107002042fd8",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "bc5eee45-8c48-41dd-ba38-7f78c4bcd036",
"metadata": {},
"source": [
"12. Lets try to improve the random forest by trying out different values for $m$. Set up a grid for m going from $1$ to $p$. Write a loop that fits a random forest for each $m$. Explain which model you would choose."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0361acc5-041d-46b1-848d-eadea0ce717b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "6f38e2e4-8242-46c6-9c49-69b7ee73be1e",
"metadata": {},
"source": [
"13. For the best model, compute the test errors and compare them to the best pruned model from Task 7."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d31d199b-116f-4585-8e4d-e40d4b6ff685",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "d6f1407e-5ad1-4690-bf9e-ecc36c4a50e5",
"metadata": {},
"source": [
"14. What is the OOB error obtained from bagging (you can infer the answer from the previous task)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d7ed7a03-8520-4fba-b2ff-500979e92496",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,62 @@
import pandas as pd
import numpy as np
from ISLP import load_data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
#from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam
# === Setup ===
# Load and preprocess Hitters data
Hitters = load_data('Hitters').dropna()
# Convert target to binary classification (Salary >= 500 as good income)
print(Hitters[["Salary"]].describe())
y = np.where(Hitters['Salary'] >= 500, 1, 0)
# Convert categorical variables into numerical variables (if needed)
Hitters = pd.get_dummies(Hitters.drop(columns=['Salary']), drop_first=True)
# Extract feature matrix after one-hot encoding
X = Hitters
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
## Build the Neural Network
model = Sequential([
Dense(units=64, input_shape=(X_train.shape[1],), activation='relu'), # Input and hidden layer
Dense(units=32, activation='relu'), # Hidden layer
Dense(units=1, activation='sigmoid') # Output layer
])
## Compile the Model (Adam optimizer and binary_crossentropy loss)
model.compile(optimizer=Adam(learning_rate=0.001),
loss='binary_crossentropy',
metrics=['accuracy'])
## Train the Model
history = model.fit(X_train, y_train, epochs=50, batch_size=16, validation_split=0.1, verbose=1)
## Evaluate the Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")
## Visualize Training Progress
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,354 @@
{
"cells": [
{
"cell_type": "raw",
"id": "77f76980-cc4f-4837-867f-218c92a7deae",
"metadata": {},
"source": [
"\\vspace{-4cm}\n",
"\\begin{center}\n",
" \\LARGE{Machine Learning for Economics and Finance}\\\\[0.5cm]\n",
" \\Large{\\textbf{Problem Set 2}}\\\\[1.0cm]\n",
" \\large{Ole Wilms}\\\\[0.5cm]\n",
" \\large{July 29, 2024}\\\\\n",
"\\end{center}"
]
},
{
"cell_type": "raw",
"id": "2c3a2d4e-1e5a-4fe3-88be-abd9b9152def",
"metadata": {},
"source": [
"\\setcounter{secnumdepth}{0}"
]
},
{
"cell_type": "markdown",
"id": "040dc2a4-910e-4cf5-9d1e-62fe7d0a8efd",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": [],
"user_expressions": []
},
"source": [
"## Important Instructions\n",
"\n",
"- In this problem set you are asked to apply the machine learning techniques we covered in the past weeks\n",
"- In case you struggle with some problems, please post your questions on the OpenOlat discussion board.\n",
"- We will discuss the solutions for the problem set on `MONTH DAY`"
]
},
{
"cell_type": "markdown",
"id": "baac6966-d67a-4a66-acec-8ef6411c4f66",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": [],
"user_expressions": []
},
"source": [
"## Setup\n",
"\n",
"Assume the same setup as in *Problem Set 1* but now you try to improve the return predictions using\n",
"the machine learning approaches we have discussed in class. For this you are asked to use the same\n",
"training and test datasets we constructed in *Problem Set 1*."
]
},
{
"cell_type": "raw",
"id": "156ee566-f0eb-4206-a443-34a63bc6dbd8",
"metadata": {},
"source": [
"\\newpage"
]
},
{
"cell_type": "markdown",
"id": "87902d82-5336-456b-bec8-403530c75f00",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": [],
"user_expressions": []
},
"source": [
"## Question 1: Shrinkage Methods\n",
"\n",
"1. Fit a ridge regression using the training data. Determine the optimal penalty parameter $\\lambda$ using $5$-fold cross validation (set the seed to $2$ before you run the CV). Provide a plot of the cross-validation MSE as a function of log($\\lambda$) and interpret the outome."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0770500d-74fe-48df-841c-20b9aef42883",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "73330b81-0e43-43ac-911f-4086a9f9788f",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"2. Prepare a slide with a table that reports training MSE and test MSE for different models. Fill in the MSE from the linear model using all features from Problem Set 1. Now compute the training and test MSE for the ridge regression with the optimal penalty parameter $\\lambda$ from *Q1.1*."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f1b13abd-80b1-4805-b108-55d403b7ab5c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "80e4160e-374a-43e1-a159-45077703658e",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"3. Redo the two tasks above using Lasso instead of Ridge. Again fix the seed to $2$. Provide a plot of the cross-validation MSE as a function of log($\\lambda$) and interpret. Provide a table that shows the coefficient of the Lasso with the optimal penalty parameter $\\lambda$. Compute the training and test MSE of this Lasso model and add it to the table from *Q1.2*."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a214f453-68d3-4b6f-bc36-dbabf5536fc3",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "03d19235-25ee-4c3b-b7bf-97cdf27d41b2",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"4. Now suppose your boss tells you that he only trusts sparse models with few variables. Use the Lasso and choose the tuning parameter $\\lambda$ such that the model only considers $3$ out of the six variables. Report the coefficients and compare them to the coefficients from the optimal model from *Q1.3* and interpret. Compute the training and test MSE of this Lasso model and add it to the table from *Q1.2*. Interpret."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9e53d846-19a3-46d9-b103-f42e75a87c20",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "e715dd42-7021-466d-a9c1-0c0b4efeee78",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": [],
"user_expressions": []
},
"source": [
"## Question 2: Tree-Based Methods\n",
"\n",
"1. Fit a large regression tree using the training data. Report the number of terminal nodes as well as the most important variables for splitting the tree."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0207f3f9-c389-4e50-abeb-5316857ab2da",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "3069027d-f53f-4348-8c0c-0885483dc8d9",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"2. Compute the training and test MSE of the tree and add it to the table from *Q1.2*."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f65211c4-6864-4749-8b94-eaeea96c9cbf",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "581f7631-9c99-4143-b87e-11b43c243dd0",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"3. Again set the seed to $2$ and use $5$-fold cross validation to determine the optimal pruning parameter for the large tree. Provide a plot of the prediction error against the size of the tree. Report the optimal tree size and provide a plot of the pruned tree. Which variables are important for splitting the pruned tree?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9801c9a3-85ba-4b70-82b6-a9bbbfcfaec4",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "18a9a179-4226-4734-8bcf-554671ce85e9",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"4. Compute the training and test MSE of the pruned tree and add it to the table from *Q1.2*."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0272ea3-971d-4881-8308-9b41c38b05bd",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "5a7e1a79-340c-4b61-9e74-e06b4f455904",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"5. Finally, use random forest to improve the predictions. Motivate your choice for the tuning parameters. Report the training and test MSE and add it to the table from *Q1.2*. Which variables are most important in the random forest?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e9731a27-c811-4cf2-a53d-7d49a48e1d5b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"id": "ccecdd74-9faf-4b7a-bd23-9d3f81dcda60",
"metadata": {
"tags": [],
"user_expressions": []
},
"source": [
"6. Supposed it is the beginning of $2020$ and you have access to both the in-sample and out-of-sample errors for the different methods. Which model do you choose to predict stock markets in the future and why?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "151e7ae9-1f4d-47f9-87d1-9da0b030da50",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "raw",
"id": "2419d990-f478-4bda-8dbc-3144fbdfc917",
"metadata": {},
"source": [
"\\newpage"
]
},
{
"cell_type": "markdown",
"id": "81cbfae3-7385-40a2-8d0d-d7db7ae9a9f5",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": [],
"user_expressions": []
},
"source": [
"## Appendix\n",
"The dataset contains the following variables:\n",
"\n",
" - **ret**: the quarterly return of the US stock market (a number of 0.01 is a $1\\%$ return per quarter)\n",
" - **date**: the date in format $yyyyq$ ($19941$ means the first quarter of $1994$)\n",
" - **DP**: the dividend to price ratio of the stock market (a valuation measure whether prices are high or low relative to the dividends payed)\n",
" - **CS**: the credit spread defined as the difference in yields between high rated corporate bonds (save investments) and low rated corporate bonds (corporations that might go bankrupt). CS measures the additional return investors require to invest in risky firms compared to well established firms with lower risks\n",
" - **ntis**: A measure for corporate issuing activity (IPOs, stock repurchases,...)\n",
" - **cay**: a measure of the wealth-to-consumption ratio (how much is consumed relative to total wealth)\n",
" - **TS**: the term spread is the difference between the long term yield on government bonds and short term yields.\n",
" - **svar**: a measure for the stock market variance\n",
"\n",
"For a full description of the data, see *Welch und Goyal* ($2007$). Google is also very helpful if you are interested in obtaining more intuition about the variables.\n"
]
},
{
"cell_type": "markdown",
"id": "db90f03c-18a4-4e7f-a31c-56f206baf5cc",
"metadata": {
"editable": true,
"slideshow": {
"slide_type": ""
},
"tags": [],
"user_expressions": []
},
"source": [
"## References\n",
"\n",
"Welch, I. and A. Goyal ($2007$, $03$). A Comprehensive Look at The Empirical Performance of Equity\n",
"Premium Prediction. *The Review of Financial Studies 21* ($4$), $1455$ $1508$."
]
}
],
"metadata": {
"date": " ",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
},
"title": " ",
"toc-autonumbering": false,
"toc-showcode": false,
"toc-showmarkdowntxt": false,
"toc-showtags": false
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long