major upload of (python) course material & solutions
This commit is contained in:
File diff suppressed because one or more lines are too long
Binary file not shown.
@@ -0,0 +1,267 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "03c68072-8fd9-4c26-9f8b-e6f6e24fd583",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\vspace{-4cm}\n",
|
||||
"\\begin{center}\n",
|
||||
" \\LARGE{Machine Learning for Economics and Finance}\\\\[0.5cm]\n",
|
||||
" \\Large{\\textbf{01\\_Auto\\_data\\_1}}\\\\[1.0cm]\n",
|
||||
" \\large{Ole Wilms}\\\\[0.5cm]\n",
|
||||
" \\large{July 29, 2024}\\\\\n",
|
||||
"\\end{center}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "4e117807-3711-444b-838d-775303383d93",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\setcounter{secnumdepth}{0}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e0c43ee7-0ede-4d7e-9966-f00493b33f0a",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"**Get and Set working directory**:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "57ab9acc-8d99-4165-8930-db6ae2be39a9",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/mnt/ds/home/UHH_MLSJ_2024/Code/Python/01_SupLearn_Regression\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os # Package to access system related information \n",
|
||||
"print(os.getcwd()) # Prints the current working directory\n",
|
||||
"path = os.getcwd()\n",
|
||||
"os.chdir(path) # Set the working directory"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4424246b-bee1-4b9e-a5ac-79c20e4b4c26",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>mpg</th>\n",
|
||||
" <th>cylinders</th>\n",
|
||||
" <th>displacement</th>\n",
|
||||
" <th>horsepower</th>\n",
|
||||
" <th>weight</th>\n",
|
||||
" <th>acceleration</th>\n",
|
||||
" <th>year</th>\n",
|
||||
" <th>origin</th>\n",
|
||||
" <th>name</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>18.0</td>\n",
|
||||
" <td>8</td>\n",
|
||||
" <td>307.0</td>\n",
|
||||
" <td>130</td>\n",
|
||||
" <td>3504</td>\n",
|
||||
" <td>12.0</td>\n",
|
||||
" <td>70</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>chevrolet chevelle malibu</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>15.0</td>\n",
|
||||
" <td>8</td>\n",
|
||||
" <td>350.0</td>\n",
|
||||
" <td>165</td>\n",
|
||||
" <td>3693</td>\n",
|
||||
" <td>11.5</td>\n",
|
||||
" <td>70</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>buick skylark 320</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>18.0</td>\n",
|
||||
" <td>8</td>\n",
|
||||
" <td>318.0</td>\n",
|
||||
" <td>150</td>\n",
|
||||
" <td>3436</td>\n",
|
||||
" <td>11.0</td>\n",
|
||||
" <td>70</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>plymouth satellite</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>16.0</td>\n",
|
||||
" <td>8</td>\n",
|
||||
" <td>304.0</td>\n",
|
||||
" <td>150</td>\n",
|
||||
" <td>3433</td>\n",
|
||||
" <td>12.0</td>\n",
|
||||
" <td>70</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>amc rebel sst</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>17.0</td>\n",
|
||||
" <td>8</td>\n",
|
||||
" <td>302.0</td>\n",
|
||||
" <td>140</td>\n",
|
||||
" <td>3449</td>\n",
|
||||
" <td>10.5</td>\n",
|
||||
" <td>70</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>ford torino</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" mpg cylinders displacement horsepower weight acceleration year \\\n",
|
||||
"0 18.0 8 307.0 130 3504 12.0 70 \n",
|
||||
"1 15.0 8 350.0 165 3693 11.5 70 \n",
|
||||
"2 18.0 8 318.0 150 3436 11.0 70 \n",
|
||||
"3 16.0 8 304.0 150 3433 12.0 70 \n",
|
||||
"4 17.0 8 302.0 140 3449 10.5 70 \n",
|
||||
"\n",
|
||||
" origin name \n",
|
||||
"0 1 chevrolet chevelle malibu \n",
|
||||
"1 1 buick skylark 320 \n",
|
||||
"2 1 plymouth satellite \n",
|
||||
"3 1 amc rebel sst \n",
|
||||
"4 1 ford torino "
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from ISLP import load_data # Package which contains the data\n",
|
||||
"Auto = load_data('Auto') # Loading the data\n",
|
||||
"Auto.head() # Showing the first 5 Lines of Data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "141f0257-39a4-4e21-9be0-78dfd645445a",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import statsmodels.formula.api as smf\n",
|
||||
"\n",
|
||||
"# fit model on training data and calculate training MSE\n",
|
||||
"fit_lm = smf.ols(formula='mpg ~ horsepower', data = Auto).fit()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "4ae9bf59-3b73-4020-b039-885948f6cbbd",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"==============================================================================\n",
|
||||
" coef std err t P>|t| [0.025 0.975]\n",
|
||||
"------------------------------------------------------------------------------\n",
|
||||
"Intercept 39.9359 0.717 55.660 0.000 38.525 41.347\n",
|
||||
"horsepower -0.1578 0.006 -24.489 0.000 -0.171 -0.145\n",
|
||||
"==============================================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(fit_lm.summary().tables[1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "77170717-6eb1-41fa-a2b6-fdbf5e9193cf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"date": " ",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.7"
|
||||
},
|
||||
"title": " ",
|
||||
"toc-autonumbering": false,
|
||||
"toc-showcode": false,
|
||||
"toc-showmarkdowntxt": false,
|
||||
"toc-showtags": false
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 48 KiB |
@@ -0,0 +1,184 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "6cbef61b-0897-42bf-b456-c0a409b87c41",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\vspace{-4cm}\n",
|
||||
"\\begin{center}\n",
|
||||
" \\LARGE{Machine Learning for Economics and Finance}\\\\\n",
|
||||
" \\Large{Task 1: Logistic Regressions}\\\\[0.5cm]\n",
|
||||
" \\Large{\\textbf{02\\_Default\\_data}}\\\\[1.0cm]\n",
|
||||
" \\large{Ole Wilms}\\\\[0.5cm]\n",
|
||||
" \\large{July 29, 2024}\\\\\n",
|
||||
"\\end{center}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "13be77f3-44f0-4983-b4cb-bd3e4b5dba8b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\setcounter{secnumdepth}{0}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "72f918a4-cdd4-4b46-a88f-f4b43c3c3a88",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"## Task 1: Logistic Regressions"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0b3f9fc6-db4f-47b0-9dfa-e41d9f85a5ba",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"1.1 Randomly split the data into $7000$ observations for training and $3000$ observations for testing and set the seed to $1$ before sampling the data. Call these two datasets *train_data* and *test_data* respectively. (Hint: use the code to split the data from 01 Auto_data_2.R or Auto_data_2.Rmd)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "335aa198-5a94-4c5a-8ad8-67c78bcf71f5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "116c466d-0627-43d6-adbe-a937ac846a28",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"1.2 Fit a logistic regression of default on *income* using the *train_data*. Analyze the significance of\n",
|
||||
"the estimated coefficients."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "2e38a201-7f2d-4999-beab-5739217a9318",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "43c6dade-5a22-476a-b3bf-bfd1b880038d",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"1.3 Compute the *out-of-sample accuracy* and *error rate* and compare to the *in-sample statistics*. Do\n",
|
||||
"you think this is a good model to predict default?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "44028726-1eff-436f-bc47-04a6786ae3ad",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c28971ef-8bee-462d-9612-88f1534bfcb5",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"1.4 Add balance as a predictor and compute the *out-of-sample error rate* and *accuracy*. Do you\n",
|
||||
"think this is a good model to predict *default*?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3a7216df-adf5-4df0-9593-69c1a7649f64",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f267ef66-1775-42a8-a1e9-45fda849f4d9",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"1.5 Compare the results for Task $1.4$ to a model with only balance as a predictor. Which model\n",
|
||||
"would you choose?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "28082bd5-8fe1-4160-aec0-1a92aebfa671",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7ccad70f-5ef5-42c8-8c2e-22e76943d281",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"1.6 Take the model from Task $1.4$ but now re-estimate the model using different *seeds* to draw your\n",
|
||||
"*training* and *test data*. Does your *test error rate* change with the seed? What’s going on here?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9ab2f559-83b1-4a66-b1dc-8799b8301d85",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"date": " ",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.1"
|
||||
},
|
||||
"title": " ",
|
||||
"toc-autonumbering": false,
|
||||
"toc-showcode": false,
|
||||
"toc-showmarkdowntxt": false,
|
||||
"toc-showtags": false
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
@@ -0,0 +1,465 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "6cbef61b-0897-42bf-b456-c0a409b87c41",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\vspace{-4cm}\n",
|
||||
"\\begin{center}\n",
|
||||
" \\LARGE{Machine Learning for Economics and Finance}\\\\[0.5cm]\n",
|
||||
" \\Large{\\textbf{03\\_Default\\_data}}\\\\[1.0cm]\n",
|
||||
" \\large{Ole Wilms}\\\\[0.5cm]\n",
|
||||
" \\large{July 29, 2024}\\\\\n",
|
||||
"\\end{center}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "13be77f3-44f0-4983-b4cb-bd3e4b5dba8b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\setcounter{secnumdepth}{0}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "335aa198-5a94-4c5a-8ad8-67c78bcf71f5",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/mnt/ds/home/UHH_MLSJ_2024/Code/Python/03-CrossValidation\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>default</th>\n",
|
||||
" <th>student</th>\n",
|
||||
" <th>balance</th>\n",
|
||||
" <th>income</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>729.526495</td>\n",
|
||||
" <td>44361.625074</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>817.180407</td>\n",
|
||||
" <td>12106.134700</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>1073.549164</td>\n",
|
||||
" <td>31767.138947</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>529.250605</td>\n",
|
||||
" <td>35704.493935</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>785.655883</td>\n",
|
||||
" <td>38463.495879</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" default student balance income\n",
|
||||
"0 No No 729.526495 44361.625074\n",
|
||||
"1 No Yes 817.180407 12106.134700\n",
|
||||
"2 No No 1073.549164 31767.138947\n",
|
||||
"3 No No 529.250605 35704.493935\n",
|
||||
"4 No No 785.655883 38463.495879"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os # Package to access system related information \n",
|
||||
"print(os.getcwd()) # Prints the current working directory\n",
|
||||
"path = os.getcwd()\n",
|
||||
"os.chdir(path) # Set the working directory\n",
|
||||
"\n",
|
||||
"from ISLP import load_data # Package which contains the data\n",
|
||||
"default_data = load_data('Default') # Loading the data\n",
|
||||
"default_data.head() # Showing the first 5 Lines of Data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "2e38a201-7f2d-4999-beab-5739217a9318",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 10000 entries, 0 to 9999\n",
|
||||
"Data columns (total 4 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 default 10000 non-null object \n",
|
||||
" 1 student 10000 non-null object \n",
|
||||
" 2 balance 10000 non-null float64\n",
|
||||
" 3 income 10000 non-null float64\n",
|
||||
"dtypes: float64(2), object(2)\n",
|
||||
"memory usage: 312.6+ KB\n",
|
||||
"None\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(default_data.info())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "7dd29324-cd54-415c-ba83-56c0d9f74159",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" balance income\n",
|
||||
"count 10000.000000 10000.000000\n",
|
||||
"mean 835.374886 33516.981876\n",
|
||||
"std 483.714985 13336.639563\n",
|
||||
"min 0.000000 771.967729\n",
|
||||
"25% 481.731105 21340.462903\n",
|
||||
"50% 823.636973 34552.644802\n",
|
||||
"75% 1166.308386 43807.729272\n",
|
||||
"max 2654.322576 73554.233495\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(default_data.describe())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "3debf6d8-efda-4414-bcca-dd758dc65512",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"# set seed\n",
|
||||
"np.random.seed(1)\n",
|
||||
"\n",
|
||||
"# Number of observations in the dataset\n",
|
||||
"n = len(default_data)\n",
|
||||
"\n",
|
||||
"# Shuffle the dataset using np.random.permutation\n",
|
||||
"shuffled_indices = np.random.permutation(n)\n",
|
||||
"\n",
|
||||
"# Compute training and validation sample sizes\n",
|
||||
"nT = int(0.7 * n) # Training sample size\n",
|
||||
"\n",
|
||||
"# Split the shuffled dataset based on the shuffled indices\n",
|
||||
"train_data = default_data.iloc[shuffled_indices[:nT]] # First 70% for training\n",
|
||||
"test_data = default_data.iloc[shuffled_indices[nT:]] # Remaining 30% for validation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "e1b2a560-2a8e-4881-8d51-f3d96c3b05fe",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Train data percentage of defaulting: 0.03157\n",
|
||||
"Test data percentage of defaulting: 0.03733\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"defaulting_train = (train_data['default'] == 'Yes').mean()\n",
|
||||
"defaulting_test = (test_data['default'] == 'Yes').mean()\n",
|
||||
"# The \"train_data$default == \"Yes\": creates a logical vector where each element is TRUE \n",
|
||||
"# if the corresponding element.\n",
|
||||
"# The outer mean() function than calculates the proportion of TRUE values \n",
|
||||
"# in the logical vector.\n",
|
||||
"\n",
|
||||
"# Output the results\n",
|
||||
"print(f\"Train data percentage of defaulting: {round(defaulting_train, 5)}\")\n",
|
||||
"print(f\"Test data percentage of defaulting: {round(defaulting_test, 5)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "f9a25057-a631-48dc-883f-643bd09d0999",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Generalized Linear Model Regression Results \n",
|
||||
"==============================================================================\n",
|
||||
"Dep. Variable: default No. Observations: 7000\n",
|
||||
"Model: GLM Df Residuals: 6997\n",
|
||||
"Model Family: Binomial Df Model: 2\n",
|
||||
"Link Function: Logit Scale: 1.0000\n",
|
||||
"Method: IRLS Log-Likelihood: -542.14\n",
|
||||
"Date: Sat, 19 Oct 2024 Deviance: 1084.3\n",
|
||||
"Time: 16:53:00 Pearson chi2: 5.42e+03\n",
|
||||
"No. Iterations: 9 Pseudo R-squ. (CS): 0.1179\n",
|
||||
"Covariance Type: nonrobust \n",
|
||||
"==============================================================================\n",
|
||||
" coef std err z P>|z| [0.025 0.975]\n",
|
||||
"------------------------------------------------------------------------------\n",
|
||||
"const -11.3514 0.515 -22.060 0.000 -12.360 -10.343\n",
|
||||
"income 1.847e-05 5.98e-06 3.091 0.002 6.76e-06 3.02e-05\n",
|
||||
"balance 0.0055 0.000 20.428 0.000 0.005 0.006\n",
|
||||
"==============================================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import statsmodels.api as sm\n",
|
||||
"\n",
|
||||
"train_data_copy = train_data.copy()\n",
|
||||
"train_data_copy['default'] = train_data_copy['default'].map({'No': 0, 'Yes': 1})\n",
|
||||
"\n",
|
||||
"test_data_copy = test_data.copy()\n",
|
||||
"test_data_copy['default'] = test_data_copy['default'].map({'No': 0, 'Yes': 1})\n",
|
||||
"\n",
|
||||
"# Logistic regression model:\n",
|
||||
"X_train = train_data_copy[['income','balance']]\n",
|
||||
"X_train = sm.add_constant(X_train) # Adds an intercept term to the model\n",
|
||||
"X_test = test_data_copy[['income','balance']]\n",
|
||||
"X_test = sm.add_constant(X_test) # Adds an intercept term to the model\n",
|
||||
"y_train = train_data_copy['default']\n",
|
||||
"\n",
|
||||
"# Fit the logistic regression model\n",
|
||||
"glm_fit = sm.GLM(y_train, X_train, family=sm.families.Binomial()).fit()\n",
|
||||
"print(glm_fit.summary())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "b5c7de71-463d-455b-a596-923cfcddcefb",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"const -11.351394\n",
|
||||
"income 0.000018\n",
|
||||
"balance 0.005536\n",
|
||||
"dtype: float64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(glm_fit.params) # print coefficients"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "6b8fb99c-d172-4398-92e5-89324c1787f8",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"K-fold Cross-Validation Error Rate: 0.02571\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"from sklearn.model_selection import KFold\n",
|
||||
"\n",
|
||||
"# ---- K-Fold Cross-Validation ----\n",
|
||||
"folds = 10\n",
|
||||
"kf = KFold(n_splits=folds, shuffle=True, random_state=12)\n",
|
||||
"cv_errors = []\n",
|
||||
"\n",
|
||||
"for train_index, test_index in kf.split(X_train):\n",
|
||||
" X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]\n",
|
||||
" y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]\n",
|
||||
" \n",
|
||||
" # Fit model on this fold\n",
|
||||
" glm_fold = sm.GLM(y_train_fold, X_train_fold, family=sm.families.Binomial()).fit()\n",
|
||||
" \n",
|
||||
" # Compute the out-of-sample error for this fold\n",
|
||||
" preds_fold = glm_fold.predict(X_test_fold)\n",
|
||||
" pred_labels_fold = [1 if p > 0.5 else 0 for p in preds_fold]\n",
|
||||
" fold_error = np.mean(pred_labels_fold != y_test_fold)\n",
|
||||
" \n",
|
||||
" cv_errors.append(fold_error)\n",
|
||||
"\n",
|
||||
"cv_error_rate = np.mean(cv_errors)\n",
|
||||
"print(f\"K-fold Cross-Validation Error Rate: {cv_error_rate:.5f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "06091455-d874-4a10-9919-78c8c9ddfbed",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"In-sample accuracy: 0.97486\n",
|
||||
"In-sample error rate: 0.02514\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# ---- In-sample predictions ----\n",
|
||||
"glm_probs_train = glm_fit.predict(X_train)\n",
|
||||
"glm_pred_train = np.where(glm_probs_train > 0.5, 1, 0) # ternary operator\n",
|
||||
"\n",
|
||||
"# Compute in-sample accuracy and error rate\n",
|
||||
"accuracy_train = accuracy_score(y_train, glm_pred_train)\n",
|
||||
"error_rate_train = np.mean(glm_pred_train != y_train)\n",
|
||||
"\n",
|
||||
"print(f\"In-sample accuracy: {round(accuracy_train, 5)}\")\n",
|
||||
"print(f\"In-sample error rate: {round(error_rate_train, 5)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "9d115c02-9520-41d5-b04b-e8cbe84b0277",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ---- Out-of-sample predictions ----\n",
|
||||
"glm_probs_test = glm_fit.predict(X_test)\n",
|
||||
"glm_pred_test = np.where(glm_probs_test > 0.5, 1, 0) # ternary operator"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "80aadaaf-e914-4e70-9ea3-411965a8d9d7",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Out-of-sample accuracy: 0.97067\n",
|
||||
"Out-of-sample error rate: 0.02933\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Compute out-of-sample accuracy and error rate\n",
|
||||
"accuracy_test = accuracy_score(test_data_copy['default'], glm_pred_test)\n",
|
||||
"error_rate_test = np.mean(glm_pred_test != test_data_copy['default'])\n",
|
||||
"\n",
|
||||
"print(f\"Out-of-sample accuracy: {round(accuracy_test, 5)}\")\n",
|
||||
"print(f\"Out-of-sample error rate: {round(error_rate_test, 5)}\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"date": " ",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.7"
|
||||
},
|
||||
"title": " ",
|
||||
"toc-autonumbering": false,
|
||||
"toc-showcode": false,
|
||||
"toc-showmarkdowntxt": false,
|
||||
"toc-showtags": false
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
@@ -0,0 +1,196 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "fb0c424f-1667-4fb2-baab-2d88d8abb387",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Preliminary setup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "de6396ca-e17d-4c95-8f96-1f78a09e9ce2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from ISLP import load_data\n",
|
||||
"from matplotlib.pyplot import subplots, show\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Load and preprocess data\n",
|
||||
"Hitters = load_data('Hitters').dropna()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "87902d82-5336-456b-bec8-403530c75f00",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"# Task"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0ce8adda-23e7-498f-9ff3-26c138903b88",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"1. Use the final model (tuning parameter) obtained from 10-fold CV and fit the model again using the full dataset and display the corresponding coefficients."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ac884445-bc95-4659-b656-d9c5f821bf52",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "05635216-4afb-4d0d-982a-a2af35d6bf3a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Multiply the feature Errors by $1/1000$ and again fit the model from Task 1. Display the coefficients and interpret. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "70bc0da8-6134-4d4d-ad1f-e43ea26fae3c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b6e19093-51bf-4e68-aba6-01c34905b5e4",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"3. Redo Task 2 BUT without the normalizing (standardize) the data. Refit the same model again and display the coefficients. Interpret. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "5a38add3-642e-41a8-8b80-c3d01a63e538",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "df85262d-8a38-4bf9-9dfa-0a001e117d33",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"4. Split the dataset into a training set using $80\\%$ of the observations and validation set using all other observations."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b0a152a8-395e-49e2-973d-252b88cd379c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e1e3e60e-0d5a-4340-ae29-9153ffdad7c8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"5. Set up a grid for the tuning parameter $\\lambda$ and fit Lasso regressions for all tuning parameters using the training data. Make sure that you choose the mininmum and maximum values of $\\lambda$ so that it allows you to determine the optimal $\\lambda$ parameter in the next task (you might need to play with the grid size a bit). "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b5e0cff0-6782-40a3-8d7f-891c19bb5f4d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "21ba53c0-def1-4059-9872-27e6b437b8af",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"6. For each model (tuning parameter), compute the mean squared prediction error in the validation dataset. Plot the validation error as a function of $\\lambda$ and find the best model which minimizes the validation error. Display the estimated coefficients for the best model and check whether some features are not selected in the final regression. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8323ce02-17fe-4f54-820d-030f198a34fe",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "19f07912-bffd-4a19-9a92-aa1a2dc48c75",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"7. Finally compare the best Lasso model obtained from the validation set approach from Task 6 to the best Lasso model obtained by 5-fold cross-validation. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e0166113-9d31-4e42-a8df-69f2048b65af",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8fd306c8-2247-4343-8c30-5dd99393c9d0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"8. Compare the best model from Task 7 to the best ridge regression obtained from 5-fold cross validation. How do the coefficients of the two models differ?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3c70e9bd-78d9-4a91-a28f-588fca65c616",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"date": " ",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.3"
|
||||
},
|
||||
"title": " ",
|
||||
"toc-autonumbering": false,
|
||||
"toc-showcode": false,
|
||||
"toc-showmarkdowntxt": false,
|
||||
"toc-showtags": false
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
Binary file not shown.
@@ -0,0 +1,164 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.feature_selection import SequentialFeatureSelector
|
||||
from ISLP import load_data
|
||||
|
||||
###
|
||||
# Forward stepwise selection
|
||||
###
|
||||
# Load Hitters dataset from ISLP
|
||||
Hitters = load_data('Hitters')
|
||||
|
||||
# Remove missing values
|
||||
Hitters = Hitters.dropna()
|
||||
|
||||
# Create dummy variables for categorical columns
|
||||
Hitters = pd.get_dummies(Hitters, drop_first=True)
|
||||
|
||||
# Separate response (target) and predictors
|
||||
y = Hitters['Salary']
|
||||
X = Hitters.drop(columns=['Salary'])
|
||||
|
||||
# Define the linear regression model
|
||||
model = LinearRegression()
|
||||
|
||||
# Perform forward stepwise selection using SequentialFeatureSelector
|
||||
#sfs = SequentialFeatureSelector(model, n_features_to_select=15, direction='forward', cv=5)
|
||||
sfs = SequentialFeatureSelector(model, n_features_to_select=15, direction='forward')
|
||||
|
||||
# Fit the model to the data
|
||||
sfs.fit(X, y)
|
||||
|
||||
# Get the selected features
|
||||
selected_features = X.columns[sfs.get_support()]
|
||||
|
||||
# Fit the model with the selected features
|
||||
model.fit(X[selected_features], y)
|
||||
|
||||
# Coefficients of the selected features
|
||||
coefficients = pd.DataFrame({
|
||||
'Feature': selected_features,
|
||||
'Coefficient': model.coef_
|
||||
})
|
||||
|
||||
# Printing short summary - intercept, coefficients and $R^{2}$
|
||||
print("\nIntercept:")
|
||||
print(model.intercept_)
|
||||
|
||||
print("\nCoefficients:")
|
||||
print(coefficients)
|
||||
|
||||
print("\nR-squared:")
|
||||
print(model.score(X[selected_features], y))
|
||||
|
||||
|
||||
###
|
||||
# Validation errors for FSS
|
||||
###
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import mean_squared_error as MSE
|
||||
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
|
||||
import statsmodels.api as sm
|
||||
|
||||
# Split the data into training and validation sets based on row indices
|
||||
train_data = Hitters.iloc[:184] # First 184 rows for training data
|
||||
val_data = Hitters.iloc[184:263] # Rows 185 to 263 for validation data
|
||||
|
||||
# Define X and y for both training and validation sets
|
||||
X_train = train_data.drop(columns=['Salary'])
|
||||
y_train = train_data['Salary']
|
||||
X_val = val_data.drop(columns=['Salary'])
|
||||
y_val = val_data['Salary']
|
||||
|
||||
# Ensure that all categorical variables are encoded as numeric
|
||||
X_train = pd.get_dummies(X_train, drop_first=True).astype(float)
|
||||
X_val = pd.get_dummies(X_val, drop_first=True).astype(float)
|
||||
|
||||
# Align columns of validation set to match training set
|
||||
X_val = X_val.reindex(columns=X_train.columns, fill_value=0).astype(float)
|
||||
|
||||
# Convert validation data to matrix form (for statsmodels)
|
||||
val_data = sm.add_constant(X_val)
|
||||
|
||||
# Ensure target variable is numeric
|
||||
y_train_np = np.asarray(y_train).astype(float)
|
||||
y_val_np = np.asarray(y_val).astype(float)
|
||||
|
||||
|
||||
# Run forward stepwise selection using sklearn's SequentialFeatureSelector
|
||||
model2 = LinearRegression()
|
||||
|
||||
sfs2 = SFS(model2,
|
||||
k_features=15,
|
||||
forward=True,
|
||||
floating=False,
|
||||
scoring='neg_mean_squared_error',
|
||||
cv=0) # No cross-validation
|
||||
|
||||
sfs2.fit(X_train, y_train)
|
||||
|
||||
# Extract selected features for each number of features (1 to 15)
|
||||
#selected_features = list(sfs2.subsets_)
|
||||
selected_features = sfs2.subsets_
|
||||
|
||||
# Compute validation mean squared errors for each model
|
||||
val_err = np.zeros(15)
|
||||
for i in range(1, 16):
|
||||
# Get the selected feature names for this step
|
||||
feature_names = selected_features[i]['feature_names']
|
||||
|
||||
# Select the corresponding features from X_train
|
||||
X_train_selected = X_train[list(feature_names)]
|
||||
|
||||
# Add constant (intercept) term
|
||||
X_train_selected = sm.add_constant(X_train_selected).astype(float)
|
||||
|
||||
# Ensure the selected features are numeric
|
||||
X_train_selected_np = np.asarray(X_train_selected).astype(float)
|
||||
|
||||
# Fit OLS model
|
||||
model = sm.OLS(y_train_np, X_train_selected_np).fit()
|
||||
|
||||
# Predict on validation set
|
||||
X_val_selected = val_data[list(feature_names)]
|
||||
X_val_selected_np = sm.add_constant(X_val_selected).astype(float) # Ensure numpy array is float
|
||||
|
||||
y_pred_val = model.predict(X_val_selected_np)
|
||||
|
||||
# Compute MSE for validation set
|
||||
val_err[i - 1] = MSE(y_val_np, y_pred_val)
|
||||
|
||||
# Print validation errors for each model size
|
||||
print("Validation Errors for each model size (1 to 15 features):")
|
||||
print(val_err)
|
||||
|
||||
print("\nMin val_err: ", min(val_err))
|
||||
|
||||
|
||||
##
|
||||
# PLOT results
|
||||
##
|
||||
import matplotlib.pyplot as plt
|
||||
# Assuming 'val_err' contains the validation MSE values
|
||||
|
||||
# Find the index of the minimum validation error
|
||||
min_index = np.argmin(val_err) + 1 # +1 because index starts from 0, but variables start from 1
|
||||
|
||||
# Plot the validation errors
|
||||
plt.figure(figsize=(8, 5))
|
||||
plt.plot(range(1, 16), val_err, marker='o', linestyle='--', color='black')
|
||||
|
||||
# Highlight the minimum MSE with a red vertical line
|
||||
plt.axvline(x=min_index, color='red', linestyle='-', linewidth=1.5)
|
||||
|
||||
# Label the axes
|
||||
plt.xlabel("# Variables", fontsize=12)
|
||||
plt.ylabel("Validation MSE", fontsize=12)
|
||||
|
||||
# Title for the plot (optional)
|
||||
plt.title("Validation MSE vs Number of Variables", fontsize=14)
|
||||
|
||||
# Show the plot
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
@@ -0,0 +1,50 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.linear_model import Ridge, RidgeCV, LassoCV
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from ISLP import load_data
|
||||
|
||||
# === Setup ===
|
||||
# Load and preprocess data
|
||||
Hitters = load_data('Hitters').dropna()
|
||||
Hitters = pd.get_dummies(Hitters, drop_first=True)
|
||||
y = Hitters['Salary']
|
||||
X = Hitters.drop(columns=['Salary'])
|
||||
|
||||
# Standardize predictors
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
|
||||
# === SLIDE 1: Ridge regression with fixed lambda ===
|
||||
ridge_fixed = Ridge(alpha=100)
|
||||
ridge_fixed.fit(X_scaled, y)
|
||||
ridge_fixed_coeffs = ridge_fixed.coef_
|
||||
ridge_fixed_preds = ridge_fixed.predict(X_scaled[:5])
|
||||
|
||||
# === SLIDE 2: Ridge regression with cross-validation to find best lambda ===
|
||||
lambdas = 10**np.linspace(10, -2, 100) * 0.5 # Equivalent to R's lambda grid
|
||||
ridge_cv = RidgeCV(alphas=lambdas, scoring='neg_mean_squared_error', cv=10)
|
||||
ridge_cv.fit(X_scaled, y)
|
||||
best_lambda_ridge = ridge_cv.alpha_
|
||||
ridge_cv_coeffs = ridge_cv.coef_
|
||||
ridge_cv_preds = ridge_cv.predict(X_scaled[:5])
|
||||
|
||||
# === SLIDE 3: Lasso regression with cross-validation ===
|
||||
lasso_cv = LassoCV(cv=10, max_iter=10000)
|
||||
lasso_cv.fit(X_scaled, y)
|
||||
best_lambda_lasso = lasso_cv.alpha_
|
||||
lasso_cv_coeffs = lasso_cv.coef_
|
||||
lasso_cv_preds = lasso_cv.predict(X_scaled[:5])
|
||||
|
||||
# === Create summary DataFrame ===
|
||||
summary = pd.DataFrame({
|
||||
'Model': ['Ridge (lambda=100)', 'RidgeCV (best lambda)', 'LassoCV (best lambda)'],
|
||||
'Best Lambda': [100, best_lambda_ridge, best_lambda_lasso],
|
||||
'Non-zero Coefficients': [
|
||||
np.sum(ridge_fixed_coeffs != 0),
|
||||
np.sum(ridge_cv_coeffs != 0),
|
||||
np.sum(lasso_cv_coeffs != 0)
|
||||
]
|
||||
})
|
||||
|
||||
print(summary)
|
||||
@@ -0,0 +1,101 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from ISLP import load_data
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.tree import DecisionTreeClassifier, plot_tree
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.model_selection import cross_val_score
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
# Load and preprocess data
|
||||
Carseats = load_data('Carseats').dropna()
|
||||
|
||||
# Create qualitative variable "High" vs "Low" Sales
|
||||
Carseats['High'] = np.where(Carseats['Sales'] <= 8, 'No', 'Yes')
|
||||
Carseats['High'] = Carseats['High'].astype('category')
|
||||
|
||||
# Drop 'Sales' from predictors
|
||||
X = Carseats.drop(columns=['Sales', 'High'])
|
||||
X = pd.get_dummies(X, drop_first=True) # Convert categorical to dummy variables
|
||||
y = Carseats['High']
|
||||
|
||||
# Train/test split (200 obs each)
|
||||
np.random.seed(2)
|
||||
train_idx = np.random.choice(len(Carseats), size=200, replace=False)
|
||||
X_train = X.iloc[train_idx]
|
||||
X_test = X.drop(train_idx)
|
||||
y_train = y.iloc[train_idx]
|
||||
y_test = y.drop(train_idx)
|
||||
|
||||
# Fit classification tree
|
||||
tree_model = DecisionTreeClassifier(criterion='entropy', random_state=2)
|
||||
tree_model.fit(X_train, y_train)
|
||||
|
||||
# Summary
|
||||
print(f"Tree depth: {tree_model.get_depth()}, Terminal nodes: {tree_model.get_n_leaves()}")
|
||||
|
||||
# Plot tree
|
||||
plt.figure(figsize=(16, 8))
|
||||
plot_tree(tree_model, filled=True, feature_names=X.columns, class_names=tree_model.classes_, fontsize=8)
|
||||
plt.title("Classification Tree")
|
||||
plt.show()
|
||||
|
||||
# Test error rate
|
||||
y_pred = tree_model.predict(X_test)
|
||||
error_rate_test = np.mean(y_pred != y_test)
|
||||
print(f"Test Error (Unpruned Tree): {error_rate_test:.3f}")
|
||||
|
||||
# Cross-validation to find optimal pruning parameter using cost-complexity pruning
|
||||
path = tree_model.cost_complexity_pruning_path(X_train, y_train)
|
||||
ccp_alphas = path.ccp_alphas[:-1] # exclude the last (trivial) alpha
|
||||
cv_errors = []
|
||||
|
||||
for alpha in ccp_alphas:
|
||||
clf = DecisionTreeClassifier(random_state=2, ccp_alpha=alpha)
|
||||
scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy')
|
||||
cv_errors.append(1 - scores.mean())
|
||||
|
||||
# Plot CV errors
|
||||
plt.figure(figsize=(8, 5))
|
||||
plt.plot(ccp_alphas, cv_errors, marker='o')
|
||||
plt.xlabel("ccp_alpha")
|
||||
plt.ylabel("Cross-Validated Classification Error")
|
||||
plt.title("CV Error vs. Tree Complexity")
|
||||
plt.show()
|
||||
|
||||
# Prune tree with optimal alpha (min CV error)
|
||||
optimal_alpha = ccp_alphas[np.argmin(cv_errors)]
|
||||
pruned_tree = DecisionTreeClassifier(random_state=2, ccp_alpha=optimal_alpha)
|
||||
pruned_tree.fit(X_train, y_train)
|
||||
|
||||
# Plot pruned tree
|
||||
plt.figure(figsize=(16, 8))
|
||||
plot_tree(pruned_tree, filled=True, feature_names=X.columns, class_names=pruned_tree.classes_, fontsize=8)
|
||||
plt.title("Pruned Classification Tree")
|
||||
plt.show()
|
||||
|
||||
# Test error of pruned tree
|
||||
y_pred_pruned = pruned_tree.predict(X_test)
|
||||
error_rate_pruned = np.mean(y_pred_pruned != y_test)
|
||||
print(f"Test Error (Pruned Tree): {error_rate_pruned:.3f}")
|
||||
|
||||
# Fit Random Forest
|
||||
rf_model = RandomForestClassifier(n_estimators=500, max_features=3, oob_score=True, random_state=2)
|
||||
rf_model.fit(X_train, y_train)
|
||||
|
||||
# OOB Error
|
||||
oob_error = 1 - rf_model.oob_score_ if rf_model.oob_score else "OOB not enabled"
|
||||
print(f"OOB Error Rate: {oob_error}")
|
||||
|
||||
# Test error of RF
|
||||
rf_pred = rf_model.predict(X_test)
|
||||
error_rate_rf = np.mean(rf_pred != y_test)
|
||||
print(f"Test Error (Random Forest): {error_rate_rf:.3f}")
|
||||
|
||||
# Feature importance
|
||||
importances = pd.Series(rf_model.feature_importances_, index=X.columns)
|
||||
importances.sort_values(ascending=True).plot(kind='barh', figsize=(10, 8), title="Variable Importance")
|
||||
plt.xlabel("Importance")
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
@@ -0,0 +1,293 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "67cd5699-6111-4576-9386-0fe46130f060",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Preliminary setup"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0ea9c10a-5919-467d-8aca-efa3f2bc05e3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from ISLP import load_data\n",
|
||||
"from matplotlib.pyplot import subplots, show\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Load and preprocess data\n",
|
||||
"Hitters = load_data('Hitters').dropna()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ce3b15bc-bebb-48cb-b0ab-8754b5004796",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Task 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a277a01e-5932-4376-9771-ca735b510eab",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"1. Use the Hitters data and remove all rows that contain missing values. Create a new\n",
|
||||
"variable that is the log of Salary and provide histograms for Salary and Log(Salary).\n",
|
||||
"Interpret."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bcc5d1a2-c5b8-401d-b854-dd0ff5837704",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5ce10e96-7257-4e74-b4dd-61eadc98090a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"2. Split the sample into a training dataset consisting of the first 200 observations and a\n",
|
||||
"test dataset containing the remaining observations."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e1c39b34-4e4e-42bb-a915-ff7d9edc2bb5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2cffb0ba-7e62-4cff-b79d-ef5e027a62ec",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"3. Fit a large, unpruned regression tree to predigt Log(Salary). Which features are used\n",
|
||||
"to construct the tree, which features are the most important and how many terminal\n",
|
||||
"nodes does the tree have? You might want to plot the tree for this exercise."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "425892e5-ba65-4be4-b103-5d1968973cf5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0c19dc38-6d3d-4d83-8e77-eab071883a1e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"4. Compute the mean squared prediction error for the test data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "eb73ed7b-6730-4a98-b04e-0d12c0c7125d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "dbae3448-f484-4fe2-afd1-40a741b8ef9e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"5. Let’s try to improve predictions using k-fold CV. Set the seed to 2 and run 5-fold cross\n",
|
||||
"validation. Plot the mean squared cross validation error against the tree size and\n",
|
||||
"report the tree size and the pruning parameter α that minimize the mean squared\n",
|
||||
"cross validation error."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "31280859-0b4f-4b8d-9aeb-4e9c83bd008a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "37322a0e-a542-4b10-88e3-eb88d7b1f2ac",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"6. Use the pruning parameter from the previous task to prune the tree. Plot the tree and\n",
|
||||
"report the most important variables."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b8bf40b3-8cba-4335-92e2-686ba0a93185",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "67496351-580b-4e9f-9b17-2776f2c55843",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"7. Compute the test mean squared prediction error for pruned tree and compare to the\n",
|
||||
"results from Task 4."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c3104831-7607-4eab-a0a2-861adde2658d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "30021421-8807-4481-b28d-6ea23cb06b82",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"8. Use random forest to improve the predictions. Fit $500$ trees using $m = \\sqrt(p)$ (round to the nearest integer)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c907edbf-5755-4a5c-bd12-ea80a2358358",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4b014396-e91b-4f72-9b58-85fa80805eb0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"9. Do you think it was necessary to fit $500$ trees or would have fewer trees be sufficient? Determine the number of trees that provides the lowest OOB error."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "77cb58bd-6d3d-4b0d-ad5e-e18737501cb8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2cea0e71-cc51-4890-b776-e4f03d7af94d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"10. Compute the OOB estimate of the out-of-sample error and compare it to best pruned model from CV of Task 5. Interpret the outcomes."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6aafe1d3-b54c-4bca-9070-ea62ac27f885",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "992771aa-1fec-44d0-b3f5-e8525bd1ce79",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"11. Which are the most important variables used in the random forest?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "85841a9e-4df5-4d14-ae2b-107002042fd8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bc5eee45-8c48-41dd-ba38-7f78c4bcd036",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"12. Let’s try to improve the random forest by trying out different values for $m$. Set up a grid for m going from $1$ to $p$. Write a loop that fits a random forest for each $m$. Explain which model you would choose."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0361acc5-041d-46b1-848d-eadea0ce717b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6f38e2e4-8242-46c6-9c49-69b7ee73be1e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"13. For the best model, compute the test errors and compare them to the best pruned model from Task 7."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d31d199b-116f-4585-8e4d-e40d4b6ff685",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d6f1407e-5ad1-4690-bf9e-ecc36c4a50e5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"14. What is the OOB error obtained from bagging (you can infer the answer from the previous task)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d7ed7a03-8520-4fba-b2ff-500979e92496",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
@@ -0,0 +1,62 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from ISLP import load_data
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tensorflow.keras.models import Sequential
|
||||
from tensorflow.keras.layers import Dense
|
||||
#from tensorflow.keras.optimizers import SGD
|
||||
from tensorflow.keras.optimizers import Adam
|
||||
|
||||
# === Setup ===
|
||||
# Load and preprocess Hitters data
|
||||
Hitters = load_data('Hitters').dropna()
|
||||
|
||||
# Convert target to binary classification (Salary >= 500 as good income)
|
||||
print(Hitters[["Salary"]].describe())
|
||||
y = np.where(Hitters['Salary'] >= 500, 1, 0)
|
||||
|
||||
# Convert categorical variables into numerical variables (if needed)
|
||||
Hitters = pd.get_dummies(Hitters.drop(columns=['Salary']), drop_first=True)
|
||||
|
||||
# Extract feature matrix after one-hot encoding
|
||||
X = Hitters
|
||||
|
||||
# Standardize the features
|
||||
scaler = StandardScaler()
|
||||
X_scaled = scaler.fit_transform(X)
|
||||
|
||||
|
||||
# Split into training and testing sets
|
||||
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
|
||||
|
||||
|
||||
## Build the Neural Network
|
||||
model = Sequential([
|
||||
Dense(units=64, input_shape=(X_train.shape[1],), activation='relu'), # Input and hidden layer
|
||||
Dense(units=32, activation='relu'), # Hidden layer
|
||||
Dense(units=1, activation='sigmoid') # Output layer
|
||||
])
|
||||
|
||||
## Compile the Model (Adam optimizer and binary_crossentropy loss)
|
||||
model.compile(optimizer=Adam(learning_rate=0.001),
|
||||
loss='binary_crossentropy',
|
||||
metrics=['accuracy'])
|
||||
|
||||
## Train the Model
|
||||
history = model.fit(X_train, y_train, epochs=50, batch_size=16, validation_split=0.1, verbose=1)
|
||||
|
||||
## Evaluate the Model
|
||||
loss, accuracy = model.evaluate(X_test, y_test)
|
||||
print(f"Test Accuracy: {accuracy:.2f}")
|
||||
|
||||
## Visualize Training Progress
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
plt.plot(history.history['accuracy'], label='Train Accuracy')
|
||||
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
|
||||
plt.title('Model Accuracy')
|
||||
plt.xlabel('Epochs')
|
||||
plt.ylabel('Accuracy')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 33 KiB |
3688
Machine Learning for Economics and Finance/Problem Set 1/ProblemSet1_solution.ipynb
Executable file
3688
Machine Learning for Economics and Finance/Problem Set 1/ProblemSet1_solution.ipynb
Executable file
File diff suppressed because one or more lines are too long
Binary file not shown.
354
Machine Learning for Economics and Finance/Problem Set 2/ProblemSet2.ipynb
Executable file
354
Machine Learning for Economics and Finance/Problem Set 2/ProblemSet2.ipynb
Executable file
@@ -0,0 +1,354 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "77f76980-cc4f-4837-867f-218c92a7deae",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\vspace{-4cm}\n",
|
||||
"\\begin{center}\n",
|
||||
" \\LARGE{Machine Learning for Economics and Finance}\\\\[0.5cm]\n",
|
||||
" \\Large{\\textbf{Problem Set 2}}\\\\[1.0cm]\n",
|
||||
" \\large{Ole Wilms}\\\\[0.5cm]\n",
|
||||
" \\large{July 29, 2024}\\\\\n",
|
||||
"\\end{center}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "2c3a2d4e-1e5a-4fe3-88be-abd9b9152def",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\setcounter{secnumdepth}{0}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "040dc2a4-910e-4cf5-9d1e-62fe7d0a8efd",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"## Important Instructions\n",
|
||||
"\n",
|
||||
"- In this problem set you are asked to apply the machine learning techniques we covered in the past weeks\n",
|
||||
"- In case you struggle with some problems, please post your questions on the OpenOlat discussion board.\n",
|
||||
"- We will discuss the solutions for the problem set on `MONTH DAY`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "baac6966-d67a-4a66-acec-8ef6411c4f66",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"## Setup\n",
|
||||
"\n",
|
||||
"Assume the same setup as in *Problem Set 1* but now you try to improve the return predictions using\n",
|
||||
"the machine learning approaches we have discussed in class. For this you are asked to use the same\n",
|
||||
"training and test datasets we constructed in *Problem Set 1*."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "156ee566-f0eb-4206-a443-34a63bc6dbd8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\newpage"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "87902d82-5336-456b-bec8-403530c75f00",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"## Question 1: Shrinkage Methods\n",
|
||||
"\n",
|
||||
"1. Fit a ridge regression using the training data. Determine the optimal penalty parameter $\\lambda$ using $5$-fold cross validation (set the seed to $2$ before you run the CV). Provide a plot of the cross-validation MSE as a function of log($\\lambda$) and interpret the outome."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0770500d-74fe-48df-841c-20b9aef42883",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "73330b81-0e43-43ac-911f-4086a9f9788f",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"2. Prepare a slide with a table that reports training MSE and test MSE for different models. Fill in the MSE from the linear model using all features from Problem Set 1. Now compute the training and test MSE for the ridge regression with the optimal penalty parameter $\\lambda$ from *Q1.1*."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f1b13abd-80b1-4805-b108-55d403b7ab5c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "80e4160e-374a-43e1-a159-45077703658e",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"3. Redo the two tasks above using Lasso instead of Ridge. Again fix the seed to $2$. Provide a plot of the cross-validation MSE as a function of log($\\lambda$) and interpret. Provide a table that shows the coefficient of the Lasso with the optimal penalty parameter $\\lambda$. Compute the training and test MSE of this Lasso model and add it to the table from *Q1.2*."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a214f453-68d3-4b6f-bc36-dbabf5536fc3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "03d19235-25ee-4c3b-b7bf-97cdf27d41b2",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"4. Now suppose your boss tells you that he only trusts sparse models with few variables. Use the Lasso and choose the tuning parameter $\\lambda$ such that the model only considers $3$ out of the six variables. Report the coefficients and compare them to the coefficients from the optimal model from *Q1.3* and interpret. Compute the training and test MSE of this Lasso model and add it to the table from *Q1.2*. Interpret."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9e53d846-19a3-46d9-b103-f42e75a87c20",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e715dd42-7021-466d-a9c1-0c0b4efeee78",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"## Question 2: Tree-Based Methods\n",
|
||||
"\n",
|
||||
"1. Fit a large regression tree using the training data. Report the number of terminal nodes as well as the most important variables for splitting the tree."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0207f3f9-c389-4e50-abeb-5316857ab2da",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3069027d-f53f-4348-8c0c-0885483dc8d9",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"2. Compute the training and test MSE of the tree and add it to the table from *Q1.2*."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f65211c4-6864-4749-8b94-eaeea96c9cbf",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "581f7631-9c99-4143-b87e-11b43c243dd0",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"3. Again set the seed to $2$ and use $5$-fold cross validation to determine the optimal pruning parameter for the large tree. Provide a plot of the prediction error against the size of the tree. Report the optimal tree size and provide a plot of the pruned tree. Which variables are important for splitting the pruned tree?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9801c9a3-85ba-4b70-82b6-a9bbbfcfaec4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "18a9a179-4226-4734-8bcf-554671ce85e9",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"4. Compute the training and test MSE of the pruned tree and add it to the table from *Q1.2*."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b0272ea3-971d-4881-8308-9b41c38b05bd",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5a7e1a79-340c-4b61-9e74-e06b4f455904",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"5. Finally, use random forest to improve the predictions. Motivate your choice for the tuning parameters. Report the training and test MSE and add it to the table from *Q1.2*. Which variables are most important in the random forest?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e9731a27-c811-4cf2-a53d-7d49a48e1d5b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "ccecdd74-9faf-4b7a-bd23-9d3f81dcda60",
|
||||
"metadata": {
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"6. Supposed it is the beginning of $2020$ and you have access to both the in-sample and out-of-sample errors for the different methods. Which model do you choose to predict stock markets in the future and why?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "151e7ae9-1f4d-47f9-87d1-9da0b030da50",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "2419d990-f478-4bda-8dbc-3144fbdfc917",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\newpage"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "81cbfae3-7385-40a2-8d0d-d7db7ae9a9f5",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"## Appendix\n",
|
||||
"The dataset contains the following variables:\n",
|
||||
"\n",
|
||||
" - **ret**: the quarterly return of the US stock market (a number of 0.01 is a $1\\%$ return per quarter)\n",
|
||||
" - **date**: the date in format $yyyyq$ ($19941$ means the first quarter of $1994$)\n",
|
||||
" - **DP**: the dividend to price ratio of the stock market (a valuation measure whether prices are high or low relative to the dividends payed)\n",
|
||||
" - **CS**: the credit spread defined as the difference in yields between high rated corporate bonds (save investments) and low rated corporate bonds (corporations that might go bankrupt). CS measures the additional return investors require to invest in risky firms compared to well established firms with lower risks\n",
|
||||
" - **ntis**: A measure for corporate issuing activity (IPO’s, stock repurchases,...)\n",
|
||||
" - **cay**: a measure of the wealth-to-consumption ratio (how much is consumed relative to total wealth)\n",
|
||||
" - **TS**: the term spread is the difference between the long term yield on government bonds and short term yields.\n",
|
||||
" - **svar**: a measure for the stock market variance\n",
|
||||
"\n",
|
||||
"For a full description of the data, see *Welch und Goyal* ($2007$). Google is also very helpful if you are interested in obtaining more intuition about the variables.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "db90f03c-18a4-4e7f-a31c-56f206baf5cc",
|
||||
"metadata": {
|
||||
"editable": true,
|
||||
"slideshow": {
|
||||
"slide_type": ""
|
||||
},
|
||||
"tags": [],
|
||||
"user_expressions": []
|
||||
},
|
||||
"source": [
|
||||
"## References\n",
|
||||
"\n",
|
||||
"Welch, I. and A. Goyal ($2007$, $03$). A Comprehensive Look at The Empirical Performance of Equity\n",
|
||||
"Premium Prediction. *The Review of Financial Studies 21* ($4$), $1455$ – $1508$."
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"date": " ",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.7"
|
||||
},
|
||||
"title": " ",
|
||||
"toc-autonumbering": false,
|
||||
"toc-showcode": false,
|
||||
"toc-showmarkdowntxt": false,
|
||||
"toc-showtags": false
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
BIN
Machine Learning for Economics and Finance/Problem Set 2/ProblemSet2.pdf
Executable file
BIN
Machine Learning for Economics and Finance/Problem Set 2/ProblemSet2.pdf
Executable file
Binary file not shown.
2092
Machine Learning for Economics and Finance/Problem Set 2/ProblemSet2_solution.ipynb
Executable file
2092
Machine Learning for Economics and Finance/Problem Set 2/ProblemSet2_solution.ipynb
Executable file
File diff suppressed because one or more lines are too long
Binary file not shown.
Reference in New Issue
Block a user