major upload of (python) course material & solutions

2025-12-03 14:39:45 +01:00
parent 52552e20cb
commit e95a0b2ecc
39 changed files with 13598 additions and 0 deletions
--- a/Validation/03_Auto_data_CV.ipynb
+++ b/Validation/03_Auto_data_CV.ipynb
--- a/Validation/03_Auto_data_CV.pdf
+++ b/Validation/03_Auto_data_CV.pdf
--- a/Validation/03_Auto_data_val_set.ipynb
+++ b/Validation/03_Auto_data_val_set.ipynb
--- a/Validation/03_Auto_data_val_set.pdf
+++ b/Validation/03_Auto_data_val_set.pdf
--- a/Validation/03_Default_data.ipynb
+++ b/Validation/03_Default_data.ipynb
@@ -0,0 +1,465 @@
+{
+ "cells": [
+  {
+   "cell_type": "raw",
+   "id": "6cbef61b-0897-42bf-b456-c0a409b87c41",
+   "metadata": {},
+   "source": [
+    "\\vspace{-4cm}\n",
+    "\\begin{center}\n",
+    "  \\LARGE{Machine Learning for Economics and Finance}\\\\[0.5cm]\n",
+    "  \\Large{\\textbf{03\\_Default\\_data}}\\\\[1.0cm]\n",
+    "  \\large{Ole Wilms}\\\\[0.5cm]\n",
+    "  \\large{July 29, 2024}\\\\\n",
+    "\\end{center}"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "13be77f3-44f0-4983-b4cb-bd3e4b5dba8b",
+   "metadata": {},
+   "source": [
+    "\\setcounter{secnumdepth}{0}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "335aa198-5a94-4c5a-8ad8-67c78bcf71f5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/mnt/ds/home/UHH_MLSJ_2024/Code/Python/03-CrossValidation\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>default</th>\n",
+       "      <th>student</th>\n",
+       "      <th>balance</th>\n",
+       "      <th>income</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>729.526495</td>\n",
+       "      <td>44361.625074</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>No</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>817.180407</td>\n",
+       "      <td>12106.134700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>1073.549164</td>\n",
+       "      <td>31767.138947</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>529.250605</td>\n",
+       "      <td>35704.493935</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
+       "      <td>785.655883</td>\n",
+       "      <td>38463.495879</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  default student      balance        income\n",
+       "0      No      No   729.526495  44361.625074\n",
+       "1      No     Yes   817.180407  12106.134700\n",
+       "2      No      No  1073.549164  31767.138947\n",
+       "3      No      No   529.250605  35704.493935\n",
+       "4      No      No   785.655883  38463.495879"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import os           # Package to access system related information \n",
+    "print(os.getcwd())  # Prints the current working directory\n",
+    "path = os.getcwd()\n",
+    "os.chdir(path)      # Set the working directory\n",
+    "\n",
+    "from ISLP import load_data           # Package which contains the data\n",
+    "default_data = load_data('Default')  # Loading the data\n",
+    "default_data.head()                  # Showing the first 5 Lines of Data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2e38a201-7f2d-4999-beab-5739217a9318",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 10000 entries, 0 to 9999\n",
+      "Data columns (total 4 columns):\n",
+      " #   Column   Non-Null Count  Dtype  \n",
+      "---  ------   --------------  -----  \n",
+      " 0   default  10000 non-null  object \n",
+      " 1   student  10000 non-null  object \n",
+      " 2   balance  10000 non-null  float64\n",
+      " 3   income   10000 non-null  float64\n",
+      "dtypes: float64(2), object(2)\n",
+      "memory usage: 312.6+ KB\n",
+      "None\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(default_data.info())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7dd29324-cd54-415c-ba83-56c0d9f74159",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "            balance        income\n",
+      "count  10000.000000  10000.000000\n",
+      "mean     835.374886  33516.981876\n",
+      "std      483.714985  13336.639563\n",
+      "min        0.000000    771.967729\n",
+      "25%      481.731105  21340.462903\n",
+      "50%      823.636973  34552.644802\n",
+      "75%     1166.308386  43807.729272\n",
+      "max     2654.322576  73554.233495\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(default_data.describe())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "3debf6d8-efda-4414-bcca-dd758dc65512",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "# set seed\n",
+    "np.random.seed(1)\n",
+    "\n",
+    "# Number of observations in the dataset\n",
+    "n = len(default_data)\n",
+    "\n",
+    "# Shuffle the dataset using np.random.permutation\n",
+    "shuffled_indices = np.random.permutation(n)\n",
+    "\n",
+    "# Compute training and validation sample sizes\n",
+    "nT = int(0.7 * n)  # Training sample size\n",
+    "\n",
+    "# Split the shuffled dataset based on the shuffled indices\n",
+    "train_data = default_data.iloc[shuffled_indices[:nT]]   # First 70% for training\n",
+    "test_data = default_data.iloc[shuffled_indices[nT:]]    # Remaining 30% for validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e1b2a560-2a8e-4881-8d51-f3d96c3b05fe",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train data percentage of defaulting: 0.03157\n",
+      "Test data percentage of defaulting:  0.03733\n"
+     ]
+    }
+   ],
+   "source": [
+    "defaulting_train = (train_data['default'] == 'Yes').mean()\n",
+    "defaulting_test = (test_data['default'] == 'Yes').mean()\n",
+    "# The \"train_data$default == \"Yes\": creates a logical vector where each element is TRUE \n",
+    "# if the corresponding element.\n",
+    "# The outer mean() function than calculates the proportion of TRUE values \n",
+    "# in the logical vector.\n",
+    "\n",
+    "# Output the results\n",
+    "print(f\"Train data percentage of defaulting: {round(defaulting_train, 5)}\")\n",
+    "print(f\"Test data percentage of defaulting:  {round(defaulting_test, 5)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "f9a25057-a631-48dc-883f-643bd09d0999",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                 Generalized Linear Model Regression Results                  \n",
+      "==============================================================================\n",
+      "Dep. Variable:                default   No. Observations:                 7000\n",
+      "Model:                            GLM   Df Residuals:                     6997\n",
+      "Model Family:                Binomial   Df Model:                            2\n",
+      "Link Function:                  Logit   Scale:                          1.0000\n",
+      "Method:                          IRLS   Log-Likelihood:                -542.14\n",
+      "Date:                Sat, 19 Oct 2024   Deviance:                       1084.3\n",
+      "Time:                        16:53:00   Pearson chi2:                 5.42e+03\n",
+      "No. Iterations:                     9   Pseudo R-squ. (CS):             0.1179\n",
+      "Covariance Type:            nonrobust                                         \n",
+      "==============================================================================\n",
+      "                 coef    std err          z      P>|z|      [0.025      0.975]\n",
+      "------------------------------------------------------------------------------\n",
+      "const        -11.3514      0.515    -22.060      0.000     -12.360     -10.343\n",
+      "income      1.847e-05   5.98e-06      3.091      0.002    6.76e-06    3.02e-05\n",
+      "balance        0.0055      0.000     20.428      0.000       0.005       0.006\n",
+      "==============================================================================\n"
+     ]
+    }
+   ],
+   "source": [
+    "import statsmodels.api as sm\n",
+    "\n",
+    "train_data_copy = train_data.copy()\n",
+    "train_data_copy['default'] = train_data_copy['default'].map({'No': 0, 'Yes': 1})\n",
+    "\n",
+    "test_data_copy = test_data.copy()\n",
+    "test_data_copy['default'] = test_data_copy['default'].map({'No': 0, 'Yes': 1})\n",
+    "\n",
+    "# Logistic regression model:\n",
+    "X_train = train_data_copy[['income','balance']]\n",
+    "X_train = sm.add_constant(X_train)  # Adds an intercept term to the model\n",
+    "X_test = test_data_copy[['income','balance']]\n",
+    "X_test = sm.add_constant(X_test)  # Adds an intercept term to the model\n",
+    "y_train = train_data_copy['default']\n",
+    "\n",
+    "# Fit the logistic regression model\n",
+    "glm_fit = sm.GLM(y_train, X_train, family=sm.families.Binomial()).fit()\n",
+    "print(glm_fit.summary())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "b5c7de71-463d-455b-a596-923cfcddcefb",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "const     -11.351394\n",
+      "income      0.000018\n",
+      "balance     0.005536\n",
+      "dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(glm_fit.params)  # print coefficients"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "6b8fb99c-d172-4398-92e5-89324c1787f8",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "K-fold Cross-Validation Error Rate: 0.02571\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.model_selection import KFold\n",
+    "\n",
+    "# ---- K-Fold Cross-Validation ----\n",
+    "folds = 10\n",
+    "kf = KFold(n_splits=folds, shuffle=True, random_state=12)\n",
+    "cv_errors = []\n",
+    "\n",
+    "for train_index, test_index in kf.split(X_train):\n",
+    "    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]\n",
+    "    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]\n",
+    "    \n",
+    "    # Fit model on this fold\n",
+    "    glm_fold = sm.GLM(y_train_fold, X_train_fold, family=sm.families.Binomial()).fit()\n",
+    "    \n",
+    "    # Compute the out-of-sample error for this fold\n",
+    "    preds_fold = glm_fold.predict(X_test_fold)\n",
+    "    pred_labels_fold = [1 if p > 0.5 else 0 for p in preds_fold]\n",
+    "    fold_error = np.mean(pred_labels_fold != y_test_fold)\n",
+    "    \n",
+    "    cv_errors.append(fold_error)\n",
+    "\n",
+    "cv_error_rate = np.mean(cv_errors)\n",
+    "print(f\"K-fold Cross-Validation Error Rate: {cv_error_rate:.5f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "06091455-d874-4a10-9919-78c8c9ddfbed",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "In-sample accuracy: 0.97486\n",
+      "In-sample error rate: 0.02514\n"
+     ]
+    }
+   ],
+   "source": [
+    "# ---- In-sample predictions ----\n",
+    "glm_probs_train = glm_fit.predict(X_train)\n",
+    "glm_pred_train = np.where(glm_probs_train > 0.5, 1, 0)  # ternary operator\n",
+    "\n",
+    "# Compute in-sample accuracy and error rate\n",
+    "accuracy_train = accuracy_score(y_train, glm_pred_train)\n",
+    "error_rate_train = np.mean(glm_pred_train != y_train)\n",
+    "\n",
+    "print(f\"In-sample accuracy: {round(accuracy_train, 5)}\")\n",
+    "print(f\"In-sample error rate: {round(error_rate_train, 5)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "9d115c02-9520-41d5-b04b-e8cbe84b0277",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# ---- Out-of-sample predictions ----\n",
+    "glm_probs_test = glm_fit.predict(X_test)\n",
+    "glm_pred_test = np.where(glm_probs_test > 0.5, 1, 0)  # ternary operator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "80aadaaf-e914-4e70-9ea3-411965a8d9d7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Out-of-sample accuracy: 0.97067\n",
+      "Out-of-sample error rate: 0.02933\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Compute out-of-sample accuracy and error rate\n",
+    "accuracy_test = accuracy_score(test_data_copy['default'], glm_pred_test)\n",
+    "error_rate_test = np.mean(glm_pred_test != test_data_copy['default'])\n",
+    "\n",
+    "print(f\"Out-of-sample accuracy: {round(accuracy_test, 5)}\")\n",
+    "print(f\"Out-of-sample error rate: {round(error_rate_test, 5)}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "date": " ",
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  },
+  "title": " ",
+  "toc-autonumbering": false,
+  "toc-showcode": false,
+  "toc-showmarkdowntxt": false,
+  "toc-showtags": false
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/Validation/03_Default_data.pdf
+++ b/Validation/03_Default_data.pdf