major upload of (python) course material & solutions

This commit is contained in:
2025-12-03 14:39:45 +01:00
parent 52552e20cb
commit e95a0b2ecc
39 changed files with 13598 additions and 0 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,465 @@
{
"cells": [
{
"cell_type": "raw",
"id": "6cbef61b-0897-42bf-b456-c0a409b87c41",
"metadata": {},
"source": [
"\\vspace{-4cm}\n",
"\\begin{center}\n",
" \\LARGE{Machine Learning for Economics and Finance}\\\\[0.5cm]\n",
" \\Large{\\textbf{03\\_Default\\_data}}\\\\[1.0cm]\n",
" \\large{Ole Wilms}\\\\[0.5cm]\n",
" \\large{July 29, 2024}\\\\\n",
"\\end{center}"
]
},
{
"cell_type": "raw",
"id": "13be77f3-44f0-4983-b4cb-bd3e4b5dba8b",
"metadata": {},
"source": [
"\\setcounter{secnumdepth}{0}"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "335aa198-5a94-4c5a-8ad8-67c78bcf71f5",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/mnt/ds/home/UHH_MLSJ_2024/Code/Python/03-CrossValidation\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>default</th>\n",
" <th>student</th>\n",
" <th>balance</th>\n",
" <th>income</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>729.526495</td>\n",
" <td>44361.625074</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>No</td>\n",
" <td>Yes</td>\n",
" <td>817.180407</td>\n",
" <td>12106.134700</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>1073.549164</td>\n",
" <td>31767.138947</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>529.250605</td>\n",
" <td>35704.493935</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>No</td>\n",
" <td>No</td>\n",
" <td>785.655883</td>\n",
" <td>38463.495879</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" default student balance income\n",
"0 No No 729.526495 44361.625074\n",
"1 No Yes 817.180407 12106.134700\n",
"2 No No 1073.549164 31767.138947\n",
"3 No No 529.250605 35704.493935\n",
"4 No No 785.655883 38463.495879"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import os # Package to access system related information \n",
"print(os.getcwd()) # Prints the current working directory\n",
"path = os.getcwd()\n",
"os.chdir(path) # Set the working directory\n",
"\n",
"from ISLP import load_data # Package which contains the data\n",
"default_data = load_data('Default') # Loading the data\n",
"default_data.head() # Showing the first 5 Lines of Data."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2e38a201-7f2d-4999-beab-5739217a9318",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 10000 entries, 0 to 9999\n",
"Data columns (total 4 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 default 10000 non-null object \n",
" 1 student 10000 non-null object \n",
" 2 balance 10000 non-null float64\n",
" 3 income 10000 non-null float64\n",
"dtypes: float64(2), object(2)\n",
"memory usage: 312.6+ KB\n",
"None\n"
]
}
],
"source": [
"print(default_data.info())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7dd29324-cd54-415c-ba83-56c0d9f74159",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" balance income\n",
"count 10000.000000 10000.000000\n",
"mean 835.374886 33516.981876\n",
"std 483.714985 13336.639563\n",
"min 0.000000 771.967729\n",
"25% 481.731105 21340.462903\n",
"50% 823.636973 34552.644802\n",
"75% 1166.308386 43807.729272\n",
"max 2654.322576 73554.233495\n"
]
}
],
"source": [
"print(default_data.describe())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3debf6d8-efda-4414-bcca-dd758dc65512",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"# set seed\n",
"np.random.seed(1)\n",
"\n",
"# Number of observations in the dataset\n",
"n = len(default_data)\n",
"\n",
"# Shuffle the dataset using np.random.permutation\n",
"shuffled_indices = np.random.permutation(n)\n",
"\n",
"# Compute training and validation sample sizes\n",
"nT = int(0.7 * n) # Training sample size\n",
"\n",
"# Split the shuffled dataset based on the shuffled indices\n",
"train_data = default_data.iloc[shuffled_indices[:nT]] # First 70% for training\n",
"test_data = default_data.iloc[shuffled_indices[nT:]] # Remaining 30% for validation"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e1b2a560-2a8e-4881-8d51-f3d96c3b05fe",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train data percentage of defaulting: 0.03157\n",
"Test data percentage of defaulting: 0.03733\n"
]
}
],
"source": [
"defaulting_train = (train_data['default'] == 'Yes').mean()\n",
"defaulting_test = (test_data['default'] == 'Yes').mean()\n",
"# The \"train_data$default == \"Yes\": creates a logical vector where each element is TRUE \n",
"# if the corresponding element.\n",
"# The outer mean() function than calculates the proportion of TRUE values \n",
"# in the logical vector.\n",
"\n",
"# Output the results\n",
"print(f\"Train data percentage of defaulting: {round(defaulting_train, 5)}\")\n",
"print(f\"Test data percentage of defaulting: {round(defaulting_test, 5)}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "f9a25057-a631-48dc-883f-643bd09d0999",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Generalized Linear Model Regression Results \n",
"==============================================================================\n",
"Dep. Variable: default No. Observations: 7000\n",
"Model: GLM Df Residuals: 6997\n",
"Model Family: Binomial Df Model: 2\n",
"Link Function: Logit Scale: 1.0000\n",
"Method: IRLS Log-Likelihood: -542.14\n",
"Date: Sat, 19 Oct 2024 Deviance: 1084.3\n",
"Time: 16:53:00 Pearson chi2: 5.42e+03\n",
"No. Iterations: 9 Pseudo R-squ. (CS): 0.1179\n",
"Covariance Type: nonrobust \n",
"==============================================================================\n",
" coef std err z P>|z| [0.025 0.975]\n",
"------------------------------------------------------------------------------\n",
"const -11.3514 0.515 -22.060 0.000 -12.360 -10.343\n",
"income 1.847e-05 5.98e-06 3.091 0.002 6.76e-06 3.02e-05\n",
"balance 0.0055 0.000 20.428 0.000 0.005 0.006\n",
"==============================================================================\n"
]
}
],
"source": [
"import statsmodels.api as sm\n",
"\n",
"train_data_copy = train_data.copy()\n",
"train_data_copy['default'] = train_data_copy['default'].map({'No': 0, 'Yes': 1})\n",
"\n",
"test_data_copy = test_data.copy()\n",
"test_data_copy['default'] = test_data_copy['default'].map({'No': 0, 'Yes': 1})\n",
"\n",
"# Logistic regression model:\n",
"X_train = train_data_copy[['income','balance']]\n",
"X_train = sm.add_constant(X_train) # Adds an intercept term to the model\n",
"X_test = test_data_copy[['income','balance']]\n",
"X_test = sm.add_constant(X_test) # Adds an intercept term to the model\n",
"y_train = train_data_copy['default']\n",
"\n",
"# Fit the logistic regression model\n",
"glm_fit = sm.GLM(y_train, X_train, family=sm.families.Binomial()).fit()\n",
"print(glm_fit.summary())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "b5c7de71-463d-455b-a596-923cfcddcefb",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"const -11.351394\n",
"income 0.000018\n",
"balance 0.005536\n",
"dtype: float64\n"
]
}
],
"source": [
"print(glm_fit.params) # print coefficients"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "6b8fb99c-d172-4398-92e5-89324c1787f8",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"K-fold Cross-Validation Error Rate: 0.02571\n"
]
}
],
"source": [
"from sklearn.metrics import accuracy_score\n",
"from sklearn.model_selection import KFold\n",
"\n",
"# ---- K-Fold Cross-Validation ----\n",
"folds = 10\n",
"kf = KFold(n_splits=folds, shuffle=True, random_state=12)\n",
"cv_errors = []\n",
"\n",
"for train_index, test_index in kf.split(X_train):\n",
" X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]\n",
" y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]\n",
" \n",
" # Fit model on this fold\n",
" glm_fold = sm.GLM(y_train_fold, X_train_fold, family=sm.families.Binomial()).fit()\n",
" \n",
" # Compute the out-of-sample error for this fold\n",
" preds_fold = glm_fold.predict(X_test_fold)\n",
" pred_labels_fold = [1 if p > 0.5 else 0 for p in preds_fold]\n",
" fold_error = np.mean(pred_labels_fold != y_test_fold)\n",
" \n",
" cv_errors.append(fold_error)\n",
"\n",
"cv_error_rate = np.mean(cv_errors)\n",
"print(f\"K-fold Cross-Validation Error Rate: {cv_error_rate:.5f}\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "06091455-d874-4a10-9919-78c8c9ddfbed",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In-sample accuracy: 0.97486\n",
"In-sample error rate: 0.02514\n"
]
}
],
"source": [
"# ---- In-sample predictions ----\n",
"glm_probs_train = glm_fit.predict(X_train)\n",
"glm_pred_train = np.where(glm_probs_train > 0.5, 1, 0) # ternary operator\n",
"\n",
"# Compute in-sample accuracy and error rate\n",
"accuracy_train = accuracy_score(y_train, glm_pred_train)\n",
"error_rate_train = np.mean(glm_pred_train != y_train)\n",
"\n",
"print(f\"In-sample accuracy: {round(accuracy_train, 5)}\")\n",
"print(f\"In-sample error rate: {round(error_rate_train, 5)}\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "9d115c02-9520-41d5-b04b-e8cbe84b0277",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# ---- Out-of-sample predictions ----\n",
"glm_probs_test = glm_fit.predict(X_test)\n",
"glm_pred_test = np.where(glm_probs_test > 0.5, 1, 0) # ternary operator"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "80aadaaf-e914-4e70-9ea3-411965a8d9d7",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Out-of-sample accuracy: 0.97067\n",
"Out-of-sample error rate: 0.02933\n"
]
}
],
"source": [
"# Compute out-of-sample accuracy and error rate\n",
"accuracy_test = accuracy_score(test_data_copy['default'], glm_pred_test)\n",
"error_rate_test = np.mean(glm_pred_test != test_data_copy['default'])\n",
"\n",
"print(f\"Out-of-sample accuracy: {round(accuracy_test, 5)}\")\n",
"print(f\"Out-of-sample error rate: {round(error_rate_test, 5)}\")"
]
}
],
"metadata": {
"date": " ",
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
},
"title": " ",
"toc-autonumbering": false,
"toc-showcode": false,
"toc-showmarkdowntxt": false,
"toc-showtags": false
},
"nbformat": 4,
"nbformat_minor": 5
}