major upload of (python) course material & solutions
This commit is contained in:
File diff suppressed because one or more lines are too long
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
@@ -0,0 +1,465 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "6cbef61b-0897-42bf-b456-c0a409b87c41",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\vspace{-4cm}\n",
|
||||
"\\begin{center}\n",
|
||||
" \\LARGE{Machine Learning for Economics and Finance}\\\\[0.5cm]\n",
|
||||
" \\Large{\\textbf{03\\_Default\\_data}}\\\\[1.0cm]\n",
|
||||
" \\large{Ole Wilms}\\\\[0.5cm]\n",
|
||||
" \\large{July 29, 2024}\\\\\n",
|
||||
"\\end{center}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "13be77f3-44f0-4983-b4cb-bd3e4b5dba8b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\setcounter{secnumdepth}{0}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "335aa198-5a94-4c5a-8ad8-67c78bcf71f5",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/mnt/ds/home/UHH_MLSJ_2024/Code/Python/03-CrossValidation\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>default</th>\n",
|
||||
" <th>student</th>\n",
|
||||
" <th>balance</th>\n",
|
||||
" <th>income</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>729.526495</td>\n",
|
||||
" <td>44361.625074</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>Yes</td>\n",
|
||||
" <td>817.180407</td>\n",
|
||||
" <td>12106.134700</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>1073.549164</td>\n",
|
||||
" <td>31767.138947</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>529.250605</td>\n",
|
||||
" <td>35704.493935</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>No</td>\n",
|
||||
" <td>785.655883</td>\n",
|
||||
" <td>38463.495879</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" default student balance income\n",
|
||||
"0 No No 729.526495 44361.625074\n",
|
||||
"1 No Yes 817.180407 12106.134700\n",
|
||||
"2 No No 1073.549164 31767.138947\n",
|
||||
"3 No No 529.250605 35704.493935\n",
|
||||
"4 No No 785.655883 38463.495879"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import os # Package to access system related information \n",
|
||||
"print(os.getcwd()) # Prints the current working directory\n",
|
||||
"path = os.getcwd()\n",
|
||||
"os.chdir(path) # Set the working directory\n",
|
||||
"\n",
|
||||
"from ISLP import load_data # Package which contains the data\n",
|
||||
"default_data = load_data('Default') # Loading the data\n",
|
||||
"default_data.head() # Showing the first 5 Lines of Data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "2e38a201-7f2d-4999-beab-5739217a9318",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"RangeIndex: 10000 entries, 0 to 9999\n",
|
||||
"Data columns (total 4 columns):\n",
|
||||
" # Column Non-Null Count Dtype \n",
|
||||
"--- ------ -------------- ----- \n",
|
||||
" 0 default 10000 non-null object \n",
|
||||
" 1 student 10000 non-null object \n",
|
||||
" 2 balance 10000 non-null float64\n",
|
||||
" 3 income 10000 non-null float64\n",
|
||||
"dtypes: float64(2), object(2)\n",
|
||||
"memory usage: 312.6+ KB\n",
|
||||
"None\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(default_data.info())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "7dd29324-cd54-415c-ba83-56c0d9f74159",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" balance income\n",
|
||||
"count 10000.000000 10000.000000\n",
|
||||
"mean 835.374886 33516.981876\n",
|
||||
"std 483.714985 13336.639563\n",
|
||||
"min 0.000000 771.967729\n",
|
||||
"25% 481.731105 21340.462903\n",
|
||||
"50% 823.636973 34552.644802\n",
|
||||
"75% 1166.308386 43807.729272\n",
|
||||
"max 2654.322576 73554.233495\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(default_data.describe())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "3debf6d8-efda-4414-bcca-dd758dc65512",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"# set seed\n",
|
||||
"np.random.seed(1)\n",
|
||||
"\n",
|
||||
"# Number of observations in the dataset\n",
|
||||
"n = len(default_data)\n",
|
||||
"\n",
|
||||
"# Shuffle the dataset using np.random.permutation\n",
|
||||
"shuffled_indices = np.random.permutation(n)\n",
|
||||
"\n",
|
||||
"# Compute training and validation sample sizes\n",
|
||||
"nT = int(0.7 * n) # Training sample size\n",
|
||||
"\n",
|
||||
"# Split the shuffled dataset based on the shuffled indices\n",
|
||||
"train_data = default_data.iloc[shuffled_indices[:nT]] # First 70% for training\n",
|
||||
"test_data = default_data.iloc[shuffled_indices[nT:]] # Remaining 30% for validation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "e1b2a560-2a8e-4881-8d51-f3d96c3b05fe",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Train data percentage of defaulting: 0.03157\n",
|
||||
"Test data percentage of defaulting: 0.03733\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"defaulting_train = (train_data['default'] == 'Yes').mean()\n",
|
||||
"defaulting_test = (test_data['default'] == 'Yes').mean()\n",
|
||||
"# The \"train_data$default == \"Yes\": creates a logical vector where each element is TRUE \n",
|
||||
"# if the corresponding element.\n",
|
||||
"# The outer mean() function than calculates the proportion of TRUE values \n",
|
||||
"# in the logical vector.\n",
|
||||
"\n",
|
||||
"# Output the results\n",
|
||||
"print(f\"Train data percentage of defaulting: {round(defaulting_train, 5)}\")\n",
|
||||
"print(f\"Test data percentage of defaulting: {round(defaulting_test, 5)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "f9a25057-a631-48dc-883f-643bd09d0999",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Generalized Linear Model Regression Results \n",
|
||||
"==============================================================================\n",
|
||||
"Dep. Variable: default No. Observations: 7000\n",
|
||||
"Model: GLM Df Residuals: 6997\n",
|
||||
"Model Family: Binomial Df Model: 2\n",
|
||||
"Link Function: Logit Scale: 1.0000\n",
|
||||
"Method: IRLS Log-Likelihood: -542.14\n",
|
||||
"Date: Sat, 19 Oct 2024 Deviance: 1084.3\n",
|
||||
"Time: 16:53:00 Pearson chi2: 5.42e+03\n",
|
||||
"No. Iterations: 9 Pseudo R-squ. (CS): 0.1179\n",
|
||||
"Covariance Type: nonrobust \n",
|
||||
"==============================================================================\n",
|
||||
" coef std err z P>|z| [0.025 0.975]\n",
|
||||
"------------------------------------------------------------------------------\n",
|
||||
"const -11.3514 0.515 -22.060 0.000 -12.360 -10.343\n",
|
||||
"income 1.847e-05 5.98e-06 3.091 0.002 6.76e-06 3.02e-05\n",
|
||||
"balance 0.0055 0.000 20.428 0.000 0.005 0.006\n",
|
||||
"==============================================================================\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import statsmodels.api as sm\n",
|
||||
"\n",
|
||||
"train_data_copy = train_data.copy()\n",
|
||||
"train_data_copy['default'] = train_data_copy['default'].map({'No': 0, 'Yes': 1})\n",
|
||||
"\n",
|
||||
"test_data_copy = test_data.copy()\n",
|
||||
"test_data_copy['default'] = test_data_copy['default'].map({'No': 0, 'Yes': 1})\n",
|
||||
"\n",
|
||||
"# Logistic regression model:\n",
|
||||
"X_train = train_data_copy[['income','balance']]\n",
|
||||
"X_train = sm.add_constant(X_train) # Adds an intercept term to the model\n",
|
||||
"X_test = test_data_copy[['income','balance']]\n",
|
||||
"X_test = sm.add_constant(X_test) # Adds an intercept term to the model\n",
|
||||
"y_train = train_data_copy['default']\n",
|
||||
"\n",
|
||||
"# Fit the logistic regression model\n",
|
||||
"glm_fit = sm.GLM(y_train, X_train, family=sm.families.Binomial()).fit()\n",
|
||||
"print(glm_fit.summary())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "b5c7de71-463d-455b-a596-923cfcddcefb",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"const -11.351394\n",
|
||||
"income 0.000018\n",
|
||||
"balance 0.005536\n",
|
||||
"dtype: float64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(glm_fit.params) # print coefficients"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "6b8fb99c-d172-4398-92e5-89324c1787f8",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"K-fold Cross-Validation Error Rate: 0.02571\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"from sklearn.model_selection import KFold\n",
|
||||
"\n",
|
||||
"# ---- K-Fold Cross-Validation ----\n",
|
||||
"folds = 10\n",
|
||||
"kf = KFold(n_splits=folds, shuffle=True, random_state=12)\n",
|
||||
"cv_errors = []\n",
|
||||
"\n",
|
||||
"for train_index, test_index in kf.split(X_train):\n",
|
||||
" X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]\n",
|
||||
" y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]\n",
|
||||
" \n",
|
||||
" # Fit model on this fold\n",
|
||||
" glm_fold = sm.GLM(y_train_fold, X_train_fold, family=sm.families.Binomial()).fit()\n",
|
||||
" \n",
|
||||
" # Compute the out-of-sample error for this fold\n",
|
||||
" preds_fold = glm_fold.predict(X_test_fold)\n",
|
||||
" pred_labels_fold = [1 if p > 0.5 else 0 for p in preds_fold]\n",
|
||||
" fold_error = np.mean(pred_labels_fold != y_test_fold)\n",
|
||||
" \n",
|
||||
" cv_errors.append(fold_error)\n",
|
||||
"\n",
|
||||
"cv_error_rate = np.mean(cv_errors)\n",
|
||||
"print(f\"K-fold Cross-Validation Error Rate: {cv_error_rate:.5f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "06091455-d874-4a10-9919-78c8c9ddfbed",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"In-sample accuracy: 0.97486\n",
|
||||
"In-sample error rate: 0.02514\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# ---- In-sample predictions ----\n",
|
||||
"glm_probs_train = glm_fit.predict(X_train)\n",
|
||||
"glm_pred_train = np.where(glm_probs_train > 0.5, 1, 0) # ternary operator\n",
|
||||
"\n",
|
||||
"# Compute in-sample accuracy and error rate\n",
|
||||
"accuracy_train = accuracy_score(y_train, glm_pred_train)\n",
|
||||
"error_rate_train = np.mean(glm_pred_train != y_train)\n",
|
||||
"\n",
|
||||
"print(f\"In-sample accuracy: {round(accuracy_train, 5)}\")\n",
|
||||
"print(f\"In-sample error rate: {round(error_rate_train, 5)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "9d115c02-9520-41d5-b04b-e8cbe84b0277",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ---- Out-of-sample predictions ----\n",
|
||||
"glm_probs_test = glm_fit.predict(X_test)\n",
|
||||
"glm_pred_test = np.where(glm_probs_test > 0.5, 1, 0) # ternary operator"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "80aadaaf-e914-4e70-9ea3-411965a8d9d7",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Out-of-sample accuracy: 0.97067\n",
|
||||
"Out-of-sample error rate: 0.02933\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Compute out-of-sample accuracy and error rate\n",
|
||||
"accuracy_test = accuracy_score(test_data_copy['default'], glm_pred_test)\n",
|
||||
"error_rate_test = np.mean(glm_pred_test != test_data_copy['default'])\n",
|
||||
"\n",
|
||||
"print(f\"Out-of-sample accuracy: {round(accuracy_test, 5)}\")\n",
|
||||
"print(f\"Out-of-sample error rate: {round(error_rate_test, 5)}\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"date": " ",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.7"
|
||||
},
|
||||
"title": " ",
|
||||
"toc-autonumbering": false,
|
||||
"toc-showcode": false,
|
||||
"toc-showmarkdowntxt": false,
|
||||
"toc-showtags": false
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Binary file not shown.
Reference in New Issue
Block a user