498 lines
13 KiB
Plaintext
498 lines
13 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "d7e90f45",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#### Pandas is for using data structures\n",
|
|
"import pandas as pd\n",
|
|
"# statsmodels contain modules for regression and time series analysis\n",
|
|
"import statsmodels.api as sm\n",
|
|
"# numpy is for numerical computing of array and mayatrix\n",
|
|
"import numpy as np\n",
|
|
"# Matplotlib, Seaborn: plotting package\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns \n",
|
|
"# matplotlib Showing the plot right after the current code \n",
|
|
"%matplotlib inline\n",
|
|
"import warnings\n",
|
|
"warnings.filterwarnings('ignore')\n",
|
|
"# basic statistics package\n",
|
|
"import scipy.stats as stats\n",
|
|
"from statsmodels.stats.outliers_influence import variance_inflation_factor\n",
|
|
"import datetime"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "5159ee37",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# functions from last lab\n",
|
|
"def four_in_one(dataframe,model):\n",
|
|
" fitted_y = model.fittedvalues\n",
|
|
" studentized_residuals = model.get_influence().resid_studentized_internal\n",
|
|
" plt.figure(figsize=(10,10))\n",
|
|
" ax1 = plt.subplot(221)\n",
|
|
" stats.probplot(studentized_residuals, dist=\"norm\", plot=plt)\n",
|
|
" ax1.set_title('Normal Q-Q')\n",
|
|
" ax1.set_xlabel('Normal Quantiles')\n",
|
|
" ax1.set_ylabel('Studentized Residuals');\n",
|
|
"\n",
|
|
" ax2 = plt.subplot(222)\n",
|
|
" ax2.hist(studentized_residuals)\n",
|
|
" ax2.set_xlabel('Studentized Residuals')\n",
|
|
" ax2.set_ylabel('Count')\n",
|
|
" ax2.set_title('Histogram')\n",
|
|
"\n",
|
|
" ax3 = plt.subplot(223)\n",
|
|
" t = range(dataframe.shape[0])\n",
|
|
" ax3.scatter(t, studentized_residuals)\n",
|
|
" ax3.set_xlabel('Observation order')\n",
|
|
" ax3.set_ylabel('Residuals')\n",
|
|
" ax3.set_title('Time series plot of studentized residuals')\n",
|
|
"\n",
|
|
" ax4 = plt.subplot(224)\n",
|
|
" temp = pd.DataFrame({'fitted_y':fitted_y,'studentized_residuals':studentized_residuals})\n",
|
|
" ax4 = sns.residplot(data=temp,x=fitted_y, y=studentized_residuals,\n",
|
|
" lowess=True,\n",
|
|
" scatter_kws={'alpha': 0.5},\n",
|
|
" line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})\n",
|
|
" ax4.set_title('Internally Studentized Residuals vs Fitted values')\n",
|
|
" ax4.set_xlabel('Fitted values')\n",
|
|
" ax4.set_ylabel('Studentized Residuals');\n",
|
|
" \n",
|
|
"def getvif(X):\n",
|
|
" X = sm.add_constant(X)\n",
|
|
" vif = pd.DataFrame()\n",
|
|
" vif[\"VIF\"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]\n",
|
|
" vif[\"Predictors\"] = X.columns\n",
|
|
" return(vif.drop(index = 0).round(2)) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "16326102",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>rBH</th>\n",
|
|
" <th>rSP</th>\n",
|
|
" <th>SmB</th>\n",
|
|
" <th>HmL</th>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>Date</th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" <th></th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>1/2/2009</th>\n",
|
|
" <td>-0.121807</td>\n",
|
|
" <td>-0.109931</td>\n",
|
|
" <td>0.0005</td>\n",
|
|
" <td>-0.0695</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1/3/2009</th>\n",
|
|
" <td>0.103053</td>\n",
|
|
" <td>0.085404</td>\n",
|
|
" <td>0.0004</td>\n",
|
|
" <td>0.0348</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1/4/2009</th>\n",
|
|
" <td>0.084198</td>\n",
|
|
" <td>0.093925</td>\n",
|
|
" <td>0.0539</td>\n",
|
|
" <td>0.0536</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1/5/2009</th>\n",
|
|
" <td>-0.025532</td>\n",
|
|
" <td>0.053081</td>\n",
|
|
" <td>-0.0252</td>\n",
|
|
" <td>0.0027</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1/6/2009</th>\n",
|
|
" <td>-0.017467</td>\n",
|
|
" <td>0.000196</td>\n",
|
|
" <td>0.0263</td>\n",
|
|
" <td>-0.0273</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" rBH rSP SmB HmL\n",
|
|
"Date \n",
|
|
"1/2/2009 -0.121807 -0.109931 0.0005 -0.0695\n",
|
|
"1/3/2009 0.103053 0.085404 0.0004 0.0348\n",
|
|
"1/4/2009 0.084198 0.093925 0.0539 0.0536\n",
|
|
"1/5/2009 -0.025532 0.053081 -0.0252 0.0027\n",
|
|
"1/6/2009 -0.017467 0.000196 0.0263 -0.0273"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data = pd.read_csv(\"BH2009-2022.csv\",index_col=0)\n",
|
|
"data.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "7fd1d118",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(167, 4)"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"data.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "09cc26c8",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Part I: CAPM model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "ebaa4598-7164-4b6c-ac8d-7d674fa4ee4f",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Task 1: Split the data into train (first 155 observations) and test (remaining 12 observations) set"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "095d1d13",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train = $$code here$$\n",
|
|
"test = $$code here$$"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c0a06748",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train.shape, test.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "abff4aee-07cb-4ddd-8cd2-7f909b217b41",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"Y = train[\"rBH\"]\n",
|
|
"X = train[\"rSP\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f84493d2-6484-4e62-a196-888c72c657f4",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Task 2: Using training set, fit a simple regression model(SLR). Report the adjusted R-square of the model."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "76e5007b-ec3f-4271-9c25-afd7cb2bd028",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"SLR = $$code here$$\n",
|
|
"print(SLR.summary())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b9fd7ede-dabc-47d9-918f-021ee1fae9b4",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Report the adjusted R-square of the model.\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3af44ff8",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Part II: Multiple Regression Model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3f8158d8-1c42-4226-99c3-0d04a026df5b",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Task 3: Using training set, fit a multiple regression model with SmB and HmL explanatory variables in addition to rSP (MLR). Report the adjusted R-square of the model."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1f2dfad6",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X = $$code here$$\n",
|
|
"MLR = $$code here$$\n",
|
|
"print(MLR.summary())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "18e59124-7f05-414a-9284-fde621aa94cc",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Report the adjusted R-square of the model.\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "fcb66422-678b-4d10-9aea-e56ae1c0adfa",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Task 4: Checking the multicollinearity problem among rSP, SmB and HmL by \n",
|
|
" i) Scatter plot matrix \n",
|
|
" ii) VIF. \n",
|
|
"#### Is the multicollinearity problem exist?"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "284873a2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"$$code here$$ #<--code for scatter plot matrix"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cad9bd49-5030-4a95-a5b5-ae8677092fbe",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"$$code here$$ #<--code for VIF"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "aea04a9e",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Is the multicollinearity problem exist?"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "88eb43de",
|
|
"metadata": {},
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "ece7bda1-23e4-47e5-84ba-d0ab6eff9f06",
|
|
"metadata": {
|
|
"tags": []
|
|
},
|
|
"source": [
|
|
"### Task 5: From the fitted multiple regression model in Task 3\n",
|
|
" i) Is the model as a whole useful at 5% significant level? \n",
|
|
" ii) Which of them is not an useful explanatory variable at 5% significant level?"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "413b5d9a-44c6-4f4e-91c4-3f82317b2b00",
|
|
"metadata": {},
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "fe0773af-c6ee-4dce-97c6-fe33af6310b8",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Task 6: Execute model diagnostic on the model fitted from Task3 using the “four_in_one” function. Comment on the normality, constant variance assumption.\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "0b372cd7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"$$code here$$ #<--code for “four_in_one” function"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "919fcacf",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Comment on the normality, constant variance assumption."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "52ff227e",
|
|
"metadata": {},
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5b40b212",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Part IV: Model Performance"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8cc559dd-6a77-4289-a451-ede8d00bbf90",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Task 7: Compare the predictive power between SLR and MLR using the test set. Which one perform better in prediction?"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "6e74fa4c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"Test_X_SLR = test['rSP']\n",
|
|
"Test_X_MLR = $$code here$$\n",
|
|
"\n",
|
|
"Test_Y_SLR = SLR.predict(sm.add_constant(Test_X_SLR))\n",
|
|
"Test_Y_MLR = $$code here$$"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d5af214f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"Test_Y = test[\"rBH\"]\n",
|
|
"\n",
|
|
"from sklearn.metrics import mean_squared_error\n",
|
|
"rmse_SLR = np.sqrt(mean_squared_error(Test_Y, Test_Y_SLR))\n",
|
|
"rmse_MLR = $$code here$$\n",
|
|
"print(\"RMSE for test set (SLR): \", rmse_SLR)\n",
|
|
"print(\"RMSE for test set (MLR): \", rmse_MLR)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "bd0c8483",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Which one perform better in prediction?"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "36d05636",
|
|
"metadata": {},
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f203f98c-2e57-4262-8fd8-5209825817af",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.9"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|