Files
004_comission/bettyphan789/Assignment2_student.ipynb
louiscklaw 843c590c8b update,
2025-01-31 19:28:53 +08:00

498 lines
13 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d7e90f45",
"metadata": {},
"outputs": [],
"source": [
"#### Pandas is for using data structures\n",
"import pandas as pd\n",
"# statsmodels contain modules for regression and time series analysis\n",
"import statsmodels.api as sm\n",
"# numpy is for numerical computing of array and mayatrix\n",
"import numpy as np\n",
"# Matplotlib, Seaborn: plotting package\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns \n",
"# matplotlib Showing the plot right after the current code \n",
"%matplotlib inline\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"# basic statistics package\n",
"import scipy.stats as stats\n",
"from statsmodels.stats.outliers_influence import variance_inflation_factor\n",
"import datetime"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "5159ee37",
"metadata": {},
"outputs": [],
"source": [
"# functions from last lab\n",
"def four_in_one(dataframe,model):\n",
" fitted_y = model.fittedvalues\n",
" studentized_residuals = model.get_influence().resid_studentized_internal\n",
" plt.figure(figsize=(10,10))\n",
" ax1 = plt.subplot(221)\n",
" stats.probplot(studentized_residuals, dist=\"norm\", plot=plt)\n",
" ax1.set_title('Normal Q-Q')\n",
" ax1.set_xlabel('Normal Quantiles')\n",
" ax1.set_ylabel('Studentized Residuals');\n",
"\n",
" ax2 = plt.subplot(222)\n",
" ax2.hist(studentized_residuals)\n",
" ax2.set_xlabel('Studentized Residuals')\n",
" ax2.set_ylabel('Count')\n",
" ax2.set_title('Histogram')\n",
"\n",
" ax3 = plt.subplot(223)\n",
" t = range(dataframe.shape[0])\n",
" ax3.scatter(t, studentized_residuals)\n",
" ax3.set_xlabel('Observation order')\n",
" ax3.set_ylabel('Residuals')\n",
" ax3.set_title('Time series plot of studentized residuals')\n",
"\n",
" ax4 = plt.subplot(224)\n",
" temp = pd.DataFrame({'fitted_y':fitted_y,'studentized_residuals':studentized_residuals})\n",
" ax4 = sns.residplot(data=temp,x=fitted_y, y=studentized_residuals,\n",
" lowess=True,\n",
" scatter_kws={'alpha': 0.5},\n",
" line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})\n",
" ax4.set_title('Internally Studentized Residuals vs Fitted values')\n",
" ax4.set_xlabel('Fitted values')\n",
" ax4.set_ylabel('Studentized Residuals');\n",
" \n",
"def getvif(X):\n",
" X = sm.add_constant(X)\n",
" vif = pd.DataFrame()\n",
" vif[\"VIF\"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]\n",
" vif[\"Predictors\"] = X.columns\n",
" return(vif.drop(index = 0).round(2)) "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "16326102",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>rBH</th>\n",
" <th>rSP</th>\n",
" <th>SmB</th>\n",
" <th>HmL</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Date</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1/2/2009</th>\n",
" <td>-0.121807</td>\n",
" <td>-0.109931</td>\n",
" <td>0.0005</td>\n",
" <td>-0.0695</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1/3/2009</th>\n",
" <td>0.103053</td>\n",
" <td>0.085404</td>\n",
" <td>0.0004</td>\n",
" <td>0.0348</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1/4/2009</th>\n",
" <td>0.084198</td>\n",
" <td>0.093925</td>\n",
" <td>0.0539</td>\n",
" <td>0.0536</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1/5/2009</th>\n",
" <td>-0.025532</td>\n",
" <td>0.053081</td>\n",
" <td>-0.0252</td>\n",
" <td>0.0027</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1/6/2009</th>\n",
" <td>-0.017467</td>\n",
" <td>0.000196</td>\n",
" <td>0.0263</td>\n",
" <td>-0.0273</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" rBH rSP SmB HmL\n",
"Date \n",
"1/2/2009 -0.121807 -0.109931 0.0005 -0.0695\n",
"1/3/2009 0.103053 0.085404 0.0004 0.0348\n",
"1/4/2009 0.084198 0.093925 0.0539 0.0536\n",
"1/5/2009 -0.025532 0.053081 -0.0252 0.0027\n",
"1/6/2009 -0.017467 0.000196 0.0263 -0.0273"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = pd.read_csv(\"BH2009-2022.csv\",index_col=0)\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "7fd1d118",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(167, 4)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.shape"
]
},
{
"cell_type": "markdown",
"id": "09cc26c8",
"metadata": {},
"source": [
"# Part I: CAPM model"
]
},
{
"cell_type": "markdown",
"id": "ebaa4598-7164-4b6c-ac8d-7d674fa4ee4f",
"metadata": {},
"source": [
"### Task 1: Split the data into train (first 155 observations) and test (remaining 12 observations) set"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "095d1d13",
"metadata": {},
"outputs": [],
"source": [
"train = $$code here$$\n",
"test = $$code here$$"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c0a06748",
"metadata": {},
"outputs": [],
"source": [
"train.shape, test.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "abff4aee-07cb-4ddd-8cd2-7f909b217b41",
"metadata": {},
"outputs": [],
"source": [
"Y = train[\"rBH\"]\n",
"X = train[\"rSP\"]"
]
},
{
"cell_type": "markdown",
"id": "f84493d2-6484-4e62-a196-888c72c657f4",
"metadata": {},
"source": [
"### Task 2: Using training set, fit a simple regression model(SLR). Report the adjusted R-square of the model."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "76e5007b-ec3f-4271-9c25-afd7cb2bd028",
"metadata": {},
"outputs": [],
"source": [
"SLR = $$code here$$\n",
"print(SLR.summary())"
]
},
{
"cell_type": "markdown",
"id": "b9fd7ede-dabc-47d9-918f-021ee1fae9b4",
"metadata": {},
"source": [
"### Report the adjusted R-square of the model.\n",
" "
]
},
{
"cell_type": "markdown",
"id": "3af44ff8",
"metadata": {},
"source": [
"# Part II: Multiple Regression Model"
]
},
{
"cell_type": "markdown",
"id": "3f8158d8-1c42-4226-99c3-0d04a026df5b",
"metadata": {},
"source": [
"### Task 3: Using training set, fit a multiple regression model with SmB and HmL explanatory variables in addition to rSP (MLR). Report the adjusted R-square of the model."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1f2dfad6",
"metadata": {},
"outputs": [],
"source": [
"X = $$code here$$\n",
"MLR = $$code here$$\n",
"print(MLR.summary())"
]
},
{
"cell_type": "markdown",
"id": "18e59124-7f05-414a-9284-fde621aa94cc",
"metadata": {},
"source": [
"### Report the adjusted R-square of the model.\n",
" "
]
},
{
"cell_type": "markdown",
"id": "fcb66422-678b-4d10-9aea-e56ae1c0adfa",
"metadata": {},
"source": [
"### Task 4: Checking the multicollinearity problem among rSP, SmB and HmL by \n",
" i) Scatter plot matrix \n",
" ii) VIF. \n",
"#### Is the multicollinearity problem exist?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "284873a2",
"metadata": {},
"outputs": [],
"source": [
"$$code here$$ #<--code for scatter plot matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cad9bd49-5030-4a95-a5b5-ae8677092fbe",
"metadata": {},
"outputs": [],
"source": [
"$$code here$$ #<--code for VIF"
]
},
{
"cell_type": "markdown",
"id": "aea04a9e",
"metadata": {},
"source": [
"### Is the multicollinearity problem exist?"
]
},
{
"cell_type": "markdown",
"id": "88eb43de",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"id": "ece7bda1-23e4-47e5-84ba-d0ab6eff9f06",
"metadata": {
"tags": []
},
"source": [
"### Task 5: From the fitted multiple regression model in Task 3\n",
" i) Is the model as a whole useful at 5% significant level? \n",
" ii) Which of them is not an useful explanatory variable at 5% significant level?"
]
},
{
"cell_type": "markdown",
"id": "413b5d9a-44c6-4f4e-91c4-3f82317b2b00",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"id": "fe0773af-c6ee-4dce-97c6-fe33af6310b8",
"metadata": {},
"source": [
"### Task 6: Execute model diagnostic on the model fitted from Task3 using the “four_in_one” function. Comment on the normality, constant variance assumption.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0b372cd7",
"metadata": {},
"outputs": [],
"source": [
"$$code here$$ #<--code for “four_in_one” function"
]
},
{
"cell_type": "markdown",
"id": "919fcacf",
"metadata": {},
"source": [
"### Comment on the normality, constant variance assumption."
]
},
{
"cell_type": "markdown",
"id": "52ff227e",
"metadata": {},
"source": []
},
{
"cell_type": "markdown",
"id": "5b40b212",
"metadata": {},
"source": [
"# Part IV: Model Performance"
]
},
{
"cell_type": "markdown",
"id": "8cc559dd-6a77-4289-a451-ede8d00bbf90",
"metadata": {},
"source": [
"### Task 7: Compare the predictive power between SLR and MLR using the test set. Which one perform better in prediction?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6e74fa4c",
"metadata": {},
"outputs": [],
"source": [
"Test_X_SLR = test['rSP']\n",
"Test_X_MLR = $$code here$$\n",
"\n",
"Test_Y_SLR = SLR.predict(sm.add_constant(Test_X_SLR))\n",
"Test_Y_MLR = $$code here$$"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d5af214f",
"metadata": {},
"outputs": [],
"source": [
"Test_Y = test[\"rBH\"]\n",
"\n",
"from sklearn.metrics import mean_squared_error\n",
"rmse_SLR = np.sqrt(mean_squared_error(Test_Y, Test_Y_SLR))\n",
"rmse_MLR = $$code here$$\n",
"print(\"RMSE for test set (SLR): \", rmse_SLR)\n",
"print(\"RMSE for test set (MLR): \", rmse_MLR)"
]
},
{
"cell_type": "markdown",
"id": "bd0c8483",
"metadata": {},
"source": [
"### Which one perform better in prediction?"
]
},
{
"cell_type": "markdown",
"id": "36d05636",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "f203f98c-2e57-4262-8fd8-5209825817af",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}