{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d7e90f45", "metadata": {}, "outputs": [], "source": [ "#### Pandas is for using data structures\n", "import pandas as pd\n", "# statsmodels contain modules for regression and time series analysis\n", "import statsmodels.api as sm\n", "# numpy is for numerical computing of array and mayatrix\n", "import numpy as np\n", "# Matplotlib, Seaborn: plotting package\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns \n", "# matplotlib Showing the plot right after the current code \n", "%matplotlib inline\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "# basic statistics package\n", "import scipy.stats as stats\n", "from statsmodels.stats.outliers_influence import variance_inflation_factor\n", "import datetime" ] }, { "cell_type": "code", "execution_count": 2, "id": "5159ee37", "metadata": {}, "outputs": [], "source": [ "# functions from last lab\n", "def four_in_one(dataframe,model):\n", " fitted_y = model.fittedvalues\n", " studentized_residuals = model.get_influence().resid_studentized_internal\n", " plt.figure(figsize=(10,10))\n", " ax1 = plt.subplot(221)\n", " stats.probplot(studentized_residuals, dist=\"norm\", plot=plt)\n", " ax1.set_title('Normal Q-Q')\n", " ax1.set_xlabel('Normal Quantiles')\n", " ax1.set_ylabel('Studentized Residuals');\n", "\n", " ax2 = plt.subplot(222)\n", " ax2.hist(studentized_residuals)\n", " ax2.set_xlabel('Studentized Residuals')\n", " ax2.set_ylabel('Count')\n", " ax2.set_title('Histogram')\n", "\n", " ax3 = plt.subplot(223)\n", " t = range(dataframe.shape[0])\n", " ax3.scatter(t, studentized_residuals)\n", " ax3.set_xlabel('Observation order')\n", " ax3.set_ylabel('Residuals')\n", " ax3.set_title('Time series plot of studentized residuals')\n", "\n", " ax4 = plt.subplot(224)\n", " temp = pd.DataFrame({'fitted_y':fitted_y,'studentized_residuals':studentized_residuals})\n", " ax4 = sns.residplot(data=temp,x=fitted_y, y=studentized_residuals,\n", " lowess=True,\n", " scatter_kws={'alpha': 0.5},\n", " line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})\n", " ax4.set_title('Internally Studentized Residuals vs Fitted values')\n", " ax4.set_xlabel('Fitted values')\n", " ax4.set_ylabel('Studentized Residuals');\n", " \n", "def getvif(X):\n", " X = sm.add_constant(X)\n", " vif = pd.DataFrame()\n", " vif[\"VIF\"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]\n", " vif[\"Predictors\"] = X.columns\n", " return(vif.drop(index = 0).round(2)) " ] }, { "cell_type": "code", "execution_count": 3, "id": "16326102", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rBHrSPSmBHmL
Date
1/2/2009-0.121807-0.1099310.0005-0.0695
1/3/20090.1030530.0854040.00040.0348
1/4/20090.0841980.0939250.05390.0536
1/5/2009-0.0255320.053081-0.02520.0027
1/6/2009-0.0174670.0001960.0263-0.0273
\n", "
" ], "text/plain": [ " rBH rSP SmB HmL\n", "Date \n", "1/2/2009 -0.121807 -0.109931 0.0005 -0.0695\n", "1/3/2009 0.103053 0.085404 0.0004 0.0348\n", "1/4/2009 0.084198 0.093925 0.0539 0.0536\n", "1/5/2009 -0.025532 0.053081 -0.0252 0.0027\n", "1/6/2009 -0.017467 0.000196 0.0263 -0.0273" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.read_csv(\"BH2009-2022.csv\",index_col=0)\n", "data.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "7fd1d118", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(167, 4)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.shape" ] }, { "cell_type": "markdown", "id": "09cc26c8", "metadata": {}, "source": [ "# Part I: CAPM model" ] }, { "cell_type": "markdown", "id": "ebaa4598-7164-4b6c-ac8d-7d674fa4ee4f", "metadata": {}, "source": [ "### Task 1: Split the data into train (first 155 observations) and test (remaining 12 observations) set" ] }, { "cell_type": "code", "execution_count": null, "id": "095d1d13", "metadata": {}, "outputs": [], "source": [ "train = $$code here$$\n", "test = $$code here$$" ] }, { "cell_type": "code", "execution_count": null, "id": "c0a06748", "metadata": {}, "outputs": [], "source": [ "train.shape, test.shape" ] }, { "cell_type": "code", "execution_count": null, "id": "abff4aee-07cb-4ddd-8cd2-7f909b217b41", "metadata": {}, "outputs": [], "source": [ "Y = train[\"rBH\"]\n", "X = train[\"rSP\"]" ] }, { "cell_type": "markdown", "id": "f84493d2-6484-4e62-a196-888c72c657f4", "metadata": {}, "source": [ "### Task 2: Using training set, fit a simple regression model(SLR). Report the adjusted R-square of the model." ] }, { "cell_type": "code", "execution_count": null, "id": "76e5007b-ec3f-4271-9c25-afd7cb2bd028", "metadata": {}, "outputs": [], "source": [ "SLR = $$code here$$\n", "print(SLR.summary())" ] }, { "cell_type": "markdown", "id": "b9fd7ede-dabc-47d9-918f-021ee1fae9b4", "metadata": {}, "source": [ "### Report the adjusted R-square of the model.\n", " " ] }, { "cell_type": "markdown", "id": "3af44ff8", "metadata": {}, "source": [ "# Part II: Multiple Regression Model" ] }, { "cell_type": "markdown", "id": "3f8158d8-1c42-4226-99c3-0d04a026df5b", "metadata": {}, "source": [ "### Task 3: Using training set, fit a multiple regression model with SmB and HmL explanatory variables in addition to rSP (MLR). Report the adjusted R-square of the model." ] }, { "cell_type": "code", "execution_count": null, "id": "1f2dfad6", "metadata": {}, "outputs": [], "source": [ "X = $$code here$$\n", "MLR = $$code here$$\n", "print(MLR.summary())" ] }, { "cell_type": "markdown", "id": "18e59124-7f05-414a-9284-fde621aa94cc", "metadata": {}, "source": [ "### Report the adjusted R-square of the model.\n", " " ] }, { "cell_type": "markdown", "id": "fcb66422-678b-4d10-9aea-e56ae1c0adfa", "metadata": {}, "source": [ "### Task 4: Checking the multicollinearity problem among rSP, SmB and HmL by \n", " i) Scatter plot matrix \n", " ii) VIF. \n", "#### Is the multicollinearity problem exist?" ] }, { "cell_type": "code", "execution_count": null, "id": "284873a2", "metadata": {}, "outputs": [], "source": [ "$$code here$$ #<--code for scatter plot matrix" ] }, { "cell_type": "code", "execution_count": null, "id": "cad9bd49-5030-4a95-a5b5-ae8677092fbe", "metadata": {}, "outputs": [], "source": [ "$$code here$$ #<--code for VIF" ] }, { "cell_type": "markdown", "id": "aea04a9e", "metadata": {}, "source": [ "### Is the multicollinearity problem exist?" ] }, { "cell_type": "markdown", "id": "88eb43de", "metadata": {}, "source": [] }, { "cell_type": "markdown", "id": "ece7bda1-23e4-47e5-84ba-d0ab6eff9f06", "metadata": { "tags": [] }, "source": [ "### Task 5: From the fitted multiple regression model in Task 3\n", " i) Is the model as a whole useful at 5% significant level? \n", " ii) Which of them is not an useful explanatory variable at 5% significant level?" ] }, { "cell_type": "markdown", "id": "413b5d9a-44c6-4f4e-91c4-3f82317b2b00", "metadata": {}, "source": [] }, { "cell_type": "markdown", "id": "fe0773af-c6ee-4dce-97c6-fe33af6310b8", "metadata": {}, "source": [ "### Task 6: Execute model diagnostic on the model fitted from Task3 using the “four_in_one” function. Comment on the normality, constant variance assumption.\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0b372cd7", "metadata": {}, "outputs": [], "source": [ "$$code here$$ #<--code for “four_in_one” function" ] }, { "cell_type": "markdown", "id": "919fcacf", "metadata": {}, "source": [ "### Comment on the normality, constant variance assumption." ] }, { "cell_type": "markdown", "id": "52ff227e", "metadata": {}, "source": [] }, { "cell_type": "markdown", "id": "5b40b212", "metadata": {}, "source": [ "# Part IV: Model Performance" ] }, { "cell_type": "markdown", "id": "8cc559dd-6a77-4289-a451-ede8d00bbf90", "metadata": {}, "source": [ "### Task 7: Compare the predictive power between SLR and MLR using the test set. Which one perform better in prediction?" ] }, { "cell_type": "code", "execution_count": null, "id": "6e74fa4c", "metadata": {}, "outputs": [], "source": [ "Test_X_SLR = test['rSP']\n", "Test_X_MLR = $$code here$$\n", "\n", "Test_Y_SLR = SLR.predict(sm.add_constant(Test_X_SLR))\n", "Test_Y_MLR = $$code here$$" ] }, { "cell_type": "code", "execution_count": null, "id": "d5af214f", "metadata": {}, "outputs": [], "source": [ "Test_Y = test[\"rBH\"]\n", "\n", "from sklearn.metrics import mean_squared_error\n", "rmse_SLR = np.sqrt(mean_squared_error(Test_Y, Test_Y_SLR))\n", "rmse_MLR = $$code here$$\n", "print(\"RMSE for test set (SLR): \", rmse_SLR)\n", "print(\"RMSE for test set (MLR): \", rmse_MLR)" ] }, { "cell_type": "markdown", "id": "bd0c8483", "metadata": {}, "source": [ "### Which one perform better in prediction?" ] }, { "cell_type": "markdown", "id": "36d05636", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": null, "id": "f203f98c-2e57-4262-8fd8-5209825817af", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }