bettyphan789/Assignment2_student.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d7e90f45",
   "metadata": {},
   "outputs": [],
   "source": [
    "#### Pandas is for using data structures\n",
    "import pandas as pd\n",
    "# statsmodels contain modules for regression and time series analysis\n",
    "import statsmodels.api as sm\n",
    "# numpy is for numerical computing of array and mayatrix\n",
    "import numpy as np\n",
    "# Matplotlib, Seaborn: plotting package\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns \n",
    "# matplotlib Showing the plot right after the current code  \n",
    "%matplotlib inline\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "# basic statistics package\n",
    "import scipy.stats as stats\n",
    "from statsmodels.stats.outliers_influence import variance_inflation_factor\n",
    "import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5159ee37",
   "metadata": {},
   "outputs": [],
   "source": [
    "# functions from last lab\n",
    "def four_in_one(dataframe,model):\n",
    "    fitted_y = model.fittedvalues\n",
    "    studentized_residuals = model.get_influence().resid_studentized_internal\n",
    "    plt.figure(figsize=(10,10))\n",
    "    ax1 = plt.subplot(221)\n",
    "    stats.probplot(studentized_residuals, dist=\"norm\", plot=plt)\n",
    "    ax1.set_title('Normal Q-Q')\n",
    "    ax1.set_xlabel('Normal Quantiles')\n",
    "    ax1.set_ylabel('Studentized Residuals');\n",
    "\n",
    "    ax2 = plt.subplot(222)\n",
    "    ax2.hist(studentized_residuals)\n",
    "    ax2.set_xlabel('Studentized Residuals')\n",
    "    ax2.set_ylabel('Count')\n",
    "    ax2.set_title('Histogram')\n",
    "\n",
    "    ax3 = plt.subplot(223)\n",
    "    t = range(dataframe.shape[0])\n",
    "    ax3.scatter(t, studentized_residuals)\n",
    "    ax3.set_xlabel('Observation order')\n",
    "    ax3.set_ylabel('Residuals')\n",
    "    ax3.set_title('Time series plot of studentized residuals')\n",
    "\n",
    "    ax4 = plt.subplot(224)\n",
    "    temp = pd.DataFrame({'fitted_y':fitted_y,'studentized_residuals':studentized_residuals})\n",
    "    ax4 = sns.residplot(data=temp,x=fitted_y, y=studentized_residuals,\n",
    "                              lowess=True,\n",
    "                              scatter_kws={'alpha': 0.5},\n",
    "                              line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})\n",
    "    ax4.set_title('Internally Studentized Residuals vs Fitted values')\n",
    "    ax4.set_xlabel('Fitted values')\n",
    "    ax4.set_ylabel('Studentized Residuals');\n",
    "    \n",
    "def getvif(X):\n",
    "    X = sm.add_constant(X)\n",
    "    vif = pd.DataFrame()\n",
    "    vif[\"VIF\"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]\n",
    "    vif[\"Predictors\"] = X.columns\n",
    "    return(vif.drop(index = 0).round(2)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "16326102",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rBH</th>\n",
       "      <th>rSP</th>\n",
       "      <th>SmB</th>\n",
       "      <th>HmL</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Date</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1/2/2009</th>\n",
       "      <td>-0.121807</td>\n",
       "      <td>-0.109931</td>\n",
       "      <td>0.0005</td>\n",
       "      <td>-0.0695</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1/3/2009</th>\n",
       "      <td>0.103053</td>\n",
       "      <td>0.085404</td>\n",
       "      <td>0.0004</td>\n",
       "      <td>0.0348</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1/4/2009</th>\n",
       "      <td>0.084198</td>\n",
       "      <td>0.093925</td>\n",
       "      <td>0.0539</td>\n",
       "      <td>0.0536</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1/5/2009</th>\n",
       "      <td>-0.025532</td>\n",
       "      <td>0.053081</td>\n",
       "      <td>-0.0252</td>\n",
       "      <td>0.0027</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1/6/2009</th>\n",
       "      <td>-0.017467</td>\n",
       "      <td>0.000196</td>\n",
       "      <td>0.0263</td>\n",
       "      <td>-0.0273</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               rBH       rSP     SmB     HmL\n",
       "Date                                        \n",
       "1/2/2009 -0.121807 -0.109931  0.0005 -0.0695\n",
       "1/3/2009  0.103053  0.085404  0.0004  0.0348\n",
       "1/4/2009  0.084198  0.093925  0.0539  0.0536\n",
       "1/5/2009 -0.025532  0.053081 -0.0252  0.0027\n",
       "1/6/2009 -0.017467  0.000196  0.0263 -0.0273"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.read_csv(\"BH2009-2022.csv\",index_col=0)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "7fd1d118",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(167, 4)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09cc26c8",
   "metadata": {},
   "source": [
    "# Part I: CAPM model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ebaa4598-7164-4b6c-ac8d-7d674fa4ee4f",
   "metadata": {},
   "source": [
    "### Task 1: Split the data into train (first 155 observations) and test (remaining 12 observations) set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "095d1d13",
   "metadata": {},
   "outputs": [],
   "source": [
    "train = $$code here$$\n",
    "test = $$code here$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0a06748",
   "metadata": {},
   "outputs": [],
   "source": [
    "train.shape, test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "abff4aee-07cb-4ddd-8cd2-7f909b217b41",
   "metadata": {},
   "outputs": [],
   "source": [
    "Y = train[\"rBH\"]\n",
    "X = train[\"rSP\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f84493d2-6484-4e62-a196-888c72c657f4",
   "metadata": {},
   "source": [
    "### Task 2: Using training set, fit a simple regression model(SLR). Report the adjusted R-square of the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76e5007b-ec3f-4271-9c25-afd7cb2bd028",
   "metadata": {},
   "outputs": [],
   "source": [
    "SLR = $$code here$$\n",
    "print(SLR.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b9fd7ede-dabc-47d9-918f-021ee1fae9b4",
   "metadata": {},
   "source": [
    "### Report the adjusted R-square of the model.\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3af44ff8",
   "metadata": {},
   "source": [
    "# Part II: Multiple Regression Model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3f8158d8-1c42-4226-99c3-0d04a026df5b",
   "metadata": {},
   "source": [
    "### Task 3: Using training set, fit a multiple regression model with SmB and HmL explanatory variables in addition to rSP (MLR). Report the adjusted R-square of the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1f2dfad6",
   "metadata": {},
   "outputs": [],
   "source": [
    "X = $$code here$$\n",
    "MLR = $$code here$$\n",
    "print(MLR.summary())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "18e59124-7f05-414a-9284-fde621aa94cc",
   "metadata": {},
   "source": [
    "### Report the adjusted R-square of the model.\n",
    "   "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fcb66422-678b-4d10-9aea-e56ae1c0adfa",
   "metadata": {},
   "source": [
    "### Task 4: Checking the multicollinearity problem among  rSP, SmB and HmL by \n",
    "    i) Scatter plot matrix \n",
    "    ii) VIF. \n",
    "#### Is the multicollinearity problem exist?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "284873a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "$$code here$$ #<--code for scatter plot matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cad9bd49-5030-4a95-a5b5-ae8677092fbe",
   "metadata": {},
   "outputs": [],
   "source": [
    "$$code here$$ #<--code for VIF"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aea04a9e",
   "metadata": {},
   "source": [
    "### Is the multicollinearity problem exist?"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "88eb43de",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "ece7bda1-23e4-47e5-84ba-d0ab6eff9f06",
   "metadata": {
    "tags": []
   },
   "source": [
    "### Task 5: From the fitted multiple regression model in Task 3\n",
    "    i) Is the model as a whole useful at 5% significant level? \n",
    "    ii) Which of them is not an useful explanatory variable at 5% significant level?"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "413b5d9a-44c6-4f4e-91c4-3f82317b2b00",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "fe0773af-c6ee-4dce-97c6-fe33af6310b8",
   "metadata": {},
   "source": [
    "### Task 6: Execute model diagnostic on the model fitted from Task3 using the “four_in_one” function. Comment on the normality, constant variance assumption.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0b372cd7",
   "metadata": {},
   "outputs": [],
   "source": [
    "$$code here$$ #<--code for “four_in_one” function"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "919fcacf",
   "metadata": {},
   "source": [
    "### Comment on the normality, constant variance assumption."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "52ff227e",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "5b40b212",
   "metadata": {},
   "source": [
    "# Part IV: Model Performance"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8cc559dd-6a77-4289-a451-ede8d00bbf90",
   "metadata": {},
   "source": [
    "### Task 7: Compare the predictive power between SLR and MLR using the test set. Which one perform better in prediction?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e74fa4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "Test_X_SLR = test['rSP']\n",
    "Test_X_MLR = $$code here$$\n",
    "\n",
    "Test_Y_SLR = SLR.predict(sm.add_constant(Test_X_SLR))\n",
    "Test_Y_MLR = $$code here$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5af214f",
   "metadata": {},
   "outputs": [],
   "source": [
    "Test_Y = test[\"rBH\"]\n",
    "\n",
    "from sklearn.metrics import mean_squared_error\n",
    "rmse_SLR = np.sqrt(mean_squared_error(Test_Y, Test_Y_SLR))\n",
    "rmse_MLR = $$code here$$\n",
    "print(\"RMSE for test set (SLR): \", rmse_SLR)\n",
    "print(\"RMSE for test set (MLR): \", rmse_MLR)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bd0c8483",
   "metadata": {},
   "source": [
    "### Which one perform better in prediction?"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "36d05636",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f203f98c-2e57-4262-8fd8-5209825817af",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}