615 lines
114 KiB
Plaintext
615 lines
114 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Histogram"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import matplotlib.pyplot as plt\n",
|
|
"%matplotlib inline\n",
|
|
"\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.style.use('default')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Iris Data\n",
|
|
"> The data set is stored in ```sklearn``` as a dictionary. \n",
|
|
"> "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])"
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn import datasets\n",
|
|
"iris = datasets.load_iris()\n",
|
|
"iris.keys()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"The feature names are: \n",
|
|
" ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']\n",
|
|
"---------------------\n",
|
|
"The head of feature data: \n",
|
|
" [[5.1 3.5 1.4 0.2]\n",
|
|
" [4.9 3. 1.4 0.2]\n",
|
|
" [4.7 3.2 1.3 0.2]\n",
|
|
" [4.6 3.1 1.5 0.2]\n",
|
|
" [5. 3.6 1.4 0.2]]\n",
|
|
"---------------------\n",
|
|
"The head of target: \n",
|
|
" [0 0 0 0 0]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# What are the items in iris?\n",
|
|
"print('The feature names are: \\n', iris['feature_names'])\n",
|
|
"print('---------------------')\n",
|
|
"print('The head of feature data: \\n', iris['data'][:5,])\n",
|
|
"print('---------------------')\n",
|
|
"print('The head of target: \\n', iris['target'][:5])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Combine Feature and Target and Convert to DataFrame"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(150, 4)\n",
|
|
"---------------------\n",
|
|
"(150, 1)\n",
|
|
"---------------------\n",
|
|
"[[6.7 3. 5.2 2.3 2. ]\n",
|
|
" [6.3 2.5 5. 1.9 2. ]\n",
|
|
" [6.5 3. 5.2 2. 2. ]\n",
|
|
" [6.2 3.4 5.4 2.3 2. ]\n",
|
|
" [5.9 3. 5.1 1.8 2. ]]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# np.c_[iris['data'],iris['target']] combines feature and target more efficiently\n",
|
|
"\n",
|
|
"print( iris['data'].shape )\n",
|
|
"print('---------------------')\n",
|
|
"\n",
|
|
"print( iris['target'].reshape((150,1)).shape )\n",
|
|
"print('---------------------')\n",
|
|
"\n",
|
|
"x = np.hstack([iris['data'],iris['target'].reshape((150,1))])\n",
|
|
"print(x[-5:])\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>sepal length (cm)</th>\n",
|
|
" <th>sepal width (cm)</th>\n",
|
|
" <th>petal length (cm)</th>\n",
|
|
" <th>petal width (cm)</th>\n",
|
|
" <th>species</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>145</th>\n",
|
|
" <td>6.7</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" <td>5.2</td>\n",
|
|
" <td>2.3</td>\n",
|
|
" <td>2.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>146</th>\n",
|
|
" <td>6.3</td>\n",
|
|
" <td>2.5</td>\n",
|
|
" <td>5.0</td>\n",
|
|
" <td>1.9</td>\n",
|
|
" <td>2.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>147</th>\n",
|
|
" <td>6.5</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" <td>5.2</td>\n",
|
|
" <td>2.0</td>\n",
|
|
" <td>2.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>148</th>\n",
|
|
" <td>6.2</td>\n",
|
|
" <td>3.4</td>\n",
|
|
" <td>5.4</td>\n",
|
|
" <td>2.3</td>\n",
|
|
" <td>2.0</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>149</th>\n",
|
|
" <td>5.9</td>\n",
|
|
" <td>3.0</td>\n",
|
|
" <td>5.1</td>\n",
|
|
" <td>1.8</td>\n",
|
|
" <td>2.0</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n",
|
|
"145 6.7 3.0 5.2 2.3 \n",
|
|
"146 6.3 2.5 5.0 1.9 \n",
|
|
"147 6.5 3.0 5.2 2.0 \n",
|
|
"148 6.2 3.4 5.4 2.3 \n",
|
|
"149 5.9 3.0 5.1 1.8 \n",
|
|
"\n",
|
|
" species \n",
|
|
"145 2.0 \n",
|
|
"146 2.0 \n",
|
|
"147 2.0 \n",
|
|
"148 2.0 \n",
|
|
"149 2.0 "
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"iris = pd.DataFrame(data=x, columns=iris['feature_names']+['species'])\n",
|
|
"iris.tail()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Special functions in Numpy/ Pandas\n",
|
|
"> ```np.where(condition,x,y)``` returns x when conditions True & y when conditions False. \n",
|
|
"> 'a' can be replaced by 'b' using ```replace(a,b)```. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"iris.species = np.where(iris.species == 0.0, 'setosa', \n",
|
|
" np.where(iris.species == 1.0,'versicolor','virginica'))\n",
|
|
"\n",
|
|
"iris.columns = iris.columns.str.replace(' ','') #Remove space bar"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000021030A85F60>,\n",
|
|
" <matplotlib.axes._subplots.AxesSubplot object at 0x0000021030ACD0B8>],\n",
|
|
" [<matplotlib.axes._subplots.AxesSubplot object at 0x00000210308E8668>,\n",
|
|
" <matplotlib.axes._subplots.AxesSubplot object at 0x0000021030919C18>]],\n",
|
|
" dtype=object)"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 4 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Pandas plot\n",
|
|
"iris.hist()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## GDP Data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>Country</th>\n",
|
|
" <th>Code</th>\n",
|
|
" <th>Population</th>\n",
|
|
" <th>GDP per Capita</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>Afghanistan</td>\n",
|
|
" <td>AFG</td>\n",
|
|
" <td>32526562.0</td>\n",
|
|
" <td>594.323081</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>Albania</td>\n",
|
|
" <td>ALB</td>\n",
|
|
" <td>2889167.0</td>\n",
|
|
" <td>3945.217582</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>Algeria</td>\n",
|
|
" <td>ALG</td>\n",
|
|
" <td>39666519.0</td>\n",
|
|
" <td>4206.031232</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>American Samoa*</td>\n",
|
|
" <td>ASA</td>\n",
|
|
" <td>55538.0</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>Andorra</td>\n",
|
|
" <td>AND</td>\n",
|
|
" <td>70473.0</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" Country Code Population GDP per Capita\n",
|
|
"0 Afghanistan AFG 32526562.0 594.323081\n",
|
|
"1 Albania ALB 2889167.0 3945.217582\n",
|
|
"2 Algeria ALG 39666519.0 4206.031232\n",
|
|
"3 American Samoa* ASA 55538.0 NaN\n",
|
|
"4 Andorra AND 70473.0 NaN"
|
|
]
|
|
},
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df = pd.read_csv(\"countryGDP.csv\")\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<matplotlib.axes._subplots.AxesSubplot at 0x21030b51be0>"
|
|
]
|
|
},
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"df1 = df['GDP per Capita'].dropna() \n",
|
|
"df1.hist()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Set Bins\n",
|
|
"> One way is to create a NumPy array with chosen cutting points. \n",
|
|
"> Another way is to specify number of bins. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[ 277.06830917 10277.06830917 20277.06830917 30277.06830917\n",
|
|
" 40277.06830917 50277.06830917 60277.06830917 70277.06830917\n",
|
|
" 80277.06830917 90277.06830917 100277.06830917]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"b = np.arange(data1.min(), data1.max(), 10000) # Fixed bin size\n",
|
|
"print(b)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"Text(0, 0.5, 'Frequency')"
|
|
]
|
|
},
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Matplotlib plots\n",
|
|
"\n",
|
|
"plt.hist(df1, bins=b, alpha=0.5, color='blue')\n",
|
|
"plt.title(\"Countrys' GDP Level\")\n",
|
|
"plt.xlabel(\"GDP in Dollars\")\n",
|
|
"plt.ylabel(\"Frequency\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"(array([89., 28., 14., 11., 5., 5., 1., 5., 7., 2., 2., 2., 1.,\n",
|
|
" 0., 2., 1., 0., 0., 0., 1.]),\n",
|
|
" array([ 277.06830917, 5335.71330211, 10394.35829506, 15453.003288 ,\n",
|
|
" 20511.64828095, 25570.29327389, 30628.93826683, 35687.58325978,\n",
|
|
" 40746.22825272, 45804.87324566, 50863.51823861, 55922.16323155,\n",
|
|
" 60980.80822449, 66039.45321744, 71098.09821038, 76156.74320332,\n",
|
|
" 81215.38819627, 86274.03318921, 91332.67818215, 96391.3231751 ,\n",
|
|
" 101449.96816804]),\n",
|
|
" <a list of 1 Patch objects>)"
|
|
]
|
|
},
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 1 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"plt.hist(df1, bins=20, color='blue', histtype='step')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"Text(0.5, 1.0, \"Population Histogram ('000')\")"
|
|
]
|
|
},
|
|
"execution_count": 19,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "\n",
|
|
"text/plain": [
|
|
"<Figure size 640x480 with 2 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"import matplotlib.pyplot as plt\n",
|
|
"\n",
|
|
"data1 = df['GDP per Capita'].dropna() \n",
|
|
"data2 = df['Population'].dropna()/1000 \n",
|
|
"\n",
|
|
"plt.subplot(1,2,1)\n",
|
|
"plt.hist(data1, color='red')\n",
|
|
"plt.xlabel('GDP')\n",
|
|
"plt.title('GDP Histogram')\n",
|
|
"\n",
|
|
"plt.subplot(1,2,2)\n",
|
|
"plt.hist(data2, color='blue')\n",
|
|
"plt.xlabel('Population')\n",
|
|
"plt.title('''Population Histogram ('000')''')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Activity 1\n",
|
|
"> The code starts with \n",
|
|
"```python\n",
|
|
"fig, axes = plt.subplots(1,2)\n",
|
|
"```\n",
|
|
"> Complete the plots where 1st box is GDP, 2nd box is Population. \n",
|
|
"> Bin size is fixed at 20. \n",
|
|
"> Set xlim for GDP to [0,75000], and for Population to [0,200000]. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|