Files
Man1130/jupyter/Man1130-python-comission/course_materials/Note/2. Joining Data Example.ipynb
louiscklaw e44aead3d5 update,
2025-02-01 01:58:19 +08:00

1394 lines
38 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Example on Data Preparation: US States Data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 57935 100 57935 0 0 115k 0 --:--:-- --:--:-- --:--:-- 115k\n"
]
}
],
"source": [
"!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state/region</th>\n",
" <th>ages</th>\n",
" <th>year</th>\n",
" <th>population</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2012</td>\n",
" <td>1117489.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AL</td>\n",
" <td>total</td>\n",
" <td>2012</td>\n",
" <td>4817528.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2010</td>\n",
" <td>1130966.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AL</td>\n",
" <td>total</td>\n",
" <td>2010</td>\n",
" <td>4785570.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2011</td>\n",
" <td>1125763.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state/region ages year population\n",
"0 AL under18 2012 1117489.0\n",
"1 AL total 2012 4817528.0\n",
"2 AL under18 2010 1130966.0\n",
"3 AL total 2010 4785570.0\n",
"4 AL under18 2011 1125763.0"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop = pd.read_csv('state-population.csv')\n",
"pop.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state/region</th>\n",
" <th>ages</th>\n",
" <th>year</th>\n",
" <th>population</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>500</th>\n",
" <td>GA</td>\n",
" <td>total</td>\n",
" <td>2004</td>\n",
" <td>8769252.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>501</th>\n",
" <td>GA</td>\n",
" <td>under18</td>\n",
" <td>2004</td>\n",
" <td>2308855.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>502</th>\n",
" <td>GA</td>\n",
" <td>total</td>\n",
" <td>2001</td>\n",
" <td>8377038.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>503</th>\n",
" <td>GA</td>\n",
" <td>under18</td>\n",
" <td>2001</td>\n",
" <td>2215390.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>504</th>\n",
" <td>GA</td>\n",
" <td>total</td>\n",
" <td>2002</td>\n",
" <td>8508256.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>695</th>\n",
" <td>IN</td>\n",
" <td>under18</td>\n",
" <td>2001</td>\n",
" <td>1579527.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>696</th>\n",
" <td>IN</td>\n",
" <td>total</td>\n",
" <td>2002</td>\n",
" <td>6155967.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>697</th>\n",
" <td>IN</td>\n",
" <td>under18</td>\n",
" <td>2002</td>\n",
" <td>1580814.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>698</th>\n",
" <td>IN</td>\n",
" <td>total</td>\n",
" <td>1999</td>\n",
" <td>6044970.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>699</th>\n",
" <td>IN</td>\n",
" <td>under18</td>\n",
" <td>1999</td>\n",
" <td>1566079.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>200 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" state/region ages year population\n",
"500 GA total 2004 8769252.0\n",
"501 GA under18 2004 2308855.0\n",
"502 GA total 2001 8377038.0\n",
"503 GA under18 2001 2215390.0\n",
"504 GA total 2002 8508256.0\n",
".. ... ... ... ...\n",
"695 IN under18 2001 1579527.0\n",
"696 IN total 2002 6155967.0\n",
"697 IN under18 2002 1580814.0\n",
"698 IN total 1999 6044970.0\n",
"699 IN under18 1999 1566079.0\n",
"\n",
"[200 rows x 4 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop[500:700]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 835 100 835 0 0 2221 0 --:--:-- --:--:-- --:--:-- 2220\n"
]
}
],
"source": [
"!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state</th>\n",
" <th>area (sq. mi)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Alabama</td>\n",
" <td>52423</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Alaska</td>\n",
" <td>656425</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Arizona</td>\n",
" <td>114006</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Arkansas</td>\n",
" <td>53182</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>California</td>\n",
" <td>163707</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state area (sq. mi)\n",
"0 Alabama 52423\n",
"1 Alaska 656425\n",
"2 Arizona 114006\n",
"3 Arkansas 53182\n",
"4 California 163707"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"areas = pd.read_csv('state-areas.csv')\n",
"areas.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" % Total % Received % Xferd Average Speed Time Time Time Current\n",
" Dload Upload Total Spent Left Speed\n",
"100 872 100 872 0 0 2273 0 --:--:-- --:--:-- --:--:-- 2276\n"
]
}
],
"source": [
"!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state</th>\n",
" <th>abbreviation</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Alaska</td>\n",
" <td>AK</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Arizona</td>\n",
" <td>AZ</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Arkansas</td>\n",
" <td>AR</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>California</td>\n",
" <td>CA</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state abbreviation\n",
"0 Alabama AL\n",
"1 Alaska AK\n",
"2 Arizona AZ\n",
"3 Arkansas AR\n",
"4 California CA"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"abbrevs = pd.read_csv('state-abbrevs.csv')\n",
"abbrevs.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Merge Data Sets of States Abbreviation"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state/region</th>\n",
" <th>ages</th>\n",
" <th>year</th>\n",
" <th>population</th>\n",
" <th>state</th>\n",
" <th>abbreviation</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2012</td>\n",
" <td>1117489.0</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AL</td>\n",
" <td>total</td>\n",
" <td>2012</td>\n",
" <td>4817528.0</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2010</td>\n",
" <td>1130966.0</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AL</td>\n",
" <td>total</td>\n",
" <td>2010</td>\n",
" <td>4785570.0</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2011</td>\n",
" <td>1125763.0</td>\n",
" <td>Alabama</td>\n",
" <td>AL</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state/region ages year population state abbreviation\n",
"0 AL under18 2012 1117489.0 Alabama AL\n",
"1 AL total 2012 4817528.0 Alabama AL\n",
"2 AL under18 2010 1130966.0 Alabama AL\n",
"3 AL total 2010 4785570.0 Alabama AL\n",
"4 AL under18 2011 1125763.0 Alabama AL"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop_ab = pd.merge(pop, abbrevs, how='outer',\n",
" left_on='state/region', \n",
" right_on='abbreviation')\n",
"pop_ab.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state/region</th>\n",
" <th>ages</th>\n",
" <th>year</th>\n",
" <th>population</th>\n",
" <th>state</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2012</td>\n",
" <td>1117489.0</td>\n",
" <td>Alabama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AL</td>\n",
" <td>total</td>\n",
" <td>2012</td>\n",
" <td>4817528.0</td>\n",
" <td>Alabama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2010</td>\n",
" <td>1130966.0</td>\n",
" <td>Alabama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AL</td>\n",
" <td>total</td>\n",
" <td>2010</td>\n",
" <td>4785570.0</td>\n",
" <td>Alabama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2011</td>\n",
" <td>1125763.0</td>\n",
" <td>Alabama</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state/region ages year population state\n",
"0 AL under18 2012 1117489.0 Alabama\n",
"1 AL total 2012 4817528.0 Alabama\n",
"2 AL under18 2010 1130966.0 Alabama\n",
"3 AL total 2010 4785570.0 Alabama\n",
"4 AL under18 2011 1125763.0 Alabama"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop_ab = pop_ab.drop('abbreviation',axis=1) \n",
"pop_ab.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fill in Missing Values for States"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"state/region False\n",
"ages False\n",
"year False\n",
"population True\n",
"state True\n",
"dtype: bool"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop_ab.isnull().any()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state/region</th>\n",
" <th>ages</th>\n",
" <th>year</th>\n",
" <th>population</th>\n",
" <th>state</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2448</th>\n",
" <td>PR</td>\n",
" <td>under18</td>\n",
" <td>1990</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2449</th>\n",
" <td>PR</td>\n",
" <td>total</td>\n",
" <td>1990</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2450</th>\n",
" <td>PR</td>\n",
" <td>total</td>\n",
" <td>1991</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2451</th>\n",
" <td>PR</td>\n",
" <td>under18</td>\n",
" <td>1991</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2452</th>\n",
" <td>PR</td>\n",
" <td>total</td>\n",
" <td>1993</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state/region ages year population state\n",
"2448 PR under18 1990 NaN NaN\n",
"2449 PR total 1990 NaN NaN\n",
"2450 PR total 1991 NaN NaN\n",
"2451 PR under18 1991 NaN NaN\n",
"2452 PR total 1993 NaN NaN"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop_ab[pop_ab['state'].isnull()].head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['PR', 'USA'], dtype=object)"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Choose ALL null rows, and column 'state/region'. Read its unique values.\n",
"pop_ab.loc[pop_ab['state'].isnull(), 'state/region'].unique()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"state/region False\n",
"ages False\n",
"year False\n",
"population True\n",
"state False\n",
"dtype: bool"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# data.loc[row condition, variable_to_be_updated] = 'new value'\n",
"\n",
"pop_ab.loc[pop_ab['state/region'] == 'PR', 'state'] = 'Puerto Rico'\n",
"pop_ab.loc[pop_ab['state/region'] == 'USA', 'state'] = 'United States'\n",
"\n",
"pop_ab.isnull().any()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Merge the 3rd Dataset"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state/region</th>\n",
" <th>ages</th>\n",
" <th>year</th>\n",
" <th>population</th>\n",
" <th>state</th>\n",
" <th>area (sq. mi)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2012</td>\n",
" <td>1117489.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AL</td>\n",
" <td>total</td>\n",
" <td>2012</td>\n",
" <td>4817528.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2010</td>\n",
" <td>1130966.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AL</td>\n",
" <td>total</td>\n",
" <td>2010</td>\n",
" <td>4785570.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2011</td>\n",
" <td>1125763.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state/region ages year population state area (sq. mi)\n",
"0 AL under18 2012 1117489.0 Alabama 52423.0\n",
"1 AL total 2012 4817528.0 Alabama 52423.0\n",
"2 AL under18 2010 1130966.0 Alabama 52423.0\n",
"3 AL total 2010 4785570.0 Alabama 52423.0\n",
"4 AL under18 2011 1125763.0 Alabama 52423.0"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final = pd.merge(pop_ab, areas, on='state', how='left')\n",
"final.head()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"state/region False\n",
"ages False\n",
"year False\n",
"population True\n",
"state False\n",
"area (sq. mi) True\n",
"dtype: bool"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final.isnull().any()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state/region</th>\n",
" <th>ages</th>\n",
" <th>year</th>\n",
" <th>population</th>\n",
" <th>state</th>\n",
" <th>area (sq. mi)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2012</td>\n",
" <td>1117489.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AL</td>\n",
" <td>total</td>\n",
" <td>2012</td>\n",
" <td>4817528.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2010</td>\n",
" <td>1130966.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AL</td>\n",
" <td>total</td>\n",
" <td>2010</td>\n",
" <td>4785570.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2011</td>\n",
" <td>1125763.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state/region ages year population state area (sq. mi)\n",
"0 AL under18 2012 1117489.0 Alabama 52423.0\n",
"1 AL total 2012 4817528.0 Alabama 52423.0\n",
"2 AL under18 2010 1130966.0 Alabama 52423.0\n",
"3 AL total 2010 4785570.0 Alabama 52423.0\n",
"4 AL under18 2011 1125763.0 Alabama 52423.0"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final.dropna(inplace=True)\n",
"final.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Creating New Variables"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state/region</th>\n",
" <th>ages</th>\n",
" <th>year</th>\n",
" <th>population</th>\n",
" <th>state</th>\n",
" <th>area (sq. mi)</th>\n",
" <th>density</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2012</td>\n",
" <td>1117489.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" <td>21.316769</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>AL</td>\n",
" <td>total</td>\n",
" <td>2012</td>\n",
" <td>4817528.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" <td>91.897221</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2010</td>\n",
" <td>1130966.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" <td>21.573851</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AL</td>\n",
" <td>total</td>\n",
" <td>2010</td>\n",
" <td>4785570.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" <td>91.287603</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>AL</td>\n",
" <td>under18</td>\n",
" <td>2011</td>\n",
" <td>1125763.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" <td>21.474601</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state/region ages year population state area (sq. mi) density\n",
"0 AL under18 2012 1117489.0 Alabama 52423.0 21.316769\n",
"1 AL total 2012 4817528.0 Alabama 52423.0 91.897221\n",
"2 AL under18 2010 1130966.0 Alabama 52423.0 21.573851\n",
"3 AL total 2010 4785570.0 Alabama 52423.0 91.287603\n",
"4 AL under18 2011 1125763.0 Alabama 52423.0 21.474601"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final['density'] = final['population'] / final['area (sq. mi)']\n",
"final.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Subsetting the Data"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>state/region</th>\n",
" <th>ages</th>\n",
" <th>year</th>\n",
" <th>population</th>\n",
" <th>state</th>\n",
" <th>area (sq. mi)</th>\n",
" <th>density</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>AL</td>\n",
" <td>total</td>\n",
" <td>2010</td>\n",
" <td>4785570.0</td>\n",
" <td>Alabama</td>\n",
" <td>52423.0</td>\n",
" <td>91.287603</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>AK</td>\n",
" <td>total</td>\n",
" <td>2010</td>\n",
" <td>713868.0</td>\n",
" <td>Alaska</td>\n",
" <td>656425.0</td>\n",
" <td>1.087509</td>\n",
" </tr>\n",
" <tr>\n",
" <th>101</th>\n",
" <td>AZ</td>\n",
" <td>total</td>\n",
" <td>2010</td>\n",
" <td>6408790.0</td>\n",
" <td>Arizona</td>\n",
" <td>114006.0</td>\n",
" <td>56.214497</td>\n",
" </tr>\n",
" <tr>\n",
" <th>189</th>\n",
" <td>AR</td>\n",
" <td>total</td>\n",
" <td>2010</td>\n",
" <td>2922280.0</td>\n",
" <td>Arkansas</td>\n",
" <td>53182.0</td>\n",
" <td>54.948667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>197</th>\n",
" <td>CA</td>\n",
" <td>total</td>\n",
" <td>2010</td>\n",
" <td>37333601.0</td>\n",
" <td>California</td>\n",
" <td>163707.0</td>\n",
" <td>228.051342</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" state/region ages year population state area (sq. mi) \\\n",
"3 AL total 2010 4785570.0 Alabama 52423.0 \n",
"91 AK total 2010 713868.0 Alaska 656425.0 \n",
"101 AZ total 2010 6408790.0 Arizona 114006.0 \n",
"189 AR total 2010 2922280.0 Arkansas 53182.0 \n",
"197 CA total 2010 37333601.0 California 163707.0 \n",
"\n",
" density \n",
"3 91.287603 \n",
"91 1.087509 \n",
"101 56.214497 \n",
"189 54.948667 \n",
"197 228.051342 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data2010 = final.query(\" year == 2010 & ages == 'total' \")\n",
"data2010.head()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"data2010.set_index('state', inplace=True) "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"scrolled": true
},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'data2010' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn [13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdata2010\u001b[49m\n",
"\u001b[0;31mNameError\u001b[0m: name 'data2010' is not defined"
]
}
],
"source": [
"data2010"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}