{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Example on Data Preparation: US States Data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 57935 100 57935 0 0 115k 0 --:--:-- --:--:-- --:--:-- 115k\n" ] } ], "source": [ "!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-population.csv" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state/regionagesyearpopulation
0ALunder1820121117489.0
1ALtotal20124817528.0
2ALunder1820101130966.0
3ALtotal20104785570.0
4ALunder1820111125763.0
\n", "
" ], "text/plain": [ " state/region ages year population\n", "0 AL under18 2012 1117489.0\n", "1 AL total 2012 4817528.0\n", "2 AL under18 2010 1130966.0\n", "3 AL total 2010 4785570.0\n", "4 AL under18 2011 1125763.0" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop = pd.read_csv('state-population.csv')\n", "pop.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state/regionagesyearpopulation
500GAtotal20048769252.0
501GAunder1820042308855.0
502GAtotal20018377038.0
503GAunder1820012215390.0
504GAtotal20028508256.0
...............
695INunder1820011579527.0
696INtotal20026155967.0
697INunder1820021580814.0
698INtotal19996044970.0
699INunder1819991566079.0
\n", "

200 rows × 4 columns

\n", "
" ], "text/plain": [ " state/region ages year population\n", "500 GA total 2004 8769252.0\n", "501 GA under18 2004 2308855.0\n", "502 GA total 2001 8377038.0\n", "503 GA under18 2001 2215390.0\n", "504 GA total 2002 8508256.0\n", ".. ... ... ... ...\n", "695 IN under18 2001 1579527.0\n", "696 IN total 2002 6155967.0\n", "697 IN under18 2002 1580814.0\n", "698 IN total 1999 6044970.0\n", "699 IN under18 1999 1566079.0\n", "\n", "[200 rows x 4 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop[500:700]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 835 100 835 0 0 2221 0 --:--:-- --:--:-- --:--:-- 2220\n" ] } ], "source": [ "!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-areas.csv" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
statearea (sq. mi)
0Alabama52423
1Alaska656425
2Arizona114006
3Arkansas53182
4California163707
\n", "
" ], "text/plain": [ " state area (sq. mi)\n", "0 Alabama 52423\n", "1 Alaska 656425\n", "2 Arizona 114006\n", "3 Arkansas 53182\n", "4 California 163707" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "areas = pd.read_csv('state-areas.csv')\n", "areas.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " % Total % Received % Xferd Average Speed Time Time Time Current\n", " Dload Upload Total Spent Left Speed\n", "100 872 100 872 0 0 2273 0 --:--:-- --:--:-- --:--:-- 2276\n" ] } ], "source": [ "!curl -O https://raw.githubusercontent.com/jakevdp/data-USstates/master/state-abbrevs.csv" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
stateabbreviation
0AlabamaAL
1AlaskaAK
2ArizonaAZ
3ArkansasAR
4CaliforniaCA
\n", "
" ], "text/plain": [ " state abbreviation\n", "0 Alabama AL\n", "1 Alaska AK\n", "2 Arizona AZ\n", "3 Arkansas AR\n", "4 California CA" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "abbrevs = pd.read_csv('state-abbrevs.csv')\n", "abbrevs.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Merge Data Sets of States Abbreviation" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state/regionagesyearpopulationstateabbreviation
0ALunder1820121117489.0AlabamaAL
1ALtotal20124817528.0AlabamaAL
2ALunder1820101130966.0AlabamaAL
3ALtotal20104785570.0AlabamaAL
4ALunder1820111125763.0AlabamaAL
\n", "
" ], "text/plain": [ " state/region ages year population state abbreviation\n", "0 AL under18 2012 1117489.0 Alabama AL\n", "1 AL total 2012 4817528.0 Alabama AL\n", "2 AL under18 2010 1130966.0 Alabama AL\n", "3 AL total 2010 4785570.0 Alabama AL\n", "4 AL under18 2011 1125763.0 Alabama AL" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop_ab = pd.merge(pop, abbrevs, how='outer',\n", " left_on='state/region', \n", " right_on='abbreviation')\n", "pop_ab.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state/regionagesyearpopulationstate
0ALunder1820121117489.0Alabama
1ALtotal20124817528.0Alabama
2ALunder1820101130966.0Alabama
3ALtotal20104785570.0Alabama
4ALunder1820111125763.0Alabama
\n", "
" ], "text/plain": [ " state/region ages year population state\n", "0 AL under18 2012 1117489.0 Alabama\n", "1 AL total 2012 4817528.0 Alabama\n", "2 AL under18 2010 1130966.0 Alabama\n", "3 AL total 2010 4785570.0 Alabama\n", "4 AL under18 2011 1125763.0 Alabama" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop_ab = pop_ab.drop('abbreviation',axis=1) \n", "pop_ab.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Fill in Missing Values for States" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "state/region False\n", "ages False\n", "year False\n", "population True\n", "state True\n", "dtype: bool" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop_ab.isnull().any()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state/regionagesyearpopulationstate
2448PRunder181990NaNNaN
2449PRtotal1990NaNNaN
2450PRtotal1991NaNNaN
2451PRunder181991NaNNaN
2452PRtotal1993NaNNaN
\n", "
" ], "text/plain": [ " state/region ages year population state\n", "2448 PR under18 1990 NaN NaN\n", "2449 PR total 1990 NaN NaN\n", "2450 PR total 1991 NaN NaN\n", "2451 PR under18 1991 NaN NaN\n", "2452 PR total 1993 NaN NaN" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop_ab[pop_ab['state'].isnull()].head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['PR', 'USA'], dtype=object)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Choose ALL null rows, and column 'state/region'. Read its unique values.\n", "pop_ab.loc[pop_ab['state'].isnull(), 'state/region'].unique()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "state/region False\n", "ages False\n", "year False\n", "population True\n", "state False\n", "dtype: bool" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# data.loc[row condition, variable_to_be_updated] = 'new value'\n", "\n", "pop_ab.loc[pop_ab['state/region'] == 'PR', 'state'] = 'Puerto Rico'\n", "pop_ab.loc[pop_ab['state/region'] == 'USA', 'state'] = 'United States'\n", "\n", "pop_ab.isnull().any()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Merge the 3rd Dataset" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state/regionagesyearpopulationstatearea (sq. mi)
0ALunder1820121117489.0Alabama52423.0
1ALtotal20124817528.0Alabama52423.0
2ALunder1820101130966.0Alabama52423.0
3ALtotal20104785570.0Alabama52423.0
4ALunder1820111125763.0Alabama52423.0
\n", "
" ], "text/plain": [ " state/region ages year population state area (sq. mi)\n", "0 AL under18 2012 1117489.0 Alabama 52423.0\n", "1 AL total 2012 4817528.0 Alabama 52423.0\n", "2 AL under18 2010 1130966.0 Alabama 52423.0\n", "3 AL total 2010 4785570.0 Alabama 52423.0\n", "4 AL under18 2011 1125763.0 Alabama 52423.0" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final = pd.merge(pop_ab, areas, on='state', how='left')\n", "final.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "state/region False\n", "ages False\n", "year False\n", "population True\n", "state False\n", "area (sq. mi) True\n", "dtype: bool" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final.isnull().any()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state/regionagesyearpopulationstatearea (sq. mi)
0ALunder1820121117489.0Alabama52423.0
1ALtotal20124817528.0Alabama52423.0
2ALunder1820101130966.0Alabama52423.0
3ALtotal20104785570.0Alabama52423.0
4ALunder1820111125763.0Alabama52423.0
\n", "
" ], "text/plain": [ " state/region ages year population state area (sq. mi)\n", "0 AL under18 2012 1117489.0 Alabama 52423.0\n", "1 AL total 2012 4817528.0 Alabama 52423.0\n", "2 AL under18 2010 1130966.0 Alabama 52423.0\n", "3 AL total 2010 4785570.0 Alabama 52423.0\n", "4 AL under18 2011 1125763.0 Alabama 52423.0" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final.dropna(inplace=True)\n", "final.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creating New Variables" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state/regionagesyearpopulationstatearea (sq. mi)density
0ALunder1820121117489.0Alabama52423.021.316769
1ALtotal20124817528.0Alabama52423.091.897221
2ALunder1820101130966.0Alabama52423.021.573851
3ALtotal20104785570.0Alabama52423.091.287603
4ALunder1820111125763.0Alabama52423.021.474601
\n", "
" ], "text/plain": [ " state/region ages year population state area (sq. mi) density\n", "0 AL under18 2012 1117489.0 Alabama 52423.0 21.316769\n", "1 AL total 2012 4817528.0 Alabama 52423.0 91.897221\n", "2 AL under18 2010 1130966.0 Alabama 52423.0 21.573851\n", "3 AL total 2010 4785570.0 Alabama 52423.0 91.287603\n", "4 AL under18 2011 1125763.0 Alabama 52423.0 21.474601" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final['density'] = final['population'] / final['area (sq. mi)']\n", "final.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Subsetting the Data" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
state/regionagesyearpopulationstatearea (sq. mi)density
3ALtotal20104785570.0Alabama52423.091.287603
91AKtotal2010713868.0Alaska656425.01.087509
101AZtotal20106408790.0Arizona114006.056.214497
189ARtotal20102922280.0Arkansas53182.054.948667
197CAtotal201037333601.0California163707.0228.051342
\n", "
" ], "text/plain": [ " state/region ages year population state area (sq. mi) \\\n", "3 AL total 2010 4785570.0 Alabama 52423.0 \n", "91 AK total 2010 713868.0 Alaska 656425.0 \n", "101 AZ total 2010 6408790.0 Arizona 114006.0 \n", "189 AR total 2010 2922280.0 Arkansas 53182.0 \n", "197 CA total 2010 37333601.0 California 163707.0 \n", "\n", " density \n", "3 91.287603 \n", "91 1.087509 \n", "101 56.214497 \n", "189 54.948667 \n", "197 228.051342 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data2010 = final.query(\" year == 2010 & ages == 'total' \")\n", "data2010.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "data2010.set_index('state', inplace=True) " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": true }, "outputs": [ { "ename": "NameError", "evalue": "name 'data2010' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn [13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdata2010\u001b[49m\n", "\u001b[0;31mNameError\u001b[0m: name 'data2010' is not defined" ] } ], "source": [ "data2010" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 2 }