310 lines
6.4 KiB
Plaintext
310 lines
6.4 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "e5905a69",
|
|
"metadata": {},
|
|
"source": [
|
|
"# CSE6242 - HW3 - Q1"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "09289981",
|
|
"metadata": {},
|
|
"source": [
|
|
"Pyspark Imports"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "139318cb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"### DO NOT MODIFY THIS CELL ###\n",
|
|
"import pyspark\n",
|
|
"from pyspark.sql import SQLContext\n",
|
|
"from pyspark.sql.functions import hour, when, col, date_format, to_timestamp, round, coalesce"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3fd9e0f8",
|
|
"metadata": {},
|
|
"source": [
|
|
"Initialize PySpark Context"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b0c18c6c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"### DO NOT MODIFY THIS CELL ###\n",
|
|
"sc = pyspark.SparkContext(appName=\"HW3-Q1\")\n",
|
|
"sqlContext = SQLContext(sc)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "d68ae314",
|
|
"metadata": {},
|
|
"source": [
|
|
"Define function for loading data"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7e5bbdda",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"### DO NOT MODIFY THIS CELL ###\n",
|
|
"def load_data():\n",
|
|
" df = sqlContext.read.option(\"header\",True) \\\n",
|
|
" .csv(\"yellow_tripdata_2019-01_short.csv\")\n",
|
|
" return df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "0d52409d",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Q1.a"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "e43f6e00",
|
|
"metadata": {},
|
|
"source": [
|
|
"Perform data casting to clean incoming dataset"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "11f801b4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def clean_data(df):\n",
|
|
" '''\n",
|
|
" input: df a dataframe\n",
|
|
" output: df a dataframe with the all the original columns\n",
|
|
" '''\n",
|
|
" \n",
|
|
" # START YOUR CODE HERE ---------\n",
|
|
" \n",
|
|
"\n",
|
|
" # END YOUR CODE HERE -----------\n",
|
|
" \n",
|
|
" return df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "d4f565d0",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Q1.b"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "72b4f712",
|
|
"metadata": {},
|
|
"source": [
|
|
"Find rate per person for based on how many passengers travel between pickup and dropoff locations. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4e115152",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def common_pair(df):\n",
|
|
" '''\n",
|
|
" input: df a dataframe\n",
|
|
" output: df a dataframe with following columns:\n",
|
|
" - PULocationID\n",
|
|
" - DOLocationID\n",
|
|
" - passenger_count\n",
|
|
" - per_person_rate\n",
|
|
" \n",
|
|
" per_person_rate is the total_amount per person for a given pair.\n",
|
|
" \n",
|
|
" '''\n",
|
|
" \n",
|
|
" # START YOUR CODE HERE ---------\n",
|
|
" \n",
|
|
" # END YOUR CODE HERE -----------\n",
|
|
" \n",
|
|
" return df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "127574ab",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Q1.c"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "36a8fd27",
|
|
"metadata": {},
|
|
"source": [
|
|
"Find trips which trip distances generate the highest tip percentage."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "376c981c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def distance_with_most_tip(df):\n",
|
|
" '''\n",
|
|
" input: df a dataframe\n",
|
|
" output: df a dataframe with following columns:\n",
|
|
" - trip_distance\n",
|
|
" - tip_percent\n",
|
|
" \n",
|
|
" trip_percent is the percent of tip out of fare_amount\n",
|
|
" \n",
|
|
" '''\n",
|
|
" \n",
|
|
" # START YOUR CODE HERE ---------\n",
|
|
" \n",
|
|
" # END YOUR CODE HERE -----------\n",
|
|
" \n",
|
|
" return df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f0172fe6",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Q1.d"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4613c906",
|
|
"metadata": {},
|
|
"source": [
|
|
"Determine the average speed at different times of day."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "abff9e24",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def time_with_most_traffic(df):\n",
|
|
" '''\n",
|
|
" input: df a dataframe\n",
|
|
" output: df a dataframe with following columns:\n",
|
|
" - time_of_day\n",
|
|
" - am_avg_speed\n",
|
|
" - pm_avg_speed\n",
|
|
" \n",
|
|
" trip_percent is the percent of tip out of fare_amount\n",
|
|
" \n",
|
|
" '''\n",
|
|
" \n",
|
|
" # START YOUR CODE HERE ---------\n",
|
|
"\n",
|
|
" # END YOUR CODE HERE -----------\n",
|
|
" \n",
|
|
" return df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "34cbd7b9",
|
|
"metadata": {},
|
|
"source": [
|
|
"### The below cells are for you to investigate your solutions and will not be graded\n",
|
|
"## Ensure they are commented out prior to submitting to Gradescope to avoid errors"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "bf9abefb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# df = load_data()\n",
|
|
"# df = clean_data(df)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cfa96f41",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# common_pair(df).show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8e42b46a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# distance_with_most_tip(df).show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4f558c64",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# time_with_most_traffic(df).show()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|