Files
louiscklaw 9035c1312b update,
2025-02-01 02:09:32 +08:00

356 lines
11 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "e5905a69",
"metadata": {},
"source": [
"# CSE6242 - HW3 - Q1"
]
},
{
"cell_type": "markdown",
"id": "09289981",
"metadata": {},
"source": [
"Pyspark Imports"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "139318cb",
"metadata": {},
"outputs": [],
"source": [
"### DO NOT MODIFY THIS CELL ###\n",
"import pyspark\n",
"from pyspark.sql import SQLContext\n",
"from pyspark.sql.functions import hour, when, col, date_format, to_timestamp, round, coalesce\n",
"from pyspark.sql.functions import *"
]
},
{
"cell_type": "markdown",
"id": "3fd9e0f8",
"metadata": {},
"source": [
"Initialize PySpark Context"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "b0c18c6c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Setting default log level to \"WARN\".\n",
"To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
"23/10/18 14:58:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
"23/10/18 14:58:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n",
"23/10/18 14:58:22 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n",
"23/10/18 14:58:22 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.\n",
"/usr/local/lib/python3.9/dist-packages/pyspark/sql/context.py:113: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.\n",
" warnings.warn(\n"
]
}
],
"source": [
"### DO NOT MODIFY THIS CELL ###\n",
"sc = pyspark.SparkContext(appName=\"HW3-Q1\")\n",
"sqlContext = SQLContext(sc)"
]
},
{
"cell_type": "markdown",
"id": "d68ae314",
"metadata": {},
"source": [
"Define function for loading data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7e5bbdda",
"metadata": {},
"outputs": [],
"source": [
"### DO NOT MODIFY THIS CELL ###\n",
"def load_data():\n",
" df = sqlContext.read.option(\"header\",True) \\\n",
" .csv(\"yellow_tripdata_2019-01_short.csv\")\n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "0d52409d",
"metadata": {},
"source": [
"### Q1.a"
]
},
{
"cell_type": "markdown",
"id": "e43f6e00",
"metadata": {},
"source": [
"Perform data casting to clean incoming dataset"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "11f801b4",
"metadata": {},
"outputs": [],
"source": [
"def clean_data(df):\n",
" '''\n",
" input: df a dataframe\n",
" output: df a dataframe with the all the original columns\n",
" '''\n",
" \n",
" # START YOUR CODE HERE ---------\n",
" from pyspark.sql.types import StructField, StructType, IntegerType, TimestampType, FloatType, StringType\n",
"\n",
" df = df.withColumn(\"passenger_count\", df[\"passenger_count\"].cast(IntegerType()))\n",
" df = df.withColumn(\"total_amount\", df[\"total_amount\"].cast(FloatType()))\n",
" df = df.withColumn(\"tip_amount\", df[\"tip_amount\"].cast(FloatType()))\n",
" df = df.withColumn(\"trip_distance\", df[\"trip_distance\"].cast(FloatType()))\n",
" df = df.withColumn(\"fare_amount\", df[\"fare_amount\"].cast(FloatType()))\n",
" df = df.withColumn(\"tpep_pickup_datetime\", df[\"tpep_pickup_datetime\"].cast(TimestampType()))\n",
" df = df.withColumn(\"tpep_dropoff_datetime\", df[\"tpep_dropoff_datetime\"].cast(TimestampType()))\n",
"\n",
" # END YOUR CODE HERE -----------\n",
" \n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "d4f565d0",
"metadata": {},
"source": [
"### Q1.b"
]
},
{
"cell_type": "markdown",
"id": "72b4f712",
"metadata": {},
"source": [
"Find rate per person for based on how many passengers travel between pickup and dropoff locations. "
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "4e115152",
"metadata": {},
"outputs": [],
"source": [
"def common_pair(df):\n",
" '''\n",
" input: df a dataframe\n",
" output: df a dataframe with following columns:\n",
" - PULocationID\n",
" - DOLocationID\n",
" - passenger_count\n",
" - per_person_rate\n",
" \n",
" per_person_rate is the total_amount per person for a given pair.\n",
" \n",
" '''\n",
" \n",
" # START YOUR CODE HERE ---------\n",
" from pyspark.sql import Window\n",
"\n",
" partition_cols = ['PULocationID','DOLocationID']\n",
"\n",
" group_by_result = df.groupBy(partition_cols).count()\n",
" # group_by_result.show()\n",
"\n",
" # Filter out any trips that have the same pick-up and drop-off location. \n",
" df_temp = df.filter((df.PULocationID != df.DOLocationID))\n",
" # group_by_result_difference_location.show()\n",
"\n",
" # # [4 pts] You will be modifying the function common_pair. \n",
" # # Return the top 10 pickup-dropoff location pairs that have the highest number of total passengers who have traveled between them. \n",
" # # Sort the location pairs by total passengers. \n",
" df_temp = df_temp.withColumn(\"passenger_count\", sum(\"passenger_count\").over(Window.partitionBy(*partition_cols)))\n",
" \n",
" # # For each location pair, also compute \n",
" # # the average amount per passenger over all trips (name this per_person_rate), utilizing total_amount.\n",
" df_temp = df_temp.withColumn(\"total_amount_partition\", sum(\"total_amount\").over(Window.partitionBy(*partition_cols)))\n",
" df_temp = df_temp.withColumn(\"per_person_rate\",col(\"total_amount_partition\")/col(\"passenger_count\"))\n",
" \n",
" # # For pairs with the same total passengers, \n",
" # # sort them in descending order of per_person_rate.\n",
" # # Rename the column for total passengers to passenger_count. \n",
" df_temp = df_temp.select(['PULocationID','DOLocationID','passenger_count','per_person_rate']).distinct()\n",
" df_joined = group_by_result.join(df_temp, partition_cols)\n",
" df_joined = df_joined.orderBy(['passenger_count','per_person_rate'], ascending=False).limit(10)\n",
" df_output = df_joined.drop('count')\n",
" # END YOUR CODE HERE -----------\n",
" \n",
" return df_output"
]
},
{
"cell_type": "markdown",
"id": "127574ab",
"metadata": {},
"source": [
"### Q1.c"
]
},
{
"cell_type": "markdown",
"id": "36a8fd27",
"metadata": {},
"source": [
"Find trips which trip distances generate the highest tip percentage."
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "376c981c",
"metadata": {},
"outputs": [],
"source": [
"def distance_with_most_tip(df):\n",
" '''\n",
" input: df a dataframe\n",
" output: df a dataframe with following columns:\n",
" - trip_distance\n",
" - tip_percent\n",
" \n",
" trip_percent is the percent of tip out of fare_amount\n",
" \n",
" '''\n",
" \n",
" # START YOUR CODE HERE ---------\n",
" \n",
" # END YOUR CODE HERE -----------\n",
" \n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "f0172fe6",
"metadata": {},
"source": [
"### Q1.d"
]
},
{
"cell_type": "markdown",
"id": "4613c906",
"metadata": {},
"source": [
"Determine the average speed at different times of day."
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "abff9e24",
"metadata": {},
"outputs": [],
"source": [
"def time_with_most_traffic(df):\n",
" '''\n",
" input: df a dataframe\n",
" output: df a dataframe with following columns:\n",
" - time_of_day\n",
" - am_avg_speed\n",
" - pm_avg_speed\n",
" \n",
" trip_percent is the percent of tip out of fare_amount\n",
" \n",
" '''\n",
" \n",
" # START YOUR CODE HERE ---------\n",
"\n",
" # END YOUR CODE HERE -----------\n",
" \n",
" return df"
]
},
{
"cell_type": "markdown",
"id": "34cbd7b9",
"metadata": {},
"source": [
"### The below cells are for you to investigate your solutions and will not be graded\n",
"## Ensure they are commented out prior to submitting to Gradescope to avoid errors"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "bf9abefb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------+------------+---------------+------------------+\n",
"|PULocationID|DOLocationID|passenger_count| per_person_rate|\n",
"+------------+------------+---------------+------------------+\n",
"| 239| 238| 62| 4.26274198870505|\n",
"| 237| 236| 60| 4.482500068346659|\n",
"| 263| 141| 52|3.4190384974846473|\n",
"| 161| 236| 42| 5.368571440378825|\n",
"| 148| 79| 42| 4.711904752822149|\n",
"| 142| 238| 39| 5.05487182812813|\n",
"| 141| 236| 37| 4.355675723101641|\n",
"| 239| 143| 37| 4.252162224537617|\n",
"| 239| 142| 35| 3.817714350564139|\n",
"| 79| 170| 34| 6.394705884596881|\n",
"+------------+------------+---------------+------------------+\n",
"\n"
]
}
],
"source": [
"# df = load_data()\n",
"# df = clean_data(df)\n",
"# common_pair(df).show()\n",
"# distance_with_most_tip(df).show()\n",
"# time_with_most_traffic(df).show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}