{ "cells": [ { "cell_type": "markdown", "id": "e5905a69", "metadata": {}, "source": [ "# CSE6242 - HW3 - Q1" ] }, { "cell_type": "markdown", "id": "09289981", "metadata": {}, "source": [ "Pyspark Imports" ] }, { "cell_type": "code", "execution_count": 1, "id": "139318cb", "metadata": {}, "outputs": [], "source": [ "### DO NOT MODIFY THIS CELL ###\n", "import pyspark\n", "from pyspark.sql import SQLContext\n", "from pyspark.sql.functions import hour, when, col, date_format, to_timestamp, round, coalesce" ] }, { "cell_type": "markdown", "id": "3fd9e0f8", "metadata": {}, "source": [ "Initialize PySpark Context" ] }, { "cell_type": "code", "execution_count": 2, "id": "b0c18c6c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", "23/10/18 14:31:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", "23/10/18 14:31:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n", "23/10/18 14:31:43 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n", "/usr/local/lib/python3.9/dist-packages/pyspark/sql/context.py:113: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.\n", " warnings.warn(\n" ] } ], "source": [ "### DO NOT MODIFY THIS CELL ###\n", "sc = pyspark.SparkContext(appName=\"HW3-Q1\")\n", "sqlContext = SQLContext(sc)" ] }, { "cell_type": "markdown", "id": "d68ae314", "metadata": {}, "source": [ "Define function for loading data" ] }, { "cell_type": "code", "execution_count": 3, "id": "7e5bbdda", "metadata": {}, "outputs": [], "source": [ "### DO NOT MODIFY THIS CELL ###\n", "def load_data():\n", " df = sqlContext.read.option(\"header\",True) \\\n", " .csv(\"yellow_tripdata_2019-01_short.csv\")\n", " return df" ] }, { "cell_type": "markdown", "id": "0d52409d", "metadata": {}, "source": [ "### Q1.a" ] }, { "cell_type": "markdown", "id": "e43f6e00", "metadata": {}, "source": [ "Perform data casting to clean incoming dataset" ] }, { "cell_type": "code", "execution_count": 4, "id": "11f801b4", "metadata": {}, "outputs": [], "source": [ "def clean_data(df):\n", " '''\n", " input: df a dataframe\n", " output: df a dataframe with the all the original columns\n", " '''\n", " \n", " # START YOUR CODE HERE ---------\n", " from pyspark.sql.types import StructField, StructType, IntegerType, TimestampType, FloatType, StringType\n", "\n", " df = df.withColumn(\"passenger_count\", df[\"passenger_count\"].cast(IntegerType()))\n", " df = df.withColumn(\"total_amount\", df[\"total_amount\"].cast(FloatType()))\n", " df = df.withColumn(\"tip_amount\", df[\"tip_amount\"].cast(FloatType()))\n", " df = df.withColumn(\"trip_distance\", df[\"trip_distance\"].cast(FloatType()))\n", " df = df.withColumn(\"fare_amount\", df[\"fare_amount\"].cast(FloatType()))\n", " df = df.withColumn(\"tpep_pickup_datetime\", df[\"tpep_pickup_datetime\"].cast(TimestampType()))\n", " df = df.withColumn(\"tpep_dropoff_datetime\", df[\"tpep_dropoff_datetime\"].cast(TimestampType()))\n", "\n", " # END YOUR CODE HERE -----------\n", " \n", " return df" ] }, { "cell_type": "markdown", "id": "d4f565d0", "metadata": {}, "source": [ "### Q1.b" ] }, { "cell_type": "markdown", "id": "72b4f712", "metadata": {}, "source": [ "Find rate per person for based on how many passengers travel between pickup and dropoff locations. " ] }, { "cell_type": "code", "execution_count": 5, "id": "4e115152", "metadata": {}, "outputs": [], "source": [ "def common_pair(df):\n", " '''\n", " input: df a dataframe\n", " output: df a dataframe with following columns:\n", " - PULocationID\n", " - DOLocationID\n", " - passenger_count\n", " - per_person_rate\n", " \n", " per_person_rate is the total_amount per person for a given pair.\n", " \n", " '''\n", " \n", " # START YOUR CODE HERE ---------\n", " \n", " # END YOUR CODE HERE -----------\n", " \n", " return df" ] }, { "cell_type": "markdown", "id": "127574ab", "metadata": {}, "source": [ "### Q1.c" ] }, { "cell_type": "markdown", "id": "36a8fd27", "metadata": {}, "source": [ "Find trips which trip distances generate the highest tip percentage." ] }, { "cell_type": "code", "execution_count": 6, "id": "376c981c", "metadata": {}, "outputs": [], "source": [ "def distance_with_most_tip(df):\n", " '''\n", " input: df a dataframe\n", " output: df a dataframe with following columns:\n", " - trip_distance\n", " - tip_percent\n", " \n", " trip_percent is the percent of tip out of fare_amount\n", " \n", " '''\n", " \n", " # START YOUR CODE HERE ---------\n", " \n", " # END YOUR CODE HERE -----------\n", " \n", " return df" ] }, { "cell_type": "markdown", "id": "f0172fe6", "metadata": {}, "source": [ "### Q1.d" ] }, { "cell_type": "markdown", "id": "4613c906", "metadata": {}, "source": [ "Determine the average speed at different times of day." ] }, { "cell_type": "code", "execution_count": 7, "id": "abff9e24", "metadata": {}, "outputs": [], "source": [ "def time_with_most_traffic(df):\n", " '''\n", " input: df a dataframe\n", " output: df a dataframe with following columns:\n", " - time_of_day\n", " - am_avg_speed\n", " - pm_avg_speed\n", " \n", " trip_percent is the percent of tip out of fare_amount\n", " \n", " '''\n", " \n", " # START YOUR CODE HERE ---------\n", "\n", " # END YOUR CODE HERE -----------\n", " \n", " return df" ] }, { "cell_type": "markdown", "id": "34cbd7b9", "metadata": {}, "source": [ "### The below cells are for you to investigate your solutions and will not be graded\n", "## Ensure they are commented out prior to submitting to Gradescope to avoid errors" ] }, { "cell_type": "code", "execution_count": 8, "id": "bf9abefb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+\n", "|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|\n", "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+\n", "| 1| 2019-01-01 00:46:40| 2019-01-01 00:53:20| 1| 1.5| 1| N| 151| 239| 1| 7.0| 0.5| 0.5| 1.65| 0| 0.3| 9.95| NULL|\n", "| 1| 2019-01-01 00:59:47| 2019-01-01 01:18:59| 1| 2.6| 1| N| 239| 246| 1| 14.0| 0.5| 0.5| 1.0| 0| 0.3| 16.3| NULL|\n", "| 2| 2018-12-21 13:48:30| 2018-12-21 13:52:40| 3| 0.0| 1| N| 236| 236| 1| 4.5| 0.5| 0.5| 0.0| 0| 0.3| 5.8| NULL|\n", "| 2| 2018-11-28 15:52:25| 2018-11-28 15:55:45| 5| 0.0| 1| N| 193| 193| 2| 3.5| 0.5| 0.5| 0.0| 0| 0.3| 7.55| NULL|\n", "| 2| 2018-11-28 15:56:57| 2018-11-28 15:58:33| 5| 0.0| 2| N| 193| 193| 2| 52.0| 0| 0.5| 0.0| 0| 0.3| 55.55| NULL|\n", "| 2| 2018-11-28 16:25:49| 2018-11-28 16:28:26| 5| 0.0| 1| N| 193| 193| 2| 3.5| 0.5| 0.5| 0.0| 5.76| 0.3| 13.31| NULL|\n", "| 2| 2018-11-28 16:29:37| 2018-11-28 16:33:43| 5| 0.0| 2| N| 193| 193| 2| 52.0| 0| 0.5| 0.0| 0| 0.3| 55.55| NULL|\n", "| 1| 2019-01-01 00:21:28| 2019-01-01 00:28:37| 1| 1.3| 1| N| 163| 229| 1| 6.5| 0.5| 0.5| 1.25| 0| 0.3| 9.05| NULL|\n", "| 1| 2019-01-01 00:32:01| 2019-01-01 00:45:39| 1| 3.7| 1| N| 229| 7| 1| 13.5| 0.5| 0.5| 3.7| 0| 0.3| 18.5| NULL|\n", "| 1| 2019-01-01 00:57:32| 2019-01-01 01:09:32| 2| 2.1| 1| N| 141| 234| 1| 10.0| 0.5| 0.5| 1.7| 0| 0.3| 13.0| NULL|\n", "| 1| 2019-01-01 00:24:04| 2019-01-01 00:47:06| 2| 2.8| 1| N| 246| 162| 1| 15.0| 0.5| 0.5| 3.25| 0| 0.3| 19.55| NULL|\n", "| 1| 2019-01-01 00:21:59| 2019-01-01 00:28:24| 1| 0.7| 1| N| 238| 151| 1| 5.5| 0.5| 0.5| 1.7| 0| 0.3| 8.5| NULL|\n", "| 1| 2019-01-01 00:45:21| 2019-01-01 01:31:05| 1| 8.7| 1| N| 163| 25| 1| 34.5| 0.5| 0.5| 7.15| 0| 0.3| 42.95| NULL|\n", "| 1| 2019-01-01 00:43:19| 2019-01-01 01:07:42| 1| 6.3| 1| N| 224| 25| 1| 21.5| 0.5| 0.5| 5.7| 0| 0.3| 28.5| NULL|\n", "| 1| 2019-01-01 00:58:24| 2019-01-01 01:15:18| 1| 2.7| 1| N| 141| 234| 1| 13.0| 0.5| 0.5| 1.0| 0| 0.3| 15.3| NULL|\n", "| 2| 2019-01-01 00:23:14| 2019-01-01 00:25:40| 1| 0.38| 1| N| 170| 170| 2| 3.5| 0.5| 0.5| 0.0| 0| 0.3| 4.8| NULL|\n", "| 2| 2019-01-01 00:39:51| 2019-01-01 00:48:02| 1| 0.55| 1| N| 170| 170| 1| 6.5| 0.5| 0.5| 1.95| 0| 0.3| 9.75| NULL|\n", "| 2| 2019-01-01 00:46:00| 2019-01-01 00:49:07| 1| 0.3| 1| N| 107| 107| 1| 4.0| 0.5| 0.5| 1.06| 0| 0.3| 6.36| NULL|\n", "| 2| 2019-01-01 00:57:45| 2019-01-01 01:03:51| 1| 1.42| 1| N| 170| 141| 1| 6.5| 0.5| 0.5| 1.56| 0| 0.3| 9.36| NULL|\n", "| 2| 2019-01-01 00:16:16| 2019-01-01 00:25:57| 1| 1.72| 1| N| 41| 247| 2| 9.0| 0.5| 0.5| 0.0| 0| 0.3| 10.3| NULL|\n", "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+\n", "only showing top 20 rows\n", "\n" ] } ], "source": [ "# df = load_data()\n", "# df = clean_data(df)" ] }, { "cell_type": "code", "execution_count": 9, "id": "cfa96f41", "metadata": {}, "outputs": [], "source": [ "# common_pair(df).show()" ] }, { "cell_type": "code", "execution_count": 10, "id": "8e42b46a", "metadata": {}, "outputs": [], "source": [ "# distance_with_most_tip(df).show()" ] }, { "cell_type": "code", "execution_count": 11, "id": "4f558c64", "metadata": {}, "outputs": [], "source": [ "# time_with_most_traffic(df).show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 5 }