{ "cells": [ { "cell_type": "markdown", "id": "e5905a69", "metadata": {}, "source": [ "# CSE6242 - HW3 - Q1" ] }, { "cell_type": "markdown", "id": "09289981", "metadata": {}, "source": [ "Pyspark Imports" ] }, { "cell_type": "code", "execution_count": 1, "id": "139318cb", "metadata": {}, "outputs": [], "source": [ "### DO NOT MODIFY THIS CELL ###\n", "import pyspark\n", "from pyspark.sql import SQLContext\n", "from pyspark.sql.functions import hour, when, col, date_format, to_timestamp, round, coalesce\n", "from pyspark.sql.functions import *" ] }, { "cell_type": "markdown", "id": "3fd9e0f8", "metadata": {}, "source": [ "Initialize PySpark Context" ] }, { "cell_type": "code", "execution_count": 2, "id": "b0c18c6c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", "23/10/18 14:58:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", "23/10/18 14:58:22 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n", "23/10/18 14:58:22 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n", "23/10/18 14:58:22 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.\n", "/usr/local/lib/python3.9/dist-packages/pyspark/sql/context.py:113: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.\n", " warnings.warn(\n" ] } ], "source": [ "### DO NOT MODIFY THIS CELL ###\n", "sc = pyspark.SparkContext(appName=\"HW3-Q1\")\n", "sqlContext = SQLContext(sc)" ] }, { "cell_type": "markdown", "id": "d68ae314", "metadata": {}, "source": [ "Define function for loading data" ] }, { "cell_type": "code", "execution_count": 3, "id": "7e5bbdda", "metadata": {}, "outputs": [], "source": [ "### DO NOT MODIFY THIS CELL ###\n", "def load_data():\n", " df = sqlContext.read.option(\"header\",True) \\\n", " .csv(\"yellow_tripdata_2019-01_short.csv\")\n", " return df" ] }, { "cell_type": "markdown", "id": "0d52409d", "metadata": {}, "source": [ "### Q1.a" ] }, { "cell_type": "markdown", "id": "e43f6e00", "metadata": {}, "source": [ "Perform data casting to clean incoming dataset" ] }, { "cell_type": "code", "execution_count": 4, "id": "11f801b4", "metadata": {}, "outputs": [], "source": [ "def clean_data(df):\n", " '''\n", " input: df a dataframe\n", " output: df a dataframe with the all the original columns\n", " '''\n", " \n", " # START YOUR CODE HERE ---------\n", " from pyspark.sql.types import StructField, StructType, IntegerType, TimestampType, FloatType, StringType\n", "\n", " df = df.withColumn(\"passenger_count\", df[\"passenger_count\"].cast(IntegerType()))\n", " df = df.withColumn(\"total_amount\", df[\"total_amount\"].cast(FloatType()))\n", " df = df.withColumn(\"tip_amount\", df[\"tip_amount\"].cast(FloatType()))\n", " df = df.withColumn(\"trip_distance\", df[\"trip_distance\"].cast(FloatType()))\n", " df = df.withColumn(\"fare_amount\", df[\"fare_amount\"].cast(FloatType()))\n", " df = df.withColumn(\"tpep_pickup_datetime\", df[\"tpep_pickup_datetime\"].cast(TimestampType()))\n", " df = df.withColumn(\"tpep_dropoff_datetime\", df[\"tpep_dropoff_datetime\"].cast(TimestampType()))\n", "\n", " # END YOUR CODE HERE -----------\n", " \n", " return df" ] }, { "cell_type": "markdown", "id": "d4f565d0", "metadata": {}, "source": [ "### Q1.b" ] }, { "cell_type": "markdown", "id": "72b4f712", "metadata": {}, "source": [ "Find rate per person for based on how many passengers travel between pickup and dropoff locations. " ] }, { "cell_type": "code", "execution_count": 5, "id": "4e115152", "metadata": {}, "outputs": [], "source": [ "def common_pair(df):\n", " '''\n", " input: df a dataframe\n", " output: df a dataframe with following columns:\n", " - PULocationID\n", " - DOLocationID\n", " - passenger_count\n", " - per_person_rate\n", " \n", " per_person_rate is the total_amount per person for a given pair.\n", " \n", " '''\n", " \n", " # START YOUR CODE HERE ---------\n", " from pyspark.sql import Window\n", "\n", " partition_cols = ['PULocationID','DOLocationID']\n", "\n", " group_by_result = df.groupBy(partition_cols).count()\n", " # group_by_result.show()\n", "\n", " # Filter out any trips that have the same pick-up and drop-off location. \n", " df_temp = df.filter((df.PULocationID != df.DOLocationID))\n", " # group_by_result_difference_location.show()\n", "\n", " # # [4 pts] You will be modifying the function common_pair. \n", " # # Return the top 10 pickup-dropoff location pairs that have the highest number of total passengers who have traveled between them. \n", " # # Sort the location pairs by total passengers. \n", " df_temp = df_temp.withColumn(\"passenger_count\", sum(\"passenger_count\").over(Window.partitionBy(*partition_cols)))\n", " \n", " # # For each location pair, also compute \n", " # # the average amount per passenger over all trips (name this per_person_rate), utilizing total_amount.\n", " df_temp = df_temp.withColumn(\"total_amount_partition\", sum(\"total_amount\").over(Window.partitionBy(*partition_cols)))\n", " df_temp = df_temp.withColumn(\"per_person_rate\",col(\"total_amount_partition\")/col(\"passenger_count\"))\n", " \n", " # # For pairs with the same total passengers, \n", " # # sort them in descending order of per_person_rate.\n", " # # Rename the column for total passengers to passenger_count. \n", " df_temp = df_temp.select(['PULocationID','DOLocationID','passenger_count','per_person_rate']).distinct()\n", " df_joined = group_by_result.join(df_temp, partition_cols)\n", " df_joined = df_joined.orderBy(['passenger_count','per_person_rate'], ascending=False).limit(10)\n", " df_output = df_joined.drop('count')\n", " # END YOUR CODE HERE -----------\n", " \n", " return df_output" ] }, { "cell_type": "markdown", "id": "127574ab", "metadata": {}, "source": [ "### Q1.c" ] }, { "cell_type": "markdown", "id": "36a8fd27", "metadata": {}, "source": [ "Find trips which trip distances generate the highest tip percentage." ] }, { "cell_type": "code", "execution_count": 6, "id": "376c981c", "metadata": {}, "outputs": [], "source": [ "def distance_with_most_tip(df):\n", " '''\n", " input: df a dataframe\n", " output: df a dataframe with following columns:\n", " - trip_distance\n", " - tip_percent\n", " \n", " trip_percent is the percent of tip out of fare_amount\n", " \n", " '''\n", " \n", " # START YOUR CODE HERE ---------\n", " \n", " # END YOUR CODE HERE -----------\n", " \n", " return df" ] }, { "cell_type": "markdown", "id": "f0172fe6", "metadata": {}, "source": [ "### Q1.d" ] }, { "cell_type": "markdown", "id": "4613c906", "metadata": {}, "source": [ "Determine the average speed at different times of day." ] }, { "cell_type": "code", "execution_count": 7, "id": "abff9e24", "metadata": {}, "outputs": [], "source": [ "def time_with_most_traffic(df):\n", " '''\n", " input: df a dataframe\n", " output: df a dataframe with following columns:\n", " - time_of_day\n", " - am_avg_speed\n", " - pm_avg_speed\n", " \n", " trip_percent is the percent of tip out of fare_amount\n", " \n", " '''\n", " \n", " # START YOUR CODE HERE ---------\n", "\n", " # END YOUR CODE HERE -----------\n", " \n", " return df" ] }, { "cell_type": "markdown", "id": "34cbd7b9", "metadata": {}, "source": [ "### The below cells are for you to investigate your solutions and will not be graded\n", "## Ensure they are commented out prior to submitting to Gradescope to avoid errors" ] }, { "cell_type": "code", "execution_count": 8, "id": "bf9abefb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------------+------------+---------------+------------------+\n", "|PULocationID|DOLocationID|passenger_count| per_person_rate|\n", "+------------+------------+---------------+------------------+\n", "| 239| 238| 62| 4.26274198870505|\n", "| 237| 236| 60| 4.482500068346659|\n", "| 263| 141| 52|3.4190384974846473|\n", "| 161| 236| 42| 5.368571440378825|\n", "| 148| 79| 42| 4.711904752822149|\n", "| 142| 238| 39| 5.05487182812813|\n", "| 141| 236| 37| 4.355675723101641|\n", "| 239| 143| 37| 4.252162224537617|\n", "| 239| 142| 35| 3.817714350564139|\n", "| 79| 170| 34| 6.394705884596881|\n", "+------------+------------+---------------+------------------+\n", "\n" ] } ], "source": [ "# df = load_data()\n", "# df = clean_data(df)\n", "# common_pair(df).show()\n", "# distance_with_most_tip(df).show()\n", "# time_with_most_traffic(df).show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 5 }