{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e5905a69",
   "metadata": {},
   "source": [
    "# CSE6242 - HW3 - Q1"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "09289981",
   "metadata": {},
   "source": [
    "Pyspark Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "139318cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "### DO NOT MODIFY THIS CELL ###\n",
    "import pyspark\n",
    "from pyspark.sql import SQLContext\n",
    "from pyspark.sql.functions import hour, when, col, date_format, to_timestamp, round, coalesce"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3fd9e0f8",
   "metadata": {},
   "source": [
    "Initialize PySpark Context"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b0c18c6c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
      "23/10/18 14:31:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
      "23/10/18 14:31:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n",
      "23/10/18 14:31:43 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n",
      "/usr/local/lib/python3.9/dist-packages/pyspark/sql/context.py:113: FutureWarning: Deprecated in 3.0.0. Use SparkSession.builder.getOrCreate() instead.\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "### DO NOT MODIFY THIS CELL ###\n",
    "sc = pyspark.SparkContext(appName=\"HW3-Q1\")\n",
    "sqlContext = SQLContext(sc)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d68ae314",
   "metadata": {},
   "source": [
    "Define function for loading data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7e5bbdda",
   "metadata": {},
   "outputs": [],
   "source": [
    "### DO NOT MODIFY THIS CELL ###\n",
    "def load_data():\n",
    "    df = sqlContext.read.option(\"header\",True) \\\n",
    "     .csv(\"yellow_tripdata_2019-01_short.csv\")\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0d52409d",
   "metadata": {},
   "source": [
    "### Q1.a"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e43f6e00",
   "metadata": {},
   "source": [
    "Perform data casting to clean incoming dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "11f801b4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_data(df):\n",
    "    '''\n",
    "    input: df a dataframe\n",
    "    output: df a dataframe with the all the original columns\n",
    "    '''\n",
    "    \n",
    "    # START YOUR CODE HERE ---------\n",
    "    from pyspark.sql.types import StructField, StructType, IntegerType, TimestampType, FloatType, StringType\n",
    "\n",
    "    df = df.withColumn(\"passenger_count\", df[\"passenger_count\"].cast(IntegerType()))\n",
    "    df = df.withColumn(\"total_amount\", df[\"total_amount\"].cast(FloatType()))\n",
    "    df = df.withColumn(\"tip_amount\", df[\"tip_amount\"].cast(FloatType()))\n",
    "    df = df.withColumn(\"trip_distance\", df[\"trip_distance\"].cast(FloatType()))\n",
    "    df = df.withColumn(\"fare_amount\", df[\"fare_amount\"].cast(FloatType()))\n",
    "    df = df.withColumn(\"tpep_pickup_datetime\", df[\"tpep_pickup_datetime\"].cast(TimestampType()))\n",
    "    df = df.withColumn(\"tpep_dropoff_datetime\", df[\"tpep_dropoff_datetime\"].cast(TimestampType()))\n",
    "\n",
    "    # END YOUR CODE HERE -----------\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d4f565d0",
   "metadata": {},
   "source": [
    "### Q1.b"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "72b4f712",
   "metadata": {},
   "source": [
    "Find rate per person for based on how many passengers travel between pickup and dropoff locations. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "4e115152",
   "metadata": {},
   "outputs": [],
   "source": [
    "def common_pair(df):\n",
    "    '''\n",
    "    input: df a dataframe\n",
    "    output: df a dataframe with following columns:\n",
    "            - PULocationID\n",
    "            - DOLocationID\n",
    "            - passenger_count\n",
    "            - per_person_rate\n",
    "            \n",
    "    per_person_rate is the total_amount per person for a given pair.\n",
    "    \n",
    "    '''\n",
    "    \n",
    "    # START YOUR CODE HERE ---------\n",
    "    \n",
    "    # END YOUR CODE HERE -----------\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "127574ab",
   "metadata": {},
   "source": [
    "### Q1.c"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "36a8fd27",
   "metadata": {},
   "source": [
    "Find trips which trip distances generate the highest tip percentage."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "376c981c",
   "metadata": {},
   "outputs": [],
   "source": [
    "def distance_with_most_tip(df):\n",
    "    '''\n",
    "    input: df a dataframe\n",
    "    output: df a dataframe with following columns:\n",
    "            - trip_distance\n",
    "            - tip_percent\n",
    "            \n",
    "    trip_percent is the percent of tip out of fare_amount\n",
    "    \n",
    "    '''\n",
    "    \n",
    "    # START YOUR CODE HERE ---------\n",
    "    \n",
    "    # END YOUR CODE HERE -----------\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f0172fe6",
   "metadata": {},
   "source": [
    "### Q1.d"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4613c906",
   "metadata": {},
   "source": [
    "Determine the average speed at different times of day."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "abff9e24",
   "metadata": {},
   "outputs": [],
   "source": [
    "def time_with_most_traffic(df):\n",
    "    '''\n",
    "    input: df a dataframe\n",
    "    output: df a dataframe with following columns:\n",
    "            - time_of_day\n",
    "            - am_avg_speed\n",
    "            - pm_avg_speed\n",
    "            \n",
    "    trip_percent is the percent of tip out of fare_amount\n",
    "    \n",
    "    '''\n",
    "    \n",
    "    # START YOUR CODE HERE ---------\n",
    "\n",
    "    # END YOUR CODE HERE -----------\n",
    "    \n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "34cbd7b9",
   "metadata": {},
   "source": [
    "### The below cells are for you to investigate your solutions and will not be graded\n",
    "## Ensure they are commented out prior to submitting to Gradescope to avoid errors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "bf9abefb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+\n",
      "|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|\n",
      "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+\n",
      "|       1| 2019-01-01 00:46:40|  2019-01-01 00:53:20|              1|          1.5|         1|                 N|         151|         239|           1|        7.0|  0.5|    0.5|      1.65|           0|                  0.3|        9.95|                NULL|\n",
      "|       1| 2019-01-01 00:59:47|  2019-01-01 01:18:59|              1|          2.6|         1|                 N|         239|         246|           1|       14.0|  0.5|    0.5|       1.0|           0|                  0.3|        16.3|                NULL|\n",
      "|       2| 2018-12-21 13:48:30|  2018-12-21 13:52:40|              3|          0.0|         1|                 N|         236|         236|           1|        4.5|  0.5|    0.5|       0.0|           0|                  0.3|         5.8|                NULL|\n",
      "|       2| 2018-11-28 15:52:25|  2018-11-28 15:55:45|              5|          0.0|         1|                 N|         193|         193|           2|        3.5|  0.5|    0.5|       0.0|           0|                  0.3|        7.55|                NULL|\n",
      "|       2| 2018-11-28 15:56:57|  2018-11-28 15:58:33|              5|          0.0|         2|                 N|         193|         193|           2|       52.0|    0|    0.5|       0.0|           0|                  0.3|       55.55|                NULL|\n",
      "|       2| 2018-11-28 16:25:49|  2018-11-28 16:28:26|              5|          0.0|         1|                 N|         193|         193|           2|        3.5|  0.5|    0.5|       0.0|        5.76|                  0.3|       13.31|                NULL|\n",
      "|       2| 2018-11-28 16:29:37|  2018-11-28 16:33:43|              5|          0.0|         2|                 N|         193|         193|           2|       52.0|    0|    0.5|       0.0|           0|                  0.3|       55.55|                NULL|\n",
      "|       1| 2019-01-01 00:21:28|  2019-01-01 00:28:37|              1|          1.3|         1|                 N|         163|         229|           1|        6.5|  0.5|    0.5|      1.25|           0|                  0.3|        9.05|                NULL|\n",
      "|       1| 2019-01-01 00:32:01|  2019-01-01 00:45:39|              1|          3.7|         1|                 N|         229|           7|           1|       13.5|  0.5|    0.5|       3.7|           0|                  0.3|        18.5|                NULL|\n",
      "|       1| 2019-01-01 00:57:32|  2019-01-01 01:09:32|              2|          2.1|         1|                 N|         141|         234|           1|       10.0|  0.5|    0.5|       1.7|           0|                  0.3|        13.0|                NULL|\n",
      "|       1| 2019-01-01 00:24:04|  2019-01-01 00:47:06|              2|          2.8|         1|                 N|         246|         162|           1|       15.0|  0.5|    0.5|      3.25|           0|                  0.3|       19.55|                NULL|\n",
      "|       1| 2019-01-01 00:21:59|  2019-01-01 00:28:24|              1|          0.7|         1|                 N|         238|         151|           1|        5.5|  0.5|    0.5|       1.7|           0|                  0.3|         8.5|                NULL|\n",
      "|       1| 2019-01-01 00:45:21|  2019-01-01 01:31:05|              1|          8.7|         1|                 N|         163|          25|           1|       34.5|  0.5|    0.5|      7.15|           0|                  0.3|       42.95|                NULL|\n",
      "|       1| 2019-01-01 00:43:19|  2019-01-01 01:07:42|              1|          6.3|         1|                 N|         224|          25|           1|       21.5|  0.5|    0.5|       5.7|           0|                  0.3|        28.5|                NULL|\n",
      "|       1| 2019-01-01 00:58:24|  2019-01-01 01:15:18|              1|          2.7|         1|                 N|         141|         234|           1|       13.0|  0.5|    0.5|       1.0|           0|                  0.3|        15.3|                NULL|\n",
      "|       2| 2019-01-01 00:23:14|  2019-01-01 00:25:40|              1|         0.38|         1|                 N|         170|         170|           2|        3.5|  0.5|    0.5|       0.0|           0|                  0.3|         4.8|                NULL|\n",
      "|       2| 2019-01-01 00:39:51|  2019-01-01 00:48:02|              1|         0.55|         1|                 N|         170|         170|           1|        6.5|  0.5|    0.5|      1.95|           0|                  0.3|        9.75|                NULL|\n",
      "|       2| 2019-01-01 00:46:00|  2019-01-01 00:49:07|              1|          0.3|         1|                 N|         107|         107|           1|        4.0|  0.5|    0.5|      1.06|           0|                  0.3|        6.36|                NULL|\n",
      "|       2| 2019-01-01 00:57:45|  2019-01-01 01:03:51|              1|         1.42|         1|                 N|         170|         141|           1|        6.5|  0.5|    0.5|      1.56|           0|                  0.3|        9.36|                NULL|\n",
      "|       2| 2019-01-01 00:16:16|  2019-01-01 00:25:57|              1|         1.72|         1|                 N|          41|         247|           2|        9.0|  0.5|    0.5|       0.0|           0|                  0.3|        10.3|                NULL|\n",
      "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# df = load_data()\n",
    "# df = clean_data(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "cfa96f41",
   "metadata": {},
   "outputs": [],
   "source": [
    "# common_pair(df).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8e42b46a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# distance_with_most_tip(df).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4f558c64",
   "metadata": {},
   "outputs": [],
   "source": [
    "# time_with_most_traffic(df).show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}