tunmnlu/task_04/hw4-workspace/Q1/submit_01/Q1.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# DVA HW4 Q1\n",
    "## Pagerank Algorithm\n",
    "\n",
    "Do not distribute or publish this code  \n",
    "Do not change the `#export` statements or add and other code or comments above them. They are needed for grading."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "'''\n",
    "*** Imports ***\n",
    "    DO NOT EDIT or add anything to this section\n",
    "'''\n",
    "import numpy as np\n",
    "import time\n",
    "import argparse\n",
    "import sys"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Update your GT username and Id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "def author():                                                                                             \n",
    "        return \"tlou31\"      # replace gburdell3 with your Georgia Tech username.                                                                                             \n",
    "                                                                                              \n",
    "def gtid():                                                                                               \n",
    "    return 90366092            # replace with your GT ID number   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "*** Utility function ***\n",
    "    DO NOT EDIT\n",
    "'''\n",
    "def dump_results(command, iterations, result):\n",
    "    print(\"Sorting...\", file=sys.stderr)\n",
    "    sorted_result = sorted(enumerate(result), key=lambda x: x[1], reverse=True)\n",
    "    output_result = \"node_id\\tpr_value\\n\"\n",
    "    for node_id, pr_value in sorted_result[:10]:\n",
    "        output_result += \"{0}\\t{1}\\n\".format(node_id, pr_value)\n",
    "    print(output_result)\n",
    "\n",
    "    with open(command+'_iter'+str(iterations)+\".txt\",\"w\") as output_file:\n",
    "        output_file.write(output_result)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### PageRank Class\n",
    "Please add your code as indicated below  \n",
    "You do not need to add code outside of the indicated areas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#export\n",
    "class PageRank:\n",
    "    def __init__(self, edge_file):\n",
    "\n",
    "        self.node_degree = {}\n",
    "        self.max_node_id = 0\n",
    "        self.edge_file = edge_file\n",
    "\n",
    "    def read_edge_file(self, edge_file):\n",
    "        with open(edge_file) as f:\n",
    "            for line in f:\n",
    "                val = line.split('\\t')\n",
    "                yield int(val[0]), int(val[1])\n",
    "\n",
    "    \"\"\"\n",
    "    Step1: Calculate the out-degree of each node and maximum node_id of the graph.\n",
    "    Store the out-degree in class variable \"node_degree\" and maximum node id to \"max_node_id\".\n",
    "    \"\"\"\n",
    "    def calculate_node_degree(self):\n",
    "        for source,target in self.read_edge_file(self.edge_file):\n",
    "\n",
    "        ### STEP 1\n",
    "        ### Implement your code here\n",
    "        #############################################\n",
    "            if source in self.node_degree.keys():\n",
    "                self.node_degree[source]=self.node_degree[source]+1\n",
    "            else:\n",
    "                self.node_degree[source]=1\n",
    "\n",
    "            if self.max_node_id == 0:\n",
    "                self.max_node_id = target\n",
    "            elif target > self.max_node_id:\n",
    "                self.max_node_id = target\n",
    "            else:\n",
    "                continue\n",
    "        #############################################\n",
    "\n",
    "        print(\"Max node id: {}\".format(self.max_node_id))\n",
    "        print(\"Total source number: {}\".format(len(self.node_degree)))\n",
    "\n",
    "    def get_max_node_id(self):\n",
    "        return self.max_node_id\n",
    "\n",
    "    def run_pagerank(self, node_weights,  damping_factor=0.85, iterations=10):\n",
    "\n",
    "        pr_values = [1.0 / (self.max_node_id + 1)] * (self.max_node_id + 1)\n",
    "        start_time = time.time()\n",
    "        \"\"\" \n",
    "        Step2: Implement pagerank algorithm as mentioned in lecture slides and the question.\n",
    "\n",
    "        Incoming Parameters:\n",
    "            node_weights: Probability of each node to flyout during random walk\n",
    "            damping_factor: Probability of continuing on the random walk\n",
    "            iterations: Number of iterations to run the algorithm \n",
    "            check the __main__ function to understand node_weights and max_node_id\n",
    "        \n",
    "        Use the calculated out-degree to calculate the pagerank value of each node\n",
    "        \"\"\"\n",
    "        for it in range(iterations):\n",
    "            target_dict = {}\n",
    "            new_pr_values = [0.0] * (self.max_node_id + 1)\n",
    "            for source, target in self.read_edge_file(self.edge_file):\n",
    "\n",
    "        ### STEP 2\n",
    "        ### Implement your code here\n",
    "        #############################################\n",
    "                if target in target_dict:\n",
    "                    target_dict[target] += pr_values[source]/self.node_degree[source]\n",
    "                else:\n",
    "                    target_dict[target] = pr_values[source]/self.node_degree[source]\n",
    "\n",
    "            for i in target_dict:\n",
    "                new_pr_values[i] += damping_factor*target_dict[i]\n",
    "            pr_values= new_pr_values\n",
    "                    \n",
    "        #############################################\n",
    "\n",
    "        print (\"Completed {0}/{1} iterations. {2} seconds elapsed.\".format(it + 1, iterations, time.time() - start_time))\n",
    "\n",
    "        return pr_values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Simplified Pagerank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Max node id: 1157826\n",
      "Total source number: 374785\n",
      "Completed 10/10 iterations. 50.050206899642944 seconds elapsed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Sorting...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node_id\tpr_value\n",
      "1144073\t4.744002382655239e-07\n",
      "1141016\t2.2677975268325195e-07\n",
      "1145498\t1.726726382312939e-07\n",
      "1157525\t1.708365954499269e-07\n",
      "1146845\t1.5240844181090655e-07\n",
      "1141026\t1.5237737223394173e-07\n",
      "1141019\t1.359932555288619e-07\n",
      "1139212\t1.2123397693864033e-07\n",
      "1137241\t1.0178198237707305e-07\n",
      "1095898\t1.0165991209723209e-07\n",
      "\n",
      "Max node id: 1157826\n",
      "Total source number: 374785\n",
      "Completed 25/25 iterations. 123.08520412445068 seconds elapsed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Sorting...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node_id\tpr_value\n",
      "1144073\t8.900245896330209e-16\n",
      "1146845\t2.702531633324097e-16\n",
      "1141026\t2.7025306757591484e-16\n",
      "1141016\t1.9665881620081745e-16\n",
      "1151950\t1.3061480152617837e-16\n",
      "1157777\t1.2280859633505984e-16\n",
      "1157778\t1.2280859633505984e-16\n",
      "1141019\t1.2123347555220871e-16\n",
      "1153989\t1.1293257104264037e-16\n",
      "1144067\t1.1133253816963228e-16\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Options\n",
    "file = 'network.tsv'                # Input file of the edge list - DO NOT EDIT\n",
    "command = 'simplified_pagerank'     # Command to run - DO NOT EDIT\n",
    "damping_factor = 0.85               # Damping factor - submit results for damping_factor = 0.85\n",
    "iterations = [10,25]                # Number of iterations - sumbit results for iterations = [10,25] \n",
    "\n",
    "# Run Simplified PageRank\n",
    "# DO NOT EDIT\n",
    "for i in iterations:\n",
    "    pr = PageRank(file)\n",
    "    pr.calculate_node_degree()\n",
    "    max_node_id = pr.get_max_node_id()\n",
    "    node_weights = np.ones(max_node_id + 1) / (max_node_id + 1)\n",
    "    result = pr.run_pagerank(node_weights=node_weights, iterations=i, damping_factor=damping_factor)\n",
    "    dump_results(command, i, result )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Personalized Pagerank"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Max node id: 1157826\n",
      "Total source number: 374785\n",
      "Completed 10/10 iterations. 46.98482871055603 seconds elapsed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Sorting...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node_id\tpr_value\n",
      "1144073\t4.744002382655239e-07\n",
      "1141016\t2.2677975268325195e-07\n",
      "1145498\t1.726726382312939e-07\n",
      "1157525\t1.708365954499269e-07\n",
      "1146845\t1.5240844181090655e-07\n",
      "1141026\t1.5237737223394173e-07\n",
      "1141019\t1.359932555288619e-07\n",
      "1139212\t1.2123397693864033e-07\n",
      "1137241\t1.0178198237707305e-07\n",
      "1095898\t1.0165991209723209e-07\n",
      "\n",
      "Max node id: 1157826\n",
      "Total source number: 374785\n",
      "Completed 25/25 iterations. 120.86021399497986 seconds elapsed.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Sorting...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "node_id\tpr_value\n",
      "1144073\t8.900245896330209e-16\n",
      "1146845\t2.702531633324097e-16\n",
      "1141026\t2.7025306757591484e-16\n",
      "1141016\t1.9665881620081745e-16\n",
      "1151950\t1.3061480152617837e-16\n",
      "1157777\t1.2280859633505984e-16\n",
      "1157778\t1.2280859633505984e-16\n",
      "1141019\t1.2123347555220871e-16\n",
      "1153989\t1.1293257104264037e-16\n",
      "1144067\t1.1133253816963228e-16\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Options\n",
    "file = 'network.tsv'                # Input file of the edge list - DO NOT EDIT\n",
    "command = 'personalized_pagerank'   # Command to run - DO NOT EDIT\n",
    "damping_factor = 0.85               # Damping factor - submit results for damping_factor = 0.85\n",
    "iterations = [10,25]                # Number of iterations - sumbit results for iterations = [10,25] \n",
    "\n",
    "# Run Personalized PageRank\n",
    "# DO NOT EDIT\n",
    "for i in iterations:\n",
    "    pr = PageRank(file)\n",
    "    pr.calculate_node_degree()\n",
    "    max_node_id = pr.get_max_node_id()\n",
    "    np.random.seed(gtid())\n",
    "    node_weights = np.random.rand(max_node_id + 1)\n",
    "    node_weights = node_weights/node_weights.sum()\n",
    "    result = pr.run_pagerank(node_weights=node_weights, iterations=i, damping_factor=damping_factor)\n",
    "    dump_results(command, i, result)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Submitting your results to gradescope\n",
    "Submit the following on Gradescope  \n",
    "* This file, Q1.ipynb\n",
    "* simplified_pagerank_iter10.txt\n",
    "* simplified_pagerank_iter25.txt\n",
    "* personalized_pagerank_iter10.txt\n",
    "* personalized_pagerank_iter25.txt\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}