395 lines
12 KiB
Plaintext
395 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# DVA HW4 Q1\n",
|
|
"## Pagerank Algorithm\n",
|
|
"\n",
|
|
"Do not distribute or publish this code \n",
|
|
"Do not change the `#export` statements or add and other code or comments above them. They are needed for grading."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#export\n",
|
|
"'''\n",
|
|
"*** Imports ***\n",
|
|
" DO NOT EDIT or add anything to this section\n",
|
|
"'''\n",
|
|
"import numpy as np\n",
|
|
"import time\n",
|
|
"import argparse\n",
|
|
"import sys"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Update your GT username and Id"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#export\n",
|
|
"def author(): \n",
|
|
" return \"tlou31\" # replace gburdell3 with your Georgia Tech username. \n",
|
|
" \n",
|
|
"def gtid(): \n",
|
|
" return 90366092 # replace with your GT ID number "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"'''\n",
|
|
"*** Utility function ***\n",
|
|
" DO NOT EDIT\n",
|
|
"'''\n",
|
|
"def dump_results(command, iterations, result):\n",
|
|
" print(\"Sorting...\", file=sys.stderr)\n",
|
|
" sorted_result = sorted(enumerate(result), key=lambda x: x[1], reverse=True)\n",
|
|
" output_result = \"node_id\\tpr_value\\n\"\n",
|
|
" \n",
|
|
" for node_id, pr_value in sorted_result[:10]:\n",
|
|
" output_result += \"{0}\\t{1}\\n\".format(node_id, pr_value)\n",
|
|
" print(output_result)\n",
|
|
"\n",
|
|
" with open(command+'_iter'+str(iterations)+\".txt\",\"w\") as output_file:\n",
|
|
" output_file.write(output_result)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### PageRank Class\n",
|
|
"Please add your code as indicated below \n",
|
|
"You do not need to add code outside of the indicated areas"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"#export\n",
|
|
"class PageRank:\n",
|
|
" def __init__(self, edge_file):\n",
|
|
"\n",
|
|
" self.node_degree = {}\n",
|
|
" self.max_node_id = 0\n",
|
|
" self.edge_file = edge_file\n",
|
|
"\n",
|
|
" def read_edge_file(self, edge_file):\n",
|
|
" with open(edge_file) as f:\n",
|
|
" for line in f:\n",
|
|
" val = line.split('\\t')\n",
|
|
" yield int(val[0]), int(val[1])\n",
|
|
"\n",
|
|
" \"\"\"\n",
|
|
" Step1: Calculate the out-degree of each node and maximum node_id of the graph.\n",
|
|
" Store the out-degree in class variable \"node_degree\" and maximum node id to \"max_node_id\".\n",
|
|
" \"\"\"\n",
|
|
" def calculate_node_degree(self):\n",
|
|
" for source,target in self.read_edge_file(self.edge_file):\n",
|
|
"\n",
|
|
" ### STEP 1\n",
|
|
" ### Implement your code here\n",
|
|
" #############################################\n",
|
|
" self.node_degree[source] = self.node_degree.get(source,0) + 1\n",
|
|
"\n",
|
|
" if source > self.max_node_id or target > self.max_node_id:\n",
|
|
" self.max_node_id = max(source, target)\n",
|
|
" \n",
|
|
" # #? missing values are 0?\n",
|
|
" for i in range(self.max_node_id):\n",
|
|
" self.node_degree[i] = self.node_degree.get(i,0) #? default of 1 to prevent ZeroDivisionError\n",
|
|
" #############################################\n",
|
|
"\n",
|
|
" print(\"Max node id: {}\".format(self.max_node_id))\n",
|
|
" print(\"Total source number: {}\".format(len(self.node_degree)))\n",
|
|
"\n",
|
|
" def get_max_node_id(self):\n",
|
|
" return self.max_node_id\n",
|
|
"\n",
|
|
" def run_pagerank(self, node_weights, damping_factor=0.85, iterations=10):\n",
|
|
" pr_values = [1.0 / (self.max_node_id + 1)] * (self.max_node_id + 1)\n",
|
|
" start_time = time.time()\n",
|
|
" \"\"\" \n",
|
|
" Step2: Implement pagerank algorithm as mentioned in lecture slides and the question.\n",
|
|
"\n",
|
|
" Incoming Parameters:\n",
|
|
" node_weights: Probability of each node to flyout during random walk\n",
|
|
" damping_factor: Probability of continuing on the random walk\n",
|
|
" iterations: Number of iterations to run the algorithm \n",
|
|
" check the __main__ function to understand node_weights and max_node_id\n",
|
|
" \n",
|
|
" Use the calculated out-degree to calculate the pagerank value of each node\n",
|
|
" \"\"\"\n",
|
|
" for it in range(iterations):\n",
|
|
" target_dict = {}\n",
|
|
" new_pr_values = [0.0] * (self.max_node_id + 1)\n",
|
|
" for source, target in self.read_edge_file(self.edge_file):\n",
|
|
"\n",
|
|
" ### STEP 2\n",
|
|
" ### Implement your code here\n",
|
|
" #############################################\n",
|
|
" new_pr_values[target] += pr_values[source]/self.node_degree[source] #* doing the sumation\n",
|
|
" \n",
|
|
" for target in range(self.max_node_id + 1):\n",
|
|
" new_pr_values[target] = (1-damping_factor) * node_weights[target] + damping_factor * new_pr_values[target]\n",
|
|
" pr_values = new_pr_values\n",
|
|
" #############################################\n",
|
|
"\n",
|
|
" print (\"Completed {0}/{1} iterations. {2} seconds elapsed.\".format(it + 1, iterations, time.time() - start_time))\n",
|
|
"\n",
|
|
" return pr_values"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Simplified Pagerank"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Max node id: 1157826\n",
|
|
"Total source number: 1157826\n",
|
|
"Completed 10/10 iterations. 54.33897852897644 seconds elapsed.\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Sorting...\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"node_id\tpr_value\n",
|
|
"663930\t0.0001724404640087142\n",
|
|
"482708\t9.409193541687526e-05\n",
|
|
"1034017\t5.480094973243448e-05\n",
|
|
"663559\t4.87472647440821e-05\n",
|
|
"357530\t4.561562351389135e-05\n",
|
|
"1120567\t4.487010028218946e-05\n",
|
|
"663605\t4.3482910860663785e-05\n",
|
|
"663682\t3.659436752793217e-05\n",
|
|
"1108860\t3.5276046684522986e-05\n",
|
|
"664788\t3.503597066140547e-05\n",
|
|
"\n",
|
|
"Max node id: 1157826\n",
|
|
"Total source number: 1157826\n",
|
|
"Completed 25/25 iterations. 134.005136013031 seconds elapsed.\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Sorting...\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"node_id\tpr_value\n",
|
|
"663930\t0.000172438581121727\n",
|
|
"482708\t9.409158864049335e-05\n",
|
|
"1034017\t5.4789410050796e-05\n",
|
|
"663559\t4.8746989429600646e-05\n",
|
|
"357530\t4.561544279769614e-05\n",
|
|
"1120567\t4.484712837377306e-05\n",
|
|
"663605\t4.348263426865656e-05\n",
|
|
"663682\t3.659435702898406e-05\n",
|
|
"1108860\t3.5251801674277104e-05\n",
|
|
"664788\t3.503570416132506e-05\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Options\n",
|
|
"file = 'network.tsv' # Input file of the edge list - DO NOT EDIT\n",
|
|
"command = 'simplified_pagerank' # Command to run - DO NOT EDIT\n",
|
|
"damping_factor = 0.85 # Damping factor - submit results for damping_factor = 0.85\n",
|
|
"iterations = [10,25] # Number of iterations - sumbit results for iterations = [10,25] \n",
|
|
"\n",
|
|
"# Run Simplified PageRank\n",
|
|
"# DO NOT EDIT\n",
|
|
"for i in iterations:\n",
|
|
" pr = PageRank(file)\n",
|
|
" pr.calculate_node_degree()\n",
|
|
" max_node_id = pr.get_max_node_id()\n",
|
|
" node_weights = np.ones(max_node_id + 1) / (max_node_id + 1)\n",
|
|
" result = pr.run_pagerank(node_weights=node_weights, iterations=i, damping_factor=damping_factor)\n",
|
|
" dump_results(command, i, result )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Personalized Pagerank"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Max node id: 1157826\n",
|
|
"Total source number: 1157826\n",
|
|
"Completed 10/10 iterations. 49.452614307403564 seconds elapsed.\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Sorting...\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"node_id\tpr_value\n",
|
|
"663930\t0.00017322880787908225\n",
|
|
"482708\t9.357576846748939e-05\n",
|
|
"1034017\t5.4259326852399675e-05\n",
|
|
"663559\t4.839221909138372e-05\n",
|
|
"1120567\t4.7232106571505145e-05\n",
|
|
"357530\t4.518349739514661e-05\n",
|
|
"663605\t4.3426854381007356e-05\n",
|
|
"663682\t3.6597247542446524e-05\n",
|
|
"1108860\t3.598649813718684e-05\n",
|
|
"664788\t3.492282035011704e-05\n",
|
|
"\n",
|
|
"Max node id: 1157826\n",
|
|
"Total source number: 1157826\n",
|
|
"Completed 25/25 iterations. 134.96649599075317 seconds elapsed.\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Sorting...\n"
|
|
]
|
|
},
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"node_id\tpr_value\n",
|
|
"663930\t0.0001732269286946148\n",
|
|
"482708\t9.357542233989271e-05\n",
|
|
"1034017\t5.4247794139781025e-05\n",
|
|
"663559\t4.839194448977728e-05\n",
|
|
"1120567\t4.7209135769309005e-05\n",
|
|
"357530\t4.518331724955377e-05\n",
|
|
"663605\t4.342657846074319e-05\n",
|
|
"663682\t3.65972370712859e-05\n",
|
|
"1108860\t3.5962241269431685e-05\n",
|
|
"664788\t3.49225540296597e-05\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Options\n",
|
|
"file = 'network.tsv' # Input file of the edge list - DO NOT EDIT\n",
|
|
"command = 'personalized_pagerank' # Command to run - DO NOT EDIT\n",
|
|
"damping_factor = 0.85 # Damping factor - submit results for damping_factor = 0.85\n",
|
|
"iterations = [10,25] # Number of iterations - sumbit results for iterations = [10,25] \n",
|
|
"\n",
|
|
"# Run Personalized PageRank\n",
|
|
"# DO NOT EDIT\n",
|
|
"for i in iterations:\n",
|
|
" pr = PageRank(file)\n",
|
|
" pr.calculate_node_degree()\n",
|
|
" max_node_id = pr.get_max_node_id()\n",
|
|
" np.random.seed(gtid())\n",
|
|
" node_weights = np.random.rand(max_node_id + 1)\n",
|
|
" node_weights = node_weights/node_weights.sum()\n",
|
|
" result = pr.run_pagerank(node_weights=node_weights, iterations=i, damping_factor=damping_factor)\n",
|
|
" dump_results(command, i, result)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Submitting your results to gradescope\n",
|
|
"Submit the following on Gradescope \n",
|
|
"* This file, Q1.ipynb\n",
|
|
"* simplified_pagerank_iter10.txt\n",
|
|
"* simplified_pagerank_iter25.txt\n",
|
|
"* personalized_pagerank_iter10.txt\n",
|
|
"* personalized_pagerank_iter25.txt\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.17"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|