Files
tunmnlu/task_2/others-answer/omsa-main/CS-6242-OAN/hw4/Q1/submission.py
louiscklaw 9035c1312b update,
2025-02-01 02:09:32 +08:00

154 lines
6.3 KiB
Python

import numpy as np
import time
import argparse
import sys
"""
Below is code for the PageRank algorithm (power iteration).
This code assumes that the node IDs start from 0 and are contiguous up to max_node_id.
You are required to implement the functionality in the space provided.
Because computing the adjacency matrix for large graph requires to load large graph dataset to
computer memory, thus, in order to calculate the PageRank value of each node, you need to iterate
over dataset multiple times and update the PageRank value based on equation mentioned in the question.
"""
def author():
return "mpearl3" # replace gburdell3 with your Georgia Tech username.
def gtid():
return 903365090 # replace with your GT ID number
class PageRank:
def __init__(self, edge_file):
self.node_degree = {}
self.max_node_id = 0
self.edge_file = edge_file
def read_edge_file(self, edge_file):
with open(edge_file) as f:
for line in f:
val = line.split('\t')
yield int(val[0]), int(val[1])
"""
Step1: Calculate the out-degree of each node and maximum node_id of the graph.
Store the out-degree in class variable "node_degree" and maximum node id to "max_node_id".
"""
def calculate_node_degree(self):
for source,target in self.read_edge_file(self.edge_file):
### Implement your code here
#############################################
if source in self.node_degree.keys():
self.node_degree[source]=self.node_degree[source]+1
else:
self.node_degree[source]=1
if self.max_node_id == 0:
self.max_node_id = target
elif target > self.max_node_id:
self.max_node_id = target
else:
continue
#############################################
print("Max node id: {}".format(self.max_node_id))
def get_max_node_id(self):
return self.max_node_id
def run_pagerank(self, node_weights, damping_factor=0.85, iterations=10):
pr_values = [1.0 / (self.max_node_id + 1)] * (self.max_node_id + 1)
start_time = time.time()
"""
Step2: Implement pagerank algorithm as mentioned in lecture slides and the question.
Incoming Parameters:
node_weights: Probability of each node to flyout during random walk
damping_factor: Probability of continuing on the random walk
iterations: Number of iterations to run the algorithm
check the __main__ function to understand node_weights and max_node_id
Use the calculated out-degree to calculate the pagerank value of each node
"""
for it in range(iterations):
target_dict = {}
new_pr_values = [0.0] * (self.max_node_id + 1)
new_pr_values = (1-damping_factor)*node_weights
for source, target in self.read_edge_file(self.edge_file):
### Implement your code here
#############################################
if target in target_dict:
target_dict[target] += pr_values[source]/self.node_degree[source]
else:
target_dict[target] = pr_values[source]/self.node_degree[source]
for i in target_dict:
new_pr_values[i] += damping_factor*target_dict[i]
pr_values= new_pr_values
#############################################
print ("Completed {0}/{1} iterations. {2} seconds elapsed.".format(it + 1, iterations, time.time() - start_time))
return pr_values
def dump_results(command, iterations, result):
print("Sorting...", file=sys.stderr)
sorted_result = sorted(enumerate(result), key=lambda x: x[1], reverse=True)
output_result = "node_id\tpr_value\n"
for node_id, pr_value in sorted_result[:10]:
output_result += "{0}\t{1}\n".format(node_id, pr_value)
print(output_result)
with open(command+'_iter'+str(iterations)+".txt","w") as output_file:
output_file.write(output_result)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="sample command: python submission.py -i 5 -d 0.85 simplified_pagerank network.tsv")
parser.add_argument("command", help="Sub-command to execute. Can be simplified_pagerank or personalized_pagerank.")
parser.add_argument("filepath", help="path of the input graph file(network.tsv)")
parser.add_argument("-i", "--iterations", dest="iterations",
help="specify the number of iterations to the algorithm. Default: 10",
default=10, type=int)
parser.add_argument("-d", "--damping-factor", dest="damping_factor",
help="specify the damping factor for pagerank. Default: 0.85",
default=0.85, type=float)
args = parser.parse_args()
if args.command == "simplified_pagerank":
pr = PageRank(args.filepath)
pr.calculate_node_degree()
max_node_id = pr.get_max_node_id()
node_weights = np.ones(max_node_id + 1) / (max_node_id + 1)
result = pr.run_pagerank(node_weights=node_weights, iterations=args.iterations, damping_factor=args.damping_factor)
dump_results(args.command, args.iterations, result )
elif args.command == "personalized_pagerank":
pr = PageRank(args.filepath)
pr.calculate_node_degree()
max_node_id = pr.get_max_node_id()
np.random.seed(gtid())
node_weights = np.random.rand(max_node_id + 1)
node_weights = node_weights/node_weights.sum()
result = pr.run_pagerank(node_weights=node_weights, iterations=args.iterations, damping_factor=args.damping_factor)
dump_results(args.command, args.iterations, result)
else:
sys.exit("Incorrect command")