Files
louiscklaw 9035c1312b update,
2025-02-01 02:09:32 +08:00

140 lines
5.4 KiB
Python

import numpy as np
import time
import argparse
import sys
"""
Below is code for the PageRank algorithm (power iteration).
This code assumes that the node IDs start from 0 and are contiguous up to max_node_id.
You are required to implement the functionality in the space provided.
Because computing the adjacency matrix for large graph requires to load large graph dataset to
computer memory, thus, in order to calculate the PageRank value of each node, you need to iterate
over dataset multiple times and update the PageRank value based on equation mentioned in the question.
"""
def author():
return "mvegesna3" # replace gburdell3 with your Georgia Tech username.
def gtid():
return 903479040 # replace with your GT ID number
class PageRank:
def __init__(self, edge_file):
self.node_degree = {}
self.max_node_id = 0
self.edge_file = edge_file
def read_edge_file(self, edge_file):
with open(edge_file) as f:
for line in f:
val = line.split('\t')
yield int(val[0]), int(val[1])
"""
Step1: Calculate the out-degree of each node and maximum node_id of the graph.
Store the out-degree in class variable "node_degree" and maximum node id to "max_node_id".
"""
def calculate_node_degree(self):
for source,target in self.read_edge_file(self.edge_file):
### Implement your code here
#############################################
if source in self.node_degree:
self.node_degree[source] = self.node_degree[source] + 1
else:
self.node_degree[source] = 1
max_node = max(source, target, self.max_node_id)
self.max_node_id = max_node
#############################################
print("Max node id: {}".format(self.max_node_id))
def get_max_node_id(self):
return self.max_node_id
def run_pagerank(self, node_weights, damping_factor=0.85, iterations=10):
pr_values = [1.0 / (self.max_node_id + 1)] * (self.max_node_id + 1)
start_time = time.time()
"""
Step2: Implement pagerank algorithm as mentioned in lecture slides and the question.
Incoming Parameters:
node_weights: Probability of each node to flyout during random walk
damping_factor: Probability of continuing on the random walk
iterations: Number of iterations to run the algorithm
check the __main__ function to understand node_weights and max_node_id
Use the calculated out-degree to calculate the pagerank value of each node
"""
for it in range(iterations):
new_pr_values = [0.0] * (self.max_node_id + 1)
for source, target in self.read_edge_file(self.edge_file):
### Implement your code here
#############################################
sum = np.array(pr_values[source]/self.node_degree[source])
new_pr_values[target] += damping_factor * sum
pr_values = (1-damping_factor) * node_weights + new_pr_values
#############################################
print ("Completed {0}/{1} iterations. {2} seconds elapsed.".format(it + 1, iterations, time.time() - start_time))
return pr_values
def dump_results(command, iterations, result):
print("Sorting...", file=sys.stderr)
sorted_result = sorted(enumerate(result), key=lambda x: x[1], reverse=True)
output_result = "node_id\tpr_value\n"
for node_id, pr_value in sorted_result[:10]:
output_result += "{0}\t{1}\n".format(node_id, pr_value)
print(output_result)
with open(command+'_iter'+str(args.iterations)+".txt","w") as output_file:
output_file.write(output_result)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="sample command: python submission.py -i 5 -d 0.85 simplified_pagerank network.tsv")
parser.add_argument("command", help="Sub-command to execute. Can be simplified_pagerank or personalized_pagerank.")
parser.add_argument("filepath", help="path of the input graph file(network.tsv)")
parser.add_argument("-i", "--iterations", dest="iterations",
help="specify the number of iterations to the algorithm. Default: 10",
default=10, type=int)
parser.add_argument("-d", "--damping-factor", dest="damping_factor",
help="specify the damping factor for pagerank. Default: 0.85",
default=0.85, type=float)
args = parser.parse_args()
if args.command == "simplified_pagerank":
pr = PageRank(args.filepath)
pr.calculate_node_degree()
max_node_id = pr.get_max_node_id()
node_weights = np.ones(max_node_id + 1) / (max_node_id + 1)
result = pr.run_pagerank(node_weights=node_weights, iterations=args.iterations, damping_factor=args.damping_factor)
dump_results(args.command, args.iterations, result )
elif args.command == "personalized_pagerank":
pr = PageRank(args.filepath)
pr.calculate_node_degree()
max_node_id = pr.get_max_node_id()
np.random.seed(gtid())
node_weights = np.random.rand(max_node_id + 1)
node_weights = node_weights/node_weights.sum()
result = pr.run_pagerank(node_weights=node_weights, iterations=args.iterations, damping_factor=args.damping_factor)
dump_results(args.command, args.iterations, result)
else:
sys.exit("Incorrect command")