import numpy as np import time import argparse import sys """ Below is code for the PageRank algorithm (power iteration). This code assumes that the node IDs start from 0 and are contiguous up to max_node_id. You are required to implement the functionality in the space provided. Because computing the adjacency matrix for large graph requires to load large graph dataset to computer memory, thus, in order to calculate the PageRank value of each node, you need to iterate over dataset multiple times and update the PageRank value based on equation mentioned in the question. """ def author(): return "mpearl3" # replace gburdell3 with your Georgia Tech username. def gtid(): return 903365090 # replace with your GT ID number class PageRank: def __init__(self, edge_file): self.node_degree = {} self.max_node_id = 0 self.edge_file = edge_file def read_edge_file(self, edge_file): with open(edge_file) as f: for line in f: val = line.split('\t') yield int(val[0]), int(val[1]) """ Step1: Calculate the out-degree of each node and maximum node_id of the graph. Store the out-degree in class variable "node_degree" and maximum node id to "max_node_id". """ def calculate_node_degree(self): for source,target in self.read_edge_file(self.edge_file): ### Implement your code here ############################################# if source in self.node_degree.keys(): self.node_degree[source]=self.node_degree[source]+1 else: self.node_degree[source]=1 if self.max_node_id == 0: self.max_node_id = target elif target > self.max_node_id: self.max_node_id = target else: continue ############################################# print("Max node id: {}".format(self.max_node_id)) def get_max_node_id(self): return self.max_node_id def run_pagerank(self, node_weights, damping_factor=0.85, iterations=10): pr_values = [1.0 / (self.max_node_id + 1)] * (self.max_node_id + 1) start_time = time.time() """ Step2: Implement pagerank algorithm as mentioned in lecture slides and the question. Incoming Parameters: node_weights: Probability of each node to flyout during random walk damping_factor: Probability of continuing on the random walk iterations: Number of iterations to run the algorithm check the __main__ function to understand node_weights and max_node_id Use the calculated out-degree to calculate the pagerank value of each node """ for it in range(iterations): target_dict = {} new_pr_values = [0.0] * (self.max_node_id + 1) new_pr_values = (1-damping_factor)*node_weights for source, target in self.read_edge_file(self.edge_file): ### Implement your code here ############################################# if target in target_dict: target_dict[target] += pr_values[source]/self.node_degree[source] else: target_dict[target] = pr_values[source]/self.node_degree[source] for i in target_dict: new_pr_values[i] += damping_factor*target_dict[i] pr_values= new_pr_values ############################################# print ("Completed {0}/{1} iterations. {2} seconds elapsed.".format(it + 1, iterations, time.time() - start_time)) return pr_values def dump_results(command, iterations, result): print("Sorting...", file=sys.stderr) sorted_result = sorted(enumerate(result), key=lambda x: x[1], reverse=True) output_result = "node_id\tpr_value\n" for node_id, pr_value in sorted_result[:10]: output_result += "{0}\t{1}\n".format(node_id, pr_value) print(output_result) with open(command+'_iter'+str(iterations)+".txt","w") as output_file: output_file.write(output_result) if __name__ == "__main__": parser = argparse.ArgumentParser(description="sample command: python submission.py -i 5 -d 0.85 simplified_pagerank network.tsv") parser.add_argument("command", help="Sub-command to execute. Can be simplified_pagerank or personalized_pagerank.") parser.add_argument("filepath", help="path of the input graph file(network.tsv)") parser.add_argument("-i", "--iterations", dest="iterations", help="specify the number of iterations to the algorithm. Default: 10", default=10, type=int) parser.add_argument("-d", "--damping-factor", dest="damping_factor", help="specify the damping factor for pagerank. Default: 0.85", default=0.85, type=float) args = parser.parse_args() if args.command == "simplified_pagerank": pr = PageRank(args.filepath) pr.calculate_node_degree() max_node_id = pr.get_max_node_id() node_weights = np.ones(max_node_id + 1) / (max_node_id + 1) result = pr.run_pagerank(node_weights=node_weights, iterations=args.iterations, damping_factor=args.damping_factor) dump_results(args.command, args.iterations, result ) elif args.command == "personalized_pagerank": pr = PageRank(args.filepath) pr.calculate_node_degree() max_node_id = pr.get_max_node_id() np.random.seed(gtid()) node_weights = np.random.rand(max_node_id + 1) node_weights = node_weights/node_weights.sum() result = pr.run_pagerank(node_weights=node_weights, iterations=args.iterations, damping_factor=args.damping_factor) dump_results(args.command, args.iterations, result) else: sys.exit("Incorrect command")