Files
louiscklaw 9035c1312b update,
2025-02-01 02:09:32 +08:00

343 lines
14 KiB
Python

import http.client
import json
import csv
#############################################################################################################################
#
# All instructions, code comments, etc. contained within this notebook are part of the assignment instructions.
# Portions of this file will auto-graded in Gradescope using different sets of parameters / data to ensure that values are not
# hard-coded.
#
# Instructions: Implement all methods in this file that have a return
# value of 'NotImplemented'. See the documentation within each method for specific details, including
# the expected return value
#
# Helper Functions:
# You are permitted to write additional helper functions/methods or use additional instance variables within
# the `Graph` class or `TMDbAPIUtils` class so long as the originally included methods work as required.
#
# Use:
# The `Graph` class is used to represent and store the data for the TMDb co-actor network graph. This class must
# also provide some basic analytics, i.e., number of nodes, edges, and nodes with the highest degree.
#
# The `TMDbAPIUtils` class is used to retrieve Actor/Movie data using themoviedb.org API. We have provided a few necessary methods
# to test your code w/ the API, e.g.: get_move_detail(), get_movie_cast(), get_movie_credits_for_person(). You may add additional
# methods and instance variables as desired (see Helper Functions).
#
# The data that you retrieve from the TMDb API is used to build your graph using the Graph class. After you build your graph using the
# TMDb API data, use the Graph class write_edges_file & write_nodes_file methods to produce the separate nodes and edges
# .csv files for use with the Argo-Lite graph visualization tool.
#
# While building the co-actor graph, you will be required to write code to expand the graph by iterating
# through a portion of the graph nodes and finding similar artists using the TMDb API. We will not grade this code directly
# but will grade the resulting graph data in your Argo-Lite graph snapshot.
#
#############################################################################################################################
class Graph:
# Do not modify
def __init__(self, with_nodes_file=None, with_edges_file=None):
"""
option 1: init as an empty graph and add nodes
option 2: init by specifying a path to nodes & edges files
"""
self.nodes = []
self.edges = []
if with_nodes_file and with_edges_file:
nodes_CSV = csv.reader(open(with_nodes_file))
nodes_CSV = list(nodes_CSV)[1:]
self.nodes = [(n[0],n[1]) for n in nodes_CSV]
edges_CSV = csv.reader(open(with_edges_file))
edges_CSV = list(edges_CSV)[1:]
self.edges = [(e[0],e[1]) for e in edges_CSV]
def add_node(self, id: str, name: str)->None:
"""
add a tuple (id, name) representing a node to self.nodes if it does not already exist
The graph should not contain any duplicate nodes
"""
if (id, name) not in self.nodes:
self.nodes.append((id, name))
def add_edge(self, source: str, target: str)->None:
"""
Add an edge between two nodes if it does not already exist.
An edge is represented by a tuple containing two strings: e.g.: ('source', 'target').
Where 'source' is the id of the source node and 'target' is the id of the target node
e.g., for two nodes with ids 'a' and 'b' respectively, add the tuple ('a', 'b') to self.edges
"""
if (source, target) not in self.edges and (target, source) not in self.edges and source != target:
self.edges.append((source, target))
def total_nodes(self)->int:
"""
Returns an integer value for the total number of nodes in the graph
"""
return len(self.nodes)
def total_edges(self)->int:
"""
Returns an integer value for the total number of edges in the graph
"""
return len(self.edges)
def max_degree_nodes(self)->dict:
"""
Return the node(s) with the highest degree
Return multiple nodes in the event of a tie
Format is a dict where the key is the node_id and the value is an integer for the node degree
e.g. {'a': 8}
or {'a': 22, 'b': 22}
"""
consolidated = {}
for id, name in self.nodes:
for edge in self.edges:
if id in edge:
consolidated[id] = consolidated.get(id, 0) + 1
max_value = max(consolidated.values())
result = {id: count for id, count in consolidated.items() if count == max_value}
return result
def print_nodes(self):
"""
No further implementation required
May be used for de-bugging if necessary
"""
print(self.nodes)
def print_edges(self):
"""
No further implementation required
May be used for de-bugging if necessary
"""
print(self.edges)
# Do not modify
def write_edges_file(self, path="edges.csv")->None:
"""
write all edges out as .csv
:param path: string
:return: None
"""
edges_path = path
edges_file = open(edges_path, 'w')
edges_file.write("source" + "," + "target" + "\n")
for e in self.edges:
edges_file.write(e[0] + "," + e[1] + "\n")
edges_file.close()
print("finished writing edges to csv")
# Do not modify
def write_nodes_file(self, path="nodes.csv")->None:
"""
write all nodes out as .csv
:param path: string
:return: None
"""
nodes_path = path
nodes_file = open(nodes_path, 'w')
nodes_file.write("id,name" + "\n")
for n in self.nodes:
nodes_file.write(n[0] + "," + n[1] + "\n")
nodes_file.close()
print("finished writing nodes to csv")
class TMDBAPIUtils:
# Do not modify
def __init__(self, api_key:str):
self.api_key=api_key
def get_movie_cast(self, movie_id:str, limit:int=None, exclude_ids:list=None) -> list:
"""
Get the movie cast for a given movie id, with optional parameters to exclude an cast member
from being returned and/or to limit the number of returned cast members
documentation url: https://developers.themoviedb.org/3/movies/get-movie-credits
:param integer movie_id: a movie_id
:param integer limit: number of returned cast members by their 'order' attribute
e.g., limit=5 will attempt to return the 5 cast members having 'order' attribute values between 0-4
If there are fewer cast members than the specified limit or the limit not specified, return all cast members
:param list exclude_ids: a list of ints containing ids (not cast_ids) of cast members that should be excluded from the returned result
e.g., if exclude_ids are [353, 455] then exclude these from any result.
:rtype: list
return a list of dicts, one dict per cast member with the following structure:
[{'cast_id': '97909' # the id of the cast member
'character': 'John Doe' # the name of the character played
'credit_id': '52fe4249c3a36847f8012927' # id of the credit}, ... ]
Important: the exclude_ids processing should occur prior to limiting output.
"""
conn = http.client.HTTPSConnection("api.themoviedb.org")
conn.request("GET", "/3/movie/{0}/credits?api_key={1}".format(movie_id, self.api_key))
movie = json.loads(conn.getresponse().read())
cast = movie['cast']
limit = len(cast) if not limit else limit
exclude_ids = [] if not exclude_ids else exclude_ids
cast_data = [member for member in cast if member['id'] not in exclude_ids and member['order'] < limit]
return cast_data
def get_movie_credits_for_person(self, person_id:str, vote_avg_threshold:float=None)->list:
"""
Using the TMDb API, get the movie credits for a person serving in a cast role
documentation url: https://developers.themoviedb.org/3/people/get-person-movie-credits
:param string person_id: the id of a person
:param vote_avg_threshold: optional parameter to return the movie credit if it is >=
the specified threshold.
e.g., if the vote_avg_threshold is 5.0, then only return credits with a vote_avg >= 5.0
:rtype: list
return a list of dicts, one dict per movie credit with the following structure:
[{'id': '97909' # the id of the movie credit
'title': 'Long, Stock and Two Smoking Barrels' # the title (not original title) of the credit
'vote_avg': 5.0 # the float value of the vote average value for the credit}, ... ]
"""
conn = http.client.HTTPSConnection("api.themoviedb.org")
conn.request("GET", "/3/person/{0}/movie_credits?api_key={1}&language=en-US".format(person_id, self.api_key))
resp = conn.getresponse()
movies = json.loads(resp.read())
vote_avg_threshold = -999 if not vote_avg_threshold else vote_avg_threshold
cast = movies['cast']
role_data = [role for role in cast if role['vote_average'] >= vote_avg_threshold]
return role_data
#############################################################################################################################
#
# BUILDING YOUR GRAPH
#
# Working with the API: See use of http.request: https://docs.python.org/3/library/http.client.html#examples
#
# Using TMDb's API, build a co-actor network for the actor's/actress' highest rated movies
# In this graph, each node represents an actor
# An edge between any two nodes indicates that the two actors/actresses acted in a movie together
# i.e., they share a movie credit.
# e.g., An edge between Samuel L. Jackson and Robert Downey Jr. indicates that they have acted in one
# or more movies together.
#
# For this assignment, we are interested in a co-actor network of highly rated movies; specifically,
# we only want the top 3 co-actors in each movie credit of an actor having a vote average >= 8.0.
#
# You will need to add extra functions or code to accomplish this. We will not directly call or explicitly grade your
# algorithm. We will instead measure the correctness of your output by evaluating the data in your argo-lite graph
# snapshot.
#
# Build your co-actor graph on the actress 'Meryl Streep' w/ person_id 5064.
# Initialize a Graph object with a single node representing Meryl Streep
# Find all of Meryl Streep's movie credits that have a vote average >= 8.0
#
# 1. For each movie credit:
# get the movie cast members having an 'order' value between 0-2 (these are the co-actors)
# for each movie cast member:
# using graph.add_node(), add the movie cast member as a node (keep track of all new nodes added to the graph)
# using graph.add_edge(), add an edge between the Meryl Streep (actress) node
# and each new node (co-actor/co-actress)
#
#
# Using the nodes added in the first iteration (this excludes the original node of Meryl Streep!)
#
# 2. For each node (actor / actress) added in the previous iteration:
# get the movie credits for the actor that have a vote average >= 8.0
# for each movie credit:
# try to get the 3 movie cast members having an 'order' value between 0-2
# for each movie cast member:
# if the node doesn't already exist:
# add the node to the graph (track all new nodes added to the graph)
# if the edge does not exist:
# add an edge between the node (actor) and the new node (co-actor/co-actress)
#
#
# - Repeat the steps from # 2. until you have iterated 3 times to build an appropriately sized graph.
# - Your graph should not have any duplicate edges or nodes
# - Write out your finished graph as a nodes file and an edges file using
# graph.write_edges_file()
# graph.write_nodes_file()
#
# Exception handling and best practices
# - You should use the param 'language=en-US' in all API calls to avoid encoding issues when writing data to file.
# - If the actor name has a comma char ',' it should be removed to prevent extra columns from being inserted into the .csv file
# - Some movie_credits may actually be collections and do not return cast data. Handle this situation by skipping these instances.
# - While The TMDb API does not have a rate-limiting scheme in place, consider that making hundreds / thousands of calls
# can occasionally result in timeout errors. It may be necessary to insert periodic sleeps when you are building your graph.
def return_name()->str:
"""
Return a string containing your GT Username
e.g., gburdell3
Do not return your 9 digit GTId
"""
return "psrinivasan48"
def return_argo_lite_snapshot()->str:
"""
Return the shared URL of your published graph in Argo-Lite
"""
return "https://poloclub.github.io/argo-graph-lite/#3141c97d-3336-4966-bce8-c6b41d7faac0"
if __name__ == "__main__":
graph = Graph()
graph.add_node(id='5064', name='Meryl Streep')
tmdb_api_utils = TMDBAPIUtils(api_key='08f2309ae878c65ce3df71cc04237e0b')
person_ids = ["5064"]
processed_persons = ["5064"]
processed_movies = []
new_person_ids = []
i = 3
while i > 0:
print("Persons for this iteration", len(person_ids))
q = 1
for person_id in person_ids:
print("Itr", q)
q += 1
# if person_id == 0: continue
movies = tmdb_api_utils.get_movie_credits_for_person(person_id, 8.0)
for movie in movies:
cast = tmdb_api_utils.get_movie_cast(movie['id'], 3, exclude_ids = [person_id])
for member in cast:
member_id = str(member['id'])
new_person_ids.append(member_id)
graph.add_node(member_id, member['name'].encode("ascii", "ignore").decode("ascii")
.replace(",", ""))
graph.add_edge(person_id, member_id)
i -= 1
print("Iteration done ", len(graph.nodes))
from copy import copy
person_ids = copy(new_person_ids)
new_person_ids = []
# call functions or place code here to build graph (graph building code not graded)
graph.write_edges_file()
graph.write_nodes_file()