########################### DO NOT MODIFY THIS SECTION ########################## ################################################################################# import sqlite3 from sqlite3 import Error import csv ################################################################################# ## Change to False to disable Sample SHOW = True ############### SAMPLE CLASS AND SQL QUERY ########################### ###################################################################### class Sample(): def sample(self): try: connection = sqlite3.connect("sample") connection.text_factory = str except Error as e: print("Error occurred: " + str(e)) print('\033[32m' + "Sample: " + '\033[m') # Sample Drop table connection.execute("DROP TABLE IF EXISTS sample;") # Sample Create connection.execute("CREATE TABLE sample(id integer, name text);") # Sample Insert connection.execute("INSERT INTO sample VALUES (?,?)",("1","test_name")) connection.commit() # Sample Select cursor = connection.execute("SELECT * FROM sample;") print(cursor.fetchall()) ###################################################################### class HW2_sql(): ############### DO NOT MODIFY THIS SECTION ########################### ###################################################################### def create_connection(self, path): connection = None try: connection = sqlite3.connect(path) connection.text_factory = str except Error as e: print("Error occurred: " + str(e)) return connection def execute_query(self, connection, query): cursor = connection.cursor() try: if query == "": return "Query Blank" else: cursor.execute(query) connection.commit() return "Query executed successfully" except Error as e: return "Error occurred: " + str(e) ###################################################################### ###################################################################### # GTusername [0 points] def GTusername(self): gt_username = "icanlapan3" return gt_username # Part a.i Create Tables [2 points] def part_ai_1(self,connection): ############### EDIT SQL STATEMENT ################################### part_ai_1_sql = "create table movies(id integer, title text, score real);" ###################################################################### return self.execute_query(connection, part_ai_1_sql) def part_ai_2(self,connection): ############### EDIT SQL STATEMENT ################################### part_ai_2_sql = "create table movie_cast(movie_id integer, cast_id integer, cast_name integer, birthday text, popularity real);" ###################################################################### return self.execute_query(connection, part_ai_2_sql) # Part a.ii Import Data [2 points] def part_aii_1(self,connection,path): ############### CREATE IMPORT CODE BELOW ############################ # Import CSV file with open(path) as fin: # `with` statement available in 2.5+ # csv.DictReader uses first line in file for column headings by default dr = csv.reader(fin) # comma is default delimiter to_db = [(i[0], i[1], i[2]) for i in dr] # Upload to the table in database connection.executemany("INSERT INTO movies (id, title, score) VALUES (?, ?, ?)", to_db) ###################################################################### sql = "SELECT COUNT(id) FROM movies;" cursor = connection.execute(sql) return cursor.fetchall()[0][0] def part_aii_2(self,connection, path): ############### CREATE IMPORT CODE BELOW ############################ # Import CSV file with open(path) as fin: # `with` statement available in 2.5+ # csv.DictReader uses first line in file for column headings by default dr = csv.reader(fin) # comma is default delimiter to_db = [(i[0], i[1], i[2], i[3], i[4]) for i in dr] # Upload to the table in database connection.executemany("INSERT INTO movie_cast (movie_id, cast_id, cast_name, birthday, popularity) VALUES (?, ?, ?, ?, ?);", to_db) ###################################################################### sql = "SELECT COUNT(cast_id) FROM movie_cast;" cursor = connection.execute(sql) return cursor.fetchall()[0][0] # Part a.iii Vertical Database Partitioning [5 points] def part_aiii(self,connection): ############### EDIT CREATE TABLE SQL STATEMENT ################################### part_aiii_sql = "create table cast_bio (cast_id integer, cast_name text, birthday date, popularity real);" ###################################################################### self.execute_query(connection, part_aiii_sql) ############### CREATE IMPORT CODE BELOW ############################ part_aiii_insert_sql = """ INSERT INTO cast_bio (cast_id, cast_name, birthday, popularity) select distinct cast_id, cast_name, birthday, popularity from movie_cast ; """ ###################################################################### self.execute_query(connection, part_aiii_insert_sql) sql = "SELECT COUNT(cast_id) FROM cast_bio;" cursor = connection.execute(sql) return cursor.fetchall()[0][0] # Part b Create Indexes [1 points] def part_b_1(self,connection): ############### EDIT SQL STATEMENT ################################### part_b_1_sql = "CREATE INDEX movie_index on movies(id);" ###################################################################### return self.execute_query(connection, part_b_1_sql) def part_b_2(self,connection): ############### EDIT SQL STATEMENT ################################### part_b_2_sql = "CREATE INDEX cast_index on movie_cast(cast_id);" ###################################################################### return self.execute_query(connection, part_b_2_sql) def part_b_3(self,connection): ############### EDIT SQL STATEMENT ################################### part_b_3_sql = "CREATE INDEX cast_bio_index on cast_bio(cast_id);" ###################################################################### return self.execute_query(connection, part_b_3_sql) # Part c Calculate a Proportion [3 points] def part_c(self,connection): ############### EDIT SQL STATEMENT ################################### part_c_sql = """ select printf("%.2f", (target/all_movies)*100) from (select cast(count (distinct case when (score > 50 and title like '%war%') then id end) as float) as target, cast(count (distinct id) as float) as all_movies from movies) x ; """ ###################################################################### cursor = connection.execute(part_c_sql) return cursor.fetchall()[0][0] # Part d Find the Most Prolific Actors [4 points] def part_d(self,connection): ############### EDIT SQL STATEMENT ################################### part_d_sql = """ select cast_name, count (distinct movie_id) as movie_count from movie_cast where popularity > 10 group by cast_name order by movie_count desc, cast_name limit 5 ; """ ###################################################################### cursor = connection.execute(part_d_sql) return cursor.fetchall() # e. [4 points] Find the highest scoring movies with the smallest cast. List the 5 highest-scoring movies that # have the fewest cast members. Sort the results by score in descending order, then by number of cast # members in ascending order, then by movie name in alphabetical order. Format all decimals to two # places using printf(). # Output format and sample values (movie_title,movie_score,cast_count): # Star Wars: Holiday Special,75.01,12 # War Games,58.49,33 # 1. movies id (integer) title (text) score (real) # 2. movie_cast movie_id (integer) cast_id (integer) cast_name (text) birthday (text) popularity (real) # Part e Find the Highest Scoring Movies With the Least Amount of Cast [4 points] def part_e(self,connection): ############### EDIT SQL STATEMENT ################################### part_e_sql = """ select a.title, printf("%.2f", a.score) as movie_score, b.cast_count from movies a inner join (select movie_id, count(distinct cast_id) as cast_count from movie_cast group by movie_id) b on a.id = b.movie_id order by score desc, cast_count asc, a.title asc limit 5 ; """ ###################################################################### cursor = connection.execute(part_e_sql) return cursor.fetchall() # f. [4 points] Get high scoring actors. Find the top ten cast members who have the highest average movie # scores. Format all decimals to two places using printf(). # ▪ Sort the output by average score in descending order, then by cast_name in alphabetical order. # ▪ Do not include movies with score <25 in the average score calculation. # ▪ Exclude cast members who have appeared in two or fewer movies. # Output format and sample values (cast_id,cast_name,average_score): # 8822,Julia Roberts,53.00 # Part f Get High Scoring Actors [4 points] def part_f(self,connection): ############### EDIT SQL STATEMENT ################################### part_f_sql = """ select cast_id, cast_name, printf("%.2f", average_score) as average_score from (select a.cast_id, a.cast_name, count(distinct a.movie_id) as movie_count, avg(score) as average_score from movie_cast a inner join (select * from movies where score >= 25) b on a.movie_id = b.id group by a.cast_id, a.cast_name ) x where movie_count > 2 order by average_score desc, cast_name asc limit 10 ; """ ###################################################################### cursor = connection.execute(part_f_sql) return cursor.fetchall() # Part g Creating Views [6 points] def part_g(self,connection): ############### EDIT SQL STATEMENT ################################### part_g_sql = """ CREATE VIEW good_collaboration AS select distinct t1.* from (select id1.cast_id as cast_member_id1, id2.cast_id as cast_member_id2, count(distinct id1.movie_id) as movie_count, avg(score) as average_movie_score from movie_cast id1 inner join movie_cast id2 on id1.movie_id = id2.movie_id inner join movies m on id1.movie_id = m.id where id1.cast_id != id2.cast_id group by id1.cast_id, id2.cast_id having count(distinct id1.movie_id) >= 3 and avg(score) >= 40 ) t1 inner join (select id1.cast_id as cast_member_id1, id2.cast_id as cast_member_id2, count(distinct id1.movie_id) as movie_count, avg(score) as average_movie_score from movie_cast id1 inner join movie_cast id2 on id1.movie_id = id2.movie_id inner join movies m on id1.movie_id = m.id where id1.cast_id != id2.cast_id group by id1.cast_id, id2.cast_id having count(distinct id1.movie_id) >= 3 and avg(score) >= 40 ) t2 on t1.cast_member_id1 = t2.cast_member_id2 and t1.cast_member_id2 = t2.cast_member_id1 ; """ ###################################################################### return self.execute_query(connection, part_g_sql) def part_gi(self,connection): ############### EDIT SQL STATEMENT ################################### part_g_i_sql = """ select cast_id, cast_name, printf("%.2f", collaboration_score) as collaboration_score from ( select distinct a.cast_id, cast_name, avg(average_movie_score) as collaboration_score from ( select cast_member_id1 as cast_id, average_movie_score from good_collaboration union all select cast_member_id2 as cast_id, average_movie_score from good_collaboration ) a inner join (select distinct cast_id, cast_name from movie_cast ) b on a.cast_id = b.cast_id group by a.cast_id, cast_name ) order by collaboration_score desc, cast_name asc limit 5 ; """ ###################################################################### cursor = connection.execute(part_g_i_sql) return cursor.fetchall() # Part h FTS [4 points] def part_h(self,connection,path): ############### EDIT SQL STATEMENT ################################### part_h_sql = """ CREATE VIRTUAL TABLE movie_overview USING fts4(id integer, overview text, tokenize = simple); """ ###################################################################### connection.execute(part_h_sql) ############### CREATE IMPORT CODE BELOW ############################ # Import CSV file with open(path) as fin: # `with` statement available in 2.5+ # csv.DictReader uses first line in file for column headings by default dr = csv.reader(fin) # comma is default delimiter to_db = [(i[0], i[1]) for i in dr] # Upload to the table in database connection.executemany("INSERT INTO movie_overview (id, overview) VALUES (?, ?);", to_db) ###################################################################### sql = "SELECT COUNT(id) FROM movie_overview;" cursor = connection.execute(sql) return cursor.fetchall()[0][0] def part_hi(self,connection): ############### EDIT SQL STATEMENT ################################### part_hi_sql = """ select count(distinct id) from movie_overview where overview MATCH 'fight'; """ ###################################################################### cursor = connection.execute(part_hi_sql) return cursor.fetchall()[0][0] def part_hii(self,connection): ############### EDIT SQL STATEMENT ################################### part_hii_sql = """ select count(distinct id) from movie_overview where overview MATCH 'space NEAR/5 program' ; """ ###################################################################### cursor = connection.execute(part_hii_sql) return cursor.fetchall()[0][0] if __name__ == "__main__": ########################### DO NOT MODIFY THIS SECTION ########################## ################################################################################# if SHOW == True: sample = Sample() sample.sample() print('\033[32m' + "Q2 Output: " + '\033[m') db = HW2_sql() try: conn = db.create_connection("Q2") except: print("Database Creation Error") try: conn.execute("DROP TABLE IF EXISTS movies;") conn.execute("DROP TABLE IF EXISTS movie_cast;") conn.execute("DROP TABLE IF EXISTS cast_bio;") conn.execute("DROP VIEW IF EXISTS good_collaboration;") conn.execute("DROP TABLE IF EXISTS movie_overview;") except: print("Error in Table Drops") try: print('\033[32m' + "part ai 1: " + '\033[m' + str(db.part_ai_1(conn))) print('\033[32m' + "part ai 2: " + '\033[m' + str(db.part_ai_2(conn))) except: print("Error in Part a.i") try: print('\033[32m' + "Row count for Movies Table: " + '\033[m' + str(db.part_aii_1(conn,"data/movies.csv"))) print('\033[32m' + "Row count for Movie Cast Table: " + '\033[m' + str(db.part_aii_2(conn,"data/movie_cast.csv"))) except: print("Error in part a.ii") try: print('\033[32m' + "Row count for Cast Bio Table: " + '\033[m' + str(db.part_aiii(conn))) except: print("Error in part a.iii") try: print('\033[32m' + "part b 1: " + '\033[m' + db.part_b_1(conn)) print('\033[32m' + "part b 2: " + '\033[m' + db.part_b_2(conn)) print('\033[32m' + "part b 3: " + '\033[m' + db.part_b_3(conn)) except: print("Error in part b") try: print('\033[32m' + "part c: " + '\033[m' + str(db.part_c(conn))) except: print("Error in part c") try: print('\033[32m' + "part d: " + '\033[m') for line in db.part_d(conn): print(line[0],line[1]) except: print("Error in part d") try: print('\033[32m' + "part e: " + '\033[m') for line in db.part_e(conn): print(line[0],line[1],line[2]) except: print("Error in part e") try: print('\033[32m' + "part f: " + '\033[m') for line in db.part_f(conn): print(line[0],line[1],line[2]) except: print("Error in part f") try: print('\033[32m' + "part g: " + '\033[m' + str(db.part_g(conn))) print('\033[32m' + "part g.i: " + '\033[m') for line in db.part_gi(conn): print(line[0],line[1],line[2]) except: print("Error in part g") try: print('\033[32m' + "part h.i: " + '\033[m'+ str(db.part_h(conn,"data/movie_overview.csv"))) print('\033[32m' + "Count h.ii: " + '\033[m' + str(db.part_hi(conn))) print('\033[32m' + "Count h.iii: " + '\033[m' + str(db.part_hii(conn))) except: print("Error in part h") conn.close() ################################################################################# #################################################################################