415 lines
16 KiB
Python
415 lines
16 KiB
Python
########################### DO NOT MODIFY THIS SECTION ##########################
|
|
#################################################################################
|
|
import sqlite3
|
|
from sqlite3 import Error
|
|
import csv
|
|
#################################################################################
|
|
|
|
## Change to False to disable Sample
|
|
SHOW = True
|
|
|
|
############### SAMPLE CLASS AND SQL QUERY ###########################
|
|
######################################################################
|
|
class Sample():
|
|
def sample(self):
|
|
try:
|
|
connection = sqlite3.connect("sample")
|
|
connection.text_factory = str
|
|
except Error as e:
|
|
print("Error occurred: " + str(e))
|
|
print('\033[32m' + "Sample: " + '\033[m')
|
|
|
|
# Sample Drop table
|
|
connection.execute("DROP TABLE IF EXISTS sample;")
|
|
# Sample Create
|
|
connection.execute("CREATE TABLE sample(id integer, name text);")
|
|
# Sample Insert
|
|
connection.execute("INSERT INTO sample VALUES (?,?)",("1","test_name"))
|
|
connection.commit()
|
|
# Sample Select
|
|
cursor = connection.execute("SELECT * FROM sample;")
|
|
print(cursor.fetchall())
|
|
|
|
######################################################################
|
|
|
|
class HW2_sql():
|
|
############### DO NOT MODIFY THIS SECTION ###########################
|
|
######################################################################
|
|
def create_connection(self, path):
|
|
connection = None
|
|
try:
|
|
connection = sqlite3.connect(path)
|
|
connection.text_factory = str
|
|
except Error as e:
|
|
print("Error occurred: " + str(e))
|
|
|
|
return connection
|
|
|
|
def execute_query(self, connection, query):
|
|
cursor = connection.cursor()
|
|
try:
|
|
if query == "":
|
|
return "Query Blank"
|
|
else:
|
|
cursor.execute(query)
|
|
connection.commit()
|
|
return "Query executed successfully"
|
|
except Error as e:
|
|
return "Error occurred: " + str(e)
|
|
######################################################################
|
|
######################################################################
|
|
|
|
# GTusername [0 points]
|
|
def GTusername(self):
|
|
gt_ousername = "helfayoumy3"
|
|
return gt_username
|
|
|
|
# Part a.i Create Tables [2 points]
|
|
def part_ai_1(self,connection):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_ai_1_sql = """
|
|
CREATE TABLE movies (
|
|
id INTEGER,
|
|
title TEXT,
|
|
score REAL
|
|
);
|
|
"""
|
|
######################################################################
|
|
|
|
return self.execute_query(connection, part_ai_1_sql)
|
|
|
|
def part_ai_2(self,connection):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_ai_2_sql = """
|
|
CREATE TABLE movie_cast
|
|
(
|
|
movie_id INTEGER,
|
|
cast_id INTEGER,
|
|
cast_name TEXT,
|
|
birthday TEXT,
|
|
popularity REAL
|
|
);
|
|
"""
|
|
######################################################################
|
|
|
|
return self.execute_query(connection, part_ai_2_sql)
|
|
|
|
# Part a.ii Import Data [2 points]
|
|
def part_aii_1(self,connection,path):
|
|
############### CREATE IMPORT CODE BELOW ############################
|
|
with open('data/movies.csv', encoding='utf-8') as csvfile:
|
|
movies = csv.reader(csvfile, delimiter=',')
|
|
for row in movies:
|
|
connection.execute("INSERT INTO movies VALUES (?,?,?)",(row[0],row[1],row[2]))
|
|
connection.commit()
|
|
######################################################################
|
|
|
|
sql = "SELECT COUNT(id) FROM movies;"
|
|
cursor = connection.execute(sql)
|
|
return cursor.fetchall()[0][0]
|
|
|
|
def part_aii_2(self,connection, path):
|
|
############### CREATE IMPORT CODE BELOW ############################
|
|
with open('data/movie_cast.csv', encoding='utf-8') as csvfile:
|
|
movie_cast = csv.reader(csvfile, delimiter=',')
|
|
for row in movie_cast:
|
|
connection.execute("INSERT INTO movie_cast VALUES (?,?,?,?,?)",(row[0],row[1],row[2], row[3], row[4]))
|
|
connection.commit()
|
|
######################################################################
|
|
|
|
sql = "SELECT COUNT(cast_id) FROM movie_cast;"
|
|
cursor = connection.execute(sql)
|
|
return cursor.fetchall()[0][0]
|
|
|
|
# Part a.iii Vertical Database Partitioning [5 points]
|
|
def part_aiii(self,connection):
|
|
############### EDIT CREATE TABLE SQL STATEMENT ###################################
|
|
part_aiii_sql = """
|
|
CREATE TABLE cast_bio
|
|
(
|
|
cast_id INTEGER,
|
|
cast_name TEXT,
|
|
birthday TEXT,
|
|
popularity REAL
|
|
);
|
|
"""
|
|
######################################################################
|
|
|
|
self.execute_query(connection, part_aiii_sql)
|
|
|
|
############### CREATE IMPORT CODE BELOW ############################
|
|
part_aiii_insert_sql = """
|
|
INSERT INTO cast_bio
|
|
SELECT DISTINCT cast_id,
|
|
cast_name,
|
|
birthday,
|
|
popularity
|
|
FROM movie_cast;
|
|
"""
|
|
######################################################################
|
|
|
|
self.execute_query(connection, part_aiii_insert_sql)
|
|
|
|
sql = "SELECT COUNT(cast_id) FROM cast_bio;"
|
|
cursor = connection.execute(sql)
|
|
return cursor.fetchall()[0][0]
|
|
|
|
|
|
# Part b Create Indexes [1 points]
|
|
def part_b_1(self,connection):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_b_1_sql = "CREATE INDEX movie_index ON movies(id);"
|
|
######################################################################
|
|
return self.execute_query(connection, part_b_1_sql)
|
|
|
|
def part_b_2(self,connection):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_b_2_sql = "CREATE INDEX cast_index ON movie_cast(cast_id);"
|
|
######################################################################
|
|
return self.execute_query(connection, part_b_2_sql)
|
|
|
|
def part_b_3(self,connection):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_b_3_sql = "CREATE UNIQUE INDEX cast_bio_index ON cast_bio(cast_id);"
|
|
######################################################################
|
|
return self.execute_query(connection, part_b_3_sql)
|
|
|
|
# Part c Calculate a Proportion [3 points]
|
|
def part_c(self,connection):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_c_sql = """
|
|
SELECT Printf('%.2f', Cast(( Count(id) * 100 ) AS REAL)/(select count(*) from movies)) FROM movies
|
|
WHERE Lower(title) LIKE '%war%' AND score > Cast(50 AS INT)
|
|
"""
|
|
######################################################################
|
|
cursor = connection.execute(part_c_sql)
|
|
return cursor.fetchall()[0][0]
|
|
|
|
# Part d Find the Most Prolific Actors [4 points]
|
|
def part_d(self,connection):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_d_sql = """
|
|
SELECT cast_name,
|
|
Count(movie_id) AS appearance_count
|
|
FROM movie_cast
|
|
WHERE popularity > 10
|
|
GROUP BY cast_name
|
|
ORDER BY appearance_count DESC,
|
|
cast_name ASC limit 5;
|
|
"""
|
|
######################################################################
|
|
cursor = connection.execute(part_d_sql)
|
|
return cursor.fetchall()
|
|
|
|
# Part e Find the Highest Scoring Movies With the Least Amount of Cast [4 points]
|
|
def part_e(self,connection):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_e_sql = """
|
|
SELECT movies.title,
|
|
Printf('%.2f', movies.score),
|
|
Count(movie_cast.cast_id) AS cast_count
|
|
FROM movies
|
|
INNER JOIN movie_cast
|
|
ON movies.id = movie_cast.movie_id
|
|
GROUP BY movies.id
|
|
ORDER BY movies.score DESC,
|
|
cast_count ASC,
|
|
movies.title ASC
|
|
LIMIT 5;
|
|
"""
|
|
######################################################################
|
|
cursor = connection.execute(part_e_sql)
|
|
return cursor.fetchall()
|
|
|
|
# Part f Get High Scoring Actors [4 points]
|
|
def part_f(self,connection):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_f_sql = """
|
|
SELECT movie_cast.cast_id,
|
|
movie_cast.cast_name,
|
|
Printf('%.2f', Avg(movies.score)) AS average_score
|
|
FROM movies
|
|
INNER JOIN movie_cast
|
|
ON movies.id = movie_cast.movie_id
|
|
WHERE movies.score >= Cast(25 AS INT)
|
|
GROUP BY movie_cast.cast_id
|
|
HAVING Count(movie_cast.movie_id) > 2
|
|
ORDER BY Avg(movies.score) DESC,
|
|
movie_cast.cast_name ASC
|
|
LIMIT 10;
|
|
"""
|
|
######################################################################
|
|
cursor = connection.execute(part_f_sql)
|
|
return cursor.fetchall()
|
|
|
|
# Part g Creating Views [6 points]
|
|
def part_g(self,connection):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_g_sql = """
|
|
CREATE VIEW good_collaboration (cast_member_id1, cast_member_id2, movie_count,
|
|
average_movie_score)
|
|
AS
|
|
SELECT mc1.cast_id AS cast_member_id1, mc2.cast_id AS cast_member_id2, Count(mc1.movie_id) AS movie_count,
|
|
Avg(m.score) AS average_movie_score
|
|
FROM movies AS m
|
|
INNER JOIN movie_cast AS mc2 ON m.id = mc2.movie_id
|
|
INNER JOIN movie_cast AS mc1 ON mc1.movie_id = mc2.movie_id
|
|
WHERE mc1.cast_id < mc2.cast_id
|
|
GROUP BY mc1.cast_id, mc2.cast_id
|
|
HAVING movie_count > 2 AND average_movie_score >= Cast(40 AS INT);
|
|
"""
|
|
######################################################################
|
|
return self.execute_query(connection, part_g_sql)
|
|
|
|
def part_gi(self,connection):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_g_i_sql = """
|
|
SELECT cast_member_id1,
|
|
cast_bio.cast_name,
|
|
Printf('%.2f', Avg(average_movie_score)) AS collaboration_score
|
|
FROM good_collaboration
|
|
INNER JOIN cast_bio
|
|
ON cast_bio.cast_id = cast_member_id1
|
|
GROUP BY cast_member_id1
|
|
ORDER BY collaboration_score DESC,
|
|
cast_bio.cast_name ASC
|
|
LIMIT 5;
|
|
"""
|
|
######################################################################
|
|
cursor = connection.execute(part_g_i_sql)
|
|
return cursor.fetchall()
|
|
|
|
# Part h FTS [4 points]
|
|
def part_h(self,connection,path):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_h_sql = """CREATE virtual TABLE movie_overview using fts3 (id integer, overview text);"""
|
|
######################################################################
|
|
connection.execute(part_h_sql)
|
|
############### CREATE IMPORT CODE BELOW ############################
|
|
with open('data/movie_overview.csv', encoding='utf-8') as csvfile:
|
|
movie_overview = csv.reader(csvfile, delimiter=',')
|
|
for row in movie_overview:
|
|
connection.execute("INSERT INTO movie_overview VALUES (?,?)",(row[0],row[1]))
|
|
connection.commit()
|
|
######################################################################
|
|
sql = "SELECT COUNT(id) FROM movie_overview;"
|
|
cursor = connection.execute(sql)
|
|
return cursor.fetchall()[0][0]
|
|
|
|
def part_hi(self,connection):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_hi_sql = """
|
|
SELECT Count(overview)
|
|
FROM movie_overview
|
|
WHERE overview match 'FIGHT';
|
|
"""
|
|
######################################################################
|
|
cursor = connection.execute(part_hi_sql)
|
|
return cursor.fetchall()[0][0]
|
|
|
|
def part_hii(self,connection):
|
|
############### EDIT SQL STATEMENT ###################################
|
|
part_hii_sql = part_hii_sql = """
|
|
SELECT Count(overview)
|
|
FROM movie_overview
|
|
WHERE overview match 'space NEAR/5 program';
|
|
"""
|
|
######################################################################
|
|
cursor = connection.execute(part_hii_sql)
|
|
return cursor.fetchall()[0][0]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
########################### DO NOT MODIFY THIS SECTION ##########################
|
|
#################################################################################
|
|
if SHOW == True:
|
|
sample = Sample()
|
|
sample.sample()
|
|
|
|
print('\033[32m' + "Q2 Output: " + '\033[m')
|
|
db = HW2_sql()
|
|
try:
|
|
conn = db.create_connection("Q2")
|
|
except:
|
|
print("Database Creation Error")
|
|
|
|
try:
|
|
conn.execute("DROP TABLE IF EXISTS movies;")
|
|
conn.execute("DROP TABLE IF EXISTS movie_cast;")
|
|
conn.execute("DROP TABLE IF EXISTS cast_bio;")
|
|
conn.execute("DROP VIEW IF EXISTS good_collaboration;")
|
|
conn.execute("DROP TABLE IF EXISTS movie_overview;")
|
|
except:
|
|
print("Error in Table Drops")
|
|
|
|
try:
|
|
print('\033[32m' + "part ai 1: " + '\033[m' + str(db.part_ai_1(conn)))
|
|
print('\033[32m' + "part ai 2: " + '\033[m' + str(db.part_ai_2(conn)))
|
|
except:
|
|
print("Error in Part a.i")
|
|
|
|
try:
|
|
print('\033[32m' + "Row count for Movies Table: " + '\033[m' + str(db.part_aii_1(conn,"data/movies.csv")))
|
|
print('\033[32m' + "Row count for Movie Cast Table: " + '\033[m' + str(db.part_aii_2(conn,"data/movie_cast.csv")))
|
|
except:
|
|
print("Error in part a.ii")
|
|
|
|
try:
|
|
print('\033[32m' + "Row count for Cast Bio Table: " + '\033[m' + str(db.part_aiii(conn)))
|
|
except:
|
|
print("Error in part a.iii")
|
|
|
|
try:
|
|
print('\033[32m' + "part b 1: " + '\033[m' + db.part_b_1(conn))
|
|
print('\033[32m' + "part b 2: " + '\033[m' + db.part_b_2(conn))
|
|
print('\033[32m' + "part b 3: " + '\033[m' + db.part_b_3(conn))
|
|
except:
|
|
print("Error in part b")
|
|
|
|
try:
|
|
print('\033[32m' + "part c: " + '\033[m' + str(db.part_c(conn)))
|
|
except:
|
|
print("Error in part c")
|
|
|
|
try:
|
|
print('\033[32m' + "part d: " + '\033[m')
|
|
for line in db.part_d(conn):
|
|
print(line[0],line[1])
|
|
except:
|
|
print("Error in part d")
|
|
|
|
try:
|
|
print('\033[32m' + "part e: " + '\033[m')
|
|
for line in db.part_e(conn):
|
|
print(line[0],line[1],line[2])
|
|
except:
|
|
print("Error in part e")
|
|
|
|
try:
|
|
print('\033[32m' + "part f: " + '\033[m')
|
|
for line in db.part_f(conn):
|
|
print(line[0],line[1],line[2])
|
|
except:
|
|
print("Error in part f")
|
|
|
|
try:
|
|
print('\033[32m' + "part g: " + '\033[m' + str(db.part_g(conn)))
|
|
print('\033[32m' + "part g.i: " + '\033[m')
|
|
for line in db.part_gi(conn):
|
|
print(line[0],line[1],line[2])
|
|
except:
|
|
print("Error in part g")
|
|
|
|
try:
|
|
print('\033[32m' + "part h.i: " + '\033[m'+ str(db.part_h(conn,"data/movie_overview.csv")))
|
|
print('\033[32m' + "Count h.ii: " + '\033[m' + str(db.part_hi(conn)))
|
|
print('\033[32m' + "Count h.iii: " + '\033[m' + str(db.part_hii(conn)))
|
|
except:
|
|
print("Error in part h")
|
|
|
|
conn.close()
|
|
#################################################################################
|
|
#################################################################################
|
|
|