tunmnlu/task_2/others-answer/isye_6242-main/HW 1/Q2_SQL.py

########################### DO NOT MODIFY THIS SECTION ##########################
#################################################################################
import sqlite3
from sqlite3 import Error
import csv
#################################################################################

## Change to False to disable Sample
SHOW = True

############### SAMPLE CLASS AND SQL QUERY ###########################
######################################################################
class Sample():
    def sample(self):
        try:
            connection = sqlite3.connect("sample")
            connection.text_factory = str
        except Error as e:
            print("Error occurred: " + str(e))
        print('\033[32m' + "Sample: " + '\033[m')

        # Sample Drop table
        connection.execute("DROP TABLE IF EXISTS sample;")
        # Sample Create
        connection.execute("CREATE TABLE sample(id integer, name text);")
        # Sample Insert
        connection.execute("INSERT INTO sample VALUES (?,?)",("1","test_name"))
        connection.commit()
        # Sample Select
        cursor = connection.execute("SELECT * FROM sample;")
        print(cursor.fetchall())

######################################################################

class HW2_sql():
    ############### DO NOT MODIFY THIS SECTION ###########################
    ######################################################################
    def create_connection(self, path):
        connection = None
        try:
            connection = sqlite3.connect(path)
            connection.text_factory = str
        except Error as e:
            print("Error occurred: " + str(e))

        return connection

    def execute_query(self, connection, query):
        cursor = connection.cursor()
        try:
            if query == "":
                return "Query Blank"
            else:
                cursor.execute(query)
                connection.commit()
                return "Query executed successfully"
        except Error as e:
            return "Error occurred: " + str(e)
    ######################################################################
    ######################################################################

    # GTusername [0 points]
    def GTusername(self):
        gt_username = "icanlapan3"
        return gt_username

    # Part a.i Create Tables [2 points]
    def part_ai_1(self,connection):
        ############### EDIT SQL STATEMENT ###################################
        part_ai_1_sql = "create table movies(id integer, title text, score real);"
        ######################################################################

        return self.execute_query(connection, part_ai_1_sql)

    def part_ai_2(self,connection):
        ############### EDIT SQL STATEMENT ###################################
        part_ai_2_sql = "create table movie_cast(movie_id integer, cast_id integer, cast_name integer, birthday text, popularity real);"
        ######################################################################

        return self.execute_query(connection, part_ai_2_sql)

    # Part a.ii Import Data [2 points]
    def part_aii_1(self,connection,path):
        ############### CREATE IMPORT CODE BELOW ############################
        # Import CSV file
        with open(path) as fin:  # `with` statement available in 2.5+
            # csv.DictReader uses first line in file for column headings by default
            dr = csv.reader(fin)  # comma is default delimiter
            to_db = [(i[0], i[1], i[2]) for i in dr]

        # Upload to the table in database
        connection.executemany("INSERT INTO movies (id, title, score) VALUES (?, ?, ?)", to_db)
       ######################################################################

        sql = "SELECT COUNT(id) FROM movies;"
        cursor = connection.execute(sql)
        return cursor.fetchall()[0][0]

    def part_aii_2(self,connection, path):
        ############### CREATE IMPORT CODE BELOW ############################
        # Import CSV file
        with open(path) as fin:  # `with` statement available in 2.5+
            # csv.DictReader uses first line in file for column headings by default
            dr = csv.reader(fin)  # comma is default delimiter
            to_db = [(i[0], i[1], i[2], i[3], i[4]) for i in dr]

        # Upload to the table in database
        connection.executemany("INSERT INTO movie_cast (movie_id, cast_id, cast_name, birthday, popularity) VALUES (?, ?, ?, ?, ?);", to_db)
        ######################################################################

        sql = "SELECT COUNT(cast_id) FROM movie_cast;"
        cursor = connection.execute(sql)
        return cursor.fetchall()[0][0]

    # Part a.iii Vertical Database Partitioning [5 points]
    def part_aiii(self,connection):
        ############### EDIT CREATE TABLE SQL STATEMENT ###################################
        part_aiii_sql = "create table cast_bio (cast_id integer, cast_name text, birthday date, popularity real);"
        ######################################################################

        self.execute_query(connection, part_aiii_sql)

        ############### CREATE IMPORT CODE BELOW ############################

        part_aiii_insert_sql = """
        INSERT INTO cast_bio
            (cast_id,
             cast_name,
             birthday,
             popularity)

        select  distinct cast_id,
                cast_name,
                birthday,
                popularity
        from movie_cast
        ;
        """

        ######################################################################

        self.execute_query(connection, part_aiii_insert_sql)

        sql = "SELECT COUNT(cast_id) FROM cast_bio;"
        cursor = connection.execute(sql)
        return cursor.fetchall()[0][0]

    # Part b Create Indexes [1 points]
    def part_b_1(self,connection):
        ############### EDIT SQL STATEMENT ###################################
        part_b_1_sql = "CREATE INDEX movie_index on movies(id);"
        ######################################################################
        return self.execute_query(connection, part_b_1_sql)

    def part_b_2(self,connection):
        ############### EDIT SQL STATEMENT ###################################
        part_b_2_sql = "CREATE INDEX cast_index on movie_cast(cast_id);"
        ######################################################################
        return self.execute_query(connection, part_b_2_sql)

    def part_b_3(self,connection):
        ############### EDIT SQL STATEMENT ###################################
        part_b_3_sql = "CREATE INDEX cast_bio_index on cast_bio(cast_id);"
        ######################################################################
        return self.execute_query(connection, part_b_3_sql)

    # Part c Calculate a Proportion [3 points]
    def part_c(self,connection):
        ############### EDIT SQL STATEMENT ###################################
        part_c_sql = """
        select printf("%.2f", (target/all_movies)*100)
        from (select    cast(count (distinct case when (score > 50 and title like '%war%') then id end) as float) as target,
                        cast(count (distinct id) as float) as all_movies
              from movies) x
        ;
        """

        ######################################################################
        cursor = connection.execute(part_c_sql)
        return cursor.fetchall()[0][0]

    # Part d Find the Most Prolific Actors [4 points]
    def part_d(self,connection):
        ############### EDIT SQL STATEMENT ###################################
        part_d_sql = """
        select cast_name, count (distinct movie_id) as movie_count
        from movie_cast
        where popularity > 10
        group by cast_name
        order by movie_count desc, cast_name
        limit 5
        ;
        """
        ######################################################################
        cursor = connection.execute(part_d_sql)
        return cursor.fetchall()

# e. [4 points] Find the highest scoring movies with the smallest cast. List the 5 highest-scoring movies that
# have the fewest cast members. Sort the results by score in descending order, then by number of cast
# members in ascending order, then by movie name in alphabetical order. Format all decimals to two
# places using printf().
# Output format and sample values (movie_title,movie_score,cast_count):
# Star Wars: Holiday Special,75.01,12
# War Games,58.49,33


# 1. movies id (integer) title (text) score (real)
# 2. movie_cast movie_id (integer) cast_id (integer) cast_name (text) birthday (text) popularity (real)

    # Part e Find the Highest Scoring Movies With the Least Amount of Cast [4 points]
    def part_e(self,connection):
        ############### EDIT SQL STATEMENT ###################################
        part_e_sql = """

        select  a.title,
                printf("%.2f", a.score) as movie_score,
                b.cast_count
        from movies a
        inner join (select  movie_id,
                            count(distinct cast_id) as cast_count
                    from movie_cast
                    group by movie_id) b on a.id = b.movie_id
        order by score desc, cast_count asc, a.title asc
        limit 5
        ;
        """
        ######################################################################
        cursor = connection.execute(part_e_sql)
        return cursor.fetchall()


# f. [4 points] Get high scoring actors. Find the top ten cast members who have the highest average movie
# scores. Format all decimals to two places using printf().
# ▪ Sort the output by average score in descending order, then by cast_name in alphabetical order.
# ▪ Do not include movies with score <25 in the average score calculation.
# ▪ Exclude cast members who have appeared in two or fewer movies.
# Output format and sample values (cast_id,cast_name,average_score):
# 8822,Julia Roberts,53.00

    # Part f Get High Scoring Actors [4 points]
    def part_f(self,connection):
        ############### EDIT SQL STATEMENT ###################################
        part_f_sql = """
        select  cast_id,
                cast_name,
                printf("%.2f", average_score) as average_score
        from (select    a.cast_id,
                        a.cast_name,
                        count(distinct a.movie_id) as movie_count,
                        avg(score) as average_score
                from movie_cast a
                inner join (select * from movies where score >= 25) b on a.movie_id = b.id
                group by a.cast_id, a.cast_name
            ) x
        where movie_count > 2
        order by average_score desc, cast_name asc
        limit 10
        ;
        """
        ######################################################################
        cursor = connection.execute(part_f_sql)
        return cursor.fetchall()

    # Part g Creating Views [6 points]
    def part_g(self,connection):
        ############### EDIT SQL STATEMENT ###################################
        part_g_sql = """
        CREATE VIEW good_collaboration AS
        select distinct t1.*
        from    (select  id1.cast_id as cast_member_id1,
                         id2.cast_id as cast_member_id2,
                         count(distinct id1.movie_id) as movie_count,
                         avg(score) as average_movie_score
                from movie_cast id1
                inner join movie_cast id2 on id1.movie_id = id2.movie_id
                inner join movies m on id1.movie_id = m.id
                where id1.cast_id != id2.cast_id
                group by id1.cast_id, id2.cast_id
                having count(distinct id1.movie_id) >= 3 and avg(score) >= 40
                ) t1
        inner join (select  id1.cast_id as cast_member_id1,
                            id2.cast_id as cast_member_id2,
                            count(distinct id1.movie_id) as movie_count,
                            avg(score) as average_movie_score
                    from movie_cast id1
                    inner join movie_cast id2 on id1.movie_id = id2.movie_id
                    inner join movies m on id1.movie_id = m.id
                    where id1.cast_id != id2.cast_id
                    group by id1.cast_id, id2.cast_id
                    having count(distinct id1.movie_id) >= 3 and avg(score) >= 40
                    ) t2 on t1.cast_member_id1 = t2.cast_member_id2
                         and t1.cast_member_id2 = t2.cast_member_id1
        ;
        """

        ######################################################################
        return self.execute_query(connection, part_g_sql)

    def part_gi(self,connection):
        ############### EDIT SQL STATEMENT ###################################
        part_g_i_sql = """
        select  cast_id,
                cast_name,
                printf("%.2f", collaboration_score) as collaboration_score
        from (
                select distinct a.cast_id,
                       cast_name,
                       avg(average_movie_score) as collaboration_score
                from (  select  cast_member_id1 as cast_id,
                                average_movie_score
                        from good_collaboration
                        union all
                        select  cast_member_id2 as cast_id,
                                average_movie_score
                        from good_collaboration
                    ) a
                inner join (select  distinct cast_id,
                                    cast_name
                            from movie_cast
                            ) b on a.cast_id = b.cast_id
                group by a.cast_id, cast_name
            )
        order by collaboration_score desc, cast_name asc
        limit 5
        ;
        """
        ######################################################################
        cursor = connection.execute(part_g_i_sql)
        return cursor.fetchall()

    # Part h FTS [4 points]
    def part_h(self,connection,path):
        ############### EDIT SQL STATEMENT ###################################
        part_h_sql = """
        CREATE VIRTUAL TABLE movie_overview USING fts4(id integer, overview text, tokenize = simple);
        """
        ######################################################################
        connection.execute(part_h_sql)
        ############### CREATE IMPORT CODE BELOW ############################

         # Import CSV file
        with open(path) as fin:  # `with` statement available in 2.5+
            # csv.DictReader uses first line in file for column headings by default
            dr = csv.reader(fin)  # comma is default delimiter
            to_db = [(i[0], i[1]) for i in dr]

        # Upload to the table in database
        connection.executemany("INSERT INTO movie_overview (id, overview) VALUES (?, ?);", to_db)

        ######################################################################
        sql = "SELECT COUNT(id) FROM movie_overview;"
        cursor = connection.execute(sql)
        return cursor.fetchall()[0][0]

    def part_hi(self,connection):
        ############### EDIT SQL STATEMENT ###################################
        part_hi_sql = """
        select count(distinct id)
        from movie_overview
        where overview MATCH 'fight';
        """
        ######################################################################
        cursor = connection.execute(part_hi_sql)
        return cursor.fetchall()[0][0]

    def part_hii(self,connection):
        ############### EDIT SQL STATEMENT ###################################
        part_hii_sql = """
        select count(distinct id)
        from movie_overview
        where overview MATCH 'space NEAR/5 program'
        ;
        """
        ######################################################################
        cursor = connection.execute(part_hii_sql)
        return cursor.fetchall()[0][0]


if __name__ == "__main__":


    ########################### DO NOT MODIFY THIS SECTION ##########################
    #################################################################################
    if SHOW == True:
        sample = Sample()
        sample.sample()

    print('\033[32m' + "Q2 Output: " + '\033[m')
    db = HW2_sql()
    try:
        conn = db.create_connection("Q2")
    except:
        print("Database Creation Error")

    try:
        conn.execute("DROP TABLE IF EXISTS movies;")
        conn.execute("DROP TABLE IF EXISTS movie_cast;")
        conn.execute("DROP TABLE IF EXISTS cast_bio;")
        conn.execute("DROP VIEW IF EXISTS good_collaboration;")
        conn.execute("DROP TABLE IF EXISTS movie_overview;")
    except:
        print("Error in Table Drops")

    try:
        print('\033[32m' + "part ai 1: " + '\033[m' + str(db.part_ai_1(conn)))
        print('\033[32m' + "part ai 2: " + '\033[m' + str(db.part_ai_2(conn)))
    except:
         print("Error in Part a.i")

    try:
        print('\033[32m' + "Row count for Movies Table: " + '\033[m' + str(db.part_aii_1(conn,"data/movies.csv")))
        print('\033[32m' + "Row count for Movie Cast Table: " + '\033[m' + str(db.part_aii_2(conn,"data/movie_cast.csv")))
    except:
        print("Error in part a.ii")

    try:
        print('\033[32m' + "Row count for Cast Bio Table: " + '\033[m' + str(db.part_aiii(conn)))
    except:
        print("Error in part a.iii")

    try:
        print('\033[32m' + "part b 1: " + '\033[m' + db.part_b_1(conn))
        print('\033[32m' + "part b 2: " + '\033[m' + db.part_b_2(conn))
        print('\033[32m' + "part b 3: " + '\033[m' + db.part_b_3(conn))
    except:
        print("Error in part b")

    try:
        print('\033[32m' + "part c: " + '\033[m' + str(db.part_c(conn)))
    except:
        print("Error in part c")

    try:
        print('\033[32m' + "part d: " + '\033[m')
        for line in db.part_d(conn):
            print(line[0],line[1])
    except:
        print("Error in part d")

    try:
        print('\033[32m' + "part e: " + '\033[m')
        for line in db.part_e(conn):
            print(line[0],line[1],line[2])
    except:
        print("Error in part e")

    try:
        print('\033[32m' + "part f: " + '\033[m')
        for line in db.part_f(conn):
            print(line[0],line[1],line[2])
    except:
        print("Error in part f")

    try:
        print('\033[32m' + "part g: " + '\033[m' + str(db.part_g(conn)))
        print('\033[32m' + "part g.i: " + '\033[m')
        for line in db.part_gi(conn):
            print(line[0],line[1],line[2])
    except:
        print("Error in part g")

    try:
        print('\033[32m' + "part h.i: " + '\033[m'+ str(db.part_h(conn,"data/movie_overview.csv")))
        print('\033[32m' + "Count h.ii: " + '\033[m' + str(db.part_hi(conn)))
        print('\033[32m' + "Count h.iii: " + '\033[m' + str(db.part_hii(conn)))
    except:
        print("Error in part h")

    conn.close()
    #################################################################################
    #################################################################################