# Merging movies_metadat with credits & keywords ON 'ID' 

import pandas as pd
import ast
import json 
import numpy as np
import matplotlib.pyplot as plt



def load_data(file_path):
    #Reading the CSV file into a DataFrame
    #Convert the 'release_date' column to datetime.date type
    # Evaluating each element in the columns that have json data and replacing any null values with np.nan and evaluating any non-null values as a Python literal expression and returning the resulting value.
    # Returning the final modified DataFrame
    df = pd.read_csv(file_path, dtype='unicode')
    df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: x.date())
    
    json_columns = ['belongs_to_collection', 'genres', 'production_companies', 'production_countries', 'spoken_languages']
    for column in json_columns:
        # use ast because json data has single quotes in the csv, which is invalid for a json object;
        df[column] = df[column].apply(lambda x: np.nan if pd.isnull(x) else ast.literal_eval(x))
    return df


# load data
movies_meta = load_data(r"C:\Users\reeds\PycharmProjects\Capstone\Data Set\movies_metadata.csv")
credits_df = pd.read_csv(r"C:\Users\reeds\PycharmProjects\Capstone\Data Set\credits.csv")
keywords_df = pd.read_csv(r"C:\Users\reeds\PycharmProjects\Capstone\Data Set\keywords.csv")

# Make sure id column is the same data type in credits_df & movies_meta
movies_meta['id'] = pd.to_numeric(movies_meta['id'], errors='coerce')
credits_df['id'] = pd.to_numeric(credits_df['id'], errors='coerce')
keywords_df['id'] = pd.to_numeric(keywords_df['id'], errors='coerce')
# Merge on 'id'
movies_meta = movies_meta.merge(credits_df, on = 'id')
movies_meta = movies_meta.merge(keywords_df, on = 'id')


# Checking and correcting data types
# change budget, Revenue, id, popularity, Vote_average, Vote_Count to numeric
movies_meta['budget'] = pd.to_numeric(movies_meta['budget'], errors='coerce')
movies_meta['revenue'] = pd.to_numeric(movies_meta['revenue'], errors='coerce')
movies_meta["id"] = pd.to_numeric(movies_meta['id'], errors='coerce', downcast="integer")
movies_meta["popularity"] = pd.to_numeric(movies_meta['popularity'], errors='coerce')
movies_meta['release_date'] = pd.to_datetime(movies_meta['release_date'], errors='coerce')
movies_meta['vote_average'] = pd.to_numeric(movies_meta['vote_average'], errors='coerce')
movies_meta['vote_count'] = pd.to_numeric(movies_meta['vote_count'], errors='coerce')
movies_meta['runtime'] = pd.to_numeric(movies_meta['runtime'], errors = 'coerce')


# JSON variables : Genres, Belongs_to_collection, Production companies & countries and spoken languages
# Genres, Belongs_to_collection, Prod Companies, Prod Countries and Spoken Languages
def extract_names(x):
    # Check if the input is a list
    if isinstance(x, list):
        # If it's a list, extract the name value from each dictionary in the list
        return [d['name'] for d in x]
    # Check if the input is a dictionary
    elif isinstance(x, dict):
        # If it's a dictionary, return the name value from the dictionary
        return [x['name']]
    # If the input is neither a list nor a dictionary, return an empty list
    else:
        return []

cols = ['genres', 'belongs_to_collection', 'production_companies', 'production_countries', 'spoken_languages']

for col in cols:
    movies_meta[col] = movies_meta[col].apply(lambda x: extract_names(x))
    
    
# Cast Names
def extract_cast_names(cast_data):
    cast_names = []
    for i, cast_member in enumerate(cast_data):
        if i < 10:
            cast_names.append(cast_member['name'])
        else:
            break
    return cast_names
# Ensure the 'cast' column has list data
movies_meta['cast'] = movies_meta['cast'].apply(lambda x: [] if pd.isnull(x) else ast.literal_eval(x))

# Extract cast names from the 'cast' column
movies_meta['cast_names'] = movies_meta['cast'].apply(extract_cast_names)


# Director Names
def extract_director_name(crew_data):
    for crew_member in crew_data:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return np.nan

# Ensure the 'crew' column has list data
movies_meta['crew'] = movies_meta['crew'].apply(lambda x: [] if pd.isnull(x) else ast.literal_eval(x))

# Extract director names from the 'crew' column
movies_meta['director'] = movies_meta['crew'].apply(extract_director_name)



# Keywords
# Ensure the 'keywords' column has list data
movies_meta['keywords'] = movies_meta['keywords'].apply(lambda x: [] if pd.isnull(x) else ast.literal_eval(x))
# Extract keywords from the 'keywords' column
def extract_keywords(keywords_data):
    keywords = []
    for i, keyword in enumerate(keywords_data):
        if i < 10:  # Limit to the first five keywords
            keywords.append(keyword['name'])
        else:
            break
    return keywords

movies_meta['keywords_list'] = movies_meta['keywords'].apply(extract_keywords)


# drop columns from drop_df
drop_df = ['homepage', 'poster_path', 'video', 'tagline', 'imdb_id', 'original_title', 
           'cast','crew', 'keywords', 'adult']  # keywords_list is the variable we want
for col in drop_df:
    if col in movies_meta.columns:
        movies_meta = movies_meta.drop(col, axis=1)


# Let's look at the total number of missing values for each predictor
missing_values = movies_meta.isna().sum()
print(missing_values)

empty_genres = movies_meta['genres'].apply(lambda x: len(x) == 0).sum()
print(f"Number of empty genres: {empty_genres}")


zero_budget = movies_meta['budget'].apply(lambda x: x == 0).sum()
zero_revenue = movies_meta['revenue'].apply(lambda x: x == 0).sum()

print(f"Number of zero budget entries: {zero_budget}")
print(f"Number of zero revenue entries: {zero_revenue}")

belongs_to_collection      0
budget                     0
genres                     0
id                         0
original_language         11
overview                 995
popularity                 0
production_companies       0
production_countries       0
release_date              84
revenue                    0
runtime                  264
spoken_languages           0
status                    82
title                      0
vote_average               0
vote_count                 0
cast_names                 0
director                 917
keywords_list              0
dtype: int64
Number of empty genres: 2524
Number of zero budget entries: 37571
Number of zero revenue entries: 39087


display(movies_meta.head(5))

fig

fig


# Print the updated dataframe
movies_meta[['weighted_rating', 'title', 'director']].head(13)


top_comedy_movies[['weighted_rating', 'title', 'director']]


ranked_anderson_movies[['title', 'weighted_rating']]


movie_title = "Toy Story"
recommendations = content_based_recommendations(movie_title, tfidf_matrix)
print(f"Top 10 recommendations for '{movie_title}':")
recommendations


display(top_10_recommended_movies)


user_id = 1
recommended_movies = recommend_movies_for_user(user_id, predicted_ratings_df, ratings)
recommended_movies

	belongs_to_collection	budget	genres	id	original_language	overview	popularity	production_companies	production_countries	release_date	revenue	runtime	spoken_languages	status	title	vote_average	vote_count	cast_names	director	keywords_list
0	[]	160000000	[Action, Thriller, Science Fiction, Mystery, A...	27205	en	Cobb, a skilled thief who commits corporate es...	29.108149	[Legendary Pictures, Warner Bros., Syncopy]	[United Kingdom, United States of America]	2010-07-14	825532764	148.0	[English]	Released	Inception	8.1	14075	[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...	Christopher Nolan	[loss of lover, dream, kidnapping, sleep, subc...
1	[The Dark Knight Collection]	185000000	[Drama, Action, Crime, Thriller]	155	en	Batman raises the stakes in his war on crime. ...	123.167259	[DC Comics, Legendary Pictures, Warner Bros., ...	[United Kingdom, United States of America]	2008-07-16	1004558444	152.0	[English, 普通话]	Released	The Dark Knight	8.3	12269	[Christian Bale, Michael Caine, Heath Ledger, ...	Christopher Nolan	[dc comics, crime fighter, secret identity, sc...
2	[Avatar Collection]	237000000	[Action, Adventure, Fantasy, Science Fiction]	19995	en	In the 22nd century, a paraplegic Marine is di...	185.070892	[Ingenious Film Partners, Twentieth Century Fo...	[United States of America, United Kingdom]	2009-12-10	2787965087	162.0	[English, Español]	Released	Avatar	7.2	12114	[Sam Worthington, Zoe Saldana, Sigourney Weave...	James Cameron	[culture clash, future, space war, space colon...
3	[The Avengers Collection]	220000000	[Science Fiction, Action, Adventure]	24428	en	When an unexpected enemy emerges and threatens...	89.887648	[Paramount Pictures, Marvel Studios]	[United States of America]	2012-04-25	1519557910	143.0	[English]	Released	The Avengers	7.4	12000	[Robert Downey Jr., Chris Evans, Mark Ruffalo,...	Joss Whedon	[new york, shield, marvel comic, superhero, ba...
4	[Deadpool Collection]	58000000	[Action, Adventure, Comedy]	293660	en	Deadpool tells the origin story of former Spec...	187.860492	[Twentieth Century Fox Film Corporation, Marve...	[United States of America]	2016-02-09	783112979	108.0	[English]	Released	Deadpool	7.4	11444	[Ryan Reynolds, Morena Baccarin, Ed Skrein, T....	Tim Miller	[anti hero, mercenary, marvel comic, superhero...

	weighted_rating	title	director
19	8.360240	The Shawshank Redemption	Frank Darabont
61	8.309658	The Godfather	Francis Ford Coppola
1	8.209996	The Dark Knight	Christopher Nolan
8	8.186913	Fight Club	David Fincher
17	8.174380	Pulp Fiction	Quentin Tarantino
21	8.071673	Forrest Gump	Robert Zemeckis
150	8.064963	Schindler's List	Steven Spielberg
153	8.062025	Whiplash	Damien Chazelle
181	8.039924	Spirited Away	Hayao Miyazaki
63	8.028738	The Empire Strikes Back	Irvin Kershner
0	8.027068	Inception	Christopher Nolan
209	8.019146	Life Is Beautiful	Roberto Benigni
88	8.011480	The Intouchables	Eric Toledano

	weighted_rating	title	director
21	8.071673	Forrest Gump	Robert Zemeckis
209	8.019146	Life Is Beautiful	Roberto Benigni
88	8.011480	The Intouchables	Eric Toledano
53	7.847687	Back to the Future	Robert Zemeckis
129	7.799760	The Grand Budapest Hotel	Wes Anderson
38	7.764797	The Wolf of Wall Street	Martin Scorsese
39	7.764212	Inside Out	Pete Docter
1646	7.734884	Dilwale Dulhania Le Jayenge	Aditya Chopra
122	7.711893	La La Land	Damien Chazelle
34	7.675550	Up	Pete Docter

	title	weighted_rating
129	The Grand Budapest Hotel	7.799760
626	Moonrise Kingdom	7.202533
939	Fantastic Mr. Fox	7.007961
859	The Royal Tenenbaums	6.963711
1582	Rushmore	6.778962
1288	The Darjeeling Limited	6.613823
1423	The Life Aquatic with Steve Zissou	6.575526
3959	Hotel Chevalier	6.094411
3167	Bottle Rocket	6.088710
10464	Come Together	5.786662

	title	similarity_score
0	Toy Story 2	0.511418
1	Toy Story 3	0.459096
2	Toy Story That Time Forgot	0.292212
3	Toy Story of Terror!	0.272110
4	Small Fry	0.264771
5	Hawaiian Vacation	0.248074
6	Toy Reanimator	0.217118
7	Small Soldiers	0.211142
8	Partysaurus Rex	0.207263
9	Silent Night, Deadly Night 5: The Toy Maker	0.205390

Analysis of IMDB Movie Ratings, with Emphasis on User Predictions and Recommendations

By: Reed Shay and Advisor Dr. Ying-Ju Chen

Table of Contents

Import & Merge Data

Clean Data for Analysis

Changing Datatypes

Handling JSON Variables

JSON columns cont.

Dropping Columns

Analyze NA and Zero Values

Cleaned Dataset After

Exploratory Data Analysis (EDA)

Recommendation Systems

Popularity Based Recommendation

Original Dataframe, Ordered by Weighted rating

Applications

Based off Director, Wes Anderson

Content Based Filtering

Cosine Similarity

Top 10 Recommendations for Toy Story

Problems/Improvements

User-based Collaborative Filtering for Movie Recommendations

Recommendations for User 2

Problems/further progression

Matrix Factorization with Neural Networks

Problems

Further Work

Personal Problems Faced

Questions?

About Me: Reed Shay

	title
0	A Nightmare on Elm Street
1	Romeo + Juliet
2	48 Hrs.
3	Once Were Warriors
4	Sissi
5	Terminator 3: Rise of the Machines
6	Monsoon Wedding

	userId	movieId	predicted_rating	title
0	1	3030	3.557553	End of the World
1	1	73290	3.488766	Urban Explorer
2	1	759	3.449224	Gentlemen Prefer Blondes
3	1	55069	3.446386	The Delivery
4	1	51277	3.420188	Now and Forever
5	1	116	3.417106	Match Point
6	1	50641	3.403370	The End of Poverty?
7	1	309	3.396181	The Celebration
8	1	1649	3.391616	Bill & Ted's Bogus Journey
9	1	149	3.378770	Akira