# Merging movies_metadat with credits & keywords ON 'ID'
import pandas as pd
import ast
import json
import numpy as np
import matplotlib.pyplot as plt
def load_data(file_path):
#Reading the CSV file into a DataFrame
#Convert the 'release_date' column to datetime.date type
# Evaluating each element in the columns that have json data and replacing any null values with np.nan and evaluating any non-null values as a Python literal expression and returning the resulting value.
# Returning the final modified DataFrame
df = pd.read_csv(file_path, dtype='unicode')
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: x.date())
json_columns = ['belongs_to_collection', 'genres', 'production_companies', 'production_countries', 'spoken_languages']
for column in json_columns:
# use ast because json data has single quotes in the csv, which is invalid for a json object;
df[column] = df[column].apply(lambda x: np.nan if pd.isnull(x) else ast.literal_eval(x))
return df
# load data
movies_meta = load_data(r"C:\Users\reeds\PycharmProjects\Capstone\Data Set\movies_metadata.csv")
credits_df = pd.read_csv(r"C:\Users\reeds\PycharmProjects\Capstone\Data Set\credits.csv")
keywords_df = pd.read_csv(r"C:\Users\reeds\PycharmProjects\Capstone\Data Set\keywords.csv")
# Make sure id column is the same data type in credits_df & movies_meta
movies_meta['id'] = pd.to_numeric(movies_meta['id'], errors='coerce')
credits_df['id'] = pd.to_numeric(credits_df['id'], errors='coerce')
keywords_df['id'] = pd.to_numeric(keywords_df['id'], errors='coerce')
# Merge on 'id'
movies_meta = movies_meta.merge(credits_df, on = 'id')
movies_meta = movies_meta.merge(keywords_df, on = 'id')
# Checking and correcting data types
# change budget, Revenue, id, popularity, Vote_average, Vote_Count to numeric
movies_meta['budget'] = pd.to_numeric(movies_meta['budget'], errors='coerce')
movies_meta['revenue'] = pd.to_numeric(movies_meta['revenue'], errors='coerce')
movies_meta["id"] = pd.to_numeric(movies_meta['id'], errors='coerce', downcast="integer")
movies_meta["popularity"] = pd.to_numeric(movies_meta['popularity'], errors='coerce')
movies_meta['release_date'] = pd.to_datetime(movies_meta['release_date'], errors='coerce')
movies_meta['vote_average'] = pd.to_numeric(movies_meta['vote_average'], errors='coerce')
movies_meta['vote_count'] = pd.to_numeric(movies_meta['vote_count'], errors='coerce')
movies_meta['runtime'] = pd.to_numeric(movies_meta['runtime'], errors = 'coerce')
# JSON variables : Genres, Belongs_to_collection, Production companies & countries and spoken languages
# Genres, Belongs_to_collection, Prod Companies, Prod Countries and Spoken Languages
def extract_names(x):
# Check if the input is a list
if isinstance(x, list):
# If it's a list, extract the name value from each dictionary in the list
return [d['name'] for d in x]
# Check if the input is a dictionary
elif isinstance(x, dict):
# If it's a dictionary, return the name value from the dictionary
return [x['name']]
# If the input is neither a list nor a dictionary, return an empty list
else:
return []
cols = ['genres', 'belongs_to_collection', 'production_companies', 'production_countries', 'spoken_languages']
for col in cols:
movies_meta[col] = movies_meta[col].apply(lambda x: extract_names(x))
# Cast Names
def extract_cast_names(cast_data):
cast_names = []
for i, cast_member in enumerate(cast_data):
if i < 10:
cast_names.append(cast_member['name'])
else:
break
return cast_names
# Ensure the 'cast' column has list data
movies_meta['cast'] = movies_meta['cast'].apply(lambda x: [] if pd.isnull(x) else ast.literal_eval(x))
# Extract cast names from the 'cast' column
movies_meta['cast_names'] = movies_meta['cast'].apply(extract_cast_names)
# Director Names
def extract_director_name(crew_data):
for crew_member in crew_data:
if crew_member['job'] == 'Director':
return crew_member['name']
return np.nan
# Ensure the 'crew' column has list data
movies_meta['crew'] = movies_meta['crew'].apply(lambda x: [] if pd.isnull(x) else ast.literal_eval(x))
# Extract director names from the 'crew' column
movies_meta['director'] = movies_meta['crew'].apply(extract_director_name)
# Keywords
# Ensure the 'keywords' column has list data
movies_meta['keywords'] = movies_meta['keywords'].apply(lambda x: [] if pd.isnull(x) else ast.literal_eval(x))
# Extract keywords from the 'keywords' column
def extract_keywords(keywords_data):
keywords = []
for i, keyword in enumerate(keywords_data):
if i < 10: # Limit to the first five keywords
keywords.append(keyword['name'])
else:
break
return keywords
movies_meta['keywords_list'] = movies_meta['keywords'].apply(extract_keywords)
# drop columns from drop_df
drop_df = ['homepage', 'poster_path', 'video', 'tagline', 'imdb_id', 'original_title',
'cast','crew', 'keywords', 'adult'] # keywords_list is the variable we want
for col in drop_df:
if col in movies_meta.columns:
movies_meta = movies_meta.drop(col, axis=1)
# Let's look at the total number of missing values for each predictor
missing_values = movies_meta.isna().sum()
print(missing_values)
empty_genres = movies_meta['genres'].apply(lambda x: len(x) == 0).sum()
print(f"Number of empty genres: {empty_genres}")
zero_budget = movies_meta['budget'].apply(lambda x: x == 0).sum()
zero_revenue = movies_meta['revenue'].apply(lambda x: x == 0).sum()
print(f"Number of zero budget entries: {zero_budget}")
print(f"Number of zero revenue entries: {zero_revenue}")
belongs_to_collection 0 budget 0 genres 0 id 0 original_language 11 overview 995 popularity 0 production_companies 0 production_countries 0 release_date 84 revenue 0 runtime 264 spoken_languages 0 status 82 title 0 vote_average 0 vote_count 0 cast_names 0 director 917 keywords_list 0 dtype: int64 Number of empty genres: 2524 Number of zero budget entries: 37571 Number of zero revenue entries: 39087
display(movies_meta.head(5))
belongs_to_collection | budget | genres | id | original_language | overview | popularity | production_companies | production_countries | release_date | revenue | runtime | spoken_languages | status | title | vote_average | vote_count | cast_names | director | keywords_list | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | [] | 160000000 | [Action, Thriller, Science Fiction, Mystery, A... | 27205 | en | Cobb, a skilled thief who commits corporate es... | 29.108149 | [Legendary Pictures, Warner Bros., Syncopy] | [United Kingdom, United States of America] | 2010-07-14 | 825532764 | 148.0 | [English] | Released | Inception | 8.1 | 14075 | [Leonardo DiCaprio, Joseph Gordon-Levitt, Elle... | Christopher Nolan | [loss of lover, dream, kidnapping, sleep, subc... |
1 | [The Dark Knight Collection] | 185000000 | [Drama, Action, Crime, Thriller] | 155 | en | Batman raises the stakes in his war on crime. ... | 123.167259 | [DC Comics, Legendary Pictures, Warner Bros., ... | [United Kingdom, United States of America] | 2008-07-16 | 1004558444 | 152.0 | [English, 普通话] | Released | The Dark Knight | 8.3 | 12269 | [Christian Bale, Michael Caine, Heath Ledger, ... | Christopher Nolan | [dc comics, crime fighter, secret identity, sc... |
2 | [Avatar Collection] | 237000000 | [Action, Adventure, Fantasy, Science Fiction] | 19995 | en | In the 22nd century, a paraplegic Marine is di... | 185.070892 | [Ingenious Film Partners, Twentieth Century Fo... | [United States of America, United Kingdom] | 2009-12-10 | 2787965087 | 162.0 | [English, Español] | Released | Avatar | 7.2 | 12114 | [Sam Worthington, Zoe Saldana, Sigourney Weave... | James Cameron | [culture clash, future, space war, space colon... |
3 | [The Avengers Collection] | 220000000 | [Science Fiction, Action, Adventure] | 24428 | en | When an unexpected enemy emerges and threatens... | 89.887648 | [Paramount Pictures, Marvel Studios] | [United States of America] | 2012-04-25 | 1519557910 | 143.0 | [English] | Released | The Avengers | 7.4 | 12000 | [Robert Downey Jr., Chris Evans, Mark Ruffalo,... | Joss Whedon | [new york, shield, marvel comic, superhero, ba... |
4 | [Deadpool Collection] | 58000000 | [Action, Adventure, Comedy] | 293660 | en | Deadpool tells the origin story of former Spec... | 187.860492 | [Twentieth Century Fox Film Corporation, Marve... | [United States of America] | 2016-02-09 | 783112979 | 108.0 | [English] | Released | Deadpool | 7.4 | 11444 | [Ryan Reynolds, Morena Baccarin, Ed Skrein, T.... | Tim Miller | [anti hero, mercenary, marvel comic, superhero... |
fig
fig
# Print the updated dataframe
movies_meta[['weighted_rating', 'title', 'director']].head(13)
weighted_rating | title | director | |
---|---|---|---|
19 | 8.360240 | The Shawshank Redemption | Frank Darabont |
61 | 8.309658 | The Godfather | Francis Ford Coppola |
1 | 8.209996 | The Dark Knight | Christopher Nolan |
8 | 8.186913 | Fight Club | David Fincher |
17 | 8.174380 | Pulp Fiction | Quentin Tarantino |
21 | 8.071673 | Forrest Gump | Robert Zemeckis |
150 | 8.064963 | Schindler's List | Steven Spielberg |
153 | 8.062025 | Whiplash | Damien Chazelle |
181 | 8.039924 | Spirited Away | Hayao Miyazaki |
63 | 8.028738 | The Empire Strikes Back | Irvin Kershner |
0 | 8.027068 | Inception | Christopher Nolan |
209 | 8.019146 | Life Is Beautiful | Roberto Benigni |
88 | 8.011480 | The Intouchables | Eric Toledano |
top_comedy_movies[['weighted_rating', 'title', 'director']]
weighted_rating | title | director | |
---|---|---|---|
21 | 8.071673 | Forrest Gump | Robert Zemeckis |
209 | 8.019146 | Life Is Beautiful | Roberto Benigni |
88 | 8.011480 | The Intouchables | Eric Toledano |
53 | 7.847687 | Back to the Future | Robert Zemeckis |
129 | 7.799760 | The Grand Budapest Hotel | Wes Anderson |
38 | 7.764797 | The Wolf of Wall Street | Martin Scorsese |
39 | 7.764212 | Inside Out | Pete Docter |
1646 | 7.734884 | Dilwale Dulhania Le Jayenge | Aditya Chopra |
122 | 7.711893 | La La Land | Damien Chazelle |
34 | 7.675550 | Up | Pete Docter |
ranked_anderson_movies[['title', 'weighted_rating']]
title | weighted_rating | |
---|---|---|
129 | The Grand Budapest Hotel | 7.799760 |
626 | Moonrise Kingdom | 7.202533 |
939 | Fantastic Mr. Fox | 7.007961 |
859 | The Royal Tenenbaums | 6.963711 |
1582 | Rushmore | 6.778962 |
1288 | The Darjeeling Limited | 6.613823 |
1423 | The Life Aquatic with Steve Zissou | 6.575526 |
3959 | Hotel Chevalier | 6.094411 |
3167 | Bottle Rocket | 6.088710 |
10464 | Come Together | 5.786662 |
movie_title = "Toy Story"
recommendations = content_based_recommendations(movie_title, tfidf_matrix)
print(f"Top 10 recommendations for '{movie_title}':")
recommendations
title | similarity_score | |
---|---|---|
0 | Toy Story 2 | 0.511418 |
1 | Toy Story 3 | 0.459096 |
2 | Toy Story That Time Forgot | 0.292212 |
3 | Toy Story of Terror! | 0.272110 |
4 | Small Fry | 0.264771 |
5 | Hawaiian Vacation | 0.248074 |
6 | Toy Reanimator | 0.217118 |
7 | Small Soldiers | 0.211142 |
8 | Partysaurus Rex | 0.207263 |
9 | Silent Night, Deadly Night 5: The Toy Maker | 0.205390 |
display(top_10_recommended_movies)
title | |
---|---|
0 | A Nightmare on Elm Street |
1 | Romeo + Juliet |
2 | 48 Hrs. |
3 | Once Were Warriors |
4 | Sissi |
5 | Terminator 3: Rise of the Machines |
6 | Monsoon Wedding |
user_id = 1
recommended_movies = recommend_movies_for_user(user_id, predicted_ratings_df, ratings)
recommended_movies
userId | movieId | predicted_rating | title | |
---|---|---|---|---|
0 | 1 | 3030 | 3.557553 | End of the World |
1 | 1 | 73290 | 3.488766 | Urban Explorer |
2 | 1 | 759 | 3.449224 | Gentlemen Prefer Blondes |
3 | 1 | 55069 | 3.446386 | The Delivery |
4 | 1 | 51277 | 3.420188 | Now and Forever |
5 | 1 | 116 | 3.417106 | Match Point |
6 | 1 | 50641 | 3.403370 | The End of Poverty? |
7 | 1 | 309 | 3.396181 | The Celebration |
8 | 1 | 1649 | 3.391616 | Bill & Ted's Bogus Journey |
9 | 1 | 149 | 3.378770 | Akira |