# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

%matplotlib inline
pd.set_option('display.max_columns', None)

# Load the dataset
df = pd.read_csv('rotten_tomatoes_movies.csv')
df.head(3)


# Set attributes in relevant format for k-means clustering
df['release_year'] = pd.to_datetime(df['original_release_date']).dt.year

def encodeAttributes(col, n):
    # Use one-hot encoding for top N entries in column attribute
    top_genres = df[col].value_counts().index[:n]
    return top_genres

def appendAttributes(col, n):
    # Append encoded attributes to original dataframe
    for item in encodeAttributes(col, n):
        df[item] = df[col].fillna('').str.lower().apply(lambda x: 1 if item.lower() in x else 0)
    return df

# Select most relevant features - manually selected for time-being
df = appendAttributes('genres', 10)
df = appendAttributes('directors', 10)
selected_attributes = ['tomatometer_rating', 'release_year', 'audience_count'] + list(encodeAttributes('genres', 10)) + list(encodeAttributes('directors', 10))

# Standardise feature values and impute missing values
scaler = StandardScaler()
imputer = SimpleImputer(strategy='mean')
X = scaler.fit_transform(df[selected_attributes])
X = imputer.fit_transform(X)

# Plot heatmap showing correlation matrix of selected attributes
plt.figure(figsize=(8, 6))
heatmap = plt.imshow(df[selected_attributes].corr(), cmap='coolwarm', interpolation='nearest')
plt.title("Attribute Correlation Heatmap")
plt.xticks(np.arange(df[selected_attributes].corr().shape[1]), labels=df[selected_attributes].corr().columns, rotation=90)
plt.yticks(np.arange(df[selected_attributes].corr().shape[1]), labels=df[selected_attributes].corr().columns)
plt.colorbar(heatmap, fraction=0.046, pad=0.04)
plt.tight_layout()
plt.show()


# Create an empty list to store the inertia (sum of squared distances to the closest cluster center)
inertia = []

# Define a range of values for k
k_range = range(1, 50)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

# Plot the inertia values
plt.figure(figsize=(8, 6))
plt.plot(k_range, inertia, marker='o', linestyle='-', color='b')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.show()


# Use K-means clustering to identify similar movies based on selected attributes
kmeans = KMeans(n_clusters=20, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X)
df['Cluster'] = clusters
mean_by_category = df.groupby('Cluster')['tomatometer_rating'].mean()

# Display average RT score for each cluster
plt.scatter(mean_by_category.index, mean_by_category)
plt.xlabel('Cluster ID')
plt.ylabel('RT Score')
plt.title('Average Cluster Scores')
plt.grid(True)
plt.show()


scaled_attributes = pd.DataFrame(X, columns = selected_attributes)
scaled_attributes['Cluster'] = clusters

# Plot parallel coordinates chart to display cluster characteristics
plt.figure(figsize=(8, 6))
pd.plotting.parallel_coordinates(scaled_attributes, 'Cluster', colormap='viridis')
plt.title("Parallel Coordinates Plot for K-means Clustering")
plt.xlabel("Attributes")
plt.ylabel("Values")
plt.xticks(fontsize=8, rotation=90)
legend_labels = [str(i) for i in sorted(scaled_attributes['Cluster'].unique())]
custom_legend = plt.legend(title='Cluster', loc='upper right', labels=legend_labels)
plt.show()


def getMovieRec():
    # Prompt user for film and return dataframe based on films located within the same cluster
    print('------------------------------------------------------------------------------')
    keyword = input("Enter a movie title for similar recommendations: ")
    search_results = df[df['movie_title'].str.contains(keyword, case=False, na=False)]
    if not search_results.empty:
        print('------------------------------------------------------------------------------')
        print("Search Results:")
        print(search_results[['movie_title', 'release_year']])
        if len(search_results) > 1:
            try:
                search_id = int(input("Please select the key of the relevant movie: "))
                filtered_df = df[df['Cluster'] == int(search_results['Cluster'][search_id])].sort_values(by='tomatometer_rating', ascending=False)
            except:
                print('Please enter a valid movie key.')
                return
        else:
            filtered_df = df[df['Cluster'] == int(search_results['Cluster'])].sort_values(by='tomatometer_rating', ascending=False)
        print('------------------------------------------------------------------------------')
        print('The following titles are recommended:')
        print(filtered_df[['movie_title', 'tomatometer_rating', 'genres']].head(3))
        print('------------------------------------------------------------------------------')
    else:
        print("No matching titles found.")
    return filtered_df
        
filtered_df = getMovieRec()
key_prompt = int(input('Please provide key of recommended film to obtain more movie info: '))
print('------------------------------------------------------------------------------')
print(filtered_df.head(3)['movie_info'].loc[key_prompt])
print('------------------------------------------------------------------------------')

------------------------------------------------------------------------------
Enter a movie title for similar recommendations: Zombieland
------------------------------------------------------------------------------
Search Results:
                  movie_title  release_year
17699              Zombieland        2009.0
17700  Zombieland: Double Tap        2019.0
Please select the key of the relevant movie: 17699
------------------------------------------------------------------------------
The following titles are recommended:
                                    movie_title  tomatometer_rating  \
8726   John Mulaney: Kid Gorgeous at Radio City               100.0   
7514                       Harold's Going Stiff               100.0   
14249                                   Tampopo               100.0   

                                  genres  
8726                              Comedy  
7514                      Comedy, Horror  
14249  Art House & International, Comedy  
------------------------------------------------------------------------------
Please provide key of recommended film to obtain more movie info: 7514
------------------------------------------------------------------------------
Harold suffers form a disease that slowly causes him to become a zombie.
------------------------------------------------------------------------------

	rotten_tomatoes_link	movie_title	movie_info	critics_consensus	content_rating	genres	directors	authors	actors	original_release_date	streaming_release_date	runtime	production_company	tomatometer_status	tomatometer_rating	tomatometer_count	audience_status	audience_rating	audience_count	tomatometer_top_critics_count	tomatometer_fresh_critics_count	tomatometer_rotten_critics_count
0	m/0814255	Percy Jackson & the Olympians: The Lightning T...	Always trouble-prone, the life of teenager Per...	Though it may seem like just another Harry Pot...	PG	Action & Adventure, Comedy, Drama, Science Fic...	Chris Columbus	Craig Titley, Chris Columbus, Rick Riordan	Logan Lerman, Brandon T. Jackson, Alexandra Da...	2010-02-12	2015-11-25	119.0	20th Century Fox	Rotten	49.0	149.0	Spilled	53.0	254421.0	43	73	76
1	m/0878835	Please Give	Kate (Catherine Keener) and her husband Alex (...	Nicole Holofcener's newest might seem slight i...	R	Comedy	Nicole Holofcener	Nicole Holofcener	Catherine Keener, Amanda Peet, Oliver Platt, R...	2010-04-30	2012-09-04	90.0	Sony Pictures Classics	Certified-Fresh	87.0	142.0	Upright	64.0	11574.0	44	123	19
2	m/10	10	A successful, middle-aged Hollywood songwriter...	Blake Edwards' bawdy comedy may not score a pe...	R	Comedy, Romance	Blake Edwards	Blake Edwards	Dudley Moore, Bo Derek, Julie Andrews, Robert ...	1979-10-05	2014-07-24	122.0	Waner Bros.	Fresh	67.0	24.0	Spilled	53.0	14684.0	2	16	8

Cinematic Clusters¶

Table of Contents¶

Introduction¶

Data Preparation¶

Feature Selection¶

K Calibration¶

Cluster Analysis¶

Model Testing¶

Conclusion¶

References¶