fuzzy-logic-movies/main.py

"""
!pip install scikit-learn
!pip install pandas
!pip install fastapi
!pip install "uvicorn[standard]"
!uvicorn main:app --reload
"""
import multiprocessing
import time
from multiprocessing import Pool

import numpy as np
import pandas as pd
import pandas.core.series
from fastapi import FastAPI
from scipy.spatial.distance import cosine
from sklearn.preprocessing import MultiLabelBinarizer

from engine import fuzzy_system

app = FastAPI()
data = pd.DataFrame()
mlb = MultiLabelBinarizer()


def inference(first: pandas.core.series.Series,
              second_id: str,
              release_year_param='similar',
              runtime_param='similar',
              seasons_param='similar',
              genres_param='same',
              emotions_param='same',
              df=None):
    if df is not None:
        second = df.loc[second_id]
    else:
        second = data.loc[second_id]

    FS = fuzzy_system(release_year_param=release_year_param,
                      runtime_param=runtime_param,
                      seasons_param=seasons_param,
                      genres_param=genres_param,
                      emotions_param=emotions_param)

    year_diff = int(first['release_year'] - second['release_year'])
    FS.set_variable('RELEASE_YEAR', year_diff)

    runtime_diff = int(first['runtime'] - second['runtime'])
    FS.set_variable('RUNTIME', runtime_diff)

    if not (np.isnan(first['seasons']) or np.isnan(second['seasons'])):
        season_diff = int(first['seasons'] - second['seasons'])
        FS.set_variable('SEASONS', season_diff)
    else:
        FS.set_variable('SEASONS', 0)

    genre_diff = 1 - cosine(first['genres'], second['genres'])
    FS.set_variable('GENRES', genre_diff)

    emotion_diff = 1 - cosine(first['emotions'], second['emotions'])
    FS.set_variable('EMOTIONS', emotion_diff)

    return second_id, FS.inference(['RECOMMENDATION'])['RECOMMENDATION']


def process_dataframe(df,
                      production,
                      release_year_param,
                      runtime_param,
                      seasons_param,
                      genres_param,
                      emotions_param
                      ):
    scores = []
    for index, row in df.iterrows():
        scores.append(inference(production,
                                str(index),
                                release_year_param,
                                runtime_param,
                                seasons_param,
                                genres_param,
                                emotions_param,
                                df))
    return scores


@app.on_event('startup')
async def startup_event():
    global data
    global mlb
    data = pd.read_csv('processed_data.csv', index_col='id', converters={'genres': pd.eval})
    all_genres = data.genres.explode().unique()
    mlb.fit([all_genres])
    data['genres'] = data['genres'].apply(lambda x: mlb.transform([x])[0])
    data['emotions'] = data[['Happy', 'Angry', 'Surprise', 'Sad', 'Fear']].values.tolist()


@app.get('/find/{title}')
def titles(title: str):
    response = []
    for index, row in data.iterrows():
        if title.lower() in row['title'].lower():
            response.append({'id': index, 'title': row['title'], 'year': row['release_year']})
    return response


@app.get('/details/{production_id}')
def details(production_id: str):
    try:
        production = data.loc[production_id]
    except:
        return {'error': f'{production_id} is not a valid id'}
    genres = production['genres']
    genres = mlb.inverse_transform(genres.reshape(1, -1))[0]
    return {
        'title': production['title'],
        'type': production['type'],
        'description': production['description'],
        'year': int(production['release_year']),
        'runtime': int(production['runtime']),
        'genres': genres,
    }


@app.get('/score/{first_id}/{second_id}')
def rec_score(first_id: str, second_id: str):
    try:
        first = data.loc[first_id]
    except KeyError:
        return {'error': f'{first_id} is not a valid id'}
    try:
        second = data.loc[second_id]
    except KeyError:
        return {'error': f'{second_id} is not a valid id'}

    return inference(first, second_id)


@app.get('/recs/{production_id}')
async def recs(production_id: str,
               release_year_param: str | None = 'similar',
               runtime_param: str | None = 'similar',
               seasons_param: str | None = 'similar',
               genres_param: str | None = 'same',
               emotions_param: str | None = 'same',
               count: int | None = 5):
    try:
        first = data.loc[production_id]
    except KeyError:
        return {'error': f'{production_id} is not a valid id'}

    scores = []
    time_start = time.time()
    cpus = multiprocessing.cpu_count()
    df_list = np.array_split(data, cpus)
    pool = Pool(cpus)
    results = [pool.apply_async(process_dataframe,
                                [df,
                                 first,
                                 release_year_param,
                                 runtime_param,
                                 seasons_param,
                                 genres_param,
                                 emotions_param]) for df in df_list]

    for r in results:
        r.wait()
    for r in results:
        scores += r.get()
    print(f'time elapsed = {time.time() - time_start}')
    scores = [idx[0] for idx in sorted(scores, key=lambda x: x[1], reverse=True)[:count + 1]]
    scores.remove(production_id)
    return {
        'id': scores
    }