""" !pip install scikit-learn !pip install pandas !pip install fastapi !pip install "uvicorn[standard]" !uvicorn main:app --reload """ import multiprocessing import time from multiprocessing import Pool import numpy as np import pandas as pd import pandas.core.series from fastapi import FastAPI from scipy.spatial.distance import cosine from sklearn.preprocessing import MultiLabelBinarizer from engine import FS app = FastAPI() data = pd.DataFrame() def inference(first: pandas.core.series.Series, second_id: str, df=None): if df is not None: second = df.loc[second_id] else: second = data.loc[second_id] year_diff = int(first['release_year'] - second['release_year']) FS.set_variable('RELEASE_YEAR', year_diff) runtime_diff = int(first['runtime'] - second['runtime']) FS.set_variable('RUNTIME', runtime_diff) if not (np.isnan(first['seasons']) or np.isnan(second['seasons'])): season_diff = int(first['seasons'] - second['seasons']) FS.set_variable('SEASONS', season_diff) else: FS.set_variable('SEASONS', 0) genre_diff = 1 - cosine(first['genres'], second['genres']) FS.set_variable('GENRES', genre_diff) emotion_diff = 1 - cosine(first['emotions'], second['emotions']) FS.set_variable('EMOTIONS', emotion_diff) return second_id, FS.inference(['RECOMMENDATION'])['RECOMMENDATION'] def process_dataframe(df, production): scores = [] for index, row in df.iterrows(): scores.append(inference(production, str(index), df)) return scores @app.on_event('startup') async def startup_event(): global data data = pd.read_csv('processed_data.csv', index_col='id', converters={'genres': pd.eval}) all_genres = data.genres.explode().unique() mlb = MultiLabelBinarizer() mlb.fit([all_genres]) data['genres'] = data['genres'].apply(lambda x: mlb.transform([x])[0]) data['emotions'] = data[['Happy', 'Angry', 'Surprise', 'Sad', 'Fear']].values.tolist() @app.get('/score/{first_id}/{second_id}') def rec_score(first_id: str, second_id: str): try: first = data.loc[first_id] except KeyError: return {'error': f'{first_id} is not a valid id'} try: second = data.loc[second_id] except KeyError: return {'error': f'{second_id} is not a valid id'} return inference(first, second_id) @app.get('/recs/{production_id}') async def recs(production_id: str, count: int | None = 5): try: first = data.loc[production_id] except KeyError: return {'error': f'{production_id} is not a valid id'} scores = [] time_start = time.time() cpus = multiprocessing.cpu_count() df_list = np.array_split(data, cpus) pool = Pool(cpus) results = [pool.apply_async(process_dataframe, [df, first]) for df in df_list] for r in results: r.wait() for r in results: scores += r.get() print(f'time elapsed = {time.time() - time_start}') scores = [idx[0] for idx in sorted(scores, key=lambda x: x[1], reverse=True)[:count+1]] scores.remove(production_id) return scores