fuzzy-logic-movies/main.py

137 lines
4.0 KiB
Python
Raw Normal View History

2023-01-07 15:21:05 +01:00
"""
!pip install scikit-learn
!pip install pandas
!pip install fastapi
!pip install "uvicorn[standard]"
!uvicorn main:app --reload
"""
2023-01-09 19:50:09 +01:00
import multiprocessing
import time
from multiprocessing import Pool
2023-01-07 15:21:05 +01:00
import numpy as np
import pandas as pd
2023-01-09 19:50:09 +01:00
import pandas.core.series
2023-01-07 15:21:05 +01:00
from fastapi import FastAPI
from scipy.spatial.distance import cosine
from sklearn.preprocessing import MultiLabelBinarizer
2023-01-25 14:39:25 +01:00
from engine import fuzzy_system
2023-01-07 15:21:05 +01:00
app = FastAPI()
data = pd.DataFrame()
2023-01-13 15:20:42 +01:00
mlb = MultiLabelBinarizer()
2023-01-07 15:21:05 +01:00
2023-01-09 19:50:09 +01:00
def inference(first: pandas.core.series.Series, second_id: str, df=None):
if df is not None:
second = df.loc[second_id]
else:
second = data.loc[second_id]
2023-01-07 22:35:00 +01:00
2023-01-25 14:39:25 +01:00
FS = fuzzy_system(release_year_param='similar', runtime_param='similar', seasons_param='similar', genres_param='same', emotions_param='same')
2023-01-07 22:35:00 +01:00
year_diff = int(first['release_year'] - second['release_year'])
FS.set_variable('RELEASE_YEAR', year_diff)
runtime_diff = int(first['runtime'] - second['runtime'])
FS.set_variable('RUNTIME', runtime_diff)
if not (np.isnan(first['seasons']) or np.isnan(second['seasons'])):
season_diff = int(first['seasons'] - second['seasons'])
FS.set_variable('SEASONS', season_diff)
else:
FS.set_variable('SEASONS', 0)
genre_diff = 1 - cosine(first['genres'], second['genres'])
FS.set_variable('GENRES', genre_diff)
emotion_diff = 1 - cosine(first['emotions'], second['emotions'])
FS.set_variable('EMOTIONS', emotion_diff)
2023-01-09 19:50:09 +01:00
return second_id, FS.inference(['RECOMMENDATION'])['RECOMMENDATION']
2023-01-07 22:35:00 +01:00
2023-01-09 19:50:09 +01:00
def process_dataframe(df, production):
scores = []
for index, row in df.iterrows():
scores.append(inference(production, str(index), df))
return scores
2023-01-07 22:35:00 +01:00
2023-01-07 15:21:05 +01:00
@app.on_event('startup')
async def startup_event():
global data
2023-01-13 15:20:42 +01:00
global mlb
2023-01-07 15:21:05 +01:00
data = pd.read_csv('processed_data.csv', index_col='id', converters={'genres': pd.eval})
all_genres = data.genres.explode().unique()
mlb.fit([all_genres])
data['genres'] = data['genres'].apply(lambda x: mlb.transform([x])[0])
data['emotions'] = data[['Happy', 'Angry', 'Surprise', 'Sad', 'Fear']].values.tolist()
2023-01-13 15:20:42 +01:00
@app.get('/find/{title}')
def titles(title: str):
2023-01-13 17:55:18 +01:00
response = []
2023-01-13 15:20:42 +01:00
for index, row in data.iterrows():
if title.lower() in row['title'].lower():
2023-01-13 17:55:18 +01:00
response.append({'id': index, 'title': row['title'], 'year': row['release_year']})
2023-01-13 15:20:42 +01:00
return response
@app.get('/details/{production_id}')
def details(production_id: str):
try:
production = data.loc[production_id]
except:
return {'error': f'{production_id} is not a valid id'}
genres = production['genres']
genres = mlb.inverse_transform(genres.reshape(1, -1))[0]
return {
'title': production['title'],
'type': production['type'],
'description': production['description'],
'year': int(production['release_year']),
'runtime': int(production['runtime']),
'genres': genres,
}
2023-01-07 15:21:05 +01:00
@app.get('/score/{first_id}/{second_id}')
def rec_score(first_id: str, second_id: str):
try:
first = data.loc[first_id]
except KeyError:
return {'error': f'{first_id} is not a valid id'}
try:
second = data.loc[second_id]
except KeyError:
return {'error': f'{second_id} is not a valid id'}
2023-01-09 19:50:09 +01:00
return inference(first, second_id)
2023-01-07 15:21:05 +01:00
2023-01-07 22:35:00 +01:00
@app.get('/recs/{production_id}')
2023-01-13 13:41:44 +01:00
async def recs(production_id: str, count: int | None = 5):
2023-01-07 22:35:00 +01:00
try:
first = data.loc[production_id]
except KeyError:
return {'error': f'{production_id} is not a valid id'}
2023-01-07 15:21:05 +01:00
2023-01-07 22:35:00 +01:00
scores = []
2023-01-09 19:50:09 +01:00
time_start = time.time()
cpus = multiprocessing.cpu_count()
df_list = np.array_split(data, cpus)
pool = Pool(cpus)
results = [pool.apply_async(process_dataframe, [df, first]) for df in df_list]
for r in results:
r.wait()
for r in results:
scores += r.get()
print(f'time elapsed = {time.time() - time_start}')
scores = [idx[0] for idx in sorted(scores, key=lambda x: x[1], reverse=True)[:count+1]]
scores.remove(production_id)
2023-01-14 00:00:35 +01:00
return {
'id': scores
}