multiprocessing

2023-01-09 19:50:09 +01:00 · 2023-01-09 19:50:09 +01:00 · 4ad7aa62e9
commit 4ad7aa62e9
parent 38578e007f
2 changed files with 31 additions and 14 deletions
--- a/engine.py
+++ b/engine.py
@ -1,6 +1,6 @@
 from simpful import *
-FS = FuzzySystem()
+FS = FuzzySystem(show_banner=False)
 # Define fuzzy sets for the variable
 # RELEASE_YEAR
--- a/main.py
+++ b/main.py
@ -5,9 +5,13 @@
 !pip install "uvicorn[standard]"
 !uvicorn main:app --reload
 """
 import multiprocessing
 import time
 from multiprocessing import Pool
 import numpy as np
 import pandas as pd
 import pandas.core.series
 from fastapi import FastAPI
 from scipy.spatial.distance import cosine
 from sklearn.preprocessing import MultiLabelBinarizer
@ -18,9 +22,11 @@ app = FastAPI()
 data = pd.DataFrame()
-def inference(first_id: str, second_id: str):
+def inference(first: pandas.core.series.Series, second_id: str, df=None):
-    first = data.loc[first_id]
+    if df is not None:
-    second = data.loc[second_id]
+        second = df.loc[second_id]
    else:
        second = data.loc[second_id]
    year_diff = int(first['release_year'] - second['release_year'])
    FS.set_variable('RELEASE_YEAR', year_diff)
@ -40,9 +46,15 @@ def inference(first_id: str, second_id: str):
    emotion_diff = 1 - cosine(first['emotions'], second['emotions'])
    FS.set_variable('EMOTIONS', emotion_diff)
-    return FS.inference(['RECOMMENDATION'])
+    return second_id, FS.inference(['RECOMMENDATION'])['RECOMMENDATION']
 def process_dataframe(df, production):
    scores = []
    for index, row in df.iterrows():
        scores.append(inference(production, str(index), df))
    return scores
@app.on_event('startup')
 async def startup_event():
@ -66,7 +78,7 @@ def rec_score(first_id: str, second_id: str):
    except KeyError:
        return {'error': f'{second_id} is not a valid id'}
-    return inference(first_id, second_id)
+    return inference(first, second_id)
@app.get('/recs/{production_id}')
@ -77,12 +89,17 @@ async def recs(production_id: str, count: int | None):
        return {'error': f'{production_id} is not a valid id'}
    scores = []
    time_start = time.time()
    cpus = multiprocessing.cpu_count()
    df_list = np.array_split(data, cpus)
    pool = Pool(cpus)
    results = [pool.apply_async(process_dataframe, [df, first]) for df in df_list]
-    for index, row in data.iterrows():
+    for r in results:
-        if str(index) == production_id:
+        r.wait()
-            continue
+    for r in results:
-        scores.append((index, inference(production_id, str(index))['RECOMMENDATION']))
+        scores += r.get()
-
+    print(f'time elapsed = {time.time() - time_start}')
-    scores = [idx[0] for idx in sorted(scores, key=lambda x: x[1], reverse=True)[:count]]
+    scores = [idx[0] for idx in sorted(scores, key=lambda x: x[1], reverse=True)[:count+1]]
-
+    scores.remove(production_id)
-    return list(scores)
+    return scores