From 04c9ce7f6e32f71d5f201591dd084b8ec8c5def9 Mon Sep 17 00:00:00 2001 From: Kacper Date: Fri, 27 Jan 2023 18:43:32 +0100 Subject: [PATCH] move data to data dir --- games.csv => data/games.csv | 0 games_processed.csv => data/games_processed.csv | 0 .../games_processed_vectorized.csv | Bin main.py | 2 +- process_dataset.py | 8 ++++---- 5 files changed, 5 insertions(+), 5 deletions(-) rename games.csv => data/games.csv (100%) rename games_processed.csv => data/games_processed.csv (100%) rename games_processed_vectorized.csv => data/games_processed_vectorized.csv (100%) diff --git a/games.csv b/data/games.csv similarity index 100% rename from games.csv rename to data/games.csv diff --git a/games_processed.csv b/data/games_processed.csv similarity index 100% rename from games_processed.csv rename to data/games_processed.csv diff --git a/games_processed_vectorized.csv b/data/games_processed_vectorized.csv similarity index 100% rename from games_processed_vectorized.csv rename to data/games_processed_vectorized.csv diff --git a/main.py b/main.py index 847b2dc..4f3901f 100644 --- a/main.py +++ b/main.py @@ -40,7 +40,7 @@ def compare_games(title_1: str, title_2: str, df: pd.DataFrame, show_graph: bool if __name__ == '__main__': - df = pd.read_pickle('games_processed_vectorized.csv') + df = pd.read_pickle('data/games_processed_vectorized.csv') while True: title_1 = input("Enter title 1: ") diff --git a/process_dataset.py b/process_dataset.py index e3c31c9..f36426e 100644 --- a/process_dataset.py +++ b/process_dataset.py @@ -34,7 +34,7 @@ def replace_with_vector(row, w2v): if __name__ == '__main__': - df = pd.read_csv('games.csv') + df = pd.read_csv('data/games.csv') df['positive_percentage'] = df.apply( lambda row: calculate_positive_percentage(row.positive_ratings, row.negative_ratings), axis=1) @@ -46,16 +46,16 @@ if __name__ == '__main__': df['all_categorical'] = df['categories'] + ';' + df['genres'] + ';' + df['steamspy_tags'] df['all_categorical'] = df['all_categorical'].map(lambda row: row.strip().replace(' ', ';').lower()) df['all_categorical'] = df['all_categorical'].apply(lambda row: replace(row)) - df.to_csv('games_processed.csv', index=False, encoding='utf-8') + df.to_csv('data/games_processed.csv', index=False, encoding='utf-8') try: w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) - df2 = pd.read_csv('games_processed.csv') + df2 = pd.read_csv('data/games_processed.csv') df2['temp'] = df2['categories'] + ';' + df2['genres'] + ';' + df2['steamspy_tags'] df2['temp'] = df2['temp'].map(lambda row: row.strip().replace(' ', ';').lower()) df2['all_categorical_vector'] = df2['temp'].apply(lambda row: replace_with_vector(row, w2v)) df2.drop('temp', inplace=True, axis=1) - df2.to_pickle('games_processed_vectorized.csv') + df2.to_pickle('data/games_processed_vectorized.csv') except: print('A local copy of GoogleNews-vectors-negative300.bin was not found. The file can be downloaded from ' 'https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300. Finishing without vectorization')