move data to data dir

This commit is contained in:
Kacper 2023-01-27 18:43:32 +01:00
parent 6aed792d44
commit 04c9ce7f6e
5 changed files with 5 additions and 5 deletions

View File

Can't render this file because it is too large.

View File

Can't render this file because it is too large.

View File

Can't render this file because it is too large.

View File

@ -40,7 +40,7 @@ def compare_games(title_1: str, title_2: str, df: pd.DataFrame, show_graph: bool
if __name__ == '__main__': if __name__ == '__main__':
df = pd.read_pickle('games_processed_vectorized.csv') df = pd.read_pickle('data/games_processed_vectorized.csv')
while True: while True:
title_1 = input("Enter title 1: ") title_1 = input("Enter title 1: ")

View File

@ -34,7 +34,7 @@ def replace_with_vector(row, w2v):
if __name__ == '__main__': if __name__ == '__main__':
df = pd.read_csv('games.csv') df = pd.read_csv('data/games.csv')
df['positive_percentage'] = df.apply( df['positive_percentage'] = df.apply(
lambda row: calculate_positive_percentage(row.positive_ratings, row.negative_ratings), axis=1) lambda row: calculate_positive_percentage(row.positive_ratings, row.negative_ratings), axis=1)
@ -46,16 +46,16 @@ if __name__ == '__main__':
df['all_categorical'] = df['categories'] + ';' + df['genres'] + ';' + df['steamspy_tags'] df['all_categorical'] = df['categories'] + ';' + df['genres'] + ';' + df['steamspy_tags']
df['all_categorical'] = df['all_categorical'].map(lambda row: row.strip().replace(' ', ';').lower()) df['all_categorical'] = df['all_categorical'].map(lambda row: row.strip().replace(' ', ';').lower())
df['all_categorical'] = df['all_categorical'].apply(lambda row: replace(row)) df['all_categorical'] = df['all_categorical'].apply(lambda row: replace(row))
df.to_csv('games_processed.csv', index=False, encoding='utf-8') df.to_csv('data/games_processed.csv', index=False, encoding='utf-8')
try: try:
w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', w2v = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',
binary=True) binary=True)
df2 = pd.read_csv('games_processed.csv') df2 = pd.read_csv('data/games_processed.csv')
df2['temp'] = df2['categories'] + ';' + df2['genres'] + ';' + df2['steamspy_tags'] df2['temp'] = df2['categories'] + ';' + df2['genres'] + ';' + df2['steamspy_tags']
df2['temp'] = df2['temp'].map(lambda row: row.strip().replace(' ', ';').lower()) df2['temp'] = df2['temp'].map(lambda row: row.strip().replace(' ', ';').lower())
df2['all_categorical_vector'] = df2['temp'].apply(lambda row: replace_with_vector(row, w2v)) df2['all_categorical_vector'] = df2['temp'].apply(lambda row: replace_with_vector(row, w2v))
df2.drop('temp', inplace=True, axis=1) df2.drop('temp', inplace=True, axis=1)
df2.to_pickle('games_processed_vectorized.csv') df2.to_pickle('data/games_processed_vectorized.csv')
except: except:
print('A local copy of GoogleNews-vectors-negative300.bin was not found. The file can be downloaded from ' print('A local copy of GoogleNews-vectors-negative300.bin was not found. The file can be downloaded from '
'https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300. Finishing without vectorization') 'https://www.kaggle.com/datasets/leadbest/googlenewsvectorsnegative300. Finishing without vectorization')