WSS-project/P0. Data preparation.ipynb
2021-03-20 20:13:28 +01:00

71 KiB
Raw Blame History

Building train and test sets

import pandas as pd
import numpy as np
import scipy.sparse as sparse
import time
import random
import evaluation_measures as ev
import matplotlib
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split

import helpers

os.makedirs('./Datasets/', exist_ok = True)

helpers.download_movielens_100k_dataset()

df=pd.read_csv('./Datasets/ml-100k/u.data',delimiter='\t', header=None)
df.columns=['user', 'item', 'rating', 'timestamp']

train, test = train_test_split(df, test_size=0.2, random_state=30)

train.to_csv('./Datasets/ml-100k/train.csv', sep='\t', header=None, index=False)
test.to_csv('./Datasets/ml-100k/test.csv', sep='\t', header=None, index=False)

Interactions properties

How data looks like?

df[:5]
user item rating timestamp
0 196 242 3 881250949
1 186 302 3 891717742
2 22 377 1 878887116
3 244 51 2 880606923
4 166 346 1 886397596

Sample properties

users, items, ratings=len(set(df['user'])), len(set(df['item'])), len(df)

print('We have {} users, {} items and {} ratings.\n'.format(users, items, ratings))

print('Average number of ratings per user is {}. \n'.format(round(ratings/users,2)))
print('Average number of ratings per item is {}.\n'.format(round(ratings/items,4)))
print('Data sparsity (% of missing entries) is {}%.'.format(round(100*ratings/(users*items),4)))
We have 943 users, 1682 items and 100000 ratings.

Average number of ratings per user is 106.04. 

Average number of ratings per item is 59.453.

Data sparsity (% of missing entries) is 6.3047%.
items_per_user=df.groupby(['user']).count()['rating']

plt.figure(figsize=(16,8))
plt.hist(items_per_user, bins=100)

# Let's add median
t=items_per_user.median()
plt.axvline(t, color='k', linestyle='dashed', linewidth=1)
plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))

# Let's add also some percentiles
t=items_per_user.quantile(0.25)
plt.axvline(t, color='k', linestyle='dashed', linewidth=1)
plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))

t=items_per_user.quantile(0.75)
plt.axvline(t, color='k', linestyle='dashed', linewidth=1)
plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))

plt.title('Number of ratings per user', fontsize=30)
plt.show()
items_per_user=df.groupby(['item']).count()['rating']

plt.figure(figsize=(16,8))
plt.hist(items_per_user, bins=100)

# Let's add median
t=items_per_user.median()
plt.axvline(t, color='k', linestyle='dashed', linewidth=1)
plt.text(t*1.1, plt.ylim()[1]*0.9, 'Median: {:.0f}'.format(t))

# Let's add also some percentiles
t=items_per_user.quantile(0.25)
plt.axvline(t, color='k', linestyle='dashed', linewidth=1)
plt.text(t*1.1, plt.ylim()[1]*0.95, '25% quantile: {:.0f}'.format(t))

t=items_per_user.quantile(0.75)
plt.axvline(t, color='k', linestyle='dashed', linewidth=1)
plt.text(t*1.05, plt.ylim()[1]*0.95, '75% quantile: {:.0f}'.format(t))

plt.title('Number of ratings per item', fontsize=30)
plt.show()
df.groupby(['rating']).count()['user']/len(df)
rating
1    0.06110
2    0.11370
3    0.27145
4    0.34174
5    0.21201
Name: user, dtype: float64

Item attributes

genres = pd.read_csv('./Datasets/ml-100k/u.genre', sep='|', header=None,
                     encoding='latin-1')
genres=dict(zip(genres[1], genres[0]))
genres
{0: 'unknown',
 1: 'Action',
 2: 'Adventure',
 3: 'Animation',
 4: "Children's",
 5: 'Comedy',
 6: 'Crime',
 7: 'Documentary',
 8: 'Drama',
 9: 'Fantasy',
 10: 'Film-Noir',
 11: 'Horror',
 12: 'Musical',
 13: 'Mystery',
 14: 'Romance',
 15: 'Sci-Fi',
 16: 'Thriller',
 17: 'War',
 18: 'Western'}
movies = pd.read_csv('./Datasets/ml-100k/u.item', sep='|', encoding='latin-1', header=None)
movies[:3]
0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23
0 1 Toy Story (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Toy%20Story%2... 0 0 0 1 1 ... 0 0 0 0 0 0 0 0 0 0
1 2 GoldenEye (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?GoldenEye%20(... 0 1 1 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 3 Four Rooms (1995) 01-Jan-1995 NaN http://us.imdb.com/M/title-exact?Four%20Rooms%... 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0

3 rows × 24 columns

for i in range(19):
    movies[i+5]=movies[i+5].apply(lambda x: genres[i] if x==1 else '')
movies['genre']=movies.iloc[:, 5:].apply(lambda x: ', '.join(x[x!='']), axis = 1)
movies=movies[[0,1,'genre']]
movies.columns=['id', 'title', 'genres']
movies.to_csv('./Datasets/ml-100k/movies.csv', index=False)
movies[:5]
id title genres
0 1 Toy Story (1995) Animation, Children's, Comedy
1 2 GoldenEye (1995) Action, Adventure, Thriller
2 3 Four Rooms (1995) Thriller
3 4 Get Shorty (1995) Action, Comedy, Drama
4 5 Copycat (1995) Crime, Drama, Thriller

Toy example

import os
os.makedirs('./Datasets/toy-example/', exist_ok = True)
toy_train=pd.DataFrame([[0,0,3,0], [0,10,4,0], [0,40,5,0], [0,70,4,0],
              [10,10,1,0], [10,20,2,0], [10,30,3,0],
              [20,30,5,0], [20,50,3,0], [20,60,4,0]])
toy_test=pd.DataFrame([[0,60,3,0],
              [10,40,5,0],
              [20,0,5,0], [20,20,4,0], [20,70,2,0]])

toy_train.to_csv('./Datasets/toy-example/train.csv', sep='\t', header=None, index=False)
toy_test.to_csv('./Datasets/toy-example/test.csv', sep='\t', header=None, index=False)