przetwarzanie_jezyka_natura.../P1/Movielens/main.ipynb
2023-01-19 21:22:01 +01:00

10 KiB

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df_movie = pd.read_csv('movies.dat',sep='::',encoding='latin1',engine='python',names=['MovieID','MovieName','Category'])
df_movie.dropna(inplace=True)
df_movie.head()
MovieID MovieName Category
0 1 Toy Story (1995) Animation|Children's|Comedy
1 2 Jumanji (1995) Adventure|Children's|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama
4 5 Father of the Bride Part II (1995) Comedy
df_rating = pd.read_csv('ratings.dat',sep='::',encoding='latin1',engine='python',names=['ID','MovieID','Ratings','TimeStamp'])
df_rating.dropna(inplace=True)
df_rating.head()
ID MovieID Ratings TimeStamp
0 1 1193 5 978300760
1 1 661 3 978302109
2 1 914 3 978301968
3 1 3408 4 978300275
4 1 2355 5 978824291
df = pd.merge(df_movie,df_rating,left_on='MovieID',right_on='MovieID')
df.head()
MovieID MovieName Category ID Ratings TimeStamp
0 1 Toy Story (1995) Animation|Children's|Comedy 1 5 978824268
1 1 Toy Story (1995) Animation|Children's|Comedy 6 4 978237008
2 1 Toy Story (1995) Animation|Children's|Comedy 8 4 978233496
3 1 Toy Story (1995) Animation|Children's|Comedy 9 5 978225952
4 1 Toy Story (1995) Animation|Children's|Comedy 10 5 978226474
groupByMovie = df.groupby('MovieID')
movieRatingsMean = groupByMovie['Ratings'].mean()*2
movieRatingsMean.columns = ['MovieID','Mean']
movieRatingsMean
MovieID
1       8.293693
2       6.402282
3       6.033473
4       5.458824
5       6.013514
          ...   
3948    7.271462
3949    8.230263
3950    7.333333
3951    7.800000
3952    7.561856
Name: Ratings, Length: 3706, dtype: float64