przetwarzanie_jezyka_natura.../P1/Movielens/main.ipynb

376 lines
10 KiB
Plaintext
Raw Normal View History

2023-01-19 21:22:01 +01:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MovieID</th>\n",
" <th>MovieName</th>\n",
" <th>Category</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation|Children's|Comedy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>Jumanji (1995)</td>\n",
" <td>Adventure|Children's|Fantasy</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Grumpier Old Men (1995)</td>\n",
" <td>Comedy|Romance</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>Waiting to Exhale (1995)</td>\n",
" <td>Comedy|Drama</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>Father of the Bride Part II (1995)</td>\n",
" <td>Comedy</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MovieID MovieName Category\n",
"0 1 Toy Story (1995) Animation|Children's|Comedy\n",
"1 2 Jumanji (1995) Adventure|Children's|Fantasy\n",
"2 3 Grumpier Old Men (1995) Comedy|Romance\n",
"3 4 Waiting to Exhale (1995) Comedy|Drama\n",
"4 5 Father of the Bride Part II (1995) Comedy"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_movie = pd.read_csv('movies.dat',sep='::',encoding='latin1',engine='python',names=['MovieID','MovieName','Category'])\n",
"df_movie.dropna(inplace=True)\n",
"df_movie.head()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>MovieID</th>\n",
" <th>Ratings</th>\n",
" <th>TimeStamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>1193</td>\n",
" <td>5</td>\n",
" <td>978300760</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>661</td>\n",
" <td>3</td>\n",
" <td>978302109</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>914</td>\n",
" <td>3</td>\n",
" <td>978301968</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>3408</td>\n",
" <td>4</td>\n",
" <td>978300275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>2355</td>\n",
" <td>5</td>\n",
" <td>978824291</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID MovieID Ratings TimeStamp\n",
"0 1 1193 5 978300760\n",
"1 1 661 3 978302109\n",
"2 1 914 3 978301968\n",
"3 1 3408 4 978300275\n",
"4 1 2355 5 978824291"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_rating = pd.read_csv('ratings.dat',sep='::',encoding='latin1',engine='python',names=['ID','MovieID','Ratings','TimeStamp'])\n",
"df_rating.dropna(inplace=True)\n",
"df_rating.head()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MovieID</th>\n",
" <th>MovieName</th>\n",
" <th>Category</th>\n",
" <th>ID</th>\n",
" <th>Ratings</th>\n",
" <th>TimeStamp</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation|Children's|Comedy</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>978824268</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation|Children's|Comedy</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>978237008</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation|Children's|Comedy</td>\n",
" <td>8</td>\n",
" <td>4</td>\n",
" <td>978233496</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation|Children's|Comedy</td>\n",
" <td>9</td>\n",
" <td>5</td>\n",
" <td>978225952</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>Toy Story (1995)</td>\n",
" <td>Animation|Children's|Comedy</td>\n",
" <td>10</td>\n",
" <td>5</td>\n",
" <td>978226474</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" MovieID MovieName Category ID Ratings \\\n",
"0 1 Toy Story (1995) Animation|Children's|Comedy 1 5 \n",
"1 1 Toy Story (1995) Animation|Children's|Comedy 6 4 \n",
"2 1 Toy Story (1995) Animation|Children's|Comedy 8 4 \n",
"3 1 Toy Story (1995) Animation|Children's|Comedy 9 5 \n",
"4 1 Toy Story (1995) Animation|Children's|Comedy 10 5 \n",
"\n",
" TimeStamp \n",
"0 978824268 \n",
"1 978237008 \n",
"2 978233496 \n",
"3 978225952 \n",
"4 978226474 "
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.merge(df_movie,df_rating,left_on='MovieID',right_on='MovieID')\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"MovieID\n",
"1 8.293693\n",
"2 6.402282\n",
"3 6.033473\n",
"4 5.458824\n",
"5 6.013514\n",
" ... \n",
"3948 7.271462\n",
"3949 8.230263\n",
"3950 7.333333\n",
"3951 7.800000\n",
"3952 7.561856\n",
"Name: Ratings, Length: 3706, dtype: float64"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"groupByMovie = df.groupby('MovieID')\n",
"movieRatingsMean = groupByMovie['Ratings'].mean()*2\n",
"movieRatingsMean.columns = ['MovieID','Mean']\n",
"movieRatingsMean"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10 (default, Nov 14 2022, 12:59:47) \n[GCC 9.4.0]"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}