dl_projekt/dataset analyze and clean.ipynb

753 lines
82 KiB
Plaintext
Raw Permalink Normal View History

2024-06-01 15:52:53 +02:00
{
"cells": [
2024-06-03 16:32:11 +02:00
{
"cell_type": "markdown",
"source": [
"# Analiza zbioru danych, czyszczenie i podział na train/valid/test"
],
"metadata": {
"collapsed": false
}
},
2024-06-01 15:52:53 +02:00
{
"cell_type": "code",
"execution_count": 123,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": " app_id app_name \\\n6417086 99910 Puzzle Pirates \n6417087 99910 Puzzle Pirates \n6417088 99910 Puzzle Pirates \n6417089 99910 Puzzle Pirates \n6417090 99910 Puzzle Pirates \n6417091 99910 Puzzle Pirates \n6417092 99910 Puzzle Pirates \n6417093 99910 Puzzle Pirates \n6417094 99910 Puzzle Pirates \n6417095 99910 Puzzle Pirates \n6417096 99910 Puzzle Pirates \n6417097 99910 Puzzle Pirates \n6417098 99910 Puzzle Pirates \n6417099 99910 Puzzle Pirates \n6417100 99910 Puzzle Pirates \n6417101 99910 Puzzle Pirates \n6417102 99910 Puzzle Pirates \n6417103 99910 Puzzle Pirates \n6417104 99910 Puzzle Pirates \n6417105 99910 Puzzle Pirates \n\n review_text review_score \\\n6417086 Reminds me of the games I played in elementary... -1 \n6417087 I dont like this game -1 \n6417088 The actual game play of Puzzle Pirates is grea... -1 \n6417089 Rating based on current state of play, as per ... -1 \n6417090 This is just appalling. -1 \n6417091 Set my age as less than 5 by mistake. Apparent... -1 \n6417092 It is terrible because I cant ge on because of... -1 \n6417093 Was fun for the first 30 minutes or so, got bo... -1 \n6417094 The game is very awefull and strange. I think ... -1 \n6417095 A very good game, got sick of it after a while... -1 \n6417096 Imagine Bejeweled with a heavy grind based eco... -1 \n6417097 This game has some serious problems. First of ... -1 \n6417098 This game is good but also horrible. Its fun t... -1 \n6417099 A very good game, got sick of it after a while... -1 \n6417100 This game is good but also horrible. Its fun t... -1 \n6417101 I really ove this game but it needs somethings... -1 \n6417102 Used to play Puzzel Pirates 'way back when', b... -1 \n6417103 This game was aright, though a bit annoying. W... -1 \n6417104 I had a nice review to recommend this game, bu... -1 \n6417105 The puzzles in this game are fun, but you have... -1 \n\n review_votes \n6417086 0 \n6417087 0 \n6417088 0 \n6417089 0 \n6417090 0 \n6417091 0 \n6417092 0 \n6417093 0 \n6417094 0 \n6417095 1 \n6417096 0 \n6417097 0 \n6417098 0 \n6417099 1 \n6417100 0 \n6417101 0 \n6417102 0 \n6417103 0 \n6417104 0 \n6417105 0 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>app_id</th>\n <th>app_name</th>\n <th>review_text</th>\n <th>review_score</th>\n <th>review_votes</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>6417086</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>Reminds me of the games I played in elementary...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417087</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>I dont like this game</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417088</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>The actual game play of Puzzle Pirates is grea...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417089</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>Rating based on current state of play, as per ...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417090</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>This is just appalling.</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417091</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>Set my age as less than 5 by mistake. Apparent...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417092</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>It is terrible because I cant ge on because of...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417093</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>Was fun for the first 30 minutes or so, got bo...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417094</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>The game is very awefull and strange. I think ...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417095</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>A very good game, got sick of it after a while...</td>\n <td>-1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>6417096</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>Imagine Bejeweled with a heavy grind based eco...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417097</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>This game has some serious problems. First of ...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417098</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>This game is good but also horrible. Its fun t...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417099</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>A very good game, got sick of it after a while...</td>\n <td>-1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>6417100</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>This game is good but also horrible. Its fun t...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417101</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>I really ove this game but it needs somethings...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417102</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>Used to play Puzzel Pirates 'way back when', b...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417103</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>This game was aright, though a bit annoying. W...</td>\n <td>-1</td>\n <td>0</t
},
"execution_count": 123,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"dataset = pd.read_csv(\"dataset.csv\")\n",
"dataset.tail(20)"
2024-06-03 16:32:11 +02:00
],
"metadata": {
"collapsed": false
}
2024-06-01 15:52:53 +02:00
},
{
"cell_type": "code",
"execution_count": 124,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 6417106 entries, 0 to 6417105\n",
"Data columns (total 5 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 app_id int64 \n",
" 1 app_name object\n",
" 2 review_text object\n",
" 3 review_score int64 \n",
" 4 review_votes int64 \n",
"dtypes: int64(3), object(2)\n",
"memory usage: 244.8+ MB\n"
]
}
],
"source": [
"dataset.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 125,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": " app_id review_score review_votes\ncount 6.417106e+06 6.417106e+06 6.417106e+06\nmean 2.274695e+05 6.394992e-01 1.472446e-01\nstd 1.260451e+05 7.687918e-01 3.543496e-01\nmin 1.000000e+01 -1.000000e+00 0.000000e+00\n25% 2.018100e+05 1.000000e+00 0.000000e+00\n50% 2.391600e+05 1.000000e+00 0.000000e+00\n75% 3.056200e+05 1.000000e+00 0.000000e+00\nmax 5.653400e+05 1.000000e+00 1.000000e+00",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>app_id</th>\n <th>review_score</th>\n <th>review_votes</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>6.417106e+06</td>\n <td>6.417106e+06</td>\n <td>6.417106e+06</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>2.274695e+05</td>\n <td>6.394992e-01</td>\n <td>1.472446e-01</td>\n </tr>\n <tr>\n <th>std</th>\n <td>1.260451e+05</td>\n <td>7.687918e-01</td>\n <td>3.543496e-01</td>\n </tr>\n <tr>\n <th>min</th>\n <td>1.000000e+01</td>\n <td>-1.000000e+00</td>\n <td>0.000000e+00</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>2.018100e+05</td>\n <td>1.000000e+00</td>\n <td>0.000000e+00</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>2.391600e+05</td>\n <td>1.000000e+00</td>\n <td>0.000000e+00</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>3.056200e+05</td>\n <td>1.000000e+00</td>\n <td>0.000000e+00</td>\n </tr>\n <tr>\n <th>max</th>\n <td>5.653400e+05</td>\n <td>1.000000e+00</td>\n <td>1.000000e+00</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 125,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.describe()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### ~5x więcej pozytywnych recenzji niż negatywnych"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Usuwanie pustych wartości"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 126,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": "review_score\n 1 5260420\n-1 1156686\nName: count, dtype: int64"
},
"execution_count": 126,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset[\"review_score\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 127,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:xlabel='review_score'>"
},
"execution_count": 127,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAhYAAAG/CAYAAAAEvJ5oAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAa1ElEQVR4nO3df5BV9X3/8dcCZUmFhQCGsHUFfwz+BAS0lqCISk2pYbBOjLG0YSC2kxnMhGHstIytxqpZM43WJk0IwQhxRotN1CTVBjFOAEdjFBgVDSECErYlggl1+fGd3pjd+/3DcdutgFz8LMvK4zFzZrjnnnvP+65eeXrO2XvrqtVqNQAABfTq7gEAgPcPYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABTTbWGxevXqTJ8+PY2Njamrq8t3v/vdmp+jWq3mS1/6UkaNGpX6+vr83u/9Xm677bbywwIAh6RPd+143759GTt2bObMmZMrr7zysJ7jc5/7XFasWJEvfelLGT16dHbt2pVdu3YVnhQAOFR1R8OXkNXV1eXhhx/OFVdc0bGuUqnkhhtuyL/8y7/kjTfeyNlnn50vfvGLmTJlSpJkw4YNGTNmTF566aWcdtpp3TM4ANDJUXuNxXXXXZcf//jHWbZsWV588cVcddVV+aM/+qO88sorSZJ/+7d/y8knn5xHHnkkJ510UkaOHJlrr73WEQsA6EZHZVhs27YtS5Ysybe//e1ceOGFOeWUU3L99dfnggsuyJIlS5IkW7ZsyS9+8Yt8+9vfzr333pulS5dm7dq1+fjHP97N0wPAsavbrrE4mPXr16etrS2jRo3qtL5SqWTIkCFJkvb29lQqldx7770d233zm9/MhAkTsnHjRqdHAKAbHJVhsXfv3vTu3Ttr165N7969O93Xv3//JMnw4cPTp0+fTvFxxhlnJHnriIewAIAj76gMi3HjxqWtrS07d+7MhRdeuN9tJk2alN/+9rfZvHlzTjnllCTJz3/+8yTJiBEjjtisAMD/6LbfCtm7d282bdqU5K2QuPPOO3PxxRdn8ODBOfHEE/Nnf/Zneeqpp3LHHXdk3Lhxef311/PEE09kzJgxufzyy9Pe3p7zzjsv/fv3z1133ZX29vbMnTs3DQ0NWbFiRXe8JAA45nVbWKxcuTIXX3zxO9bPmjUrS5cuzZtvvplbb7019957b/7zP/8zQ4cOzR/8wR/k5ptvzujRo5Mk27dvz2c/+9msWLEixx13XKZNm5Y77rgjgwcPPtIvBwDIUfI5FgDA+8NR+eumAEDPdMQv3mxvb8/27dszYMCA1NXVHendAwCHoVqtZs+ePWlsbEyvXgc+LnHEw2L79u1pamo60rsFAApoaWnJCSeccMD7j3hYDBgwIMlbgzU0NBzp3QMAh2H37t1pamrq+Hv8QI54WLx9+qOhoUFYAEAP826XMbh4EwAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIrp090DHEtG/s2j3T0CR9DW2y/v7hEAjjhHLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICACimprD4/Oc/n7q6uk7L6aef3lWzAQA9TM1fm37WWWflhz/84f88QR/fvA4AvKXmKujTp08+/OEPH/L2lUollUql4/bu3btr3SUA0EPUfI3FK6+8ksbGxpx88smZOXNmtm3bdtDtm5ubM3DgwI6lqanpsIcFAI5uNYXF+eefn6VLl2b58uVZuHBhXn311Vx44YXZs2fPAR+zYMGCtLa2diwtLS3veWgA4OhU06mQadOmdfx5zJgxOf/88zNixIj867/+az796U/v9zH19fWpr69/b1MCAD3Ce/p100GDBmXUqFHZtGlTqXkAgB7sPYXF3r17s3nz5gwfPrzUPABAD1ZTWFx//fVZtWpVtm7dmqeffjp/8id/kt69e+eaa67pqvkAgB6kpmss/uM//iPXXHNNfv3rX+f444/PBRdckGeeeSbHH398V80HAPQgNYXFsmXLumoOAOB9wHeFAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICACjmPYXF7bffnrq6usybN6/QOABAT3bYYfHcc89l0aJFGTNmTMl5AIAe7LDCYu/evZk5c2YWL16cD37wgwfdtlKpZPfu3Z0WAOD96bDCYu7cubn88sszderUd922ubk5AwcO7FiampoOZ5cAQA9Qc1gsW7Ys69atS3Nz8yFtv2DBgrS2tnYsLS0tNQ8JAPQMfWrZuKWlJZ/73Ofy+OOPp1+/fof0mPr6+tTX1x/WcABAz1JTWKxduzY7d+7M+PHjO9a1tbVl9erV+ed//udUKpX07t27+JAAQM9QU1hceumlWb9+fad1s2fPzumnn56//uu/FhUAcIyrKSwGDBiQs88+u9O64447LkOGDHnHegDg2OOTNwGAYmo6YrE/K1euLDAGAPB+4IgFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGJqCouFCxdmzJgxaWhoSENDQyZOnJgf/OAHXTUbANDD1BQWJ5xwQm6//fasXbs2a9asySWXXJIZM2bk5Zdf7qr5AIAepE8tG0+fPr3T7dtuuy0LFy7MM888k7POOmu/j6lUKqlUKh23d+/efRhjAgA9wWFfY9HW1pZly5Zl3759mThx4gG3a25uzsCBAzuWpqamw90lAHCUqzks1q9fn/79+6e+vj6f+cxn8vDDD+fMM8884PYLFixIa2trx9LS0vKeBgYAjl41nQpJktNOOy3PP/98Wltb853vfCezZs3KqlWrDhgX9fX1qa+vf8+DAgBHv5rDom/fvjn11FOTJBMmTMhzzz2Xf/qnf8qiRYuKDwcA9Czv+XMs2tvbO12cCQAcu2o6YrFgwYJMmzYtJ554Yvbs2ZP7778/K1euzGOPPdZV8wEAPUhNYbFz58586lOfyi9/+csMHDgwY8aMyWOPPZY//MM/7Kr5AIAepKaw+OY3v9lVcwAA7wO+KwQAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIoRFgBAMcICAChGWAAAxQgLAKAYYQEAFCMsAIBihAUAUIywAACKERYAQDHCAgAoRlgAAMUICwCgGGEBABQjLACAYoQFAFCMsAAAihEWAEAxwgIAKEZYAADFCAsAoBhhAQAUIywAgGKEBQBQjLAAAIqpKSyam5tz3nnnZcCAAfn
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dataset[\"review_score\"].value_counts().plot(kind=\"bar\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 128,
2024-06-01 15:52:53 +02:00
"outputs": [],
"source": [
"dataset_without_na = dataset.dropna()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 129,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": " app_id app_name \\\n0 10 Counter-Strike \n1 10 Counter-Strike \n2 10 Counter-Strike \n3 10 Counter-Strike \n4 10 Counter-Strike \n... ... ... \n6417101 99910 Puzzle Pirates \n6417102 99910 Puzzle Pirates \n6417103 99910 Puzzle Pirates \n6417104 99910 Puzzle Pirates \n6417105 99910 Puzzle Pirates \n\n review_text review_score \\\n0 Ruined my life. 1 \n1 This will be more of a ''my experience with th... 1 \n2 This game saved my virginity. 1 \n3 • Do you like original games? • Do you like ga... 1 \n4 Easy to learn, hard to master. 1 \n... ... ... \n6417101 I really ove this game but it needs somethings... -1 \n6417102 Used to play Puzzel Pirates 'way back when', b... -1 \n6417103 This game was aright, though a bit annoying. W... -1 \n6417104 I had a nice review to recommend this game, bu... -1 \n6417105 The puzzles in this game are fun, but you have... -1 \n\n review_votes \n0 0 \n1 1 \n2 0 \n3 0 \n4 1 \n... ... \n6417101 0 \n6417102 0 \n6417103 0 \n6417104 0 \n6417105 0 \n\n[6226728 rows x 5 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>app_id</th>\n <th>app_name</th>\n <th>review_text</th>\n <th>review_score</th>\n <th>review_votes</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>10</td>\n <td>Counter-Strike</td>\n <td>Ruined my life.</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>10</td>\n <td>Counter-Strike</td>\n <td>This will be more of a ''my experience with th...</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>10</td>\n <td>Counter-Strike</td>\n <td>This game saved my virginity.</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>10</td>\n <td>Counter-Strike</td>\n <td>• Do you like original games? • Do you like ga...</td>\n <td>1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>10</td>\n <td>Counter-Strike</td>\n <td>Easy to learn, hard to master.</td>\n <td>1</td>\n <td>1</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>6417101</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>I really ove this game but it needs somethings...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417102</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>Used to play Puzzel Pirates 'way back when', b...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417103</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>This game was aright, though a bit annoying. W...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417104</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>I had a nice review to recommend this game, bu...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6417105</th>\n <td>99910</td>\n <td>Puzzle Pirates</td>\n <td>The puzzles in this game are fun, but you have...</td>\n <td>-1</td>\n <td>0</td>\n </tr>\n </tbody>\n</table>\n<p>6226728 rows × 5 columns</p>\n</div>"
},
"execution_count": 129,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset_without_na"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 130,
2024-06-01 15:52:53 +02:00
"outputs": [],
"source": [
"dataset = dataset_without_na"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 131,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": "review_score\n 1 5126132\n-1 1100596\nName: count, dtype: int64"
},
"execution_count": 131,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset[\"review_score\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Gry z największą liczbą recenzji w zbiorze danych"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 132,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": "app_name\nDayZ 88850\nPAYDAY 2 88783\nTerraria 84702\nRust 77037\nDota 2 73433\nRocket League 54188\nUndertale 51878\nLeft 4 Dead 2 50863\nWarframe 48164\nGrand Theft Auto V 42323\nRobocraft 41596\nStarbound 41141\nPortal 2 38796\nSpace Engineers 37453\nFallout: New Vegas 32918\nArma 3 32262\nThe Witcher 3: Wild Hunt 31830\nHeroes & Generals 31303\nBioShock Infinite 31076\nThe Forest 29998\nName: count, dtype: int64"
},
"execution_count": 132,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset['app_name'].value_counts().nlargest(20)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Recenzje gier dostępnych we wczesnym dostępie wyświetlają się jako \"Early Access Review\", bez tekstu"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 133,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": "review_text\n Early Access Review 977399\n Early Access Review 10571\n10/10 6050\n. 4769\nGreat game 3662\ngreat game 3554\nGreat game! 2440\n:) 2093\nNice game 1793\nGreat Game 1659\n♥♥♥♥ 1645\nGreat game. 1633\ncool 1502\n... 1247\nits good 974\nGreat Game! 924\n9/10 889\n8/10 747\nGreat 746\ni love this game 720\nName: count, dtype: int64"
},
"execution_count": 133,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset['review_text'].value_counts().nlargest(20)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 134,
2024-06-01 15:52:53 +02:00
"outputs": [],
"source": [
"dataset = dataset[dataset['review_text'].str.contains(\"Early Access Review\")==False]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 135,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": "review_text\n10/10 6050\n. 4769\nGreat game 3662\ngreat game 3554\nGreat game! 2440\nName: count, dtype: int64"
},
"execution_count": 135,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset['review_text'].value_counts().nlargest(5)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 136,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": "review_score\n 1 4341259\n-1 897431\nName: count, dtype: int64"
},
"execution_count": 136,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset[\"review_score\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Zbiór danych nadal jest dosyć duży więc obetnę jego większość w celu szybszego treningu"
2024-06-01 15:52:53 +02:00
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 137,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": "review_score\n 1 130102\n-1 27059\nName: count, dtype: int64"
2024-06-01 15:52:53 +02:00
},
"execution_count": 137,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset = dataset.sample(frac=0.03)\n",
2024-06-01 15:52:53 +02:00
"dataset[\"review_score\"].value_counts()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Usunięcie niepotrzebnych kolumn"
2024-06-01 15:52:53 +02:00
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 138,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 157161 entries, 1260671 to 6268511\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 review_text 157161 non-null object\n",
" 1 review_score 157161 non-null int64 \n",
"dtypes: int64(1), object(1)\n",
"memory usage: 3.6+ MB\n"
]
2024-06-01 15:52:53 +02:00
}
],
"source": [
"dataset = dataset.drop(columns=[\"app_id\", \"review_votes\", \"app_name\"])\n",
"dataset.info()"
2024-06-01 15:52:53 +02:00
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
2024-06-01 15:52:53 +02:00
"source": [
"### Podział na zbiory train/test/validate"
2024-06-01 15:52:53 +02:00
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 139,
"outputs": [],
2024-06-01 15:52:53 +02:00
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"train, test_and_valid = train_test_split(dataset, test_size=0.2)\n",
"test, valid = train_test_split(test_and_valid, test_size=0.5)"
2024-06-01 15:52:53 +02:00
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Downsampling klasy pozytywnej dla zbioru treningowego"
],
2024-06-01 15:52:53 +02:00
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 140,
"outputs": [
{
"data": {
"text/plain": "review_score\n 1 104113\n-1 21615\nName: count, dtype: int64"
},
"execution_count": 140,
"metadata": {},
"output_type": "execute_result"
}
],
2024-06-01 15:52:53 +02:00
"source": [
"train[\"review_score\"].value_counts()"
2024-06-01 15:52:53 +02:00
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 141,
2024-06-01 15:52:53 +02:00
"outputs": [],
"source": [
"dataset_positive_reviews = train[train[\"review_score\"]==1]\n",
"dataset_negative_reviews = train[train[\"review_score\"]==-1]\n",
"\n",
"dataset_positive_reviews = dataset_positive_reviews.sample(21615)\n",
"train = pd.concat([dataset_positive_reviews,dataset_negative_reviews])\n",
"train = train.sample(frac=1.0) # Losowanie kolejności przykładów\n",
"train = train.reset_index(drop=True)"
2024-06-01 15:52:53 +02:00
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 142,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='count'>"
},
"execution_count": 142,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZkAAAGFCAYAAAAvsY4uAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAgYklEQVR4nO3deXTV9Z3/8dfNSmICGEIWVllkEcOiWKW1iKgV3DpdsKBFOljsaPuzrR2lnV/dZtpxbGvrjLXHVutYFbWWutbWKghKUUAQBAQCyBK2JISQkH27d/64guwkN/d739/l+TiHE3LxeF5y4/f1/Xw+9/v5hCKRSEQAADggyToAAMC/KBkAgGMoGQCAYygZAIBjKBkAgGMoGQCAYygZAIBjKBkAgGMoGQCAYygZAIBjKBkAgGMoGQCAYygZAIBjKBkAgGMoGQCAYygZAIBjKBkAgGMoGQCAYygZAIBjKBkAgGMoGQCAYygZAIBjKBkAgGMoGQCAYygZAIBjKBkggV544QV94QtfUI8ePRQKhbRq1SrrSICjKBkggerq6nThhRfq/vvvt44CJESKdQAgSKZPny5J2rZtm20QIEEYyQAAHEPJAAAcQ8kADpkzZ46ysrIO/Vq0aJF1JCDhWJMBHHLNNdfo/PPPP/R97969DdMANigZwCHZ2dnKzs62jgGYomSABKqsrFRJSYl2794tSSouLpYkFRQUqKCgwDIa4AjWZIAEeuWVVzRmzBhdeeWVkqSpU6dqzJgxeuSRR4yTAc4IRSKRiHUIAIA/MZIBADiGkgEAOIaSAQA4hpIBADiGkgEAOIaSAQA4hpIBADiGkgEAOIaSAQA4hr3LgHaoqm9W2YEmldc0Hvpa09iq1rawWsMRtYUjag1HDn0fiUhJoZBSk0NKTgopJSmk5KQkpSaHlJqcpO6Zqcrv2kV52enRr13TlZnG/47wH36qEWjVDS3aU90QLY4DjSqviX49WCTlNU0qr2lSc2vY8SzZ6Snq2TX90+L55GvPw77v1T1DXVKTHc8CxAt7lyEw9tc1a82u6uivndGvu6oarGN1SEpSSIPzslTUu5uK+nRTUe9uGl7YleKBa1Ey8CU/FEp7HV48I/t009kUD1yEkoHnNba0afm2/fpwZ5XW7qrW6p3+LZT2Olg8Iz8Z7Yzpd7pG9OqqUChkHQ0BQ8nAkypqm/TW+nK9ub5M/9hUoYaWNutIrpffNV0Th+XrsrPy9NlBuYx0kBCUDDxjY1mN3lxXpvnry7RqR5XC/OTGLCM1WReemavLhudr4vA85WalW0eCT1EycK3WtrCWbavUvHXlmr+hTNv31VtH8qWkkDS6b3ddMjxfl52VryH52daR4COUDFzlQGOLFhbv1bx1ZVpYXK4Dja3WkQKnf49MXTIsX5cOz9NnBuQoJZlnthE7SgausLJkv55eUqK/rN6tpgQ8k4L2yctO19Tz+uq68/uroFsX6zjwIEoGZhpb2vTyql16ekmJ1uyqto6Dk0hJCumS4XmafsEZ+tzgHnxKDe1GySDhtlbU6an3tuvPH+xUdUOLdRx00MCep+n68/vrq+f2UbeMVOs4cDlKBgnRFo7ozXVlenrJdi3+uEL81HlfRmqyrhnVS9PH9dfZvbtZx4FLUTJwVHlNo55btkPPLivRnupG6zhwyOi+3TX9gv66alSh0lN4/gafomTgiOLSGj301ib9/aNStbTxIxYUOaelaep5fXXT+IHqnplmHQcuQMkgrnbur9cv39yol1bu4mHJAMvukqJ/uWiQZn5ugDLSGNkEGSWDuNhX26RfL9isOUtK1NzGR5ARlZedrlsvOVNTz+vL8zYBRcmgU+qaWvXooi16bNFW1Tbx4CSOb0DuabrtsiG6amQhH38OGEoGMWluDWvO0u16eMFmVdQ2W8eBRxT17qbbLx+q8UN6WkdBglAy6JBwOKKXP9ylX765UTsqg72dPmL32UE9NHvSMI3q2906ChxGyaDd3tpQpp+9XqwNpTXWUeATVxQV6F+/MFQDe2ZZR4FDKBmc0raKOv3ohTV6b8s+6yjwoZSkkL5+QX/dMWmoMtNSrOMgzigZnFAkEtH/Lt6mn/+9mEPB4Lh+OZn62VdH6oKBPayjII4oGRzXtoo63TF3tZZtq7SOggAJhaQbLuiv2ZOHMarxCUoGR2D0AjdgVOMflAwOYfQCN2FU4w+UDBi9wNUY1XgbJRNwjF7gBYxqvIuSCShGL/AiRjXeQ8kEUEVtk7495wMt3croBd4TCkmzPj9QsycNU3IS+6C5HSUTMGt3VeumJ5drNweIwePGD+mph6aN4Qhol6NkAuQvq3fr9j+tZnoMvjEw9zQ9OmOsBrEtjWtRMgEQiUT0wBsb9esFm62jAHGX3SVFD00bowlD86yj4DgoGZ+ra2rV9/+4Sm+sK7OOAjgmKST9cPIw3TR+kHUUHIWS8bEdlfX65h+Wq7iMXZMRDF8e01v3faVI6Skc+ewWlIxPvftxhb495wPtr2+xjgIk1Oi+3fW76ecqr2sX6ygQJeNLT763Tf/+6jq1hnlrEUz5XdP1u+ljORTNBSgZH2lpC+uulz/Ss8tKrKMA5tJTkvRfXynSl8b0sY4SaJSMT1Q3tGjWk8u1jAcsgSPcPGGQZk8aZh0jsCgZH6isa9b03y/VR7sPWEcBXOnrF/TTf3zxbIVC7BCQaJSMx+2tadL1jy3RxrJa6yiAq005t4/u/8pIJbEVTUJRMh5WWt2o6x5boi1766yjAJ7wxdG99MtrR7PnWQJRMh61c3+9rnt0qUoq662jAJ4y+ewC/c+0MUpNTrKOEgiUjAeV7KvXtEeXaFdVg3UUwJMuHZ6n31x/rtJSKBqn8TfsMbuqGigYoJPmrS/Xd575QK1tYesovkfJeEjZgUZdR8EAcfHGujJ994+r1MZDy46iZDxib02Tpj26RNv3sQYDxMtrq/fo9j99qDBF4xhKxgP21zXr648t5VNkgANeWLlL//+lNWJ52hmUjMtVN7Ro+uNL2UkZcNCzy3bo3lfXWcfwJUrGxVrbwrplzgqt3cWT/IDTnnh3m3779sfWMXyHknGxn7y2Xos377OOAQTG/a9v0ILicusYvkLJuNRzy0r0xLvbrGMAgRKOSLc+u1Kby9mmKV4oGRdavq1Sd738kXUMIJBqGls168nlqm7gwL94oGRcZndVg/7l6RVq5iExwMzWijp955kPeIYmDigZF2lobtOsJ5erorbZOgoQeIs2Veg//7reOobnUTIu8q9zP+RMGMBFfv+PrZq7Yqd1DE+jZFziofmb9NrqPdYxABzl315cow9K9lvH8CxKxgX+/lGpfjlvo3UMAMfR3BrWt55aodLqRusonkTJGCsurdFtf1wldrQA3GtvTZNuemq5GlvarKN4DiVjaH9ds2Y9uVx1zfzgAm63eme1Zv95tXUMz6FkDH3/+VWcbAl4yMurduvJ97ZZx/AUSsbI8+/v0MLivdYxAHTQf/1tg0o4cqPdKBkDe6ob9B+vseMr4EX1zW26fe6HHA3QTpSMgR/+eY1qGlutYwCI0dKtlXryve3WMTyBkkmw59/fobc3Mk0GeN39rzNt1h6UTAIxTQb4B9Nm7UPJJBDTZIC/MG12apRMgjBNBvgT02YnR8kkANNkgH8xbXZylEwCME0G+BvTZidGyTiMaTIgGJg2Oz5KxkFMkwHBwbTZ8VEyDrrnlY+YJgMCZOnWSj2/fId1DFehZBzyQcl+/f2jMusYABLsV29u4kiAw1AyDrn/bxusIwAwUHqgUX94d5t1DNegZBywoLhcS7dWWscAYOQ3Cz9WdUOLdQxXoGTiLBKJ6GevF1vHAGCouqFFj7z9sXUMV6Bk4uyVD3dr/Z4D1jEAGHti8TaVH2i0jmGOkomjlrawHnhjo3UMAC7Q0NKmB+dvso5hjpKJo2eWlnCcMoBDnn9/h7ZW1FnHMEXJxEl9c6seemuzdQwALtIajugXbwR7jZaSiZPHFm1VRW2TdQwALvPXNXu0Zme1dQwzlEwcVNY169F3tljHAOBCkUh0X7OgomTi4OEFm1XTxPYxAI7vH5s
2024-06-01 15:52:53 +02:00
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"train[\"review_score\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 143,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='count'>"
},
"execution_count": 143,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZkAAAGFCAYAAAAvsY4uAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAoaklEQVR4nO3deXxU1cH/8e9kD0lIQjaSIEsI+w5uqEWwVbGuVXFHqv3Zah93H6H2qVqX2hetbW2tSmsf6lr3Bfu4L4CAImvYQZYkJEASspM9mZnfHyiVPUzmzrn3zuf9evGSJLxmvpIw3zn3nHuOx+/3+wUAgAUiTAcAALgXJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwDCUDALAMJQMAsAwlAwCwTJTpAICdtXX4VNvcprqmdtU1t6v2m//Wt7Srqc2rlnavmtq8am73qqXNq3afX9ERHsVERez9FRmh6G/+GxMVodioCEVH/udrCbFRyuoeq+zkeGUkxSoywmP6fxkIKkoGYa+qoVXF1U3aXtWk4qomFVc1qrh67+8rG1pDliMywqOMxFj1TI5Tz+5x6pkcp+zkuH0fZyfHKzslTtGRXICAc3j8fr/fdAjAal6fXxt21WvtjjoVVjXuK5Tt1U1qaO0wHa/TYiIjlJ+ZqGE53ff+yk3WkOzuSozl/SLsiZKBK1U2tGpFcY1WltRqRXGN1uyoU1Ob13QsS3g8Ut+0BA3N6a6h2d+UT06yMpJiTUcDKBk4X4fXp/W76rVye61WbK/Riu01KqluNh3LuMykWI3rk6pT8tN1Wn66+qUnmI6EMETJwJG2VzXpo/Vl+nRDhVaW1Kil3Wc6ku3lJMfplPx0nZqfpgkDMpSWyEgH1qNk4Ah+v1+rSuv08foyfby+XF+XN5iO5GgRHmlErxRNGpShMwZnakRusjweVrYh+CgZ2FZrh1dfbK3Sx+vL9emGcpXXh26lV7jJSIrVpEEZunB0rsbnpSmCpdQIEkoGttLQ2rFvtDJ/0241unSy3s5ykuN04ZhcXTI2V/mZSabjwOEoGdjCksJqvbK0RO+v3eXaVWBONLJXsi4ek6sLRueqR0KM6ThwIEoGxlTUt+i15aV6fXmpCisbTcfBEURHenT6wAxdPLaXvj8kU7FRkaYjwSEoGYTcF1sq9dyXxfpkQ7k6fPz4OU1yfLTOG5mtaaf01cAsLqfhyCgZhER9S7veWF6qFxYXa+tuRi1u4PFIEwdm6MbT++ukvDTTcWBTlAwsVVHfolnzt+nlpduZa3GxMb1T9LMJeTpraE9WpmE/lAwsUV7foqfmbdVLS7artYMbJcNFXnqCbpiQp4vH5jJvA0mUDIKMcoG0976b607tq2tO7qPucdGm48AgSgZBUVbXoifnbdHLS0vURrngG4mxUbrqpN666fT+SmUJdFiiZNAlu+qa9eTcrXplGeWCw+seF6Vbvz9A007py3k4YYaSQUCqGlr12Ceb9crSErV5KRd0Tt+0bvrFOUM0eXhP01EQIpQMjonP59eLXxXr0Y++Vl1zu+k4cKiT83roV+cO1fDcZNNRYDFKBp22cnuN7p2zVmt31JuOAheI8EgXj+2l6WcPUmb3ONNxYBFKBkdV09immR9s1CvLSsRPC4KtW0ykfjahv346IU/xMSx7dhtKBofl8/n18tIS/f7Djapp4tIYrJWdHKdfnTtU547MNh0FQUTJ4JDWlNbpV3PWalVJrekoCDOTh/XUgxcNU2YSl9DcgJLBfupb2jXz/Y16acl2sXclTEnpFq17zx2qS8b1Mh0FXUTJYJ9lRdW67eUC7ahtNh0FkCRNGpSh3148Uj2TGdU4FSUDeX1+/eXTzfrr3C3yMnyBzSTHR+uhi4brglE5pqMgAJRMmCutadLtLxdoWXGN6SjAEV0wKkcPXTRcyfHsheYklEwY+7/VO/XLN9eovqXDdBSgU3KS4/TolFE6JT/ddBR0EiUThhpbO3T/O+v0+vJS01GAY+bxSD+f2F93nTmIs2scgJIJM6tLa3XbywUqrOR0SjjbxEEZ+vMVY7h8ZnOUTJjw+/36++fb9OhHm9Tu5VsOd+iXnqC/Tx2nAVlJpqPgMCiZMNDc5tV/v7ZK767ZZToKEHSJsVF6dMoodna2KUrG5XbVNeuG55axqSVczeORbpmUrzvOHCiPh3kaO6FkXGzl9hr99Pnl2r2n1XQUICR+MCRTf7p8tJI48tk2KBmXmlOwQ9NfX61WTqtEmMnLSNDT1x6v/hmJpqNAlIwr/fWzzXr0o69NxwCMSYqN0mNXjNb3h2SZjhL2KBkX6fD6dO+ctXppSYnpKIBxkREe/f7Skbp4LJtsmkTJuERja4d+/uIKzf96t+kogG14PNKDFw7X1JP7mI4StigZF6hsaNW02Uu0bicryIBD+cU5g3Xj6f1NxwhLlIzDVTa06qqnF+vr8gbTUQBb+69J/XX32YNNxwg7lIyDUTDAsfnxKX11//lDuZcmhCgZh6pqaNWVFAxwzKaM66WZl4xkc80QoWQcqKqhVVc9/ZU2le8xHQVwpHNHZuuxy0crOjLCdBTXo2QcpqqhVVf/4yttLKNggK6YNChDT10zTnHRkaajuBo17iDVjW0UDBAkczft1k0vLFeHl10xrETJOER1Y5uuenoxBQME0dxNu3XPm2tMx3A1SsYBKBjAOq8tL9XvP9xoOoZrUTI219zm1Y//uYSCASz0xNyteu7LItMxXImSsTG/36/bX1mp1aV1pqMArvfrd9bpfQ72CzpKxsZmfrBJH64rNx0DCAs+v3TbKwX6aluV6SiuQsnY1KtLSzRr/lbTMYCw0tbh0w3PLdMmLk8HDSVjQ19urdL/vM2KF8CE+pYOTZu9RDtrm01HcQVKxma27W7QTS8uV7uXe2QBU8rqW3Tt7CWqbWozHcXxKBkbqW1q00+eXabapnbTUYCwt6WiQTf/a6V8Pt7wdQUlYxPtXp9ufGG5CisbTUcB8I2FWyr16EebTMdwNErGJn755hot3lZtOgaAAzw1f6s+WldmOoZjUTI28MyiQr22vNR0DACH4PdLd722iqsMAaJkDNuwq16PvM+WFoCd7Wnp0I3PL1dzm9d0FMehZA7w+eef6/zzz1dOTo48Ho/efvtty56rpd2r215eqbYOdoEF7G5T+R7d/85a0zEch5I5QGNjo0aNGqUnnnjC8uf6zbsbONkScJBXl5VqTsEO0zEcJcp0ALs555xzdM4551j+PJ+sL9fzi4stfx4AwfU/b63VyF4p6peeYDqKIzCSMaBiT4tmvLHadAwAAWho7dDN/1qh1g7mZzqDkgkxv9+vu15dpapG7iQGnGrdznrNfJ/7ZzqDkgmx/11YqAWbK03HANBFz3xRqOXFNaZj2B4lE0Lrdtbpdx/w7gdwA59f+sUbq1kdehSUTIg0t3l128sFavPyAwm4xeaKBj01jyM5joSSOUBDQ4MKCgpUUFAgSSosLFRBQYG2b9/epcf948ebtKWC5cqA2zwxd4u2VHD+zOF4/H4/W4x+x7x58zRp0qSDPj9t2jQ988wzAT3m+p31uuCvC9XBbq6AKx3fJ1Wv3TheHo/HdBTboWQs5vf7dfFTX2jl9lrTUQBY6KELh2nq+L6mY9gOl8ss9q8l2ykYIAz87oNNKqtrMR3DdigZC1U2tGomm18CYWFPa4d+9TZ7mx2IkrHQb9/bqPqWDtMxAITIJxvK9e7qXaZj2AolY5GV22v05krOiAHCzf3vrFN9C0eof4uSsYDf79ev/71eLKkAwk9lQ6v+Np97Z75FyVjgjRU7tKqk1nQMAIb8c1GRdu9pNR3DFiiZIGt
2024-06-01 15:52:53 +02:00
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"test[\"review_score\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 144,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:ylabel='count'>"
},
"execution_count": 144,
2024-06-01 15:52:53 +02:00
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZkAAAGFCAYAAAAvsY4uAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAo9UlEQVR4nO3dd5yU1aH/8e9s77ssW9hdepciIBBEEYGoiIqiqMF+YzRXvTeJ2AjmZzTFmJjojfFGTUjQqBgLNoyxB5Qi3aWDtGWXsgW295md+f2B4UqRMjvPnOd55vN+vXi5O8DsV1jmO+c85znHEwgEAgIAwAJRpgMAANyLkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYhpIBAFiGkgEAWIaSAQBYJsZ0AMDOWn1+HWhoUW2TT7XNXtU0elXb7FVtk1cNrW1q9h780eRtU7PXr2ZvmyQpPiZa8bFRio+JUlxM1MHPYw5+Hh/7tY9jopWZHKe89AR1Sk9QbDTv++AulAwiXkVdi4orG7W7qlHFBxpVXHnwR0llo0prm+UPhCdHlEfqmBKv/PQE5aUnKi8jQfnpieqUnqD8jIOP5aYlKDrKE55AQAh4AoFAmP4JAeYEAgFtK6/XFyXV2lJap10HDpZISVWjGlvbTMc7adFRHvXMStaggnQNzE879N/UhFjT0YBjomTgSvvrW1RYXK3Ckmp9UVKltSU1qmvxmY5lCY9H6paZpIEF6RqUn65BBWkalJ+uDslxpqMBlAycr9nbpg17aw8WSnGVCkuqtbuqyXQs4woyEnV653Sd2bOjzu6dpd45KaYjIQJRMnCkvdVN+nhTmT7eVK6lOw6o1ec3Hcn28tITdHbvLI3pnaUxfbKUlRJvOhIiACUDRwgEAlqzu0affFUsm/bVmo7kaB6PdHpBuib0z9WE/jkaVJAmj4cFBQg9Sga21dTapkXb9uuTTWX61+Zylde1mI7kWrlp8RrfL0eTBudpTO8sVrAhZCgZ2Epjq0/vrSvVP9ft0+Lt+9XsZRos3HLT4jVlaIGmDu+svrmppuPA4SgZ2MKKokq9trJE/1xXqnqXrgJzosEF6Zp6RoEuHVqgTFarIQiUDIwpr23Wa6t2a+6q3dq5v8F0HBxHbLRH4/vlaOrwzprQP4edCXDSKBmEVSAQ0JLtB/Ti0l36aGOZfOG6nR4h0yEpVpcOydd3RnbVgPw003Fgc5QMwqKm0avXVpXopWXF2sGoxTXG9s3Wbef21Fm9skxHgU1RMrBUZUOrZi3coRc+38W1Fhcb0jldt53bSxMHdlIUK9PwNZQMLFFR16I/f7Zdc5YVO2pvMLRPz6xkfX9sT11xRmfFxXDdBpQMQqystllPL9iul1cUs/w4guWmxevms3vo2lFd2bwzwlEyCIk91U16esE2vbpyN1u84JDUhBhdf2Y33TKmhzqyjU1EomTQLsUHGvXUgm16ffVuedv4VsKxpcbH6L8m9NbNZ/dgGi3CUDIISk2TV49/uEVzlhWzDBknrWtmkmZO6q9Jg/NMR0GYUDI4JYFAQK+sKNFvP9iiAw2tpuPAoUb1yNQDlwzQoIJ001FgMUoGJ23t7mo98PYGrSmpNh0FLhDlkaae0Vn3XthPOakJpuPAIpQMTqiyoVWPvr9Zr64sCdt594gcyXHRun1cL91yTk8lxEabjoMQo2Twjdr8Ac1ZtkuPffilapq8puPA5QoyEjXzov665PR801EQQpQMjmnVrko98NYGbeRwMITZ+QNy9fDlg5hCcwlKBodpbPXpF//YqJdXlIjvDJiSnhirBycP0BVndDYdBe1EyeCQNSXVuvOVQrbdh21M6J+jX10+WJ3SGdU4FSUD+f0BPbVgm37/8VbueYHtpCXE6OHLB2vyEK7VOBElE+H2VDdp+suFWl5UaToKcFxThubr51MGKY290ByFkolgbxfu0f97a73qmtmCH85QkJGox64eojN7djQdBSeJkolAdc1ePfDWer1VuNd0FOCURXmk287tpXsu6MfZNQ5AyUSYlUWVuvOVQu2uajIdBWiXsX2z9eS0YUpPYvrMziiZCPLMp9v12w+2qI2L+3CJ7h2T9OcbR6hvbqrpKPgGlEwEaPa2acbra/U202NwoeS4aD3+naGaOLCT6Sg4BkrG5UprmvX9F1Zq7e4a01EAy3g80g8m9NH08/rI4+E6jZ1QMi62aleVbntxlSrqWkxHAcLi/AG5+p/vDFVKfIzpKPgKJeNSbxfu0b1z13IUMiJOn5wU/fnGEeqRlWw6CkTJuNL//murHvvoS/YeQ8RKS4jRE9cM0/h+OaajRDxKxkV8bX7d/+Y6vbpyt+kogHHRUR79+orBumpEF9NRIhol4xJ1zV7d/uJqLdq233QUwDY8HumhyQN101ndTUeJWJSMC9Q0enX9X5dp3R5WkAHHct+F/XTHuN6mY0QkSsbhqhtbdd1flmnDXg4XA47nv8b30r0T+5uOEXEoGQerajhYMJxeCZyc757dXT+9ZAD30oQRJeNQlQ2tunbWUm0urTMdBXCUq0d01q+vOJ3NNcOEknGg/fUtum7WMm0po2CAYEwekq//uXqIYqKjTEdxPUrGYSrqWnTtrKXaWl5vOgrgaOedlqs/XjdM8THRpqO4GiXjIOV1zbp21jJto2CAkBjXL1t/uXEEIxoL8SfrEOW1zZr256UUDBBCC7ZUaMbr60zHcDVKxgGqGlo1bdZS7ahoMB0FcJ3XV+/Wo+9vNh3DtSgZm2vxten7L6ykYAALPbVgu57/vMh0DFeiZGwsEAjo3tfWakVRlekogOs9NG+D3lu3z3QM16FkbOzxj77UvDWcZgmEgz8g3flKoZbvrDQdxVUoGZuau2q3nvzXNtMxgIjS4vPrlr+t0JfcgxYylIwNLdm+XzPfWGs6BhCRapt9umn2cu2raTIdxRUoGZvZVl6v219cLW8bty8BpuyradZ/zF6hmiav6SiOR8nYyIH6Ft38HN/YgB1sKavTHXNWqc3PG772oGRsotnbplufX6niykbTUQB8ZfG2A3rswy2mYzgaJWMTP359rVYXV5uOAeAIT3+6XR9uKDUdw7EoGRt4dUWJ3ipkqTJgR4GAdPdra1S0nxuig0HJHOGzzz7T5MmTlZ+fL4/Ho7feesvSr7etvF4Pzttg6dcA0D51zT7d9uIqNbW2mY7iOJTMERoaGjRkyBD98Y9/tPxrtfja9MO/f6EmL9+4gN1tLq3TT99ebzqG48SYDmA3kyZN0qRJk8LytR7552aOTgYc5LVVuzWmT5YuG1pgOopjMJIx5JNNZXpuSZHpGABO0U/eXM/1mVNAyRhQVtuse+dyRz/gRPUtPv3331er1ec3HcURKJkw8/sDmv5KoSobWk1HARCk9Xtq9ch7m0zHcARKJsye/nS7lmw/YDoGgHZ6bkmRVhSxY/OJUDJhtLq4Sv/z0ZemYwAIgUBAmvnGOqbNToCSOUJ9fb0KCwtVWFgoSdq5c6cKCwtVXFzcrudt9rbprlcK5WMfJMA1tpXX64/zOZLjeDyBQIBXva9ZsGCBxo8ff9TjN910k5577rmgn/e3H2zWH+dvb0cyAHYUFx2ld384Rn1yU01HsSVKJgy2lNbpkicXsn0/4FIjunXQa7eNlsfjMR3Fdpgus1ggENDMN9ZSMICLrdxVpReXtW9K3a0oGYu9uKyY3ZWBCPDoe5tVWtNsOobtUDIW2l/fokff32w6BoAwqGvx6QH2NjsKJWOhX7+3WXXNPtMxAITJRxvL9N66faZj2AolY5HVxVV6ffVu0zEAhNmD8zaotpkj1P+NkrGA3x/QQ/M2iHV7QOQpr2v
2024-06-01 15:52:53 +02:00
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"valid[\"review_score\"].value_counts().plot(kind=\"pie\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 145,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 43230 entries, 0 to 43229\n",
2024-06-01 15:52:53 +02:00
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 review_text 43230 non-null object\n",
" 1 review_score 43230 non-null int64 \n",
2024-06-01 15:52:53 +02:00
"dtypes: int64(1), object(1)\n",
"memory usage: 675.6+ KB\n"
2024-06-01 15:52:53 +02:00
]
}
],
"source": [
"train.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 146,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 15716 entries, 1265039 to 5454569\n",
2024-06-01 15:52:53 +02:00
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 review_text 15716 non-null object\n",
" 1 review_score 15716 non-null int64 \n",
2024-06-01 15:52:53 +02:00
"dtypes: int64(1), object(1)\n",
"memory usage: 368.3+ KB\n"
2024-06-01 15:52:53 +02:00
]
}
],
"source": [
"test.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 147,
2024-06-01 15:52:53 +02:00
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 15717 entries, 5012153 to 4962820\n",
2024-06-01 15:52:53 +02:00
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 review_text 15717 non-null object\n",
" 1 review_score 15717 non-null int64 \n",
2024-06-01 15:52:53 +02:00
"dtypes: int64(1), object(1)\n",
"memory usage: 368.4+ KB\n"
2024-06-01 15:52:53 +02:00
]
}
],
"source": [
"valid.info()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Przykłady z każdego podzbioru"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 148,
"outputs": [
{
"data": {
"text/plain": " review_text review_score\n0 I'm the biggest fan you will ever meet of tile... -1\n1 Really an improvement on the old game (Which w... 1\n2 celebrating the four year birthday of payday w... -1\n3 Only fun when playing with friends. Can't join... -1\n4 While smashing planets together can be fun, th... -1",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>review_text</th>\n <th>review_score</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>I'm the biggest fan you will ever meet of tile...</td>\n <td>-1</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Really an improvement on the old game (Which w...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>celebrating the four year birthday of payday w...</td>\n <td>-1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Only fun when playing with friends. Can't join...</td>\n <td>-1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>While smashing planets together can be fun, th...</td>\n <td>-1</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 148,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 149,
"outputs": [
{
"data": {
"text/plain": " review_text review_score\n1265039 I love the Fact you can do what EVER you want ... 1\n3132003 Tony Hawk's without the Pro Skater. Finding ou... 1\n880195 It's pretty good. 1\n717128 This the best dungeon game I have played since... 1\n5221356 Totally awesome game alone or with a friend. I... 1",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>review_text</th>\n <th>review_score</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>1265039</th>\n <td>I love the Fact you can do what EVER you want ...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3132003</th>\n <td>Tony Hawk's without the Pro Skater. Finding ou...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>880195</th>\n <td>It's pretty good.</td>\n <td>1</td>\n </tr>\n <tr>\n <th>717128</th>\n <td>This the best dungeon game I have played since...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>5221356</th>\n <td>Totally awesome game alone or with a friend. I...</td>\n <td>1</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 149,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 150,
"outputs": [
{
"data": {
"text/plain": " review_text review_score\n5012153 ..it's like nights into dreams and treasures o... 1\n5818758 As someone who mostly just likes making cool s... 1\n4582102 What can I say about this game the story is sh... 1\n5242842 A very unique and enjoyable puzzle solving str... 1\n5400923 A very adorable, charming game. 1",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>review_text</th>\n <th>review_score</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>5012153</th>\n <td>..it's like nights into dreams and treasures o...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>5818758</th>\n <td>As someone who mostly just likes making cool s...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4582102</th>\n <td>What can I say about this game the story is sh...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>5242842</th>\n <td>A very unique and enjoyable puzzle solving str...</td>\n <td>1</td>\n </tr>\n <tr>\n <th>5400923</th>\n <td>A very adorable, charming game.</td>\n <td>1</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 150,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"valid.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Zapis do csv"
],
"metadata": {
"collapsed": false
}
},
2024-06-01 15:52:53 +02:00
{
"cell_type": "code",
"execution_count": 151,
2024-06-01 15:52:53 +02:00
"outputs": [],
"source": [
"train.to_csv(\"train.csv\")\n",
"test.to_csv(\"test.csv\")\n",
"valid.to_csv(\"valid.csv\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}