232 lines
5.8 KiB
Plaintext
232 lines
5.8 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b726950a",
|
|
"metadata": {},
|
|
"source": [
|
|
"<font size=\"5\">**1. Pobieramy wybrany zbiór**</font>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "13106acf",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!pip install --user kaggle \n",
|
|
"!pip install --user pandas\n",
|
|
"!kaggle datasets download -d mterzolo/lego-sets\n",
|
|
"!unzip -o lego-sets.zip"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "661a8c28",
|
|
"metadata": {},
|
|
"source": [
|
|
"<font size=\"5\">**2. Dokonujemy inspekcji danych**</font>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8dc2c5fa",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!pip install pandas"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "90670da6",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!wc -l lego_sets.csv\n",
|
|
"!head -n 5 lego_sets.csv # duzo tekstu w niektorych kolumnach..."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e92afb9c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"lego = pd.read_csv('lego_sets.csv')\n",
|
|
"lego # wglad w strukture elementow i klasy, wielkosc itd."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "824ffb81",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"lego.describe(include='all') # srednia, odchylenie standardowe itd."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "290de05b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"lego[\"theme_name\"].value_counts() # rozklad czestosci dla przykladowej klasy (tematyka zestawu)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "151119d7",
|
|
"metadata": {},
|
|
"source": [
|
|
"<font size=\"5\">**3. Preprocessing**</font>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "7327e72b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!grep -P \"^$\" -n lego_sets.csv # puste linie - nie ma\n",
|
|
"!grep -P \",,\" -n lego_sets.csv # puste pola"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9e0a4327",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# usuwamy przyklady z pustymi polami\n",
|
|
"lego_all = pd.read_csv('lego_sets.csv').dropna()\n",
|
|
"lego_all.to_csv('lego_sets_clean.csv', index = None, header=True)\n",
|
|
"lego_clean = pd.read_csv('lego_sets_clean.csv')\n",
|
|
"lego_clean"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "89840c87",
|
|
"metadata": {},
|
|
"source": [
|
|
"<font size=\"5\">**4. Normalizacja**</font>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c1f33e04",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!pip install --user numpy\n",
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"# list_price moze byc do dwoch miejsc po przecinku\n",
|
|
"lego_clean['list_price'] = lego_clean['list_price'].round(2)\n",
|
|
"\n",
|
|
"# num_reviews, piece_count i prod_id moga byc wartosciami calkowitymi\n",
|
|
"lego_clean['num_reviews'] = lego_clean['num_reviews'].apply(np.int64)\n",
|
|
"lego_clean['piece_count'] = lego_clean['piece_count'].apply(np.int64)\n",
|
|
"lego_clean['prod_id'] = lego_clean['prod_id'].apply(np.int64)\n",
|
|
"\n",
|
|
"# czysto dla przykladu normalizujemy pozostale floaty (chociaz nie trzeba, wszystkie juz sa w tej samej skali)\n",
|
|
"lego_clean['play_star_rating'] = (lego_clean['play_star_rating'] - lego_clean['play_star_rating'].min() ) / (lego_clean['play_star_rating'].max() - lego_clean['play_star_rating'].min())\n",
|
|
"lego_clean['star_rating'] = (lego_clean['star_rating'] - lego_clean['star_rating'].min() ) / (lego_clean['star_rating'].max() - lego_clean['star_rating'].min())\n",
|
|
"lego_clean['val_star_rating'] = (lego_clean['val_star_rating'] - lego_clean['val_star_rating'].min() ) / (lego_clean['val_star_rating'].max() - lego_clean['val_star_rating'].min())\n",
|
|
"\n",
|
|
"lego_clean.to_csv('lego_sets_clean_normalised.csv', index = None, header=True)\n",
|
|
"lego_clean_normalised = pd.read_csv('lego_sets_clean_normalised.csv')\n",
|
|
"lego_clean_normalised"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "739ea946",
|
|
"metadata": {},
|
|
"source": [
|
|
"<font size=\"5\">**5. Podział na podzbiory**</font>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1ed5b5bb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!pip install --user sklearn\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"\n",
|
|
"# pierwszy podzial, wydzielamy zbior treningowy\n",
|
|
"lego_train, lego_rem = train_test_split(lego_clean_normalised, train_size=0.8, random_state=1)\n",
|
|
"\n",
|
|
"# drugi podział, wydzielamy walidacyjny i testowy\n",
|
|
"lego_valid, lego_test = train_test_split(lego_rem, test_size=0.5, random_state=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9d0bdaf9",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"lego_train"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "dc151dc5",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"lego_valid"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4d6ba0fb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"lego_test"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.7"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|