{ "cells": [ { "cell_type": "markdown", "id": "b726950a", "metadata": {}, "source": [ "**1. Pobieramy wybrany zbiór**" ] }, { "cell_type": "code", "execution_count": null, "id": "13106acf", "metadata": {}, "outputs": [], "source": [ "!pip install --user kaggle \n", "!pip install --user pandas\n", "!kaggle datasets download -d mterzolo/lego-sets\n", "!unzip -o lego-sets.zip" ] }, { "cell_type": "markdown", "id": "661a8c28", "metadata": {}, "source": [ "**2. Dokonujemy inspekcji danych**" ] }, { "cell_type": "code", "execution_count": null, "id": "8dc2c5fa", "metadata": {}, "outputs": [], "source": [ "!pip install pandas" ] }, { "cell_type": "code", "execution_count": null, "id": "90670da6", "metadata": {}, "outputs": [], "source": [ "!wc -l lego_sets.csv\n", "!head -n 5 lego_sets.csv # duzo tekstu w niektorych kolumnach..." ] }, { "cell_type": "code", "execution_count": null, "id": "e92afb9c", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "lego = pd.read_csv('lego_sets.csv')\n", "lego # wglad w strukture elementow i klasy, wielkosc itd." ] }, { "cell_type": "code", "execution_count": null, "id": "824ffb81", "metadata": {}, "outputs": [], "source": [ "lego.describe(include='all') # srednia, odchylenie standardowe itd." ] }, { "cell_type": "code", "execution_count": null, "id": "290de05b", "metadata": {}, "outputs": [], "source": [ "lego[\"theme_name\"].value_counts() # rozklad czestosci dla przykladowej klasy (tematyka zestawu)" ] }, { "cell_type": "markdown", "id": "151119d7", "metadata": {}, "source": [ "**3. Preprocessing**" ] }, { "cell_type": "code", "execution_count": null, "id": "7327e72b", "metadata": {}, "outputs": [], "source": [ "!grep -P \"^$\" -n lego_sets.csv # puste linie - nie ma\n", "!grep -P \",,\" -n lego_sets.csv # puste pola" ] }, { "cell_type": "code", "execution_count": null, "id": "9e0a4327", "metadata": {}, "outputs": [], "source": [ "# usuwamy przyklady z pustymi polami\n", "lego_all = pd.read_csv('lego_sets.csv').dropna()\n", "lego_all.to_csv('lego_sets_clean.csv', index = None, header=True)\n", "lego_clean = pd.read_csv('lego_sets_clean.csv')\n", "lego_clean" ] }, { "cell_type": "markdown", "id": "89840c87", "metadata": {}, "source": [ "**4. Normalizacja**" ] }, { "cell_type": "code", "execution_count": null, "id": "c1f33e04", "metadata": {}, "outputs": [], "source": [ "!pip install --user numpy\n", "import numpy as np\n", "\n", "# list_price moze byc do dwoch miejsc po przecinku\n", "lego_clean['list_price'] = lego_clean['list_price'].round(2)\n", "\n", "# num_reviews, piece_count i prod_id moga byc wartosciami calkowitymi\n", "lego_clean['num_reviews'] = lego_clean['num_reviews'].apply(np.int64)\n", "lego_clean['piece_count'] = lego_clean['piece_count'].apply(np.int64)\n", "lego_clean['prod_id'] = lego_clean['prod_id'].apply(np.int64)\n", "\n", "# czysto dla przykladu normalizujemy pozostale floaty (chociaz nie trzeba, wszystkie juz sa w tej samej skali)\n", "lego_clean['play_star_rating'] = (lego_clean['play_star_rating'] - lego_clean['play_star_rating'].min() ) / (lego_clean['play_star_rating'].max() - lego_clean['play_star_rating'].min())\n", "lego_clean['star_rating'] = (lego_clean['star_rating'] - lego_clean['star_rating'].min() ) / (lego_clean['star_rating'].max() - lego_clean['star_rating'].min())\n", "lego_clean['val_star_rating'] = (lego_clean['val_star_rating'] - lego_clean['val_star_rating'].min() ) / (lego_clean['val_star_rating'].max() - lego_clean['val_star_rating'].min())\n", "\n", "lego_clean.to_csv('lego_sets_clean_normalised.csv', index = None, header=True)\n", "lego_clean_normalised = pd.read_csv('lego_sets_clean_normalised.csv')\n", "lego_clean_normalised" ] }, { "cell_type": "markdown", "id": "739ea946", "metadata": {}, "source": [ "**5. Podział na podzbiory**" ] }, { "cell_type": "code", "execution_count": null, "id": "1ed5b5bb", "metadata": {}, "outputs": [], "source": [ "!pip install --user sklearn\n", "from sklearn.model_selection import train_test_split\n", "\n", "# pierwszy podzial, wydzielamy zbior treningowy\n", "lego_train, lego_rem = train_test_split(lego_clean_normalised, train_size=0.8, random_state=1)\n", "\n", "# drugi podział, wydzielamy walidacyjny i testowy\n", "lego_valid, lego_test = train_test_split(lego_rem, test_size=0.5, random_state=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "9d0bdaf9", "metadata": {}, "outputs": [], "source": [ "lego_train" ] }, { "cell_type": "code", "execution_count": null, "id": "dc151dc5", "metadata": {}, "outputs": [], "source": [ "lego_valid" ] }, { "cell_type": "code", "execution_count": null, "id": "4d6ba0fb", "metadata": {}, "outputs": [], "source": [ "lego_test" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }