diff --git a/02_zadanie.ipynb b/02_zadanie.ipynb deleted file mode 100644 index b2d4597..0000000 --- a/02_zadanie.ipynb +++ /dev/null @@ -1,232 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "b726950a", - "metadata": {}, - "source": [ - "**1. Pobieramy wybrany zbiór**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "13106acf", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install --user kaggle \n", - "!pip install --user pandas\n", - "!kaggle datasets download -d mterzolo/lego-sets\n", - "!unzip -o lego-sets.zip" - ] - }, - { - "cell_type": "markdown", - "id": "661a8c28", - "metadata": {}, - "source": [ - "**2. Dokonujemy inspekcji danych**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8dc2c5fa", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install pandas" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90670da6", - "metadata": {}, - "outputs": [], - "source": [ - "!wc -l lego_sets.csv\n", - "!head -n 5 lego_sets.csv # duzo tekstu w niektorych kolumnach..." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e92afb9c", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "lego = pd.read_csv('lego_sets.csv')\n", - "lego # wglad w strukture elementow i klasy, wielkosc itd." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "824ffb81", - "metadata": {}, - "outputs": [], - "source": [ - "lego.describe(include='all') # srednia, odchylenie standardowe itd." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "290de05b", - "metadata": {}, - "outputs": [], - "source": [ - "lego[\"theme_name\"].value_counts() # rozklad czestosci dla przykladowej klasy (tematyka zestawu)" - ] - }, - { - "cell_type": "markdown", - "id": "151119d7", - "metadata": {}, - "source": [ - "**3. Preprocessing**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7327e72b", - "metadata": {}, - "outputs": [], - "source": [ - "!grep -P \"^$\" -n lego_sets.csv # puste linie - nie ma\n", - "!grep -P \",,\" -n lego_sets.csv # puste pola" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9e0a4327", - "metadata": {}, - "outputs": [], - "source": [ - "# usuwamy puste pola\n", - "lego_all = pd.read_csv('lego_sets.csv').dropna()\n", - "lego_all.to_csv('lego_sets_clean.csv', index = None, header=True)\n", - "lego_clean = pd.read_csv('lego_sets_clean.csv')\n", - "lego_clean" - ] - }, - { - "cell_type": "markdown", - "id": "89840c87", - "metadata": {}, - "source": [ - "**4. Normalizacja**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1f33e04", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install --user numpy\n", - "import numpy as np\n", - "\n", - "# list_price moze byc do dwoch miejsc po przecinku\n", - "lego_clean['list_price'] = lego_clean['list_price'].round(2)\n", - "\n", - "# num_reviews, piece_count i prod_id moga byc wartosciami calkowitymi\n", - "lego_clean['num_reviews'] = lego_clean['num_reviews'].apply(np.int64)\n", - "lego_clean['piece_count'] = lego_clean['piece_count'].apply(np.int64)\n", - "lego_clean['prod_id'] = lego_clean['prod_id'].apply(np.int64)\n", - "\n", - "# czysto dla przykladu normalizujemy pozostale floaty (chociaz nie trzeba, wszystkie juz sa w tej samej skali)\n", - "lego_clean['play_star_rating'] = (lego_clean['play_star_rating'] - lego_clean['play_star_rating'].min() ) / (lego_clean['play_star_rating'].max() - lego_clean['play_star_rating'].min())\n", - "lego_clean['star_rating'] = (lego_clean['star_rating'] - lego_clean['star_rating'].min() ) / (lego_clean['star_rating'].max() - lego_clean['star_rating'].min())\n", - "lego_clean['val_star_rating'] = (lego_clean['val_star_rating'] - lego_clean['val_star_rating'].min() ) / (lego_clean['val_star_rating'].max() - lego_clean['val_star_rating'].min())\n", - "\n", - "lego_clean.to_csv('lego_sets_clean_normalised.csv', index = None, header=True)\n", - "lego_clean_normalised = pd.read_csv('lego_sets_clean_normalised.csv')\n", - "lego_clean_normalised" - ] - }, - { - "cell_type": "markdown", - "id": "739ea946", - "metadata": {}, - "source": [ - "**5. Podział na podzbiory**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1ed5b5bb", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install --user sklearn\n", - "from sklearn.model_selection import train_test_split\n", - "lego_clean_normalised\n", - "\n", - "# pierwszy podzial, wydzielamy zbior treningowy\n", - "lego_train, lego_rem = train_test_split(lego_clean_normalised, train_size=0.8, random_state=1)\n", - "\n", - "# drugi podział, wydzielamy walidacyjny i testowy\n", - "lego_valid, lego_test = train_test_split(lego_rem, test_size=0.5, random_state=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d0bdaf9", - "metadata": {}, - "outputs": [], - "source": [ - "lego_train" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc151dc5", - "metadata": {}, - "outputs": [], - "source": [ - "lego_valid" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d6ba0fb", - "metadata": {}, - "outputs": [], - "source": [ - "lego_test" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}