From 865d4a13af4cf0dd5134de831e1e6fc338c749eb Mon Sep 17 00:00:00 2001 From: Kacper Dudzic Date: Sun, 20 Mar 2022 17:18:31 +0100 Subject: [PATCH] Upload files to '' --- 02_zadanie.ipynb | 231 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 02_zadanie.ipynb diff --git a/02_zadanie.ipynb b/02_zadanie.ipynb new file mode 100644 index 0000000..3415a76 --- /dev/null +++ b/02_zadanie.ipynb @@ -0,0 +1,231 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b726950a", + "metadata": {}, + "source": [ + "**1. Pobieramy wybrany zbiór**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13106acf", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --user kaggle \n", + "!pip install --user pandas\n", + "!kaggle datasets download -d mterzolo/lego-sets\n", + "!unzip -o lego-sets.zip" + ] + }, + { + "cell_type": "markdown", + "id": "661a8c28", + "metadata": {}, + "source": [ + "**2. Dokonujemy inspekcji danych**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8dc2c5fa", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "90670da6", + "metadata": {}, + "outputs": [], + "source": [ + "!wc -l lego_sets.csv\n", + "!head -n 5 lego_sets.csv # duzo tekstu w niektorych kolumnach..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e92afb9c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "lego = pd.read_csv('lego_sets.csv')\n", + "lego # wglad w strukture elementow i klasy, wielkosc itd." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "824ffb81", + "metadata": {}, + "outputs": [], + "source": [ + "lego.describe(include='all') # srednia, odchylenie standardowe itd." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "290de05b", + "metadata": {}, + "outputs": [], + "source": [ + "lego[\"theme_name\"].value_counts() # rozklad czestosci dla przykladowej klasy (tematyka zestawu)" + ] + }, + { + "cell_type": "markdown", + "id": "151119d7", + "metadata": {}, + "source": [ + "**3. Preprocessing**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7327e72b", + "metadata": {}, + "outputs": [], + "source": [ + "!grep -P \"^$\" -n lego_sets.csv # puste linie - nie ma\n", + "!grep -P \",,\" -n lego_sets.csv # puste pola" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e0a4327", + "metadata": {}, + "outputs": [], + "source": [ + "# usuwamy puste pola\n", + "lego_all = pd.read_csv('lego_sets.csv').dropna()\n", + "lego_all.to_csv('lego_sets_clean.csv', index = None, header=True)\n", + "lego_clean = pd.read_csv('lego_sets_clean.csv')\n", + "lego_clean" + ] + }, + { + "cell_type": "markdown", + "id": "89840c87", + "metadata": {}, + "source": [ + "**4. Normalizacja**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1f33e04", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --user numpy\n", + "import numpy as np\n", + "\n", + "# list_price moze byc do dwoch miejsc po przecinku\n", + "lego_clean['list_price'] = lego_clean['list_price'].round(2)\n", + "\n", + "# num_reviews, piece_count i prod_id moga byc wartosciami calkowitymi\n", + "lego_clean['num_reviews'] = lego_clean['num_reviews'].apply(np.int64)\n", + "lego_clean['piece_count'] = lego_clean['piece_count'].apply(np.int64)\n", + "lego_clean['prod_id'] = lego_clean['prod_id'].apply(np.int64)\n", + "\n", + "# czysto dla przykladu normalizujemy pozostale floaty (chociaz nie trzeba, wszystkie juz sa w tej samej skali)\n", + "lego_clean['play_star_rating'] = (lego_clean['play_star_rating'] - lego_clean['play_star_rating'].min() ) / (lego_clean['play_star_rating'].max() - lego_clean['play_star_rating'].min())\n", + "lego_clean['star_rating'] = (lego_clean['star_rating'] - lego_clean['star_rating'].min() ) / (lego_clean['star_rating'].max() - lego_clean['star_rating'].min())\n", + "lego_clean['val_star_rating'] = (lego_clean['val_star_rating'] - lego_clean['val_star_rating'].min() ) / (lego_clean['val_star_rating'].max() - lego_clean['val_star_rating'].min())\n", + "\n", + "lego_clean.to_csv('lego_sets_clean_normalised.csv', index = None, header=True)\n", + "lego_clean_normalised = pd.read_csv('lego_sets_clean_normalised.csv')\n", + "lego_clean_normalised" + ] + }, + { + "cell_type": "markdown", + "id": "739ea946", + "metadata": {}, + "source": [ + "**5. Podział na podzbiory**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ed5b5bb", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --user sklearn\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# pierwszy podzial, wydzielamy zbior treningowy\n", + "lego_train, lego_rem = train_test_split(lego_clean_normalised, train_size=0.8, random_state=1)\n", + "\n", + "# drugi podział, wydzielamy walidacyjny i testowy\n", + "lego_valid, lego_test = train_test_split(lego_rem, test_size=0.5, random_state=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d0bdaf9", + "metadata": {}, + "outputs": [], + "source": [ + "lego_train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc151dc5", + "metadata": {}, + "outputs": [], + "source": [ + "lego_valid" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d6ba0fb", + "metadata": {}, + "outputs": [], + "source": [ + "lego_test" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}