ium_426206/zadanie1.ipynb
2021-05-07 20:16:31 +02:00

68 KiB
Raw Blame History

#!pip install kaggle
#!pip install pandas
#!pip install matplotlib
#!pip install sklearn
!kaggle datasets download -d apoorvaappz/global-super-store-dataset
!unzip global-super-store-dataset.zip
Archive:  global-super-store-dataset.zip
  inflating: Global_Superstore2.csv  
  inflating: Global_Superstore2.xlsx  
import pandas as pd
gssd=pd.read_csv('Global_Superstore2.csv', encoding="latin2")
gssd
Row ID Order ID Order Date Ship Date Ship Mode Customer ID Customer Name Segment City State ... Product ID Category Sub-Category Product Name Sales Quantity Discount Profit Shipping Cost Order Priority
0 32298 CA-2012-124891 31-07-2012 31-07-2012 Same Day RH-19495 Rick Hansen Consumer New York City New York ... TEC-AC-10003033 Technology Accessories Plantronics CS510 - Over-the-Head monaural Wir... 2309.650 7 0.0 762.1845 933.57 Critical
1 26341 IN-2013-77878 05-02-2013 07-02-2013 Second Class JR-16210 Justin Ritter Corporate Wollongong New South Wales ... FUR-CH-10003950 Furniture Chairs Novimex Executive Leather Armchair, Black 3709.395 9 0.1 -288.7650 923.63 Critical
2 25330 IN-2013-71249 17-10-2013 18-10-2013 First Class CR-12730 Craig Reiter Consumer Brisbane Queensland ... TEC-PH-10004664 Technology Phones Nokia Smart Phone, with Caller ID 5175.171 9 0.1 919.9710 915.49 Medium
3 13524 ES-2013-1579342 28-01-2013 30-01-2013 First Class KM-16375 Katherine Murray Home Office Berlin Berlin ... TEC-PH-10004583 Technology Phones Motorola Smart Phone, Cordless 2892.510 5 0.1 -96.5400 910.16 Medium
4 47221 SG-2013-4320 05-11-2013 06-11-2013 Same Day RH-9495 Rick Hansen Consumer Dakar Dakar ... TEC-SHA-10000501 Technology Copiers Sharp Wireless Fax, High-Speed 2832.960 8 0.0 311.5200 903.04 Critical
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
51285 29002 IN-2014-62366 19-06-2014 19-06-2014 Same Day KE-16420 Katrina Edelman Corporate Kure Hiroshima ... OFF-FA-10000746 Office Supplies Fasteners Advantus Thumb Tacks, 12 Pack 65.100 5 0.0 4.5000 0.01 Medium
51286 35398 US-2014-102288 20-06-2014 24-06-2014 Standard Class ZC-21910 Zuschuss Carroll Consumer Houston Texas ... OFF-AP-10002906 Office Supplies Appliances Hoover Replacement Belt for Commercial Guardsm... 0.444 1 0.8 -1.1100 0.01 Medium
51287 40470 US-2013-155768 02-12-2013 02-12-2013 Same Day LB-16795 Laurel Beltran Home Office Oxnard California ... OFF-EN-10001219 Office Supplies Envelopes #10- 4 1/8" x 9 1/2" Security-Tint Envelopes 22.920 3 0.0 11.2308 0.01 High
51288 9596 MX-2012-140767 18-02-2012 22-02-2012 Standard Class RB-19795 Ross Baird Home Office Valinhos Săo Paulo ... OFF-BI-10000806 Office Supplies Binders Acco Index Tab, Economy 13.440 2 0.0 2.4000 0.00 Medium
51289 6147 MX-2012-134460 22-05-2012 26-05-2012 Second Class MC-18100 Mick Crebagga Consumer Tipitapa Managua ... OFF-PA-10004155 Office Supplies Paper Eaton Computer Printout Paper, 8.5 x 11 61.380 3 0.0 1.8000 0.00 High

51290 rows × 24 columns

import numpy as np
gssd_train, gssd_dev, gssd_test = np.split(gssd.sample(frac=1, random_state=42), [int(.6*len(gssd)), int(.8*len(gssd))])
gssd.shape[0] # Liczba danych w całym zbiorze
51290
gssd_train.shape[0] # Liczba danych w zbiorze do nauczania
30774
gssd_dev.shape[0] # Liczba danych w zbiorze do walidacji
10258
gssd_test.shape[0] # Liczba danych w zbiorze do testowania
10258
gssd.describe(include='all')
Row ID Order ID Order Date Ship Date Ship Mode Customer ID Customer Name Segment City State ... Product ID Category Sub-Category Product Name Sales Quantity Discount Profit Shipping Cost Order Priority
count 51290.00000 51290 51290 51290 51290 51290 51290 51290 51290 51290 ... 51290 51290 51290 51290 51290.000000 51290.000000 51290.000000 51290.000000 51290.000000 51290
unique NaN 25035 1430 1464 4 1590 795 3 3636 1094 ... 10292 3 17 3788 NaN NaN NaN NaN NaN 4
top NaN CA-2014-100111 18-06-2014 22-11-2014 Standard Class PO-18850 Muhammed Yedwab Consumer New York City California ... OFF-AR-10003651 Office Supplies Binders Staples NaN NaN NaN NaN NaN Medium
freq NaN 14 135 130 30775 97 108 26518 915 2001 ... 35 31273 6152 227 NaN NaN NaN NaN NaN 29433
mean 25645.50000 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 246.490581 3.476545 0.142908 28.610982 26.375915 NaN
std 14806.29199 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 487.565361 2.278766 0.212280 174.340972 57.296804 NaN
min 1.00000 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 0.444000 1.000000 0.000000 -6599.978000 0.000000 NaN
25% 12823.25000 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 30.758625 2.000000 0.000000 0.000000 2.610000 NaN
50% 25645.50000 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 85.053000 3.000000 0.000000 9.240000 7.790000 NaN
75% 38467.75000 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 251.053200 5.000000 0.200000 36.810000 24.450000 NaN
max 51290.00000 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN 22638.480000 14.000000 0.850000 8399.976000 933.570000 NaN

11 rows × 24 columns

gssd["Sub-Category"].value_counts()
Binders        6152
Storage        5059
Art            4883
Paper          3538
Chairs         3434
Phones         3357
Furnishings    3170
Accessories    3075
Labels         2606
Envelopes      2435
Supplies       2425
Fasteners      2420
Bookcases      2411
Copiers        2223
Appliances     1755
Machines       1486
Tables          861
Name: Sub-Category, dtype: int64
import matplotlib
gssd["Sub-Category"].value_counts().plot(kind="bar")
<AxesSubplot:>
#Wypisanie kolumn z wartościami NaN
for col in gssd.columns:
    if gssd[col].isnull().values.any():
        print(col)            
Postal Code
#Usunięcię kolumny Postal Code, ponieważ nie ma większego znaczenia dla danych, a jest w niej sporo wartości NaN
gssd = gssd.dropna(axis='columns')
#Normalizacja wartości float
from sklearn import preprocessing
flcols = gssd.select_dtypes(include=['float64']).columns
x = gssd.select_dtypes(include=['float64']).values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
normcols = pd.DataFrame(x_scaled, columns=flcols)
for col in flcols:
    gssd[col] = normcols[col]
gssd
Row ID Order ID Order Date Ship Date Ship Mode Customer ID Customer Name Segment City State ... Product ID Category Sub-Category Product Name Sales Quantity Discount Profit Shipping Cost Order Priority
0 32298 CA-2012-124891 31-07-2012 31-07-2012 Same Day RH-19495 Rick Hansen Consumer New York City New York ... TEC-AC-10003033 Technology Accessories Plantronics CS510 - Over-the-Head monaural Wir... 0.102006 7 0.000000 0.490812 1.000000 Critical
1 26341 IN-2013-77878 05-02-2013 07-02-2013 Second Class JR-16210 Justin Ritter Corporate Wollongong New South Wales ... FUR-CH-10003950 Furniture Chairs Novimex Executive Leather Armchair, Black 0.163837 9 0.117647 0.420749 0.989353 Critical
2 25330 IN-2013-71249 17-10-2013 18-10-2013 First Class CR-12730 Craig Reiter Consumer Brisbane Queensland ... TEC-PH-10004664 Technology Phones Nokia Smart Phone, with Caller ID 0.228586 9 0.117647 0.501331 0.980633 Medium
3 13524 ES-2013-1579342 28-01-2013 30-01-2013 First Class KM-16375 Katherine Murray Home Office Berlin Berlin ... TEC-PH-10004583 Technology Phones Motorola Smart Phone, Cordless 0.127753 5 0.117647 0.433564 0.974924 Medium
4 47221 SG-2013-4320 05-11-2013 06-11-2013 Same Day RH-9495 Rick Hansen Consumer Dakar Dakar ... TEC-SHA-10000501 Technology Copiers Sharp Wireless Fax, High-Speed 0.125122 8 0.000000 0.460768 0.967298 Critical
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
51285 29002 IN-2014-62366 19-06-2014 19-06-2014 Same Day KE-16420 Katrina Edelman Corporate Kure Hiroshima ... OFF-FA-10000746 Office Supplies Fasteners Advantus Thumb Tacks, 12 Pack 0.002856 5 0.000000 0.440300 0.000011 Medium
51286 35398 US-2014-102288 20-06-2014 24-06-2014 Standard Class ZC-21910 Zuschuss Carroll Consumer Houston Texas ... OFF-AP-10002906 Office Supplies Appliances Hoover Replacement Belt for Commercial Guardsm... 0.000000 1 0.941176 0.439926 0.000011 Medium
51287 40470 US-2013-155768 02-12-2013 02-12-2013 Same Day LB-16795 Laurel Beltran Home Office Oxnard California ... OFF-EN-10001219 Office Supplies Envelopes #10- 4 1/8" x 9 1/2" Security-Tint Envelopes 0.000993 3 0.000000 0.440749 0.000011 High
51288 9596 MX-2012-140767 18-02-2012 22-02-2012 Standard Class RB-19795 Ross Baird Home Office Valinhos Săo Paulo ... OFF-BI-10000806 Office Supplies Binders Acco Index Tab, Economy 0.000574 2 0.000000 0.440160 0.000000 Medium
51289 6147 MX-2012-134460 22-05-2012 26-05-2012 Second Class MC-18100 Mick Crebagga Consumer Tipitapa Managua ... OFF-PA-10004155 Office Supplies Paper Eaton Computer Printout Paper, 8.5 x 11 0.002692 3 0.000000 0.440120 0.000000 High

51290 rows × 23 columns