ium_s451499/lab_1/zadanie_lab_1.ipynb
2024-03-20 03:57:25 +01:00

288 KiB
Raw Blame History

Zadanie LAB 1

1.Pobranie danych ze zbioru

!kaggle datasets download -d bunnyjr/bitcoin-historical-data-jan-2018-jan-2022-per-min -p ./data
bitcoin-historical-data-jan-2018-jan-2022-per-min.zip: Skipping, found more recently modified local copy (use --force to force download)
!cd data/
!unzip -o bitcoin-historical-data-jan-2018-jan-2022-per-min.zip
'unzip' is not recognized as an internal or external command,
operable program or batch file.

2 Podział na podzbiory

import pandas as pd

btc18 = pd.read_csv('data/BTC2018.csv')
btc19 = pd.read_csv('data/BTC2019.csv')
btc20 = pd.read_csv('data/BTC2020.csv')
btc21 = pd.read_csv('data/BTC2021.csv')

btc18.head()
unix date symbol open high low close Volume BTC Volume USD
0 1546300740 2018-12-31 23:59:00 BTC/USD 3689.26 3693.30 3689.26 3693.30 9.838855 36337.841473
1 1546300680 2018-12-31 23:58:00 BTC/USD 3689.26 3691.35 3689.26 3691.35 0.560000 2067.156000
2 1546300620 2018-12-31 23:57:00 BTC/USD 3688.83 3689.26 3688.83 3689.26 0.560833 2069.059602
3 1546300560 2018-12-31 23:56:00 BTC/USD 3687.87 3689.65 3686.92 3686.92 7.610240 28058.346208
4 1546300500 2018-12-31 23:55:00 BTC/USD 3688.28 3688.85 3685.00 3688.85 7.665703 28277.629729
btc19.head()
unix date symbol open high low close Volume BTC Volume USD
0 1577836740 2019-12-31 23:59:00 BTC/USD 7167.30 7171.22 7167.30 7168.36 0.191448 1372.366393
1 1577836680 2019-12-31 23:58:00 BTC/USD 7166.89 7167.30 7161.99 7167.30 0.568868 4077.245538
2 1577836620 2019-12-31 23:57:00 BTC/USD 7164.22 7170.80 7161.65 7166.89 3.959440 28376.874525
3 1577836560 2019-12-31 23:56:00 BTC/USD 7182.49 7182.49 7170.20 7170.20 6.248892 44805.802550
4 1577836500 2019-12-31 23:55:00 BTC/USD 7175.69 7176.68 7175.69 7176.68 0.016877 121.122623
btc20.head()
unix date symbol open high low close Volume BTC Volume USD
0 1609459140 2020-12-31 23:59:00 BTC/USD 28982.44 29024.59 28969.83 28992.79 5.897679 170990.182941
1 1609459080 2020-12-31 23:58:00 BTC/USD 29044.79 29048.62 29000.12 29000.12 20.362289 590508.837815
2 1609459020 2020-12-31 23:57:00 BTC/USD 29039.53 29048.63 29023.26 29044.79 14.503728 421257.721488
3 1609458960 2020-12-31 23:56:00 BTC/USD 29052.02 29059.42 29039.28 29039.53 2.707583 78626.925850
4 1609458900 2020-12-31 23:55:00 BTC/USD 29036.10 29052.02 29025.78 29052.02 7.631439 221708.705383
btc21.head()
unix date symbol open high low close Volume BTC Volume USD
0 1609459260 1/1/2021 0:01 BTC/USD 29007.31 29086.90 29007.31 29083.47 14.561951 423512.06040
1 1609459320 1/1/2021 0:02 BTC/USD 29069.80 29073.02 29028.14 29035.89 3.030301 87987.49928
2 1609459380 1/1/2021 0:03 BTC/USD 29037.68 29069.39 29019.00 29048.13 2.189621 63604.38239
3 1609459440 1/1/2021 0:04 BTC/USD 29048.13 29057.73 29035.61 29045.19 1.446538 42014.95943
4 1609459500 1/1/2021 0:05 BTC/USD 29021.86 29023.38 28982.33 28999.50 1.062360 30807.89925

Normalizacja daty i podział daty na osobne pola

#btc21 normalizacja daty
btc21.date = pd.to_datetime(btc21['date'], format='%m/%d/%Y %H:%M')
btc21.date = btc21['date'].dt.strftime('%Y-%m-%d %H:%M:%S')
btc21.head()
unix date symbol open high low close Volume BTC Volume USD
0 1609459260 2021-01-01 00:01:00 BTC/USD 29007.31 29086.90 29007.31 29083.47 14.561951 423512.06040
1 1609459320 2021-01-01 00:02:00 BTC/USD 29069.80 29073.02 29028.14 29035.89 3.030301 87987.49928
2 1609459380 2021-01-01 00:03:00 BTC/USD 29037.68 29069.39 29019.00 29048.13 2.189621 63604.38239
3 1609459440 2021-01-01 00:04:00 BTC/USD 29048.13 29057.73 29035.61 29045.19 1.446538 42014.95943
4 1609459500 2021-01-01 00:05:00 BTC/USD 29021.86 29023.38 28982.33 28999.50 1.062360 30807.89925
btc_all = pd.concat([btc18, btc19, btc20, btc21], ignore_index=True)


btc_all[['date', 'hour']] = btc_all['date'].str.split(' ', expand=True)
btc_all[['year', 'month', 'day']] = btc_all['date'].str.split('-', expand=True)
btc_all['month of year'] = btc_all['year'] + '-' + btc_all['month']


btc_all
unix date symbol open high low close Volume BTC Volume USD hour year month day month of year
0 1546300740 2018-12-31 BTC/USD 3689.26 3693.30 3689.26 3693.30 9.838855 36337.841473 23:59:00 2018 12 31 2018-12
1 1546300680 2018-12-31 BTC/USD 3689.26 3691.35 3689.26 3691.35 0.560000 2067.156000 23:58:00 2018 12 31 2018-12
2 1546300620 2018-12-31 BTC/USD 3688.83 3689.26 3688.83 3689.26 0.560833 2069.059602 23:57:00 2018 12 31 2018-12
3 1546300560 2018-12-31 BTC/USD 3687.87 3689.65 3686.92 3686.92 7.610240 28058.346208 23:56:00 2018 12 31 2018-12
4 1546300500 2018-12-31 BTC/USD 3688.28 3688.85 3685.00 3688.85 7.665703 28277.629729 23:55:00 2018 12 31 2018-12
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2149697 1646105880 2022-03-01 BTC/USD 43077.82 43077.82 43049.46 43049.46 0.022210 956.143143 03:38:00 2022 03 01 2022-03
2149698 1646105940 2022-03-01 BTC/USD 43035.16 43035.16 42999.44 42999.44 0.820950 35300.390270 03:39:00 2022 03 01 2022-03
2149699 1646106000 2022-03-01 BTC/USD 43022.24 43022.24 43016.03 43016.03 0.009230 397.037957 03:40:00 2022 03 01 2022-03
2149700 1646106060 2022-03-01 BTC/USD 43018.23 43046.59 43018.23 43046.58 0.142977 6154.673021 03:41:00 2022 03 01 2022-03
2149701 1646106180 2022-03-01 BTC/USD 43046.58 43046.58 43046.58 43046.58 0.000000 0.000000 03:43:00 2022 03 01 2022-03

2149702 rows × 14 columns

Statyskyki btc_all

btc_all.describe(include='all')
unix date symbol open high low close Volume BTC Volume USD hour year month day month of year
count 2.149702e+06 2149702 2149702 2.149702e+06 2.149702e+06 2.149702e+06 2.149702e+06 2.149702e+06 2.149702e+06 2149702 2149702 2149702 2149702 2149702
unique NaN 1495 1 NaN NaN NaN NaN NaN NaN 1440 5 12 31 51
top NaN 2018-12-31 BTC/USD NaN NaN NaN NaN NaN NaN 00:34:00 2018 01 22 2018-12
freq NaN 1440 2149702 NaN NaN NaN NaN NaN NaN 1494 525599 223196 72000 44640
mean 1.579945e+09 NaN NaN 1.920786e+04 1.921931e+04 1.919615e+04 1.920790e+04 5.443650e+00 7.797241e+04 NaN NaN NaN NaN NaN
std 3.808436e+07 NaN NaN 1.811668e+04 1.812675e+04 1.810635e+04 1.811686e+04 1.491300e+01 2.183400e+05 NaN NaN NaN NaN NaN
min 1.514765e+09 NaN NaN 3.124450e+03 3.131860e+03 3.122280e+03 3.124450e+03 0.000000e+00 0.000000e+00 NaN NaN NaN NaN NaN
25% 1.547010e+09 NaN NaN 7.135640e+03 7.139850e+03 7.131500e+03 7.135792e+03 2.380776e-01 2.900109e+03 NaN NaN NaN NaN NaN
50% 1.579256e+09 NaN NaN 9.528370e+03 9.533685e+03 9.523120e+03 9.528510e+03 1.304600e+00 1.711807e+04 NaN NaN NaN NaN NaN
75% 1.613861e+09 NaN NaN 3.454626e+04 3.457261e+04 3.451898e+04 3.454637e+04 4.802350e+00 6.629355e+04 NaN NaN NaN NaN NaN
max 1.646106e+09 NaN NaN 6.900000e+04 6.900000e+04 6.879999e+04 6.900000e+04 1.098349e+03 1.396792e+07 NaN NaN NaN NaN NaN
#podział na podzbiory i zapis do plików
from sklearn.model_selection import train_test_split



train_dev, btc_test = train_test_split(btc_all, test_size=0.2, random_state=42)
btc_train, btc_dev = train_test_split(train_dev, test_size=0.25, random_state=42) 


btc_train.to_csv('data/btc_train.csv', index=False)
btc_dev.to_csv('data/btc_dev.csv', index=False)
btc_test.to_csv('data/btc_test.csv', index=False)
#zaladowanie btc_dev
import pandas as pd

btc_dev = pd.read_csv('data/btc_dev.csv')
btc_train = pd.read_csv('data/btc_train.csv')
btc_test = pd.read_csv('data/btc_test.csv')

Dane o zbiorach

btc_dev.describe(include='all')
unix date symbol open high low close Volume BTC Volume USD hour year month day month of year
count 4.299410e+05 429941 429941 429941.000000 429941.000000 429941.000000 429941.000000 429941.000000 4.299410e+05 429941 429941.000000 429941.000000 429941.000000 429941
unique NaN 1495 1 NaN NaN NaN NaN NaN NaN 1440 NaN NaN NaN 51
top NaN 2021-09-25 BTC/USD NaN NaN NaN NaN NaN NaN 12:13:00 NaN NaN NaN 2021-10
freq NaN 338 429941 NaN NaN NaN NaN NaN NaN 352 NaN NaN NaN 9032
mean 1.579897e+09 NaN NaN 19187.806679 19199.224871 19176.132847 19187.853331 5.432500 7.764623e+04 NaN 2019.588653 6.229801 15.708069 NaN
std 3.806220e+07 NaN NaN 18110.655080 18120.678529 18100.353208 18110.821598 14.749560 2.193092e+05 NaN 1.206519 3.472108 8.788119 NaN
min 1.514765e+09 NaN NaN 3137.000000 3137.000000 3130.000000 3135.440000 0.000000 0.000000e+00 NaN 2018.000000 1.000000 1.000000 NaN
25% 1.547001e+09 NaN NaN 7138.650000 7142.470000 7134.340000 7138.440000 0.237175 2.898257e+03 NaN 2019.000000 3.000000 8.000000 NaN
50% 1.579157e+09 NaN NaN 9526.480000 9531.510000 9521.470000 9526.910000 1.304891 1.712750e+04 NaN 2020.000000 6.000000 16.000000 NaN
75% 1.613801e+09 NaN NaN 34472.350000 34500.000000 34450.000000 34476.160000 4.811006 6.621286e+04 NaN 2021.000000 9.000000 23.000000 NaN
max 1.646106e+09 NaN NaN 68794.750000 68894.100000 68704.410000 68733.680000 778.854848 1.378306e+07 NaN 2022.000000 12.000000 31.000000 NaN
btc_train.describe(include='all')
btc_test.describe(include='all')
btc_dev['Volume BTC'].value_counts()
Volume BTC
0.000000     10353
1.000000       239
0.010000       160
0.001000       153
2.000000       131
             ...  
5.190947         1
0.529028         1
10.358122        1
3.810011         1
0.062389         1
Name: count, Length: 410603, dtype: int64
btc_dev['year'].value_counts().plot(kind='bar')
<Axes: xlabel='year'>
btc_dev[["high", "year"]].groupby("year").mean()
high
year
2018 7533.899647
2019 7355.831382
2020 10418.525729
2021 47402.209955
2022 40931.761356
btc_dev[["Volume USD", "year"]].groupby("year").mean().plot(kind="bar")
<Axes: xlabel='year'>
import seaborn as sns
sns.set_theme()
sns.relplot(data=btc_dev, x="month", y="open", hue="year")
sns.relplot(data=btc_dev, x="month", y="close", hue="year")
<seaborn.axisgrid.FacetGrid at 0x2758607bed0>
btc_dev_v = btc_dev[btc_dev["year"] !=  "2022"]
sns.relplot(data=btc_dev_v, x="month", y="close", hue="year")
import seaborn as sns
sns.set_theme()
sns.pairplot(data=btc_dev.drop(columns=["unix"]), hue="year")

Czyszczenie

Dane wydają się być czyste, nie widzę potrzeby ich czyszczenia