2021-05-07 16:47:38 +02:00
|
|
|
import sys
|
2021-04-10 20:54:29 +02:00
|
|
|
import kaggle
|
2021-04-10 16:11:18 +02:00
|
|
|
import pandas as pd
|
|
|
|
import numpy as np
|
|
|
|
from sklearn import preprocessing
|
2021-04-25 19:08:03 +02:00
|
|
|
from sklearn.linear_model import LinearRegression
|
2021-05-06 21:29:58 +02:00
|
|
|
from sklearn.metrics import mean_squared_error
|
2021-05-07 18:26:53 +02:00
|
|
|
|
2021-04-10 16:11:18 +02:00
|
|
|
|
2021-04-25 19:08:03 +02:00
|
|
|
device = 'cpu'
|
2021-04-10 16:11:18 +02:00
|
|
|
|
2021-04-10 20:54:29 +02:00
|
|
|
# kaggle
|
|
|
|
kaggle.api.authenticate()
|
|
|
|
kaggle.api.dataset_download_files('timmate/avocado-prices-2020', path='.', unzip=True)
|
|
|
|
|
2021-04-25 19:08:03 +02:00
|
|
|
# wczytanie danych
|
2021-04-10 16:11:18 +02:00
|
|
|
avocado_with_year = pd.read_csv('avocado-updated-2020.csv')
|
|
|
|
|
2021-04-25 19:08:03 +02:00
|
|
|
# usuniecie redundantnej kolumny 'year' i zamiana wartosci 'type' na 0 lub 1
|
2021-04-10 16:11:18 +02:00
|
|
|
new = ['date', 'average_price', 'total_volume', '4046', '4225', '4770', 'total_bags', 'small_bags', 'large_bags', 'xlarge_bags', 'type', 'geography']
|
|
|
|
avocado = avocado_with_year[new]
|
|
|
|
avocado.to_csv("avocado.csv", index=False)
|
|
|
|
avocado = pd.read_csv('avocado.csv')
|
2021-04-25 19:08:03 +02:00
|
|
|
avocado['type'] = avocado.type.map(dict(organic=1, conventional=0))
|
2021-04-10 16:11:18 +02:00
|
|
|
|
2021-04-25 19:08:03 +02:00
|
|
|
# usuniecie wierszy z pustymi wartosciami
|
|
|
|
avocado.isnull().sum()
|
|
|
|
avocado.dropna()
|
2021-04-10 16:11:18 +02:00
|
|
|
|
2021-04-25 19:08:03 +02:00
|
|
|
# preprocessing
|
|
|
|
num_values = avocado.select_dtypes(include='float64').values
|
|
|
|
scaler = preprocessing.MinMaxScaler()
|
|
|
|
x_scaled = scaler.fit_transform(num_values)
|
|
|
|
num_columns = avocado.select_dtypes(include='float64').columns
|
|
|
|
avocado_normalized = pd.DataFrame(x_scaled, columns=num_columns)
|
|
|
|
for col in avocado.columns:
|
|
|
|
if col in num_columns:
|
|
|
|
avocado[col] = avocado_normalized[col]
|
|
|
|
|
|
|
|
avocado_normalized['type'] = avocado['type']
|
|
|
|
avocado_normalized['geography'] = avocado['geography']
|
|
|
|
|
2021-05-07 18:26:53 +02:00
|
|
|
# parametr CUTOFF
|
|
|
|
cutoff_param = int(sys.argv[1])
|
|
|
|
avocado_normalized = avocado_normalized.head(cutoff_param)
|
|
|
|
|
2021-04-25 19:08:03 +02:00
|
|
|
|
|
|
|
# podział na train/dev/test
|
|
|
|
avocado_train, avocado_validate, avocado_test = np.split(avocado_normalized.sample(frac=1), [int(.6*len(avocado_normalized)), int(.8*len(avocado_normalized))])
|
|
|
|
|
|
|
|
print("Avocado: ".ljust(20), np.size(avocado_normalized))
|
2021-04-10 16:11:18 +02:00
|
|
|
print("Avocado (train) : ".ljust(20), np.size(avocado_train))
|
|
|
|
print("Avocado (validate): ".ljust(20), np.size(avocado_validate))
|
|
|
|
print("Avocado (test) ".ljust(20), np.size(avocado_test))
|
|
|
|
|
2021-04-25 19:08:03 +02:00
|
|
|
# sprawdzenie danych
|
|
|
|
avocado_normalized.describe(include = 'all')
|
2021-04-10 16:11:18 +02:00
|
|
|
avocado_train.describe(include= 'all')
|
|
|
|
avocado_validate.describe(include = 'all')
|
|
|
|
avocado_test.describe(include = 'all')
|
|
|
|
|
2021-04-25 19:08:03 +02:00
|
|
|
avocado_normalized.geography.value_counts()
|
2021-04-10 16:11:18 +02:00
|
|
|
avocado_test.geography.value_counts()
|
|
|
|
avocado_train.geography.value_counts()
|
2021-04-25 19:08:03 +02:00
|
|
|
|
|
|
|
pd.value_counts(avocado_normalized['type']).plot.bar()
|
2021-04-10 16:11:18 +02:00
|
|
|
pd.value_counts(avocado_train['type']).plot.bar()
|
|
|
|
pd.value_counts(avocado_test['type']).plot.bar()
|
2021-04-25 19:08:03 +02:00
|
|
|
|
|
|
|
avocado_normalized['average_price'].hist()
|
2021-04-10 16:11:18 +02:00
|
|
|
avocado_train['average_price'].hist()
|
|
|
|
avocado_validate['average_price'].hist()
|
|
|
|
avocado_test['average_price'].hist()
|
|
|
|
|
2021-05-07 18:26:53 +02:00
|
|
|
# zapis do plików
|
|
|
|
avocado_train.to_csv('avocado_train.csv')
|
|
|
|
avocado_validate.to_csv('avocado_validate.csv')
|
|
|
|
avocado_test.to_csv('avocado_test.csv')
|
|
|
|
|
2021-04-25 19:08:03 +02:00
|
|
|
|
|
|
|
# print(avocado_train[:10])
|
|
|
|
# print(avocado_test[:10])
|
2021-05-07 18:26:53 +02:00
|
|
|
#print(avocado_normalized)
|