import sys import kaggle import pandas as pd import numpy as np from sklearn import preprocessing from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error device = 'cpu' # kaggle kaggle.api.authenticate() kaggle.api.dataset_download_files('timmate/avocado-prices-2020', path='.', unzip=True) # wczytanie danych avocado_with_year = pd.read_csv('avocado-updated-2020.csv') # usuniecie redundantnej kolumny 'year' i zamiana wartosci 'type' na 0 lub 1 new = ['date', 'average_price', 'total_volume', '4046', '4225', '4770', 'total_bags', 'small_bags', 'large_bags', 'xlarge_bags', 'type', 'geography'] avocado = avocado_with_year[new] avocado.to_csv("avocado.csv", index=False) avocado = pd.read_csv('avocado.csv') avocado['type'] = avocado.type.map(dict(organic=1, conventional=0)) # usuniecie wierszy z pustymi wartosciami avocado.isnull().sum() avocado.dropna() # preprocessing num_values = avocado.select_dtypes(include='float64').values scaler = preprocessing.MinMaxScaler() x_scaled = scaler.fit_transform(num_values) num_columns = avocado.select_dtypes(include='float64').columns avocado_normalized = pd.DataFrame(x_scaled, columns=num_columns) for col in avocado.columns: if col in num_columns: avocado[col] = avocado_normalized[col] avocado_normalized['type'] = avocado['type'] avocado_normalized['geography'] = avocado['geography'] # parametr CUTOFF cutoff_param = int(sys.argv[1]) avocado_normalized = avocado_normalized.head(cutoff_param) # podział na train/dev/test avocado_train, avocado_validate, avocado_test = np.split(avocado_normalized.sample(frac=1), [int(.6*len(avocado_normalized)), int(.8*len(avocado_normalized))]) print("Avocado: ".ljust(20), np.size(avocado_normalized)) print("Avocado (train) : ".ljust(20), np.size(avocado_train)) print("Avocado (validate): ".ljust(20), np.size(avocado_validate)) print("Avocado (test) ".ljust(20), np.size(avocado_test)) # sprawdzenie danych avocado_normalized.describe(include = 'all') avocado_train.describe(include= 'all') avocado_validate.describe(include = 'all') avocado_test.describe(include = 'all') avocado_normalized.geography.value_counts() avocado_test.geography.value_counts() avocado_train.geography.value_counts() pd.value_counts(avocado_normalized['type']).plot.bar() pd.value_counts(avocado_train['type']).plot.bar() pd.value_counts(avocado_test['type']).plot.bar() avocado_normalized['average_price'].hist() avocado_train['average_price'].hist() avocado_validate['average_price'].hist() avocado_test['average_price'].hist() # zapis do plików avocado_train.to_csv('avocado_train.csv') avocado_validate.to_csv('avocado_validate.csv') avocado_test.to_csv('avocado_test.csv') # print(avocado_train[:10]) # print(avocado_test[:10]) #print(avocado_normalized)