import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error import numpy as np # Read column names col_names = [] with open('names') as f: col_names = f.read().strip().split('\t') # Read data dev = pd.read_table('dev-0/in.tsv', error_bad_lines=False, header=None, names=col_names[1:]) test = pd.read_table('test-A/in.tsv', error_bad_lines=False, header=None, names=col_names[1:]) train = pd.read_table('train/train.tsv', error_bad_lines=False, header=None, names=col_names) dev_expected = pd.read_table('dev-0/expected.tsv', error_bad_lines=False, header=None) # Create dummies for columns for c in train.select_dtypes(include=object).columns.values: train[c] = train[c].astype("category").cat.codes for c in dev.select_dtypes(include=object).columns.values: dev[c] = dev[c].astype("category").cat.codes for c in test.select_dtypes(include=object).columns.values: test[c] = test[c].astype("category").cat.codes # Sprawdzanie ile jest odstających wartości dla price # fig, ax = plt.subplots(1, 2) # fig.set_figheight(15) # fig.set_figwidth(20) # ax[0].boxplot(train['price']) # ax[0].set_title('price') # ax[1].boxplot(train['mileage']) # ax[1].set_title('mileage') # plt.show() # Usunięcie odstających wartości priceMin = 0 for price in train['price']: if price < 1000: priceMin += 1 # print("Price min cut: " + str(priceMin)) priceMax = 0 for price in train['price']: if price > 1000000: priceMin += 1 # print("Price max cut: " + str(priceMax)) mileageMin = 0 for m in train['mileage']: if m < 100: mileageMin += 1 # print("Mileage min cut: " + str(mileageMin)) train = train.loc[(train['price'] > 1000)] train = train.loc[(train['mileage'] > 100)] # Split train set to X and Y X_train = train.loc[:, train.columns != 'price'] Y_train = train['price'] # Create Linear regresion model clf = LinearRegression().fit(X_train, Y_train) # # Predict dev_p = clf.predict(dev) test_p = clf.predict(test) # # Accuracy score = mean_squared_error(dev_p, dev_expected, squared=False) print("RMSE: " + str(score)) # # Save to files dev_p.tofile('./dev-0/out.tsv', sep='\n') test_p.tofile('./test-A/out.tsv', sep='\n')