import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error names = ['price', 'mileage', 'year', 'brand', 'engineType', 'engineCapacity'] x_names = ['mileage', 'year', 'brand', 'engineType', 'engineCapacity'] names_without_brand = ['mileage', 'year', 'engineType', 'engineCapacity'] def main(): train_x, train_y = get_train_data() model = LinearRegression() model.fit(train_x, train_y) dev_x, dev_y = get_dev_data() predicted_dev_y = model.predict(dev_x) save_csv(predicted_dev_y, 'dev-0/out.tsv') test_x = get_test_data() predicted_test_y = model.predict(test_x) save_csv(predicted_test_y, 'test-A/out.tsv') print(RMSE(dev_y, predicted_dev_y)) def get_train_data(): raw_data = pd.read_csv('train/train.tsv', sep='\t', names=names) x = raw_data[names_without_brand] x = pd.get_dummies(x, columns=['engineType']) y = raw_data['price'] return x, y def get_dev_data(): dev_raw_data = pd.read_csv('dev-0/in.tsv', sep='\t', names=x_names) x = dev_raw_data[names_without_brand] x = pd.get_dummies(x, columns=['engineType']) with open('dev-0/expected.tsv', 'r') as file: y = [float(line.strip('\n')) for line in file.readlines()] return x, y def get_test_data(): test_raw_data = pd.read_csv('test-A/in.tsv', sep='\t', names=x_names) x = test_raw_data[names_without_brand] x = pd.get_dummies(x, columns=['engineType']) return x def save_csv(data, path): df = pd.DataFrame(data) df.to_csv(path, sep='\t', index=False, header=False) def RMSE(dev_y, predicted_dev_y): return np.sqrt(mean_squared_error(dev_y, predicted_dev_y)) if __name__ == '__main__': main()