import pandas as pd from pandas import DataFrame from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error import numpy as np df = pd.read_csv("train/train.tsv", header=None, sep="\t", error_bad_lines=False, names=['price', 'mileage', 'year', 'brand', 'engineType', 'engineCapacity']) dev0 = pd.read_csv("dev-0/in.tsv", header=None, sep="\t", error_bad_lines=False, names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity']) testA = pd.read_csv("test-A/in.tsv", header=None, sep="\t", error_bad_lines=False, names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity']) expected = pd.read_csv("dev-0/expected.tsv", header=None, sep="\t", error_bad_lines=False, names=['price']) df = df[['price', 'year', 'mileage', 'engineCapacity']] min_val = np.min(df) max_val = np.max(df) df = (df - min_val) / (max_val - min_val) Y = df[['price']] X = df[['year', 'mileage', 'engineCapacity']] model = LinearRegression().fit(X, Y) dev0 = (dev0[['year', 'mileage', 'engineCapacity']] - min_val) / (max_val - min_val) testA = (testA[['year', 'mileage', 'engineCapacity']] - min_val) / (max_val - min_val) predicted_dev0 = model.predict(dev0[['year', 'mileage', 'engineCapacity']]) predicted_testA = model.predict(testA[['year', 'mileage', 'engineCapacity']]) predicted_denormalized = [] for pred in predicted_dev0: denorm = pred[0] * (max_val[0] - min_val[0]) + min_val[0] predicted_denormalized.append(denorm) with open("dev-0/out.tsv", "w") as file: for pred in predicted_denormalized: file.write(str(pred) + "\n") with open("test-A/out.tsv", "w") as file: for pred in predicted_testA: file.write(str(pred[0]) + "\n") predicted_denormalized = DataFrame(predicted_denormalized, columns=['pred']) error = mean_squared_error(expected, predicted_denormalized) for exp, pred in zip(expected.values, predicted_denormalized.values): print(exp, pred) f = open("dev0_rmse.txt", "w") f.write(str(error)) f.close() print(error)