import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error from sklearn.preprocessing import PolynomialFeatures col_names = ["Price","Mileage","Year","Brand","EngineType","EngineCapacity"] def prepareData(df): df["Age"] = 2018 - df["Year"] df["SqrtAge"] = df.Age**0.5 df = pd.concat([df, df['EngineType'].str.get_dummies()], axis = 1 ) df = df.drop(['EngineType','Brand'], axis = 1) df["SqrtMileage"] = df.Mileage ** 0.5 df["SqrtEngineCapacity"] = df.EngineCapacity ** 0.5 poly = PolynomialFeatures(2, interaction_only=True) df = poly.fit_transform(df) return df def main(): df =pd.read_csv('train/train.tsv', sep="\t", names=col_names) y_dev =pd.read_csv('dev-0/expected.tsv', sep="\t", names=["Price"]) x_dev =pd.read_csv('dev-0/in.tsv', sep="\t", names=["Mileage","Year","Brand","EngineType","EngineCapacity"]) x_test =pd.read_csv('test-A/in.tsv', sep="\t", names=["Mileage","Year","Brand","EngineType","EngineCapacity"]) y_train = df.Price x_train = df.drop('Price', axis=1) x_train = prepareData(x_train) linReg = LinearRegression() linReg.fit(x_train, y_train) x_dev = prepareData(x_dev) x_test = prepareData(x_test) #Score modelu dla zbioru dev score = linReg.score(x_dev, y_dev) print(score) #Wartość RMSE dla zbioru dev y_pred = linReg.predict(x_dev) data = {'Price':y_pred} y_pred = pd.DataFrame(data) y_pred.to_csv(r'dev-0/out.tsv', sep='\t', index=False, header=False) rmse = mean_squared_error(y_dev, y_pred, squared=False) print(rmse) #predict dla test-A y_pred_test = linReg.predict(x_test) data = {'Price':y_pred_test} y_pred_test = pd.DataFrame(data) y_pred_test.to_csv(r'test-A/out.tsv', sep='\t', index=False, header=False) if __name__ == "__main__": main()