2021-05-18 18:06:52 +02:00
|
|
|
import pandas as pd
|
|
|
|
from sklearn.linear_model import LinearRegression
|
|
|
|
from sklearn.metrics import mean_squared_error
|
|
|
|
from sklearn.preprocessing import PolynomialFeatures
|
|
|
|
|
|
|
|
col_names = ["Price","Mileage","Year","Brand","EngineType","EngineCapacity"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def prepareData(df):
|
|
|
|
df["Age"] = 2018 - df["Year"]
|
|
|
|
df["SqrtAge"] = df.Age**0.5
|
|
|
|
df = pd.concat([df, df['EngineType'].str.get_dummies()], axis = 1 )
|
|
|
|
df = df.drop(['EngineType','Brand'], axis = 1)
|
|
|
|
df["SqrtMileage"] = df.Mileage ** 0.5
|
|
|
|
df["SqrtEngineCapacity"] = df.EngineCapacity ** 0.5
|
|
|
|
poly = PolynomialFeatures(2, interaction_only=True)
|
|
|
|
df = poly.fit_transform(df)
|
|
|
|
return df
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
|
|
df =pd.read_csv('train/train.tsv', sep="\t", names=col_names)
|
|
|
|
y_dev =pd.read_csv('dev-0/expected.tsv', sep="\t", names=["Price"])
|
|
|
|
x_dev =pd.read_csv('dev-0/in.tsv', sep="\t", names=["Mileage","Year","Brand","EngineType","EngineCapacity"])
|
|
|
|
x_test =pd.read_csv('test-A/in.tsv', sep="\t", names=["Mileage","Year","Brand","EngineType","EngineCapacity"])
|
|
|
|
|
|
|
|
y_train = df.Price
|
|
|
|
x_train = df.drop('Price', axis=1)
|
|
|
|
|
|
|
|
x_train = prepareData(x_train)
|
|
|
|
|
|
|
|
linReg = LinearRegression()
|
|
|
|
linReg.fit(x_train, y_train)
|
|
|
|
|
|
|
|
x_dev = prepareData(x_dev)
|
|
|
|
x_test = prepareData(x_test)
|
|
|
|
|
|
|
|
#Score modelu dla zbioru dev
|
|
|
|
score = linReg.score(x_dev, y_dev)
|
|
|
|
print(score)
|
|
|
|
|
|
|
|
#Wartość RMSE dla zbioru dev
|
|
|
|
y_pred = linReg.predict(x_dev)
|
|
|
|
data = {'Price':y_pred}
|
|
|
|
y_pred = pd.DataFrame(data)
|
2021-05-18 18:10:32 +02:00
|
|
|
|
2021-05-18 18:15:54 +02:00
|
|
|
y_pred.to_csv(r'dev-0/out.tsv', sep='\t', index=False, header=False)
|
2021-05-18 18:10:32 +02:00
|
|
|
|
2021-05-18 18:06:52 +02:00
|
|
|
rmse = mean_squared_error(y_dev, y_pred, squared=False)
|
|
|
|
print(rmse)
|
|
|
|
|
|
|
|
#predict dla test-A
|
|
|
|
y_pred_test = linReg.predict(x_test)
|
|
|
|
data = {'Price':y_pred_test}
|
|
|
|
y_pred_test = pd.DataFrame(data)
|
|
|
|
|
2021-05-18 18:15:54 +02:00
|
|
|
y_pred_test.to_csv(r'test-A/out.tsv', sep='\t', index=False, header=False)
|
2021-05-18 18:06:52 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|