import gzip import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression from sklearn.utils import shuffle from sklearn.metrics import accuracy_score def preprocess(x): x = pd.concat([x, x['engineType'].str.get_dummies().astype(bool)], axis=1) x = x.drop(['engineType', 'brand'], axis=1) return x def makePrediction(path): x_pred = pd.read_table(path + '/in.tsv', error_bad_lines=False, header=None, names=names[1:]) x_pred = preprocess(x_pred) y_pred = model.predict(x_pred) y_pred.tofile(path + '/out.tsv', sep='\n') names = ['price', 'mileage', 'year', 'brand', 'engineType', 'engineCap'] train = pd.read_table('train/train.tsv', error_bad_lines=False, header=None, names=names) y_train = train['price'] x_train = train.iloc[:, 1:] x_train = preprocess(x_train) model = LinearRegression() model.fit(x_train, y_train) makePrediction('dev-0') makePrediction('test-A')