from sklearn.linear_model import LinearRegression import pandas as pd import numpy as np brands = None def get_model(): global brands df = pd.read_csv('./train/train.tsv', sep='\t', names=["price", "mileage", "year", "brand", "engine_type", "engine_capacity"]) X = df.loc[:, df.columns != 'price'] y = df['price'] X["age"] = X.year.apply(lambda x: np.sqrt(2017-x)) X["sqrt_mileage"] = X.mileage.apply(lambda x: np.sqrt(x)) brands = X.brand.value_counts()[:35].index.tolist() X.brand = X.brand.apply(lambda x: x if x in brands else "0") X = pd.get_dummies(X) regr = LinearRegression() return regr.fit(X, y) def predict_and_write(path, model): global brands with open(f'{path}out.tsv', 'w') as out: df_dev = pd.read_csv(f'{path}in.tsv', sep='\t', names=["mileage", "year", "brand", "engine_type", "engine_capacity"]) df_dev.brand = df_dev.brand.apply(lambda x: x if x in brands else "0") df_dev["age"] = df_dev.year.apply(lambda x: np.sqrt(2017-x)) df_dev["sqrt_mileage"] = df_dev.mileage.apply(lambda x: np.sqrt(x)) df_dev = pd.get_dummies(df_dev) predictions = model.predict(df_dev).astype(int) for prediction in predictions: out.write(f"{prediction}\n") def main(): model = get_model() predict_and_write('./dev-0/', model) predict_and_write('./test-A/', model) if __name__ == '__main__': main()