from sklearn.linear_model import LinearRegression import pandas as pd def prepare_data(file, type): data = pd.read_csv(file, header=None, sep="\t") for c in data.select_dtypes(include=object).columns.values: data[c] = data[c].astype("category").cat.codes if type == 'train': data = pd.get_dummies(data, columns=[4]) else: data = pd.get_dummies(data, columns=[3]) return data data = prepare_data("./train/train.tsv", "train") data = data.loc[(data[0] > 1000)] price = data.iloc[:,0] training_data = data.iloc[:,1:] clf = LinearRegression().fit(training_data, price) with open('dev-0/out.tsv', 'w') as writer: dev_data = prepare_data('dev-0/in.tsv', "dev") for result in clf.predict(dev_data.iloc[:,0:]): writer.write(str(int(result)) + '\n') with open('test-A/out.tsv', 'w') as writer: test_data = prepare_data('test-A/in.tsv', "test") for result in clf.predict(test_data.iloc[:,0:]): writer.write(str(int(result)) + '\n')