#basic imports import pandas from sklearn.linear_model import LinearRegression #basic paths openTrain = './train/train.tsv' openDev = './dev-0/in.tsv' openTest = './test-A/in.tsv' #read from files with open('./names') as f_names: names = f_names.read().rstrip('\n').split('\t') read0 = pandas.read_table(openTrain, sep='\t', names=names) read1 = pandas.read_table(openDev, sep='\t', names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity']) #basic normalization & filtering trainSet = pandas.get_dummies(read0, columns=['engineType']) categories1 = trainSet.select_dtypes(include=object).columns.values for c in categories1: trainSet[c] = trainSet[c].astype('category').cat.codes trainSet = trainSet.loc[(trainSet['price'] > 10000)] #to avoid suspicious observations #for some reason this value gives the smallest RMSE according to geval, while smaller or bigger #price gives RMSE >34k #Model training X = trainSet.loc[:, trainSet.columns != 'price'] solution = LinearRegression().fit(X, trainSet['price']) devSet = pandas.get_dummies(read1, columns=['engineType']) categories2 = devSet.select_dtypes(include=object).columns.values for c in categories2: devSet[c] = devSet[c].astype('category').cat.codes predict = solution.predict(devSet) predict.tofile("./dev-0/out.tsv", sep='\n') testSet = pandas.get_dummies(read1, columns=['engineType']) categories3 = testSet.select_dtypes(include=object).columns.values for c in categories3: testSet[c] = testSet[c].astype('category').cat.codes predict = solution.predict(devSet) predict.tofile("./test-A/out.tsv", sep='\n') #Outcome: 33956 for prices >10000