Zaktualizuj 'Main.py'

This commit is contained in:
Damian Bregier 2021-05-26 12:24:19 +02:00
parent 69b5c6854c
commit 2dfefc4943

View File

@ -1,45 +1,45 @@
#basic imports #basic imports
import pandas import pandas
from sklearn.linear_model import LinearRegression from sklearn.linear_model import LinearRegression
#basic paths #basic paths
openTrain = './train/train.tsv' openTrain = './train/train.tsv'
openDev = './dev-0/in.tsv' openDev = './dev-0/in.tsv'
openTest = './test-A/in.tsv' openTest = './test-A/in.tsv'
#read from files #read from files
with open('./names') as f_names: with open('./names') as f_names:
names = f_names.read().rstrip('\n').split('\t') names = f_names.read().rstrip('\n').split('\t')
read0 = pandas.read_table(openTrain, sep='\t', names=names) read0 = pandas.read_table(openTrain, sep='\t', names=names)
read1 = pandas.read_table(openDev, sep='\t', names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity']) read1 = pandas.read_table(openDev, sep='\t', names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity'])
#basic normalization & filtering #basic normalization & filtering
trainSet = pandas.get_dummies(read0, columns=['engineType']) trainSet = pandas.get_dummies(read0, columns=['engineType'])
categories1 = trainSet.select_dtypes(include=object).columns.values categories1 = trainSet.select_dtypes(include=object).columns.values
for c in categories1: for c in categories1:
trainSet[c] = trainSet[c].astype('category').cat.codes trainSet[c] = trainSet[c].astype('category').cat.codes
trainSet = trainSet.loc[(trainSet['price'] > 10000)] #to avoid suspicious observations trainSet = trainSet.loc[(trainSet['price'] > 10000)] #to avoid suspicious observations
#for some reason this value gives the smallest RMSE according to geval, while smaller or bigger #for some reason this value gives the smallest RMSE according to geval, while smaller or bigger
#price gives RMSE >34k #price gives RMSE >34k
#Model training #Model training
X = trainSet.loc[:, trainSet.columns != 'price'] X = trainSet.loc[:, trainSet.columns != 'price']
solution = LinearRegression().fit(X, trainSet['price']) solution = LinearRegression().fit(X, trainSet['price'])
devSet = pandas.get_dummies(read1, columns=['engineType']) devSet = pandas.get_dummies(read1, columns=['engineType'])
categories2 = devSet.select_dtypes(include=object).columns.values categories2 = devSet.select_dtypes(include=object).columns.values
for c in categories2: for c in categories2:
devSet[c] = devSet[c].astype('category').cat.codes devSet[c] = devSet[c].astype('category').cat.codes
predict = solution.predict(devSet) predict = solution.predict(devSet)
predict.tofile("./dev-0/out.tsv", sep='\n') predict.tofile("./dev-0/out.tsv", sep='\n')
testSet = pandas.get_dummies(read1, columns=['engineType']) testSet = pandas.get_dummies(read1, columns=['engineType'])
categories3 = testSet.select_dtypes(include=object).columns.values categories3 = testSet.select_dtypes(include=object).columns.values
for c in categories3: for c in categories3:
testSet[c] = testSet[c].astype('category').cat.codes testSet[c] = testSet[c].astype('category').cat.codes
predict = solution.predict(devSet) predict = solution.predict(devSet)
predict.tofile("./test-A/out.tsv", sep='\n') predict.tofile("./test-A/out.tsv", sep='\n')
#Outcome: 33956 for prices >10000 #Outcome: 33956 for prices >10000