forked from kubapok/auta-public
Zaktualizuj 'Main.py'
This commit is contained in:
parent
69b5c6854c
commit
2dfefc4943
@ -1,45 +1,45 @@
|
|||||||
#basic imports
|
#basic imports
|
||||||
import pandas
|
import pandas
|
||||||
from sklearn.linear_model import LinearRegression
|
from sklearn.linear_model import LinearRegression
|
||||||
|
|
||||||
#basic paths
|
#basic paths
|
||||||
openTrain = './train/train.tsv'
|
openTrain = './train/train.tsv'
|
||||||
openDev = './dev-0/in.tsv'
|
openDev = './dev-0/in.tsv'
|
||||||
openTest = './test-A/in.tsv'
|
openTest = './test-A/in.tsv'
|
||||||
|
|
||||||
#read from files
|
#read from files
|
||||||
with open('./names') as f_names:
|
with open('./names') as f_names:
|
||||||
names = f_names.read().rstrip('\n').split('\t')
|
names = f_names.read().rstrip('\n').split('\t')
|
||||||
read0 = pandas.read_table(openTrain, sep='\t', names=names)
|
read0 = pandas.read_table(openTrain, sep='\t', names=names)
|
||||||
read1 = pandas.read_table(openDev, sep='\t', names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity'])
|
read1 = pandas.read_table(openDev, sep='\t', names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity'])
|
||||||
|
|
||||||
#basic normalization & filtering
|
#basic normalization & filtering
|
||||||
trainSet = pandas.get_dummies(read0, columns=['engineType'])
|
trainSet = pandas.get_dummies(read0, columns=['engineType'])
|
||||||
categories1 = trainSet.select_dtypes(include=object).columns.values
|
categories1 = trainSet.select_dtypes(include=object).columns.values
|
||||||
for c in categories1:
|
for c in categories1:
|
||||||
trainSet[c] = trainSet[c].astype('category').cat.codes
|
trainSet[c] = trainSet[c].astype('category').cat.codes
|
||||||
trainSet = trainSet.loc[(trainSet['price'] > 10000)] #to avoid suspicious observations
|
trainSet = trainSet.loc[(trainSet['price'] > 10000)] #to avoid suspicious observations
|
||||||
#for some reason this value gives the smallest RMSE according to geval, while smaller or bigger
|
#for some reason this value gives the smallest RMSE according to geval, while smaller or bigger
|
||||||
#price gives RMSE >34k
|
#price gives RMSE >34k
|
||||||
|
|
||||||
#Model training
|
#Model training
|
||||||
X = trainSet.loc[:, trainSet.columns != 'price']
|
X = trainSet.loc[:, trainSet.columns != 'price']
|
||||||
solution = LinearRegression().fit(X, trainSet['price'])
|
solution = LinearRegression().fit(X, trainSet['price'])
|
||||||
|
|
||||||
devSet = pandas.get_dummies(read1, columns=['engineType'])
|
devSet = pandas.get_dummies(read1, columns=['engineType'])
|
||||||
categories2 = devSet.select_dtypes(include=object).columns.values
|
categories2 = devSet.select_dtypes(include=object).columns.values
|
||||||
for c in categories2:
|
for c in categories2:
|
||||||
devSet[c] = devSet[c].astype('category').cat.codes
|
devSet[c] = devSet[c].astype('category').cat.codes
|
||||||
|
|
||||||
predict = solution.predict(devSet)
|
predict = solution.predict(devSet)
|
||||||
predict.tofile("./dev-0/out.tsv", sep='\n')
|
predict.tofile("./dev-0/out.tsv", sep='\n')
|
||||||
testSet = pandas.get_dummies(read1, columns=['engineType'])
|
testSet = pandas.get_dummies(read1, columns=['engineType'])
|
||||||
|
|
||||||
categories3 = testSet.select_dtypes(include=object).columns.values
|
categories3 = testSet.select_dtypes(include=object).columns.values
|
||||||
for c in categories3:
|
for c in categories3:
|
||||||
testSet[c] = testSet[c].astype('category').cat.codes
|
testSet[c] = testSet[c].astype('category').cat.codes
|
||||||
|
|
||||||
predict = solution.predict(devSet)
|
predict = solution.predict(devSet)
|
||||||
predict.tofile("./test-A/out.tsv", sep='\n')
|
predict.tofile("./test-A/out.tsv", sep='\n')
|
||||||
|
|
||||||
#Outcome: 33956 for prices >10000
|
#Outcome: 33956 for prices >10000
|
Loading…
Reference in New Issue
Block a user