diff --git a/Main b/Main.py similarity index 95% rename from Main rename to Main.py index 68b5fdd..6e671db 100644 --- a/Main +++ b/Main.py @@ -1,45 +1,45 @@ -#basic imports -import pandas -from sklearn.linear_model import LinearRegression - -#basic paths -openTrain = './train/train.tsv' -openDev = './dev-0/in.tsv' -openTest = './test-A/in.tsv' - -#read from files -with open('./names') as f_names: - names = f_names.read().rstrip('\n').split('\t') -read0 = pandas.read_table(openTrain, sep='\t', names=names) -read1 = pandas.read_table(openDev, sep='\t', names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity']) - -#basic normalization & filtering -trainSet = pandas.get_dummies(read0, columns=['engineType']) -categories1 = trainSet.select_dtypes(include=object).columns.values -for c in categories1: - trainSet[c] = trainSet[c].astype('category').cat.codes -trainSet = trainSet.loc[(trainSet['price'] > 10000)] #to avoid suspicious observations -#for some reason this value gives the smallest RMSE according to geval, while smaller or bigger -#price gives RMSE >34k - -#Model training -X = trainSet.loc[:, trainSet.columns != 'price'] -solution = LinearRegression().fit(X, trainSet['price']) - -devSet = pandas.get_dummies(read1, columns=['engineType']) -categories2 = devSet.select_dtypes(include=object).columns.values -for c in categories2: - devSet[c] = devSet[c].astype('category').cat.codes - -predict = solution.predict(devSet) -predict.tofile("./dev-0/out.tsv", sep='\n') -testSet = pandas.get_dummies(read1, columns=['engineType']) - -categories3 = testSet.select_dtypes(include=object).columns.values -for c in categories3: - testSet[c] = testSet[c].astype('category').cat.codes - -predict = solution.predict(devSet) -predict.tofile("./test-A/out.tsv", sep='\n') - +#basic imports +import pandas +from sklearn.linear_model import LinearRegression + +#basic paths +openTrain = './train/train.tsv' +openDev = './dev-0/in.tsv' +openTest = './test-A/in.tsv' + +#read from files +with open('./names') as f_names: + names = f_names.read().rstrip('\n').split('\t') +read0 = pandas.read_table(openTrain, sep='\t', names=names) +read1 = pandas.read_table(openDev, sep='\t', names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity']) + +#basic normalization & filtering +trainSet = pandas.get_dummies(read0, columns=['engineType']) +categories1 = trainSet.select_dtypes(include=object).columns.values +for c in categories1: + trainSet[c] = trainSet[c].astype('category').cat.codes +trainSet = trainSet.loc[(trainSet['price'] > 10000)] #to avoid suspicious observations +#for some reason this value gives the smallest RMSE according to geval, while smaller or bigger +#price gives RMSE >34k + +#Model training +X = trainSet.loc[:, trainSet.columns != 'price'] +solution = LinearRegression().fit(X, trainSet['price']) + +devSet = pandas.get_dummies(read1, columns=['engineType']) +categories2 = devSet.select_dtypes(include=object).columns.values +for c in categories2: + devSet[c] = devSet[c].astype('category').cat.codes + +predict = solution.predict(devSet) +predict.tofile("./dev-0/out.tsv", sep='\n') +testSet = pandas.get_dummies(read1, columns=['engineType']) + +categories3 = testSet.select_dtypes(include=object).columns.values +for c in categories3: + testSet[c] = testSet[c].astype('category').cat.codes + +predict = solution.predict(devSet) +predict.tofile("./test-A/out.tsv", sep='\n') + #Outcome: 33956 for prices >10000 \ No newline at end of file