auta-public/rozwiązanie.py
2021-05-17 21:27:15 +02:00

50 lines
1.7 KiB
Python

import pandas
from sklearn.linear_model import LinearRegression
r_in = './train/train.tsv'
# r_expected= './sport-text-classification-ball-ISI-public/train/expected.tsv'
r_ind_ev = './dev-0/in.tsv'
r_ind_test_A = './test-A/in.tsv'
with open('./names') as f_names:
names = f_names.read().rstrip('\n').split('\t')
tsv_read = pandas.read_table(r_in, error_bad_lines=False, sep='\t', names=names)
tsv_read_dev = pandas.read_table(r_ind_ev, error_bad_lines=False, sep='\t',
names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity'])
tsv_read_test_A = pandas.read_table(r_ind_test_A, error_bad_lines=False, sep='\t',
names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity'])
train = pandas.get_dummies(tsv_read, columns=['engineType'])
categorical_cols = train.select_dtypes(include=object).columns.values
for col in categorical_cols:
train[col] = train[col].astype('category').cat.codes
train = train.loc[(train['price'] > 1000)]
X = train.loc[:, train.columns != 'price']
clf = LinearRegression().fit(X, train['price'])
dev = pandas.get_dummies(tsv_read_dev, columns=['engineType'])
categorical_cols1 = dev.select_dtypes(include=object).columns.values
for col in categorical_cols1:
dev[col] = dev[col].astype('category').cat.codes
predictions = clf.predict(dev)
predictions.tofile("./dev-0/out.tsv", sep='\n')
test = pandas.get_dummies(tsv_read_test_A, columns=['engineType'])
categorical_cols2 = test.select_dtypes(include=object).columns.values
for col in categorical_cols2:
test[col] = test[col].astype('category').cat.codes
predictions = clf.predict(dev)
predictions.tofile("./test-A/out.tsv", sep='\n')