forked from kubapok/auta-public
58 lines
1.7 KiB
Python
58 lines
1.7 KiB
Python
|
import pandas as pd
|
||
|
from pathlib import Path
|
||
|
from sklearn.linear_model import LinearRegression
|
||
|
|
||
|
def get_names():
|
||
|
DATA_DIR = Path('./')
|
||
|
with open(DATA_DIR / 'names') as f_names:
|
||
|
return f_names.read().rstrip('\n').split('\t')
|
||
|
|
||
|
def get_data(names):
|
||
|
df = pd.read_csv("train/train.tsv", header=None, sep="\t", error_bad_lines=False, names=names)
|
||
|
dev_data = pd.read_csv("dev-0/in.tsv", header=None, sep="\t", error_bad_lines=False, names=['mileage', 'year', 'brand', 'engineType','engineCapacity'])
|
||
|
test_data = pd.read_csv("test-A/in.tsv", header=None, sep="\t", error_bad_lines=False, names=['mileage', 'year','brand', 'engineType', 'engineCapacity'])
|
||
|
return df, dev_data, test_data
|
||
|
|
||
|
def get_train_data(df):
|
||
|
df = df.drop(['brand'], axis=1)
|
||
|
|
||
|
train = pd.get_dummies(df, columns=['engineType'])
|
||
|
|
||
|
train = train.loc[(train['price'] > 1000)]
|
||
|
return train.loc[(train['mileage'] > 100)]
|
||
|
|
||
|
def get_x(train):
|
||
|
return train.loc[:, train.columns != 'price']
|
||
|
|
||
|
def get_y(train):
|
||
|
return train['price']
|
||
|
|
||
|
def get_linear_regression(x,y):
|
||
|
return LinearRegression().fit(x, y)
|
||
|
|
||
|
def process_data(df):
|
||
|
data = df.drop(['brand'], axis=1)
|
||
|
return pd.get_dummies(data, columns=['engineType'])
|
||
|
|
||
|
def get_prediction(clf, data, type):
|
||
|
prediction = clf.predict(data)
|
||
|
if type == 'dev':
|
||
|
prediction.tofile("./dev-0/out.tsv", sep='\n')
|
||
|
elif type == 'test':
|
||
|
prediction.tofile("./test-A/out.tsv", sep='\n')
|
||
|
|
||
|
def main():
|
||
|
#prepare
|
||
|
df, dev_data, test_data = get_data(get_names())
|
||
|
train = get_train_data(df)
|
||
|
x = get_x(train)
|
||
|
y = get_y(train)
|
||
|
#linear regression
|
||
|
clf = get_linear_regression(x, y)
|
||
|
#predictions
|
||
|
dev = process_data(dev_data)
|
||
|
test = process_data(test_data)
|
||
|
get_prediction(clf, dev, 'dev')
|
||
|
get_prediction(clf, test, 'test')
|
||
|
|
||
|
main()
|