new features

This commit is contained in:
s470611 2021-05-12 09:11:08 +02:00
parent 979991c0d8
commit 773c8517d0
3 changed files with 2007 additions and 2000 deletions

File diff suppressed because it is too large Load Diff

View File

@ -2,25 +2,32 @@ from sklearn.linear_model import LinearRegression
import pandas as pd import pandas as pd
import numpy as np import numpy as np
brands = None
def get_model(): def get_model():
global brands
df = pd.read_csv('./train/train.tsv', sep='\t', df = pd.read_csv('./train/train.tsv', sep='\t',
names=["price", "mileage", "year", "brand", "engine_type", "engine_capacity"]) names=["price", "mileage", "year", "brand", "engine_type", "engine_capacity"])
X = df.loc[:, df.columns != 'price'] X = df.loc[:, df.columns != 'price']
y = df['price'] y = df['price']
X["age"] = X.year.apply(lambda x: np.sqrt(2017-x))
X = X.drop(["brand"], axis=1) X["sqrt_mileage"] = X.mileage.apply(lambda x: np.sqrt(x))
X = pd.get_dummies(X, columns= ["engine_type"], drop_first=True) brands = X.brand.value_counts()[:35].index.tolist()
X.brand = X.brand.apply(lambda x: x if x in brands else "0")
X = pd.get_dummies(X)
regr = LinearRegression() regr = LinearRegression()
return regr.fit(X, y) return regr.fit(X, y)
def predict_and_write(path, model): def predict_and_write(path, model):
global brands
with open(f'{path}out.tsv', 'w') as out: with open(f'{path}out.tsv', 'w') as out:
df_dev = pd.read_csv(f'{path}in.tsv', sep='\t', df_dev = pd.read_csv(f'{path}in.tsv', sep='\t',
names=["mileage", "year", "brand", "engine_type", "engine_capacity"]) names=["mileage", "year", "brand", "engine_type", "engine_capacity"])
df_dev = df_dev.drop(["brand"], axis=1) df_dev.brand = df_dev.brand.apply(lambda x: x if x in brands else "0")
df_dev = pd.get_dummies(df_dev, columns= ["engine_type"], drop_first=True) df_dev["age"] = df_dev.year.apply(lambda x: np.sqrt(2017-x))
df_dev["sqrt_mileage"] = df_dev.mileage.apply(lambda x: np.sqrt(x))
df_dev = pd.get_dummies(df_dev)
predictions = model.predict(df_dev).astype(int) predictions = model.predict(df_dev).astype(int)
for prediction in predictions: for prediction in predictions:
out.write(f"{prediction}\n") out.write(f"{prediction}\n")

File diff suppressed because it is too large Load Diff