auta-public/script.py

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import pandas as pd
import numpy as np

brands = None

def process_data(df):
    df["age"] = 2018 - df["year"]
    df["sqrt_age"] = df.age**0.7
    df["sqrt_mileage"] = df.mileage ** 0.7
    df["sqrt_engine_capacity"] = df.engine_capacity ** 0.7
    global brands
    if not brands:
        brands = df.brand.value_counts()[:35].index.tolist()
    df.brand = df.brand.apply(lambda x: x if x in brands else "0")
    df = pd.get_dummies(df)
    poly = PolynomialFeatures(2, interaction_only=True)
    df  = poly.fit_transform(df)
    return df

def get_model():
    df = pd.read_csv('./train/train.tsv', sep='\t',
                 names=["price", "mileage", "year", "brand", "engine_type", "engine_capacity"])
    X = df.loc[:, df.columns != 'price']
    y = df['price']
    X  = process_data(X)
    regr = LinearRegression()
    return regr.fit(X, y)


def predict_and_write(path, model):
    with open(f'{path}out.tsv', 'w') as out:
        df_dev = pd.read_csv(f'{path}in.tsv', sep='\t',
                            names=["mileage", "year", "brand", "engine_type", "engine_capacity"])
        df_dev = process_data(df_dev)
        predictions = model.predict(df_dev).astype(int)
        for prediction in predictions:
            out.write(f"{prediction}\n")

def main():
    model = get_model()
    predict_and_write('./dev-0/', model)
    predict_and_write('./test-A/', model)

if __name__ == '__main__':
    main()
linear regr 2021-05-09 21:26:15 +02:00			`from sklearn.linear_model import LinearRegression`
poly 2021-05-12 10:55:01 +02:00			`from sklearn.preprocessing import PolynomialFeatures`
linear regr 2021-05-09 21:26:15 +02:00			`import pandas as pd`
			`import numpy as np`

new features 2021-05-12 09:11:08 +02:00			`brands = None`
linear regr 2021-05-09 21:26:15 +02:00
poly 2021-05-12 10:55:01 +02:00			`def process_data(df):`
magic 2021-05-12 11:42:17 +02:00			`df["age"] = 2018 - df["year"]`
			`df["sqrt_age"] = df.age**0.7`
			`df["sqrt_mileage"] = df.mileage ** 0.7`
			`df["sqrt_engine_capacity"] = df.engine_capacity ** 0.7`
new features 2021-05-12 09:11:08 +02:00			`global brands`
poly 2021-05-12 10:55:01 +02:00			`if not brands:`
			`brands = df.brand.value_counts()[:35].index.tolist()`
			`df.brand = df.brand.apply(lambda x: x if x in brands else "0")`
			`df = pd.get_dummies(df)`
			`poly = PolynomialFeatures(2, interaction_only=True)`
			`df = poly.fit_transform(df)`
			`return df`

			`def get_model():`
linear regr 2021-05-09 21:26:15 +02:00			`df = pd.read_csv('./train/train.tsv', sep='\t',`
			`names=["price", "mileage", "year", "brand", "engine_type", "engine_capacity"])`
			`X = df.loc[:, df.columns != 'price']`
			`y = df['price']`
poly 2021-05-12 10:55:01 +02:00			`X = process_data(X)`
linear regr 2021-05-09 21:26:15 +02:00			`regr = LinearRegression()`
			`return regr.fit(X, y)`


			`def predict_and_write(path, model):`
			`with open(f'{path}out.tsv', 'w') as out:`
			`df_dev = pd.read_csv(f'{path}in.tsv', sep='\t',`
			`names=["mileage", "year", "brand", "engine_type", "engine_capacity"])`
poly 2021-05-12 10:55:01 +02:00			`df_dev = process_data(df_dev)`
linear regr 2021-05-09 21:26:15 +02:00			`predictions = model.predict(df_dev).astype(int)`
			`for prediction in predictions:`
			`out.write(f"{prediction}\n")`

			`def main():`
			`model = get_model()`
			`predict_and_write('./dev-0/', model)`
			`predict_and_write('./test-A/', model)`

			`if __name__ == '__main__':`
			`main()`