magic

poly
new features
2021-05-12 11:42:17 +02:00 · 2021-05-12 10:55:01 +02:00 · 2021-05-12 09:11:08 +02:00 · 2021-05-09 21:26:15 +02:00
4 changed files with 2047 additions and 0 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/BIN
+++ b/BIN
--- a/script.py
+++ b/script.py
@ -0,0 +1,47 @@
+from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import PolynomialFeatures
+import pandas as pd
+import numpy as np
+
+brands = None
+
+def process_data(df):
+    df["age"] = 2018 - df["year"]
+    df["sqrt_age"] = df.age**0.7
+    df["sqrt_mileage"] = df.mileage ** 0.7
+    df["sqrt_engine_capacity"] = df.engine_capacity ** 0.7
+    global brands
+    if not brands:
+        brands = df.brand.value_counts()[:35].index.tolist()
+    df.brand = df.brand.apply(lambda x: x if x in brands else "0")
+    df = pd.get_dummies(df)
+    poly = PolynomialFeatures(2, interaction_only=True)
+    df  = poly.fit_transform(df)
+    return df
+
+def get_model():
+    df = pd.read_csv('./train/train.tsv', sep='\t',
+                 names=["price", "mileage", "year", "brand", "engine_type", "engine_capacity"])
+    X = df.loc[:, df.columns != 'price']
+    y = df['price']
+    X  = process_data(X)
+    regr = LinearRegression()
+    return regr.fit(X, y)
+
+
+def predict_and_write(path, model):
+    with open(f'{path}out.tsv', 'w') as out:
+        df_dev = pd.read_csv(f'{path}in.tsv', sep='\t',
+                            names=["mileage", "year", "brand", "engine_type", "engine_capacity"])
+        df_dev = process_data(df_dev)
+        predictions = model.predict(df_dev).astype(int)
+        for prediction in predictions:
+            out.write(f"{prediction}\n")
+
+def main():
+    model = get_model()
+    predict_and_write('./dev-0/', model)
+    predict_and_write('./test-A/', model)
+
+if __name__ == '__main__':
+    main()
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
Author	SHA1	Message	Date
s470611	e41231f1ea	magic	2021-05-12 11:42:17 +02:00
s470611	7ae168e1cb	poly	2021-05-12 10:55:01 +02:00
s470611	773c8517d0	new features	2021-05-12 09:11:08 +02:00
s470611	979991c0d8	linear regr	2021-05-09 21:26:15 +02:00