poly

2021-05-12 10:55:01 +02:00 · 2021-05-12 10:55:01 +02:00 · 7ae168e1cb
commit 7ae168e1cb
parent 773c8517d0
3 changed files with 2009 additions and 2004 deletions
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/script.py
+++ b/script.py
@ -1,33 +1,38 @@
 from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import PolynomialFeatures
 import pandas as pd
 import numpy as np

 brands = None

-def get_model():
+def process_data(df):
+    df["age"] = df.year.apply(lambda x: np.sqrt(2017-x))
+    df["sqrt_mileage"] = df.mileage.apply(lambda x: np.sqrt(x))
+    df["sqrt_engine_capacity"] = df.engine_capacity.apply(lambda x: np.sqrt(x))
    global brands
+    if not brands:
+        brands = df.brand.value_counts()[:35].index.tolist()
+    df.brand = df.brand.apply(lambda x: x if x in brands else "0")
+    df = pd.get_dummies(df)
+    poly = PolynomialFeatures(2, interaction_only=True)
+    df  = poly.fit_transform(df)
+    return df
+
+def get_model():
    df = pd.read_csv('./train/train.tsv', sep='\t',
                 names=["price", "mileage", "year", "brand", "engine_type", "engine_capacity"])
    X = df.loc[:, df.columns != 'price']
    y = df['price']
-    X["age"] = X.year.apply(lambda x: np.sqrt(2017-x))
-    X["sqrt_mileage"] = X.mileage.apply(lambda x: np.sqrt(x))
-    brands = X.brand.value_counts()[:35].index.tolist()
-    X.brand = X.brand.apply(lambda x: x if x in brands else "0")
-    X = pd.get_dummies(X)
+    X  = process_data(X)
    regr = LinearRegression()
    return regr.fit(X, y)


 def predict_and_write(path, model):
-    global brands
    with open(f'{path}out.tsv', 'w') as out:
        df_dev = pd.read_csv(f'{path}in.tsv', sep='\t',
                            names=["mileage", "year", "brand", "engine_type", "engine_capacity"])
-        df_dev.brand = df_dev.brand.apply(lambda x: x if x in brands else "0")
-        df_dev["age"] = df_dev.year.apply(lambda x: np.sqrt(2017-x))
-        df_dev["sqrt_mileage"] = df_dev.mileage.apply(lambda x: np.sqrt(x))
-        df_dev = pd.get_dummies(df_dev)
+        df_dev = process_data(df_dev)
        predictions = model.predict(df_dev).astype(int)
        for prediction in predictions:
            out.write(f"{prediction}\n")
--- a/test-A/out.tsv
+++ b/test-A/out.tsv