import pandas as pd import plotly.express as px from pandas import DataFrame from sklearn import preprocessing from sklearn.linear_model import LinearRegression, Ridge, RidgeCV from sklearn.metrics import mean_squared_error from sklearn.preprocessing import PolynomialFeatures col_names = ["price", "mileage", "year", "brand", "engine_type", "engine_cap"] col_names_in = ["mileage", "year", "brand", "engine_type", "engine_cap"] df_train = pd.read_csv( "train/train.tsv", error_bad_lines=False, header=None, sep="\t", names=col_names ) df = df_train dev0 = pd.read_csv( "dev-0/in.tsv", error_bad_lines=False, header=None, sep="\t", names=col_names_in ) testA = pd.read_csv( "test-A/in.tsv", error_bad_lines=False, header=None, sep="\t", names=col_names_in ) test = pd.read_csv("dev-0/expected.tsv", error_bad_lines=False, header=None, sep="\t") Y_a = test[0] brands = df.brand.value_counts()[:35].index.tolist() def preprocess_data(df: DataFrame, brands: list) -> DataFrame: """Prepare dataset to linear regression""" df.brand = df.brand.apply(lambda x: x if x in brands else "0") df["year"] = df.year / 2000 df["mileage"] = df.mileage ** 0.3 df["engine_cap"] = df.engine_cap * 0.3 df["brand"] = df["brand"].str.lower() scaler = preprocessing.RobustScaler() df = pd.get_dummies(df, columns=["brand", "engine_type"]) # takes 1k rmse more ;( df[["mileage", "year", "engine_cap", "year"]] = scaler.fit_transform( df[["mileage", "year", "engine_cap", "year"]] ) poly = PolynomialFeatures(2, interaction_only=True) df = poly.fit_transform(df) return df indexes = df_train[(df_train.price < 1000) & (df_train.price > 1)].index df_train.drop(indexes, inplace=True) index = df_train[(df_train.mileage > 900000)].index df_train.drop(index, inplace=True) Y_train = df_train["price"] df_train.drop("price", axis=1, inplace=True) # df_train = df_train[df_train.price not in range (2, 1000)] df_train = preprocess_data(df_train, brands) dev0 = preprocess_data(dev0, brands) testA = preprocess_data(testA, brands) # fig = px.imshow(df_train.corr()) # fig.show() lm_model = LinearRegression() # clf = RidgeCV(alphas=[0.1, 0.01, 0.001, 0.00001, 1e-1], cv=10, fit_intercept=True, normalize=True) # clf.fit(df_train, Y_train) lm_model.fit(df_train, Y_train) dev0_predicted = lm_model.predict(dev0) testA_predicted = lm_model.predict(testA) # dev0_predicted2 = clf.predict(dev0) pd.Series(dev0_predicted).to_csv("dev-0/out.tsv", sep="\t", index=False, header=False) pd.Series(testA_predicted).to_csv("test-A/out.tsv", sep="\t", index=False, header=False) print(mean_squared_error(Y_a, dev0_predicted, squared=False))