import numpy as np import pandas as pd from matplotlib import pyplot as plt from sklearn.linear_model import LinearRegression in_columns = ['id_stacji', 'nazwa_stacji', 'typ_zbioru', 'rok', 'miesiąc'] df = pd.read_csv('train/in.tsv', header=None, sep='\t') df.columns = in_columns measurements = pd.read_csv('train/expected.tsv', header=None, sep='\t') measurements.columns = ['suma_opadów'] start_year = 1981 end_year = 2021 total_years = end_year - start_year total_months = total_years * 12 known_years = 30 stations = [ 249180010, 249190560, 249200370, 249200490, 249220150, 249220180, 250190160, 250190390, 250210130, 251170090, 251210040, 252150120, 252160230, 252200150, 252210050, 252230120, 253170210, 253220070, 253230020, 254200080, 254220090 ] station_to_idx = {station: i for i, station in enumerate(stations)} x = np.full((len(stations), total_months), fill_value=-1) for (_, df_row), (_, measurement) in zip(df.iterrows(), measurements.iterrows()): station_id = df_row['id_stacji'] station_idx = station_to_idx[station_id] year = df_row['rok'] month = df_row['miesiąc'] - 1 assert start_year <= year < end_year, year assert 0 <= month < 12 absolute_month = (year - start_year) * 12 + month x[station_idx, absolute_month] = measurement test_in = pd.read_csv('dev-0/in.tsv', header=None, sep='\t') test_in.columns = in_columns test_exp = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t') test_exp.columns = ['suma_opadów'] for (_, df_row), (_, measurement) in zip(test_in.iterrows(), test_exp.iterrows()): station_id = df_row['id_stacji'] station_idx = station_to_idx[station_id] year = df_row['rok'] month = df_row['miesiąc'] - 1 assert start_year <= year < end_year, year assert 0 <= month < 12 absolute_month = (year - start_year) * 12 + month assert x[station_idx, absolute_month] == -1 x[station_idx, absolute_month] = measurement z = x.reshape((len(stations), total_years, 12)) fully_known: np.ndarray = z[:, :known_years] assert (fully_known == -1).sum() == 0 all_time_std = fully_known.std((1, 2)) all_time_mean = fully_known.mean((1, 2)) std_per_month = fully_known.std(1) mean_per_month = fully_known.mean(1) missing_stations = np.unique(np.where(x == -1)[0]) missing_entries = len(missing_stations) * (total_years - known_years) * 12 assert (z[missing_stations, known_years:] == -1).sum() == missing_entries assert (x == -1).sum() == missing_entries # plt.plot(fully_known.reshape(len(stations),-1).T) # plt.show() all_stations = np.arange(len(stations)) known_stations = np.delete(all_stations, missing_stations) entries_of_fully_known_stations = z[known_stations] assert (entries_of_fully_known_stations == -1).sum() == 0 known_entries_of_partially_known_stations = z[missing_stations, :known_years] model_per_month = [LinearRegression() for _ in range(12)] for month in range(12): model = model_per_month[month] u = entries_of_fully_known_stations[:, :known_years, month].T v = known_entries_of_partially_known_stations[:, :, month].T model.fit(u, v) p = model.predict(u) rmse = np.mean((p - v) ** 2) m = mean_per_month[missing_stations, month] rmse2 = np.mean((m - v) ** 2) print(rmse, "/", rmse2) z_prev = z.copy() METHOD = "mean" if METHOD == "linear_regression": for month in range(12): model = model_per_month[month] u = entries_of_fully_known_stations[:, known_years:, month].T p = model.predict(u) p[p < 0] = 0 assert np.all(z[missing_stations, known_years:, month] == -1) z[missing_stations, known_years:, month] = p.T elif METHOD == "mean": for year in range(known_years, total_years): for month in range(12): assert np.all(z[known_stations, year, month] != -1) assert np.all(z[missing_stations, year, month] == -1) z[missing_stations, year, month] = np.mean(z[known_stations, year, month]) assert np.all(z != -1) df = pd.read_csv('test-A/in.tsv', header=None, sep='\t') df.columns = in_columns with open('test-A/out.tsv', 'w+') as f: for _, df_row in df.iterrows(): station_id = df_row['id_stacji'] station_idx = station_to_idx[station_id] year = df_row['rok'] month = df_row['miesiąc'] - 1 assert start_year <= year < end_year, year assert 0 <= month < 12 year = year - start_year assert z_prev[station_idx, year, month] == -1 assert z[station_idx, year, month] != -1 print(z[station_idx, year, month], file=f)