precipitation-pl/run.py

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression

in_columns = ['id_stacji', 'nazwa_stacji', 'typ_zbioru', 'rok', 'miesiąc']

df = pd.read_csv('train/in.tsv', header=None, sep='\t')
df.columns = in_columns

measurements = pd.read_csv('train/expected.tsv', header=None, sep='\t')
measurements.columns = ['suma_opadów']

start_year = 1981
end_year = 2021
total_years = end_year - start_year
total_months = total_years * 12
known_years = 30

stations = [
    249180010,
    249190560,
    249200370,
    249200490,
    249220150,
    249220180,
    250190160,
    250190390,
    250210130,
    251170090,
    251210040,
    252150120,
    252160230,
    252200150,
    252210050,
    252230120,
    253170210,
    253220070,
    253230020,
    254200080,
    254220090
]
station_to_idx = {station: i for i, station in enumerate(stations)}
x = np.full((len(stations), total_months), fill_value=-1)

for (_, df_row), (_, measurement) in zip(df.iterrows(), measurements.iterrows()):
    station_id = df_row['id_stacji']
    station_idx = station_to_idx[station_id]
    year = df_row['rok']
    month = df_row['miesiąc'] - 1
    assert start_year <= year < end_year, year
    assert 0 <= month < 12
    absolute_month = (year - start_year) * 12 + month
    x[station_idx, absolute_month] = measurement

test_in = pd.read_csv('dev-0/in.tsv', header=None, sep='\t')
test_in.columns = in_columns

test_exp = pd.read_csv('dev-0/expected.tsv', header=None, sep='\t')
test_exp.columns = ['suma_opadów']

for (_, df_row), (_, measurement) in zip(test_in.iterrows(), test_exp.iterrows()):
    station_id = df_row['id_stacji']
    station_idx = station_to_idx[station_id]
    year = df_row['rok']
    month = df_row['miesiąc'] - 1
    assert start_year <= year < end_year, year
    assert 0 <= month < 12
    absolute_month = (year - start_year) * 12 + month
    assert x[station_idx, absolute_month] == -1
    x[station_idx, absolute_month] = measurement

z = x.reshape((len(stations), total_years, 12))
fully_known: np.ndarray = z[:, :known_years]
assert (fully_known == -1).sum() == 0
all_time_std = fully_known.std((1, 2))
all_time_mean = fully_known.mean((1, 2))
std_per_month = fully_known.std(1)
mean_per_month = fully_known.mean(1)

missing_stations = np.unique(np.where(x == -1)[0])
missing_entries = len(missing_stations) * (total_years - known_years) * 12
assert (z[missing_stations, known_years:] == -1).sum() == missing_entries
assert (x == -1).sum() == missing_entries
# plt.plot(fully_known.reshape(len(stations),-1).T)
# plt.show()
all_stations = np.arange(len(stations))
known_stations = np.delete(all_stations, missing_stations)
entries_of_fully_known_stations = z[known_stations]
assert (entries_of_fully_known_stations == -1).sum() == 0
known_entries_of_partially_known_stations = z[missing_stations, :known_years]

model_per_month = [LinearRegression() for _ in range(12)]
for month in range(12):
    model = model_per_month[month]
    u = entries_of_fully_known_stations[:, :known_years, month].T
    v = known_entries_of_partially_known_stations[:, :, month].T
    model.fit(u, v)
    p = model.predict(u)
    rmse = np.mean((p - v) ** 2)
    m = mean_per_month[missing_stations, month]
    rmse2 = np.mean((m - v) ** 2)
    print(rmse, "/", rmse2)

z_prev = z.copy()

for month in range(12):
    model = model_per_month[month]
    u = entries_of_fully_known_stations[:, known_years:, month].T
    p = model.predict(u)
    p[p<0] = 0
    assert np.all(z[missing_stations, known_years:, month] == -1)
    z[missing_stations, known_years:, month] = p.T
assert np.all(z != -1)
df = pd.read_csv('test-A/in.tsv', header=None, sep='\t')
df.columns = in_columns
with open('test-A/out.tsv', 'w+') as f:
    for _, df_row in df.iterrows():
        station_id = df_row['id_stacji']
        station_idx = station_to_idx[station_id]
        year = df_row['rok']
        month = df_row['miesiąc'] - 1
        assert start_year <= year < end_year, year
        assert 0 <= month < 12
        year = year - start_year
        assert z_prev[station_idx, year, month] == -1
        assert z[station_idx, year, month] != -1
        print(z[station_idx, year, month], file=f)