import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression from sklearn.pipeline import make_pipeline def load_data(path): return pd.read_csv(path, sep='\t', header=None) def write_res(data, path): with open(path, 'w') as f: for line in data: f.write(f'{line}\n') print(f"Data written {path}/out.tsv") def main(): train = load_data('train/train.tsv') text = train[4] year = train.apply(lambda row: ((row[0] + row[1])/2), axis=1) model = make_pipeline(TfidfVectorizer(), LinearRegression()) model.fit(text, year) for path in ['dev-0', 'dev-1', 'test-A']: in_df = load_data(f'{path}/in.tsv') predict = model.predict(in_df[0]) write_res(predict, f'{path}/out.tsv') if __name__ == '__main__': main()