retroc2/run.py
2022-05-25 17:59:38 +02:00

36 lines
1.2 KiB
Python

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
df = pd.read_csv("train.tsv", sep="\t", header=None)
df["year_mean"] = (df[1] + df[0]) / 2
dev0_x = pd.read_csv("dev0_in.tsv", sep='\r\t', header=None)
dev1_x = pd.read_csv("dev1_in.tsv", sep='\r\t', header=None)
testA_x = pd.read_csv("testA_in.tsv", sep='\r\t', header=None)
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
X = vectorizer.fit_transform(df[4])
y = df["year_mean"]
model = LinearRegression().fit(X, y)
dev0_results = model.predict(vectorizer.transform(dev0_x[0]))
dev1_results = model.predict(vectorizer.transform(dev1_x[0]))
testA_results = model.predict(vectorizer.transform(testA_x[0]))
dev0_results = [str(x) + "\n" for x in dev0_results]
dev1_results = [str(x) + "\n" for x in dev1_results]
testA_results = [str(x) + "\n" for x in testA_results]
with open("dev0_out.tsv", "w", encoding="UTF-8") as file:
file.writelines(dev0_results)
with open("dev1_out.tsv", "w", encoding="UTF-8") as file:
file.writelines(dev1_results)
with open("testA_out.tsv", "w", encoding="UTF-8") as file:
file.writelines(testA_results)