36 lines
1.2 KiB
Python
36 lines
1.2 KiB
Python
import pandas as pd
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.linear_model import LinearRegression
|
|
|
|
|
|
df = pd.read_csv("train.tsv", sep="\t", header=None)
|
|
df["year_mean"] = (df[1] + df[0]) / 2
|
|
|
|
dev0_x = pd.read_csv("dev0_in.tsv", sep='\r\t', header=None)
|
|
dev1_x = pd.read_csv("dev1_in.tsv", sep='\r\t', header=None)
|
|
testA_x = pd.read_csv("testA_in.tsv", sep='\r\t', header=None)
|
|
|
|
vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
|
|
X = vectorizer.fit_transform(df[4])
|
|
y = df["year_mean"]
|
|
|
|
model = LinearRegression().fit(X, y)
|
|
|
|
dev0_results = model.predict(vectorizer.transform(dev0_x[0]))
|
|
dev1_results = model.predict(vectorizer.transform(dev1_x[0]))
|
|
testA_results = model.predict(vectorizer.transform(testA_x[0]))
|
|
|
|
dev0_results = [str(x) + "\n" for x in dev0_results]
|
|
dev1_results = [str(x) + "\n" for x in dev1_results]
|
|
testA_results = [str(x) + "\n" for x in testA_results]
|
|
|
|
with open("dev0_out.tsv", "w", encoding="UTF-8") as file:
|
|
file.writelines(dev0_results)
|
|
|
|
with open("dev1_out.tsv", "w", encoding="UTF-8") as file:
|
|
file.writelines(dev1_results)
|
|
|
|
with open("testA_out.tsv", "w", encoding="UTF-8") as file:
|
|
file.writelines(testA_results)
|
|
|