import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression df = pd.read_csv("train.tsv", sep="\t", header=None) df["year_mean"] = (df[1] + df[0]) / 2 dev0_x = pd.read_csv("dev0_in.tsv", sep='\r\t', header=None) dev1_x = pd.read_csv("dev1_in.tsv", sep='\r\t', header=None) testA_x = pd.read_csv("testA_in.tsv", sep='\r\t', header=None) vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2)) X = vectorizer.fit_transform(df[4]) y = df["year_mean"] model = LinearRegression().fit(X, y) dev0_results = model.predict(vectorizer.transform(dev0_x[0])) dev1_results = model.predict(vectorizer.transform(dev1_x[0])) testA_results = model.predict(vectorizer.transform(testA_x[0])) dev0_results = [str(x) + "\n" for x in dev0_results] dev1_results = [str(x) + "\n" for x in dev1_results] testA_results = [str(x) + "\n" for x in testA_results] with open("dev0_out.tsv", "w", encoding="UTF-8") as file: file.writelines(dev0_results) with open("dev1_out.tsv", "w", encoding="UTF-8") as file: file.writelines(dev1_results) with open("testA_out.tsv", "w", encoding="UTF-8") as file: file.writelines(testA_results)