from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression import string import csv date = [] text = [] with open("train/train.tsv", 'r', encoding="utf-8") as train: for line in csv.reader(train, delimiter="\t"): date.append((float(line[0]) + float(line[1]))/2) text.append(line[4]) lr = LinearRegression() vectorizer = TfidfVectorizer() text = vectorizer.fit_transform(text) print("Fitting lr") lr.fit(text, date) textIn = [] print("reading in.tsv") with open("dev-0/in.tsv", 'r', encoding="utf-8") as dev0: for line in csv.reader(dev0, delimiter="\t"): textIn.append(line[0]) textIn = vectorizer.transform(textIn) devOut = lr.predict(textIn) print("writing out.tsv") with open("dev-0/out.tsv", 'w', encoding="utf-8") as dev0: for w in devOut: dev0.write(str(w)) dev0.write("\n") ## Test A textIn = [] print("reading test in.tsv") with open("test-A/in.tsv", 'r', encoding="utf-8") as test: for line in csv.reader(test, delimiter="\t"): textIn.append(line[0]) textIn = vectorizer.transform(textIn) testOut = lr.predict(textIn) print("writing test out.tsv") with open("test-A/out.tsv", 'w', encoding="utf-8") as test: for w in testOut: test.write(str(w)) test.write("\n")