50 lines
1.3 KiB
Python
50 lines
1.3 KiB
Python
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
|
from sklearn.linear_model import LinearRegression
|
||
|
import string
|
||
|
import csv
|
||
|
|
||
|
date = []
|
||
|
text = []
|
||
|
with open("train/train.tsv", 'r', encoding="utf-8") as train:
|
||
|
for line in csv.reader(train, delimiter="\t"):
|
||
|
date.append((float(line[0]) + float(line[1]))/2)
|
||
|
text.append(line[4])
|
||
|
|
||
|
lr = LinearRegression()
|
||
|
vectorizer = TfidfVectorizer()
|
||
|
text = vectorizer.fit_transform(text)
|
||
|
print("Fitting lr")
|
||
|
lr.fit(text, date)
|
||
|
|
||
|
textIn = []
|
||
|
print("reading in.tsv")
|
||
|
with open("dev-0/in.tsv", 'r', encoding="utf-8") as dev0:
|
||
|
for line in csv.reader(dev0, delimiter="\t"):
|
||
|
textIn.append(line[0])
|
||
|
|
||
|
textIn = vectorizer.transform(textIn)
|
||
|
devOut = lr.predict(textIn)
|
||
|
|
||
|
print("writing out.tsv")
|
||
|
with open("dev-0/out.tsv", 'w', encoding="utf-8") as dev0:
|
||
|
for w in devOut:
|
||
|
dev0.write(str(w))
|
||
|
dev0.write("\n")
|
||
|
|
||
|
## Test A
|
||
|
textIn = []
|
||
|
print("reading test in.tsv")
|
||
|
with open("test-A/in.tsv", 'r', encoding="utf-8") as test:
|
||
|
for line in csv.reader(test, delimiter="\t"):
|
||
|
textIn.append(line[0])
|
||
|
|
||
|
textIn = vectorizer.transform(textIn)
|
||
|
testOut = lr.predict(textIn)
|
||
|
|
||
|
print("writing test out.tsv")
|
||
|
with open("test-A/out.tsv", 'w', encoding="utf-8") as test:
|
||
|
for w in testOut:
|
||
|
test.write(str(w))
|
||
|
test.write("\n")
|
||
|
|