2022-05-17 00:49:33 +02:00
|
|
|
import pandas as pd
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
from sklearn.pipeline import make_pipeline
|
|
|
|
from sklearn.linear_model import LinearRegression
|
|
|
|
from sklearn.metrics import mean_squared_error
|
|
|
|
|
|
|
|
with open('train/train.tsv', 'r', encoding='utf8') as file:
|
|
|
|
train_data = pd.read_csv(file, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
|
|
|
|
|
|
|
|
|
|
|
|
def readFile(filename):
|
|
|
|
result = []
|
|
|
|
with open(filename, 'r', encoding="utf-8") as file:
|
|
|
|
for line in file:
|
|
|
|
text = line.split("\t")[0].strip()
|
|
|
|
result.append(text)
|
|
|
|
return result
|
|
|
|
|
|
|
|
def write_pred(filename, predictions):
|
|
|
|
with open(filename, "w") as file:
|
|
|
|
for pred in predictions:
|
|
|
|
file.write(str(pred) + "\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
2022-05-17 12:56:52 +02:00
|
|
|
# train_data = train_data[:10000]
|
2022-05-17 00:49:33 +02:00
|
|
|
|
|
|
|
X = train_data['Text']
|
|
|
|
Y = train_data['Begin']
|
|
|
|
|
|
|
|
|
|
|
|
model = make_pipeline(TfidfVectorizer(), LinearRegression())
|
|
|
|
model.fit(X, Y)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dev_0 = readFile('dev-0/in.tsv')
|
|
|
|
predict_dev_0 = model.predict(dev_0)
|
|
|
|
write_pred('dev-0/out.tsv', predict_dev_0)
|
|
|
|
|
|
|
|
dev_1 = readFile('dev-1/in.tsv')
|
|
|
|
predict_dev_1 = model.predict(dev_1)
|
|
|
|
write_pred('dev-1/out.tsv', predict_dev_1)
|
|
|
|
|
|
|
|
test_A = readFile('test-A/in.tsv')
|
|
|
|
predict_test_A = model.predict(test_A)
|
|
|
|
write_pred('test-A/out.tsv', predict_test_A)
|
|
|
|
|
|
|
|
|