From 12afc815f237c8993d573aba150523ad3e4a03a6 Mon Sep 17 00:00:00 2001 From: szymonj98 Date: Mon, 9 May 2022 17:15:47 +0200 Subject: [PATCH] s444386 run file --- run.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 run.py diff --git a/run.py b/run.py new file mode 100644 index 0000000..ed86da7 --- /dev/null +++ b/run.py @@ -0,0 +1,50 @@ +import lzma +import math +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import make_pipeline +from sklearn.metrics import mean_squared_error +import pandas as pd + +X_train = [] +Y_train = [] + +stop = 0 + +with lzma.open('train/train.tsv.xz', 'rt', encoding="utf-8") as f: + data = pd.read_csv(f, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text']) + + +data = data[['Text', 'Begin']] +data = data[0:50000] + +X = data['Text'] +y = data['Begin'] + +model = make_pipeline(TfidfVectorizer(), LinearRegression()) +model.fit(X, y) + +def readFile(filename): + X_dev = [] + with open(filename, 'r', encoding="utf-8") as dev_in: + for line in dev_in: + text = line.split("\t")[0].strip() + X_dev.append(text) + return X_dev + +def writePred(filename, predictions): + with open(filename, "w") as out_file: + for pred in predictions: + out_file.write(str(pred) + "\n") + +x = readFile('dev-0/in.tsv') +pred = model.predict(x) +writePred('dev-0/out.tsv',pred) + +x = readFile('dev-1/in.tsv') +pred = model.predict(x) +writePred('dev-1/out.tsv',pred) + +x = readFile('test-A/in.tsv') +pred = model.predict(x) +writePred('test-A/out.tsv',pred) \ No newline at end of file