import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import make_pipeline from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error with open('train/train.tsv', 'r', encoding='utf8') as file: train_data = pd.read_csv(file, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text']) def readFile(filename): result = [] with open(filename, 'r', encoding="utf-8") as file: for line in file: text = line.split("\t")[0].strip() result.append(text) return result def write_pred(filename, predictions): with open(filename, "w") as file: for pred in predictions: file.write(str(pred) + "\n") # train_data = train_data[:10000] X = train_data['Text'] Y = train_data['Begin'] model = make_pipeline(TfidfVectorizer(), LinearRegression()) model.fit(X, Y) dev_0 = readFile('dev-0/in.tsv') predict_dev_0 = model.predict(dev_0) write_pred('dev-0/out.tsv', predict_dev_0) dev_1 = readFile('dev-1/in.tsv') predict_dev_1 = model.predict(dev_1) write_pred('dev-1/out.tsv', predict_dev_1) test_A = readFile('test-A/in.tsv') predict_test_A = model.predict(test_A) write_pred('test-A/out.tsv', predict_test_A)