import lzma import math from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression from sklearn.pipeline import make_pipeline from sklearn.metrics import mean_squared_error import pandas as pd X_train = [] Y_train = [] stop = 0 with lzma.open('train/train.tsv.xz', 'rt', encoding="utf-8") as f: data = pd.read_csv(f, sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text']) data = data[['Text', 'Begin']] data = data[0:50000] X = data['Text'] y = data['Begin'] model = make_pipeline(TfidfVectorizer(), LinearRegression()) model.fit(X, y) def readFile(filename): X_dev = [] with open(filename, 'r', encoding="utf-8") as dev_in: for line in dev_in: text = line.split("\t")[0].strip() X_dev.append(text) return X_dev def writePred(filename, predictions): with open(filename, "w") as out_file: for pred in predictions: out_file.write(str(pred) + "\n") x = readFile('dev-0/in.tsv') pred = model.predict(x) writePred('dev-0/out.tsv',pred) x = readFile('dev-1/in.tsv') pred = model.predict(x) writePred('dev-1/out.tsv',pred) x = readFile('test-A/in.tsv') pred = model.predict(x) writePred('test-A/out.tsv',pred)