import lzma import csv import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline def readInput(dir): X = [] if 'xz' in dir: with lzma.open(dir) as f: for line in f: text = line.decode('utf-8') text = text.split('\t') X.append(text) else: with open(dir, encoding='utf8', errors='ignore') as f: for line in f: X. append(line.replace('\n','')) return X def writeOutput(output, dir): with open(dir, 'w', newline='') as f: writer = csv.writer(f) writer.writerows(output) if __name__ == '__main__': train = pd.DataFrame(readInput('train/train.tsv.xz'), columns=['Beginning', 'End', 'Title', 'Source', 'X']) train['Y'] = train.apply(lambda x: (float(x.Beginning) + float(x.End))/2, axis=1) train = train.drop(columns=['Beginning', 'End', 'Title', 'Source']) estimators = [('tfidf', TfidfVectorizer()), ('linearRegression', LinearRegression())] model = Pipeline(estimators) model.fit(train.X, train.Y) # dev-0 testX = readInput('dev-0/in.tsv') writeOutput(model.predict(testX), 'dev-0/out.tsv') # dev-1 testX = readInput('dev-1/in.tsv') writeOutput(model.predict(testX), 'dev-1/out.tsv') # test-A testX = readInput('test-A/in.tsv') writeOutput(model.predict(testX), 'test-A/out.tsv')