import lzma import csv import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline def readInput(dir): X = [] if 'xz' in dir: with lzma.open(dir) as f: for line in f: text = line.decode('utf-8') text = text.split('\t') X.append(text) else: with open(dir, encoding='utf8', errors='ignore') as f: for line in f: X. append(line.replace('\n','')) return X def writeOutput(output, dir): with open(dir, 'w', newline='') as f: writer = csv.writer(f) for row in output: writer.writerow([row]) if __name__ == '__main__': print('Reading input...') train = pd.DataFrame(readInput('train/train.tsv.xz')[:10000], columns=['Beginning', 'End', 'Title', 'Source', 'X']) train['Y'] = train.apply(lambda x: (float(x.Beginning) + float(x.End))/2, axis=1) train = train.drop(columns=['Beginning', 'End', 'Title', 'Source']) estimators = [('tfidf', TfidfVectorizer()), ('linearRegression', LinearRegression())] print('Creating pipeline...') model = Pipeline(estimators) print('Fitting model...') model.fit(train.X, train.Y) print('dev-0...') # dev-0 testX = readInput('dev-0/in.tsv') predicted = model.predict(testX) predicted = [str(x) for x in predicted.tolist()] writeOutput(predicted, 'dev-0/out.tsv') print('dev-1...') # dev-1 testX = readInput('dev-1/in.tsv') predicted = model.predict(testX) predicted = [str(x) for x in predicted.tolist()] writeOutput(predicted, 'dev-1/out.tsv') print('test-A...') # test-A testX = readInput('test-A/in.tsv') predicted = model.predict(testX) predicted = [str(x) for x in predicted.tolist()] writeOutput(predicted, 'test-A/out.tsv')