from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression from decimal import Decimal import numpy as np import lzma import pandas as pd import sys if sys.version_info[0] < 3: from StringIO import StringIO else: from io import StringIO def openXZ(path): with lzma.open(path, mode='rt') as f: return f.readlines() def readFile(path): with open(path) as source: return source.readlines() def toArr(a): return [x.split("\t") for x in a] def getLinearRegresion(dataPath): inPath = dataPath + "/in.tsv" outPath = dataPath + "/out.tsv" tmpAr = toArr(readFile(inPath)) inDf = pd.DataFrame(data=tmpAr) dataVec = vectorizer.transform(inDf[0]) evaluate = lg.predict(dataVec) with open(outPath, 'w') as file: for e in evaluate: file.write("%f\n" % e) vectorizer = TfidfVectorizer() lg = LinearRegression() tmp = toArr(openXZ("./retroc2/train/train.tsv.xz")) train = pd.DataFrame(data=tmp) train = train.astype({0: np.number, 1: np.number}) dateMean = (train[0] + train[1]) / 2 trainVec = vectorizer.fit_transform(train[4]) lg.fit(trainVec, dateMean) getLinearRegresion("./retroc2/dev-0") getLinearRegresion("./retroc2/dev-1") getLinearRegresion("./retroc2/test-A")