from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression import pickle filename = 'regressor.sav' vec_file = 'vectorizer.pickle' regressor = LinearRegression() # regressor = pickle.load(open(filename, 'rb')) vectorizer = TfidfVectorizer() # vectorizer = pickle.load(open(vec_file, 'rb')) def train(): with open('train/train.tsv') as f: docs = [line.rstrip() for line in f] docs_preprocessed = [] y = [] for doc in docs: row = doc.split('\t') start_date = row[0] end_date = row[1] end_date = end_date.split(' ') if len(end_date) > 1: row.insert(4, end_date[1]) end_date = end_date[0] doc = row[4:5][0] docs_preprocessed.append(doc) y.append((float(start_date) + float(end_date))/2) y = [float(value) for value in y] print('Fitting vectorizer...') x = vectorizer.fit_transform(docs_preprocessed) pickle.dump(vectorizer, open(vec_file, 'wb')) print('DONE!') print('Fitting regressor...') regressor.fit(x, y) pickle.dump(regressor, open(filename, 'wb')) print('DONE!') def classify(path): print("Predicting for", path) with open(path + 'in.tsv') as f: docs = [line.rstrip() for line in f] test_x = vectorizer.transform(docs) predictions = regressor.predict(test_x) with open(path + 'out.tsv', 'w') as file: for prediction in predictions: file.write("%f\n" % prediction) train() classify('dev-0/') # classify('dev-1/') # classify('test-A/')