from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression import pickle stopwords = [] # stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt with open('stopwords.txt') as f: stopwords = [line.rstrip() for line in f] filename = 'regressor.sav' regressor = LinearRegression() # regressor = pickle.load(open(filename, 'rb')) vectorizer = TfidfVectorizer() def preprocess(doc): doc = doc.lower().split(' ') doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc)) doc = ' '.join(doc) return doc def train(): with open('train/train.tsv') as f: docs = [line.rstrip() for line in f] docs_preprocessed = [] y = [] for doc in docs[:1000]: row = doc.split('\t') start = row[0] end = row[1] end = end.split(' ') if len(end) > 1: row.insert(4, end[1]) end = end[0] rest = row[4:] preprocessed = rest[0] docs_preprocessed.append(preprocessed) docs_preprocessed.append(preprocessed) y.append(start) y.append(end) y = [float(value) for value in y] x = vectorizer.fit_transform(docs_preprocessed) regressor.fit(x, y) pickle.dump(regressor, open(filename, 'wb')) def classify(path): with open(path + 'in.tsv') as f: docs = [line.rstrip() for line in f] docs_preprocessed = [] for doc in docs: docs_preprocessed.append(preprocess(doc)) test_x = vectorizer.transform(docs) predictions = regressor.predict(test_x) with open(path + 'out.tsv', 'w') as file: for prediction in predictions: file.write("%f\n" % prediction) train() classify('dev-0/') # classify('test-A/')