retroc2/solution.py

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
import pickle

stopwords = []
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
with open('stopwords.txt') as f:
    stopwords = [line.rstrip() for line in f]

filename = 'regressor.sav'
regressor = LinearRegression()
# regressor = pickle.load(open(filename, 'rb'))
vectorizer = TfidfVectorizer()


def preprocess(doc):
    doc = doc.lower().split(' ')
    doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
    doc = ' '.join(doc)
    return doc


def train():
    with open('train/train.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    y = []
    for doc in docs[:1000]:
        row = doc.split('\t')
        start = row[0]
        end = row[1]
        end = end.split(' ')
        if len(end) > 1:
            row.insert(4, end[1])
        end = end[0]
        rest = row[4:]
        preprocessed = rest[0]
        docs_preprocessed.append(preprocessed)
        docs_preprocessed.append(preprocessed)
        y.append(start)
        y.append(end)
    y = [float(value) for value in y]
    x = vectorizer.fit_transform(docs_preprocessed)
    regressor.fit(x, y)
    pickle.dump(regressor, open(filename, 'wb'))


def classify(path):
    with open(path + 'in.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    for doc in docs:
        docs_preprocessed.append(preprocess(doc))
    test_x = vectorizer.transform(docs)
    predictions = regressor.predict(test_x)
    with open(path + 'out.tsv', 'w') as file:
        for prediction in predictions:
            file.write("%f\n" % prediction)


train()
classify('dev-0/')
# classify('test-A/')