retroc2/solution.py

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
import pickle

stopwords = []
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
with open('stopwords.txt') as f:
    stopwords = [line.rstrip() for line in f]

filename = 'regressor.sav'
regressor = LinearRegression()
# regressor = pickle.load(open(filename, 'rb'))
vectorizer = TfidfVectorizer()


def preprocess(doc):
    doc = doc.lower().split(' ')
    doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
    doc = ' '.join(doc)
    return doc


def train():
    with open('train/train.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    y = []
    for doc in docs[:1000]:
        row = doc.split('\t')
        start = row[0]
        end = row[1]
        end = end.split(' ')
        if len(end) > 1:
            row.insert(4, end[1])
        end = end[0]
        rest = row[4:]
        preprocessed = rest[0]
        docs_preprocessed.append(preprocessed)
        docs_preprocessed.append(preprocessed)
        y.append(start)
        y.append(end)
    y = [float(value) for value in y]
    x = vectorizer.fit_transform(docs_preprocessed)
    regressor.fit(x, y)
    pickle.dump(regressor, open(filename, 'wb'))


def classify(path):
    with open(path + 'in.tsv') as f:
        docs = [line.rstrip() for line in f]
    docs_preprocessed = []
    for doc in docs:
        docs_preprocessed.append(preprocess(doc))
    test_x = vectorizer.transform(docs)
    predictions = regressor.predict(test_x)
    with open(path + 'out.tsv', 'w') as file:
        for prediction in predictions:
            file.write("%f\n" % prediction)


train()
classify('dev-0/')
# classify('test-A/')
Add solution for sample data 2021-04-27 20:34:48 +02:00			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`from sklearn.linear_model import LinearRegression`
			`import pickle`

			`stopwords = []`
			`# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt`
			`with open('stopwords.txt') as f:`
			`stopwords = [line.rstrip() for line in f]`

			`filename = 'regressor.sav'`
			`regressor = LinearRegression()`
			`# regressor = pickle.load(open(filename, 'rb'))`
			`vectorizer = TfidfVectorizer()`


			`def preprocess(doc):`
			`doc = doc.lower().split(' ')`
			`doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))`
			`doc = ' '.join(doc)`
			`return doc`


			`def train():`
			`with open('train/train.tsv') as f:`
			`docs = [line.rstrip() for line in f]`
			`docs_preprocessed = []`
			`y = []`
			`for doc in docs[:1000]:`
			`row = doc.split('\t')`
			`start = row[0]`
			`end = row[1]`
			`end = end.split(' ')`
			`if len(end) > 1:`
			`row.insert(4, end[1])`
			`end = end[0]`
			`rest = row[4:]`
			`preprocessed = rest[0]`
			`docs_preprocessed.append(preprocessed)`
			`docs_preprocessed.append(preprocessed)`
			`y.append(start)`
			`y.append(end)`
			`y = [float(value) for value in y]`
			`x = vectorizer.fit_transform(docs_preprocessed)`
			`regressor.fit(x, y)`
			`pickle.dump(regressor, open(filename, 'wb'))`


			`def classify(path):`
			`with open(path + 'in.tsv') as f:`
			`docs = [line.rstrip() for line in f]`
			`docs_preprocessed = []`
			`for doc in docs:`
			`docs_preprocessed.append(preprocess(doc))`
			`test_x = vectorizer.transform(docs)`
			`predictions = regressor.predict(test_x)`
			`with open(path + 'out.tsv', 'w') as file:`
			`for prediction in predictions:`
			`file.write("%f\n" % prediction)`


			`train()`
			`classify('dev-0/')`
			`# classify('test-A/')`