forked from kubapok/retroc2
65 lines
1.8 KiB
Python
65 lines
1.8 KiB
Python
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.linear_model import LinearRegression
|
|
import pickle
|
|
|
|
stopwords = []
|
|
# stopwords source - https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt
|
|
with open('stopwords.txt') as f:
|
|
stopwords = [line.rstrip() for line in f]
|
|
|
|
filename = 'regressor.sav'
|
|
regressor = LinearRegression()
|
|
# regressor = pickle.load(open(filename, 'rb'))
|
|
vectorizer = TfidfVectorizer()
|
|
|
|
|
|
def preprocess(doc):
|
|
doc = doc.lower().split(' ')
|
|
doc = list(filter(lambda word: (word not in stopwords) and (word != ''), doc))
|
|
doc = ' '.join(doc)
|
|
return doc
|
|
|
|
|
|
def train():
|
|
with open('train/train.tsv') as f:
|
|
docs = [line.rstrip() for line in f]
|
|
docs_preprocessed = []
|
|
y = []
|
|
for doc in docs[:1000]:
|
|
row = doc.split('\t')
|
|
start = row[0]
|
|
end = row[1]
|
|
end = end.split(' ')
|
|
if len(end) > 1:
|
|
row.insert(4, end[1])
|
|
end = end[0]
|
|
rest = row[4:]
|
|
preprocessed = rest[0]
|
|
docs_preprocessed.append(preprocessed)
|
|
docs_preprocessed.append(preprocessed)
|
|
y.append(start)
|
|
y.append(end)
|
|
y = [float(value) for value in y]
|
|
x = vectorizer.fit_transform(docs_preprocessed)
|
|
regressor.fit(x, y)
|
|
pickle.dump(regressor, open(filename, 'wb'))
|
|
|
|
|
|
def classify(path):
|
|
with open(path + 'in.tsv') as f:
|
|
docs = [line.rstrip() for line in f]
|
|
docs_preprocessed = []
|
|
for doc in docs:
|
|
docs_preprocessed.append(preprocess(doc))
|
|
test_x = vectorizer.transform(docs)
|
|
predictions = regressor.predict(test_x)
|
|
with open(path + 'out.tsv', 'w') as file:
|
|
for prediction in predictions:
|
|
file.write("%f\n" % prediction)
|
|
|
|
|
|
train()
|
|
classify('dev-0/')
|
|
# classify('test-A/')
|
|
|