This commit is contained in:
Michal Maciaszek 2020-12-15 16:40:10 +01:00
parent ecfafbf86c
commit 9f31d8cc24
9 changed files with 10562 additions and 0 deletions

46
create_dictionary.py Normal file
View File

@ -0,0 +1,46 @@
#!/usr/bin/python3
import sys
import pickle
from tokenizator import tokenize
def main():
dictionary = set()
train_y = []
documents = []
document_terms = {}
counter = 0
for line in sys.stdin:
#print(line)
if counter > 15000:
break
line = line.strip()
fields = line.split('\t')
if len(fields) < 2:
continue
label = fields[0].strip()
document = fields[1]
documents.append(document)
train_y.append(int(label))
terms = tokenize(document)
document_terms[counter] = terms
for t in terms:
dictionary.add(t)
counter += 1
word_to_index_mapping = {}
index = 1
for w in dictionary:
word_to_index_mapping[w] = index
index += 1
model = (dictionary, documents, document_terms, word_to_index_mapping, train_y)
pickle.dump(model, open('dictionary.pkl', 'wb'))
if __name__ == '__main__':
main()

5272
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
dictionary.pkl Normal file

Binary file not shown.

BIN
geval Executable file

Binary file not shown.

BIN
model.pkl Normal file

Binary file not shown.

31
predict.py Normal file
View File

@ -0,0 +1,31 @@
import pickle
import sys
import torch
from tokenizator import tokenize
def get_x(line, weights, mapping):
terms = tokenize(line)
x = len(weights) * [0.]
x[len(weights) - 1] = 1
for word in terms:
if word in mapping:
x[mapping[word]] += 1
return torch.tensor(x, dtype=torch.float)
def main():
w, word_to_index_mapping = pickle.load(open('model.pkl', 'rb'))
for line in sys.stdin:
line = line.strip()
x = get_x(line, w, word_to_index_mapping)
y = torch.sigmoid(x @ w)
if y > 0.85:
y = torch.tensor([0.85])
elif y < 0.15:
y = torch.tensor([0.15])
print(y.item())
main()

5152
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

18
tokenizator.py Normal file
View File

@ -0,0 +1,18 @@
import nltk
#nltk.download()
from nltk.corpus import stopwords
def tokenize(d):
chars = ['.', '?', ';', ':', ',', '(', ')', '"', "'", '`', '``', "''", '/', '//', '!', '+', '-', '*',
'{', '}', '[', ']', '>', '<',"&", '~']
d = d.replace('/', ' ').replace('\'','').replace('*', '').replace("\\n", ' ').replace('gt','').replace('.',' ').replace('_',' ').replace('-',' ')
#print(d)
tokens = nltk.word_tokenize(d)
stops = stopwords.words('english')
deletethis = chars + stops
tokens = [x.lower() for x in tokens if x not in deletethis]
return tokens

43
train.py Normal file
View File

@ -0,0 +1,43 @@
#!/usr/bin/python3 -W ignore::UserWarning
import pickle
import torch
def parse_dataset(documents, document_terms, mapping, voc_len):
print('voc len = ', voc_len)
x = []
for i in range(len(documents)):
features = voc_len * [0.]
# bias
features.append(1.0)
for word in document_terms[i]:
features[mapping[word]] += 1
x.append(features)
return torch.tensor(x, dtype=torch.float)
def main():
model = pickle.load(open('dictionary.pkl', 'rb'))
vocabulary, documents, document_terms, word_to_index_mapping, train_y = model
x = parse_dataset(documents, document_terms, word_to_index_mapping, len(vocabulary))
y = torch.tensor(train_y, dtype=torch.float)
w = torch.randn(len(vocabulary) + 1, requires_grad=True)
learning_rate = torch.tensor(0.001)
counter = 0
for _ in range(2500):
if counter % 10 == 0:
print(counter)
y_predicted = torch.sigmoid(x @ w)
cost = (-1 / y.size()[0]) * torch.sum(y * torch.log(y_predicted + 1e-10) + (1 - y) * torch.log(1 - y_predicted + 1e-10))
cost.backward()
with torch.no_grad():
w -= learning_rate * w.grad
w.requires_grad = True
counter += 1
print(w)
model = (w, word_to_index_mapping)
pickle.dump(model, open('model.pkl', 'wb'))
main()