solution
This commit is contained in:
parent
ecfafbf86c
commit
9f31d8cc24
46
create_dictionary.py
Normal file
46
create_dictionary.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
from tokenizator import tokenize
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
dictionary = set()
|
||||||
|
train_y = []
|
||||||
|
documents = []
|
||||||
|
document_terms = {}
|
||||||
|
|
||||||
|
counter = 0
|
||||||
|
for line in sys.stdin:
|
||||||
|
#print(line)
|
||||||
|
if counter > 15000:
|
||||||
|
break
|
||||||
|
line = line.strip()
|
||||||
|
fields = line.split('\t')
|
||||||
|
if len(fields) < 2:
|
||||||
|
continue
|
||||||
|
label = fields[0].strip()
|
||||||
|
document = fields[1]
|
||||||
|
documents.append(document)
|
||||||
|
train_y.append(int(label))
|
||||||
|
terms = tokenize(document)
|
||||||
|
document_terms[counter] = terms
|
||||||
|
|
||||||
|
for t in terms:
|
||||||
|
dictionary.add(t)
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
word_to_index_mapping = {}
|
||||||
|
index = 1
|
||||||
|
for w in dictionary:
|
||||||
|
word_to_index_mapping[w] = index
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
model = (dictionary, documents, document_terms, word_to_index_mapping, train_y)
|
||||||
|
pickle.dump(model, open('dictionary.pkl', 'wb'))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
5272
dev-0/out.tsv
Normal file
5272
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
dictionary.pkl
Normal file
BIN
dictionary.pkl
Normal file
Binary file not shown.
31
predict.py
Normal file
31
predict.py
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import pickle
|
||||||
|
import sys
|
||||||
|
import torch
|
||||||
|
from tokenizator import tokenize
|
||||||
|
|
||||||
|
|
||||||
|
def get_x(line, weights, mapping):
|
||||||
|
terms = tokenize(line)
|
||||||
|
x = len(weights) * [0.]
|
||||||
|
x[len(weights) - 1] = 1
|
||||||
|
for word in terms:
|
||||||
|
if word in mapping:
|
||||||
|
x[mapping[word]] += 1
|
||||||
|
return torch.tensor(x, dtype=torch.float)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
w, word_to_index_mapping = pickle.load(open('model.pkl', 'rb'))
|
||||||
|
for line in sys.stdin:
|
||||||
|
line = line.strip()
|
||||||
|
x = get_x(line, w, word_to_index_mapping)
|
||||||
|
y = torch.sigmoid(x @ w)
|
||||||
|
if y > 0.85:
|
||||||
|
y = torch.tensor([0.85])
|
||||||
|
elif y < 0.15:
|
||||||
|
y = torch.tensor([0.15])
|
||||||
|
|
||||||
|
print(y.item())
|
||||||
|
|
||||||
|
|
||||||
|
main()
|
5152
test-A/out.tsv
Normal file
5152
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
18
tokenizator.py
Normal file
18
tokenizator.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
import nltk
|
||||||
|
#nltk.download()
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(d):
|
||||||
|
|
||||||
|
chars = ['.', '?', ';', ':', ',', '(', ')', '"', "'", '`', '``', "''", '/', '//', '!', '+', '-', '*',
|
||||||
|
'{', '}', '[', ']', '>', '<',"&", '~']
|
||||||
|
d = d.replace('/', ' ').replace('\'','').replace('*', '').replace("\\n", ' ').replace('gt','').replace('.',' ').replace('_',' ').replace('-',' ')
|
||||||
|
#print(d)
|
||||||
|
tokens = nltk.word_tokenize(d)
|
||||||
|
|
||||||
|
stops = stopwords.words('english')
|
||||||
|
deletethis = chars + stops
|
||||||
|
tokens = [x.lower() for x in tokens if x not in deletethis]
|
||||||
|
|
||||||
|
return tokens
|
43
train.py
Normal file
43
train.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/python3 -W ignore::UserWarning
|
||||||
|
|
||||||
|
import pickle
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def parse_dataset(documents, document_terms, mapping, voc_len):
|
||||||
|
print('voc len = ', voc_len)
|
||||||
|
x = []
|
||||||
|
for i in range(len(documents)):
|
||||||
|
features = voc_len * [0.]
|
||||||
|
# bias
|
||||||
|
features.append(1.0)
|
||||||
|
for word in document_terms[i]:
|
||||||
|
features[mapping[word]] += 1
|
||||||
|
x.append(features)
|
||||||
|
return torch.tensor(x, dtype=torch.float)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
model = pickle.load(open('dictionary.pkl', 'rb'))
|
||||||
|
vocabulary, documents, document_terms, word_to_index_mapping, train_y = model
|
||||||
|
x = parse_dataset(documents, document_terms, word_to_index_mapping, len(vocabulary))
|
||||||
|
y = torch.tensor(train_y, dtype=torch.float)
|
||||||
|
w = torch.randn(len(vocabulary) + 1, requires_grad=True)
|
||||||
|
learning_rate = torch.tensor(0.001)
|
||||||
|
counter = 0
|
||||||
|
for _ in range(2500):
|
||||||
|
if counter % 10 == 0:
|
||||||
|
print(counter)
|
||||||
|
y_predicted = torch.sigmoid(x @ w)
|
||||||
|
cost = (-1 / y.size()[0]) * torch.sum(y * torch.log(y_predicted + 1e-10) + (1 - y) * torch.log(1 - y_predicted + 1e-10))
|
||||||
|
cost.backward()
|
||||||
|
with torch.no_grad():
|
||||||
|
w -= learning_rate * w.grad
|
||||||
|
w.requires_grad = True
|
||||||
|
counter += 1
|
||||||
|
print(w)
|
||||||
|
model = (w, word_to_index_mapping)
|
||||||
|
pickle.dump(model, open('model.pkl', 'wb'))
|
||||||
|
|
||||||
|
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user