44 lines
1.3 KiB
Python
44 lines
1.3 KiB
Python
|
#!/usr/bin/python3 -W ignore::UserWarning
|
||
|
|
||
|
import pickle
|
||
|
import torch
|
||
|
|
||
|
|
||
|
def parse_dataset(documents, document_terms, mapping, voc_len):
|
||
|
print('voc len = ', voc_len)
|
||
|
x = []
|
||
|
for i in range(len(documents)):
|
||
|
features = voc_len * [0.]
|
||
|
# bias
|
||
|
features.append(1.0)
|
||
|
for word in document_terms[i]:
|
||
|
features[mapping[word]] += 1
|
||
|
x.append(features)
|
||
|
return torch.tensor(x, dtype=torch.float)
|
||
|
|
||
|
|
||
|
def main():
|
||
|
model = pickle.load(open('dictionary.pkl', 'rb'))
|
||
|
vocabulary, documents, document_terms, word_to_index_mapping, train_y = model
|
||
|
x = parse_dataset(documents, document_terms, word_to_index_mapping, len(vocabulary))
|
||
|
y = torch.tensor(train_y, dtype=torch.float)
|
||
|
w = torch.randn(len(vocabulary) + 1, requires_grad=True)
|
||
|
learning_rate = torch.tensor(0.001)
|
||
|
counter = 0
|
||
|
for _ in range(2500):
|
||
|
if counter % 10 == 0:
|
||
|
print(counter)
|
||
|
y_predicted = torch.sigmoid(x @ w)
|
||
|
cost = (-1 / y.size()[0]) * torch.sum(y * torch.log(y_predicted + 1e-10) + (1 - y) * torch.log(1 - y_predicted + 1e-10))
|
||
|
cost.backward()
|
||
|
with torch.no_grad():
|
||
|
w -= learning_rate * w.grad
|
||
|
w.requires_grad = True
|
||
|
counter += 1
|
||
|
print(w)
|
||
|
model = (w, word_to_index_mapping)
|
||
|
pickle.dump(model, open('model.pkl', 'wb'))
|
||
|
|
||
|
|
||
|
main()
|