This commit is contained in:
SzamanFL 2020-06-14 17:44:28 +02:00
parent e72f319c75
commit 939388fdca
3 changed files with 10600 additions and 10503 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

255
train.py
View File

@ -1,6 +1,54 @@
#!/usr/bin/python3 #!/usr/bin/env python3
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import re, sys, pickle, random import re, sys, pickle, random
from nltk.corpus import stopwords from nltk.corpus import stopwords
import torch
import ipdb as ip
from string import punctuation
from collections import Counter
import numpy as np
train_on_gpu = False
class ClassifyLSTM(nn.Module):
def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
super().__init__()
self.output_size = output_size
self.n_layers = n_layers
self.hidden_dim = hidden_dim
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
self.dropout = nn.Dropout(0.3)
self.fc = nn.Linear(hidden_dim, output_size)
self.sig = nn.Sigmoid()
def forward(self, x, hidden):
batch_size = x.size(0)
embeds = self.embedding(x)
lstm_out, hidden = self.lstm(embeds, hidden)
lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
out = self.dropout(lstm_out)
out = self.fc(out)
sig_out = self.sig(out)
sig_out = sig_out.view(batch_size, -1)
sig_out = sig_out[:, -1]
return sig_out, hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
if (train_on_gpu):
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
else:
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
return hidden
def clear_post(post): def clear_post(post):
post = post.replace('\\n', ' ') post = post.replace('\\n', ' ')
@ -17,94 +65,143 @@ def clear_post(post):
post_no_stop = [w for w in post if not w in stop_words] post_no_stop = [w for w in post if not w in stop_words]
return post_no_stop return post_no_stop
# czy słowa musza byc setem? def count_all_words(posts):
def create_vocabulary_and_documents(in_file, expected_file): joint_posts = ' '.join(posts)
vocabulary = set() words = joint_posts.split()
posts = {} count_words = Counter(words)
with open(in_file) as in_f, open(expected_file) as exp_f: total_words = len(words)
for line, exp in zip(in_f, exp_f): sorted_words = count_words.most_common(total_words)
text, timestap = line.rstrip('\n').split('\t') return sorted_words, total_words, count_words
post = clear_post(text)
posts[" ".join(post)] = int(exp)
for word in post:
vocabulary.add(word)
with open('data', 'wb') as f:
pickle.dump([vocabulary, posts], f)
print("data created")
return vocabulary, posts
def create_mappings(vocabulary): def pad_features(posts_int, seq_length):
word_to_index_mapping = {} features = np.zeros((len(posts_int), seq_length), dtype = int)
index_to_word_mapping = {} for i, post in enumerate(posts_int):
xi = 1 post_len = len(post)
for word in vocabulary: if post_len <= seq_length:
word_to_index_mapping[word] = xi zeroes = list(np.zeros(seq_length-post_len))
index_to_word_mapping[xi] = word new = zeroes+post
xi += 1
return word_to_index_mapping, index_to_word_mapping elif post_len > seq_length:
new = post[0:seq_length]
features[i,:] = np.array(new)
return features
def main(): def main():
if len(sys.argv) != 4: if len(sys.argv) != 3:
print("syntax ./train.py model expected_file in_file")
return return
model = str(sys.argv[1])
expected_file = str(sys.argv[2])
in_file = str(sys.argv[3])
try:
with open("data", 'rb') as pos:
pickle_list = pickle.load(pos)
print("data loaded")
vocabulary = pickle_list[0]
posts = pickle_list[1]
except FileNotFoundError:
vocabulary, posts = create_vocabulary_and_documents(in_file, expected_file)
word_to_index_mapping, index_to_word_mapping = create_mappings(vocabulary) in_file = sys.argv[1]
expected_file = sys.argv[2]
weights = [] posts = []
for xi in range(0, len(vocabulary) + 1): labels = []
weights.append(random.uniform(-0.01,0.01)) with open(in_file, 'r') as f:
for line in f:
post = line.split('\t')[0].rstrip().lower()
post = ''.join([c for c in post if c not in punctuation])
posts.append(post)
learning_rate = 0.000000001 with open(expected_file, 'r') as f:
loss_sum = 0.0 for line in f:
loss_sum_counter = 0 labels.append(int(line))
lowest_loss_sum_weights = []
lowest_loss_sum = 10000.0
print(f"len of vocabulary {len(vocabulary)}") sorted_words, total_words, count_words = count_all_words(posts)
# mozna ustawić na bardzo bardzo duzo vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
while loss_sum_counter != 10000:
try:
d, y = random.choice(list(posts.items()))
y_hat = weights[0]
tokens = d.split(' ')
for word in tokens:
# mozna tez cos pomyslec z count aby lepiej dzialalo
#print(f"{d.count(word)} : {word}")
y_hat += weights[word_to_index_mapping[word]] * tokens.count(word)
#print(f"{weights[word_to_index_mapping[word]]} : {word}")
loss = (y_hat - y)**2 posts_int = []
loss_sum += loss for post in posts:
delta = (y_hat - y) * learning_rate p = [vocab_to_int[w] for w in post.split()]
if loss_sum_counter % 100 == 0: posts_int.append(p)
print(f"{loss_sum_counter} : {loss_sum /1000} : {y_hat} : {delta} : {lowest_loss_sum}")
#loss_sum_counter = 0
loss_sum = 0
weights[0] -= delta encoded_labels = np.array(labels)
for word in tokens:
weights[word_to_index_mapping[word]] -= tokens.count(word) * delta
if lowest_loss_sum > loss_sum and loss_sum != 0: posts_len = [len(p) for p in posts_int]
print(f"it happened, new lowest_sum {loss_sum}") pd.Series(posts_len).hist()
lowest_loss_sum = loss_sum print(pd.Series(posts_len).describe())
lowest_loss_sum_weights = weights # outliners
posts_int = [posts_int[i] for i, l in enumerate(posts_len) if l>2 ]
encoded_labels = [ encoded_labels[i] for i, l in enumerate(posts_len) if l> 2 ]
loss_sum_counter +=1 seq_length = 63
except KeyboardInterrupt: train_x = pad_features(posts_int, seq_length)
break train_y = np.array(encoded_labels)
#print(lowest_loss_sum_weights)
with open(model, 'wb') as f: train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
pickle.dump([weights, lowest_loss_sum_weights, word_to_index_mapping], f) batch_size = 50
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last = True)
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
vocab_size = len(vocab_to_int)+1
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2
model = ClassifyLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
lr=0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epochs = 4
counter = 0
print_every = 100
clip=5
if(train_on_gpu):
model.cuda()
model.train()
for e in range(epochs):
h = model.init_hidden(batch_size)
for inputs, labels in train_loader:
counter += 1
if(train_on_gpu):
inputs, labels = inputs.cuda(), labels.cuda()
h = tuple([each.data for each in h])
model.zero_grad()
inputs = inputs.type(torch.LongTensor)
output, h = model(inputs, h)
loss = criterion(output.squeeze(), labels.float())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
if counter % print_every == 0:
val_h = model.init_hidden(batch_size)
val_losses = []
model.eval()
#for inputs, labels in valid_loader:
# val_h = tuple([each.data for each in val_h])
# if(train_on_gpu):
# inputs, labels = inputs.cuda(), labels.cuda()
# inputs = inputs.type(torch.LongTensor)
# output, val_h = model(inputs, val_h)
# val_loss = criterion(output.squeeze(), labels.float())
# val_losses.append(val_loss.item())
#model.train()
print("Epoch: {}/{}...".format(e+1, epochs),
"Step: {}...".format(counter),
"Loss: {:.6f}...".format(loss.item()),
"Val Loss: {:.6f}".format(np.mean(val_losses)))
# test_losses = []
# num_correct = 0
# h = model.init_hidden(batch_size)
# model.eval()
# for inputs, labels in test_loader:
# h = tuple([each.data for each in h])
# if(train_on_gpu):
# inputs, labels = inputs.cuda(), labels.cuda()
#
# inputs = inputs.type(torch.LongTensor)
# output, h = model(inputs, h)
# test_loss = criterion(output.squeeze(), labels.float())
# test_losses.append(test_loss.item())
ip.set_trace()
main() main()