This commit is contained in:
SzamanFL 2020-06-14 17:44:28 +02:00
parent e72f319c75
commit 939388fdca
3 changed files with 10600 additions and 10503 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

255
train.py
View File

@ -1,6 +1,54 @@
#!/usr/bin/python3
#!/usr/bin/env python3
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import re, sys, pickle, random
from nltk.corpus import stopwords
import torch
import ipdb as ip
from string import punctuation
from collections import Counter
import numpy as np
train_on_gpu = False
class ClassifyLSTM(nn.Module):
def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
super().__init__()
self.output_size = output_size
self.n_layers = n_layers
self.hidden_dim = hidden_dim
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
self.dropout = nn.Dropout(0.3)
self.fc = nn.Linear(hidden_dim, output_size)
self.sig = nn.Sigmoid()
def forward(self, x, hidden):
batch_size = x.size(0)
embeds = self.embedding(x)
lstm_out, hidden = self.lstm(embeds, hidden)
lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
out = self.dropout(lstm_out)
out = self.fc(out)
sig_out = self.sig(out)
sig_out = sig_out.view(batch_size, -1)
sig_out = sig_out[:, -1]
return sig_out, hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
if (train_on_gpu):
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
else:
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
return hidden
def clear_post(post):
post = post.replace('\\n', ' ')
@ -17,94 +65,143 @@ def clear_post(post):
post_no_stop = [w for w in post if not w in stop_words]
return post_no_stop
# czy słowa musza byc setem?
def create_vocabulary_and_documents(in_file, expected_file):
vocabulary = set()
posts = {}
with open(in_file) as in_f, open(expected_file) as exp_f:
for line, exp in zip(in_f, exp_f):
text, timestap = line.rstrip('\n').split('\t')
post = clear_post(text)
posts[" ".join(post)] = int(exp)
for word in post:
vocabulary.add(word)
with open('data', 'wb') as f:
pickle.dump([vocabulary, posts], f)
print("data created")
return vocabulary, posts
def count_all_words(posts):
joint_posts = ' '.join(posts)
words = joint_posts.split()
count_words = Counter(words)
total_words = len(words)
sorted_words = count_words.most_common(total_words)
return sorted_words, total_words, count_words
def create_mappings(vocabulary):
word_to_index_mapping = {}
index_to_word_mapping = {}
xi = 1
for word in vocabulary:
word_to_index_mapping[word] = xi
index_to_word_mapping[xi] = word
xi += 1
return word_to_index_mapping, index_to_word_mapping
def pad_features(posts_int, seq_length):
features = np.zeros((len(posts_int), seq_length), dtype = int)
for i, post in enumerate(posts_int):
post_len = len(post)
if post_len <= seq_length:
zeroes = list(np.zeros(seq_length-post_len))
new = zeroes+post
elif post_len > seq_length:
new = post[0:seq_length]
features[i,:] = np.array(new)
return features
def main():
if len(sys.argv) != 4:
print("syntax ./train.py model expected_file in_file")
if len(sys.argv) != 3:
return
model = str(sys.argv[1])
expected_file = str(sys.argv[2])
in_file = str(sys.argv[3])
try:
with open("data", 'rb') as pos:
pickle_list = pickle.load(pos)
print("data loaded")
vocabulary = pickle_list[0]
posts = pickle_list[1]
except FileNotFoundError:
vocabulary, posts = create_vocabulary_and_documents(in_file, expected_file)
word_to_index_mapping, index_to_word_mapping = create_mappings(vocabulary)
in_file = sys.argv[1]
expected_file = sys.argv[2]
weights = []
for xi in range(0, len(vocabulary) + 1):
weights.append(random.uniform(-0.01,0.01))
posts = []
labels = []
with open(in_file, 'r') as f:
for line in f:
post = line.split('\t')[0].rstrip().lower()
post = ''.join([c for c in post if c not in punctuation])
posts.append(post)
learning_rate = 0.000000001
loss_sum = 0.0
loss_sum_counter = 0
lowest_loss_sum_weights = []
lowest_loss_sum = 10000.0
with open(expected_file, 'r') as f:
for line in f:
labels.append(int(line))
print(f"len of vocabulary {len(vocabulary)}")
# mozna ustawić na bardzo bardzo duzo
while loss_sum_counter != 10000:
try:
d, y = random.choice(list(posts.items()))
y_hat = weights[0]
tokens = d.split(' ')
for word in tokens:
# mozna tez cos pomyslec z count aby lepiej dzialalo
#print(f"{d.count(word)} : {word}")
y_hat += weights[word_to_index_mapping[word]] * tokens.count(word)
#print(f"{weights[word_to_index_mapping[word]]} : {word}")
sorted_words, total_words, count_words = count_all_words(posts)
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
loss = (y_hat - y)**2
loss_sum += loss
delta = (y_hat - y) * learning_rate
if loss_sum_counter % 100 == 0:
print(f"{loss_sum_counter} : {loss_sum /1000} : {y_hat} : {delta} : {lowest_loss_sum}")
#loss_sum_counter = 0
loss_sum = 0
posts_int = []
for post in posts:
p = [vocab_to_int[w] for w in post.split()]
posts_int.append(p)
weights[0] -= delta
for word in tokens:
weights[word_to_index_mapping[word]] -= tokens.count(word) * delta
encoded_labels = np.array(labels)
if lowest_loss_sum > loss_sum and loss_sum != 0:
print(f"it happened, new lowest_sum {loss_sum}")
lowest_loss_sum = loss_sum
lowest_loss_sum_weights = weights
posts_len = [len(p) for p in posts_int]
pd.Series(posts_len).hist()
print(pd.Series(posts_len).describe())
# outliners
posts_int = [posts_int[i] for i, l in enumerate(posts_len) if l>2 ]
encoded_labels = [ encoded_labels[i] for i, l in enumerate(posts_len) if l> 2 ]
loss_sum_counter +=1
except KeyboardInterrupt:
break
#print(lowest_loss_sum_weights)
with open(model, 'wb') as f:
pickle.dump([weights, lowest_loss_sum_weights, word_to_index_mapping], f)
seq_length = 63
train_x = pad_features(posts_int, seq_length)
train_y = np.array(encoded_labels)
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
batch_size = 50
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last = True)
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
vocab_size = len(vocab_to_int)+1
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2
model = ClassifyLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
lr=0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epochs = 4
counter = 0
print_every = 100
clip=5
if(train_on_gpu):
model.cuda()
model.train()
for e in range(epochs):
h = model.init_hidden(batch_size)
for inputs, labels in train_loader:
counter += 1
if(train_on_gpu):
inputs, labels = inputs.cuda(), labels.cuda()
h = tuple([each.data for each in h])
model.zero_grad()
inputs = inputs.type(torch.LongTensor)
output, h = model(inputs, h)
loss = criterion(output.squeeze(), labels.float())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
if counter % print_every == 0:
val_h = model.init_hidden(batch_size)
val_losses = []
model.eval()
#for inputs, labels in valid_loader:
# val_h = tuple([each.data for each in val_h])
# if(train_on_gpu):
# inputs, labels = inputs.cuda(), labels.cuda()
# inputs = inputs.type(torch.LongTensor)
# output, val_h = model(inputs, val_h)
# val_loss = criterion(output.squeeze(), labels.float())
# val_losses.append(val_loss.item())
#model.train()
print("Epoch: {}/{}...".format(e+1, epochs),
"Step: {}...".format(counter),
"Loss: {:.6f}...".format(loss.item()),
"Val Loss: {:.6f}".format(np.mean(val_losses)))
# test_losses = []
# num_correct = 0
# h = model.init_hidden(batch_size)
# model.eval()
# for inputs, labels in test_loader:
# h = tuple([each.data for each in h])
# if(train_on_gpu):
# inputs, labels = inputs.cuda(), labels.cuda()
#
# inputs = inputs.type(torch.LongTensor)
# output, h = model(inputs, h)
# test_loss = criterion(output.squeeze(), labels.float())
# test_losses.append(test_loss.item())
ip.set_trace()
main()