use LSTM
This commit is contained in:
parent
e72f319c75
commit
939388fdca
10544
dev-0/out.tsv
10544
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
10304
test-A/out.tsv
10304
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
255
train.py
255
train.py
@ -1,6 +1,54 @@
|
|||||||
#!/usr/bin/python3
|
#!/usr/bin/env python3
|
||||||
|
import torch.nn as nn
|
||||||
|
from torch.utils.data import DataLoader, TensorDataset
|
||||||
|
import pandas as pd
|
||||||
import re, sys, pickle, random
|
import re, sys, pickle, random
|
||||||
from nltk.corpus import stopwords
|
from nltk.corpus import stopwords
|
||||||
|
import torch
|
||||||
|
import ipdb as ip
|
||||||
|
from string import punctuation
|
||||||
|
from collections import Counter
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
train_on_gpu = False
|
||||||
|
|
||||||
|
class ClassifyLSTM(nn.Module):
|
||||||
|
def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
|
||||||
|
super().__init__()
|
||||||
|
self.output_size = output_size
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.hidden_dim = hidden_dim
|
||||||
|
|
||||||
|
self.embedding = nn.Embedding(vocab_size, embedding_dim)
|
||||||
|
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
|
||||||
|
|
||||||
|
self.dropout = nn.Dropout(0.3)
|
||||||
|
|
||||||
|
self.fc = nn.Linear(hidden_dim, output_size)
|
||||||
|
self.sig = nn.Sigmoid()
|
||||||
|
|
||||||
|
def forward(self, x, hidden):
|
||||||
|
batch_size = x.size(0)
|
||||||
|
embeds = self.embedding(x)
|
||||||
|
lstm_out, hidden = self.lstm(embeds, hidden)
|
||||||
|
|
||||||
|
lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
|
||||||
|
out = self.dropout(lstm_out)
|
||||||
|
out = self.fc(out)
|
||||||
|
sig_out = self.sig(out)
|
||||||
|
sig_out = sig_out.view(batch_size, -1)
|
||||||
|
sig_out = sig_out[:, -1]
|
||||||
|
return sig_out, hidden
|
||||||
|
|
||||||
|
def init_hidden(self, batch_size):
|
||||||
|
weight = next(self.parameters()).data
|
||||||
|
|
||||||
|
if (train_on_gpu):
|
||||||
|
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
|
||||||
|
else:
|
||||||
|
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
|
||||||
|
return hidden
|
||||||
|
|
||||||
|
|
||||||
def clear_post(post):
|
def clear_post(post):
|
||||||
post = post.replace('\\n', ' ')
|
post = post.replace('\\n', ' ')
|
||||||
@ -17,94 +65,143 @@ def clear_post(post):
|
|||||||
post_no_stop = [w for w in post if not w in stop_words]
|
post_no_stop = [w for w in post if not w in stop_words]
|
||||||
return post_no_stop
|
return post_no_stop
|
||||||
|
|
||||||
# czy słowa musza byc setem?
|
def count_all_words(posts):
|
||||||
def create_vocabulary_and_documents(in_file, expected_file):
|
joint_posts = ' '.join(posts)
|
||||||
vocabulary = set()
|
words = joint_posts.split()
|
||||||
posts = {}
|
count_words = Counter(words)
|
||||||
with open(in_file) as in_f, open(expected_file) as exp_f:
|
total_words = len(words)
|
||||||
for line, exp in zip(in_f, exp_f):
|
sorted_words = count_words.most_common(total_words)
|
||||||
text, timestap = line.rstrip('\n').split('\t')
|
return sorted_words, total_words, count_words
|
||||||
post = clear_post(text)
|
|
||||||
posts[" ".join(post)] = int(exp)
|
|
||||||
for word in post:
|
|
||||||
vocabulary.add(word)
|
|
||||||
with open('data', 'wb') as f:
|
|
||||||
pickle.dump([vocabulary, posts], f)
|
|
||||||
print("data created")
|
|
||||||
return vocabulary, posts
|
|
||||||
|
|
||||||
def create_mappings(vocabulary):
|
def pad_features(posts_int, seq_length):
|
||||||
word_to_index_mapping = {}
|
features = np.zeros((len(posts_int), seq_length), dtype = int)
|
||||||
index_to_word_mapping = {}
|
for i, post in enumerate(posts_int):
|
||||||
xi = 1
|
post_len = len(post)
|
||||||
for word in vocabulary:
|
if post_len <= seq_length:
|
||||||
word_to_index_mapping[word] = xi
|
zeroes = list(np.zeros(seq_length-post_len))
|
||||||
index_to_word_mapping[xi] = word
|
new = zeroes+post
|
||||||
xi += 1
|
|
||||||
return word_to_index_mapping, index_to_word_mapping
|
elif post_len > seq_length:
|
||||||
|
new = post[0:seq_length]
|
||||||
|
|
||||||
|
features[i,:] = np.array(new)
|
||||||
|
|
||||||
|
return features
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
if len(sys.argv) != 4:
|
if len(sys.argv) != 3:
|
||||||
print("syntax ./train.py model expected_file in_file")
|
|
||||||
return
|
return
|
||||||
model = str(sys.argv[1])
|
|
||||||
expected_file = str(sys.argv[2])
|
|
||||||
in_file = str(sys.argv[3])
|
|
||||||
try:
|
|
||||||
with open("data", 'rb') as pos:
|
|
||||||
pickle_list = pickle.load(pos)
|
|
||||||
print("data loaded")
|
|
||||||
vocabulary = pickle_list[0]
|
|
||||||
posts = pickle_list[1]
|
|
||||||
except FileNotFoundError:
|
|
||||||
vocabulary, posts = create_vocabulary_and_documents(in_file, expected_file)
|
|
||||||
|
|
||||||
word_to_index_mapping, index_to_word_mapping = create_mappings(vocabulary)
|
in_file = sys.argv[1]
|
||||||
|
expected_file = sys.argv[2]
|
||||||
|
|
||||||
weights = []
|
posts = []
|
||||||
for xi in range(0, len(vocabulary) + 1):
|
labels = []
|
||||||
weights.append(random.uniform(-0.01,0.01))
|
with open(in_file, 'r') as f:
|
||||||
|
for line in f:
|
||||||
|
post = line.split('\t')[0].rstrip().lower()
|
||||||
|
post = ''.join([c for c in post if c not in punctuation])
|
||||||
|
posts.append(post)
|
||||||
|
|
||||||
learning_rate = 0.000000001
|
with open(expected_file, 'r') as f:
|
||||||
loss_sum = 0.0
|
for line in f:
|
||||||
loss_sum_counter = 0
|
labels.append(int(line))
|
||||||
lowest_loss_sum_weights = []
|
|
||||||
lowest_loss_sum = 10000.0
|
|
||||||
|
|
||||||
print(f"len of vocabulary {len(vocabulary)}")
|
sorted_words, total_words, count_words = count_all_words(posts)
|
||||||
# mozna ustawić na bardzo bardzo duzo
|
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
|
||||||
while loss_sum_counter != 10000:
|
|
||||||
try:
|
|
||||||
d, y = random.choice(list(posts.items()))
|
|
||||||
y_hat = weights[0]
|
|
||||||
tokens = d.split(' ')
|
|
||||||
for word in tokens:
|
|
||||||
# mozna tez cos pomyslec z count aby lepiej dzialalo
|
|
||||||
#print(f"{d.count(word)} : {word}")
|
|
||||||
y_hat += weights[word_to_index_mapping[word]] * tokens.count(word)
|
|
||||||
#print(f"{weights[word_to_index_mapping[word]]} : {word}")
|
|
||||||
|
|
||||||
loss = (y_hat - y)**2
|
posts_int = []
|
||||||
loss_sum += loss
|
for post in posts:
|
||||||
delta = (y_hat - y) * learning_rate
|
p = [vocab_to_int[w] for w in post.split()]
|
||||||
if loss_sum_counter % 100 == 0:
|
posts_int.append(p)
|
||||||
print(f"{loss_sum_counter} : {loss_sum /1000} : {y_hat} : {delta} : {lowest_loss_sum}")
|
|
||||||
#loss_sum_counter = 0
|
|
||||||
loss_sum = 0
|
|
||||||
|
|
||||||
weights[0] -= delta
|
encoded_labels = np.array(labels)
|
||||||
for word in tokens:
|
|
||||||
weights[word_to_index_mapping[word]] -= tokens.count(word) * delta
|
|
||||||
|
|
||||||
if lowest_loss_sum > loss_sum and loss_sum != 0:
|
posts_len = [len(p) for p in posts_int]
|
||||||
print(f"it happened, new lowest_sum {loss_sum}")
|
pd.Series(posts_len).hist()
|
||||||
lowest_loss_sum = loss_sum
|
print(pd.Series(posts_len).describe())
|
||||||
lowest_loss_sum_weights = weights
|
# outliners
|
||||||
|
posts_int = [posts_int[i] for i, l in enumerate(posts_len) if l>2 ]
|
||||||
|
encoded_labels = [ encoded_labels[i] for i, l in enumerate(posts_len) if l> 2 ]
|
||||||
|
|
||||||
loss_sum_counter +=1
|
seq_length = 63
|
||||||
except KeyboardInterrupt:
|
train_x = pad_features(posts_int, seq_length)
|
||||||
break
|
train_y = np.array(encoded_labels)
|
||||||
#print(lowest_loss_sum_weights)
|
|
||||||
with open(model, 'wb') as f:
|
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
|
||||||
pickle.dump([weights, lowest_loss_sum_weights, word_to_index_mapping], f)
|
batch_size = 50
|
||||||
|
|
||||||
|
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last = True)
|
||||||
|
|
||||||
|
dataiter = iter(train_loader)
|
||||||
|
sample_x, sample_y = dataiter.next()
|
||||||
|
|
||||||
|
vocab_size = len(vocab_to_int)+1
|
||||||
|
output_size = 1
|
||||||
|
embedding_dim = 400
|
||||||
|
hidden_dim = 256
|
||||||
|
n_layers = 2
|
||||||
|
|
||||||
|
model = ClassifyLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
|
||||||
|
|
||||||
|
lr=0.001
|
||||||
|
criterion = nn.BCELoss()
|
||||||
|
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
|
||||||
|
epochs = 4
|
||||||
|
counter = 0
|
||||||
|
print_every = 100
|
||||||
|
clip=5
|
||||||
|
if(train_on_gpu):
|
||||||
|
model.cuda()
|
||||||
|
|
||||||
|
model.train()
|
||||||
|
for e in range(epochs):
|
||||||
|
h = model.init_hidden(batch_size)
|
||||||
|
for inputs, labels in train_loader:
|
||||||
|
counter += 1
|
||||||
|
if(train_on_gpu):
|
||||||
|
inputs, labels = inputs.cuda(), labels.cuda()
|
||||||
|
h = tuple([each.data for each in h])
|
||||||
|
model.zero_grad()
|
||||||
|
inputs = inputs.type(torch.LongTensor)
|
||||||
|
output, h = model(inputs, h)
|
||||||
|
loss = criterion(output.squeeze(), labels.float())
|
||||||
|
loss.backward()
|
||||||
|
nn.utils.clip_grad_norm_(model.parameters(), clip)
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
if counter % print_every == 0:
|
||||||
|
val_h = model.init_hidden(batch_size)
|
||||||
|
val_losses = []
|
||||||
|
model.eval()
|
||||||
|
#for inputs, labels in valid_loader:
|
||||||
|
# val_h = tuple([each.data for each in val_h])
|
||||||
|
# if(train_on_gpu):
|
||||||
|
# inputs, labels = inputs.cuda(), labels.cuda()
|
||||||
|
# inputs = inputs.type(torch.LongTensor)
|
||||||
|
# output, val_h = model(inputs, val_h)
|
||||||
|
# val_loss = criterion(output.squeeze(), labels.float())
|
||||||
|
# val_losses.append(val_loss.item())
|
||||||
|
#model.train()
|
||||||
|
print("Epoch: {}/{}...".format(e+1, epochs),
|
||||||
|
"Step: {}...".format(counter),
|
||||||
|
"Loss: {:.6f}...".format(loss.item()),
|
||||||
|
"Val Loss: {:.6f}".format(np.mean(val_losses)))
|
||||||
|
|
||||||
|
# test_losses = []
|
||||||
|
# num_correct = 0
|
||||||
|
# h = model.init_hidden(batch_size)
|
||||||
|
# model.eval()
|
||||||
|
# for inputs, labels in test_loader:
|
||||||
|
# h = tuple([each.data for each in h])
|
||||||
|
# if(train_on_gpu):
|
||||||
|
# inputs, labels = inputs.cuda(), labels.cuda()
|
||||||
|
#
|
||||||
|
# inputs = inputs.type(torch.LongTensor)
|
||||||
|
# output, h = model(inputs, h)
|
||||||
|
# test_loss = criterion(output.squeeze(), labels.float())
|
||||||
|
# test_losses.append(test_loss.item())
|
||||||
|
|
||||||
|
|
||||||
|
ip.set_trace()
|
||||||
main()
|
main()
|
||||||
|
Loading…
Reference in New Issue
Block a user