paranormal-or-skeptic/train.py

208 lines
6.9 KiB
Python
Raw Normal View History

2020-06-14 17:44:28 +02:00
#!/usr/bin/env python3
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
2020-04-06 10:41:14 +02:00
import re, sys, pickle, random
2020-03-29 23:29:19 +02:00
from nltk.corpus import stopwords
2020-06-14 17:44:28 +02:00
import torch
import ipdb as ip
from string import punctuation
from collections import Counter
import numpy as np
train_on_gpu = False
class ClassifyLSTM(nn.Module):
def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
super().__init__()
self.output_size = output_size
self.n_layers = n_layers
self.hidden_dim = hidden_dim
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
self.dropout = nn.Dropout(0.3)
self.fc = nn.Linear(hidden_dim, output_size)
self.sig = nn.Sigmoid()
def forward(self, x, hidden):
batch_size = x.size(0)
embeds = self.embedding(x)
lstm_out, hidden = self.lstm(embeds, hidden)
lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
out = self.dropout(lstm_out)
out = self.fc(out)
sig_out = self.sig(out)
sig_out = sig_out.view(batch_size, -1)
sig_out = sig_out[:, -1]
return sig_out, hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
if (train_on_gpu):
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
else:
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
return hidden
2020-03-22 10:15:36 +01:00
2020-03-29 13:39:47 +02:00
def clear_post(post):
post = post.replace('\\n', ' ')
2020-03-29 23:29:19 +02:00
post = post.lower()
2020-03-29 19:48:30 +02:00
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
2020-03-29 14:28:07 +02:00
post = re.sub(r'[\.\,\/\~]+', ' ', post)
post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
2020-03-29 19:48:30 +02:00
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\\\!\=\^]+', '', post)
2020-03-29 14:28:07 +02:00
post = re.sub(r'( \- |\-\-+)', ' ', post)
2020-03-29 13:39:47 +02:00
post = re.sub(r' +', ' ', post)
post = post.rstrip(' ')
2020-03-29 23:29:19 +02:00
post = post.split(' ')
stop_words = set(stopwords.words('english'))
post_no_stop = [w for w in post if not w in stop_words]
return post_no_stop
2020-03-22 10:15:36 +01:00
2020-06-14 17:44:28 +02:00
def count_all_words(posts):
joint_posts = ' '.join(posts)
words = joint_posts.split()
count_words = Counter(words)
total_words = len(words)
sorted_words = count_words.most_common(total_words)
return sorted_words, total_words, count_words
def pad_features(posts_int, seq_length):
features = np.zeros((len(posts_int), seq_length), dtype = int)
for i, post in enumerate(posts_int):
post_len = len(post)
if post_len <= seq_length:
zeroes = list(np.zeros(seq_length-post_len))
new = zeroes+post
elif post_len > seq_length:
new = post[0:seq_length]
features[i,:] = np.array(new)
return features
2020-03-29 23:29:19 +02:00
2020-03-22 10:15:36 +01:00
def main():
2020-06-14 17:44:28 +02:00
if len(sys.argv) != 3:
2020-03-29 13:39:47 +02:00
return
2020-06-14 17:44:28 +02:00
in_file = sys.argv[1]
expected_file = sys.argv[2]
posts = []
labels = []
with open(in_file, 'r') as f:
for line in f:
post = line.split('\t')[0].rstrip().lower()
post = ''.join([c for c in post if c not in punctuation])
posts.append(post)
with open(expected_file, 'r') as f:
for line in f:
labels.append(int(line))
sorted_words, total_words, count_words = count_all_words(posts)
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
posts_int = []
for post in posts:
p = [vocab_to_int[w] for w in post.split()]
posts_int.append(p)
encoded_labels = np.array(labels)
posts_len = [len(p) for p in posts_int]
pd.Series(posts_len).hist()
print(pd.Series(posts_len).describe())
# outliners
posts_int = [posts_int[i] for i, l in enumerate(posts_len) if l>2 ]
encoded_labels = [ encoded_labels[i] for i, l in enumerate(posts_len) if l> 2 ]
seq_length = 63
train_x = pad_features(posts_int, seq_length)
train_y = np.array(encoded_labels)
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
batch_size = 50
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last = True)
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
vocab_size = len(vocab_to_int)+1
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2
model = ClassifyLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
lr=0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epochs = 4
counter = 0
print_every = 100
clip=5
if(train_on_gpu):
model.cuda()
model.train()
for e in range(epochs):
h = model.init_hidden(batch_size)
for inputs, labels in train_loader:
counter += 1
if(train_on_gpu):
inputs, labels = inputs.cuda(), labels.cuda()
h = tuple([each.data for each in h])
model.zero_grad()
inputs = inputs.type(torch.LongTensor)
output, h = model(inputs, h)
loss = criterion(output.squeeze(), labels.float())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
if counter % print_every == 0:
val_h = model.init_hidden(batch_size)
val_losses = []
model.eval()
#for inputs, labels in valid_loader:
# val_h = tuple([each.data for each in val_h])
# if(train_on_gpu):
# inputs, labels = inputs.cuda(), labels.cuda()
# inputs = inputs.type(torch.LongTensor)
# output, val_h = model(inputs, val_h)
# val_loss = criterion(output.squeeze(), labels.float())
# val_losses.append(val_loss.item())
#model.train()
print("Epoch: {}/{}...".format(e+1, epochs),
"Step: {}...".format(counter),
"Loss: {:.6f}...".format(loss.item()),
"Val Loss: {:.6f}".format(np.mean(val_losses)))
# test_losses = []
# num_correct = 0
# h = model.init_hidden(batch_size)
# model.eval()
# for inputs, labels in test_loader:
# h = tuple([each.data for each in h])
# if(train_on_gpu):
# inputs, labels = inputs.cuda(), labels.cuda()
#
# inputs = inputs.type(torch.LongTensor)
# output, h = model(inputs, h)
# test_loss = criterion(output.squeeze(), labels.float())
# test_losses.append(test_loss.item())
ip.set_trace()
2020-03-22 10:15:36 +01:00
main()