paranormal-or-skeptic/train.py
2020-06-14 17:44:28 +02:00

208 lines
6.9 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import re, sys, pickle, random
from nltk.corpus import stopwords
import torch
import ipdb as ip
from string import punctuation
from collections import Counter
import numpy as np
train_on_gpu = False
class ClassifyLSTM(nn.Module):
def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
super().__init__()
self.output_size = output_size
self.n_layers = n_layers
self.hidden_dim = hidden_dim
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
self.dropout = nn.Dropout(0.3)
self.fc = nn.Linear(hidden_dim, output_size)
self.sig = nn.Sigmoid()
def forward(self, x, hidden):
batch_size = x.size(0)
embeds = self.embedding(x)
lstm_out, hidden = self.lstm(embeds, hidden)
lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
out = self.dropout(lstm_out)
out = self.fc(out)
sig_out = self.sig(out)
sig_out = sig_out.view(batch_size, -1)
sig_out = sig_out[:, -1]
return sig_out, hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
if (train_on_gpu):
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
else:
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
return hidden
def clear_post(post):
post = post.replace('\\n', ' ')
post = post.lower()
post = re.sub(r'(\(|)(http|https|www)[a-zA-Z0-9\.\:\/\_\=\&\;\?\+\-\%]+(\)|)', ' internetlink ', post)
post = re.sub(r'[\.\,\/\~]+', ' ', post)
post = re.sub(r'(&lt|&gt|\@[a-zA-Z0-9]+)','',post)
post = re.sub(r'[\'\(\)\?\*\"\`\;0-9\[\]\:\%\|\\\!\=\^]+', '', post)
post = re.sub(r'( \- |\-\-+)', ' ', post)
post = re.sub(r' +', ' ', post)
post = post.rstrip(' ')
post = post.split(' ')
stop_words = set(stopwords.words('english'))
post_no_stop = [w for w in post if not w in stop_words]
return post_no_stop
def count_all_words(posts):
joint_posts = ' '.join(posts)
words = joint_posts.split()
count_words = Counter(words)
total_words = len(words)
sorted_words = count_words.most_common(total_words)
return sorted_words, total_words, count_words
def pad_features(posts_int, seq_length):
features = np.zeros((len(posts_int), seq_length), dtype = int)
for i, post in enumerate(posts_int):
post_len = len(post)
if post_len <= seq_length:
zeroes = list(np.zeros(seq_length-post_len))
new = zeroes+post
elif post_len > seq_length:
new = post[0:seq_length]
features[i,:] = np.array(new)
return features
def main():
if len(sys.argv) != 3:
return
in_file = sys.argv[1]
expected_file = sys.argv[2]
posts = []
labels = []
with open(in_file, 'r') as f:
for line in f:
post = line.split('\t')[0].rstrip().lower()
post = ''.join([c for c in post if c not in punctuation])
posts.append(post)
with open(expected_file, 'r') as f:
for line in f:
labels.append(int(line))
sorted_words, total_words, count_words = count_all_words(posts)
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
posts_int = []
for post in posts:
p = [vocab_to_int[w] for w in post.split()]
posts_int.append(p)
encoded_labels = np.array(labels)
posts_len = [len(p) for p in posts_int]
pd.Series(posts_len).hist()
print(pd.Series(posts_len).describe())
# outliners
posts_int = [posts_int[i] for i, l in enumerate(posts_len) if l>2 ]
encoded_labels = [ encoded_labels[i] for i, l in enumerate(posts_len) if l> 2 ]
seq_length = 63
train_x = pad_features(posts_int, seq_length)
train_y = np.array(encoded_labels)
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
batch_size = 50
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last = True)
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
vocab_size = len(vocab_to_int)+1
output_size = 1
embedding_dim = 400
hidden_dim = 256
n_layers = 2
model = ClassifyLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
lr=0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
epochs = 4
counter = 0
print_every = 100
clip=5
if(train_on_gpu):
model.cuda()
model.train()
for e in range(epochs):
h = model.init_hidden(batch_size)
for inputs, labels in train_loader:
counter += 1
if(train_on_gpu):
inputs, labels = inputs.cuda(), labels.cuda()
h = tuple([each.data for each in h])
model.zero_grad()
inputs = inputs.type(torch.LongTensor)
output, h = model(inputs, h)
loss = criterion(output.squeeze(), labels.float())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
if counter % print_every == 0:
val_h = model.init_hidden(batch_size)
val_losses = []
model.eval()
#for inputs, labels in valid_loader:
# val_h = tuple([each.data for each in val_h])
# if(train_on_gpu):
# inputs, labels = inputs.cuda(), labels.cuda()
# inputs = inputs.type(torch.LongTensor)
# output, val_h = model(inputs, val_h)
# val_loss = criterion(output.squeeze(), labels.float())
# val_losses.append(val_loss.item())
#model.train()
print("Epoch: {}/{}...".format(e+1, epochs),
"Step: {}...".format(counter),
"Loss: {:.6f}...".format(loss.item()),
"Val Loss: {:.6f}".format(np.mean(val_losses)))
# test_losses = []
# num_correct = 0
# h = model.init_hidden(batch_size)
# model.eval()
# for inputs, labels in test_loader:
# h = tuple([each.data for each in h])
# if(train_on_gpu):
# inputs, labels = inputs.cuda(), labels.cuda()
#
# inputs = inputs.type(torch.LongTensor)
# output, h = model(inputs, h)
# test_loss = criterion(output.squeeze(), labels.float())
# test_losses.append(test_loss.item())
ip.set_trace()
main()