challenging-america-word-ga.../run.py
2022-05-07 14:53:24 +02:00

201 lines
5.8 KiB
Python

from itertools import islice
import regex as re
import sys
from torchtext.vocab import build_vocab_from_iterator
import lzma
from torch import nn
import torch
from torch.utils.data import IterableDataset
import itertools
from torch.utils.data import DataLoader
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk import trigrams
# def get_words_from_line(file_path):
# for index, line in enumerate(get_lines_from_file(file)):
# yield '<s>'
# for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
# yield m.group(0).lower()
# yield '</s>'
# if index == 10000:
# break
tokenizer = RegexpTokenizer(r"\w+")
def read_file_6(file):
for line in file:
text = line.split("\t")
yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[6].replace("\\n", " ").replace("\n", "").lower()))
def get_words_from_line(line):
line = line.rstrip()
yield '<s>'
for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
yield m.group(0).lower()
yield '</s>'
def get_words_lines_from_file(file_path):
with lzma.open(file_path, mode='rt') as file:
for index, line in enumerate(file):
text = line.split("\t")
yield get_words_from_line(re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n", " ").replace("\n", "").lower())))
# if index == 1000:
# break
vocab_size = 30000
vocab = build_vocab_from_iterator(
get_words_lines_from_file('train/in.tsv.xz'),
max_tokens=vocab_size,
specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])
# vocab=None
embed_size = 100
class SimpleBigramNeuralLanguageModel(nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleBigramNeuralLanguageModel, self).__init__()
self.model = nn.Sequential(
nn.Embedding(vocabulary_size, embedding_size),
nn.Linear(embedding_size, vocabulary_size),
nn.Softmax()
)
def forward(self, x):
return self.model(x)
def look_ahead_iterator(gen):
prev = None
for item in gen:
if prev is not None:
yield (prev, item)
prev = item
class Bigrams(IterableDataset):
def __init__(self, text_file, vocabulary_size):
self.vocab = build_vocab_from_iterator(
get_words_lines_from_file(text_file),
max_tokens=vocabulary_size,
specials=['<unk>'])
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.text_file = text_file
def __iter__(self):
return look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_words_lines_from_file(self.text_file))))
def train():
batch_size = 15000
train_dataset = Bigrams('train/in.tsv.xz', vocab_size)
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
model.train()
step = 0
for x, y in train_data_loader:
# Transfer Data to GPU
x = x.to(device)
y = y.to(device)
# Clear the gradients
optimizer.zero_grad()
# Forward Pass
ypredicted = model(x)
# Find the Loss
loss = criterion(torch.log(ypredicted), y)
if step % 100 == 0:
print(step, loss)
step += 1
# Calculate gradients
loss.backward()
# Update Weights
optimizer.step()
print(step)
torch.save(model.state_dict(), 'model1.bin')
def predict(word):
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('model1.bin'))
model.eval()
ixs = torch.tensor(vocab.forward([word])).to(device)
out = model(ixs)
top = torch.topk(out[0], 8)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
str_predictions = ""
lht = 1.0
for pred_word in list(zip(top_words, top_indices, top_probs)):
if lht - pred_word[2] >= 0:
str_predictions += f"{pred_word[0]}:{pred_word[2]} "
lht -= pred_word[2]
if lht != 1.0:
str_predictions += f":{lht}"
return str_predictions
def similar():
device = 'cuda'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.load_state_dict(torch.load('model1.bin'))
model.eval()
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
embeddings = model.model[0].weight
vec = embeddings[vocab['went']]
similarities = cos(vec, embeddings)
top = torch.topk(similarities, 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
print(list(zip(top_words, top_indices, top_probs)))
def generate_outputs(input_file, output_file):
with open(output_file, 'w') as outputf:
with lzma.open(input_file, mode='rt') as file:
for index, text in enumerate(read_file_6(file)):
tokens = tokenizer.tokenize(text)
if len(tokens) < 4:
prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
else:
prediction = predict(tokens[-1])
outputf.write(prediction + '\n')
if __name__ == "__main__":
# train()
# predict()
# generate_outputs("dev-0/in.tsv.xz", "dev-0/out.tsv")
generate_outputs("test-A/in.tsv.xz", "test-A/out.tsv")
# count_words = 0
# for i in get_words_lines_from_file('train/in.tsv.xz'):
# for j in i:
# count_words += 1
# print(count_words)