challenging-america-word-ga.../run.py
2022-05-30 21:08:27 +02:00

253 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""run
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1vjpmLsNPjPLM1_5fBGbBYg-ZqdXQeGQH
"""
from google.colab import drive
drive.mount('/content/gdrive/')
# importy
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
import torch
import pandas as pd
import regex as re
import csv
import itertools
from os.path import exists
vocab_size = 15000
embed_size = 128
lstm_size = 128
# funkcje pomocnicze
def clean(text):
text = str(text).strip().lower()
text = re.sub("|>|<|\.|\\|\"|”|-|,|\*|:|\/", "", text)
text = text.replace('\\n', " ").replace("'t", " not").replace("'s", " is").replace("'ll", " will").replace("'m", " am").replace("'ve", " have")
text = text.replace("'", "")
return text
def get_words_from_line(line, specials = True):
line = line.rstrip()
if specials:
yield '<s>'
for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
yield m.group(0).lower()
if specials:
yield '</s>'
def get_word_lines_from_data(d):
for line in d:
yield get_words_from_line(line)
class Model(torch.nn.Module):
def __init__(self, vocabulary_size, embedding_size, lstm_size):
super(Model, self).__init__()
self.lstm_size = lstm_size
self.embedding_dim = embedding_size
self.num_layers = 3
self.embedding = torch.nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=self.embedding_dim,
)
self.lstm = torch.nn.LSTM(
input_size=self.lstm_size,
hidden_size=self.lstm_size,
num_layers=self.num_layers,
dropout=0.2,
)
self.fc = torch.nn.Linear(self.lstm_size, vocab_size)
def forward(self, x, prev_state = None):
embed = self.embedding(x)
output, state = self.lstm(embed, prev_state)
logits = self.fc(output)
return logits, state
class Trigrams(torch.utils.data.IterableDataset):
def __init__(self, data, vocabulary_size):
self.vocab = build_vocab_from_iterator(
get_word_lines_from_data(data),
max_tokens = vocabulary_size,
specials = ['<unk>'])
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.data = data
@staticmethod
def look_ahead_iterator(gen):
w1 = None
for item in gen:
if w1 is not None:
yield (w1, item)
w1 = item
def __iter__(self):
return self.look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_data(self.data))))
# ładowanie danych treningowych
train_in = pd.read_csv("gdrive/MyDrive/train/in.tsv.xz", sep='\t', header=None, encoding="UTF-8", on_bad_lines="skip", quoting=csv.QUOTE_NONE, nrows=20000)[[6, 7]]
train_expected = pd.read_csv("gdrive/MyDrive/train/expected.tsv", sep='\t', header=None, encoding="UTF-8", on_bad_lines="skip", quoting=csv.QUOTE_NONE, nrows=20000)
train_data = pd.concat([train_in, train_expected], axis=1)
train_data = train_data[6] + train_data[0] + train_data[7]
train_data = train_data.apply(clean)
train_dataset = Trigrams(train_data, vocab_size)
train_dataset_rev = Trigrams(train_data.iloc[::-1], vocab_size)
# trenowanie/wczytywanie modelu
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Model(vocab_size, embed_size, lstm_size).to(device)
print(device)
if(not exists('model1.bin')):
data = DataLoader(train_dataset, batch_size=8000)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
model.train()
step = 0
for i in range(1):
print(f"EPOCH {i}=========================")
for x, y in data:
optimizer.zero_grad()
x = x.to(device)
y = y.to(device)
y_pred, state_h = model(x)
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
if step % 100 == 0:
print(step, loss)
step += 1
torch.save(model.state_dict(), 'model1.bin')
else:
print("Loading model1")
model.load_state_dict(torch.load('model1.bin'))
vocab = train_dataset.vocab
# trenowanie/wczytywanie modelu
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_b = Model(vocab_size, embed_size, lstm_size).to(device)
print(device)
if(not exists('model1_b.bin')):
data_b = DataLoader(train_dataset_rev, batch_size=8000)
optimizer = torch.optim.Adam(model_b.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
model_b.train()
step = 0
for i in range(1):
print(f"EPOCH {i}=========================")
for x, y in data:
optimizer.zero_grad()
x = x.to(device)
y = y.to(device)
y_pred, state_h = model_b(x)
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
if step % 100 == 0:
print(step, loss)
step += 1
torch.save(model_b.state_dict(), 'model1_b.bin')
else:
print("Loading model1")
model_b.load_state_dict(torch.load('model1_b.bin'))
import numpy as np
def predict(tokens_left, tokens_right):
ixs = torch.tensor(vocab.forward(tokens_left)).to(device)
ixs_r = torch.tensor(vocab.forward(tokens_right)).to(device)
out = model(ixs)
out_b = model_b(ixs_r)
top = torch.topk(out[0], 8)
top_b = torch.topk(out_b[0], 8)
top_indices = top.indices.tolist()[0]
top_probs = top.values.tolist()[0]
top_indices_b = top_b.indices.tolist()[0]
top_probs_b = top_b.values.tolist()[0]
raw_result = []
for ind in set(top_indices + top_indices_b):
prob = 0
if(ind in top_indices):
prob += top_probs[top_indices.index(ind)]
if(ind in top_indices_b):
prob += top_probs_b[top_indices_b.index(ind)]
raw_result += [[vocab.lookup_token(ind), prob]]
raw_result = list(filter(lambda x: x[0] != "<unk>", raw_result))
raw_result = sorted(raw_result, key=lambda x: -x[1])[:8]
words = [x[0] for x in raw_result]
probs = [x[1] for x in raw_result]
probs_x = np.exp(probs)/sum(np.exp(probs))
result = ""
for word, prob in list(zip(words,probs_x)):
result += f"{word}:{prob} "
result += ":0.3"
result = result.rstrip()
return result
from nltk import word_tokenize
def predict_file(result_path, data):
with open(result_path, "w+", encoding="UTF-8") as f:
for index, row in data.iterrows():
result = {}
before = None
after = None
for after in get_words_from_line(clean(str(row[7])), False):
after = [after]
break
for before in get_words_from_line(clean(str(row[6])), False):
pass
before = [before]
if(len(before) < 1 and len(after) < 1):
result = "a:0.2 the:0.2 to:0.2 of:0.1 and:0.1 of:0.1 :0.1"
else:
result = predict(before, after)
result = result.strip()
print(result)
f.write(result + "\n")
dev_data = pd.read_csv("gdrive/MyDrive/dev-0/in.tsv.xz", sep='\t', header=None, quoting=csv.QUOTE_NONE)
dev_data[6] = dev_data[6].apply(clean)
dev_data[7] = dev_data[7].apply(clean)
predict_file("gdrive/MyDrive/dev-0/out.tsv", dev_data)
test_data = pd.read_csv("gdrive/MyDrive/test-A/in.tsv.xz", sep='\t', header=None, quoting=csv.QUOTE_NONE)
test_data[6] = test_data[6].apply(clean)
test_data[7] = test_data[7].apply(clean)
predict_file("gdrive/MyDrive/test-A/out.tsv", test_data)
# !wget https://gonito.net/get/bin/geval
# !chmod 777 geval
!rm -r dev-0
!cp -r gdrive/MyDrive/dev-0 dev-0
!./geval -t dev-0 --metric PerplexityHashed