changed encoding (GPT-2)

This commit is contained in:
adnovac 2022-06-09 20:52:29 +02:00
parent 023903113d
commit a7bc73667e
3 changed files with 17975 additions and 18169 deletions

File diff suppressed because it is too large Load Diff

278
run.py
View File

@ -1,253 +1,59 @@
# -*- coding: utf-8 -*-
"""run
"""run - gpt.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1vjpmLsNPjPLM1_5fBGbBYg-ZqdXQeGQH
https://colab.research.google.com/drive/1YlyKQShvsB_4qBTfjdRm2ngeYAKpPtxt
"""
!pip install transformers
from google.colab import drive
drive.mount('/content/gdrive/')
# importy
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
import torch
import pandas as pd
import regex as re
import torch
import transformers
import csv
import itertools
from os.path import exists
import tensorflow as tf
import re
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
vocab_size = 15000
embed_size = 128
lstm_size = 128
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
# funkcje pomocnicze
def clean(text):
text = str(text).strip().lower()
text = re.sub("|>|<|\.|\\|\"|”|-|,|\*|:|\/", "", text)
text = text.replace('\\n', " ").replace("'t", " not").replace("'s", " is").replace("'ll", " will").replace("'m", " am").replace("'ve", " have")
text = text.replace("'", "")
return text
def predict(text):
text = str.join(" ", text.split()[-50:])
input_ids = tokenizer(text, return_tensors='pt')
with torch.no_grad():
logits = model(**input_ids).logits[:, -1, :]
result = ""
top = torch.topk(logits, 2)
probs = tf.nn.softmax(top.values[0]).numpy().tolist()
print(probs)
for i in range(2):
predicted_word = tokenizer.decode(top.indices[0][i], skip_special_tokens=True).split()[-1]
sentence_score = probs[i]
result+=f"{predicted_word}:{sentence_score} "
result = result + " :0.2"
return result
def get_words_from_line(line, specials = True):
line = line.rstrip()
if specials:
yield '<s>'
for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
yield m.group(0).lower()
if specials:
yield '</s>'
def predict_doc(input_path, output_path):
data = pd.read_csv(input_path, sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)[6]
data = data.replace('\\\\n', "", regex=True)
data = data.apply(lambda x: re.sub('[^a-zA-Z0-9] ', '', x))
cnt = len(data)
with open(output_path, 'w') as file:
for i, row in enumerate(data):
try:
result = predict(row)
except:
result = "a:0.5 the:0.5"
print(f"{i}/{cnt} {result}")
file.write(result + '\n')
# predict_doc('gdrive/MyDrive/dev-0/in.tsv.xz', 'gdrive/MyDrive/dev-0/out.tsv')
def get_word_lines_from_data(d):
for line in d:
yield get_words_from_line(line)
class Model(torch.nn.Module):
def __init__(self, vocabulary_size, embedding_size, lstm_size):
super(Model, self).__init__()
self.lstm_size = lstm_size
self.embedding_dim = embedding_size
self.num_layers = 3
self.embedding = torch.nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=self.embedding_dim,
)
self.lstm = torch.nn.LSTM(
input_size=self.lstm_size,
hidden_size=self.lstm_size,
num_layers=self.num_layers,
dropout=0.2,
)
self.fc = torch.nn.Linear(self.lstm_size, vocab_size)
def forward(self, x, prev_state = None):
embed = self.embedding(x)
output, state = self.lstm(embed, prev_state)
logits = self.fc(output)
return logits, state
class Trigrams(torch.utils.data.IterableDataset):
def __init__(self, data, vocabulary_size):
self.vocab = build_vocab_from_iterator(
get_word_lines_from_data(data),
max_tokens = vocabulary_size,
specials = ['<unk>'])
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.data = data
@staticmethod
def look_ahead_iterator(gen):
w1 = None
for item in gen:
if w1 is not None:
yield (w1, item)
w1 = item
def __iter__(self):
return self.look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_data(self.data))))
# ładowanie danych treningowych
train_in = pd.read_csv("gdrive/MyDrive/train/in.tsv.xz", sep='\t', header=None, encoding="UTF-8", on_bad_lines="skip", quoting=csv.QUOTE_NONE, nrows=20000)[[6, 7]]
train_expected = pd.read_csv("gdrive/MyDrive/train/expected.tsv", sep='\t', header=None, encoding="UTF-8", on_bad_lines="skip", quoting=csv.QUOTE_NONE, nrows=20000)
train_data = pd.concat([train_in, train_expected], axis=1)
train_data = train_data[6] + train_data[0] + train_data[7]
train_data = train_data.apply(clean)
train_dataset = Trigrams(train_data, vocab_size)
train_dataset_rev = Trigrams(train_data.iloc[::-1], vocab_size)
# trenowanie/wczytywanie modelu
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Model(vocab_size, embed_size, lstm_size).to(device)
print(device)
if(not exists('model1.bin')):
data = DataLoader(train_dataset, batch_size=8000)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
model.train()
step = 0
for i in range(1):
print(f"EPOCH {i}=========================")
for x, y in data:
optimizer.zero_grad()
x = x.to(device)
y = y.to(device)
y_pred, state_h = model(x)
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
if step % 100 == 0:
print(step, loss)
step += 1
torch.save(model.state_dict(), 'model1.bin')
else:
print("Loading model1")
model.load_state_dict(torch.load('model1.bin'))
vocab = train_dataset.vocab
# trenowanie/wczytywanie modelu
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_b = Model(vocab_size, embed_size, lstm_size).to(device)
print(device)
if(not exists('model1_b.bin')):
data_b = DataLoader(train_dataset_rev, batch_size=8000)
optimizer = torch.optim.Adam(model_b.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
model_b.train()
step = 0
for i in range(1):
print(f"EPOCH {i}=========================")
for x, y in data:
optimizer.zero_grad()
x = x.to(device)
y = y.to(device)
y_pred, state_h = model_b(x)
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
if step % 100 == 0:
print(step, loss)
step += 1
torch.save(model_b.state_dict(), 'model1_b.bin')
else:
print("Loading model1")
model_b.load_state_dict(torch.load('model1_b.bin'))
import numpy as np
def predict(tokens_left, tokens_right):
ixs = torch.tensor(vocab.forward(tokens_left)).to(device)
ixs_r = torch.tensor(vocab.forward(tokens_right)).to(device)
out = model(ixs)
out_b = model_b(ixs_r)
top = torch.topk(out[0], 8)
top_b = torch.topk(out_b[0], 8)
top_indices = top.indices.tolist()[0]
top_probs = top.values.tolist()[0]
top_indices_b = top_b.indices.tolist()[0]
top_probs_b = top_b.values.tolist()[0]
raw_result = []
for ind in set(top_indices + top_indices_b):
prob = 0
if(ind in top_indices):
prob += top_probs[top_indices.index(ind)]
if(ind in top_indices_b):
prob += top_probs_b[top_indices_b.index(ind)]
raw_result += [[vocab.lookup_token(ind), prob]]
raw_result = list(filter(lambda x: x[0] != "<unk>", raw_result))
raw_result = sorted(raw_result, key=lambda x: -x[1])[:8]
words = [x[0] for x in raw_result]
probs = [x[1] for x in raw_result]
probs_x = np.exp(probs)/sum(np.exp(probs))
result = ""
for word, prob in list(zip(words,probs_x)):
result += f"{word}:{prob} "
result += ":0.3"
result = result.rstrip()
return result
from nltk import word_tokenize
def predict_file(result_path, data):
with open(result_path, "w+", encoding="UTF-8") as f:
for index, row in data.iterrows():
result = {}
before = None
after = None
for after in get_words_from_line(clean(str(row[7])), False):
after = [after]
break
for before in get_words_from_line(clean(str(row[6])), False):
pass
before = [before]
if(len(before) < 1 and len(after) < 1):
result = "a:0.2 the:0.2 to:0.2 of:0.1 and:0.1 of:0.1 :0.1"
else:
result = predict(before, after)
result = result.strip()
print(result)
f.write(result + "\n")
dev_data = pd.read_csv("gdrive/MyDrive/dev-0/in.tsv.xz", sep='\t', header=None, quoting=csv.QUOTE_NONE)
dev_data[6] = dev_data[6].apply(clean)
dev_data[7] = dev_data[7].apply(clean)
predict_file("gdrive/MyDrive/dev-0/out.tsv", dev_data)
test_data = pd.read_csv("gdrive/MyDrive/test-A/in.tsv.xz", sep='\t', header=None, quoting=csv.QUOTE_NONE)
test_data[6] = test_data[6].apply(clean)
test_data[7] = test_data[7].apply(clean)
predict_file("gdrive/MyDrive/test-A/out.tsv", test_data)
# !wget https://gonito.net/get/bin/geval
# !chmod 777 geval
!rm -r dev-0
!cp -r gdrive/MyDrive/dev-0 dev-0
!./geval -t dev-0 --metric PerplexityHashed
predict_doc('gdrive/MyDrive/test-A/in.tsv.xz', 'gdrive/MyDrive/test-A/out.tsv')

File diff suppressed because it is too large Load Diff