Compare commits

..

5 Commits
master ... gpt

Author SHA1 Message Date
Jakub Kaczmarek 40c36dce44 Final out fix 2023-06-09 02:33:57 +02:00
Jakub Kaczmarek 5bc8f3f6f7 Fix output 2023-06-09 02:26:52 +02:00
Jakub Kaczmarek c8247f077f Fix output 2023-06-09 02:24:43 +02:00
Jakub Kaczmarek ab56101d2c Remove results for dev-0 2023-06-09 02:07:56 +02:00
Jakub Kaczmarek f8892f9209 Add gpt2 solution 2023-06-09 01:55:15 +02:00
13 changed files with 7463 additions and 46535 deletions

4
.gitignore vendored
View File

@ -1,4 +1,4 @@
geval
*~
*.swp
*.bak
@ -6,5 +6,3 @@ geval
*.o
.DS_Store
.token
*.pickle
*.xz

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -5,9 +5,6 @@ tags:
params:
epochs: 1
vocab-size: 20000
batch-size: 10000
embed-size:
- 100
- 500
- 1000
topk: 10
batch-size: 5000
embed-size: 100
topk: 150

18
gpt_predict.py Normal file
View File

@ -0,0 +1,18 @@
from transformers import pipeline
import lzma
generator = pipeline("text-generation", model="gpt2")
with open("test-A/in.tsv", "r") as input_file, open(
"test-A/out.tsv", "w"
) as output_file:
for line in input_file:
line = line.rstrip()
line = line.replace("\\n", " ")
prompt = line.split("\t")[6]
result = generator(prompt, max_new_tokens=1, num_return_sequences=1)[0][
"generated_text"
]
output_file.write(f"{result.split()[-1]}:1\n")

168
run.py
View File

@ -1,3 +1,5 @@
from itertools import islice
import sys
import lzma
import regex as re
from torchtext.vocab import build_vocab_from_iterator
@ -14,12 +16,10 @@ from tqdm import tqdm
def get_words_from_line(line):
line = line.rstrip()
line = line.split("\t")
text = line[-2] + " " + line[-1]
text = re.sub(r"\\\\+n", " ", text)
text = re.sub('[^A-Za-z ]+', '', text)
for t in text.split():
yield t
yield "<s>"
for m in re.finditer(r"[\p{L}0-9\*]+|\p{P}+", line):
yield m.group(0).lower()
yield "</s>"
def get_word_lines_from_file(file_name):
@ -64,28 +64,25 @@ class TrigramModel(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim):
super(TrigramModel, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.hidden = nn.Linear(embedding_dim * 2, hidden_dim)
self.output = nn.Linear(hidden_dim, vocab_size)
self.linear1 = nn.Linear(embedding_dim, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, vocab_size)
self.softmax = nn.Softmax()
def forward(self, x, y):
x = self.embeddings(x)
y = self.embeddings(y)
z = self.hidden(torch.cat([x, y], dim=1))
z = self.output(z)
z = self.linear1(x + y)
z = self.linear2(z)
z = self.softmax(z)
return z
embed_size = 500
vocab_size = 20000
vocab_path = "vocabulary.pickle"
if exists(vocab_path):
print("Loading vocabulary from file...")
with open(vocab_path, "rb") as fh:
vocab = pickle.load(fh)
else:
print("Building vocabulary...")
vocab = build_vocab_from_iterator(
get_word_lines_from_file("train/in.tsv.xz"),
max_tokens=vocab_size,
@ -95,139 +92,28 @@ else:
with open(vocab_path, "wb") as fh:
pickle.dump(vocab, fh)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
dataset_path = 'train/dataset.pickle'
if exists(dataset_path):
print("Loading dataset from file...")
with open(dataset_path, "rb") as fh:
train_dataset = pickle.load(fh)
else:
print("Building dataset...")
train_dataset = Trigrams("train/in.tsv.xz", vocab_size)
with open(dataset_path, "wb") as fh:
pickle.dump(train_dataset, fh)
print("Building model...")
model = TrigramModel(vocab_size, embed_size, 64).to(device)
data = DataLoader(train_dataset, batch_size=10000)
device = "cpu"
train_dataset = Trigrams("train/in.tsv.xz", vocab_size)
model = TrigramModel(vocab_size, 100, 64).to(device)
data = DataLoader(train_dataset, batch_size=5000)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
print("Training model...")
model.train()
losses = []
step = 0
max_steps = 1000
for epoch in tqdm(range(10)):
for x, y, z in tqdm(data):
x = x.to(device)
y = y.to(device)
z = z.to(device)
for x, y, z in tqdm(data):
x = x.to(device)
y = y.to(device)
z = z.to(device)
optimizer.zero_grad()
ypredicted = model(x, z)
loss = criterion(torch.log(ypredicted), y)
losses.append(loss.item())
loss.backward()
optimizer.step()
step += 1
if step > max_steps:
break
optimizer.zero_grad()
ypredicted = model(x, z)
loss = criterion(torch.log(ypredicted), y)
losses.append(loss)
loss.backward()
optimizer.step()
print(f"Epoch {epoch} loss:", loss.item())
plt.plot(losses)
plt.show()
torch.save(model.state_dict(), f"trigram_model-embed_{embed_size}.bin")
vocab_unique = set(train_dataset.vocab.get_stoi().keys())
output = []
print('Predicting dev...')
with lzma.open("dev-0/in.tsv.xz", encoding='utf8', mode="rt") as file:
for line in tqdm(file):
line = line.split("\t")
first_word = re.sub(r"\\\\+n", " ", line[-2]).split()[-1]
first_word = re.sub('[^A-Za-z]+', '', first_word)
next_word = re.sub(r"\\\\+n", " ", line[-1]).split()[0]
nenxt_word = re.sub('[^A-Za-z]+', '', next_word)
if first_word not in vocab_unique:
word = "<unk>"
if next_word not in vocab_unique:
word = "<unk>"
first_word = torch.tensor(train_dataset.vocab.forward([first_word])).to(device)
next_word = torch.tensor(train_dataset.vocab.forward([next_word])).to(device)
out = model(first_word, next_word)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
unk_bonus = 1 - sum(top_probs)
top_words = vocab.lookup_tokens(top_indices)
top_zipped = list(zip(top_words, top_probs))
res = ""
for w, p in top_zipped:
if w == "<unk>":
res += f":{(p + unk_bonus):.4f} "
else:
res += f"{w}:{p:.4f} "
res = res[:-1]
res += "\n"
output.append(res)
with open(f"dev-0/out-embed-{embed_size}.tsv", mode="w") as file:
file.writelines(output)
model.eval()
output = []
print('Predicting test...')
with lzma.open("test-A/in.tsv.xz", encoding='utf8', mode="rt") as file:
for line in tqdm(file):
line = line.split("\t")
first_word = re.sub(r"\\\\+n", " ", line[-2]).split()[-1]
first_word = re.sub('[^A-Za-z]+', '', first_word)
next_word = re.sub(r"\\\\+n", " ", line[-1]).split()[0]
next_word = re.sub('[^A-Za-z]+', '', next_word)
if first_word not in vocab_unique:
word = "<unk>"
if next_word not in vocab_unique:
word = "<unk>"
first_word = torch.tensor(train_dataset.vocab.forward([first_word])).to(device)
next_word = torch.tensor(train_dataset.vocab.forward([next_word])).to(device)
out = model(first_word, next_word)
top = torch.topk(out[0], 10)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
unk_bonus = 1 - sum(top_probs)
top_words = vocab.lookup_tokens(top_indices)
top_zipped = list(zip(top_words, top_probs))
res = ""
for w, p in top_zipped:
if w == "<unk>":
res += f":{(p + unk_bonus):.4f} "
else:
res += f"{w}:{p:.4f} "
res = res[:-1]
res += "\n"
output.append(res)
with open(f"test-A/out-embed-{embed_size}.tsv", mode="w") as file:
file.writelines(output)
torch.save(model.state_dict(), "model1.bin")

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

7414
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.