Compare commits
5 Commits
Author | SHA1 | Date |
---|---|---|
Jakub Kaczmarek | 40c36dce44 | |
Jakub Kaczmarek | 5bc8f3f6f7 | |
Jakub Kaczmarek | c8247f077f | |
Jakub Kaczmarek | ab56101d2c | |
Jakub Kaczmarek | f8892f9209 |
|
@ -1,4 +1,4 @@
|
|||
geval
|
||||
|
||||
*~
|
||||
*.swp
|
||||
*.bak
|
||||
|
@ -6,5 +6,3 @@ geval
|
|||
*.o
|
||||
.DS_Store
|
||||
.token
|
||||
*.pickle
|
||||
*.xz
|
10519
dev-0/out-embed-100.tsv
10519
dev-0/out-embed-100.tsv
File diff suppressed because it is too large
Load Diff
10519
dev-0/out-embed-500.tsv
10519
dev-0/out-embed-500.tsv
File diff suppressed because it is too large
Load Diff
10519
dev-0/out.tsv
10519
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
|
@ -5,9 +5,6 @@ tags:
|
|||
params:
|
||||
epochs: 1
|
||||
vocab-size: 20000
|
||||
batch-size: 10000
|
||||
embed-size:
|
||||
- 100
|
||||
- 500
|
||||
- 1000
|
||||
topk: 10
|
||||
batch-size: 5000
|
||||
embed-size: 100
|
||||
topk: 150
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
from transformers import pipeline
|
||||
import lzma
|
||||
|
||||
generator = pipeline("text-generation", model="gpt2")
|
||||
|
||||
with open("test-A/in.tsv", "r") as input_file, open(
|
||||
"test-A/out.tsv", "w"
|
||||
) as output_file:
|
||||
for line in input_file:
|
||||
line = line.rstrip()
|
||||
line = line.replace("\\n", " ")
|
||||
|
||||
prompt = line.split("\t")[6]
|
||||
|
||||
result = generator(prompt, max_new_tokens=1, num_return_sequences=1)[0][
|
||||
"generated_text"
|
||||
]
|
||||
output_file.write(f"{result.split()[-1]}:1\n")
|
168
run.py
168
run.py
|
@ -1,3 +1,5 @@
|
|||
from itertools import islice
|
||||
import sys
|
||||
import lzma
|
||||
import regex as re
|
||||
from torchtext.vocab import build_vocab_from_iterator
|
||||
|
@ -14,12 +16,10 @@ from tqdm import tqdm
|
|||
|
||||
def get_words_from_line(line):
|
||||
line = line.rstrip()
|
||||
line = line.split("\t")
|
||||
text = line[-2] + " " + line[-1]
|
||||
text = re.sub(r"\\\\+n", " ", text)
|
||||
text = re.sub('[^A-Za-z ]+', '', text)
|
||||
for t in text.split():
|
||||
yield t
|
||||
yield "<s>"
|
||||
for m in re.finditer(r"[\p{L}0-9\*]+|\p{P}+", line):
|
||||
yield m.group(0).lower()
|
||||
yield "</s>"
|
||||
|
||||
|
||||
def get_word_lines_from_file(file_name):
|
||||
|
@ -64,28 +64,25 @@ class TrigramModel(nn.Module):
|
|||
def __init__(self, vocab_size, embedding_dim, hidden_dim):
|
||||
super(TrigramModel, self).__init__()
|
||||
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
|
||||
self.hidden = nn.Linear(embedding_dim * 2, hidden_dim)
|
||||
self.output = nn.Linear(hidden_dim, vocab_size)
|
||||
self.linear1 = nn.Linear(embedding_dim, hidden_dim)
|
||||
self.linear2 = nn.Linear(hidden_dim, vocab_size)
|
||||
self.softmax = nn.Softmax()
|
||||
|
||||
def forward(self, x, y):
|
||||
x = self.embeddings(x)
|
||||
y = self.embeddings(y)
|
||||
z = self.hidden(torch.cat([x, y], dim=1))
|
||||
z = self.output(z)
|
||||
z = self.linear1(x + y)
|
||||
z = self.linear2(z)
|
||||
z = self.softmax(z)
|
||||
return z
|
||||
|
||||
|
||||
embed_size = 500
|
||||
vocab_size = 20000
|
||||
vocab_path = "vocabulary.pickle"
|
||||
if exists(vocab_path):
|
||||
print("Loading vocabulary from file...")
|
||||
with open(vocab_path, "rb") as fh:
|
||||
vocab = pickle.load(fh)
|
||||
else:
|
||||
print("Building vocabulary...")
|
||||
vocab = build_vocab_from_iterator(
|
||||
get_word_lines_from_file("train/in.tsv.xz"),
|
||||
max_tokens=vocab_size,
|
||||
|
@ -95,139 +92,28 @@ else:
|
|||
with open(vocab_path, "wb") as fh:
|
||||
pickle.dump(vocab, fh)
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
print("Using device:", device)
|
||||
dataset_path = 'train/dataset.pickle'
|
||||
if exists(dataset_path):
|
||||
print("Loading dataset from file...")
|
||||
with open(dataset_path, "rb") as fh:
|
||||
train_dataset = pickle.load(fh)
|
||||
else:
|
||||
print("Building dataset...")
|
||||
train_dataset = Trigrams("train/in.tsv.xz", vocab_size)
|
||||
with open(dataset_path, "wb") as fh:
|
||||
pickle.dump(train_dataset, fh)
|
||||
|
||||
print("Building model...")
|
||||
model = TrigramModel(vocab_size, embed_size, 64).to(device)
|
||||
data = DataLoader(train_dataset, batch_size=10000)
|
||||
device = "cpu"
|
||||
train_dataset = Trigrams("train/in.tsv.xz", vocab_size)
|
||||
model = TrigramModel(vocab_size, 100, 64).to(device)
|
||||
data = DataLoader(train_dataset, batch_size=5000)
|
||||
optimizer = torch.optim.Adam(model.parameters())
|
||||
criterion = torch.nn.NLLLoss()
|
||||
|
||||
print("Training model...")
|
||||
model.train()
|
||||
losses = []
|
||||
step = 0
|
||||
max_steps = 1000
|
||||
for epoch in tqdm(range(10)):
|
||||
for x, y, z in tqdm(data):
|
||||
x = x.to(device)
|
||||
y = y.to(device)
|
||||
z = z.to(device)
|
||||
|
||||
for x, y, z in tqdm(data):
|
||||
x = x.to(device)
|
||||
y = y.to(device)
|
||||
z = z.to(device)
|
||||
|
||||
optimizer.zero_grad()
|
||||
ypredicted = model(x, z)
|
||||
loss = criterion(torch.log(ypredicted), y)
|
||||
losses.append(loss.item())
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
step += 1
|
||||
if step > max_steps:
|
||||
break
|
||||
optimizer.zero_grad()
|
||||
ypredicted = model(x, z)
|
||||
loss = criterion(torch.log(ypredicted), y)
|
||||
losses.append(loss)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
print(f"Epoch {epoch} loss:", loss.item())
|
||||
|
||||
plt.plot(losses)
|
||||
plt.show()
|
||||
|
||||
torch.save(model.state_dict(), f"trigram_model-embed_{embed_size}.bin")
|
||||
|
||||
vocab_unique = set(train_dataset.vocab.get_stoi().keys())
|
||||
|
||||
output = []
|
||||
print('Predicting dev...')
|
||||
with lzma.open("dev-0/in.tsv.xz", encoding='utf8', mode="rt") as file:
|
||||
for line in tqdm(file):
|
||||
line = line.split("\t")
|
||||
|
||||
first_word = re.sub(r"\\\\+n", " ", line[-2]).split()[-1]
|
||||
first_word = re.sub('[^A-Za-z]+', '', first_word)
|
||||
|
||||
next_word = re.sub(r"\\\\+n", " ", line[-1]).split()[0]
|
||||
nenxt_word = re.sub('[^A-Za-z]+', '', next_word)
|
||||
|
||||
if first_word not in vocab_unique:
|
||||
word = "<unk>"
|
||||
if next_word not in vocab_unique:
|
||||
word = "<unk>"
|
||||
|
||||
first_word = torch.tensor(train_dataset.vocab.forward([first_word])).to(device)
|
||||
next_word = torch.tensor(train_dataset.vocab.forward([next_word])).to(device)
|
||||
|
||||
out = model(first_word, next_word)
|
||||
|
||||
top = torch.topk(out[0], 10)
|
||||
top_indices = top.indices.tolist()
|
||||
top_probs = top.values.tolist()
|
||||
unk_bonus = 1 - sum(top_probs)
|
||||
top_words = vocab.lookup_tokens(top_indices)
|
||||
top_zipped = list(zip(top_words, top_probs))
|
||||
|
||||
res = ""
|
||||
for w, p in top_zipped:
|
||||
if w == "<unk>":
|
||||
res += f":{(p + unk_bonus):.4f} "
|
||||
else:
|
||||
res += f"{w}:{p:.4f} "
|
||||
|
||||
res = res[:-1]
|
||||
res += "\n"
|
||||
output.append(res)
|
||||
|
||||
with open(f"dev-0/out-embed-{embed_size}.tsv", mode="w") as file:
|
||||
file.writelines(output)
|
||||
|
||||
|
||||
model.eval()
|
||||
|
||||
output = []
|
||||
print('Predicting test...')
|
||||
with lzma.open("test-A/in.tsv.xz", encoding='utf8', mode="rt") as file:
|
||||
for line in tqdm(file):
|
||||
line = line.split("\t")
|
||||
|
||||
first_word = re.sub(r"\\\\+n", " ", line[-2]).split()[-1]
|
||||
first_word = re.sub('[^A-Za-z]+', '', first_word)
|
||||
|
||||
next_word = re.sub(r"\\\\+n", " ", line[-1]).split()[0]
|
||||
next_word = re.sub('[^A-Za-z]+', '', next_word)
|
||||
|
||||
if first_word not in vocab_unique:
|
||||
word = "<unk>"
|
||||
if next_word not in vocab_unique:
|
||||
word = "<unk>"
|
||||
|
||||
first_word = torch.tensor(train_dataset.vocab.forward([first_word])).to(device)
|
||||
next_word = torch.tensor(train_dataset.vocab.forward([next_word])).to(device)
|
||||
|
||||
out = model(first_word, next_word)
|
||||
|
||||
top = torch.topk(out[0], 10)
|
||||
top_indices = top.indices.tolist()
|
||||
top_probs = top.values.tolist()
|
||||
unk_bonus = 1 - sum(top_probs)
|
||||
top_words = vocab.lookup_tokens(top_indices)
|
||||
top_zipped = list(zip(top_words, top_probs))
|
||||
|
||||
res = ""
|
||||
for w, p in top_zipped:
|
||||
if w == "<unk>":
|
||||
res += f":{(p + unk_bonus):.4f} "
|
||||
else:
|
||||
res += f"{w}:{p:.4f} "
|
||||
|
||||
res = res[:-1]
|
||||
res += "\n"
|
||||
output.append(res)
|
||||
|
||||
with open(f"test-A/out-embed-{embed_size}.tsv", mode="w") as file:
|
||||
file.writelines(output)
|
||||
torch.save(model.state_dict(), "model1.bin")
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue