add hyperparameters search

This commit is contained in:
kpierzynski 2024-05-23 02:41:59 +02:00
parent b05c10444e
commit b2a20a87e4
3 changed files with 11785 additions and 10704 deletions

File diff suppressed because it is too large Load Diff

1253
main.ipynb

File diff suppressed because one or more lines are too long

198
run.py
View File

@ -15,19 +15,12 @@ from torch import nn
import torch import torch
from tqdm.notebook import tqdm from tqdm.notebook import tqdm
embed_size = 300
vocab_size = 30_000
num_epochs = 1
device = 'cuda' device = 'cuda'
batch_size = 8192
train_file_path = 'train/train.txt'
# In[2]: # In[2]:
# Function to extract words from a line of text
def get_words_from_line(line): def get_words_from_line(line):
line = line.rstrip() line = line.rstrip()
yield '<s>' yield '<s>'
@ -35,13 +28,11 @@ def get_words_from_line(line):
yield m.group(0).lower() yield m.group(0).lower()
yield '</s>' yield '</s>'
# Generator to read lines from a file
def get_word_lines_from_file(file_name): def get_word_lines_from_file(file_name):
with open(file_name, 'r', encoding='utf8') as fh: with open(file_name, 'r', encoding='utf8') as fh:
for line in fh: for line in fh:
yield get_words_from_line(line) yield get_words_from_line(line)
# Function to create 5-grams from a sequence
def look_ahead_iterator(gen): def look_ahead_iterator(gen):
prev2, prev1, next1, next2 = None, None, None, None prev2, prev1, next1, next2 = None, None, None, None
for item in gen: for item in gen:
@ -49,7 +40,10 @@ def look_ahead_iterator(gen):
yield (prev2, prev1, next2, item, next1) yield (prev2, prev1, next2, item, next1)
prev2, prev1, next1, next2 = prev1, next1, next2, item prev2, prev1, next1, next2 = prev1, next1, next2, item
# Dataset class for 5-grams
# In[3]:
class FiveGrams(IterableDataset): class FiveGrams(IterableDataset):
def __init__(self, text_file, vocabulary_size): def __init__(self, text_file, vocabulary_size):
self.vocab = build_vocab_from_iterator( self.vocab = build_vocab_from_iterator(
@ -66,20 +60,6 @@ class FiveGrams(IterableDataset):
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))) (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file)))
) )
# Instantiate the dataset
train_dataset = FiveGrams(train_file_path, vocab_size)
# In[3]:
i = 0
for x in train_dataset:
print(train_dataset.vocab.lookup_tokens(x))
if i >= 1:
break
i += 1
# In[4]: # In[4]:
@ -99,39 +79,44 @@ class SimpleFiveGramNeuralLanguageModel(nn.Module):
out = self.linear2(out) out = self.linear2(out)
return self.softmax(out) return self.softmax(out)
model = SimpleFiveGramNeuralLanguageModel(vocab_size, embed_size).to(device)
# In[5]: # In[5]:
data = DataLoader(train_dataset, batch_size=batch_size) def train(embed_size,vocab_size,num_epochs,batch_size,train_file_path):
optimizer = torch.optim.Adam(model.parameters()) train_dataset = FiveGrams(train_file_path, vocab_size)
criterion = torch.nn.CrossEntropyLoss() model = SimpleFiveGramNeuralLanguageModel(vocab_size, embed_size).to(device)
model.train() data = DataLoader(train_dataset, batch_size=batch_size)
step = 0 optimizer = torch.optim.Adam(model.parameters())
for _ in range(num_epochs): criterion = torch.nn.CrossEntropyLoss()
for x1, x2, x3, x4, y in tqdm(data, desc="Train loop"):
y = y.to(device) model.train()
x = torch.cat((x1.unsqueeze(1), x2.unsqueeze(1), x3.unsqueeze(1), x4.unsqueeze(1)), dim=1).to(device)
optimizer.zero_grad()
ypredicted = model(x)
loss = criterion(torch.log(ypredicted), y)
if step % 5000 == 0:
print(step, loss)
step += 1
loss.backward()
optimizer.step()
step = 0 step = 0
model.eval() for _ in range(num_epochs):
for x1, x2, x3, x4, y in tqdm(data, desc="Train loop"):
y = y.to(device)
x = torch.cat((x1.unsqueeze(1), x2.unsqueeze(1), x3.unsqueeze(1), x4.unsqueeze(1)), dim=1).to(device)
optimizer.zero_grad()
ypredicted = model(x)
loss = criterion(torch.log(ypredicted), y)
if step % 5000 == 0:
print(step, loss)
step += 1
loss.backward()
optimizer.step()
step = 0
break
model.eval()
return model, train_dataset.vocab
# In[8]: # In[6]:
def get_gap_candidates(words, n=20, vocab=train_dataset.vocab): def get_gap_candidates(words, model, vocab, n=20):
ixs = vocab(words) ixs = vocab(words)
ixs = torch.tensor(ixs).unsqueeze(0).to(device) ixs = torch.tensor(ixs).unsqueeze(0).to(device)
@ -151,11 +136,11 @@ def clean(text):
text = text.strip() text = text.strip()
return text return text
def predictor(prefix, suffix): def predictor(prefix, suffix, model, vocab):
prefix = clean(prefix) prefix = clean(prefix)
suffix = clean(suffix) suffix = clean(suffix)
words = prefix.split(' ')[-2:] + suffix.split(' ')[:2] words = prefix.split(' ')[-2:] + suffix.split(' ')[:2]
candidates = get_gap_candidates(words) candidates = get_gap_candidates(words, model, vocab)
probs_sum = 0 probs_sum = 0
output = '' output = ''
@ -169,10 +154,10 @@ def predictor(prefix, suffix):
return output return output
# In[9]: # In[7]:
def generate_result(input_path, output_path='out.tsv'): def generate_result(input_path,model, vocab, output_path='out.tsv'):
lines = [] lines = []
with open(input_path, encoding='utf-8') as f: with open(input_path, encoding='utf-8') as f:
for line in f: for line in f:
@ -183,8 +168,113 @@ def generate_result(input_path, output_path='out.tsv'):
with open(output_path, 'w', encoding='utf-8') as output_file: with open(output_path, 'w', encoding='utf-8') as output_file:
for prefix, suffix in tqdm(lines): for prefix, suffix in tqdm(lines):
result = predictor(prefix, suffix) result = predictor(prefix, suffix, model, vocab)
output_file.write(result + '\n') output_file.write(result + '\n')
generate_result('dev-0/in.tsv', output_path='dev-0/out.tsv')
# In[8]:
import subprocess
def evaluate():
cmd = 'wsl bash -c "cd /mnt/d/UAM/MODELOWANIE/5GRAM && ./geval -t dev-0"'
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
return float(result.stdout)
# In[9]:
embed_sizes = [100,200,300]
vocab_sizes = [10_000, 20_000, 30_000]
num_epochss = [1]
batch_sizes = [8192]
train_file_paths = ['train/nano.txt', 'train/train.txt']
results = []
for embed_size in embed_sizes:
for vocab_size in vocab_sizes:
for num_epochs in num_epochss:
for batch_size in batch_sizes:
for train_file_path in train_file_paths:
model, vocab = train(embed_size,vocab_size,num_epochs,batch_size,train_file_path)
generate_result('dev-0/in.tsv', model, vocab, output_path='dev-0/out.tsv')
result = evaluate()
config = {"embed_size": embed_size, "vocab_size": vocab_size, "num_epochs": num_epochs, "batch_size": batch_size, "train_file_path": train_file_path, "perplexity": result }
print(config)
results.append( config )
# In[10]:
results
# In[23]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import griddata
# Sample data
data = results
# Extracting data
vocab_size = [item['vocab_size'] for item in data if 'nano' not in item['train_file_path'] ]
embed_size = [item['embed_size'] for item in data if 'nano' not in item['train_file_path'] ]
perplexity = [item['perplexity'] for item in data if 'nano' not in item['train_file_path'] ]
# Plotting
grid_x, grid_y = np.meshgrid(np.linspace(min(vocab_size), max(vocab_size), 100),
np.linspace(min(embed_size), max(embed_size), 100))
grid_z = griddata((vocab_size, embed_size), perplexity, (grid_x, grid_y), method='cubic')
# Plotting
plt.figure(figsize=(10, 6))
contour = plt.contourf(grid_x, grid_y, grid_z, cmap='viridis')
plt.colorbar(contour, label='Perplexity')
plt.scatter(vocab_size, embed_size, c='red') # Optional: plot actual data points
plt.xlabel('Vocab Size')
plt.ylabel('Embed Size')
plt.title('Embed Size vs Vocab Size with Perplexity for whole training set')
plt.show()
# In[22]:
# Extracting data
vocab_size = [item['vocab_size'] for item in data if 'nano' in item['train_file_path'] ]
embed_size = [item['embed_size'] for item in data if 'nano' in item['train_file_path'] ]
perplexity = [item['perplexity'] for item in data if 'nano' in item['train_file_path'] ]
# Plotting
grid_x, grid_y = np.meshgrid(np.linspace(min(vocab_size), max(vocab_size), 100),
np.linspace(min(embed_size), max(embed_size), 100))
grid_z = griddata((vocab_size, embed_size), perplexity, (grid_x, grid_y), method='cubic')
# Plotting
plt.figure(figsize=(10, 6))
contour = plt.contourf(grid_x, grid_y, grid_z, cmap='viridis')
plt.colorbar(contour, label='Perplexity')
plt.scatter(vocab_size, embed_size, c='red') # Optional: plot actual data points
plt.xlabel('Vocab Size')
plt.ylabel('Embed Size')
plt.title('Embed Size vs Vocab Size with Perplexity for nano training set')
plt.show()
# In[26]:
from math import log
best_model_parameters = min(results, key=lambda x: x['perplexity'])
best_model_parameters['logPerplexity'] = log(best_model_parameters['perplexity'])
best_model_parameters