add hyperparameters search

This commit is contained in:
kpierzynski 2024-05-23 02:41:59 +02:00
parent b05c10444e
commit b2a20a87e4
3 changed files with 11785 additions and 10704 deletions

File diff suppressed because it is too large Load Diff

1253
main.ipynb

File diff suppressed because one or more lines are too long

198
run.py
View File

@ -15,19 +15,12 @@ from torch import nn
import torch
from tqdm.notebook import tqdm
embed_size = 300
vocab_size = 30_000
num_epochs = 1
device = 'cuda'
batch_size = 8192
train_file_path = 'train/train.txt'
# In[2]:
# Function to extract words from a line of text
def get_words_from_line(line):
line = line.rstrip()
yield '<s>'
@ -35,13 +28,11 @@ def get_words_from_line(line):
yield m.group(0).lower()
yield '</s>'
# Generator to read lines from a file
def get_word_lines_from_file(file_name):
with open(file_name, 'r', encoding='utf8') as fh:
for line in fh:
yield get_words_from_line(line)
# Function to create 5-grams from a sequence
def look_ahead_iterator(gen):
prev2, prev1, next1, next2 = None, None, None, None
for item in gen:
@ -49,7 +40,10 @@ def look_ahead_iterator(gen):
yield (prev2, prev1, next2, item, next1)
prev2, prev1, next1, next2 = prev1, next1, next2, item
# Dataset class for 5-grams
# In[3]:
class FiveGrams(IterableDataset):
def __init__(self, text_file, vocabulary_size):
self.vocab = build_vocab_from_iterator(
@ -66,20 +60,6 @@ class FiveGrams(IterableDataset):
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file)))
)
# Instantiate the dataset
train_dataset = FiveGrams(train_file_path, vocab_size)
# In[3]:
i = 0
for x in train_dataset:
print(train_dataset.vocab.lookup_tokens(x))
if i >= 1:
break
i += 1
# In[4]:
@ -99,39 +79,44 @@ class SimpleFiveGramNeuralLanguageModel(nn.Module):
out = self.linear2(out)
return self.softmax(out)
model = SimpleFiveGramNeuralLanguageModel(vocab_size, embed_size).to(device)
# In[5]:
data = DataLoader(train_dataset, batch_size=batch_size)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss()
model.train()
step = 0
for _ in range(num_epochs):
for x1, x2, x3, x4, y in tqdm(data, desc="Train loop"):
y = y.to(device)
x = torch.cat((x1.unsqueeze(1), x2.unsqueeze(1), x3.unsqueeze(1), x4.unsqueeze(1)), dim=1).to(device)
optimizer.zero_grad()
ypredicted = model(x)
loss = criterion(torch.log(ypredicted), y)
if step % 5000 == 0:
print(step, loss)
step += 1
loss.backward()
optimizer.step()
def train(embed_size,vocab_size,num_epochs,batch_size,train_file_path):
train_dataset = FiveGrams(train_file_path, vocab_size)
model = SimpleFiveGramNeuralLanguageModel(vocab_size, embed_size).to(device)
data = DataLoader(train_dataset, batch_size=batch_size)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss()
model.train()
step = 0
model.eval()
for _ in range(num_epochs):
for x1, x2, x3, x4, y in tqdm(data, desc="Train loop"):
y = y.to(device)
x = torch.cat((x1.unsqueeze(1), x2.unsqueeze(1), x3.unsqueeze(1), x4.unsqueeze(1)), dim=1).to(device)
optimizer.zero_grad()
ypredicted = model(x)
loss = criterion(torch.log(ypredicted), y)
if step % 5000 == 0:
print(step, loss)
step += 1
loss.backward()
optimizer.step()
step = 0
break
model.eval()
return model, train_dataset.vocab
# In[8]:
# In[6]:
def get_gap_candidates(words, n=20, vocab=train_dataset.vocab):
def get_gap_candidates(words, model, vocab, n=20):
ixs = vocab(words)
ixs = torch.tensor(ixs).unsqueeze(0).to(device)
@ -151,11 +136,11 @@ def clean(text):
text = text.strip()
return text
def predictor(prefix, suffix):
def predictor(prefix, suffix, model, vocab):
prefix = clean(prefix)
suffix = clean(suffix)
words = prefix.split(' ')[-2:] + suffix.split(' ')[:2]
candidates = get_gap_candidates(words)
candidates = get_gap_candidates(words, model, vocab)
probs_sum = 0
output = ''
@ -169,10 +154,10 @@ def predictor(prefix, suffix):
return output
# In[9]:
# In[7]:
def generate_result(input_path, output_path='out.tsv'):
def generate_result(input_path,model, vocab, output_path='out.tsv'):
lines = []
with open(input_path, encoding='utf-8') as f:
for line in f:
@ -183,8 +168,113 @@ def generate_result(input_path, output_path='out.tsv'):
with open(output_path, 'w', encoding='utf-8') as output_file:
for prefix, suffix in tqdm(lines):
result = predictor(prefix, suffix)
result = predictor(prefix, suffix, model, vocab)
output_file.write(result + '\n')
generate_result('dev-0/in.tsv', output_path='dev-0/out.tsv')
# In[8]:
import subprocess
def evaluate():
cmd = 'wsl bash -c "cd /mnt/d/UAM/MODELOWANIE/5GRAM && ./geval -t dev-0"'
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
return float(result.stdout)
# In[9]:
embed_sizes = [100,200,300]
vocab_sizes = [10_000, 20_000, 30_000]
num_epochss = [1]
batch_sizes = [8192]
train_file_paths = ['train/nano.txt', 'train/train.txt']
results = []
for embed_size in embed_sizes:
for vocab_size in vocab_sizes:
for num_epochs in num_epochss:
for batch_size in batch_sizes:
for train_file_path in train_file_paths:
model, vocab = train(embed_size,vocab_size,num_epochs,batch_size,train_file_path)
generate_result('dev-0/in.tsv', model, vocab, output_path='dev-0/out.tsv')
result = evaluate()
config = {"embed_size": embed_size, "vocab_size": vocab_size, "num_epochs": num_epochs, "batch_size": batch_size, "train_file_path": train_file_path, "perplexity": result }
print(config)
results.append( config )
# In[10]:
results
# In[23]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import griddata
# Sample data
data = results
# Extracting data
vocab_size = [item['vocab_size'] for item in data if 'nano' not in item['train_file_path'] ]
embed_size = [item['embed_size'] for item in data if 'nano' not in item['train_file_path'] ]
perplexity = [item['perplexity'] for item in data if 'nano' not in item['train_file_path'] ]
# Plotting
grid_x, grid_y = np.meshgrid(np.linspace(min(vocab_size), max(vocab_size), 100),
np.linspace(min(embed_size), max(embed_size), 100))
grid_z = griddata((vocab_size, embed_size), perplexity, (grid_x, grid_y), method='cubic')
# Plotting
plt.figure(figsize=(10, 6))
contour = plt.contourf(grid_x, grid_y, grid_z, cmap='viridis')
plt.colorbar(contour, label='Perplexity')
plt.scatter(vocab_size, embed_size, c='red') # Optional: plot actual data points
plt.xlabel('Vocab Size')
plt.ylabel('Embed Size')
plt.title('Embed Size vs Vocab Size with Perplexity for whole training set')
plt.show()
# In[22]:
# Extracting data
vocab_size = [item['vocab_size'] for item in data if 'nano' in item['train_file_path'] ]
embed_size = [item['embed_size'] for item in data if 'nano' in item['train_file_path'] ]
perplexity = [item['perplexity'] for item in data if 'nano' in item['train_file_path'] ]
# Plotting
grid_x, grid_y = np.meshgrid(np.linspace(min(vocab_size), max(vocab_size), 100),
np.linspace(min(embed_size), max(embed_size), 100))
grid_z = griddata((vocab_size, embed_size), perplexity, (grid_x, grid_y), method='cubic')
# Plotting
plt.figure(figsize=(10, 6))
contour = plt.contourf(grid_x, grid_y, grid_z, cmap='viridis')
plt.colorbar(contour, label='Perplexity')
plt.scatter(vocab_size, embed_size, c='red') # Optional: plot actual data points
plt.xlabel('Vocab Size')
plt.ylabel('Embed Size')
plt.title('Embed Size vs Vocab Size with Perplexity for nano training set')
plt.show()
# In[26]:
from math import log
best_model_parameters = min(results, key=lambda x: x['perplexity'])
best_model_parameters['logPerplexity'] = log(best_model_parameters['perplexity'])
best_model_parameters