diff --git a/app.py b/app.py new file mode 100644 index 0000000..110126a --- /dev/null +++ b/app.py @@ -0,0 +1,232 @@ +''' +modified code from classes +''' + +import numpy as np +import torch +from tqdm.notebook import tqdm +from sklearn.model_selection import train_test_split +import nltk +from nltk.tokenize import word_tokenize + +nltk.download('punkt') + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +print('Using {} device'.format(device)) + +with open("train/train.tsv", "r") as lalka_path: + lines = lalka_path.readlines() +train, test = train_test_split(lines, test_size = 0.2) +with open("train/train_train.tsv", "w") as out_train_file: + for i in train: + out_train_file.write(i) +with open("train/train_test.tsv", "w") as out_test_file: + for i in test: + out_test_file.write(i) + +lalka_path_train= 'train/train_train.tsv' +lalka_path_valid= 'train/train_test.tsv' + +corpora_train = open(lalka_path_train).read() + +corpora_train_tokenized = list(word_tokenize(corpora_train)) +corpora_train_tokenized = [token.lower() for token in corpora_train_tokenized] + +vocab_itos = sorted(set(corpora_train_tokenized)) +print(len(vocab_itos)) + +vocab_itos = vocab_itos[:15005] +vocab_itos[15001] = "" +vocab_itos[15002] = "" +vocab_itos[15003] = "" +vocab_itos[15004] = "" + +print(len(vocab_itos)) + +vocab_stoi = dict() +for i, token in enumerate(vocab_itos): + vocab_stoi[token] = i + +NGRAMS = 5 + +def get_token_id(dataset): + token_ids = [vocab_stoi['']] * (NGRAMS-1) + [vocab_stoi['']] + for token in dataset: + try: + token_ids.append(vocab_stoi[token]) + except KeyError: + token_ids.append(vocab_stoi['']) + token_ids.append(vocab_stoi['']) + return token_ids + +train_ids = get_token_id(corpora_train_tokenized) + +def get_samples(dataset): + samples = [] + for i in range(len(dataset)-NGRAMS): + samples.append(dataset[i:i+NGRAMS]) + return samples + +train_ids = get_samples(train_ids) + +train_ids = torch.tensor(train_ids, device = device) + +corpora_valid = open(lalka_path_valid).read() + +corpora_valid_tokenized = list(word_tokenize(corpora_valid)) +corpora_valid_tokenized = [token.lower() for token in corpora_valid_tokenized] + +valid_ids = get_token_id(corpora_valid_tokenized) + +valid_ids = torch.tensor(get_samples(valid_ids), dtype = torch.long, device = device) + +class GRU(torch.nn.Module): + + def __init__(self): + super(GRU, self).__init__() + self.emb = torch.nn.Embedding(len(vocab_itos),100) + self.rec = torch.nn.GRU(100, 256, 1, batch_first = True) + self.fc1 = torch.nn.Linear( 256 ,len(vocab_itos)) + #self.dropout = torch.nn.Dropout(0.5) + + def forward(self, x): + emb = self.emb(x) + #emb = self.dropout(emb) + output, h_n = self.rec(emb) + hidden = h_n.squeeze(0) + out = self.fc1(hidden) + #out = self.dropout(out) + return out + +lm = GRU().to(device) + +criterion = torch.nn.CrossEntropyLoss() + +optimizer = torch.optim.Adam(lm.parameters(),lr=0.0001) + +BATCH_SIZE = 128 +EPOCHS = 15 + +def get_ppl(dataset_ids): + lm.eval() + + batches = 0 + loss_sum =0 + acc_score = 0 + + for i in range(0, len(dataset_ids)-BATCH_SIZE+1, BATCH_SIZE): + X = dataset_ids[i:i+BATCH_SIZE,:NGRAMS-1] + Y = dataset_ids[i:i+BATCH_SIZE,NGRAMS-1] + predictions = lm(X) + + # equally distributted + # predictions = torch.zeros_like(predictions) + + loss = criterion(predictions,Y) + + loss_sum += loss.item() + batches += 1 + + return np.exp(loss_sum / batches) + +history_ppl_train = [] +history_ppl_valid = [] +for epoch in range(EPOCHS): + + batches = 0 + loss_sum =0 + acc_score = 0 + lm.train() + #for i in range(0, len(train_ids)-BATCH_SIZE+1, BATCH_SIZE): + for i in tqdm(range(0, len(train_ids)-BATCH_SIZE+1, BATCH_SIZE)): + X = train_ids[i:i+BATCH_SIZE,:NGRAMS-1] + Y = train_ids[i:i+BATCH_SIZE,NGRAMS-1] + predictions = lm(X) + loss = criterion(predictions,Y) + + + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + loss_sum += loss.item() + batches += 1 + + ppl_train = get_ppl(train_ids) + ppl_valid = get_ppl(valid_ids) + + history_ppl_train.append(ppl_train) + history_ppl_valid.append(ppl_valid) + + print('epoch: ', epoch) + print('train ppl: ', ppl_train) + print('valid ppl: ', ppl_valid) + print() + +tokenized = list(word_tokenize('Gości innych nie widział oprócz spółleśników')) +tokenized = [token.lower() for token in tokenized] + +ids = [] +for word in tokenized: + if word in vocab_stoi: + ids.append(vocab_stoi[word]) + else: + ids.append(vocab_stoi['']) + +lm.eval() + +ids = torch.tensor(ids, dtype = torch.long, device = device) + +preds= lm(ids.unsqueeze(0)) + +vocab_itos[torch.argmax(torch.softmax(preds,1),1).item()] + +tokenized = list(word_tokenize('Lalka')) +tokenized = [token.lower() for token in tokenized] + +ids = [] +for word in tokenized: + if word in vocab_stoi: + ids.append(vocab_stoi[word]) + else: + ids.append(vocab_stoi['']) +ids = torch.tensor([ids], dtype = torch.long, device = device) + +candidates_number = 10 +for i in range(30): + preds= lm(ids) + candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() + candidate = 15001 + while candidate > 15000: + candidate = candidates[np.random.randint(candidates_number)] + print(vocab_itos[candidate]) + ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) + +with open("dev-0/in.tsv", "r") as dev_path: + nr_of_dev_lines = len(dev_path.readlines()) +with open("test-A/in.tsv", "r") as test_a_path: + nr_of_test_a_lines = len(test_a_path.readlines()) +with open("dev-0/out.tsv", "w") as out_dev_file: + for i in range(nr_of_dev_lines): + preds= lm(ids) + candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() + candidate = 15001 + while candidate > 15000: + candidate = candidates[np.random.randint(candidates_number)] + print(vocab_itos[candidate]) + ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) + out_dev_file.write(vocab_itos[candidate] + '\n') +with open("test-A/out.tsv", "w") as out_test_file: + for i in range(nr_of_dev_lines): + preds= lm(ids) + candidates = torch.topk(torch.softmax(preds,1),candidates_number)[1][0].cpu().numpy() + candidate = 15001 + while candidate > 15000: + candidate = candidates[np.random.randint(candidates_number)] + print(vocab_itos[candidate]) + ids = torch.cat((ids, torch.tensor([[candidate]], device = device)), 1) + out_test_file.write(vocab_itos[candidate] + '\n') + + diff --git a/dev-0/out.tsv b/dev-0/out.tsv new file mode 100644 index 0000000..d23677a --- /dev/null +++ b/dev-0/out.tsv @@ -0,0 +1,471 @@ +na +lewo +: +a +co +o +nim +nie +mam +do +niego +na +jego +i +ja +mam +? +. +i +nawet +. +na +drugi +, +na +drugi +o +. +po +mnie +o +mnie +i +nawet +od +czasu +do +pana +: +czy +. +i +o +nie +jest +. +ale +na +lewo +do +mnie +; +na +niego +: +a +na +drugi +. +ale +, +o +co +? +) +. +ja +, +jak +ja +: +na +drugi +; +nie +na +jej +do +mnie +nie +na +myśl +nie +może +, +na +drugi +i +nie +było +do +czasu +. +i +o +, +jak +mnie +. +nie +może +do +pana +, +ale +pan +nie +na +myśl +: +i +nie +może +i +już +. +nie +i +do +, +na +lewo +; +, +bo +do +mnie +do +pana +nie +na +niej +do +, +ażeby +i +o +mnie +. +nie +ma +nie +jest +? +) +i +ja +: +czy +pan +. +na +jego +; +, +co +pan +na +jej +. +ale +nie +ma +do +paryża +o +o +czym +? +, +nie +o +pani +? +, +bo +na +jego +do +paryża +i +nie +będzie +na +mnie +? +po +chwili +na +lewo +? +a +jeżeli +na +myśl +. +nie +ma +? +pan +, +o +ile +? +po +mnie +, +i +nie +jest +; +nie +. +i +na +niego +, +o +nie +i +do +panny +do +. +po +czym +od +niego +; +na +jego +? +ale +o +nie +, +ale +co +? +po +co +nie +ma +do +, +a +na +jego +? +nie +jest +na +niego +, +bo +nie +nie +o +mnie +o +, +który +na +niego +? +nie +i +jeszcze +o +mnie +i +o +nie +było +o +o +niej +na +jej +na +jego +; +i +o +i +o +pani +i +na +niego +; +o +mnie +i +już +do +pani +nie +na +niego +na +jego +, +na +którym +do +. +potem +do +mnie +? +na +lewo +na +, +która +. +pan +na +co +o +niej +nie +może +, +ażeby +. +panna +na +myśl +, +ale +i +nawet +. +ale +jak +mnie +o +. +po +i +nie +. +i +nie +mógł +i +nawet +nie +jest +. +ale +do +panny +. +ale +nawet +, +ażeby +pan +? +po +mnie +do +pana +, +jak +, +jak +na +jej +i +ja +do +, +jak +mnie +na +jego +; +, +ażeby +i +ja +i +po +, +na +co +, +ale +o +czym +nie +, +co +. +a +pan +do +, +a +ja +już +od +pani +, +a +nie +i +pan +, +o +ile +i +jeszcze +. +po +mnie +na +niej +. +ale +nawet +mnie +? +na +mnie +: +co +o +i +nawet +i +o +czym +; +a +ja +jest +nie +mam +! +, +bo +na +co +? +nie +może +do +niej +! +, +a +jeżeli +, +nie +nie +może +na +niego +. +pan +ignacy +o +i +o +niej +! diff --git a/test-A/out.tsv b/test-A/out.tsv new file mode 100644 index 0000000..52f56f9 --- /dev/null +++ b/test-A/out.tsv @@ -0,0 +1,471 @@ +) +, +jak +na +mnie +. +i +o +o +. +ale +o +i +ja +o +mnie +? +) +o +, +a +ja +już +, +o +pani +i +jeszcze +o +o +o +, +bo +i +co +pan +ignacy +i +już +od +, +ażeby +nie +. +o +i +o +. +pan +jest +na +, +jak +nie +było +; +o +niej +na +myśl +o +i +, +ażeby +do +niego +nie +ma +. +ja +o +o +. +ja +o +jej +? +a +a +nie +. +o +nie +. +i +pan +jest +? +po +pokoju +; +o +niej +. +ja +na +mnie +. +i +już +na +jej +, +ale +, +co +nie +ma +nie +, +o +, +co +nie +będzie +na +jego +. +pan +jest +nie +może +o +i +nawet +do +domu +? +, +o +co +na +jej +; +i +nie +o +czym +o +pani +. +o +niej +! +. +i +jeszcze +do +niego +. +o +czym +na +niej +: +co +. +nie +jest +? +na +mnie +i +pan +do +niej +? +na +kilka +i +pan +nie +ma +nie +ma +i +nie +. +, +o +czym +. +nie +i +do +panny +, +bo +pan +. +nie +było +na +nią +do +, +nie +jest +na +niej +. +nie +o +. +a +ja +już +pan +. +ale +nawet +o +, +co +do +niego +i +po +chwili +na +nią +na +mnie +i +o +, +nie +ma +i +nie +ma +? +, +jak +ja +? +a +nie +. +nie +będzie +, +co +pan +, +na +lewo +do +pana +i +, +i +na +lewo +! +i +jeszcze +na +mnie +, +o +pani +i +ja +i +nawet +na +myśl +. +nie +może +o +, +o +co +o +czym +: +pan +nie +jest +i +o +pani +na +co +dzień +. +na +co +i +nie +ma +? +nie +może +do +domu +na +jej +? +) +! +na +niego +, +ale +nie +. +ale +i +pan +ignacy +o +. +a +co +, +ale +nawet +mnie +: +, +na +którym +? +i +ja +już +nie +było +. +ale +na +co +pan +, +o +pani +meliton +o +, +i +nie +na +jego +do +paryża +i +nie +, +a +co +i +już +nie +na +jej +o +czym +. +pan +. +i +co +na +, +a +może +pan +o +. +o +co +. +a +nawet +, +i +nawet +do +nas +; +ale +o +czym +nie +, +o +nie +i +jeszcze +do +mnie +o +na +mnie +; +nie +było +. +pan +na +jej +i +po +do +domu +! +po +kilka +o +niej +i +już +nic +; +o +mnie +i +ja +: +i +co +? +pan +jest +? +po +i +po +mnie +do +mnie +: +i +już +, +co +do +pani +nie +o +o +pierwszej +, +ale +ja +już +na +jej +i +jeszcze +. +nie +może +, +o +pani +meliton +na +, +na +i +, +i +jeszcze +jak +i +o +pani