Europarl/TAU_translator_from_scratch.ipynb
2020-01-28 18:51:11 +01:00

1 line
45 KiB
Plaintext

{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"TAU_translator_from_scratch.ipynb","provenance":[],"collapsed_sections":["RF1vdsADCAM1"],"toc_visible":true,"authorship_tag":"ABX9TyMRnEKpWjs7cheHuy93zBBW"},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"EPqzHV7BucP2","colab_type":"text"},"source":["Install\n","---\n","#### TO DO\n","* Load and prepare data for train, dev-0 and test-A sets\n","* Prepere basic Encoder adn Decoder\n","* Test training basic Encoder-Decoder + Adam optimzer + Cross Entropy loss fnc\n","* Report model training status (epoch, time, iterations, current loss)\n","* Model saving to drive, saving evaluation on test-A (and evaluation on dev-0?) on the end of epoch \n","* Add pretrained embeddings\n","* Add attention mechanism\n","* Reverse input sentence?\n","* BiRNN?"]},{"cell_type":"code","metadata":{"id":"Bx7TQD2DsDg7","colab_type":"code","colab":{}},"source":["from google.colab import drive\n","drive.mount('/content/drive')"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"c3NR3DOisZoQ","colab_type":"code","colab":{}},"source":["!pip install torch torchvision"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"BRcLW_KWxD2I","colab_type":"code","colab":{}},"source":["import re\n","import math\n","import time\n","import os.path\n","\n","import numpy as np\n","import torch\n","import torch.nn as nn\n","import torch.nn.functional as F\n","from torch.utils.data import Dataset, DataLoader\n","from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence\n","from torch import optim\n","from tqdm import tqdm"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"colab_type":"code","id":"a6r74YL2-8Jm","outputId":"346ccb2b-6b19-4628-caa1-34b8318a9ca1","executionInfo":{"status":"ok","timestamp":1580115861850,"user_tz":-60,"elapsed":2527,"user":{"displayName":"Stanisław Gołębiewski","photoUrl":"","userId":"02205040307954405899"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n","print(device)"],"execution_count":0,"outputs":[{"output_type":"stream","text":["cuda:0\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"hDsPNpPN-EeV","colab_type":"text"},"source":["### Load data"]},{"cell_type":"code","metadata":{"id":"ZujLKZUKsvS8","colab_type":"code","colab":{}},"source":["BLANK_TOKEN = 0\n","SOS_TOKEN = 1\n","EOS_TOKEN = 2\n","UNK_TOKEN = 3\n","\n","class Lang:\n"," # https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html\n","\n"," def __init__(self, name):\n"," self.name = name\n"," self.word2index = {\"<BLANK>\": 0, \"<SOS>\": 1, \"<EOS>\": 2, \"<UNK>\": 3}\n"," self.word2count = {}\n"," self.index2word = {0: \"<BLANK>\", 1: \"<SOS>\", 2: \"<EOS>\", 3: \"<UNK>\"}\n"," self.n_words = 4\n","\n"," def addSentence(self, sentence):\n"," for word in sentence.split(' '):\n"," self.addWord(word)\n","\n"," def addWord(self, word):\n"," if word not in self.word2index:\n"," self.word2index[word] = self.n_words\n"," self.word2count[word] = 1\n"," self.index2word[self.n_words] = word\n"," self.n_words += 1\n"," else:\n"," self.word2count[word] += 1\n","\n"," def vector2Sentence(self, vector):\n"," words = []\n"," for index in vector:\n"," if index not in self.index2word:\n"," words.append(\"<UNK>\")\n"," else:\n"," words.append(self.index2word[index])\n"," return \" \".join(words)\n","\n"," # def blankAllLowFreqWorlds(self):\n"," # new_word2index = {\"SOS\": 0, \"EOS\": 1, \"<UNK>\": 2}\n"," # new_index2word = {0: \"SOS\", 1: \"EOS\", 2: \"<UNK>\"}\n"," # new_n_words = 3\n","\n"," # for word in self.word2count:\n"," # if self.word2count[word] == 1:\n"," # new_word2index[word] = 2\n"," # else:\n"," # if word not in new_word2index:\n"," # new_word2index[word] = new_n_words\n"," # new_index2word[new_n_words] = word\n"," # new_n_words += 1\n","\n","class MyData(Dataset):\n"," def __init__(self, X, y):\n"," self.data = X\n"," self.target = y\n"," self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]\n"," \n"," def __getitem__(self, index):\n"," x = self.data[index]\n"," y = self.target[index]\n"," x_len = self.length[index]\n"," return x,y,x_len\n"," \n"," def __len__(self):\n"," return len(self.data)\n","\n","\n","def normalize_line(line: str) -> str:\n"," out = line.strip().lower()\n"," out = re.sub(r\"([,:.!?\\)])\", r\" \\1\", out)\n"," out = re.sub(r\"([\\(])\", r\"\\1 \", out)\n"," return out\n","\n","\n","def sentence_to_indexes(lang: Lang, sentence: str) -> list:\n"," out = [lang.word2index[\"<SOS>\"]]\n"," for s in sentence.split(\" \"):\n"," if s not in lang.word2index:\n"," out.append(lang.word2index[\"<UNK>\"])\n"," else:\n"," out.append(lang.word2index[s])\n"," out.append(lang.word2index[\"<EOS>\"])\n"," return out\n","\n","\n","# def sentence_to_tensor(lang: Lang, sentence: str) -> list:\n","# indexes = sentence_to_indexes(lang, sentence)\n","# indexes.append(lang.word2index[\"<EOS>\"])\n","# out = torch.tensor(indexes, dtype=torch.long, device=device)\n","# out = out.view(-1, 1)\n","# return out\n","\n","\n","# def sentence_pair_to_tensors(lang_1: Lang, lang_2: Lang, pair):\n","# out_sentence_1 = sentence_to_tensor(lang_1, pair[0])\n","# out_sentence_2 = sentence_to_tensor(lang_2, pair[1])\n","# return((out_sentence_1, out_sentence_2))\n"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"fNHCJDfItgvz","colab_type":"code","outputId":"501c74b4-d447-4a8b-db82-bea47485005f","executionInfo":{"status":"ok","timestamp":1580115900019,"user_tz":-60,"elapsed":29490,"user":{"displayName":"Stanisław Gołębiewski","photoUrl":"","userId":"02205040307954405899"}},"colab":{"base_uri":"https://localhost:8080/","height":367}},"source":["TRAIN_DATA_PATH = \"/content/drive/My Drive/Colab Notebooks/euro-parl-data/train/train.tsv\"\n","TEST_DATA_DEV_0_IN_PATH = \"/content/drive/My Drive/Colab Notebooks/euro-parl-data/dev-0/in.tsv\"\n","TEST_DATA_DEV_0_EXPECTED_PATH = \"/content/drive/My Drive/Colab Notebooks/euro-parl-data/dev-0/expected.tsv\"\n","TEST_DATA_A_PATH = \"/content/drive/My Drive/Colab Notebooks/euro-parl-data/test-A/in.tsv\"\n","\n","print(f\"{TRAIN_DATA_PATH}: {os.path.isfile(TRAIN_DATA_PATH)}\")\n","print(f\"{TEST_DATA_DEV_0_IN_PATH}: {os.path.isfile(TEST_DATA_DEV_0_IN_PATH)}\")\n","print(f\"{TEST_DATA_DEV_0_EXPECTED_PATH}: {os.path.isfile(TEST_DATA_DEV_0_EXPECTED_PATH)}\")\n","print(f\"{TEST_DATA_A_PATH}: {os.path.isfile(TEST_DATA_A_PATH)}\")\n","\n","# load train data\n","# MAX_TOKENS_IN_SENTENCE = 20\n","# MAX_TOKENS_OUT_SENTENCE = 25\n","MAX_TOKENS_IN_SENTENCE = 30\n","MAX_TOKENS_OUT_SENTENCE = 30\n","SRC_lang = Lang(\"eng\")\n","TRG_lang = Lang(\"pol\")\n","# SRC_TRG_pairs = []\n","train_SRC_tensor = []\n","train_TRG_tensor = []\n","\n","test_dev_0_SRC_tensor = []\n","test_dev_0_TRG_tensor_oryginal = []\n","\n","test_A_SRC_tensor = []\n","\n","print(f\"\\n> Loading train data from '{TRAIN_DATA_PATH}'\")\n","with open(TRAIN_DATA_PATH) as file_in:\n"," for line in file_in:\n"," pl_line, en_line = line.strip().split(\"\\t\")\n"," pl_line = normalize_line(pl_line)\n"," en_line = normalize_line(en_line)\n"," if(len(pl_line.split(\" \")) > MAX_TOKENS_IN_SENTENCE):\n"," continue\n","\n"," if(len(en_line.split(\" \")) > MAX_TOKENS_OUT_SENTENCE):\n"," continue\n"," TRG_lang.addSentence(pl_line)\n"," SRC_lang.addSentence(en_line)\n"," # SRC_TRG_pairs.append((en_line, pl_line))\n"," train_SRC_tensor.append(sentence_to_indexes(SRC_lang, en_line))\n"," train_TRG_tensor.append(sentence_to_indexes(TRG_lang, pl_line))\n","\n","\n","\n","print(f\"---TEST---\")\n","print(f\"\\n> no. pairs: {len(train_SRC_tensor)}\")\n","print(f\"> {SRC_lang.name} - no. words: {SRC_lang.n_words}\")\n","print(f\"> {TRG_lang.name} - no. words: {TRG_lang.n_words}\")\n","\n","\n","print(f\"\\n> Loading test dev-0 data from '{TEST_DATA_DEV_0_IN_PATH}' and '{TEST_DATA_DEV_0_EXPECTED_PATH}'\")\n","with open(TEST_DATA_DEV_0_IN_PATH) as file_in:\n"," for line in file_in:\n"," en_line = normalize_line(line.strip())\n"," test_dev_0_SRC_tensor.append(sentence_to_indexes(SRC_lang, en_line))\n","\n","with open(TEST_DATA_DEV_0_EXPECTED_PATH) as file_in:\n"," for line in file_in:\n"," test_dev_0_TRG_tensor_oryginal.append(line.strip())\n","\n","\n","print(f\"---TEST DEV 0---\")\n","print(f\"> no. pairs: {len(test_dev_0_SRC_tensor)}, {len(test_dev_0_TRG_tensor_oryginal)}\")\n","\n","\n","print(f\"\\n> Loading test test-A data from '{TEST_DATA_A_PATH}'\")\n","with open(TEST_DATA_A_PATH) as file_in:\n"," for line in file_in:\n"," en_line = normalize_line(line.strip())\n"," test_A_SRC_tensor.append(sentence_to_indexes(SRC_lang, en_line))\n","\n","\n","print(f\"---TEST A---\")\n","print(f\"> no. pairs: {len(test_A_SRC_tensor)}\")\n","\n","\n"," "],"execution_count":0,"outputs":[{"output_type":"stream","text":["/content/drive/My Drive/Colab Notebooks/euro-parl-data/train/train.tsv: True\n","/content/drive/My Drive/Colab Notebooks/euro-parl-data/dev-0/in.tsv: True\n","/content/drive/My Drive/Colab Notebooks/euro-parl-data/dev-0/expected.tsv: True\n","/content/drive/My Drive/Colab Notebooks/euro-parl-data/test-A/in.tsv: True\n","\n","> Loading train data from '/content/drive/My Drive/Colab Notebooks/euro-parl-data/train/train.tsv'\n","---TEST---\n","\n","> no. pairs: 354115\n","> eng - no. words: 51106\n","> pol - no. words: 126051\n","\n","> Loading test dev-0 data from '/content/drive/My Drive/Colab Notebooks/euro-parl-data/dev-0/in.tsv' and '/content/drive/My Drive/Colab Notebooks/euro-parl-data/dev-0/expected.tsv'\n","---TEST DEV 0---\n","> no. pairs: 10000, 10000\n","\n","> Loading test test-A data from '/content/drive/My Drive/Colab Notebooks/euro-parl-data/test-A/in.tsv'\n","---TEST A---\n","> no. pairs: 5000\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"TvobNYhsEO5x","colab_type":"code","colab":{}},"source":["def max_length(tensor):\n"," lengths = [len(t) for t in tensor]\n"," return max(lengths), lengths.index(max(lengths))\n","\n","def pad_sequence(x, max_len):\n"," padded = np.zeros((max_len), dtype=np.int64)\n"," eos_index = SRC_lang.word2index[\"<EOS>\"]\n"," if len(x) > max_len:\n"," padded[:] = x[:max_len]\n"," if padded[-1] != eos_index:\n"," padded[-1] = eos_index\n"," else: \n"," padded[:len(x)] = x\n"," \n"," return padded"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"ycYy5gq641Uy","colab_type":"code","colab":{}},"source":["max_length_SRC, max_index_SRC = max_length(train_SRC_tensor)\n","max_length_TRG, max_index_TRG = max_length(train_TRG_tensor)\n","\n","train_SRC_tensor = [pad_sequence(s, max_length_SRC) for s in train_SRC_tensor]\n","train_TRG_tensor = [pad_sequence(s, max_length_TRG) for s in train_TRG_tensor]\n","\n","test_dev_0_SRC_tensor = [pad_sequence(s, max_length_SRC) for s in test_dev_0_SRC_tensor]\n","test_A_SRC_tensor = [pad_sequence(s, max_length_SRC) for s in test_A_SRC_tensor]"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"-RpGTnpG_cAG","colab_type":"code","colab":{}},"source":["print(train_SRC_tensor[1])\n","print(SRC_lang.vector2Sentence(train_SRC_tensor[1]))\n","print()\n","print(train_TRG_tensor[1])\n","print(TRG_lang.vector2Sentence(train_TRG_tensor[1]))\n","print(max_length_SRC, max_length_TRG)\n","print(SRC_lang.vector2Sentence(train_SRC_tensor[max_index_SRC]))\n","print(TRG_lang.vector2Sentence(train_TRG_tensor[max_index_SRC]))\n","print()\n","print(SRC_lang.vector2Sentence(train_SRC_tensor[max_index_TRG]))\n","print(TRG_lang.vector2Sentence(train_TRG_tensor[max_index_TRG]))\n","\n","print(\"\\n\\n---TEST DEV SET---\")\n","print(test_dev_0_SRC_tensor[346])\n","print(SRC_lang.vector2Sentence(test_dev_0_SRC_tensor[346]))\n","print(test_dev_0_TRG_tensor_oryginal[346])\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"61X0BYQuRVxn","colab_type":"text"},"source":["### Docoder / Encoder\n"]},{"cell_type":"code","metadata":{"id":"s3Be7lOZ5R-d","colab_type":"code","colab":{}},"source":["BUFFER_SIZE = len(train_SRC_tensor)\n","BATCH_SIZE = 50\n","N_BATCH = BUFFER_SIZE//BATCH_SIZE\n","embedding_dim = 256\n","hidden_units = 1024\n","vocab_src_size = SRC_lang.n_words\n","vocab_trg_size = TRG_lang.n_words\n","\n","train_dataset = DataLoader(MyData(train_SRC_tensor , train_TRG_tensor), batch_size = BATCH_SIZE, drop_last=True, shuffle=True)\n","test_dev_0_dataset = DataLoader(MyData(test_dev_0_SRC_tensor, np.zeros((len(test_dev_0_SRC_tensor)), dtype=np.int64)), batch_size = BATCH_SIZE, shuffle=False)\n","test_A_dataset = DataLoader(MyData(test_A_SRC_tensor, np.zeros((len(test_A_SRC_tensor)), dtype=np.int64)), batch_size = BATCH_SIZE, shuffle=False)\n"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"sev_V5IPkzUk","colab_type":"code","colab":{}},"source":["for (batch, (inp, targ, inp_len)) in enumerate(train_dataset):\n"," print(f\"{inp}\\n{targ}\\n{inp_len}\\n\\n\")\n"," break\n","\n","for (batch, (inp, targ, inp_len)) in enumerate(test_A_dataset):\n"," # print(f\"{inp}\\n{targ}\\n{inp_len}\\n\\n\")\n"," for vec in inp.numpy():\n"," print(SRC_lang.vector2Sentence(vec))\n"," break\n","\n","for (batch, (inp, targ, inp_len)) in enumerate(test_dev_0_dataset):\n"," print(f\"{inp}\\n{targ}\\n{inp_len}\\n\\n\")\n"," break\n","\n"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"9dL74mLxSz6-","colab_type":"code","colab":{}},"source":["class Encoder(nn.Module):\n"," def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):\n"," super(Encoder, self).__init__()\n"," self.batch_sz = batch_sz\n"," self.enc_units = enc_units\n"," self.vocab_size = vocab_size\n"," self.embedding_dim = embedding_dim\n"," self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)\n"," self.gru = nn.GRU(self.embedding_dim, self.enc_units)\n"," \n"," def forward(self, x, lens, device):\n"," # x: batch_size, max_length \n"," \n"," # x: batch_size, max_length, embedding_dim\n"," x = self.embedding(x) \n"," \n"," # x transformed = max_len X batch_size X embedding_dim\n"," # x = x.permute(1,0,2)\n"," x = pack_padded_sequence(x, lens) # unpad\n"," \n"," self.hidden = self.initialize_hidden_state(device)\n"," \n"," # output: max_length, batch_size, enc_units\n"," # self.hidden: 1, batch_size, enc_units\n"," output, self.hidden = self.gru(x, self.hidden) # gru returns hidden state of all timesteps as well as hidden state at last timestep\n"," \n"," # pad the sequence to the max length in the batch\n"," output, _ = pad_packed_sequence(output)\n"," \n"," return output, self.hidden\n","\n"," def initialize_hidden_state(self, device):\n"," return torch.zeros((1, self.batch_sz, self.enc_units)).to(device)\n","\n","### sort batch function to be able to use with pad_packed_sequence\n","def sort_batch(X, y, lengths):\n"," lengths, indx = lengths.sort(dim=0, descending=True)\n"," X = X[indx]\n"," y = y[indx]\n"," return X.transpose(0,1), y, lengths, indx # transpose (batch x seq) to (seq x batch)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"ghTfwuE6TyN_","colab_type":"code","colab":{}},"source":["class Decoder(nn.Module):\n"," def __init__(self, vocab_size, embedding_dim, dec_units, enc_units, batch_sz):\n"," super(Decoder, self).__init__()\n"," self.batch_sz = batch_sz\n"," self.dec_units = dec_units\n"," self.enc_units = enc_units\n"," self.vocab_size = vocab_size\n"," self.embedding_dim = embedding_dim\n"," self.embedding = nn.Embedding(self.vocab_size, self.embedding_dim)\n"," self.gru = nn.GRU(self.embedding_dim + self.enc_units, \n"," self.dec_units,\n"," batch_first=True)\n"," self.fc = nn.Linear(self.enc_units, self.vocab_size)\n"," \n"," # used for attention\n"," self.W1 = nn.Linear(self.enc_units, self.dec_units)\n"," self.W2 = nn.Linear(self.enc_units, self.dec_units)\n"," self.V = nn.Linear(self.enc_units, 1)\n"," \n"," def forward(self, x, hidden, enc_output):\n"," # enc_output original: (max_length, batch_size, enc_units)\n"," # enc_output converted == (batch_size, max_length, hidden_size)\n"," enc_output = enc_output.permute(1,0,2)\n"," # hidden shape == (batch_size, hidden size)\n"," # hidden_with_time_axis shape == (batch_size, 1, hidden size)\n"," # we are doing this to perform addition to calculate the score\n"," \n"," # hidden shape == (batch_size, hidden size)\n"," # hidden_with_time_axis shape == (batch_size, 1, hidden size)\n"," hidden_with_time_axis = hidden.permute(1, 0, 2)\n"," \n"," # score: (batch_size, max_length, hidden_size) # Bahdanaus's\n"," # we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V\n"," # It doesn't matter which FC we pick for each of the inputs\n"," score = torch.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis))\n"," \n"," #score = torch.tanh(self.W2(hidden_with_time_axis) + self.W1(enc_output))\n"," \n"," # attention_weights shape == (batch_size, max_length, 1)\n"," # we get 1 at the last axis because we are applying score to self.V\n"," attention_weights = torch.softmax(self.V(score), dim=1)\n"," \n"," # context_vector shape after sum == (batch_size, hidden_size)\n"," context_vector = attention_weights * enc_output\n"," context_vector = torch.sum(context_vector, dim=1)\n"," \n"," # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n"," # takes case of the right portion of the model above (illustrated in red)\n"," x = self.embedding(x)\n"," \n"," # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n"," #x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n"," # ? Looks like attention vector in diagram of source\n"," x = torch.cat((context_vector.unsqueeze(1), x), -1)\n"," \n"," # passing the concatenated vector to the GRU\n"," # output: (batch_size, 1, hidden_size)\n"," output, state = self.gru(x)\n"," \n"," \n"," # output shape == (batch_size * 1, hidden_size)\n"," output = output.view(-1, output.size(2))\n"," \n"," # output shape == (batch_size * 1, vocab)\n"," x = self.fc(output)\n"," \n"," return x, state, attention_weights\n"," \n"," def initialize_hidden_state(self):\n"," return torch.zeros((1, self.batch_sz, self.dec_units))"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"Ex3nJDN9S43C","colab_type":"code","colab":{}},"source":["# TEST ENCODER\n","encoder = Encoder(vocab_src_size, embedding_dim, hidden_units, BATCH_SIZE)\n","\n","encoder.to(device)\n","\n","it = iter(train_dataset)\n","x, y, x_len = next(it)\n","xs, ys, lens, _ = sort_batch(x, y, x_len)\n","\n","enc_output, enc_hidden = encoder(xs.to(device), lens, device)\n","\n","print(enc_output.size()) # max_length, batch_size, enc_units"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"2M_e_46dTzvl","colab_type":"code","colab":{}},"source":["# TEST DECODER\n","encoder = Encoder(vocab_src_size, embedding_dim, hidden_units, BATCH_SIZE)\n","encoder.to(device)\n","\n","it = iter(train_dataset)\n","x, y, x_len = next(it)\n","xs, ys, lens, _ = sort_batch(x, y, x_len)\n","\n","enc_output, enc_hidden = encoder(xs.to(device), lens, device)\n","\n","print(\"Input: \", x.shape)\n","print(\"Output: \", y.shape)\n","\n","# sort the batch first to be able to use with pac_pack_sequence\n","xs, ys, lens, _ = sort_batch(x, y, x_len)\n","\n","enc_output, enc_hidden = encoder(xs.to(device), lens, device)\n","print(\"Encoder Output: \", enc_output.shape) # batch_size X max_length X enc_units\n","print(\"Encoder Hidden: \", enc_hidden.shape) # batch_size X enc_units (corresponds to the last state)\n","\n","decoder = Decoder(vocab_trg_size, embedding_dim, hidden_units, hidden_units, BATCH_SIZE)\n","decoder = decoder.to(device)\n","\n","print(enc_hidden.squeeze(0).shape)\n","\n","dec_hidden = enc_hidden#.squeeze(0)\n","dec_input = torch.tensor([[TRG_lang.word2index['<SOS>']]] * BATCH_SIZE)\n","print(\"Decoder Input: \", dec_input.shape)\n","print(\"--------\")\n","\n","for t in range(1, y.size(1)):\n"," # enc_hidden: 1, batch_size, enc_units\n"," # output: max_length, batch_size, enc_units\n"," predictions, dec_hidden, _ = decoder(dec_input.to(device), \n"," dec_hidden.to(device), \n"," enc_output.to(device))\n"," \n"," print(\"Prediction: \", predictions.shape)\n"," topv, topi = predictions.data.topk(1)\n"," print(\"Topi: \", topi.shape)\n"," print(topi)\n"," print(\"Decoder Hidden: \", dec_hidden.shape)\n"," \n"," #loss += loss_function(y[:, t].to(device), predictions.to(device))\n"," \n"," dec_input = y[:, t].unsqueeze(1)\n"," print(dec_input)\n","\n"," print(dec_input.shape)\n"," break"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"2b4BwaHErmYJ","colab_type":"code","colab":{}},"source":["def predict(enc, dec, seq, dev):\n"," pass"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lGwFgO3sraFX","colab_type":"code","colab":{}},"source":["# TEST PREDICTION FOR TEST dev 0"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"dcuKPr1GreNU","colab_type":"code","colab":{}},"source":["# TEST PREDICTION FOR TEST A"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"dZF4izmbXW9f","colab_type":"text"},"source":["### Train\n"]},{"cell_type":"code","metadata":{"id":"kOeDQa_nXa11","colab_type":"code","colab":{}},"source":["def loss_function(real, pred):\n"," \"\"\" Only consider non-zero inputs in the loss; mask needed \"\"\"\n"," #mask = 1 - np.equal(real, 0) # assign 0 to all above 0 and 1 to all 0s\n"," #print(mask)\n"," if torch.cuda.is_available():\n"," mask = real.ge(1).type(torch.cuda.FloatTensor)\n"," else:\n"," mask = real.ge(1).type(torch.FloatTensor)\n"," \n"," \n"," loss_ = criterion(pred, real) * mask \n"," return torch.mean(loss_)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"QclyWIop5dRG","colab_type":"code","outputId":"9affd9f9-905c-421f-e3d7-a2b57e50a200","executionInfo":{"status":"ok","timestamp":1580115917766,"user_tz":-60,"elapsed":17680,"user":{"displayName":"Stanisław Gołębiewski","photoUrl":"","userId":"02205040307954405899"}},"colab":{"base_uri":"https://localhost:8080/","height":156}},"source":["encoder = Encoder(vocab_src_size, embedding_dim, hidden_units, BATCH_SIZE)\n","decoder = Decoder(vocab_trg_size, embedding_dim, hidden_units, hidden_units, BATCH_SIZE)\n","\n","encoder.to(device)\n","decoder.to(device)\n","\n"],"execution_count":0,"outputs":[{"output_type":"execute_result","data":{"text/plain":["Decoder(\n"," (embedding): Embedding(126051, 256)\n"," (gru): GRU(1280, 1024, batch_first=True)\n"," (fc): Linear(in_features=1024, out_features=126051, bias=True)\n"," (W1): Linear(in_features=1024, out_features=1024, bias=True)\n"," (W2): Linear(in_features=1024, out_features=1024, bias=True)\n"," (V): Linear(in_features=1024, out_features=1, bias=True)\n",")"]},"metadata":{"tags":[]},"execution_count":12}]},{"cell_type":"code","metadata":{"id":"oq5W5bEnEBLY","colab_type":"code","colab":{}},"source":["criterion = nn.CrossEntropyLoss()\n","optimizer = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), \n"," lr=0.001)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"yWpOFo3ayzov","colab_type":"code","colab":{}},"source":["def predict(seq, seq_l):\n"," out_seq = []\n"," enc_output, enc_hidden = encoder(seq.to(device), seq_l, device)\n"," dec_hidden = enc_hidden\n"," dec_input = torch.tensor([[TRG_lang.word2index['<SOS>']]] * BATCH_SIZE)\n"," for i in range(MAX_TOKENS_OUT_SENTENCE):\n"," predictions, dec_hidden, _ = decoder(dec_input.to(device), \n"," dec_hidden.to(device), \n"," enc_output.to(device))\n"," topv, topi = predictions.data.topk(1)\n"," dec_input = topi\n"," out_seq.append(topi.cpu().squeeze().detach().numpy())\n","\n"," out = []\n"," for vec in np.transpose(np.array(out_seq)):\n"," out.append(TRG_lang.vector2Sentence(vec))\n"," return np.array(out)\n","\n","\n","\n"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"hzqiC2Rn2FxH","colab_type":"code","colab":{}},"source":["### TEST PREDICT\n","EPOCHS = 1\n","print(f\"batches:{N_BATCH}\")\n","for (batch, (inp, targ, inp_len)) in enumerate(train_dataset):\n"," \n","\n"," \n"," encoder.train()\n"," decoder.train()\n"," \n"," xs, ys, lens, _ = sort_batch(inp, targ, inp_len)\n"," pred_x = xs.t()[3].numpy()\n"," pred_y = ys[3].numpy()\n"," print(SRC_lang.vector2Sentence(pred_x))\n"," print(TRG_lang.vector2Sentence(pred_y))\n","\n","\n"," for epoch in range(EPOCHS):\n"," \n"," start = time.time() \n"," total_loss = 0\n"," loss = 0\n","\n"," \n","\n"," # print(xs)\n"," enc_output, enc_hidden = encoder(xs.to(device), lens, device)\n"," dec_hidden = enc_hidden\n"," \n"," # use teacher forcing - feeding the target as the next input (via dec_input)\n"," dec_input = torch.tensor([[TRG_lang.word2index['<SOS>']]] * BATCH_SIZE)\n"," \n"," # run code below for every timestep in the ys batch\n"," for t in range(1, 5):\n"," predictions, dec_hidden, _ = decoder(topi, \n"," dec_hidden.to(device), \n"," enc_output.to(device))\n"," dec_input = ys[:, t].unsqueeze(1)\n"," # print(dec_input, dec_input.shape)\n"," topv, topi = predictions.data.topk(1)\n"," print(np.transpose(np.array([TRG_lang.index2word[index] for index in topi.cpu().squeeze().detach().numpy()])))\n"," loss += loss_function(ys[:, t].to(device), predictions.to(device))\n"," topv, topi = predictions.data.topk(1)\n","\n"," \n","\n"," # dec_input = ys[:, t].unsqueeze(1)\n"," \n"," \n"," batch_loss = (loss / int(ys.size(1)))\n"," total_loss += batch_loss\n"," \n"," # optimizer.zero_grad()\n"," \n"," # loss.backward()\n","\n"," # ### UPDATE MODEL PARAMETERS\n"," # optimizer.step()\n","\n"," \n"," encoder.eval()\n"," decoder.eval()\n"," print(predict(xs, lens))\n"," break\n"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"R5XxM3M3tjc9","colab_type":"code","colab":{}},"source":["TEST_DEV_PATH = \"/content/drive/My Drive/Colab Notebooks/euro-parl-data/dev-0/\"\n","TEST_A_PATH = \"/content/drive/My Drive/Colab Notebooks/euro-parl-data/test-A/\"\n","MODELS_PATH = \"/content/drive/My Drive/Colab Notebooks/euro-parl-data/models/\""],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"colab_type":"code","id":"wJuHAh8UBivi","outputId":"2687c128-0bd9-435f-8226-d37e586fc549","colab":{"base_uri":"https://localhost:8080/","height":104}},"source":["EPOCHS = 20\n","print(f\"batches:{N_BATCH}\")\n","\n","for epoch in range(10, EPOCHS):\n"," start = time.time()\n"," \n"," encoder.train()\n"," decoder.train()\n"," \n"," total_loss = 0\n"," \n"," for (batch, (inp, targ, inp_len)) in enumerate(train_dataset):\n"," loss = 0\n"," \n"," xs, ys, lens, _ = sort_batch(inp, targ, inp_len)\n"," enc_output, enc_hidden = encoder(xs.to(device), lens, device)\n"," dec_hidden = enc_hidden\n"," \n"," dec_input = torch.tensor([[TRG_lang.word2index['<SOS>']]] * BATCH_SIZE)\n"," \n"," for t in range(1, ys.size(1)):\n"," predictions, dec_hidden, _ = decoder(dec_input.to(device), \n"," dec_hidden.to(device), \n"," enc_output.to(device))\n"," loss += loss_function(ys[:, t].to(device), predictions.to(device))\n"," dec_input = ys[:, t].unsqueeze(1)\n"," \n"," \n"," batch_loss = (loss / int(ys.size(1)))\n"," total_loss += batch_loss\n"," \n"," optimizer.zero_grad()\n"," loss.backward()\n"," optimizer.step()\n","\n"," if batch % 1000 == 0:\n"," print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.detach().item():.4f} Time {time.time() - start}')\n"," \n","\n"," print('Epoch {} Loss {:.4f}'.format(epoch + 1,\n"," total_loss / N_BATCH))\n"," print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))\n","\n"," encoder.eval()\n"," decoder.eval()\n","\n"," file_name = f\"ep-{epoch+1}_embedding-dim-{embedding_dim}_hidden-units-{hidden_units}_max-len-in-{MAX_TOKENS_IN_SENTENCE}_max-len-out-{MAX_TOKENS_OUT_SENTENCE}\"\n"," dev_file_path = f\"{TEST_DEV_PATH}out-{file_name}.tsv\"\n"," a_file_path = f\"{TEST_A_PATH}out-{file_name}.tsv\"\n"," model_file_path = f\"{MODELS_PATH}model-{file_name}.pt\"\n","\n"," # dev-0 test\n"," preds = []\n"," for (batch, (inp, targ, inp_len)) in enumerate(test_dev_0_dataset):\n"," xs, ys, lens, indx = sort_batch(inp, targ, inp_len)\n"," tmp = [p for _, p in sorted([(indx[i].item(), pred) for i, pred in enumerate(predict(xs, lens))], key=lambda x: x[0])]\n"," preds.append(tmp)\n"," \n"," print(f\"> Saving dev-0 to 'out-{file_name}.tsv'\")\n"," np.savetxt(dev_file_path, np.array(preds).flatten(), delimiter='\\t', fmt='%s')\n","\n"," preds = []\n"," # test-A test\n"," for (batch, (inp, targ, inp_len)) in enumerate(test_A_dataset):\n"," xs, ys, lens, indx = sort_batch(inp, targ, inp_len)\n"," tmp = [p for _, p in sorted([(indx[i].item(), pred) for i, pred in enumerate(predict(xs, lens))], key=lambda x: x[0])]\n"," preds.append(tmp)\n"," print(f\"> Saving test A to 'out-{file_name}.tsv'\")\n"," np.savetxt(a_file_path, np.array(preds).flatten(), delimiter='\\t', fmt='%s')\n","\n"," if((epoch + 1) % 2 == 0):\n"," print(f\"> Saving model to '{model_file_path}'\")\n"," # save-model\n"," torch.save({'enc_state_dict': encoder.state_dict(), 'dec_state_dict': decoder.state_dict()}, model_file_path)\n"," "],"execution_count":0,"outputs":[{"output_type":"stream","text":["batches:7082\n","Epoch 11 Batch 0 Loss 6.2139 Time 1.2431468963623047\n","Epoch 11 Batch 1000 Loss 2.3112 Time 996.8302667140961\n","Epoch 11 Batch 2000 Loss 1.9345 Time 2001.6348514556885\n","Epoch 11 Batch 3000 Loss 1.6354 Time 3006.442977666855\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"DJSxzYT89LAE","colab_type":"text"},"source":["### Load model\n"]},{"cell_type":"code","metadata":{"id":"YSCUlEK29Y9B","colab_type":"code","outputId":"9caf15da-a2c3-4ca3-eeec-02d7983c9f6e","executionInfo":{"status":"ok","timestamp":1580111504374,"user_tz":-60,"elapsed":16986,"user":{"displayName":"Stanisław Gołębiewski","photoUrl":"","userId":"02205040307954405899"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["file_name = \"model-ep-10_embedding-dim-256_hidden-units-1024.pt\"\n","print(f\"> Loading model from '{file_name}'\")\n","model = torch.load(f\"{MODELS_PATH}{file_name}\")"],"execution_count":0,"outputs":[{"output_type":"stream","text":["> Loading model from 'model-ep-10_embedding-dim-256_hidden-units-1024.pt'\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"5vOr6Kul_Nzl","colab_type":"code","outputId":"875db2fe-6050-4dd2-98f8-005ba37906ce","executionInfo":{"status":"ok","timestamp":1580111586148,"user_tz":-60,"elapsed":492,"user":{"displayName":"Stanisław Gołębiewski","photoUrl":"","userId":"02205040307954405899"}},"colab":{"base_uri":"https://localhost:8080/","height":34}},"source":["print(model.keys())"],"execution_count":0,"outputs":[{"output_type":"stream","text":["dict_keys(['enc_state_dict', 'dec_state_dict'])\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"UpHh8ca_-_Lb","colab_type":"code","colab":{}},"source":["print(f\"> Init model\")\n","encoder = Encoder(vocab_src_size, embedding_dim, hidden_units, BATCH_SIZE)\n","decoder = Decoder(vocab_trg_size, embedding_dim, hidden_units, hidden_units, BATCH_SIZE)\n","\n","encoder.load_state_dict(model['enc_state_dict'])\n","decoder.load_state_dict(model['dec_state_dict'])\n","\n","encoder.to(device)\n","decoder.to(device)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"BblioWxf_ts7","colab_type":"code","outputId":"abdfe885-44cb-471e-a892-afc2b1e230ce","executionInfo":{"status":"ok","timestamp":1580115356366,"user_tz":-60,"elapsed":59325,"user":{"displayName":"Stanisław Gołębiewski","photoUrl":"","userId":"02205040307954405899"}},"colab":{"base_uri":"https://localhost:8080/","height":52}},"source":["file_name = f\"ep-10_embedding-dim-{embedding_dim}_hidden-units-{hidden_units}\"\n","dev_file_path = f\"{TEST_DEV_PATH}dev-eval-out-{file_name}.tsv\"\n","a_file_path = f\"{TEST_A_PATH}A-eval-out-{file_name}.tsv\"\n","\n","# dev-0 test\n","preds = []\n","for (batch, (inp, targ, inp_len)) in enumerate(test_dev_0_dataset):\n"," xs, ys, lens, indx = sort_batch(inp, targ, inp_len)\n"," tmp = [p for _, p in sorted([(indx[i].item(), pred) for i, pred in enumerate(predict(xs, lens))], key=lambda x: x[0])]\n"," preds.append(tmp)\n"," \n","print(f\"> Saving dev-0 to 'out-{file_name}.tsv'\")\n","np.savetxt(dev_file_path, np.array(preds).flatten(), delimiter='\\t', fmt='%s')\n","\n","preds = []\n","# test-A test\n","for (batch, (inp, targ, inp_len)) in enumerate(test_A_dataset):\n"," xs, ys, lens, indx = sort_batch(inp, targ, inp_len)\n"," tmp = [p for _, p in sorted([(indx[i].item(), pred) for i, pred in enumerate(predict(xs, lens))], key=lambda x: x[0])]\n"," preds.append(tmp)\n","print(f\"> Saving test A to 'out-{file_name}.tsv'\")\n","np.savetxt(a_file_path, np.array(preds).flatten(), delimiter='\\t', fmt='%s')"],"execution_count":0,"outputs":[{"output_type":"stream","text":["> Saving dev-0 to 'out-ep-10_embedding-dim-256_hidden-units-1024.tsv'\n","> Saving test A to 'out-ep-10_embedding-dim-256_hidden-units-1024.tsv'\n"],"name":"stdout"}]},{"cell_type":"code","metadata":{"id":"HV92vsaBJ3Hd","colab_type":"code","outputId":"e9963e59-8a49-4ec0-c5d6-6a4eebe212eb","executionInfo":{"status":"ok","timestamp":1580115160551,"user_tz":-60,"elapsed":673,"user":{"displayName":"Stanisław Gołębiewski","photoUrl":"","userId":"02205040307954405899"}},"colab":{"base_uri":"https://localhost:8080/","height":54}},"source":["# dev-0 test\n","preds = []\n","for (batch, (inp, targ, inp_len)) in enumerate(test_dev_0_dataset):\n"," xs, ys, lens, indx = sort_batch(inp, targ, inp_len)\n"," preds = predict(xs, lens)\n"," preds = [p for _, p in sorted([(indx[i].item(), pred) for i, pred in enumerate(preds)], key=lambda x: x[0])]\n"," preds.append()"],"execution_count":0,"outputs":[{"output_type":"stream","text":["['kiedy zostanie ustanowiony 4 tysiące ton rocznie to jedno z całą pewnością w wysokości dodatkowych kosztów . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'tendencja do tego , aby zaradzić w sprawie niektórych unijnych programów na unię europejską nie są szkodliwe . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'to nie dla nas dobrym sposobem na podstawie polityki rybołówstwa , czy w naszej własnej , czy jesteśmy w naszej wodach lub wody <EOS> <BLANK>', 'chciałbym także , aby zapewnić , aby zapewnić , aby pieniądze ue pogrążyła się państwa trzecie ue w krajach trzecich , że pieniądze ue pogrążyła', '( de ) panie przewodniczący , pani komisarz , panie i panowie ! cieszę się , że parlament wreszcie zagłosują w tej sprawie trzeciej kwestii', 'poprawka do rozporządzenia rady w sprawie środków finansowych w społeczności we wspólnocie , której musimy wdrożyć . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'dotyczy to w szczególności sektora rybołówstwa stosunków , czyli o badania i badania . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'systemy kontroli nad wspólną polityką rybołówstwa oraz ich wdrażanie . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'sprawozdawca przedstawił wysiłek w celu ochrony jego pochodzeniu wspomniał o konieczności znacznego ograniczenia emisji w wyniku regulujących działalność 75% do 75% okresu odpoczynku . <EOS>', 'nie udało nam się nam do jego zamiaru . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'istniejące przepisy okazały się w praktyce i praktyki , aby przewidziane w niektórych dziedzinach . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'państwa członkowskie już w możliwość wykorzystania możliwości zawieszenia i możliwościach nie ma żadnych potrzeb . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'ważne jest , aby zmiana treści rozporządzenia będzie zapewnianie sposobem na rzecz realistycznych ram prawnych . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'panie przewodniczący ! sprawozdanie pana posła zbyt wiele wzywamy ostrzeżenie do komisji europejskiej . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'zachód szkocji , że trudno jest , dlaczego komisja potrafi powiedzieć , że komisja może powiedzieć , że chce powiedzieć , że chce wprowadzić ograniczenie', 'komisja wolałaby przedłużenia obecnej środków technicznych , które powodują one , w tym z dnia , w tym zwłaszcza dla wielu innych , w tym', 'komisja nie komisja 18 miesięcy , tzw . tymczasowego środków , które nie udało się nie zasobów ryb . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'kolejne 18 miesięcy ryb , ryb w morzu . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'wzywam kolegów o poparcie poprawek do wszystkich poprawek i mam nadzieję , że te poprawki zostaną przyjęte , gdyby poprawka poprawki , a przyjęciem tych', 'panie przewodniczący ! chciałbym zgłosić poprawkę do środków technicznych , które próbują temu sprostać temu , by mówili o najlepszym dowodem na odrażające , by', 'w czasach wzmożonego przesiedleń zasobów rybnych , komisja wprowadziła wyjątkowe układy przejściowe dotyczące przejściowych okresu ważności różnych polityk miesiącach 12 miesięcy z 12 nowym okresem', 'teraz komisja usiłuje przestrzegać zasad w sprawie wywozu , a następnie nowe porozumienie w życie przy nowym kształtem . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'dzięki temu doprowadziło to oznaczać , że przejściowe regulacje na okres przejściowy wynoszący 12 do rozpoczęcia jednego roku <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'mam nadzieję , że parlament poprze jutro . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', '( pt ) panie przewodniczący , pani komisarz ! wszyscy chcę pogratulować czterech sprawozdawców , chciałbym skupić się tylko na dwóch nowych konferencji mojej agendy', 'w 2008 roku brak jakichkolwiek przepisów w sprawie projektu rozporządzenia dotyczącego zagwarantowania uproszczenie i wyjaśnienia ue regulacje dotyczące przepisów ue na temat wspólnotowego w zakresie', 'wniosek ustawodawczy obejmuje wniosek spowoduje rozpoczęcie jednego z kolejnego okresu - w innych kilku dziewięćdziesiąt nr 7 - w innych kilku dziewięćdziesiąt nr 7 -', 'utrzymanie sąsiedztwa ramy regulacyjne w okresie trwania , zgodnie z uwzględnieniem rybaków takich jak bez utraty rybaków . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'obecna tajemniczość nie jest rozszerzenie stosowania jednego z prawem dokładnie i zakazu stosowania ich życia w przypadku kryzysów a czasami metrów . <EOS> <BLANK> <BLANK>', 'obecne zakaz tego zakazu stosowania pestycydów , która spotkała się od dziesięcioleci o 30% miasta w rejonie zanieczyszczenia dla trwałego restrukturyzacji długu do zanieczyszczenia w', 'dlatego apeluję o poparcie przyjęcia w propozycję w propozycji , dlatego właśnie to propozycje pana posła pana posła pana posła pana posła pana posła pana', 'chciałbym również wspomnieć o pakiecie projektu rozporządzenia dotyczącego środków finansowych wspólnoty dotyczące wdrażania wspólnej polityki i ustawy o prawie przegląd ustawy o prawie przegląd ustawy', 'wraz z europejską rybołówstwa europejski fundusz zapewnienia wspierania finansów publicznych , zapewniając wsparcie dla finansowania projektów obszarach pod względem zasobów , zapewniając wsparcie dla finansowania', 'w tym kontekście chciałbym wyrazić moje pełne poparcie dla propozycji przedstawionych w tej izbie , sprawozdawca , sprawozdawca , sprawozdawca , sprawozdawca , sprawozdawca ,', 'wagi i rosnące znaczenie chronienia znaczenia zarządzania legalną wymianę substancji i szczegółowe badania w pełnej autonomii zasobów . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', '( es ) panie przewodniczący , pani komisarz ! chciałbym pogratulować czterem sprawozdawcom sprawozdań w ich wyjątkowo wykonanej . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'skoncentruję się teraz na dwóch sprawozdań , uwzględniając ich znaczenie . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'jedną z nich dotyczy porozumienia z unią . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'jest to umowa acta unii europejskiej a państwami akp - o czym się , że tzw . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'przedmiotowa umowa ustanawia możliwość uznania ich ważności ryb . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'powtarzam : ryby . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'w sprawozdaniu przewidziano również dla samej korzyści , aby sam sposób , by poprawić jego ograniczenia jego zdolności i jej potencjał . <EOS> <BLANK> <BLANK>', 'ustanawia on ramy partnerstwa kryteriów dotyczących rybołówstwa i odpowiedzialnej , za ochronę zasobów naturalnych . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'musimy uwzględnić to , że istnieje szereg zawodów , ponieważ codziennie , co roku , że powstał co roku przeprowadzana zgodnie z prawem międzynarodowym rokiem', 'ułatwia on badania naukowe , a także obserwatorzy ponad obserwatorów oraz za zachowania bioróżnorodności , jak i zachowania bioróżnorodności , jak i zachowania bioróżnorodności ,', 'protokół ta czerpie korzyści dzięki istnieniu dostępu do połowów , ponieważ europejska regulacja , stały obszar , rozwoju , rozwoju , rozwoju , rozwoju ,', 'jest jednak ma ono również bardzo ważna element : każda trzecią państw trzecich od państwa , jeśli pieniądze od zapewnienia ochrony konsumentów . <EOS> <BLANK>', 'w państwach trzecich zatrudnieni stanowią wpływy zasobów rybołówstwa . <EOS> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK> <BLANK>', 'dlatego też uważamy , że nie tylko nie tylko przyczynia się z punktu widzenia funduszu , ale również pomóc w sposób jej przetrwanie . <EOS>', 'należy zwrócić szczególną uwagę na krótkie ramy w sprawie rozporządzenia dotyczącego konkretnych środków technicznych , które w czasie po wypadku czas , które w czasie']\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"RF1vdsADCAM1","colab_type":"text"},"source":["### Process output files"]},{"cell_type":"code","metadata":{"id":"TaVrCe7uCIlP","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]}