{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import csv\n", "import os\n", "import re\n", "import random\n", "from collections import Counter, defaultdict\n", "import nltk\n", "import math\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "directory = \"train/in.tsv.xz\"\n", "directory_dev_0 = \"dev-0/in.tsv.xz\"\n", "directory_test_A = \"test-A/in.tsv.xz\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### MODEL N-GRAM" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "class Model():\n", " \n", " def __init__(self, vocab_size=30_000, UNK_token= '', n=3):\n", " if (n <= 1 or n % 2 == 0):\n", " raise \"change N value !!!\"\n", " self.n = n\n", " self.vocab_size = vocab_size\n", " self.UNK_token = UNK_token\n", " \n", " def train(self, corpus:list) -> None:\n", " if(self.n > 1):\n", " self.n_grams = list(nltk.ngrams(corpus, n=self.n))\n", " else:\n", " self.n_grams = corpus\n", " self.counter = Counter(self.n_grams)\n", " self.words_counter = Counter(corpus)\n", " self.all_quantities = Counter([gram[:math.floor(self.n/2)]+gram[math.ceil(self.n/2):] for gram in self.n_grams])\n", "\n", " self.all_grams = defaultdict(set)\n", "\n", " for gram in tqdm(self.n_grams):\n", " previous_words = tuple(gram[:math.floor(self.n/2)])\n", " next_words = tuple(gram[math.ceil(self.n/2):])\n", " word = gram[math.floor(self.n/2)]\n", " self.all_grams[(previous_words, next_words)].add(word)\n", "\n", " def get_conditional_prob_for_word(self, left_text: list, right_text: list, word: str) -> float:\n", " previous_words = tuple(left_text[-math.floor(self.n/2):])\n", " next_words = tuple(right_text[:math.floor(self.n/2)])\n", " quantity = self.counter[previous_words + tuple([word]) + next_words]\n", " all_quantity = self.all_quantities[previous_words + next_words]\n", " if (all_quantity <= 0):\n", " return 0\n", " return quantity/all_quantity\n", " \n", " def get_prob_for_text(self, text: list) -> float:\n", " prob = 1\n", " for gram in list(nltk.ngrams(text, self.n)):\n", " prob *= self.get_conditional_prob_for_word(gram[:math.floor(self.n/2)], gram[math.ceil(self.n/2):], gram[math.floor(self.n/2)])\n", " return prob\n", " \n", " def most_probable_words(self, left_text: list, right_text: list) -> str:\n", " previous_words = tuple(left_text[-math.floor(self.n/2):])\n", " next_words = tuple(right_text[:math.floor(self.n/2)])\n", " all_words = self.all_grams[(previous_words, next_words)]\n", " best_words = []\n", " for word in all_words:\n", " probability = self.get_conditional_prob_for_word(list(previous_words), list(next_words), word)\n", " best_words.append((word, probability))\n", " return sorted(best_words, key=(lambda l: l[1]), reverse=True)[:20]\n", " \n", " def generate_text(self, text_beggining:list, text_ending:list, greedy: bool) -> list:\n", " words = self.most_probable_words(text_beggining, text_ending)\n", " return words\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DATASET" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['came', 'fiom', 'the', 'last', 'place', 'to', 'this', 'place,', 'and', 'this', 'place', 'is', 'Where', 'We', 'Were,', 'this', 'is', 'the', 'first', 'road', 'I', 'ever', 'was', 'on', 'where', 'you', 'can', 'ride', 'elsewhere', 'from', 'anywhere', 'and', 'be', 'nowhere.', 'He', 'says,', 'while', 'this', 'train', 'stops', 'every-', 'where,', 'it', 'never', 'stops', 'anywhere', 'un-', 'less', 'its', 'somewhere.', 'Well,', 'I', 'says,', \"I'm\", 'glad', 'to', 'hear', 'that,', 'but,', 'accord-', 'ing', 'to', 'your', 'figures,', 'I', 'left', 'myself', 'where', '1', 'was,', 'which', 'is', 'five', 'miles', 'near-', 'er', 'to', 'myself', 'than', 'I', 'was', 'when', 'we', 'were', 'where', 'we', 'are', 'now.', 'We', 'have', 'now', 'reached', 'Slidell.', \"That's\", 'a', 'fine', 'place.', 'The', 'people', 'down']\n" ] } ], "source": [ "dataframeList = pd.read_csv(directory, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE, chunksize=10000)\n", "\n", "expectedList = pd.read_csv(directory, sep='\\t', header=None, names=['Word'], quoting=csv.QUOTE_NONE, chunksize=10000)\n", "\n", "DATASET = \"\"\n", "\n", "for number, (dataframe, expected) in enumerate(zip(dataframeList, expectedList)):\n", " dataframe = dataframe.replace(r'\\\\r|\\\\n|\\n|\\\\t', ' ', regex=True)\n", "\n", " left_text = dataframe['LeftContext'].to_list()\n", " right_text = dataframe['RightContext'].to_list()\n", " word = expected['Word'].to_list()\n", "\n", " lines = zip(left_text, word, right_text)\n", " lines = list(map(lambda l: \" \".join(l), lines))\n", " DATASET = DATASET + \" \".join(lines)\n", "\n", "FINAL_DATASET = re.split(r\"\\s+\", DATASET)\n", "print(FINAL_DATASET[:100])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TRAIN" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 199099663/199099663 [11:00<00:00, 301572.30it/s] \n" ] } ], "source": [ "model_3gram = Model(n = 3)\n", "model_3gram.train(FINAL_DATASET)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "model = model_3gram" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### PREDICTION" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def convert_predictions(line):\n", " sum_predictions = np.sum([pred[1] for pred in line])\n", " result = \"\"\n", " all_pred = 0\n", " for word, pred in line:\n", " new_pred = math.floor(pred / sum_predictions * 100) / 100\n", " if(new_pred == 1.0):\n", " new_pred = 0.99\n", " all_pred = all_pred + new_pred\n", " result = result + word + \":\" + str(new_pred) + \" \"\n", " if(round(all_pred, 2) < 1):\n", " result = result + \":\" + str(round(1 - all_pred, 2))\n", " else:\n", " result = result + \":\" + str(0.01)\n", " return result" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "10519it [00:51, 206.24it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[[], [('passage', 0.005959701068962256), ('growth', 0.005202913631633715), ('successors', 0.005108315201967647), ('place,', 0.004682622268470343), ('use,', 0.004115031690473938), ('head', 0.003452842682811465), ('own', 0.003452842682811465), ('own,', 0.003310945038312364), ('power', 0.00326364582347933), ('place', 0.0032163466086462963), ('functions,', 0.0031690473938132627), ('members', 0.0031690473938132627), ('work', 0.0030271497493141613), ('value', 0.00288525210481506), ('principles', 0.002743354460315959), ('strength', 0.002696055245482925), ('beauty', 0.002459559171317756), ('action,', 0.0023649607416516886), ('history', 0.0023176615268186546), ('value,', 0.002270362311985621)], [('a', 0.5714285714285714), ('the', 0.2857142857142857), ('lha', 0.14285714285714285)], [], [], [('a', 0.31221719457013575), ('him', 0.07239819004524888), ('two', 0.06334841628959276), ('means', 0.03167420814479638), ('just', 0.027149321266968326), ('only', 0.02262443438914027), ('money', 0.02262443438914027), ('good\\\\nand', 0.01809954751131222), ('all', 0.01809954751131222), ('force', 0.01809954751131222), ('the', 0.013574660633484163), ('capital', 0.013574660633484163), ('no', 0.013574660633484163), ('barely', 0.013574660633484163), ('capacity', 0.00904977375565611), ('bills', 0.00904977375565611), ('scarcely', 0.00904977375565611), ('boats', 0.00904977375565611), ('stabling', 0.00904977375565611), ('applicants', 0.00904977375565611)], [], [], [('arc', 1.0)], [('as', 0.7895791583166333), ('that', 0.11022044088176353), ('ns', 0.018036072144288578), ('sure,', 0.008016032064128256), ('sure', 0.006012024048096192), ('confident', 0.006012024048096192), ('defrauded', 0.004008016032064128), ('that,', 0.004008016032064128), ('r.s', 0.004008016032064128), ('us', 0.004008016032064128), ('but', 0.004008016032064128), ('tbat', 0.004008016032064128), ('thst', 0.004008016032064128), ('a>', 0.004008016032064128), ('its', 0.002004008016032064), ('ts', 0.002004008016032064), ('a3', 0.002004008016032064), ('alike;', 0.002004008016032064), ('\"as', 0.002004008016032064), ('bad.', 0.002004008016032064)], [], [('the', 0.4470046082949309), ('show', 0.25161290322580643), ('shew', 0.04470046082949309), ('this', 0.027188940092165898), ('tho', 0.02165898617511521), ('our', 0.01889400921658986), ('a', 0.013364055299539171), ('tbe', 0.00967741935483871), ('that', 0.009216589861751152), ('their', 0.009216589861751152), ('any', 0.005529953917050691), ('immediately', 0.004147465437788019), ('said', 0.004147465437788019), ('tlie', 0.003686635944700461), ('some', 0.0027649769585253456), ('his', 0.0027649769585253456), ('what', 0.0027649769585253456), ('find', 0.002304147465437788), ('thow', 0.002304147465437788), ('snow', 0.002304147465437788)], [], [], [('to', 0.71875), ('a', 0.109375), ('except\\\\nto', 0.03125), ('and', 0.03125), ('¦', 0.03125), ('world,', 0.015625), ('the', 0.015625), ('uud', 0.015625), ('for', 0.015625), ('efcept\\\\nta', 0.015625)], [('There', 0.5), ('To', 0.5)], [('to', 0.5416666666666666), ('Almighty', 0.20833333333333334), ('that', 0.16666666666666666), ('for', 0.041666666666666664), ('thai', 0.020833333333333332), ('the', 0.020833333333333332)], [('as', 0.29831387808041504), ('posted', 0.05188067444876784), ('informed', 0.04798962386511025), ('up', 0.04669260700389105), ('started', 0.03501945525291829), ('known', 0.03501945525291829), ('fed', 0.016861219195849545), ('down', 0.01556420233463035), ('Informed', 0.014267185473411154), ('represented', 0.01297016861219196), ('along', 0.011673151750972763), ('out', 0.011673151750972763), ('back', 0.010376134889753566), ('and', 0.010376134889753566), ('established', 0.009079118028534372), ('that', 0.007782101167315175), ('aa', 0.007782101167315175), ('satisfied', 0.00648508430609598), ('is', 0.005188067444876783), ('advanced', 0.005188067444876783)], [], [], [], [('will', 0.7142857142857143), ('to', 0.2857142857142857)], [('went', 0.031111497349439472), ('carried', 0.018510471886677673), ('came', 0.016424784913530895), ('find', 0.015642652298600852), ('set', 0.014773616059789694), ('with-', 0.014599808812027461), ('go', 0.01399148344485965), ('pointed', 0.013122447206048491), ('carry', 0.012861736334405145), ('come', 0.011645085600069523), ('started', 0.011036760232901711), ('put', 0.010341531241852785), ('brought', 0.009993916746328322), ('paid', 0.009559398626922743), ('get', 0.009298687755279395), ('sent', 0.009124880507517164), ('took', 0.008342747892587122), ('got', 0.00816894064482489), ('struck', 0.007995133397062657), ('laid', 0.006952289910489268)], [('city', 0.049490538573508006), ('City', 0.042212518195050945), ('Bay', 0.03202328966521106), ('and', 0.023289665211062592), ('avenue', 0.020378457059679767), ('city,', 0.01455604075691412), ('is', 0.013100436681222707), ('delegation', 0.013100436681222707), ('City,', 0.010189228529839884), ('Railroad', 0.010189228529839884), ('State', 0.010189228529839884), ('banks', 0.008733624454148471), ('letter', 0.008733624454148471), ('as', 0.008733624454148471), ('or', 0.00727802037845706), ('were', 0.00727802037845706), ('bound', 0.00727802037845706), ('seems', 0.00727802037845706), ('prior', 0.00727802037845706), ('brought', 0.00727802037845706)], [], [('that', 0.05250831014357985), ('in', 0.043840778342967776), ('for', 0.03457297312301788), ('at', 0.03137382807891687), ('to', 0.026793707191483752), ('of', 0.024961658836510504), ('on', 0.024194835566720564), ('all', 0.022643840084385258), ('when', 0.015985315854851804), ('with', 0.015294827933185751), ('by', 0.014850694998646783), ('if', 0.011190068077251373), ('as', 0.010940243301573203), ('from', 0.010412835441808176), ('then', 0.008761216091491384), ('after', 0.0069499864678246506), ('under', 0.006283787066016197), ('In', 0.00617622362093254), ('upon', 0.005971506096418484), ('during', 0.004802187354702604)], [], [], [], [], [], [], [], [], [('progressively,', 0.05555555555555555), ('file', 0.05555555555555555), ('streets', 0.05555555555555555), ('bland', 0.05555555555555555), ('her', 0.05555555555555555), ('management,', 0.05555555555555555), ('rapped,', 0.05555555555555555), ('gans', 0.027777777777777776), ('settlement', 0.027777777777777776), ('rived', 0.027777777777777776), ('th.\\\\nprepari*', 0.027777777777777776), ('Mack', 0.027777777777777776), ('Wichita', 0.027777777777777776), ('any\\\\norganization,', 0.027777777777777776), ('rulers,', 0.027777777777777776), ('tracted', 0.027777777777777776), ('lode,', 0.027777777777777776), ('child,-', 0.027777777777777776), ('iyiagdown,\\\\ndithcult', 0.027777777777777776), ('wuss', 0.027777777777777776)], [('for', 0.055714285714285716), ('about', 0.04428571428571428), ('twenty-', 0.03), ('eighty', 0.03), ('the', 0.02857142857142857), ('forty', 0.02857142857142857), ('thirty-', 0.027142857142857142), ('in', 0.025714285714285714), ('twenty', 0.024285714285714285), ('sixty', 0.022857142857142857), ('from', 0.02142857142857143), ('thirty', 0.02142857142857143), ('at', 0.02142857142857143), ('whithin', 0.02), ('some', 0.017142857142857144), ('seventy', 0.017142857142857144), ('ninety', 0.017142857142857144), ('fifty', 0.015714285714285715), ('that', 0.014285714285714285), ('forty-', 0.012857142857142857)], [], [], [('block', 1.0)], [('for\\\\nthe', 0.3333333333333333), ('toward', 0.3333333333333333), ('it,', 0.3333333333333333)], [], [], [('of', 0.9406392694063926), ('or', 0.0182648401826484), ('of.', 0.0045662100456621), ('c\\\\nof', 0.0045662100456621), ('ofecommon', 0.0045662100456621), ('of-', 0.0045662100456621), ('ot', 0.0045662100456621), ('of\\\\nrailway', 0.0045662100456621), ('of\\\\nthe', 0.0045662100456621), ('al\\\\nof', 0.0045662100456621), ('ol', 0.0045662100456621)], [], [], [], [], [('in', 0.1984948259642521), ('to', 0.1251175917215428), ('of', 0.08278457196613359), ('on', 0.04609595484477893), ('In', 0.03574788334901223), ('for', 0.034807149576669805), ('if', 0.028222013170272814), ('by', 0.022577610536218252), ('among', 0.022577610536218252), ('with', 0.020696142991533398), ('at', 0.01975540921919097), ('upon', 0.01881467544684854), ('from', 0.015051740357478834), ('when', 0.014111006585136407), ('interest', 0.014111006585136407), ('during', 0.01317027281279398), ('since', 0.01317027281279398), ('as', 0.012229539040451553), ('about', 0.010348071495766699), ('describe', 0.00940733772342427)], [], [('the', 0.6666666666666666), ('tlie', 0.3333333333333333)], [('trouble', 0.1), ('art', 0.1), ('gulf', 0.1), ('impetus', 0.1), ('sufferers.\\\\nThere', 0.05), ('change,', 0.05), ('ambition', 0.05), ('mission', 0.05), ('man', 0.05), ('object', 0.05), ('deal.\\\\nIt', 0.05), ('drawback', 0.05), ('mountain', 0.05), ('difficulty', 0.05), ('mass', 0.05), ('and\\\\nthere', 0.05)], [('throat,', 0.3333333333333333), ('yet', 0.3333333333333333), ('come', 0.3333333333333333)], [('ralniali', 0.6666666666666666), ('vocabulary', 0.3333333333333333)], [('the', 0.8435374149659864), ('one', 0.031292517006802724), ('tho', 0.01904761904761905), ('said', 0.013605442176870748), ('a', 0.009523809523809525), ('tbe', 0.006802721088435374), ('his', 0.005442176870748299), ('Twenty-', 0.004081632653061225), ('tbo', 0.0027210884353741495), ('its', 0.0027210884353741495), ('Uie', 0.0027210884353741495), ('two', 0.0027210884353741495), ('Broas', 0.0027210884353741495), ('Ihe', 0.0027210884353741495), (\"'he\", 0.0027210884353741495), (\"Yieuxtemps'\", 0.0027210884353741495), ('^he', 0.0027210884353741495), ('tin·', 0.0027210884353741495), ('this', 0.0027210884353741495), ('that', 0.0027210884353741495)], [('of', 0.8801742919389978), ('ol', 0.023965141612200435), ('in', 0.015250544662309368), ('ot', 0.013071895424836602), ('at', 0.008714596949891068), ('as', 0.006535947712418301), ('cf', 0.006535947712418301), ('on', 0.004357298474945534), ('by', 0.004357298474945534), ('<>l', 0.004357298474945534), ('ef', 0.004357298474945534), ('revives', 0.002178649237472767), ('meetings,', 0.002178649237472767), ('collection\\\\nof', 0.002178649237472767), ('undor', 0.002178649237472767), ('or', 0.002178649237472767), ('and', 0.002178649237472767), ('cross,', 0.002178649237472767), ('»>f', 0.002178649237472767), ('address.\\\\nAfter', 0.002178649237472767)], [('the', 1.0)], [], [], [], [('people', 0.007811723789066814), ('city', 0.007597883587209734), ('sum', 0.007528284653586438), ('purpose', 0.007202480949813621), ('office', 0.006874155545547206), ('use', 0.0064152060557124335), ('part', 0.005949700144594307), ('time', 0.005928013520349366), ('end', 0.005590105654207281), ('amount', 0.0055063851978198386), ('name', 0.005444855705776056), ('State', 0.005406525858273371), ('place', 0.0049142899219231095), ('hands', 0.004776100734873958), ('payment', 0.004654050430983831), ('City', 0.004477027056333276), ('rate', 0.004328751067309734), ('date', 0.004124997667427043), ('number', 0.004105832743675701), ('cost', 0.003972686957613745)], [('is', 0.12226176354290233), ('seems', 0.04404903123764334), ('was', 0.03846052458152102), ('necessary', 0.030815869250032952), ('ought', 0.024489257941215238), ('seemed', 0.0243574535389482), ('comes', 0.021484117569526822), ('came', 0.017556346381969158), ('impossible', 0.0125477790958218), ('possible', 0.01162514827995255), ('up', 0.010860682746803744), ('appears', 0.010491630420456043), ('has', 0.009226308158692501), ('occurred', 0.009068142875972058), ('began', 0.008567286147357321), ('appearing', 0.008198233821009622), ('difficult', 0.00785554237511533), ('over', 0.006880189798339265), ('had', 0.006695663635165414), ('Is', 0.006616580993805193)], [], [('the', 0.6598268072289156), ('a', 0.06701807228915663), ('his', 0.025978915662650603), ('tho', 0.02371987951807229), ('this', 0.017695783132530122), ('the\\\\ncourt', 0.015060240963855422), ('each', 0.011671686746987951), ('either', 0.010730421686746988), ('tbe', 0.00809487951807229), ('her', 0.007530120481927711), ('that', 0.007530120481927711), ('my', 0.006400602409638554), ('said', 0.005835843373493976), ('our', 0.004329819277108434), ('boarding', 0.003953313253012048), ('their', 0.0037650602409638554), ('every', 0.00338855421686747), ('your', 0.00338855421686747), ('any', 0.0028237951807228916), ('packing', 0.0026355421686746986)], [('be', 0.7551020408163265), ('bo', 0.061224489795918366), ('the', 0.04081632653061224), ('he', 0.04081632653061224), ('1', 0.02040816326530612), ('an\\\\noccasional', 0.02040816326530612), ('b', 0.02040816326530612), ('every', 0.02040816326530612), ('hurry\\\\nthe', 0.02040816326530612)], [('his', 0.28869047619047616), ('her', 0.125), ('one', 0.07142857142857142), ('a', 0.06547619047619048), ('little', 0.05654761904761905), ('their', 0.050595238095238096), ('wife,', 0.023809523809523808), ('bis', 0.01488095238095238), ('Hud-', 0.01488095238095238), ('the', 0.008928571428571428), ('loving', 0.008928571428571428), ('only', 0.008928571428571428), ('my', 0.005952380952380952), ('promising', 0.005952380952380952), ('learned', 0.005952380952380952), (\"Callahan's\", 0.005952380952380952), ('dutiful', 0.005952380952380952), ('John-', 0.005952380952380952), ('Infant', 0.005952380952380952), ('young', 0.005952380952380952)], [], [], [], [('little', 0.15447154471544716), ('small', 0.08943089430894309), ('summer', 0.08130081300813008), ('neat', 0.04878048780487805), ('pleasant', 0.032520325203252036), ('tiny', 0.024390243902439025), ('fine', 0.024390243902439025), ('new', 0.024390243902439025), ('nice', 0.024390243902439025), ('white', 0.016260162601626018), ('five-room', 0.016260162601626018), ('thatched', 0.016260162601626018), (\"ploughman's\", 0.016260162601626018), ('drab', 0.016260162601626018), ('low', 0.016260162601626018), ('eesy', 0.016260162601626018), ('vine-covered', 0.016260162601626018), ('snug', 0.016260162601626018), ('rest', 0.016260162601626018), ('clay', 0.016260162601626018)], [('way', 0.046931305507476664), ('duty', 0.03010526913145873), ('power', 0.017648894815991828), ('efforts', 0.013108986181653095), ('return', 0.012513123173396135), ('right', 0.012484748744431519), ('mind', 0.012115881167891496), ('attention', 0.011917260165139177), ('ability', 0.011293022727917601), ('wife', 0.01081065743551911), ('name', 0.010612036432766791), ('intention', 0.010555287574837556), ('wife,', 0.008171835541809721), ('time', 0.007916465681128167), ('hand', 0.007859716823198934), ('desire', 0.007575972533552762), ('friends', 0.007178730528048123), ('family', 0.007036858383225038), ('life', 0.0068098629515081005), ('letter', 0.006213999943251142)], [('husband', 0.020961168092444128), ('husband,', 0.01285439154387065), ('father', 0.01222734805392574), ('head', 0.011286782819008375), ('mother', 0.010032695839118555), ('hands', 0.008509875934966632), ('eyes', 0.006763111927262955), ('face', 0.0064495901822905), ('home', 0.0064495901822905), ('life', 0.006001701975186994), ('head,', 0.005464236126662785), ('father,', 0.00515071438169033), ('hand', 0.005105925560979979), ('up', 0.0050611367402696285), ('arms', 0.004926770278138577), ('life,', 0.004881981457428226), ('room', 0.004792403816007524), ('eyes,', 0.004792403816007524), ('face,', 0.004568459712455771), ('children', 0.004299726788193667)], [('amount', 0.015857973212029315), ('person', 0.015099823098306799), ('land', 0.008150113722517059), ('property', 0.007202426080363912), ('more', 0.005875663381349507), ('money', 0.005370229972201162), ('blood', 0.00511751326762699), ('ground', 0.004801617386909275), ('world', 0.004612079858478646), ('people', 0.004485721506191559), ('law', 0.0044225423300480165), ('time', 0.004359363153904473), ('lands', 0.004106646449330301), ('debt', 0.004043467273186757), ('country,', 0.0039171089208996715), ('work', 0.0038539297447561286), ('sum', 0.0037907505686125853), ('country', 0.0037907505686125853), ('bonds', 0.003664392216325499), ('party', 0.00347485468789487)], [], [('as', 0.08163265306122448), ('time', 0.08163265306122448), ('child,', 0.061224489795918366), ('lady,', 0.04081632653061224), ('ones,', 0.04081632653061224), ('is', 0.04081632653061224), ('girl,', 0.04081632653061224), ('relief', 0.04081632653061224), ('gossip', 0.04081632653061224), ('which', 0.04081632653061224), ('value;', 0.02040816326530612), ('cor]\\\\nral,\"', 0.02040816326530612), ('spoil,', 0.02040816326530612), ('harshly,', 0.02040816326530612), ('elevated,', 0.02040816326530612), ('cousequence;', 0.02040816326530612), ('and', 0.02040816326530612), ('thought,', 0.02040816326530612), ('ones', 0.02040816326530612), ('has', 0.02040816326530612)], [], [], [], [('their', 0.6666666666666666), ('tbe', 0.16666666666666666), ('the', 0.16666666666666666)], [('owner', 0.010640353298371515), ('person', 0.007728401876897599), ('city', 0.006541540160088325), ('same', 0.00612751863096881), ('same,', 0.005106265525807342), ('purchaser', 0.004747446867237096), ('State', 0.004402428926304168), ('whole', 0.004264421749930997), ('state', 0.004043610267733922), ('time', 0.003629588738614408), ('county', 0.0035881865857024567), ('party', 0.0033121722329561135), ('man', 0.003036157880209771), ('day', 0.002953353574385868), ('right', 0.0028843499861992824), ('military', 0.0028015456803753796), ('people', 0.002760143527463428), ('town', 0.0027325420921887936), ('point', 0.0025945349158156225), ('State,', 0.0022909191277946453)], [], [('people', 0.007811723789066814), ('city', 0.007597883587209734), ('sum', 0.007528284653586438), ('purpose', 0.007202480949813621), ('office', 0.006874155545547206), ('use', 0.0064152060557124335), ('part', 0.005949700144594307), ('time', 0.005928013520349366), ('end', 0.005590105654207281), ('amount', 0.0055063851978198386), ('name', 0.005444855705776056), ('State', 0.005406525858273371), ('place', 0.0049142899219231095), ('hands', 0.004776100734873958), ('payment', 0.004654050430983831), ('City', 0.004477027056333276), ('rate', 0.004328751067309734), ('date', 0.004124997667427043), ('number', 0.004105832743675701), ('cost', 0.003972686957613745)], [('the', 0.8945738090151677), ('tho', 0.028733176671651355), ('tbe', 0.014740440076906645), ('that', 0.007904293954283274), ('this', 0.00640888698995941), ('tne', 0.0035248878444776757), ('thc', 0.0020294808801538133), ('these', 0.0019226660969878231), ('tlie', 0.001815851313821833), ('tha', 0.001815851313821833), ('ihe', 0.0013885921811578722), ('tiie', 0.0013885921811578722), ('those', 0.0008545182653279214), ('the.', 0.0007477034821619312), ('tile', 0.0007477034821619312), ('Hie', 0.0007477034821619312), ('th', 0.0007477034821619312), ('tlio', 0.000640888698995941), ('thu', 0.000640888698995941), ('tue', 0.000640888698995941)], [], [], [('it.', 0.05698529411764706), ('this.', 0.04044117647058824), ('manner.', 0.029411764705882353), ('that.', 0.025735294117647058), ('this:', 0.02022058823529412), ('mad.', 0.014705882352941176), ('results.', 0.012867647058823529), ('satin.', 0.011029411764705883), ('metal.', 0.009191176470588236), ('him.', 0.009191176470588236), ('me.', 0.009191176470588236), ('children.', 0.009191176470588236), ('nature.', 0.009191176470588236), ('thunder.', 0.009191176470588236), ('it', 0.007352941176470588), ('silver.', 0.007352941176470588), ('circumstances.', 0.007352941176470588), ('them.', 0.007352941176470588), ('It.', 0.007352941176470588), ('Macbeth.', 0.0055147058823529415)], [('and', 1.0)], [], [], [], [], [], [('of', 0.8275862068965517), ('»f', 0.06896551724137931), ('af', 0.06896551724137931), ('01', 0.034482758620689655)], [('not', 0.10097719869706841), ('sufficient,', 0.04560260586319218), ('made', 0.04071661237785016), ('pleasant,', 0.035830618892508145), ('almost', 0.029315960912052116), ('left', 0.026058631921824105), ('never', 0.019543973941368076), ('still', 0.017915309446254073), ('entirely', 0.016286644951140065), ('taken', 0.014657980456026058), ('now', 0.014657980456026058), ('absolutely', 0.011400651465798045), ('better', 0.011400651465798045), ('dispersed', 0.011400651465798045), ('elected,', 0.009771986970684038), ('people', 0.009771986970684038), ('yet', 0.009771986970684038), ('also', 0.008143322475570033), ('wholly', 0.008143322475570033), ('taxed', 0.008143322475570033)], [('and', 0.29577464788732394), ('procured,', 0.03621730382293763), ('or', 0.03420523138832998), ('made', 0.028169014084507043), ('killed', 0.012072434607645875), ('received\\\\nand', 0.012072434607645875), ('divided', 0.008048289738430584), ('made,', 0.008048289738430584), ('tried', 0.008048289738430584), ('issued', 0.008048289738430584), ('established', 0.006036217303822937), ('published', 0.006036217303822937), ('and\\\\nstill', 0.006036217303822937), ('printed', 0.006036217303822937), ('formed,', 0.006036217303822937), ('preserved', 0.006036217303822937), ('found', 0.006036217303822937), ('made.\\\\nThere', 0.004024144869215292), ('exposed', 0.004024144869215292), ('elected', 0.004024144869215292)], [], [('the', 0.4024390243902439), ('a', 0.3780487804878049), ('it', 0.07317073170731707), ('them', 0.024390243902439025), ('him', 0.024390243902439025), ('me', 0.024390243902439025), ('sleeping', 0.012195121951219513), ('tho', 0.012195121951219513), ('i', 0.012195121951219513), ('al', 0.012195121951219513), ('her', 0.012195121951219513), ('necessary', 0.012195121951219513)], [], [('the', 0.6086956521739131), ('tbe', 0.08695652173913043), ('our', 0.08695652173913043), ('(ho', 0.043478260869565216), ('offthe', 0.043478260869565216), ('off\\\\nthe', 0.043478260869565216), ('for', 0.043478260869565216), ('off\\\\nour', 0.043478260869565216)], [], [('connected', 0.8095238095238095), ('associated', 0.047619047619047616), ('con\\xad\\\\nnected', 0.047619047619047616), ('connecte', 0.023809523809523808), ('which\\\\nmingle', 0.023809523809523808), ('met', 0.023809523809523808), ('connecte«!', 0.023809523809523808)]]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# PREDICTION FOR DEV-0\n", "\n", "dataframe = pd.read_csv(directory_dev_0, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)\n", "dataframe = dataframe.replace(r'\\\\r|\\\\n|\\n|\\\\t', ' ', regex=True)\n", "\n", "left_text = dataframe['LeftContext'].apply(lambda l: re.split(r\"\\s+\", l)).to_list()\n", "right_text = dataframe['RightContext'].apply(lambda l: re.split(r\"\\s+\", l)).to_list()\n", "\n", "lines = zip(left_text, right_text)\n", "lines = list(map(lambda l: model.generate_text(l[0], l[1], False), tqdm(lines)))\n", "print(lines[:100])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 10519/10519 [00:00<00:00, 106542.75it/s]\n" ] } ], "source": [ "with open(\"dev-0/out.tsv\", \"w\", encoding=\"UTF-8\") as file:\n", " result = \"\\n\".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))\n", " file.write(result)\n", " file.close()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "7414it [00:25, 290.39it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[[], [], [('the', 0.9090909090909091), ('tho', 0.09090909090909091)], [('man', 0.022790697674418603), ('plan', 0.011348837209302326), ('trial', 0.009674418604651163), ('living', 0.008744186046511627), ('statement', 0.008), ('law', 0.007720930232558139), ('class', 0.007162790697674419), ('time', 0.006232558139534884), ('year', 0.005767441860465117), ('vote', 0.005488372093023256), ('government', 0.005209302325581395), ('single', 0.005209302325581395), ('day', 0.005023255813953489), ('question', 0.004930232558139535), ('sensation', 0.0048372093023255815), ('bill', 0.004651162790697674), ('little', 0.004372093023255814), ('corporation,', 0.003813953488372093), ('way', 0.003813953488372093), ('means', 0.003813953488372093)], [], [('here', 0.17647058823529413), ('to', 0.17647058823529413), ('plaster,', 0.11764705882352941), ('arms,', 0.11764705882352941), ('youngsters,', 0.058823529411764705), ('mist', 0.058823529411764705), ('baby\\\\nlingers', 0.058823529411764705), ('close,', 0.058823529411764705), ('vines', 0.058823529411764705), ('material', 0.058823529411764705), ('fabrics', 0.058823529411764705)], [('tho', 1.0)], [], [('hanging', 0.06666666666666667), ('used', 0.06666666666666667), ('alarmed', 0.06666666666666667), ('seated', 0.06666666666666667), ('landed', 0.06666666666666667), ('received', 0.06666666666666667), ('outlined', 0.06666666666666667), ('drunk', 0.03333333333333333), ('employed', 0.03333333333333333), ('above', 0.03333333333333333), ('gathered', 0.03333333333333333), ('put', 0.03333333333333333), ('burned', 0.03333333333333333), ('early', 0.03333333333333333), ('conferred', 0.03333333333333333), ('apartments', 0.03333333333333333), ('living', 0.03333333333333333), ('poured', 0.03333333333333333), ('practical-\\\\nly', 0.03333333333333333), ('visible', 0.03333333333333333)], [], [('since', 0.1415119720204466), ('in', 0.03847188592951305), ('on', 0.02824858757062147), ('been', 0.026096314231907454), ('had', 0.024751143395211193), ('seen', 0.011299435028248588), ('to', 0.011030400860909336), ('saw', 0.010761366693570083), ('be', 0.009954264191552327), ('see', 0.008340059187516815), ('that', 0.008071025020177562), ('heard', 0.007532956685499058), ('reached', 0.007263922518159807), ('In', 0.007263922518159807), ('before', 0.006994888350820554), ('of', 0.006725854183481302), ('forget', 0.006187785848802798), ('at', 0.005918751681463546), ('occupied', 0.005918751681463546), ('have', 0.005649717514124294)], [], [('days', 0.1013157894736842), ('minutes', 0.06710526315789474), ('years', 0.05789473684210526), ('weeks', 0.04473684210526316), ('moments', 0.042105263157894736), ('months', 0.04078947368421053), ('cents', 0.02894736842105263), ('at', 0.02631578947368421), ('dollars', 0.019736842105263157), ('days,', 0.018421052631578946), ('years,', 0.018421052631578946), ('hours', 0.013157894736842105), ('day', 0.011842105263157895), ('months,', 0.010526315789473684), ('furnish', 0.010526315789473684), ('in', 0.010526315789473684), ('weeks,', 0.007894736842105263), ('with', 0.006578947368421052), ('that', 0.006578947368421052), ('hundreds', 0.005263157894736842)], [], [('there', 1.0)], [('from', 1.0)], [('which', 0.40717029449423814), ('what', 0.05505761843790013), ('whom', 0.03393085787451985), ('until', 0.029449423815621), ('as', 0.01792573623559539), ('them', 0.01088348271446863), ('till', 0.009603072983354673), ('when', 0.0076824583866837385), ('this', 0.007042253521126761), ('if', 0.007042253521126761), ('and', 0.007042253521126761), ('them,', 0.006402048655569782), ('hand', 0.006402048655569782), ('all', 0.005761843790012804), ('hand,', 0.005121638924455826), ('whether', 0.005121638924455826), ('Saturday', 0.005121638924455826), ('that', 0.005121638924455826), ('board,', 0.005121638924455826), ('where', 0.005121638924455826)], [('this', 0.4672489082969432), ('any', 0.09606986899563319), ('every', 0.09388646288209607), ('the', 0.07860262008733625), ('that', 0.07860262008733625), ('a', 0.024017467248908297), ('one', 0.021834061135371178), ('some', 0.019650655021834062), ('each', 0.013100436681222707), ('an\\\\nearlier', 0.008733624454148471), ('what', 0.008733624454148471), ('no', 0.008733624454148471), (\"th's\", 0.006550218340611353), ('anj-', 0.004366812227074236), ('nny', 0.004366812227074236), ('ever?', 0.004366812227074236), ('evsry', 0.004366812227074236), ('ono', 0.004366812227074236), ('a\\\\nyounger', 0.002183406113537118), ('each\\\\nsuccessive', 0.002183406113537118)], [('little', 0.19910514541387025), ('whole', 0.049217002237136466), ('beautiful', 0.03131991051454139), ('nearest', 0.024608501118568233), ('neighboring', 0.024608501118568233), ('said', 0.02237136465324385), ('Indian', 0.020134228187919462), ('entire', 0.017897091722595078), ('same', 0.015659955257270694), ('present', 0.013422818791946308), ('old', 0.013422818791946308), ('agricultural', 0.013422818791946308), ('next', 0.013422818791946308), ('small', 0.011185682326621925), ('town,', 0.011185682326621925), ('first', 0.011185682326621925), ('quiet', 0.011185682326621925), ('town\\\\nor', 0.008948545861297539), ('thriving', 0.008948545861297539), ('Maine', 0.008948545861297539)], [], [], [], [('inches', 0.2222222222222222), ('feet', 0.08262108262108261), ('chains', 0.07407407407407407), ('poles', 0.042735042735042736), ('Inches', 0.02849002849002849), ('.00', 0.022792022792022793), ('links', 0.022792022792022793), ('years', 0.019943019943019943), ('in.)', 0.017094017094017096), ('perches', 0.014245014245014245), ('.50', 0.014245014245014245), ('miles', 0.011396011396011397), (\"o'clock\", 0.011396011396011397), ('00', 0.011396011396011397), ('.25', 0.011396011396011397), ('.', 0.011396011396011397), ('chs,', 0.011396011396011397), ('inchee', 0.008547008547008548), ('iuches', 0.008547008547008548), ('rods', 0.008547008547008548)], [], [], [], [], [('the', 0.75), ('silent', 0.25)], [('it', 0.275), ('than', 0.15), ('this', 0.125), ('that', 0.125), ('there', 0.05), ('cheerfulness', 0.025), ('realized', 0.025), ('who', 0.025), ('she', 0.025), ('It', 0.025), ('t)an', 0.025), ('but', 0.025), ('all\\\\nthat', 0.025), ('1\\\\nthan', 0.025), ('what', 0.025), ('he', 0.025)], [('justice', 1.0)], [], [('going', 0.03180755506336902), ('placed', 0.028423772609819122), ('made', 0.026331979820351913), ('put', 0.023378860588162912), ('posted', 0.021164021164021163), ('found', 0.01925679832656577), ('held', 0.015934539190353144), ('laid', 0.015749969238341332), ('arrested', 0.01347360649686231), ('carried', 0.013350559862187769), ('lying', 0.013289036544850499), ('taken', 0.013289036544850499), ('not', 0.01273532668881506), ('based', 0.012427710102128706), ('standing', 0.01187400024609327), ('born', 0.011443337024732374), ('called', 0.0103359173126615), ('sitting', 0.010274393995324227), ('set', 0.009105450965916083), ('attached', 0.0068906115417743325)], [], [], [], [('Final', 0.8235294117647058), ('Jewish', 0.058823529411764705), ('Finir', 0.029411764705882353), ('Finsl', 0.029411764705882353), ('International', 0.029411764705882353), (\"People*'\", 0.029411764705882353)], [('said', 0.06564112842611615), ('seen', 0.06037981660570226), ('remembered', 0.05206193315628602), ('hoped', 0.03597735130530641), ('found', 0.028160545172120056), ('expected', 0.01914115347998196), ('sure', 0.018740291626998046), ('denied', 0.013579195269830134), ('told', 0.012977902490354262), ('admitted', 0.012577040637370347), ('supposed', 0.01237660971087839), ('understood', 0.011925640126271484), ('shown', 0.011925640126271484), ('stated', 0.011524778273287569), ('forgotten', 0.011023700957057674), ('assured', 0.010622839104073759), ('regretted', 0.01037230044595881), ('true', 0.009921330861351907), ('observed', 0.009470361276745002), ('noted', 0.009420253545122012)], [('arise\\\\ncomplications', 1.0)], [], [('be', 0.06187624750499002), ('sell', 0.033932135728542916), ('look', 0.031936127744510975), ('work', 0.021956087824351298), ('bo', 0.015968063872255488), ('appear', 0.015968063872255488), ('meet', 0.013972055888223553), ('get', 0.011976047904191617), ('redemption', 0.011976047904191617), ('stop', 0.011976047904191617), ('order', 0.00998003992015968), ('go', 0.00998003992015968), ('sit', 0.00998003992015968), ('return', 0.007984031936127744), ('Cuba', 0.007984031936127744), ('voto', 0.007984031936127744), ('him', 0.007984031936127744), ('It', 0.007984031936127744), ('me', 0.007984031936127744), ('remain', 0.007984031936127744)], [('must', 0.3333333333333333), ('shall', 0.3333333333333333), ('as\\\\nmay', 0.16666666666666666), ('may', 0.16666666666666666)], [('cross\\xad', 1.0)], [('of', 0.9375), ('ot', 0.0234375), ('or', 0.015625), ('f', 0.0078125), ('as', 0.0078125), ('oi', 0.0078125)], [], [('come,', 0.07317073170731707), ('will', 0.04878048780487805), ('tires;', 0.04878048780487805), ('die', 0.04878048780487805), ('wrecked', 0.04878048780487805), ('enough,', 0.04878048780487805), ('tempered,', 0.04878048780487805), ('die,', 0.04878048780487805), ('show,', 0.04878048780487805), ('say.', 0.04878048780487805), ('decay', 0.04878048780487805), ('learn', 0.024390243902439025), ('end,', 0.024390243902439025), ('decay,\\\\nand', 0.024390243902439025), ('evident', 0.024390243902439025), ('as\\\\npossible', 0.024390243902439025), ('as', 0.024390243902439025), ('after-especially', 0.024390243902439025), ('lost', 0.024390243902439025), ('will\\\\nairi7e,', 0.024390243902439025)], [('of', 0.900392762861374), ('ot', 0.010752688172043012), ('in', 0.007533320455862469), ('to', 0.006503122786684695), ('ol', 0.005859249243448586), ('and', 0.0051509883458888675), ('from', 0.0046358895112999805), ('on', 0.003670079196445818), ('or', 0.0034769171334749853), ('for', 0.0031549803618569315), ('at', 0.0030262056532097095), ('cf', 0.0021891700470027687), ('that', 0.0014809091494430494), ('oi', 0.0014165217951194386), ('o', 0.0012877470864722169), ('by', 0.0009658103148541626), ('into', 0.0009014229605305518), ('during', 0.0009014229605305518), ('In', 0.0009014229605305518), ('as', 0.000837035606206941)], [], [], [], [('half', 0.18426501035196688), ('for', 0.054865424430641824), ('in', 0.04796411318150449), ('such', 0.032091097308488616), ('like', 0.030020703933747412), ('with', 0.021739130434782608), ('on', 0.020703933747412008), ('twice', 0.01932367149758454), ('by', 0.018633540372670808), ('him', 0.01725327812284334), ('once', 0.014837819185645272), ('making', 0.012077294685990338), ('In', 0.01069703243616287), ('as', 0.009316770186335404), ('it', 0.00862663906142167), ('what', 0.005866114561766736), ('at', 0.005521048999309869), ('of', 0.005521048999309869), ('halt', 0.005521048999309869), ('through', 0.005521048999309869)], [], [], [('that', 0.5), ('ther', 0.25), ('the', 0.25)], [('to', 0.8762886597938144), ('find', 0.020618556701030927), ('lectures,', 0.020618556701030927), ('to?', 0.020618556701030927), ('them,\\\\nthough', 0.010309278350515464), ('school', 0.010309278350515464), ('and', 0.010309278350515464), ('as', 0.010309278350515464), ('ing', 0.010309278350515464), ('tothatafterawhilo;', 0.010309278350515464)], [], [('is', 0.09836065573770492), ('to', 0.06147540983606557), ('shows', 0.06147540983606557), ('was', 0.05327868852459016), ('showing', 0.040983606557377046), ('says', 0.03278688524590164), ('of', 0.03278688524590164), ('made', 0.028688524590163935), ('showed', 0.02459016393442623), ('saying', 0.020491803278688523), ('Is', 0.020491803278688523), ('said', 0.020491803278688523), ('by', 0.01639344262295082), ('that', 0.01639344262295082), ('published', 0.012295081967213115), ('declares', 0.012295081967213115), ('but', 0.012295081967213115), ('adds', 0.012295081967213115), ('as', 0.012295081967213115), ('and', 0.012295081967213115)], [], [], [], [('unerjoycd', 0.2), ('broken', 0.1), ('338,', 0.1), (\"'JO,\", 0.1), ('73.', 0.1), ('£Sl', 0.1), (\"17.'t,\", 0.1), ('of\\\\nglass', 0.1), (':iroc,', 0.1)], [], [], [], [], [('is', 0.6146341463414634), ('was', 0.17073170731707318), ('hates', 0.02926829268292683), ('all', 0.02926829268292683), ('Is', 0.01951219512195122), ('i', 0.00975609756097561), ('seems', 0.00975609756097561), ('ia', 0.00975609756097561), ('was\\\\nthought', 0.004878048780487805), ('id', 0.004878048780487805), ('wholly', 0.004878048780487805), ('w«g', 0.004878048780487805), ('bates', 0.004878048780487805), ('is\\\\nolso', 0.004878048780487805), ('iiates', 0.004878048780487805), ('hales', 0.004878048780487805), ('would\\\\nbe', 0.004878048780487805), ('entirely', 0.004878048780487805), ('for', 0.004878048780487805), ('13', 0.004878048780487805)], [('of', 0.8686131386861314), ('by', 0.043795620437956206), ('half', 0.010948905109489052), ('ol', 0.010948905109489052), ('only', 0.0072992700729927005), ('in', 0.0072992700729927005), ('resembled', 0.0072992700729927005), ('with', 0.0072992700729927005), ('hostess,', 0.0072992700729927005), (\">1'\", 0.0036496350364963502), ('of\\\\nabout', 0.0036496350364963502), ('upon', 0.0036496350364963502), ('cf', 0.0036496350364963502), ('on', 0.0036496350364963502), ('as', 0.0036496350364963502), ('ot', 0.0036496350364963502), ('oi\\\\nbutter', 0.0036496350364963502)], [('is', 0.15611521873074552), ('was', 0.11652803450400492), ('up', 0.025492914356130623), ('did', 0.01494146642020949), ('out', 0.014094269870609981), ('has', 0.013478126925446704), ('be', 0.013016019716574245), ('will', 0.012245841035120148), ('all', 0.01170671595810228), ('does', 0.00885705483672212), ('is,', 0.008240911891558842), ('were', 0.007932840418977202), ('and', 0.007470733210104745), ('comes', 0.006931608133086876), ('down', 0.006315465187923599), ('appears', 0.006238447319778189), ('away', 0.00593037584719655), ('Is', 0.00577634011090573), ('not', 0.004929143561306223), ('lies', 0.004929143561306223)], [('be', 0.5909090909090909), ('show', 0.18181818181818182), ('remain', 0.09090909090909091), ('continue', 0.045454545454545456), ('ho\\\\niotisidereri', 0.045454545454545456), ('tie', 0.045454545454545456)], [], [], [('in', 0.0751054852320675), ('of', 0.06160337552742616), ('throughout', 0.04388185654008439), ('restoring', 0.036286919831223625), ('on', 0.036286919831223625), ('to', 0.031223628691983123), ('at', 0.0270042194092827), ('by', 0.021940928270042195), ('that', 0.016877637130801686), ('from', 0.016033755274261603), ('for', 0.016033755274261603), ('and', 0.012658227848101266), ('with', 0.012658227848101266), ('speaking,', 0.010126582278481013), ('prefer', 0.009282700421940928), ('over', 0.009282700421940928), ('In', 0.009282700421940928), ('known,', 0.007594936708860759), ('believed', 0.007594936708860759), ('through', 0.007594936708860759)], [], [], [('and', 0.3333333333333333), ('»nd', 0.3333333333333333), ('nnd', 0.3333333333333333)], [('that', 0.18391152855627935), ('when', 0.10362066353582791), ('then', 0.05771852749583396), ('as', 0.05105287077715498), ('if', 0.04226632328435086), ('said', 0.027571580063626724), ('while', 0.01545220421148311), ('yet', 0.014846235418875928), ('there', 0.013482805635509772), ('now', 0.012573852446599), ('which', 0.01227086805029541), ('so', 0.01196788365399182), ('what', 0.010149977276170277), ('where', 0.009998485078018482), ('how', 0.009241024087259506), ('after', 0.008332070898348734), ('before', 0.007423117709437964), ('though', 0.007423117709437964), ('although', 0.006362672322375398), ('here', 0.005756703529768217)], [], [], [('he', 0.3530961791831357), ('I', 0.1712779973649539), ('she', 0.07114624505928854), ('they', 0.04479578392621871), ('we', 0.039525691699604744), ('is', 0.028985507246376812), ('was', 0.026350461133069828), ('1', 0.025032938076416336), ('ho', 0.023715415019762844), ('lie', 0.01844532279314888), ('be', 0.010540184453227932), ('you', 0.010540184453227932), ('were', 0.005270092226613966), ('time', 0.005270092226613966), ('have', 0.005270092226613966), ('all', 0.003952569169960474), ('evening', 0.003952569169960474), ('whenever\\\\nshe', 0.003952569169960474), ('Americans', 0.003952569169960474), ('one', 0.003952569169960474)], [], [('and', 1.0)], [], [], [], [('great', 0.5), ('gay', 0.25), ('fanatic,', 0.25)], [], [('would', 0.6), ('they', 0.2), ('mean?\\\\nI', 0.2)], [], [], [], [], [], [], [], [], [], [], [], [('world,', 0.005109339873288371), ('time', 0.004291845493562232), ('country', 0.004291845493562232), ('city', 0.003883098303699162), ('house', 0.003883098303699162), ('people,', 0.0032699775189045576), ('city,', 0.0030656039239730227), ('ground', 0.0030656039239730227), ('day', 0.002861230329041488), ('people', 0.002452483139178418), ('world', 0.002452483139178418), ('public,', 0.002452483139178418), ('water', 0.002248109544246883), ('ground,', 0.002248109544246883), ('house,', 0.002248109544246883), ('building', 0.002248109544246883), ('man', 0.0020437359493153486), ('other', 0.0020437359493153486), ('State', 0.0020437359493153486), ('blood', 0.0020437359493153486)], [('to', 0.15388454500083182), ('of', 0.15213774746298453), ('in', 0.07927133588421227), ('on', 0.05672932956246881), ('for', 0.04192314090833472), ('that', 0.03942771585426718), ('at', 0.03859590750291133), ('from', 0.035850939943437035), ('with', 0.02537015471635335), ('up', 0.014889369489269673), ('upon', 0.014307103643320579), ('over', 0.013724837797371485), ('into', 0.013059391116286808), ('and', 0.012893029446015638), ('by', 0.011811678589253036), ('through', 0.011395774413575113), ('In', 0.011395774413575113), ('all', 0.01064714689735485), ('out', 0.010563966062219265), ('down', 0.007985360173016137)], []]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# PREDICTION FOR TEST-A\n", "\n", "dataframe = pd.read_csv(directory_test_A, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], quoting=csv.QUOTE_NONE)\n", "dataframe = dataframe.replace(r'\\\\r|\\\\n|\\n|\\\\t', ' ', regex=True)\n", "\n", "left_text = dataframe['LeftContext'].apply(lambda l: re.split(r\"\\s+\", l)).to_list()\n", "right_text = dataframe['RightContext'].apply(lambda l: re.split(r\"\\s+\", l)).to_list()\n", "\n", "lines = zip(left_text, right_text)\n", "lines = list(map(lambda l: model.generate_text(l[0], l[1], False), tqdm(lines)))\n", "print(lines[:100])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 7414/7414 [00:00<00:00, 114060.60it/s]\n" ] } ], "source": [ "with open(\"test-A/out.tsv\", \"w\", encoding=\"UTF-8\") as file:\n", " result = \"\\n\".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))\n", " file.write(result)\n", " file.close()" ] } ], "metadata": { "kernelspec": { "display_name": "python11", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }