{ "cells": [ { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import csv\n", "import os\n", "import re\n", "import random\n", "from collections import Counter, defaultdict\n", "import nltk\n", "import math\n", "from tqdm import tqdm" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "directory = \"train/in.tsv.xz\"\n", "directory_dev_0 = \"dev-0/in.tsv.xz\"\n", "directory_test_A = \"test-A/in.tsv.xz\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### MODEL N-GRAM" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "class Model():\n", " \n", " def __init__(self, vocab_size=30_000, UNK_token= '', n=3):\n", " if (n <= 1 or n % 2 == 0):\n", " raise \"change N value !!!\"\n", " self.n = n\n", " self.vocab_size = vocab_size\n", " self.UNK_token = UNK_token\n", " \n", " def train(self, corpus:list) -> None:\n", " if(self.n > 1):\n", " self.n_grams = list(nltk.ngrams(corpus, n=self.n))\n", " else:\n", " self.n_grams = corpus\n", " self.counter = Counter(self.n_grams)\n", " self.words_counter = Counter(corpus)\n", " self.all_quantities = Counter([gram[:math.floor(self.n/2)]+gram[math.ceil(self.n/2):] for gram in self.n_grams])\n", "\n", " self.all_grams = defaultdict(set)\n", "\n", " for gram in tqdm(self.n_grams):\n", " previous_words = tuple(gram[:math.floor(self.n/2)])\n", " next_words = tuple(gram[math.ceil(self.n/2):])\n", " word = gram[math.floor(self.n/2)]\n", " self.all_grams[(previous_words, next_words)].add(word)\n", "\n", " def get_conditional_prob_for_word(self, left_text: list, right_text: list, word: str) -> float:\n", " previous_words = tuple(left_text[-math.floor(self.n/2):])\n", " next_words = tuple(right_text[:math.floor(self.n/2)])\n", " quantity = self.counter[previous_words + tuple([word]) + next_words]\n", " all_quantity = self.all_quantities[previous_words + next_words]\n", " if (all_quantity <= 0):\n", " return 0\n", " return quantity/all_quantity\n", " \n", " def get_prob_for_text(self, text: list) -> float:\n", " prob = 1\n", " for gram in list(nltk.ngrams(text, self.n)):\n", " prob *= self.get_conditional_prob_for_word(gram[:math.floor(self.n/2)], gram[math.ceil(self.n/2):], gram[math.floor(self.n/2)])\n", " return prob\n", " \n", " def most_probable_words(self, left_text: list, right_text: list) -> str:\n", " previous_words = tuple(left_text[-math.floor(self.n/2):])\n", " next_words = tuple(right_text[:math.floor(self.n/2)])\n", " all_words = self.all_grams[(previous_words, next_words)]\n", " best_words = []\n", " for word in all_words:\n", " probability = self.get_conditional_prob_for_word(list(previous_words), list(next_words), word)\n", " best_words.append((word, probability))\n", " return sorted(best_words, key=(lambda l: l[1]), reverse=True)[:20]\n", " \n", " def generate_text(self, text_beggining:list, text_ending:list, greedy: bool) -> list:\n", " words = self.most_probable_words(text_beggining, text_ending)\n", " return words\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DATASET" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['came', 'fiom', 'the', 'last', 'place', 'to', 'thisnplace,', 'and', 'this', 'place', 'is', 'Where', 'WenWere,', 'this', 'is', 'the', 'first', 'road', 'I', 'evernwas', 'on', 'where', 'you', 'can', 'ride', 'elsewherenfrom', 'anywhere', 'and', 'be', 'nowhere.nHe', 'says,', 'while', 'this', 'train', 'stops', 'every-nwhere,', 'it', 'never', 'stops', 'anywhere', 'un-nless', 'its', 'somewhere.', 'Well,', 'I', \"says,nI'm\", 'glad', 'to', 'hear', 'that,', 'but,', 'accord-ning', 'to', 'your', 'figures,', 'I', 'left', 'myselfnwhere', '1', 'was,', 'which', 'is', 'five', 'miles', 'near-ner', 'to', 'myself', 'than', 'I', 'was', 'when', 'wenwere', 'where', 'we', 'are', 'now.nWe', 'have', 'now', 'reached', \"Slidell.nThat's\", 'a', 'fine', 'place.', 'The', 'peoplendown', 'there', 'remind', 'me', 'of', 'bananas-nthey', 'come', 'and', 'go', 'in', 'bunches.', '811-ndell', 'used', 'to', 'be', 'noted']\n" ] } ], "source": [ "dataframeList = pd.read_csv(directory, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], escapechar='\\\\', quoting=csv.QUOTE_NONE, chunksize=10000)\n", "\n", "expectedList = pd.read_csv(directory, sep='\\t', header=None, names=['Word'], escapechar='\\\\', quoting=csv.QUOTE_NONE, chunksize=10000)\n", "\n", "DATASET = \"\"\n", "\n", "for number, (dataframe, expected) in enumerate(zip(dataframeList, expectedList)):\n", " left_text = dataframe['LeftContext'].to_list()\n", " right_text = dataframe['RightContext'].to_list()\n", " word = expected['Word'].to_list()\n", "\n", " lines = zip(left_text, word, right_text)\n", " lines = list(map(lambda l: \" \".join(l), lines))\n", " DATASET = DATASET + \" \".join(lines)\n", "\n", "FINAL_DATASET = re.split(r\"\\s+\", DATASET)\n", "print(FINAL_DATASET[:100])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TRAIN" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 180304236/180304236 [13:57<00:00, 215160.70it/s] \n" ] } ], "source": [ "model_3gram = Model(n = 3)\n", "model_3gram.train(FINAL_DATASET)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "model = model_3gram" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### PREDICTION" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "def convert_predictions(line):\n", " sum_predictions = np.sum([pred[1] for pred in line])\n", " result = \"\"\n", " all_pred = 0\n", " for word, pred in line:\n", " new_pred = math.floor(pred / sum_predictions * 100) / 100\n", " if(new_pred == 1.0):\n", " new_pred = 0.99\n", " all_pred = all_pred + new_pred\n", " result = result + word + \":\" + str(new_pred) + \" \"\n", " if(round(all_pred, 2) < 1):\n", " result = result + \":\" + str(round(1 - all_pred, 2))\n", " else:\n", " result = result + \":\" + str(0.01)\n", " return result" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "10519it [00:31, 330.85it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[[], [('passage', 0.005712530712530713), ('growth', 0.0049754299754299755), ('use,', 0.004545454545454545), ('functions,', 0.003931203931203931), ('successors', 0.0036855036855036856), ('place,', 0.0035626535626535625), ('own,', 0.0031941031941031942), ('own', 0.0031941031941031942), ('head', 0.00300982800982801), ('power', 0.0029484029484029483), ('action,', 0.002764127764127764), ('work', 0.0025798525798525797), ('members', 0.0025184275184275185), ('value,', 0.0025184275184275185), ('value', 0.002334152334152334), ('vicinity,', 0.002334152334152334), ('name', 0.002334152334152334), ('place', 0.0022727272727272726), ('beauty', 0.0022113022113022115), ('strength', 0.0022113022113022115)], [], [], [('undertook', 1.0)], [('a', 0.2926829268292683), ('two', 0.08536585365853659), ('goodnand', 0.07317073170731707), ('him', 0.054878048780487805), ('means', 0.036585365853658534), ('money', 0.03048780487804878), ('all', 0.024390243902439025), ('force', 0.024390243902439025), ('just', 0.018292682926829267), ('capacity', 0.012195121951219513), ('scarcely', 0.012195121951219513), ('stabling', 0.012195121951219513), ('guns', 0.012195121951219513), ('barely', 0.012195121951219513), ('boats', 0.012195121951219513), ('h', 0.012195121951219513), ('amply', 0.012195121951219513), ('decline', 0.012195121951219513), ('capital', 0.012195121951219513), ('u', 0.012195121951219513)], [], [], [], [('as', 0.7727272727272727), ('a3', 0.09090909090909091), ('that', 0.09090909090909091), ('its', 0.045454545454545456)], [], [('the', 0.4133906633906634), ('show', 0.21375921375921375), ('shew', 0.03194103194103194), ('this', 0.03194103194103194), ('tho', 0.0214987714987715), ('our', 0.016584766584766583), ('a', 0.012285012285012284), ('their', 0.009828009828009828), ('that', 0.009213759213759214), ('tbe', 0.009213759213759214), ('ascertainnthe', 0.005528255528255528), ('benthe', 0.004914004914004914), ('learnnthe', 0.004914004914004914), ('any', 0.0042997542997543), ('tlie', 0.0042997542997543), ('thenreal', 0.0036855036855036856), ('his', 0.0036855036855036856), ('what', 0.003071253071253071), ('said', 0.003071253071253071), ('immediately', 0.003071253071253071)], [], [('and', 0.0744047619047619), ('put', 0.03273809523809524), ('be', 0.026785714285714284), ('placed', 0.023809523809523808), ('again', 0.017857142857142856), ('held', 0.01488095238095238), ('engaged', 0.01488095238095238), ('pending', 0.01488095238095238), ('wrapped', 0.011904761904761904), ('started', 0.011904761904761904), ('went', 0.008928571428571428), ('got', 0.008928571428571428), ('?', 0.008928571428571428), ('roll', 0.005952380952380952), ('playing', 0.005952380952380952), ('13SJ4', 0.005952380952380952), ('specialised', 0.005952380952380952), ('anninfant', 0.005952380952380952), ('streamed.', 0.005952380952380952), ('flew', 0.005952380952380952)], [('to', 0.6538461538461539), ('a', 0.09615384615384616), ('exceptnto', 0.07692307692307693), ('¦', 0.038461538461538464), ('efceptnta', 0.038461538461538464), ('world,', 0.019230769230769232), ('.nto', 0.019230769230769232), ('the', 0.019230769230769232), ('and', 0.019230769230769232), ('anyway.n“Then', 0.019230769230769232)], [], [('to', 0.7), ('Almighty', 0.2), ('that', 0.1)], [('Knocked', 1.0)], [], [], [], [('will', 1.0)], [('went', 0.024908077333649626), ('carried', 0.016486774997034753), ('set', 0.014351796939864785), ('find', 0.014233187047799786), ('go', 0.01399596726366979), ('came', 0.013402917803344799), ('carry', 0.01197959909856482), ('pointed', 0.011505159530304827), ('come', 0.010793500177914838), ('put', 0.010556280393784841), ('get', 0.00937018147313486), ('paid', 0.009132961689004864), ('sent', 0.009014351796939865), ('started', 0.00865852212074487), ('brought', 0.007828252876289882), ('took', 0.007709642984224884), ('got', 0.007235203415964892), ('take', 0.006879373739769897), ('laid', 0.006523544063574902), ('worn', 0.006404934171509904)], [('City', 0.04151624548736462), ('city', 0.03790613718411552), ('Bay', 0.02707581227436823), ('and', 0.023465703971119134), ('avenue', 0.010830324909747292), ('andndesiring', 0.010830324909747292), ('Railroad', 0.009025270758122744), ('banks', 0.009025270758122744), ('letter', 0.009025270758122744), ('brought', 0.009025270758122744), ('as', 0.009025270758122744), ('city,', 0.009025270758122744), ('delegation', 0.007220216606498195), ('bound', 0.005415162454873646), ('went', 0.005415162454873646), ('than', 0.005415162454873646), ('is', 0.005415162454873646), ('special', 0.005415162454873646), ('refused', 0.005415162454873646), ('State', 0.005415162454873646)], [], [('that', 0.04383794072009741), ('in', 0.03887523791522526), ('for', 0.0292179677094643), ('at', 0.028466309445838847), ('to', 0.024184934306826025), ('of', 0.022264029855338752), ('on', 0.020962913796665452), ('all', 0.019811250258245164), ('by', 0.01305511721033684), ('with', 0.012905664690083826), ('when', 0.012540824714172056), ('if', 0.010294641248016457), ('as', 0.00950781768550794), ('from', 0.008567145940386028), ('then', 0.006677011125421434), ('after', 0.005661613120173013), ('In', 0.005336334105504688), ('under', 0.005301168806621626), ('upon', 0.00521325555941397), ('also', 0.003780269629929186)], [], [], [], [], [], [], [], [], [('management,', 0.06896551724137931), ('iyiagdown,ndithcult', 0.06896551724137931), ('anynorganization,', 0.06896551724137931), ('progressively,', 0.06896551724137931), ('file', 0.06896551724137931), ('th.nprepari*', 0.06896551724137931), ('bland', 0.06896551724137931), ('rapped,', 0.06896551724137931), ('streets', 0.06896551724137931), ('woodennshutters', 0.06896551724137931), ('her', 0.06896551724137931), ('throughnnegligence', 0.034482758620689655), ('less,', 0.034482758620689655), ('cane', 0.034482758620689655), ('saturday', 0.034482758620689655), ('lode,', 0.034482758620689655), ('wuss', 0.034482758620689655), ('printed', 0.034482758620689655)], [], [], [], [], [], [], [], [('of', 0.896774193548387), ('or', 0.025806451612903226), ('ofnthe', 0.012903225806451613), ('cnof', 0.012903225806451613), ('alnof', 0.012903225806451613), ('ofnrailway', 0.012903225806451613), ('onnthe', 0.0064516129032258064), ('onntho', 0.0064516129032258064), ('ofntbe', 0.0064516129032258064), ('of-', 0.0064516129032258064)], [], [], [], [], [('in', 0.17255434782608695), ('to', 0.12635869565217392), ('of', 0.07744565217391304), ('for', 0.035326086956521736), ('on', 0.035326086956521736), ('In', 0.03125), ('if', 0.025815217391304348), ('interest', 0.019021739130434784), ('from', 0.016304347826086956), ('with', 0.016304347826086956), ('upon', 0.016304347826086956), ('by', 0.014945652173913044), ('at', 0.014945652173913044), ('designatednin', 0.010869565217391304), ('describednIn', 0.009510869565217392), ('all', 0.009510869565217392), ('as', 0.009510869565217392), ('describednin', 0.009510869565217392), ('during', 0.008152173913043478), ('thosenof', 0.008152173913043478)], [], [('tlie', 1.0)], [('impetus', 0.1111111111111111), ('art', 0.1111111111111111), ('deal.nIt', 0.1111111111111111), ('sufferers.nThere', 0.1111111111111111), ('gulf', 0.1111111111111111), ('andnthere', 0.1111111111111111), ('thatnit', 0.05555555555555555), ('ambition', 0.05555555555555555), ('trouble', 0.05555555555555555), ('agitation,nit', 0.05555555555555555), ('object', 0.05555555555555555), ('drawback', 0.05555555555555555)], [], [('ralniali', 1.0)], [], [('of', 0.8458646616541353), ('ol', 0.03383458646616541), ('in', 0.02631578947368421), ('ot', 0.018796992481203006), ('at', 0.011278195488721804), ('cf', 0.011278195488721804), ('ef', 0.007518796992481203), ('address.nAfter', 0.007518796992481203), ('as', 0.007518796992481203), ('collectionnof', 0.007518796992481203), ('revives', 0.0037593984962406013), ('celebrationnof', 0.0037593984962406013), ('warmed', 0.0037593984962406013), ('on', 0.0037593984962406013), ('and', 0.0037593984962406013), ('marked', 0.0037593984962406013)], [], [], [], [], [('city', 0.007010434854267423), ('people', 0.006996914867864666), ('sum', 0.006725227522056892), ('purpose', 0.006255890851218344), ('office', 0.006244302291444553), ('use', 0.006098801485395839), ('time', 0.005581822957709482), ('part', 0.005461430697837317), ('end', 0.0053217241716755), ('name', 0.004863976060610743), ('State', 0.004797663746349603), ('place', 0.004708818121417204), ('amount', 0.004601945847947795), ('hands', 0.004257508099115664), ('rate', 0.004044407361053168), ('City', 0.0039729445757814555), ('payment', 0.003918220821294107), ('date', 0.003868003728941012), ('day', 0.003817786636587916), ('cost', 0.0036220687381861073)], [('is', 0.11022032187184512), ('seems', 0.035186174951006594), ('was', 0.03272165805570402), ('necessary', 0.02419977433339272), ('ought', 0.02057723142704436), ('seemed', 0.01826117940495279), ('comes', 0.016657758774274008), ('came', 0.015232495991448424), ('impossible', 0.010036225429063484), ('appears', 0.00944236593621949), ('possible', 0.009086050240513094), ('up', 0.008254646950531505), ('has', 0.008046796128036106), ('occurred', 0.007809252330898509), ('appearing', 0.007066927964843518), ('began', 0.006799691193063721), ('difficult', 0.00602767385236653), ('had', 0.005938594928439931), ('over', 0.005908901953797731), ('Is', 0.005879208979155532)], [], [('the', 0.5955223880597015), ('a', 0.062437810945273634), ('thencourt', 0.04900497512437811), ('his', 0.022885572139303482), ('tho', 0.018407960199004977), ('this', 0.014925373134328358), ('either', 0.010447761194029851), ('each', 0.009950248756218905), ('her', 0.007213930348258706), ('tbe', 0.006716417910447761), ('my', 0.006467661691542288), ('that', 0.005970149253731343), ('said', 0.005472636815920398), ('boarding', 0.00472636815920398), ('our', 0.004477611940298508), ('thenCourt', 0.003980099502487562), ('every', 0.003233830845771144), ('your', 0.002736318407960199), ('thecourt', 0.002736318407960199), ('packing', 0.002736318407960199)], [('be', 0.6774193548387096), ('he', 0.06451612903225806), ('annoccasional', 0.06451612903225806), ('hurrynthe', 0.06451612903225806), ('b', 0.03225806451612903), ('say,nwere', 0.03225806451612903), ('1', 0.03225806451612903), ('bo', 0.03225806451612903)], [('his', 0.30180180180180183), ('her', 0.13963963963963963), ('little', 0.06756756756756757), ('a', 0.06306306306306306), ('one', 0.05855855855855856), ('their', 0.05405405405405406), ('wife,', 0.036036036036036036), ('bis', 0.02252252252252252), ('loving', 0.013513513513513514), ('hisnblackmailing', 0.009009009009009009), ('gallant', 0.009009009009009009), ('only', 0.009009009009009009), ('hisnadopted', 0.009009009009009009), ('learned', 0.009009009009009009), ('bib', 0.009009009009009009), ('hla', 0.009009009009009009), ('Infant', 0.009009009009009009), ('bor', 0.009009009009009009), ('hitnillustrious', 0.009009009009009009), ('interesting', 0.009009009009009009)], [('null!', 1.0)], [], [], [('little', 0.11363636363636363), ('summer', 0.09090909090909091), ('small', 0.03409090909090909), ('neat', 0.022727272727272728), ('pleasant', 0.022727272727272728), ('white', 0.022727272727272728), ('pretty-nlittle', 0.022727272727272728), ('beautiful', 0.022727272727272728), ('prettynlittle', 0.022727272727272728), (\"ploughman's\", 0.022727272727272728), ('rest', 0.022727272727272728), (\"collier's\", 0.022727272727272728), ('drab', 0.022727272727272728), ('charming', 0.022727272727272728), ('five-room', 0.022727272727272728), ('thatched', 0.022727272727272728), ('eesy', 0.022727272727272728), ('clay', 0.022727272727272728), ('nice', 0.022727272727272728), ('modest', 0.022727272727272728)], [('way', 0.041432534319652894), ('duty', 0.028842734191621026), ('power', 0.015008179813642506), ('return', 0.011273917063802546), ('right', 0.01120278824951988), ('mind', 0.010882708585247884), ('efforts', 0.010882708585247884), ('name', 0.009958033999573228), ('attention', 0.009744647556725229), ('wife', 0.009282310263887901), ('ability', 0.009068923821039904), ('intention', 0.008997795006757237), ('time', 0.007290703463973256), ('hand', 0.007184010242549257), ('wife,', 0.006899494985418593), ('desire', 0.006686108542570596), ('life', 0.006330464471157266), ('family', 0.005974820399743936), ('friends', 0.005868127178319938), ('letter', 0.005512483106906608)], [('husband', 0.016717967288309495), ('husband,', 0.01062224636369123), ('head', 0.010380831673607338), ('father', 0.010079063311002475), ('mother', 0.008208099462852314), ('hands', 0.007182087029995775), ('eyes', 0.0063371356147021544), ('life', 0.0062767819421811815), ('home', 0.0056128915444504796), ('father,', 0.005552537871929507), ('up', 0.005431830526887561), ('face', 0.005431830526887561), ('life,', 0.0050093548192407505), ('arms', 0.004828293801677832), ('head,', 0.004828293801677832), ('hand', 0.004647232784114913), ('eyes,', 0.00458687911159394), ('heart', 0.004043696058905184), ('own,', 0.003983342386384212), ('room', 0.003983342386384212)], [('person', 0.013726525378987857), ('amount', 0.013500263971641904), ('land', 0.007089524096839882), ('property', 0.006259898936571385), ('blood', 0.005882796590994796), ('more', 0.005053171430726299), ('money', 0.003997284863111849), ('world', 0.0037710234557658947), ('time', 0.0037710234557658947), ('people', 0.003620182517535259), ('law', 0.003544762048419941), ('ground', 0.0034693415793046233), ('sum', 0.0033939211101893054), ('lands', 0.0033939211101893054), ('debt', 0.0033185006410739876), ('country,', 0.003092239233728034), ('work', 0.00286597782638208), ('country', 0.0026397164190361264), ('party', 0.0025642959499208085), ('other,', 0.0025642959499208085)], [], [('time', 0.08571428571428572), ('is', 0.05714285714285714), ('as', 0.05714285714285714), ('ones,', 0.05714285714285714), ('girl,', 0.05714285714285714), ('gossip', 0.05714285714285714), ('lady,', 0.05714285714285714), ('which', 0.05714285714285714), ('relief', 0.05714285714285714), ('cor]nral,\"', 0.05714285714285714), ('child,', 0.05714285714285714), ('im-nportance,', 0.02857142857142857), ('benefitnhas', 0.02857142857142857), ('city,', 0.02857142857142857), ('ones', 0.02857142857142857), ('daughter,', 0.02857142857142857), ('onee,', 0.02857142857142857), ('masonnwork', 0.02857142857142857), ('concern,', 0.02857142857142857), ('fellow,', 0.02857142857142857)], [], [], [('one', 0.034703683929524824), ('spoke', 0.03363587827015483), ('all', 0.022423918846769888), ('out', 0.018152696209289908), ('despaired', 0.013881473571809931), ('territory', 0.013347570742124934), ('President', 0.01014415376401495), ('Secretary', 0.01014415376401495), ('secretary', 0.009610250934329953), ('told', 0.008542445274959957), ('day', 0.00800854244527496), ('treasurer', 0.007474639615589963), ('president', 0.007474639615589963), ('instead', 0.0069407367859049655), ('think', 0.0069407367859049655), ('some', 0.0069407367859049655), ('thenwhole', 0.006406833956219968), ('owner', 0.006406833956219968), ('sheriff', 0.005339028296849973), ('half', 0.004271222637479978)], [], [('owner', 0.009700253430044568), ('person', 0.006973695709167177), ('city', 0.006239622476623263), ('same', 0.005855107926243118), ('same,', 0.005190946430131959), ('purchaser', 0.00473652014331906), ('State', 0.0040898365813160885), ('whole', 0.003932535174342393), ('state', 0.0036179323603950015), ('time', 0.0034606309534213056), ('party', 0.003180984007690291), ('day', 0.002796469457310146), ('man', 0.002761513589093769), ('county', 0.0027440356549855807), ('people', 0.0026216901162282617), ('right', 0.002464388709254566), ('town', 0.002254653499956305), ('point', 0.002254653499956305), ('court', 0.0020623962247662327), ('land', 0.0020099624224416673)], [('Btrlp', 1.0)], [('number', 0.008571428571428572), ('amount', 0.00816326530612245), ('names', 0.00816326530612245), ('cost', 0.008027210884353741), ('members', 0.007346938775510204), ('rest', 0.006802721088435374), ('people', 0.006394557823129252), ('work', 0.006394557823129252), ('other', 0.004761904761904762), ('destruction', 0.004625850340136055), ('protection', 0.004489795918367347), ('laws', 0.004081632653061225), ('sum', 0.004081632653061225), ('act', 0.0038095238095238095), ('costs', 0.003673469387755102), ('character', 0.003673469387755102), ('result', 0.0035374149659863946), ('President', 0.0035374149659863946), ('necessity', 0.0035374149659863946), ('power', 0.0035374149659863946)], [('the', 0.8815144083384426), ('tho', 0.027896995708154508), ('tbe', 0.015481299816063764), ('that', 0.006897608828939301), ('this', 0.005824647455548743), ('tne', 0.0030656039239730227), ('thc', 0.001992642550582465), ('tlie', 0.0018393623543838135), ('tiie', 0.0015328019619865114), ('ihe', 0.0015328019619865114), ('these', 0.0015328019619865114), ('muchnthe', 0.0013795217657878603), ('whichnthe', 0.001226241569589209), ('aboutnthe', 0.001226241569589209), ('preciselynthe', 0.001226241569589209), ('tile', 0.001072961373390558), ('justnthe', 0.0009196811771919068), ('nearlynthe', 0.0009196811771919068), ('Hie', 0.0009196811771919068), ('the.', 0.0009196811771919068)], [], [], [('it.', 0.04918032786885246), ('manner.', 0.030054644808743168), ('this.', 0.0273224043715847), ('satin.', 0.01639344262295082), ('that.', 0.01639344262295082), ('thunder.', 0.01366120218579235), ('children.', 0.01366120218579235), ('this:', 0.01366120218579235), ('results.', 0.01092896174863388), ('anneggshell.', 0.01092896174863388), ('nature.', 0.01092896174863388), ('it', 0.01092896174863388), ('mad.', 0.01092896174863388), ('It.', 0.00819672131147541), ('thenpresent.', 0.00819672131147541), ('taffy.', 0.00819672131147541), ('him.', 0.00819672131147541), ('information.', 0.00546448087431694), ('asthma.', 0.00546448087431694), ('thenrest.', 0.00546448087431694)], [('and', 1.0)], [], [], [('tmcle.', 0.25), ('at', 0.25), ('people', 0.25), (\"husband'Bndeath.\", 0.25)], [], [], [('of', 0.7333333333333333), ('af', 0.13333333333333333), ('»f', 0.13333333333333333)], [('yet', 0.2222222222222222), ('sufficient,', 0.2222222222222222), ('not', 0.2222222222222222), ('left', 0.2222222222222222), ('obtained', 0.1111111111111111)], [('and', 0.20798319327731093), ('procured,', 0.037815126050420166), ('receivednand', 0.025210084033613446), ('or', 0.02100840336134454), ('andnstill', 0.014705882352941176), ('made', 0.014705882352941176), ('killed', 0.01050420168067227), ('divided', 0.008403361344537815), ('tried', 0.008403361344537815), ('made.nThere', 0.008403361344537815), ('discoverednthat', 0.0063025210084033615), ('made,', 0.0063025210084033615), ('arrestednand', 0.0063025210084033615), ('doingnand', 0.004201680672268907), ('committed', 0.004201680672268907), ('andjiow', 0.004201680672268907), ('inform-ned,', 0.004201680672268907), (\"eaptured.nIn'nntrv\", 0.004201680672268907), ('barred.nWe', 0.004201680672268907), ('tempted.nThere', 0.004201680672268907)], [], [], [], [('the', 0.38461538461538464), ('tbe', 0.15384615384615385), ('offnthe', 0.15384615384615385), ('offnour', 0.15384615384615385), ('our', 0.07692307692307693), ('offthe', 0.07692307692307693)], [], [('connected', 0.65), ('con\\xadnnected', 0.2), ('whichnmingle', 0.1), ('somewhatninterfered', 0.05)]]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# PREDICTION FOR DEV-0\n", "\n", "dataframe = pd.read_csv(directory_dev_0, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], escapechar='\\\\', quoting=csv.QUOTE_NONE)\n", "\n", "left_text = dataframe['LeftContext'].apply(lambda l: re.split(r\"\\s+\", l)).to_list()\n", "right_text = dataframe['RightContext'].apply(lambda l: re.split(r\"\\s+\", l)).to_list()\n", "\n", "lines = zip(left_text, right_text)\n", "lines = list(map(lambda l: model.generate_text(l[0], l[1], False), tqdm(lines)))\n", "print(lines[:100])" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 10519/10519 [00:00<00:00, 111905.55it/s]\n" ] } ], "source": [ "with open(\"dev-0/out.tsv\", \"w\", encoding=\"UTF-8\") as file:\n", " result = \"\\n\".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))\n", " file.write(result)\n", " file.close()" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "7414it [00:17, 422.07it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[[], [('home', 0.08333333333333333), ('decline', 0.08333333333333333), ('or', 0.08333333333333333), ('spread', 0.08333333333333333), ('is', 0.08333333333333333), ('numerous', 0.08333333333333333), ('road', 0.08333333333333333), ('owned', 0.08333333333333333), ('resides', 0.08333333333333333), ('taxes', 0.08333333333333333), ('whitely', 0.08333333333333333), ('water', 0.08333333333333333)], [], [('man', 0.01770717393503997), ('plan', 0.009106546595163412), ('trial', 0.006779318020843873), ('living', 0.00647576646767176), ('statement', 0.005868663361327532), ('law', 0.005868663361327532), ('vote', 0.005261560254983305), ('class', 0.005059192552868562), ('year', 0.00485682485075382), ('sensation', 0.0043509055954669635), ('question', 0.004148537893352221), ('single', 0.004148537893352221), ('bill', 0.0040473540422948494), ('day', 0.0040473540422948494), ('government', 0.0034402509359506223), ('time', 0.0034402509359506223), ('paper', 0.003339067084893251), ('means', 0.002934331680663766), ('speech', 0.002934331680663766), ('way', 0.002934331680663766)], [], [], [], [], [], [], [('on', 0.23076923076923078), ('be', 0.15384615384615385), ('means', 0.15384615384615385), ('about', 0.15384615384615385), ('in', 0.15384615384615385), ('for', 0.07692307692307693), ('Influence', 0.07692307692307693)], [], [('days', 0.06546854942233633), ('minutes', 0.051347881899871634), ('years', 0.03979460847240052), ('moments', 0.03465982028241335), ('weeks', 0.029525032092426188), ('months', 0.026957637997432605), ('cents', 0.01797175866495507), ('at', 0.01668806161745828), ('years,', 0.01540436456996149), ('yearsnago', 0.014120667522464698), ('days,', 0.012836970474967908), ('day', 0.011553273427471117), ('dollars', 0.011553273427471117), ('furnish', 0.010269576379974325), ('months,', 0.010269576379974325), ('in', 0.008985879332477536), ('daysnago', 0.008985879332477536), ('hours', 0.008985879332477536), ('weeksnago', 0.007702182284980745), ('with', 0.006418485237483954)], [], [], [('from', 1.0)], [('which', 0.4), (\"Radway's,\", 0.13333333333333333), ('mules,', 0.13333333333333333), ('interest;', 0.13333333333333333), ('each,', 0.13333333333333333), ('Wednesday', 0.06666666666666667)], [], [('little', 0.16510903426791276), ('whole', 0.04361370716510903), ('beautiful', 0.04361370716510903), ('neighboring', 0.028037383177570093), ('townnor', 0.024922118380062305), ('nearest', 0.024922118380062305), ('agricultural', 0.018691588785046728), ('said', 0.012461059190031152), ('next', 0.012461059190031152), ('Maine', 0.012461059190031152), ('British', 0.012461059190031152), ('present', 0.009345794392523364), ('ancient', 0.009345794392523364), ('incorporated', 0.009345794392523364), ('thriving', 0.009345794392523364), ('small', 0.009345794392523364), ('obscure', 0.009345794392523364), ('city,', 0.009345794392523364), ('Japanese', 0.009345794392523364), ('States,nthe', 0.006230529595015576)], [], [('weakling', 1.0)], [], [('inches', 0.21926910299003322), ('feet', 0.07973421926910298), ('poles', 0.046511627906976744), ('chains', 0.04318936877076412), ('Inches', 0.029900332225913623), ('in.)', 0.019933554817275746), ('years', 0.019933554817275746), ('links', 0.016611295681063124), ('.00', 0.016611295681063124), ('miles', 0.013289036544850499), ('perches', 0.013289036544850499), ('.25', 0.013289036544850499), ('inch-nes', 0.013289036544850499), ('.50', 0.013289036544850499), ('chs,', 0.013289036544850499), ('.', 0.013289036544850499), ('00', 0.009966777408637873), ('feet,', 0.009966777408637873), ('inchee', 0.009966777408637873), ('10nper.', 0.006644518272425249)], [], [], [], [('puzzling', 0.1111111111111111), ('sufficient', 0.1111111111111111), ('strong', 0.1111111111111111), ('brought', 0.1111111111111111), ('-signed', 0.1111111111111111), ('preparatory', 0.1111111111111111), ('taken', 0.1111111111111111), ('obliged', 0.1111111111111111), ('enough', 0.1111111111111111)], [], [('it', 0.2631578947368421), ('than', 0.15789473684210525), ('that', 0.13157894736842105), ('this', 0.10526315789473684), ('1nthan', 0.05263157894736842), ('allnthat', 0.05263157894736842), ('It', 0.02631578947368421), ('there', 0.02631578947368421), ('t)an', 0.02631578947368421), ('finit', 0.02631578947368421), ('he', 0.02631578947368421), ('what', 0.02631578947368421), ('cheerfulness', 0.02631578947368421), ('theology.nIt', 0.02631578947368421), ('but', 0.02631578947368421)], [('justice', 1.0)], [], [('running', 1.0)], [], [], [], [], [('said', 0.058110627719080175), ('seen', 0.05382224984462399), ('remembered', 0.041516469857054074), ('hoped', 0.0326911124922312), ('found', 0.023306401491609695), ('expected', 0.01591050341827222), ('sure', 0.014418893722809198), ('denied', 0.013983840894965818), ('told', 0.011746426351771286), ('admitted', 0.010938471100062151), ('stated', 0.010441267868241143), ('shown', 0.010316967060285892), ('supposed', 0.010006215040397762), ('forgotten', 0.009944064636420136), ('assured', 0.00975761342448726), ('regretted', 0.00944686140459913), ('understood', 0.009073958980733375), ('observed', 0.00882535736482287), ('noted', 0.008763206960845246), ('true', 0.008203853325046613)], [('arisencomplications', 1.0)], [], [('be', 0.04782608695652174), ('sell', 0.030434782608695653), ('work', 0.01956521739130435), ('bo', 0.017391304347826087), ('look', 0.017391304347826087), ('meet', 0.013043478260869565), ('appear', 0.013043478260869565), ('redemption', 0.013043478260869565), ('get', 0.013043478260869565), ('stop', 0.013043478260869565), ('sit', 0.010869565217391304), ('order', 0.008695652173913044), ('return', 0.008695652173913044), ('Cuba', 0.008695652173913044), ('It', 0.008695652173913044), ('me', 0.008695652173913044), ('go', 0.006521739130434782), ('voto', 0.006521739130434782), ('come', 0.006521739130434782), ('keepnthem', 0.006521739130434782)], [], [('cross\\xadn', 1.0)], [('of', 1.0)], [], [], [('of', 0.8988101291569206), ('ot', 0.009254551001728872), ('in', 0.007932472287196176), ('to', 0.006813790297976203), ('ol', 0.005084918132818062), ('and', 0.004881521407505339), ('or', 0.003966236143598088), ('from', 0.003864537780941727), ('on', 0.0037628394182853656), ('for', 0.0031526492423471983), ('at', 0.002644157429065392), ('cf', 0.002339062341096308), ('that', 0.0015254754398454184), ('oi', 0.0014237770771890572), ('as', 0.0011186819892199736), ('during', 0.0008135869012508898), ('into', 0.0008135869012508898), ('upon', 0.0008135869012508898), ('by', 0.0007118885385945286), ('throughout', 0.0007118885385945286)], [], [], [('“pale-nfaces.”', 0.2), ('schools', 0.2), ('purchase^.', 0.2), ('surface.', 0.2), ('valley.', 0.2)], [('SGO', 0.2857142857142857), ('$Srt.O(iO', 0.2857142857142857), ('$2,600', 0.14285714285714285), ('twice', 0.14285714285714285), ('815,000', 0.14285714285714285)], [], [], [], [('to', 0.9333333333333333), ('them,nthough', 0.02666666666666667), ('to?', 0.02666666666666667), ('itnIs', 0.013333333333333334)], [], [('is', 0.08333333333333333), ('to', 0.052083333333333336), ('was', 0.046875), ('shows', 0.046875), ('of', 0.03125), ('showed', 0.03125), ('showing', 0.026041666666666668), ('says', 0.026041666666666668), ('that', 0.020833333333333332), ('made', 0.020833333333333332), ('Is', 0.020833333333333332), ('by', 0.015625), ('wasnmade', 0.015625), ('as', 0.015625), ('beingnmade', 0.010416666666666666), ('reads,', 0.010416666666666666), ('ofnreceipts', 0.010416666666666666), ('ehowa', 0.010416666666666666), ('off', 0.010416666666666666), ('said', 0.010416666666666666)], [], [('reaeel-ownera', 1.0)], [], [('ofnglass', 0.2857142857142857), ('unerjoycd', 0.2857142857142857), ('£Sl', 0.14285714285714285), ('73.', 0.14285714285714285), ('338,', 0.14285714285714285)], [], [], [], [], [('is', 0.609271523178808), ('was', 0.16556291390728478), ('all', 0.039735099337748346), ('wouldnbe', 0.019867549668874173), ('Is', 0.019867549668874173), ('isnolso', 0.013245033112582781), ('wasnthought', 0.013245033112582781), ('wasnconsidered', 0.013245033112582781), ('wasndeemed', 0.013245033112582781), ('seems', 0.013245033112582781), ('isnu', 0.013245033112582781), ('i', 0.006622516556291391), ('watndeemed', 0.006622516556291391), ('ia', 0.006622516556291391), ('certainlynseems', 0.006622516556291391), ('it', 0.006622516556291391), ('whol-nly', 0.006622516556291391), ('makent', 0.006622516556291391), ('creatednmore', 0.006622516556291391), ('lantherefore', 0.006622516556291391)], [], [('is', 0.13545816733067728), ('was', 0.08764940239043825), ('up', 0.04780876494023904), ('out', 0.04780876494023904), ('back', 0.027888446215139442), ('did', 0.027888446215139442), ('appears', 0.02390438247011952), ('off', 0.01593625498007968), ('has', 0.01593625498007968), ('stood', 0.01195219123505976), ('appeared', 0.01195219123505976), ('and', 0.01195219123505976), ('lies', 0.01195219123505976), ('were,', 0.01195219123505976), ('caught', 0.01195219123505976), ('stands', 0.01195219123505976), ('makes', 0.00796812749003984), ('now', 0.00796812749003984), ('accumulating', 0.00796812749003984), ('flew', 0.00796812749003984)], [('be', 0.5), ('show', 0.25), ('remain', 0.125), ('honiotisidereri', 0.125)], [], [], [], [], [], [('»nd', 0.3333333333333333), ('nnd', 0.3333333333333333), ('and', 0.3333333333333333)], [], [], [], [('he', 0.33043478260869563), ('I', 0.17565217391304347), ('she', 0.06260869565217392), ('they', 0.04), ('we', 0.034782608695652174), ('is', 0.029565217391304348), ('1', 0.02434782608695652), ('was', 0.022608695652173914), ('lie', 0.020869565217391306), ('ho', 0.019130434782608695), ('be', 0.01217391304347826), ('whenevernshe', 0.010434782608695653), ('bonnetnand', 0.006956521739130435), ('willnbe', 0.006956521739130435), ('you', 0.006956521739130435), ('time', 0.0052173913043478265), ('were', 0.0052173913043478265), ('henhas', 0.0052173913043478265), ('theynnever', 0.0034782608695652175), ('havennot', 0.0034782608695652175)], [], [('and', 1.0)], [], [('have', 1.0)], [], [('great', 0.6666666666666666), ('gay', 0.3333333333333333)], [], [('would', 0.5), ('mean?nI', 0.5)], [], [], [], [('and', 0.05303867403314917), ('placed', 0.04143646408839779), ('put', 0.03370165745856354), ('be', 0.03149171270718232), ('pending', 0.013259668508287293), ('only', 0.011049723756906077), ('engaged', 0.010497237569060774), ('stir', 0.009392265193370166), ('came', 0.008839779005524863), ('brought', 0.0077348066298342545), ('called', 0.0071823204419889505), ('su-nperior', 0.0066298342541436465), ('living', 0.0066298342541436465), ('made', 0.0060773480662983425), ('set', 0.0060773480662983425), ('went', 0.0060773480662983425), ('come', 0.0060773480662983425), ('residing', 0.0055248618784530384), ('go', 0.004972375690607734), ('followed', 0.004972375690607734)], [], [], [], [], [], [], [], [('time', 0.004418668876001105), ('world,', 0.004418668876001105), ('ground', 0.0038663352665009665), ('house', 0.0035901684617508974), ('country', 0.0035901684617508974), ('public,', 0.0033140016570008283), ('people,', 0.0030378348522507597), ('city', 0.0030378348522507597), ('blood', 0.0027616680475006906), ('sea', 0.0024855012427506215), ('present', 0.0022093344380005524), ('ground,', 0.0022093344380005524), ('church', 0.0022093344380005524), ('first', 0.0022093344380005524), ('same,', 0.0022093344380005524), ('highest', 0.0022093344380005524), ('business', 0.0019331676332504833), ('people', 0.0019331676332504833), ('other', 0.0019331676332504833), ('city,', 0.0019331676332504833)], [('of', 0.4166666666666667), ('on', 0.16666666666666666), ('at', 0.1111111111111111), ('in', 0.1111111111111111), ('out', 0.08333333333333333), ('among', 0.05555555555555555), ('for', 0.027777777777777776), ('towards', 0.027777777777777776)], []]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "# PREDICTION FOR TEST-A\n", "\n", "dataframe = pd.read_csv(directory_test_A, sep='\\t', header=None, names=['FileId', 'Year', 'LeftContext', 'RightContext'], escapechar='\\\\', quoting=csv.QUOTE_NONE)\n", "\n", "left_text = dataframe['LeftContext'].apply(lambda l: re.split(r\"\\s+\", l)).to_list()\n", "right_text = dataframe['RightContext'].apply(lambda l: re.split(r\"\\s+\", l)).to_list()\n", "\n", "lines = zip(left_text, right_text)\n", "lines = list(map(lambda l: model.generate_text(l[0], l[1], False), tqdm(lines)))\n", "print(lines[:100])" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 7414/7414 [00:00<00:00, 128642.81it/s]\n" ] } ], "source": [ "with open(\"test-A/out.tsv\", \"w\", encoding=\"UTF-8\") as file:\n", " result = \"\\n\".join(list(map(lambda l: convert_predictions(l), tqdm(lines))))\n", " file.write(result)\n", " file.close()" ] } ], "metadata": { "kernelspec": { "display_name": "python11", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }