challenging-america-word-ga.../run3.ipynb


			
				
					
						
						
						
							
							
							{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2167,"status":"ok","timestamp":1685120042005,"user":{"displayName":"Mateusz Tylka","userId":"07807448241267011566"},"user_tz":-120},"id":"aCzyDBdV2C1y","outputId":"1087c938-f745-4f9f-c5dc-7b8992578a8c"},"outputs":[{"name":"stdout","output_type":"stream","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":2,"metadata":{"executionInfo":{"elapsed":275,"status":"ok","timestamp":1685120045295,"user":{"displayName":"Mateusz Tylka","userId":"07807448241267011566"},"user_tz":-120},"id":"oIr5BGuN4NEu"},"outputs":[],"source":["DATA_DIR = '/content/drive/MyDrive/'"]},{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":230,"status":"ok","timestamp":1685120046787,"user":{"displayName":"Mateusz Tylka","userId":"07807448241267011566"},"user_tz":-120},"id":"ZlgSpkzWwpFg"},"outputs":[],"source":["import lzma\n","import pickle\n","from collections import Counter\n","\n","def clean_line(line: str):\n","    separated = line.split('\\t')\n","    prefix = separated[6].replace(r'\\n', ' ')\n","    suffix = separated[7].replace(r'\\n', ' ')\n","    return prefix + ' ' + suffix\n","\n","\n","def words(filename):\n","    with lzma.open(filename, mode='rt', encoding='utf-8') as fid:\n","        index = 1\n","        print('Words')\n","        for line in fid:\n","            print(f'\\rProgress: {(index / 432022 * 100):2f}%', end='')\n","            text = clean_line(line)\n","            for word in text.split():\n","                yield word\n","            index += 1\n","        print()\n","\n","\n","def bigrams(filename, V: dict):\n","    with lzma.open(filename, mode='rt', encoding='utf-8') as fid:\n","        index = 1\n","        print('Bigrams')\n","        for line in fid:\n","            print(f'\\rProgress: {(index / 432022 * 100):2f}%', end='')\n","            text = clean_line(line)\n","            first_word = ''\n","            for second_word in text.split():\n","                if V.get(second_word) is None:\n","                    second_word = 'UNK'\n","                if second_word:\n","                    yield first_word, second_word\n","                first_word = second_word\n","            index += 1\n","        print()\n","\n","\n","def trigrams(filename, V: dict):\n","    with lzma.open(filename, mode='rt', encoding='utf-8') as fid:\n","        index = 1\n","        print('Trigrams')\n","        for line in fid:\n","            print(f'\\rProgress: {(index / 432022 * 100):2f}%', end='')\n","            text = clean_line(line)\n","            first_word = ''\n","            second_word = ''\n","            for third_word in text.split():\n","                if V.get(third_word) is None:\n","                    third_word = 'UNK'\n","                if first_word:\n","                    yield first_word, second_word, third_word\n","                first_word = second_word\n","                second_word = third_word\n","            index += 1\n","        print()\n","\n","\n","def tetragrams(filename, V: dict):\n","    with lzma.open(filename, mode='rt', encoding='utf-8') as fid:\n","        index = 1\n","        print('Tetragrams')\n","        for line in fid:\n","            print(f'\\rProgress: {(index / 432022 * 100):2f}%', end='')\n","            text = clean_line(line)\n","            first_word = ''\n","            second_word = ''\n","            third_word = ''\n","            for fourth_word in text.split():\n","                if V.get(fourth_word) is None:\n","                    fourth_word = 'UNK'\n","                if first_word:\n","                    yield first_word, second_word, third_word, fourth_word\n","                first_word = second_word\n","                second_word = third_word\n","                third_word = fourth_word\n","            index += 1\n","        print()\n","\n","\n","def P(first_word, second_word=None, third_word=None, fourth_word=None):\n","    try:\n","        if second_word is None:\n","            return V_common_dict[first_word] / total\n","        if third_word is None:\n","            return V2_dict[(first_word, second_word)] / V_common_dict[first_word]\n","        if fourth_word is None:\n","            return V3_dict[(first_word, second_word, third_word)] / V2_dict[(first_word, second_word)]\n","        else:\n","            return V4_dict[(first_word, second_word, third_word, fourth_word)] / V3_dict[\n","                (first_word, second_word, third_word)]\n","    except KeyError:\n","        return 0\n","\n","\n","def smoothed(tetragram):\n","    first, second, third, fourth = tetragram\n","    return 0.45 * P(first, second, third, fourth) + 0.30 * P(second, third, fourth) + 0.15 * P(third, fourth) + 0.1 * P(\n","        fourth)\n","\n","\n","def candidates(left_context, right_context):\n","    cand = {}\n","    first, second, third = left_context\n","    fifth, sixth, seventh = right_context\n","    for word in V_common_dict:\n","        p1 = smoothed((first, second, third, word))\n","        p2 = smoothed((second, third, word, fifth))\n","        p3 = smoothed((third, word, fifth, sixth))\n","        p4 = smoothed((word, fifth, sixth, seventh))\n","        cand[word] = p1 * p2 * p3 * p4\n","    cand = sorted(list(cand.items()), key=lambda x: x[1], reverse=True)[:5]\n","    norm = [(x[0], float(x[1]) / sum([y[1] for y in cand])) for x in cand]\n","    for index, elem in enumerate(norm):\n","        unk = None\n","        if 'UNK' in elem:\n","            unk = norm.pop(index)\n","            norm.append(('', unk[1]))\n","            break\n","    if unk is None:\n","        norm[-1] = ('', norm[-1][1])\n","    return ' '.join([f'{x[0]}:{x[1]}' for x in norm])\n","\n","\n","def create_outputs(folder_name):\n","    print(f'Creating outputs in {folder_name}')\n","    with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:\n","        with open(f'{folder_name}/out.tsv', 'w', encoding='utf-8') as f:\n","            for line in fid:\n","                separated = line.split('\\t')\n","                prefix = separated[6].replace(r'\\n', ' ').split()\n","                suffix = separated[7].replace(r'\\n', ' ').split()\n","                left_context = [x if V_common_dict.get(x) else 'UNK' for x in prefix[-3:]]\n","                right_context = [x if V_common_dict.get(x) else 'UNK' for x in suffix[:3]]\n","                w = candidates(left_context, right_context)\n","                f.write(w + '\\n')\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"P-zYH8UK2qRZ"},"outputs":[],"source":["WORD_LIMIT = 3000\n","\n","# V = Counter(words(DATA_DIR+'train/in.tsv.xz'))\n","\n","# V_common_dict = dict(V.most_common(WORD_LIMIT))\n","\n","# UNK = 0\n","\n","# for key, value in V.items():\n","#     if V_common_dict.get(key) is None:\n","#         UNK += value\n","\n","# V_common_dict['UNK'] = UNK\n","\n","# with open('V.pickle', 'wb') as handle:\n","#     pickle.dump(V_common_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)\n","\n","with open('V.pickle', 'rb') as handle:\n","    V_common_dict = pickle.load(handle)\n","\n","total = sum(V_common_dict.values())\n","\n","# V2 = Counter(bigrams(DATA_DIR+'train/in.tsv.xz', V_common_dict))\n","\n","# V2_dict = dict(V2)\n","\n","# with open('V2.pickle', 'wb') as handle:\n","#     pickle.dump(V2_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)\n","\n","\n","# with open('V2.pickle', 'rb') as handle:\n","#     V2_dict = pickle.load(handle)\n","\n","V3 = Counter(trigrams(DATA_DIR+'train/in.tsv.xz', V_common_dict))\n","\n","V3_dict = dict(V3)\n","\n","with open('V3.pickle', 'wb') as handle:\n","    pickle.dump(V3_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)\n","\n","# with open('V3.pickle', 'rb') as handle:\n","#     V3_dict = pickle.load(handle)\n","\n","# V4 = Counter(tetragrams(DATA_DIR+'train/in.tsv.xz', V_common_dict))\n","\n","# V4_dict = dict(V4)\n","\n","# with open('V4.pickle', 'wb') as handle:\n","#     pickle.dump(V4_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)\n","\n","create_outputs('dev-0')\n","create_outputs('test-A')"]}],"metadata":{"accelerator":"GPU","colab":{"authorship_tag":"ABX9TyOyO8TJ5Avbq0HurHWP0RHD","gpuType":"T4","provenance":[]},"gpuClass":"standard","kernelspec":{"display_name":"Python 3.10.8 64-bit","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.8"},"vscode":{"interpreter":{"hash":"767d51c1340bd893661ea55ea3124f6de3c7a262a8b4abca0554b478b1e2ff90"}}},"nbformat":4,"nbformat_minor":0}

						
						
					
				
				
					
						Reference in New Issue
					
					View Git Blame
					Copy Permalink