challenging-america-word-ga.../run3.ipynb

{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":2167,"status":"ok","timestamp":1685120042005,"user":{"displayName":"Mateusz Tylka","userId":"07807448241267011566"},"user_tz":-120},"id":"aCzyDBdV2C1y","outputId":"1087c938-f745-4f9f-c5dc-7b8992578a8c"},"outputs":[{"name":"stdout","output_type":"stream","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","execution_count":2,"metadata":{"executionInfo":{"elapsed":275,"status":"ok","timestamp":1685120045295,"user":{"displayName":"Mateusz Tylka","userId":"07807448241267011566"},"user_tz":-120},"id":"oIr5BGuN4NEu"},"outputs":[],"source":["DATA_DIR = '/content/drive/MyDrive/'"]},{"cell_type":"code","execution_count":1,"metadata":{"executionInfo":{"elapsed":230,"status":"ok","timestamp":1685120046787,"user":{"displayName":"Mateusz Tylka","userId":"07807448241267011566"},"user_tz":-120},"id":"ZlgSpkzWwpFg"},"outputs":[],"source":["import lzma\n","import pickle\n","from collections import Counter\n","\n","def clean_line(line: str):\n","    separated = line.split('\\t')\n","    prefix = separated[6].replace(r'\\n', ' ')\n","    suffix = separated[7].replace(r'\\n', ' ')\n","    return prefix + ' ' + suffix\n","\n","\n","def words(filename):\n","    with lzma.open(filename, mode='rt', encoding='utf-8') as fid:\n","        index = 1\n","        print('Words')\n","        for line in fid:\n","            print(f'\\rProgress: {(index / 432022 * 100):2f}%', end='')\n","            text = clean_line(line)\n","            for word in text.split():\n","                yield word\n","            index += 1\n","        print()\n","\n","\n","def bigrams(filename, V: dict):\n","    with lzma.open(filename, mode='rt', encoding='utf-8') as fid:\n","        index = 1\n","        print('Bigrams')\n","        for line in fid:\n","            print(f'\\rProgress: {(index / 432022 * 100):2f}%', end='')\n","            text = clean_line(line)\n","            first_word = ''\n","            for second_word in text.split():\n","                if V.get(second_word) is None:\n","                    second_word = 'UNK'\n","                if second_word:\n","                    yield first_word, second_word\n","                first_word = second_word\n","            index += 1\n","        print()\n","\n","\n","def trigrams(filename, V: dict):\n","    with lzma.open(filename, mode='rt', encoding='utf-8') as fid:\n","        index = 1\n","        print('Trigrams')\n","        for line in fid:\n","            print(f'\\rProgress: {(index / 432022 * 100):2f}%', end='')\n","            text = clean_line(line)\n","            first_word = ''\n","            second_word = ''\n","            for third_word in text.split():\n","                if V.get(third_word) is None:\n","                    third_word = 'UNK'\n","                if first_word:\n","                    yield first_word, second_word, third_word\n","                first_word = second_word\n","                second_word = third_word\n","            index += 1\n","        print()\n","\n","\n","def tetragrams(filename, V: dict):\n","    with lzma.open(filename, mode='rt', encoding='utf-8') as fid:\n","        index = 1\n","        print('Tetragrams')\n","        for line in fid:\n","            print(f'\\rProgress: {(index / 432022 * 100):2f}%', end='')\n","            text = clean_line(line)\n","            first_word = ''\n","            second_word = ''\n","            third_word = ''\n","            for fourth_word in text.split():\n","                if V.get(fourth_word) is None:\n","                    fourth_word = 'UNK'\n","                if first_word:\n","                    yield first_word, second_word, third_word, fourth_word\n","                first_word = second_word\n","                second_word = third_word\n","                third_word = fourth_w