1 line
8.4 KiB
Plaintext
1 line
8.4 KiB
Plaintext
|
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"gpuType":"T4","authorship_tag":"ABX9TyOyO8TJ5Avbq0HurHWP0RHD"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU","gpuClass":"standard"},"cells":[{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"aCzyDBdV2C1y","executionInfo":{"status":"ok","timestamp":1685120042005,"user_tz":-120,"elapsed":2167,"user":{"displayName":"Mateusz Tylka","userId":"07807448241267011566"}},"outputId":"1087c938-f745-4f9f-c5dc-7b8992578a8c"},"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}]},{"cell_type":"code","source":["DATA_DIR = '/content/drive/MyDrive/'"],"metadata":{"id":"oIr5BGuN4NEu","executionInfo":{"status":"ok","timestamp":1685120045295,"user_tz":-120,"elapsed":275,"user":{"displayName":"Mateusz Tylka","userId":"07807448241267011566"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","execution_count":3,"metadata":{"id":"ZlgSpkzWwpFg","executionInfo":{"status":"ok","timestamp":1685120046787,"user_tz":-120,"elapsed":230,"user":{"displayName":"Mateusz Tylka","userId":"07807448241267011566"}}},"outputs":[],"source":["import lzma\n","import pickle\n","from collections import Counter\n","\n","def clean_line(line: str):\n"," separated = line.split('\\t')\n"," prefix = separated[6].replace(r'\\n', ' ')\n"," suffix = separated[7].replace(r'\\n', ' ')\n"," return prefix + ' ' + suffix\n","\n","\n","def words(filename):\n"," with lzma.open(filename, mode='rt', encoding='utf-8') as fid:\n"," index = 1\n"," print('Words')\n"," for line in fid:\n"," print(f'\\rProgress: {(index / 432022 * 100):2f}%', end='')\n"," text = clean_line(line)\n"," for word in text.split():\n"," yield word\n"," index += 1\n"," print()\n","\n","\n","def bigrams(filename, V: dict):\n"," with lzma.open(filename, mode='rt', encoding='utf-8') as fid:\n"," index = 1\n"," print('Bigrams')\n"," for line in fid:\n"," print(f'\\rProgress: {(index / 432022 * 100):2f}%', end='')\n"," text = clean_line(line)\n"," first_word = ''\n"," for second_word in text.split():\n"," if V.get(second_word) is None:\n"," second_word = 'UNK'\n"," if second_word:\n"," yield first_word, second_word\n"," first_word = second_word\n"," index += 1\n"," print()\n","\n","\n","def trigrams(filename, V: dict):\n"," with lzma.open(filename, mode='rt', encoding='utf-8') as fid:\n"," index = 1\n"," print('Trigrams')\n"," for line in fid:\n"," print(f'\\rProgress: {(index / 432022 * 100):2f}%', end='')\n"," text = clean_line(line)\n"," first_word = ''\n"," second_word = ''\n"," for third_word in text.split():\n"," if V.get(third_word) is None:\n"," third_word = 'UNK'\n"," if first_word:\n"," yield first_word, second_word, third_word\n"," first_word = second_word\n"," second_word = third_word\n"," index += 1\n"," print()\n","\n","\n","def tetragrams(filename, V: dict):\n"," with lzma.open(filename, mode='rt', encoding='utf-8') as fid:\n"," index = 1\n"," print('Tetragrams')\n"," for line in fid:\n"," print(f'\\rProgress: {(index / 432022 * 100):2f}%', end='')\n"," text = clean_line(line)\n"," first_word = ''\n"," second_word = ''\n"," third_word = ''\n"," for fourth_word in text.split():\n"," if V.get(fourth_word) is None:\n","
|