challenging-america-word-ga.../trigram_neural.ipynb

1 line
166 KiB
Plaintext
Raw Normal View History

2023-05-10 00:37:23 +02:00
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"machine_shape":"hm","gpuType":"V100","authorship_tag":"ABX9TyNxnyqaLopDiv3Mni0cDb0b"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU","gpuClass":"standard"},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"id":"8Ov1POzi2VY9"},"outputs":[],"source":[]},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"8SY0Ca7ueoZB","executionInfo":{"status":"ok","timestamp":1683659918042,"user_tz":-120,"elapsed":28484,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"040c3f2c-7da1-49f2-a8a0-75263b2539e8"},"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}]},{"cell_type":"code","source":[],"metadata":{"id":"X5yAfFNG2ijo"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["cd drive/MyDrive"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"sB_SM-dbe45A","executionInfo":{"status":"ok","timestamp":1683660270671,"user_tz":-120,"elapsed":250,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"19dd8ff3-7788-473b-fbec-1edf2c048f2a"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive\n"]}]},{"cell_type":"code","source":["cd challenging-america-word-gap-prediction/"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"GTzGNc2c2mSy","executionInfo":{"status":"ok","timestamp":1683660271700,"user_tz":-120,"elapsed":4,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"f9d39452-f209-4b22-cd96-237a5a468c0e"},"execution_count":3,"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive/challenging-america-word-gap-prediction\n"]}]},{"cell_type":"code","source":["import itertools\n","import lzma\n","import numpy as np\n","import regex as re\n","import torch\n","import pandas as pd\n","from torch import nn\n","from torch.utils.data import IterableDataset, DataLoader\n","import csv\n","from itertools import islice, chain\n","from torchtext.vocab import build_vocab_from_iterator"],"metadata":{"id":"AQBLNJJH2pfP","executionInfo":{"status":"ok","timestamp":1683662202333,"user_tz":-120,"elapsed":249,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":20,"outputs":[]},{"cell_type":"code","source":["def clean_text(txt):\n"," txt = txt.lower().replace('-\\\\\\\\\\\\\\\\n', '').replace('\\\\\\\\\\\\\\\\n', ' ')\n"," txt = re.sub(r'\\p{P}', '', txt)\n"," txt = txt.replace(\"'t\", \" not\").replace(\"'s\", \" is\").replace(\"'ll\", \" will\").replace(\"'m\", \" am\").replace(\"'ve\", \" have\")\n"," txt = txt.replace(\"\", \"'\")\n"," txt = txt.replace(\" this\\\\nplace\", \"this place\")\n"," txt = txt.replace(\"'we\\\\nwere\", \"we were\")\n"," txt = txt.replace(\"'ever\\\\nwas\", \"ever was\")\n"," txt = txt.replace(\"'making\\\\nsuch\", \"making such\")\n"," txt = txt.replace(\"'boot\\\\nto\", \"boot to\")\n"," txt = txt.replace(\"'elsewhere\\\\nfrom\", \"elsewhere from\")\n"," txt=txt.replace(\"United\\\\nStates\",\"United States\")\n"," txt = txt.replace(\"Unit-\\\\ned\",\"United\" )\n"," txt = txt.replace(\"neigh-\\\\nbors\", \"neighbours\")\n"," txt = txt.replace(\"aver-\\\\nage\", \"average\")\n"," txt = txt.replace(\"people\\\\ndown\", \"people down\")\n"," txt =re.compile(r\"'s|[\\-]|\\-\\\\n|\\p{P}\").sub(\"\", txt)\n"," txt = re.compile(r\"[{}\\[\\]\\&%^$*#\\(\\)@\\t\\n0123456789]+\").sub(\" \", txt)\n","\n"," return txt"],"metadata":{"id":"mGsDsF8a6Amp","executionInfo":{"status":"ok","timestamp":1683660292442,"user_tz":-120,"elapsed":233,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}}},"execution_count":5,"outputs":[]},{"cell_type":"code","source":["device='cuda'"],"metadata"