challenging-america-word-ga.../Untitled0.ipynb

1 line
83 KiB
Plaintext
Raw Normal View History

2023-05-10 00:37:23 +02:00
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"authorship_tag":"ABX9TyOyuZJRE5oH0if2B60EHnNm"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU","gpuClass":"standard"},"cells":[{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"fey0MM6ujDTv","executionInfo":{"status":"ok","timestamp":1680689733502,"user_tz":-120,"elapsed":21136,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"8bf5004c-20a5-4949-f0d0-eee93e2f79d0"},"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}],"source":["from google.colab import drive\n","drive.mount('/content/drive')"]},{"cell_type":"code","source":["cd drive/MyDrive"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"cykvdVL5jbTZ","executionInfo":{"status":"ok","timestamp":1680689733503,"user_tz":-120,"elapsed":33,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"ed99eb60-f3a4-455a-fddc-7514b5a641ee"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive\n"]}]},{"cell_type":"code","source":["cd challenging-america-word-gap-prediction/"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"01lVy22fjeik","executionInfo":{"status":"ok","timestamp":1680689733504,"user_tz":-120,"elapsed":24,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"8a4fd9d8-c8d1-481a-82fb-18b582894836"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive/challenging-america-word-gap-prediction\n"]}]},{"cell_type":"code","source":["! pip install lmza"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yZ6TVjdIj2Qd","executionInfo":{"status":"ok","timestamp":1680689734773,"user_tz":-120,"elapsed":1286,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"fd6e0988-6430-4cec-ab4b-6ecbe4259d73"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n","\u001b[31mERROR: Could not find a version that satisfies the requirement lmza (from versions: none)\u001b[0m\u001b[31m\n","\u001b[0m\u001b[31mERROR: No matching distribution found for lmza\u001b[0m\u001b[31m\n","\u001b[0m"]}]},{"cell_type":"code","source":["from collections import Counter"],"metadata":{"id":"PY_GLjeIfA5i"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import lzma"],"metadata":{"id":"adTwEZuPjujM"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["import pickle"],"metadata":{"id":"K7TshO9We-UH"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["rowcount=0\n","for row in lzma.open(\"test-A/in.tsv.xz\"):\n"," rowcount+= 1\n"," #printing the result\n","print(\"Number of lines present:-\", rowcount)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"PhryEzN5juLo","executionInfo":{"status":"ok","timestamp":1680689735909,"user_tz":-120,"elapsed":1144,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"774c9a36-f7c4-4f1d-d4a3-502b87b1eb94"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Number of lines present:- 7414\n"]}]},{"cell_type":"code","source":["with lzma.open('dev-0/in.tsv.xz',mode='rt', encoding='utf-8' ) as f:\n"," with open('dev-0/out.tsv', 'w', newline='\\n') as out:\n"," for line in f.readlines():\n"," sep = line.split('\\t')\n"," print(sep)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"RDOsdvYzkNEg","executionInfo":{"status":"ok","timestamp":1680689742717,"user_tz":-120,"elapsed":6817,"user":{"displayName":"Martyna Drumińska","userId":"13361003509289187965"}},"outputId":"cdd32684-3297-4a0c-9716-1d9f65a87de0"},"execution_count":null,"outputs":[]},{"cell_type":"c