challenging-america-word-ga.../02.ipynb
2023-04-04 21:22:25 +02:00

444 lines
17 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from itertools import islice"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter\n",
"from collections import OrderedDict\n",
"\n",
"def freq_list(g, top=None):\n",
" c = Counter(g)\n",
"\n",
" if top is None:\n",
" items = c.items()\n",
" else:\n",
" items = c.most_common(top)\n",
"\n",
" return OrderedDict(sorted(items, key=lambda t: -t[1]))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# import matplotlib.pyplot as plt\n",
"# from collections import OrderedDict\n",
"\n",
"# def rang_freq_with_labels(name, g, top=None):\n",
"# freq = freq_list(g, top)\n",
"\n",
"# plt.figure(figsize=(12, 3))\n",
"# plt.ylabel('liczba wystąpień')\n",
"\n",
"# plt.bar(freq.keys(), freq.values())\n",
"\n",
"# fname = f'02_Jezyki/{name}.png'\n",
"\n",
"# plt.savefig(fname)\n",
"\n",
"# return fname\n",
"\n",
"# rang_freq_with_labels('pt-chars', get_characters(pan_tadeusz))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# freq_list(get_characters(pan_tadeusz), top=8)\n",
"# list(islice(get_words(pan_tadeusz), 100, 130))"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[\"came fiom the last place to this\\\\nplace, and this place is Where We\\\\nWere, this is the first road I ever\\\\nwas on where you can ride elsewhere\\\\nfrom anywhere and be nowhere.\\\\nHe says, while this train stops every-\\\\nwhere, it never stops anywhere un-\\\\nless its somewhere. Well, I says,\\\\nI'm glad to hear that, but, accord-\\\\ning to your figures, I left myself\\\\nwhere 1 was, which is five miles near-\\\\ner to myself than I was when we\\\\nwere where we are now.\\\\nWe have now reached Slidell.\\\\nThat's a fine place. The people\\\\ndown there remind me of bananas-\\\\nthey come and go in bunches. 811-\\\\ndell used to be noted for her tough\\\\npeople. Now she is noted for be,\\\\ntough steaks. Well, I certainly got\\\\none there. When the waiter brought\\\\nit in it was so small I thought. It\\\\nwas a crack in the plate. I skid,\\\\nwaiter what else have you got? +He\\\\nbrought me in two codfish and one\\\\nsmelt. I said, waiter have you got\\\\npigs feet? He said no, rheumatism\\\\nmakes me walk that way. I sald,\\\\nhow is the pumpkin pie?\",\n",
" \"said\\\\nit's all squash. The best I could get\\\\nin that hotel was a soup sandwich.\\\\nAfter the table battle the waiter and\\\\nI signed an armistice. I then went\\\\nover to the hotel clerk and asked for\\\\na room. He said with or without a\\\\nbed? I said, with a bed. He said,\\\\nI don't think I 'have' a bed long\\\\nenough for you. I said, well, I'll\\\\naddtwo feettoitwhenIgetinit.\\\\nHe gave me a lovely room on the\\\\ntop floor. It was one of those rooms\\\\nthat stands on each side. If you\\\\nhappen to get up in the middle of\\\\nthe night you want to be sure and\\\\nget up in the middle of the room.\\\\nThat night I dreamt I was eating\\\\nflannel cakes. When I woke up half\\\\nof the blanket was gone. I must\\\\nhave got up on the wrong side of the\\\\nbed, for next morning I had an awful\\\\nheadache. I told the manager about\\\\nit. He said, you have rheumatic\\\\npains. I said, no, I think it is on,\\\\nof those attic room pains. I nad to\\\\ngetupat5a.m.inthemorningso\\\\nthey could use the sheet to set the\\\\nbreakfast table.\\n\"]"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import lzma\n",
"\n",
"# korpus = []\n",
"# with lzma.open('train/in.tsv.xz', mode='rt') as file:\n",
"# for idx, line in enumerate(file):\n",
"# # print(line)\n",
"# korpus.append(line.split('\\t')[6:8])\n",
"# if idx == 100:\n",
"# break\n",
"# korpus[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# PREPROCESS!"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[\"All the country between Puget Sound and the l'acifia ocean ia an unknown, unexplored and nnsnnreyed wilderness; that ought to tempt brave and adventur- ous spirits with the pros poet of an un- broken wilderness, abounding in big »ame and trout. Fish stories have been told me by anglers who have dropped their yellow hackle flies in those hidden cresks, that would make Munchausen shake hla head, but one can not afford to duubt the word of any one who goes into the heart of a real wdderneea for his truut. It is known, though, and proved by a whole picnic party of witnesses that one skilled fly fishermen brought bank sixty sprookled brook trout after an ah Sencs of leas than three hours, lie erawled through the nnderbrnsh aud crept along Ohlmionm Creek, and brought all bis fish back to prove\",\n",
" \" talo. A fishing party went over to Whntooa Lake, on the east shore of the Sound a week ago. They were provided with all essentials fors three days' camp. They told wonderful storu s beforehand how the trout wore erowded in that lake. Inquiry since their return shows thst two days were epeat cruising among t tie eastern shore, and neither d*h nor fish stories prove that thsy did any angling. Trolling for aalmon is a good excuse for spending idle daya on the water, and sailing aad tacking over three enchant- ed waters with a string astern, is the happteet, lasieet kind of way to kill time, Lone fishermen go oat at midnight, and at the turn of the tide in the dark honra the torn cod bite fast anywhere from the end of the wharf to the end of the Sound, \"]"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import regex as re\n",
"for idx, item in enumerate(korpus):\n",
" korpus[idx] = [item[i].replace('\\n', ' ').replace('\\\\n', ' ') for i in range(2)]\n",
"korpus[idx]"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"from itertools import islice\n",
"import regex as re\n",
"import lzma\n",
"\n",
"# korpus = []\n",
"# with lzma.open('train/in.tsv.xz', mode='rt') as file:\n",
"# for idx, line in enumerate(file):\n",
"# line.split('\\t')[6:8]\n",
"\n",
"def get_words():\n",
" with lzma.open('train/in.tsv.xz', mode='rt') as file:\n",
" for line in file:\n",
" i = [item[i].replace('\\n', ' ').replace('\\\\n', ' ') for i in range(2)]\n",
" for t in i:\n",
" for m in re.finditer(r'[\\p{L}0-9\\*]+', t):\n",
" yield m.group(0)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [],
"source": [
"ver = get_words()"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'this'"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# next(ver)"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"OrderedDict([('the', 8208418),\n",
" ('of', 4320220),\n",
" ('and', 3888198),\n",
" ('to', 2592132),\n",
" ('that', 2160110),\n",
" ('a', 2160110),\n",
" ('in', 1728088),\n",
" ('Sound', 1296066),\n",
" ('an', 1296066),\n",
" ('with', 1296066),\n",
" ('trout', 1296066),\n",
" ('one', 1296066),\n",
" ('for', 1296066),\n",
" ('is', 1296066),\n",
" ('three', 1296066),\n",
" ('wilderness', 864044),\n",
" ('stories', 864044),\n",
" ('have', 864044),\n",
" ('told', 864044),\n",
" ('by', 864044),\n",
" ('who', 864044),\n",
" ('their', 864044),\n",
" ('any', 864044),\n",
" ('party', 864044),\n",
" ('fishermen', 864044),\n",
" ('brought', 864044),\n",
" ('all', 864044),\n",
" ('fish', 864044),\n",
" ('prove', 864044),\n",
" ('over', 864044),\n",
" ('on', 864044),\n",
" ('shore', 864044),\n",
" ('They', 864044),\n",
" ('were', 864044),\n",
" ('days', 864044),\n",
" ('at', 864044),\n",
" ('end', 864044),\n",
" ('All', 432022),\n",
" ('country', 432022),\n",
" ('between', 432022),\n",
" ('Puget', 432022),\n",
" ('l', 432022),\n",
" ('acifia', 432022),\n",
" ('ocean', 432022),\n",
" ('ia', 432022),\n",
" ('unknown', 432022),\n",
" ('unexplored', 432022),\n",
" ('nnsnnreyed', 432022),\n",
" ('ought', 432022),\n",
" ('tempt', 432022),\n",
" ('brave', 432022),\n",
" ('adventur', 432022),\n",
" ('ous', 432022),\n",
" ('spirits', 432022),\n",
" ('pros', 432022),\n",
" ('poet', 432022),\n",
" ('un', 432022),\n",
" ('broken', 432022),\n",
" ('abounding', 432022),\n",
" ('big', 432022),\n",
" ('ame', 432022),\n",
" ('Fish', 432022),\n",
" ('been', 432022),\n",
" ('me', 432022),\n",
" ('anglers', 432022),\n",
" ('dropped', 432022),\n",
" ('yellow', 432022),\n",
" ('hackle', 432022),\n",
" ('flies', 432022),\n",
" ('those', 432022),\n",
" ('hidden', 432022),\n",
" ('cresks', 432022),\n",
" ('would', 432022),\n",
" ('make', 432022),\n",
" ('Munchausen', 432022),\n",
" ('shake', 432022),\n",
" ('hla', 432022),\n",
" ('head', 432022),\n",
" ('but', 432022),\n",
" ('can', 432022),\n",
" ('not', 432022),\n",
" ('afford', 432022),\n",
" ('duubt', 432022),\n",
" ('word', 432022),\n",
" ('goes', 432022),\n",
" ('into', 432022),\n",
" ('heart', 432022),\n",
" ('real', 432022),\n",
" ('wdderneea', 432022),\n",
" ('his', 432022),\n",
" ('truut', 432022),\n",
" ('It', 432022),\n",
" ('known', 432022),\n",
" ('though', 432022),\n",
" ('proved', 432022),\n",
" ('whole', 432022),\n",
" ('picnic', 432022),\n",
" ('witnesses', 432022),\n",
" ('skilled', 432022),\n",
" ('fly', 432022),\n",
" ('bank', 432022),\n",
" ('sixty', 432022),\n",
" ('sprookled', 432022),\n",
" ('brook', 432022),\n",
" ('after', 432022),\n",
" ('ah', 432022),\n",
" ('Sencs', 432022),\n",
" ('leas', 432022),\n",
" ('than', 432022),\n",
" ('hours', 432022),\n",
" ('lie', 432022),\n",
" ('erawled', 432022),\n",
" ('through', 432022),\n",
" ('nnderbrnsh', 432022),\n",
" ('aud', 432022),\n",
" ('crept', 432022),\n",
" ('along', 432022),\n",
" ('Ohlmionm', 432022),\n",
" ('Creek', 432022),\n",
" ('bis', 432022),\n",
" ('back', 432022),\n",
" ('talo', 432022),\n",
" ('A', 432022),\n",
" ('fishing', 432022),\n",
" ('went', 432022),\n",
" ('Whntooa', 432022),\n",
" ('Lake', 432022),\n",
" ('east', 432022),\n",
" ('week', 432022),\n",
" ('ago', 432022),\n",
" ('provided', 432022),\n",
" ('essentials', 432022),\n",
" ('fors', 432022),\n",
" ('camp', 432022),\n",
" ('wonderful', 432022),\n",
" ('storu', 432022),\n",
" ('s', 432022),\n",
" ('beforehand', 432022),\n",
" ('how', 432022),\n",
" ('wore', 432022),\n",
" ('erowded', 432022),\n",
" ('lake', 432022),\n",
" ('Inquiry', 432022),\n",
" ('since', 432022),\n",
" ('return', 432022),\n",
" ('shows', 432022),\n",
" ('thst', 432022),\n",
" ('two', 432022),\n",
" ('epeat', 432022),\n",
" ('cruising', 432022),\n",
" ('among', 432022),\n",
" ('t', 432022),\n",
" ('tie', 432022),\n",
" ('eastern', 432022),\n",
" ('neither', 432022),\n",
" ('d*h', 432022),\n",
" ('nor', 432022),\n",
" ('thsy', 432022),\n",
" ('did', 432022),\n",
" ('angling', 432022),\n",
" ('Trolling', 432022),\n",
" ('aalmon', 432022),\n",
" ('good', 432022),\n",
" ('excuse', 432022),\n",
" ('spending', 432022),\n",
" ('idle', 432022),\n",
" ('daya', 432022),\n",
" ('water', 432022),\n",
" ('sailing', 432022),\n",
" ('aad', 432022),\n",
" ('tacking', 432022),\n",
" ('enchant', 432022),\n",
" ('ed', 432022),\n",
" ('waters', 432022),\n",
" ('string', 432022),\n",
" ('astern', 432022),\n",
" ('happteet', 432022),\n",
" ('lasieet', 432022),\n",
" ('kind', 432022),\n",
" ('way', 432022),\n",
" ('kill', 432022),\n",
" ('time', 432022),\n",
" ('Lone', 432022),\n",
" ('go', 432022),\n",
" ('oat', 432022),\n",
" ('midnight', 432022),\n",
" ('turn', 432022),\n",
" ('tide', 432022),\n",
" ('dark', 432022),\n",
" ('honra', 432022),\n",
" ('torn', 432022),\n",
" ('cod', 432022),\n",
" ('bite', 432022),\n",
" ('fast', 432022),\n",
" ('anywhere', 432022),\n",
" ('from', 432022),\n",
" ('wharf', 432022)])"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"freq_list(ver)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.9 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.9"
},
"org": null,
"vscode": {
"interpreter": {
"hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1"
}
}
},
"nbformat": 4,
"nbformat_minor": 0
}