challenging-america-word-gap-prediction/nb.ipynb at de7fe9fabdd10aa0b8d821f44b343c3b6a0fb5f4

s478855/challenging-america-word-gap-prediction

ulaniuk ec06c05e97 1st try

2023-04-04 23:16:57 +02:00

19 KiB

Raw Blame History

import re
from itertools import islice
from collections import Counter
import pandas as pd
from tqdm import tqdm

import lzma
from collections import Counter, OrderedDict
import matplotlib.pyplot as plt
from math import log
import re
import numpy as np

with open("train/in.tsv", encoding='utf8', mode="rt") as file:
    a = file.readlines()

a = [line.split("\t") for line in a]
text = " ".join([line[-2] + " " + line[-1] for line in a])
text = re.sub(r"\\\\+n", " ", text)

del a

len(text)

19560075

words = re.findall("\w+", text)
bigram_counter = Counter(zip(words, islice(words, 1, None)))
bigram_counter = dict(sorted(bigram_counter.items(), key=lambda item: item[1], reverse=True))

del words

bigram_counter_short = {}
for key, value in bigram_counter.items():
    if value > 5:
        bigram_counter_short[key] = value

bigram_counter = bigram_counter_short
del bigram_counter_short

unigram_counter = Counter(text.split(' '))
unigram_counter = unigram_counter.most_common(10_000)
# unigram_counter = dict(sorted(unigram_counter.items(), key=lambda item: item[1]), reverse=True)
unigram_counter_list = unigram_counter
unigram_counter = dict(unigram_counter)

# with open("dev-0/in.tsv", encoding='utf8', mode="rt") as file:
#     a = file.readlines()

# a = [line.split("\t") for line in a]
# text = " ".join([line[-2] + " " + line[-1] for line in a])

test_data = pd.read_csv('dev-0/in.tsv', sep='\t', error_bad_lines=False, header=None)

C:\Users\micha\AppData\Local\Temp\ipykernel_14716\2692353843.py:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.


  test_data = pd.read_csv('dev-0/in.tsv', sep='\t', error_bad_lines=False, header=None)
Skipping line 654: expected 8 fields, saw 9
Skipping line 2220: expected 8 fields, saw 9

test_data

	0	1	2	3	4	5	6	7
0	662ed514d56f7bc8743aa6f23794c731	LINCOLN TELEGRAPH	ChronAm	1838.834247	43.910755	-69.820862	rin 11K ui i rsognfd inlriliinnts i>r the town...	Northeasterly hv the head of said .^corn’s\nan...
1	0c3ac40edfe6a167ab692fdb9219a93c	THE WYANDOT PIONEER	ChronAm	1857.691781	40.827279	-83.281309	ton County feel an interest in. tn great is-\n...	and design,\nand hence, every election, be it ...
2	b298097f3afd2f8c06b61fa2308ec725	RICHMOND ENQUIRER	ChronAm	1847.012329	37.538509	-77.434280	But at our own doors we have evidence ten\ning...	Democrat\nenlisting lor the Mexican wvir. They...
3	1d50cf957a6a9cbbe0ee7773a72a76d4	RAFTSMAN'S JOURNAL	ChronAm	1867.541096	41.027280	-78.439188	The wonderful Flexibility and great comfort\na...	will preserve their perfect aud grace\nful sha...
4	5a7297b76de00c7d9e1fb159384238c0	RICHMOND ENQUIRER	ChronAm	1826.083562	37.538509	-77.434280	Illinois.—The Legislature met at Ya:.ualia\non...	to run the line between Arkansas and\nthe’Vhnc...
...	...	...	...	...	...	...	...	...
10397	02e9e019df1992daeafe82b041d94aac	WATERBURY EVENING DEMOCRAT	ChronAm	1888.949454	41.558153	-73.051497	the Fitzgeralds should perish like a common\nt...	Brian, but there was also a touch\nof self int...
10398	74fa28868cbc998d15c242baea4e1faa	RICHMOND ENQUIRER	ChronAm	1836.012295	37.538509	-77.434280	herd, so soon as he conveniently can, after th...	Court dotli lurlher adjudge, order, and decree...
10399	147be715e90bac01c55969d90254f29e	EVENING CAPITAL	ChronAm	1907.004110	38.978640	-76.492786	Drs. James J. Murphy, of Annapo-\nlis, and Tho...	in the matter\nor show any inclination to help...
10400	1357f703947d912523ac23540cb99a0f	RAFTSMAN'S JOURNAL	ChronAm	1868.077869	41.027280	-78.439188	the soles of the feet spikes or corks are fixe...	\nIn order to prevent "the giant" from\nfright...
10401	23346293dbc949ee2edc3380db29f33b	THE DEMOCRATIC WHIG	ChronAm	1843.760274	33.495674	-88.427263	tion which his opponent had taken, and whilst\...	come criterion, by which to judge\nof a nation...

10402 rows × 8 columns

results_string = []

with lzma.open("dev-0/in.tsv.xz", encoding='utf8', mode="rt") as file:
    for line in tqdm(file):
        line = line.split("\t")
        text_before = str(line[-2]).replace('\\\\n', ' ').replace('\n', ' ')
        text_after = str(line[-1]).replace('\\\\n', ' ').replace('\n', ' ')

        if text_before[-1] == ' ':
            text_before = text_before[:-1]
        if text_before[0] == ' ':
            text_before = text_before[1:]

        if text_after[-1] == ' ':
            text_after = text_after[:-1]
        if text_after[0] == ' ':
            text_after = text_after[1:]

        word_before = text_before.split(' ')[-1]
        word_after = text_after.split(' ')[0]

        best_words = {}

        for word_middle, _ in unigram_counter_list:
            current_score = 0
            if (word_before, word_middle) in bigram_counter.keys() and (word_middle, word_after) in bigram_counter.keys() and word_before in unigram_counter.keys() and word_after in unigram_counter.keys():
                current_score = (bigram_counter[(word_before, word_middle)] / unigram_counter[word_before]) * (bigram_counter[(word_middle, word_after)] / unigram_counter[word_middle])
            best_words[word_middle] = current_score

        best_words = sorted(best_words.items(), key=lambda item: item[1], reverse=True)
        leftover_probability = 0
        for _, value in best_words[:5]:
            if value == 0:
                break
            leftover_probability += value
        leftover_probability = max(1 - leftover_probability, 0.01)

        result = f'{best_words[0][0]}:{round(best_words[0][1], 7):.8f} {best_words[1][0]}:{round(best_words[1][1], 7):.8f} {best_words[2][0]}:{round(best_words[2][1], 7):.8f} {best_words[3][0]}:{round(best_words[3][1], 7):.8f} {best_words[4][0]}:{round(best_words[4][1], 7):.8f} :{round(leftover_probability, 3):.8f}'
        results_string.append(result)

10519it [02:47, 62.67it/s]

results_string[:20]

['the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'own:0.00076790 way:0.00069960 head:0.00058630 work:0.00051990 place:0.00045550 :0.99700000',
 'the:0.00001150 a:0.00001040 Madison:0.00000230 every:0.00000210 Missouri:0.00000120 :1.00000000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'a:0.00008660 him:0.00000250 no:0.00000170 all:0.00000150 them:0.00000120 :1.00000000',
 'trees:0.00092990 and:0.00057150 is:0.00030980 growers:0.00029090 growing:0.00014300 :0.99800000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'that:0.00131940 as:0.00047920 sure:0.00009500 and:0.00009330 better:0.00007450 :0.99800000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'be:0.00032910 the:0.00014090 show:0.00012860 this:0.00001380 a:0.00000790 :0.99900000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'country:0.00007340 world:0.00007170 people:0.00006310 city:0.00005530 time:0.00004280 :1.00000000',
 'to:0.00030170 I:0.00005940 and:0.00002870 a:0.00001570 t:0.00001340 :1.00000000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'that:0.00014960 for:0.00010580 God:0.00005010 to:0.00002530 ing:0.00001430 :1.00000000',
 'founded:0.00097130 known:0.00064890 posted:0.00052370 as:0.00032530 fed:0.00027720 :0.99700000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000']

with open(r'test-A/out.tsv', 'w') as fp:
    for item in results_string:
        fp.write("%s\n" % item)
    print('Done')

19 KiB Raw Blame History Unescape Escape

19 KiB

Raw Blame History