challenging-america-word-ga.../nb.ipynb
2023-04-04 23:16:57 +02:00

19 KiB
Raw Blame History

import re
from itertools import islice
from collections import Counter
import pandas as pd
from tqdm import tqdm
import lzma
from collections import Counter, OrderedDict
import matplotlib.pyplot as plt
from math import log
import re
import numpy as np
with open("train/in.tsv", encoding='utf8', mode="rt") as file:
    a = file.readlines()

a = [line.split("\t") for line in a]
text = " ".join([line[-2] + " " + line[-1] for line in a])
text = re.sub(r"\\\\+n", " ", text)
del a
len(text)
19560075
words = re.findall("\w+", text)
bigram_counter = Counter(zip(words, islice(words, 1, None)))
bigram_counter = dict(sorted(bigram_counter.items(), key=lambda item: item[1], reverse=True))

del words
bigram_counter_short = {}
for key, value in bigram_counter.items():
    if value > 5:
        bigram_counter_short[key] = value

bigram_counter = bigram_counter_short
del bigram_counter_short
unigram_counter = Counter(text.split(' '))
unigram_counter = unigram_counter.most_common(10_000)
# unigram_counter = dict(sorted(unigram_counter.items(), key=lambda item: item[1]), reverse=True)
unigram_counter_list = unigram_counter
unigram_counter = dict(unigram_counter) 
# with open("dev-0/in.tsv", encoding='utf8', mode="rt") as file:
#     a = file.readlines()

# a = [line.split("\t") for line in a]
# text = " ".join([line[-2] + " " + line[-1] for line in a])
test_data = pd.read_csv('dev-0/in.tsv', sep='\t', error_bad_lines=False, header=None)
C:\Users\micha\AppData\Local\Temp\ipykernel_14716\2692353843.py:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.


  test_data = pd.read_csv('dev-0/in.tsv', sep='\t', error_bad_lines=False, header=None)
Skipping line 654: expected 8 fields, saw 9
Skipping line 2220: expected 8 fields, saw 9

test_data
0 1 2 3 4 5 6 7
0 662ed514d56f7bc8743aa6f23794c731 LINCOLN TELEGRAPH ChronAm 1838.834247 43.910755 -69.820862 rin 11K ui i rsognfd inlriliinnts i>r the town... Northeasterly hv the head of said .^corns\nan...
1 0c3ac40edfe6a167ab692fdb9219a93c THE WYANDOT PIONEER ChronAm 1857.691781 40.827279 -83.281309 ton County feel an interest in. tn great is-\n... and design,\nand hence, every election, be it ...
2 b298097f3afd2f8c06b61fa2308ec725 RICHMOND ENQUIRER ChronAm 1847.012329 37.538509 -77.434280 But at our own doors we have evidence ten\ning... Democrat\nenlisting lor the Mexican wvir. They...
3 1d50cf957a6a9cbbe0ee7773a72a76d4 RAFTSMAN'S JOURNAL ChronAm 1867.541096 41.027280 -78.439188 The wonderful Flexibility and great comfort\na... will preserve their perfect aud grace\nful sha...
4 5a7297b76de00c7d9e1fb159384238c0 RICHMOND ENQUIRER ChronAm 1826.083562 37.538509 -77.434280 Illinois.—The Legislature met at Ya:.ualia\non... to run the line between Arkansas and\ntheVhnc...
... ... ... ... ... ... ... ... ...
10397 02e9e019df1992daeafe82b041d94aac WATERBURY EVENING DEMOCRAT ChronAm 1888.949454 41.558153 -73.051497 the Fitzgeralds should perish like a common\nt... Brian, but there was also a touch\nof self int...
10398 74fa28868cbc998d15c242baea4e1faa RICHMOND ENQUIRER ChronAm 1836.012295 37.538509 -77.434280 herd, so soon as he conveniently can, after th... Court dotli lurlher adjudge, order, and decree...
10399 147be715e90bac01c55969d90254f29e EVENING CAPITAL ChronAm 1907.004110 38.978640 -76.492786 Drs. James J. Murphy, of Annapo-\nlis, and Tho... in the matter\nor show any inclination to help...
10400 1357f703947d912523ac23540cb99a0f RAFTSMAN'S JOURNAL ChronAm 1868.077869 41.027280 -78.439188 the soles of the feet spikes or corks are fixe... \nIn order to prevent "the giant" from\nfright...
10401 23346293dbc949ee2edc3380db29f33b THE DEMOCRATIC WHIG ChronAm 1843.760274 33.495674 -88.427263 tion which his opponent had taken, and whilst\... come criterion, by which to judge\nof a nation...

10402 rows × 8 columns

results_string = []

with lzma.open("dev-0/in.tsv.xz", encoding='utf8', mode="rt") as file:
    for line in tqdm(file):
        line = line.split("\t")
        text_before = str(line[-2]).replace('\\\\n', ' ').replace('\n', ' ')
        text_after = str(line[-1]).replace('\\\\n', ' ').replace('\n', ' ')

        if text_before[-1] == ' ':
            text_before = text_before[:-1]
        if text_before[0] == ' ':
            text_before = text_before[1:]

        if text_after[-1] == ' ':
            text_after = text_after[:-1]
        if text_after[0] == ' ':
            text_after = text_after[1:]

        word_before = text_before.split(' ')[-1]
        word_after = text_after.split(' ')[0]

        best_words = {}

        for word_middle, _ in unigram_counter_list:
            current_score = 0
            if (word_before, word_middle) in bigram_counter.keys() and (word_middle, word_after) in bigram_counter.keys() and word_before in unigram_counter.keys() and word_after in unigram_counter.keys():
                current_score = (bigram_counter[(word_before, word_middle)] / unigram_counter[word_before]) * (bigram_counter[(word_middle, word_after)] / unigram_counter[word_middle])
            best_words[word_middle] = current_score

        best_words = sorted(best_words.items(), key=lambda item: item[1], reverse=True)
        leftover_probability = 0
        for _, value in best_words[:5]:
            if value == 0:
                break
            leftover_probability += value
        leftover_probability = max(1 - leftover_probability, 0.01)

        result = f'{best_words[0][0]}:{round(best_words[0][1], 7):.8f} {best_words[1][0]}:{round(best_words[1][1], 7):.8f} {best_words[2][0]}:{round(best_words[2][1], 7):.8f} {best_words[3][0]}:{round(best_words[3][1], 7):.8f} {best_words[4][0]}:{round(best_words[4][1], 7):.8f} :{round(leftover_probability, 3):.8f}'
        results_string.append(result)
10519it [02:47, 62.67it/s]
results_string[:20]
['the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'own:0.00076790 way:0.00069960 head:0.00058630 work:0.00051990 place:0.00045550 :0.99700000',
 'the:0.00001150 a:0.00001040 Madison:0.00000230 every:0.00000210 Missouri:0.00000120 :1.00000000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'a:0.00008660 him:0.00000250 no:0.00000170 all:0.00000150 them:0.00000120 :1.00000000',
 'trees:0.00092990 and:0.00057150 is:0.00030980 growers:0.00029090 growing:0.00014300 :0.99800000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'that:0.00131940 as:0.00047920 sure:0.00009500 and:0.00009330 better:0.00007450 :0.99800000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'be:0.00032910 the:0.00014090 show:0.00012860 this:0.00001380 a:0.00000790 :0.99900000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'country:0.00007340 world:0.00007170 people:0.00006310 city:0.00005530 time:0.00004280 :1.00000000',
 'to:0.00030170 I:0.00005940 and:0.00002870 a:0.00001570 t:0.00001340 :1.00000000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'that:0.00014960 for:0.00010580 God:0.00005010 to:0.00002530 ing:0.00001430 :1.00000000',
 'founded:0.00097130 known:0.00064890 posted:0.00052370 as:0.00032530 fed:0.00027720 :0.99700000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',
 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000']
with open(r'test-A/out.tsv', 'w') as fp:
    for item in results_string:
        fp.write("%s\n" % item)
    print('Done')