19 KiB
19 KiB
import re
from itertools import islice
from collections import Counter
import pandas as pd
from tqdm import tqdm
import lzma
from collections import Counter, OrderedDict
import matplotlib.pyplot as plt
from math import log
import re
import numpy as np
with open("train/in.tsv", encoding='utf8', mode="rt") as file:
a = file.readlines()
a = [line.split("\t") for line in a]
text = " ".join([line[-2] + " " + line[-1] for line in a])
text = re.sub(r"\\\\+n", " ", text)
del a
len(text)
19560075
words = re.findall("\w+", text)
bigram_counter = Counter(zip(words, islice(words, 1, None)))
bigram_counter = dict(sorted(bigram_counter.items(), key=lambda item: item[1], reverse=True))
del words
bigram_counter_short = {}
for key, value in bigram_counter.items():
if value > 5:
bigram_counter_short[key] = value
bigram_counter = bigram_counter_short
del bigram_counter_short
unigram_counter = Counter(text.split(' '))
unigram_counter = unigram_counter.most_common(10_000)
# unigram_counter = dict(sorted(unigram_counter.items(), key=lambda item: item[1]), reverse=True)
unigram_counter_list = unigram_counter
unigram_counter = dict(unigram_counter)
# with open("dev-0/in.tsv", encoding='utf8', mode="rt") as file:
# a = file.readlines()
# a = [line.split("\t") for line in a]
# text = " ".join([line[-2] + " " + line[-1] for line in a])
test_data = pd.read_csv('dev-0/in.tsv', sep='\t', error_bad_lines=False, header=None)
C:\Users\micha\AppData\Local\Temp\ipykernel_14716\2692353843.py:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future. test_data = pd.read_csv('dev-0/in.tsv', sep='\t', error_bad_lines=False, header=None) Skipping line 654: expected 8 fields, saw 9 Skipping line 2220: expected 8 fields, saw 9
test_data
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
---|---|---|---|---|---|---|---|---|
0 | 662ed514d56f7bc8743aa6f23794c731 | LINCOLN TELEGRAPH | ChronAm | 1838.834247 | 43.910755 | -69.820862 | rin 11K ui i rsognfd inlriliinnts i>r the town... | Northeasterly hv the head of said .^corn’s\nan... |
1 | 0c3ac40edfe6a167ab692fdb9219a93c | THE WYANDOT PIONEER | ChronAm | 1857.691781 | 40.827279 | -83.281309 | ton County feel an interest in. tn great is-\n... | and design,\nand hence, every election, be it ... |
2 | b298097f3afd2f8c06b61fa2308ec725 | RICHMOND ENQUIRER | ChronAm | 1847.012329 | 37.538509 | -77.434280 | But at our own doors we have evidence ten\ning... | Democrat\nenlisting lor the Mexican wvir. They... |
3 | 1d50cf957a6a9cbbe0ee7773a72a76d4 | RAFTSMAN'S JOURNAL | ChronAm | 1867.541096 | 41.027280 | -78.439188 | The wonderful Flexibility and great comfort\na... | will preserve their perfect aud grace\nful sha... |
4 | 5a7297b76de00c7d9e1fb159384238c0 | RICHMOND ENQUIRER | ChronAm | 1826.083562 | 37.538509 | -77.434280 | Illinois.—The Legislature met at Ya:.ualia\non... | to run the line between Arkansas and\nthe’Vhnc... |
... | ... | ... | ... | ... | ... | ... | ... | ... |
10397 | 02e9e019df1992daeafe82b041d94aac | WATERBURY EVENING DEMOCRAT | ChronAm | 1888.949454 | 41.558153 | -73.051497 | the Fitzgeralds should perish like a common\nt... | Brian, but there was also a touch\nof self int... |
10398 | 74fa28868cbc998d15c242baea4e1faa | RICHMOND ENQUIRER | ChronAm | 1836.012295 | 37.538509 | -77.434280 | herd, so soon as he conveniently can, after th... | Court dotli lurlher adjudge, order, and decree... |
10399 | 147be715e90bac01c55969d90254f29e | EVENING CAPITAL | ChronAm | 1907.004110 | 38.978640 | -76.492786 | Drs. James J. Murphy, of Annapo-\nlis, and Tho... | in the matter\nor show any inclination to help... |
10400 | 1357f703947d912523ac23540cb99a0f | RAFTSMAN'S JOURNAL | ChronAm | 1868.077869 | 41.027280 | -78.439188 | the soles of the feet spikes or corks are fixe... | \nIn order to prevent "the giant" from\nfright... |
10401 | 23346293dbc949ee2edc3380db29f33b | THE DEMOCRATIC WHIG | ChronAm | 1843.760274 | 33.495674 | -88.427263 | tion which his opponent had taken, and whilst\... | come criterion, by which to judge\nof a nation... |
10402 rows × 8 columns
results_string = []
with lzma.open("dev-0/in.tsv.xz", encoding='utf8', mode="rt") as file:
for line in tqdm(file):
line = line.split("\t")
text_before = str(line[-2]).replace('\\\\n', ' ').replace('\n', ' ')
text_after = str(line[-1]).replace('\\\\n', ' ').replace('\n', ' ')
if text_before[-1] == ' ':
text_before = text_before[:-1]
if text_before[0] == ' ':
text_before = text_before[1:]
if text_after[-1] == ' ':
text_after = text_after[:-1]
if text_after[0] == ' ':
text_after = text_after[1:]
word_before = text_before.split(' ')[-1]
word_after = text_after.split(' ')[0]
best_words = {}
for word_middle, _ in unigram_counter_list:
current_score = 0
if (word_before, word_middle) in bigram_counter.keys() and (word_middle, word_after) in bigram_counter.keys() and word_before in unigram_counter.keys() and word_after in unigram_counter.keys():
current_score = (bigram_counter[(word_before, word_middle)] / unigram_counter[word_before]) * (bigram_counter[(word_middle, word_after)] / unigram_counter[word_middle])
best_words[word_middle] = current_score
best_words = sorted(best_words.items(), key=lambda item: item[1], reverse=True)
leftover_probability = 0
for _, value in best_words[:5]:
if value == 0:
break
leftover_probability += value
leftover_probability = max(1 - leftover_probability, 0.01)
result = f'{best_words[0][0]}:{round(best_words[0][1], 7):.8f} {best_words[1][0]}:{round(best_words[1][1], 7):.8f} {best_words[2][0]}:{round(best_words[2][1], 7):.8f} {best_words[3][0]}:{round(best_words[3][1], 7):.8f} {best_words[4][0]}:{round(best_words[4][1], 7):.8f} :{round(leftover_probability, 3):.8f}'
results_string.append(result)
10519it [02:47, 62.67it/s]
results_string[:20]
['the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000', 'own:0.00076790 way:0.00069960 head:0.00058630 work:0.00051990 place:0.00045550 :0.99700000', 'the:0.00001150 a:0.00001040 Madison:0.00000230 every:0.00000210 Missouri:0.00000120 :1.00000000', 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000', 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000', 'a:0.00008660 him:0.00000250 no:0.00000170 all:0.00000150 them:0.00000120 :1.00000000', 'trees:0.00092990 and:0.00057150 is:0.00030980 growers:0.00029090 growing:0.00014300 :0.99800000', 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000', 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000', 'that:0.00131940 as:0.00047920 sure:0.00009500 and:0.00009330 better:0.00007450 :0.99800000', 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000', 'be:0.00032910 the:0.00014090 show:0.00012860 this:0.00001380 a:0.00000790 :0.99900000', 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000', 'country:0.00007340 world:0.00007170 people:0.00006310 city:0.00005530 time:0.00004280 :1.00000000', 'to:0.00030170 I:0.00005940 and:0.00002870 a:0.00001570 t:0.00001340 :1.00000000', 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000', 'that:0.00014960 for:0.00010580 God:0.00005010 to:0.00002530 ing:0.00001430 :1.00000000', 'founded:0.00097130 known:0.00064890 posted:0.00052370 as:0.00032530 fed:0.00027720 :0.99700000', 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000', 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000']
with open(r'test-A/out.tsv', 'w') as fp:
for item in results_string:
fp.write("%s\n" % item)
print('Done')