test 8 version

This commit is contained in:
pietrzakkuba 2022-04-03 13:53:56 +02:00
parent bf943014f3
commit c02c81943b
3 changed files with 15381 additions and 15384 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,9 +1,9 @@
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
from nltk import trigrams from nltk import trigrams
import string
from collections import defaultdict, Counter from collections import defaultdict, Counter
import pandas as pd import pandas as pd
import csv import csv
import regex as re
trigrams_list = [] trigrams_list = []
@ -11,12 +11,9 @@ model = defaultdict(lambda: defaultdict(lambda: 0))
def preprocess(text): def preprocess(text):
_text = str(text) text = str(text).lower().replace("-\\n", "").replace("\\n", " ")
_text = _text.lower().replace("-\\n", "").replace('\\n', ' ').strip() text = re.sub(r'\p{P}', '', text)
for character in _text: words = word_tokenize(text)
if character not in string.ascii_lowercase + ' ':
_text = _text.replace(character, '')
words = word_tokenize(_text)
if len(words): if len(words):
return words return words
return [''] return ['']
@ -30,7 +27,7 @@ def predict(word_before, word_after):
prob_sum += value prob_sum += value
predictions.append(f'{key}:{value}') predictions.append(f'{key}:{value}')
if prob_sum == 0.0: if prob_sum == 0.0:
return 'the:0:2 be:0.2 to:0.2 of:0.15 and:0.15 :0.1' return 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
elif prob_sum < 1.0: elif prob_sum < 1.0:
predictions.append(f':{max(1 - prob_sum, 0.01)}') predictions.append(f':{max(1 - prob_sum, 0.01)}')
return ' '.join(predictions) return ' '.join(predictions)
@ -68,23 +65,23 @@ for index, words_1_3 in enumerate(model):
for word_2 in model[words_1_3]: for word_2 in model[words_1_3]:
model[words_1_3][word_2] /= float(count) model[words_1_3][word_2] /= float(count)
file_in = pd.read_csv('test-A/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
with open('test-A/out.tsv', 'w', encoding='utf-8') as file_out: def make_prediction(file):
print('zapisywanie test-A') file_in = pd.read_csv(f'{file}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
for line_in in file_in.iterrows(): with open(f'{file}/out.tsv', 'w') as file_out:
before = line_in[1][6] print(f'zapisywanie {file}')
after = line_in[1][7] for line_in in file_in.iterrows():
word_before_in, word_after_in = preprocess(before)[-1], preprocess(after)[0] before = line_in[1][6]
file_out.write(predict(word_before_in, word_after_in) + '\n') after = line_in[1][7]
if len(before) < 3 or len(after) < 3:
prediction = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
else:
word_before_in, word_after_in = preprocess(before)[-1], preprocess(after)[0]
prediction = predict(word_before_in, word_after_in)
file_out.write(prediction + '\n')
file_in = pd.read_csv('dev-0/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE) make_prediction('test-A')
with open('dev-0/out.tsv', 'w', encoding='utf-8') as file_out: make_prediction('dev-0')
print('zapisywanie dev-0')
for line_in in file_in.iterrows():
before = line_in[1][6]
after = line_in[1][7]
word_before_in, word_after_in = preprocess(before)[-1], preprocess(after)[0]
file_out.write(predict(word_before_in, word_after_in) + '\n')
print('koniec') print('koniec')

File diff suppressed because it is too large Load Diff