v2 20000 rows top 12

This commit is contained in:
Łukasz Jędyk 2022-04-02 17:35:49 +02:00
parent 20f3c70aea
commit e15e94a20c
3 changed files with 17765 additions and 17758 deletions

File diff suppressed because it is too large Load Diff

33
run.py
View File

@ -1,8 +1,16 @@
import pandas as pd import pandas as pd
import csv import csv
import regex as re
from nltk import trigrams, word_tokenize from nltk import trigrams, word_tokenize
from collections import Counter, defaultdict from collections import Counter, defaultdict
def clean_text(text):
text = text.lower().replace('-\\n', '').replace('\\n', ' ')
text = re.sub(r'\p{P}', '', text)
return text
train_data = pd.read_csv('train/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) train_data = pd.read_csv('train/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
train_labels = pd.read_csv('train/expected.tsv', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) train_labels = pd.read_csv('train/expected.tsv', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
@ -14,13 +22,12 @@ train_data['final'] = train_data[6] + train_data[0] + train_data[7]
model = defaultdict(lambda: defaultdict(lambda: 0)) model = defaultdict(lambda: defaultdict(lambda: 0))
for index, row in train_data.iterrows(): for index, row in train_data.iterrows():
text = str(row['final']).lower() text = clean_text(str(row['final']))
text = text.replace('-\\n', '')
text = text.replace('\\n', ' ')
words = word_tokenize(text) words = word_tokenize(text)
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True): for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
model[(w2, w3)][w1] += 1 if w1 and w2 and w3:
if index > 10000: model[(w2, w3)][w1] += 1
if index > 20000:
break break
for w2_w3 in model: for w2_w3 in model:
@ -39,6 +46,9 @@ def predict_probs(word1, word2):
total_prob += prob total_prob += prob
str_prediction += f'{word}:{prob} ' str_prediction += f'{word}:{prob} '
if total_prob == 0.0:
return 'the:0.3 be:0.2 to:0.2 of:0.2 :0.1'
remaining_prob = 1 - total_prob remaining_prob = 1 - total_prob
if remaining_prob < 0.0001: if remaining_prob < 0.0001:
@ -48,29 +58,26 @@ def predict_probs(word1, word2):
return str_prediction return str_prediction
dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE) test_data = pd.read_csv('test-A/in.tsv.xz', sep='\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
with open('dev-0/out.tsv', 'w') as file: with open('dev-0/out.tsv', 'w') as file:
for index, row in dev_data.iterrows(): for index, row in dev_data.iterrows():
text = str(row[7]).lower() text = clean_text(str(row[7]))
text = text.replace('-\\n', '')
text = text.replace('\\n', ' ')
words = word_tokenize(text) words = word_tokenize(text)
if len(words) < 4: if len(words) < 4:
prediction = 'and:0.01 :0.99' prediction = 'the:0.3 be:0.2 to:0.2 of:0.2 :0.1'
else: else:
prediction = predict_probs(words[0], words[1]) prediction = predict_probs(words[0], words[1])
file.write(prediction + '\n') file.write(prediction + '\n')
with open('test-A/out.tsv', 'w') as file: with open('test-A/out.tsv', 'w') as file:
for index, row in test_data.iterrows(): for index, row in test_data.iterrows():
text = str(row[7]).lower() text = clean_text(str(row[7]))
text = text.replace('-\\n', '')
text = text.replace('\\n', ' ')
words = word_tokenize(text) words = word_tokenize(text)
if len(words) < 4: if len(words) < 4:
prediction = 'and:0.01 :0.99' prediction = 'the:0.3 be:0.2 to:0.2 of:0.2 :0.1'
else: else:
prediction = predict_probs(words[0], words[1]) prediction = predict_probs(words[0], words[1])
file.write(prediction + '\n') file.write(prediction + '\n')

File diff suppressed because it is too large Load Diff