wygladzanie

This commit is contained in:
Maciej(Linux) 2022-04-12 20:13:36 +02:00
parent 4e4ebdcadc
commit ac33d50cec
4 changed files with 18082 additions and 18001 deletions

File diff suppressed because it is too large Load Diff

128
run.py
View File

@ -1,80 +1,80 @@
from nltk import tris, word_tokenize from re import T
import pandas as pd import pandas as pd
import csv import csv
import regex as re
from collections import Counter, defaultdict from collections import Counter, defaultdict
from nltk.tokenize import RegexpTokenizer
from nltk import trigrams
import regex as re
import lzma
train = pd.read_csv( class GapEssa:
'train/in.tsv.xz',
sep='\t',
on_bad_lines='skip',
header=None,
quoting=csv.QUOTE_NONE,
nrows=40000)
def __init__(self):
self.alpha = 0.0001
self.vocab = set()
self.model = defaultdict(lambda: defaultdict(lambda: 0))
self.tokenizer = RegexpTokenizer(r"\w+")
labels = pd.read_csv( def read_file(self, f, mode=0):
'train/expected.tsv', for line in f:
sep='\t', text = line.split("\t")
on_bad_lines='skip', if(mode==0):
header=None, yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n"," ").replace("\n","").lower()))
quoting=csv.QUOTE_NONE,
nrows=40000)
def data_preprocessing(text):
return re.sub(r'\p{P}', '', text.lower().replace('-\\n', '').replace('\\n', ' '))
def predict(before, after):
prediction = dict(Counter(dict(tri[before, after])).most_common(5))
result = ''
prob = 0.0
for key, value in prediction.items():
prob += value
result += f'{key}:{value} '
if prob == 0.0:
return 'to:0.015 be:0.015 the:0.015 not:0.01 and:0.02 a:0.02 :0.9'
result += f':{max(1 - prob, 0.01)}'
return result
def make_prediction(file):
data = pd.read_csv(f'{file}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
with open(f'{file}/out.tsv', 'w', encoding='utf-8') as file_out:
for _, row in data.iterrows():
before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7])))
if len(before) < 3 or len(after) < 3:
prediction = 'to:0.015 be:0.015 the:0.015 not:0.01 and:0.02 a:0.02 :0.9'
else: else:
prediction = predict(before[-1], after[0]) yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[7].replace("\\n"," ").replace("\n","").lower()))
file_out.write(prediction + '\n')
def train(self, f):
with lzma.open(f, mode='rt') as file:
for index, text in enumerate(self.read_file(file)):
tokens = self.tokenizer.tokenize(text)
for w1, w2, w3 in trigrams(tokens, pad_right=True, pad_left=True):
if w1 and w2 and w3:
self.model[(w2, w3)][w1] += 1
self.vocab.add(w1)
self.vocab.add(w2)
self.vocab.add(w3)
if index == 40000:
break
train = train[[6, 7]] for pair in self.model:
train = pd.concat([train, labels], axis=1) num_n_grams = float(sum(self.model[pair].values()))
train['line'] = train[6] + train[0] + train[7] for word in self.model[pair]:
self.model[pair][word] = (self.model[pair][word] + self.alpha) / (num_n_grams + self.alpha*len(self.vocab))
def out(self, input_f, output_f):
with open(output_f, 'w') as out_f:
with lzma.open(input_f, mode='rt') as in_f:
for _, text in enumerate(self.read_file(in_f, mode=1)):
t = self.tokenizer.tokenize(text)
if len(t) < 4:
# p = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
p = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'
else:
p = self.pred(t[0], t[1])
out_f.write(p + '\n')
tri = defaultdict(lambda: defaultdict(lambda: 0)) def pred(self, w1, w2):
total = 0.0
line = ''
rows = train.iterrows() p = dict(self.model[w1, w2])
rows_len = len(train) m = dict(Counter(p).most_common(6))
for index, (_, row) in enumerate(rows):
text = data_preprocessing(str(row['line']))
words = word_tokenize(text)
for word_1, word_2, word_3 in tris(words, pad_right=True, pad_left=True):
if word_1 and word_2 and word_3:
tri[(word_1, word_3)][word_2] += 1
model_len = len(tri) for word, prob in m.items():
for index, words_1_3 in enumerate(tri): total += prob
count = sum(tri[words_1_3].values()) line += f'{word}:{prob} '
for word_2 in tri[words_1_3]:
tri[words_1_3][word_2] += 0.25
tri[words_1_3][word_2] /= float(count + 0.25 + len(word_2))
if total == 0.0:
return 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'
if 1 - total >= 0.01:
line += f":{1-total}"
else:
line += f":0.01"
make_prediction('test-A') return line
make_prediction('dev-0')
wp = GapEssa()
wp.train('train/in.tsv.xz')
wp.out('dev-0/in.tsv.xz', 'dev-0/out.tsv')
wp.out('test-A/in.tsv.xz', 'test-A/out.tsv')

81
run_old.py Normal file
View File

@ -0,0 +1,81 @@
from nltk import trigram as tris
from nltk import word_tokenize
import pandas as pd
import csv
import regex as re
from collections import Counter, defaultdict
train = pd.read_csv(
'train/in.tsv.xz',
sep='\t',
on_bad_lines='skip',
header=None,
quoting=csv.QUOTE_NONE,
nrows=30000)
labels = pd.read_csv(
'train/expected.tsv',
sep='\t',
on_bad_lines='skip',
header=None,
quoting=csv.QUOTE_NONE,
nrows=30000)
def data_preprocessing(text):
return re.sub(r'\p{P}', '', text.lower().replace('-\\n', '').replace('\\n', ' '))
def predict(before, after):
prediction = dict(Counter(dict(tri[before, after])).most_common(5))
result = ''
prob = 0.0
for key, value in prediction.items():
prob += value
result += f'{key}:{value} '
if prob == 0.0:
return 'to:0.015 be:0.015 the:0.015 not:0.01 and:0.02 a:0.02 :0.9'
result += f':{max(1 - prob, 0.01)}'
return result
def make_prediction(file):
data = pd.read_csv(f'{file}/in.tsv.xz', sep='\t', on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
with open(f'{file}/out.tsv', 'w', encoding='utf-8') as file_out:
for _, row in data.iterrows():
before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7])))
if len(before) < 3 or len(after) < 3:
prediction = 'to:0.015 be:0.015 the:0.015 not:0.01 and:0.02 a:0.02 :0.9'
else:
prediction = predict(before[-1], after[0])
file_out.write(prediction + '\n')
train = train[[6, 7]]
train = pd.concat([train, labels], axis=1)
train['line'] = train[6] + train[0] + train[7]
tri = defaultdict(lambda: defaultdict(lambda: 0))
rows = train.iterrows()
rows_len = len(train)
for index, (_, row) in enumerate(rows):
text = data_preprocessing(str(row['line']))
words = word_tokenize(text)
for word_1, word_2, word_3 in tris(words, pad_right=True, pad_left=True):
if word_1 and word_2 and word_3:
tri[(word_1, word_3)][word_2] += 1
model_len = len(tri)
for index, words_1_3 in enumerate(tri):
count = sum(tri[words_1_3].values())
for word_2 in tri[words_1_3]:
tri[words_1_3][word_2] += 0.25
tri[words_1_3][word_2] /= float(count + 0.25 + len(word_2))
make_prediction('test-A')
make_prediction('dev-0')

File diff suppressed because it is too large Load Diff