This commit is contained in:
Maciej(Linux) 2022-04-24 00:33:08 +02:00
parent ac33d50cec
commit 96f3dafd86
6 changed files with 18418 additions and 18001 deletions

File diff suppressed because it is too large Load Diff

BIN
kenlm_model.arpa Normal file

Binary file not shown.

271
run.ipynb Normal file
View File

@ -0,0 +1,271 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from nltk import trigrams, word_tokenize\n",
"import pandas as pd\n",
"import csv\n",
"import regex as re\n",
"from collections import Counter, defaultdict\n",
"import kenlm\n",
"from english_words import english_words_alpha_set\n",
"from math import log10"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"train_set = pd.read_csv(\n",
" 'train/in.tsv.xz',\n",
" sep='\\t',\n",
" header=None,\n",
" quoting=csv.QUOTE_NONE,\n",
" nrows=35000)\n",
"\n",
"train_labels = pd.read_csv(\n",
" 'train/expected.tsv',\n",
" sep='\\t',\n",
" header=None,\n",
" quoting=csv.QUOTE_NONE,\n",
" nrows=35000)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"data = pd.concat([train_set, train_labels], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"data = train_set[6] + train_set[0] + train_set[7]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def data_preprocessing(text):\n",
" return re.sub(r'\\p{P}', '', text.lower().replace('-\\\\n', '').replace('\\\\n', ' ').replace(\"'ll\", \" will\").replace(\"-\", \"\").replace(\"'ve\", \" have\").replace(\"'s\", \" is\"))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"data = data.apply(data_preprocessing)\n",
"prediction = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"with open(\"train_file.txt\", \"w+\") as f:\n",
" for text in data:\n",
" f.write(text + \"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"KENLM_BUILD_PATH='../kenlm/build/bin/lmplz'"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== 1/5 Counting and sorting n-grams ===\n",
"Reading /home/maciej/challenging-america-word-gap-prediction/train_file.txt\n",
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
"****************************************************************************************************\n",
"Unigram tokens 11040226 types 580506\n",
"=== 2/5 Calculating and sorting adjusted counts ===\n",
"Chain sizes: 1:6966072 2:4100520192 3:7688475136 4:12301560832\n",
"Statistics:\n",
"1 580506 D1=0.841976 D2=0.938008 D3+=1.10537\n",
"2 3583875 D1=0.83057 D2=1.0296 D3+=1.2275\n",
"3 7705610 D1=0.899462 D2=1.16366 D3+=1.32181\n",
"4 9865473 D1=0.942374 D2=1.27613 D3+=1.35073\n",
"Memory estimate for binary LM:\n",
"type MB\n",
"probing 442 assuming -p 1.5\n",
"probing 508 assuming -r models -p 1.5\n",
"trie 216 without quantization\n",
"trie 126 assuming -q 8 -b 8 quantization \n",
"trie 195 assuming -a 22 array pointer compression\n",
"trie 104 assuming -a 22 -q 8 -b 8 array pointer compression and quantization\n",
"=== 3/5 Calculating and sorting initial probabilities ===\n",
"Chain sizes: 1:6966072 2:57342000 3:154112200 4:236771352\n",
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
"####################################################################################################\n",
"=== 4/5 Calculating and writing order-interpolated probabilities ===\n",
"Chain sizes: 1:6966072 2:57342000 3:154112200 4:236771352\n",
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
"####################################################################################################\n",
"=== 5/5 Writing ARPA model ===\n",
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
"****************************************************************************************************\n",
"Name:lmplz\tVmPeak:23697780 kB\tVmRSS:21496 kB\tRSSMax:4963084 kB\tuser:39.0693\tsys:17.6943\tCPU:56.7637\treal:43.821\n"
]
}
],
"source": [
"!$KENLM_BUILD_PATH -o 4 < train_file.txt > kenlm_model.arpa"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/maciej/challenging-america-word-gap-prediction\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Loading the LM will be faster if you build a binary file.\n",
"Reading /home/maciej/challenging-america-word-gap-prediction/kenlm_model.arpa\n",
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
"****************************************************************************************************\n"
]
}
],
"source": [
"import os\n",
"print(os.getcwd())\n",
"model = kenlm.Model('kenlm_model.arpa')\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"def predict(before, after):\n",
" result = ''\n",
" prob = 0.0\n",
" best = []\n",
" for word in english_words_alpha_set:\n",
" text = ' '.join([before, word, after])\n",
" text_score = model.score(text, bos=False, eos=False)\n",
" if len(best) < 12:\n",
" best.append((word, text_score))\n",
" else:\n",
" is_better = False\n",
" worst_score = None\n",
" for score in best:\n",
" if not worst_score:\n",
" worst_score = score\n",
" else:\n",
" if worst_score[1] > score[1]:\n",
" worst_score = score\n",
" if worst_score[1] < text_score:\n",
" best.remove(worst_score)\n",
" best.append((word, text_score))\n",
" probs = sorted(best, key=lambda tup: tup[1], reverse=True)\n",
" pred_str = ''\n",
" for word, prob in probs:\n",
" pred_str += f'{word}:{prob} '\n",
" pred_str += f':{log10(0.99)}'\n",
" return pred_str\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"def make_prediction(path, result_path):\n",
" data = pd.read_csv(path, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
" with open(result_path, 'w', encoding='utf-8') as file_out:\n",
" for _, row in data.iterrows():\n",
" before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7])))\n",
" if len(before) < 2 or len(after) < 2:\n",
" pred = prediction\n",
" else:\n",
" pred = predict(before[-1], after[0])\n",
" file_out.write(pred + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"make_prediction(\"dev-0/in.tsv.xz\", \"dev-0/out.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"make_prediction(\"test-A/in.tsv.xz\", \"test-A/out.tsv\")"
]
}
],
"metadata": {
"interpreter": {
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

202
run.py Normal file → Executable file
View File

@ -1,80 +1,146 @@
from re import T
#!/usr/bin/env python
# coding: utf-8
# In[2]:
from nltk import trigrams, word_tokenize
import pandas as pd
import csv
from collections import Counter, defaultdict
from nltk.tokenize import RegexpTokenizer
from nltk import trigrams
import regex as re
import lzma
from collections import Counter, defaultdict
import kenlm
from english_words import english_words_alpha_set
from math import log10
class GapEssa:
# In[3]:
def __init__(self):
self.alpha = 0.0001
self.vocab = set()
self.model = defaultdict(lambda: defaultdict(lambda: 0))
self.tokenizer = RegexpTokenizer(r"\w+")
def read_file(self, f, mode=0):
for line in f:
text = line.split("\t")
if(mode==0):
yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n"," ").replace("\n","").lower()))
else:
yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[7].replace("\\n"," ").replace("\n","").lower()))
def train(self, f):
with lzma.open(f, mode='rt') as file:
for index, text in enumerate(self.read_file(file)):
tokens = self.tokenizer.tokenize(text)
for w1, w2, w3 in trigrams(tokens, pad_right=True, pad_left=True):
if w1 and w2 and w3:
self.model[(w2, w3)][w1] += 1
self.vocab.add(w1)
self.vocab.add(w2)
self.vocab.add(w3)
if index == 40000:
break
for pair in self.model:
num_n_grams = float(sum(self.model[pair].values()))
for word in self.model[pair]:
self.model[pair][word] = (self.model[pair][word] + self.alpha) / (num_n_grams + self.alpha*len(self.vocab))
train_set = pd.read_csv(
'train/in.tsv.xz',
sep='\t',
header=None,
quoting=csv.QUOTE_NONE,
nrows=35000)
def out(self, input_f, output_f):
with open(output_f, 'w') as out_f:
with lzma.open(input_f, mode='rt') as in_f:
for _, text in enumerate(self.read_file(in_f, mode=1)):
t = self.tokenizer.tokenize(text)
if len(t) < 4:
# p = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
p = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'
else:
p = self.pred(t[0], t[1])
out_f.write(p + '\n')
train_labels = pd.read_csv(
'train/expected.tsv',
sep='\t',
header=None,
quoting=csv.QUOTE_NONE,
nrows=35000)
def pred(self, w1, w2):
total = 0.0
line = ''
p = dict(self.model[w1, w2])
m = dict(Counter(p).most_common(6))
for word, prob in m.items():
total += prob
line += f'{word}:{prob} '
if total == 0.0:
return 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'
if 1 - total >= 0.01:
line += f":{1-total}"
# In[4]:
data = pd.concat([train_set, train_labels], axis=1)
# In[5]:
data = train_set[6] + train_set[0] + train_set[7]
# In[6]:
def data_preprocessing(text):
return re.sub(r'\p{P}', '', text.lower().replace('-\\n', '').replace('\\n', ' ').replace("'ll", " will").replace("-", "").replace("'ve", " have").replace("'s", " is"))
# In[8]:
data = data.apply(data_preprocessing)
prediction = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'
# In[25]:
with open("train_file.txt", "w+") as f:
for text in data:
f.write(text + "\n")
# In[27]:
KENLM_BUILD_PATH='../kenlm/build/bin/lmplz'
# In[28]:
get_ipython().system('$KENLM_BUILD_PATH -o 4 < train_file.txt > kenlm_model.arpa')
# In[29]:
import os
print(os.getcwd())
model = kenlm.Model('kenlm_model.arpa')
# In[30]:
def predict(before, after):
result = ''
prob = 0.0
best = []
for word in english_words_alpha_set:
text = ' '.join([before, word, after])
text_score = model.score(text, bos=False, eos=False)
if len(best) < 12:
best.append((word, text_score))
else:
line += f":0.01"
return line
is_better = False
worst_score = None
for score in best:
if not worst_score:
worst_score = score
else:
if worst_score[1] > score[1]:
worst_score = score
if worst_score[1] < text_score:
best.remove(worst_score)
best.append((word, text_score))
probs = sorted(best, key=lambda tup: tup[1], reverse=True)
pred_str = ''
for word, prob in probs:
pred_str += f'{word}:{prob} '
pred_str += f':{log10(0.99)}'
return pred_str
# In[31]:
def make_prediction(path, result_path):
data = pd.read_csv(path, sep='\t', header=None, quoting=csv.QUOTE_NONE)
with open(result_path, 'w', encoding='utf-8') as file_out:
for _, row in data.iterrows():
before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7])))
if len(before) < 2 or len(after) < 2:
pred = prediction
else:
pred = predict(before[-1], after[0])
file_out.write(pred + '\n')
# In[32]:
make_prediction("dev-0/in.tsv.xz", "dev-0/out.tsv")
# In[33]:
make_prediction("test-A/in.tsv.xz", "test-A/out.tsv")
wp = GapEssa()
wp.train('train/in.tsv.xz')
wp.out('dev-0/in.tsv.xz', 'dev-0/out.tsv')
wp.out('test-A/in.tsv.xz', 'test-A/out.tsv')

80
run2.py Normal file
View File

@ -0,0 +1,80 @@
from re import T
import pandas as pd
import csv
from collections import Counter, defaultdict
from nltk.tokenize import RegexpTokenizer
from nltk import trigrams
import regex as re
import lzma
class GapEssa:
def __init__(self):
self.alpha = 0.0001
self.vocab = set()
self.model = defaultdict(lambda: defaultdict(lambda: 0))
self.tokenizer = RegexpTokenizer(r"\w+")
def read_file(self, f, mode=0):
for line in f:
text = line.split("\t")
if(mode==0):
yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', ' '.join([text[6], text[7]]).replace("\\n"," ").replace("\n","").lower()))
else:
yield re.sub(r"[^\w\d'\s]+", '', re.sub(' +', ' ', text[7].replace("\\n"," ").replace("\n","").lower()))
def train(self, f):
with lzma.open(f, mode='rt') as file:
for index, text in enumerate(self.read_file(file)):
tokens = self.tokenizer.tokenize(text)
for w1, w2, w3 in trigrams(tokens, pad_right=True, pad_left=True):
if w1 and w2 and w3:
self.model[(w2, w3)][w1] += 1
self.vocab.add(w1)
self.vocab.add(w2)
self.vocab.add(w3)
if index == 40000:
break
for pair in self.model:
num_n_grams = float(sum(self.model[pair].values()))
for word in self.model[pair]:
self.model[pair][word] = (self.model[pair][word] + self.alpha) / (num_n_grams + self.alpha*len(self.vocab))
def out(self, input_f, output_f):
with open(output_f, 'w') as out_f:
with lzma.open(input_f, mode='rt') as in_f:
for _, text in enumerate(self.read_file(in_f, mode=1)):
t = self.tokenizer.tokenize(text)
if len(t) < 4:
# p = 'the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1'
p = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'
else:
p = self.pred(t[0], t[1])
out_f.write(p + '\n')
def pred(self, w1, w2):
total = 0.0
line = ''
p = dict(self.model[w1, w2])
m = dict(Counter(p).most_common(6))
for word, prob in m.items():
total += prob
line += f'{word}:{prob} '
if total == 0.0:
return 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'
if 1 - total >= 0.01:
line += f":{1-total}"
else:
line += f":0.01"
return line
wp = GapEssa()
wp.train('train/in.tsv.xz')
wp.out('dev-0/in.tsv.xz', 'dev-0/out.tsv')
wp.out('test-A/in.tsv.xz', 'test-A/out.tsv')

File diff suppressed because it is too large Load Diff