This commit is contained in:
fraszosia@gmail.com 2022-04-01 15:41:25 +02:00
parent 57fd77f584
commit 51220186a3
4 changed files with 15450 additions and 15452 deletions

File diff suppressed because it is too large Load Diff

View File

@ -71,7 +71,7 @@
"def generate_N_grams(text, ngram=1, no_punctuation=True):\n",
" text = re.sub(r'[\\-] ', '', text).lower()\n",
" if no_punctuation:\n",
" text = re.sub(r'[\\)\\(\\.\\,\\-]', ' ', text)\n",
" text = re.sub(r'[^\\w\\s]', ' ', text)\n",
" words=[word for word in text.split()]\n",
" temp=zip(*[words[i:] for i in range(0,ngram)])\n",
" ans=[' '.join(ngram) for ngram in temp]\n",
@ -141,8 +141,7 @@
" if tmp_probs[i] == 1:\n",
" tmp_probs[i] = 0.1\n",
" else:\n",
" c = probs[word_2][min(probs[word_2].keys(), key=(lambda k: probs[word_2][k]))] / 10\n",
" tmp_probs[i] = probs[word_1][i] * c\n",
" tmp_probs[i] = probs[word_1][i] / 5\n",
" else:\n",
" tmp_probs = probs[word_1]\n",
" else:\n",
@ -172,7 +171,7 @@
" t = i[0]\n",
" t = re.sub(r'[\\-] ', '', t).lower()\n",
" if True:\n",
" t = re.sub(r'[\\)\\(\\.\\,\\-]', ' ', t)\n",
" t = re.sub(r'[^\\w\\s]', ' ', t)\n",
" words=[word for word in t.split()]\n",
" found_words.append(find_word(words[-1], ' '.join(words[-2:])))\n",
" return found_words\n",

7
run.py
View File

@ -39,7 +39,7 @@ def print_example(data, words, idx):
def generate_N_grams(text, ngram=1, no_punctuation=True):
text = re.sub(r'[\-] ', '', text).lower()
if no_punctuation:
text = re.sub(r'[\)\(\.\,\-]', ' ', text)
text = re.sub(r'[^\w\s]', ' ', text)
words=[word for word in text.split()]
temp=zip(*[words[i:] for i in range(0,ngram)])
ans=[' '.join(ngram) for ngram in temp]
@ -86,8 +86,7 @@ def find_word(word_1, word_2):
if tmp_probs[i] == 1:
tmp_probs[i] = 0.1
else:
c = probs[word_2][min(probs[word_2].keys(), key=(lambda k: probs[word_2][k]))] / 10
tmp_probs[i] = probs[word_1][i] * c
tmp_probs[i] = probs[word_1][i] / 5
else:
tmp_probs = probs[word_1]
else:
@ -110,7 +109,7 @@ def find_words(data):
t = i[0]
t = re.sub(r'[\-] ', '', t).lower()
if True:
t = re.sub(r'[\)\(\.\,\-]', ' ', t)
t = re.sub(r'[^\w\s]', ' ', t)
words=[word for word in t.split()]
found_words.append(find_word(words[-1], ' '.join(words[-2:])))
return found_words

File diff suppressed because it is too large Load Diff