This commit is contained in:
fraszosia@gmail.com 2022-04-01 15:41:25 +02:00
parent 57fd77f584
commit 51220186a3
4 changed files with 15450 additions and 15452 deletions

File diff suppressed because it is too large Load Diff

View File

@ -71,7 +71,7 @@
"def generate_N_grams(text, ngram=1, no_punctuation=True):\n", "def generate_N_grams(text, ngram=1, no_punctuation=True):\n",
" text = re.sub(r'[\\-] ', '', text).lower()\n", " text = re.sub(r'[\\-] ', '', text).lower()\n",
" if no_punctuation:\n", " if no_punctuation:\n",
" text = re.sub(r'[\\)\\(\\.\\,\\-]', ' ', text)\n", " text = re.sub(r'[^\\w\\s]', ' ', text)\n",
" words=[word for word in text.split()]\n", " words=[word for word in text.split()]\n",
" temp=zip(*[words[i:] for i in range(0,ngram)])\n", " temp=zip(*[words[i:] for i in range(0,ngram)])\n",
" ans=[' '.join(ngram) for ngram in temp]\n", " ans=[' '.join(ngram) for ngram in temp]\n",
@ -141,8 +141,7 @@
" if tmp_probs[i] == 1:\n", " if tmp_probs[i] == 1:\n",
" tmp_probs[i] = 0.1\n", " tmp_probs[i] = 0.1\n",
" else:\n", " else:\n",
" c = probs[word_2][min(probs[word_2].keys(), key=(lambda k: probs[word_2][k]))] / 10\n", " tmp_probs[i] = probs[word_1][i] / 5\n",
" tmp_probs[i] = probs[word_1][i] * c\n",
" else:\n", " else:\n",
" tmp_probs = probs[word_1]\n", " tmp_probs = probs[word_1]\n",
" else:\n", " else:\n",
@ -172,7 +171,7 @@
" t = i[0]\n", " t = i[0]\n",
" t = re.sub(r'[\\-] ', '', t).lower()\n", " t = re.sub(r'[\\-] ', '', t).lower()\n",
" if True:\n", " if True:\n",
" t = re.sub(r'[\\)\\(\\.\\,\\-]', ' ', t)\n", " t = re.sub(r'[^\\w\\s]', ' ', t)\n",
" words=[word for word in t.split()]\n", " words=[word for word in t.split()]\n",
" found_words.append(find_word(words[-1], ' '.join(words[-2:])))\n", " found_words.append(find_word(words[-1], ' '.join(words[-2:])))\n",
" return found_words\n", " return found_words\n",

7
run.py
View File

@ -39,7 +39,7 @@ def print_example(data, words, idx):
def generate_N_grams(text, ngram=1, no_punctuation=True): def generate_N_grams(text, ngram=1, no_punctuation=True):
text = re.sub(r'[\-] ', '', text).lower() text = re.sub(r'[\-] ', '', text).lower()
if no_punctuation: if no_punctuation:
text = re.sub(r'[\)\(\.\,\-]', ' ', text) text = re.sub(r'[^\w\s]', ' ', text)
words=[word for word in text.split()] words=[word for word in text.split()]
temp=zip(*[words[i:] for i in range(0,ngram)]) temp=zip(*[words[i:] for i in range(0,ngram)])
ans=[' '.join(ngram) for ngram in temp] ans=[' '.join(ngram) for ngram in temp]
@ -86,8 +86,7 @@ def find_word(word_1, word_2):
if tmp_probs[i] == 1: if tmp_probs[i] == 1:
tmp_probs[i] = 0.1 tmp_probs[i] = 0.1
else: else:
c = probs[word_2][min(probs[word_2].keys(), key=(lambda k: probs[word_2][k]))] / 10 tmp_probs[i] = probs[word_1][i] / 5
tmp_probs[i] = probs[word_1][i] * c
else: else:
tmp_probs = probs[word_1] tmp_probs = probs[word_1]
else: else:
@ -110,7 +109,7 @@ def find_words(data):
t = i[0] t = i[0]
t = re.sub(r'[\-] ', '', t).lower() t = re.sub(r'[\-] ', '', t).lower()
if True: if True:
t = re.sub(r'[\)\(\.\,\-]', ' ', t) t = re.sub(r'[^\w\s]', ' ', t)
words=[word for word in t.split()] words=[word for word in t.split()]
found_words.append(find_word(words[-1], ' '.join(words[-2:]))) found_words.append(find_word(words[-1], ' '.join(words[-2:])))
return found_words return found_words

File diff suppressed because it is too large Load Diff