lab 12-14

This commit is contained in:
Bartosz Pietrzak 2024-05-28 21:18:57 +02:00
parent e02ff5ab39
commit 054d45b9ed
2 changed files with 601 additions and 25 deletions

File diff suppressed because one or more lines are too long

View File

@ -44,7 +44,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 1,
"id": "familiar-terrace",
"metadata": {
"scrolled": true
@ -120,13 +120,47 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 27,
"id": "economic-southeast",
"metadata": {},
"outputs": [],
"source": [
"def correct_text(text):\n",
" return []"
"def load_data(zip_file_path):\n",
" with ZipFile(zip_file_path) as zip_f:\n",
" with zip_f.open('hunspell_pl.txt') as f:\n",
" return set([line.strip().lower() for line in f.read().decode('utf-8').splitlines()])\n",
"\n",
"def correct_words(sentence, dictionary):\n",
" \"\"\"\n",
" 0 - incorrect\n",
" 1 - correct\n",
" \"\"\"\n",
" return [(word, 1) if word in dictionary\n",
" else (word, 0)\n",
" for word in sentence.lower().split()]\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "bba15ae8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('hakunamatata', 0)]\n",
"[('czy', 1), ('dobrze', 1), ('pisze', 0)]\n",
"[('ala', 1), ('ma', 1), ('kota', 1)]\n"
]
}
],
"source": [
"data = load_data('data/hunspell_pl.zip')\n",
"print(correct_words('Hakunamatata', data))\n",
"print(correct_words('Czy dobrze pisze', data))\n",
"print(correct_words('Ala ma kota', data))"
]
},
{
@ -168,13 +202,32 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 33,
"id": "built-sally",
"metadata": {},
"outputs": [],
"source": [
"alphabet = 'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'\n",
"\n",
"def L1(w):\n",
" return []"
" edits = set()\n",
" for i in range(len(w) + 1):\n",
" for c in alphabet:\n",
" if i < len(w):\n",
" edits.add(w[:i] + c + w[i+1:])\n",
" edits.add(w[:i] + c + w[i:])\n",
" if i < len(w):\n",
" edits.add(w[:i] + w[i+1:])\n",
" return edits\n",
"\n",
"def L2(l1):\n",
" edits = set()\n",
" for word in l1:\n",
" edits.update(L1(word))\n",
" return edits\n",
"\n",
"def S(edits, dictionary):\n",
" return set(dictionary).intersection(set(edits))"
]
},
{
@ -187,13 +240,37 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 39,
"id": "coordinated-cooperation",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['cześć'], ['mam'], ['na'], {'gajek', 'tajner', 'jasne', 'janne', 'jajem', 'janet', 'jacek', 'janez', 'banek', 'jarek', 'panek', 'ranek', 'majtek', 'danek', 'jadnak', 'jenek', 'najdek', 'jaje', 'jane', 'kajtek', 'jonek', 'jamnik', 'janik', 'pajek', 'janem', 'gajnik', 'jajnik', 'jasiek', 'garnek', 'majek', 'bajek', 'jasek', 'jasnej', 'janek', 'tajnik', 'ganek'}]\n"
]
}
],
"source": [
"def generate_suggestions(w):\n",
" return []"
"def apply_edits(sentence, dictionary):\n",
" suggestions = []\n",
" for word, is_misspelled in sentence:\n",
" if is_misspelled == 0:\n",
" words = generate_suggestions(word, dictionary)\n",
" suggestions.append(words)\n",
" else:\n",
" suggestions.append([word])\n",
" return suggestions\n",
"\n",
"def generate_suggestions(w, dictionary):\n",
" l1 = L1(w)\n",
" l2 = L2(l1)\n",
" s1 = S(l1, dictionary)\n",
" s2 = S(l2, dictionary)\n",
" return s1.union(s2)\n",
"\n",
"print(apply_edits(correct_words('Cześć mam na jajnek', data), data))"
]
}
],
@ -216,7 +293,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.0"
},
"subtitle": "13,14. Korekta pisowni",
"title": "Komputerowe wspomaganie tłumaczenia",