forked from bfijalkowski/KWT-2024
lab 12-14
This commit is contained in:
parent
e02ff5ab39
commit
054d45b9ed
527
lab/lab_12.ipynb
527
lab/lab_12.ipynb
File diff suppressed because one or more lines are too long
@ -44,7 +44,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 1,
|
||||
"id": "familiar-terrace",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
@ -120,13 +120,47 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 27,
|
||||
"id": "economic-southeast",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def correct_text(text):\n",
|
||||
" return []"
|
||||
"def load_data(zip_file_path):\n",
|
||||
" with ZipFile(zip_file_path) as zip_f:\n",
|
||||
" with zip_f.open('hunspell_pl.txt') as f:\n",
|
||||
" return set([line.strip().lower() for line in f.read().decode('utf-8').splitlines()])\n",
|
||||
"\n",
|
||||
"def correct_words(sentence, dictionary):\n",
|
||||
" \"\"\"\n",
|
||||
" 0 - incorrect\n",
|
||||
" 1 - correct\n",
|
||||
" \"\"\"\n",
|
||||
" return [(word, 1) if word in dictionary\n",
|
||||
" else (word, 0)\n",
|
||||
" for word in sentence.lower().split()]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "bba15ae8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[('hakunamatata', 0)]\n",
|
||||
"[('czy', 1), ('dobrze', 1), ('pisze', 0)]\n",
|
||||
"[('ala', 1), ('ma', 1), ('kota', 1)]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data = load_data('data/hunspell_pl.zip')\n",
|
||||
"print(correct_words('Hakunamatata', data))\n",
|
||||
"print(correct_words('Czy dobrze pisze', data))\n",
|
||||
"print(correct_words('Ala ma kota', data))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -168,13 +202,32 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 33,
|
||||
"id": "built-sally",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"alphabet = 'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'\n",
|
||||
"\n",
|
||||
"def L1(w):\n",
|
||||
" return []"
|
||||
" edits = set()\n",
|
||||
" for i in range(len(w) + 1):\n",
|
||||
" for c in alphabet:\n",
|
||||
" if i < len(w):\n",
|
||||
" edits.add(w[:i] + c + w[i+1:])\n",
|
||||
" edits.add(w[:i] + c + w[i:])\n",
|
||||
" if i < len(w):\n",
|
||||
" edits.add(w[:i] + w[i+1:])\n",
|
||||
" return edits\n",
|
||||
"\n",
|
||||
"def L2(l1):\n",
|
||||
" edits = set()\n",
|
||||
" for word in l1:\n",
|
||||
" edits.update(L1(word))\n",
|
||||
" return edits\n",
|
||||
"\n",
|
||||
"def S(edits, dictionary):\n",
|
||||
" return set(dictionary).intersection(set(edits))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -187,13 +240,37 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 39,
|
||||
"id": "coordinated-cooperation",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[['cześć'], ['mam'], ['na'], {'gajek', 'tajner', 'jasne', 'janne', 'jajem', 'janet', 'jacek', 'janez', 'banek', 'jarek', 'panek', 'ranek', 'majtek', 'danek', 'jadnak', 'jenek', 'najdek', 'jaje', 'jane', 'kajtek', 'jonek', 'jamnik', 'janik', 'pajek', 'janem', 'gajnik', 'jajnik', 'jasiek', 'garnek', 'majek', 'bajek', 'jasek', 'jasnej', 'janek', 'tajnik', 'ganek'}]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def generate_suggestions(w):\n",
|
||||
" return []"
|
||||
"def apply_edits(sentence, dictionary):\n",
|
||||
" suggestions = []\n",
|
||||
" for word, is_misspelled in sentence:\n",
|
||||
" if is_misspelled == 0:\n",
|
||||
" words = generate_suggestions(word, dictionary)\n",
|
||||
" suggestions.append(words)\n",
|
||||
" else:\n",
|
||||
" suggestions.append([word])\n",
|
||||
" return suggestions\n",
|
||||
"\n",
|
||||
"def generate_suggestions(w, dictionary):\n",
|
||||
" l1 = L1(w)\n",
|
||||
" l2 = L2(l1)\n",
|
||||
" s1 = S(l1, dictionary)\n",
|
||||
" s2 = S(l2, dictionary)\n",
|
||||
" return s1.union(s2)\n",
|
||||
"\n",
|
||||
"print(apply_edits(correct_words('Cześć mam na jajnek', data), data))"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -216,7 +293,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.11.0"
|
||||
},
|
||||
"subtitle": "13,14. Korekta pisowni",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
Loading…
Reference in New Issue
Block a user