forked from bfijalkowski/KWT-2024
lab 12-14
This commit is contained in:
parent
e02ff5ab39
commit
054d45b9ed
527
lab/lab_12.ipynb
527
lab/lab_12.ipynb
File diff suppressed because one or more lines are too long
@ -44,7 +44,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 1,
|
||||||
"id": "familiar-terrace",
|
"id": "familiar-terrace",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
@ -120,13 +120,47 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 27,
|
||||||
"id": "economic-southeast",
|
"id": "economic-southeast",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def correct_text(text):\n",
|
"def load_data(zip_file_path):\n",
|
||||||
" return []"
|
" with ZipFile(zip_file_path) as zip_f:\n",
|
||||||
|
" with zip_f.open('hunspell_pl.txt') as f:\n",
|
||||||
|
" return set([line.strip().lower() for line in f.read().decode('utf-8').splitlines()])\n",
|
||||||
|
"\n",
|
||||||
|
"def correct_words(sentence, dictionary):\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" 0 - incorrect\n",
|
||||||
|
" 1 - correct\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" return [(word, 1) if word in dictionary\n",
|
||||||
|
" else (word, 0)\n",
|
||||||
|
" for word in sentence.lower().split()]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"id": "bba15ae8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[('hakunamatata', 0)]\n",
|
||||||
|
"[('czy', 1), ('dobrze', 1), ('pisze', 0)]\n",
|
||||||
|
"[('ala', 1), ('ma', 1), ('kota', 1)]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"data = load_data('data/hunspell_pl.zip')\n",
|
||||||
|
"print(correct_words('Hakunamatata', data))\n",
|
||||||
|
"print(correct_words('Czy dobrze pisze', data))\n",
|
||||||
|
"print(correct_words('Ala ma kota', data))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -168,13 +202,32 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 33,
|
||||||
"id": "built-sally",
|
"id": "built-sally",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"alphabet = 'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'\n",
|
||||||
|
"\n",
|
||||||
"def L1(w):\n",
|
"def L1(w):\n",
|
||||||
" return []"
|
" edits = set()\n",
|
||||||
|
" for i in range(len(w) + 1):\n",
|
||||||
|
" for c in alphabet:\n",
|
||||||
|
" if i < len(w):\n",
|
||||||
|
" edits.add(w[:i] + c + w[i+1:])\n",
|
||||||
|
" edits.add(w[:i] + c + w[i:])\n",
|
||||||
|
" if i < len(w):\n",
|
||||||
|
" edits.add(w[:i] + w[i+1:])\n",
|
||||||
|
" return edits\n",
|
||||||
|
"\n",
|
||||||
|
"def L2(l1):\n",
|
||||||
|
" edits = set()\n",
|
||||||
|
" for word in l1:\n",
|
||||||
|
" edits.update(L1(word))\n",
|
||||||
|
" return edits\n",
|
||||||
|
"\n",
|
||||||
|
"def S(edits, dictionary):\n",
|
||||||
|
" return set(dictionary).intersection(set(edits))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -187,13 +240,37 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 39,
|
||||||
"id": "coordinated-cooperation",
|
"id": "coordinated-cooperation",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[['cześć'], ['mam'], ['na'], {'gajek', 'tajner', 'jasne', 'janne', 'jajem', 'janet', 'jacek', 'janez', 'banek', 'jarek', 'panek', 'ranek', 'majtek', 'danek', 'jadnak', 'jenek', 'najdek', 'jaje', 'jane', 'kajtek', 'jonek', 'jamnik', 'janik', 'pajek', 'janem', 'gajnik', 'jajnik', 'jasiek', 'garnek', 'majek', 'bajek', 'jasek', 'jasnej', 'janek', 'tajnik', 'ganek'}]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"def generate_suggestions(w):\n",
|
"def apply_edits(sentence, dictionary):\n",
|
||||||
" return []"
|
" suggestions = []\n",
|
||||||
|
" for word, is_misspelled in sentence:\n",
|
||||||
|
" if is_misspelled == 0:\n",
|
||||||
|
" words = generate_suggestions(word, dictionary)\n",
|
||||||
|
" suggestions.append(words)\n",
|
||||||
|
" else:\n",
|
||||||
|
" suggestions.append([word])\n",
|
||||||
|
" return suggestions\n",
|
||||||
|
"\n",
|
||||||
|
"def generate_suggestions(w, dictionary):\n",
|
||||||
|
" l1 = L1(w)\n",
|
||||||
|
" l2 = L2(l1)\n",
|
||||||
|
" s1 = S(l1, dictionary)\n",
|
||||||
|
" s2 = S(l2, dictionary)\n",
|
||||||
|
" return s1.union(s2)\n",
|
||||||
|
"\n",
|
||||||
|
"print(apply_edits(correct_words('Cześć mam na jajnek', data), data))"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -216,7 +293,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.11.0"
|
||||||
},
|
},
|
||||||
"subtitle": "13,14. Korekta pisowni",
|
"subtitle": "13,14. Korekta pisowni",
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
Loading…
Reference in New Issue
Block a user