lab 12-14

This commit is contained in:
Bartosz Pietrzak 2024-05-28 21:18:57 +02:00
parent e02ff5ab39
commit 054d45b9ed
2 changed files with 601 additions and 25 deletions

File diff suppressed because one or more lines are too long

View File

@ -44,7 +44,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 1,
"id": "familiar-terrace", "id": "familiar-terrace",
"metadata": { "metadata": {
"scrolled": true "scrolled": true
@ -120,13 +120,47 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 27,
"id": "economic-southeast", "id": "economic-southeast",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def correct_text(text):\n", "def load_data(zip_file_path):\n",
" return []" " with ZipFile(zip_file_path) as zip_f:\n",
" with zip_f.open('hunspell_pl.txt') as f:\n",
" return set([line.strip().lower() for line in f.read().decode('utf-8').splitlines()])\n",
"\n",
"def correct_words(sentence, dictionary):\n",
" \"\"\"\n",
" 0 - incorrect\n",
" 1 - correct\n",
" \"\"\"\n",
" return [(word, 1) if word in dictionary\n",
" else (word, 0)\n",
" for word in sentence.lower().split()]\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "bba15ae8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('hakunamatata', 0)]\n",
"[('czy', 1), ('dobrze', 1), ('pisze', 0)]\n",
"[('ala', 1), ('ma', 1), ('kota', 1)]\n"
]
}
],
"source": [
"data = load_data('data/hunspell_pl.zip')\n",
"print(correct_words('Hakunamatata', data))\n",
"print(correct_words('Czy dobrze pisze', data))\n",
"print(correct_words('Ala ma kota', data))"
] ]
}, },
{ {
@ -168,13 +202,32 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 33,
"id": "built-sally", "id": "built-sally",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"alphabet = 'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'\n",
"\n",
"def L1(w):\n", "def L1(w):\n",
" return []" " edits = set()\n",
" for i in range(len(w) + 1):\n",
" for c in alphabet:\n",
" if i < len(w):\n",
" edits.add(w[:i] + c + w[i+1:])\n",
" edits.add(w[:i] + c + w[i:])\n",
" if i < len(w):\n",
" edits.add(w[:i] + w[i+1:])\n",
" return edits\n",
"\n",
"def L2(l1):\n",
" edits = set()\n",
" for word in l1:\n",
" edits.update(L1(word))\n",
" return edits\n",
"\n",
"def S(edits, dictionary):\n",
" return set(dictionary).intersection(set(edits))"
] ]
}, },
{ {
@ -187,13 +240,37 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 39,
"id": "coordinated-cooperation", "id": "coordinated-cooperation",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['cześć'], ['mam'], ['na'], {'gajek', 'tajner', 'jasne', 'janne', 'jajem', 'janet', 'jacek', 'janez', 'banek', 'jarek', 'panek', 'ranek', 'majtek', 'danek', 'jadnak', 'jenek', 'najdek', 'jaje', 'jane', 'kajtek', 'jonek', 'jamnik', 'janik', 'pajek', 'janem', 'gajnik', 'jajnik', 'jasiek', 'garnek', 'majek', 'bajek', 'jasek', 'jasnej', 'janek', 'tajnik', 'ganek'}]\n"
]
}
],
"source": [ "source": [
"def generate_suggestions(w):\n", "def apply_edits(sentence, dictionary):\n",
" return []" " suggestions = []\n",
" for word, is_misspelled in sentence:\n",
" if is_misspelled == 0:\n",
" words = generate_suggestions(word, dictionary)\n",
" suggestions.append(words)\n",
" else:\n",
" suggestions.append([word])\n",
" return suggestions\n",
"\n",
"def generate_suggestions(w, dictionary):\n",
" l1 = L1(w)\n",
" l2 = L2(l1)\n",
" s1 = S(l1, dictionary)\n",
" s2 = S(l2, dictionary)\n",
" return s1.union(s2)\n",
"\n",
"print(apply_edits(correct_words('Cześć mam na jajnek', data), data))"
] ]
} }
], ],
@ -216,7 +293,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.8.10" "version": "3.11.0"
}, },
"subtitle": "13,14. Korekta pisowni", "subtitle": "13,14. Korekta pisowni",
"title": "Komputerowe wspomaganie tłumaczenia", "title": "Komputerowe wspomaganie tłumaczenia",