lab 12-14

2024-05-28 21:18:57 +02:00 · 2024-05-28 21:18:57 +02:00 · 054d45b9ed
commit 054d45b9ed
parent e02ff5ab39
2 changed files with 601 additions and 25 deletions
--- a/lab/lab_12.ipynb
+++ b/lab/lab_12.ipynb
--- a/lab/lab_13-14.ipynb
+++ b/lab/lab_13-14.ipynb
@ -44,7 +44,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
   "id": "familiar-terrace",
   "metadata": {
    "scrolled": true
@ -120,13 +120,47 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 27,
   "id": "economic-southeast",
   "metadata": {},
   "outputs": [],
   "source": [
-    "def correct_text(text):\n",
-    "    return []"
+    "def load_data(zip_file_path):\n",
+    "    with ZipFile(zip_file_path) as zip_f:\n",
+    "        with zip_f.open('hunspell_pl.txt') as f:\n",
+    "            return set([line.strip().lower() for line in f.read().decode('utf-8').splitlines()])\n",
+    "\n",
+    "def correct_words(sentence, dictionary):\n",
+    "    \"\"\"\n",
+    "    0 - incorrect\n",
+    "    1 - correct\n",
+    "    \"\"\"\n",
+    "    return [(word, 1) if word in dictionary\n",
+    "            else (word, 0)\n",
+    "            for word in sentence.lower().split()]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "bba15ae8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('hakunamatata', 0)]\n",
+      "[('czy', 1), ('dobrze', 1), ('pisze', 0)]\n",
+      "[('ala', 1), ('ma', 1), ('kota', 1)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = load_data('data/hunspell_pl.zip')\n",
+    "print(correct_words('Hakunamatata', data))\n",
+    "print(correct_words('Czy dobrze pisze', data))\n",
+    "print(correct_words('Ala ma kota', data))"
   ]
  },
  {
@ -168,13 +202,32 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 33,
   "id": "built-sally",
   "metadata": {},
   "outputs": [],
   "source": [
+    "alphabet = 'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'\n",
+    "\n",
    "def L1(w):\n",
-    "    return []"
+    "    edits = set()\n",
+    "    for i in range(len(w) + 1):\n",
+    "        for c in alphabet:\n",
+    "            if i < len(w):\n",
+    "                edits.add(w[:i] + c + w[i+1:])\n",
+    "            edits.add(w[:i] + c + w[i:])\n",
+    "        if i < len(w):\n",
+    "            edits.add(w[:i] + w[i+1:])\n",
+    "    return edits\n",
+    "\n",
+    "def L2(l1):\n",
+    "    edits = set()\n",
+    "    for word in l1:\n",
+    "        edits.update(L1(word))\n",
+    "    return edits\n",
+    "\n",
+    "def S(edits, dictionary):\n",
+    "    return set(dictionary).intersection(set(edits))"
   ]
  },
  {
@ -187,13 +240,37 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 39,
   "id": "coordinated-cooperation",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['cześć'], ['mam'], ['na'], {'gajek', 'tajner', 'jasne', 'janne', 'jajem', 'janet', 'jacek', 'janez', 'banek', 'jarek', 'panek', 'ranek', 'majtek', 'danek', 'jadnak', 'jenek', 'najdek', 'jaje', 'jane', 'kajtek', 'jonek', 'jamnik', 'janik', 'pajek', 'janem', 'gajnik', 'jajnik', 'jasiek', 'garnek', 'majek', 'bajek', 'jasek', 'jasnej', 'janek', 'tajnik', 'ganek'}]\n"
+     ]
+    }
+   ],
   "source": [
-    "def generate_suggestions(w):\n",
-    "    return []"
+    "def apply_edits(sentence, dictionary):\n",
+    "    suggestions = []\n",
+    "    for word, is_misspelled in sentence:\n",
+    "        if is_misspelled == 0:\n",
+    "            words = generate_suggestions(word, dictionary)\n",
+    "            suggestions.append(words)\n",
+    "        else:\n",
+    "            suggestions.append([word])\n",
+    "    return suggestions\n",
+    "\n",
+    "def generate_suggestions(w, dictionary):\n",
+    "    l1 = L1(w)\n",
+    "    l2 = L2(l1)\n",
+    "    s1 = S(l1, dictionary)\n",
+    "    s2 = S(l2, dictionary)\n",
+    "    return s1.union(s2)\n",
+    "\n",
+    "print(apply_edits(correct_words('Cześć mam na jajnek', data), data))"
   ]
  }
 ],
@ -216,7 +293,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.11.0"
  },
  "subtitle": "13,14. Korekta pisowni",
  "title": "Komputerowe wspomaganie tłumaczenia",