2024-05-31 lab13-14

2024-05-31 20:52:27 +02:00 · 2024-05-31 20:52:27 +02:00 · d4038eb5ae
commit d4038eb5ae
parent 5bbba14a57
1 changed files with 131 additions and 8 deletions
--- a/lab/lab_13-14.ipynb
+++ b/lab/lab_13-14.ipynb
@ -44,7 +44,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 30,
   "id": "familiar-terrace",
   "metadata": {
    "scrolled": true
@ -120,13 +120,62 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
+   "id": "d0970691",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pl_dict = set()\n",
+    "with ZipFile('data/hunspell_pl.zip') as zipped_dictionary:\n",
+    "    with zipped_dictionary.open('hunspell_pl.txt') as dictionary_file:\n",
+    "        for line_bytes in dictionary_file:\n",
+    "            line = line_bytes.decode('utf-8')\n",
+    "            pl_dict.add(line.rstrip())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
   "id": "economic-southeast",
   "metadata": {},
   "outputs": [],
   "source": [
    "def correct_text(text):\n",
-    "    return []"
+    "    words = text.split()\n",
+    "\n",
+    "    result = []\n",
+    "    for word in words:\n",
+    "        if word in pl_dict:\n",
+    "            result.append((word, \"correct\"))\n",
+    "        else:\n",
+    "            result.append((word, \"incorrect\"))\n",
+    "\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "771a6c40",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('kalend', 'incorrect'),\n",
+       " ('kalendarz', 'correct'),\n",
+       " ('kaledoński', 'correct'),\n",
+       " ('kalejdoskopowy', 'correct'),\n",
+       " ('kalendarium', 'correct')]"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "correct_text(\"kalend kalendarz kaledoński kalejdoskopowy kalendarium\")"
   ]
  },
  {
@ -168,13 +217,51 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 34,
   "id": "built-sally",
   "metadata": {},
   "outputs": [],
   "source": [
    "def L1(w):\n",
-    "    return []"
+    "    letters = 'abcdefghijklmnopqrstuvwxyząćęłńóśźż'\n",
+    "    splits = [(w[:i], w[i:]) for i in range(len(w) + 1)]\n",
+    "    \n",
+    "    deletes = [L + R[1:] for L, R in splits if R]\n",
+    "    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]\n",
+    "    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]\n",
+    "    inserts = [L + c + R for L, R in splits for c in letters]\n",
+    "    \n",
+    "    return set(deletes + transposes + replaces + inserts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "dc3ffbfe",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['kaqendarz',\n",
+       " 'kalenydarz',\n",
+       " 'kalendadz',\n",
+       " 'kalenżarz',\n",
+       " 'kalendlrz',\n",
+       " 'kalendaóz',\n",
+       " 'kalvendarz',\n",
+       " 'kalendarzv',\n",
+       " 'katendarz',\n",
+       " 'kolendarz']"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(L1(\"kalendarz\"))[:10]"
   ]
  },
  {
@ -187,13 +274,49 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 36,
   "id": "coordinated-cooperation",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_suggestions(w):\n",
-    "    return []"
+    "    # Generate L1(w)\n",
+    "    L1_set = L1(w)\n",
+    "    # Generate S1(w)\n",
+    "    S1 = L1_set.intersection(pl_dict)\n",
+    "\n",
+    "    # Generate L2(w)\n",
+    "    L2_set = set()\n",
+    "    for v in L1_set:\n",
+    "        L2_set.update(L1(v))\n",
+    "    \n",
+    "    # Generate S2(w)\n",
+    "    S2 = L2_set.intersection(pl_dict)\n",
+    "\n",
+    "    # Combine S1 and S2 and return as list\n",
+    "    suggestions = S1.union(S2)\n",
+    "    return list(suggestions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "id": "e0c572ce",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['kalendarz', 'kalandar', 'kalendarzyk', 'arendarz']"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "generate_suggestions(\"kalendarz\")"
   ]
  }
 ],
@ -216,7 +339,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.14"
  },
  "subtitle": "13,14. Korekta pisowni",
  "title": "Komputerowe wspomaganie tłumaczenia",