diff --git a/lab/lab_13-14.ipynb b/lab/lab_13-14.ipynb index 740d7d9..031d2b5 100644 --- a/lab/lab_13-14.ipynb +++ b/lab/lab_13-14.ipynb @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 30, "id": "familiar-terrace", "metadata": { "scrolled": true @@ -120,13 +120,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, + "id": "d0970691", + "metadata": {}, + "outputs": [], + "source": [ + "pl_dict = set()\n", + "with ZipFile('data/hunspell_pl.zip') as zipped_dictionary:\n", + " with zipped_dictionary.open('hunspell_pl.txt') as dictionary_file:\n", + " for line_bytes in dictionary_file:\n", + " line = line_bytes.decode('utf-8')\n", + " pl_dict.add(line.rstrip())" + ] + }, + { + "cell_type": "code", + "execution_count": 32, "id": "economic-southeast", "metadata": {}, "outputs": [], "source": [ "def correct_text(text):\n", - " return []" + " words = text.split()\n", + "\n", + " result = []\n", + " for word in words:\n", + " if word in pl_dict:\n", + " result.append((word, \"correct\"))\n", + " else:\n", + " result.append((word, \"incorrect\"))\n", + "\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "771a6c40", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('kalend', 'incorrect'),\n", + " ('kalendarz', 'correct'),\n", + " ('kaledoński', 'correct'),\n", + " ('kalejdoskopowy', 'correct'),\n", + " ('kalendarium', 'correct')]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "correct_text(\"kalend kalendarz kaledoński kalejdoskopowy kalendarium\")" ] }, { @@ -168,13 +217,51 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 34, "id": "built-sally", "metadata": {}, "outputs": [], "source": [ "def L1(w):\n", - " return []" + " letters = 'abcdefghijklmnopqrstuvwxyząćęłńóśźż'\n", + " splits = [(w[:i], w[i:]) for i in range(len(w) + 1)]\n", + " \n", + " deletes = [L + R[1:] for L, R in splits if R]\n", + " transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]\n", + " replaces = [L + c + R[1:] for L, R in splits if R for c in letters]\n", + " inserts = [L + c + R for L, R in splits for c in letters]\n", + " \n", + " return set(deletes + transposes + replaces + inserts)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "dc3ffbfe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['kaqendarz',\n", + " 'kalenydarz',\n", + " 'kalendadz',\n", + " 'kalenżarz',\n", + " 'kalendlrz',\n", + " 'kalendaóz',\n", + " 'kalvendarz',\n", + " 'kalendarzv',\n", + " 'katendarz',\n", + " 'kolendarz']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(L1(\"kalendarz\"))[:10]" ] }, { @@ -187,13 +274,49 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 36, "id": "coordinated-cooperation", "metadata": {}, "outputs": [], "source": [ "def generate_suggestions(w):\n", - " return []" + " # Generate L1(w)\n", + " L1_set = L1(w)\n", + " # Generate S1(w)\n", + " S1 = L1_set.intersection(pl_dict)\n", + "\n", + " # Generate L2(w)\n", + " L2_set = set()\n", + " for v in L1_set:\n", + " L2_set.update(L1(v))\n", + " \n", + " # Generate S2(w)\n", + " S2 = L2_set.intersection(pl_dict)\n", + "\n", + " # Combine S1 and S2 and return as list\n", + " suggestions = S1.union(S2)\n", + " return list(suggestions)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "e0c572ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['kalendarz', 'kalandar', 'kalendarzyk', 'arendarz']" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "generate_suggestions(\"kalendarz\")" ] } ], @@ -216,7 +339,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.14" }, "subtitle": "13,14. Korekta pisowni", "title": "Komputerowe wspomaganie tłumaczenia",