2024-05-31 lab13-14
This commit is contained in:
parent
5bbba14a57
commit
d4038eb5ae
@ -44,7 +44,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 30,
|
||||||
"id": "familiar-terrace",
|
"id": "familiar-terrace",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
@ -120,13 +120,62 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 31,
|
||||||
|
"id": "d0970691",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pl_dict = set()\n",
|
||||||
|
"with ZipFile('data/hunspell_pl.zip') as zipped_dictionary:\n",
|
||||||
|
" with zipped_dictionary.open('hunspell_pl.txt') as dictionary_file:\n",
|
||||||
|
" for line_bytes in dictionary_file:\n",
|
||||||
|
" line = line_bytes.decode('utf-8')\n",
|
||||||
|
" pl_dict.add(line.rstrip())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 32,
|
||||||
"id": "economic-southeast",
|
"id": "economic-southeast",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def correct_text(text):\n",
|
"def correct_text(text):\n",
|
||||||
" return []"
|
" words = text.split()\n",
|
||||||
|
"\n",
|
||||||
|
" result = []\n",
|
||||||
|
" for word in words:\n",
|
||||||
|
" if word in pl_dict:\n",
|
||||||
|
" result.append((word, \"correct\"))\n",
|
||||||
|
" else:\n",
|
||||||
|
" result.append((word, \"incorrect\"))\n",
|
||||||
|
"\n",
|
||||||
|
" return result"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 33,
|
||||||
|
"id": "771a6c40",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[('kalend', 'incorrect'),\n",
|
||||||
|
" ('kalendarz', 'correct'),\n",
|
||||||
|
" ('kaledoński', 'correct'),\n",
|
||||||
|
" ('kalejdoskopowy', 'correct'),\n",
|
||||||
|
" ('kalendarium', 'correct')]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 33,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"correct_text(\"kalend kalendarz kaledoński kalejdoskopowy kalendarium\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -168,13 +217,51 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 34,
|
||||||
"id": "built-sally",
|
"id": "built-sally",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def L1(w):\n",
|
"def L1(w):\n",
|
||||||
" return []"
|
" letters = 'abcdefghijklmnopqrstuvwxyząćęłńóśźż'\n",
|
||||||
|
" splits = [(w[:i], w[i:]) for i in range(len(w) + 1)]\n",
|
||||||
|
" \n",
|
||||||
|
" deletes = [L + R[1:] for L, R in splits if R]\n",
|
||||||
|
" transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]\n",
|
||||||
|
" replaces = [L + c + R[1:] for L, R in splits if R for c in letters]\n",
|
||||||
|
" inserts = [L + c + R for L, R in splits for c in letters]\n",
|
||||||
|
" \n",
|
||||||
|
" return set(deletes + transposes + replaces + inserts)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 35,
|
||||||
|
"id": "dc3ffbfe",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['kaqendarz',\n",
|
||||||
|
" 'kalenydarz',\n",
|
||||||
|
" 'kalendadz',\n",
|
||||||
|
" 'kalenżarz',\n",
|
||||||
|
" 'kalendlrz',\n",
|
||||||
|
" 'kalendaóz',\n",
|
||||||
|
" 'kalvendarz',\n",
|
||||||
|
" 'kalendarzv',\n",
|
||||||
|
" 'katendarz',\n",
|
||||||
|
" 'kolendarz']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 35,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"list(L1(\"kalendarz\"))[:10]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -187,13 +274,49 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 36,
|
||||||
"id": "coordinated-cooperation",
|
"id": "coordinated-cooperation",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def generate_suggestions(w):\n",
|
"def generate_suggestions(w):\n",
|
||||||
" return []"
|
" # Generate L1(w)\n",
|
||||||
|
" L1_set = L1(w)\n",
|
||||||
|
" # Generate S1(w)\n",
|
||||||
|
" S1 = L1_set.intersection(pl_dict)\n",
|
||||||
|
"\n",
|
||||||
|
" # Generate L2(w)\n",
|
||||||
|
" L2_set = set()\n",
|
||||||
|
" for v in L1_set:\n",
|
||||||
|
" L2_set.update(L1(v))\n",
|
||||||
|
" \n",
|
||||||
|
" # Generate S2(w)\n",
|
||||||
|
" S2 = L2_set.intersection(pl_dict)\n",
|
||||||
|
"\n",
|
||||||
|
" # Combine S1 and S2 and return as list\n",
|
||||||
|
" suggestions = S1.union(S2)\n",
|
||||||
|
" return list(suggestions)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 37,
|
||||||
|
"id": "e0c572ce",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['kalendarz', 'kalandar', 'kalendarzyk', 'arendarz']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 37,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"generate_suggestions(\"kalendarz\")"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -216,7 +339,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.10.14"
|
||||||
},
|
},
|
||||||
"subtitle": "13,14. Korekta pisowni",
|
"subtitle": "13,14. Korekta pisowni",
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
Loading…
Reference in New Issue
Block a user