This commit is contained in:
Krystian Osiński 2024-04-23 19:49:47 +02:00
parent 9992c8d806
commit 623c3c308c
3 changed files with 808 additions and 42 deletions

View File

@ -68,9 +68,30 @@
"id": "environmental-thread", "id": "environmental-thread",
"metadata": {}, "metadata": {},
"source": [ "source": [
"### Ćwiczenie 2: Uruchom ekstraktor terminologii (wykrywacz rzeczowników) z poprzednich zajęć na każdym dokumencie z osobna. Jako wynik ekstraktora w każdym przypadku wypisz 5 najczęściej występujących rzeczowników. Wyniki działania komendy umieść w notatniku." "import nltk\n",
"nltk.download('punkt') # Pobierz tokenizator zdania, jeśli jeszcze go nie masz\n",
"\n",
"def podziel_na_zdania(nazwa_pliku):\n",
" with open(nazwa_pliku, 'r') as plik:\n",
" tekst = plik.read()\n",
" zdania = nltk.sent_tokenize(tekst)\n",
" return zdania\n",
"\n",
"nazwa_pliku = 'tekst.txt' # Zmień na nazwę swojego pliku\n",
"zdania = podziel_na_zdania(nazwa_pliku)\n",
"\n",
"for zdanie in zdania:\n",
" print(zdanie)### Ćwiczenie 2: Uruchom ekstraktor terminologii (wykrywacz rzeczowników) z poprzednich zajęć na każdym dokumencie z osobna. Jako wynik ekstraktora w każdym przypadku wypisz 5 najczęściej występujących rzeczowników. Wyniki działania komendy umieść w notatniku."
] ]
}, },
{
"cell_type": "code",
"execution_count": null,
"id": "79fd91c4-c22e-43a0-9842-83a33bd65fa9",
"metadata": {},
"outputs": [],
"source": []
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 17,
@ -560,7 +581,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.12.1" "version": "3.9.2"
}, },
"subtitle": "4,5. Klasyfikacja tematyczna (terminologii ciąg dalszy)", "subtitle": "4,5. Klasyfikacja tematyczna (terminologii ciąg dalszy)",
"title": "Komputerowe wspomaganie tłumaczenia", "title": "Komputerowe wspomaganie tłumaczenia",

View File

@ -87,7 +87,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 40, "execution_count": 19,
"id": "documented-hacker", "id": "documented-hacker",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -95,7 +95,7 @@
"import re\n", "import re\n",
"def find_tags(text):\n", "def find_tags(text):\n",
" pos = []\n", " pos = []\n",
" regexp = r'(<([a-zA-Z])>)|(</([a-zA-Z])>)'\n", " regexp = r'(<([_a-zA-Z]]*[^<]*)>)|(</([_a-zA-Z][^<?!]*)>)'\n",
" pattern = re.compile(regexp)\n", " pattern = re.compile(regexp)\n",
" tags = pattern.finditer(text)\n", " tags = pattern.finditer(text)\n",
" for tag in tags:\n", " for tag in tags:\n",
@ -105,23 +105,28 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 41, "execution_count": 25,
"id": "52a23469-3283-48df-ba49-4d23ba0d6088", "id": "52a23469-3283-48df-ba49-4d23ba0d6088",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[(5, 8), (29, 33)]" "[(71, 83), (83, 91), (96, 105), (105, 115), (127, 138), (138, 151)]"
] ]
}, },
"execution_count": 41, "execution_count": 25,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"string = \"aaafo<r>m1aaaahform2uadaiudfo</r>m3oihdhdform2oahdoihwfform2oiadoafform11\"\n", "string = \"<2024.21.04| Documentation on how to connect to the database with XML: <Connection><server>Games</server><database>Call of Duty</database></Connection>\"\n",
"#<Connection>\n",
"# <server>Games</server>\n",
"# <database>Call of Duty</database>\n",
"#</Connection>\n",
"\n",
"find_tags(string)" "find_tags(string)"
] ]
}, },
@ -135,15 +140,169 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 362,
"id": "unauthorized-study", "id": "unauthorized-study",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def is_translatable(text):\n", "def is_translatable(text):\n",
" # 4.2.1\n", " # v 4.2.1 / version: 4.2.1\n",
" # \n", " # a) B) / aa) BB) / a. BB. / aa. BB. / 1. 2. / 2) 3)\n",
" return True" " # przykłady kodu python\n",
" # nr_indeksu (6 cyfr | z/lub bez poprzedzającego znaku 's')\n",
" \n",
" version = r'(^v [0-9].[0-9].?[0-9]?[0-9]?$)|(^ver(:*|.*) [0-9].[0-9].?[0-9]?[0-9]?$)'\n",
" ol = r'(^[a-zA-z][a-zA-z]?(\\)|.)$)|(^[0-9][0-9]?(\\)|.)$)'\n",
" index = r'(^s?\\d{6}$)'\n",
" pyt = r\"\"\"^```python\n",
".*?\n",
"```$\"\"\"\n",
" \n",
" regexp = fr'{version}|{ol}|{index}'\n",
" \n",
" if re.fullmatch(regexp,text):\n",
" return False\n",
" elif re.fullmatch(pyt,text,re.DOTALL):\n",
" return False\n",
" else:\n",
" return True\n"
]
},
{
"cell_type": "code",
"execution_count": 189,
"id": "30abd5b8-f6e3-4a59-8f53-5ff3b4899e7b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 189,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"is_translatable(\"Wiedza z tego przedmiotu może się dobrze przydać przy tworzeniu systemu dialogowego\")"
]
},
{
"cell_type": "code",
"execution_count": 190,
"id": "e3a39c8a-976b-4d66-834e-6d69c3833fe5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 190,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"is_translatable(\"Kliknij enter aby rozpocząć\")"
]
},
{
"cell_type": "code",
"execution_count": 185,
"id": "c879c8dd-9a1d-490b-a44f-4a06134f8521",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ver: 4.3.34 False\n",
"v 4.3.34 False\n"
]
}
],
"source": [
"x = is_translatable(\"ver: 4.3.34\")\n",
"y = is_translatable(\"v 4.3.34\")\n",
"\n",
"print(f\"ver: 4.3.34 {x}\\nv 4.3.34 {y}\")"
]
},
{
"cell_type": "code",
"execution_count": 184,
"id": "dcb91bed-63c3-4044-bb19-a2abe6a36c88",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"s444820 False\n",
"444820 False\n"
]
}
],
"source": [
"x = is_translatable(\"s444820\")\n",
"y = is_translatable(\"444820\")\n",
"print(f\"s444820 {x}\\n444820 {y}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 187,
"id": "59575516-df17-4e35-9b42-9ee69f9d8b0e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a) False\n",
"II. False\n",
"1. False\n",
"2. False\n"
]
}
],
"source": [
"a = is_translatable(\"a)\")\n",
"b = is_translatable(\"II.\")\n",
"c = is_translatable(\"1.\")\n",
"d = is_translatable(r\"2.\")\n",
"print(f\"a) {a}\\nII. {b}\\n1. {c}\\n2. {d}\")"
]
},
{
"cell_type": "code",
"execution_count": 188,
"id": "40d913e6-48aa-4ece-81b4-bbe74efb5533",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 188,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ex = r\"\"\"```python\n",
"def func():\n",
" print(HelloEarth)\n",
"func()\n",
"```\"\"\"\n",
"\n",
"is_translatable(ex)"
] ]
}, },
{ {
@ -156,18 +315,66 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 435,
"id": "beautiful-mathematics", "id": "bbb6d96e-231d-48fb-a6a5-f05cb8c84b87",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"[[(0, 10), '21', '04', '2024'],\n",
" [(23, 33), '20', '22', '2021'],\n",
" [(54, 64), '01', '01', '1999']]"
]
},
"execution_count": 435,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"import re\n",
"\n",
"def find_dates(text):\n", "def find_dates(text):\n",
" #YYYY-MM-DD\n", " formats = [\n",
" #DD-MM-YYYY\n", " r'(\\d{4}-\\d{2}-\\d{2})', # YYYY-MM-DD\n",
" #\n", " r'(\\d{2}-\\d{2}-\\d{4})', # DD-MM-YYYY\n",
" #\n", " \n",
" #\n", " r'(\\d{4}/\\d{2}/\\d{2})', # YYYY/MM/DD\n",
" return []" " r'(\\d{2}/\\d{2}/\\d{4})', # DD/MM/YYYY\n",
" \n",
" r'(\\d{4}\\.\\d{2}\\.\\d{2})' # YYYY.MM.DD\n",
" r'(\\d{2}\\.\\d{2}\\.\\d{4})', # DD.MM.YYYY\n",
" ]\n",
"\n",
" regexp = ''\n",
" for form in formats:\n",
" if regexp:\n",
" regexp += \"|\"+form\n",
" else:\n",
" regexp += form\n",
"\n",
" answer = [] \n",
" pattern = re.compile(regexp)\n",
" dates = pattern.finditer(text) \n",
" \n",
" for date in dates:\n",
" parts = re.split(r'[-./]', date.group())\n",
" \n",
" if len(parts[0]) == 4:\n",
" y = parts[0]\n",
" d = parts[2]\n",
" else:\n",
" d = parts[0]\n",
" y = parts[2]\n",
" m = parts[1]\n",
" \n",
" answer.append([date.span(),d, m, y])\n",
"\n",
" return answer\n",
"\n",
"\n",
"find_dates(\"2024-04-21 awdad qwrwe 20/22/2021 negweg qwqwd %reset 1999/01/01\")"
] ]
}, },
{ {
@ -193,13 +400,174 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 369,
"id": "da509df1-75e4-4ae8-9b0d-5055d551b9dd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\n",
"Collecting isoweek\n",
" Downloading isoweek-1.3.3-py2.py3-none-any.whl.metadata (4.4 kB)\n",
"Downloading isoweek-1.3.3-py2.py3-none-any.whl (7.1 kB)\n",
"Installing collected packages: isoweek\n",
"Successfully installed isoweek-1.3.3\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip install isoweek"
]
},
{
"cell_type": "code",
"execution_count": 426,
"id": "finished-essex", "id": "finished-essex",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from isoweek import Week\n",
"from datetime import datetime\n",
"\n",
"def isoWeekNumber(date_string):\n",
" date_object = datetime.strptime(date_string, \"%Y-%m-%d\")\n",
" \n",
" return str(date_object.isocalendar()[1])\n",
"\n",
"def change_substring(text, new, start, end):\n",
" return text[:start] + new + text[end:]\n",
"\n",
"def EuropeDate(target_segmnet,dates): #YYYY-MM-DD\n",
" for date in dates:\n",
" target_segmnet = change_substring(target_segmnet,date[3]+\"-\"+date[2]+\"-\"+date[1],date[0][0],date[0][1])\n",
" return target_segmnet \n",
"\n",
"def USDate(target_segmnet,dates): #MM-DD-YYYY\n",
" for date in dates:\n",
" target_segmnet = change_substring(target_segmnet,date[2]+\"-\"+date[1]+\"-\"+date[3],date[0][0],date[0][1])\n",
" return target_segmnet\n",
"\n",
"def tyreDOTDate(target_segmnet,dates): #WWYY (weekYear) \n",
" index = 0\n",
" for date in dates:\n",
" d = date[3]+\"-\"+date[2]+\"-\"+date[1]\n",
" target_segmnet = change_substring(target_segmnet,isoWeekNumber(d)+date[3][2:],date[0][0]-index*6,date[0][1]-index*6)\n",
" index += 1\n",
" return target_segmnet\n",
"\n",
"\n",
"def correct_dates(source_segment, target_segment, date_format):\n", "def correct_dates(source_segment, target_segment, date_format):\n",
" return ''" " ss = find_dates(source_segment)\n",
" ts = find_dates(target_segment)\n",
" if len(ss) != len(ts):\n",
" return 'Błąd! Niezgodna liczba dat!'\n",
" \n",
" for index in range(len(ss)):\n",
" if ss[index][1] != ts[index][1]:\n",
" print(\"Rozbieżny dzień w dacie!\")\n",
" print(f\"source_segment: {ss[index]}\")\n",
" print(f\"target_segment: {ts[index]}\")\n",
" return\n",
" \n",
" if date_format == \"Europe\":\n",
" return EuropeDate(target_segment,ts)\n",
" elif date_format == \"US\":\n",
" return USDate(target_segment,ts)\n",
" elif date_format == \"tyre-dot\":\n",
" return tyreDOTDate(target_segment,ts)\n",
" else:\n",
" return \"nierozpoznawalny format rządanej daty\"\n",
"\n",
"source=\"Moje urodziny: 06/07/1999\\n moje najbliższe imieniny: 2024/12/04\"\n",
"target=\"My birthday: 06/07/1999\\nmy forthcoming name day: 2024/12/04.\""
]
},
{
"cell_type": "code",
"execution_count": 431,
"id": "05c351cb-c414-426a-9499-37886d943834",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Europe-format\n",
"---\n",
"My birthday: 1999-07-06\n",
"my forthcoming name day: 2024-12-04.\n"
]
}
],
"source": [
"print(\"Europe-format\\n---\")\n",
"print(correct_dates(source,target,\"Europe\"))"
]
},
{
"cell_type": "code",
"execution_count": 432,
"id": "b0c34609-cbd1-44ec-9c3a-191f0400d1fc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"US-format\n",
"---\n",
"My birthday: 07-06-1999\n",
"my forthcoming name day: 12-04-2024.\n"
]
}
],
"source": [
"print(\"US-format\\n---\")\n",
"print(correct_dates(source,target,\"US\"))"
]
},
{
"cell_type": "code",
"execution_count": 434,
"id": "c0808108-65f9-4025-b6e7-ad06fc06a4df",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tire-Dot-Format\n",
"---\n",
"My birthday: 2799\n",
"my forthcoming name day: 4924.\n"
]
}
],
"source": [
"print(\"Tire-Dot-Format\\n---\")\n",
"print(correct_dates(source,target,\"tyre-dot\"))"
]
},
{
"cell_type": "code",
"execution_count": 433,
"id": "9be67593-0ada-423b-97a9-9b4dcb3e9fa1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Błąd! Niezgodna liczba dat!\n"
]
}
],
"source": [
"# Niezgodna liczba dat\n",
"print(correct_dates(target,\"My birthday: \\nmy forthcoming name day: 2024/12/04\",\"Europe\")) "
] ]
}, },
{ {
@ -244,13 +612,135 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 673,
"id": "romance-judge", "id": "romance-judge",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"difference: 1.125\n"
]
},
{
"data": {
"text/plain": [
"\"Hi, I'm <b>Krystian Osiński </b>and I'm 24 <i>years </i>old.\""
]
},
"execution_count": 673,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"import math\n",
"\n",
"def transfer_tags(source_segment, target_segment):\n", "def transfer_tags(source_segment, target_segment):\n",
" return ''" " regexp = r'(\\s|<[^>]+>)'\n",
" tags = []\n",
" result = \"\"\n",
" index = 0\n",
"\n",
" tokens = [token for token in re.split(regexp, source_segment) if token.strip() != \"\"]\n",
" \n",
" for token in tokens:\n",
" if re.search(r'(<[^>]+>)', token):\n",
" tags.append([index,token])\n",
" if token.strip() != \"\":\n",
" index += 1\n",
" \n",
" plain_s = re.sub(r'(<[^>]+>)', '', source_segment).split(\" \")\n",
" \n",
" t_tokens = target_segment.split(\" \")\n",
" \n",
" if len(plain_s) == len(t_tokens):\n",
" for tag in tags:\n",
" t_tokens.insert(tag[0],tag[1])\n",
" else:\n",
" difference = len(t_tokens)/len(plain_s)\n",
" for tag in tags:\n",
" t_tokens.insert(math.floor(tag[0]*difference),tag[1])\n",
" print(f\"difference: {difference}\")\n",
" \n",
" \n",
" for token in t_tokens:\n",
" if len(token) > 1:\n",
" if token[0] == \"<\" and token[-1] == \">\" and token[1] != \"/\":\n",
" result += token\n",
" elif token[1] == \"/\":\n",
" result += token\n",
" else:\n",
" result += token + \" \"\n",
" else:\n",
" result += token + \" \"\n",
" \n",
" return result.strip()\n",
"\n",
"PLXML = \"Cześć, jestem <b>Krystian Osiński</b> i mam <i>24</i> lata.\"\n",
"EN = \"Hi, I'm Krystian Osiński and I'm 24 years old.\"\n",
"transfer_tags(PLXML,EN)"
]
},
{
"cell_type": "code",
"execution_count": 674,
"id": "25c52d74-d893-4c64-a637-baede2f85059",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"difference: 1.1111111111111112\n"
]
},
{
"data": {
"text/plain": [
"\"Hi, I'm <b>Krystian <i>Osiński </i></b>and I'm almost <i>24 years </i>old.\""
]
},
"execution_count": 674,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PLXML = \"Cześć, jestem <b>Krystian <i>Osiński</i></b> i mam <i>prawie 24</i> lata.\"\n",
"EN = \"Hi, I'm Krystian Osiński and I'm almost 24 years old.\"\n",
"transfer_tags(PLXML,EN)"
]
},
{
"cell_type": "code",
"execution_count": 675,
"id": "3ffb97dd-e806-4b28-8b49-25c7e044758f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"difference: 1.125\n"
]
},
{
"data": {
"text/plain": [
"'I like <b>eating </b>donuts and drinking very hot <i>coffee </i>'"
]
},
"execution_count": 675,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PLXML = \"Lubię jeść <b>pączki</b> i pić bardzo gorącą <i>kawę</i>\"\n",
"EN = \"I like eating donuts and drinking very hot coffee\"\n",
"transfer_tags(PLXML,EN)"
] ]
} }
], ],

File diff suppressed because one or more lines are too long