add lab8

2024-04-23 19:49:47 +02:00 · 2024-04-23 19:49:47 +02:00 · 623c3c308c
commit 623c3c308c
parent 9992c8d806
3 changed files with 808 additions and 42 deletions
--- a/lab/lab_04-05.ipynb
+++ b/lab/lab_04-05.ipynb
@ -68,9 +68,30 @@
   "id": "environmental-thread",
   "metadata": {},
   "source": [
-    "### Ćwiczenie 2: Uruchom ekstraktor terminologii (wykrywacz rzeczowników) z poprzednich zajęć na każdym dokumencie z osobna. Jako wynik ekstraktora w każdym przypadku wypisz 5 najczęściej występujących rzeczowników. Wyniki działania komendy umieść w notatniku."
+    "import nltk\n",
+    "nltk.download('punkt')  # Pobierz tokenizator zdania, jeśli jeszcze go nie masz\n",
+    "\n",
+    "def podziel_na_zdania(nazwa_pliku):\n",
+    "    with open(nazwa_pliku, 'r') as plik:\n",
+    "        tekst = plik.read()\n",
+    "        zdania = nltk.sent_tokenize(tekst)\n",
+    "        return zdania\n",
+    "\n",
+    "nazwa_pliku = 'tekst.txt'  # Zmień na nazwę swojego pliku\n",
+    "zdania = podziel_na_zdania(nazwa_pliku)\n",
+    "\n",
+    "for zdanie in zdania:\n",
+    "    print(zdanie)### Ćwiczenie 2: Uruchom ekstraktor terminologii (wykrywacz rzeczowników) z poprzednich zajęć na każdym dokumencie z osobna. Jako wynik ekstraktora w każdym przypadku wypisz 5 najczęściej występujących rzeczowników. Wyniki działania komendy umieść w notatniku."
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79fd91c4-c22e-43a0-9842-83a33bd65fa9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
  {
   "cell_type": "code",
   "execution_count": 17,
@ -560,7 +581,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.12.1"
+   "version": "3.9.2"
  },
  "subtitle": "4,5. Klasyfikacja tematyczna (terminologii ciąg dalszy)",
  "title": "Komputerowe wspomaganie tłumaczenia",
--- a/lab/lab_06-07.ipynb
+++ b/lab/lab_06-07.ipynb
@ -87,7 +87,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 19,
   "id": "documented-hacker",
   "metadata": {},
   "outputs": [],
@ -95,7 +95,7 @@
    "import re\n",
    "def find_tags(text):\n",
    "    pos = []\n",
-    "    regexp = r'(<([a-zA-Z])>)|(</([a-zA-Z])>)'\n",
+    "    regexp = r'(<([_a-zA-Z]]*[^<]*)>)|(</([_a-zA-Z][^<?!]*)>)'\n",
    "    pattern = re.compile(regexp)\n",
    "    tags = pattern.finditer(text)\n",
    "    for tag in tags:\n",
@ -105,23 +105,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 25,
   "id": "52a23469-3283-48df-ba49-4d23ba0d6088",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "[(5, 8), (29, 33)]"
+       "[(71, 83), (83, 91), (96, 105), (105, 115), (127, 138), (138, 151)]"
      ]
     },
-     "execution_count": 41,
+     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "string = \"aaafo<r>m1aaaahform2uadaiudfo</r>m3oihdhdform2oahdoihwfform2oiadoafform11\"\n",
+    "string = \"<2024.21.04| Documentation on how to connect to the database with XML: <Connection><server>Games</server><database>Call of Duty</database></Connection>\"\n",
+    "#<Connection>\n",
+    "#    <server>Games</server>\n",
+    "#    <database>Call of Duty</database>\n",
+    "#</Connection>\n",
+    "\n",
    "find_tags(string)"
   ]
  },
@ -135,15 +140,169 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 362,
   "id": "unauthorized-study",
   "metadata": {},
   "outputs": [],
   "source": [
    "def is_translatable(text):\n",
-    "    # 4.2.1\n",
-    "    # \n",
-    "    return True"
+    "    # v 4.2.1 / version: 4.2.1\n",
+    "    # a) B) / aa) BB) / a. BB. / aa. BB. / 1. 2. / 2) 3)\n",
+    "    # przykłady kodu python\n",
+    "    # nr_indeksu (6 cyfr | z/lub bez poprzedzającego znaku 's')\n",
+    "    \n",
+    "    version = r'(^v [0-9].[0-9].?[0-9]?[0-9]?$)|(^ver(:*|.*) [0-9].[0-9].?[0-9]?[0-9]?$)'\n",
+    "    ol = r'(^[a-zA-z][a-zA-z]?(\\)|.)$)|(^[0-9][0-9]?(\\)|.)$)'\n",
+    "    index = r'(^s?\\d{6}$)'\n",
+    "    pyt = r\"\"\"^```python\n",
+    ".*?\n",
+    "```$\"\"\"\n",
+    "    \n",
+    "    regexp = fr'{version}|{ol}|{index}'\n",
+    "    \n",
+    "    if re.fullmatch(regexp,text):\n",
+    "        return False\n",
+    "    elif re.fullmatch(pyt,text,re.DOTALL):\n",
+    "        return False\n",
+    "    else:\n",
+    "        return True\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 189,
+   "id": "30abd5b8-f6e3-4a59-8f53-5ff3b4899e7b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 189,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "is_translatable(\"Wiedza z tego przedmiotu może się dobrze przydać przy tworzeniu systemu dialogowego\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 190,
+   "id": "e3a39c8a-976b-4d66-834e-6d69c3833fe5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 190,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "is_translatable(\"Kliknij enter aby rozpocząć\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 185,
+   "id": "c879c8dd-9a1d-490b-a44f-4a06134f8521",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ver: 4.3.34 False\n",
+      "v 4.3.34 False\n"
+     ]
+    }
+   ],
+   "source": [
+    "x = is_translatable(\"ver: 4.3.34\")\n",
+    "y = is_translatable(\"v 4.3.34\")\n",
+    "\n",
+    "print(f\"ver: 4.3.34 {x}\\nv 4.3.34 {y}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 184,
+   "id": "dcb91bed-63c3-4044-bb19-a2abe6a36c88",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "s444820 False\n",
+      "444820 False\n"
+     ]
+    }
+   ],
+   "source": [
+    "x = is_translatable(\"s444820\")\n",
+    "y = is_translatable(\"444820\")\n",
+    "print(f\"s444820 {x}\\n444820 {y}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 187,
+   "id": "59575516-df17-4e35-9b42-9ee69f9d8b0e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a) False\n",
+      "II. False\n",
+      "1. False\n",
+      "2. False\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = is_translatable(\"a)\")\n",
+    "b = is_translatable(\"II.\")\n",
+    "c = is_translatable(\"1.\")\n",
+    "d = is_translatable(r\"2.\")\n",
+    "print(f\"a) {a}\\nII. {b}\\n1. {c}\\n2. {d}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 188,
+   "id": "40d913e6-48aa-4ece-81b4-bbe74efb5533",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "False"
+      ]
+     },
+     "execution_count": 188,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ex = r\"\"\"```python\n",
+    "def func():\n",
+    "    print(HelloEarth)\n",
+    "func()\n",
+    "```\"\"\"\n",
+    "\n",
+    "is_translatable(ex)"
   ]
  },
  {
@ -156,18 +315,66 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
-   "id": "beautiful-mathematics",
+   "execution_count": 435,
+   "id": "bbb6d96e-231d-48fb-a6a5-f05cb8c84b87",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[[(0, 10), '21', '04', '2024'],\n",
+       " [(23, 33), '20', '22', '2021'],\n",
+       " [(54, 64), '01', '01', '1999']]"
+      ]
+     },
+     "execution_count": 435,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
+    "import re\n",
+    "\n",
    "def find_dates(text):\n",
-    "    #YYYY-MM-DD\n",
-    "    #DD-MM-YYYY\n",
-    "    #\n",
-    "    #\n",
-    "    #\n",
-    "    return []"
+    "    formats = [\n",
+    "        r'(\\d{4}-\\d{2}-\\d{2})',   # YYYY-MM-DD\n",
+    "        r'(\\d{2}-\\d{2}-\\d{4})',   # DD-MM-YYYY\n",
+    "        \n",
+    "        r'(\\d{4}/\\d{2}/\\d{2})',   # YYYY/MM/DD\n",
+    "        r'(\\d{2}/\\d{2}/\\d{4})',   # DD/MM/YYYY\n",
+    "        \n",
+    "        r'(\\d{4}\\.\\d{2}\\.\\d{2})'  # YYYY.MM.DD\n",
+    "        r'(\\d{2}\\.\\d{2}\\.\\d{4})', # DD.MM.YYYY\n",
+    "    ]\n",
+    "\n",
+    "    regexp = ''\n",
+    "    for form in formats:\n",
+    "        if regexp:\n",
+    "            regexp += \"|\"+form\n",
+    "        else:\n",
+    "            regexp += form\n",
+    "\n",
+    "    answer = []       \n",
+    "    pattern = re.compile(regexp)\n",
+    "    dates = pattern.finditer(text)        \n",
+    "            \n",
+    "    for date in dates:\n",
+    "        parts = re.split(r'[-./]', date.group())\n",
+    "        \n",
+    "        if len(parts[0]) == 4:\n",
+    "            y = parts[0]\n",
+    "            d = parts[2]\n",
+    "        else:\n",
+    "            d = parts[0]\n",
+    "            y = parts[2]\n",
+    "        m = parts[1]\n",
+    "            \n",
+    "        answer.append([date.span(),d, m, y])\n",
+    "\n",
+    "    return answer\n",
+    "\n",
+    "\n",
+    "find_dates(\"2024-04-21 awdad qwrwe 20/22/2021 negweg qwqwd %reset 1999/01/01\")"
   ]
  },
  {
@ -193,13 +400,174 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 369,
+   "id": "da509df1-75e4-4ae8-9b0d-5055d551b9dd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Defaulting to user installation because normal site-packages is not writeable\n",
+      "Collecting isoweek\n",
+      "  Downloading isoweek-1.3.3-py2.py3-none-any.whl.metadata (4.4 kB)\n",
+      "Downloading isoweek-1.3.3-py2.py3-none-any.whl (7.1 kB)\n",
+      "Installing collected packages: isoweek\n",
+      "Successfully installed isoweek-1.3.3\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "pip install isoweek"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 426,
   "id": "finished-essex",
   "metadata": {},
   "outputs": [],
   "source": [
+    "from isoweek import Week\n",
+    "from datetime import datetime\n",
+    "\n",
+    "def isoWeekNumber(date_string):\n",
+    "    date_object = datetime.strptime(date_string, \"%Y-%m-%d\")\n",
+    "    \n",
+    "    return str(date_object.isocalendar()[1])\n",
+    "\n",
+    "def change_substring(text, new, start, end):\n",
+    "    return text[:start] + new + text[end:]\n",
+    "\n",
+    "def EuropeDate(target_segmnet,dates): #YYYY-MM-DD\n",
+    "    for date in dates:\n",
+    "        target_segmnet = change_substring(target_segmnet,date[3]+\"-\"+date[2]+\"-\"+date[1],date[0][0],date[0][1])\n",
+    "    return target_segmnet   \n",
+    "\n",
+    "def USDate(target_segmnet,dates): #MM-DD-YYYY\n",
+    "    for date in dates:\n",
+    "        target_segmnet = change_substring(target_segmnet,date[2]+\"-\"+date[1]+\"-\"+date[3],date[0][0],date[0][1])\n",
+    "    return target_segmnet\n",
+    "\n",
+    "def tyreDOTDate(target_segmnet,dates): #WWYY (weekYear) \n",
+    "    index = 0\n",
+    "    for date in dates:\n",
+    "        d = date[3]+\"-\"+date[2]+\"-\"+date[1]\n",
+    "        target_segmnet = change_substring(target_segmnet,isoWeekNumber(d)+date[3][2:],date[0][0]-index*6,date[0][1]-index*6)\n",
+    "        index += 1\n",
+    "    return target_segmnet\n",
+    "\n",
+    "\n",
    "def correct_dates(source_segment, target_segment, date_format):\n",
-    "    return ''"
+    "    ss = find_dates(source_segment)\n",
+    "    ts = find_dates(target_segment)\n",
+    "    if len(ss) != len(ts):\n",
+    "        return 'Błąd! Niezgodna liczba dat!'\n",
+    "    \n",
+    "    for index in range(len(ss)):\n",
+    "        if ss[index][1] != ts[index][1]:\n",
+    "            print(\"Rozbieżny dzień w dacie!\")\n",
+    "            print(f\"source_segment: {ss[index]}\")\n",
+    "            print(f\"target_segment: {ts[index]}\")\n",
+    "            return\n",
+    "        \n",
+    "    if date_format == \"Europe\":\n",
+    "        return EuropeDate(target_segment,ts)\n",
+    "    elif date_format == \"US\":\n",
+    "        return USDate(target_segment,ts)\n",
+    "    elif date_format == \"tyre-dot\":\n",
+    "        return tyreDOTDate(target_segment,ts)\n",
+    "    else:\n",
+    "        return \"nierozpoznawalny format rządanej daty\"\n",
+    "\n",
+    "source=\"Moje urodziny: 06/07/1999\\n moje najbliższe imieniny: 2024/12/04\"\n",
+    "target=\"My birthday: 06/07/1999\\nmy forthcoming name day: 2024/12/04.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 431,
+   "id": "05c351cb-c414-426a-9499-37886d943834",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Europe-format\n",
+      "---\n",
+      "My birthday: 1999-07-06\n",
+      "my forthcoming name day: 2024-12-04.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Europe-format\\n---\")\n",
+    "print(correct_dates(source,target,\"Europe\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 432,
+   "id": "b0c34609-cbd1-44ec-9c3a-191f0400d1fc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "US-format\n",
+      "---\n",
+      "My birthday: 07-06-1999\n",
+      "my forthcoming name day: 12-04-2024.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"US-format\\n---\")\n",
+    "print(correct_dates(source,target,\"US\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 434,
+   "id": "c0808108-65f9-4025-b6e7-ad06fc06a4df",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Tire-Dot-Format\n",
+      "---\n",
+      "My birthday: 2799\n",
+      "my forthcoming name day: 4924.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Tire-Dot-Format\\n---\")\n",
+    "print(correct_dates(source,target,\"tyre-dot\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 433,
+   "id": "9be67593-0ada-423b-97a9-9b4dcb3e9fa1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Błąd! Niezgodna liczba dat!\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Niezgodna liczba dat\n",
+    "print(correct_dates(target,\"My birthday: \\nmy forthcoming name day: 2024/12/04\",\"Europe\")) "
   ]
  },
  {
@ -244,13 +612,135 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 673,
   "id": "romance-judge",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "difference: 1.125\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "\"Hi, I'm <b>Krystian Osiński </b>and I'm 24 <i>years </i>old.\""
+      ]
+     },
+     "execution_count": 673,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
+    "import math\n",
+    "\n",
    "def transfer_tags(source_segment, target_segment):\n",
-    "    return ''"
+    "    regexp = r'(\\s|<[^>]+>)'\n",
+    "    tags = []\n",
+    "    result = \"\"\n",
+    "    index = 0\n",
+    "\n",
+    "    tokens = [token for token in re.split(regexp, source_segment) if token.strip() != \"\"]\n",
+    "    \n",
+    "    for token in tokens:\n",
+    "        if re.search(r'(<[^>]+>)', token):\n",
+    "            tags.append([index,token])\n",
+    "        if token.strip() != \"\":\n",
+    "            index += 1\n",
+    "            \n",
+    "    plain_s = re.sub(r'(<[^>]+>)', '', source_segment).split(\" \")\n",
+    "    \n",
+    "    t_tokens = target_segment.split(\" \")\n",
+    "   \n",
+    "    if len(plain_s) == len(t_tokens):\n",
+    "        for tag in tags:\n",
+    "            t_tokens.insert(tag[0],tag[1])\n",
+    "    else:\n",
+    "        difference = len(t_tokens)/len(plain_s)\n",
+    "        for tag in tags:\n",
+    "            t_tokens.insert(math.floor(tag[0]*difference),tag[1])\n",
+    "        print(f\"difference: {difference}\")\n",
+    "        \n",
+    "    \n",
+    "    for token in t_tokens:\n",
+    "        if len(token) > 1:\n",
+    "            if token[0] == \"<\" and token[-1] == \">\" and token[1] != \"/\":\n",
+    "                result += token\n",
+    "            elif token[1] == \"/\":\n",
+    "                result += token\n",
+    "            else:\n",
+    "                result += token  + \" \"\n",
+    "        else:\n",
+    "             result += token  + \" \"\n",
+    "    \n",
+    "    return result.strip()\n",
+    "\n",
+    "PLXML = \"Cześć, jestem <b>Krystian Osiński</b> i mam <i>24</i> lata.\"\n",
+    "EN = \"Hi, I'm Krystian Osiński and I'm 24 years old.\"\n",
+    "transfer_tags(PLXML,EN)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 674,
+   "id": "25c52d74-d893-4c64-a637-baede2f85059",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "difference: 1.1111111111111112\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "\"Hi, I'm <b>Krystian <i>Osiński </i></b>and I'm almost <i>24 years </i>old.\""
+      ]
+     },
+     "execution_count": 674,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "PLXML = \"Cześć, jestem <b>Krystian <i>Osiński</i></b> i mam <i>prawie 24</i> lata.\"\n",
+    "EN = \"Hi, I'm Krystian Osiński and I'm almost 24 years old.\"\n",
+    "transfer_tags(PLXML,EN)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 675,
+   "id": "3ffb97dd-e806-4b28-8b49-25c7e044758f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "difference: 1.125\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'I like <b>eating </b>donuts and drinking very hot <i>coffee </i>'"
+      ]
+     },
+     "execution_count": 675,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "PLXML = \"Lubię jeść <b>pączki</b> i pić bardzo gorącą <i>kawę</i>\"\n",
+    "EN = \"I like eating donuts and drinking very hot coffee\"\n",
+    "transfer_tags(PLXML,EN)"
   ]
  }
 ],
--- a/lab/lab_08.ipynb
+++ b/lab/lab_08.ipynb