[2024-04-14] lab 6, 7

2024-04-14 19:33:04 +02:00 · 2024-04-14 19:33:04 +02:00 · 7a6ac33f6e
commit 7a6ac33f6e
parent 5de69211e1
1 changed files with 343 additions and 15 deletions
--- a/lab/lab_06-07.ipynb
+++ b/lab/lab_06-07.ipynb
@ -55,13 +55,52 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 7,
   "id": "documented-hacker",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[(10, 13), (17, 21)]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
+    "import re\n",
+    "\n",
    "def find_tags(text):\n",
-    "    return []"
+    "    tags = re.finditer(r'<[^>]+>', text)\n",
+    "    return [tag.span() for tag in tags]\n",
+    "\n",
+    "# Test the function\n",
+    "text = 'This is a <b>bold</b> text'\n",
+    "find_tags(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "1781331d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('<b>', '</b>')"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text[10:13], text[17:21]"
   ]
  },
  {
@ -74,13 +113,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 9,
   "id": "unauthorized-study",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(True, False, False)"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "def is_translatable(text):\n",
-    "    return True"
+    "    # Text is translatable if it contains only letters, spaces, and punctuation\n",
+    "    return re.fullmatch(r'[a-zA-Z .,!?]+', text) is not None\n",
+    "\n",
+    "# Test the function\n",
+    "is_translatable('Hello, world!'), is_translatable('Hello, 123!'), is_translatable('你好，世界！')"
   ]
  },
  {
@ -93,13 +147,65 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 12,
   "id": "beautiful-mathematics",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[(12, 22), (28, 38), (42, 52), (56, 66), (70, 85)]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "def find_dates(text):\n",
-    "    return []"
+    "    # Find all dates in 5 formats: yyyy-mm-dd, yyyy/mm/dd, dd-mm-yyyy, dd/mm/yyyy, dd month yyyy\n",
+    "    # yyyy-mm-dd\n",
+    "    dates = [date.span() for date in re.finditer(r'\\b\\d{4}-\\d{2}-\\d{2}\\b', text)]\n",
+    "    # yyyy/mm/dd\n",
+    "    dates = dates + [date.span() for date in re.finditer(r'\\b\\d{4}/\\d{2}/\\d{2}\\b', text)]\n",
+    "    # dd-mm-yyyy\n",
+    "    dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}-\\d{2}-\\d{4}\\b', text)]\n",
+    "    # dd/mm/yyyy\n",
+    "    dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}/\\d{2}/\\d{4}\\b', text)]\n",
+    "    # dd month yyyy\n",
+    "    dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2} [a-zA-Z]+ \\d{4}\\b', text)]\n",
+    "    return dates\n",
+    "\n",
+    "# Test the function\n",
+    "text = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020 or 01 January 2020'\n",
+    "find_dates(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "215a4cbd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2020-01-01\n",
+      "2020/01/01\n",
+      "01-01-2020\n",
+      "01/01/2020\n",
+      "01 January 2020\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(text[12:22])\n",
+    "print(text[28:38])\n",
+    "print(text[42:52])\n",
+    "print(text[56:66])\n",
+    "print(text[70:85])"
   ]
  },
  {
@ -125,13 +231,164 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
-   "id": "finished-essex",
+   "execution_count": 37,
+   "id": "e37a24ad",
   "metadata": {},
   "outputs": [],
+   "source": [
+    "text = 'The date is 2020-01-02, not 2020/01/02 or 02-01-2020 or 02/01/2020 or 02 January 2020'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "id": "4da1f53f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The date is 01/02/2020, not 01/02/2020 or 02/01/2020 or 02/01/2020 or 01/02/2020'"
+      ]
+     },
+     "execution_count": 38,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from dateutil.parser import parse\n",
+    "\n",
+    "def change_data_to_US_format(text):\n",
+    "    dates = find_dates(text)\n",
+    "\n",
+    "    for start, end in dates:\n",
+    "        date = text[start:end]\n",
+    "        try:\n",
+    "            new_date = parse(date).strftime('%m/%d/%Y')\n",
+    "            text = text[:start] + new_date + text[end:]\n",
+    "        except:\n",
+    "            pass\n",
+    "    return text\n",
+    "\n",
+    "# Test the function\n",
+    "change_data_to_US_format(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "8a2bf3a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The date is 02/01/2020, not 02/01/2020 or 01/02/2020 or 01/02/2020 or 02/01/2020'"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from dateutil.parser import parse\n",
+    "\n",
+    "def change_data_to_EU_format(text):\n",
+    "    dates = find_dates(text)\n",
+    "\n",
+    "    for start, end in dates:\n",
+    "        date = text[start:end]\n",
+    "        try:\n",
+    "            new_date = parse(date).strftime('%d/%m/%Y')\n",
+    "            text = text[:start] + new_date + text[end:]\n",
+    "        except:\n",
+    "            pass\n",
+    "    return text\n",
+    "\n",
+    "# Test the function\n",
+    "change_data_to_EU_format(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "e1c63075",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'The date is 2020.01.02, not 2020.01.02 or 2020.02.01 or 2020.02.01 or 2020.01.02'"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from dateutil.parser import parse\n",
+    "\n",
+    "def change_data_to_digit_dot_format(text):\n",
+    "    dates = find_dates(text)\n",
+    "\n",
+    "    for start, end in dates:\n",
+    "        date = text[start:end]\n",
+    "        try:\n",
+    "            new_date = parse(date).strftime('%Y.%m.%d')\n",
+    "            text = text[:start] + new_date + text[end:]\n",
+    "        except:\n",
+    "            pass\n",
+    "    return text\n",
+    "\n",
+    "# Test the function\n",
+    "change_data_to_digit_dot_format(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "finished-essex",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Data jest 01/01/2020, a nie 01/01/2020 lub 01/01/2020 lub 01/01/2020'"
+      ]
+     },
+     "execution_count": 45,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "def correct_dates(source_segment, target_segment, date_format):\n",
-    "    return ''"
+    "    # Check if number of dates in source and target segments are the same\n",
+    "    assert len(find_dates(source_segment)) == len(find_dates(target_segment))\n",
+    "\n",
+    "    # Check if all dates are the same (ignore the format)\n",
+    "    source_dates = find_dates(source_segment)\n",
+    "    target_dates = find_dates(target_segment)\n",
+    "    for source_date, target_date in zip(source_dates, target_dates):\n",
+    "        assert change_data_to_US_format(source_segment[source_date[0]:source_date[1]]) == change_data_to_US_format(target_segment[target_date[0]:target_date[1]]), f\"Dates are different: {source_segment[source_date[0]:source_date[1]]} and {target_segment[target_date[0]:target_date[1]]}\"\n",
+    "\n",
+    "    # Change the format of dates in the target segment\n",
+    "    if date_format == 'US':\n",
+    "        target_segment = change_data_to_US_format(target_segment)\n",
+    "    elif date_format == 'EU':\n",
+    "        target_segment = change_data_to_EU_format(target_segment)\n",
+    "    elif date_format == 'digit.dot':\n",
+    "        target_segment = change_data_to_digit_dot_format(target_segment)\n",
+    "\n",
+    "    return target_segment\n",
+    "\n",
+    "# Test the function\n",
+    "source_segment = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020'\n",
+    "target_segment = \"Data jest 01/01/2020, a nie 2020-01-01 lub 01-01-2020 lub 01/01/2020\"\n",
+    "correct_dates(source_segment, target_segment, 'US')"
   ]
  },
  {
@ -176,13 +433,84 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 60,
   "id": "romance-judge",
   "metadata": {},
   "outputs": [],
   "source": [
+    "import math\n",
+    "\n",
    "def transfer_tags(source_segment, target_segment):\n",
-    "    return ''"
+    "    # Split the segments into tokens\n",
+    "    source_tokens = source_segment.split()\n",
+    "    target_tokens = target_segment.split()\n",
+    "\n",
+    "    # Calculate the ratio of the number of tokens in the target to the number of tokens in the source\n",
+    "    ratio = len(target_tokens) / len(source_tokens)\n",
+    "\n",
+    "    # Assign tags to tokens in the target tokens - if the source token has a tag, assign it to the corresponding token in the target tokens\n",
+    "    for i, source_token in enumerate(source_tokens):\n",
+    "        if re.match(r'<[^>]+>', source_token):\n",
+    "            target_index = math.ceil(i * ratio)\n",
+    "\n",
+    "            if target_index >= len(target_tokens):\n",
+    "                target_index = len(target_tokens) - 1\n",
+    "\n",
+    "            # Assign start tag\n",
+    "            target_tokens[target_index] = re.findall(r'<[^>]+>', source_token)[0] + target_tokens[target_index]\n",
+    "\n",
+    "            # Assign end tag\n",
+    "            target_tokens[target_index] = target_tokens[target_index] + re.findall(r'</[^>]+>', source_token)[0]\n",
+    "\n",
+    "    return ' '.join(target_tokens)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "fd8858d8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'To jest <b>ważny</b> tekst'"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Test the function (same number of tokens)\n",
+    "source_segment = 'This is <b>bold</b> text'\n",
+    "target_segment = 'To jest ważny tekst'\n",
+    "transfer_tags(source_segment, target_segment)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "id": "de9e6298",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'To jest bardzo <b>ważny</b> tekst'"
+      ]
+     },
+     "execution_count": 62,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Test the function (different number of tokens)\n",
+    "source_segment = 'This is <b>bold</b> text'\n",
+    "target_segment = 'To jest bardzo ważny tekst'\n",
+    "transfer_tags(source_segment, target_segment)"
   ]
  }
 ],
@ -205,7 +533,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.14"
  },
  "subtitle": "6,7. Preprocessing i postprocessing",
  "title": "Komputerowe wspomaganie tłumaczenia",