diff --git a/lab/lab_06-07.ipynb b/lab/lab_06-07.ipynb
index 3b0a71a..b2eeeaa 100644
--- a/lab/lab_06-07.ipynb
+++ b/lab/lab_06-07.ipynb
@@ -55,13 +55,52 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 7,
"id": "documented-hacker",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[(10, 13), (17, 21)]"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
+ "import re\n",
+ "\n",
"def find_tags(text):\n",
- " return []"
+ " tags = re.finditer(r'<[^>]+>', text)\n",
+ " return [tag.span() for tag in tags]\n",
+ "\n",
+ "# Test the function\n",
+ "text = 'This is a bold text'\n",
+ "find_tags(text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "1781331d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "('', '')"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "text[10:13], text[17:21]"
]
},
{
@@ -74,13 +113,28 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 9,
"id": "unauthorized-study",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(True, False, False)"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"def is_translatable(text):\n",
- " return True"
+ " # Text is translatable if it contains only letters, spaces, and punctuation\n",
+ " return re.fullmatch(r'[a-zA-Z .,!?]+', text) is not None\n",
+ "\n",
+ "# Test the function\n",
+ "is_translatable('Hello, world!'), is_translatable('Hello, 123!'), is_translatable('你好,世界!')"
]
},
{
@@ -93,13 +147,65 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 12,
"id": "beautiful-mathematics",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[(12, 22), (28, 38), (42, 52), (56, 66), (70, 85)]"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"def find_dates(text):\n",
- " return []"
+ " # Find all dates in 5 formats: yyyy-mm-dd, yyyy/mm/dd, dd-mm-yyyy, dd/mm/yyyy, dd month yyyy\n",
+ " # yyyy-mm-dd\n",
+ " dates = [date.span() for date in re.finditer(r'\\b\\d{4}-\\d{2}-\\d{2}\\b', text)]\n",
+ " # yyyy/mm/dd\n",
+ " dates = dates + [date.span() for date in re.finditer(r'\\b\\d{4}/\\d{2}/\\d{2}\\b', text)]\n",
+ " # dd-mm-yyyy\n",
+ " dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}-\\d{2}-\\d{4}\\b', text)]\n",
+ " # dd/mm/yyyy\n",
+ " dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}/\\d{2}/\\d{4}\\b', text)]\n",
+ " # dd month yyyy\n",
+ " dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2} [a-zA-Z]+ \\d{4}\\b', text)]\n",
+ " return dates\n",
+ "\n",
+ "# Test the function\n",
+ "text = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020 or 01 January 2020'\n",
+ "find_dates(text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "215a4cbd",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2020-01-01\n",
+ "2020/01/01\n",
+ "01-01-2020\n",
+ "01/01/2020\n",
+ "01 January 2020\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(text[12:22])\n",
+ "print(text[28:38])\n",
+ "print(text[42:52])\n",
+ "print(text[56:66])\n",
+ "print(text[70:85])"
]
},
{
@@ -125,13 +231,164 @@
},
{
"cell_type": "code",
- "execution_count": 4,
- "id": "finished-essex",
+ "execution_count": 37,
+ "id": "e37a24ad",
"metadata": {},
"outputs": [],
+ "source": [
+ "text = 'The date is 2020-01-02, not 2020/01/02 or 02-01-2020 or 02/01/2020 or 02 January 2020'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "4da1f53f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'The date is 01/02/2020, not 01/02/2020 or 02/01/2020 or 02/01/2020 or 01/02/2020'"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from dateutil.parser import parse\n",
+ "\n",
+ "def change_data_to_US_format(text):\n",
+ " dates = find_dates(text)\n",
+ "\n",
+ " for start, end in dates:\n",
+ " date = text[start:end]\n",
+ " try:\n",
+ " new_date = parse(date).strftime('%m/%d/%Y')\n",
+ " text = text[:start] + new_date + text[end:]\n",
+ " except:\n",
+ " pass\n",
+ " return text\n",
+ "\n",
+ "# Test the function\n",
+ "change_data_to_US_format(text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "id": "8a2bf3a3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'The date is 02/01/2020, not 02/01/2020 or 01/02/2020 or 01/02/2020 or 02/01/2020'"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from dateutil.parser import parse\n",
+ "\n",
+ "def change_data_to_EU_format(text):\n",
+ " dates = find_dates(text)\n",
+ "\n",
+ " for start, end in dates:\n",
+ " date = text[start:end]\n",
+ " try:\n",
+ " new_date = parse(date).strftime('%d/%m/%Y')\n",
+ " text = text[:start] + new_date + text[end:]\n",
+ " except:\n",
+ " pass\n",
+ " return text\n",
+ "\n",
+ "# Test the function\n",
+ "change_data_to_EU_format(text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "e1c63075",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'The date is 2020.01.02, not 2020.01.02 or 2020.02.01 or 2020.02.01 or 2020.01.02'"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from dateutil.parser import parse\n",
+ "\n",
+ "def change_data_to_digit_dot_format(text):\n",
+ " dates = find_dates(text)\n",
+ "\n",
+ " for start, end in dates:\n",
+ " date = text[start:end]\n",
+ " try:\n",
+ " new_date = parse(date).strftime('%Y.%m.%d')\n",
+ " text = text[:start] + new_date + text[end:]\n",
+ " except:\n",
+ " pass\n",
+ " return text\n",
+ "\n",
+ "# Test the function\n",
+ "change_data_to_digit_dot_format(text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "id": "finished-essex",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'Data jest 01/01/2020, a nie 01/01/2020 lub 01/01/2020 lub 01/01/2020'"
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"def correct_dates(source_segment, target_segment, date_format):\n",
- " return ''"
+ " # Check if number of dates in source and target segments are the same\n",
+ " assert len(find_dates(source_segment)) == len(find_dates(target_segment))\n",
+ "\n",
+ " # Check if all dates are the same (ignore the format)\n",
+ " source_dates = find_dates(source_segment)\n",
+ " target_dates = find_dates(target_segment)\n",
+ " for source_date, target_date in zip(source_dates, target_dates):\n",
+ " assert change_data_to_US_format(source_segment[source_date[0]:source_date[1]]) == change_data_to_US_format(target_segment[target_date[0]:target_date[1]]), f\"Dates are different: {source_segment[source_date[0]:source_date[1]]} and {target_segment[target_date[0]:target_date[1]]}\"\n",
+ "\n",
+ " # Change the format of dates in the target segment\n",
+ " if date_format == 'US':\n",
+ " target_segment = change_data_to_US_format(target_segment)\n",
+ " elif date_format == 'EU':\n",
+ " target_segment = change_data_to_EU_format(target_segment)\n",
+ " elif date_format == 'digit.dot':\n",
+ " target_segment = change_data_to_digit_dot_format(target_segment)\n",
+ "\n",
+ " return target_segment\n",
+ "\n",
+ "# Test the function\n",
+ "source_segment = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020'\n",
+ "target_segment = \"Data jest 01/01/2020, a nie 2020-01-01 lub 01-01-2020 lub 01/01/2020\"\n",
+ "correct_dates(source_segment, target_segment, 'US')"
]
},
{
@@ -176,13 +433,84 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 60,
"id": "romance-judge",
"metadata": {},
"outputs": [],
"source": [
+ "import math\n",
+ "\n",
"def transfer_tags(source_segment, target_segment):\n",
- " return ''"
+ " # Split the segments into tokens\n",
+ " source_tokens = source_segment.split()\n",
+ " target_tokens = target_segment.split()\n",
+ "\n",
+ " # Calculate the ratio of the number of tokens in the target to the number of tokens in the source\n",
+ " ratio = len(target_tokens) / len(source_tokens)\n",
+ "\n",
+ " # Assign tags to tokens in the target tokens - if the source token has a tag, assign it to the corresponding token in the target tokens\n",
+ " for i, source_token in enumerate(source_tokens):\n",
+ " if re.match(r'<[^>]+>', source_token):\n",
+ " target_index = math.ceil(i * ratio)\n",
+ "\n",
+ " if target_index >= len(target_tokens):\n",
+ " target_index = len(target_tokens) - 1\n",
+ "\n",
+ " # Assign start tag\n",
+ " target_tokens[target_index] = re.findall(r'<[^>]+>', source_token)[0] + target_tokens[target_index]\n",
+ "\n",
+ " # Assign end tag\n",
+ " target_tokens[target_index] = target_tokens[target_index] + re.findall(r'[^>]+>', source_token)[0]\n",
+ "\n",
+ " return ' '.join(target_tokens)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "fd8858d8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'To jest ważny tekst'"
+ ]
+ },
+ "execution_count": 61,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Test the function (same number of tokens)\n",
+ "source_segment = 'This is bold text'\n",
+ "target_segment = 'To jest ważny tekst'\n",
+ "transfer_tags(source_segment, target_segment)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "id": "de9e6298",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'To jest bardzo ważny tekst'"
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Test the function (different number of tokens)\n",
+ "source_segment = 'This is bold text'\n",
+ "target_segment = 'To jest bardzo ważny tekst'\n",
+ "transfer_tags(source_segment, target_segment)"
]
}
],
@@ -205,7 +533,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.10"
+ "version": "3.10.14"
},
"subtitle": "6,7. Preprocessing i postprocessing",
"title": "Komputerowe wspomaganie tłumaczenia",