[2024-04-14] lab 6, 7

This commit is contained in:
Patryk Bartkowiak 2024-04-14 19:33:04 +02:00
parent 5de69211e1
commit 7a6ac33f6e
1 changed files with 343 additions and 15 deletions

View File

@ -55,13 +55,52 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 7,
"id": "documented-hacker",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"[(10, 13), (17, 21)]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"\n",
"def find_tags(text):\n",
" return []"
" tags = re.finditer(r'<[^>]+>', text)\n",
" return [tag.span() for tag in tags]\n",
"\n",
"# Test the function\n",
"text = 'This is a <b>bold</b> text'\n",
"find_tags(text)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1781331d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('<b>', '</b>')"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text[10:13], text[17:21]"
]
},
{
@ -74,13 +113,28 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 9,
"id": "unauthorized-study",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"(True, False, False)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def is_translatable(text):\n",
" return True"
" # Text is translatable if it contains only letters, spaces, and punctuation\n",
" return re.fullmatch(r'[a-zA-Z .,!?]+', text) is not None\n",
"\n",
"# Test the function\n",
"is_translatable('Hello, world!'), is_translatable('Hello, 123!'), is_translatable('你好,世界!')"
]
},
{
@ -93,13 +147,65 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 12,
"id": "beautiful-mathematics",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"[(12, 22), (28, 38), (42, 52), (56, 66), (70, 85)]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def find_dates(text):\n",
" return []"
" # Find all dates in 5 formats: yyyy-mm-dd, yyyy/mm/dd, dd-mm-yyyy, dd/mm/yyyy, dd month yyyy\n",
" # yyyy-mm-dd\n",
" dates = [date.span() for date in re.finditer(r'\\b\\d{4}-\\d{2}-\\d{2}\\b', text)]\n",
" # yyyy/mm/dd\n",
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{4}/\\d{2}/\\d{2}\\b', text)]\n",
" # dd-mm-yyyy\n",
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}-\\d{2}-\\d{4}\\b', text)]\n",
" # dd/mm/yyyy\n",
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}/\\d{2}/\\d{4}\\b', text)]\n",
" # dd month yyyy\n",
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2} [a-zA-Z]+ \\d{4}\\b', text)]\n",
" return dates\n",
"\n",
"# Test the function\n",
"text = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020 or 01 January 2020'\n",
"find_dates(text)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "215a4cbd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2020-01-01\n",
"2020/01/01\n",
"01-01-2020\n",
"01/01/2020\n",
"01 January 2020\n"
]
}
],
"source": [
"print(text[12:22])\n",
"print(text[28:38])\n",
"print(text[42:52])\n",
"print(text[56:66])\n",
"print(text[70:85])"
]
},
{
@ -125,13 +231,164 @@
},
{
"cell_type": "code",
"execution_count": 4,
"id": "finished-essex",
"execution_count": 37,
"id": "e37a24ad",
"metadata": {},
"outputs": [],
"source": [
"text = 'The date is 2020-01-02, not 2020/01/02 or 02-01-2020 or 02/01/2020 or 02 January 2020'"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "4da1f53f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The date is 01/02/2020, not 01/02/2020 or 02/01/2020 or 02/01/2020 or 01/02/2020'"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from dateutil.parser import parse\n",
"\n",
"def change_data_to_US_format(text):\n",
" dates = find_dates(text)\n",
"\n",
" for start, end in dates:\n",
" date = text[start:end]\n",
" try:\n",
" new_date = parse(date).strftime('%m/%d/%Y')\n",
" text = text[:start] + new_date + text[end:]\n",
" except:\n",
" pass\n",
" return text\n",
"\n",
"# Test the function\n",
"change_data_to_US_format(text)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "8a2bf3a3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The date is 02/01/2020, not 02/01/2020 or 01/02/2020 or 01/02/2020 or 02/01/2020'"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from dateutil.parser import parse\n",
"\n",
"def change_data_to_EU_format(text):\n",
" dates = find_dates(text)\n",
"\n",
" for start, end in dates:\n",
" date = text[start:end]\n",
" try:\n",
" new_date = parse(date).strftime('%d/%m/%Y')\n",
" text = text[:start] + new_date + text[end:]\n",
" except:\n",
" pass\n",
" return text\n",
"\n",
"# Test the function\n",
"change_data_to_EU_format(text)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "e1c63075",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The date is 2020.01.02, not 2020.01.02 or 2020.02.01 or 2020.02.01 or 2020.01.02'"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from dateutil.parser import parse\n",
"\n",
"def change_data_to_digit_dot_format(text):\n",
" dates = find_dates(text)\n",
"\n",
" for start, end in dates:\n",
" date = text[start:end]\n",
" try:\n",
" new_date = parse(date).strftime('%Y.%m.%d')\n",
" text = text[:start] + new_date + text[end:]\n",
" except:\n",
" pass\n",
" return text\n",
"\n",
"# Test the function\n",
"change_data_to_digit_dot_format(text)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "finished-essex",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Data jest 01/01/2020, a nie 01/01/2020 lub 01/01/2020 lub 01/01/2020'"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def correct_dates(source_segment, target_segment, date_format):\n",
" return ''"
" # Check if number of dates in source and target segments are the same\n",
" assert len(find_dates(source_segment)) == len(find_dates(target_segment))\n",
"\n",
" # Check if all dates are the same (ignore the format)\n",
" source_dates = find_dates(source_segment)\n",
" target_dates = find_dates(target_segment)\n",
" for source_date, target_date in zip(source_dates, target_dates):\n",
" assert change_data_to_US_format(source_segment[source_date[0]:source_date[1]]) == change_data_to_US_format(target_segment[target_date[0]:target_date[1]]), f\"Dates are different: {source_segment[source_date[0]:source_date[1]]} and {target_segment[target_date[0]:target_date[1]]}\"\n",
"\n",
" # Change the format of dates in the target segment\n",
" if date_format == 'US':\n",
" target_segment = change_data_to_US_format(target_segment)\n",
" elif date_format == 'EU':\n",
" target_segment = change_data_to_EU_format(target_segment)\n",
" elif date_format == 'digit.dot':\n",
" target_segment = change_data_to_digit_dot_format(target_segment)\n",
"\n",
" return target_segment\n",
"\n",
"# Test the function\n",
"source_segment = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020'\n",
"target_segment = \"Data jest 01/01/2020, a nie 2020-01-01 lub 01-01-2020 lub 01/01/2020\"\n",
"correct_dates(source_segment, target_segment, 'US')"
]
},
{
@ -176,13 +433,84 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 60,
"id": "romance-judge",
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"\n",
"def transfer_tags(source_segment, target_segment):\n",
" return ''"
" # Split the segments into tokens\n",
" source_tokens = source_segment.split()\n",
" target_tokens = target_segment.split()\n",
"\n",
" # Calculate the ratio of the number of tokens in the target to the number of tokens in the source\n",
" ratio = len(target_tokens) / len(source_tokens)\n",
"\n",
" # Assign tags to tokens in the target tokens - if the source token has a tag, assign it to the corresponding token in the target tokens\n",
" for i, source_token in enumerate(source_tokens):\n",
" if re.match(r'<[^>]+>', source_token):\n",
" target_index = math.ceil(i * ratio)\n",
"\n",
" if target_index >= len(target_tokens):\n",
" target_index = len(target_tokens) - 1\n",
"\n",
" # Assign start tag\n",
" target_tokens[target_index] = re.findall(r'<[^>]+>', source_token)[0] + target_tokens[target_index]\n",
"\n",
" # Assign end tag\n",
" target_tokens[target_index] = target_tokens[target_index] + re.findall(r'</[^>]+>', source_token)[0]\n",
"\n",
" return ' '.join(target_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "fd8858d8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'To jest <b>ważny</b> tekst'"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Test the function (same number of tokens)\n",
"source_segment = 'This is <b>bold</b> text'\n",
"target_segment = 'To jest ważny tekst'\n",
"transfer_tags(source_segment, target_segment)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "de9e6298",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'To jest bardzo <b>ważny</b> tekst'"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Test the function (different number of tokens)\n",
"source_segment = 'This is <b>bold</b> text'\n",
"target_segment = 'To jest bardzo ważny tekst'\n",
"transfer_tags(source_segment, target_segment)"
]
}
],
@ -205,7 +533,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.10.14"
},
"subtitle": "6,7. Preprocessing i postprocessing",
"title": "Komputerowe wspomaganie tłumaczenia",