forked from bfijalkowski/KWT-2024
[2024-04-14] lab 6, 7
This commit is contained in:
parent
5de69211e1
commit
7a6ac33f6e
@ -55,13 +55,52 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 7,
|
||||
"id": "documented-hacker",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(10, 13), (17, 21)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def find_tags(text):\n",
|
||||
" return []"
|
||||
" tags = re.finditer(r'<[^>]+>', text)\n",
|
||||
" return [tag.span() for tag in tags]\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"text = 'This is a <b>bold</b> text'\n",
|
||||
"find_tags(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "1781331d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"('<b>', '</b>')"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"text[10:13], text[17:21]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -74,13 +113,28 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 9,
|
||||
"id": "unauthorized-study",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(True, False, False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def is_translatable(text):\n",
|
||||
" return True"
|
||||
" # Text is translatable if it contains only letters, spaces, and punctuation\n",
|
||||
" return re.fullmatch(r'[a-zA-Z .,!?]+', text) is not None\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"is_translatable('Hello, world!'), is_translatable('Hello, 123!'), is_translatable('你好,世界!')"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -93,13 +147,65 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 12,
|
||||
"id": "beautiful-mathematics",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(12, 22), (28, 38), (42, 52), (56, 66), (70, 85)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def find_dates(text):\n",
|
||||
" return []"
|
||||
" # Find all dates in 5 formats: yyyy-mm-dd, yyyy/mm/dd, dd-mm-yyyy, dd/mm/yyyy, dd month yyyy\n",
|
||||
" # yyyy-mm-dd\n",
|
||||
" dates = [date.span() for date in re.finditer(r'\\b\\d{4}-\\d{2}-\\d{2}\\b', text)]\n",
|
||||
" # yyyy/mm/dd\n",
|
||||
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{4}/\\d{2}/\\d{2}\\b', text)]\n",
|
||||
" # dd-mm-yyyy\n",
|
||||
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}-\\d{2}-\\d{4}\\b', text)]\n",
|
||||
" # dd/mm/yyyy\n",
|
||||
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}/\\d{2}/\\d{4}\\b', text)]\n",
|
||||
" # dd month yyyy\n",
|
||||
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2} [a-zA-Z]+ \\d{4}\\b', text)]\n",
|
||||
" return dates\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"text = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020 or 01 January 2020'\n",
|
||||
"find_dates(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "215a4cbd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2020-01-01\n",
|
||||
"2020/01/01\n",
|
||||
"01-01-2020\n",
|
||||
"01/01/2020\n",
|
||||
"01 January 2020\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(text[12:22])\n",
|
||||
"print(text[28:38])\n",
|
||||
"print(text[42:52])\n",
|
||||
"print(text[56:66])\n",
|
||||
"print(text[70:85])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -125,13 +231,164 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "finished-essex",
|
||||
"execution_count": 37,
|
||||
"id": "e37a24ad",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = 'The date is 2020-01-02, not 2020/01/02 or 02-01-2020 or 02/01/2020 or 02 January 2020'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"id": "4da1f53f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The date is 01/02/2020, not 01/02/2020 or 02/01/2020 or 02/01/2020 or 01/02/2020'"
|
||||
]
|
||||
},
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from dateutil.parser import parse\n",
|
||||
"\n",
|
||||
"def change_data_to_US_format(text):\n",
|
||||
" dates = find_dates(text)\n",
|
||||
"\n",
|
||||
" for start, end in dates:\n",
|
||||
" date = text[start:end]\n",
|
||||
" try:\n",
|
||||
" new_date = parse(date).strftime('%m/%d/%Y')\n",
|
||||
" text = text[:start] + new_date + text[end:]\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"change_data_to_US_format(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"id": "8a2bf3a3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The date is 02/01/2020, not 02/01/2020 or 01/02/2020 or 01/02/2020 or 02/01/2020'"
|
||||
]
|
||||
},
|
||||
"execution_count": 39,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from dateutil.parser import parse\n",
|
||||
"\n",
|
||||
"def change_data_to_EU_format(text):\n",
|
||||
" dates = find_dates(text)\n",
|
||||
"\n",
|
||||
" for start, end in dates:\n",
|
||||
" date = text[start:end]\n",
|
||||
" try:\n",
|
||||
" new_date = parse(date).strftime('%d/%m/%Y')\n",
|
||||
" text = text[:start] + new_date + text[end:]\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"change_data_to_EU_format(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"id": "e1c63075",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The date is 2020.01.02, not 2020.01.02 or 2020.02.01 or 2020.02.01 or 2020.01.02'"
|
||||
]
|
||||
},
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from dateutil.parser import parse\n",
|
||||
"\n",
|
||||
"def change_data_to_digit_dot_format(text):\n",
|
||||
" dates = find_dates(text)\n",
|
||||
"\n",
|
||||
" for start, end in dates:\n",
|
||||
" date = text[start:end]\n",
|
||||
" try:\n",
|
||||
" new_date = parse(date).strftime('%Y.%m.%d')\n",
|
||||
" text = text[:start] + new_date + text[end:]\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"change_data_to_digit_dot_format(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"id": "finished-essex",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Data jest 01/01/2020, a nie 01/01/2020 lub 01/01/2020 lub 01/01/2020'"
|
||||
]
|
||||
},
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def correct_dates(source_segment, target_segment, date_format):\n",
|
||||
" return ''"
|
||||
" # Check if number of dates in source and target segments are the same\n",
|
||||
" assert len(find_dates(source_segment)) == len(find_dates(target_segment))\n",
|
||||
"\n",
|
||||
" # Check if all dates are the same (ignore the format)\n",
|
||||
" source_dates = find_dates(source_segment)\n",
|
||||
" target_dates = find_dates(target_segment)\n",
|
||||
" for source_date, target_date in zip(source_dates, target_dates):\n",
|
||||
" assert change_data_to_US_format(source_segment[source_date[0]:source_date[1]]) == change_data_to_US_format(target_segment[target_date[0]:target_date[1]]), f\"Dates are different: {source_segment[source_date[0]:source_date[1]]} and {target_segment[target_date[0]:target_date[1]]}\"\n",
|
||||
"\n",
|
||||
" # Change the format of dates in the target segment\n",
|
||||
" if date_format == 'US':\n",
|
||||
" target_segment = change_data_to_US_format(target_segment)\n",
|
||||
" elif date_format == 'EU':\n",
|
||||
" target_segment = change_data_to_EU_format(target_segment)\n",
|
||||
" elif date_format == 'digit.dot':\n",
|
||||
" target_segment = change_data_to_digit_dot_format(target_segment)\n",
|
||||
"\n",
|
||||
" return target_segment\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"source_segment = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020'\n",
|
||||
"target_segment = \"Data jest 01/01/2020, a nie 2020-01-01 lub 01-01-2020 lub 01/01/2020\"\n",
|
||||
"correct_dates(source_segment, target_segment, 'US')"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -176,13 +433,84 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 60,
|
||||
"id": "romance-judge",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import math\n",
|
||||
"\n",
|
||||
"def transfer_tags(source_segment, target_segment):\n",
|
||||
" return ''"
|
||||
" # Split the segments into tokens\n",
|
||||
" source_tokens = source_segment.split()\n",
|
||||
" target_tokens = target_segment.split()\n",
|
||||
"\n",
|
||||
" # Calculate the ratio of the number of tokens in the target to the number of tokens in the source\n",
|
||||
" ratio = len(target_tokens) / len(source_tokens)\n",
|
||||
"\n",
|
||||
" # Assign tags to tokens in the target tokens - if the source token has a tag, assign it to the corresponding token in the target tokens\n",
|
||||
" for i, source_token in enumerate(source_tokens):\n",
|
||||
" if re.match(r'<[^>]+>', source_token):\n",
|
||||
" target_index = math.ceil(i * ratio)\n",
|
||||
"\n",
|
||||
" if target_index >= len(target_tokens):\n",
|
||||
" target_index = len(target_tokens) - 1\n",
|
||||
"\n",
|
||||
" # Assign start tag\n",
|
||||
" target_tokens[target_index] = re.findall(r'<[^>]+>', source_token)[0] + target_tokens[target_index]\n",
|
||||
"\n",
|
||||
" # Assign end tag\n",
|
||||
" target_tokens[target_index] = target_tokens[target_index] + re.findall(r'</[^>]+>', source_token)[0]\n",
|
||||
"\n",
|
||||
" return ' '.join(target_tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 61,
|
||||
"id": "fd8858d8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'To jest <b>ważny</b> tekst'"
|
||||
]
|
||||
},
|
||||
"execution_count": 61,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Test the function (same number of tokens)\n",
|
||||
"source_segment = 'This is <b>bold</b> text'\n",
|
||||
"target_segment = 'To jest ważny tekst'\n",
|
||||
"transfer_tags(source_segment, target_segment)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 62,
|
||||
"id": "de9e6298",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'To jest bardzo <b>ważny</b> tekst'"
|
||||
]
|
||||
},
|
||||
"execution_count": 62,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Test the function (different number of tokens)\n",
|
||||
"source_segment = 'This is <b>bold</b> text'\n",
|
||||
"target_segment = 'To jest bardzo ważny tekst'\n",
|
||||
"transfer_tags(source_segment, target_segment)"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -205,7 +533,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.10.14"
|
||||
},
|
||||
"subtitle": "6,7. Preprocessing i postprocessing",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
Loading…
Reference in New Issue
Block a user