diff --git a/lab/lab_06-07.ipynb b/lab/lab_06-07.ipynb index 3b0a71a..b2eeeaa 100644 --- a/lab/lab_06-07.ipynb +++ b/lab/lab_06-07.ipynb @@ -55,13 +55,52 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "id": "documented-hacker", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[(10, 13), (17, 21)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import re\n", + "\n", "def find_tags(text):\n", - " return []" + " tags = re.finditer(r'<[^>]+>', text)\n", + " return [tag.span() for tag in tags]\n", + "\n", + "# Test the function\n", + "text = 'This is a bold text'\n", + "find_tags(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1781331d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('', '')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text[10:13], text[17:21]" ] }, { @@ -74,13 +113,28 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "id": "unauthorized-study", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(True, False, False)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def is_translatable(text):\n", - " return True" + " # Text is translatable if it contains only letters, spaces, and punctuation\n", + " return re.fullmatch(r'[a-zA-Z .,!?]+', text) is not None\n", + "\n", + "# Test the function\n", + "is_translatable('Hello, world!'), is_translatable('Hello, 123!'), is_translatable('你好,世界!')" ] }, { @@ -93,13 +147,65 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 12, "id": "beautiful-mathematics", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[(12, 22), (28, 38), (42, 52), (56, 66), (70, 85)]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def find_dates(text):\n", - " return []" + " # Find all dates in 5 formats: yyyy-mm-dd, yyyy/mm/dd, dd-mm-yyyy, dd/mm/yyyy, dd month yyyy\n", + " # yyyy-mm-dd\n", + " dates = [date.span() for date in re.finditer(r'\\b\\d{4}-\\d{2}-\\d{2}\\b', text)]\n", + " # yyyy/mm/dd\n", + " dates = dates + [date.span() for date in re.finditer(r'\\b\\d{4}/\\d{2}/\\d{2}\\b', text)]\n", + " # dd-mm-yyyy\n", + " dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}-\\d{2}-\\d{4}\\b', text)]\n", + " # dd/mm/yyyy\n", + " dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}/\\d{2}/\\d{4}\\b', text)]\n", + " # dd month yyyy\n", + " dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2} [a-zA-Z]+ \\d{4}\\b', text)]\n", + " return dates\n", + "\n", + "# Test the function\n", + "text = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020 or 01 January 2020'\n", + "find_dates(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "215a4cbd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2020-01-01\n", + "2020/01/01\n", + "01-01-2020\n", + "01/01/2020\n", + "01 January 2020\n" + ] + } + ], + "source": [ + "print(text[12:22])\n", + "print(text[28:38])\n", + "print(text[42:52])\n", + "print(text[56:66])\n", + "print(text[70:85])" ] }, { @@ -125,13 +231,164 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "finished-essex", + "execution_count": 37, + "id": "e37a24ad", "metadata": {}, "outputs": [], + "source": [ + "text = 'The date is 2020-01-02, not 2020/01/02 or 02-01-2020 or 02/01/2020 or 02 January 2020'" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "4da1f53f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'The date is 01/02/2020, not 01/02/2020 or 02/01/2020 or 02/01/2020 or 01/02/2020'" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dateutil.parser import parse\n", + "\n", + "def change_data_to_US_format(text):\n", + " dates = find_dates(text)\n", + "\n", + " for start, end in dates:\n", + " date = text[start:end]\n", + " try:\n", + " new_date = parse(date).strftime('%m/%d/%Y')\n", + " text = text[:start] + new_date + text[end:]\n", + " except:\n", + " pass\n", + " return text\n", + "\n", + "# Test the function\n", + "change_data_to_US_format(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "8a2bf3a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'The date is 02/01/2020, not 02/01/2020 or 01/02/2020 or 01/02/2020 or 02/01/2020'" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dateutil.parser import parse\n", + "\n", + "def change_data_to_EU_format(text):\n", + " dates = find_dates(text)\n", + "\n", + " for start, end in dates:\n", + " date = text[start:end]\n", + " try:\n", + " new_date = parse(date).strftime('%d/%m/%Y')\n", + " text = text[:start] + new_date + text[end:]\n", + " except:\n", + " pass\n", + " return text\n", + "\n", + "# Test the function\n", + "change_data_to_EU_format(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "e1c63075", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'The date is 2020.01.02, not 2020.01.02 or 2020.02.01 or 2020.02.01 or 2020.01.02'" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dateutil.parser import parse\n", + "\n", + "def change_data_to_digit_dot_format(text):\n", + " dates = find_dates(text)\n", + "\n", + " for start, end in dates:\n", + " date = text[start:end]\n", + " try:\n", + " new_date = parse(date).strftime('%Y.%m.%d')\n", + " text = text[:start] + new_date + text[end:]\n", + " except:\n", + " pass\n", + " return text\n", + "\n", + "# Test the function\n", + "change_data_to_digit_dot_format(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "finished-essex", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Data jest 01/01/2020, a nie 01/01/2020 lub 01/01/2020 lub 01/01/2020'" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def correct_dates(source_segment, target_segment, date_format):\n", - " return ''" + " # Check if number of dates in source and target segments are the same\n", + " assert len(find_dates(source_segment)) == len(find_dates(target_segment))\n", + "\n", + " # Check if all dates are the same (ignore the format)\n", + " source_dates = find_dates(source_segment)\n", + " target_dates = find_dates(target_segment)\n", + " for source_date, target_date in zip(source_dates, target_dates):\n", + " assert change_data_to_US_format(source_segment[source_date[0]:source_date[1]]) == change_data_to_US_format(target_segment[target_date[0]:target_date[1]]), f\"Dates are different: {source_segment[source_date[0]:source_date[1]]} and {target_segment[target_date[0]:target_date[1]]}\"\n", + "\n", + " # Change the format of dates in the target segment\n", + " if date_format == 'US':\n", + " target_segment = change_data_to_US_format(target_segment)\n", + " elif date_format == 'EU':\n", + " target_segment = change_data_to_EU_format(target_segment)\n", + " elif date_format == 'digit.dot':\n", + " target_segment = change_data_to_digit_dot_format(target_segment)\n", + "\n", + " return target_segment\n", + "\n", + "# Test the function\n", + "source_segment = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020'\n", + "target_segment = \"Data jest 01/01/2020, a nie 2020-01-01 lub 01-01-2020 lub 01/01/2020\"\n", + "correct_dates(source_segment, target_segment, 'US')" ] }, { @@ -176,13 +433,84 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 60, "id": "romance-judge", "metadata": {}, "outputs": [], "source": [ + "import math\n", + "\n", "def transfer_tags(source_segment, target_segment):\n", - " return ''" + " # Split the segments into tokens\n", + " source_tokens = source_segment.split()\n", + " target_tokens = target_segment.split()\n", + "\n", + " # Calculate the ratio of the number of tokens in the target to the number of tokens in the source\n", + " ratio = len(target_tokens) / len(source_tokens)\n", + "\n", + " # Assign tags to tokens in the target tokens - if the source token has a tag, assign it to the corresponding token in the target tokens\n", + " for i, source_token in enumerate(source_tokens):\n", + " if re.match(r'<[^>]+>', source_token):\n", + " target_index = math.ceil(i * ratio)\n", + "\n", + " if target_index >= len(target_tokens):\n", + " target_index = len(target_tokens) - 1\n", + "\n", + " # Assign start tag\n", + " target_tokens[target_index] = re.findall(r'<[^>]+>', source_token)[0] + target_tokens[target_index]\n", + "\n", + " # Assign end tag\n", + " target_tokens[target_index] = target_tokens[target_index] + re.findall(r']+>', source_token)[0]\n", + "\n", + " return ' '.join(target_tokens)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "fd8858d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'To jest ważny tekst'" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Test the function (same number of tokens)\n", + "source_segment = 'This is bold text'\n", + "target_segment = 'To jest ważny tekst'\n", + "transfer_tags(source_segment, target_segment)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "de9e6298", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'To jest bardzo ważny tekst'" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Test the function (different number of tokens)\n", + "source_segment = 'This is bold text'\n", + "target_segment = 'To jest bardzo ważny tekst'\n", + "transfer_tags(source_segment, target_segment)" ] } ], @@ -205,7 +533,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.14" }, "subtitle": "6,7. Preprocessing i postprocessing", "title": "Komputerowe wspomaganie tłumaczenia",