forked from bfijalkowski/KWT-2024
lab 2
This commit is contained in:
parent
d32188878d
commit
e02ff5ab39
File diff suppressed because one or more lines are too long
@ -60,8 +60,35 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def find_tags(text):\n",
|
"import re\n",
|
||||||
" return []"
|
"\n",
|
||||||
|
"def find_tags(string):\n",
|
||||||
|
" pattern = r'<[^>]+>'\n",
|
||||||
|
" matches = re.finditer(pattern, string)\n",
|
||||||
|
" tag_indexes = [(match.start(), match.end()) for match in matches]\n",
|
||||||
|
" return tag_indexes"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "3dc08368",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['<tag1>', '</tag1>', '<tag2>', '</tag2>']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"string = \"<tag1>ADIOS</tag1><tag2>OLA</tag2>\"\n",
|
||||||
|
"[ string[out[0]:out[1]] for out in find_tags(string)]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -74,13 +101,73 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 20,
|
||||||
"id": "unauthorized-study",
|
"id": "unauthorized-study",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"False\n",
|
||||||
|
"False\n",
|
||||||
|
"False\n",
|
||||||
|
"True\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"import string\n",
|
||||||
|
"\n",
|
||||||
"def is_translatable(text):\n",
|
"def is_translatable(text):\n",
|
||||||
" return True"
|
" return bool(re.match(r'^[^0-9IVXLCDM\\s' + re.escape(string.punctuation) + ']+$', text))\n",
|
||||||
|
"\n",
|
||||||
|
"text1 = \"This is a sample text.\"\n",
|
||||||
|
"text2 = \"2024.\"\n",
|
||||||
|
"text3 = \"Это пример текста.\"\n",
|
||||||
|
"text4 = \"おはよう\"\n",
|
||||||
|
"\n",
|
||||||
|
"print(is_translatable(text1)) \n",
|
||||||
|
"print(is_translatable(text2))\n",
|
||||||
|
"print(is_translatable(text3))\n",
|
||||||
|
"print(is_translatable(text4))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 32,
|
||||||
|
"id": "ae92a18c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\"This is a sample text.\" is translatable?\n",
|
||||||
|
"Yes\n",
|
||||||
|
"\"2024.\" is translatable?\n",
|
||||||
|
"No\n",
|
||||||
|
"\"Это пример текста.\" is translatable?\n",
|
||||||
|
"Yes\n",
|
||||||
|
"\"おはよう\" is translatable?\n",
|
||||||
|
"Yes\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"import string\n",
|
||||||
|
"\n",
|
||||||
|
"def is_translatable(text):\n",
|
||||||
|
" return bool(re.match(r'^[^\\d]+$|^\\s+$', text))\n",
|
||||||
|
"\n",
|
||||||
|
"examples = [\"This is a sample text.\", \"2024.\", \n",
|
||||||
|
" \"Это пример текста.\", \"おはよう\"]\n",
|
||||||
|
"\n",
|
||||||
|
"for ex in examples:\n",
|
||||||
|
" response = 'Yes' if is_translatable(ex) else 'No'\n",
|
||||||
|
" print(f'\"{ex}\" is translatable?\\n{response}')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -93,13 +180,74 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 44,
|
||||||
"id": "beautiful-mathematics",
|
"id": "beautiful-mathematics",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"('03/25/2022', 3, 25, 2022)\tday:\t3\tmonth:\t25\tyear:\t2022\n",
|
||||||
|
"('25-12-2023', 25, 12, 2023)\tday:\t25\tmonth:\t12\tyear:\t2023\n",
|
||||||
|
"('09/30/2025', 9, 30, 2025)\tday:\t9\tmonth:\t30\tyear:\t2025\n",
|
||||||
|
"('03/25/2022', 25, 3, 2022)\tday:\t25\tmonth:\t3\tyear:\t2022\n",
|
||||||
|
"('09/30/2025', 30, 9, 2025)\tday:\t30\tmonth:\t9\tyear:\t2025\n",
|
||||||
|
"('12 March 2024', 12, 3, 2024)\tday:\t12\tmonth:\t3\tyear:\t2024\n",
|
||||||
|
"('25-12-2023', 25, 12, 2023)\tday:\t25\tmonth:\t12\tyear:\t2023\n",
|
||||||
|
"('12 March 2024', 12, 3, 2024)\tday:\t12\tmonth:\t3\tyear:\t2024\n",
|
||||||
|
"('15 September, 2026', 15, 9, 2026)\tday:\t15\tmonth:\t9\tyear:\t2026\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"\n",
|
||||||
"def find_dates(text):\n",
|
"def find_dates(text):\n",
|
||||||
" return []"
|
" date_formats = [\n",
|
||||||
|
" (r'(\\d{1,2})[-/](\\d{1,2})[-/](\\d{2,4})', 'day_first'),\n",
|
||||||
|
" (r'(\\d{1,2})[/](\\d{1,2})[/](\\d{2,4})', 'month_first'),\n",
|
||||||
|
" (r'(\\d{1,2}) (\\w{3,9}) (\\d{4})', 'day_first'),\n",
|
||||||
|
" (r'(\\d{1,2})-(\\d{1,2})-(\\d{2,4})', 'day_first'),\n",
|
||||||
|
" (r'(\\d{1,2}) (\\w{3,9}),? (\\d{4})', 'day_first')\n",
|
||||||
|
" ]\n",
|
||||||
|
"\n",
|
||||||
|
" months = {\n",
|
||||||
|
" 'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,\n",
|
||||||
|
" 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" dates_found = []\n",
|
||||||
|
" for date_pattern, format_type in date_formats:\n",
|
||||||
|
" matches = re.finditer(date_pattern, text)\n",
|
||||||
|
" for match in matches:\n",
|
||||||
|
" groups = match.groups()\n",
|
||||||
|
" if len(groups) == 3:\n",
|
||||||
|
" if format_type == 'day_first':\n",
|
||||||
|
" day, month, year = groups\n",
|
||||||
|
" else:\n",
|
||||||
|
" month, day, year = groups\n",
|
||||||
|
" if month.isdigit():\n",
|
||||||
|
" month = int(month)\n",
|
||||||
|
" else:\n",
|
||||||
|
" month = months[month]\n",
|
||||||
|
" dates_found.append((match.group(), int(day), month, int(year)))\n",
|
||||||
|
" elif len(groups) == 4:\n",
|
||||||
|
" if format_type == 'day_first':\n",
|
||||||
|
" day, month, _, year = groups\n",
|
||||||
|
" else:\n",
|
||||||
|
" month, day, _, year = groups\n",
|
||||||
|
" if month.isdigit():\n",
|
||||||
|
" month = int(month)\n",
|
||||||
|
" else:\n",
|
||||||
|
" month = months[month]\n",
|
||||||
|
" dates_found.append((match.group(), int(day), month, int(year)))\n",
|
||||||
|
" return dates_found\n",
|
||||||
|
"\n",
|
||||||
|
"text = \"Here are some dates: 03/25/2022, 25-12-2023, 12 March 2024, 09/30/2025, 15 September, 2026\"\n",
|
||||||
|
"dates = find_dates(text)\n",
|
||||||
|
"for date in dates:\n",
|
||||||
|
" print(f\"{date}\\tday:\\t{date[1]}\\tmonth:\\t{date[2]}\\tyear:\\t{date[3]}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -205,7 +353,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.11.0"
|
||||||
},
|
},
|
||||||
"subtitle": "6,7. Preprocessing i postprocessing",
|
"subtitle": "6,7. Preprocessing i postprocessing",
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
18091
lab/lab_08.ipynb
18091
lab/lab_08.ipynb
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user