This commit is contained in:
potato 2024-04-23 23:52:29 +02:00
parent d32188878d
commit e02ff5ab39
3 changed files with 18512 additions and 45 deletions

File diff suppressed because one or more lines are too long

View File

@ -60,8 +60,35 @@
"metadata": {},
"outputs": [],
"source": [
"def find_tags(text):\n",
" return []"
"import re\n",
"\n",
"def find_tags(string):\n",
" pattern = r'<[^>]+>'\n",
" matches = re.finditer(pattern, string)\n",
" tag_indexes = [(match.start(), match.end()) for match in matches]\n",
" return tag_indexes"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3dc08368",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['<tag1>', '</tag1>', '<tag2>', '</tag2>']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"string = \"<tag1>ADIOS</tag1><tag2>OLA</tag2>\"\n",
"[ string[out[0]:out[1]] for out in find_tags(string)]"
]
},
{
@ -74,13 +101,73 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 20,
"id": "unauthorized-study",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n",
"False\n",
"False\n",
"True\n"
]
}
],
"source": [
"import re\n",
"import string\n",
"\n",
"def is_translatable(text):\n",
" return True"
" return bool(re.match(r'^[^0-9IVXLCDM\\s' + re.escape(string.punctuation) + ']+$', text))\n",
"\n",
"text1 = \"This is a sample text.\"\n",
"text2 = \"2024.\"\n",
"text3 = \"Это пример текста.\"\n",
"text4 = \"おはよう\"\n",
"\n",
"print(is_translatable(text1)) \n",
"print(is_translatable(text2))\n",
"print(is_translatable(text3))\n",
"print(is_translatable(text4))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "ae92a18c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\"This is a sample text.\" is translatable?\n",
"Yes\n",
"\"2024.\" is translatable?\n",
"No\n",
"\"Это пример текста.\" is translatable?\n",
"Yes\n",
"\"おはよう\" is translatable?\n",
"Yes\n"
]
}
],
"source": [
"import re\n",
"import string\n",
"\n",
"def is_translatable(text):\n",
" return bool(re.match(r'^[^\\d]+$|^\\s+$', text))\n",
"\n",
"examples = [\"This is a sample text.\", \"2024.\", \n",
" \"Это пример текста.\", \"おはよう\"]\n",
"\n",
"for ex in examples:\n",
" response = 'Yes' if is_translatable(ex) else 'No'\n",
" print(f'\"{ex}\" is translatable?\\n{response}')"
]
},
{
@ -93,13 +180,74 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 44,
"id": "beautiful-mathematics",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('03/25/2022', 3, 25, 2022)\tday:\t3\tmonth:\t25\tyear:\t2022\n",
"('25-12-2023', 25, 12, 2023)\tday:\t25\tmonth:\t12\tyear:\t2023\n",
"('09/30/2025', 9, 30, 2025)\tday:\t9\tmonth:\t30\tyear:\t2025\n",
"('03/25/2022', 25, 3, 2022)\tday:\t25\tmonth:\t3\tyear:\t2022\n",
"('09/30/2025', 30, 9, 2025)\tday:\t30\tmonth:\t9\tyear:\t2025\n",
"('12 March 2024', 12, 3, 2024)\tday:\t12\tmonth:\t3\tyear:\t2024\n",
"('25-12-2023', 25, 12, 2023)\tday:\t25\tmonth:\t12\tyear:\t2023\n",
"('12 March 2024', 12, 3, 2024)\tday:\t12\tmonth:\t3\tyear:\t2024\n",
"('15 September, 2026', 15, 9, 2026)\tday:\t15\tmonth:\t9\tyear:\t2026\n"
]
}
],
"source": [
"import re\n",
"\n",
"def find_dates(text):\n",
" return []"
" date_formats = [\n",
" (r'(\\d{1,2})[-/](\\d{1,2})[-/](\\d{2,4})', 'day_first'),\n",
" (r'(\\d{1,2})[/](\\d{1,2})[/](\\d{2,4})', 'month_first'),\n",
" (r'(\\d{1,2}) (\\w{3,9}) (\\d{4})', 'day_first'),\n",
" (r'(\\d{1,2})-(\\d{1,2})-(\\d{2,4})', 'day_first'),\n",
" (r'(\\d{1,2}) (\\w{3,9}),? (\\d{4})', 'day_first')\n",
" ]\n",
"\n",
" months = {\n",
" 'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,\n",
" 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12\n",
" }\n",
"\n",
" dates_found = []\n",
" for date_pattern, format_type in date_formats:\n",
" matches = re.finditer(date_pattern, text)\n",
" for match in matches:\n",
" groups = match.groups()\n",
" if len(groups) == 3:\n",
" if format_type == 'day_first':\n",
" day, month, year = groups\n",
" else:\n",
" month, day, year = groups\n",
" if month.isdigit():\n",
" month = int(month)\n",
" else:\n",
" month = months[month]\n",
" dates_found.append((match.group(), int(day), month, int(year)))\n",
" elif len(groups) == 4:\n",
" if format_type == 'day_first':\n",
" day, month, _, year = groups\n",
" else:\n",
" month, day, _, year = groups\n",
" if month.isdigit():\n",
" month = int(month)\n",
" else:\n",
" month = months[month]\n",
" dates_found.append((match.group(), int(day), month, int(year)))\n",
" return dates_found\n",
"\n",
"text = \"Here are some dates: 03/25/2022, 25-12-2023, 12 March 2024, 09/30/2025, 15 September, 2026\"\n",
"dates = find_dates(text)\n",
"for date in dates:\n",
" print(f\"{date}\\tday:\\t{date[1]}\\tmonth:\\t{date[2]}\\tyear:\\t{date[3]}\")"
]
},
{
@ -205,7 +353,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.0"
},
"subtitle": "6,7. Preprocessing i postprocessing",
"title": "Komputerowe wspomaganie tłumaczenia",

File diff suppressed because it is too large Load Diff