forked from bfijalkowski/KWT-2024
Compare commits
3 Commits
9b9e46df22
...
824f7d373d
Author | SHA1 | Date | |
---|---|---|---|
|
824f7d373d | ||
|
6a0efac373 | ||
78982a4f21 |
@ -103,7 +103,9 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "['Press the ENTER button']"
|
||||
"text/plain": [
|
||||
"['Press the ENTER button']"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
@ -168,7 +170,9 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "['Press the ENTER button', 'Press the ENTER key']"
|
||||
"text/plain": [
|
||||
"['Press the ENTER button', 'Press the ENTER key']"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
@ -208,7 +212,9 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "[]"
|
||||
"text/plain": [
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
@ -273,7 +279,9 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "[]"
|
||||
"text/plain": [
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
@ -335,7 +343,9 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "[]"
|
||||
"text/plain": [
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
@ -478,7 +488,9 @@
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": "[('przycisk', 'button'), ('drukarka', 'printer')]"
|
||||
"text/plain": [
|
||||
"[('przycisk', 'button'), ('drukarka', 'printer')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
@ -585,7 +597,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.11"
|
||||
"version": "3.9.2"
|
||||
},
|
||||
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
@ -40,9 +40,11 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "existing-approval",
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "961796fd-4463-4a17-ac15-afe712b3959e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Jedną z funkcji dostępnych we wszystkich większych programach do wspomagania tłumaczenia jest znajdowanie bardzo pewnych dopasowań w pamięci tłumaczeń. Są one zwane **ICE** (In-Context Exact match) lub 101% match. Są to takie dopasowania z pamięci tłumaczeń, dla których nie tylko zdanie źródłowe z TM jest identyczne z tłumaczonym, ale także poprzednie zdanie źródłowe z TM zgadza się z poprzednim zdaniem tłumaczonym oraz następne z TM z następnym tłumaczonym."
|
||||
]
|
||||
@ -258,21 +260,9 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4064ce50",
|
||||
"id": "355e4914-08da-4bd4-b8a2-67b055831c30",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'Levenshtein'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn [2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mLevenshtein\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m distance \u001b[38;5;28;01mas\u001b[39;00m levenshtein_distance\n\u001b[1;32m 3\u001b[0m levenshtein_distance(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mkotek\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mkotki\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'Levenshtein'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from Levenshtein import distance as levenshtein_distance\n",
|
||||
"\n",
|
||||
@ -349,22 +339,9 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "invisible-cambodia",
|
||||
"cell_type": "raw",
|
||||
"id": "4a47854f-df2e-451f-8e09-99f59210f86f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0.631578947368421"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"levenshtein_similarity('Spróbuj wyłączyć i włączyć komputer', 'Nie próbuj wyłączać i włączać drukarki')"
|
||||
]
|
||||
|
132
lab/lab_03.ipynb
132
lab/lab_03.ipynb
@ -171,116 +171,34 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 1,
|
||||
"id": "tribal-attention",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2024-04-20T15:23:32.727687100Z",
|
||||
"start_time": "2024-04-20T15:23:24.826454500Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" \n",
|
||||
"for\n",
|
||||
"all\n",
|
||||
"Java\n",
|
||||
"programmer\n",
|
||||
":\n",
|
||||
"this\n",
|
||||
"section\n",
|
||||
"explain\n",
|
||||
"how\n",
|
||||
"to\n",
|
||||
"compile\n",
|
||||
"and\n",
|
||||
"run\n",
|
||||
"a\n",
|
||||
"swing\n",
|
||||
"application\n",
|
||||
"from\n",
|
||||
"the\n",
|
||||
"command\n",
|
||||
"line\n",
|
||||
".\n",
|
||||
"for\n",
|
||||
"information\n",
|
||||
"on\n",
|
||||
"compile\n",
|
||||
"and\n",
|
||||
"run\n",
|
||||
"a\n",
|
||||
"swing\n",
|
||||
"application\n",
|
||||
"use\n",
|
||||
"NetBeans\n",
|
||||
"IDE\n",
|
||||
",\n",
|
||||
"see\n",
|
||||
"Running\n",
|
||||
"Tutorial\n",
|
||||
"Examples\n",
|
||||
"in\n",
|
||||
"NetBeans\n",
|
||||
"IDE\n",
|
||||
".\n",
|
||||
"the\n",
|
||||
"compilation\n",
|
||||
"instruction\n",
|
||||
"work\n",
|
||||
"for\n",
|
||||
"all\n",
|
||||
"swing\n",
|
||||
"program\n",
|
||||
"—\n",
|
||||
"applet\n",
|
||||
",\n",
|
||||
"as\n",
|
||||
"well\n",
|
||||
"as\n",
|
||||
"application\n",
|
||||
".\n",
|
||||
"here\n",
|
||||
"be\n",
|
||||
"the\n",
|
||||
"step\n",
|
||||
"-PRON-\n",
|
||||
"need\n",
|
||||
"to\n",
|
||||
"follow\n",
|
||||
":\n",
|
||||
"install\n",
|
||||
"the\n",
|
||||
"late\n",
|
||||
"release\n",
|
||||
"of\n",
|
||||
"the\n",
|
||||
"Java\n",
|
||||
"SE\n",
|
||||
"platform\n",
|
||||
",\n",
|
||||
"if\n",
|
||||
"-PRON-\n",
|
||||
"have\n",
|
||||
"not\n",
|
||||
"already\n",
|
||||
"do\n",
|
||||
"so\n",
|
||||
".\n",
|
||||
"create\n",
|
||||
"a\n",
|
||||
"program\n",
|
||||
"that\n",
|
||||
"use\n",
|
||||
"Swing\n",
|
||||
"component\n",
|
||||
".\n",
|
||||
"compile\n",
|
||||
"the\n",
|
||||
"program\n",
|
||||
".\n",
|
||||
"run\n",
|
||||
"the\n",
|
||||
"program\n",
|
||||
".\n"
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
|
||||
"\u001B[1;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)",
|
||||
"Cell \u001B[1;32mIn[1], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mspacy\u001B[39;00m\n\u001B[0;32m 2\u001B[0m nlp \u001B[38;5;241m=\u001B[39m spacy\u001B[38;5;241m.\u001B[39mload(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124men_core_web_sm\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m 4\u001B[0m doc \u001B[38;5;241m=\u001B[39m nlp(text)\n",
|
||||
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\__init__.py:13\u001B[0m\n\u001B[0;32m 10\u001B[0m \u001B[38;5;66;03m# These are imported as part of the API\u001B[39;00m\n\u001B[0;32m 11\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mthinc\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mapi\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Config, prefer_gpu, require_cpu, require_gpu \u001B[38;5;66;03m# noqa: F401\u001B[39;00m\n\u001B[1;32m---> 13\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m pipeline \u001B[38;5;66;03m# noqa: F401\u001B[39;00m\n\u001B[0;32m 14\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m util\n\u001B[0;32m 15\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mabout\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m __version__ \u001B[38;5;66;03m# noqa: F401\u001B[39;00m\n",
|
||||
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\pipeline\\__init__.py:1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mattributeruler\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m AttributeRuler\n\u001B[0;32m 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdep_parser\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m DependencyParser\n\u001B[0;32m 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01medit_tree_lemmatizer\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m EditTreeLemmatizer\n",
|
||||
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\pipeline\\attributeruler.py:8\u001B[0m\n\u001B[0;32m 6\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m util\n\u001B[0;32m 7\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01merrors\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Errors\n\u001B[1;32m----> 8\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mlanguage\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Language\n\u001B[0;32m 9\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mmatcher\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Matcher\n\u001B[0;32m 10\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mscorer\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Scorer\n",
|
||||
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\language.py:43\u001B[0m\n\u001B[0;32m 41\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mlang\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mtokenizer_exceptions\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m BASE_EXCEPTIONS, URL_MATCH\n\u001B[0;32m 42\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mlookups\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m load_lookups\n\u001B[1;32m---> 43\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpipe_analysis\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m analyze_pipes, print_pipe_analysis, validate_attrs\n\u001B[0;32m 44\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mschemas\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m (\n\u001B[0;32m 45\u001B[0m ConfigSchema,\n\u001B[0;32m 46\u001B[0m ConfigSchemaInit,\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 49\u001B[0m validate_init_settings,\n\u001B[0;32m 50\u001B[0m )\n\u001B[0;32m 51\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mscorer\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Scorer\n",
|
||||
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\pipe_analysis.py:6\u001B[0m\n\u001B[0;32m 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mwasabi\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m msg\n\u001B[0;32m 5\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01merrors\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Errors\n\u001B[1;32m----> 6\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mtokens\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Doc, Span, Token\n\u001B[0;32m 7\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutil\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m dot_to_dict\n\u001B[0;32m 9\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m TYPE_CHECKING:\n\u001B[0;32m 10\u001B[0m \u001B[38;5;66;03m# This lets us add type hints for mypy etc. without causing circular imports\u001B[39;00m\n",
|
||||
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\tokens\\__init__.py:1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m_serialize\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m DocBin\n\u001B[0;32m 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdoc\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Doc\n\u001B[0;32m 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mmorphanalysis\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m MorphAnalysis\n",
|
||||
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\tokens\\_serialize.py:14\u001B[0m\n\u001B[0;32m 12\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01merrors\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Errors\n\u001B[0;32m 13\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutil\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m SimpleFrozenList, ensure_path\n\u001B[1;32m---> 14\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mvocab\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Vocab\n\u001B[0;32m 15\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m_dict_proxies\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m SpanGroups\n\u001B[0;32m 16\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdoc\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m DOCBIN_ALL_ATTRS \u001B[38;5;28;01mas\u001B[39;00m ALL_ATTRS\n",
|
||||
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\vocab.pyx:1\u001B[0m, in \u001B[0;36minit spacy.vocab\u001B[1;34m()\u001B[0m\n",
|
||||
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\tokens\\doc.pyx:1\u001B[0m, in \u001B[0;36minit spacy.tokens.doc\u001B[1;34m()\u001B[0m\n",
|
||||
"File \u001B[1;32m<frozen importlib._bootstrap>:404\u001B[0m, in \u001B[0;36mparent\u001B[1;34m(self)\u001B[0m\n",
|
||||
"\u001B[1;31mKeyboardInterrupt\u001B[0m: "
|
||||
]
|
||||
}
|
||||
],
|
||||
|
File diff suppressed because one or more lines are too long
@ -60,8 +60,14 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import regex\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def find_tags(text):\n",
|
||||
" return []"
|
||||
" result = []\n",
|
||||
" for occurance in regex.finditer(\"(\\</?\\w+\\>)\", text, regex.IGNORECASE):\n",
|
||||
" result.append(occurance.span())\n",
|
||||
" return result"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -79,8 +85,12 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import regex\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Assuming text is a single word\n",
|
||||
"def is_translatable(text):\n",
|
||||
" return True"
|
||||
" return regex.fullmatch(\"[A-Z\\-]+\", text, regex.IGNORECASE) is not None"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -98,8 +108,26 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import regex\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def find_dates(text):\n",
|
||||
" return []"
|
||||
" regex_format = regex.compile(\"(?P<day>[0-9]{1,2})[/.-](?P<month>[0-9]{1,2})[/.-](?P<year>[0-9]{4})\")\n",
|
||||
" matches = regex.match(regex_format, text)\n",
|
||||
" result = {\n",
|
||||
" 'day': int(matches.group('day')),\n",
|
||||
" 'month': int(matches.group('month')),\n",
|
||||
" 'year': int(matches.group('year')),\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" return result\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(find_dates(\"01/02/1970\"))\n",
|
||||
"print(find_dates(\"01.02.1970\"))\n",
|
||||
"print(find_dates(\"01-02-1970\"))\n",
|
||||
"print(find_dates(\"1/2/1970\"))\n",
|
||||
"print(find_dates(\"1.2.1970\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -130,8 +158,22 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"formats = {\n",
|
||||
" 'd/m/y': lambda date: f\"{date['day']}/{date['month']}/{date['year']}\",\n",
|
||||
" 'y-m-d': lambda date: f\"{date['year']}-{date['month']}-{date['day']}\",\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def correct_dates(source_segment, target_segment, date_format):\n",
|
||||
" return ''"
|
||||
" source_date = find_dates(source_segment)\n",
|
||||
" target_date = find_dates(target_segment)\n",
|
||||
" if target_date != source_date:\n",
|
||||
" print('Dates differ')\n",
|
||||
"\n",
|
||||
" return formats[date_format](source_date)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(correct_dates(\"1.2.1970\", \"1.2.1970\", 'y-m-d'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -190,7 +232,7 @@
|
||||
"author": "Rafał Jaworski",
|
||||
"email": "rjawor@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@ -205,7 +247,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.9.2"
|
||||
},
|
||||
"subtitle": "6,7. Preprocessing i postprocessing",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
@ -50,37 +50,36 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Nastolatek ukradł znajomemu 4500 złotych. Wcześniej pił z nim alkohol\n",
|
||||
"Czekają nas kolejne podwyżki rachunków. Tym razem za ogrzewanie i ciepłą wodę\n",
|
||||
"Nie żyje Piotr Ś. Czyściciel kamienic miał 47 lat\n",
|
||||
"Maciej Skorża nie zmienił zdania o systemie na mecz z Rakowem. Kolejorz ma szybką okazję do rehabilitacji\n",
|
||||
"Kto zabił Kazimierę Kurkowiak? Poznańskie Archiwum X wraca do sprawy sprzed 30 lat\n",
|
||||
"Mieszkańcy osiedla Kwiatowego zyskają nowy chodnik\n",
|
||||
"Poznańskie ZOO ponownie się otwiera i apeluje o kupowanie biletów online\n",
|
||||
"1700 zł mandatu dla motocyklisty: nie ma prawa jazdy, jechał za szybko\n",
|
||||
"Plac Wolności ma tętnić życiem. Jest koncepcja zagospodarowania\n",
|
||||
"Dzikie wysypisko w Wielkopolskim Parku Narodowym, a w nim paczka z telefonem odbiorcy\n",
|
||||
"Dobre wieści z Łazarza! \"Zielona Perła\" sprzedana!\n",
|
||||
"Sokoły wędrowne w gnieździe na kominie poznańskiej elektrociepłowni! Są 4 młode\n",
|
||||
"720 nowych zakażeń w Wielkopolsce\n",
|
||||
"Uderzył kobietę w sklepie: \"sprawca będzie rozliczony\"\n",
|
||||
"Zespół Szkół Geodezyjno- Drogowych. Przyszłość rysuje się w kolorowych barwach!\n",
|
||||
"Tajemniczy wypadek i pożar pod Kwilczem. Auto spłonęło, w środku nikogo nie było\n",
|
||||
"Nad Jeziorem Maltańskim powstanie duży hotel? \"Ma uzupełniać infrastrukturę sportową\"\n",
|
||||
"Śmiertelny wypadek na trasie S8: samochód potrącił rowerzystę\n",
|
||||
"Specjaliści o poszukiwaniu Natalii Lick: \"niestety trop psa prowadził na Wartostradę\"\n",
|
||||
"Korki przy skrzyżowaniu Grochowska / Grunwaldzka: ruszyły prace!\n",
|
||||
"Restauracja w Kaliszu przyjmuje klientów: sanepid i policja \"odwiedzili\" lokal\n",
|
||||
"Ile kosztuje wywóz odpadów?\n",
|
||||
"Dachowanie auta na trasie Konin - Turek\n",
|
||||
"Kierowca BMW pod wpływem narkotyków, pasażer w ich posiadaniu. Obaj zostali zatrzymani\n",
|
||||
"Leszno: mężczyzna uderzył klientkę sklepu. Poszło o maseczkę?\n",
|
||||
"Od poniedziałku zapłacimy za parkowanie na kolejnych ulicach\n",
|
||||
"Włamał się do obiektu handlowego. Grozi mu nawet 15 lat więzienia\n",
|
||||
"Rondo Śródka: kolizja z udziałem dwóch pojazdów\n",
|
||||
"Europoseł PSL: oświadczenie Episkopatu ma wpływ na proces szczepień. \"Bardzo dużo ludzi zrezygnowało\"\n",
|
||||
"Bezcenna wygrana Enea Energetyka. Poznanianki zagrają w fazie play-off\n",
|
||||
"No to w drogę! Po odmienionych trasach w Wielkopolsce\n"
|
||||
"W Poznaniu uroczyście odsłonięto monument upamiętniający cmentarz żydowski założony jeszcze w XIX wieku\n",
|
||||
"Przez ulice Poznania przejdzie Marsz dla Życia. Będą utrudnienia\n",
|
||||
"Sierść psa zatopiona w żywicy? Taką biżuterię pamiątkową zlecają właściciele czworonożnych pociech\n",
|
||||
"Nagrał film w jednej z poznańskich \"Biedronek\". Kilka spleśniałych cytryn w kartonie. \"Nikt się tym nie przejmuje\"\n",
|
||||
"Gniezno: poszkodowani po ulewie będą mogli ubiegać się o pomoc w ZUS i US. Powstała również specjalna infolinia\n",
|
||||
"Zostawiła jedzenie dla potrzebujących. Coraz więcej głodnych osób, którym nie wystarcza pieniędzy po opłaceniu rachunków\n",
|
||||
"Kolejne ostrzeżenie I stopnia od IMGW. Oprócz burz może wystąpić również grad\n",
|
||||
"Lech przegrał Koroną. Na trybunach marsz żałobny i 'mamy k**** dość'\n",
|
||||
"Warta Poznań po przegranej z Jagielonią Białystok spada do I ligi\n",
|
||||
"Mieszkańcy skarżą się na właściciela samochodu, w którym notorycznie włącza się alarm. \"Uprzykrza nam to życie!\"\n",
|
||||
"Leśne Placówki Montessori\n",
|
||||
"Na autostradzie samochód wpadł w poślizg i stanął w poprzek. Są spore utrudnienia\n",
|
||||
"Wróciła plaga kradzieży katalizatorów. Zmora dla kierowców, którzy nie mogą garażować auta\n",
|
||||
"Nowy basen w Kiekrzu? W tunelu wody przybyło po same kolana\n",
|
||||
"Pierożki Dim Sum z Para Bar Rataje ze specjalną zniżką!\n",
|
||||
"Wielka głowa Darii Zawiałow zablokowała przez chwilę przejście dla pieszych na jednej z poznańskich ulic\n",
|
||||
"Fałszywy pożar w centrum Poznania. Kłęby dymu w kamienicy?\n",
|
||||
"Jest kolejne ostrzeżenie pierwszego stopnia, tym razem hydrologiczne. Gwałtowny wzrost stanu wody\n",
|
||||
"Uwaga. Utrudnienia na drodze i ograniczenie prędkości. Potrwa to około 5 godzin\n",
|
||||
"Chcą pobić rekord w kręceniu lodów. Tona lodów w ciągu doby\n",
|
||||
"Jest ostrzeżenie IMGW dla Wielkopolski. Lepiej schować przedmioty, które mogą przemieścić się pod wypływem silnego wiatru\n",
|
||||
"Nowe Centrum Medyczne Bizpark już w sprzedaży. Znajdź idealny lokal pod swoją działalność medyczną\n",
|
||||
"Rondo Obornickie: zderzenie samochodu z motocyklem. Poszkodowany został odwieziony do szpitala. Chwilowe utrudnienia\n",
|
||||
"Policjanci publikują wizerunek i szukają tego mężczyzny\n",
|
||||
"Grupa Stonewall będzie miała program na antenie TVP3 Poznań. \"To będzie odtrutka na lata dezinformacji\"\n",
|
||||
"Ruszył remont ważnego mostu. Co z kłódkami zakochanych?\n",
|
||||
"Mieszkaniec spotkał wilka w Poznaniu?\n",
|
||||
"Włamanie do... lokomotywy\n",
|
||||
"W nadwarciański krajobraz wpisały się... żurawie. \"Jeden jest największy na świecie\"\n",
|
||||
"Robisz remont? Za to możesz słono zapłacić!\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -108,13 +107,51 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 10,
|
||||
"id": "moving-clothing",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_names(article_type):\n",
|
||||
" return []"
|
||||
"from bs4 import element\n",
|
||||
"\n",
|
||||
"def get_names(article_type, page_nr: int = 0):\n",
|
||||
" url = 'https://www.ceneo.pl/;szukaj-' + article_type + ';0020-30-0-0-' + str(page_nr) + '.htm'\n",
|
||||
" page = requests.get(url)\n",
|
||||
" if page_nr != 0 and url != page.url:\n",
|
||||
" return []\n",
|
||||
" soup = BeautifulSoup(page.content, 'html.parser')\n",
|
||||
" result = []\n",
|
||||
"\n",
|
||||
" def is_product_title_container(tag: element.Tag) -> bool:\n",
|
||||
" if not tag.has_attr('class'):\n",
|
||||
" return False\n",
|
||||
"\n",
|
||||
" classes = tag.attrs['class']\n",
|
||||
" if len(classes) != 1:\n",
|
||||
" return False\n",
|
||||
"\n",
|
||||
" return classes[0] == 'cat-prod-row__name'\n",
|
||||
"\n",
|
||||
" def is_product_title(tag: element.Tag) -> bool:\n",
|
||||
" if not tag.has_attr('class'):\n",
|
||||
" return True\n",
|
||||
"\n",
|
||||
" classes = tag.attrs['class']\n",
|
||||
" if len(classes) != 1:\n",
|
||||
" return False\n",
|
||||
"\n",
|
||||
" return classes[0] == 'font-bold'\n",
|
||||
"\n",
|
||||
" for tag in soup.find_all(is_product_title_container):\n",
|
||||
" href = tag.find('a')\n",
|
||||
" if type(href) is not element.Tag:\n",
|
||||
" continue\n",
|
||||
" spans = href.find_all('span')\n",
|
||||
" for span in spans:\n",
|
||||
" if is_product_title(span):\n",
|
||||
" result.append(span.text)\n",
|
||||
"\n",
|
||||
" return result"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -135,13 +172,21 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 9,
|
||||
"id": "german-dispute",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def scrape_names():\n",
|
||||
" return []"
|
||||
" result = []\n",
|
||||
" search = 'laptop'\n",
|
||||
" page = 0\n",
|
||||
" while True:\n",
|
||||
" local_result = get_names(search, page)\n",
|
||||
" if len(local_result) == 0:\n",
|
||||
" return result\n",
|
||||
" result = result + local_result\n",
|
||||
" page += 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -197,13 +242,39 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"id": "regulation-sheriff",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def scrape_wmi():\n",
|
||||
" return []"
|
||||
" def get_text(soup_l: BeautifulSoup) -> str:\n",
|
||||
" for trash in soup_l(['script', 'style']):\n",
|
||||
" trash.extract()\n",
|
||||
"\n",
|
||||
" text = soup_l.get_text()\n",
|
||||
"\n",
|
||||
" return re.sub(r'\\s+', ' ', text)\n",
|
||||
"\n",
|
||||
" result = []\n",
|
||||
"\n",
|
||||
" base_url = 'https://wmi.amu.edu.pl/'\n",
|
||||
" page = requests.get(base_url)\n",
|
||||
" soup = BeautifulSoup(page.content, 'html.parser')\n",
|
||||
" result.append(get_text(soup))\n",
|
||||
" for href in soup.find_all('a'):\n",
|
||||
" if type(href) != element.Tag:\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" if not href.has_attr('href'):\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" if base_url in href.attrs['href']:\n",
|
||||
" sub_page = requests.get(href.attrs['href'])\n",
|
||||
" result.append(get_text(BeautifulSoup(sub_page.content, 'html.parser')))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" return result"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -222,30 +293,97 @@
|
||||
"### Ćwiczenie 4: Pobierz jak najwięcej słów w języku albańskim z serwisu glosbe.com."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "706d6cba-c7a7-4d1b-9c2f-eb2119f859b5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Nie jest to rozwiązanie zbalansowane, ale pobierze najwięcej słów (Przy odpowiedniej rotacji adresów IP, z których korzystamy, ale założyłem, że kwestia infrastruktury i tego jak strona jest chroniona przed atakami DOS, jest poza zakresem tego zadania)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 7,
|
||||
"id": "surgical-ozone",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def scrape_shqip():\n",
|
||||
" return []"
|
||||
" import string\n",
|
||||
"\n",
|
||||
" result = []\n",
|
||||
" letters = list(string.ascii_lowercase)\n",
|
||||
" letters_count = len(letters)\n",
|
||||
" longest_sensible_english_word_len = 28\n",
|
||||
" base_url = 'https://glosbe.com/en/sq/'\n",
|
||||
"\n",
|
||||
" def get_words(word_l: str) -> list[str]:\n",
|
||||
" def is_translated_word(tag: element.Tag) -> bool:\n",
|
||||
" if not tag.has_attr('id') or not tag.has_attr('lang'):\n",
|
||||
" return False\n",
|
||||
"\n",
|
||||
" if not 'translation__' in tag.attrs['id'] or 'sq' != tag.attrs['lang']:\n",
|
||||
" return False\n",
|
||||
"\n",
|
||||
" return True\n",
|
||||
"\n",
|
||||
" result_l = []\n",
|
||||
" page = requests.get(base_url + word_l)\n",
|
||||
" soup = BeautifulSoup(page.content, 'html.parser')\n",
|
||||
" words_l = soup.find_all(is_translated_word)\n",
|
||||
" for word_l in words_l:\n",
|
||||
" text = word_l.text\n",
|
||||
" result_l.append(re.sub(r'\\s+', ' ', text))\n",
|
||||
"\n",
|
||||
" return result_l\n",
|
||||
"\n",
|
||||
" def trans(word_l: list[int]) -> str:\n",
|
||||
" result_l = ''\n",
|
||||
" for letter_l in word_l:\n",
|
||||
" result_l += letters[letter_l]\n",
|
||||
"\n",
|
||||
" return result_l\n",
|
||||
"\n",
|
||||
" def increment(word_l: list[int]) -> list[int]:\n",
|
||||
" done = False\n",
|
||||
" result_l = []\n",
|
||||
" for letter_l in word_l:\n",
|
||||
" if done:\n",
|
||||
" result_l.append(letter_l)\n",
|
||||
" continue\n",
|
||||
" next_letter_l = letter_l + 1\n",
|
||||
" if next_letter_l == letters_count:\n",
|
||||
" result_l.append(0)\n",
|
||||
" continue\n",
|
||||
"\n",
|
||||
" result_l.append(next_letter_l)\n",
|
||||
" done = True\n",
|
||||
"\n",
|
||||
" return result_l\n",
|
||||
"\n",
|
||||
" for length in range(longest_sensible_english_word_len - 1):\n",
|
||||
" length += 1\n",
|
||||
" combos = pow(length, letters_count)\n",
|
||||
" word = []\n",
|
||||
" for pos in range(length):\n",
|
||||
" word.append(0)\n",
|
||||
" for i in range(combos):\n",
|
||||
" result.append(get_words(trans(word)))\n",
|
||||
" word = increment(word)\n",
|
||||
"\n",
|
||||
" return result"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Rafał Jaworski",
|
||||
"email": "rjawor@amu.edu.pl",
|
||||
"lang": "pl",
|
||||
"subtitle": "9,10. Web scraping",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
"year": "2021",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
@ -256,8 +394,11 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
}
|
||||
"version": "3.10.4"
|
||||
},
|
||||
"subtitle": "9,10. Web scraping",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
|
Loading…
Reference in New Issue
Block a user