Compare commits

...

5 Commits
main ... main

Author SHA1 Message Date
Patryk Bartkowiak fd590b3a22 2024-05-12 lab 9, 10 2024-05-12 23:12:15 +02:00
Patryk Bartkowiak 9b75563e6a [2024-04-14] lab 8 2024-04-14 22:21:17 +02:00
Patryk Bartkowiak 7a6ac33f6e [2024-04-14] lab 6, 7 2024-04-14 19:33:04 +02:00
Patryk Bartkowiak 5de69211e1 [2024-04-14] lab 4, 5 2024-04-14 18:45:52 +02:00
Patryk Bartkowiak 870b673fac [2024-04-13] labs 1,2,3 2024-04-13 14:10:00 +02:00
11 changed files with 2420 additions and 270 deletions

2
lab/data/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
corpus/
NIPS Papers/

10
lab/data/lda_topics.txt Normal file
View File

@ -0,0 +1,10 @@
(0, '0.006*"learning" + 0.005*"model" + 0.005*"data" + 0.004*"function" + 0.004*"set" + 0.004*"using" + 0.004*"number" + 0.004*"neural" + 0.004*"one" + 0.003*"error"')
(1, '0.008*"learning" + 0.006*"data" + 0.005*"model" + 0.005*"set" + 0.004*"algorithm" + 0.004*"time" + 0.004*"one" + 0.004*"two" + 0.003*"used" + 0.003*"figure"')
(2, '0.007*"data" + 0.005*"model" + 0.005*"set" + 0.005*"learning" + 0.004*"one" + 0.004*"algorithm" + 0.004*"time" + 0.003*"using" + 0.003*"figure" + 0.003*"training"')
(3, '0.006*"data" + 0.005*"model" + 0.004*"learning" + 0.004*"two" + 0.004*"algorithm" + 0.004*"using" + 0.004*"function" + 0.004*"set" + 0.003*"number" + 0.003*"given"')
(4, '0.006*"learning" + 0.005*"data" + 0.005*"model" + 0.005*"set" + 0.004*"algorithm" + 0.004*"time" + 0.004*"using" + 0.004*"two" + 0.004*"function" + 0.003*"one"')
(5, '0.008*"learning" + 0.006*"data" + 0.005*"algorithm" + 0.004*"model" + 0.004*"two" + 0.004*"function" + 0.004*"number" + 0.003*"figure" + 0.003*"time" + 0.003*"set"')
(6, '0.007*"learning" + 0.006*"model" + 0.005*"data" + 0.005*"algorithm" + 0.004*"function" + 0.004*"set" + 0.003*"time" + 0.003*"one" + 0.003*"based" + 0.003*"number"')
(7, '0.007*"learning" + 0.005*"set" + 0.005*"data" + 0.005*"model" + 0.004*"algorithm" + 0.004*"function" + 0.004*"using" + 0.004*"number" + 0.004*"log" + 0.004*"figure"')
(8, '0.005*"learning" + 0.005*"set" + 0.005*"algorithm" + 0.004*"model" + 0.004*"function" + 0.004*"data" + 0.004*"one" + 0.004*"time" + 0.003*"using" + 0.003*"given"')
(9, '0.007*"data" + 0.006*"model" + 0.005*"learning" + 0.005*"algorithm" + 0.004*"two" + 0.003*"number" + 0.003*"time" + 0.003*"set" + 0.003*"function" + 0.003*"used"')

100
lab/data/top_nouns.txt Normal file
View File

@ -0,0 +1,100 @@
project victims support visit mediation
exhibition cooperation year meeting films
exhibition cooperation year meeting films
solution occupation settlement problem resolutions
residence citizens permit security citizen
residence citizens permit security citizen
support measures countries farmers member
data services infrastructure development project
data services infrastructure development project
photographs service scans materials films
photographs service scans materials films
insurance ZUS contributions benefits administration
project archaeology research conservation history
project archaeology research conservation history
cases % coronavirus countries disease
% year case cases coronavirus
ship tug speed accident course
ship tug speed accident course
work scientists research science telomerase
work scientists research science telomerase
film media part time efforts
film media part time efforts
insurance ZUS contributions benefits administration
use care stewardship resistance antibiotics
services administration state information e
services administration state information e
coronavirus research measures outbreak member
residence card foreigner work permit
security e threats policy gas
security e threats policy gas
paper 15th reader file date
paper 15th reader file date
costs implementation management tasks expenditures
food cooperation products market agri
costs implementation management tasks expenditures
costs implementation management tasks expenditures
artist work painting paintings time
artist work painting paintings time
Home » rights representatives discrimination
Home » rights representatives discrimination
command documentation alias files directory
water basis land status item
water basis land status item
% contract contracts . No
food cooperation products market agri
% contract contracts . No
market level services age companies
market level services age companies
projects innovation R&D development companies
projects innovation R&D development companies
contracts contract % item procedures
contracts contract % item procedures
room A office information B
room A office information B
advantage production country countries goods
measles vaccine disease person people
advantage production country countries goods
card residence permission business stamp
card residence permission business stamp
w % gospodarczego polityki publicznych
system banks stability risk sector
camps people concentration policy resistance
camps people concentration policy resistance
safety aviation management requirements entity
safety aviation management requirements entity
research call philosophy information project
vaccination pertussis cancer risk disease
research call philosophy information project
energy gas % oil countries
energy gas % oil countries
cooperation meeting talks forces defence
project education information coronavirus funding
food education project measures assistance
infection disease symptoms fever humans
energy audit costs use management
countries % development benefits funds
years minister year rector persons
water food fish times year
land water population data age
land water population data age
market labour crisis unemployment countries
market labour crisis unemployment countries
accelerator research - operation model
accelerator research - operation model
energy policy power development objectives
priest hand country wedding church
eggs breakfast food products meat
eggs breakfast food products meat
water fish times food year
honey production bread time taste
honey production bread time taste
data job portal vacancies Decision
data job portal vacancies Decision
food quality products apples farmers
food quality products apples farmers
visa activities child B-1 institution
visa activities child B-1 institution
- co preparations operation preparation
- co preparations operation preparation
project victims support visit mediation

View File

@ -0,0 +1,100 @@
approval total lawyers priorities judges
agriculture support guests offers author
agriculture support guests offers author
homeland invasion address prisoners sources
identity positions elaboration issues terms
identity positions elaboration issues terms
distancing lenders mechanism check part
IT Realization Services resolutions bases
IT Realization Services resolutions bases
occupation scans browser Service processes
occupation scans browser Service processes
am war month Insurance centralisation
conservation zu provisions basin record
conservation zu provisions basin record
culture city abscesses aeronautics disruptors
infection Recommendations man evening occurrence
course hull STATE classifier certificate
course hull STATE classifier certificate
cooling work culture part laboratory
cooling work culture part laboratory
culture reverse advisor documentary service
culture reverse advisor documentary service
am war month Insurance centralisation
pressure ability entry prescribers costs
economies management role disk stakeholders
economies management role disk stakeholders
traders fears carriers illness distancing
activity employment foreigners Visa graduate
defense forecast quarter factors opportunity
defense forecast quarter factors opportunity
case author screen announcement typefaces
case author screen announcement typefaces
revenue office premises o proposals
storage completion efforts Meeting crisis
office Types premises protection days
revenue office premises o proposals
pictures splashing dobrze viewer culture
pictures splashing dobrze viewer culture
creation origin discrimination interest institutions
creation origin discrimination interest institutions
names contexts calculator program descriptor
periods standards total name property
periods standards total name property
Art days liability authorities services
storage completion efforts Meeting crisis
Art days liability authorities services
skills provision country economies science
skills provision country economies science
Project possibilities cancer members therapies
Project possibilities cancer members therapies
price auction actions telecommunications appointment
price auction actions telecommunications appointment
records coffee authorisation line times
records coffee authorisation line times
example manner source essence identification
defences vaccines days spread body
example manner source essence identification
servants employees Possession insurance examinations
servants employees Possession insurance examinations
systemowe dopiero system latach popytem
efficiency problems uncertainty improvement Risk
uprising borders rights security campaign
uprising borders rights security campaign
part audits Responsibilities services authority
protection competence version occurrence requisition
Requirements members methodology data database
whoop substitute cause exposure course
Requirements members methodology data database
erent decisions SOURCES spectrum economies
erent decisions SOURCES spectrum economies
invitation effects help armament round
area teaching tax time travel
time Recommendation participants guarantees work
toxin mechanisms attacks Babies therapies
production replacement control SMEs audit
significance net ground participants levels
functioning consultation interest expert procedures
thing mercury eggs municipality lunch
agriculture R result development prices
agriculture R result development prices
reflection basis sources points results
reflection basis sources points results
leaders reach author features publications
leaders reach author features publications
consumption Improvement bodies level need
money delirium advice house couple
work thanks BEgINNINg range funds
work thanks BEgINNINg range funds
option eggs dinner wine quantities
seeds mead event maples approach
seeds mead event maples approach
case complaints consultation Employers actions
case complaints consultation Employers actions
activity fruit indications zation rice
activity fruit indications zation rice
building work premises Food child
building work premises Food child
virtue works culture sectors others
virtue works culture sectors others
approval total lawyers priorities judges

View File

@ -52,7 +52,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 191,
"id": "narrow-romantic",
"metadata": {},
"outputs": [],
@ -71,7 +71,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 192,
"id": "indonesian-electron",
"metadata": {},
"outputs": [],
@ -82,7 +82,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 193,
"id": "compact-trinidad",
"metadata": {},
"outputs": [
@ -92,7 +92,7 @@
"['Press the ENTER button']"
]
},
"execution_count": 3,
"execution_count": 193,
"metadata": {},
"output_type": "execute_result"
}
@ -119,7 +119,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 194,
"id": "exposed-daniel",
"metadata": {},
"outputs": [],
@ -139,7 +139,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 195,
"id": "serial-velvet",
"metadata": {},
"outputs": [
@ -149,7 +149,7 @@
"['Press the ENTER button', 'Press the ENTER key']"
]
},
"execution_count": 5,
"execution_count": 195,
"metadata": {},
"output_type": "execute_result"
}
@ -176,7 +176,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 196,
"id": "every-gibson",
"metadata": {},
"outputs": [
@ -186,7 +186,7 @@
"[]"
]
},
"execution_count": 6,
"execution_count": 196,
"metadata": {},
"output_type": "execute_result"
}
@ -213,13 +213,37 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 197,
"id": "protected-rings",
"metadata": {},
"outputs": [],
"source": [
"def preprocess(sentence):\n",
" return sentence.lower()\n",
"\n",
"def tm_lookup(sentence):\n",
" return ''"
" return [entry[1] for entry in translation_memory if preprocess(entry[0]) == preprocess(sentence)]"
]
},
{
"cell_type": "code",
"execution_count": 198,
"id": "7baee10b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Press the ENTER button', 'Press the ENTER key']"
]
},
"execution_count": 198,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tm_lookup('Wciśnij przycisk ENTER')"
]
},
{
@ -232,17 +256,17 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 199,
"id": "severe-alloy",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"''"
"[]"
]
},
"execution_count": 18,
"execution_count": 199,
"metadata": {},
"output_type": "execute_result"
}
@ -261,13 +285,40 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 200,
"id": "structural-diesel",
"metadata": {},
"outputs": [],
"source": [
"import string\n",
"\n",
"def preprocess(s):\n",
" translator = str.maketrans('', '', string.punctuation)\n",
" return s.translate(translator).lower()\n",
"\n",
"def tm_lookup(sentence):\n",
" return ''"
" return [entry[1] for entry in translation_memory if preprocess(entry[0]) == preprocess(sentence)]"
]
},
{
"cell_type": "code",
"execution_count": 201,
"id": "c03c6709",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Press the ENTER button', 'Press the ENTER key']"
]
},
"execution_count": 201,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tm_lookup('Wciśnij przycisk [ENTER]')"
]
},
{
@ -280,17 +331,17 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 202,
"id": "brief-senegal",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"''"
"[]"
]
},
"execution_count": 12,
"execution_count": 202,
"metadata": {},
"output_type": "execute_result"
}
@ -317,13 +368,43 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 203,
"id": "mathematical-customs",
"metadata": {},
"outputs": [],
"source": [
"def compare_sentences(l1, l2):\n",
" return sum([1 for i, j in zip(l1.split(), l2.split()) if i != j]) <= 1\n",
"\n",
"import string\n",
"\n",
"def preprocess(s):\n",
" translator = str.maketrans('', '', string.punctuation)\n",
" return s.translate(translator).lower()\n",
"\n",
"def tm_lookup(sentence):\n",
" return ''"
" return [entry[1] for entry in translation_memory if compare_sentences(preprocess(entry[0]), preprocess(sentence))]"
]
},
{
"cell_type": "code",
"execution_count": 204,
"id": "6264b722",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['System restart required']"
]
},
"execution_count": 204,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tm_lookup('Wymagane ponowne uruchomienie maszyny')"
]
},
{
@ -344,7 +425,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 205,
"id": "humanitarian-wrong",
"metadata": {},
"outputs": [],
@ -362,7 +443,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 206,
"id": "located-perception",
"metadata": {},
"outputs": [],
@ -374,7 +455,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 207,
"id": "advised-casting",
"metadata": {},
"outputs": [
@ -384,7 +465,7 @@
"[('przycisk', 'button'), ('drukarka', 'printer')]"
]
},
"execution_count": 17,
"execution_count": 207,
"metadata": {},
"output_type": "execute_result"
}
@ -406,7 +487,7 @@
"id": "defensive-fifteen",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Jeżeli implementacja wygląda tak jak powyżej, złożoność to `O(n*m)`, ponieważ dla każdego słowa iteracyjnie przechodzimy przez cały nasz słownik i szukamy odpowiednika"
]
},
{
@ -419,13 +500,56 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 208,
"id": "aca5d340",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('przycisk', 'button')]"
]
},
"execution_count": 208,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"glossary_lookup('Każda Drukarka posiada przycisk wznowienia drukowania')"
]
},
{
"cell_type": "code",
"execution_count": 209,
"id": "original-tunisia",
"metadata": {},
"outputs": [],
"source": [
"def glossary_lookup(sentence):\n",
" return ''"
" sentence_words = [word.lower() for word in sentence.split()]\n",
" return [entry for entry in glossary if entry[0] in sentence_words]"
]
},
{
"cell_type": "code",
"execution_count": 210,
"id": "716bbbe9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('przycisk', 'button'), ('drukarka', 'printer')]"
]
},
"execution_count": 210,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')"
]
},
{
@ -438,13 +562,50 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 211,
"id": "32dec661",
"metadata": {},
"outputs": [],
"source": [
"glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]\n",
"glossary = {\n",
" 'komputer': 'computer',\n",
" 'przycisk': 'button',\n",
" 'drukarka': 'printer'\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 212,
"id": "adolescent-semiconductor",
"metadata": {},
"outputs": [],
"source": [
"def glossary_lookup(sentence):\n",
" return ''"
" sentence_words = [word.lower() for word in sentence.split() if word.lower() in glossary]\n",
" return [(word, glossary[word]) for word in sentence_words]"
]
},
{
"cell_type": "code",
"execution_count": 213,
"id": "d1e991c6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('drukarka', 'printer'), ('przycisk', 'button')]"
]
},
"execution_count": 213,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')"
]
}
],
@ -467,7 +628,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.10.14"
},
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
"title": "Komputerowe wspomaganie tłumaczenia",

View File

@ -57,7 +57,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 17,
"id": "confident-prison",
"metadata": {},
"outputs": [],
@ -80,13 +80,27 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 18,
"id": "continental-submission",
"metadata": {},
"outputs": [],
"source": [
"def ice_lookup(sentence, prev_sentence, next_sentence):\n",
" return []"
" # Wyniki dopasowania ICE\n",
" ice_matches = []\n",
"\n",
" # Iterujemy przez pamięć tłumaczeń, pomijając pierwszy i ostatni element dla bezpieczeństwa kontekstowego\n",
" for index in range(1, len(translation_memory) - 1):\n",
" # Pobieramy obecne, poprzednie i następne zdania z TM\n",
" prev_tm_sentence, _ = translation_memory[index - 1]\n",
" current_tm_sentence, current_tm_translation = translation_memory[index]\n",
" next_tm_sentence, _ = translation_memory[index + 1]\n",
"\n",
" # Sprawdzamy, czy wszystkie trzy zdania zgadzają się z odpowiednikami w TM\n",
" if (prev_tm_sentence == prev_sentence and current_tm_sentence == current_sentence and next_tm_sentence == next_sentence):\n",
" ice_matches.append(current_tm_translation)\n",
"\n",
" return ice_matches"
]
},
{
@ -119,7 +133,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 19,
"id": "fourth-pillow",
"metadata": {},
"outputs": [],
@ -141,7 +155,11 @@
"id": "graduate-theorem",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Nie, ponieważ w tej funkcji interesuje nas tylko długość zdania, tzn. drugi warunek nie będzie spełniony\n",
"\n",
"Przykład: `kot != bok`, a dla tej funkcji zwróci 0\n",
"\n",
"Spełnione warunki: 1, 3, 4"
]
},
{
@ -154,7 +172,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 20,
"id": "continued-christopher",
"metadata": {},
"outputs": [],
@ -179,7 +197,40 @@
"id": "metallic-leave",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Tak, spełnia wszystkie warunki\n",
"\n",
"Sprawdzenie dla warunku 4"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "349a3547",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n",
"True\n",
"True\n",
"True\n"
]
}
],
"source": [
"# x == y i y == z\n",
"print(sentence_distance(\"kot\", \"kot\") + sentence_distance(\"kot\", \"kot\") >= sentence_distance(\"kot\", \"kot\"))\n",
"\n",
"# x == y i y != z\n",
"print(sentence_distance(\"kot\", \"kot\") + sentence_distance(\"kot\", \"pies\") >= sentence_distance(\"kot\", \"pies\"))\n",
"\n",
"# x != y i y == z\n",
"print(sentence_distance(\"kot\", \"pies\") + sentence_distance(\"pies\", \"pies\") >= sentence_distance(\"kot\", \"pies\"))\n",
"\n",
"# x != y i y != z\n",
"print(sentence_distance(\"kot\", \"pies\") + sentence_distance(\"pies\", \"kot\") >= sentence_distance(\"kot\", \"kot\"))"
]
},
{
@ -206,7 +257,11 @@
"id": "bibliographic-stopping",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź:\n",
"- Dystans Levenshteina jest zawsze nieujemny\n",
"- Jeśli dwa ciągi są identyczne, nie potrzeba żadnych operacji do przekształcenia jednego w drugi\n",
"- Dystans Levenshteina jest symetryczny, ponieważ liczba operacji wymaganych do przekształcenia ciągu A w ciąg B jest taka sama jak liczba operacji potrzebnych do przekształcenia ciągu B w ciąg A\n",
"- Dystans Levenshteina spełnia nierówność trójkąta. Można to uzasadnić rozważając, że przekształcenie ciągu X w Y przez ciąg pośredni Z (najpierw przekształcając X w Z, a następnie Z w Y) nie będzie wymagać więcej operacji niż bezpośrednie przekształcenie X w Y"
]
},
{
@ -223,7 +278,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 21,
"id": "secondary-wrist",
"metadata": {},
"outputs": [
@ -233,7 +288,7 @@
"2"
]
},
"execution_count": 5,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@ -254,7 +309,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 22,
"id": "associate-tuner",
"metadata": {},
"outputs": [],
@ -273,7 +328,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 23,
"id": "focal-pathology",
"metadata": {},
"outputs": [
@ -283,7 +338,7 @@
"0.9166666666666666"
]
},
"execution_count": 7,
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
@ -294,7 +349,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 24,
"id": "roman-ceiling",
"metadata": {},
"outputs": [
@ -304,7 +359,7 @@
"0.9428571428571428"
]
},
"execution_count": 8,
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@ -315,7 +370,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 25,
"id": "invisible-cambodia",
"metadata": {},
"outputs": [
@ -325,7 +380,7 @@
"0.631578947368421"
]
},
"execution_count": 9,
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
@ -344,13 +399,22 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 26,
"id": "genetic-cradle",
"metadata": {},
"outputs": [],
"source": [
"# Write a fuzzy_lookup function that will search the translation memory for all sentences whose Levenshtein similarity to the searched sentence is greater than or equal to a set threshold.\n",
"def fuzzy_lookup(sentence, threshold):\n",
" return []"
" fuzzy_matches = []\n",
"\n",
" # Iterujemy przez pamięć tłumaczeń\n",
" for tm_sentence, tm_translation in translation_memory:\n",
" # Sprawdzamy, czy podobieństwo Levenshteina jest większe niż próg\n",
" if levenshtein_similarity(sentence, tm_sentence) >= threshold:\n",
" fuzzy_matches.append(tm_translation)\n",
"\n",
" return fuzzy_matches"
]
}
],
@ -373,7 +437,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.10.14"
},
"subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
"title": "Komputerowe wspomaganie tłumaczenia",

View File

@ -63,7 +63,7 @@
"id": "diverse-sunglasses",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Wynik z Google Translate to `metal cabinet guides`"
]
},
{
@ -86,12 +86,12 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 11,
"id": "loving-prince",
"metadata": {},
"outputs": [],
"source": [
"text = \" For all Java programmers:\"\n",
"text = \" For all Java programmers:\"\n",
"text += \" This section explains how to compile and run a Swing application from the command line.\"\n",
"text += \" For information on compiling and running a Swing application using NetBeans IDE,\"\n",
"text += \" see Running Tutorial Examples in NetBeans IDE. The compilation instructions work for all Swing programs\"\n",
@ -110,7 +110,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 12,
"id": "bound-auction",
"metadata": {},
"outputs": [],
@ -128,13 +128,46 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 13,
"id": "cognitive-cedar",
"metadata": {},
"outputs": [],
"source": [
"def terminology_lookup():\n",
" return []"
" for term in dictionary:\n",
" start = 0\n",
" while True:\n",
" start = text.find(term, start)\n",
" if start == -1:\n",
" break\n",
" end = start + len(term)\n",
" print(f'{term}: ({start}, {end})')\n",
" start = end"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "0a4a26ba",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"program: (14, 21)\n",
"program: (291, 298)\n",
"program: (468, 475)\n",
"program: (516, 523)\n",
"program: (533, 540)\n",
"application: (80, 91)\n",
"application: (164, 175)\n",
"application: (322, 333)\n"
]
}
],
"source": [
"terminology_lookup()"
]
},
{
@ -161,7 +194,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 15,
"id": "tribal-attention",
"metadata": {},
"outputs": [
@ -169,108 +202,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
" \n",
"for\n",
"all\n",
"Java\n",
"programmer\n",
":\n",
"this\n",
"section\n",
"explain\n",
"how\n",
"to\n",
"compile\n",
"and\n",
"run\n",
"a\n",
"swing\n",
"application\n",
"from\n",
"the\n",
"command\n",
"line\n",
".\n",
"for\n",
"information\n",
"on\n",
"compile\n",
"and\n",
"run\n",
"a\n",
"swing\n",
"application\n",
"use\n",
"NetBeans\n",
"IDE\n",
",\n",
"see\n",
"Running\n",
"Tutorial\n",
"Examples\n",
"in\n",
"NetBeans\n",
"IDE\n",
".\n",
"the\n",
"compilation\n",
"instruction\n",
"work\n",
"for\n",
"all\n",
"swing\n",
"program\n",
"—\n",
"applet\n",
",\n",
"as\n",
"well\n",
"as\n",
"application\n",
".\n",
"here\n",
"be\n",
"the\n",
"step\n",
"-PRON-\n",
"need\n",
"to\n",
"follow\n",
":\n",
"install\n",
"the\n",
"late\n",
"release\n",
"of\n",
"the\n",
"Java\n",
"SE\n",
"platform\n",
",\n",
"if\n",
"-PRON-\n",
"have\n",
"not\n",
"already\n",
"do\n",
"so\n",
".\n",
"create\n",
"a\n",
"program\n",
"that\n",
"use\n",
"Swing\n",
"component\n",
".\n",
"compile\n",
"the\n",
"program\n",
".\n",
"run\n",
"the\n",
"program\n",
".\n"
" for all Java programmer : this section explain how to compile and run a swing application from the command line . for information on compile and run a swing application use NetBeans IDE , see run Tutorial Examples in NetBeans IDE . the compilation instruction work for all Swing program — applet , as well as application . here be the step you need to follow : install the late release of the Java SE platform , if you have not already do so . create a program that use swing component . compile the program . run the program . "
]
}
],
@ -281,7 +213,7 @@
"doc = nlp(text)\n",
"\n",
"for token in doc:\n",
" print(token.lemma_)"
" print(token.lemma_, end=' ')"
]
},
{
@ -302,13 +234,40 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 40,
"id": "surgical-demonstration",
"metadata": {},
"outputs": [],
"source": [
"def terminology_lookup():\n",
" return []"
" for term in dictionary:\n",
" for token in doc:\n",
" if token.lemma_ == term:\n",
" print(f'{token}: ({token.idx}, {token.idx + len(token)})')"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "74f600ea",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"programs: (291, 299)\n",
"program: (468, 475)\n",
"program: (516, 523)\n",
"program: (533, 540)\n",
"application: (80, 91)\n",
"application: (164, 175)\n",
"applications: (322, 334)\n"
]
}
],
"source": [
"terminology_lookup()"
]
},
{
@ -337,13 +296,56 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 22,
"id": "superb-butterfly",
"metadata": {},
"outputs": [],
"source": [
"def get_nouns(text):\n",
" return []"
" doc = nlp(text)\n",
" return [token.text for token in doc if token.pos_ == 'NOUN']"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "2bfedfa3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['programmers',\n",
" 'section',\n",
" 'Swing',\n",
" 'application',\n",
" 'command',\n",
" 'line',\n",
" 'information',\n",
" 'Swing',\n",
" 'application',\n",
" 'compilation',\n",
" 'instructions',\n",
" 'programs',\n",
" 'applets',\n",
" 'applications',\n",
" 'steps',\n",
" 'release',\n",
" 'platform',\n",
" 'program',\n",
" 'Swing',\n",
" 'components',\n",
" 'program',\n",
" 'program']"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_nouns(text)"
]
},
{
@ -356,7 +358,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 19,
"id": "acting-tolerance",
"metadata": {},
"outputs": [],
@ -374,13 +376,54 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 26,
"id": "eight-redhead",
"metadata": {},
"outputs": [],
"source": [
"def extract_terms(text):\n",
" return []"
" doc = nlp(text)\n",
" terms = {}\n",
" for token in doc:\n",
" if token.pos_ == 'NOUN':\n",
" term = token.lemma_\n",
" terms[term] = terms.get(term, 0) + 1\n",
" return terms"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "07c1122a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'programmer': 1,\n",
" 'section': 1,\n",
" 'swing': 3,\n",
" 'application': 3,\n",
" 'command': 1,\n",
" 'line': 1,\n",
" 'information': 1,\n",
" 'compilation': 1,\n",
" 'instruction': 1,\n",
" 'program': 4,\n",
" 'applet': 1,\n",
" 'step': 1,\n",
" 'release': 1,\n",
" 'platform': 1,\n",
" 'component': 1}"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_terms(text)"
]
},
{
@ -393,14 +436,82 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 32,
"id": "monetary-mambo",
"metadata": {},
"outputs": [],
"source": [
"# Extract and count nouns, verbs and adjectives\n",
"def extract_terms(text):\n",
" return []"
" doc = nlp(text)\n",
" terms = {\"nouns\": {}, \"verbs\": {}, \"adjectives\": {}}\n",
" for token in doc:\n",
" if token.pos_ == 'NOUN':\n",
" term = token.lemma_\n",
" terms[\"nouns\"][term] = terms[\"nouns\"].get(term, 0) + 1\n",
" elif token.pos_ == 'VERB':\n",
" term = token.lemma_\n",
" terms[\"verbs\"][term] = terms[\"verbs\"].get(term, 0) + 1\n",
" elif token.pos_ == 'ADJ':\n",
" term = token.lemma_\n",
" terms[\"adjectives\"][term] = terms[\"adjectives\"].get(term, 0) + 1\n",
"\n",
" return terms"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "1eb48136",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'adjectives': {'late': 1},\n",
" 'nouns': {'applet': 1,\n",
" 'application': 3,\n",
" 'command': 1,\n",
" 'compilation': 1,\n",
" 'component': 1,\n",
" 'information': 1,\n",
" 'instruction': 1,\n",
" 'line': 1,\n",
" 'platform': 1,\n",
" 'program': 4,\n",
" 'programmer': 1,\n",
" 'release': 1,\n",
" 'section': 1,\n",
" 'step': 1,\n",
" 'swing': 3},\n",
" 'verbs': {'compile': 3,\n",
" 'create': 1,\n",
" 'do': 1,\n",
" 'explain': 1,\n",
" 'follow': 1,\n",
" 'install': 1,\n",
" 'need': 1,\n",
" 'run': 4,\n",
" 'see': 1,\n",
" 'use': 2,\n",
" 'work': 1}}\n"
]
}
],
"source": [
"from pprint import pprint\n",
"\n",
"pprint(extract_terms(text))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "62aeea83",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
@ -422,7 +533,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.10.14"
},
"subtitle": "3. Terminologia",
"title": "Komputerowe wspomaganie tłumaczenia",

File diff suppressed because one or more lines are too long

View File

@ -55,13 +55,52 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 7,
"id": "documented-hacker",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"[(10, 13), (17, 21)]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import re\n",
"\n",
"def find_tags(text):\n",
" return []"
" tags = re.finditer(r'<[^>]+>', text)\n",
" return [tag.span() for tag in tags]\n",
"\n",
"# Test the function\n",
"text = 'This is a <b>bold</b> text'\n",
"find_tags(text)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1781331d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('<b>', '</b>')"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text[10:13], text[17:21]"
]
},
{
@ -74,13 +113,28 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 9,
"id": "unauthorized-study",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"(True, False, False)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def is_translatable(text):\n",
" return True"
" # Text is translatable if it contains only letters, spaces, and punctuation\n",
" return re.fullmatch(r'[a-zA-Z .,!?]+', text) is not None\n",
"\n",
"# Test the function\n",
"is_translatable('Hello, world!'), is_translatable('Hello, 123!'), is_translatable('你好,世界!')"
]
},
{
@ -93,13 +147,65 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 12,
"id": "beautiful-mathematics",
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"[(12, 22), (28, 38), (42, 52), (56, 66), (70, 85)]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def find_dates(text):\n",
" return []"
" # Find all dates in 5 formats: yyyy-mm-dd, yyyy/mm/dd, dd-mm-yyyy, dd/mm/yyyy, dd month yyyy\n",
" # yyyy-mm-dd\n",
" dates = [date.span() for date in re.finditer(r'\\b\\d{4}-\\d{2}-\\d{2}\\b', text)]\n",
" # yyyy/mm/dd\n",
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{4}/\\d{2}/\\d{2}\\b', text)]\n",
" # dd-mm-yyyy\n",
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}-\\d{2}-\\d{4}\\b', text)]\n",
" # dd/mm/yyyy\n",
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}/\\d{2}/\\d{4}\\b', text)]\n",
" # dd month yyyy\n",
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2} [a-zA-Z]+ \\d{4}\\b', text)]\n",
" return dates\n",
"\n",
"# Test the function\n",
"text = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020 or 01 January 2020'\n",
"find_dates(text)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "215a4cbd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2020-01-01\n",
"2020/01/01\n",
"01-01-2020\n",
"01/01/2020\n",
"01 January 2020\n"
]
}
],
"source": [
"print(text[12:22])\n",
"print(text[28:38])\n",
"print(text[42:52])\n",
"print(text[56:66])\n",
"print(text[70:85])"
]
},
{
@ -125,13 +231,164 @@
},
{
"cell_type": "code",
"execution_count": 4,
"id": "finished-essex",
"execution_count": 37,
"id": "e37a24ad",
"metadata": {},
"outputs": [],
"source": [
"text = 'The date is 2020-01-02, not 2020/01/02 or 02-01-2020 or 02/01/2020 or 02 January 2020'"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "4da1f53f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The date is 01/02/2020, not 01/02/2020 or 02/01/2020 or 02/01/2020 or 01/02/2020'"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from dateutil.parser import parse\n",
"\n",
"def change_data_to_US_format(text):\n",
" dates = find_dates(text)\n",
"\n",
" for start, end in dates:\n",
" date = text[start:end]\n",
" try:\n",
" new_date = parse(date).strftime('%m/%d/%Y')\n",
" text = text[:start] + new_date + text[end:]\n",
" except:\n",
" pass\n",
" return text\n",
"\n",
"# Test the function\n",
"change_data_to_US_format(text)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "8a2bf3a3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The date is 02/01/2020, not 02/01/2020 or 01/02/2020 or 01/02/2020 or 02/01/2020'"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from dateutil.parser import parse\n",
"\n",
"def change_data_to_EU_format(text):\n",
" dates = find_dates(text)\n",
"\n",
" for start, end in dates:\n",
" date = text[start:end]\n",
" try:\n",
" new_date = parse(date).strftime('%d/%m/%Y')\n",
" text = text[:start] + new_date + text[end:]\n",
" except:\n",
" pass\n",
" return text\n",
"\n",
"# Test the function\n",
"change_data_to_EU_format(text)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "e1c63075",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The date is 2020.01.02, not 2020.01.02 or 2020.02.01 or 2020.02.01 or 2020.01.02'"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from dateutil.parser import parse\n",
"\n",
"def change_data_to_digit_dot_format(text):\n",
" dates = find_dates(text)\n",
"\n",
" for start, end in dates:\n",
" date = text[start:end]\n",
" try:\n",
" new_date = parse(date).strftime('%Y.%m.%d')\n",
" text = text[:start] + new_date + text[end:]\n",
" except:\n",
" pass\n",
" return text\n",
"\n",
"# Test the function\n",
"change_data_to_digit_dot_format(text)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "finished-essex",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Data jest 01/01/2020, a nie 01/01/2020 lub 01/01/2020 lub 01/01/2020'"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def correct_dates(source_segment, target_segment, date_format):\n",
" return ''"
" # Check if number of dates in source and target segments are the same\n",
" assert len(find_dates(source_segment)) == len(find_dates(target_segment))\n",
"\n",
" # Check if all dates are the same (ignore the format)\n",
" source_dates = find_dates(source_segment)\n",
" target_dates = find_dates(target_segment)\n",
" for source_date, target_date in zip(source_dates, target_dates):\n",
" assert change_data_to_US_format(source_segment[source_date[0]:source_date[1]]) == change_data_to_US_format(target_segment[target_date[0]:target_date[1]]), f\"Dates are different: {source_segment[source_date[0]:source_date[1]]} and {target_segment[target_date[0]:target_date[1]]}\"\n",
"\n",
" # Change the format of dates in the target segment\n",
" if date_format == 'US':\n",
" target_segment = change_data_to_US_format(target_segment)\n",
" elif date_format == 'EU':\n",
" target_segment = change_data_to_EU_format(target_segment)\n",
" elif date_format == 'digit.dot':\n",
" target_segment = change_data_to_digit_dot_format(target_segment)\n",
"\n",
" return target_segment\n",
"\n",
"# Test the function\n",
"source_segment = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020'\n",
"target_segment = \"Data jest 01/01/2020, a nie 2020-01-01 lub 01-01-2020 lub 01/01/2020\"\n",
"correct_dates(source_segment, target_segment, 'US')"
]
},
{
@ -176,13 +433,84 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 60,
"id": "romance-judge",
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"\n",
"def transfer_tags(source_segment, target_segment):\n",
" return ''"
" # Split the segments into tokens\n",
" source_tokens = source_segment.split()\n",
" target_tokens = target_segment.split()\n",
"\n",
" # Calculate the ratio of the number of tokens in the target to the number of tokens in the source\n",
" ratio = len(target_tokens) / len(source_tokens)\n",
"\n",
" # Assign tags to tokens in the target tokens - if the source token has a tag, assign it to the corresponding token in the target tokens\n",
" for i, source_token in enumerate(source_tokens):\n",
" if re.match(r'<[^>]+>', source_token):\n",
" target_index = math.ceil(i * ratio)\n",
"\n",
" if target_index >= len(target_tokens):\n",
" target_index = len(target_tokens) - 1\n",
"\n",
" # Assign start tag\n",
" target_tokens[target_index] = re.findall(r'<[^>]+>', source_token)[0] + target_tokens[target_index]\n",
"\n",
" # Assign end tag\n",
" target_tokens[target_index] = target_tokens[target_index] + re.findall(r'</[^>]+>', source_token)[0]\n",
"\n",
" return ' '.join(target_tokens)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "fd8858d8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'To jest <b>ważny</b> tekst'"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Test the function (same number of tokens)\n",
"source_segment = 'This is <b>bold</b> text'\n",
"target_segment = 'To jest ważny tekst'\n",
"transfer_tags(source_segment, target_segment)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "de9e6298",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'To jest bardzo <b>ważny</b> tekst'"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Test the function (different number of tokens)\n",
"source_segment = 'This is <b>bold</b> text'\n",
"target_segment = 'To jest bardzo ważny tekst'\n",
"transfer_tags(source_segment, target_segment)"
]
}
],
@ -205,7 +533,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.10.14"
},
"subtitle": "6,7. Preprocessing i postprocessing",
"title": "Komputerowe wspomaganie tłumaczenia",

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long