Compare commits
5 Commits
Author | SHA1 | Date |
---|---|---|
Patryk Bartkowiak | fd590b3a22 | |
Patryk Bartkowiak | 9b75563e6a | |
Patryk Bartkowiak | 7a6ac33f6e | |
Patryk Bartkowiak | 5de69211e1 | |
Patryk Bartkowiak | 870b673fac |
|
@ -0,0 +1,2 @@
|
|||
corpus/
|
||||
NIPS Papers/
|
|
@ -0,0 +1,10 @@
|
|||
(0, '0.006*"learning" + 0.005*"model" + 0.005*"data" + 0.004*"function" + 0.004*"set" + 0.004*"using" + 0.004*"number" + 0.004*"neural" + 0.004*"one" + 0.003*"error"')
|
||||
(1, '0.008*"learning" + 0.006*"data" + 0.005*"model" + 0.005*"set" + 0.004*"algorithm" + 0.004*"time" + 0.004*"one" + 0.004*"two" + 0.003*"used" + 0.003*"figure"')
|
||||
(2, '0.007*"data" + 0.005*"model" + 0.005*"set" + 0.005*"learning" + 0.004*"one" + 0.004*"algorithm" + 0.004*"time" + 0.003*"using" + 0.003*"figure" + 0.003*"training"')
|
||||
(3, '0.006*"data" + 0.005*"model" + 0.004*"learning" + 0.004*"two" + 0.004*"algorithm" + 0.004*"using" + 0.004*"function" + 0.004*"set" + 0.003*"number" + 0.003*"given"')
|
||||
(4, '0.006*"learning" + 0.005*"data" + 0.005*"model" + 0.005*"set" + 0.004*"algorithm" + 0.004*"time" + 0.004*"using" + 0.004*"two" + 0.004*"function" + 0.003*"one"')
|
||||
(5, '0.008*"learning" + 0.006*"data" + 0.005*"algorithm" + 0.004*"model" + 0.004*"two" + 0.004*"function" + 0.004*"number" + 0.003*"figure" + 0.003*"time" + 0.003*"set"')
|
||||
(6, '0.007*"learning" + 0.006*"model" + 0.005*"data" + 0.005*"algorithm" + 0.004*"function" + 0.004*"set" + 0.003*"time" + 0.003*"one" + 0.003*"based" + 0.003*"number"')
|
||||
(7, '0.007*"learning" + 0.005*"set" + 0.005*"data" + 0.005*"model" + 0.004*"algorithm" + 0.004*"function" + 0.004*"using" + 0.004*"number" + 0.004*"log" + 0.004*"figure"')
|
||||
(8, '0.005*"learning" + 0.005*"set" + 0.005*"algorithm" + 0.004*"model" + 0.004*"function" + 0.004*"data" + 0.004*"one" + 0.004*"time" + 0.003*"using" + 0.003*"given"')
|
||||
(9, '0.007*"data" + 0.006*"model" + 0.005*"learning" + 0.005*"algorithm" + 0.004*"two" + 0.003*"number" + 0.003*"time" + 0.003*"set" + 0.003*"function" + 0.003*"used"')
|
|
@ -0,0 +1,100 @@
|
|||
project victims support visit mediation
|
||||
exhibition cooperation year meeting films
|
||||
exhibition cooperation year meeting films
|
||||
solution occupation settlement problem resolutions
|
||||
residence citizens permit security citizen
|
||||
residence citizens permit security citizen
|
||||
support measures countries farmers member
|
||||
data services infrastructure development project
|
||||
data services infrastructure development project
|
||||
photographs service scans materials films
|
||||
photographs service scans materials films
|
||||
insurance ZUS contributions benefits administration
|
||||
project archaeology research conservation history
|
||||
project archaeology research conservation history
|
||||
cases % coronavirus countries disease
|
||||
% year case cases coronavirus
|
||||
ship tug speed accident course
|
||||
ship tug speed accident course
|
||||
work scientists research science telomerase
|
||||
work scientists research science telomerase
|
||||
film media part time efforts
|
||||
film media part time efforts
|
||||
insurance ZUS contributions benefits administration
|
||||
use care stewardship resistance antibiotics
|
||||
services administration state information e
|
||||
services administration state information e
|
||||
coronavirus research measures outbreak member
|
||||
residence card foreigner work permit
|
||||
security e threats policy gas
|
||||
security e threats policy gas
|
||||
paper 15th reader file date
|
||||
paper 15th reader file date
|
||||
costs implementation management tasks expenditures
|
||||
food cooperation products market agri
|
||||
costs implementation management tasks expenditures
|
||||
costs implementation management tasks expenditures
|
||||
artist work painting paintings time
|
||||
artist work painting paintings time
|
||||
Home » rights representatives discrimination
|
||||
Home » rights representatives discrimination
|
||||
command documentation alias files directory
|
||||
water basis land status item
|
||||
water basis land status item
|
||||
% contract contracts . No
|
||||
food cooperation products market agri
|
||||
% contract contracts . No
|
||||
market level services age companies
|
||||
market level services age companies
|
||||
projects innovation R&D development companies
|
||||
projects innovation R&D development companies
|
||||
contracts contract % item procedures
|
||||
contracts contract % item procedures
|
||||
room A office information B
|
||||
room A office information B
|
||||
advantage production country countries goods
|
||||
measles vaccine disease person people
|
||||
advantage production country countries goods
|
||||
card residence permission business stamp
|
||||
card residence permission business stamp
|
||||
w % gospodarczego polityki publicznych
|
||||
system banks stability risk sector
|
||||
camps people concentration policy resistance
|
||||
camps people concentration policy resistance
|
||||
safety aviation management requirements entity
|
||||
safety aviation management requirements entity
|
||||
research call philosophy information project
|
||||
vaccination pertussis cancer risk disease
|
||||
research call philosophy information project
|
||||
energy gas % oil countries
|
||||
energy gas % oil countries
|
||||
cooperation meeting talks forces defence
|
||||
project education information coronavirus funding
|
||||
food education project measures assistance
|
||||
infection disease symptoms fever humans
|
||||
energy audit costs use management
|
||||
countries % development benefits funds
|
||||
years minister year rector persons
|
||||
water food fish times year
|
||||
land water population data age
|
||||
land water population data age
|
||||
market labour crisis unemployment countries
|
||||
market labour crisis unemployment countries
|
||||
accelerator research - operation model
|
||||
accelerator research - operation model
|
||||
energy policy power development objectives
|
||||
priest hand country wedding church
|
||||
eggs breakfast food products meat
|
||||
eggs breakfast food products meat
|
||||
water fish times food year
|
||||
honey production bread time taste
|
||||
honey production bread time taste
|
||||
data job portal vacancies Decision
|
||||
data job portal vacancies Decision
|
||||
food quality products apples farmers
|
||||
food quality products apples farmers
|
||||
visa activities child B-1 institution
|
||||
visa activities child B-1 institution
|
||||
- co preparations operation preparation
|
||||
- co preparations operation preparation
|
||||
project victims support visit mediation
|
|
@ -0,0 +1,100 @@
|
|||
approval total lawyers priorities judges
|
||||
agriculture support guests offers author
|
||||
agriculture support guests offers author
|
||||
homeland invasion address prisoners sources
|
||||
identity positions elaboration issues terms
|
||||
identity positions elaboration issues terms
|
||||
distancing lenders mechanism check part
|
||||
IT Realization Services resolutions bases
|
||||
IT Realization Services resolutions bases
|
||||
occupation scans browser Service processes
|
||||
occupation scans browser Service processes
|
||||
am war month Insurance centralisation
|
||||
conservation zu provisions basin record
|
||||
conservation zu provisions basin record
|
||||
culture city abscesses aeronautics disruptors
|
||||
infection Recommendations man evening occurrence
|
||||
course hull STATE classifier certificate
|
||||
course hull STATE classifier certificate
|
||||
cooling work culture part laboratory
|
||||
cooling work culture part laboratory
|
||||
culture reverse advisor documentary service
|
||||
culture reverse advisor documentary service
|
||||
am war month Insurance centralisation
|
||||
pressure ability entry prescribers costs
|
||||
economies management role disk stakeholders
|
||||
economies management role disk stakeholders
|
||||
traders fears carriers illness distancing
|
||||
activity employment foreigners Visa graduate
|
||||
defense forecast quarter factors opportunity
|
||||
defense forecast quarter factors opportunity
|
||||
case author screen announcement typefaces
|
||||
case author screen announcement typefaces
|
||||
revenue office premises o proposals
|
||||
storage completion efforts Meeting crisis
|
||||
office Types premises protection days
|
||||
revenue office premises o proposals
|
||||
pictures splashing dobrze viewer culture
|
||||
pictures splashing dobrze viewer culture
|
||||
creation origin discrimination interest institutions
|
||||
creation origin discrimination interest institutions
|
||||
names contexts calculator program descriptor
|
||||
periods standards total name property
|
||||
periods standards total name property
|
||||
Art days liability authorities services
|
||||
storage completion efforts Meeting crisis
|
||||
Art days liability authorities services
|
||||
skills provision country economies science
|
||||
skills provision country economies science
|
||||
Project possibilities cancer members therapies
|
||||
Project possibilities cancer members therapies
|
||||
price auction actions telecommunications appointment
|
||||
price auction actions telecommunications appointment
|
||||
records coffee authorisation line times
|
||||
records coffee authorisation line times
|
||||
example manner source essence identification
|
||||
defences vaccines days spread body
|
||||
example manner source essence identification
|
||||
servants employees Possession insurance examinations
|
||||
servants employees Possession insurance examinations
|
||||
systemowe dopiero system latach popytem
|
||||
efficiency problems uncertainty improvement Risk
|
||||
uprising borders rights security campaign
|
||||
uprising borders rights security campaign
|
||||
part audits Responsibilities services authority
|
||||
protection competence version occurrence requisition
|
||||
Requirements members methodology data database
|
||||
whoop substitute cause exposure course
|
||||
Requirements members methodology data database
|
||||
erent decisions SOURCES spectrum economies
|
||||
erent decisions SOURCES spectrum economies
|
||||
invitation effects help armament round
|
||||
area teaching tax time travel
|
||||
time Recommendation participants guarantees work
|
||||
toxin mechanisms attacks Babies therapies
|
||||
production replacement control SMEs audit
|
||||
significance net ground participants levels
|
||||
functioning consultation interest expert procedures
|
||||
thing mercury eggs municipality lunch
|
||||
agriculture R result development prices
|
||||
agriculture R result development prices
|
||||
reflection basis sources points results
|
||||
reflection basis sources points results
|
||||
leaders reach author features publications
|
||||
leaders reach author features publications
|
||||
consumption Improvement bodies level need
|
||||
money delirium advice house couple
|
||||
work thanks BEgINNINg range funds
|
||||
work thanks BEgINNINg range funds
|
||||
option eggs dinner wine quantities
|
||||
seeds mead event maples approach
|
||||
seeds mead event maples approach
|
||||
case complaints consultation Employers actions
|
||||
case complaints consultation Employers actions
|
||||
activity fruit indications zation rice
|
||||
activity fruit indications zation rice
|
||||
building work premises Food child
|
||||
building work premises Food child
|
||||
virtue works culture sectors others
|
||||
virtue works culture sectors others
|
||||
approval total lawyers priorities judges
|
223
lab/lab_01.ipynb
223
lab/lab_01.ipynb
|
@ -52,7 +52,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 191,
|
||||
"id": "narrow-romantic",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -71,7 +71,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 192,
|
||||
"id": "indonesian-electron",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -82,7 +82,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 193,
|
||||
"id": "compact-trinidad",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -92,7 +92,7 @@
|
|||
"['Press the ENTER button']"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"execution_count": 193,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -119,7 +119,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 194,
|
||||
"id": "exposed-daniel",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -139,7 +139,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 195,
|
||||
"id": "serial-velvet",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -149,7 +149,7 @@
|
|||
"['Press the ENTER button', 'Press the ENTER key']"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 195,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -176,7 +176,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 196,
|
||||
"id": "every-gibson",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -186,7 +186,7 @@
|
|||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"execution_count": 196,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -213,13 +213,37 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 197,
|
||||
"id": "protected-rings",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def preprocess(sentence):\n",
|
||||
" return sentence.lower()\n",
|
||||
"\n",
|
||||
"def tm_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" return [entry[1] for entry in translation_memory if preprocess(entry[0]) == preprocess(sentence)]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 198,
|
||||
"id": "7baee10b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Press the ENTER button', 'Press the ENTER key']"
|
||||
]
|
||||
},
|
||||
"execution_count": 198,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tm_lookup('Wciśnij przycisk ENTER')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -232,17 +256,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 199,
|
||||
"id": "severe-alloy",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"''"
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"execution_count": 199,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -261,13 +285,40 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 200,
|
||||
"id": "structural-diesel",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import string\n",
|
||||
"\n",
|
||||
"def preprocess(s):\n",
|
||||
" translator = str.maketrans('', '', string.punctuation)\n",
|
||||
" return s.translate(translator).lower()\n",
|
||||
"\n",
|
||||
"def tm_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" return [entry[1] for entry in translation_memory if preprocess(entry[0]) == preprocess(sentence)]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 201,
|
||||
"id": "c03c6709",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Press the ENTER button', 'Press the ENTER key']"
|
||||
]
|
||||
},
|
||||
"execution_count": 201,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tm_lookup('Wciśnij przycisk [ENTER]')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -280,17 +331,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 202,
|
||||
"id": "brief-senegal",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"''"
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"execution_count": 202,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -317,13 +368,43 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 203,
|
||||
"id": "mathematical-customs",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def compare_sentences(l1, l2):\n",
|
||||
" return sum([1 for i, j in zip(l1.split(), l2.split()) if i != j]) <= 1\n",
|
||||
"\n",
|
||||
"import string\n",
|
||||
"\n",
|
||||
"def preprocess(s):\n",
|
||||
" translator = str.maketrans('', '', string.punctuation)\n",
|
||||
" return s.translate(translator).lower()\n",
|
||||
"\n",
|
||||
"def tm_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" return [entry[1] for entry in translation_memory if compare_sentences(preprocess(entry[0]), preprocess(sentence))]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 204,
|
||||
"id": "6264b722",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['System restart required']"
|
||||
]
|
||||
},
|
||||
"execution_count": 204,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tm_lookup('Wymagane ponowne uruchomienie maszyny')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -344,7 +425,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 205,
|
||||
"id": "humanitarian-wrong",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -362,7 +443,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 206,
|
||||
"id": "located-perception",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -374,7 +455,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 207,
|
||||
"id": "advised-casting",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -384,7 +465,7 @@
|
|||
"[('przycisk', 'button'), ('drukarka', 'printer')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"execution_count": 207,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -406,7 +487,7 @@
|
|||
"id": "defensive-fifteen",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: Jeżeli implementacja wygląda tak jak powyżej, złożoność to `O(n*m)`, ponieważ dla każdego słowa iteracyjnie przechodzimy przez cały nasz słownik i szukamy odpowiednika"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -419,13 +500,56 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 208,
|
||||
"id": "aca5d340",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('przycisk', 'button')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 208,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"glossary_lookup('Każda Drukarka posiada przycisk wznowienia drukowania')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 209,
|
||||
"id": "original-tunisia",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def glossary_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" sentence_words = [word.lower() for word in sentence.split()]\n",
|
||||
" return [entry for entry in glossary if entry[0] in sentence_words]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 210,
|
||||
"id": "716bbbe9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('przycisk', 'button'), ('drukarka', 'printer')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 210,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -438,13 +562,50 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 211,
|
||||
"id": "32dec661",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]\n",
|
||||
"glossary = {\n",
|
||||
" 'komputer': 'computer',\n",
|
||||
" 'przycisk': 'button',\n",
|
||||
" 'drukarka': 'printer'\n",
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 212,
|
||||
"id": "adolescent-semiconductor",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def glossary_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" sentence_words = [word.lower() for word in sentence.split() if word.lower() in glossary]\n",
|
||||
" return [(word, glossary[word]) for word in sentence_words]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 213,
|
||||
"id": "d1e991c6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('drukarka', 'printer'), ('przycisk', 'button')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 213,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -467,7 +628,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.10.14"
|
||||
},
|
||||
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
|
104
lab/lab_02.ipynb
104
lab/lab_02.ipynb
|
@ -57,7 +57,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 17,
|
||||
"id": "confident-prison",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -80,13 +80,27 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 18,
|
||||
"id": "continental-submission",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def ice_lookup(sentence, prev_sentence, next_sentence):\n",
|
||||
" return []"
|
||||
" # Wyniki dopasowania ICE\n",
|
||||
" ice_matches = []\n",
|
||||
"\n",
|
||||
" # Iterujemy przez pamięć tłumaczeń, pomijając pierwszy i ostatni element dla bezpieczeństwa kontekstowego\n",
|
||||
" for index in range(1, len(translation_memory) - 1):\n",
|
||||
" # Pobieramy obecne, poprzednie i następne zdania z TM\n",
|
||||
" prev_tm_sentence, _ = translation_memory[index - 1]\n",
|
||||
" current_tm_sentence, current_tm_translation = translation_memory[index]\n",
|
||||
" next_tm_sentence, _ = translation_memory[index + 1]\n",
|
||||
"\n",
|
||||
" # Sprawdzamy, czy wszystkie trzy zdania zgadzają się z odpowiednikami w TM\n",
|
||||
" if (prev_tm_sentence == prev_sentence and current_tm_sentence == current_sentence and next_tm_sentence == next_sentence):\n",
|
||||
" ice_matches.append(current_tm_translation)\n",
|
||||
"\n",
|
||||
" return ice_matches"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -119,7 +133,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 19,
|
||||
"id": "fourth-pillow",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -141,7 +155,11 @@
|
|||
"id": "graduate-theorem",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: Nie, ponieważ w tej funkcji interesuje nas tylko długość zdania, tzn. drugi warunek nie będzie spełniony\n",
|
||||
"\n",
|
||||
"Przykład: `kot != bok`, a dla tej funkcji zwróci 0\n",
|
||||
"\n",
|
||||
"Spełnione warunki: 1, 3, 4"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -154,7 +172,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 20,
|
||||
"id": "continued-christopher",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -179,7 +197,40 @@
|
|||
"id": "metallic-leave",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: Tak, spełnia wszystkie warunki\n",
|
||||
"\n",
|
||||
"Sprawdzenie dla warunku 4"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "349a3547",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"True\n",
|
||||
"True\n",
|
||||
"True\n",
|
||||
"True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# x == y i y == z\n",
|
||||
"print(sentence_distance(\"kot\", \"kot\") + sentence_distance(\"kot\", \"kot\") >= sentence_distance(\"kot\", \"kot\"))\n",
|
||||
"\n",
|
||||
"# x == y i y != z\n",
|
||||
"print(sentence_distance(\"kot\", \"kot\") + sentence_distance(\"kot\", \"pies\") >= sentence_distance(\"kot\", \"pies\"))\n",
|
||||
"\n",
|
||||
"# x != y i y == z\n",
|
||||
"print(sentence_distance(\"kot\", \"pies\") + sentence_distance(\"pies\", \"pies\") >= sentence_distance(\"kot\", \"pies\"))\n",
|
||||
"\n",
|
||||
"# x != y i y != z\n",
|
||||
"print(sentence_distance(\"kot\", \"pies\") + sentence_distance(\"pies\", \"kot\") >= sentence_distance(\"kot\", \"kot\"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -206,7 +257,11 @@
|
|||
"id": "bibliographic-stopping",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź:\n",
|
||||
"- Dystans Levenshteina jest zawsze nieujemny\n",
|
||||
"- Jeśli dwa ciągi są identyczne, nie potrzeba żadnych operacji do przekształcenia jednego w drugi\n",
|
||||
"- Dystans Levenshteina jest symetryczny, ponieważ liczba operacji wymaganych do przekształcenia ciągu A w ciąg B jest taka sama jak liczba operacji potrzebnych do przekształcenia ciągu B w ciąg A\n",
|
||||
"- Dystans Levenshteina spełnia nierówność trójkąta. Można to uzasadnić rozważając, że przekształcenie ciągu X w Y przez ciąg pośredni Z (najpierw przekształcając X w Z, a następnie Z w Y) nie będzie wymagać więcej operacji niż bezpośrednie przekształcenie X w Y"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -223,7 +278,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 21,
|
||||
"id": "secondary-wrist",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -233,7 +288,7 @@
|
|||
"2"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -254,7 +309,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 22,
|
||||
"id": "associate-tuner",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -273,7 +328,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 23,
|
||||
"id": "focal-pathology",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -283,7 +338,7 @@
|
|||
"0.9166666666666666"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -294,7 +349,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 24,
|
||||
"id": "roman-ceiling",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -304,7 +359,7 @@
|
|||
"0.9428571428571428"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -315,7 +370,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 25,
|
||||
"id": "invisible-cambodia",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -325,7 +380,7 @@
|
|||
"0.631578947368421"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -344,13 +399,22 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 26,
|
||||
"id": "genetic-cradle",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Write a fuzzy_lookup function that will search the translation memory for all sentences whose Levenshtein similarity to the searched sentence is greater than or equal to a set threshold.\n",
|
||||
"def fuzzy_lookup(sentence, threshold):\n",
|
||||
" return []"
|
||||
" fuzzy_matches = []\n",
|
||||
"\n",
|
||||
" # Iterujemy przez pamięć tłumaczeń\n",
|
||||
" for tm_sentence, tm_translation in translation_memory:\n",
|
||||
" # Sprawdzamy, czy podobieństwo Levenshteina jest większe niż próg\n",
|
||||
" if levenshtein_similarity(sentence, tm_sentence) >= threshold:\n",
|
||||
" fuzzy_matches.append(tm_translation)\n",
|
||||
"\n",
|
||||
" return fuzzy_matches"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -373,7 +437,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.10.14"
|
||||
},
|
||||
"subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
|
351
lab/lab_03.ipynb
351
lab/lab_03.ipynb
|
@ -63,7 +63,7 @@
|
|||
"id": "diverse-sunglasses",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: Wynik z Google Translate to `metal cabinet guides`"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -86,12 +86,12 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 11,
|
||||
"id": "loving-prince",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = \" For all Java programmers:\"\n",
|
||||
"text = \" For all Java programmers:\"\n",
|
||||
"text += \" This section explains how to compile and run a Swing application from the command line.\"\n",
|
||||
"text += \" For information on compiling and running a Swing application using NetBeans IDE,\"\n",
|
||||
"text += \" see Running Tutorial Examples in NetBeans IDE. The compilation instructions work for all Swing programs\"\n",
|
||||
|
@ -110,7 +110,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 12,
|
||||
"id": "bound-auction",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -128,13 +128,46 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 13,
|
||||
"id": "cognitive-cedar",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def terminology_lookup():\n",
|
||||
" return []"
|
||||
" for term in dictionary:\n",
|
||||
" start = 0\n",
|
||||
" while True:\n",
|
||||
" start = text.find(term, start)\n",
|
||||
" if start == -1:\n",
|
||||
" break\n",
|
||||
" end = start + len(term)\n",
|
||||
" print(f'{term}: ({start}, {end})')\n",
|
||||
" start = end"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "0a4a26ba",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"program: (14, 21)\n",
|
||||
"program: (291, 298)\n",
|
||||
"program: (468, 475)\n",
|
||||
"program: (516, 523)\n",
|
||||
"program: (533, 540)\n",
|
||||
"application: (80, 91)\n",
|
||||
"application: (164, 175)\n",
|
||||
"application: (322, 333)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"terminology_lookup()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -161,7 +194,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 15,
|
||||
"id": "tribal-attention",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -169,108 +202,7 @@
|
|||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" \n",
|
||||
"for\n",
|
||||
"all\n",
|
||||
"Java\n",
|
||||
"programmer\n",
|
||||
":\n",
|
||||
"this\n",
|
||||
"section\n",
|
||||
"explain\n",
|
||||
"how\n",
|
||||
"to\n",
|
||||
"compile\n",
|
||||
"and\n",
|
||||
"run\n",
|
||||
"a\n",
|
||||
"swing\n",
|
||||
"application\n",
|
||||
"from\n",
|
||||
"the\n",
|
||||
"command\n",
|
||||
"line\n",
|
||||
".\n",
|
||||
"for\n",
|
||||
"information\n",
|
||||
"on\n",
|
||||
"compile\n",
|
||||
"and\n",
|
||||
"run\n",
|
||||
"a\n",
|
||||
"swing\n",
|
||||
"application\n",
|
||||
"use\n",
|
||||
"NetBeans\n",
|
||||
"IDE\n",
|
||||
",\n",
|
||||
"see\n",
|
||||
"Running\n",
|
||||
"Tutorial\n",
|
||||
"Examples\n",
|
||||
"in\n",
|
||||
"NetBeans\n",
|
||||
"IDE\n",
|
||||
".\n",
|
||||
"the\n",
|
||||
"compilation\n",
|
||||
"instruction\n",
|
||||
"work\n",
|
||||
"for\n",
|
||||
"all\n",
|
||||
"swing\n",
|
||||
"program\n",
|
||||
"—\n",
|
||||
"applet\n",
|
||||
",\n",
|
||||
"as\n",
|
||||
"well\n",
|
||||
"as\n",
|
||||
"application\n",
|
||||
".\n",
|
||||
"here\n",
|
||||
"be\n",
|
||||
"the\n",
|
||||
"step\n",
|
||||
"-PRON-\n",
|
||||
"need\n",
|
||||
"to\n",
|
||||
"follow\n",
|
||||
":\n",
|
||||
"install\n",
|
||||
"the\n",
|
||||
"late\n",
|
||||
"release\n",
|
||||
"of\n",
|
||||
"the\n",
|
||||
"Java\n",
|
||||
"SE\n",
|
||||
"platform\n",
|
||||
",\n",
|
||||
"if\n",
|
||||
"-PRON-\n",
|
||||
"have\n",
|
||||
"not\n",
|
||||
"already\n",
|
||||
"do\n",
|
||||
"so\n",
|
||||
".\n",
|
||||
"create\n",
|
||||
"a\n",
|
||||
"program\n",
|
||||
"that\n",
|
||||
"use\n",
|
||||
"Swing\n",
|
||||
"component\n",
|
||||
".\n",
|
||||
"compile\n",
|
||||
"the\n",
|
||||
"program\n",
|
||||
".\n",
|
||||
"run\n",
|
||||
"the\n",
|
||||
"program\n",
|
||||
".\n"
|
||||
" for all Java programmer : this section explain how to compile and run a swing application from the command line . for information on compile and run a swing application use NetBeans IDE , see run Tutorial Examples in NetBeans IDE . the compilation instruction work for all Swing program — applet , as well as application . here be the step you need to follow : install the late release of the Java SE platform , if you have not already do so . create a program that use swing component . compile the program . run the program . "
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -281,7 +213,7 @@
|
|||
"doc = nlp(text)\n",
|
||||
"\n",
|
||||
"for token in doc:\n",
|
||||
" print(token.lemma_)"
|
||||
" print(token.lemma_, end=' ')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -302,13 +234,40 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 40,
|
||||
"id": "surgical-demonstration",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def terminology_lookup():\n",
|
||||
" return []"
|
||||
" for term in dictionary:\n",
|
||||
" for token in doc:\n",
|
||||
" if token.lemma_ == term:\n",
|
||||
" print(f'{token}: ({token.idx}, {token.idx + len(token)})')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"id": "74f600ea",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"programs: (291, 299)\n",
|
||||
"program: (468, 475)\n",
|
||||
"program: (516, 523)\n",
|
||||
"program: (533, 540)\n",
|
||||
"application: (80, 91)\n",
|
||||
"application: (164, 175)\n",
|
||||
"applications: (322, 334)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"terminology_lookup()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -337,13 +296,56 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 22,
|
||||
"id": "superb-butterfly",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_nouns(text):\n",
|
||||
" return []"
|
||||
" doc = nlp(text)\n",
|
||||
" return [token.text for token in doc if token.pos_ == 'NOUN']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "2bfedfa3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['programmers',\n",
|
||||
" 'section',\n",
|
||||
" 'Swing',\n",
|
||||
" 'application',\n",
|
||||
" 'command',\n",
|
||||
" 'line',\n",
|
||||
" 'information',\n",
|
||||
" 'Swing',\n",
|
||||
" 'application',\n",
|
||||
" 'compilation',\n",
|
||||
" 'instructions',\n",
|
||||
" 'programs',\n",
|
||||
" 'applets',\n",
|
||||
" 'applications',\n",
|
||||
" 'steps',\n",
|
||||
" 'release',\n",
|
||||
" 'platform',\n",
|
||||
" 'program',\n",
|
||||
" 'Swing',\n",
|
||||
" 'components',\n",
|
||||
" 'program',\n",
|
||||
" 'program']"
|
||||
]
|
||||
},
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"get_nouns(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -356,7 +358,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 19,
|
||||
"id": "acting-tolerance",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -374,13 +376,54 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 26,
|
||||
"id": "eight-redhead",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_terms(text):\n",
|
||||
" return []"
|
||||
" doc = nlp(text)\n",
|
||||
" terms = {}\n",
|
||||
" for token in doc:\n",
|
||||
" if token.pos_ == 'NOUN':\n",
|
||||
" term = token.lemma_\n",
|
||||
" terms[term] = terms.get(term, 0) + 1\n",
|
||||
" return terms"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"id": "07c1122a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'programmer': 1,\n",
|
||||
" 'section': 1,\n",
|
||||
" 'swing': 3,\n",
|
||||
" 'application': 3,\n",
|
||||
" 'command': 1,\n",
|
||||
" 'line': 1,\n",
|
||||
" 'information': 1,\n",
|
||||
" 'compilation': 1,\n",
|
||||
" 'instruction': 1,\n",
|
||||
" 'program': 4,\n",
|
||||
" 'applet': 1,\n",
|
||||
" 'step': 1,\n",
|
||||
" 'release': 1,\n",
|
||||
" 'platform': 1,\n",
|
||||
" 'component': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"extract_terms(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -393,14 +436,82 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 32,
|
||||
"id": "monetary-mambo",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Extract and count nouns, verbs and adjectives\n",
|
||||
"def extract_terms(text):\n",
|
||||
" return []"
|
||||
" doc = nlp(text)\n",
|
||||
" terms = {\"nouns\": {}, \"verbs\": {}, \"adjectives\": {}}\n",
|
||||
" for token in doc:\n",
|
||||
" if token.pos_ == 'NOUN':\n",
|
||||
" term = token.lemma_\n",
|
||||
" terms[\"nouns\"][term] = terms[\"nouns\"].get(term, 0) + 1\n",
|
||||
" elif token.pos_ == 'VERB':\n",
|
||||
" term = token.lemma_\n",
|
||||
" terms[\"verbs\"][term] = terms[\"verbs\"].get(term, 0) + 1\n",
|
||||
" elif token.pos_ == 'ADJ':\n",
|
||||
" term = token.lemma_\n",
|
||||
" terms[\"adjectives\"][term] = terms[\"adjectives\"].get(term, 0) + 1\n",
|
||||
"\n",
|
||||
" return terms"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"id": "1eb48136",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'adjectives': {'late': 1},\n",
|
||||
" 'nouns': {'applet': 1,\n",
|
||||
" 'application': 3,\n",
|
||||
" 'command': 1,\n",
|
||||
" 'compilation': 1,\n",
|
||||
" 'component': 1,\n",
|
||||
" 'information': 1,\n",
|
||||
" 'instruction': 1,\n",
|
||||
" 'line': 1,\n",
|
||||
" 'platform': 1,\n",
|
||||
" 'program': 4,\n",
|
||||
" 'programmer': 1,\n",
|
||||
" 'release': 1,\n",
|
||||
" 'section': 1,\n",
|
||||
" 'step': 1,\n",
|
||||
" 'swing': 3},\n",
|
||||
" 'verbs': {'compile': 3,\n",
|
||||
" 'create': 1,\n",
|
||||
" 'do': 1,\n",
|
||||
" 'explain': 1,\n",
|
||||
" 'follow': 1,\n",
|
||||
" 'install': 1,\n",
|
||||
" 'need': 1,\n",
|
||||
" 'run': 4,\n",
|
||||
" 'see': 1,\n",
|
||||
" 'use': 2,\n",
|
||||
" 'work': 1}}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
"pprint(extract_terms(text))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "62aeea83",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -422,7 +533,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.10.14"
|
||||
},
|
||||
"subtitle": "3. Terminologia",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -55,13 +55,52 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 7,
|
||||
"id": "documented-hacker",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(10, 13), (17, 21)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def find_tags(text):\n",
|
||||
" return []"
|
||||
" tags = re.finditer(r'<[^>]+>', text)\n",
|
||||
" return [tag.span() for tag in tags]\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"text = 'This is a <b>bold</b> text'\n",
|
||||
"find_tags(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "1781331d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"('<b>', '</b>')"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"text[10:13], text[17:21]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -74,13 +113,28 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 9,
|
||||
"id": "unauthorized-study",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(True, False, False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def is_translatable(text):\n",
|
||||
" return True"
|
||||
" # Text is translatable if it contains only letters, spaces, and punctuation\n",
|
||||
" return re.fullmatch(r'[a-zA-Z .,!?]+', text) is not None\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"is_translatable('Hello, world!'), is_translatable('Hello, 123!'), is_translatable('你好,世界!')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -93,13 +147,65 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 12,
|
||||
"id": "beautiful-mathematics",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(12, 22), (28, 38), (42, 52), (56, 66), (70, 85)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def find_dates(text):\n",
|
||||
" return []"
|
||||
" # Find all dates in 5 formats: yyyy-mm-dd, yyyy/mm/dd, dd-mm-yyyy, dd/mm/yyyy, dd month yyyy\n",
|
||||
" # yyyy-mm-dd\n",
|
||||
" dates = [date.span() for date in re.finditer(r'\\b\\d{4}-\\d{2}-\\d{2}\\b', text)]\n",
|
||||
" # yyyy/mm/dd\n",
|
||||
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{4}/\\d{2}/\\d{2}\\b', text)]\n",
|
||||
" # dd-mm-yyyy\n",
|
||||
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}-\\d{2}-\\d{4}\\b', text)]\n",
|
||||
" # dd/mm/yyyy\n",
|
||||
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}/\\d{2}/\\d{4}\\b', text)]\n",
|
||||
" # dd month yyyy\n",
|
||||
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2} [a-zA-Z]+ \\d{4}\\b', text)]\n",
|
||||
" return dates\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"text = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020 or 01 January 2020'\n",
|
||||
"find_dates(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "215a4cbd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"2020-01-01\n",
|
||||
"2020/01/01\n",
|
||||
"01-01-2020\n",
|
||||
"01/01/2020\n",
|
||||
"01 January 2020\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(text[12:22])\n",
|
||||
"print(text[28:38])\n",
|
||||
"print(text[42:52])\n",
|
||||
"print(text[56:66])\n",
|
||||
"print(text[70:85])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -125,13 +231,164 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "finished-essex",
|
||||
"execution_count": 37,
|
||||
"id": "e37a24ad",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = 'The date is 2020-01-02, not 2020/01/02 or 02-01-2020 or 02/01/2020 or 02 January 2020'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 38,
|
||||
"id": "4da1f53f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The date is 01/02/2020, not 01/02/2020 or 02/01/2020 or 02/01/2020 or 01/02/2020'"
|
||||
]
|
||||
},
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from dateutil.parser import parse\n",
|
||||
"\n",
|
||||
"def change_data_to_US_format(text):\n",
|
||||
" dates = find_dates(text)\n",
|
||||
"\n",
|
||||
" for start, end in dates:\n",
|
||||
" date = text[start:end]\n",
|
||||
" try:\n",
|
||||
" new_date = parse(date).strftime('%m/%d/%Y')\n",
|
||||
" text = text[:start] + new_date + text[end:]\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"change_data_to_US_format(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"id": "8a2bf3a3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The date is 02/01/2020, not 02/01/2020 or 01/02/2020 or 01/02/2020 or 02/01/2020'"
|
||||
]
|
||||
},
|
||||
"execution_count": 39,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from dateutil.parser import parse\n",
|
||||
"\n",
|
||||
"def change_data_to_EU_format(text):\n",
|
||||
" dates = find_dates(text)\n",
|
||||
"\n",
|
||||
" for start, end in dates:\n",
|
||||
" date = text[start:end]\n",
|
||||
" try:\n",
|
||||
" new_date = parse(date).strftime('%d/%m/%Y')\n",
|
||||
" text = text[:start] + new_date + text[end:]\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"change_data_to_EU_format(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"id": "e1c63075",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'The date is 2020.01.02, not 2020.01.02 or 2020.02.01 or 2020.02.01 or 2020.01.02'"
|
||||
]
|
||||
},
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from dateutil.parser import parse\n",
|
||||
"\n",
|
||||
"def change_data_to_digit_dot_format(text):\n",
|
||||
" dates = find_dates(text)\n",
|
||||
"\n",
|
||||
" for start, end in dates:\n",
|
||||
" date = text[start:end]\n",
|
||||
" try:\n",
|
||||
" new_date = parse(date).strftime('%Y.%m.%d')\n",
|
||||
" text = text[:start] + new_date + text[end:]\n",
|
||||
" except:\n",
|
||||
" pass\n",
|
||||
" return text\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"change_data_to_digit_dot_format(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"id": "finished-essex",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'Data jest 01/01/2020, a nie 01/01/2020 lub 01/01/2020 lub 01/01/2020'"
|
||||
]
|
||||
},
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def correct_dates(source_segment, target_segment, date_format):\n",
|
||||
" return ''"
|
||||
" # Check if number of dates in source and target segments are the same\n",
|
||||
" assert len(find_dates(source_segment)) == len(find_dates(target_segment))\n",
|
||||
"\n",
|
||||
" # Check if all dates are the same (ignore the format)\n",
|
||||
" source_dates = find_dates(source_segment)\n",
|
||||
" target_dates = find_dates(target_segment)\n",
|
||||
" for source_date, target_date in zip(source_dates, target_dates):\n",
|
||||
" assert change_data_to_US_format(source_segment[source_date[0]:source_date[1]]) == change_data_to_US_format(target_segment[target_date[0]:target_date[1]]), f\"Dates are different: {source_segment[source_date[0]:source_date[1]]} and {target_segment[target_date[0]:target_date[1]]}\"\n",
|
||||
"\n",
|
||||
" # Change the format of dates in the target segment\n",
|
||||
" if date_format == 'US':\n",
|
||||
" target_segment = change_data_to_US_format(target_segment)\n",
|
||||
" elif date_format == 'EU':\n",
|
||||
" target_segment = change_data_to_EU_format(target_segment)\n",
|
||||
" elif date_format == 'digit.dot':\n",
|
||||
" target_segment = change_data_to_digit_dot_format(target_segment)\n",
|
||||
"\n",
|
||||
" return target_segment\n",
|
||||
"\n",
|
||||
"# Test the function\n",
|
||||
"source_segment = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020'\n",
|
||||
"target_segment = \"Data jest 01/01/2020, a nie 2020-01-01 lub 01-01-2020 lub 01/01/2020\"\n",
|
||||
"correct_dates(source_segment, target_segment, 'US')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -176,13 +433,84 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 60,
|
||||
"id": "romance-judge",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import math\n",
|
||||
"\n",
|
||||
"def transfer_tags(source_segment, target_segment):\n",
|
||||
" return ''"
|
||||
" # Split the segments into tokens\n",
|
||||
" source_tokens = source_segment.split()\n",
|
||||
" target_tokens = target_segment.split()\n",
|
||||
"\n",
|
||||
" # Calculate the ratio of the number of tokens in the target to the number of tokens in the source\n",
|
||||
" ratio = len(target_tokens) / len(source_tokens)\n",
|
||||
"\n",
|
||||
" # Assign tags to tokens in the target tokens - if the source token has a tag, assign it to the corresponding token in the target tokens\n",
|
||||
" for i, source_token in enumerate(source_tokens):\n",
|
||||
" if re.match(r'<[^>]+>', source_token):\n",
|
||||
" target_index = math.ceil(i * ratio)\n",
|
||||
"\n",
|
||||
" if target_index >= len(target_tokens):\n",
|
||||
" target_index = len(target_tokens) - 1\n",
|
||||
"\n",
|
||||
" # Assign start tag\n",
|
||||
" target_tokens[target_index] = re.findall(r'<[^>]+>', source_token)[0] + target_tokens[target_index]\n",
|
||||
"\n",
|
||||
" # Assign end tag\n",
|
||||
" target_tokens[target_index] = target_tokens[target_index] + re.findall(r'</[^>]+>', source_token)[0]\n",
|
||||
"\n",
|
||||
" return ' '.join(target_tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 61,
|
||||
"id": "fd8858d8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'To jest <b>ważny</b> tekst'"
|
||||
]
|
||||
},
|
||||
"execution_count": 61,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Test the function (same number of tokens)\n",
|
||||
"source_segment = 'This is <b>bold</b> text'\n",
|
||||
"target_segment = 'To jest ważny tekst'\n",
|
||||
"transfer_tags(source_segment, target_segment)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 62,
|
||||
"id": "de9e6298",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'To jest bardzo <b>ważny</b> tekst'"
|
||||
]
|
||||
},
|
||||
"execution_count": 62,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Test the function (different number of tokens)\n",
|
||||
"source_segment = 'This is <b>bold</b> text'\n",
|
||||
"target_segment = 'To jest bardzo ważny tekst'\n",
|
||||
"transfer_tags(source_segment, target_segment)"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -205,7 +533,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.10.14"
|
||||
},
|
||||
"subtitle": "6,7. Preprocessing i postprocessing",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
|
205
lab/lab_08.ipynb
205
lab/lab_08.ipynb
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue