Compare commits
8 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
9545d2d669 | ||
|
d4038eb5ae | ||
|
5bbba14a57 | ||
|
fd590b3a22 | ||
|
9b75563e6a | ||
|
7a6ac33f6e | ||
|
5de69211e1 | ||
|
870b673fac |
2
lab/data/.gitignore
vendored
Normal file
2
lab/data/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
corpus/
|
||||||
|
NIPS Papers/
|
10
lab/data/lda_topics.txt
Normal file
10
lab/data/lda_topics.txt
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
(0, '0.006*"learning" + 0.005*"model" + 0.005*"data" + 0.004*"function" + 0.004*"set" + 0.004*"using" + 0.004*"number" + 0.004*"neural" + 0.004*"one" + 0.003*"error"')
|
||||||
|
(1, '0.008*"learning" + 0.006*"data" + 0.005*"model" + 0.005*"set" + 0.004*"algorithm" + 0.004*"time" + 0.004*"one" + 0.004*"two" + 0.003*"used" + 0.003*"figure"')
|
||||||
|
(2, '0.007*"data" + 0.005*"model" + 0.005*"set" + 0.005*"learning" + 0.004*"one" + 0.004*"algorithm" + 0.004*"time" + 0.003*"using" + 0.003*"figure" + 0.003*"training"')
|
||||||
|
(3, '0.006*"data" + 0.005*"model" + 0.004*"learning" + 0.004*"two" + 0.004*"algorithm" + 0.004*"using" + 0.004*"function" + 0.004*"set" + 0.003*"number" + 0.003*"given"')
|
||||||
|
(4, '0.006*"learning" + 0.005*"data" + 0.005*"model" + 0.005*"set" + 0.004*"algorithm" + 0.004*"time" + 0.004*"using" + 0.004*"two" + 0.004*"function" + 0.003*"one"')
|
||||||
|
(5, '0.008*"learning" + 0.006*"data" + 0.005*"algorithm" + 0.004*"model" + 0.004*"two" + 0.004*"function" + 0.004*"number" + 0.003*"figure" + 0.003*"time" + 0.003*"set"')
|
||||||
|
(6, '0.007*"learning" + 0.006*"model" + 0.005*"data" + 0.005*"algorithm" + 0.004*"function" + 0.004*"set" + 0.003*"time" + 0.003*"one" + 0.003*"based" + 0.003*"number"')
|
||||||
|
(7, '0.007*"learning" + 0.005*"set" + 0.005*"data" + 0.005*"model" + 0.004*"algorithm" + 0.004*"function" + 0.004*"using" + 0.004*"number" + 0.004*"log" + 0.004*"figure"')
|
||||||
|
(8, '0.005*"learning" + 0.005*"set" + 0.005*"algorithm" + 0.004*"model" + 0.004*"function" + 0.004*"data" + 0.004*"one" + 0.004*"time" + 0.003*"using" + 0.003*"given"')
|
||||||
|
(9, '0.007*"data" + 0.006*"model" + 0.005*"learning" + 0.005*"algorithm" + 0.004*"two" + 0.003*"number" + 0.003*"time" + 0.003*"set" + 0.003*"function" + 0.003*"used"')
|
100
lab/data/top_nouns.txt
Normal file
100
lab/data/top_nouns.txt
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
project victims support visit mediation
|
||||||
|
exhibition cooperation year meeting films
|
||||||
|
exhibition cooperation year meeting films
|
||||||
|
solution occupation settlement problem resolutions
|
||||||
|
residence citizens permit security citizen
|
||||||
|
residence citizens permit security citizen
|
||||||
|
support measures countries farmers member
|
||||||
|
data services infrastructure development project
|
||||||
|
data services infrastructure development project
|
||||||
|
photographs service scans materials films
|
||||||
|
photographs service scans materials films
|
||||||
|
insurance ZUS contributions benefits administration
|
||||||
|
project archaeology research conservation history
|
||||||
|
project archaeology research conservation history
|
||||||
|
cases % coronavirus countries disease
|
||||||
|
% year case cases coronavirus
|
||||||
|
ship tug speed accident course
|
||||||
|
ship tug speed accident course
|
||||||
|
work scientists research science telomerase
|
||||||
|
work scientists research science telomerase
|
||||||
|
film media part time efforts
|
||||||
|
film media part time efforts
|
||||||
|
insurance ZUS contributions benefits administration
|
||||||
|
use care stewardship resistance antibiotics
|
||||||
|
services administration state information e
|
||||||
|
services administration state information e
|
||||||
|
coronavirus research measures outbreak member
|
||||||
|
residence card foreigner work permit
|
||||||
|
security e threats policy gas
|
||||||
|
security e threats policy gas
|
||||||
|
paper 15th reader file date
|
||||||
|
paper 15th reader file date
|
||||||
|
costs implementation management tasks expenditures
|
||||||
|
food cooperation products market agri
|
||||||
|
costs implementation management tasks expenditures
|
||||||
|
costs implementation management tasks expenditures
|
||||||
|
artist work painting paintings time
|
||||||
|
artist work painting paintings time
|
||||||
|
Home » rights representatives discrimination
|
||||||
|
Home » rights representatives discrimination
|
||||||
|
command documentation alias files directory
|
||||||
|
water basis land status item
|
||||||
|
water basis land status item
|
||||||
|
% contract contracts . No
|
||||||
|
food cooperation products market agri
|
||||||
|
% contract contracts . No
|
||||||
|
market level services age companies
|
||||||
|
market level services age companies
|
||||||
|
projects innovation R&D development companies
|
||||||
|
projects innovation R&D development companies
|
||||||
|
contracts contract % item procedures
|
||||||
|
contracts contract % item procedures
|
||||||
|
room A office information B
|
||||||
|
room A office information B
|
||||||
|
advantage production country countries goods
|
||||||
|
measles vaccine disease person people
|
||||||
|
advantage production country countries goods
|
||||||
|
card residence permission business stamp
|
||||||
|
card residence permission business stamp
|
||||||
|
w % gospodarczego polityki publicznych
|
||||||
|
system banks stability risk sector
|
||||||
|
camps people concentration policy resistance
|
||||||
|
camps people concentration policy resistance
|
||||||
|
safety aviation management requirements entity
|
||||||
|
safety aviation management requirements entity
|
||||||
|
research call philosophy information project
|
||||||
|
vaccination pertussis cancer risk disease
|
||||||
|
research call philosophy information project
|
||||||
|
energy gas % oil countries
|
||||||
|
energy gas % oil countries
|
||||||
|
cooperation meeting talks forces defence
|
||||||
|
project education information coronavirus funding
|
||||||
|
food education project measures assistance
|
||||||
|
infection disease symptoms fever humans
|
||||||
|
energy audit costs use management
|
||||||
|
countries % development benefits funds
|
||||||
|
years minister year rector persons
|
||||||
|
water food fish times year
|
||||||
|
land water population data age
|
||||||
|
land water population data age
|
||||||
|
market labour crisis unemployment countries
|
||||||
|
market labour crisis unemployment countries
|
||||||
|
accelerator research - operation model
|
||||||
|
accelerator research - operation model
|
||||||
|
energy policy power development objectives
|
||||||
|
priest hand country wedding church
|
||||||
|
eggs breakfast food products meat
|
||||||
|
eggs breakfast food products meat
|
||||||
|
water fish times food year
|
||||||
|
honey production bread time taste
|
||||||
|
honey production bread time taste
|
||||||
|
data job portal vacancies Decision
|
||||||
|
data job portal vacancies Decision
|
||||||
|
food quality products apples farmers
|
||||||
|
food quality products apples farmers
|
||||||
|
visa activities child B-1 institution
|
||||||
|
visa activities child B-1 institution
|
||||||
|
- co preparations operation preparation
|
||||||
|
- co preparations operation preparation
|
||||||
|
project victims support visit mediation
|
100
lab/data/top_nouns_tfidf.txt
Normal file
100
lab/data/top_nouns_tfidf.txt
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
approval total lawyers priorities judges
|
||||||
|
agriculture support guests offers author
|
||||||
|
agriculture support guests offers author
|
||||||
|
homeland invasion address prisoners sources
|
||||||
|
identity positions elaboration issues terms
|
||||||
|
identity positions elaboration issues terms
|
||||||
|
distancing lenders mechanism check part
|
||||||
|
IT Realization Services resolutions bases
|
||||||
|
IT Realization Services resolutions bases
|
||||||
|
occupation scans browser Service processes
|
||||||
|
occupation scans browser Service processes
|
||||||
|
am war month Insurance centralisation
|
||||||
|
conservation zu provisions basin record
|
||||||
|
conservation zu provisions basin record
|
||||||
|
culture city abscesses aeronautics disruptors
|
||||||
|
infection Recommendations man evening occurrence
|
||||||
|
course hull STATE classifier certificate
|
||||||
|
course hull STATE classifier certificate
|
||||||
|
cooling work culture part laboratory
|
||||||
|
cooling work culture part laboratory
|
||||||
|
culture reverse advisor documentary service
|
||||||
|
culture reverse advisor documentary service
|
||||||
|
am war month Insurance centralisation
|
||||||
|
pressure ability entry prescribers costs
|
||||||
|
economies management role disk stakeholders
|
||||||
|
economies management role disk stakeholders
|
||||||
|
traders fears carriers illness distancing
|
||||||
|
activity employment foreigners Visa graduate
|
||||||
|
defense forecast quarter factors opportunity
|
||||||
|
defense forecast quarter factors opportunity
|
||||||
|
case author screen announcement typefaces
|
||||||
|
case author screen announcement typefaces
|
||||||
|
revenue office premises o proposals
|
||||||
|
storage completion efforts Meeting crisis
|
||||||
|
office Types premises protection days
|
||||||
|
revenue office premises o proposals
|
||||||
|
pictures splashing dobrze viewer culture
|
||||||
|
pictures splashing dobrze viewer culture
|
||||||
|
creation origin discrimination interest institutions
|
||||||
|
creation origin discrimination interest institutions
|
||||||
|
names contexts calculator program descriptor
|
||||||
|
periods standards total name property
|
||||||
|
periods standards total name property
|
||||||
|
Art days liability authorities services
|
||||||
|
storage completion efforts Meeting crisis
|
||||||
|
Art days liability authorities services
|
||||||
|
skills provision country economies science
|
||||||
|
skills provision country economies science
|
||||||
|
Project possibilities cancer members therapies
|
||||||
|
Project possibilities cancer members therapies
|
||||||
|
price auction actions telecommunications appointment
|
||||||
|
price auction actions telecommunications appointment
|
||||||
|
records coffee authorisation line times
|
||||||
|
records coffee authorisation line times
|
||||||
|
example manner source essence identification
|
||||||
|
defences vaccines days spread body
|
||||||
|
example manner source essence identification
|
||||||
|
servants employees Possession insurance examinations
|
||||||
|
servants employees Possession insurance examinations
|
||||||
|
systemowe dopiero system latach popytem
|
||||||
|
efficiency problems uncertainty improvement Risk
|
||||||
|
uprising borders rights security campaign
|
||||||
|
uprising borders rights security campaign
|
||||||
|
part audits Responsibilities services authority
|
||||||
|
protection competence version occurrence requisition
|
||||||
|
Requirements members methodology data database
|
||||||
|
whoop substitute cause exposure course
|
||||||
|
Requirements members methodology data database
|
||||||
|
erent decisions SOURCES spectrum economies
|
||||||
|
erent decisions SOURCES spectrum economies
|
||||||
|
invitation effects help armament round
|
||||||
|
area teaching tax time travel
|
||||||
|
time Recommendation participants guarantees work
|
||||||
|
toxin mechanisms attacks Babies therapies
|
||||||
|
production replacement control SMEs audit
|
||||||
|
significance net ground participants levels
|
||||||
|
functioning consultation interest expert procedures
|
||||||
|
thing mercury eggs municipality lunch
|
||||||
|
agriculture R result development prices
|
||||||
|
agriculture R result development prices
|
||||||
|
reflection basis sources points results
|
||||||
|
reflection basis sources points results
|
||||||
|
leaders reach author features publications
|
||||||
|
leaders reach author features publications
|
||||||
|
consumption Improvement bodies level need
|
||||||
|
money delirium advice house couple
|
||||||
|
work thanks BEgINNINg range funds
|
||||||
|
work thanks BEgINNINg range funds
|
||||||
|
option eggs dinner wine quantities
|
||||||
|
seeds mead event maples approach
|
||||||
|
seeds mead event maples approach
|
||||||
|
case complaints consultation Employers actions
|
||||||
|
case complaints consultation Employers actions
|
||||||
|
activity fruit indications zation rice
|
||||||
|
activity fruit indications zation rice
|
||||||
|
building work premises Food child
|
||||||
|
building work premises Food child
|
||||||
|
virtue works culture sectors others
|
||||||
|
virtue works culture sectors others
|
||||||
|
approval total lawyers priorities judges
|
223
lab/lab_01.ipynb
223
lab/lab_01.ipynb
@ -52,7 +52,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 191,
|
||||||
"id": "narrow-romantic",
|
"id": "narrow-romantic",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -71,7 +71,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 192,
|
||||||
"id": "indonesian-electron",
|
"id": "indonesian-electron",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -82,7 +82,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 193,
|
||||||
"id": "compact-trinidad",
|
"id": "compact-trinidad",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -92,7 +92,7 @@
|
|||||||
"['Press the ENTER button']"
|
"['Press the ENTER button']"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 3,
|
"execution_count": 193,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -119,7 +119,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 194,
|
||||||
"id": "exposed-daniel",
|
"id": "exposed-daniel",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -139,7 +139,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 195,
|
||||||
"id": "serial-velvet",
|
"id": "serial-velvet",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -149,7 +149,7 @@
|
|||||||
"['Press the ENTER button', 'Press the ENTER key']"
|
"['Press the ENTER button', 'Press the ENTER key']"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 5,
|
"execution_count": 195,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -176,7 +176,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 196,
|
||||||
"id": "every-gibson",
|
"id": "every-gibson",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -186,7 +186,7 @@
|
|||||||
"[]"
|
"[]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 6,
|
"execution_count": 196,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -213,13 +213,37 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 197,
|
||||||
"id": "protected-rings",
|
"id": "protected-rings",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"def preprocess(sentence):\n",
|
||||||
|
" return sentence.lower()\n",
|
||||||
|
"\n",
|
||||||
"def tm_lookup(sentence):\n",
|
"def tm_lookup(sentence):\n",
|
||||||
" return ''"
|
" return [entry[1] for entry in translation_memory if preprocess(entry[0]) == preprocess(sentence)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 198,
|
||||||
|
"id": "7baee10b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Press the ENTER button', 'Press the ENTER key']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 198,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"tm_lookup('Wciśnij przycisk ENTER')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -232,17 +256,17 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 18,
|
"execution_count": 199,
|
||||||
"id": "severe-alloy",
|
"id": "severe-alloy",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"''"
|
"[]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 18,
|
"execution_count": 199,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -261,13 +285,40 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 11,
|
"execution_count": 200,
|
||||||
"id": "structural-diesel",
|
"id": "structural-diesel",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import string\n",
|
||||||
|
"\n",
|
||||||
|
"def preprocess(s):\n",
|
||||||
|
" translator = str.maketrans('', '', string.punctuation)\n",
|
||||||
|
" return s.translate(translator).lower()\n",
|
||||||
|
"\n",
|
||||||
"def tm_lookup(sentence):\n",
|
"def tm_lookup(sentence):\n",
|
||||||
" return ''"
|
" return [entry[1] for entry in translation_memory if preprocess(entry[0]) == preprocess(sentence)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 201,
|
||||||
|
"id": "c03c6709",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Press the ENTER button', 'Press the ENTER key']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 201,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"tm_lookup('Wciśnij przycisk [ENTER]')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -280,17 +331,17 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 202,
|
||||||
"id": "brief-senegal",
|
"id": "brief-senegal",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"''"
|
"[]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 12,
|
"execution_count": 202,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -317,13 +368,43 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": 203,
|
||||||
"id": "mathematical-customs",
|
"id": "mathematical-customs",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"def compare_sentences(l1, l2):\n",
|
||||||
|
" return sum([1 for i, j in zip(l1.split(), l2.split()) if i != j]) <= 1\n",
|
||||||
|
"\n",
|
||||||
|
"import string\n",
|
||||||
|
"\n",
|
||||||
|
"def preprocess(s):\n",
|
||||||
|
" translator = str.maketrans('', '', string.punctuation)\n",
|
||||||
|
" return s.translate(translator).lower()\n",
|
||||||
|
"\n",
|
||||||
"def tm_lookup(sentence):\n",
|
"def tm_lookup(sentence):\n",
|
||||||
" return ''"
|
" return [entry[1] for entry in translation_memory if compare_sentences(preprocess(entry[0]), preprocess(sentence))]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 204,
|
||||||
|
"id": "6264b722",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['System restart required']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 204,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"tm_lookup('Wymagane ponowne uruchomienie maszyny')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -344,7 +425,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 15,
|
"execution_count": 205,
|
||||||
"id": "humanitarian-wrong",
|
"id": "humanitarian-wrong",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -362,7 +443,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 16,
|
"execution_count": 206,
|
||||||
"id": "located-perception",
|
"id": "located-perception",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -374,7 +455,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 17,
|
"execution_count": 207,
|
||||||
"id": "advised-casting",
|
"id": "advised-casting",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -384,7 +465,7 @@
|
|||||||
"[('przycisk', 'button'), ('drukarka', 'printer')]"
|
"[('przycisk', 'button'), ('drukarka', 'printer')]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 17,
|
"execution_count": 207,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -406,7 +487,7 @@
|
|||||||
"id": "defensive-fifteen",
|
"id": "defensive-fifteen",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Odpowiedź:"
|
"Odpowiedź: Jeżeli implementacja wygląda tak jak powyżej, złożoność to `O(n*m)`, ponieważ dla każdego słowa iteracyjnie przechodzimy przez cały nasz słownik i szukamy odpowiednika"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -419,13 +500,56 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 19,
|
"execution_count": 208,
|
||||||
|
"id": "aca5d340",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[('przycisk', 'button')]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 208,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"glossary_lookup('Każda Drukarka posiada przycisk wznowienia drukowania')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 209,
|
||||||
"id": "original-tunisia",
|
"id": "original-tunisia",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def glossary_lookup(sentence):\n",
|
"def glossary_lookup(sentence):\n",
|
||||||
" return ''"
|
" sentence_words = [word.lower() for word in sentence.split()]\n",
|
||||||
|
" return [entry for entry in glossary if entry[0] in sentence_words]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 210,
|
||||||
|
"id": "716bbbe9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[('przycisk', 'button'), ('drukarka', 'printer')]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 210,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -438,13 +562,50 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 20,
|
"execution_count": 211,
|
||||||
|
"id": "32dec661",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]\n",
|
||||||
|
"glossary = {\n",
|
||||||
|
" 'komputer': 'computer',\n",
|
||||||
|
" 'przycisk': 'button',\n",
|
||||||
|
" 'drukarka': 'printer'\n",
|
||||||
|
"}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 212,
|
||||||
"id": "adolescent-semiconductor",
|
"id": "adolescent-semiconductor",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def glossary_lookup(sentence):\n",
|
"def glossary_lookup(sentence):\n",
|
||||||
" return ''"
|
" sentence_words = [word.lower() for word in sentence.split() if word.lower() in glossary]\n",
|
||||||
|
" return [(word, glossary[word]) for word in sentence_words]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 213,
|
||||||
|
"id": "d1e991c6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[('drukarka', 'printer'), ('przycisk', 'button')]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 213,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -467,7 +628,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.10.14"
|
||||||
},
|
},
|
||||||
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
|
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
104
lab/lab_02.ipynb
104
lab/lab_02.ipynb
@ -57,7 +57,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 17,
|
||||||
"id": "confident-prison",
|
"id": "confident-prison",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -80,13 +80,27 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 18,
|
||||||
"id": "continental-submission",
|
"id": "continental-submission",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def ice_lookup(sentence, prev_sentence, next_sentence):\n",
|
"def ice_lookup(sentence, prev_sentence, next_sentence):\n",
|
||||||
" return []"
|
" # Wyniki dopasowania ICE\n",
|
||||||
|
" ice_matches = []\n",
|
||||||
|
"\n",
|
||||||
|
" # Iterujemy przez pamięć tłumaczeń, pomijając pierwszy i ostatni element dla bezpieczeństwa kontekstowego\n",
|
||||||
|
" for index in range(1, len(translation_memory) - 1):\n",
|
||||||
|
" # Pobieramy obecne, poprzednie i następne zdania z TM\n",
|
||||||
|
" prev_tm_sentence, _ = translation_memory[index - 1]\n",
|
||||||
|
" current_tm_sentence, current_tm_translation = translation_memory[index]\n",
|
||||||
|
" next_tm_sentence, _ = translation_memory[index + 1]\n",
|
||||||
|
"\n",
|
||||||
|
" # Sprawdzamy, czy wszystkie trzy zdania zgadzają się z odpowiednikami w TM\n",
|
||||||
|
" if (prev_tm_sentence == prev_sentence and current_tm_sentence == current_sentence and next_tm_sentence == next_sentence):\n",
|
||||||
|
" ice_matches.append(current_tm_translation)\n",
|
||||||
|
"\n",
|
||||||
|
" return ice_matches"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -119,7 +133,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 19,
|
||||||
"id": "fourth-pillow",
|
"id": "fourth-pillow",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -141,7 +155,11 @@
|
|||||||
"id": "graduate-theorem",
|
"id": "graduate-theorem",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Odpowiedź:"
|
"Odpowiedź: Nie, ponieważ w tej funkcji interesuje nas tylko długość zdania, tzn. drugi warunek nie będzie spełniony\n",
|
||||||
|
"\n",
|
||||||
|
"Przykład: `kot != bok`, a dla tej funkcji zwróci 0\n",
|
||||||
|
"\n",
|
||||||
|
"Spełnione warunki: 1, 3, 4"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -154,7 +172,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 20,
|
||||||
"id": "continued-christopher",
|
"id": "continued-christopher",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -179,7 +197,40 @@
|
|||||||
"id": "metallic-leave",
|
"id": "metallic-leave",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Odpowiedź:"
|
"Odpowiedź: Tak, spełnia wszystkie warunki\n",
|
||||||
|
"\n",
|
||||||
|
"Sprawdzenie dla warunku 4"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 29,
|
||||||
|
"id": "349a3547",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"True\n",
|
||||||
|
"True\n",
|
||||||
|
"True\n",
|
||||||
|
"True\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# x == y i y == z\n",
|
||||||
|
"print(sentence_distance(\"kot\", \"kot\") + sentence_distance(\"kot\", \"kot\") >= sentence_distance(\"kot\", \"kot\"))\n",
|
||||||
|
"\n",
|
||||||
|
"# x == y i y != z\n",
|
||||||
|
"print(sentence_distance(\"kot\", \"kot\") + sentence_distance(\"kot\", \"pies\") >= sentence_distance(\"kot\", \"pies\"))\n",
|
||||||
|
"\n",
|
||||||
|
"# x != y i y == z\n",
|
||||||
|
"print(sentence_distance(\"kot\", \"pies\") + sentence_distance(\"pies\", \"pies\") >= sentence_distance(\"kot\", \"pies\"))\n",
|
||||||
|
"\n",
|
||||||
|
"# x != y i y != z\n",
|
||||||
|
"print(sentence_distance(\"kot\", \"pies\") + sentence_distance(\"pies\", \"kot\") >= sentence_distance(\"kot\", \"kot\"))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -206,7 +257,11 @@
|
|||||||
"id": "bibliographic-stopping",
|
"id": "bibliographic-stopping",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Odpowiedź:"
|
"Odpowiedź:\n",
|
||||||
|
"- Dystans Levenshteina jest zawsze nieujemny\n",
|
||||||
|
"- Jeśli dwa ciągi są identyczne, nie potrzeba żadnych operacji do przekształcenia jednego w drugi\n",
|
||||||
|
"- Dystans Levenshteina jest symetryczny, ponieważ liczba operacji wymaganych do przekształcenia ciągu A w ciąg B jest taka sama jak liczba operacji potrzebnych do przekształcenia ciągu B w ciąg A\n",
|
||||||
|
"- Dystans Levenshteina spełnia nierówność trójkąta. Można to uzasadnić rozważając, że przekształcenie ciągu X w Y przez ciąg pośredni Z (najpierw przekształcając X w Z, a następnie Z w Y) nie będzie wymagać więcej operacji niż bezpośrednie przekształcenie X w Y"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -223,7 +278,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 21,
|
||||||
"id": "secondary-wrist",
|
"id": "secondary-wrist",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -233,7 +288,7 @@
|
|||||||
"2"
|
"2"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 5,
|
"execution_count": 21,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -254,7 +309,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 22,
|
||||||
"id": "associate-tuner",
|
"id": "associate-tuner",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -273,7 +328,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 23,
|
||||||
"id": "focal-pathology",
|
"id": "focal-pathology",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -283,7 +338,7 @@
|
|||||||
"0.9166666666666666"
|
"0.9166666666666666"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 7,
|
"execution_count": 23,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -294,7 +349,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 24,
|
||||||
"id": "roman-ceiling",
|
"id": "roman-ceiling",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -304,7 +359,7 @@
|
|||||||
"0.9428571428571428"
|
"0.9428571428571428"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 8,
|
"execution_count": 24,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -315,7 +370,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 25,
|
||||||
"id": "invisible-cambodia",
|
"id": "invisible-cambodia",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -325,7 +380,7 @@
|
|||||||
"0.631578947368421"
|
"0.631578947368421"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 9,
|
"execution_count": 25,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -344,13 +399,22 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 10,
|
"execution_count": 26,
|
||||||
"id": "genetic-cradle",
|
"id": "genetic-cradle",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"# Write a fuzzy_lookup function that will search the translation memory for all sentences whose Levenshtein similarity to the searched sentence is greater than or equal to a set threshold.\n",
|
||||||
"def fuzzy_lookup(sentence, threshold):\n",
|
"def fuzzy_lookup(sentence, threshold):\n",
|
||||||
" return []"
|
" fuzzy_matches = []\n",
|
||||||
|
"\n",
|
||||||
|
" # Iterujemy przez pamięć tłumaczeń\n",
|
||||||
|
" for tm_sentence, tm_translation in translation_memory:\n",
|
||||||
|
" # Sprawdzamy, czy podobieństwo Levenshteina jest większe niż próg\n",
|
||||||
|
" if levenshtein_similarity(sentence, tm_sentence) >= threshold:\n",
|
||||||
|
" fuzzy_matches.append(tm_translation)\n",
|
||||||
|
"\n",
|
||||||
|
" return fuzzy_matches"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -373,7 +437,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.10.14"
|
||||||
},
|
},
|
||||||
"subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
|
"subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
349
lab/lab_03.ipynb
349
lab/lab_03.ipynb
@ -63,7 +63,7 @@
|
|||||||
"id": "diverse-sunglasses",
|
"id": "diverse-sunglasses",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Odpowiedź:"
|
"Odpowiedź: Wynik z Google Translate to `metal cabinet guides`"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -86,7 +86,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 11,
|
||||||
"id": "loving-prince",
|
"id": "loving-prince",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -110,7 +110,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 12,
|
||||||
"id": "bound-auction",
|
"id": "bound-auction",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -128,13 +128,46 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 13,
|
||||||
"id": "cognitive-cedar",
|
"id": "cognitive-cedar",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def terminology_lookup():\n",
|
"def terminology_lookup():\n",
|
||||||
" return []"
|
" for term in dictionary:\n",
|
||||||
|
" start = 0\n",
|
||||||
|
" while True:\n",
|
||||||
|
" start = text.find(term, start)\n",
|
||||||
|
" if start == -1:\n",
|
||||||
|
" break\n",
|
||||||
|
" end = start + len(term)\n",
|
||||||
|
" print(f'{term}: ({start}, {end})')\n",
|
||||||
|
" start = end"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"id": "0a4a26ba",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"program: (14, 21)\n",
|
||||||
|
"program: (291, 298)\n",
|
||||||
|
"program: (468, 475)\n",
|
||||||
|
"program: (516, 523)\n",
|
||||||
|
"program: (533, 540)\n",
|
||||||
|
"application: (80, 91)\n",
|
||||||
|
"application: (164, 175)\n",
|
||||||
|
"application: (322, 333)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"terminology_lookup()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -161,7 +194,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 15,
|
||||||
"id": "tribal-attention",
|
"id": "tribal-attention",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -169,108 +202,7 @@
|
|||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
" \n",
|
" for all Java programmer : this section explain how to compile and run a swing application from the command line . for information on compile and run a swing application use NetBeans IDE , see run Tutorial Examples in NetBeans IDE . the compilation instruction work for all Swing program — applet , as well as application . here be the step you need to follow : install the late release of the Java SE platform , if you have not already do so . create a program that use swing component . compile the program . run the program . "
|
||||||
"for\n",
|
|
||||||
"all\n",
|
|
||||||
"Java\n",
|
|
||||||
"programmer\n",
|
|
||||||
":\n",
|
|
||||||
"this\n",
|
|
||||||
"section\n",
|
|
||||||
"explain\n",
|
|
||||||
"how\n",
|
|
||||||
"to\n",
|
|
||||||
"compile\n",
|
|
||||||
"and\n",
|
|
||||||
"run\n",
|
|
||||||
"a\n",
|
|
||||||
"swing\n",
|
|
||||||
"application\n",
|
|
||||||
"from\n",
|
|
||||||
"the\n",
|
|
||||||
"command\n",
|
|
||||||
"line\n",
|
|
||||||
".\n",
|
|
||||||
"for\n",
|
|
||||||
"information\n",
|
|
||||||
"on\n",
|
|
||||||
"compile\n",
|
|
||||||
"and\n",
|
|
||||||
"run\n",
|
|
||||||
"a\n",
|
|
||||||
"swing\n",
|
|
||||||
"application\n",
|
|
||||||
"use\n",
|
|
||||||
"NetBeans\n",
|
|
||||||
"IDE\n",
|
|
||||||
",\n",
|
|
||||||
"see\n",
|
|
||||||
"Running\n",
|
|
||||||
"Tutorial\n",
|
|
||||||
"Examples\n",
|
|
||||||
"in\n",
|
|
||||||
"NetBeans\n",
|
|
||||||
"IDE\n",
|
|
||||||
".\n",
|
|
||||||
"the\n",
|
|
||||||
"compilation\n",
|
|
||||||
"instruction\n",
|
|
||||||
"work\n",
|
|
||||||
"for\n",
|
|
||||||
"all\n",
|
|
||||||
"swing\n",
|
|
||||||
"program\n",
|
|
||||||
"—\n",
|
|
||||||
"applet\n",
|
|
||||||
",\n",
|
|
||||||
"as\n",
|
|
||||||
"well\n",
|
|
||||||
"as\n",
|
|
||||||
"application\n",
|
|
||||||
".\n",
|
|
||||||
"here\n",
|
|
||||||
"be\n",
|
|
||||||
"the\n",
|
|
||||||
"step\n",
|
|
||||||
"-PRON-\n",
|
|
||||||
"need\n",
|
|
||||||
"to\n",
|
|
||||||
"follow\n",
|
|
||||||
":\n",
|
|
||||||
"install\n",
|
|
||||||
"the\n",
|
|
||||||
"late\n",
|
|
||||||
"release\n",
|
|
||||||
"of\n",
|
|
||||||
"the\n",
|
|
||||||
"Java\n",
|
|
||||||
"SE\n",
|
|
||||||
"platform\n",
|
|
||||||
",\n",
|
|
||||||
"if\n",
|
|
||||||
"-PRON-\n",
|
|
||||||
"have\n",
|
|
||||||
"not\n",
|
|
||||||
"already\n",
|
|
||||||
"do\n",
|
|
||||||
"so\n",
|
|
||||||
".\n",
|
|
||||||
"create\n",
|
|
||||||
"a\n",
|
|
||||||
"program\n",
|
|
||||||
"that\n",
|
|
||||||
"use\n",
|
|
||||||
"Swing\n",
|
|
||||||
"component\n",
|
|
||||||
".\n",
|
|
||||||
"compile\n",
|
|
||||||
"the\n",
|
|
||||||
"program\n",
|
|
||||||
".\n",
|
|
||||||
"run\n",
|
|
||||||
"the\n",
|
|
||||||
"program\n",
|
|
||||||
".\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -281,7 +213,7 @@
|
|||||||
"doc = nlp(text)\n",
|
"doc = nlp(text)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for token in doc:\n",
|
"for token in doc:\n",
|
||||||
" print(token.lemma_)"
|
" print(token.lemma_, end=' ')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -302,13 +234,40 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 40,
|
||||||
"id": "surgical-demonstration",
|
"id": "surgical-demonstration",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def terminology_lookup():\n",
|
"def terminology_lookup():\n",
|
||||||
" return []"
|
" for term in dictionary:\n",
|
||||||
|
" for token in doc:\n",
|
||||||
|
" if token.lemma_ == term:\n",
|
||||||
|
" print(f'{token}: ({token.idx}, {token.idx + len(token)})')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 39,
|
||||||
|
"id": "74f600ea",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"programs: (291, 299)\n",
|
||||||
|
"program: (468, 475)\n",
|
||||||
|
"program: (516, 523)\n",
|
||||||
|
"program: (533, 540)\n",
|
||||||
|
"application: (80, 91)\n",
|
||||||
|
"application: (164, 175)\n",
|
||||||
|
"applications: (322, 334)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"terminology_lookup()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -337,13 +296,56 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 22,
|
||||||
"id": "superb-butterfly",
|
"id": "superb-butterfly",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def get_nouns(text):\n",
|
"def get_nouns(text):\n",
|
||||||
" return []"
|
" doc = nlp(text)\n",
|
||||||
|
" return [token.text for token in doc if token.pos_ == 'NOUN']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"id": "2bfedfa3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['programmers',\n",
|
||||||
|
" 'section',\n",
|
||||||
|
" 'Swing',\n",
|
||||||
|
" 'application',\n",
|
||||||
|
" 'command',\n",
|
||||||
|
" 'line',\n",
|
||||||
|
" 'information',\n",
|
||||||
|
" 'Swing',\n",
|
||||||
|
" 'application',\n",
|
||||||
|
" 'compilation',\n",
|
||||||
|
" 'instructions',\n",
|
||||||
|
" 'programs',\n",
|
||||||
|
" 'applets',\n",
|
||||||
|
" 'applications',\n",
|
||||||
|
" 'steps',\n",
|
||||||
|
" 'release',\n",
|
||||||
|
" 'platform',\n",
|
||||||
|
" 'program',\n",
|
||||||
|
" 'Swing',\n",
|
||||||
|
" 'components',\n",
|
||||||
|
" 'program',\n",
|
||||||
|
" 'program']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 23,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"get_nouns(text)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -356,7 +358,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 19,
|
||||||
"id": "acting-tolerance",
|
"id": "acting-tolerance",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -374,13 +376,54 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": 26,
|
||||||
"id": "eight-redhead",
|
"id": "eight-redhead",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def extract_terms(text):\n",
|
"def extract_terms(text):\n",
|
||||||
" return []"
|
" doc = nlp(text)\n",
|
||||||
|
" terms = {}\n",
|
||||||
|
" for token in doc:\n",
|
||||||
|
" if token.pos_ == 'NOUN':\n",
|
||||||
|
" term = token.lemma_\n",
|
||||||
|
" terms[term] = terms.get(term, 0) + 1\n",
|
||||||
|
" return terms"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"id": "07c1122a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'programmer': 1,\n",
|
||||||
|
" 'section': 1,\n",
|
||||||
|
" 'swing': 3,\n",
|
||||||
|
" 'application': 3,\n",
|
||||||
|
" 'command': 1,\n",
|
||||||
|
" 'line': 1,\n",
|
||||||
|
" 'information': 1,\n",
|
||||||
|
" 'compilation': 1,\n",
|
||||||
|
" 'instruction': 1,\n",
|
||||||
|
" 'program': 4,\n",
|
||||||
|
" 'applet': 1,\n",
|
||||||
|
" 'step': 1,\n",
|
||||||
|
" 'release': 1,\n",
|
||||||
|
" 'platform': 1,\n",
|
||||||
|
" 'component': 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 27,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"extract_terms(text)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -393,14 +436,82 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 9,
|
"execution_count": 32,
|
||||||
"id": "monetary-mambo",
|
"id": "monetary-mambo",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"# Extract and count nouns, verbs and adjectives\n",
|
||||||
"def extract_terms(text):\n",
|
"def extract_terms(text):\n",
|
||||||
" return []"
|
" doc = nlp(text)\n",
|
||||||
|
" terms = {\"nouns\": {}, \"verbs\": {}, \"adjectives\": {}}\n",
|
||||||
|
" for token in doc:\n",
|
||||||
|
" if token.pos_ == 'NOUN':\n",
|
||||||
|
" term = token.lemma_\n",
|
||||||
|
" terms[\"nouns\"][term] = terms[\"nouns\"].get(term, 0) + 1\n",
|
||||||
|
" elif token.pos_ == 'VERB':\n",
|
||||||
|
" term = token.lemma_\n",
|
||||||
|
" terms[\"verbs\"][term] = terms[\"verbs\"].get(term, 0) + 1\n",
|
||||||
|
" elif token.pos_ == 'ADJ':\n",
|
||||||
|
" term = token.lemma_\n",
|
||||||
|
" terms[\"adjectives\"][term] = terms[\"adjectives\"].get(term, 0) + 1\n",
|
||||||
|
"\n",
|
||||||
|
" return terms"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 35,
|
||||||
|
"id": "1eb48136",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'adjectives': {'late': 1},\n",
|
||||||
|
" 'nouns': {'applet': 1,\n",
|
||||||
|
" 'application': 3,\n",
|
||||||
|
" 'command': 1,\n",
|
||||||
|
" 'compilation': 1,\n",
|
||||||
|
" 'component': 1,\n",
|
||||||
|
" 'information': 1,\n",
|
||||||
|
" 'instruction': 1,\n",
|
||||||
|
" 'line': 1,\n",
|
||||||
|
" 'platform': 1,\n",
|
||||||
|
" 'program': 4,\n",
|
||||||
|
" 'programmer': 1,\n",
|
||||||
|
" 'release': 1,\n",
|
||||||
|
" 'section': 1,\n",
|
||||||
|
" 'step': 1,\n",
|
||||||
|
" 'swing': 3},\n",
|
||||||
|
" 'verbs': {'compile': 3,\n",
|
||||||
|
" 'create': 1,\n",
|
||||||
|
" 'do': 1,\n",
|
||||||
|
" 'explain': 1,\n",
|
||||||
|
" 'follow': 1,\n",
|
||||||
|
" 'install': 1,\n",
|
||||||
|
" 'need': 1,\n",
|
||||||
|
" 'run': 4,\n",
|
||||||
|
" 'see': 1,\n",
|
||||||
|
" 'use': 2,\n",
|
||||||
|
" 'work': 1}}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from pprint import pprint\n",
|
||||||
|
"\n",
|
||||||
|
"pprint(extract_terms(text))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "62aeea83",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@ -422,7 +533,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.10.14"
|
||||||
},
|
},
|
||||||
"subtitle": "3. Terminologia",
|
"subtitle": "3. Terminologia",
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
File diff suppressed because one or more lines are too long
@ -55,13 +55,52 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 7,
|
||||||
"id": "documented-hacker",
|
"id": "documented-hacker",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[(10, 13), (17, 21)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"\n",
|
||||||
"def find_tags(text):\n",
|
"def find_tags(text):\n",
|
||||||
" return []"
|
" tags = re.finditer(r'<[^>]+>', text)\n",
|
||||||
|
" return [tag.span() for tag in tags]\n",
|
||||||
|
"\n",
|
||||||
|
"# Test the function\n",
|
||||||
|
"text = 'This is a <b>bold</b> text'\n",
|
||||||
|
"find_tags(text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "1781331d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"('<b>', '</b>')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"text[10:13], text[17:21]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -74,13 +113,28 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 9,
|
||||||
"id": "unauthorized-study",
|
"id": "unauthorized-study",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(True, False, False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"def is_translatable(text):\n",
|
"def is_translatable(text):\n",
|
||||||
" return True"
|
" # Text is translatable if it contains only letters, spaces, and punctuation\n",
|
||||||
|
" return re.fullmatch(r'[a-zA-Z .,!?]+', text) is not None\n",
|
||||||
|
"\n",
|
||||||
|
"# Test the function\n",
|
||||||
|
"is_translatable('Hello, world!'), is_translatable('Hello, 123!'), is_translatable('你好,世界!')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -93,13 +147,65 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 12,
|
||||||
"id": "beautiful-mathematics",
|
"id": "beautiful-mathematics",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[(12, 22), (28, 38), (42, 52), (56, 66), (70, 85)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"def find_dates(text):\n",
|
"def find_dates(text):\n",
|
||||||
" return []"
|
" # Find all dates in 5 formats: yyyy-mm-dd, yyyy/mm/dd, dd-mm-yyyy, dd/mm/yyyy, dd month yyyy\n",
|
||||||
|
" # yyyy-mm-dd\n",
|
||||||
|
" dates = [date.span() for date in re.finditer(r'\\b\\d{4}-\\d{2}-\\d{2}\\b', text)]\n",
|
||||||
|
" # yyyy/mm/dd\n",
|
||||||
|
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{4}/\\d{2}/\\d{2}\\b', text)]\n",
|
||||||
|
" # dd-mm-yyyy\n",
|
||||||
|
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}-\\d{2}-\\d{4}\\b', text)]\n",
|
||||||
|
" # dd/mm/yyyy\n",
|
||||||
|
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}/\\d{2}/\\d{4}\\b', text)]\n",
|
||||||
|
" # dd month yyyy\n",
|
||||||
|
" dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2} [a-zA-Z]+ \\d{4}\\b', text)]\n",
|
||||||
|
" return dates\n",
|
||||||
|
"\n",
|
||||||
|
"# Test the function\n",
|
||||||
|
"text = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020 or 01 January 2020'\n",
|
||||||
|
"find_dates(text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "215a4cbd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"2020-01-01\n",
|
||||||
|
"2020/01/01\n",
|
||||||
|
"01-01-2020\n",
|
||||||
|
"01/01/2020\n",
|
||||||
|
"01 January 2020\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(text[12:22])\n",
|
||||||
|
"print(text[28:38])\n",
|
||||||
|
"print(text[42:52])\n",
|
||||||
|
"print(text[56:66])\n",
|
||||||
|
"print(text[70:85])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -125,13 +231,164 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 37,
|
||||||
"id": "finished-essex",
|
"id": "e37a24ad",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"text = 'The date is 2020-01-02, not 2020/01/02 or 02-01-2020 or 02/01/2020 or 02 January 2020'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 38,
|
||||||
|
"id": "4da1f53f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'The date is 01/02/2020, not 01/02/2020 or 02/01/2020 or 02/01/2020 or 01/02/2020'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 38,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from dateutil.parser import parse\n",
|
||||||
|
"\n",
|
||||||
|
"def change_data_to_US_format(text):\n",
|
||||||
|
" dates = find_dates(text)\n",
|
||||||
|
"\n",
|
||||||
|
" for start, end in dates:\n",
|
||||||
|
" date = text[start:end]\n",
|
||||||
|
" try:\n",
|
||||||
|
" new_date = parse(date).strftime('%m/%d/%Y')\n",
|
||||||
|
" text = text[:start] + new_date + text[end:]\n",
|
||||||
|
" except:\n",
|
||||||
|
" pass\n",
|
||||||
|
" return text\n",
|
||||||
|
"\n",
|
||||||
|
"# Test the function\n",
|
||||||
|
"change_data_to_US_format(text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 39,
|
||||||
|
"id": "8a2bf3a3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'The date is 02/01/2020, not 02/01/2020 or 01/02/2020 or 01/02/2020 or 02/01/2020'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 39,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from dateutil.parser import parse\n",
|
||||||
|
"\n",
|
||||||
|
"def change_data_to_EU_format(text):\n",
|
||||||
|
" dates = find_dates(text)\n",
|
||||||
|
"\n",
|
||||||
|
" for start, end in dates:\n",
|
||||||
|
" date = text[start:end]\n",
|
||||||
|
" try:\n",
|
||||||
|
" new_date = parse(date).strftime('%d/%m/%Y')\n",
|
||||||
|
" text = text[:start] + new_date + text[end:]\n",
|
||||||
|
" except:\n",
|
||||||
|
" pass\n",
|
||||||
|
" return text\n",
|
||||||
|
"\n",
|
||||||
|
"# Test the function\n",
|
||||||
|
"change_data_to_EU_format(text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 41,
|
||||||
|
"id": "e1c63075",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'The date is 2020.01.02, not 2020.01.02 or 2020.02.01 or 2020.02.01 or 2020.01.02'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 41,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from dateutil.parser import parse\n",
|
||||||
|
"\n",
|
||||||
|
"def change_data_to_digit_dot_format(text):\n",
|
||||||
|
" dates = find_dates(text)\n",
|
||||||
|
"\n",
|
||||||
|
" for start, end in dates:\n",
|
||||||
|
" date = text[start:end]\n",
|
||||||
|
" try:\n",
|
||||||
|
" new_date = parse(date).strftime('%Y.%m.%d')\n",
|
||||||
|
" text = text[:start] + new_date + text[end:]\n",
|
||||||
|
" except:\n",
|
||||||
|
" pass\n",
|
||||||
|
" return text\n",
|
||||||
|
"\n",
|
||||||
|
"# Test the function\n",
|
||||||
|
"change_data_to_digit_dot_format(text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 45,
|
||||||
|
"id": "finished-essex",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'Data jest 01/01/2020, a nie 01/01/2020 lub 01/01/2020 lub 01/01/2020'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 45,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"def correct_dates(source_segment, target_segment, date_format):\n",
|
"def correct_dates(source_segment, target_segment, date_format):\n",
|
||||||
" return ''"
|
" # Check if number of dates in source and target segments are the same\n",
|
||||||
|
" assert len(find_dates(source_segment)) == len(find_dates(target_segment))\n",
|
||||||
|
"\n",
|
||||||
|
" # Check if all dates are the same (ignore the format)\n",
|
||||||
|
" source_dates = find_dates(source_segment)\n",
|
||||||
|
" target_dates = find_dates(target_segment)\n",
|
||||||
|
" for source_date, target_date in zip(source_dates, target_dates):\n",
|
||||||
|
" assert change_data_to_US_format(source_segment[source_date[0]:source_date[1]]) == change_data_to_US_format(target_segment[target_date[0]:target_date[1]]), f\"Dates are different: {source_segment[source_date[0]:source_date[1]]} and {target_segment[target_date[0]:target_date[1]]}\"\n",
|
||||||
|
"\n",
|
||||||
|
" # Change the format of dates in the target segment\n",
|
||||||
|
" if date_format == 'US':\n",
|
||||||
|
" target_segment = change_data_to_US_format(target_segment)\n",
|
||||||
|
" elif date_format == 'EU':\n",
|
||||||
|
" target_segment = change_data_to_EU_format(target_segment)\n",
|
||||||
|
" elif date_format == 'digit.dot':\n",
|
||||||
|
" target_segment = change_data_to_digit_dot_format(target_segment)\n",
|
||||||
|
"\n",
|
||||||
|
" return target_segment\n",
|
||||||
|
"\n",
|
||||||
|
"# Test the function\n",
|
||||||
|
"source_segment = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020'\n",
|
||||||
|
"target_segment = \"Data jest 01/01/2020, a nie 2020-01-01 lub 01-01-2020 lub 01/01/2020\"\n",
|
||||||
|
"correct_dates(source_segment, target_segment, 'US')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -176,13 +433,84 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 60,
|
||||||
"id": "romance-judge",
|
"id": "romance-judge",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import math\n",
|
||||||
|
"\n",
|
||||||
"def transfer_tags(source_segment, target_segment):\n",
|
"def transfer_tags(source_segment, target_segment):\n",
|
||||||
" return ''"
|
" # Split the segments into tokens\n",
|
||||||
|
" source_tokens = source_segment.split()\n",
|
||||||
|
" target_tokens = target_segment.split()\n",
|
||||||
|
"\n",
|
||||||
|
" # Calculate the ratio of the number of tokens in the target to the number of tokens in the source\n",
|
||||||
|
" ratio = len(target_tokens) / len(source_tokens)\n",
|
||||||
|
"\n",
|
||||||
|
" # Assign tags to tokens in the target tokens - if the source token has a tag, assign it to the corresponding token in the target tokens\n",
|
||||||
|
" for i, source_token in enumerate(source_tokens):\n",
|
||||||
|
" if re.match(r'<[^>]+>', source_token):\n",
|
||||||
|
" target_index = math.ceil(i * ratio)\n",
|
||||||
|
"\n",
|
||||||
|
" if target_index >= len(target_tokens):\n",
|
||||||
|
" target_index = len(target_tokens) - 1\n",
|
||||||
|
"\n",
|
||||||
|
" # Assign start tag\n",
|
||||||
|
" target_tokens[target_index] = re.findall(r'<[^>]+>', source_token)[0] + target_tokens[target_index]\n",
|
||||||
|
"\n",
|
||||||
|
" # Assign end tag\n",
|
||||||
|
" target_tokens[target_index] = target_tokens[target_index] + re.findall(r'</[^>]+>', source_token)[0]\n",
|
||||||
|
"\n",
|
||||||
|
" return ' '.join(target_tokens)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 61,
|
||||||
|
"id": "fd8858d8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'To jest <b>ważny</b> tekst'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 61,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Test the function (same number of tokens)\n",
|
||||||
|
"source_segment = 'This is <b>bold</b> text'\n",
|
||||||
|
"target_segment = 'To jest ważny tekst'\n",
|
||||||
|
"transfer_tags(source_segment, target_segment)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 62,
|
||||||
|
"id": "de9e6298",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'To jest bardzo <b>ważny</b> tekst'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 62,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Test the function (different number of tokens)\n",
|
||||||
|
"source_segment = 'This is <b>bold</b> text'\n",
|
||||||
|
"target_segment = 'To jest bardzo ważny tekst'\n",
|
||||||
|
"transfer_tags(source_segment, target_segment)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -205,7 +533,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.10.14"
|
||||||
},
|
},
|
||||||
"subtitle": "6,7. Preprocessing i postprocessing",
|
"subtitle": "6,7. Preprocessing i postprocessing",
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
205
lab/lab_08.ipynb
205
lab/lab_08.ipynb
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
432
lab/lab_11.ipynb
432
lab/lab_11.ipynb
@ -57,8 +57,28 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import regex\n",
|
||||||
|
"\n",
|
||||||
"def sentence_split(text):\n",
|
"def sentence_split(text):\n",
|
||||||
" return []"
|
" # Regular expression pattern to match sentence-ending punctuation followed by a space and an uppercase letter\n",
|
||||||
|
" pattern = regex.compile(r'(?<=[.!?])\\s+(?=\\p{Lu})', regex.UNICODE)\n",
|
||||||
|
" \n",
|
||||||
|
" # Split the text using the defined pattern\n",
|
||||||
|
" segments = regex.split(pattern, text)\n",
|
||||||
|
"\n",
|
||||||
|
" # Remove leading and trailing whitespace from each segment\n",
|
||||||
|
" segments = [segment.strip() for segment in segments]\n",
|
||||||
|
"\n",
|
||||||
|
" # Replace multiple newlines with a single newline\n",
|
||||||
|
" segments = [regex.sub(r'\\n+', '\\n', segment) for segment in segments]\n",
|
||||||
|
"\n",
|
||||||
|
" # Replace multiple spaces with a single space\n",
|
||||||
|
" segments = [regex.sub(r'\\s+', ' ', segment) for segment in segments]\n",
|
||||||
|
"\n",
|
||||||
|
" # Remove empty segments\n",
|
||||||
|
" segments = [segment for segment in segments if segment]\n",
|
||||||
|
" \n",
|
||||||
|
" return segments"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -71,13 +91,129 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 2,
|
||||||
"id": "guilty-morocco",
|
"id": "guilty-morocco",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Segment 1: Wydział Matematyki i Informatyki | Wydział Matematyki i Informatyki Brak obsługi JavaScript Do pełnej funkcjonalności strony potrzebujesz włączonej obsługi skryptów.\n",
|
||||||
|
"Segment 2: Instrukcje, które pozwolą Ci włączyć skrypty w Twojej przeglądarce znajdziesz tutaj Przejdź do TreśćPrzejdź do Menu głównePrzejdź do Mapa serwisuPrzejdź do Dostępność A A A en pl Wyszukaj Wyszukaj Nawigacja mobilna Wydział - Wydział Matematyki i Informatyki Wydział - Wydział Matematyki i Informatyki NO XML TR1A Wydział Pokaż menu szczegółowe Powrót do głównego menu O wydziale Władze wydziału Struktura wydziału Rada Naukowa Dyscyplin Rady programowe Pracownicy Projekty Historia Biblioteka wydziałowa Informator WMI w mediach Wybory 2024 Kontakt Życie naukowe Pokaż menu szczegółowe Powrót do głównego menu Awanse naukowe Wykłady i seminaria Cykle wykładów Towarzystwa i redakcje Konferencje Doktorzy honoris causa Profesorowie Członkowie Akademii Konkurs im.\n",
|
||||||
|
"Segment 3: Edyty Szymańskiej Dla Kandydata Pokaż menu szczegółowe Powrót do głównego menu Rekrutacja krok po kroku Studia I stopnia Studia II stopnia Studia doktoranckie Studia podyplomowe Akademia CISCO Samorząd studencki Koła i organizacje studenckie Uniwersytet Otwarty Dla Studenta Dla Pracownika Dla szkół Pokaż menu szczegółowe Powrót do głównego menu Edukacja matematyczno-informatyczna Współpraca ze szkołami Współpraca Pokaż menu szczegółowe Powrót do głównego menu Współpraca z biznesem Współpraca ze szkołami Targi pracy i staży branży IT Oferty pracy 30-LECIE Pokaż menu szczegółowe Powrót do głównego menu Harmonogram Wykłady naukowe z okazji 30-lecia WMI Wydarzenia KWUMI Galeria Zjazd Absolwentów powrót do góry Uniwersytet im.\n",
|
||||||
|
"Segment 4: Adama Mickiewicza w PoznaniuIntranet pracownikaIntranet studenta Stypendium dla olimpijczykówJesteś laureatem lub finalistą olimpiady przedmiotowej?\n",
|
||||||
|
"Segment 5: Sprawdź jak uzyskać stypendium!Czytaj więcej Jubileusz 30-leciaWydziału Matematyki i Informatyki UAMCzytaj więcej Z okazji 30-lecia wydziału22 czerwca 2024 r. serdecznie zapraszamy na zjazd absolwentówZAREJESTRUJ SIĘ!\n",
|
||||||
|
"Segment 6: Data on CampusZapraszamy na Data on Campus #1Czytaj więcejStypendium dla olimpijczykówSprawdź!Jubileusz 30-leciaWydziału Matematyki i InformatykiZjazd absolwentów22 czerwca 2024 r.Data on Campus #18 czerwca 2024 r.\n",
|
||||||
|
"Segment 7: Wiadomości Absolutorium 2024 28 maja 2024 IX edycja konkursu Study@research - laureaci z WMI 21 maja 2024 Pozytywna ocena PKA dla kierunku informatyka 14 maja 2024 Wyjazdowa Rada Pracodawców 13 maja 2024 Sportowe sukcesy WMI 09 maja 2024 Czytaj więcej Wydarzenia 5 czerwca 2024 Publiczna obrona rozprawy doktorskiej mgra Tomasza Ziętkiewicza 8 czerwca 2024 Data on Campus #1 10 czerwca 2024 Wykład 23: Grafowe modele sieci społecznościowych, czyli o światach dużych i małych 11 czerwca 2024 Wykład nr 24: O zbiorach rozmytych, czyli o tym, jak nauczyć komputer rozumieć oraz wykorzystywać informację nieprecyzyjną 13 czerwca 2024 Wykład nr 25: Jak z dwóch kryształów otrzymać jeden, czyli o dodawaniu i odejmowaniu wielościanów 15 czerwca 2024 Ultimate Hackathon Mission 3.0 Czytaj więcej O wydziale Jako jednostka uczelni badawczej, Wydział Matematyki i Informatyki UAM w Poznaniu kontynuuje ponad 100-letnią tradycję poznańskiej matematyki.\n",
|
||||||
|
"Segment 8: Jest też jednym z najlepszych ośrodków badawczo-dydaktycznych w zakresie informatyki w Polsce.\n",
|
||||||
|
"Segment 9: Obecnie Wydział prowadzi studia na czterech kierunkach: matematyce, informatyce, analizie i przetwarzaniu danych oraz na nauczaniu matematyki i informatyki.\n",
|
||||||
|
"Segment 10: Ostatni z wymienionych kierunków stanowi ofertę wyjątkową w skali całego kraju.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"def sentence_split_enhanced(text):\n",
|
"import requests\n",
|
||||||
" return []"
|
"from bs4 import BeautifulSoup\n",
|
||||||
|
"\n",
|
||||||
|
"def fetch_webpage_content(url):\n",
|
||||||
|
" response = requests.get(url)\n",
|
||||||
|
" response.raise_for_status() # Raise an exception for HTTP errors\n",
|
||||||
|
" soup = BeautifulSoup(response.content, 'html.parser')\n",
|
||||||
|
" return soup.get_text()\n",
|
||||||
|
"\n",
|
||||||
|
"url = \"https://wmi.amu.edu.pl/\"\n",
|
||||||
|
"webpage_content = fetch_webpage_content(url)\n",
|
||||||
|
"\n",
|
||||||
|
"import re\n",
|
||||||
|
"import unicodedata\n",
|
||||||
|
"\n",
|
||||||
|
"segments = sentence_split(webpage_content)\n",
|
||||||
|
"for i, segment in enumerate(segments[:10]):\n",
|
||||||
|
" print(f\"Segment {i+1}: {segment}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "3cd97d83",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Wyjątek 1: Skróty zakończone kropką\n",
|
||||||
|
"Skróty takie jak \"mgr.\", \"prof.\", \"dr.\" mogą powodować niepotrzebne podziały segmentów. Musimy upewnić się, że algorytm nie dzieli zdania po skrótach."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "fd509273",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Wyjątek 2: Daty i inne liczby zakończone kropką\n",
|
||||||
|
"Daty, takie jak \"22 czerwca 2024 r.\", mogą również powodować nieprawidłowe podziały. Musimy uwzględnić takie przypadki."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "20b69c09",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Segment 1: Wydział Matematyki i Informatyki | Wydział Matematyki i Informatyki Brak obsługi JavaScript Do pełnej funkcjonalności strony potrzebujesz włączonej obsługi skryptów.\n",
|
||||||
|
"Segment 2: Instrukcje, które pozwolą Ci włączyć skrypty w Twojej przeglądarce znajdziesz tutaj Przejdź do TreśćPrzejdź do Menu głównePrzejdź do Mapa serwisuPrzejdź do Dostępność A A A en pl Wyszukaj Wyszukaj Nawigacja mobilna Wydział - Wydział Matematyki i Informatyki Wydział - Wydział Matematyki i Informatyki NO XML TR1A Wydział Pokaż menu szczegółowe Powrót do głównego menu O wydziale Władze wydziału Struktura wydziału Rada Naukowa Dyscyplin Rady programowe Pracownicy Projekty Historia Biblioteka wydziałowa Informator WMI w mediach Wybory 2024 Kontakt Życie naukowe Pokaż menu szczegółowe Powrót do głównego menu Awanse naukowe Wykłady i seminaria Cykle wykładów Towarzystwa i redakcje Konferencje Doktorzy honoris causa Profesorowie Członkowie Akademii Konkurs im. Edyty Szymańskiej Dla Kandydata Pokaż menu szczegółowe Powrót do głównego menu Rekrutacja krok po kroku Studia I stopnia Studia II stopnia Studia doktoranckie Studia podyplomowe Akademia CISCO Samorząd studencki Koła i organizacje studenckie Uniwersytet Otwarty Dla Studenta Dla Pracownika Dla szkół Pokaż menu szczegółowe Powrót do głównego menu Edukacja matematyczno-informatyczna Współpraca ze szkołami Współpraca Pokaż menu szczegółowe Powrót do głównego menu Współpraca z biznesem Współpraca ze szkołami Targi pracy i staży branży IT Oferty pracy 30-LECIE Pokaż menu szczegółowe Powrót do głównego menu Harmonogram Wykłady naukowe z okazji 30-lecia WMI Wydarzenia KWUMI Galeria Zjazd Absolwentów powrót do góry Uniwersytet im. Adama Mickiewicza w PoznaniuIntranet pracownikaIntranet studenta Stypendium dla olimpijczykówJesteś laureatem lub finalistą olimpiady przedmiotowej?\n",
|
||||||
|
"Segment 3: Sprawdź jak uzyskać stypendium!Czytaj więcej Jubileusz 30-leciaWydziału Matematyki i Informatyki UAMCzytaj więcej Z okazji 30-lecia wydziału22 czerwca 2024 r. serdecznie zapraszamy na zjazd absolwentówZAREJESTRUJ SIĘ!\n",
|
||||||
|
"Segment 4: Data on CampusZapraszamy na Data on Campus #1Czytaj więcejStypendium dla olimpijczykówSprawdź!Jubileusz 30-leciaWydziału Matematyki i InformatykiZjazd absolwentów22 czerwca 2024 r.Data on Campus #18 czerwca 2024 r. Wiadomości Absolutorium 2024 28 maja 2024 IX edycja konkursu Study@research - laureaci z WMI 21 maja 2024 Pozytywna ocena PKA dla kierunku informatyka 14 maja 2024 Wyjazdowa Rada Pracodawców 13 maja 2024 Sportowe sukcesy WMI 09 maja 2024 Czytaj więcej Wydarzenia 5 czerwca 2024 Publiczna obrona rozprawy doktorskiej mgra Tomasza Ziętkiewicza 8 czerwca 2024 Data on Campus #1 10 czerwca 2024 Wykład 23: Grafowe modele sieci społecznościowych, czyli o światach dużych i małych 11 czerwca 2024 Wykład nr 24: O zbiorach rozmytych, czyli o tym, jak nauczyć komputer rozumieć oraz wykorzystywać informację nieprecyzyjną 13 czerwca 2024 Wykład nr 25: Jak z dwóch kryształów otrzymać jeden, czyli o dodawaniu i odejmowaniu wielościanów 15 czerwca 2024 Ultimate Hackathon Mission 3.0 Czytaj więcej O wydziale Jako jednostka uczelni badawczej, Wydział Matematyki i Informatyki UAM w Poznaniu kontynuuje ponad 100-letnią tradycję poznańskiej matematyki.\n",
|
||||||
|
"Segment 5: Jest też jednym z najlepszych ośrodków badawczo-dydaktycznych w zakresie informatyki w Polsce.\n",
|
||||||
|
"Segment 6: Obecnie Wydział prowadzi studia na czterech kierunkach: matematyce, informatyce, analizie i przetwarzaniu danych oraz na nauczaniu matematyki i informatyki.\n",
|
||||||
|
"Segment 7: Ostatni z wymienionych kierunków stanowi ofertę wyjątkową w skali całego kraju.\n",
|
||||||
|
"Segment 8: W ofercie Wydziału można także znaleźć studia podyplomowe. 4 kierunki studiów 1700+ studentów 6000+ absolwentów Studia I stopnia Matematyka Fascynuje Cię królowa nauk?\n",
|
||||||
|
"Segment 9: Jesteś umysłem ścisłym?\n",
|
||||||
|
"Segment 10: Chcesz studiować matematykę na wiodącej uczelni w Polsce?\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import regex\n",
|
||||||
|
"\n",
|
||||||
|
"def enhanced_sentence_split(text):\n",
|
||||||
|
" # Lista wyjątków, po których nie dzielimy nawet jeśli jest kropka\n",
|
||||||
|
" exceptions = ['r.', 'tzn.', 'np.', 'itp.', 'etc.', 'dr.', 'prof.', 'im.']\n",
|
||||||
|
"\n",
|
||||||
|
" # Regular expression pattern to match sentence-ending punctuation followed by a space and an uppercase letter\n",
|
||||||
|
" pattern = regex.compile(r'(?<=[.!?])\\s+(?=\\p{Lu})', regex.UNICODE)\n",
|
||||||
|
"\n",
|
||||||
|
" # Split the text using the defined pattern\n",
|
||||||
|
" segments = regex.split(pattern, text)\n",
|
||||||
|
"\n",
|
||||||
|
" # Remove leading and trailing whitespace from each segment\n",
|
||||||
|
" segments = [segment.strip() for segment in segments]\n",
|
||||||
|
"\n",
|
||||||
|
" # Rejoin segments that were incorrectly split due to exceptions\n",
|
||||||
|
" i = 0\n",
|
||||||
|
" while i < len(segments) - 1:\n",
|
||||||
|
" for exception in exceptions:\n",
|
||||||
|
" if segments[i].endswith(exception):\n",
|
||||||
|
" segments[i] += ' ' + segments.pop(i + 1)\n",
|
||||||
|
" break\n",
|
||||||
|
" else:\n",
|
||||||
|
" i += 1\n",
|
||||||
|
"\n",
|
||||||
|
" # Replace multiple newlines with a single newline\n",
|
||||||
|
" segments = [regex.sub(r'\\n+', '\\n', segment) for segment in segments]\n",
|
||||||
|
"\n",
|
||||||
|
" # Replace multiple spaces with a single space\n",
|
||||||
|
" segments = [regex.sub(r'\\s+', ' ', segment) for segment in segments]\n",
|
||||||
|
"\n",
|
||||||
|
" # Remove empty segments\n",
|
||||||
|
" segments = [segment for segment in segments if segment]\n",
|
||||||
|
"\n",
|
||||||
|
" return segments\n",
|
||||||
|
"\n",
|
||||||
|
"segments = enhanced_sentence_split(webpage_content)\n",
|
||||||
|
"for i, segment in enumerate(segments[:10]):\n",
|
||||||
|
" print(f\"Segment {i+1}: {segment}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -122,46 +258,145 @@
|
|||||||
"id": "divided-chain",
|
"id": "divided-chain",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"XLIFF jest formatem do przechowywania pamięci tłumaczeń, który opiera się na XML-u. Przykładowy plik XLIFF wygląda następująco:"
|
"XLIFF jest formatem do przechowywania pamięci tłumaczeń, który opiera się na XML-u"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "raw",
|
"cell_type": "code",
|
||||||
"id": "appropriate-timber",
|
"execution_count": 4,
|
||||||
|
"id": "169d0134",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n",
|
"text_hu = fetch_webpage_content(\"https://hu.wikipedia.org/wiki/Sz%C3%A1m%C3%ADt%C3%A1studom%C3%A1ny\")\n",
|
||||||
"<xliff xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\">\n",
|
"text_en = fetch_webpage_content(\"https://en.wikipedia.org/wiki/Computer_science\")"
|
||||||
" <file datatype=\"plaintext\" original=\"self\" source-language=\"en\" target-language=\"es\">\n",
|
]
|
||||||
" <header>\n",
|
},
|
||||||
" <sxmd:metadata xmlns:sxmd=\"urn:x-sap:mlt:xliff12:metadata:1.0\" xmlns=\"urn:x-sap:mlt:tsmetadata:1.0\">\n",
|
{
|
||||||
" <object-name>sample</object-name>\n",
|
"cell_type": "code",
|
||||||
" <collection>KWT</collection>\n",
|
"execution_count": 5,
|
||||||
" <domain>KWT</domain>\n",
|
"id": "f3549418",
|
||||||
" <developer>123</developer>\n",
|
"metadata": {},
|
||||||
" <description>sample XLIFF file</description>\n",
|
"outputs": [],
|
||||||
" </sxmd:metadata>\n",
|
"source": [
|
||||||
" </header>\n",
|
"hu_segments = enhanced_sentence_split(text_hu)\n",
|
||||||
" <body>\n",
|
"en_segments = enhanced_sentence_split(text_en)"
|
||||||
" <trans-unit>\n",
|
]
|
||||||
" <source>Hello world!</source>\n",
|
},
|
||||||
" <target>Hola mundo!</target>\n",
|
{
|
||||||
" </trans-unit>\n",
|
"cell_type": "code",
|
||||||
" <trans-unit>\n",
|
"execution_count": 6,
|
||||||
" <source>File</source>\n",
|
"id": "143730e4",
|
||||||
" <target>Archivo</target>\n",
|
"metadata": {},
|
||||||
" </trans-unit>\n",
|
"outputs": [
|
||||||
" <trans-unit>\n",
|
{
|
||||||
" <source>New</source>\n",
|
"data": {
|
||||||
" <target>Nuevo</target>\n",
|
"text/plain": [
|
||||||
" </trans-unit>\n",
|
"['Számítástudomány – Wikipédia Ugrás a tartalomhoz Főmenü Főmenü áthelyezés az oldalsávba elrejtés Navigáció KezdőlapTartalomKiemelt szócikkekFriss változtatásokLap találomraTudakozó Részvétel KezdőknekSegítségKözösségi portálKapcsolatfelvételAdományok Keresés Keresés Fiók létrehozása Bejelentkezés Személyes eszközök Fiók létrehozása Bejelentkezés Lapok kijelentkezett szerkesztőknek további információk KözreműködésekVitalap Tartalomjegyzék áthelyezés az oldalsávba elrejtés Bevezető 1Vizsgálati területei A(z) Vizsgálati területei alszakasz kinyitása/becsukása 1.1Számítástudomány 1.2Számítógép-tudomány 2Története és alágai 3Kapcsolódó szócikkek 4Jegyzetek 5További információk Tartalomjegyzék kinyitása/becsukása Számítástudomány 161 nyelv EnglishAfrikaansAlemannischአማርኛAragonésالعربيةمصرىঅসমীয়াAsturianuAzərbaycancaتۆرکجهБашҡортсаBoarischŽemaitėškaBikol CentralБеларускаяБеларуская (тарашкевіца)БългарскиभोजपुरीবাংলাBrezhonegBosanskiCatalàکوردیCorsuČeštinaKaszëbscziCymraegDanskDeutschZazakiΕλληνικάEmiliàn e rumagnòlEsperantoEspañolEestiEuskaraEstremeñuفارسیSuomiVõroFøroysktFrançaisNordfriiskFurlanFryskGaeilgeKriyòl gwiyannenGalegoGaelg客家語/Hak-kâ-ngîעבריתहिन्दीHrvatskiKreyòl ayisyenՀայերենInterlinguaBahasa IndonesiaInterlingueÍslenskaItalianoᐃᓄᒃᑎᑐᑦ / inuktitut日本語La .lojban.ქართულიQaraqalpaqshaTaqbaylitҚазақшаKalaallisutភាសាខ្មែរಕನ್ನಡ한국어KurdîLatinaLadinoLëtzebuergeschLigureLombardLietuviųLatgaļuLatviešuМокшеньMalagasyОлык марийMinangkabauМакедонскиമലയാളംМонголꯃꯤꯇꯩ ꯂꯣꯟBahasa MelayuMirandésမြန်မာဘာသာNapulitanoNedersaksiesनेपालीनेपाल भाषाNederlandsNorsk nynorskNorsk bokmålNovialNouormandOccitanOromooଓଡ଼ିଆPicardपालिPolskiPiemontèisپښتوPortuguêsRuna SimiRomânăArmãneashtiРусскийРусиньскыйसंस्कृतम्Саха тылаSarduSicilianuScotsسنڌيSrpskohrvatski / српскохрватскиTaclḥitၽႃႇသႃႇတႆး සිංහලSimple EnglishSlovenčinaSlovenščinaShqipСрпски / srpskiSeelterskSundaSvenskaKiswahiliதமிழ்తెలుగుTetunТоҷикӣไทยትግርኛTagalogTok PisinTürkçeТатарча / tatarçaئۇيغۇرچە / UyghurcheУкраїнськаاردوOʻzbekcha / ўзбекчаVènetoTiếng ViệtWalonWinarayWolof吴语მარგალურიYorùbáZeêuws中文閩南語 / Bân-lâm-gú粵語IsiZulu Hivatkozások szerkesztése SzócikkVitalap magyar OlvasásSzerkesztésLaptörténet Eszközök Eszközök áthelyezés az oldalsávba elrejtés Műveletek OlvasásSzerkesztésLaptörténet Általános Mi hivatkozik erre?Kapcsolódó változtatásokSpeciális lapokHivatkozás erre a változatraLapinformációkHogyan hivatkozz erre a lapra?Rövidített URL készítéseQR-kód letöltéseWikidata-adatlap Nyomtatás/\\u200bexportálás Könyv készítéseLetöltés PDF-kéntNyomtatható változat Társprojektek Wikimédia Commons A Wikipédiából, a szabad enciklopédiából Ez a közzétett változat, ellenőrizve: 2023. június 30.Pontosságellenőrzött Lásd még: Informatika A számítástudomány (computing science) és a számítógép-tudomány (computer science) egymáshoz nagyon közeli, egymást majdnem teljesen átfedő és szorosan összefüggő területeket ölel fel, ezért tárgyalásuk csak együttesen értelmezhető.',\n",
|
||||||
" <trans-unit>\n",
|
" 'Mindkét tudományág lényege, hogy az információkezelést és -feldolgozást állítja vizsgálata fókuszába elméleti és gyakorlati megközelítésben.',\n",
|
||||||
" <source>Exit</source>\n",
|
" 'Kialakulása az 1940-es években kezdődött, nemcsak időben egybeesve, de szoros kapcsolatban is az első elektronikus számítógépek tervezésével.',\n",
|
||||||
" <target>Salir</target>\n",
|
" 'A számítástudomány nem azonos sem az informatikával, sem a számítástechnikával (főleg ha a szilíciumcsipek gyártásának technikáját is ideértjük), sem pedig az információelmélettel, bár vannak kisebb-nagyobb átfedések.',\n",
|
||||||
" </trans-unit>\n",
|
" 'A számítástudománynak nem feladata konkrét szoftverek fejlesztése, bár foglalkozik azzal, miképp lehet a szoftverek hatékony tervezését segíteni, és ennek milyen elméleti alapjai vannak.',\n",
|
||||||
" </body>\n",
|
" 'Nem feladata konkrét információfeldolgozó gépek tervezése, bár szintén foglalkozik azzal, hogyan lehet ezek hatékonyságát elméleti szinten növelni; végképp nem feladata pedig ezek megépítése, bár a tudományág úttörői, mint Alan Turing vagy Neumann János, munkatársként részt vettek a számítógépek korai modelljeinek építésében, kialakításában is (elméleti munkásságukkal szoros kapcsolatban).',\n",
|
||||||
" </file>\n",
|
" 'Vizsgálati területei[szerkesztés] Számítástudomány[szerkesztés] A számítástudomány[1][2] a matematika egyik, igen fiatal tudományága, amely az információfeldolgozó gépek (például számítógépek) tervezésének és működtetésének elméleti, matematikai alapjaival foglalkozik.[3] Némileg elnagyoltan az algoritmusok általános elméletének is nevezhető.[4] „A számítógépek megjelenése, a mechanikus számítási eljárások megindították az algoritmus definíciójának és a programok írásmódjának formalizálását, az algoritmusok és programok szintaktikai (utasítások, vezérlési struktúra), szemantikai (helyesség, ekvivalencia), valamint kiszámíthatósági (a bemeneti értékekhez tartozó kiszámítási idő és memóriaszükséglet) tulajdonságainak mélyreható vizsgálatát.',\n",
|
||||||
"</xliff>"
|
" 'E kutatási területeket összefoglalóan matematikai számítástudománynak nevezzük.”[5][6] Az információkezelés és -feldolgozás matematikai alapjai köré csoportosul, és a számítások alapvető természetének megértésére irányul, mely számos alkalmazáshoz vezet a hatékony algoritmusok elemzésében és tervezésében, valamint a megbízható hardver- és szoftverrendszerek tervezésére és ellenőrzésére szolgáló formális módszerek fejlesztésében.',\n",
|
||||||
|
" 'Elméleti alapjai: az automataelmélet, a fordítóprogramok, az adatbázis-elmélet.',\n",
|
||||||
|
" 'Gyakorlati területei: a számítógépes irányítás és szabályozás, a nagy rendszerek analízise és szintézise, a mérnöki tervezés.',\n",
|
||||||
|
" 'Ezek alapjait a halmazok, ítéletek, relációk, függvények, a számelmélet, a különböző algebrai struktúrák, azon belül főként a Boole-algebra adja.',\n",
|
||||||
|
" 'Fontos része a kódelmélet, azon belül a zajmentes és zajos csatornák, az optimális és hibajavító kódolás alapelemei, az automaták és formális nyelvek elmélete, a párhuzamos és elosztott számítási rendszerek elmélete, valamint az algebra, a logika és a kategóriák a számítástudományban.',\n",
|
||||||
|
" 'Komplex vizsgálati területe a kiszámíthatóságelmélet, valamint annak kiterjesztése, a bonyolultságelmélet, mely azt vizsgálja, miképp lehet osztályozni az algoritmikusan megoldható problémákat, feladatokat a megoldásukhoz szükséges erőforrások mennyisége szerint.',\n",
|
||||||
|
" 'A számítógép-tudománnyal átfedésben levő átmeneti elemei: az adatstruktúrák, az algoritmusok, a programozási nyelvek, a szoftvertechnológia, a mesterséges intelligencia, az adatbázis-kezelés.',\n",
|
||||||
|
" 'Számítógép-tudomány[szerkesztés] A számítógép-tudomány[7][8][9] tárgya maga a számítógép mint eszköz; az információfeldolgozó gépek tervezésének és használatának elméleti kérdéseit kutatja.',\n",
|
||||||
|
" 'A matematika egyik igen fiatal tudományága, amely az információfeldolgozó gépek (például számítógépek) tervezésének és működtetésének elméleti, matematikai alapjaival foglalkozik.',\n",
|
||||||
|
" 'Némileg elnagyoltan az algoritmusok általános elméletének is nevezhető.[10] Eredményei és tárgya közé tartoznak a számításokat végző rendszerek és módszerek megértésével, tervezési módszerekkel, algoritmusokkal és eszközökkel, a fogalmak tesztelésének, valamint az analízisnek és verifikációnak módszereivel, a tudásreprezentációval és ennek implementációjával foglalkozó elméletek.',\n",
|
||||||
|
" 'Komplex vizsgálati területei a véges automaták, valamint a veremautomaták, mint a Turing-gép speciális esetei.',\n",
|
||||||
|
" 'Fő elemei: az algoritmusok és adatszerkezetek, a programozási módszertan és nyelvek, valamint a számítógépes elemek és architektúrák.',\n",
|
||||||
|
" 'Története és alágai[szerkesztés] A számítógép-tudomány a matematika egyik legkésőbb, mintegy fél évszázada önállósult ága.',\n",
|
||||||
|
" 'Keletkezését 1936-tól, Alan Turing angol matematikus automata- és algoritmuselméleti cikkeinek megjelenésétől, illetve Neumann János, Stephen Cole Kleene, Andrej Markov, George H.',\n",
|
||||||
|
" 'Mealy, Edward Forrest Moore, Emil Post, Kurt Gödel, John McCarthy és más kutatók hasonló jellegű munkáinak napvilágra kerülésétől kezdve számíthatjuk.',\n",
|
||||||
|
" 'A számítógép-tudomány fejlődése rendkívül gyors, a legtöbb alágnak azonban már van kialakult és közmegegyezéses jellegű elnevezése és feladatköre.',\n",
|
||||||
|
" 'Néhány alága, elméletcsoportja:[11] kiszámíthatóságelmélet, rekurzióelmélet: az algoritmusok futásának befejeződését, eredményes lefutásának lehetőségét és viszonyait vizsgálja,[12] más szavakkal: egyes függvényeknek, műveleteknek más függvényekkel való kiszámíthatóságával foglalkozik, tekinthető a számításelmélet egy olyan ágának vagy testvérterületének is, mely Turing-gépek és automaták helyett hagyományos matematikai fogalmakra (függvény, generált struktúra stb.) alapoz.',\n",
|
||||||
|
" 'E terület úttörője Stephen Cole Kleene volt (érdekesség, hogy a matematikai logika részének is tekinthető).[13] A bonyolultságelmélet a kiszámíthatóságelmélet kiterjesztése.',\n",
|
||||||
|
" 'Azt vizsgálja, hogyan lehet osztályozni az algoritmikusan megoldható problémákat, feladatokat a megoldásukhoz szükséges erőforrások mennyisége szerint.[14] automataelmélet,[8] számításelmélet, bonyolultságelmélet vagy komplexitáselmélet: formális nyelvek, formális nyelvtanok és automaták elmélete: ide sorolhatóak a generatív nyelvtanok, általánosabban a produkciós rendszerek, az automatatípusok által generált és elfogadott nyelvek vizsgálata, az egyes automatatípusok összehasonlítása.',\n",
|
||||||
|
" 'Ennek az alágnak rengeteg fontos kutatója volt mind nyugaton, mind a Szovjetunióban, ill.',\n",
|
||||||
|
" 'Oroszországban.',\n",
|
||||||
|
" 'Fontos terület a Turing-gépek és hasonló automaták elmélete, mégpedig az ezek által futtatott algoritmusok idő-és memóriaigényének vizsgálata.',\n",
|
||||||
|
" 'Központi problémája a hatékonysági vagy bonyolultsági osztályok (P, NP stb.) közti kapcsolatok megállapítása, illetve az indeterminisztikus algoritmusok vizsgálata és alkalmazása; absztrakt adatszerkezetek elmélete:[6] ide tartozik a gráfelméleti algoritmusok vizsgálata (keresési problémák és például a matroidok alkalmazása az ilyesfajta problémákra), az informatika bizonyos alapfogalmainak (adatszerkezetek) matematikai leírása; formális szemantika: ez a fordítóprogramok különböző formális nyelvtanokkal való leírásának matematikai elméletéből nőtte ki magát; fontos szerepet játszanak benne az attribútumnyelvtanok és rekurzív nyelvtanok elmélete (például), vagy például a logikai programozás elméleti leírása; logikai tervezés és optimalizálás:[8] ez a hagyományosan mérnöki tudomány a számítógép-tudomány absztrakt modelljeinek tanulmányozásával egy időben alakult ki, nagyrészt tőlük függetlenül, logikai áramköröket ugyanis nemcsak számítógépekben, hanem egyszerűbb automatákban is használnak.',\n",
|
||||||
|
" 'Az áramkörök tervezésével és optimalizálásával foglalkozik, logikainak azért nevezik, mert az áramkörmodelleket ún. logikai kapukból építi fel: egy logikai kapu olyan elektronikus szerkezet, amely a bemenő digitális (a gyakorlatban szinte mindig elektronikus) jelek valamilyen logikai függvényét képes előállítani.',\n",
|
||||||
|
" 'Optimális egy áramkör (általában), ha a kapuk számát sikerül minimalizálni. mesterségesintelligencia-kutatás[8] (pontosabban ennek matematikai alapjai): az az algoritmusok hatékonyságát azok önállóságának, önműködésének szempontjából vizsgálja; ez az elmélet a számítógép-tudomány, az informatika és a kognitív tudomány érdekes határterületeiből nőtt össze és ki; Számos terület (pl. a párhuzamos algoritmusok elmélete, az axiomatikus bonyolultságelmélet stb.) azonban még mindig inkább csak születőfélben lévő elmélet, mintsem önálló névvel rendelkező tudományág formájában létezik, és nehezebben sorolható a fenti alágak közé.',\n",
|
||||||
|
" 'Kapcsolódó szócikkek[szerkesztés] algoritmus Informatika Jegyzetek[szerkesztés] ↑ Katona Gyula – Recski András – Szabó Csaba: A számítástudomány alapjai.',\n",
|
||||||
|
" 'Typotex Kft., 2002.; ISBN 978-963-9664-19-7; ISBN 963-9664-19-7. ↑ A BME számítástudományi és információelméleti tanszékének honlapja.',\n",
|
||||||
|
" 'Hiv. beill.: 2011. 12. 19. ↑ Computer science Archiválva 2010. május 27-i dátummal a Wayback Machine-ben - Szótári bejegyzés az amerikai NITRD (A Hálózati és Információs Technológia Nemzeti Együttműködést Irányító Hivatala - National Coordination Office for Networking and Information Technology) honlapján. ↑ Dayton Codebreakers.com[halott link] ↑ Giorgio Ausiello: Algoritmusok és rekurzív függvények bonyolultságelmélete.',\n",
|
||||||
|
" 'Műszaki Könyvkiadó, Bp., 1984.',\n",
|
||||||
|
" 'ISBN 963-10-5159-5. 14. o. ↑ a b U.',\n",
|
||||||
|
" 'S.',\n",
|
||||||
|
" 'National Research Council Committee on the Fundamentals of Computer Science : Computer Science.',\n",
|
||||||
|
" 'Google elektronikus könyv (PDF), (erősen) korlátozott előnézet.',\n",
|
||||||
|
" 'Hiv. beill. 2010. július 12.; 11.-13. o.Hiv. beillesztése: 2011. 12. 19. ↑ A kombinatorika és a séta mestere (beszélgetés Szemerédi Endre matematikussal).',\n",
|
||||||
|
" 'Magyar Tudomány; 2008./06.; hiv. beill. 2010. augusztus 1.',\n",
|
||||||
|
" 'Vö.: „Az elméleti számítástechnika művelése ugyanis sokszor nagyon nehéz, bonyolult matematikai eszközöket és gondolatokat igényel: szóval, az elméleti számítástechnika szerintem a matematika egyik ága!',\n",
|
||||||
|
" 'Egyébként Magyarországon folyt vita arról, miképp nevezzék a gyereket, elméleti számítástechnika, számítógép-tudomány és ki tudja, mi még – egyik sem tűnik túl szerencsésnek.',\n",
|
||||||
|
" 'Talán az elméleti számítástechnika a legjobb magyar fordítás…” ↑ a b c d Ralston, Anthony: Programozás és számítógép-tudomány.',\n",
|
||||||
|
" 'Műszaki Könyvkiadó, Bp., 1974.',\n",
|
||||||
|
" 'ISBN 963-10-0616-6. ↑ Az ELTE számítógép-tudományi tanszékének honlapja Archiválva 2010. május 15-i dátummal a Wayback Machine-ben.',\n",
|
||||||
|
" 'Hiv. beill.: 2011. 12. 19. ↑ Tudomány és még sok minden. mindenkilapja.hu. [2016. augusztus 15-i dátummal az eredetiből archiválva]. (Hozzáférés: 2016. január 13.) ↑ Tasnádi Attila: Számítástudomány gazdaságinformatikusoknak | bookline. [2008. november 9-i dátummal az eredetiből archiválva]. (Hozzáférés: 2010. július 12.) ↑ Algoritmizálás alapjai. tankonyvtar.hu, 2011. (Hozzáférés: 2016. január 13.) ↑ (ld. angolul). [2004. április 16-i dátummal az eredetiből archiválva]. (Hozzáférés: 2004. október 3.) ↑ Ésik, Zoltán.',\n",
|
||||||
|
" 'A számítástudomány alapjai.',\n",
|
||||||
|
" 'TypotexKiadó, 5. o. (2011).',\n",
|
||||||
|
" 'Hozzáférés ideje: 2016. január 13.',\n",
|
||||||
|
" 'További információk[szerkesztés] Alice és Bob – 6. rész: Alice és Bob a kiszámíthatóság határán Alice és Bob – 7. rész: Alice és Bob egymillió dolláros kérdése Alice és Bob – 8. rész: Alice és Bob biztonsága Ralston, Anthony: Programozás és számítógép-tudomány.',\n",
|
||||||
|
" 'Műszaki Könyvkiadó, Bp., 1974.',\n",
|
||||||
|
" 'ISBN 963-10-0616-6. (er. mű: Introduction to Programming and Computer Science, McGraw-Hill Inc.; ford.',\n",
|
||||||
|
" 'Dr. Szabados József).',\n",
|
||||||
|
" 'Informatikai portál • összefoglaló, színes tartalomajánló lap Nemzetközi katalógusok LCCN: sh89003285 GND: 4026894-9 NKCS: ph124511 BNF: cb11932109b BNE: XX525961 A lap eredeti címe: „https://hu.wikipedia.org/w/index.php?title=Számítástudomány&oldid=26253398” Kategória: Számítógép-tudományRejtett kategóriák: Minden szócikk halott külső hivatkozásokkalSzócikkek halott külső hivatkozásokkal 2019 áprilisábólWikipédia-szócikkek LCCN azonosítóvalWikipédia-szócikkek GND azonosítóvalWikipédia-szócikkek BNF azonosítóval A lap utolsó módosítása: 2023. június 30., 11:20 A lap szövege Creative Commons Nevezd meg! – Így add tovább! 4.0 licenc alatt van; egyes esetekben más módon is felhasználható.',\n",
|
||||||
|
" 'Részletekért lásd a felhasználási feltételeket.',\n",
|
||||||
|
" 'Adatvédelmi irányelvek A Wikipédiáról Jogi nyilatkozat Code of Conduct Fejlesztők Statisztikák Sütinyilatkozat Mobil nézet Korlátozott tartalomszélesség ki/be']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"hu_segments"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "af282c08",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Save the Polish and English segments to separate files\n",
|
||||||
|
"with open('hu_segments.txt', 'w', encoding='utf-8') as file:\n",
|
||||||
|
" for segment in hu_segments:\n",
|
||||||
|
" file.write(segment + '\\n')\n",
|
||||||
|
"\n",
|
||||||
|
"with open('en_segments.txt', 'w', encoding='utf-8') as file:\n",
|
||||||
|
" for segment in en_segments:\n",
|
||||||
|
" file.write(segment + '\\n')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "4134e233",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Reading dictionary...\n",
|
||||||
|
"59 source language sentences read.\n",
|
||||||
|
"379 target language sentences read.\n",
|
||||||
|
"Sizes differing too much. Ignoring files to avoid a rare loop bug.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!hunalign/src/hunalign/hunalign hunalign/data/hu-en.stem.dic hu_segments.txt en_segments.txt -hand=hunalign/examples/demo.manual.ladder -text > align.txt"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -174,28 +409,122 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 9,
|
||||||
"id": "remarkable-pillow",
|
"id": "a30fb7bc",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def convert2xliff(hunalign_file_name):\n",
|
"import xml.etree.ElementTree as ET\n",
|
||||||
" return 0"
|
"\n",
|
||||||
|
"def hunalign_to_xliff(hunalign_content, source_lang, target_lang, xliff_file):\n",
|
||||||
|
" xliff = ET.Element('xliff', {\n",
|
||||||
|
" 'xmlns': 'urn:oasis:names:tc:xliff:document:1.2',\n",
|
||||||
|
" 'version': '1.2'\n",
|
||||||
|
" })\n",
|
||||||
|
" \n",
|
||||||
|
" file_elem = ET.SubElement(xliff, 'file', {\n",
|
||||||
|
" 'datatype': 'plaintext',\n",
|
||||||
|
" 'original': 'self',\n",
|
||||||
|
" 'source-language': source_lang,\n",
|
||||||
|
" 'target-language': target_lang\n",
|
||||||
|
" })\n",
|
||||||
|
" \n",
|
||||||
|
" header = ET.SubElement(file_elem, 'header')\n",
|
||||||
|
" metadata = ET.SubElement(header, 'sxmd:metadata', {\n",
|
||||||
|
" 'xmlns:sxmd': 'urn:x-sap:mlt:xliff12:metadata:1.0',\n",
|
||||||
|
" 'xmlns': 'urn:x-sap:mlt:tsmetadata:1.0'\n",
|
||||||
|
" })\n",
|
||||||
|
" ET.SubElement(metadata, 'object-name').text = 'sample'\n",
|
||||||
|
" ET.SubElement(metadata, 'collection').text = 'KWT'\n",
|
||||||
|
" ET.SubElement(metadata, 'domain').text = 'KWT'\n",
|
||||||
|
" ET.SubElement(metadata, 'developer').text = '123'\n",
|
||||||
|
" ET.SubElement(metadata, 'description').text = 'sample XLIFF file'\n",
|
||||||
|
" \n",
|
||||||
|
" body = ET.SubElement(file_elem, 'body')\n",
|
||||||
|
" \n",
|
||||||
|
" for i, line in enumerate(hunalign_content.strip().split('\\n')):\n",
|
||||||
|
" src_tgt = line.strip().split(' ||| ')\n",
|
||||||
|
" if len(src_tgt) == 2:\n",
|
||||||
|
" trans_unit = ET.SubElement(body, 'trans-unit', {'id': str(i + 1)})\n",
|
||||||
|
" ET.SubElement(trans_unit, 'source').text = src_tgt[0]\n",
|
||||||
|
" ET.SubElement(trans_unit, 'target').text = src_tgt[1]\n",
|
||||||
|
" \n",
|
||||||
|
" tree = ET.ElementTree(xliff)\n",
|
||||||
|
" ET.indent(tree, space=\" \", level=0) # Formatowanie z wcięciami\n",
|
||||||
|
" tree.write(xliff_file, encoding='utf-8', xml_declaration=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "6b68cbed",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"hunalign_content = \"\"\"\n",
|
||||||
|
"0-0 Hello world! ||| Witaj świecie!\n",
|
||||||
|
"1-1 This is a test. ||| To jest test.\n",
|
||||||
|
"2-2 How are you? ||| Jak się masz?\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"hunalign_to_xliff(hunalign_content, 'en', 'pl', 'output.xliff')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "d799237b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"<?xml version='1.0' encoding='utf-8'?>\n",
|
||||||
|
"<xliff xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\">\n",
|
||||||
|
" <file datatype=\"plaintext\" original=\"self\" source-language=\"en\" target-language=\"pl\">\n",
|
||||||
|
" <header>\n",
|
||||||
|
" <sxmd:metadata xmlns:sxmd=\"urn:x-sap:mlt:xliff12:metadata:1.0\" xmlns=\"urn:x-sap:mlt:tsmetadata:1.0\">\n",
|
||||||
|
" <object-name>sample</object-name>\n",
|
||||||
|
" <collection>KWT</collection>\n",
|
||||||
|
" <domain>KWT</domain>\n",
|
||||||
|
" <developer>123</developer>\n",
|
||||||
|
" <description>sample XLIFF file</description>\n",
|
||||||
|
" </sxmd:metadata>\n",
|
||||||
|
" </header>\n",
|
||||||
|
" <body>\n",
|
||||||
|
" <trans-unit id=\"1\">\n",
|
||||||
|
" <source>0-0 Hello world!</source>\n",
|
||||||
|
" <target>Witaj świecie!</target>\n",
|
||||||
|
" </trans-unit>\n",
|
||||||
|
" <trans-unit id=\"2\">\n",
|
||||||
|
" <source>1-1 This is a test.</source>\n",
|
||||||
|
" <target>To jest test.</target>\n",
|
||||||
|
" </trans-unit>\n",
|
||||||
|
" <trans-unit id=\"3\">\n",
|
||||||
|
" <source>2-2 How are you?</source>\n",
|
||||||
|
" <target>Jak się masz?</target>\n",
|
||||||
|
" </trans-unit>\n",
|
||||||
|
" </body>\n",
|
||||||
|
" </file>\n",
|
||||||
|
"</xliff>\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"with open(\"output.xliff\", \"r\") as file:\n",
|
||||||
|
" print(file.read())"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"author": "Rafał Jaworski",
|
"author": "Rafał Jaworski",
|
||||||
"email": "rjawor@amu.edu.pl",
|
"email": "rjawor@amu.edu.pl",
|
||||||
"lang": "pl",
|
|
||||||
"subtitle": "11. Urównoleglanie",
|
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
|
||||||
"year": "2021",
|
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "Python 3",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
|
"lang": "pl",
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"codemirror_mode": {
|
"codemirror_mode": {
|
||||||
"name": "ipython",
|
"name": "ipython",
|
||||||
@ -206,8 +535,11 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.10.14"
|
||||||
}
|
},
|
||||||
|
"subtitle": "11. Urównoleglanie",
|
||||||
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
"year": "2021"
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 5
|
"nbformat_minor": 5
|
||||||
|
146
lab/lab_12.ipynb
146
lab/lab_12.ipynb
@ -104,6 +104,33 @@
|
|||||||
"Celem powyższego ćwiczenia jest pozyskanie danych testowych. Dalsze analizy będziemy prowadzili już bez key loggera, starając się korzystać jedynie z danych zapisanych w pliku. Oczywiście, jeśli zajdzie taka konieczność, można w każdej chwili wygenerować sobie nowy plik."
|
"Celem powyższego ćwiczenia jest pozyskanie danych testowych. Dalsze analizy będziemy prowadzili już bez key loggera, starając się korzystać jedynie z danych zapisanych w pliku. Oczywiście, jeśli zajdzie taka konieczność, można w każdej chwili wygenerować sobie nowy plik."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "983ebbed",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import keyboard\n",
|
||||||
|
"from datetime import datetime\n",
|
||||||
|
"\n",
|
||||||
|
"# Ścieżka do pliku, w którym będą zapisywane dane\n",
|
||||||
|
"log_file = \"keylog.txt\"\n",
|
||||||
|
"\n",
|
||||||
|
"def report_key(event):\n",
|
||||||
|
" with open(log_file, \"a\") as f:\n",
|
||||||
|
" # Pobieramy aktualny czas\n",
|
||||||
|
" timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')\n",
|
||||||
|
" # Zapisujemy czas i wciśnięty klawisz do pliku\n",
|
||||||
|
" f.write(f\"{timestamp} - {event.name}\\n\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Ustawienie callbacka dla zdarzeń klawiatury\n",
|
||||||
|
"keyboard.on_release(callback=report_key)\n",
|
||||||
|
"\n",
|
||||||
|
"# Czekanie na zdarzenia klawiatury\n",
|
||||||
|
"keyboard.wait()\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "boxed-maple",
|
"id": "boxed-maple",
|
||||||
@ -114,13 +141,64 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": null,
|
||||||
"id": "possible-holder",
|
"id": "possible-holder",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"from datetime import timedelta\n",
|
||||||
|
"\n",
|
||||||
|
"key_events = []\n",
|
||||||
|
"\n",
|
||||||
|
"def report_key(event):\n",
|
||||||
|
" # Pobieramy aktualny czas\n",
|
||||||
|
" timestamp = datetime.now()\n",
|
||||||
|
" # Zapisujemy czas i wciśnięty klawisz do listy\n",
|
||||||
|
" key_events.append((timestamp, event.name))\n",
|
||||||
|
"\n",
|
||||||
|
"# Ustawienie callbacka dla zdarzeń klawiatury\n",
|
||||||
|
"keyboard.on_release(callback=report_key)\n",
|
||||||
|
"\n",
|
||||||
"def calculate_typing_speed():\n",
|
"def calculate_typing_speed():\n",
|
||||||
" return 0"
|
" if not key_events:\n",
|
||||||
|
" return \"No key events recorded.\"\n",
|
||||||
|
"\n",
|
||||||
|
" total_time = timedelta()\n",
|
||||||
|
" total_chars = 0\n",
|
||||||
|
" total_words = 0\n",
|
||||||
|
" prev_time = key_events[0][0]\n",
|
||||||
|
"\n",
|
||||||
|
" for i in range(1, len(key_events)):\n",
|
||||||
|
" current_time = key_events[i][0]\n",
|
||||||
|
" key = key_events[i][1]\n",
|
||||||
|
"\n",
|
||||||
|
" # Obliczamy czas między kolejnymi naciśnięciami klawiszy\n",
|
||||||
|
" time_diff = current_time - prev_time\n",
|
||||||
|
"\n",
|
||||||
|
" # Jeśli różnica czasu jest mniejsza niż 5 sekund, dodajemy do całkowitego czasu\n",
|
||||||
|
" if time_diff <= timedelta(seconds=5):\n",
|
||||||
|
" total_time += time_diff\n",
|
||||||
|
" total_chars += 1\n",
|
||||||
|
" if key == \"space\":\n",
|
||||||
|
" total_words += 1\n",
|
||||||
|
"\n",
|
||||||
|
" prev_time = current_time\n",
|
||||||
|
"\n",
|
||||||
|
" # Dodajemy ostatnie słowo (bo nie zawsze kończy się spacją)\n",
|
||||||
|
" total_words += 1\n",
|
||||||
|
"\n",
|
||||||
|
" # Obliczamy prędkość pisania\n",
|
||||||
|
" total_minutes = total_time.total_seconds() / 60\n",
|
||||||
|
" chars_per_minute = total_chars / total_minutes if total_minutes > 0 else 0\n",
|
||||||
|
" words_per_minute = total_words / total_minutes if total_minutes > 0 else 0\n",
|
||||||
|
"\n",
|
||||||
|
" return f\"Typing Speed: {chars_per_minute:.2f} chars/min, {words_per_minute:.2f} words/min\"\n",
|
||||||
|
"\n",
|
||||||
|
"# Uruchomienie keyloggera i czekanie na zdarzenia klawiatury\n",
|
||||||
|
"keyboard.wait()\n",
|
||||||
|
"\n",
|
||||||
|
"# Po zakończeniu pisania, wyliczamy prędkość pisania\n",
|
||||||
|
"print(calculate_typing_speed())"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -141,28 +219,73 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": null,
|
||||||
"id": "close-riverside",
|
"id": "close-riverside",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"key_events = []\n",
|
||||||
|
"\n",
|
||||||
|
"def report_key(event):\n",
|
||||||
|
" # Pobieramy aktualny czas\n",
|
||||||
|
" timestamp = datetime.now()\n",
|
||||||
|
" # Zapisujemy czas i wciśnięty klawisz do listy\n",
|
||||||
|
" key_events.append((timestamp, event.name))\n",
|
||||||
|
"\n",
|
||||||
|
"# Ustawienie callbacka dla zdarzeń klawiatury\n",
|
||||||
|
"keyboard.on_release(callback=report_key)\n",
|
||||||
|
"\n",
|
||||||
"def find_pauses():\n",
|
"def find_pauses():\n",
|
||||||
" return []"
|
" if not key_events:\n",
|
||||||
|
" return \"No key events recorded.\"\n",
|
||||||
|
"\n",
|
||||||
|
" pauses = []\n",
|
||||||
|
" prev_time = key_events[0][0]\n",
|
||||||
|
" full_text = ''.join([key[1] if key[1] != \"space\" else \" \" for key in key_events])\n",
|
||||||
|
"\n",
|
||||||
|
" for i in range(1, len(key_events)):\n",
|
||||||
|
" current_time = key_events[i][0]\n",
|
||||||
|
" key = key_events[i][1]\n",
|
||||||
|
"\n",
|
||||||
|
" # Obliczamy czas między kolejnymi naciśnięciami klawiszy\n",
|
||||||
|
" time_diff = current_time - prev_time\n",
|
||||||
|
"\n",
|
||||||
|
" # Jeśli różnica czasu jest większa niż 3 sekundy, zapisujemy przerwę\n",
|
||||||
|
" if time_diff > timedelta(seconds=3):\n",
|
||||||
|
" start_idx = max(0, i - 21)\n",
|
||||||
|
" end_idx = min(len(full_text), i + 20)\n",
|
||||||
|
" context = full_text[start_idx:end_idx]\n",
|
||||||
|
" pauses.append((time_diff.total_seconds(), context))\n",
|
||||||
|
"\n",
|
||||||
|
" prev_time = current_time\n",
|
||||||
|
"\n",
|
||||||
|
" # Sortowanie przerw malejąco po długości\n",
|
||||||
|
" pauses.sort(reverse=True, key=lambda x: x[0])\n",
|
||||||
|
"\n",
|
||||||
|
" return pauses\n",
|
||||||
|
"\n",
|
||||||
|
"# Uruchomienie keyloggera i czekanie na zdarzenia klawiatury\n",
|
||||||
|
"keyboard.wait()\n",
|
||||||
|
"\n",
|
||||||
|
"# Po zakończeniu pisania, wykrywamy przerwy\n",
|
||||||
|
"pauses = find_pauses()\n",
|
||||||
|
"\n",
|
||||||
|
"# Wyświetlanie przerw\n",
|
||||||
|
"for pause in pauses:\n",
|
||||||
|
" length, context = pause\n",
|
||||||
|
" print(f\"Pause length: {length:.2f} seconds, Context: '{context}'\")\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"author": "Rafał Jaworski",
|
"author": "Rafał Jaworski",
|
||||||
"email": "rjawor@amu.edu.pl",
|
"email": "rjawor@amu.edu.pl",
|
||||||
"lang": "pl",
|
|
||||||
"subtitle": "12. Key logging",
|
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
|
||||||
"year": "2021",
|
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "Python 3",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
|
"lang": "pl",
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"codemirror_mode": {
|
"codemirror_mode": {
|
||||||
"name": "ipython",
|
"name": "ipython",
|
||||||
@ -173,8 +296,11 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.10.14"
|
||||||
}
|
},
|
||||||
|
"subtitle": "12. Key logging",
|
||||||
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
"year": "2021"
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
"nbformat_minor": 5
|
"nbformat_minor": 5
|
||||||
|
@ -44,7 +44,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 30,
|
||||||
"id": "familiar-terrace",
|
"id": "familiar-terrace",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
@ -120,13 +120,62 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 31,
|
||||||
|
"id": "d0970691",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pl_dict = set()\n",
|
||||||
|
"with ZipFile('data/hunspell_pl.zip') as zipped_dictionary:\n",
|
||||||
|
" with zipped_dictionary.open('hunspell_pl.txt') as dictionary_file:\n",
|
||||||
|
" for line_bytes in dictionary_file:\n",
|
||||||
|
" line = line_bytes.decode('utf-8')\n",
|
||||||
|
" pl_dict.add(line.rstrip())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 32,
|
||||||
"id": "economic-southeast",
|
"id": "economic-southeast",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def correct_text(text):\n",
|
"def correct_text(text):\n",
|
||||||
" return []"
|
" words = text.split()\n",
|
||||||
|
"\n",
|
||||||
|
" result = []\n",
|
||||||
|
" for word in words:\n",
|
||||||
|
" if word in pl_dict:\n",
|
||||||
|
" result.append((word, \"correct\"))\n",
|
||||||
|
" else:\n",
|
||||||
|
" result.append((word, \"incorrect\"))\n",
|
||||||
|
"\n",
|
||||||
|
" return result"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 33,
|
||||||
|
"id": "771a6c40",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[('kalend', 'incorrect'),\n",
|
||||||
|
" ('kalendarz', 'correct'),\n",
|
||||||
|
" ('kaledoński', 'correct'),\n",
|
||||||
|
" ('kalejdoskopowy', 'correct'),\n",
|
||||||
|
" ('kalendarium', 'correct')]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 33,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"correct_text(\"kalend kalendarz kaledoński kalejdoskopowy kalendarium\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -168,13 +217,51 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 34,
|
||||||
"id": "built-sally",
|
"id": "built-sally",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def L1(w):\n",
|
"def L1(w):\n",
|
||||||
" return []"
|
" letters = 'abcdefghijklmnopqrstuvwxyząćęłńóśźż'\n",
|
||||||
|
" splits = [(w[:i], w[i:]) for i in range(len(w) + 1)]\n",
|
||||||
|
" \n",
|
||||||
|
" deletes = [L + R[1:] for L, R in splits if R]\n",
|
||||||
|
" transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]\n",
|
||||||
|
" replaces = [L + c + R[1:] for L, R in splits if R for c in letters]\n",
|
||||||
|
" inserts = [L + c + R for L, R in splits for c in letters]\n",
|
||||||
|
" \n",
|
||||||
|
" return set(deletes + transposes + replaces + inserts)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 35,
|
||||||
|
"id": "dc3ffbfe",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['kaqendarz',\n",
|
||||||
|
" 'kalenydarz',\n",
|
||||||
|
" 'kalendadz',\n",
|
||||||
|
" 'kalenżarz',\n",
|
||||||
|
" 'kalendlrz',\n",
|
||||||
|
" 'kalendaóz',\n",
|
||||||
|
" 'kalvendarz',\n",
|
||||||
|
" 'kalendarzv',\n",
|
||||||
|
" 'katendarz',\n",
|
||||||
|
" 'kolendarz']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 35,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"list(L1(\"kalendarz\"))[:10]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -187,13 +274,49 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 36,
|
||||||
"id": "coordinated-cooperation",
|
"id": "coordinated-cooperation",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def generate_suggestions(w):\n",
|
"def generate_suggestions(w):\n",
|
||||||
" return []"
|
" # Generate L1(w)\n",
|
||||||
|
" L1_set = L1(w)\n",
|
||||||
|
" # Generate S1(w)\n",
|
||||||
|
" S1 = L1_set.intersection(pl_dict)\n",
|
||||||
|
"\n",
|
||||||
|
" # Generate L2(w)\n",
|
||||||
|
" L2_set = set()\n",
|
||||||
|
" for v in L1_set:\n",
|
||||||
|
" L2_set.update(L1(v))\n",
|
||||||
|
" \n",
|
||||||
|
" # Generate S2(w)\n",
|
||||||
|
" S2 = L2_set.intersection(pl_dict)\n",
|
||||||
|
"\n",
|
||||||
|
" # Combine S1 and S2 and return as list\n",
|
||||||
|
" suggestions = S1.union(S2)\n",
|
||||||
|
" return list(suggestions)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 37,
|
||||||
|
"id": "e0c572ce",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['kalendarz', 'kalandar', 'kalendarzyk', 'arendarz']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 37,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"generate_suggestions(\"kalendarz\")"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -216,7 +339,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.10.14"
|
||||||
},
|
},
|
||||||
"subtitle": "13,14. Korekta pisowni",
|
"subtitle": "13,14. Korekta pisowni",
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
1082
lab/lab_15.ipynb
1082
lab/lab_15.ipynb
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user