Ta linia i następne zostaną zignorowane--
M app/dist/files/README uaktualnione M app/doc/utt.texinfo dopiski M app/src/gue/Makefile statyczne biblioteki M app/src/cor/cmdline_cor.ggo usuniecie nie dzialajacych parametrow M app/src/cor/Makefile statyczne biblioteki M app/src/common/cmdline_common.ggo ? M app/src/kor/Makefile statyczne biblioteki M app/src/lem/Makefile statyczne biblioteki M lang/dist/tarball/Makefile pakowanie modulow jezykowych po jednym M lang/Makefile -"- git-svn-id: svn://atos.wmid.amu.edu.pl/utt@61 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
This commit is contained in:
parent
261bf629fb
commit
e28a625259
34
app/dist/files/README
vendored
34
app/dist/files/README
vendored
@ -17,5 +17,35 @@ unrestricted text for any conceivable purpose.
|
|||||||
|
|
||||||
Installation
|
Installation
|
||||||
**************
|
**************
|
||||||
Run utt_make_config.pl to create configuration files.
|
|
||||||
Configuration files will be created in ~/.utt/
|
1) unpack the UTT tar archive
|
||||||
|
2) in the same directory, unpack the tar archives of all UTT dictionary modules you have
|
||||||
|
3) run
|
||||||
|
make install
|
||||||
|
in the root directory of the installation
|
||||||
|
4) add the bin directory to the PATH variable
|
||||||
|
|
||||||
|
|
||||||
|
Requirements
|
||||||
|
*************
|
||||||
|
|
||||||
|
* File::HomeDir
|
||||||
|
|
||||||
|
the Perl package File::HomeDir must be installed
|
||||||
|
(to install the package, run 'perl -MCPAN -e shell' and write
|
||||||
|
'install File::HomeDir' after the 'cpan>' prompt appears)
|
||||||
|
|
||||||
|
* flex
|
||||||
|
|
||||||
|
to run the ser component, flex must be installed in your system
|
||||||
|
|
||||||
|
* ruby
|
||||||
|
|
||||||
|
to run the tre component, ruby must be installed in your system
|
||||||
|
|
||||||
|
* locale pl_PL.iso-8852-2
|
||||||
|
|
||||||
|
the locales pl_PL.iso-8859-2 (pl_PL in short) must be installed
|
||||||
|
and set while using UTT with the Polish module. The text you
|
||||||
|
process with UTT must be encoded in iso-8859-2.
|
||||||
|
|
||||||
|
@ -366,7 +366,7 @@ covered by the second segment and no segment starts at position
|
|||||||
|
|
||||||
@section Flattened UTT file
|
@section Flattened UTT file
|
||||||
|
|
||||||
A UTT file format has two variants: regular and flattend. The regular
|
A UTT file format has two variants: regular and flattened. The regular
|
||||||
format was described above. In the flattened format some of the
|
format was described above. In the flattened format some of the
|
||||||
end-of-line characters are replaced with line-feed characters.
|
end-of-line characters are replaced with line-feed characters.
|
||||||
|
|
||||||
@ -1607,11 +1607,11 @@ compression tool (grp usually processes data faster than it is read from a
|
|||||||
disk, especially for slow laptop drives).
|
disk, especially for slow laptop drives).
|
||||||
|
|
||||||
@example
|
@example
|
||||||
cat corpus | tok | sen | lem | grp -a p | lzop -7 > corpus.grp.lzo
|
cat corpus | tok | sen | lem -1 | fla | lzop -7 > corpus.grp.lzo
|
||||||
@end example
|
@end example
|
||||||
|
|
||||||
@example
|
@example
|
||||||
lzop -cd corpus.grp.lzo | grp -a gP -e @var{EXPR} | ser -e @var{EXPR}
|
lzop -cd corpus.grp.lzo | grp -e @var{EXPR} | unfla | ser -e @var{EXPR}
|
||||||
@end example
|
@end example
|
||||||
|
|
||||||
|
|
||||||
@ -1626,11 +1626,15 @@ lzop -cd corpus.grp.lzo | grp -a gP -e @var{EXPR} | ser -e @var{EXPR}
|
|||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
@item @strong{Authors:} @tab Marcin Walas, Tomasz Obrêbski
|
@item @strong{Authors:} @tab Marcin Walas, Tomasz Obrêbski
|
||||||
@item @strong{Component category:} @tab filter
|
@item @strong{Input format:} @tab UTT flattened
|
||||||
|
@item @strong{Output format:} @tab UTT flattened
|
||||||
|
@item @strong{Required annotation:} @tab tok, sen, lem -1
|
||||||
@end multitable
|
@end multitable
|
||||||
|
|
||||||
[TODO]
|
[TODO]
|
||||||
|
|
||||||
|
(see mar's help 'mar -h' for some information)
|
||||||
|
|
||||||
@c ---------------------------------------------------------------------
|
@c ---------------------------------------------------------------------
|
||||||
@c KOT
|
@c KOT
|
||||||
@c ---------------------------------------------------------------------
|
@c ---------------------------------------------------------------------
|
||||||
@ -1870,16 +1874,32 @@ termination of the program.
|
|||||||
@c @end menu
|
@c @end menu
|
||||||
|
|
||||||
|
|
||||||
|
@c -------------------------------------------------------------------------------
|
||||||
|
@c FLA
|
||||||
|
@c -------------------------------------------------------------------------------
|
||||||
|
|
||||||
@page
|
@page
|
||||||
@node fla
|
@node fla
|
||||||
@section fla - the UTT file flattener
|
@section fla - the UTT file flattener
|
||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
@item @strong{Authors:} @tab Tomasz Obrêbski
|
@item @strong{Authors:} @tab Tomasz Obrêbski
|
||||||
@item @strong{Component category:} @tab filter
|
@item @strong{Input format:} @tab UTT regular
|
||||||
|
@item @strong{Output format:} @tab UTT flattened
|
||||||
|
@item @strong{Required annotation:} @tab sen
|
||||||
@end multitable
|
@end multitable
|
||||||
@c
|
@c
|
||||||
|
|
||||||
|
@menu
|
||||||
|
* fla description::
|
||||||
|
@c * fla command line options::
|
||||||
|
@c * fla usage example::
|
||||||
|
@end menu
|
||||||
|
|
||||||
|
|
||||||
|
@node fla description
|
||||||
|
@subsection Description
|
||||||
|
|
||||||
@command{fla} ``flattens'' a utt file by merging segments belonging
|
@command{fla} ``flattens'' a utt file by merging segments belonging
|
||||||
to one sentence in one line. Technically, end-of-line characters
|
to one sentence in one line. Technically, end-of-line characters
|
||||||
('\n', ASCII code 10) are replaced with line-feed characters ('\f',
|
('\n', ASCII code 10) are replaced with line-feed characters ('\f',
|
||||||
@ -1901,13 +1921,10 @@ The facultative argument is a regular expression describing segments
|
|||||||
which should be treated as sentence beginnings (the test is: the
|
which should be treated as sentence beginnings (the test is: the
|
||||||
segment contains a fragment matching the @code{<bosregex>}). By
|
segment contains a fragment matching the @code{<bosregex>}). By
|
||||||
default, segments containing a field @code{BOS} are seeked.
|
default, segments containing a field @code{BOS} are seeked.
|
||||||
@c @menu
|
|
||||||
@c * con command line options::
|
|
||||||
@c * con usage example::
|
|
||||||
@c * con hints::
|
|
||||||
@c @end menu
|
|
||||||
|
|
||||||
|
|
||||||
|
@c -------------------------------------------------------------------------------
|
||||||
|
@c UNFLA
|
||||||
|
@c -------------------------------------------------------------------------------
|
||||||
|
|
||||||
@page
|
@page
|
||||||
@node unfla
|
@node unfla
|
||||||
@ -1915,9 +1932,19 @@ default, segments containing a field @code{BOS} are seeked.
|
|||||||
|
|
||||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||||
@item @strong{Authors:} @tab Tomasz Obrêbski
|
@item @strong{Authors:} @tab Tomasz Obrêbski
|
||||||
@item @strong{Component category:} @tab filter
|
@item @strong{Input format:} @tab UTT flattened
|
||||||
|
@item @strong{Output format:} @tab UTT regular
|
||||||
|
@item @strong{Required annotation:} @tab -
|
||||||
@end multitable
|
@end multitable
|
||||||
|
|
||||||
|
@menu
|
||||||
|
* unfla description::
|
||||||
|
@c * fla command line options::
|
||||||
|
@c * fla usage example::
|
||||||
|
@end menu
|
||||||
|
|
||||||
|
@node unfla description
|
||||||
|
@subsection Description
|
||||||
@command{unfla} transforms a flattened UTT file, produced by
|
@command{unfla} transforms a flattened UTT file, produced by
|
||||||
@command{fla}, into the regular format by restoring end-of-line
|
@command{fla}, into the regular format by restoring end-of-line
|
||||||
characters.
|
characters.
|
||||||
@ -1970,7 +1997,7 @@ cat text | tok | lem | cor -p W -S lem | lem -I cor | gue -p W -S lem
|
|||||||
|
|
||||||
|
|
||||||
@example
|
@example
|
||||||
cat text | tok | lem --only-fail | cor -1 > output3
|
cat text | tok | egrep ' W ' | lem | egrep -v 'lem:' | cor -1
|
||||||
@end example
|
@end example
|
||||||
|
|
||||||
@item Expression extraction
|
@item Expression extraction
|
||||||
@ -2019,43 +2046,44 @@ required by @command{grp} first, and then use the preprocessed data.
|
|||||||
|
|
||||||
As @command{grp} (@command{grep}) processes data faster then it is
|
As @command{grp} (@command{grep}) processes data faster then it is
|
||||||
read from the disk drive, the search time may be still shortened by
|
read from the disk drive, the search time may be still shortened by
|
||||||
using file compression techniques. We suggest usin @command{lzop}.
|
using file compression techniques. We suggest using the
|
||||||
|
@command{lzop} compressor/decompressor.
|
||||||
|
|
||||||
@item the fastest way to search a large corpus
|
@item the fastest way to search a large corpus
|
||||||
|
|
||||||
step 1: preprocessing
|
step 1: corpus preprocessing
|
||||||
|
|
||||||
@example
|
@example
|
||||||
cat corpus | tok | sen | lem -1 \
|
cat corpus | tok | sen | lem -1 \
|
||||||
| grp -a p | lzop -7 > corpus.grp.lzo
|
| fla | lzop -7 > corpus.grp.lzo
|
||||||
@end example
|
@end example
|
||||||
|
|
||||||
step 2: search
|
step 2: search
|
||||||
|
|
||||||
@example
|
@example
|
||||||
lzop -cd corpus.grp.lzo | grp -a gP -e 'cat(<V>) space
|
lzop -cd corpus.grp.lzo | unfla | grp -e 'cat(<V>) space
|
||||||
lexeme(rozmowa)' | ser -e 'cat(<V>) space lexeme(rozmowa)' | con
|
lexeme(rozmowa)' | ser -e 'cat(<V>) space lexeme(rozmowa)' | con
|
||||||
@end example
|
@end example
|
||||||
|
|
||||||
@end enumerate
|
@end enumerate
|
||||||
|
|
||||||
@subsubheading More complicated configurations
|
@c @subsubheading More complicated configurations
|
||||||
|
|
||||||
|
|
||||||
@example
|
@c @example
|
||||||
mknod fifo1 p
|
@c mknod fifo1 p
|
||||||
mknod fifo2 p
|
@c mknod fifo2 p
|
||||||
mknod fifo3 p
|
@c mknod fifo3 p
|
||||||
mknod fifo4 p
|
@c mknod fifo4 p
|
||||||
mknod fifo5 p
|
@c mknod fifo5 p
|
||||||
|
|
||||||
tok | lem -p W -e fifo1 > fifo2 &
|
@c tok | lem -p W -e fifo1 > fifo2 &
|
||||||
cor -e fifo3 < fifo1 | lem > fifo4 &
|
@c cor -e fifo3 < fifo1 | lem > fifo4 &
|
||||||
gue < fifo3 > fifo5 &
|
@c gue < fifo3 > fifo5 &
|
||||||
sort -m fifo2 fifo4 fifo5
|
@c sort -m fifo2 fifo4 fifo5
|
||||||
|
|
||||||
rm fifo?
|
@c rm fifo?
|
||||||
@end example
|
@c @end example
|
||||||
|
|
||||||
|
|
||||||
@c ---------------------------------------------------------------------
|
@c ---------------------------------------------------------------------
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
#section "Common UTT options"
|
#section "Common UTT options"
|
||||||
|
|
||||||
|
|
||||||
option "input" f "Input file" string no hidden
|
option "input" f "Input file" string no
|
||||||
|
|
||||||
option "output" o "Output file" string no hidden
|
option "output" o "Output file for succesfully processed segments" string no
|
||||||
|
|
||||||
option "fail" e "Output file for unsuccesfully processed segments " string no hidden
|
option "fail" e "Output file for unsuccesfully processed segments " string no
|
||||||
|
|
||||||
option "only-fail" - "Print only segments the program failed to process" flag off hidden
|
option "only-fail" - "Print only segments the program failed to process" flag off hidden
|
||||||
|
|
||||||
option "no-fail" - "Print only segments the program processed" flag off hidden
|
option "no-fail" - "Print only segments the program processed" flag off hidden
|
||||||
|
|
||||||
option "copy" c "Copy succesfully processed segments to standard output" flag off hidden
|
option "copy" c "Copy succesfully processed segments to standard output" flag off
|
||||||
|
|
||||||
option "process" p "Process segments with this tag" string no multiple
|
option "process" p "Process segments with this tag" string no multiple
|
||||||
|
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
PAR=-Wno-deprecated -m32 -fpermissive
|
PAR=-Wno-deprecated -m32 -fpermissive -static
|
||||||
# -static
|
|
||||||
PAR2=-c -Wno-deprecated -m32 -fpermissive
|
PAR2=-c -Wno-deprecated -m32 -fpermissive
|
||||||
LIB_PATH=../lib
|
LIB_PATH=../lib
|
||||||
COMMON_PATH=../common
|
COMMON_PATH=../common
|
||||||
|
@ -4,5 +4,5 @@ version "0.1"
|
|||||||
option "dictionary-home" - "Dictionary home dir." string typestr="FILENAME" no hidden
|
option "dictionary-home" - "Dictionary home dir." string typestr="FILENAME" no hidden
|
||||||
option "dictionary" d "Dictionary" string typestr="FILENAME" default="cor.bin" no
|
option "dictionary" d "Dictionary" string typestr="FILENAME" default="cor.bin" no
|
||||||
option "distance" n "Maximal edit distance." int default="1" no
|
option "distance" n "Maximal edit distance." int default="1" no
|
||||||
option "replace" r "Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off
|
option "replace" r "Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off hidden
|
||||||
#option "single" - "Place all alternatives in the same line" flag off
|
#option "single" - "Place all alternatives in the same line" flag off
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
PAR=-Wno-deprecated -O3 -fpermissive -m32
|
PAR=-Wno-deprecated -O3 -fpermissive -m32 -static
|
||||||
#-static
|
|
||||||
PAR2=-c -Wno-deprecated -O3 -fpermissive -m32
|
PAR2=-c -Wno-deprecated -O3 -fpermissive -m32
|
||||||
LIB_PATH=../lib
|
LIB_PATH=../lib
|
||||||
COMMON_PATH=../common
|
COMMON_PATH=../common
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
PAR=-Wno-deprecated -m32 -fpermissive
|
PAR=-Wno-deprecated -m32 -fpermissive -static
|
||||||
# -static
|
|
||||||
PAR2=-c -Wno-deprecated -m32 -fpermissive
|
PAR2=-c -Wno-deprecated -m32 -fpermissive
|
||||||
LIB_PATH=../lib
|
LIB_PATH=../lib
|
||||||
COMMON_PATH=../common
|
COMMON_PATH=../common
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
PAR=-Wno-deprecated -m32 -O3 -fpermissive
|
PAR=-Wno-deprecated -m32 -O3 -fpermissive -static
|
||||||
#-static
|
PAR2=-c -Wno-deprecated -m32 -O3 -fpermissive -static
|
||||||
PAR2=-c -Wno-deprecated -m32 -O3 -fpermissive
|
|
||||||
LIB_PATH=../lib
|
LIB_PATH=../lib
|
||||||
COMMON_PATH=../common
|
COMMON_PATH=../common
|
||||||
CMDLINE_FILE='"../lem/cmdline.h"'
|
CMDLINE_FILE='"../lem/cmdline.h"'
|
||||||
|
@ -10,6 +10,7 @@ export UTT_DIC_BIN=$(CUR_DIR)/dic
|
|||||||
#directory where distribution will be placed
|
#directory where distribution will be placed
|
||||||
export UTT_DIC_OUTPUT=${CUR_DIR}
|
export UTT_DIC_OUTPUT=${CUR_DIR}
|
||||||
|
|
||||||
|
export LANG_MODULES=pl_PL.ISO-8852-2 pl_PL.UTF-8
|
||||||
|
|
||||||
# path to dictionary compiler
|
# path to dictionary compiler
|
||||||
DIC_COMPILER=../app/src/compiledic/compiledic
|
DIC_COMPILER=../app/src/compiledic/compiledic
|
||||||
@ -31,3 +32,9 @@ distribute: compile dist_tarball
|
|||||||
dist_tarball:
|
dist_tarball:
|
||||||
cd dist && make tarball; cd ${CUR_DIR};
|
cd dist && make tarball; cd ${CUR_DIR};
|
||||||
|
|
||||||
|
|
||||||
|
.PHONY: dist_tarball_pl_PL.ISO-8859-2
|
||||||
|
dist_tarball:
|
||||||
|
export DIC_LANG=pl_PL.ISO-8859-2 && \
|
||||||
|
cd dist && make tarball; cd ${CUR_DIR};
|
||||||
|
|
||||||
|
5
lang/dist/tarball/Makefile
vendored
5
lang/dist/tarball/Makefile
vendored
@ -12,7 +12,8 @@ _UTT_REL=$(shell cat ../../../app/dist/common/release.def)
|
|||||||
# Temp vars
|
# Temp vars
|
||||||
_TARBALL_ROOT=$(DIR)/utt-$(_UTT_VER).$(_UTT_REL)
|
_TARBALL_ROOT=$(DIR)/utt-$(_UTT_VER).$(_UTT_REL)
|
||||||
_UTT_DIC_HOME=share/utt
|
_UTT_DIC_HOME=share/utt
|
||||||
_TAR_FILE_NAME=utt.dic.$(_UTT_VER)_$(_UTT_REL)
|
_TAR_FILE_NAME=utt.$(_UTT_VER)_$(_UTT_REL)
|
||||||
|
|
||||||
|
|
||||||
#defualt task
|
#defualt task
|
||||||
.PHONY: default
|
.PHONY: default
|
||||||
@ -20,7 +21,7 @@ default:
|
|||||||
@echo Build directory: ${UTT_DIC_BIN}
|
@echo Build directory: ${UTT_DIC_BIN}
|
||||||
@echo Output directory for tarball: ${UTT_DIC_OUTPUT}
|
@echo Output directory for tarball: ${UTT_DIC_OUTPUT}
|
||||||
mkdir -p ${_TARBALL_ROOT}/${_UTT_DIC_HOME}
|
mkdir -p ${_TARBALL_ROOT}/${_UTT_DIC_HOME}
|
||||||
if test -n "${DIC_LANG}" -a -d ${UTT_DIC_BIN}/${DIC_LANG} ; \
|
if [[ -n "${DIC_LANG}" && -d ${UTT_DIC_BIN}/${DIC_LANG} ]]; \
|
||||||
then \
|
then \
|
||||||
echo "Tworze dystrybucje ${DIC_LANG}"; \
|
echo "Tworze dystrybucje ${DIC_LANG}"; \
|
||||||
mkdir -p ${_TARBALL_ROOT}/${_UTT_DIC_HOME}/${DIC_LANG}; \
|
mkdir -p ${_TARBALL_ROOT}/${_UTT_DIC_HOME}/${DIC_LANG}; \
|
||||||
|
Loading…
Reference in New Issue
Block a user