diff --git a/app/dist/files/README b/app/dist/files/README index bc4cfd5..60403f8 100644 --- a/app/dist/files/README +++ b/app/dist/files/README @@ -17,5 +17,35 @@ unrestricted text for any conceivable purpose. Installation ************** -Run utt_make_config.pl to create configuration files. -Configuration files will be created in ~/.utt/ + +1) unpack the UTT tar archive +2) in the same directory, unpack the tar archives of all UTT dictionary modules you have +3) run + make install + in the root directory of the installation +4) add the bin directory to the PATH variable + + +Requirements +************* + +* File::HomeDir + + the Perl package File::HomeDir must be installed + (to install the package, run 'perl -MCPAN -e shell' and write + 'install File::HomeDir' after the 'cpan>' prompt appears) + +* flex + + to run the ser component, flex must be installed in your system + +* ruby + + to run the tre component, ruby must be installed in your system + +* locale pl_PL.iso-8852-2 + + the locales pl_PL.iso-8859-2 (pl_PL in short) must be installed + and set while using UTT with the Polish module. The text you + process with UTT must be encoded in iso-8859-2. + diff --git a/app/doc/utt.texinfo b/app/doc/utt.texinfo index 0a26c93..00f7d13 100644 --- a/app/doc/utt.texinfo +++ b/app/doc/utt.texinfo @@ -366,7 +366,7 @@ covered by the second segment and no segment starts at position @section Flattened UTT file -A UTT file format has two variants: regular and flattend. The regular +A UTT file format has two variants: regular and flattened. The regular format was described above. In the flattened format some of the end-of-line characters are replaced with line-feed characters. @@ -1607,11 +1607,11 @@ compression tool (grp usually processes data faster than it is read from a disk, especially for slow laptop drives). @example -cat corpus | tok | sen | lem | grp -a p | lzop -7 > corpus.grp.lzo +cat corpus | tok | sen | lem -1 | fla | lzop -7 > corpus.grp.lzo @end example @example -lzop -cd corpus.grp.lzo | grp -a gP -e @var{EXPR} | ser -e @var{EXPR} +lzop -cd corpus.grp.lzo | grp -e @var{EXPR} | unfla | ser -e @var{EXPR} @end example @@ -1626,11 +1626,15 @@ lzop -cd corpus.grp.lzo | grp -a gP -e @var{EXPR} | ser -e @var{EXPR} @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @item @strong{Authors:} @tab Marcin Walas, Tomasz Obrêbski -@item @strong{Component category:} @tab filter +@item @strong{Input format:} @tab UTT flattened +@item @strong{Output format:} @tab UTT flattened +@item @strong{Required annotation:} @tab tok, sen, lem -1 @end multitable [TODO] +(see mar's help 'mar -h' for some information) + @c --------------------------------------------------------------------- @c KOT @c --------------------------------------------------------------------- @@ -1870,16 +1874,32 @@ termination of the program. @c @end menu +@c ------------------------------------------------------------------------------- +@c FLA +@c ------------------------------------------------------------------------------- + @page @node fla @section fla - the UTT file flattener @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @item @strong{Authors:} @tab Tomasz Obrêbski -@item @strong{Component category:} @tab filter +@item @strong{Input format:} @tab UTT regular +@item @strong{Output format:} @tab UTT flattened +@item @strong{Required annotation:} @tab sen @end multitable @c +@menu +* fla description:: +@c * fla command line options:: +@c * fla usage example:: +@end menu + + +@node fla description +@subsection Description + @command{fla} ``flattens'' a utt file by merging segments belonging to one sentence in one line. Technically, end-of-line characters ('\n', ASCII code 10) are replaced with line-feed characters ('\f', @@ -1901,13 +1921,10 @@ The facultative argument is a regular expression describing segments which should be treated as sentence beginnings (the test is: the segment contains a fragment matching the @code{}). By default, segments containing a field @code{BOS} are seeked. -@c @menu -@c * con command line options:: -@c * con usage example:: -@c * con hints:: -@c @end menu - +@c ------------------------------------------------------------------------------- +@c UNFLA +@c ------------------------------------------------------------------------------- @page @node unfla @@ -1915,9 +1932,19 @@ default, segments containing a field @code{BOS} are seeked. @multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} @item @strong{Authors:} @tab Tomasz Obrêbski -@item @strong{Component category:} @tab filter +@item @strong{Input format:} @tab UTT flattened +@item @strong{Output format:} @tab UTT regular +@item @strong{Required annotation:} @tab - @end multitable +@menu +* unfla description:: +@c * fla command line options:: +@c * fla usage example:: +@end menu + +@node unfla description +@subsection Description @command{unfla} transforms a flattened UTT file, produced by @command{fla}, into the regular format by restoring end-of-line characters. @@ -1970,7 +1997,7 @@ cat text | tok | lem | cor -p W -S lem | lem -I cor | gue -p W -S lem @example -cat text | tok | lem --only-fail | cor -1 > output3 +cat text | tok | egrep ' W ' | lem | egrep -v 'lem:' | cor -1 @end example @item Expression extraction @@ -2019,43 +2046,44 @@ required by @command{grp} first, and then use the preprocessed data. As @command{grp} (@command{grep}) processes data faster then it is read from the disk drive, the search time may be still shortened by -using file compression techniques. We suggest usin @command{lzop}. +using file compression techniques. We suggest using the +@command{lzop} compressor/decompressor. @item the fastest way to search a large corpus -step 1: preprocessing +step 1: corpus preprocessing @example cat corpus | tok | sen | lem -1 \ -| grp -a p | lzop -7 > corpus.grp.lzo +| fla | lzop -7 > corpus.grp.lzo @end example step 2: search @example -lzop -cd corpus.grp.lzo | grp -a gP -e 'cat() space +lzop -cd corpus.grp.lzo | unfla | grp -e 'cat() space lexeme(rozmowa)' | ser -e 'cat() space lexeme(rozmowa)' | con @end example @end enumerate -@subsubheading More complicated configurations +@c @subsubheading More complicated configurations -@example -mknod fifo1 p -mknod fifo2 p -mknod fifo3 p -mknod fifo4 p -mknod fifo5 p +@c @example +@c mknod fifo1 p +@c mknod fifo2 p +@c mknod fifo3 p +@c mknod fifo4 p +@c mknod fifo5 p -tok | lem -p W -e fifo1 > fifo2 & -cor -e fifo3 < fifo1 | lem > fifo4 & -gue < fifo3 > fifo5 & -sort -m fifo2 fifo4 fifo5 +@c tok | lem -p W -e fifo1 > fifo2 & +@c cor -e fifo3 < fifo1 | lem > fifo4 & +@c gue < fifo3 > fifo5 & +@c sort -m fifo2 fifo4 fifo5 -rm fifo? -@end example +@c rm fifo? +@c @end example @c --------------------------------------------------------------------- diff --git a/app/src/common/cmdline_common.ggo b/app/src/common/cmdline_common.ggo index 5dfafda..45a385a 100644 --- a/app/src/common/cmdline_common.ggo +++ b/app/src/common/cmdline_common.ggo @@ -1,17 +1,17 @@ #section "Common UTT options" -option "input" f "Input file" string no hidden +option "input" f "Input file" string no -option "output" o "Output file" string no hidden +option "output" o "Output file for succesfully processed segments" string no -option "fail" e "Output file for unsuccesfully processed segments " string no hidden +option "fail" e "Output file for unsuccesfully processed segments " string no option "only-fail" - "Print only segments the program failed to process" flag off hidden option "no-fail" - "Print only segments the program processed" flag off hidden -option "copy" c "Copy succesfully processed segments to standard output" flag off hidden +option "copy" c "Copy succesfully processed segments to standard output" flag off option "process" p "Process segments with this tag" string no multiple diff --git a/app/src/cor/Makefile b/app/src/cor/Makefile index ecdfa12..08b9a70 100644 --- a/app/src/cor/Makefile +++ b/app/src/cor/Makefile @@ -1,5 +1,4 @@ -PAR=-Wno-deprecated -m32 -fpermissive -# -static +PAR=-Wno-deprecated -m32 -fpermissive -static PAR2=-c -Wno-deprecated -m32 -fpermissive LIB_PATH=../lib COMMON_PATH=../common diff --git a/app/src/cor/cmdline_cor.ggo b/app/src/cor/cmdline_cor.ggo index c2062e5..810d511 100644 --- a/app/src/cor/cmdline_cor.ggo +++ b/app/src/cor/cmdline_cor.ggo @@ -4,5 +4,5 @@ version "0.1" option "dictionary-home" - "Dictionary home dir." string typestr="FILENAME" no hidden option "dictionary" d "Dictionary" string typestr="FILENAME" default="cor.bin" no option "distance" n "Maximal edit distance." int default="1" no -option "replace" r "Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off +option "replace" r "Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off hidden #option "single" - "Place all alternatives in the same line" flag off diff --git a/app/src/gue/Makefile b/app/src/gue/Makefile index 135d71f..abd1da5 100644 --- a/app/src/gue/Makefile +++ b/app/src/gue/Makefile @@ -1,5 +1,4 @@ -PAR=-Wno-deprecated -O3 -fpermissive -m32 -#-static +PAR=-Wno-deprecated -O3 -fpermissive -m32 -static PAR2=-c -Wno-deprecated -O3 -fpermissive -m32 LIB_PATH=../lib COMMON_PATH=../common diff --git a/app/src/kor/Makefile b/app/src/kor/Makefile index 62dbc94..68113a9 100755 --- a/app/src/kor/Makefile +++ b/app/src/kor/Makefile @@ -1,5 +1,4 @@ -PAR=-Wno-deprecated -m32 -fpermissive -# -static +PAR=-Wno-deprecated -m32 -fpermissive -static PAR2=-c -Wno-deprecated -m32 -fpermissive LIB_PATH=../lib COMMON_PATH=../common diff --git a/app/src/lem/Makefile b/app/src/lem/Makefile index bbfe110..a04fde9 100644 --- a/app/src/lem/Makefile +++ b/app/src/lem/Makefile @@ -1,6 +1,5 @@ -PAR=-Wno-deprecated -m32 -O3 -fpermissive -#-static -PAR2=-c -Wno-deprecated -m32 -O3 -fpermissive +PAR=-Wno-deprecated -m32 -O3 -fpermissive -static +PAR2=-c -Wno-deprecated -m32 -O3 -fpermissive -static LIB_PATH=../lib COMMON_PATH=../common CMDLINE_FILE='"../lem/cmdline.h"' diff --git a/lang/Makefile b/lang/Makefile index 0b0555a..2bb9125 100644 --- a/lang/Makefile +++ b/lang/Makefile @@ -10,6 +10,7 @@ export UTT_DIC_BIN=$(CUR_DIR)/dic #directory where distribution will be placed export UTT_DIC_OUTPUT=${CUR_DIR} +export LANG_MODULES=pl_PL.ISO-8852-2 pl_PL.UTF-8 # path to dictionary compiler DIC_COMPILER=../app/src/compiledic/compiledic @@ -31,3 +32,9 @@ distribute: compile dist_tarball dist_tarball: cd dist && make tarball; cd ${CUR_DIR}; + +.PHONY: dist_tarball_pl_PL.ISO-8859-2 +dist_tarball: + export DIC_LANG=pl_PL.ISO-8859-2 && \ + cd dist && make tarball; cd ${CUR_DIR}; + diff --git a/lang/dist/tarball/Makefile b/lang/dist/tarball/Makefile index 2d43d72..4222ba6 100644 --- a/lang/dist/tarball/Makefile +++ b/lang/dist/tarball/Makefile @@ -12,7 +12,8 @@ _UTT_REL=$(shell cat ../../../app/dist/common/release.def) # Temp vars _TARBALL_ROOT=$(DIR)/utt-$(_UTT_VER).$(_UTT_REL) _UTT_DIC_HOME=share/utt -_TAR_FILE_NAME=utt.dic.$(_UTT_VER)_$(_UTT_REL) +_TAR_FILE_NAME=utt.$(_UTT_VER)_$(_UTT_REL) + #defualt task .PHONY: default @@ -20,7 +21,7 @@ default: @echo Build directory: ${UTT_DIC_BIN} @echo Output directory for tarball: ${UTT_DIC_OUTPUT} mkdir -p ${_TARBALL_ROOT}/${_UTT_DIC_HOME} - if test -n "${DIC_LANG}" -a -d ${UTT_DIC_BIN}/${DIC_LANG} ; \ + if [[ -n "${DIC_LANG}" && -d ${UTT_DIC_BIN}/${DIC_LANG} ]]; \ then \ echo "Tworze dystrybucje ${DIC_LANG}"; \ mkdir -p ${_TARBALL_ROOT}/${_UTT_DIC_HOME}/${DIC_LANG}; \