Ta linia i następne zostaną zignorowane--

M app/dist/files/README uaktualnione M app/doc/utt.texinfo dopiski M app/src/gue/Makefile statyczne biblioteki M app/src/cor/cmdline_cor.ggo usuniecie nie dzialajacych parametrow M app/src/cor/Makefile statyczne biblioteki M app/src/common/cmdline_common.ggo ? M app/src/kor/Makefile statyczne biblioteki M app/src/lem/Makefile statyczne biblioteki M lang/dist/tarball/Makefile pakowanie modulow jezykowych po jednym M lang/Makefile -"- git-svn-id: svn://atos.wmid.amu.edu.pl/utt@61 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
2008-10-29 10:17:16 +00:00 · 2008-10-29 10:17:16 +00:00 · e28a625259
commit e28a625259
parent 261bf629fb
10 changed files with 110 additions and 48 deletions
--- a/app/dist/files/README
+++ b/app/dist/files/README
@ -17,5 +17,35 @@ unrestricted text for any conceivable purpose.

 Installation
 **************
-Run utt_make_config.pl to create configuration files.
-Configuration files will be created in ~/.utt/
+
+1) unpack the UTT tar archive
+2) in the same directory, unpack the tar archives of all UTT dictionary modules you have
+3) run
+	make install
+   in the root directory of the installation
+4) add the bin directory to the PATH variable
+
+
+Requirements
+*************
+
+* File::HomeDir
+
+  the Perl package File::HomeDir must be installed
+  (to install the package, run 'perl -MCPAN -e shell' and write
+   'install File::HomeDir' after the 'cpan>' prompt appears)
+   
+* flex
+
+  to run the ser component, flex must be installed in your system
+
+* ruby
+
+  to run the tre component, ruby must be installed in your system
+
+* locale pl_PL.iso-8852-2
+
+  the locales pl_PL.iso-8859-2 (pl_PL in short) must be installed
+  and set while using UTT with the Polish module. The text you 
+  process with UTT must be encoded in iso-8859-2.
+  
--- a/app/doc/utt.texinfo
+++ b/app/doc/utt.texinfo
@ -366,7 +366,7 @@ covered by the second segment and no segment starts at position

@section Flattened UTT file

-A UTT file format has two variants: regular and flattend. The regular
+A UTT file format has two variants: regular and flattened. The regular
 format was described above.  In the flattened format some of the
 end-of-line characters are replaced with line-feed characters.

@ -1607,11 +1607,11 @@ compression tool (grp usually processes data faster than it is read from a
 disk, especially for slow laptop drives).

@example
-cat corpus | tok | sen | lem | grp -a p | lzop -7 > corpus.grp.lzo
+cat corpus | tok | sen | lem -1 | fla | lzop -7 > corpus.grp.lzo
@end example

@example
-lzop -cd corpus.grp.lzo | grp -a gP -e @var{EXPR} | ser -e @var{EXPR}
+lzop -cd corpus.grp.lzo | grp -e @var{EXPR} | unfla | ser -e @var{EXPR}
@end example


@ -1626,11 +1626,15 @@ lzop -cd corpus.grp.lzo | grp -a gP -e @var{EXPR} | ser -e @var{EXPR}

@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:}                 @tab Marcin Walas, Tomasz Obrêbski
-@item @strong{Component category:}      @tab filter
+@item @strong{Input format:}            @tab UTT flattened
+@item @strong{Output format:}           @tab UTT flattened
+@item @strong{Required annotation:}     @tab tok, sen, lem -1
@end multitable

 [TODO]

+(see mar's help 'mar -h' for some information)
+
@c ---------------------------------------------------------------------
@c KOT
@c ---------------------------------------------------------------------
@ -1870,16 +1874,32 @@ termination of the program.
@c @end menu


+@c -------------------------------------------------------------------------------
+@c FLA
+@c -------------------------------------------------------------------------------
+
@page
@node fla
@section fla - the UTT file flattener

@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:}                 @tab Tomasz Obrêbski
-@item @strong{Component category:}      @tab filter
+@item @strong{Input format:}            @tab UTT regular
+@item @strong{Output format:}           @tab UTT flattened
+@item @strong{Required annotation:}     @tab sen
@end multitable
@c

+@menu
+* fla description::
+@c * fla command line options::
+@c * fla usage example::
+@end menu
+
+
+@node fla description
+@subsection Description
+
@command{fla} ``flattens'' a utt file by merging segments belonging
 to one sentence in one line. Technically, end-of-line characters
 ('\n', ASCII code 10) are replaced with line-feed characters ('\f',
@ -1901,13 +1921,10 @@ The facultative argument is a regular expression describing segments
 which should be treated as sentence beginnings (the test is: the
 segment contains a fragment matching the @code{<bosregex>}). By
 default, segments containing a field @code{BOS} are seeked.
-@c @menu
-@c * con command line options::
-@c * con usage example::
-@c * con hints::    
-@c @end menu
-

+@c -------------------------------------------------------------------------------
+@c UNFLA
+@c -------------------------------------------------------------------------------

@page
@node unfla
@ -1915,9 +1932,19 @@ default, segments containing a field @code{BOS} are seeked.

@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
@item @strong{Authors:}                 @tab Tomasz Obrêbski
-@item @strong{Component category:}      @tab filter
+@item @strong{Input format:}            @tab UTT flattened
+@item @strong{Output format:}           @tab UTT regular
+@item @strong{Required annotation:}     @tab -
@end multitable

+@menu
+* unfla description::
+@c * fla command line options::
+@c * fla usage example::
+@end menu
+
+@node unfla description
+@subsection Description
@command{unfla} transforms a flattened UTT file, produced by
@command{fla}, into the regular format by restoring end-of-line
 characters.
@ -1970,7 +1997,7 @@ cat text | tok | lem | cor -p W -S lem | lem -I cor | gue -p W -S lem


@example
-cat text | tok | lem --only-fail | cor -1 > output3
+cat text | tok | egrep ' W ' | lem | egrep -v 'lem:' | cor -1
@end example

@item Expression extraction
@ -2019,43 +2046,44 @@ required by @command{grp} first, and then use the preprocessed data.

 As @command{grp} (@command{grep}) processes data faster then it is
 read from the disk drive, the search time may be still shortened by
-using file compression techniques.  We suggest usin @command{lzop}.
+using file compression techniques.  We suggest using the
+@command{lzop} compressor/decompressor.

@item the fastest way to search a large corpus

-step 1: preprocessing
+step 1: corpus preprocessing

@example
 cat corpus | tok | sen | lem -1 \
-| grp -a p | lzop -7 > corpus.grp.lzo
+| fla | lzop -7 > corpus.grp.lzo
@end example

 step 2: search

@example
-lzop -cd corpus.grp.lzo | grp -a gP -e 'cat(<V>) space
+lzop -cd corpus.grp.lzo | unfla | grp -e 'cat(<V>) space
 lexeme(rozmowa)' | ser -e 'cat(<V>) space lexeme(rozmowa)' | con
@end example

@end enumerate

-@subsubheading More complicated configurations
+@c @subsubheading More complicated configurations


-@example
-mknod fifo1 p
-mknod fifo2 p
-mknod fifo3 p
-mknod fifo4 p
-mknod fifo5 p
+@c @example
+@c mknod fifo1 p
+@c mknod fifo2 p
+@c mknod fifo3 p
+@c mknod fifo4 p
+@c mknod fifo5 p

-tok | lem -p W -e fifo1 > fifo2 &
-cor -e fifo3 < fifo1 | lem > fifo4 &
-gue < fifo3 > fifo5 &
-sort -m fifo2 fifo4 fifo5
+@c tok | lem -p W -e fifo1 > fifo2 &
+@c cor -e fifo3 < fifo1 | lem > fifo4 &
+@c gue < fifo3 > fifo5 &
+@c sort -m fifo2 fifo4 fifo5

-rm fifo?
-@end example
+@c rm fifo?
+@c @end example


@c ---------------------------------------------------------------------
--- a/app/src/common/cmdline_common.ggo
+++ b/app/src/common/cmdline_common.ggo
@ -1,17 +1,17 @@
 #section "Common UTT options"


-option  "input"		f	"Input file" string no hidden
+option  "input"		f	"Input file" string no

-option  "output"	o	"Output file" string no hidden
+option  "output"	o	"Output file for succesfully processed segments" string no

-option  "fail"		e	"Output file for unsuccesfully processed segments " string no hidden
+option  "fail"		e	"Output file for unsuccesfully processed segments " string no

 option 	"only-fail"	-	"Print only segments the program failed to process" flag off hidden

 option 	"no-fail"	-	"Print only segments the program processed" flag off hidden 

-option  "copy"		c       "Copy succesfully processed segments to standard output" flag off hidden
+option  "copy"		c       "Copy succesfully processed segments to standard output" flag off

 option  "process"	p	"Process segments with this tag" string no multiple

--- a/app/src/cor/Makefile
+++ b/app/src/cor/Makefile
@ -1,5 +1,4 @@
-PAR=-Wno-deprecated -m32 -fpermissive
-# -static
+PAR=-Wno-deprecated -m32 -fpermissive -static
 PAR2=-c -Wno-deprecated -m32 -fpermissive
 LIB_PATH=../lib
 COMMON_PATH=../common
--- a/app/src/cor/cmdline_cor.ggo
+++ b/app/src/cor/cmdline_cor.ggo
@ -4,5 +4,5 @@ version "0.1"
 option "dictionary-home"	-	"Dictionary home dir." string typestr="FILENAME" no hidden
 option "dictionary"		d	"Dictionary" string typestr="FILENAME" default="cor.bin" no
 option "distance"		n	"Maximal edit distance." int default="1" no
-option "replace"		r	"Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off
+option "replace"		r	"Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off hidden
 #option "single"			-	"Place all alternatives in the same line" flag off
--- a/app/src/gue/Makefile
+++ b/app/src/gue/Makefile
@ -1,5 +1,4 @@
-PAR=-Wno-deprecated -O3 -fpermissive -m32
-#-static
+PAR=-Wno-deprecated -O3 -fpermissive -m32 -static
 PAR2=-c -Wno-deprecated -O3 -fpermissive -m32
 LIB_PATH=../lib
 COMMON_PATH=../common
--- a/app/src/kor/Makefile
+++ b/app/src/kor/Makefile
@ -1,5 +1,4 @@
-PAR=-Wno-deprecated -m32 -fpermissive
-# -static
+PAR=-Wno-deprecated -m32 -fpermissive -static
 PAR2=-c -Wno-deprecated -m32 -fpermissive
 LIB_PATH=../lib
 COMMON_PATH=../common
--- a/app/src/lem/Makefile
+++ b/app/src/lem/Makefile
@ -1,6 +1,5 @@
-PAR=-Wno-deprecated -m32 -O3 -fpermissive
-#-static
-PAR2=-c -Wno-deprecated -m32 -O3 -fpermissive
+PAR=-Wno-deprecated -m32 -O3 -fpermissive -static
+PAR2=-c -Wno-deprecated -m32 -O3 -fpermissive -static
 LIB_PATH=../lib
 COMMON_PATH=../common
 CMDLINE_FILE='"../lem/cmdline.h"'
--- a/lang/Makefile
+++ b/lang/Makefile
@ -10,6 +10,7 @@ export UTT_DIC_BIN=$(CUR_DIR)/dic
 #directory where distribution will be placed
 export UTT_DIC_OUTPUT=${CUR_DIR}

+export LANG_MODULES=pl_PL.ISO-8852-2 pl_PL.UTF-8

 # path to dictionary compiler
 DIC_COMPILER=../app/src/compiledic/compiledic
@ -31,3 +32,9 @@ distribute: compile dist_tarball
 dist_tarball:
 	cd dist && make tarball; cd ${CUR_DIR};
 	
+	
+.PHONY: dist_tarball_pl_PL.ISO-8859-2
+dist_tarball:
+	export DIC_LANG=pl_PL.ISO-8859-2 && \
+	cd dist && make tarball; cd ${CUR_DIR};
+	
--- a/lang/dist/tarball/Makefile
+++ b/lang/dist/tarball/Makefile
@ -12,7 +12,8 @@ _UTT_REL=$(shell cat ../../../app/dist/common/release.def)
 # Temp vars
 _TARBALL_ROOT=$(DIR)/utt-$(_UTT_VER).$(_UTT_REL)
 _UTT_DIC_HOME=share/utt
-_TAR_FILE_NAME=utt.dic.$(_UTT_VER)_$(_UTT_REL)
+_TAR_FILE_NAME=utt.$(_UTT_VER)_$(_UTT_REL)
+

 #defualt task
 .PHONY: default
@ -20,7 +21,7 @@ default:
 	@echo Build directory: ${UTT_DIC_BIN}
 	@echo Output directory for tarball: ${UTT_DIC_OUTPUT}
 	mkdir -p ${_TARBALL_ROOT}/${_UTT_DIC_HOME}
-	if test -n "${DIC_LANG}" -a -d ${UTT_DIC_BIN}/${DIC_LANG} ; \
+	if [[ -n "${DIC_LANG}" && -d ${UTT_DIC_BIN}/${DIC_LANG} ]]; \
 	then \
 	    echo "Tworze dystrybucje ${DIC_LANG}"; \
 	    mkdir -p ${_TARBALL_ROOT}/${_UTT_DIC_HOME}/${DIC_LANG}; \