Ta linia i następne zostaną zignorowane--
M app/dist/files/README uaktualnione M app/doc/utt.texinfo dopiski M app/src/gue/Makefile statyczne biblioteki M app/src/cor/cmdline_cor.ggo usuniecie nie dzialajacych parametrow M app/src/cor/Makefile statyczne biblioteki M app/src/common/cmdline_common.ggo ? M app/src/kor/Makefile statyczne biblioteki M app/src/lem/Makefile statyczne biblioteki M lang/dist/tarball/Makefile pakowanie modulow jezykowych po jednym M lang/Makefile -"- git-svn-id: svn://atos.wmid.amu.edu.pl/utt@61 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
This commit is contained in:
parent
261bf629fb
commit
e28a625259
34
app/dist/files/README
vendored
34
app/dist/files/README
vendored
@ -17,5 +17,35 @@ unrestricted text for any conceivable purpose.
|
||||
|
||||
Installation
|
||||
**************
|
||||
Run utt_make_config.pl to create configuration files.
|
||||
Configuration files will be created in ~/.utt/
|
||||
|
||||
1) unpack the UTT tar archive
|
||||
2) in the same directory, unpack the tar archives of all UTT dictionary modules you have
|
||||
3) run
|
||||
make install
|
||||
in the root directory of the installation
|
||||
4) add the bin directory to the PATH variable
|
||||
|
||||
|
||||
Requirements
|
||||
*************
|
||||
|
||||
* File::HomeDir
|
||||
|
||||
the Perl package File::HomeDir must be installed
|
||||
(to install the package, run 'perl -MCPAN -e shell' and write
|
||||
'install File::HomeDir' after the 'cpan>' prompt appears)
|
||||
|
||||
* flex
|
||||
|
||||
to run the ser component, flex must be installed in your system
|
||||
|
||||
* ruby
|
||||
|
||||
to run the tre component, ruby must be installed in your system
|
||||
|
||||
* locale pl_PL.iso-8852-2
|
||||
|
||||
the locales pl_PL.iso-8859-2 (pl_PL in short) must be installed
|
||||
and set while using UTT with the Polish module. The text you
|
||||
process with UTT must be encoded in iso-8859-2.
|
||||
|
||||
|
@ -366,7 +366,7 @@ covered by the second segment and no segment starts at position
|
||||
|
||||
@section Flattened UTT file
|
||||
|
||||
A UTT file format has two variants: regular and flattend. The regular
|
||||
A UTT file format has two variants: regular and flattened. The regular
|
||||
format was described above. In the flattened format some of the
|
||||
end-of-line characters are replaced with line-feed characters.
|
||||
|
||||
@ -1607,11 +1607,11 @@ compression tool (grp usually processes data faster than it is read from a
|
||||
disk, especially for slow laptop drives).
|
||||
|
||||
@example
|
||||
cat corpus | tok | sen | lem | grp -a p | lzop -7 > corpus.grp.lzo
|
||||
cat corpus | tok | sen | lem -1 | fla | lzop -7 > corpus.grp.lzo
|
||||
@end example
|
||||
|
||||
@example
|
||||
lzop -cd corpus.grp.lzo | grp -a gP -e @var{EXPR} | ser -e @var{EXPR}
|
||||
lzop -cd corpus.grp.lzo | grp -e @var{EXPR} | unfla | ser -e @var{EXPR}
|
||||
@end example
|
||||
|
||||
|
||||
@ -1626,11 +1626,15 @@ lzop -cd corpus.grp.lzo | grp -a gP -e @var{EXPR} | ser -e @var{EXPR}
|
||||
|
||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||
@item @strong{Authors:} @tab Marcin Walas, Tomasz Obrêbski
|
||||
@item @strong{Component category:} @tab filter
|
||||
@item @strong{Input format:} @tab UTT flattened
|
||||
@item @strong{Output format:} @tab UTT flattened
|
||||
@item @strong{Required annotation:} @tab tok, sen, lem -1
|
||||
@end multitable
|
||||
|
||||
[TODO]
|
||||
|
||||
(see mar's help 'mar -h' for some information)
|
||||
|
||||
@c ---------------------------------------------------------------------
|
||||
@c KOT
|
||||
@c ---------------------------------------------------------------------
|
||||
@ -1870,16 +1874,32 @@ termination of the program.
|
||||
@c @end menu
|
||||
|
||||
|
||||
@c -------------------------------------------------------------------------------
|
||||
@c FLA
|
||||
@c -------------------------------------------------------------------------------
|
||||
|
||||
@page
|
||||
@node fla
|
||||
@section fla - the UTT file flattener
|
||||
|
||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||
@item @strong{Authors:} @tab Tomasz Obrêbski
|
||||
@item @strong{Component category:} @tab filter
|
||||
@item @strong{Input format:} @tab UTT regular
|
||||
@item @strong{Output format:} @tab UTT flattened
|
||||
@item @strong{Required annotation:} @tab sen
|
||||
@end multitable
|
||||
@c
|
||||
|
||||
@menu
|
||||
* fla description::
|
||||
@c * fla command line options::
|
||||
@c * fla usage example::
|
||||
@end menu
|
||||
|
||||
|
||||
@node fla description
|
||||
@subsection Description
|
||||
|
||||
@command{fla} ``flattens'' a utt file by merging segments belonging
|
||||
to one sentence in one line. Technically, end-of-line characters
|
||||
('\n', ASCII code 10) are replaced with line-feed characters ('\f',
|
||||
@ -1901,13 +1921,10 @@ The facultative argument is a regular expression describing segments
|
||||
which should be treated as sentence beginnings (the test is: the
|
||||
segment contains a fragment matching the @code{<bosregex>}). By
|
||||
default, segments containing a field @code{BOS} are seeked.
|
||||
@c @menu
|
||||
@c * con command line options::
|
||||
@c * con usage example::
|
||||
@c * con hints::
|
||||
@c @end menu
|
||||
|
||||
|
||||
@c -------------------------------------------------------------------------------
|
||||
@c UNFLA
|
||||
@c -------------------------------------------------------------------------------
|
||||
|
||||
@page
|
||||
@node unfla
|
||||
@ -1915,9 +1932,19 @@ default, segments containing a field @code{BOS} are seeked.
|
||||
|
||||
@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa}
|
||||
@item @strong{Authors:} @tab Tomasz Obrêbski
|
||||
@item @strong{Component category:} @tab filter
|
||||
@item @strong{Input format:} @tab UTT flattened
|
||||
@item @strong{Output format:} @tab UTT regular
|
||||
@item @strong{Required annotation:} @tab -
|
||||
@end multitable
|
||||
|
||||
@menu
|
||||
* unfla description::
|
||||
@c * fla command line options::
|
||||
@c * fla usage example::
|
||||
@end menu
|
||||
|
||||
@node unfla description
|
||||
@subsection Description
|
||||
@command{unfla} transforms a flattened UTT file, produced by
|
||||
@command{fla}, into the regular format by restoring end-of-line
|
||||
characters.
|
||||
@ -1970,7 +1997,7 @@ cat text | tok | lem | cor -p W -S lem | lem -I cor | gue -p W -S lem
|
||||
|
||||
|
||||
@example
|
||||
cat text | tok | lem --only-fail | cor -1 > output3
|
||||
cat text | tok | egrep ' W ' | lem | egrep -v 'lem:' | cor -1
|
||||
@end example
|
||||
|
||||
@item Expression extraction
|
||||
@ -2019,43 +2046,44 @@ required by @command{grp} first, and then use the preprocessed data.
|
||||
|
||||
As @command{grp} (@command{grep}) processes data faster then it is
|
||||
read from the disk drive, the search time may be still shortened by
|
||||
using file compression techniques. We suggest usin @command{lzop}.
|
||||
using file compression techniques. We suggest using the
|
||||
@command{lzop} compressor/decompressor.
|
||||
|
||||
@item the fastest way to search a large corpus
|
||||
|
||||
step 1: preprocessing
|
||||
step 1: corpus preprocessing
|
||||
|
||||
@example
|
||||
cat corpus | tok | sen | lem -1 \
|
||||
| grp -a p | lzop -7 > corpus.grp.lzo
|
||||
| fla | lzop -7 > corpus.grp.lzo
|
||||
@end example
|
||||
|
||||
step 2: search
|
||||
|
||||
@example
|
||||
lzop -cd corpus.grp.lzo | grp -a gP -e 'cat(<V>) space
|
||||
lzop -cd corpus.grp.lzo | unfla | grp -e 'cat(<V>) space
|
||||
lexeme(rozmowa)' | ser -e 'cat(<V>) space lexeme(rozmowa)' | con
|
||||
@end example
|
||||
|
||||
@end enumerate
|
||||
|
||||
@subsubheading More complicated configurations
|
||||
@c @subsubheading More complicated configurations
|
||||
|
||||
|
||||
@example
|
||||
mknod fifo1 p
|
||||
mknod fifo2 p
|
||||
mknod fifo3 p
|
||||
mknod fifo4 p
|
||||
mknod fifo5 p
|
||||
@c @example
|
||||
@c mknod fifo1 p
|
||||
@c mknod fifo2 p
|
||||
@c mknod fifo3 p
|
||||
@c mknod fifo4 p
|
||||
@c mknod fifo5 p
|
||||
|
||||
tok | lem -p W -e fifo1 > fifo2 &
|
||||
cor -e fifo3 < fifo1 | lem > fifo4 &
|
||||
gue < fifo3 > fifo5 &
|
||||
sort -m fifo2 fifo4 fifo5
|
||||
@c tok | lem -p W -e fifo1 > fifo2 &
|
||||
@c cor -e fifo3 < fifo1 | lem > fifo4 &
|
||||
@c gue < fifo3 > fifo5 &
|
||||
@c sort -m fifo2 fifo4 fifo5
|
||||
|
||||
rm fifo?
|
||||
@end example
|
||||
@c rm fifo?
|
||||
@c @end example
|
||||
|
||||
|
||||
@c ---------------------------------------------------------------------
|
||||
|
@ -1,17 +1,17 @@
|
||||
#section "Common UTT options"
|
||||
|
||||
|
||||
option "input" f "Input file" string no hidden
|
||||
option "input" f "Input file" string no
|
||||
|
||||
option "output" o "Output file" string no hidden
|
||||
option "output" o "Output file for succesfully processed segments" string no
|
||||
|
||||
option "fail" e "Output file for unsuccesfully processed segments " string no hidden
|
||||
option "fail" e "Output file for unsuccesfully processed segments " string no
|
||||
|
||||
option "only-fail" - "Print only segments the program failed to process" flag off hidden
|
||||
|
||||
option "no-fail" - "Print only segments the program processed" flag off hidden
|
||||
|
||||
option "copy" c "Copy succesfully processed segments to standard output" flag off hidden
|
||||
option "copy" c "Copy succesfully processed segments to standard output" flag off
|
||||
|
||||
option "process" p "Process segments with this tag" string no multiple
|
||||
|
||||
|
@ -1,5 +1,4 @@
|
||||
PAR=-Wno-deprecated -m32 -fpermissive
|
||||
# -static
|
||||
PAR=-Wno-deprecated -m32 -fpermissive -static
|
||||
PAR2=-c -Wno-deprecated -m32 -fpermissive
|
||||
LIB_PATH=../lib
|
||||
COMMON_PATH=../common
|
||||
|
@ -4,5 +4,5 @@ version "0.1"
|
||||
option "dictionary-home" - "Dictionary home dir." string typestr="FILENAME" no hidden
|
||||
option "dictionary" d "Dictionary" string typestr="FILENAME" default="cor.bin" no
|
||||
option "distance" n "Maximal edit distance." int default="1" no
|
||||
option "replace" r "Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off
|
||||
option "replace" r "Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off hidden
|
||||
#option "single" - "Place all alternatives in the same line" flag off
|
||||
|
@ -1,5 +1,4 @@
|
||||
PAR=-Wno-deprecated -O3 -fpermissive -m32
|
||||
#-static
|
||||
PAR=-Wno-deprecated -O3 -fpermissive -m32 -static
|
||||
PAR2=-c -Wno-deprecated -O3 -fpermissive -m32
|
||||
LIB_PATH=../lib
|
||||
COMMON_PATH=../common
|
||||
|
@ -1,5 +1,4 @@
|
||||
PAR=-Wno-deprecated -m32 -fpermissive
|
||||
# -static
|
||||
PAR=-Wno-deprecated -m32 -fpermissive -static
|
||||
PAR2=-c -Wno-deprecated -m32 -fpermissive
|
||||
LIB_PATH=../lib
|
||||
COMMON_PATH=../common
|
||||
|
@ -1,6 +1,5 @@
|
||||
PAR=-Wno-deprecated -m32 -O3 -fpermissive
|
||||
#-static
|
||||
PAR2=-c -Wno-deprecated -m32 -O3 -fpermissive
|
||||
PAR=-Wno-deprecated -m32 -O3 -fpermissive -static
|
||||
PAR2=-c -Wno-deprecated -m32 -O3 -fpermissive -static
|
||||
LIB_PATH=../lib
|
||||
COMMON_PATH=../common
|
||||
CMDLINE_FILE='"../lem/cmdline.h"'
|
||||
|
@ -10,6 +10,7 @@ export UTT_DIC_BIN=$(CUR_DIR)/dic
|
||||
#directory where distribution will be placed
|
||||
export UTT_DIC_OUTPUT=${CUR_DIR}
|
||||
|
||||
export LANG_MODULES=pl_PL.ISO-8852-2 pl_PL.UTF-8
|
||||
|
||||
# path to dictionary compiler
|
||||
DIC_COMPILER=../app/src/compiledic/compiledic
|
||||
@ -31,3 +32,9 @@ distribute: compile dist_tarball
|
||||
dist_tarball:
|
||||
cd dist && make tarball; cd ${CUR_DIR};
|
||||
|
||||
|
||||
.PHONY: dist_tarball_pl_PL.ISO-8859-2
|
||||
dist_tarball:
|
||||
export DIC_LANG=pl_PL.ISO-8859-2 && \
|
||||
cd dist && make tarball; cd ${CUR_DIR};
|
||||
|
||||
|
5
lang/dist/tarball/Makefile
vendored
5
lang/dist/tarball/Makefile
vendored
@ -12,7 +12,8 @@ _UTT_REL=$(shell cat ../../../app/dist/common/release.def)
|
||||
# Temp vars
|
||||
_TARBALL_ROOT=$(DIR)/utt-$(_UTT_VER).$(_UTT_REL)
|
||||
_UTT_DIC_HOME=share/utt
|
||||
_TAR_FILE_NAME=utt.dic.$(_UTT_VER)_$(_UTT_REL)
|
||||
_TAR_FILE_NAME=utt.$(_UTT_VER)_$(_UTT_REL)
|
||||
|
||||
|
||||
#defualt task
|
||||
.PHONY: default
|
||||
@ -20,7 +21,7 @@ default:
|
||||
@echo Build directory: ${UTT_DIC_BIN}
|
||||
@echo Output directory for tarball: ${UTT_DIC_OUTPUT}
|
||||
mkdir -p ${_TARBALL_ROOT}/${_UTT_DIC_HOME}
|
||||
if test -n "${DIC_LANG}" -a -d ${UTT_DIC_BIN}/${DIC_LANG} ; \
|
||||
if [[ -n "${DIC_LANG}" && -d ${UTT_DIC_BIN}/${DIC_LANG} ]]; \
|
||||
then \
|
||||
echo "Tworze dystrybucje ${DIC_LANG}"; \
|
||||
mkdir -p ${_TARBALL_ROOT}/${_UTT_DIC_HOME}/${DIC_LANG}; \
|
||||
|
Loading…
Reference in New Issue
Block a user