diff --git a/app/Makefile b/app/Makefile new file mode 100644 index 0000000..ca29e63 --- /dev/null +++ b/app/Makefile @@ -0,0 +1,152 @@ +# main makefile + +BIN=bin +SRC=src +DIR=$(shell pwd) + +############################## + +UTT_DIST_NAME=utt-0.9 + +export UTT_DIR=${DIR}/${UTT_DIST_NAME} + +export UTT_BIN_DIR=${UTT_DIR}/bin # executables +export UTT_CONF_DIR=${UTT_DIR}/conf # configuration files +export UTT_SHARE_DIR=${UTT_DIR}/share # stuff +export UTT_LANG_DIR=${UTT_DIR}/lang # language/encoding specific stuff +export UTT_TAGS_DIR=${UTT_DIR}/tags # tag format specific stuff +#export UTT_LIB_DIR=${UTT_DIR}/lib # nothing +export UTT_DOC_DIR=${UTT_DIR}/doc # documantation + +UTT_DIST_FILE=utt + +# list of components to be included in the distribution + +COMPONENTS = lib gue tok.l cor lem kot sen-l sen-nl ser grp con fla unfla mar compiledic + +############################## + +all: dirs components conf doc lang tags files share + @echo "Make completed successfully!" + +.PHONY: dirs +dirs: + if [ -d ${UTT_DIR} ]; then rm -r ${UTT_DIR}; fi + mkdir -p ${UTT_DIR} + mkdir -p ${UTT_BIN_DIR} + mkdir -p ${UTT_CONF_DIR} + mkdir -p ${UTT_SHARE_DIR} + mkdir -p ${UTT_LANG_DIR} + mkdir -p ${UTT_TAGS_DIR} + mkdir -p ${UTT_DOC_DIR} + +.PHONY: components +components: + @for cmp in $(COMPONENTS); do\ + make $$cmp;\ + done + +${COMPONENTS}: + cd $(SRC)/$@ && make && make copy; cd $(DIR); + +.PHONY: conf +conf: + cd $(DIR)/conf && make && make copy ; cd $(DIR) + +.PHONY: doc +doc: + cd $(DIR)/doc && make && make copy ; cd $(DIR) + +.PHONY: lang +lang: + cd $(DIR)/lang && make && make copy ; cd $(DIR) + +.PHONY: tags +tags: + cd $(DIR)/tags && make && make copy ; cd $(DIR) + +.PHONY: share +share: + cd $(DIR)/share && make && make copy ; cd $(DIR) + +.PHONY: files +files: + cd ${DIR}/files && make && make copy ; cd ${DIR} + + +clean: clean_components clean_doc clean_dist + @echo "All files cleaned successfully!" + +clean_components: + @for cmp in $(COMPONENTS); do \ + cd $(SRC)/$$cmp && make clean ; cd $(DIR); \ + done + +clean_lib: + cd $(SRC)/lib && make clean + +clean_doc: + cd $(DIR)/doc && make clean ; cd $(DIR) + + + +install: all + cd ${UTT_DIR} && make install; cd ${DIR} + +uninstall: + cd ${UTT_DIR} && make uninstall; cd ${DIR} + +reinstall: + cd ${UTT_DIR} && make reinstall; cd ${DIR} + +# ifdef INSTALL_BIN_DIR +# if [ -d ${INSTALL_BIN_DIR} ]; then true; else mkdir -p ${INSTALL_BIN_DIR}; fi +# cp -r ${UTT_BIN_DIR}/* ${INSTALL_BIN_DIR}/ +# endif +# ifdef INSTALL_SHARE_DIR +# if [ -d ${INSTALL_SHARE_DIR} ]; then true; else mkdir -p ${INSTALL_SHARE_DIR}; fi +# cp -r ${UTT_SHARE_DIR}/* ${INSTALL_SHARE_DIR}/ +# endif +# ifdef INSTALL_DOC_DIR +# if [ -d ${INSTALL_DOC_DIR} ]; then true; else mkdir -p ${INSTALL_DOC_DIR}; fi +# cp -r ${UTT_DOC_DIR}/* ${INSTALL_DOC_DIR}/ +# endif +# ifdef INSTALL_LIB_DIR +# if [ -d ${INSTALL_LIB_DIR} ]; then true; else mkdir -p ${INSTALL_LIB_DIR}; fi +# cp -r ${UTT_LIB_DIR}/* ${INSTALL_LIB_DIR} +# endif + +# +# install: make_dirs install_components install_cnf install_dta install_doc +# @echo "Installation completed successfully!" + +# install_components: +# @for cmp in $(COMPONENTS); do \ +# cd $(SRC)/$$cmp && make install ; cd $(DIR); \ +# done + +# install_cnf: +# cp conf/*.conf $(UTT_ETC_DIR)/ + +# install_dta: +# cp -r data/* $(UTT_SHARE_DIR)/ + +# install_doc: +# cp doc/utt.{ps,pdf,html,info} $(UTT_DOC_DIR)/ + +# make_dirs: +# if [ -d $(UTT_BIN_DIR) ]; then true; else mkdir -p $(UTT_BIN_DIR); fi +# if [ -d $(UTT_ETC_DIR) ]; then true; else mkdir -p $(UTT_ETC_DIR); fi +# if [ -d $(UTT_SHARE_DIR) ]; then true; else mkdir -p $(UTT_SHARE_DIR); fi +# if [ -d $(UTT_DOC_DIR) ]; then true; else mkdir -p $(UTT_DOC_DIR); fi +# if [ -d $(UTT_LIB_DIR) ]; then true; else mkdir -p $(UTT_LIB_DIR); fi + +dist: all + tar -czvf $(UTT_DIST_NAME).tgz $(UTT_DIR) + + +clean_dist: + if [ -d $(UTT_DIST_DIR) ]; then rm -r $(UTT_DIST_DIR); else true; fi + if [ -f $(UTT_DIST_FILE).tar.bz2 ]; then rm $(UTT_DIST_FILE).tar.bz2; else true; fi + if [ -f $(UTT_DIST_PMDB_FILE).tar.bz2 ]; then rm $(UTT_DIST_PMDB_FILE).tar.bz2; else true; fi + diff --git a/app/README.developers b/app/README.developers new file mode 100644 index 0000000..6f87fe4 --- /dev/null +++ b/app/README.developers @@ -0,0 +1,16 @@ + +COMMANDS TO BE RUN IN THIS DIRECTORY: + +% make + +compiles all the components, moves all files destinated for +distribution in the directory [the value of UTT_DIR_NAME variable in +Makefile] (currently utt-0.9) + +% make install + +installes the package in your system in the directory ~/.utt + +% make dist + +prepares distribution file named ${UTT_DIR_NAME}.tgz (currently utt-0.9.tgz) diff --git a/app/TODO b/app/TODO new file mode 100644 index 0000000..7271464 --- /dev/null +++ b/app/TODO @@ -0,0 +1,11 @@ +* wyprowadzic grp-pre i grp-post z grp do aux? +* zamienic kota na lepszego (Kubis) +* + +1. DONE. Makefile do gph (install). +2. (zrobione dla ser?) Nazwy pmdb2re -> pmdb.tag2re (grp, ser). +3. DONE. Usuniecie bibliotek (aplhabet, erro). +4. DONE (dla gue i lem) Poprawna obsluga opcji --one-line i oraz --one-field. +--- +5. Zadania zwiazane z rozbudowa ser (src/ser/TODO). + diff --git a/app/dist/Makefile b/app/dist/Makefile new file mode 100644 index 0000000..090400d --- /dev/null +++ b/app/dist/Makefile @@ -0,0 +1,61 @@ +# compile task doesn't compile sources, but just copy some files +# this should be changed +# + +# I put here some variables + +# path, where binaries are placed +# (they will be processed for making distribution) +export _UTT_DIST_DIR=$(shell pwd)/bin +# path, where distribution file should be placed +export _UTT_DIST_OUTPUT=$(shell pwd) + + +# ----------------------------------------------------------- +# default task should display options +.PHONY: default +defaul: + @echo "Using: make compile|tarball|rpm|deb" + + +# ----------------------------------------------------------- +# ----------------------------------------------------------- +# this task should compile utt application +.PHONY: compile +compile: + if test -d ${_UTT_DIST_DIR}; then rm -fr ${_UTT_DIST_DIR}; fi + mkdir -p ${_UTT_DIST_DIR} + @# fake compilation + cp -r ../utt-0.9/* ${_UTT_DIST_DIR}/ + @# we add some extra file (required during instalation) + cp common/create_utt_config.pl ${_UTT_DIST_DIR}/ + chmod 700 ${_UTT_DIST_DIR}/create_utt_config.pl + + +# ----------------------------------------------------------- +# this task should compile utt (if nesessery) and create tar.gz version +.PHONY: tarball +tarball: compile + cd tarball && make + +# ----------------------------------------------------------- +# this task should compile utt (if nesessery) and create rpm version +.PHONY: rpm +rpm: compile + @#we build rpm (see spec/README for details) + cd spec && make + +# ----------------------------------------------------------- +# this task should compile utt (if nesessery) and create deb version +.PHONY: deb +deb: compile + @#we build deb (see deb/README for details) + cd deb && make + +# ----------------------------------------------------------- +# this task should remove compiled files and directories +.PHONY: clean +clean: + # finally the line below should be uncomment + rm -fr ${_UTT_DIST_DIR} + diff --git a/app/dist/common/description.def b/app/dist/common/description.def new file mode 100644 index 0000000..4c8c701 --- /dev/null +++ b/app/dist/common/description.def @@ -0,0 +1 @@ +I put here some description. \ No newline at end of file diff --git a/app/dist/common/description.pl.def b/app/dist/common/description.pl.def new file mode 100644 index 0000000..07382bb --- /dev/null +++ b/app/dist/common/description.pl.def @@ -0,0 +1 @@ +Tu umieszczę opis po polsku. \ No newline at end of file diff --git a/app/dist/common/release.def b/app/dist/common/release.def new file mode 100644 index 0000000..56a6051 --- /dev/null +++ b/app/dist/common/release.def @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/app/dist/common/requirements.def b/app/dist/common/requirements.def new file mode 100644 index 0000000..e69de29 diff --git a/app/dist/common/utt_make_config.pl b/app/dist/common/utt_make_config.pl new file mode 100644 index 0000000..67fc241 --- /dev/null +++ b/app/dist/common/utt_make_config.pl @@ -0,0 +1,53 @@ +#!/usr/bin/perl + +use Cwd 'abs_path'; +use File::Basename; +use POSIX; + +my $cur_dir = dirname(abs_path($0)); + +open(FILE, ">$cur_dir/conf/utt.conf"); + +# we put some description into utt.conf file +print FILE "# ************************************************************\n"; +print FILE "# * This file was created automatically during installation. *\n"; +print FILE "# * If you don't need do not change it. *\n"; +print FILE "# * *\n"; +print FILE "# * UAM Text Tools *\n"; +print FILE "# * Adam Mickiewicz University, Poland *\n"; +print FILE "# * http://utt.amu.edu.pl *\n"; +print FILE "# ************************************************************\n"; +print FILE "\n\n"; + +# we need utt home directory +print FILE "# absolute path to utt directory\n"; +print FILE "UTT_HOME=$cur_dir\n\n"; + + +# we need user default locale +$best_locale = findLocale(); +print FILE "# user locale\n"; +print FILE "UTT_LOCALE=$best_locale\n"; +print FILE "\n"; + +close FILE; + + + +sub findLocale() { + $cur_locale = setlocale(LC_CTYPE); + + # we replace Latinx to ISO-8859-x + $cur_locale =~ s/(.+?)Latin(.+?)/$1ISO\-8859\-$2/g; + + if($cur_locale =~ /\w+_\w+\.\S+/) { + $best_locale = $cur_locale; + } + elsif($cur_locale =~ /\w+_\w+/) { + $best_locale = $cur_locale.".UTF-8"; + } + else { + $best_locale = toupper($cur_locale).'_'.tolower($cur_locale).'.UTF-8'; + } + return $best_locale; +} diff --git a/app/dist/common/version.def b/app/dist/common/version.def new file mode 100644 index 0000000..9a7d84f --- /dev/null +++ b/app/dist/common/version.def @@ -0,0 +1 @@ +0.9 \ No newline at end of file diff --git a/app/dist/deb/Makefile b/app/dist/deb/Makefile new file mode 100644 index 0000000..f175ca4 --- /dev/null +++ b/app/dist/deb/Makefile @@ -0,0 +1,81 @@ +#default task + +# here there're few properties +_PRODUCT_NAME=utt +_BUILD_DIR=$(_UTT_BIN_DIR) +_UTT_VER=$(shell cat ../common/version.def) +_UTT_REL=$(shell cat ../common/release.def) +_DEB_ROOT=$(shell pwd)/deb_root +_INSTALL_DIR=/usr/local/$(_PRODUCT_NAME)/$(_UTT_VER)-$(_UTT_REL) + +.PHONY: default +default: + # we need some extra configuration files + make_control + make_postinst + + # first, we prepare some directory structure + mkdir -p $(_DEB_ROOT)/DEBIAN + mkdir -p $(_DEB_ROOT)$(_INSTALL_DIR) + mkdir -p $(_DEB_ROOT)/usr/share/man/man1 + mkdir -p $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME) + + find $(_DEB_ROOT) -type d | xargs chmod 755 # this is necessary on Debian Woody, don't ask me why + + # next, we copy necessary files + mv ./control $(_DEB_ROOT)/DEBIAN/ + cp ./postinst $(_DEB_ROOT)/DEBIAN/ + cp ./prerm $(_DEB_ROOT)/DEBIAN/ +# cp -r $(_BUILD_DIR)/man/* $(_DEB_ROOT)/usr/share/man/ + cp $(_BUILD_DIR)/COPYRIGHT $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/copyright +# cp $(_BUILD_DIR)/changelog $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/ +# cp $(_BUILD_DIR)/changelog.Debian $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/ + + + # next we make man/doc archives +# gzip --best $(_DEB_ROOT)/usr/share/man/man1/$(_PRODUCT_NAME).1 +# gzip --best $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/changelog +# gzip --best $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/changelog.Debian +# tar -cvvf control.tar.gz ${_DEB_ROOT}/DEBIAN/ +# rm -fr ${_DEB_ROOT}/DEBIAN/ + + # and binaries + cp -rv $(_BUILD_DIR)/* $(_DEB_ROOT)$(_INSTALL_DIR)/ +# tar -cvvf data.tar.gz ${_DEB_ROOT}/ +# rm -fr ${_DEB_ROOT}/ + + + # finally, we buid deb package + fakeroot dpkg-deb --build $(_DEB_ROOT) + mv $(_DEB_ROOT).deb $(_PRODUCT_NAME)_$(_UTT_VER)-$(_UTT_REL).all.deb + + +.PHONY: make_control +make_control: + echo "Package: $(_PRODUCT_NAME)" > control + echo "Version: $(_UTT_VER)" >> control + echo "Section: web" >> control + echo "Priority: optional" >> control + echo "Architecture: all" >> control + echo "Essential: no" >> control + + echo "Depends: " >> control +# here we read this information from file ../common/requirements.def + #libwww-perl, acme-base (>= 1.2) <= wymagania pakietowe + + echo "Pre-Depends: perl" >> control + + echo "Maintainer: Adam Mickiewicz University" >> control + echo "Provides: $(_PRODUCT_NAME)" >> control + echo -n "Description: " >> control + cat ../common/description.def >> control + +.PHONY: make_postinst +make_postinst: + echo "#!/bin/sh" > postinst + echo "$(_INSTALL_DIR)/create_utt_config.pl" >> postinst + echo "rm -f $(_INSTALL_DIR)/create_utt_config.pl" >> postinst + +.PHONY: make_prerm +make_prerm: + echo "#!/bin/sh" > prerm diff --git a/app/dist/deb/README b/app/dist/deb/README new file mode 100644 index 0000000..771b11a --- /dev/null +++ b/app/dist/deb/README @@ -0,0 +1,3 @@ +This directory contains files necessery to create deb package. + +apt-get install dpkg-dev debhelper devscripts fakeroot linda diff --git a/app/dist/files/COPYRIGHT b/app/dist/files/COPYRIGHT new file mode 100644 index 0000000..e69de29 diff --git a/app/dist/files/LICENCE b/app/dist/files/LICENCE new file mode 100644 index 0000000..e69de29 diff --git a/app/dist/files/README b/app/dist/files/README new file mode 100644 index 0000000..0306a62 --- /dev/null +++ b/app/dist/files/README @@ -0,0 +1,14 @@ + +Installation: + +1) Run the command: + + make install + +in this directory. This will install UTT in the directory '~/.utt'. + +2) Add the path + + ~/.utt/bin + +to your PATH variable to make UTT programs visible to your system. diff --git a/app/dist/spec/Makefile b/app/dist/spec/Makefile new file mode 100644 index 0000000..0a4571e --- /dev/null +++ b/app/dist/spec/Makefile @@ -0,0 +1,15 @@ +# this makefile will build rpm + +DIR=$(shell pwd) + +ifndef _UTT_DIST_DIR + _UTT_DIST_DIR=${DIR} +endif + + +# default task +.PHONY: rpm +rpm: + cd ${_UTT_DIST_DIR}; rpmbuild -bb ${DIR}/utt.spec + + diff --git a/app/dist/spec/README b/app/dist/spec/README new file mode 100644 index 0000000..a88a0df --- /dev/null +++ b/app/dist/spec/README @@ -0,0 +1,16 @@ +This directory contains files necessary to produce rpm package. + +First, you must have variable _UTT_DIST_DIR defined properly. +This variable should be defined by main Makefile. + +To create rpm file, just write: +make + +The created package should appears in default RPM directory. +(in my computer it is /usr/src/redhat/RPMS/$arch/ directory) + +To determine the rpm output directory, execute: +rpm --showrc | grep _rmpdir + +You need access privilage to this directory to create rmp. + diff --git a/app/dist/spec/utt.spec b/app/dist/spec/utt.spec new file mode 100644 index 0000000..71e69e5 --- /dev/null +++ b/app/dist/spec/utt.spec @@ -0,0 +1,106 @@ +# +# Default RPM header. +# +# START_RPM_STD_HEADER: + + +# +# RPM properties +# +%define _this_product UAM Text Tools +%define _this_summary Some tools for text processing +%define _this_name utt +%define _this_version %(cat ../common/version.def) +%define _this_release %(cat ../common/release.def) +%define _this_copyright Adam Mickiewicz University, Poland + +# +# We need some paths +# +# Directory with utt binaries +%define _UTT_DIST_DIR %(pwd) +#Root directory in which utt will be installed +%define _UTT_DIR /usr/local/%_this_name +#Directory for rpm +%define _RPM_ROOT %_UTT_DIST_DIR/../rpm_root + +# +# Default RPM header. +# +# END_RPM_STD_HEADER: +# -------------------------------------------------------------------- + +Summary: %_this_summary +Name: %_this_name +Version: %_this_version +Release: %_this_release +#Copyright: %_this_copyright +License: GPL +Group: Development/Tools +URL: http://utt.amu.edu.pl +Vendor: Adam Mickiewicz University +BuildRoot: %_RPM_ROOT +#BuildArch: i586 +# requirements for utt application +#AutoReq: no +#AutoReqProv: no + +#Requires: glibc >= 2.1.3 +#Requires: libgcc1 >= 3.0 +#Requires: libgcc >= 3.0 +#Requires: libstdc++6 >= 3.4.1 +#Requires: libstdc++ >= 3.4.1 + +%description +%(cat ../common/description.def) + +%description -l pl +%(cat ../common/description.pl.def) + + +# ------------------------------------------------------------- +# preparing sources for compilation +%prep + +# source compilation +%build + +# rpm building +%install +%__mkdir_p $RPM_BUILD_ROOT%_UTT_DIR +cp -fr %_UTT_DIST_DIR/* $RPM_BUILD_ROOT%_UTT_DIR/ + + +# cleaning after rpm build +%clean +rm -rf $RPM_BUILD_ROOT + +# ------------------------------------------------------------- +#before installation +%pre + + +#after installation +%post +# we need to create utt.conf file +%_UTT_DIR/create_utt_config.pl +rm -f %_UTT_DIR/create_utt_config.pl +# we need to create links in /usr/local/bin +find %_UTT_DIR/bin/ -type f -exec ln -f {} /usr/local/bin \; + + +#before uninstallation +%preun +# we delete links from /usr/local/bin +for fn in `find %_UTT_DIR/bin/ -type f -exec basename {} \;`; do rm -f /usr/local/bin/$fn; done + + +#after uninstallation +%postun +# we remove all extra files +rm -fr %_UTT_DIR + +# ------------------------------------------------------------- +%files +%defattr(-,root,root) +/* diff --git a/app/dist/tarball/INSTALL b/app/dist/tarball/INSTALL new file mode 100644 index 0000000..2d46c52 --- /dev/null +++ b/app/dist/tarball/INSTALL @@ -0,0 +1,4 @@ +Here you can find some information about how to install utt. + +You should just unpack archive and then +execute create_utt_conf.pl and remove it. diff --git a/app/dist/tarball/Makefile b/app/dist/tarball/Makefile new file mode 100644 index 0000000..c033c59 --- /dev/null +++ b/app/dist/tarball/Makefile @@ -0,0 +1,38 @@ +# This makefile allows build tarball distribution for utt. + +# +# Some variables +# + +# Directory with utt binaries +ifndef _UTT_DIST_DIR + _UTT_DIST_DIR=${DIR} +endif + +# Where put result +ifndef _UTT_DIST_OUTPUT + _UTT_DIST_OUTPUT=${DIR} +endif + +# Common info about version and release +_UTT_VER=$(shell cat ../common/version.def) +_UTT_REL=$(shell cat ../common/release.def) + +# Temp vars +DIR=$(shell pwd) +_TARBALL_ROOT=$(DIR)/utt_$(_UTT_VER)-$(_UTT_REL) +_TAR_FILE_NAME=utt.$(_UTT_VER)_$(_UTT_REL).tar.gz + +#defualt task +.PHONY: default +default: + @echo Build dir is ${_UTT_DIST_DIR} + @echo Change output for tarball as ${_UTT_DIST_OUTPUT} + mkdir -p ${_TARBALL_ROOT} + cp -fr ${_UTT_DIST_DIR}/* ${_TARBALL_ROOT} + @# we add some extra files + cp ./INSTALL ${_TARBALL_ROOT}/ + + tar -czf ${_UTT_DIST_OUTPUT}/${_TAR_FILE_NAME} utt* + + rm -rf ${_TARBALL_ROOT} diff --git a/app/dist/tarball/README b/app/dist/tarball/README new file mode 100644 index 0000000..2f7f453 --- /dev/null +++ b/app/dist/tarball/README @@ -0,0 +1,6 @@ +This directory contains Makefile, which allows to create tar.gz archive. + +To create archive, just write: +make + +Warning: you need define variable _UTT_DIST_DIR. diff --git a/app/doc/Makefile b/app/doc/Makefile new file mode 100644 index 0000000..7cca627 --- /dev/null +++ b/app/doc/Makefile @@ -0,0 +1,27 @@ +main: utt.info utt.pdf utt.html utt.ps + +utt.info: utt.texinfo + makeinfo utt.texinfo + +utt.pdf: utt.texinfo + texi2pdf utt.texinfo + rm utt.{aux,cp,fn,ky,log,pg,toc,tp,vr} + +utt.html: utt.texinfo + makeinfo --html --no-split utt.texinfo + +utt.dvi: utt.texinfo + texi2dvi utt.texinfo + +utt.ps: utt.dvi + dvips -o utt.ps utt.dvi + + +copy: +ifdef UTT_DOC_DIR + cp utt.{info,ps,pdf,html} ${UTT_DOC_DIR} +endif + +clean: + rm -f utt.{aux,cp,dvi,fn,fns,html,info,ky,log,pdf,pg,ps,toc,tp,vr} + rm -f *~ diff --git a/app/doc/utt.texinfo b/app/doc/utt.texinfo new file mode 100644 index 0000000..0399ca7 --- /dev/null +++ b/app/doc/utt.texinfo @@ -0,0 +1,2687 @@ +\input texinfo @c -*-texinfo-*- +@documentencoding ISO-8859-2 +@c @documentlanguage pl + +@c %**start of header +@setfilename utt.info +@settitle UAM Text Tools v0.90 +@c %**end of header + +@copying +This manual is for UAM Text Tools (version 0.90, November, 2007) + +Copyright @copyright{} 2005, 2007 Tomasz Obrbski, Micha Stolarski, Justyna Walkowska, Pawe Konieczka. + +Permission is granted to copy, distribute and/or modify this document +under the terms of the GNU Free Documentation License, Version 1.2 +or any later version published by the Free Software Foundation; +with no Invariant Sections, no Front-Cover Texts, and no Back-Cover +Texts. A copy of the license is included in the section entitled GNU Free Documentation License,,GNU Free Documentation License. + +@c @quotation +@c Permission is granted to ... +@c No permission is granted until the document is completed. +@c @end quotation +@end copying + + +@titlepage +@title UAM Text Tools 0.90 - User Manual +@subtitle edition 0.01, @today +@subtitle status: prescript +@author by Justyna Walkowska, Tomasz Obr@,{}ebski and Micha@l{} Stolarski +@page +@vskip 0pt plus 1filll +@insertcopying +@end titlepage + +@contents + +@c @paragraphindent none + +@iftex +@parskip = 0.5@normalbaselineskip plus 3pt minus 1pt +@end iftex + +@c @headings off +@c @everyheading LEM(1) @| @| LEM(1) +@everyfooting @today @c @| @thispage @| + +@ifnottex + +@node Top +@top UTT - UAM Text Tools + +@insertcopying + +@menu +* General information:: +* UTT file format:: +* Configuration files:: +* UTT components:: +* Auxiliary tools:: +* Usage examples:: +* PMDBF dictionary:: +@c * Examples:: +@c * Copyright:: +* GNU Free Documentation License:: +* Reporting bugs:: +* Author:: +@end menu +@end ifnottex + + +@c ---------------------------------------------------------------------- + +@node General information +@chapter General information + +UAM Text Tools (UTT) is a package of language processing tools +developed at Adam Mickiewicz University. Its functionality includes: + +@itemize @bullet + +@item +tokenization +@item +dictionary-based morphological analysis +@item +heuristic morphological analysis of unknown words +@item +spelling correction +@item +pattern search +@item +sentence splitting +@item +generation of concordance tables +@end itemize + +The toolkit is destined for processing of raw (not annotated) +unrestricted text for any conceivable purpose. + +The system is organized as a collection of command-line programs, each +performing one operation, e.g. tokenization, lemmatization, spelling +correction. The components are independent one from another, the +unifying element being the uniform i/o file format. + +The components may be combined in various ways to provide various text +processing services. Also new components supplied by the used may be +easily incorporated into the system provided that they respect the i/o +file format conventions. + +UTT component programs does not depend on any specific tagset or +morphological description format. + +UTT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or (at your option) any later version. + +The Polex/PMDBF dictionary is licensed under the Creative Commons by-nc-sa License which prohibits commercial use. + + +List of contributors: + +@itemize +@item Pawel Konieczka +@item Tomasz Obrebski +@item Michal Stolarski +@item Marcin Walas +@item Justyna Walkowska +@end itemize + +@c ---------------------------------------------------------------------- +@c --------------------------------------------------------------------- + +@node UTT file format +@chapter UTT file format + +A UTT file contains annotation of a text. It consists of a sequence of +segments. Each segment explicitly refers to a continuous piece of the +text and provides some information on it. + +@section Segment format + +A segment occupies one line of a UTT file and consists of +space-separated fields: + + +@quotation +@sp 1 +[@var{start} [@var{length}]] @var{type} @var{form} [@var{annotation1} [@var{annotation2} ...]] +@sp 1 +@end quotation + +@table @var + +@item @var{start} +Non-negative integer value indicating the position in the source text where the +segment starts. + +@item @var{length} +Non-negative integer value indicating the length of the segment. + +@item @var{type} +A sequence of non-ASCII characters (without spaces or letters, which could lead to @var{type} being misinterpreted as a @var{start} or @var{length} field). +@var{type} reflects the main classification of segments - +into words, numbers, punctuation marks, meta-text markers. +@xref{tok output,,tok output}, for description of automatically recognized type markers. + +@item @var{form} +This field contains the textual form of the segment or the special +symbol @code{*} indicating that the form is not given (e.g. when the segment has been created artificially to mark something and is of lentgh 0). + +The characters or character sequences that have special meaning in the +@var{form} field are enumerated below. + +Characters with special meaning: + +@itemize +@item @code{_} - space character +@item @code{*} - undefined contents +@end itemize + +Escape sequences: + +@itemize +@item @code{\n} - new line +@item @code{\t} - tabulation +@item @code{\r} - carriage return + +@item @code{\_} - the @code{_} character +@item @code{\*} - the @code{*} character +@item @code{\\} - the @code{\} character + +@c @item @code{\hh} - a character with hexadecimal code @code{hh} (used for non-printable characters) +@end itemize + +@item @var{annotation1} +@item @var{annotation2} +@item ... +Annotation fields have the following format: + +@var{longname} @code{:} @var{value} + +or + +@var{shortname} @var{value} + +where @var{longname} is a string of alphanumeric characters +(isalnum() test), @var{shortname} - a single non-alphanumeric character +(ispunct() test), and @var{value} is an arbitrary string of non-blank characters. + +@end table + + +Only two fields are mandatory: @var{type} and @var{form}. All other fields +may be absent. In the case when only one number precedes the +@var{type} field, it is interpreted as the @var{START} position. + +If the @var{length} field is ommited, the length of the segment is the +length of the @var{form} field, except when the value of the +@var{form} field is @code{*} -- in this case, the length is assumed to +be 0. + +If the @var{start} field is also absent, the segment is assumed to directly +follow the preceding one. + +@c Conventions: + +@c Annotation fields with predefined meaning: + +@c @itemize +@c @item @code{!} - UTT components are allowed to modify the contents of +@c the @var{form} field (e.g. spelling correction does this). If this happens the +@c original form of the segment have to be placed in the @code{!}-field. +@c @item @code{@@} - morphological description +@c @item @code{=} - node identifier assignment (used in graph encoding) +@c @item @code{<} - preceding/dominating node(s) (used in graph encoding) +@c @item @code{>} - succeeding/subordinate node(s) (used in graph encoding) +@c @end itemize + +Segments of length 0 may be used to mark file positions with some +information. See e.g. BOS and EOS (beginning/end of sentence) markers +in the example below. + +Example: + +sentence: @samp{Piszemy dobre progrumy.} + +@example +0000 00 BOS * +0000 07 W Piszemy lem:pisa,V +0007 01 S _ +0008 05 W dobre lem:dobry,ADJ +0013 01 S _ +0014 08 W progrumy cor:programy lem:program,N +0022 01 P . +0023 00 EOS * +0023 01 S _ +0024 00 BOS * +0024 11 W Warszawiacy lem:Warszawiak,N +0035 01 S _ +0036 03 W te +0039 01 P . +0040 00 EOS * + +@end example + +@example +0000 BOS * +0000 W Piszemy lem:pisa,V +0007 S _ +0008 W dobre lem:dobry,ADJ +0013 S _ +0014 W progrumy cor:programy lem:program,N +0022 P . +0023 EOS * +@end example + +Posion information may be provided only for some types of segments: + +@example +0000 BOS * +W Piszemy lem:pisa,V +S _ +W dobre lem:dobry,ADJ +S _ +W progrumy cor:programy lem:program,N +P . +EOS * +S _ +0024 BOS * +W Warszawiacy lem:Warszawiak,N +S _ +W te +P . +EOS * +@end example + +Position/length information may be provided only when necessary: + +@example +0000 04 N * +0000 N 12 +P . +N 5 +S _ +W km +@end example + +@section UTT File + +A UTT file consists of a sequence of segments. The same text position +may be covered by multiple segments. In cosequence, ambiguous text +segmentation and ambiguous annotation may be represented. + +There are two structural requirements a valid UTT-formatted file +has to meet: + +@itemize @bullet + +@item +segments have to be sorted with respect to the @var{position} field, + +@item +for each +segment ending at position @var{n}, either there must be a segment starting at +position @var{n+1}, or position @var{n+1} is not covered by any segment; similarly +for each segment starting at position @var{n}, either there must be a segment +ending at position @var{n-1}, or the position @var{n-1} must not be covered +by any segment. + +@end itemize + +A valid annotation for the text fragment +@example +12.5 km +@end example + +may be + +@example +0000 02 N 12 +0000 04 N 12.5 +0002 01 P . +0003 01 N 5 +0004 01 S _ +0005 02 W km +@end example + +but not + +@example +0000 02 N 12 +0000 04 N 12.5 +0004 01 S _ +0005 02 W km +@end example + +because in the latter example the first segment (starting at position 0000, 2 characters long) ends at position @var{n}=0001 which is covered by the second segment and no segment starts at position @var{n+2}=0002. + +@section Character encoding + +The UTT component programs accept only 1-byte character encoding, such +as ISO, ANSI, DOS, UTF-8 (probably: not tested yet). + + +@c @section Formats + +@c @unnumberedsubsubsec Basic format + +@c While processing large amounts of the overhead related with explicit +@c ... of the start position and segment length becomes ... . Therefore, +@c for efficiency reasons certain shortcuts are possible: + +@c @unnumberedsubsubsec Relative start position + +@c Start position may be given as relative distance from the last +@c absolut position. + +@c @unnumberedsubsubsec Absent length + +@c Segment length may by omitted. Normally it can be restored by counting +@c the length of the @emph{form field}. For segments with the special value +@c @code{*} in the @emph{form field} length 0 is assumed. + +@c @unnumberedsubsubsec Absent length and start position + +@c Both start position and segment length may be omitted. In this format +@c each segment is assumed to follow the previous one. This format is, +@c therefore, suitable only for unambiguously tagged text +@c (0-length markers can be still used.) + + +@c @table @code +@c @item AL +@c @code{1234 03 W kot} +@c @item RL +@c @code{+56 03 W kot} +@c @item A +@c @code{1234 W kot} +@c @item R +@c @code{+56 W kot} +@c @item 0 +@c @code{W kot} +@c @end table + + +@c [JAK UZYSKA POLSKIE CZCIONKI W DVI???] + +@macro parhelp +@item @b{@minus{}@minus{}help}, @b{@minus{}h} +Print help. +@end macro + + +@macro parversion +@item @b{@minus{}@minus{}version}, @b{@minus{}V} +Print version information. +@end macro + +@macro parinteractive +@item @b{@minus{}@minus{}interactive, @minus{}i} +This option toggles interactive mode, which is by default off. In the +interactive mode the program does not buffer the output. +@end macro + + +@c @macro parfile +@c @item @b{@minus{}@minus{}file=@var{filename}, @minus{}f @var{filename}} +@c Input file name. +@c If this option is absent or equal to '@minus{}', the program +@c reads from the standard input. +@c @end macro + + +@c @macro paroutput +@c @item @b{@minus{}@minus{}output=@var{filename}, @minus{}o @var{filename}} +@c Regular output file name. To regular output the program sends segments +@c which it successfully processed and copies those which were not +@c subject to processing. If this option is absent or equal to +@c '@minus{}', standard output is used. +@c @end macro + +@c @macro parfail +@c @item @b{@minus{}@minus{}fail=@var{filename}, @minus{}e @var{filename}} +@c Fail output file name. To fail output the program copies the segments +@c it failed to process. If this option is absent or equal to +@c '@minus{}', standard output is used. +@c @end macro + + +@c @macro parcopy +@c @item @b{@minus{}@minus{}copy, @minus{}c} +@c Copy succesfully processed segments to regular output also in their +@c original input form. +@c @end macro + + +@macro parinputfield +@item @b{@minus{}@minus{}input-field=@var{fieldname}, @minus{}I @var{fieldname}} +The field containing the input to the program. The default is the +@var{form} field. The fields @var{position}, @var{length}, @var{type}, +and @var{form} are referred to as @code{1}, @code{2}, @code{3}, +@code{4}, respectively. +@end macro + + +@macro paroutputfield +@item @b{@minus{}@minus{}output-field=@var{fieldname}, @minus{}O @var{fieldname}} +The name of the field added by the program. The default is the name of the program. +@end macro + + +@macro pardictionary +@item @b{@minus{}@minus{}dictionary=@var{filename}, @minus{}d @var{filename}} +Dictionary file name. +@end macro + + +@macro parprocess +@item @b{@minus{}@minus{}process=@var{type}, @minus{}p @var{type}} +Process segments with the specified value in the @var{type} field. +Multiple occurences of this option are allowed and are interpreted as +disjunction. If this option is absent, all segments are processed. +@end macro + + +@macro parselect +@item @b{@minus{}@minus{}select=@var{fieldname}, @minus{}s @var{fieldname}} +Select for processing only segments in which the field named +@var{fieldname} is present. Multiple occurences of this option are +allowed and are interpreted as conjunction of conditions. If this +option is absent, all segments are processed. +@end macro + + +@macro parunselect +@item @b{@minus{}@minus{}unselect=@var{fieldname}, @minus{}S @var{fieldname}} +Select for processing only segments in which the field @var{fieldname} +is absent. Multiple occurences of this option are allowed and are +interpreted as conjunction of conditions. If this option is absent, +all segments are processed. +@end macro + + +@macro paroneline +@item @b{@minus{}@minus{}one-line} +This option makes the program print ambiguous annotation in one output +line by generating multiple annotation fields. By default when +ambiguous annotation may be produced for a segment, the segment is +multiplicated and each of the annotations is added to separate copy of +the segment. +@end macro + + +@macro paronefield +@item @b{@minus{}@minus{}one-field, @minus{}1} +This option makes the program print ambiguous annotation in one +annotation field. By default when ambiguous annotation may be produced +for a segment, the segment is multiplicated and each of the +annotations is added to separate copy of the segment. + +This option is useful when working with @command{kot} or @command{con}. +@end macro + + +@c --------------------------------------------------------------------- +@c --------------------------------------------------------------------- + +@c @node Common command line options +@c @chapter Common command line options + +@c @table @code + +@c @parhelp + +@c @item @b{@minus{}@minus{}help}, @b{@minus{}h} +@c Print help. + +@c @item @b{@minus{}@minus{}version}, @b{@minus{}v} +@c Print version information. + +@c @item @b{@minus{}@minus{}file=@var{filename}, @minus{}f @var{filename}} +@c Input file name. +@c If this option is absent or equal to '@minus{}', the program +@c reads from the standard input. + +@c @item @b{@minus{}@minus{}output=@var{filename}, @minus{}o @var{filename}} +@c Regular output file name. To regular output the program sends segments +@c which it successfully processed and copies those which were not +@c subject to processing. If this option is absent or equal to +@c '@minus{}', standard output is used. + +@c @item @b{@minus{}@minus{}fail=@var{filename}, @minus{}e @var{filename}} +@c Fail output file name. To fail output the program copies the segments +@c it failed to process. If this option is absent or equal to +@c '@minus{}', standard output is used. + +@c @item @b{@minus{}@minus{}only-fail} +@c Discard segments which would normally be sent to regular +@c output. Print only segments the program failed to process. + +@c @item @b{@minus{}@minus{}no-fail} +@c Discard segments the program failed to process. +@c (This and the previous option are functionally equivalent to, +@c respectively, @option{-o /dev/null} and @option{-e /dev/null}, but +@c make the programs run faster.) + +@c @item @b{@minus{}@minus{}input-field=@var{fieldname}, @minus{}I @var{fieldname}} +@c The field containing the input to the program. The default is usually +@c the @var{form} field (unless otherwise stated in the program +@c description). The fields @var{position}, @var{length}, @var{tag}, and +@c @var{form} are referred to as @code{1}, @code{2}, @code{3}, @code{4}, +@c respectively. + +@c @item @b{@minus{}@minus{}output-field=@var{fieldname}, @minus{}O @var{fieldname}} +@c The name of the field added by the program. The default is the name of +@c the program. + +@c @c @item @b{@minus{}@minus{}copy, @minus{}c} +@c @c Copy processed segments to regular output. + +@c @item @b{@minus{}@minus{}dictionary=@var{filename}, @minus{}d @var{filename}} +@c Dictionary file name. +@c (This option is used by programs which use dictionary data.) + +@c @item @b{@minus{}@minus{}process=@var{tag}, @minus{}p @var{tag}} +@c Process segments with the specified value in the @var{tag} field. +@c Multiple occurences of this option are allowed and are interpreted as +@c disjunction. If this option is absent, all segments are processed. + +@c @item @b{@minus{}@minus{}select=@var{fieldname}, @minus{}s @var{fieldname}} +@c Select for processing only segments in which the field named +@c @var{fieldname} is present. Multiple occurences of this option are +@c allowed and are interpreted as conjunction of conditions. If this +@c option is absent, all segments are processed. + +@c @item @b{@minus{}@minus{}unselect=@var{fieldname}, @minus{}S @var{fieldname}} +@c Select for processing only segments in which the field @var{fieldname} +@c is absent. Multiple occurences of this option are allowed and are +@c interpreted as conjunction of conditions. If this option is absent, +@c all segments are processed. + +@c @item @b{@minus{}@minus{}interactive @minus{}i} +@c This option toggles interactive mode, which is by default off. In the +@c interactive mode the program does not buffer the output. + +@c @item @b{@minus{}@minus{}config=@var{filename}} +@c Read configuration from file @file{@var{filename}}. + +@c @item @b{@minus{}@minus{}one @minus{}1} +@c This option makes the program print ambiguous annotation in one output +@c segment. By default when +@c ambiguous new annotation is being produced for a segment, the segment +@c is multiplicated and each of the annotations is added to separate copy +@c of the segment. + +@c @end table + +@c --------------------------------------------------------------------- +@c CONFIGURATION FILES +@c --------------------------------------------------------------------- + +@node Configuration files +@chapter Configuration files + +Values for all command line options accepted by a component +may be set in configuration files. The default location of the +configuration files for a component named @command{@var{program}} are + +@example + @file{/etc/utt/conf/@var{program}.conf} +@end example + +for system-wide configuration file and + +@example + @file{~/.utt/conf/@var{program}.conf} +@end example + +for user configuration file. + +@c The configuration file to load may be also specified with the +@c @option{--config} option. Configuration file need not be provided. + +For each option, the value is set according to the following priority: + +@itemize +@item command line +@c @item configuration file indicated with @option{--config} option +@item user configuration file (or configuration file indicated with the @option{--config} option) +@item system-wide configuration file +@end itemize + +Parameter values are specified in the following format: + +@var{parametername}=@var{value} + +where @var{parametername} is the short or long name of an option accepted by +the program, or + +@var{parametername} + +if the option does not need arguments. + +You can introduce comments to configuration files using the # sign. + +If a program accepts multiple occurences of an option (e.g. @var{lem}'s select option) you can specify them in two distinct lines of the program's configuration file. + +@c The equal sign may be omitted. + + +@quotation Tip +If you have two (or more) frequently used sets of options for the same +program (eg. lem with PMDBF dictionary and lem with a user dictionary) +a good solution is to create two soft links to lem, called +eg. lemg and lemu and specify their configuration in files lemg.conf +and lemu.conf respectively. +@end quotation + +@c --------------------------------------------------------------------- +@c COMPONENTS +@c --------------------------------------------------------------------- + +@node UTT components +@chapter UTT components + +UTT components are of three types: + +@menu +Sources: programs which read non-UTT data (e.g. raw text) and produce output +in UTT format +* tok:: a tokenizer + +Filters: programs which read and produce UTT-formatted data +@c * sen - the sentencizer:: +* lem:: a morphological analyzer +* gue:: a morphological guesser +* cor:: a spelling corrector +* sen:: a sentensizer +@c * gph - the graphizer:: +* ser:: a pattern search tool (marks matches) +* grp:: a pattern search tool (selects sentences containing a match) + +Sinks: programs which read UTT data and produce output in another format +* kot:: an untokenizer +* con:: a concordance table generator +@end menu + +@c --------------------------------------------------------------------- +@c TOK +@c --------------------------------------------------------------------- + +@page +@node tok +@section tok - a tokenizer + +@c ---------------------------------------- + +@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @strong{Authors:} @tab Tomasz Obrbski +@item @strong{Component category:} @tab source +@end multitable + + +@menu +* tok description:: +* tok input:: +* tok output:: +* tok command line options:: +* tok example:: +@end menu + +@node tok description +@subsection Description + +@code{tok} is a simple program which reads a text file and identifies +tokens on the basis of their orthographic form. The type of the token +is printed as the @var{type} field. + +@node tok input +@subsection Input + +Raw text. + +@node tok output +@subsection Output + +UTT-file with four fields: @var{start}, @var{length}, @var{type}, and @var{form}. In the @var{type} field five types of tokens are distinguished: + +@itemize + +@item @code{W} +(word) +- continuous sequence of letters + +@item @code{N} +(number) +- continuous sequence of digits + +@item @code{S} +(space) +- continuous sequence of space characters + +@item @code{P} +(punctuation mark) +- single printable characters not belonging to any of the other classes + +@item @code{B} +(unprintable character) +- single unprintable character + +@end itemize + + + +@node tok command line options +@subsection Command line options + +@table @code + +@item @b{@minus{}@minus{}help}, @b{@minus{}h} +Print help. + +@item @b{@minus{}@minus{}version}, @b{@minus{}V} +Print version information. + +@item @b{@minus{}@minus{}interactive, @minus{}i} +This option toggles interactive mode, which is by default off. In the +interactive mode the program does not buffer the output. + +@end table + +@node tok example +@subsection Example + +Input: + +@example +Piszemy dobre programy. +@end example + +Output: + +@example +0000 07 W Piszemy +0007 01 S _ +0008 05 W dobre +0013 01 S _ +0014 08 W programy +0022 01 P . +0023 01 S \n +@end example + + +@c --------------------------------------------------------------------- +@c SEN +@c --------------------------------------------------------------------- + +@c @node sen - sentencizer +@c @chapter sen - sentencizer + +@c Authors: Tomasz Obrbski + +@c --------------------------------------------------------------------- +@c LEM +@c --------------------------------------------------------------------- + +@page +@node lem +@section lem - morphological analyzer + +@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @strong{Authors:} @tab Tomasz Obrbski, Micha Stolarski +@item @strong{Component category:} @tab filter +@end multitable + +@menu +* lem description:: +* lem command line options:: +* lem input:: +* lem output:: +* lem example:: +* lem dictionaries:: +* lem hints:: +@end menu + +@node lem description +@subsection Description + +@command{lem} performs morphological analysis of a simple orthographic +word, returning all its possible morphological annotations, +disregarding the context. + +@c ---------------------------------------- + +@node lem command line options +@subsection Command line options + +@table @code +@parhelp +@parversion +@parinteractive +@c @parfile +@c @paroutput +@c @parfail +@c @parcopy +@parinputfield +@paroutputfield +@pardictionary +@parprocess +@parselect +@parunselect +@paroneline +@paronefield +@end table + +@c ---------------------------------------- + +@node lem input +@subsection Input + +Lem reads a UTT file and processes the value of the @var{form} field +(the input field may be changed with @option{--input-field} option). + +@node lem output +@subsection Output + +@command{lem} adds a new annotation field, whose default name is @code{lem}. In +case of ambiguity either the segment is multiplicated (default), +multiple @code{lem} fields are added (@option{--one-line}) or ambiguous +annotation is produced as the value of single @code{lem} field (option +@option{--one-field,-1}): + +@itemize @bullet + +@item +unambiguous value format: + +@example + , +@end example + +@item +ambiguous value format (@option{--one-field} option) + + +@example + ,[,][;,[,]] +@end example + +(alternative descriptions for the same lemma are separated by commas, +alternative lemmata are separated by semicolons.) + +@end itemize + +@node lem example +@subsection Example + +Input: + +@example +0000 07 W Piszemy +0007 01 S _ +0008 05 W dobre +0013 01 S _ +0014 08 W programy +0022 01 P . +0023 01 B \n +@end example + +Output (default): + +@example +0000 07 W Piszemy lem:pisa,V/AiVpMdTrfNpP1 +0007 01 B _ +0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn +0008 05 W dobre lem:dobry,ADJ/DpNsCnavGn +0013 01 B _ +0014 08 W programy lem:program,N/GiNpCa +0014 08 W programy lem:program,N/GiNpCn +0014 08 W programy lem:program,N/GiNpCv +0022 01 P . +0023 01 B \n +@end example + +Output (@option{--one-line} option): + +@example +0000 07 W Piszemy lem:pisa,V/AiVpMdTrfNpP1 +0007 01 S _ +0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn lem:dobry,ADJ/DpNsCnavGn +0013 01 S _ +0014 08 W programy lem:program,N/GiNpCa lem:program,N/GiNpCn lem:program,N/GiNpCv +0022 01 P . +0023 01 S \n +@end example + +Output (@option{--one-field} option): + +@example +0000 07 W Piszemy lem:pisa,V/AiVpMdTrfNpP1 +0007 01 S _ +0008 05 W dobre lem:dobry,ADJ/DpNpCnavGaifn,ADJ/DpNsCnavGn +0013 01 S _ +0014 08 W programy lem:program,N/GiNpCa,N/GiNpCn,N/GiNpCv +0022 01 P . +0023 01 S \n +@end example + +@c ---------------------------------------- + +@node lem dictionaries +@subsection Dictionaries + +@command{lem} requires a dictionary. The dictionary may be provided in +one of two formats: in text (source) format or in binary (fsa) format. + +@subsubheading Text format + +Dictionary entries have the following structure: + +@example +
;,[;,] +@end example + +@var{lemma} may be given explicitly or in the cut-add format: + +@example +@code{[-]} +@end example + +meaning: replace prefix of length @code{} with +string @code{}, replace suffix of length @code{} with string +@code{}. For example @code{3t} transforms @samp{kocie} into +@samp{kot}, @code{3-4ay} transforms @samp{najbielsi} into @samp{biay} + +Each dictionary entry must be written in one line and must not contain blank characters. + +Examples: +@example +kot;0,N/GaNsCn +kota;1,N/GaNsCg;1,N/GaNsCa +kotu;1,N/GaNsCd +kotem;2,N/GaNsCi +kocie;3t,N/GaNsCl;3t,N/GaNsCv +najbielsi;3-4ay,ADJ/DsNpCnGp +najbielsze;3-5ay,ADJ/DsNpCnGaifn +najlepsi;dobry,ADJ/DsNpCnGp +najlepsze;dobry,ADJ/DsNpCnGaifn +@end example + + +The mandatory file name extension for a text dictionary is @code{dic}. For large +dictionaries it is preferable, however, to compile them into binary +(fsa) format. + +@subsubheading Binary format + +The mandatory file name extension for a binary dictionary is @code{bin}. To +compile a text dictionary into binary format, write: + +@example +compiledic .dic +@end example + +@subsubheading Polex/PMDBF dictionary + +A large-coverage morphological dictionary for Polish language, Polex/PMDBF, is included in +the distribution as the default @emph{lem}'s dictionary. It's +located by default in: + +@file{$HOME/.utt/pl/lem.bin} + +@node lem hints +@subsection Hints + +@c @subsubheading Combining data from multiple dictionaries + +@c @itemize + +@c @item Apply , then apply to words which were not annotatated. + +@c @example +@c lem -d | lem -S lem -d +@c @end example + +@c @item Add annotations from two dictionaries and . + +@c @example +@c lem -c -d | lem -S lem -d +@c @end example + +@c @end itemize + + +@c --------------------------------------------------------------------- +@c GUE +@c --------------------------------------------------------------------- + +@page +@node gue +@section gue - morphological guesser + +@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} + +@item @strong{Authors:} @tab Micha Stolarski, Tomasz Obrbski +@item @strong{Component category:} @tab filter + +@end multitable + +@command{gue} guesess morphological descriptions of the form contained +in the @var{form} field. + +@menu +* gue command line options:: +* gue example:: +* gue dictionaries:: +@end menu + +@node gue command line options +@subsection Command line options + +@table @code + +@parhelp +@parversion +@parinteractive +@c @parfile +@c @paroutput +@c @parfail +@c @parcopy +@parinputfield +@paroutputfield +@pardictionary +@parprocess +@parselect +@parunselect +@paroneline +@paronefield + +@item @b{@minus{}@minus{}delta=@var{n}} +Stop displaying answers after fall of weight, that is, when weight difference between 2 subsequent results is more than delta value (default=`0.2'). + + +@item @b{@minus{}@minus{}cut-off=@var{n}} +Do not display answers with less weight than cut-off value (default=`200'). + + +@item @b{@minus{}@minus{}guess_count=@var{n}, @minus{}n @var{n}} +Guess up to n descriptions (default=`0', which means 'display all results'). + + + +@end table + +@node gue example +@subsection Example + +@example +command: gue -n 2 + +input: +0000 07 W smerfny + +output: +0000 07 W smerfny gue:,ADJ/CaDpGiNs +0000 07 W smerfny gue:,ADJ/CnvDpGaipNs +@end example + + +@node gue dictionaries +@subsection Dictionaries + +@command{gue} requires a dictionary. For now, the dictionary must be provided in binary (fsa) format. +The fsa format is created by compiling text-format dictionaries. + + + +@subsubheading Text format + +Dictionary entries have the following structure: + +@example +@var{prefix}@code{*}@var{suffix}@code{;}@var{lemma}@code{,}@var{description}@code{:}@var{weight} +@end example + +@var{lemma} must be given in the cut-add format: + +@example +@code{[-]} +@end example +(no spaces in between): replace prefix of length @var{cut1} with +string @var{add1}, replace suffix of length @var{cat2} with string +@var{add2}. + + +Example: @code{3-4ay} transforms @i{najbielsi} into @i{biay} + + +@var{description} contains the part of speech and morphosyntactic information (@xref{PMDBF dictionary}.). + +@var{weight} is an integer value between 1 and 999 indicating the +likelihood of the guess. + +@example +*k;1a,N/GfNsCa +naj*elszy;3-4ay,ADJ/...:... +@end example + + +@c --------------------------------------------------------------------- +@c COR +@c --------------------------------------------------------------------- + +@page +@node cor +@section cor - spelling corrector + +@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @strong{Authors:} @tab Tomasz Obrbski, Micha Stolarski +@item @strong{Component category:} @tab filter +@end multitable + +The spelling corrector applies Kemal Oflazer's dynamic programming +algorithm @cite{oflazer96} to the FSA representation of the set of +word forms of the Polex/PMDBF dictionary. Given an incorrect +word form it returns all word forms present in the dictionary whose +edit distance is smaller than the threshold given as the parameter. + +By default @code{cor} replaces the contents of the @var{form} field +with new corrected value, placing the old contents in the @code{cor} +field. + + +@menu +* cor command line options:: +* cor dictionaries:: +@end menu + + +@node cor command line options +@subsection Command line options + +@table @code + +@parhelp +@parversion +@parinteractive +@c @parfile +@c @paroutput +@c @parfail +@c @parcopy +@parinputfield +@paroutputfield +@pardictionary +@parprocess +@parselect +@parunselect +@paroneline +@paronefield + +@item @b{@minus{}@minus{}distance=@var{int}, @minus{}n @var{int}} +Maximum edit distance (default='1'). + + +@end table + +@node cor dictionaries +@subsection Dictionaries + +@command{cor} requires a dictionary. The dictionary has to be provided in binary (fsa) format. +The fsa format is created by compiling text-format dictionaries. + +@subsubheading Text format + +The @command{cor} dictionary is a list of words: +@example +odlot +odlotowy +odludek +@end example + +@page +@node sen +@section sen - a sentensizer + +@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} + +@item @strong{Authors:} @tab Tomasz Obrbski +@item @strong{Component category:} @tab filter + +@end multitable + +@command{sen} detects sentence boundaries in UTT-formatted texts and marks them with special zero-length segments, in which the @var{type} field may contain the BOS (beginning of sentence) or EOS (end of sentence) annotation. + +@menu +@c * sen input:: +@c * sen output:: +* sen example:: +@end menu + +@node sen example +@subsection Example + +@example +command: sen + +input: +0000 05 W Cze +0005 01 P ! +0006 01 S _ +0007 02 W To +0009 01 S _ +0010 02 W ja +0012 01 P . +0013 01 S \n + +output: +0000 00 BOS * +0000 05 W Cze +0005 01 P ! +0006 00 EOS * +0006 00 BOS * +0006 01 S _ +0007 02 W To +0009 01 S _ +0010 02 W ja +0012 01 P . +0013 01 S \n +0014 00 EOS * +@end example + + +@c --------------------------------------------------------------------- +@c GPH +@c --------------------------------------------------------------------- + +@c @node gph - graphizer +@c @chapter gph - graphizer + +@c Authors: Tomasz Obrbski + + + +@c SER +@c --------------------------------------------------------------------- +@c --------------------------------------------------------------------- + +@page +@node ser +@section ser - pattern search tool + +@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @strong{Authors:} @tab Tomasz Obrbski +@item @strong{Component category:} @tab filter +@end multitable + +@command{ser} looks for patterns in UTT-formatted texts. + +@menu +* ser command line options:: +* ser pattern:: +* ser how ser works:: +* ser customization:: +* ser limitations:: +* ser requirements:: +@end menu + + +@c --------------------------------------------------------------------- +@node ser command line options +@subsection Command line options + +@table @code + +@parhelp +@parversion +@c @parfile +@c @paroutput +@c @parinputfield +@c @paroutputfield +@parprocess +@parinteractive + +@item @b{@minus{}@minus{}pattern=@var{pattern}, @minus{}e @var{pattern}} +The search pattern. + +@item @b{@minus{}@minus{}morph=@var{field}} +The name of the annotation field containing the morphological +description (default @code{lem}). + +@item @b{@minus{}@minus{}flex} +Only print the generated flex source code. + +@item @b{@minus{}@minus{}macro=@var{filename}} +Read macrodefinitions from file @var{filename} rather than from +default location. This option allows to redefine the set of terms. + +@item @b{@minus{}@minus{}define=@var{filename}} +Append macrodefinitions from file @var{filename}. This option +allows to extend the set of terms. + +@end table + + +@c --------------------------------------------------------------------- +@node ser pattern +@subsection Pattern + +The @command{ser} pattern is a regular expression over terms corresponding +to text segments or segment sequences. Predefined terms are: + +@table @code + +@item seg(@var{t},@var{f},@var{a}) +a segment of type @var{t}, containing form @var{f} and annotation +@var{a} + +@item form(@var{f}) +a segment containing form @var{f} + +@item field(@var{f}) +a segment containing annotation field @var{f} + +@item space(@var{f}) +a space segment of form @var{f} + +@item word(@var{f}) +a word segment of form @var{f} + +@item punct(@var{f}) +a punct segment of form @var{f} + +@item number(@var{f}) +a number segment of form @var{f} + +@item lexeme(@var{f}) +a word segment with lemma @var{f} + +@item cat(@var{c}) +a word segment of category @var{c} + +@end table + +All arguments are optional. If an argument is omitted, an arbitrary +string of non-blank characters is assumed as the argument value. Term +arguments may be arbitrary character-level regular expressions. The +following special symbols can by used: + +@multitable {aaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @code{[@dots{}]} @tab a character class +@item @code{[^@dots{}]} @tab a negated character class +@item @code{|} @tab alternative +@item @code{*} @tab repetition, including zero times +@item @code{+} @tab repetition, at least one time +@item @code{?} @tab optionality +@item @code{@{@var{m},@var{n}@}} @tab repetition from @var{m} to @var{n} times +@item @code{@{@var{m},@}} @tab repetition @var{m} or more times +@item @code{@{@var{m}@}} @tab repetition @var{m} times +@item @code{@var{\ddd}} @tab the character with octal value @var{ddd} +@item @code{\x@var{hh}} @tab the character with hexadecimal value @var{hh} +@item @code{( )} @tab parentheses, used to override precedence +@c @end multitable + +@c @multitable {aaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @code{.} @tab a non-blank character +@item @code{\w} @tab a letter +@item @code{\W} @tab a non-blank character other than a letter +@item @code{\d} @tab a digit +@item @code{\D} @tab a non-blank character other than a digit +@item @code{\s} @tab a space or tab character +@item @code{\S} @tab a non-blank character (the same as @code{.}) +@item @code{\l} @tab a lowercase letter +@item @code{\L} @tab an uppercase letter +@end multitable + + +@noindent The following characters: +@example +@verb{% [ ] ^ | * + ? { } , . < > \ %} +@end example +must be escaped with a backslash, i.e. written as: +@example +@verb{% \[ \] \^ \| \* \+ \? \{ \} \, \. \< \> \\ %} +@end example + +@quotation Note +The special symbols are ... borrowed from Perl with minor +modifications ... for convenience +The meaning of certain special characters/sequences slightly differs +from their common ???. This is motivated by convenience reasons. +The meaning of the @code{.} special character is modified due to +the special function of spaces in utt files (they are field +separators). Use @code{\s} to explicitly +@end quotation + +In the argument of the @code{cat} term a special operator <...> may be +used. A category specification enclosed in angle brackets matches all +category descriptions which are consistent (non-contradictory) with the +specification. For example @code{} matches all noun descriptions, +@code{} matches all adjectives in accusative or nominal case. + + +@* +@noindent @b{Examples of one-segment patterns:} + +@multitable {aaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @code{seg} @tab any segment +@item @code{word} @tab any word-form +@item @code{word(pomocy)} @tab the word-form @samp{pomocy} +@item @code{word(naj.+)} @tab a word-form beginning with @samp{naj} +@item @code{word(\L\l+)} @tab a capitalized word-form +@item @code{punct} @tab a punctuation character +@item @code{space(.*\\n.*)} @tab a space segment containing a newline character +@item @code{lexeme(pomoc)} @tab any form of the lexeme 'pomoc' +@item @code{cat(N/.*)} @tab a word which category starts with @code{N/} +@item @code{cat()} @tab a word which category matches @code{N/Ca} +@end multitable + +@* +@noindent @b{Examples of multi-segment patterns:} + +@table @code + +@item (word(\L) punct(\.) space?)+ word(\L\l+) +a sequence of initials followed by a surname + +@item punct seg(W|S|N)* cat() seg(W|S|N)* punct +a text fragment between two punctuation characters, containing an +ocurrence of a relative pronoun + +@end table + + +@node ser how ser works +@subsection How ser works + +@node ser customization +@subsection Customization + +@c All predefined terms correspond to single segments, + +@example +define(`verbseq', `(cat(V) (space cat(V)))') +@end example + + +the term @code{cat()} may not be used as a ... of + +@c See @command{m4} manual for further details on macro definition format. + +@node ser limitations +@subsection Limitations + +more than 3 attributes in <>. + +@node ser requirements +@subsection Requirements + +In order to run @command{ser}, the following programs must be +installed in the system: + +@itemize + +@item @command{m4} +@item @command{grep} +@item @command{flex} +@item @command{gcc} + +@end itemize + + +@c GRP +@c --------------------------------------------------------------------- +@c --------------------------------------------------------------------- + +@page +@node grp +@section grp - pattern search tool + +@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @strong{Authors:} @tab Tomasz Obrbski +@item @strong{Component category:} @tab filter +@end multitable + + +@code{gre} selects sentences containing an expression matching a +pattern. The pattern format is exactly the same as that accepted by +@code{ser}. + +@code{gre} is intended mainly for speeding up corpus search process. +It is extremely fast (processing speed is usually higher then the speed +of reading the corpus file from disk). + + + +@c @menu +@c * ser command line options:: +@c * ser pattern:: +@c * ser how ser works:: +@c * ser customization:: +@c * ser limitations:: +@c * ser requirements:: +@c @end menu +@menu +* grp command line options:: +* grp pattern:: +* grp hints:: +@end menu + +@node grp command line options +@subsection Command line options + +@table @code + +@parhelp +@parversion +@c @parfile +@c @paroutput +@c @parinputfield +@c @paroutputfield +@parprocess +@parinteractive + +@item @b{@minus{}@minus{}pattern=@var{pattern}, @minus{}e @var{pattern}} +The search pattern. + +@item @b{@minus{}@minus{}morph=@var{field}} +The name of the annotation field containing the morphological +description (default @code{lem}). + +@item @b{@minus{}@minus{}command} +Only print the generated flex source code. + +@item @b{@minus{}@minus{}macro=@var{filename}} +Read macrodefinitions from file @var{filename} rather than from +default location. This option allows to redefine the set of terms. + +@item @b{@minus{}@minus{}define=@var{filename}} +Append macrodefinitions from file @var{filename}. This option +allows to extend the set of terms. + +@end table + + +@node grp pattern +@subsection Pattern + +(see @code{ser}) + +@node grp hints +@subsection Hints + +The corpus search speed may be increased by combining grp with lzop +compression tool (grp usually processes data faster than it is read from a +disk, especially for slow laptop drives). + +@example +cat corpus | tok | sen | lem | grp -a p | lzop -7 > corpus.grp.lzo +@end example + +@example +lzop -cd corpus.grp.lzo | grp -a gP -e @var{EXPR} | ser -e @var{EXPR} +@end example + + +@c --------------------------------------------------------------------- +@c kot +@c --------------------------------------------------------------------- +@c --------------------------------------------------------------------- + +@page +@node kot +@section kot - untokenizer + +Authors: Tomasz Obrbski + +@command{kot} is the opposite of @command{tok}. It changes UTT-formatted text into plain text. + +@menu +* kot command line options:: +* kot usage examples:: +@end menu + +@node kot command line options +@subsection Command line options + +@table @code + +@parhelp + +@c @item @b{@minus{}@minus{}version}, @b{@minus{}v} + +@c @item @b{@minus{}@minus{}file=@var{filename}, @minus{}f @var{filename}} + +@c @item @b{@minus{}@minus{}output=@var{filename}, @minus{}o @var{filename}} + +@c @item @b{@minus{}@minus{}interactive @minus{}i} + +@c @item @b{@minus{}@minus{}config=@var{filename}} + +@item + +@item @b{@minus{}@minus{}gap-fill=@var{string}, @minus{}g @var{string}} +print @var{string} between nonadjacent segments of the input file + +@item @b{@minus{}@minus{}spaces, @minus{}r} +retain the special characters @code{_}, @code{\t}, +@code{\n}, @code{\r}, @code{\f} unexpanded in the output + +@end table + +@node kot usage examples +@subsection Usage examples + +@example +cat legia.txt | tok | kot +@end example + +@example +cat legia.txt | tok | lem -1 | kot +@end example + +@c CON............................................................ +@c ............................................................... +@c ............................................................... + +@page +@node con +@section con - concordance table generator + +@command{con} generates a concordance table based on a pattern given to @command{ser}. + +@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @strong{Authors:} @tab Justyna Walkowska +@item @strong{Component category:} @tab sink +@end multitable +@c + +@menu +* con command line options:: +* con usage example:: +* con hints:: +@end menu + +@node con command line options +@subsection Command line options + +@table @code + +@parhelp + +@c @item @b{@minus{}@minus{}help}, @b{@minus{}h} +@c @item @b{@minus{}@minus{}version}, @b{@minus{}v} +@c @item @b{@minus{}@minus{}file=@var{filename}, @minus{}f @var{filename}} +@c @item @b{@minus{}@minus{}output=@var{filename}, @minus{}o @var{filename}} +@c @item @b{@minus{}@minus{}fail=@var{filename}, @minus{}e @var{filename}} [???] +@c @item @b{@minus{}@minus{}copy, @minus{}c} [???] +@c @item @b{@minus{}@minus{}input-field=@var{fieldname}, @minus{}I @var{fieldname}} +@c @item @b{@minus{}@minus{}output-field=@var{fieldname}, @minus{}O @var{fieldname}} +@c @item @b{@minus{}@minus{}process=@var{class}, @minus{}p @var{class}} +@c @item @b{@minus{}@minus{}interactive @minus{}i} +@c @item @b{@minus{}@minus{}config=@var{filename}} +@c @item +@c @item @b{@minus{}@minus{}pattern=@var{pattern}, @minus{}e @var{pattern}} +@c search pattern +@c +@c @item @b{@minus{}@minus{}flex} +@c only print the generated flex source code +@c +@c @item @b{@minus{}@minus{}macro=@var{filename}} +@c read macrodefinitions from file @var{filename} rather than from +@c default location. This option allows to redefine the set of terms. +@c +@c @item @b{@minus{}@minus{}define=@var{filename}} +@c append macrodefinitions from file @var{filename}. This option +@c allows to extend the set of terms. + +@item @b{@minus{}@minus{}left @minus{}l} + Left context info (default='30c'). Example: +@example + -l=5c: left context is 5 characters + -l=5w: left context is 5 words + -l=5s: left context is 5 non-empty input lines + -l='\s*\S+\sr\S+BOS': left context starts with the given regex +@end example + +@item @b{@minus{}@minus{}right @minus{}r} + Right context info (default='30c'). +@item @b{@minus{}@minus{}trim @minus{}t} + Clear incomplete words from output. +@item @b{@minus{}@minus{}white @minus{}w} + DO NOT change all white characters into spaces. +@item @b{@minus{}@minus{}column @minus{}c} + Left column minimal width in characters (default = 0). +@item @b{@minus{}@minus{}ignore @minus{}i} + Ignore segment inconsistency in the input. +@item @b{@minus{}@minus{}bon} + Beginning of selected segment (regex, default='[0-9]+ [0-9]+ BOM .*'). +@item @b{@minus{}@minus{}eob} + End of selected segment (regex, default='[0-9]+ [0-9]+ EOM .*'). +@item @b{@minus{}@minus{}bod} + Selected segment beginning display string (default='['). +@item @b{@minus{}@minus{}eod} + Selected segment end display string (default=']'). + + + +@end table + +@node con usage example +@subsection Usage example +@example +cat file.txt | tok | lem -1 | ser -e 'lexeme(dom) | con' +@end example + + +@node con hints +@subsection Hints + +@command{con} is a rather slow program. Do not pass large amounts of +redundant text through this program. @command{con} works fine in the following +sequence: + +@example +... | grp -e EXPR | ser -e EXPR | con +@end example + + + +@c --------------------------------------------------------------------- +@c --------------------------------------------------------------------- + +@page +@node Auxiliary tools +@chapter Auxiliary tools + +@menu +* compiledic:: dictionary compiler +* fla:: UTT file flattener +* unfla:: UTT file unflattener +@end menu + + +@page +@node compiledic +@section compiledic - the dictionary compiler + +@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @strong{Authors:} @tab Michal Stolarski, Tomasz Obrebski +@item @strong{Component category:} @tab additional tool +@end multitable +@c + +@command{compiledic} compiles dictionaries in text format (@code{.dic} extension) into binary +(FSA) format (@code{.bin} extension). + +Automaton representation of a dictionary is built using the AT&T tools: +@itemize +@item AT&T FSM Library, +@item AT&T Lextools. +@end itemize + +In order for the compiledic program to work you have to install the +above mentioned packages into your system. They are freely available +for non-commercial use. + +Usage: +@example + compiledic .dic +@end example + +The file .bin will be generated. + +Remarque: The program produces a lot of temporary files which are +stored in the current directory. They are deleted after successfull +termination of the program. + +@c @menu +@c * con command line options:: +@c * con usage example:: +@c * con hints:: +@c @end menu + + +@page +@node fla +@section fla - the UTT file flattener + +@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @strong{Authors:} @tab Tomasz Obrbski +@item @strong{Component category:} @tab filter +@end multitable +@c + +@command{fla} ``flattens'' a utt file by merging segments belonging +to one sentence in one line. Technically, end-of-line characters +('\n', ASCII code 10) are replaced with line-feed characters ('\f', +ASCII code 12). The flattening makes it possible to process UTT files +with such tools as @command{grep} or @command{sed} sentence by +sentence (used in @command{grp} and @command{mar}). + +Flattened files should have the suffix @code{.fla}, eg. @file{thetext.utt.fla}. + +Flattened files are still human-readible. + +Usage: + +@example + fla [] +@end example + +The facultative argument is a regular expression describing segments +which should be treated as sentence beginnings (the test is: the +segment contains a fragment matching the @code{}). By +default, segments containing a field @code{BOS} are seeked. +@c @menu +@c * con command line options:: +@c * con usage example:: +@c * con hints:: +@c @end menu + + + +@page +@node unfla +@section unfla - the UTT file unflattener + +@multitable {aaaaaaaaaaaaaaaaaaaaaaaaa} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@item @strong{Authors:} @tab Tomasz Obrbski +@item @strong{Component category:} @tab filter +@end multitable + +@command{unfla} transforms a flattened UTT file, produced by +@command{fla}, into the regular format by restoring end-of-line +characters. + + + + +@c --------------------------------------------------------------------- +@c USAGE EXAMPLES +@c --------------------------------------------------------------------- + +@node Usage examples +@chapter Usage examples + +@subsubheading Simple pipelines + +@enumerate + +@item tokenization + +cat text | tok > output1 + +@item morphological annotation (1) + +simple dictionary based lemmatization + +cat text | tok | lem > output1 + +@item morphological annotation (2) + +1) perform dictionary-based lemmatization +4) guess descriptions for words which have no annotation + +@example +cat text | tok | lem | gue -S lem > output2 +@end example + +@item morphological annotation (3) + +1) perform dictionary-based lemmatization +2) try to correct words with no annotation +3) perform dictionary-based lemmatization of corrected words +4) guess descriptions for words which still have no annotation + +@example +cat text | tok | lem | cor -p W -S lem | lem -I cor | gue -p W -S lem +@end example +@item spelling correction + + + +@example +cat text | tok | lem --only-fail | cor -1 > output3 +@end example + +@item Expression extraction + +Extraction of all occurrences of a verb followed by a form of the noun 'rozmowa'. + +@example +cat text | tok | lem -1 | ser -e 'cat() space lexeme(rozmowa)' -m | kot > output4 +@end example + +@item A word in context + +Extraction of text fragments containing a form of the lexeme 'rozmowa' in +the context of 5 preceeding and 5 succeeding corpus segments. + +@example +cat text | tok | lem -1 | ser -e 'seg@{5@} lexeme(rozmowa) seg@{5@}' -m | kot > output +@end example + +@item generation of concordance table (1) + +@example +cat text | tok | lem -1 | ser -e 'cat() space lexeme(rozmowa)' | con +@end example + +10" + +@item generation of concordance table (2) + +The same as above but much faster + +@example +cat text | tok | lem -1 | \ +grp -e 'cat() space lexeme(rozmowa)' | \ +ser -e 'cat() space lexeme(rozmowa)' | \ +con +@end example + +2" + +@item generation of concordance table (3) + +Usually, one performs repetitively search over the same corpus. In +such case it is advisable to transform the corpus data into the format +required by @command{grp} first, and then use the preprocessed data. + +As @command{grp} (@command{grep}) processes data faster then it is +read from the disk drive, the search time may be still shortened by +using file compression techniques. We suggest usin @command{lzop}. + +@item the fastest way to search a large corpus + +step 1: preprocessing + +@example +cat corpus | tok | sen | lem -1 \ +| grp -a p | lzop -7 > corpus.grp.lzo +@end example + +step 2: search + +@example +lzop -cd corpus.grp.lzo | grp -a gP -e 'cat() space +lexeme(rozmowa)' | ser -e 'cat() space lexeme(rozmowa)' | con +@end example + +@end enumerate + +@subsubheading More complicated configurations + + +@example +mknod fifo1 p +mknod fifo2 p +mknod fifo3 p +mknod fifo4 p +mknod fifo5 p + +tok | lem -p W -e fifo1 > fifo2 & +cor -e fifo3 < fifo1 | lem > fifo4 & +gue < fifo3 > fifo5 & +sort -m fifo2 fifo4 fifo5 + +rm fifo? +@end example + + +@c --------------------------------------------------------------------- +@c --------------------------------------------------------------------- + +@c --------------------------------------------------------------------- +@c PMDBF DICTIONARY +@c --------------------------------------------------------------------- + +@node PMDBF dictionary +@chapter PMDBF dictionary + +UTT components come with lexical data derived from Polish +Morphological Database (PMDB). + +@menu +* PMDBF files:: +* PMDBF tag structure:: +* PMDBF parts of speech:: +* PMDBF morphosyntactic attributes:: +@end menu + +@node PMDBF files +@section Files + +@node PMDBF tag structure +@section Tag structure + +pos = [[:upper:]]+ + +attr = [[:upper:]]+ + +val = [[:lower:][:digit:]?!*+-] | <[^>\n]+> + +descr = pos ( / ( attr val + ) + ) ? + +@node PMDBF parts of speech +@section Parts of speech + +@multitable {ADJPRP} { adjectival-passive-participle } +@item @code{N} @tab noun +@item @code{NPRO} @tab nominal-pronoun +@item @code{NV} @tab deverbal-noun +@item @code{V} @tab verb +@item @code{BYC} @tab byc +@item @code{VNI} @tab non-inflected-verb +@item @code{ADJ} @tab adjective +@item @code{ADJPAP} @tab adjectival-passive-participle +@item @code{ADJPRP} @tab adjectival-present-participle +@item @code{ADJPP} @tab adjectival-past-participle +@item @code{ADJPRO} @tab adjectival-pronoun +@item @code{ADJNUM} @tab adjectival-numeral +@item @code{ADV} @tab adverb +@item @code{ADVANP} @tab adverbial-anterior-participle +@item @code{ADVPRP} @tab adverbial-present-participle +@item @code{ADVPRO} @tab adverbial-pronoun +@item @code{ADVNUM} @tab adverbial-numeral +@item @code{P} @tab preposition +@item @code{PPRO} @tab prep-noun-pronoun +@item @code{CONJ} @tab conjunction +@item @code{EXCL} @tab exclamation +@item @code{APP} @tab call +@item @code{ONO} @tab onomatopoeia +@item @code{PART} @tab particle +@item @code{NUMCRD} @tab cardinal-numeral +@item @code{NUMCOL} @tab collective-numeral +@item @code{NUMPAR} @tab partitive-numeral +@item @code{NUMORD} @tab ordinal-numeral +@end multitable + +@node PMDBF morphosyntactic attributes +@section Morphosyntactic attributes + +@multitable {Attr} {Val} {aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa} +@c @headitem Attr @tab Val @tab Description +@item +@code{A} @tab @tab Aspect +@item +@tab @code{p} @tab perfect +@item +@tab @code{i} @tab imperfect. +@item +@item +@code{V} @tab @tab Verb-Form +@item +@tab @code{b} @tab infinitive, +@item +@tab @code{p} @tab personal, +@item +@tab @code{i} @tab impersonal. +@item +@item +@code{M} @tab @tab Mood +@item +@tab @code{d} @tab declarative, +@item +@tab @code{c} @tab conditional, +@item +@tab @code{i} @tab imperative. +@item +@item +@code{T} @tab @tab Tense +@item +@tab @code{a} @tab past, +@item +@tab @code{r} @tab present, +@item +@tab @code{f} @tab future. +@item +@item +@code{P} @tab @tab Person +@item +@tab @code{1} @tab 1, +@item +@tab @code{2} @tab 2, +@item +@tab @code{3} @tab 3. +@item +@item +@code{D} @tab @tab Degree +@item +@tab @code{p} @tab positive, +@item +@tab @code{c} @tab comparative, +@item +@tab @code{s} @tab superlative. +@item +@item +@code{N} @tab @tab Number +@item +@tab @code{s} @tab singular, +@item +@tab @code{p} @tab plural. +@item +@item +@code{C} @tab @tab Case +@item +@tab @code{n} @tab nominative, +@item +@tab @code{g} @tab genitive, +@item +@tab @code{d} @tab dative, +@item +@tab @code{a} @tab accusative, +@item +@tab @code{i} @tab instrumantal, +@item +@tab @code{l} @tab locative, +@item +@tab @code{v} @tab vocative. +@item +@item +@code{G} @tab @tab Gender +@item +@tab @code{p} @tab masculine-personal, +@item +@tab @code{a} @tab masculine-animal, +@item +@tab @code{i} @tab masculine-inanimate, +@item +@tab @code{f} @tab feminine, +@item +@tab @code{n} @tab neuter. +@end multitable + + +@c --------------------------------------------------------------------- +@c --------------------------------------------------------------------- +@c +@c @node Examples +@c @chapter Examples + +@c ---------------------------------------------------------------------- +@c ---------------------------------------------------------------------- + +@node GNU Free Documentation License +@chapter GNU Free Documentation License + +@c The GNU Free Documentation License. +@center Version 1.2, November 2002 + +@c This file is intended to be included within another document, +@c hence no sectioning command or @node. + +@display +Copyright @copyright{} 2000,2001,2002 Free Software Foundation, Inc. +51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA + +Everyone is permitted to copy and distribute verbatim copies +of this license document, but changing it is not allowed. +@end display + +@enumerate 0 +@item +PREAMBLE + +The purpose of this License is to make a manual, textbook, or other +functional and useful document @dfn{free} in the sense of freedom: to +assure everyone the effective freedom to copy and redistribute it, +with or without modifying it, either commercially or noncommercially. +Secondarily, this License preserves for the author and publisher a way +to get credit for their work, while not being considered responsible +for modifications made by others. + +This License is a kind of ``copyleft'', which means that derivative +works of the document must themselves be free in the same sense. It +complements the GNU General Public License, which is a copyleft +license designed for free software. + +We have designed this License in order to use it for manuals for free +software, because free software needs free documentation: a free +program should come with manuals providing the same freedoms that the +software does. But this License is not limited to software manuals; +it can be used for any textual work, regardless of subject matter or +whether it is published as a printed book. We recommend this License +principally for works whose purpose is instruction or reference. + +@item +APPLICABILITY AND DEFINITIONS + +This License applies to any manual or other work, in any medium, that +contains a notice placed by the copyright holder saying it can be +distributed under the terms of this License. Such a notice grants a +world-wide, royalty-free license, unlimited in duration, to use that +work under the conditions stated herein. The ``Document'', below, +refers to any such manual or work. Any member of the public is a +licensee, and is addressed as ``you''. You accept the license if you +copy, modify or distribute the work in a way requiring permission +under copyright law. + +A ``Modified Version'' of the Document means any work containing the +Document or a portion of it, either copied verbatim, or with +modifications and/or translated into another language. + +A ``Secondary Section'' is a named appendix or a front-matter section +of the Document that deals exclusively with the relationship of the +publishers or authors of the Document to the Document's overall +subject (or to related matters) and contains nothing that could fall +directly within that overall subject. (Thus, if the Document is in +part a textbook of mathematics, a Secondary Section may not explain +any mathematics.) The relationship could be a matter of historical +connection with the subject or with related matters, or of legal, +commercial, philosophical, ethical or political position regarding +them. + +The ``Invariant Sections'' are certain Secondary Sections whose titles +are designated, as being those of Invariant Sections, in the notice +that says that the Document is released under this License. If a +section does not fit the above definition of Secondary then it is not +allowed to be designated as Invariant. The Document may contain zero +Invariant Sections. If the Document does not identify any Invariant +Sections then there are none. + +The ``Cover Texts'' are certain short passages of text that are listed, +as Front-Cover Texts or Back-Cover Texts, in the notice that says that +the Document is released under this License. A Front-Cover Text may +be at most 5 words, and a Back-Cover Text may be at most 25 words. + +A ``Transparent'' copy of the Document means a machine-readable copy, +represented in a format whose specification is available to the +general public, that is suitable for revising the document +straightforwardly with generic text editors or (for images composed of +pixels) generic paint programs or (for drawings) some widely available +drawing editor, and that is suitable for input to text formatters or +for automatic translation to a variety of formats suitable for input +to text formatters. A copy made in an otherwise Transparent file +format whose markup, or absence of markup, has been arranged to thwart +or discourage subsequent modification by readers is not Transparent. +An image format is not Transparent if used for any substantial amount +of text. A copy that is not ``Transparent'' is called ``Opaque''. + +Examples of suitable formats for Transparent copies include plain +@sc{ascii} without markup, Texinfo input format, La@TeX{} input +format, @acronym{SGML} or @acronym{XML} using a publicly available +@acronym{DTD}, and standard-conforming simple @acronym{HTML}, +PostScript or @acronym{PDF} designed for human modification. Examples +of transparent image formats include @acronym{PNG}, @acronym{XCF} and +@acronym{JPG}. Opaque formats include proprietary formats that can be +read and edited only by proprietary word processors, @acronym{SGML} or +@acronym{XML} for which the @acronym{DTD} and/or processing tools are +not generally available, and the machine-generated @acronym{HTML}, +PostScript or @acronym{PDF} produced by some word processors for +output purposes only. + +The ``Title Page'' means, for a printed book, the title page itself, +plus such following pages as are needed to hold, legibly, the material +this License requires to appear in the title page. For works in +formats which do not have any title page as such, ``Title Page'' means +the text near the most prominent appearance of the work's title, +preceding the beginning of the body of the text. + +A section ``Entitled XYZ'' means a named subunit of the Document whose +title either is precisely XYZ or contains XYZ in parentheses following +text that translates XYZ in another language. (Here XYZ stands for a +specific section name mentioned below, such as ``Acknowledgements'', +``Dedications'', ``Endorsements'', or ``History''.) To ``Preserve the Title'' +of such a section when you modify the Document means that it remains a +section ``Entitled XYZ'' according to this definition. + +The Document may include Warranty Disclaimers next to the notice which +states that this License applies to the Document. These Warranty +Disclaimers are considered to be included by reference in this +License, but only as regards disclaiming warranties: any other +implication that these Warranty Disclaimers may have is void and has +no effect on the meaning of this License. + +@item +VERBATIM COPYING + +You may copy and distribute the Document in any medium, either +commercially or noncommercially, provided that this License, the +copyright notices, and the license notice saying this License applies +to the Document are reproduced in all copies, and that you add no other +conditions whatsoever to those of this License. You may not use +technical measures to obstruct or control the reading or further +copying of the copies you make or distribute. However, you may accept +compensation in exchange for copies. If you distribute a large enough +number of copies you must also follow the conditions in section 3. + +You may also lend copies, under the same conditions stated above, and +you may publicly display copies. + +@item +COPYING IN QUANTITY + +If you publish printed copies (or copies in media that commonly have +printed covers) of the Document, numbering more than 100, and the +Document's license notice requires Cover Texts, you must enclose the +copies in covers that carry, clearly and legibly, all these Cover +Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on +the back cover. Both covers must also clearly and legibly identify +you as the publisher of these copies. The front cover must present +the full title with all words of the title equally prominent and +visible. You may add other material on the covers in addition. +Copying with changes limited to the covers, as long as they preserve +the title of the Document and satisfy these conditions, can be treated +as verbatim copying in other respects. + +If the required texts for either cover are too voluminous to fit +legibly, you should put the first ones listed (as many as fit +reasonably) on the actual cover, and continue the rest onto adjacent +pages. + +If you publish or distribute Opaque copies of the Document numbering +more than 100, you must either include a machine-readable Transparent +copy along with each Opaque copy, or state in or with each Opaque copy +a computer-network location from which the general network-using +public has access to download using public-standard network protocols +a complete Transparent copy of the Document, free of added material. +If you use the latter option, you must take reasonably prudent steps, +when you begin distribution of Opaque copies in quantity, to ensure +that this Transparent copy will remain thus accessible at the stated +location until at least one year after the last time you distribute an +Opaque copy (directly or through your agents or retailers) of that +edition to the public. + +It is requested, but not required, that you contact the authors of the +Document well before redistributing any large number of copies, to give +them a chance to provide you with an updated version of the Document. + +@item +MODIFICATIONS + +You may copy and distribute a Modified Version of the Document under +the conditions of sections 2 and 3 above, provided that you release +the Modified Version under precisely this License, with the Modified +Version filling the role of the Document, thus licensing distribution +and modification of the Modified Version to whoever possesses a copy +of it. In addition, you must do these things in the Modified Version: + +@enumerate A +@item +Use in the Title Page (and on the covers, if any) a title distinct +from that of the Document, and from those of previous versions +(which should, if there were any, be listed in the History section +of the Document). You may use the same title as a previous version +if the original publisher of that version gives permission. + +@item +List on the Title Page, as authors, one or more persons or entities +responsible for authorship of the modifications in the Modified +Version, together with at least five of the principal authors of the +Document (all of its principal authors, if it has fewer than five), +unless they release you from this requirement. + +@item +State on the Title page the name of the publisher of the +Modified Version, as the publisher. + +@item +Preserve all the copyright notices of the Document. + +@item +Add an appropriate copyright notice for your modifications +adjacent to the other copyright notices. + +@item +Include, immediately after the copyright notices, a license notice +giving the public permission to use the Modified Version under the +terms of this License, in the form shown in the Addendum below. + +@item +Preserve in that license notice the full lists of Invariant Sections +and required Cover Texts given in the Document's license notice. + +@item +Include an unaltered copy of this License. + +@item +Preserve the section Entitled ``History'', Preserve its Title, and add +to it an item stating at least the title, year, new authors, and +publisher of the Modified Version as given on the Title Page. If +there is no section Entitled ``History'' in the Document, create one +stating the title, year, authors, and publisher of the Document as +given on its Title Page, then add an item describing the Modified +Version as stated in the previous sentence. + +@item +Preserve the network location, if any, given in the Document for +public access to a Transparent copy of the Document, and likewise +the network locations given in the Document for previous versions +it was based on. These may be placed in the ``History'' section. +You may omit a network location for a work that was published at +least four years before the Document itself, or if the original +publisher of the version it refers to gives permission. + +@item +For any section Entitled ``Acknowledgements'' or ``Dedications'', Preserve +the Title of the section, and preserve in the section all the +substance and tone of each of the contributor acknowledgements and/or +dedications given therein. + +@item +Preserve all the Invariant Sections of the Document, +unaltered in their text and in their titles. Section numbers +or the equivalent are not considered part of the section titles. + +@item +Delete any section Entitled ``Endorsements''. Such a section +may not be included in the Modified Version. + +@item +Do not retitle any existing section to be Entitled ``Endorsements'' or +to conflict in title with any Invariant Section. + +@item +Preserve any Warranty Disclaimers. +@end enumerate + +If the Modified Version includes new front-matter sections or +appendices that qualify as Secondary Sections and contain no material +copied from the Document, you may at your option designate some or all +of these sections as invariant. To do this, add their titles to the +list of Invariant Sections in the Modified Version's license notice. +These titles must be distinct from any other section titles. + +You may add a section Entitled ``Endorsements'', provided it contains +nothing but endorsements of your Modified Version by various +parties---for example, statements of peer review or that the text has +been approved by an organization as the authoritative definition of a +standard. + +You may add a passage of up to five words as a Front-Cover Text, and a +passage of up to 25 words as a Back-Cover Text, to the end of the list +of Cover Texts in the Modified Version. Only one passage of +Front-Cover Text and one of Back-Cover Text may be added by (or +through arrangements made by) any one entity. If the Document already +includes a cover text for the same cover, previously added by you or +by arrangement made by the same entity you are acting on behalf of, +you may not add another; but you may replace the old one, on explicit +permission from the previous publisher that added the old one. + +The author(s) and publisher(s) of the Document do not by this License +give permission to use their names for publicity for or to assert or +imply endorsement of any Modified Version. + +@item +COMBINING DOCUMENTS + +You may combine the Document with other documents released under this +License, under the terms defined in section 4 above for modified +versions, provided that you include in the combination all of the +Invariant Sections of all of the original documents, unmodified, and +list them all as Invariant Sections of your combined work in its +license notice, and that you preserve all their Warranty Disclaimers. + +The combined work need only contain one copy of this License, and +multiple identical Invariant Sections may be replaced with a single +copy. If there are multiple Invariant Sections with the same name but +different contents, make the title of each such section unique by +adding at the end of it, in parentheses, the name of the original +author or publisher of that section if known, or else a unique number. +Make the same adjustment to the section titles in the list of +Invariant Sections in the license notice of the combined work. + +In the combination, you must combine any sections Entitled ``History'' +in the various original documents, forming one section Entitled +``History''; likewise combine any sections Entitled ``Acknowledgements'', +and any sections Entitled ``Dedications''. You must delete all +sections Entitled ``Endorsements.'' + +@item +COLLECTIONS OF DOCUMENTS + +You may make a collection consisting of the Document and other documents +released under this License, and replace the individual copies of this +License in the various documents with a single copy that is included in +the collection, provided that you follow the rules of this License for +verbatim copying of each of the documents in all other respects. + +You may extract a single document from such a collection, and distribute +it individually under this License, provided you insert a copy of this +License into the extracted document, and follow this License in all +other respects regarding verbatim copying of that document. + +@item +AGGREGATION WITH INDEPENDENT WORKS + +A compilation of the Document or its derivatives with other separate +and independent documents or works, in or on a volume of a storage or +distribution medium, is called an ``aggregate'' if the copyright +resulting from the compilation is not used to limit the legal rights +of the compilation's users beyond what the individual works permit. +When the Document is included in an aggregate, this License does not +apply to the other works in the aggregate which are not themselves +derivative works of the Document. + +If the Cover Text requirement of section 3 is applicable to these +copies of the Document, then if the Document is less than one half of +the entire aggregate, the Document's Cover Texts may be placed on +covers that bracket the Document within the aggregate, or the +electronic equivalent of covers if the Document is in electronic form. +Otherwise they must appear on printed covers that bracket the whole +aggregate. + +@item +TRANSLATION + +Translation is considered a kind of modification, so you may +distribute translations of the Document under the terms of section 4. +Replacing Invariant Sections with translations requires special +permission from their copyright holders, but you may include +translations of some or all Invariant Sections in addition to the +original versions of these Invariant Sections. You may include a +translation of this License, and all the license notices in the +Document, and any Warranty Disclaimers, provided that you also include +the original English version of this License and the original versions +of those notices and disclaimers. In case of a disagreement between +the translation and the original version of this License or a notice +or disclaimer, the original version will prevail. + +If a section in the Document is Entitled ``Acknowledgements'', +``Dedications'', or ``History'', the requirement (section 4) to Preserve +its Title (section 1) will typically require changing the actual +title. + +@item +TERMINATION + +You may not copy, modify, sublicense, or distribute the Document except +as expressly provided for under this License. Any other attempt to +copy, modify, sublicense or distribute the Document is void, and will +automatically terminate your rights under this License. However, +parties who have received copies, or rights, from you under this +License will not have their licenses terminated so long as such +parties remain in full compliance. + +@item +FUTURE REVISIONS OF THIS LICENSE + +The Free Software Foundation may publish new, revised versions +of the GNU Free Documentation License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. See +@uref{http://www.gnu.org/copyleft/}. + +Each version of the License is given a distinguishing version number. +If the Document specifies that a particular numbered version of this +License ``or any later version'' applies to it, you have the option of +following the terms and conditions either of that specified version or +of any later version that has been published (not as a draft) by the +Free Software Foundation. If the Document does not specify a version +number of this License, you may choose any version ever published (not +as a draft) by the Free Software Foundation. +@end enumerate + +@page +@heading ADDENDUM: How to use this License for your documents + +To use this License in a document you have written, include a copy of +the License in the document and put the following copyright and +license notices just after the title page: + +@smallexample +@group + Copyright (C) @var{year} @var{your name}. + Permission is granted to copy, distribute and/or modify this document + under the terms of the GNU Free Documentation License, Version 1.2 + or any later version published by the Free Software Foundation; + with no Invariant Sections, no Front-Cover Texts, and no Back-Cover + Texts. A copy of the license is included in the section entitled ``GNU + Free Documentation License''. +@end group +@end smallexample + +If you have Invariant Sections, Front-Cover Texts and Back-Cover Texts, +replace the ``with@dots{}Texts.'' line with this: + +@smallexample +@group + with the Invariant Sections being @var{list their titles}, with + the Front-Cover Texts being @var{list}, and with the Back-Cover Texts + being @var{list}. +@end group +@end smallexample + +If you have Invariant Sections without Cover Texts, or some other +combination of the three, merge those two alternatives to suit the +situation. + +If your document contains nontrivial examples of program code, we +recommend releasing these examples in parallel under your choice of +free software license, such as the GNU General Public License, +to permit their use in free software. + +@c Local Variables: +@c ispell-local-pdict: "ispell-dict" +@c End: + + +@c --------------------------------------------------------------------- +@c --------------------------------------------------------------------- + +@node Reporting bugs +@chapter Reporting bugs + +Report bugs to . + +@c --------------------------------------------------------------------- +@c --------------------------------------------------------------------- + +@c @node Copyright +@c @chapter Copyright +@c +@c Copyright 2004 by Tomasz Obrebski +@c This software is free for research and educational use. + +@c --------------------------------------------------------------------- +@c --------------------------------------------------------------------- + +@node Author +@chapter Author + + +@bye diff --git a/app/lib/ser.l.template b/app/lib/ser.l.template new file mode 100644 index 0000000..1c72081 --- /dev/null +++ b/app/lib/ser.l.template @@ -0,0 +1,30 @@ +%{ + #include + int n=0; +%} + +%% + +PATTERN { + int start, end, len; + char *lastseg, *tmp; + if(yytext[yyleng-1]!='\n') + {fprintf(stderr,"ser: pattern matches incomplete line\n"); exit(1);} + n++; + sscanf(yytext,"%d %d",&start,&len); + yytext[yyleng-1]='\0'; + if(tmp=strrchr(yytext,'\n')) + { + lastseg=tmp+1; + sscanf(lastseg,"%d %d", &end, &len); + } + else + end=start; + yytext[yyleng-1]='\n'; + printf("%04d 00 BOM * ser:%d\n",start,n); + ECHO; + printf("%04d 00 EOM * ser:%d\n",end+len,n); + } + + +.*\n DEFAULTACTION; diff --git a/app/lib/terms.m4 b/app/lib/terms.m4 new file mode 100644 index 0000000..d4ea143 --- /dev/null +++ b/app/lib/terms.m4 @@ -0,0 +1,52 @@ +divert(-1) +#-------------------------------------------------------------------------- + +# Macros defined here may be used in pattern specifications +# You can modify this file according to your needs. + +# ENDOFSEGMENT and MORFIELD are macros expanded to, respectively, +# end of segment marker (dependes on the format: flattened or not) +# and the name of the annotation field containing morphological +# information (standard value is 'lem'). These values are controlled +# by programs using this file to expand search patterns (ser, grp, ...). + +# seg(type,form,annotation) + +define(`seg',`(\s*((\d+\s+)(\d+\s+)?)?dnl +ifelse($1, `',`(\S+)', `($1)')\s+dnl +ifelse($2, `',`(\S+)', `($2)')dnl +ifelse($3, `',`((\s+\S+)*)', `(\s+($3))')\s*ENDOFSEGMENT)') + +# form(f) - segment containing the form f + +define(`form', `seg(,$1)') + +# field(f) segment containing auxiliary field f + +define(`field', `seg(,,`(\S+\s+)*($1)(\s+\S+)*')') + +# word, space, punct, number segments (assuming W, S, P, N segment types) + +define(`space', `seg(`S',`$1')') +define(`word', `seg(`W',`$1')') +define(`punct', `seg(`P',`$1')') +define(`number', `seg(`N',`$1')') + +# macros specific to PMDB format + +define(`lexeme', `field(`MORFIELD:(\S+;)?$1,\S+')') +define(`cat', `field(`MORFIELD:\S+,$1([,;]\S+)?')') + + +# Place here your macro definitions. + + + + + + + + + +#-------------------------------------------------------------------------- +divert(0) \ No newline at end of file diff --git a/app/src/common/Makefile b/app/src/common/Makefile new file mode 100644 index 0000000..cd4bb61 --- /dev/null +++ b/app/src/common/Makefile @@ -0,0 +1,8 @@ +# main: cmdline.c main_template.cc +# g++ -o main cmdline.c common.cc main_template.cc + +# cmdline.c cmdline.h : cmdline.ggo +# gengetopt -i cmdline.ggo + +# cmdline.ggo: cmdline_common.ggo cmdline_program.ggo +# cat cmdline_common.ggo cmdline_program.ggo > cmdline.ggo diff --git a/app/src/common/README b/app/src/common/README new file mode 100644 index 0000000..46870a9 --- /dev/null +++ b/app/src/common/README @@ -0,0 +1,18 @@ +Propozycja ujednolicenia dzialania klocka na poziomie +funkcji main. Parametry meta - zdefiniowane dla +wszystkich, poza tok, programow, definiujace ich zachowanie +w systemie klockow. + +cmdline_common.ggo - deklaracje parametrow meta + +cmdline_program.ggo - przyklad deklaracji parametrow programu + nazwa docelowa np. cmdline_guess.ggo + +common.cc - zmienne globalne zawierajace informacje + przekazane przez parametry meta +common.h + +main_template.cc - szkielet funkcji main + +Makefile - sposob kompilacji + diff --git a/app/src/common/cmdline_common.ggo b/app/src/common/cmdline_common.ggo new file mode 100644 index 0000000..5dfafda --- /dev/null +++ b/app/src/common/cmdline_common.ggo @@ -0,0 +1,34 @@ +#section "Common UTT options" + + +option "input" f "Input file" string no hidden + +option "output" o "Output file" string no hidden + +option "fail" e "Output file for unsuccesfully processed segments " string no hidden + +option "only-fail" - "Print only segments the program failed to process" flag off hidden + +option "no-fail" - "Print only segments the program processed" flag off hidden + +option "copy" c "Copy succesfully processed segments to standard output" flag off hidden + +option "process" p "Process segments with this tag" string no multiple + +option "select" s "Select only segments with this field" string no multiple + +option "ignore" S "Select only segments without this field" string no multiple + +option "output-field" O "Output field name" string no + +option "input-field" I "Input field name" string no multiple + +option "interactive" i "Toggle interactive mode" flag off + +option "config" - "Configuration file" string typestr="FILENAME" no + +option "one-field" 1 "Print all results in one segments (creates ambiguous annotation)" flag off + +option "one-line" - "Print annotation alternatives as additional fields" flag off + +option "language" - "Language." string no diff --git a/app/src/common/cmdline_program.ggo b/app/src/common/cmdline_program.ggo new file mode 100644 index 0000000..e5e3058 --- /dev/null +++ b/app/src/common/cmdline_program.ggo @@ -0,0 +1,5 @@ +package "guess" +version "0.1" + +option "color" l "Show guessed descriptions in colour." flag off + diff --git a/app/src/common/common.cc b/app/src/common/common.cc new file mode 100644 index 0000000..93c2a4f --- /dev/null +++ b/app/src/common/common.cc @@ -0,0 +1,264 @@ +#include +#include +#include +#include +#include +#include "common.h" + +#include +#include + +FILE* inputf=stdin; +FILE* outputf=stdout; +FILE* failedf=stdout; +bool copy_processed=0; +bool one_field=false; +bool one_line=false; +char output_field_prefix[32]; +char input_field_prefix[32]; + +extern int argc; +extern char **argv; + + +// tilde (home dir) expansion in path +int expand_path(char* inpath, char* outpath) +{ + if(inpath[0]=='~') + sprintf(outpath,"%s%s",getenv("HOME"),inpath+1); + else + strcpy(outpath,inpath); + return 0; // no problem +} + + + + +/* +parameters: + -name - field name, long or short + +prefix - field name with ':' appended if long name +return value: + 1 if correct field name, 0 otherwise +examples: +name prefix r.v. +lem lem: 1 +@ @ 1 +:: 'undef' 0 +a,b 'undef' 0 +*/ +int fieldprefix(char *name, char *prefix) +{ + if (ispunct(name[0]) && name[1]=='\0') // correct short name + { + strcpy(prefix, name); return 1; + } + + int i=0; + while(name[i]!='\0' && isalnum(name[i])) ++i; + + if(name[i]=='\0' && i>0) // correct long name + { + sprintf(prefix,"%s:",name); return 1; + } + + // incorrect + return 0; +} + + + +void set_program_name(char program_name[], char* argv0) +{ + if (char* p_name = strrchr(argv0, '/')) + strcpy(program_name,p_name+1); + else + strcpy(program_name,argv0); +} + + + +extern void process_config_files(gengetopt_args_info* args, char* argv0) +{ + + char program_name[256]; + char config_file[256]; + char config_file_tmp[256]; + + set_program_name(program_name,argv0); + + // obsługa pliku konfiguracyjnego podanego w linii komend + if (args->config_given) { + if (file_accessible(args->config_arg) == 0) { + if (cmdline_parser_configfile(args->config_arg, + args, + 0, // 0 - nie nadpisuj wartości parametrów + 0, // 0 - nie inicjuj + 0) != 0) { + fprintf(stderr, "Error in config file (%s)\n", args->config_arg); + exit(1); + } + } + } + + if(args->one_line_given && !one_line) one_line=true, one_field=false; + if(args->one_field_given && !one_field) one_line=false, one_field=true; + + // obsluga pliku konfiguracyjnego uzytkownika dla programu + sprintf(config_file_tmp, "%s/%s.conf", USER_CONFIG_DIR, program_name); + expand_path(config_file_tmp, config_file); + if (file_accessible(config_file) == 0) { + if (cmdline_parser_configfile(config_file, + args, + 0, // 0 - nie nadpisuj danych + 0, // 0 - nie inicjuj struktury + 0) != 0) { + fprintf(stderr, "Error in config file (%s)\n", config_file); + exit(1); + } + } + + if(args->one_line_given && !one_line) one_line=true, one_field=false; + if(args->one_field_given && !one_field) one_line=false, one_field=true; + + + // obsluga pliku konfiguracyjnego uzytkownika globalnego + sprintf(config_file_tmp, "%s/utt.conf", USER_CONFIG_DIR); + expand_path(config_file_tmp, config_file); + if (file_accessible(config_file) == 0) { + if (cmdline_parser_configfile(config_file, + args, + 0, // 0 - nie nadpisuj danych + 0, // 0 - nie inicjuj struktury + 0) != 0) { + fprintf(stderr, "Error in config file (%s)\n", config_file); + exit(1); + } + } + + if(args->one_line_given && !one_line) one_line=true, one_field=false; + if(args->one_field_given && !one_field) one_line=false, one_field=true; + + + + // obsluga systemowego pliku konfiguracyjnego dla programu + sprintf(config_file, "%s/%s.conf", SYSTEM_CONFIG_DIR, program_name); + if (file_accessible(config_file) == 0) { + if (cmdline_parser_configfile(config_file, + args, + 0, // 0 - nie zmieniaj danych wczesniejszych + 0, // 0 - nie inicjuj struktury + 0 // 0 - nie sprawdzaj wymaganych parametrow + ) != 0) { + fprintf(stderr, "Error in config file (%s)\n", config_file); + exit(1); + } + } + + if(args->one_line_given && !one_line) one_line=true, one_field=false; + if(args->one_field_given && !one_field) one_line=false, one_field=true; + + + // obsluga systemowego pliku konfiguracyjnego globalnego + sprintf(config_file, "%s/utt.conf", SYSTEM_CONFIG_DIR); + if (file_accessible(config_file) == 0) { + if (cmdline_parser_configfile(config_file, + args, + 0, // 0 - nie zmieniaj danych wczesniejszych + 0, // 0 - nie inicjuj struktury + 0 // 0 - nie sprawdzaj wymaganych parametrow + ) != 0) { + fprintf(stderr, "Error in config file (%s)\n", config_file); + exit(1); + } + } + + if(args->one_line_given && !one_line) one_line=true, one_field=false; + if(args->one_field_given && !one_field) one_line=false, one_field=true; + +} + + +void process_common_options(gengetopt_args_info* args, char* argv0) +{ + char program_name[256]; + + set_program_name(program_name,argv0); + + setlocale(LC_CTYPE,""); + setlocale(LC_COLLATE, ""); + + if(args->help_given) + cmdline_parser_print_help (); + + if(args->input_given) + if(!(inputf=fopen(args->input_arg,"r"))) + { + fprintf(stderr,"No such file: %s.\n", args->input_arg); + exit(1); + } + + if(args->output_given) + if(!(outputf=fopen(args->output_arg,"w"))) + { + fprintf(stderr,"Cannot open output file: %s.\n", args->output_arg); + exit(1); + } + + if(args->fail_given) + if(!(failedf=fopen(args->fail_arg,"w"))) + { + fprintf(stderr,"Cannot open output file: %s.\n", args->fail_arg); + exit(1); + } + + if(args->input_field_given) + fieldprefix(args->input_field_arg[0],input_field_prefix); + else + strcpy(input_field_prefix, "4"); + + if(args->output_field_given) + fieldprefix(args->output_field_arg,output_field_prefix); + else + sprintf(output_field_prefix, "%s%c", program_name, INFIELD_SEP); + + if ((args->copy_given)) + copy_processed=true; +} + +// sprawdza istnienie pliku +int file_accessible(const char* path) { + return access(path, R_OK); +} + +// sprawdza istnienie pliku konfiguracyjnego +int config_file_exists(const char* dir, const char* filename) { + struct stat dir_stat; + struct stat file_stat; + + char* path = (char*)malloc(strlen(dir) + strlen(filename) + 2); // + '\0' + '/' + + sprintf(path, "%s/%s", dir, filename); + + if (stat(dir, &dir_stat) != 0) + return -1; + + if (stat(path, &file_stat) != 0) + return -1; + + if (!S_ISDIR(dir_stat.st_mode)) + return -1; // katalog nie jest katalogiem + + if (!S_ISREG(file_stat.st_mode)) + return -1; // plik konfiguracyjny nie jest plikiem + + if (access(dir, X_OK) != 0) + return -1; // nie mamy prawa zmienic katalogu + + if (access(path, R_OK) != 0) + return -1; // nie mamy prawa odczytu pliku + + free(path); + + return 0; +} diff --git a/app/src/common/common.h b/app/src/common/common.h new file mode 100644 index 0000000..ae08847 --- /dev/null +++ b/app/src/common/common.h @@ -0,0 +1,416 @@ +#ifndef __COMMON_H +#define __COMMON_H + +#include +#include + +#include "../lib/const.h" + +#include _CMDLINE_FILE + + +/************************************************** + * Stale dotyczace wejscia/wyjscia + */ + +#define EMPTYFORM '*' +#define INFIELD_SEP ':' +#define MAXAUX 16 +#define FIELD_SEP " \t\n" + + +// katalogi z plikami konfiguracyjnymi +// nowe +// stare - do wyrzucenia +// #define CONFIG_DIR ".utt/conf" + +// nazwa zmiennej okreslajaca sciezke do danych + +// #define UTT_DIR_VAR "UTT_DIR" + +// sciezka do plikow z danymi (np UTT_DIR/pliki) wzgledem $HOME! + +// #define UTT_DIR_DEFAULT ".utt/pl/" + +/**************************************************/ + + +extern FILE* inputf; +extern FILE* outputf; +extern FILE* failedf; + +extern char* input_filename; +extern char* output_filename; +extern char* failed_filename; +extern bool one_line; +extern bool one_field; + +extern char input_field_prefix[]; +extern char output_field_prefix[]; + +extern bool copy_processed; +extern bool append_output; +extern bool append_failed; + +//sciezka do katalogu z danymi +extern char utt_dir[]; + +extern void process_common_options(gengetopt_args_info* args, char* argv0); +extern void process_config_files(gengetopt_args_info* args, char* argv0); + +extern int expand_path(char* inpath, char* outpath); + +extern int fieldprefix(char *name, char *prefix); + + +/************************************************** + * problems with casing */ +// sprawdzenie wielkosci liter +// warto zwracana: +// 0 - wszystkie mae litery +// 1 - pierwsza wielka, reszta male +// 2 - wszystkie wielkie +// 3 - inne +inline int casing(char* s) +{ + int ret = isupper(*s) ? 1 : 0; + while(*++s != '\0') + { + if(isupper(*s)) + { + if(ret==1) ret=2; + else if(ret==0) ret=3; + } + else + { + if(ret==2) ret=3; + } + } + return ret; +} + +// +inline void tolowers(char* s, char* d) +{ + *d=tolower(*s); + while(*s != '\0') * ++d = tolower(* ++s); +} + + +// przepisuje s do d +// nadajac wielko liter zgodnie z wartoci casing +// casing - warto zwracana przez casing() +// jeli casing==3 przepisuje bez zmian (za mao informacji) +inline void restorecasing(char *s, char *d, int casing) +{ + switch(casing) + { + case 0: + case 3: + *d=*s; + while(*s != '\0') * ++d = * ++s; + break; + case 1: + *d=toupper(*s); + while(*s != '\0') * ++d = * ++s; + break; + case 2: + *d=toupper(*s); + while(*s != '\0') * ++d = toupper(* ++s); + break; + } +} + +/**************************************************/ + +/* +parameters: + -seg - segment + -pref - field name or "1", "2", "3", "4" for the first four fields + +val - field contents +return value: + 1 if specified field exists, 0 otherwise +*/ + +inline int getfield(char* seg, const char* pref, char* val) +{ + + char* p=seg; + char* p0; + + while(isspace(*p)) ++p; + + // field "1" + p0=p; while(isdigit(*p)) ++p; + if(*pref=='1') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0; + + while(isspace(*p)) ++p; + + // field "2" + p0=p; while(isdigit(*p)) ++p; + if(*pref=='2') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0; + + while(isspace(*p)) ++p; + + // field "3" + p0=p; while(isgraph(*p)) ++p; + if(*pref=='3') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0; + + while(isspace(*p)) ++p; + + // field "4" + p0=p; while(isgraph(*p)) ++p; + if(*pref=='4') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0; + + while(isspace(*p)) ++p; + + // annotation fields + do p=strstr(p,pref); while(p!=NULL && *(p-1)!=' ' && *(p-1)!='\t'); + + if(p==NULL) return 0; + else + { + p+=strlen(pref); + int len=strcspn(p,FIELD_SEP "\n\r\f\0"); + strncpy(val,p,len); + val[len]='\0'; + return 1; + } +} + + +inline +bool process_seg(char* seg, gengetopt_args_info& args) +{ + char buf[256]; + bool ret = !args.process_given; + if(args.process_given) + { + getfield(seg,"3",buf); + for(int i=0; i= MAX_LINE) return 0; // bezpieczniej, ale wolniej + + int seglen=strlen(seg); + sprintf(seg+(seglen-1)," %s%s\n",pref,val); + return 1; +} + +/**************************************************/ + +struct Seg +{ + int filepos, len; + char* tag; + char* form; + char* aux[MAXAUX]; + int auxn; + + bool parse(char* line); + char* getfield(char* fieldname); + void print(char* line); + bool addfield(char* s); + bool clearfields(); +}; + +/**************************************************/ + +/* definicja struktury wejscia/wyjscia + */ +struct Segment +{ + int filepos, len; + char* tag; + char* form; + char* aux[MAXAUX]; + int auxn; + + bool parse(char* line); + char* getfield(char* fieldname); + void print(char* line); + bool addfield(char* s); + bool clearfields(); +}; + +/* + * Sprawdza czy nalezy przetwarzac dany segment. + */ + +inline +bool process_seg(Segment& s, gengetopt_args_info& args) +{ + bool ret = !args.process_given; + + for(int i=0; i + +#include "common.h" + +main(int argc, char* argv[]) +{ + gengetopt_args_info args; + + if(cmdline_parser(argc,argv,&args) != 0) + exit(1); + + process_common_options(args); + + // + // TU KOD + // + + cmdline_parser_free(&args); + +} diff --git a/app/src/compiledic/Makefile b/app/src/compiledic/Makefile new file mode 100644 index 0000000..cc586bb --- /dev/null +++ b/app/src/compiledic/Makefile @@ -0,0 +1,12 @@ +all: compiledic aut2fsa + +compiledic: + +aut2fsa: aut2fsa.cc + g++ -Wno-deprecated -O3 -fpermissive -static -o aut2fsa aut2fsa.cc + + +copy: +ifdef UTT_BIN_DIR + cp compiledic fsm2aut aut2fsa ${UTT_BIN_DIR} +endif diff --git a/app/src/compiledic/TODO b/app/src/compiledic/TODO new file mode 100644 index 0000000..19e73e8 --- /dev/null +++ b/app/src/compiledic/TODO @@ -0,0 +1,5 @@ +* pliki tymczasowe: + - pliki symboli lab i scl + - pliki powstajace podczas kompilacji slownika + + gdzie maja byc tworzone? tak jak teraz nie moze byc! diff --git a/app/src/compiledic/aut2fsa b/app/src/compiledic/aut2fsa new file mode 100755 index 0000000..4b9f892 Binary files /dev/null and b/app/src/compiledic/aut2fsa differ diff --git a/app/src/compiledic/aut2fsa.cc b/app/src/compiledic/aut2fsa.cc new file mode 100644 index 0000000..3b0d982 --- /dev/null +++ b/app/src/compiledic/aut2fsa.cc @@ -0,0 +1,16 @@ + +#include +#include + +#include "../lib/tfti.h" + +#include + +int main() +{ + TFTiv a; + a.read(); + a.save(); + + return 0; +} diff --git a/app/src/compiledic/compiledic b/app/src/compiledic/compiledic new file mode 100755 index 0000000..a48196d --- /dev/null +++ b/app/src/compiledic/compiledic @@ -0,0 +1,190 @@ +#! /usr/bin/env perl + +$symfile='~/.utt/pl/pl_PL.iso-8859-2.sym'; +$symfilenoext = $symfile; +$symfilenoext =~ s/\.sym$//; +$labfile = $symfilenoext . '.lab'; +$sclfile = $symfilenoext . '.scl'; + +use locale; +#use strict; + +################################################## +$linesPerFile = 20000; + +if (@ARGV < 1) { + print "usage: prep_user_dict.pl dictionary_file\n"; + exit; +} + +my $file = shift; # @ARGV; +my $filenameprefix; + +if ($file =~ /(.*)\.dic/) +{ + $filenameprefix = $1; +} +else +{ + print "The input file must have .dic extension."; + exit(1); +} + +# Przygotowanie etykiet + +#`makeLabels.pl > labels.sym`; + +`lexmakelab $symfilenoext`; + +# Analiza pliku sownika + +print "preparing file..........................................."; + +`sed -r "s/([[:punct:]])/\[\\1\]/g" < $file > temp1`; + +`cp temp1 temp2`; + +print "OK\n"; + +#dzielimy plik na wiele czci, uruchamiamy lexcomplex dla kadej +#czci osobno, nastpnie czymy to za pomoc programu fsmunion + +#print "Dziel sownik na mniejsze czci..."; + +open(IN, "./temp2"); + +$lineCount = 0; +$fileCount = 0; + +`mkdir LemTEMP`; + +open(FILE, ">LemTEMP/slo_0"); + +while () { + + if (++$lineCount >= $linesPerFile) { + $fileCount++; + $lineCount = 0; + + close(FILE); +# print "Tworz nowy plik tymczasowy: slo_".$fileCount."\n"; + open(FILE, ">LemTEMP/slo_".$fileCount); + } + + print(FILE $_); +} + +#print "OK\n"; + +print "building partial automata"; + +#32 kropki, fileCount plikow +$filesPerDot = $fileCount/32; +$files=$filesPerDot; +$dots=0; + +for ($i=0; $i<=$fileCount; $i++) { + + if ($files >= $filesPerDot) { + $files = 0; + print "."; + $dots++; + } + $files++; + + $command = "lexcomplex -l $labfile -S $sclfile < LemTEMP/slo_".$i." > LemTEMP/slownik_".$i.".fsm"; + + `$command`; + +} +if ($dots < 32) { + for ($i=0; $i<32 - $dots; $i++) { + print "."; + } +} + +print "OK\n"; + +`rm LemTEMP/slo_*`; + +print "building final automaton"; + +#35 kropek... +$ndots=33; +$filesPerDot = $fileCount/$ndots; +$files=$filesPerDot; +$dots=0; + +`cp LemTEMP/slownik_0.fsm slownik1.fsm`; + +for ($i=1; $i<=$filecount; $i++) { + + if ($files >= $filesPerDot) { + $files = 0; + print "."; + $dots++; + } + $files++; + + $command = "fsmunion LemTEMP/slownik_".$i." slownik1.fsm > slownik2.fsm"; + + `$command`; + + `mv slownik2.fsm slownik1.fsm`; +} + +if ($dots < $ndots) { + for ($i=0; $i<$ndots - $dots; $i++) { + print "."; + } +} + +`fsmunion LemTEMP/* > slownik1.fsm`; + +print "OK\n"; + +print "removing epsilon-transitions............................."; + +`fsmrmepsilon slownik1.fsm > slownik2.fsm`; + +`rm slownik1.fsm`; + +print "OK\n"; + +print "determinizing automaton.................................."; + +`fsmdeterminize slownik2.fsm > slownik1.fsm`; + +`rm slownik2.fsm`; + +print "OK\n"; + +print "minimizing automaton....................................."; + +`fsmminimize slownik1.fsm > slownik.fsm`; + +#`rm slownik1.fsm`; + +print "OK\n"; + +print "converting fsm format to bin............................."; + +`fsmprint -i $labfile slownik.fsm > slownik.txt`; + +`fsm2aut slownik.txt > slownik.aut`; + +`aut2fsa < slownik.aut > $filenameprefix.bin`; + +print "OK\n"; + +print "removing temporary files................................."; + +`rm LemTEMP/*`; +`rmdir LemTEMP`; +`rm temp2`; +`rm slownik.fsm`; +`rm slownik.txt`; +`rm slownik.aut`; +`rm labels.*`; + +print "OK\n"; diff --git a/app/src/compiledic/fsm2aut b/app/src/compiledic/fsm2aut new file mode 100755 index 0000000..ee25876 --- /dev/null +++ b/app/src/compiledic/fsm2aut @@ -0,0 +1,44 @@ +#!/usr/bin/perl + +my $currstate=-1; +my @states; +my @final; +my $tn=0; + +while(<>) +{ + if(/^\s*([0-9]+)\s+([0-9]+)\s+(.)(\s*)?$/) + { + push @{$states[$1]}, ($3, $2); + $#states=$2 if $#states<$2; + $tn++; + } + elsif(/^\s*([0-9]+)\s*$/) + { + $final[$1]=1; + $#states=$1 if $#states<$1; + } + else + { + die("Input error."); + } +} + +print scalar(@states)," ",$tn," char void\n"; + +my $i=0; +my $width=int(log(@states+1)/log(10)); +foreach $stateref (@states) +{ + $f = ($final[$i]?"+":"-"); + printf "%${width}d %s",$i++,$f; + while(@$stateref) + { + $c=shift @$stateref; + $s=shift @$stateref; + print " $c $s"; + } + print "\n"; +} + + diff --git a/app/src/con/Makefile b/app/src/con/Makefile new file mode 100644 index 0000000..e0c9a77 --- /dev/null +++ b/app/src/con/Makefile @@ -0,0 +1,7 @@ + +con: + +copy: +ifdef UTT_BIN_DIR + cp con ${UTT_BIN_DIR} +endif diff --git a/app/src/con/con b/app/src/con/con new file mode 100755 index 0000000..4d28984 --- /dev/null +++ b/app/src/con/con @@ -0,0 +1,549 @@ +#!/usr/bin/perl -w +use strict; +use Getopt::Long; +use locale; + +Getopt::Long::Configure('no_ignore_case_always'); + +my $l='30c'; +my $r='30c'; +my $trim=0; +my $white=0; +my $bon='[0-9]+ [0-9]+ BOM .*'; +my $eon='[0-9]+ [0-9]+ EOM .*'; +my $bod='['; +my $eod=']'; +my $column=0; +my $ignore=0; +my $help=0; + +my $configfile1="../../conf/con.conf"; +my $configfile2="../conf/con.conf"; + +#read configuration files########################### +my $file; +foreach $file ($configfile1, $configfile2){ + if(open(CONFIG, $file)){ + while () { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "left")or($name eq "l")){ + $l=$value; + } + elsif(($name eq "right")or($name eq "r")){ + $r=$value; + } + elsif(($name eq "trim")or($name eq "t")){ + $trim=1; + } + elsif(($name eq "white")or($name eq "w")){ + $white=1; + } + elsif($name eq "bom"){ + $bon=$value; + } + elsif($name eq "eom"){ + $eon=$value; + } + elsif($name eq "bod"){ + $bod=$value; + } + elsif($name eq "eod"){ + $eod=$value; + } + elsif(($name eq "column")or($name eq "c")){ + $column=$value; + } + elsif(($name eq "ignore")or($name eq "i")){ + $ignore=1; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + + } + close CONFIG; + } +} +######################################################### + +GetOptions("left|l=s" => \$l, + "right|r=s" => \$r, + "trim|t" => \$trim, + "white|w" => \$white, + "bom=s" => \$bon, + "eom=s" => \$eon, + "bod=s" => \$bod, + "eod=s" => \$eod, + "column|c=s" => \$column, + "ignore|i" => \$ignore, + "help|h" => \$help); + +if(!($column=~/^[0-9]+$/)){$column=0;} + +if($help) +{ + print <<'END' +Options: + --help -h Help. + --left -l Left context info (default='30c') + Examples: + -l=5c: left context is 5 characters + -l=5w: left context is 5 words + -l=5s: left context is 5 non-empty input lines + -l='\s*\S+\sr\S+BOS': left context starts with the given regex + --right -r Right context info (default='30c') + --trim -t Clear incomplete words from output + --white -w DO NOT change all white characters into spaces + --column -c Left column minimal width in characters (default = 0) + --ignore -i Ignore input inconsistency + --bon Beginning of selected segment + (regex, default='[0-9]+ [0-9]+ BOM .*') + --eon End of selected segment + (regex, default='[0-9]+ [0-9]+ EOM .*') + --bod Selected segment beginning display (default='[') + --eod Selected segment end display (default=']') + +END +; + exit 0; +} + + +my $seg_no=0; +my $seg_size=0; + +my $left_type; +my $left_size; +my $right_type; +my $right_size; + +set_lr_types($l, $r, \$left_type,\$left_size,\$right_type,\$right_size, $trim); + + +my $inn=0; +my $after_bos=0; +my $before_eos=0; + +my @LEFT; #tablica skalarw +my @CENTER; #tablica skalarw +my @RIGHT; + +my @current_center; +my @current_left; #skalar dla c, w pp. tablica +my @current_left_words; +my @current_right_words_number; + + +while(<>){ + my $line = $_; + chomp $line; + my @line = split / /, $line; + my $line_s=@line; + + if(!line_format_ok(@line)){next;} + + if(!$white){white_into_spaces(\@line);} + else{if($line[2] eq "S"){symbols_into_white(\$line[3]);}} + + if(!input_consistent(\$seg_no,\$seg_size,$line[0],$line[1],$ignore)){ + eof_or_inconsistency(\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,$left_type,$right_type); + @current_center=(); + @current_left=(); + @current_left_words=(); + @current_right_words_number=(); + $after_bos=0; + $before_eos=0; + } + + remember_current_left($left_type,$left_size,\@current_left,\@line, \@current_left_words, $line, \$after_bos, \$before_eos); + remember_center($line,\@line,\$inn,\@current_center,$white,\@CENTER,\@current_left,\@LEFT, \$after_bos, \$before_eos, \@RIGHT, \@current_right_words_number); + remember_right($right_type,$left_type,$right_size,\@line,\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,\@current_right_words_number, $line, \$before_eos); +} + +eof_or_inconsistency(\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,$left_type,$right_type); +exit(0); + +#################procedury############################### + +sub line_format_ok{ + my @line = @_; + my $size = @line; + if($size<4){return 0;} + if($line[0]!~/[0-9]+/){return 0;} + if($line[1]!~/[0-9]+/){return 0;} + return 1; + } + +sub white_into_spaces{ + my $line_ref=shift; + if(@{$line_ref}[2] eq "S"){ + @{$line_ref}[3]=" "; + } + } + +sub symbols_into_white{ + my $string_ref=shift; + ${$string_ref} =~ s/\\n/\n/g; + ${$string_ref} =~ s/\\t/\t/g; + ${$string_ref} =~ s/_/ /g; + } + +sub white_into_symbols{ + my $string_ref=shift; + ${$string_ref} =~ s/\n/\\n/g; + ${$string_ref} =~ s/\t/\\t/g; + ${$string_ref} =~ s/ /_/g; + } + +sub input_consistent{ + my $seg_no_ref = shift; + my $seg_size_ref = shift; + my $line0 = shift; + my $line1 = shift; + my $ig = shift; + my $ok=1; + + if(${$seg_no_ref}!=0&&(!$ig)){ + my $distance = $line0-${$seg_size_ref}; + if($distance!=${$seg_no_ref}){$ok=0;} + } + ${$seg_no_ref}=$line0; + ${$seg_size_ref}=$line1; + return $ok; + } + +sub set_lr_types{ + my $left = shift; + my $right = shift; + my $left_type_ref =shift; + my $left_size_ref =shift; + my $right_type_ref =shift; + my $right_size_ref =shift; + my $do_trim=shift; + + if($left=~/[0-9]+c/){ + ${$left_type_ref}='c'; + ${$left_size_ref}=get_number($left); + if($do_trim){${$left_size_ref}++;} + } + else{ + if($left=~/[0-9]+w/){ + ${$left_type_ref}='w'; + ${$left_size_ref}=get_number($left); + } + else{ + if($left=~/[0-9]+s/){ + ${$left_type_ref}='s'; + ${$left_size_ref}=get_number($left); + } + else{ + ${$left_type_ref}=$left; + } + } + } + +if($right=~/[0-9]+c/){ + ${$right_type_ref}='c'; + ${$right_size_ref}=get_number($right); + if($do_trim){${$right_size_ref}++;} + } + else{ + if($right=~/[0-9]+w/){ + ${$right_type_ref}='w'; + ${$right_size_ref}=get_number($right); + } + else{ + if($right=~/[0-9]+s/){ + ${$right_type_ref}='s'; + ${$right_size_ref}=get_number($right); + } + else{ + ${$right_type_ref}=$right; + } + } + } + } + +sub get_number{ + my $string = shift; + my @letters = split(//,$string); + my $i=0; + while($letters[$i]=~/[0-9]/){$i++;} + my $j; + my $number=0; + my $ten=1; + for($j=$i-1;$j>=0;$j--){ + $number+=$letters[$j]*$ten; + $ten*=10; + } + return $number; + } + +sub remember_center{ + my $lin = shift; + my $lin_ref = shift; + my $inn_ref = shift; + my $current_center_ref = shift; + my $white_info = shift; + my $CENTER_REF = shift; + my $current_left_ref = shift; + my $LEFT_REF = shift; + my $after_bos_ref = shift; + my $before_eos_ref = shift; + my $RIGHT_REF = shift; + my $current_words_right_number_ref = shift; + + if((!${$inn_ref}) && $lin=~/$bon/){ + ${$inn_ref}=1; + @{$current_center_ref}=(); + ${$after_bos_ref}=0; + + push(@{$LEFT_REF},join('',@{$current_left_ref})); + + } + if(${$inn_ref} && $lin=~/$eon/){ + ${$inn_ref}=0; + push(@{$CENTER_REF},join('',@{$current_center_ref})); + ${$before_eos_ref}=1; + my @new_table; + push(@{$RIGHT_REF},\@new_table); + push(@{$current_words_right_number_ref},0); + } + if($inn && index($lin,'*')==-1){ + white_into_symbols(\${$lin_ref}[3]); + if($white_info){push(@{$current_center_ref},${$lin_ref}[3]);} + else{push(@{$current_center_ref},${$lin_ref}[3]);} + } + } + +sub remember_current_left{ +my $type=shift; +my $size=shift; +my $ref=shift; +my $line_ref=shift; + if($type eq 'c'){ + if(!(${$line_ref}[3] eq '*')){ + push(@{$ref},split('',${$line_ref}[3])); + my $lsize = @{$ref}; + if($lsize>$size){splice(@{$ref},0,$lsize-$size);} + } + } + else{ + if($type eq 'w'){ + my $words_ref = shift; + if(!(${$line_ref}[3] eq '*')){ + push(@{$ref},${$line_ref}[3]); + if(${$line_ref}[2] eq 'W'){ + push(@{$words_ref},${$line_ref}[3]); + } + my $lsize = @{$words_ref}; + if($lsize>$size){ + my $word = ${$words_ref}[1]; + splice(@{$words_ref},0,1); + while(!(${$ref}[0] eq $word)){splice(@{$ref},0,1); } + } + } + + } + else{ + if($type eq 's'){ + if(!(${$line_ref}[3] eq '*')){ + push(@{$ref},${$line_ref}[3]); + my $lsize = @{$ref}; + if($lsize>$size){splice(@{$ref},0,$lsize-$size);} + } + } + else{#bos/eos + shift; + my $line = shift; + my $after_bos_ref = shift; + my $before_eos_ref = shift; + if($line=~/$type/){ + ${$after_bos_ref}=1; + @{$ref}=(); + } + if(${$after_bos_ref} && !(${$line_ref}[3] eq '*')){ + push(@{$ref},${$line_ref}[3]); + } + } + } + } + } + +sub remember_right{ +my $type=shift; +my $type_left=shift; +my $size=shift; +my $line_ref=shift; +my $LEFT_REF=shift; +my $CENTER_REF=shift; +my $RIGHT_REF=shift; +my $bod=shift; +my $eod=shift; +my $w=shift; +my $c=shift; +my $t=shift; + + if($type eq 'c'){ + if(!(${$line_ref}[3] eq '*')){ + my $right_size = @{$RIGHT_REF}; + for(my $i=0; $i<$right_size; $i++){ + push(@{${$RIGHT_REF}[$i]}, split('',${$line_ref}[3])); + my $lsize = @{${$RIGHT_REF}[$i]}; + if($lsize>=$size){ + splice(@{${$RIGHT_REF}[$i]},$size-1); #wypisz i usun + print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type); + $right_size = @{$RIGHT_REF}; + $i--; + } + } + } + } + else{ + if($type eq 'w'){ + my $words_number_ref = shift; + if(!(${$line_ref}[3] eq '*')){ + my $right_size = @{$RIGHT_REF}; + for(my $i=0; $i<$right_size; $i++){ + push(@{${$RIGHT_REF}[$i]},${$line_ref}[3]); + if(${$line_ref}[2] eq 'W'){ + ${$words_number_ref}[$i]=${$words_number_ref}[$i]+1; + if(${$words_number_ref}[$i]==$size){ + print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type); + $right_size = @{$RIGHT_REF}; + $i--; + splice(@{$words_number_ref},$i,1); + } + } + } + } + } + else{ + if($type eq 's'){ + if(!(${$line_ref}[3] eq '*')){ + my $right_s = @{$RIGHT_REF}; + for(my $i=0; $i<$right_s; $i++){ + push(@{${$RIGHT_REF}[$i]},${$line_ref}[3]); + my $rsize=@{${$RIGHT_REF}[$i]}; + if($rsize==$size){ + print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type); + $right_s = @{$RIGHT_REF}; + $i--; + } + } + } + } + else{#bos/eos + shift; + my $line = shift; + my $before_eos_ref = shift; + if(${$before_eos_ref}){ + if(!(${$line_ref}[3] eq '*')){ + #tylko 1 pozycja + push(@{${$RIGHT_REF}[0]},${$line_ref}[3]); + } + if($line=~/$type/){ + ${$before_eos_ref}=0; + print_and_remove(0,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type); + } + } + } + } + } +} + +sub print_and_remove{ + my $index = shift; + my $LEFT_REF = shift; + my $CENTER_REF = shift; + my $RIGHT_REF = shift; + my $bdis = shift; + my $edis = shift; + my $white = shift; + my $column = shift; + my $trim = shift; + my $left_type = shift; + my $right_type = shift; + + my $left_string = "${$LEFT_REF}[$index]"; + my $right_string = join('',@{${$RIGHT_REF}[$index]}); + + if($trim){ + if($left_type eq "c"){$left_string=trim_left($left_string);} + if($right_type eq "c"){$right_string=trim_right($right_string);} + } + + if(length($left_string)<$column){$left_string=" "x($column-length($left_string)).$left_string;} + + if($white){ + white_into_symbols(\$left_string); + white_into_symbols(\$right_string); +#ponizsza linijka dodana 18 listopada + white_into_symbols(\${$CENTER_REF}[$index]); + } + + print $left_string; + print $bdis; + +#ponizsza 3 linijki (tj. 1 blok) dodana 18 listopada + if(!$white){ + symbols_into_white(\${$CENTER_REF}[$index]); + } + + print "${$CENTER_REF}[$index]"; + print $edis; + print $right_string; + print "\n"; + + splice(@{$LEFT_REF},$index,1); + splice(@{$CENTER_REF},$index,1); + splice(@{$RIGHT_REF},$index,1); + } + +sub trim_left{ + my $string = shift; + if(substr($string,0,1) eq " "){return substr($string,1);} + my $position = index($string," "); + my $temp_position = index($string,"\n"); + if(!$temp_position==-1&&($position==-1||$temp_position<$position)){$position=$temp_position;} + $temp_position = index($string,"\t"); + if(!$temp_position==-1&&($position==-1||$temp_position<$position)){$position=$temp_position;} + return substr($string,$position+1); + } + +sub trim_right{ + my $string = shift; + my $length = length($string); + if(substr($string,$length-1,1) eq " "){return substr($string,0,$length-1);} + my $position = rindex($string," "); + my $temp_position = rindex($string,"\n"); + if($temp_position>$position){$position=$temp_position;} + $temp_position = rindex($string,"\t"); + if($temp_position>$position){$position=$temp_position;} + return substr($string,0,$position); + } + +sub eof_or_inconsistency{ + my $LEFT_REF = shift; + my $CENTER_REF = shift; + my $RIGHT_REF = shift; + my $bdis = shift; + my $edis = shift; + my $white = shift; + my $column = shift; + my $trim = shift; + my $left_type = shift; + my $right_type = shift; + + my $length = @{$CENTER_REF}; + for(my $i=0;$i<$length;$i++){ + print_and_remove(0,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bdis,$edis,$white,$column,$trim,$left_type,$right_type); + $length = @{$CENTER_REF}; + $i--; + } + } diff --git a/app/src/cor/Makefile b/app/src/cor/Makefile new file mode 100644 index 0000000..67ff4ea --- /dev/null +++ b/app/src/cor/Makefile @@ -0,0 +1,42 @@ +PAR=-Wno-deprecated -m32 -fpermissive +# -static +PAR2=-c -Wno-deprecated -m32 -fpermissive +LIB_PATH=../lib +COMMON_PATH=../common +CMDLINE_FILE='"../cor/cmdline.h"' + + +cor: main.cc corr.o $(LIB_PATH)/word.o \ + $(LIB_PATH)/auttools.o cmdline.c common_cor.o common.o + g++ $(PAR) main.cc corr.o common.o \ + $(LIB_PATH)/word.o $(LIB_PATH)/auttools.o cmdline.c common_cor.o \ + -o cor + +corr.o: corr.cc corr.hh + g++ $(PAR2) corr.cc + +common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \ + $(COMMON_PATH)/common.h + g++ $(PAR2) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc + +common_cor.o: cmdline.h common_cor.cc common_cor.h + g++ $(PAR2) common_cor.cc + +cmdline.c cmdline.h: cmdline.ggo + gengetopt -i cmdline.ggo --conf-parser + +cmdline.ggo: cmdline_cor.ggo ../common/cmdline_common.ggo + cat cmdline_cor.ggo ../common/cmdline_common.ggo > cmdline.ggo + +copy: +ifdef UTT_BIN_DIR + cp cor ${UTT_BIN_DIR} +endif + +clean: clean.cmdline + rm *.o || true + rm cor || true + +clean.cmdline: + rm cmdline.* || true + diff --git a/app/src/cor/cmdline_cor.ggo b/app/src/cor/cmdline_cor.ggo new file mode 100644 index 0000000..c2062e5 --- /dev/null +++ b/app/src/cor/cmdline_cor.ggo @@ -0,0 +1,8 @@ +package "cor" +version "0.1" + +option "dictionary-home" - "Dictionary home dir." string typestr="FILENAME" no hidden +option "dictionary" d "Dictionary" string typestr="FILENAME" default="cor.bin" no +option "distance" n "Maximal edit distance." int default="1" no +option "replace" r "Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off +#option "single" - "Place all alternatives in the same line" flag off diff --git a/app/src/cor/common_cor.cc b/app/src/cor/common_cor.cc new file mode 100644 index 0000000..c87a95b --- /dev/null +++ b/app/src/cor/common_cor.cc @@ -0,0 +1,19 @@ +#include +#include +#include "common_cor.h" + +char dictionary[256]; + +void process_cor_options(gengetopt_args_info* args) +{ + if(args->dictionary_given) + { + expand_path(args->dictionary_arg,dictionary); + } + else if (args->dictionary_home_given && args->language_given) + { + char buf[255]; + expand_path(args->dictionary_home_arg, buf); + sprintf(dictionary,"%s/%s/cor.bin",buf,args->language_arg); + } +} diff --git a/app/src/cor/common_cor.h b/app/src/cor/common_cor.h new file mode 100644 index 0000000..f815e4e --- /dev/null +++ b/app/src/cor/common_cor.h @@ -0,0 +1,19 @@ +#ifndef __COMMON_COR_H +#define __COMMON_COR_H + +#include + +#define _CMDLINE_FILE "../cor/cmdline.h" +#include "../common/common.h" + +#include "cmdline.h" + +#define DICT_FILE "cor.bin" + +extern int change_count; + +extern void process_cor_options(gengetopt_args_info* args); + +extern char dictionary[]; + +#endif diff --git a/app/src/cor/corr.cc b/app/src/cor/corr.cc new file mode 100644 index 0000000..1e0d83c --- /dev/null +++ b/app/src/cor/corr.cc @@ -0,0 +1,142 @@ +//--------------------------------------------------------------------------- + +#include "corr.hh" + +#define MAXPATH 256 + +#define min(x,y) ((xy)?(x):(y)) + + +int Corr::ed(int i,int j) +{ + if(i==-1) + return j+1; + if(j==-1) + return i+1; + if(i==-2 || j==-2) + return n+1; + + if(X[i]==Y[j]) + return H2[i-1][j-1]; + if(X[i-1]==Y[j] && X[i]==Y[j-1]) + return 1+min(H2[i-2][j-2],min(H2[i][j-1],H2[i-1][j])); + return 1+min(H2[i-1][j-1],min(H2[i][j-1],H2[i-1][j])); + +/* + if(X[i]==Y[j]) + return H[(i-1)+2][(j-1)+2]; + if(X[i-1]==Y[j] && X[i]==Y[j-1]) + return 1+min(H[(i-2)+2][(j-2)+2],min(H[(i)+2][(j-1)+2],H[(i-1)+2][(j)+2])); + return 1+min(H[(i-1)+2][(j-1)+2],min(H[(i)+2][(j-1)+2],H[(i-1)+2][(j)+2])); +*/ +} + +int Corr::cuted(int j) +{ + int l=max(0,j-t); + int u=min(m,j+t); + int ce=j+t; + for(int k=l;k<=u;k++) + { + if(H2[k][j]0) + j--; + else + more=0; + while(more && !continued(path[j])); + state=path[j]+1; + } + return count; +} + + +//--------------------------------------------------------------------------- + diff --git a/app/src/cor/corr.hh b/app/src/cor/corr.hh new file mode 100644 index 0000000..5c7438e --- /dev/null +++ b/app/src/cor/corr.hh @@ -0,0 +1,34 @@ +//--------------------------------------------------------------------------- +#ifndef _corr_hh +#define _corr_hh +//--------------------------------------------------------------------------- + +#include "../lib/tfti.h" +#include "../lib/word.h" + +class Corr : public TFTiv +{ +private: + int H[100][100]; + char X[100]; // misspelled string + char Y[100]; // (possibly partial) candidate string + int m; // length of X + int n; // maximal length of Y + + int ed(int,int); + int cuted(int); + void recomputeH(int); + +public: + int (*H2)[100]; + + int t; // threshold + + Corr() : H2((int(*)[100])&H[2][2]) {}; + Corr(const char* a) : TFTiv(a), H2((int(*)[100])&H[2][2]) { }; + + int correct(const char* w, Words& tab); +}; + +//--------------------------------------------------------------------------- +#endif diff --git a/app/src/cor/main.cc b/app/src/cor/main.cc new file mode 100644 index 0000000..23380a6 --- /dev/null +++ b/app/src/cor/main.cc @@ -0,0 +1,155 @@ +#include +#include +#include "../lib/iotools.h" +#define _CMDLINE_FILE "../cor/cmdline.h" +#include "../common/common.h" +#include "common_cor.h" +#include "corr.hh" +#include "cmdline.h" +#include + + +int main(int argc, char** argv) { + +// setlocale(LC_CTYPE,""); +// setlocale(LC_COLLATE,""); + + gengetopt_args_info args; + + if(cmdline_parser(argc, argv, &args) != 0) + exit(1); + + process_config_files(&args,argv[0]); + process_common_options(&args,argv[0]); + process_cor_options(&args); + + Corr cor; + + cor.load(dictionary); + cor.t=args.distance_arg; + + char line[MAX_LINE+1]; + long line_count = 0; + + Segment seg; + Words tab; + char form1[MAX_LINE]; + char* form; + int formcasing; + char corfield[MAX_LINE]=""; + + while (fgets(line, MAX_LINE, inputf)) + { +// strcpy(outline,line); + ++line_count; + +// if(!seg.parse(line)) +// { +// fprintf(stderr,"Input error in line %d.\n",line_count); +// exit(1); +// } + + char outline[128]; + //printf("Starting cor... searching for %d fields\n", args.input_field_given); + //for (int i=0; i +#include +#include + +char buf[5001]; + +main(int argc, char **argv) +{ + + char *pattern; + char eoln; + regex_t re; + + int firstline=1; + + if(argc < 2) +/* pattern="[ \t]*([0-9]+[ \t]+){2}EOS([ \t].*)?"; */ + pattern="[ \t]*BOS([ \t].*)?"; + else + pattern=argv[1]; + + if(argc < 3) + eoln='\f'; + else + eoln=atoi(argv[2]); + + if(regcomp(&re, pattern, REG_EXTENDED|REG_NOSUB) !=0) + { + fprintf(stderr,"Invalid pattern.\n"); + exit(1); + } + + while(fgets(buf,5000,stdin)) + { + buf[strlen(buf)-1]='\0'; + if(firstline) + firstline=0; + else + if(regexec(&re, buf, (size_t)0, NULL, 0) == 0) + putchar('\n'); + else + putchar(eoln); + fputs(buf,stdout); + } + putchar('\n'); +} diff --git a/app/src/gph/Makefile b/app/src/gph/Makefile new file mode 100644 index 0000000..129e815 --- /dev/null +++ b/app/src/gph/Makefile @@ -0,0 +1,7 @@ + +gph: + +copy: +ifdef UTT_BIN_DIR + cp gph ${UTT_BIN_DIR} +endif diff --git a/app/src/gph/gph b/app/src/gph/gph new file mode 100755 index 0000000..b602ac7 --- /dev/null +++ b/app/src/gph/gph @@ -0,0 +1,85 @@ +#!/usr/bin/perl + +use Getopt::Long; + +my @process; +my $help=0; +my $reset; +my $interactive=1; + +GetOptions("process|p=s" => \@process, + "help|h" => \$help, + "reset|r=s" => \$reset, + "interactive|i" => \$interactive); + +if($help) +{ + print <<'END' +Usage: gph [OPTIONS] + +Options: + -p tag Process segments with this tag as nodes. + -r tag Start new graph at this tag. + -f filename Input file (NIE DZIALA). + -o filename Output file (NIE DZIALA). + -i Toggle interactive mode (default=on). +END +; + exit 0; +} + + +$|=1 if $interactive; + +my @prev; + +my $n=0; + +while(<>) +{ + chomp; + my $do=0; + + my @line = split /\s+/; + + if($line[2] eq $reset) + { + $n=0; + @prev = (); + } + + for my $p (@process) + { + $do=1 if $line[2] eq $p; + } + + if($do) + { + @preds = (); + shift @prev while @prev+0 && $prev[0]->[1] + $prev[0]->[2] < $line[0]; + for my $p (@prev) + { + push(@preds, $p->[0]) if $p->[1] + $p->[2] == $line[0]; + } + push @prev, [$n, $line[0], $line[1]]; + + $gph=' gph:'.$n.':'.join(',',@preds); + + $n++; + } + else + { + for my $p (@prev) + { + if($p->[1]+$p->[2] == $line[0]) + { + $p->[2] += $line[1]; + } + } + + $gph=''; + + } + + print $_.$gph."\n"; +} diff --git a/app/src/grp/Makefile b/app/src/grp/Makefile new file mode 100644 index 0000000..e0ca5c1 --- /dev/null +++ b/app/src/grp/Makefile @@ -0,0 +1,6 @@ +main: + +copy: +ifdef UTT_BIN_DIR + cp grp ${UTT_BIN_DIR} +endif diff --git a/app/src/grp/grp b/app/src/grp/grp new file mode 100755 index 0000000..97c136d --- /dev/null +++ b/app/src/grp/grp @@ -0,0 +1,154 @@ +#!/usr/bin/perl + +#package: UAM Text Tools +#component name: gre +#author: Tomasz Obrbski + +use strict; +use Getopt::Long; + +my $LIB_DIR="/usr/local/lib/utt"; # katalog zawierajacy terms.m4 + +my $systemconfigfile="/usr/local/etc/utt/grp.conf"; +my $userconfigfile="$ENV{'HOME'}/.utt/grp.conf"; + +Getopt::Long::Configure('no_ignore_case_always'); + +my $help=0; +my $pattern=0; +my $matches_only=0; +my $macrofile=0; +my $define=0; +my $show_command=0; +my $action="pgP"; +my $eos="seg(EOS)"; +my $morfield='lem'; + +#read configuration files########################### +my $file; +foreach $file ($systemconfigfile, $userconfigfile){ + if(open(CONFIG, $file)){ + while () { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "pattern")or($name eq "e")){ + $pattern=$value; + } + elsif(($name eq "eos")or($name eq "E")){ + $eos=$value; + } + elsif($name eq "morph"){ + $morfield=$value; + } + elsif($name eq "macros"){ + $macrofile=$value; + } + elsif($name eq "define"){ + $define=$value; + } + elsif($name eq "command"){ + $show_command=1; + } + elsif($name eq "action"){ + $action; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + + } + close CONFIG; + } +} +######################################################### + +GetOptions("pattern|e=s" => \$pattern, + "eos|E=s" => \$eos, + "morph=s" => \$morfield, + "macros=s" => \$macrofile, + "define=s" => \$macrofile, + "command" => \$show_command, + "action=s" => \$action, + "help|h" => \$help); + +if($help) +{ + print <<'END' +Usage: gre [OPTIONS] [file ..] + +Options: + --pattern -e PATTERN Pattern. + --eos -E PATTERN Segment serving as sentence delimiter. + --morph=STRING Field containing morphological information (default 'lem'). + --macros=FILE Read macrodefinitions from FILE. + --define=FILE Add macrodefinitions from FILE. + --action -a [u][p][g][P] Perform only indicated actions. + u - uncompress with 'lzop -cd' + p - preprocess + g - grep + P - postprocess + (default pgP) + --command Print the shell command to be executed and exit. + --help -h Help. +END +; + exit 0; +} + +die("$0: no pattern given.\n") unless $pattern || $action !~ /g/; + +die("$0: macro file not found") unless + $macrofile or + -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4"; + +my $uncompress = ($action =~ /u/) ? ' lzop -cd | ' : ''; +my $preproc = ($action =~ /p/) ? ' fla | ' : ''; + +my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; + + +# discarding spaces +$pattern =~ s/\s+/\\`'/g; #` +# quoting escaped commas +$pattern =~ s/\\,/\\`\\`\\,''/g; +# quoting commas in {m,n} r.e. operator +$pattern =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; + +my $grepre = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`; + +die("Incorrect pattern (m4).") if $? >> 8; + + +chomp $grepre; + +# <> expansion + +$grepre =~ s/<([^>]+)>/`echo $1 | tag2re`/ge; + +$grepre =~ s/\./[^ [:cntrl:]]/g; + +$grepre =~ s/\\s/[ ]/g; +$grepre =~ s/\\S/[^ [:cntrl:]]/g; +$grepre =~ s/\\d/[0-9]/g; +$grepre =~ s/\\D/[^0-9 [:cntrl:]]/g; +$grepre =~ s/\\w/[a-z󶼿A-ZʣӦ0-9_]/g; +$grepre =~ s/\\W/[^a-z󶼿A-ZʣӦ0-9_ [:cntrl:]]/g; +# extensions +$grepre =~ s/\\l/[a-z󶼿]/g; #lowercase letter +$grepre =~ s/\\L/[A-ZʣӦ]/g; #upercase letter + +my $grep_command = ($action =~ /g/) ? "egrep '$grepre'" : " cat "; + +if($show_command) +{ + print $grep_command."\n"; + exit 0; +} + +#print $preproc.$grep_command.$postproc."\n"; + +exec $preproc.$grep_command.$postproc; diff --git a/app/src/gue/Makefile b/app/src/gue/Makefile new file mode 100644 index 0000000..7fb9380 --- /dev/null +++ b/app/src/gue/Makefile @@ -0,0 +1,42 @@ +PAR=-Wno-deprecated -O3 -fpermissive -static +PAR2=-c -Wno-deprecated -O3 -fpermissive +LIB_PATH=../lib +COMMON_PATH=../common +CMDLINE_FILE='"../gue/cmdline.h"' + + +gue: main.cc guess.o $(LIB_PATH)/auttools.o $(LIB_PATH)/word.o \ + cmdline.c common_guess.o common.o + g++ $(PAR) main.cc guess.o \ + $(LIB_PATH)/auttools.o $(LIB_PATH)/word.o cmdline.c common.o common_guess.o \ + -o gue + +guess.o: guess.h guess.cc + g++ $(PAR2) guess.cc + +common_guess.o: cmdline.h common_guess.cc common_guess.h + g++ $(PAR2) common_guess.cc + +common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \ + $(COMMON_PATH)/common.h + g++ $(PAR2) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc + +cmdline.c cmdline.h: cmdline.ggo + gengetopt -i cmdline.ggo --conf-parser + +cmdline.ggo: cmdline_guess.ggo ../common/cmdline_common.ggo + cat cmdline_guess.ggo ../common/cmdline_common.ggo > cmdline.ggo + + +clean: clean.cmdline + rm *.o || true + rm gue || true + + +clean.cmdline: + rm cmdline.* || true + +copy: +ifdef UTT_BIN_DIR + cp gue ${UTT_BIN_DIR} +endif diff --git a/app/src/gue/cmdline_guess.ggo b/app/src/gue/cmdline_guess.ggo new file mode 100644 index 0000000..e94fec3 --- /dev/null +++ b/app/src/gue/cmdline_guess.ggo @@ -0,0 +1,12 @@ +package "guess" +version "0.1" + +option "guess_count" n "Guess up to n descriptions" int default="0" no +option "delta" - "Stop displaying answers after fall of weight" float default="0.2" no +option "cut-off" - "Do not display answers with less weight than cut-off" int default="200" no +option "dictionary-home" - "dh" hidden +option "dictionary" d "File with dictionary information" string typestr="filename" default="~/.utt/lang/pl_PL.ISO-8859-2/gue.bin" no +option "per-info" v "Display performance information" flag off +option "weights" w "Print weights" flag off hidden +option "no-uppercase" - "Do not process form containing uppercase letters" flag off + diff --git a/app/src/gue/common_guess.cc b/app/src/gue/common_guess.cc new file mode 100644 index 0000000..955792d --- /dev/null +++ b/app/src/gue/common_guess.cc @@ -0,0 +1,50 @@ +#include +#include +#include "common_guess.h" + +int guess_count=0; +double delta=0.1; +int cut_off=100; +char dictionary[255]; +bool per_info=false; +bool weights=true; + +void process_guess_options(gengetopt_args_info* args) +{ + + if(args->dictionary_given) + { + expand_path(args->dictionary_arg,dictionary); + } + else if (args->dictionary_home_given && args->language_given) + { + char buf[255]; + expand_path(args->dictionary_home_arg, buf); + sprintf(dictionary,"%s/%s/lem.bin",buf,args->language_arg); + } + + if(args->guess_count_given) + guess_count=args->guess_count_arg; + else + guess_count=0; + + if(guess_count==0) + guess_count=100; + + if(args->delta_given) + delta=args->delta_arg; + else + delta=0.1; + + if(args->cut_off_given) + cut_off=args->cut_off_arg; + else + cut_off=100; + + if(args->per_info_given) + per_info=args->per_info_flag; + + if(args->weights_given) + weights=false; + +} diff --git a/app/src/gue/common_guess.h b/app/src/gue/common_guess.h new file mode 100644 index 0000000..c7a44bc --- /dev/null +++ b/app/src/gue/common_guess.h @@ -0,0 +1,20 @@ +#ifndef __COMMON_GUESS_H +#define __COMMON_GUESS_H + +#include +#define _CMDLINE_FILE "../gue/cmdline.h" +#include "../common/common.h" +#include "cmdline.h" + +#define DIC_FILE "gue.bin" + +extern int guess_count; +extern double delta; +extern int cut_off; +extern char dictionary[]; +extern bool per_info; +extern bool weights; + +void process_guess_options(gengetopt_args_info* args); + +#endif diff --git a/app/src/gue/guess.cc b/app/src/gue/guess.cc new file mode 100644 index 0000000..bf502d5 --- /dev/null +++ b/app/src/gue/guess.cc @@ -0,0 +1,138 @@ + +#include "guess.h" + +#include +#include +#include +#include +#include + +#define DICT 1 +#define COR 2 +#define DICT_P 3 +#define COR_P 4 + +#define W_PRE 0.1 +#define W_SUF 0.9 + +#define PREF_SIGN '_' + +Guess::Guess(const char* suf_file) + : _suf(suf_file) { + /* _suf = NULL; + _pref = NULL; + + if (strlen(suf_file) > 0) + _suf = new TFTiv(suf_file); + if (strlen(pref_file) > 0) + _pref = new TFTiv(corp_file); + */ +} + + + char buf[MAX_LINE]; + char out[MAX_LINE]; + char* buf0_s = buf; + char* word_t = NULL; + long state_s = 0; + unsigned length_s = buf0_s - buf; + long len = 0; + int i=0; + +int Guess::ana(const char* word, Words& result) { + + assert(word && &result); + + /* Word zawiera wyraz, ktory mamy zbadac. + * Nalezy przepisac go w odwrotnej kolejnosci do bufora, + * znalezc najdluzszy prefiks pasujacy do tego bufora + * separatorem jest '/' - za tym znakiem znajduje sie + * prawdopodobienstwo wystapienia danego opisu */ + + buf0_s = buf; + word_t = strdup(word); + + if (reverse(word, buf) != 0) + return -1; + + + + state_s = -1; + // printf("#buf0_s=%s, ", buf0_s); + state_s = _suf.pref(buf0_s, PREF_SIGN); + // printf("#word=%s, buf0_s=%s\t", word, buf0_s); + /* jezeli state_s != -1 to oznacza, ze w slowniku jest zawarta + * informacja o prefiksie tego slowa. + * nie jest ona odwrocona, wiec porownujemy do word a nie do buf + */ + // printf("state_s=%d\t", state_s); + if (state_s != -1) { + state_s = _suf.pref(word_t, '~', state_s); + // printf("state_s(wp)=%d, word_t=%s, word=%s\n", state_s, word_t, word); + } + if (state_s == -1) { + // if (_suf != NULL) + buf0_s = buf; + state_s = _suf.pref(buf0_s, '~'); + // printf("state_s=%d\n", state_s); + } + + length_s = buf0_s - buf; + + /* state jest stanem, od ktorego zaczyna sie sciezka opisujaca + * prawdopodobienstwo przeciwienstwa wystapienia opisu + * znajdujacego sie dalej na tej sciezce. + * Im mniejsza wartosc liczby tym wieksze prawdopodobienstwo */ + + len = 0; + i=0; + + // if (_suf != NULL) + len = _suf.cont(state_s, out); + while (len > 0) { + i++; + add_word_prob(result, word, out, length_s, DICT); + len = _suf.cont(-1, out); + } + + return i; + +} + + +int Guess::add_word_prob(Words& tab, const char* word, const char* path, unsigned len, int source) { + + /* Dodaje do tablicy tab wyraz word wraz + * z prawdopodobienstwem i opisem zawartym + * w sciezce path */ + + // printf("add_word_prob("); + // fflush(stdout); + char p[MAX_LINE]; + + strcpy(p, path); + + int probLen = strcspn(p, ";"); + char prob[probLen+1]; + strncpy(prob, p, probLen); + prob[probLen] = '\0'; + + char* desc = p + probLen+1; // +2 bo pomijamy jeszcze znak ';' + + int i = tab.add(word, desc); + + if (source==DICT) { + tab[i].len_suf(len); + tab[i].w_suf(atof(prob)); // + W_PRE*tab[i].w_suf())); + // tab[i].w_suf((float)(W_SUF*(1000-atof(prob)) + W_PRE*tab[i].w_suf())); + } +// if (source==COR) { +// tab[i].len_pref(len); +// tab[i].w_pref(W_SUF*(1000-atof(prob)) + W_PRE*tab[i].w_pref()); +// } +// printf(")\n"); +// fflush(stdout); + + return i; + +} diff --git a/app/src/gue/guess.h b/app/src/gue/guess.h new file mode 100644 index 0000000..68b7584 --- /dev/null +++ b/app/src/gue/guess.h @@ -0,0 +1,56 @@ + +#include "../lib/tfti.h" +#include "../lib/word.h" + +#include + +/************************************************************** + * Zawiera definicje klasy Guess. * + * * + * Klasa ta pozwala na okreslenie opisu slowa nie * + * znajdujacego sie w slowniku wraz z prawdopodobienstwem * + * jego wystapienia. * + *************************************************************/ + +class Guess { + + public: + + // nazawa pliku slownika w parametrze + Guess(const char* suf_file); + + // zwraca tablice opisow slowa wraz z prawdopodobienstwem ich wystapienia + int ana(const char* word, Words& result); + + long time_overall; + + private: + + // sufiksy + TFTiv _suf; + + // prefiksy + TFTiv _pref; + + //odwraca ciag znakow + int reverse(const char* src, char* dest) { + + // assert((src != NULL) && (dest != NULL)); + + const char* c = src; + + int len = strlen(src); + + for (int i=1; i<=len; ++i) { + dest[i-1] = src[len-i]; + } + + dest[len] = '\0'; + + return 0; + } + + //dodaje nowy element do tablicy WordsProb + int add_word_prob(Words& tab, const char* word, const char* path, unsigned len, int source); + +}; diff --git a/app/src/gue/main.cc b/app/src/gue/main.cc new file mode 100644 index 0000000..a4809a1 --- /dev/null +++ b/app/src/gue/main.cc @@ -0,0 +1,192 @@ +#include +#include +#include "../lib/iotools.h" +#define _CMDLINE_FILE "../gue/cmdline.h" +#define CONFIGFILE1 "/home/ynka/utt/utt-0.9/conf/gue.conf" +#define CONFIGFILE2 "/home/ynka/utt/utt-0.9/conf/gue.conf" +#include "../common/common.h" +#include "common_guess.h" +#include "guess.h" +#include "cmdline.h" + +#define W_SUFF 0.6 +#define W_PREF 0.4 + +int main(int argc, char** argv) { + + int non_standard_config=0; + + gengetopt_args_info args; + + if(cmdline_parser(argc, argv, &args) != 0) + exit(1); + + process_config_files(&args,argv[0]); + + process_common_options(&args,argv[0]); + process_guess_options(&args); + + + // PONIŻEJ POPRZEDNI KOD (JUSTYNY) +// //preliminary command-line parsing - for configuration file info only +// gengetopt_args_info pre_args; + +// if (cmdline_parser(argc, argv, &pre_args) != 0) +// exit(1); +// if(pre_args.config_given){ +// printf("podano config: %s\n",pre_args.config_arg); +// non_standard_config=1; +// } + + +// //configuration file 1 parsing +// struct cmdline_parser_params *params; +// params = cmdline_parser_params_init(); +// params->initialize = 1; +// if(cmdline_parser_config_file(CONFIGFILE1,&args, params)!=0){ +// printf("System-wide configuration file parsing error!\n"); +// exit(1); +// } + +// //configuration file 2 parsing-overriding +// params->initialize=0; +// params->override=1; +// char* config2=(non_standard_config)?pre_args.config_arg:CONFIGFILE2; +// if(cmdline_parser_config_file(config2,&args, params)!=0){ +// printf("User configuration file parsing error!\n"); +// return 1; +// } + +// params->initialize=0; +// params->override=1; +// //params->check_required=1; + +// free(params); + +// //command-line options parsing-overriding +// if (cmdline_parser(argc, argv, &args) != 0) +// exit(1); + + + char line[MAX_LINE]; + char outline[MAX_LINE]; + char parms[MAX_LINE], desc[MAX_LINE], lemma[MAX_LINE]; + long line_count = 0; + // printf("d_f=%s\n", dict_file); + Guess guess(dictionary); + int words_count=0; + time_t start_time = time(NULL); + + Segment seg; + Words tab; + char* form; //[MAX_FORM]; + while (fgets(line, MAX_LINE, inputf)==line) { + line_count++; + int start, len; + + line[strlen(line)-1] = '\0'; + + if (!seg.parse(line)) { + fprintf(stderr, "B³±d w wej¶ciu (linia: %d)\n", line_count); + return -1; + } + + if (process_seg(seg, args)) { + words_count++; + tab.clear(); + if (args.input_field_given>0) { + form = getInput(args.input_field_arg, args.input_field_given, seg); + } else + form = seg.form; + + if (NULL == form) { + continue; + } + + guess.ana(form, tab); + + if ((tab.count()==0) && (!args.no_fail_flag)) { + // no guesses - analysis was unsuccessful + seg.print(outline); //this is necessary - seg.parse destroys line... + fputs(outline, failedf); + if (copy_processed) + fputs(line, stdout); + continue; + } + // we've got some guesses. Do we want to print it? + if (args.only_fail_flag) + continue; + + float last_weight=0; + int i=0; + int count=0; + unsigned first=1; + char* parms_end = parms; + char last_lemma[MAX_LINE]; + + while ((i=tab.next()) != -1 && count++) { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "gap-fill")or($name eq "g")){ + $gap_fill=$value; + } + elsif(($name eq "spaces")or($name eq "s")){ + $spaces=1; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + + } + close CONFIG; + } +} +######################################################### + +GetOptions("gap-fill|g=s" => \$gap_fill, + "spaces|r" => \$spaces, + "help|h" => \$help); + +if($help) +{ + print <<'END' +Usage: ser [OPTIONS] [file ..] + +Options: + --gap-fill -g Help. + --spaces -r + --define=FILE Read macrodefinitions from FILE. + --flex-template=FILE Read flex code template from FILE. + --only-matching -m Print only fragments matching PATTERN. + --flex Print only the generated flex code and exit. +END +; + exit 0; +} + + +$gap_fill =~ s/\\t/\t/g; +$gap_fill =~ s/\\n/\n/g; +$gap_fill =~ s/\\r/\r/g; +$gap_fill =~ s/\\f/\f/g; + +my $prevend=-1; +my $count=0; + +while(<>) +{ + my ($start,$len,$type,$form) = /^\s*(\d+)\s+(\d+)\s+(\S+)\s+(\S+)/; + + if($start > $prevend) + { + print $gap_fill unless $count++ == 0; + } + + $prevend=$start+$len; + + next if $len==0;# || $form eq "*"; + + $form =~ s/\\\*/*/g; + + if($type eq 'S' && ! $spaces) + { + $form =~ s/_/ /g; + $form =~ s/\\t/\t/g; + $form =~ s/\\n/\n/g; + $form =~ s/\\r/\r/g; + $form =~ s/\\f/\f/g; + } + + print $form; +} + +#print $gap_fill; + +# print "\n"; diff --git a/app/src/lem/Makefile b/app/src/lem/Makefile new file mode 100644 index 0000000..49e201f --- /dev/null +++ b/app/src/lem/Makefile @@ -0,0 +1,56 @@ +PAR=-Wno-deprecated -m32 -O3 -fpermissive +#-static +PAR2=-c -Wno-deprecated -m32 -O3 -fpermissive +LIB_PATH=../lib +COMMON_PATH=../common +CMDLINE_FILE='"../lem/cmdline.h"' + + +lem: main.cc lem.o $(LIB_PATH)/auttools.o $(LIB_PATH)/word.o \ + cmdline.c common_lem.o common.o symtab.o + g++ $(PAR) main.cc lem.o $(LIB_PATH)/auttools.o \ + $(LIB_PATH)/word.o cmdline.c common.o common_lem.o \ + symtab.o -o lem + +lem.o: lem.h lem.cc + g++ $(PAR2) lem.cc + +# alphabet.o: $(LIB_PATH)/alphabet.h $(LIB_PATH)/alphabet.cc +# g++ $(PAR2) $(LIB_PATH)/alphabet.cc + +# auttools.o: $(LIB_PATH)/auttools.h $(LIB_PATH)/auttools.cc +# g++ $(PAR2) $(LIB_PATH)/auttools.cc + +# word.o: $(LIB_PATH)/word.h $(LIB_PATH)/word.cc +# g++ $(PAR2) $(LIB_PATH)/word.cc + +# erro.o: $(LIB_PATH)/erro.h $(LIB_PATH)/erro.cc +# g++ $(PAR2) $(LIB_PATH)/erro.cc + +common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \ + $(COMMON_PATH)/common.h + g++ $(PAR2) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc + +common_lem.o: cmdline.h common_lem.h common_lem.cc + g++ $(PAR2) common_lem.cc + +cmdline.c cmdline.h: cmdline.ggo + gengetopt -i cmdline.ggo --conf-parser + +cmdline.ggo: cmdline_lem.ggo ../common/cmdline_common.ggo + cat cmdline_lem.ggo ../common/cmdline_common.ggo > cmdline.ggo + +symtab.o: $(LIB_PATH)/symtab.h $(LIB_PATH)/symtab.cc + g++ $(PAR2) $(LIB_PATH)/symtab.cc + +clean: clean.cmdline + rm *.o || true + rm lem || true + +clean.cmdline: + rm cmdline.* || true + +copy: +ifdef UTT_BIN_DIR + cp lem $(UTT_BIN_DIR) +endif diff --git a/app/src/lem/cmdline_lem.ggo b/app/src/lem/cmdline_lem.ggo new file mode 100644 index 0000000..21c93f0 --- /dev/null +++ b/app/src/lem/cmdline_lem.ggo @@ -0,0 +1,5 @@ +package "lem" +version "0.1" + +option "dictionary-home" - "D.h." string typestr="FILENAME" hidden no +option "dictionary" d "Dictionary" string typestr="FILENAME" default="lem.bin" no diff --git a/app/src/lem/common_lem.cc b/app/src/lem/common_lem.cc new file mode 100644 index 0000000..f81c07b --- /dev/null +++ b/app/src/lem/common_lem.cc @@ -0,0 +1,41 @@ +#include +#include +#include "common_lem.h" + +char dictionary[255]; + +void process_lem_options(gengetopt_args_info* args) +{ + + if(args->dictionary_given) + { + expand_path(args->dictionary_arg,dictionary); + } + else if (args->dictionary_home_given && args->language_given) + { + char buf[255]; + expand_path(args->dictionary_home_arg, buf); + sprintf(dictionary,"%s/%s/lem.bin",buf,args->language_arg); + } +} + + +// STARE +// if(args.dictionary_given) +// strcpy(dictionary, args.dictionary_arg); +// else { +// char path[256]; +// //sprintf(path, "/etc/utt/data/%s/%s", args.locale_arg, DICT_FILE); +// //if (file_accessible(path) == 0) +// // strcpy(dictionary, path); +// //else { +// sprintf(path, "%s/%s", utt_dir, DICT_FILE); +// if (file_accessible(path) == 0) +// strcpy(dictionary, path); +// else { +// fprintf(stderr, "Cannot find dictionary!\n"); +// exit(1); +// } +// //} +// } + diff --git a/app/src/lem/common_lem.h b/app/src/lem/common_lem.h new file mode 100644 index 0000000..fe640c9 --- /dev/null +++ b/app/src/lem/common_lem.h @@ -0,0 +1,16 @@ +#ifndef __COMMON_LEM__H +#define __COMMON_LEM__H + +#include +#define _CMDLINE_FILE "../lem/cmdline.h" +#include "../common/common.h" + +#include "cmdline.h" + +#define DICT_FILE "lem.bin" + +extern char dictionary[]; + +extern void process_lem_options(gengetopt_args_info* args); + +#endif diff --git a/app/src/lem/lem.cc b/app/src/lem/lem.cc new file mode 100644 index 0000000..3c4dde9 --- /dev/null +++ b/app/src/lem/lem.cc @@ -0,0 +1,152 @@ +#include "lem.h" + +#include + + +/* Znajduje opisy slownikowe dla wyrazu. + * Parametry: + * form - wyraz, + * tab - referencja do tablicy Words (miejsce na wyniki) + * Wartosc: + * liczba dodanych opisow + */ +int Lem::ana(const char* form, Words& tab) { + + // sprawdzamy czy parametry wywolania sa poprawne + assert(form && &tab); + int count0 = tab.count(); + long l; + if ((l=_dict.next(_dict.gtra(0, form, FT::ftMAXPATH), ';'))>=0) + add_to_table(tab, form, l); + return tab.count()-count0; +} + + +/* Szukamy opisu slownikowego nastepnego wyrazu w buforze. + * Parametry: + * buf - bufor + * tab - miejsce na wyniki + * Wartosc: + * ilosc dodanych opisow + */ +int Lem::pref(char* buf, Words& tab) { + + // sprawdzamy czy parametry wywolania sa poprawne + assert(buf && &tab); + + int count0 = tab.count(); + long l; + char* buf0 = buf; + + if((l=_dict.pref(buf, ';'))>=0) { + char form[MAX_FORM]; + int len=buf-buf0; + form[len]='\0'; + add_to_table(tab,form,l); + } + return tab.count() - count0; +} + +/* Dodaje kolejne opisy do tablicy wynikow. + * Parametry: + * tab - tablica wynikow, + * f - wyraz, + * s - stan, na ktorym zaczyna sie pierwszy opis + */ +void Lem::add_to_table(Words& tab, const char* f, long s) { + + // sprawdzenie parametrow + assert(&tab); + assert(f); + + char des[FT::ftMAXPATH]; + + while (_dict.cont(s, des)) { + char* des1; + if ((des1=strtok(des, ";")) != NULL) + do { + if (tab.count() >= MAX_ALT) break; + tab.add(f, des1); + des1=strtok(NULL, ";"); + } while (des1!=NULL); + s=-1; + } +} + +void Lem::prn_dict() +{ + + char des[FT::ftMAXPATH]; + + long s=0; + + while (_dict.cont(s, des)) + { + printf("%s\n",des); + s=-1; + } +} + + +AuxLem::AuxLem(const char* filename) + : Lem(), _dict(SIZE) +{ + FILE* f; + char buf[MAX_LINE+2]; + f=fopen(filename,"r"); + for(long i=0; i=MAX_LINE-1) continue; // BEZ isalpha! + buf[l-1]='\0'; + char* sep=strchr(buf,';'); + if(sep==NULL) continue; + *sep='\0'; + long formind=_dict.add(buf); + if(formind>=0) + { + char* desc=strdup(sep+1); + info[formind]=desc; + } + else + fprintf(stderr,"AuxLem: Form not added: %s;%s.\n", buf,sep+1); + } + fclose(f); +}; + +//--------------------------------------------------------------------------- + +AuxLem::~AuxLem() +{ +// for(long i=0; i<_dict.count(); ++i) +// free(info[_dict.hashindex(i)]); + for(long i=0; i=0) + { + strcpy(des,info[ind]); + char* des1; + if((des1=strtok(des,";"))!=NULL) + do + { + if(tab.cnt>=MAXALT) break; + tab.add(form,des1); + des1=strtok(NULL,";"); + } while(des1!=NULL); + } + return tab.count()-count0; +} + +//--------------------------------------------------------------------------- + diff --git a/app/src/lem/lem.h b/app/src/lem/lem.h new file mode 100644 index 0000000..f6256f1 --- /dev/null +++ b/app/src/lem/lem.h @@ -0,0 +1,50 @@ +#include "../lib/tfti.h" +#include "../lib/word.h" +#include "../lib/symtab.h" +#include "../lib/const.h" + +class Lem { + + protected: + // Alphabet& _alpha; + + // slownik + TFTiv _dict; + + void add_to_table(Words& tab, const char* f, long s); + + public: + + Lem() {}; + Lem(const char* d) + : _dict(d) {}; + virtual int ana(const char* form, Words& tab); + int pref(char* form, Words& tab); + void prn_dict(); + +}; + + +class AuxLem : public Lem { +public: + + static const int SIZE=1500000; + // static const int MAXLINE=1000; + static const int MAXALT=256; + + AuxLem(const char* filename); + ~AuxLem(); + +// int ana(const char* form, Grams& tab); + int ana(const char* form, Words& tab); + +// operator bool() { return _dict && info; } + +private: + SymbolTable _dict; + char* info[SIZE]; + +}; + + + diff --git a/app/src/lem/main.cc b/app/src/lem/main.cc new file mode 100644 index 0000000..6c3a949 --- /dev/null +++ b/app/src/lem/main.cc @@ -0,0 +1,132 @@ +#include "../lib/iotools.h" +#define _CMDLINE_FILE "../lem/cmdline.h" +#include "../common/common.h" +#include "common_lem.h" +#include "lem.h" +#include "cmdline.h" +#include + +int main(int argc, char** argv) { + +// setlocale(LC_CTYPE,""); //PO CO TO? +// setlocale(LC_COLLATE,""); // + + gengetopt_args_info args; + + if(cmdline_parser(argc, argv, &args) != 0) + exit(1); + + process_config_files(&args,argv[0]); + process_common_options(&args,argv[0]); + process_lem_options(&args); + + char line[MAX_LINE+1]; + char outline[MAX_LINE+1]; + char parms[MAX_LINE+1], desc[MAX_LINE+1], lemma[MAX_LINE+1]; + long line_count = 0; + + Lem* lem; + + if(strcmp(dictionary+strlen(dictionary)-4,".bin")==0) + lem = new Lem(dictionary); + else if(strcmp(dictionary+strlen(dictionary)-4,".dic")==0) + lem = new AuxLem(dictionary); + else + fprintf(stderr,"lem: Invalid dictionary file extension.\n"); + + Words tab; +// Segment seg; + + while (fgets(line, MAX_LINE, inputf)) + { + // strcpy(outline,line); + ++line_count; + + int start, len; + + if (!process_seg(line, args)) // TO POWINNO BYC WCZESNIEJ ZABEZPIECZONE + fputs(line, outputf); + else + { + char form[MAX_FORM]; + + tab.clear(); + getfield(line,input_field_prefix,form); + if (form==NULL) continue; + + lem->ana(form, tab); + if(tab.count()==0) + { + char form1[MAX_FORM]; // tymczasowo tak, trzeba zmienic ana + char* p; + strcpy(form1,form); + for(p=form1;*p;++p) *p=tolower(*p); + p=form1; + lem->ana(p,tab); + } + + if (tab.count() == 0) + fputs(line, failedf); + else + { // mamy jakies opisy w slowniku + + if(one_line) + { + char* descp=desc; + for (int i=0; i< tab.count(); ++i) + { + descp += sprintf(descp," %s%s,%s", output_field_prefix, tab[i].lemma(), tab[i].descr()); + } + strcpy(outline,line); + outline[strlen(outline)-1]='\0'; + strcat(outline,desc); + strcat(outline,"\n"); + fputs(outline, outputf); + if (copy_processed) + fputs(line,outputf); + } + else if(one_field) + { + char* descp=desc; + for (int i=0; i< tab.count(); ++i) + if(i==0) + descp += sprintf(descp," %s%s,%s", output_field_prefix, tab[i].lemma(), tab[i].descr()); + else + { + if(strcmp(tab[i].lemma(),tab[i-1].lemma())==0) + descp += sprintf(descp,",%s",tab[i].descr()); + else + descp += sprintf(descp,";%s,%s",tab[i].lemma(),tab[i].descr()); + } + + strcpy(outline,line); + outline[strlen(outline)-1]='\0'; + strcat(outline,desc); + strcat(outline,"\n"); + fputs(outline, outputf); + if (copy_processed) + fputs(line,outputf); + } + else + { + for (int i=0; i< tab.count(); ++i) + { + // kolejne opisy - kolejne linie. + sprintf(desc, " %s%s,%s\n", output_field_prefix, tab[i].lemma(), tab[i].descr()); + strcpy(outline,line); + outline[strlen(outline)-1]='\0'; + strcat(outline,desc); + fputs(outline, outputf); + } + if (copy_processed) + fputs(line,outputf); + } + } + } + + if(args.interactive_flag) + fflush(outputf), fflush(failedf); + + } + cmdline_parser_free(&args); +} diff --git a/app/src/lib/Makefile b/app/src/lib/Makefile new file mode 100644 index 0000000..5dfc8c9 --- /dev/null +++ b/app/src/lib/Makefile @@ -0,0 +1,20 @@ +PAR=-Wno-deprecated -m32 -O3 +PAR2=-c -Wno-deprecated -m32 -O3 -static -fpermissive +LIB_PATH=../lib +COMMON_PATH=../common + +main: auttools.o word.o copy + +auttools.o: auttools.h auttools.cc + g++ $(PAR2) auttools.cc + +word.o: word.h word.cc + g++ $(PAR2) word.cc + +clean: + rm *.o + +copy: +ifdef UTT_LIB_DIR + cp -r perl $(UTT_LIB_DIR)/ +endif diff --git a/app/src/lib/auttools.cc b/app/src/lib/auttools.cc new file mode 100644 index 0000000..daf018d --- /dev/null +++ b/app/src/lib/auttools.cc @@ -0,0 +1,164 @@ +#include "auttools.h" +//#include "/src/cpp-comm/plx/Plx.h" + +void fullform(const char* b, const char* d, char* f) +{ + int i,j=0; + int n1, n2=0; + bool g=false; + char s1[200], s2[200], temps[200]; + while(d[j]>='0' && d[j]<='9')j++; + strncpy(temps,d,j); temps[j]='\0'; + n1=atoi(temps); + i=j; + while(!ispunct(d[j]) || d[j]=='*') j++; + strncpy(s1,d+i,j-i); + s1[j-i]='\0'; + if(d[j++]=='-') + { + i=j; + while(d[j]>='0' && d[j]<='9')j++; + strncpy(temps,d+i,j-i); temps[j]='\0'; + n2=atoi(temps); + i=j; + while(!ispunct(d[j]) || d[j]=='*') j++; + strncpy(s2,d+i,j-i); + s2[j-i]='\0'; + g=true; + } + + int blen=strlen(b); + if(g) + if(n1+n2<=blen) + { + strcpy(f,s1); + strcat(f,b+n1); + f[strlen(f)-n2]='\0'; + strcat(f,s2); + } + else + strcpy(f,""); + else + if(n1<=blen) + { + strcpy(f,b); + f[strlen(f)-n1]='\0'; + strcat(f,s1); + } + else + strcpy(f,""); +} + +void compose(char* stem, char* ending, char* form) +{ + bool suffix=true; + while(*stem) + if(*stem=='*') + { + strcpy(form,ending); + form+=strlen(ending); + suffix=false; + stem++; + } + else + *(form++)=*(stem++); + if(suffix) + { + strcpy(form,ending); + form+=strlen(ending); + } + *form='\0'; +} + +void autodescr(const char* f, const char* des, char* lemma, char* pos, char* attr) +{ + char lemd[MAXWORDLEN]; + int o,l=strcspn(des,","); + strncpy(lemd,des,l); + lemd[l]='\0'; + fullform(f,lemd,lemma); + o=l+1; + l=strcspn(des+o,"/:"); + strncpy(pos,des+o,l); + pos[l]='\0'; + o=o+l; + if(des[o]=='/') + { + o++; + strcpy(attr,des+o); + } + else + attr[0]='\0'; +} + + +int common_prefix(const char* s, const char* t) +{ + int n=0; + while(*s==*t && *s!='\0') + { s++,t++;n++; } + return n; +} + +int strdiff(const char* s, const char* t, + int& frontcut, char* prefix, int& endcut, char* suffix) +{ + int slen=strlen(s); + int tlen=strlen(t); + int ss, ss_max=0; /* ss - s shift */ + int ts, ts_max=0; /* ts - t shift */ + int common, common_max=0; + for(ss=0;sscommon_max + && (common>4 || (ss==0 && ts==0 && common>1)) ) + { + ss_max=ss; + ts_max=ts; + common_max=common; + } + // print "--", tsmax,"\n" + printf("--%d\n", ts_max); + frontcut=ss_max; + strncpy(prefix,t,ts_max); prefix[ts_max]='\0'; + endcut=slen-ss_max-common_max; + strcpy(suffix,t+ts_max+common_max); + return common_max; +} + +void fprndiff(FILE* f, const char* s, const char* t) +{ + int frontcut,endcut; + char pref[MAXWORDLEN],suff[MAXWORDLEN]; + strdiff(s,t,frontcut,pref,endcut,suff); + if(frontcut!=0 || pref[0]!='\0') + fprintf(f,"%d%s-%d%s",frontcut,pref,endcut,suff); + else + fprintf(f,"%d%s",endcut,suff); +} + +void sprndiff(char* outstr, const char* s, const char* t) +{ + int frontcut,endcut; + char pref[MAXWORDLEN],suff[MAXWORDLEN]; + strdiff(s,t,frontcut,pref,endcut,suff); + if(frontcut!=0 || pref[0]!='\0') + sprintf(outstr,"%d%s-%d%s",frontcut,pref,endcut,suff); + else + sprintf(outstr,"%d%s",endcut,suff); +} + + +void despos(const char* des, char* pos) +{ + int di=0; + int pi=0; + while(des[di]!=',' && des[di]!='\0') ++di; + if(des[di]==',') + { + ++di; + while(isupper(des[di])) pos[pi++]=des[di++]; + } + pos[pi]='\0'; +} + diff --git a/app/src/lib/auttools.h b/app/src/lib/auttools.h new file mode 100644 index 0000000..d558222 --- /dev/null +++ b/app/src/lib/auttools.h @@ -0,0 +1,39 @@ + +#ifndef _Auttools_h +#define _Auttools_h + +#include +#include +#include +#include + +/* #define ISALPHAG(c) ((c>='A' && c<='Z') || (c>='a' && c<='z') || \ */ +/* c=='' || c=='' || c=='' || c=='' || \ */ +/* c=='' || c=='' || c=='' || c=='' || \ */ +/* c=='' || c=='' || c=='' || c=='' || \ */ +/* c=='' || c=='' || c=='' || c=='' || \ */ +/* c=='' || c=='' || c=='*') */ + +#define MAXWORDLEN 64 + +extern void fullform(const char* b, const char* d, // in + char* f); // out + +extern void compose(char* stem, char* ending, // in + char* form); // out + +extern void autodescr(const char* f, const char* des, // in + char* lemma, char* pos, char* attr); // out + +extern int strdiff(char* s, char* t, // in + int& frontcut, char* prefix, // out + int& endcut, char* suffix); // out + +extern void fprndiff(FILE* f, const char* s, const char* t);// in + +extern void sprndiff(char* outstr, const char* s, const char* t); // in + +extern void despos(const char* des, // in + char* pos); // out + +#endif diff --git a/app/src/lib/const.h b/app/src/lib/const.h new file mode 100644 index 0000000..52f9b63 --- /dev/null +++ b/app/src/lib/const.h @@ -0,0 +1,24 @@ + + + +// maksymalna dlugosc wyrazu +#define MAX_FORM 80 + +// maksymalna dlugosc opisu +#define MAX_DESC 80 + +// maksymalna dlogosc lini w pliku przejsciowym +#define MAX_LINE 1024 + +// separator pol w pliku posrednim +#define FIELD_SEP " \t\n" + +// maksymalna liczba alternatywnych opisow +#define MAX_ALT 256 + +// plik ze slownikiem dla guessa +#define GUESS_DICT_FILE "slownik.fsa" + +// katalogi z plikami konfiguracyjnymi +#define SYSTEM_CONFIG_DIR "/usr/local/etc/utt" +#define USER_CONFIG_DIR "~/.utt" diff --git a/app/src/lib/iotools.h b/app/src/lib/iotools.h new file mode 100644 index 0000000..c4d20a3 --- /dev/null +++ b/app/src/lib/iotools.h @@ -0,0 +1,53 @@ +#include "const.h" +#include +#include +#include +#include +#include + + +// napisy zostaj na miejscu (w line), tylko wskazniki sa ustawian +// i zara dopisywane zera s dopisywane + +inline +int parsetok(char* line, int* a, int* b, char** c, char** d, char** e, char** f) +{ + char* field; + if((field=strtok(line,FIELD_SEP))!=NULL) + *a=atoi(field); // nie sprawdzana poprawnosc + else + return 0; + if((field=strtok(NULL,FIELD_SEP))!=NULL) + *b=atoi(field); // nie sprawdzana poprawnosc + else return 1; + if((*c=strtok(NULL,FIELD_SEP))==NULL) return 2; + if((*d=strtok(NULL,FIELD_SEP))==NULL) return 3; + if((*e=strtok(NULL,FIELD_SEP))==NULL) return 4; + if((*f=strtok(NULL,FIELD_SEP))==NULL) return 6; + return 6; +} + +// napisy s kopiowane +inline +int scantok(const char* line, int* a, int* b, char* c, char* d, char* e=NULL, char* f=NULL) +{ + return sscanf(line," %d %d %s %s %s %s", a, b, c, d, e, f); +} + +inline +int printtok(char* line, int a, int b, char* c, char* d, char* e, char* f, char* parms) +{ + sprintf(line,"%04d %02d %s %s %s %s `%s\n", a, b, c, d, e, f, parms); +} + +inline +int printtok(char* line, int a, int b, char* c, char* d, char* e, char* f) +{ + sprintf(line,"%04d %02d %s %s %s %s\n", a, b, c, d, e, f); +} + +inline +int printtok(char* line, int a, int b, char* c, char* d) +{ + sprintf(line,"%04d %02d %s %s\n", a, b, c, d); +} diff --git a/app/src/lib/matchdescr.cc b/app/src/lib/matchdescr.cc new file mode 100644 index 0000000..ea54655 --- /dev/null +++ b/app/src/lib/matchdescr.cc @@ -0,0 +1,86 @@ +#include +#include + +inline +bool inline_matchattr(const char* a, const char* b) +{ + const char *p, *q; // pomocnicze wskazniki + while(*a && *b) + { + p=a; q=b; + while(isupper(*p) && isupper(*q)) // rowny prefiks + if(*p==*q) ++p, ++q; + else if(*p<*q) // a jest mniejszy + { + // przesywamy a do nastepnego atr + a=p; + while(isupper(*a)) ++a; while(islower(*a)) ++a; + goto end; + } + else + { + // przesuwamy b do nastepnego atr + b=q; + while(isupper(*b)) ++b; while(islower(*b)) ++b; + goto end; + } + + if(islower(*p) && islower(*q)) // rowne atrybuty + { + a=p; b=q; // przesuwamy wskaznik, sprawdzamy wartosci + while(*a != *b) + { + if(*a > *b && !islower(*++b)) return false; + if(*a < *b && !islower(*++a)) return false; + } + // znaleziono rowna wartosc, przesywamy a i b do nast atr + while(isupper(*a)) ++a; while(islower(*a)) ++a; + while(isupper(*b)) ++b; while(islower(*b)) ++b; + goto end; + } + + if(islower(*p)) // a jest krotszy, czyli mniejszy + { // przesuwamy a do nastepnego atrybutu + a=p; + while(islower(*a)) ++a; + goto end; + } + + if(islower(*q)) // b jest krotszy, czyli mniejszy + { // przesuwamy b do nastepnego atrybutu + b=q; + while(islower(*b)) ++b; + goto end; + } + end: ; + } + return true; +} + + +bool matchattr(const char* a, const char* b) +{ + return inline_matchattr(a,b); +} + +bool matchdescr(const char* a, const char* b) +{ + while(isupper(*a) && isupper(*b) && *a==*b) ++a, ++b; + if(*a=='\0') + if(*b=='\0' || *b=='/') return true; + else return false; + + if(*a=='/') + if(*b=='\0') return true; + else if(*b=='/') return inline_matchattr(++a, ++b); + + return false; +} + + +int main() +{ + char a[100], b[100]; + while(scanf("%s %s", a, b)==2) + printf("%s & %s = %d\n", a, b, matchdescr(a,b)); +} diff --git a/app/src/lib/matchdescr.h b/app/src/lib/matchdescr.h new file mode 100644 index 0000000..f9ee5d5 --- /dev/null +++ b/app/src/lib/matchdescr.h @@ -0,0 +1,10 @@ + +// obie funkcje wymagaja by deskrypcje byly w postaci kanonicznej +// obslugiwane sa tylko krotkie (jednoliterowe) atrybuty + +// test czy zgadzaja sie deskrypcje +bool matchdescr(const char* a, const char* b); + +// test czy zgadaja sie same atrybuty (czyli to, co po ukosniku) +bool matchattr(const char* a, const char* b); + diff --git a/app/src/lib/symtab.cc b/app/src/lib/symtab.cc new file mode 100644 index 0000000..2cf6421 --- /dev/null +++ b/app/src/lib/symtab.cc @@ -0,0 +1,171 @@ +#include "symtab.h" +#include +#include +#include +#include +//--------------------------------------------------------------------------- + +SymbolTable::SymbolTable(int n, int (*h)(const char*,int), const char* filename) + : _mx(n), _cnt(0), hash(h) +{ + _sz=first(n); + _key=new char*[_sz]; + _defind=new int[_sz]; + _hashind=new int[_sz]; + _def=new char*[_mx]; + for(int i=0; i<_sz; i++) _key[i]=NULL; + if(filename) + add_from_file(filename); +} + +//--------------------------------------------------------------------------- + +SymbolTable::SymbolTable(int n, const char* filename) + : _mx(n), _cnt(0), hash(hash1) +{ + _sz=first(n); + _key=new char*[_sz]; + _defind=new int[_sz]; + _hashind=new int[_sz]; + _def=new char*[_mx]; + for(int i=0; i<_sz; ++i) _key[i]=NULL; + if(filename) + add_from_file(filename); +} + +//--------------------------------------------------------------------------- + +SymbolTable::~SymbolTable() +{ + clear(); + delete[] _key; + delete[] _defind; + delete[] _hashind; + delete[] _def; +} + +//--------------------------------------------------------------------------- + +void SymbolTable::clear() +{ + for(int i=0; i<_sz; ++i) + if(_key[i]) + free(_key[i]); +} + +//--------------------------------------------------------------------------- + +bool SymbolTable::add_from_file(const char* filename) +{ + FILE* in=fopen(filename,"r"); + char buf[MAXKEYLEN+1]; + + if(in) + while(fscanf(in,"%s",buf)==1) + { + if(strlen(buf)==MAXKEYLEN || add(buf)<0) + return false; + } + return true; +} + +//--------------------------------------------------------------------------- + +int SymbolTable::add(const char* s) +{ + if(_cnt<_mx) + { + int ind=hash(s,_sz); + while(_key[ind]) + if(strcmp(_key[ind],s)) + ind=++ind%_sz; + else + return _defind[ind]; + _key[ind]=strdup(s); + _defind[ind]=_cnt; + _hashind[_cnt]=ind; + _def[_cnt]=_key[ind]; + _cnt++; + return _cnt-1; + } + else + return -1; +} + +//--------------------------------------------------------------------------- + +int SymbolTable::operator[](const char* s) +{ + int ind=hash(s,_sz); + while(_key[ind]) + if(strcmp(_key[ind],s)==0) + return _defind[ind]; + else + ind=++ind % _sz; + return -1; +} + +//--------------------------------------------------------------------------- + +int SymbolTable::first(unsigned int n) +{ + int fi=n; + int bound=(n/2 < MAXKEYLEN)? n/2 : MAXKEYLEN; + bool found; + do + { + found=true; + if(fi++ == MAXINT) return -1; + for(int i=2; i=4) + return abs((*((int*)(s+(l/2-2)))+(int)(*s * s[l-1])) % _sz); + else + { + int i=0; + strcpy((char*)&i,s); + return abs((i+(int)(*s * s[l-1])) % _sz); + } +} + +//--------------------------------------------------------------------------- + +int hash2(const char* s, int _sz) +{ + int l=strlen(s); + if(l>=6) + { + unsigned int i1,i2,i3; + strncpy((char*)&i1,s,sizeof(int)); + strncpy((char*)&i2,s+(l/2-2),sizeof(int)); + strncpy((char*)&i3,s+(l-4),sizeof(int)); + return abs((i1+i2+i3) % _sz); + } + else + { + int i=0; + strncpy((char*)&i,s,sizeof(int)); + return abs((i+(int)(*s * s[l-1])) % _sz); + } +} + +//--------------------------------------------------------------------------- + diff --git a/app/src/lib/symtab.h b/app/src/lib/symtab.h new file mode 100644 index 0000000..d4456a3 --- /dev/null +++ b/app/src/lib/symtab.h @@ -0,0 +1,52 @@ +#ifndef _HashTable_h +#define _HashTable_h +//--------------------------------------------------------------------------- +#include +#include +//--------------------------------------------------------------------------- +int hash1(const char* s, int sz); +int hash2(const char* s, int sz); +//--------------------------------------------------------------------------- + +class SymbolTable +{ + int _mx; + int _sz; + int _cnt; + char** _key; + char** _def; + int* _defind; + int* _hashind; // s tu redundancje + +public: + static const unsigned int MAXKEYLEN=2000; + + SymbolTable(int n, int (*h)(const char*,int), const char* filename=NULL); + SymbolTable(int n, const char* filename=NULL); + ~SymbolTable(); + + void clear(); + + int (*hash)(const char*, int); + + bool add_from_file(const char* filename); + + int add(const char* s); + int operator[](const char* s); + const char* operator[](int i){if(i<0||i>=_cnt)return NULL;else return _def[i];} + int index(const char* s) { return this->operator[](s); }; + int index(int i) { if(i<0||i>=_cnt) return -1; else return i; }; + int hash_index(int i) { return _hashind[i]; } + const char* symbol(int i) { if(i<0||i>=_cnt)return NULL; else return _def[i];} + + int capacity() { return _mx; } + int size() { return _sz; } + int count() { return _cnt; } + float search_rate(); + +private: + static int first(unsigned int n); +}; + +//--------------------------------------------------------------------------- +#endif diff --git a/app/src/lib/tft.h b/app/src/lib/tft.h new file mode 100755 index 0000000..1abda6c --- /dev/null +++ b/app/src/lib/tft.h @@ -0,0 +1,879 @@ +#ifndef _TFT_h +#define _TFT_h +//--------------------------------------------------------------------------- +#include +#include +#include +#include + +#include + +//#include "top.h" +#include "ttrans.h" +//--------------------------------------------------------------------------- + +/// Klasa bazowa przetwornika skoczonego. +/** + \remark Po co ta klasa? Co dotyczy samych przej, przenie do TTrans, + reszt wcieli do TFT. +*/ +class FT +{ +public: + FT() : copy_default(false), print_mode(OO), ttn(0) {}; + +//print mode + enum OUTPUT { II, ///< tylko symbole wejciowe + OO, ///< tylko symbole wyjciowe + IOIO, ///< symbol wyjciowy po wejciowym + OIOI, ///< symbol wyjciowy przed wejciowym + IIOO, ///< cae wejcie, potem cae wyjcie + OOII ///< cae wyjcie, potem cae wejcie + + }; + +/// maks dugo cieki + static const unsigned int ftMAXPATH=500; + +/// maks dugo opisu typu symbolu we/wy +/** + \remark Przenie do TTrans +*/ + static const unsigned int ftTYPELEN=32; + +/// specjalny symbol dla wartoci 'epsilon' +/** + \remark Przenie do TTrans +*/ + static const char ftEPSILON='~'; + +/// specialny symbol dla wartoci 'default' +/** + \remark Przenie do TTrans +*/ + static const char ftDEFAULT='@'; + +/// domylny symbol wyjciowy (true-'@', flase-'~') +/** + \remark Przenie do TTrans(???) +*/ + bool copy_default; + +/// tryb wyjcia + OUTPUT print_mode; + +/// false, jeli automat nie ma przej + operator bool() { return (bool)ttn; }; + + virtual const char* intype() { return itype; }; + virtual const char* outtype() { return otype; }; + +protected: + +/// liczba elementw tablicy tt + unsigned long ttn; + +/// liczba stanw + unsigned long states; + +/// liczba przej + unsigned long transitions; + +/// typ symboli wejciowych (napis) +/** + \remark Przenie do TTrans(???) +*/ + char itype[ftTYPELEN]; + +/// typ symboli wyjciowych (napis) +/** + \remark Przenie do TTrans(???) +*/ + char otype[ftTYPELEN]; +}; + +//--------------------------------------------------------------------------- + +/// Szablon przetwornika skoczonego +/** + \param I - typ symbolu wejciowego + \param Ipass - typ, jaki ma by uyty przy przekazywaniu symbolu we jako parametru + do funkcji (metody), rwny \a I lub \a I& + \param O - typ symbolu wyjciowego + \param Opass - typ, jaki ma by uyty przy przekazywaniu symbolu wy jako parametru + do funkcji (metody), rwny \a O lub \a O& + \param - typ przejcia, musi by podklas TTrans +*/ +template +class TFT : public FT +{ + + +public: + + TFT() : FT(), tt(NULL) { setiotypes(); }; + +/** +\name Metody poziomu 1 +Poziom przej. +*/ + +//@{ + +/// Test, czy przejcie \a t akceptuje symbol \a in. + bool accepts(long t, Ipass in) const; + +/// Test, czy lista przej dla aktualnego stanu jest kontynuowana po \a t. + bool continued(long t) const; + +/// Stan, do ktrego prowadzi przejcie \a t. +/** + \pre !empty(t) +*/ + long next(long t) const; + +/// Symbol wejciowy przejcia \a t. + Ipass input(long t) const; + +/// Symbol wyjciowy przejcia \a t. + Opass output(long t) const; + +/// Zwraca \c true, jeli symbolem we przejcia \a t jest epsilon. + bool epsi(long t) const; + +/// Zwraca \c true, jeli symbolem we przejcia \a t jest symbol domylny. + bool defi(long t) const; + +/// Zwraca \c true, jeli symbolem wy przejcia \a t jest epsilon. + bool epso(long t) const; + +/// Zwraca \c true, jeli symbolem wy przejcia \a t jest symbol domylny. + bool defo(long t) const; + +/// Indeks przejcia przez \a in. + long tra(long t, Ipass in) const; + +/// Indeks przejcia przez \a in - non-deterministic. + long tra_nd(long t, Ipass in, long nth) const; + +//@} + +/** +\name Poziom 2 +Poziom stanw. Stan (indeks stanu) = indeks jego pierwszego przejcia +*/ +//@{ +/// Zwraca \c true jeli stan \a s jest pusty (nie ma z niego przej). + bool empty(long s) const { return tt[s].empty(); } + +/// Zwraca \c true jeli stan \a s jest stanem kocowym. + bool final(long s) const { return tt[s].final(); } + + long next(long t, Ipass in) const; + +//long trans(const I* si, I* so, long& olen) const; + + long gtra(long s, const I* w, long maxpath=ftMAXPATH) const; + +//@} + +/** +\name Poziom 3 +Poziom ... +*/ +//@{ + long cont(long s=-1, I* c=NULL) const; + + long match(const I* w=NULL, long* p=NULL) const; + + long match_nd(const I* w=NULL, long* p=NULL) const; + + long lgstmatch(const I* w, long* p, long& plen, long maxpath=ftMAXPATH) const; + + /*NOWE*/ + + long lgstpath(I*& buf, long*& path, long start=0) const; + + long pref(I*& buf, I sep, long start=0) const; + +//@} + +protected: + + TT* tt; // tablica przej + + long prn(const I* si, long* p, O* so) const; + + void prntt(ostream& os); + + void sort(); + + void setiotypes(); // NIE DZIAA (dlaczego???) + +// friend ostream& operator<<(ostream&,const CDFA&); +// friend istream& operator>>(istream&,CDFA&); + +private: + long prn_oo(const I* si, long* p, O* so) const; + long prn_ioio(const I* si, long* p, O* so) const; + long prn_oioi(const I* si, long* p, O* so) const; + long prn_iioo(const I* si, long* p, O* so) const; + long prn_ooii(const I* si, long* p, O* so) const; +}; + + +//--------------------------------------------------------------------------- +//--------------------------------------------------------------------------- + +/** + stan = indeks pierwszego przejcia + + state(t) = stan, do ktrego naley t + + symbol zerowy = symbol s, dla ktrego (bool)s zwraca \c false, + w przypadku znakw - '\0' +*/ + +//--------------------------------------------------------------------------- +//--------------------------------------------------------------------------- + + +template +inline +bool TFT::accepts(long t, Ipass in) const +{ return tt[t].accepts(in); } + +/// Test whether the transition list continues after \a t. +template +inline +bool TFT::continued(long t) const +{ return tt[t].continued(); } + +/** + \pre !empty(t) +*/ +template +inline +long TFT::next(long t) const +{ return tt[t].next(); } + +template +inline +Ipass TFT::input(long t) const +{ return tt[t].in(); } + +template +inline +Opass TFT::output(long t) const +{ return tt[t].out(); } + +template +inline +bool TFT::epsi(long t) const +{ return tt[t].epsi(); } + +template +inline +bool TFT::defi(long t) const +{ return tt[t].defi(); } + +template +inline +bool TFT::epso(long t) const +{ return tt[t].epso(); } + +template +inline +bool TFT::defo(long t) const +{ return tt[t].defo(); } + +/** + \param +t - indeks przejcia + \param +in - symbol we + \return Indeks przjcia (>=\a t) dla biecego stanu, ktre + akceptuje symbol we \a in lub -1, jeli nie ma takiego przejcia +*/ +template +long TFT::tra(long t, Ipass in) const +{ + if(t<0 || t>=ttn) + return -1; + + if(empty(t)) return -1; + while(!accepts(t,in)) + if(continued(t)) + t++; + else + return -1; + return t; +} + +//--------------------------------------------------------------------------- +/// Indeks przejcia - wersja dla automatu niedeterministycznego. +/** + \param +t - indeks przejcia + \param +in - symbol we + \return Indeks przjcia (>=\a t) dla biecego stanu, ktre + akceptuje symbol we \a in lub -1, jeli nie ma takiego przejcia + Jeli nth==0, t1>=t, w przeciwnym razie t1>t. +*/ +template +long TFT::tra_nd(long t, Ipass in, long nth) const +{ + if(t<0 || t>=ttn) + return -1; + + if(nth) + if(continued(t)) + t++; + else + return -1; + else + { if(empty(t)) return -1; } + + while(!accepts(t,in)) + if(continued(t)) + t++; + else + return -1; + + return t; +} + +//} + +//--------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + + +/// Funkcja przejcia. +/** + \param t - stan + \param in - symbol we + \return Stan, do ktrego mona przej z \a t po wpywem symbolu \a in + lub -1, jeli nie ma przejcia przez \a in + +*/ +template +long TFT::next(long t, Ipass in) const +{ + if(t<0 || (unsigned long)t>=ttn) + return -1; + + if(empty(t)) return -1; + while(!accepts(t,in)) + if(continued(t)) + t++; + else { + return -1; + } + + return next(t); +} + +//--------------------------------------------------------------------------- + +//---------------------------------------------------------------------------- +/// Uoglniona funkcja przejscia. +/** + \param +s - stan + \param +w - wskanik pierwszego elementu cigu symboli we, zakoczonego symbolem zerowym + \param maxpath maksymalna dugo cieki, domylnie ftMAXPATH + \return stan osigalny z \a s pod wpywem \a w (na ciece mog si pojawi + epsilon-przejcia +*/ +template +long TFT::gtra(long s, const I* w, long maxpath) const +{ + if(s<0 || (unsigned long)s>=ttn) + return -1; + + long i=0; + while(*w) + { + if(i>maxpath || empty(s)) return -1; + while(!accepts(s,*w)) + if(continued(s)) + s++; + else + return -1; + if(!epsi(s)) w++; + s=next(s); + i++; + } + return s; +} + +//---------------------------------------------------------------------------- + +/// Kontynuacja. +/** +... +\param +s stan, jeli -1 - poszukiwane jest nastpne rozwizanie +\param -c cig symboli we ze cieki prowadzcej z \a s do + stanu kocowego +\return dugo cigu \a c (= dugo cieki) +\remark DZIAA TYLKO DLA ZNAKW!!! + EPSILON-PRZEJCIA NIEDOZWOLONE!!! +*/ +template +long TFT::cont(long s, I* c) const +{ + static unsigned long path[ftMAXPATH]={0}; + static unsigned long i=0; + static bool more=false; + + bool found=false; + + if(s!=-1) + { + if(s<0 || (unsigned long)s>=ttn) + more=false; + else + { + i=0; + c[0]=0; + path[0]=s; + more=true; + if(final(s)) + found=true; + } + } + + while(more && !found) + { + if(!empty(path[i]) && i0) + c[--i]=0; + else + more=false; + }while(more && !continued(path[i])); + path[i]=path[i]+1; + } + if(final(path[i])) + { + found=true; + c[i]=0; + } + } + return i; +} + +//---------------------------------------------------------------------------- +/// Dopasowannie. +/** + \remark Nie zaimplementowane. +*/ +template +long TFT::match(const I* w, long* p) const +{} + +//---------------------------------------------------------------------------- +/// Dopasowanie niedeterministyczne. +/** + \param +w - wskanik pierwszego elementu cigu symboli we, zakoczonego symbolem zerowym, + jeli NULL - poszukiwane jest nastpne rozwizanie + \param -p cig przej zakoczony -1 + \return dugo dopasowania (PO CO?) +*/ +template +long TFT::match_nd(const I* w, long* p) const +{ + static bool more=false; + static I *w0, *wc; + static long s=0, *p0, *pc, *pc_bound; + + bool found=false; + + if(w) + { + wc=w0=w; + pc=p0=p; + more=true; + pc_bound=pc+ftMAXPATH; + if(final(s=0)) + { + *pc=-1; return 0; + } + } + + while(more) + { + if(*wc && pc=0) + { if(!epsi(*pc)) wc++; s=next(*pc); pc++; } + else + while(true) + { + if(pc==p0) { more=false; return -1; } + if(!epsi(*(--pc))) wc--; + if((*pc=trand(*pc,*wc,1))>=0) + { if(!epsi(*pc)) wc++; s=next(*pc); pc++; break; } + } + if(final(s)) { *pc=-1; return wc-w0; } + } + return -1; +} + +//---------------------------------------------------------------------------- +/// Najdusze dopasowanie. +/** + \param +w wskanik pierwszego elementu cigu symboli wejciowych + \param -p cieka + \param -plen dugo cieki + \param +maxpath maks ddugo cieki, domylnie FT::ftMAXPATH + \return dugo skonsumowanego wejcia +*/ +template +long TFT + ::lgstmatch(const I* w, long* p, long& plen, long maxpath) const +{ + long s=0; + long t; + long i=0; + const char* w0=w; + long ilen=0; + while(*w && i=0) + { + if(!epsi(t)) w++; + s=next(t); + i++; + *(p++)=t; + if(final(s)) { plen=i; ilen=w-w0; } + } + *p=-1; + return ilen; +} + +//---------------------------------------------------------------------------- +/// Najdusza cieka. +/** + \param +buf wskanik pierwszego elementu cigu symboli wejciowych + \param -buf pozycja jeden za skonsumowanym prefiksem + \param +path wskanik pierwszego elementu wektora przej + \param -path wskanik jeden za ostatnim przejciem + \return dugo skonsumowanego prefiksu (PO CO? LEPIEJ D CIEKI) +*/ +template +long TFT + ::lgstpath(I*& buf, long*& path, long start) const +{ + long s=start; + long t; + const char* buf0=buf; + const long* pathlimit=path+FT::ftMAXPATH; + while(*buf && path=0) + { + if(!epsi(t)) buf++; + s=next(t); + *(path++)=t; + } + return buf-buf0; +} + +//---------------------------------------------------------------------------- +/// Najduszy prefiks. +/** + \param +buf wskanik pierwszego elementu cigu symboli wejciowych + \param -buf pozycja jeden za skonsumowanym prefiksem + \param +sep separator + \return stan po przejciu przez \a sep + \remark Dziaa tylko dla automatw deterministycznych, minimalnych, eps-wolnych, + gdzie d. cieki == d. dopasowania. +*/ +template +long TFT + ::pref(I*& buf, I sep, long start) const +{ + static long pathtab[ftMAXPATH]; + // static long* path=pathtab; + long* path=pathtab; + static bool more; + + long s; + if(*buf) // pierwsze wywoanie + { + if(!lgstpath(buf,path,start)) + return -1; + --path; + more=true; + } + else // kolejne wywoanie + --buf,--path; + while(more) + if(path>=pathtab) + if((s=next(next(*path),sep))>=0) { + return s; + } + else + --buf, --path; + else + { + more=false; + return -1; + } + return -1; +} + +//---------------------------------------------------------------------------- + +/* +template +long TFT::trans(const I* si, O* so, long& olen) const +{ + long p[ftMAXPATH]; + long ilen; + long plen; + if((ilen=lgstmatch(si,p,plen))>0) + olen=prn(si,p,so); + else + ilen=olen=0; + return ilen; +} +*/ +//---------------------------------------------------------------------------- + +template +long TFT::prn(const I* si, long* p, O* so) const +{ + switch(print_mode) + { + case OO: return prn_oo(si,p,so); + case IOIO: return prn_ioio(si,p,so); + case OIOI: return prn_oioi(si,p,so); + case IIOO: return prn_iioo(si,p,so); + case OOII: return prn_ooii(si,p,so); + } +} + +//---------------------------------------------------------------------------- + +template +long TFT::prn_oo(const I* si, long* p, O* so) const +{ + char* so0=so; + while(*p>=0) + { + long t=*p; + if(!epso(t)) + { + if(defo(t)) + *(so++)=*si; + else + *(so++)=output(t); + } + if(!epsi(t)) si++; + p++; + + } + return so-so0; +} + +//---------------------------------------------------------------------------- + + +template +long TFT::prn_ioio(const I* si, long* p, O* so) const +{ + char* so0=so; + while(*p>=0) + { + long t=*p; + if(!epsi(t)) + *(so++)=*si; + if(!epso(t)) + if(defo(t)) + *(so++)=*si; + else + *(so++)=output(t); + if(!epsi(t)) si++; + p++; + } + return so-so0; +} + + +//---------------------------------------------------------------------------- + +template +long TFT::prn_oioi(const I* si, long* p, O* so) const +{ + char* so0=so; + while(*p>=0) + { + long t=*p; + if(!epso(t)) + { + if(defo(t)) + *(so++)=*si; + else + *(so++)=output(t); + } + if(!epsi(t)) + *(so++)=*(si++); + p++; + } + return so-so0; +} + +//---------------------------------------------------------------------------- + +template +long TFT::prn_iioo(const I* si, long* p, O* so) const +{ + const char* si0=si; + long* p0=p; + char* so0=so; + while(*p>=0) + { + long t=*p; + if(!epsi(t)) + { + *(so++)=*si; + si++; + } + p++; + } + si=si0; + p=p0; + while(*p>=0) + { + long t=*p; + if(!epso(t)) + if(defo(t)) + *(so++)=*si; + else + *(so++)=output(t); + if(!epsi(t)) si++; + p++; + } + return so-so0; +} + +//---------------------------------------------------------------------------- + +template +long TFT::prn_ooii(const I* si, long* p, O* so) const +{ + + const char* si0=si; + long* p0=p; + char* so0=so; + while(*p>=0) + { + long t=*p; + if(!epso(t)) + { + if(defo(t)) + *(so++)=*si; + else + *(so++)=output(t); + } + if(!epsi(t)) si++; + p++; + } + si=si0; + p=p0; + while(*p>=0) + { + long t=*p; + if(!epsi(t)) + *(so++)=*(si++); + p++; + } + return so-so0; +} + +//--------------------------------------------------------------------------- + +template +void TFT::sort() +{ + long t=0; + while(t1) + { + long eps=-1; + long def=-1; + for(int i=0; i=0 && epseps) def--; + if(def>=0 && def=0) + { + memmove(tt+t0+def+1,tt+t0+def,tn-eps-2); + tt[t-2]=temp; + } + else + { + memmove(tt+t0+def+1,tt+t0+def,tn-eps-2); + tt[t-1]=temp; + } + } + while(t0 +void TFT::setiotypes() +{ + int i=0; + const char* it=typeid(I).name(); + while(*it) + if(*it==' ') + { it++; continue; } + else + itype[i++]=*(it++); + itype[i]='\0'; + + i=0; + const char* ot=typeid(O).name(); + while(*ot) + if(*ot==' ') + { ot++; continue; } + else + otype[i++]=*(ot++); + otype[i]='\0'; +}; + +//--------------------------------------------------------------------------- + +template +void TFT::prntt(ostream& os) +{ + for(long i=0; i +#include +#include +//#include + +#include "tft.h" +//--------------------------------------------------------------------------- + +template +class TFTi : public TFT > +{ +public: + TFTi() : TFT >() {}; + TFTi(const char* filename) + : TFT >() { load(filename); }; + + void read(const char* filename); + void read(istream& is=cin); + void write(const char* filename); + void write(ostream& os=cout); + void load(const char* filename); + void load(FILE* f=stdin); + void save(const char* filename); + void save(FILE* f=stdout); + void clear(); + using TFT >::ttn; + using TFT >::states; + using TFT >::transitions; + using TFT >::itype; + using TFT >::ftTYPELEN; + using TFT >::otype; + using TFT >::tt; + using TFT >::copy_default; + using TFT >::print_mode; + + +// friend istream& operator>>(istream&, TFTi&); +// friend ostream& operator<<(ostream&, const TFTi&); +}; + +//--------------------------------------------------------------------------- + +template +void TFTi::read(const char* filename) +{ + ifstream is(filename); + if(!is) { fprintf(stderr,"Failed to open input file."); exit(1); } + read(is); +} + +template +void TFTi::read(istream& is) +{ + long *si; // state-index relation + long ci=0; // current index + char ch; // character read; + int empty=0; // no of states with 0 trans? + char intype[FT::ftTYPELEN]; + char outtype[FT::ftTYPELEN]; + + clear(); + + is >> states >> transitions >> intype >> outtype; + +// if(strcmp(intype,itype)!=0 || +// strcmp(outtype,otype)!=0 && strcmp(outtype,"void")!=0) +// { is.clear(ios::badbit); goto end; }; + + while(is.peek()==' ' || is.peek()=='\t') is.get(ch); + while(is.peek()!='\n') + { + char s[20]; + is >> s; + if(strcmp(s,"COPY")==0 && strcmp(intype,outtype)==0) copy_default=true; + else if(strcmp(s,"NOCOPY")==0) copy_default=false; + else if(strcmp(s,"II")==0) print_mode=FT::II; + else if(strcmp(s,"OO")==0) print_mode=FT::OO; + else if(strcmp(s,"IOIO")==0) print_mode=FT::IOIO; + else if(strcmp(s,"OIOI")==0) print_mode=FT::OIOI; + else if(strcmp(s,"IIOO")==0) print_mode=FT::IIOO; + else if(strcmp(s,"OIOI")==0) print_mode=FT::OIOI; + while(is.peek()==' ' || is.peek()=='\t') is.get(ch); + } + + ttn=transitions+2; // 1 state without trans., 1 additional + si=new long[states]; + tt=new TTrans_i[ttn]; + + for(long cs=0;cs> cscheck; + if(cs!=cscheck) goto end; + + while(is.peek()==' ' || is.peek()=='\t') is.get(ch); + + is.get(ch); + if(!is) goto end; + switch(ch) + { + case '-': tt[ci].final(false); break; + case '+': tt[ci].final(true); break; + default: goto end; + } + tc=0, tt[ci].continued(false); + + while(is.peek()==' ' || is.peek()=='\t') is.get(ch); + while(is && is.peek()!='\n') + { + switch(is.peek()) + { + case '~': tt[ci].epsi(true); tt[ci].defi(true); is.get(ch); + break; + case '@': tt[ci].epsi(false); tt[ci].defi(true); is.get(ch); + break; + default : tt[ci].geti(is); + } + if(!is) goto end; + if(is.peek()=='/') + { + is.get(ch); + switch(is.peek()) + { + case '~': tt[ci].epso(true); tt[ci].defo(true); is.get(ch); + break; + case '@': tt[ci].epso(false); tt[ci].defo(true); is.get(ch); + break; + default : tt[ci].geto(is); + } + } + else + { + tt[ci].defo(true); + if(copy_default) tt[ci].epso(false); else tt[ci].epso(true); + } + if(!is) goto end; + + unsigned long transition; + is >> transition; + tt[ci].next(transition); + + tt[ci].continued(false); + tt[ci].empty(false); + + if(tc>0) tt[ci-1].continued(true); + tc++,ci++; + } + if(tc==0) + { + if(++empty>2) { fprintf(stderr, "Nondeterministic automaton."); exit(1); } + tt[ci].empty(true); + ci++; + } + is.get(ch); + if(ch!='\n') { is.clear(ios::badbit); goto end; } + } + + ttn=transitions+empty; + if(ttn!=ci) { is.clear(ios::badbit); goto end; }; + for(long i=0;i +void TFTi::write(const char* filename) +{ + ofstream os(filename); + if(!os) err("Failed to open output file."); + write(os); +} + +template +void TFTi::write(ostream& os) +{ + os << states << ' ' << transitions << ' '; +// os << itype << ' ' << otype << ' '; + os << "char void"; +// os << (copy_default ? "COPY" : "NOCOPY") << ' '; +// switch(print_mode) +// { +// case FT::II : os << "II"; break; +// case FT::OO : os << "OO"; break; +// case FT::IOIO: os << "IOIO"; break; +// case FT::OIOI: os << "OIOI"; break; +// case FT::IIOO: os << "IIOO"; break; +// case FT::OOII: os << "OOII"; +// } + os << '\n'; + + long* si=new long[ttn]; + long cs=0; + for(long i=0;i +void TFTi::load(const char* filename) +{ + FILE* f; + if(*filename) + f=fopen(filename,"rb"); + else + f=stdin; + if(!f) { fprintf(stderr, "Cannot open automaton file."); return; } + load(f); +} + +template +void TFTi::load(FILE* f) +{ + + clear(); + + if(fread(&ttn,sizeof(ttn),1,f)!=1) { fprintf(stderr, "Binary input error."); return;} + if(fread(&states,sizeof(states),1,f)!=1) { fprintf(stderr, "Binary input error."); return;} + if(fread(&transitions,sizeof(transitions),1,f)!=1) { fprintf(stderr, "Binary input error."); return;} + if(fread(itype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr, "Binary input error."); return;} + if(fread(otype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr, "Binary input error."); return;} + if(fread(©_default,sizeof(copy_default),1,f)!=1) { fprintf(stderr, "Binary input error."); return;} + if(fread(&print_mode,sizeof(print_mode),1,f)!=1) { fprintf(stderr, "Binary input error."); return;} + if((tt=new TTrans_i[ttn])==NULL) { fprintf(stderr, "Cannot allocate memory for tt."); return;} + if(fread(tt,sizeof(TTrans_i),ttn,f)!=ttn) { fprintf(stderr, "Binary input error."); return; } + fclose(f); + + +} + +//--------------------------------------------------------------------------- + +template +void TFTi::save(const char* filename) +{ + FILE* f; + if(*filename) + f=fopen(filename,"wb"); + else + f=stdout; + if(!f) err("Cannot open file."); + save(f); +} + +template +void TFTi::save(FILE* f) +{ + if(fwrite(&ttn,sizeof(ttn),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(&states,sizeof(states),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(&transitions,sizeof(transitions),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(itype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(otype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(©_default,sizeof(copy_default),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(&print_mode,sizeof(print_mode),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(tt,sizeof(TTrans_i),ttn,f)!=ttn) { fprintf(stderr,"Binary output error."); exit(1); } + fclose(f); +} + +//--------------------------------------------------------------------------- + +template +void TFTi::clear() +{ + if(tt) delete[] tt; + ttn=0; +} + +//--------------------------------------------------------------------------- +/* +template +istream& operator>>(istream& is, TFTi& ft) +{ + long *si; // state-index relation + long ci=0; // current index + char ch; // character read; + int empty=0; // no of states with 0 trans? + char intype[FT::ftTYPELEN]; + char outtype[FT::ftTYPELEN]; + + ft.clear(); + + is >> ft.states >> ft.transitions >> intype >> outtype; + + if(strcmp(intype,ft.itype)!=0 || + strcmp(outtype,ft.otype)!=0 && strcmp(outtype,"void")!=0) + { is.clear(ios::badbit); return is; }; + + while(is.peek()==' ' || is.peek()=='\t') is.get(ch); + while(is.peek()!='\n') + { + char s[20]; + is >> s; + if(strcmp(s,"COPY")==0 && strcmp(intype,outtype)==0) ft.copy_default=true; + else if(strcmp(s,"NOCOPY")==0) ft.copy_default=false; + else if(strcmp(s,"II")==0) ft.print_mode=FT::II; + else if(strcmp(s,"OO")==0) ft.print_mode=FT::OO; + else if(strcmp(s,"IOIO")==0) ft.print_mode=FT::IOIO; + else if(strcmp(s,"OIOI")==0) ft.print_mode=FT::OIOI; + else if(strcmp(s,"IIOO")==0) ft.print_mode=FT::IIOO; + else if(strcmp(s,"OIOI")==0) ft.print_mode=FT::OIOI; + while(is.peek()==' ' || is.peek()=='\t') is.get(ch); + } + + ft.ttn=ft.transitions+2; // 1 state without trans., 1 additional + si=new long[ft.states]; + ft.tt=new TTrans_i[ft.ttn]; + + for(long cs=0;cs> ch; while(ch!='+' && ch!='-'); + switch(ch) + { + case '-': ft.tt[ci].final(false); break; + case '+': ft.tt[ci].final(true); break; + default: return is; + } + tc=0, ft.tt[ci].continued(false); + while((is.get(ch),ch==' ')) + { + if(!is) return is; + switch(is.peek()) + { + case '~': ft.tt[ci].epsi(true); ft.tt[ci].defi(true); is.get(ch); + break; + case '@': ft.tt[ci].epsi(false); ft.tt[ci].defi(true); is.get(ch); + break; + default : ft.tt[ci].geti(is); + } + if(!is) return is; + if(is.peek()=='/') + { + is.get(ch); + switch(is.peek()) + { + case '~': ft.tt[ci].epso(true); ft.tt[ci].defo(true); is.get(ch); + break; + case '@': ft.tt[ci].epso(false); ft.tt[ci].defo(true); is.get(ch); + break; + default : ft.tt[ci].geto(is); + } + } + else + { + ft.tt[ci].defo(true); + if(ft.copy_default) ft.tt[ci].epso(false); else ft.tt[ci].epso(true); + } + if(!is) return is; + + unsigned long transition; + is >> transition; + ft.tt[ci].next(transition); + + ft.tt[ci].continued(false); + + ft.tt[ci].empty(false); + if(tc>0) ft.tt[ci-1].continued(true); + tc++,ci++; + } + if(tc==0) + { + if(++empty>2) err("Nondeterministic automaton."); + ft.tt[ci].empty(true); + ci++; + } + if(ch!='\n') { is.clear(ios::badbit); return is; } + } + + ft.ttn=ft.transitions+empty; + if(ft.ttn!=ci) { is.clear(ios::badbit); return is; }; + for(long i=0;i +ostream& operator<<(ostream& os, const TFTi& ft) +{ + os << ft.states << ' ' << ft.transitions << ' ' + << ft.itype << ' ' << ft.otype << ' '; + os << (ft.copy_default ? "COPY" : "NOCOPY") << ' '; + switch(ft.print_mode) + { + case FT::II : os << "II"; break; + case FT::OO : os << "OO"; break; + case FT::IOIO: os << "IOIO"; break; + case FT::OIOI: os << "OIOI"; break; + case FT::IIOO: os << "IIOO"; break; + case FT::OOII: os << "OOII"; + } + os << ' ' << '\n'; + + long* si=new long[ft.ttn]; + long cs=0; + for(long i=0;i +class TFTiv : public TFTi +{ +public: + TFTiv() : TFTi() {}; + TFTiv(const char* filename) : TFTi(filename) {}; +}; + +//--------------------------------------------------------------------------- + +template +class TFTir : public TFTi +{ +public: + TFTir() : TFTi() {}; +}; + +//--------------------------------------------------------------------------- +#endif diff --git a/app/src/lib/ttrans.h b/app/src/lib/ttrans.h new file mode 100755 index 0000000..ce1ed97 --- /dev/null +++ b/app/src/lib/ttrans.h @@ -0,0 +1,204 @@ +#ifndef _TTransi_h +#define _TTransi_h +//--------------------------------------------------------------------------- +#include +//--------------------------------------------------------------------------- + +//! The template for a transition with input and output symbols stored internally. +/*! + A state is identified with the set of its outgoing transitions. + The state index is the index of the first transition for it. + A state with no outgoing transition is represented as an empty transition. +*/ +template +class TTrans_i +{ +public: +//private: +//! Input symbol + I i; +//! Output symbol + O o; + +public: + +//! state is final + static const unsigned char BITf=0x01; +//! transition list is continued + static const unsigned char BITc=0x02; +//! no transition + static const unsigned char BITe=0x04; +//! epsilon input + static const unsigned char BITepsi=0x08; +//! default input + static const unsigned char BITdefi=0x10; +//! epsilon output + static const unsigned char BITepso=0x20; +//! default output + static const unsigned char BITdefo=0x40; + +//! Flags + unsigned char flags; + +//! The index of the next state + long nxt; + +//! Input symbol. +//! \return The input symbol of the transition. + Ipass in() const { return i; } + +//! Output symbol. +//! \return The output symbol of the transition. + Opass out() const { return o; } + +//! Set the input symbol. +//! \param in input symbol + void in(Ipass in) { i=in; } + +//! Set the output symbol. +//! \param out output symbol + void out(Opass out) { o=out; } + +//! remark Is this needed? + I& iref() { return i; } + +//! remark Is this needed? + O& oref() { return o; } + +//! Test whether an input symbol is accepted. +//! \remark Simplified. Should rely on a test function provided by the user. + bool accepts(Ipass in) { return defi() || in==i; } + +//! Next state. +//! \return Destination state of the transition. + long next() const { return nxt; }; + +//! Set the next state. +//! \param t destination state of the transition + void next(long t) { nxt=t; }; + +//! Is the state final? +//! \return \c true if the state is final, false otherwise. + bool final() const { return flags&BITf; }; + +//! Set the \b final flag. +//! \param b \c true if the state is final, \c false otherwise. + void final(bool b) { if(b) flags|=BITf; else flags&=~BITf; }; + +//! Is the transition list continued? +//! \return \c true if the transition is not the last transition for the state, +//! \c false otherwise. + bool continued() const { return flags&BITc; }; + +//! Set the \b continuation flag. +//! \param b \c true if the transition is not the last one for the state, \c false otherwise. + void continued(bool b) { if(b) flags|=BITc; else flags&=~BITc; }; + +//! Is the transition empty? +//! \return \c true if the transition is empty (represents a state with no outgoing transitions), +//! \c false otherwise. + bool empty() const { return flags&BITe; }; + +//! Set the \b empty flag. +//! \param b \c true if the transition is empty, \c false otherwise. + void empty(bool b) { if(b) flags|=BITe; else flags&=~BITe; }; + + bool epsi() const { return flags&BITepsi; }; + void epsi(bool b) { if(b) flags|=BITepsi; else flags&=~BITepsi; }; + + bool defi() const { return flags&BITdefi; }; + void defi(bool b) { if(b) flags|=BITdefi; else flags&=~BITdefi; }; + + bool epso() const { return flags&BITepso; }; + void epso(bool b) { if(b) flags|=BITepso; else flags&=~BITepso; }; + + bool defo() const { return flags&BITdefo; }; + void defo(bool b) { if(b) flags|=BITdefo; else flags&=~BITdefo; }; + + void geti(istream&); + void geto(istream&); + +// friend ostream& operator<<(ostream& os, const TTrans_i& t); + +}; + +//--------------------------------------------------------------------------- + +template +void getsym(istream& is, char& c) +{ + is >> c; + if(c=='\\') + { + is.get(c); + switch(c) + { + case 'n':c='\n';break; + case 't':c='\t';break; + } + } +} + +template +void getsym(istream& is, T& s) +{ is >> s; } + +//--------------------------------------------------------------------------- + +template +void TTrans_i::geti(istream& is) +{ getsym(is,iref()); }; + +template +void TTrans_i::geto(istream& is) +{ getsym(is,oref()); }; + +//--------------------------------------------------------------------------- +/* +template +ostream& operator<<(ostream& os, const TTrans_i& t) +{ + os << (t.final() ? '+' : '-'); + os << ' '; + + if(!t.empty()) + { + if(t.defi()) + os << (t.epsi() ? '~' : '@'); + else + switch(t.in()) + { + case ' ': os << "\\ "; break; + case '\n': os << "\\n"; break; + case '\t': os << "\\t"; break; + default: os << t.in(); + } + + os << '/'; + + if(t.defo()) + os << (t.epso() ? '~' : '@'); + else + switch(t.out()) + { + case ' ': os << "\\ "; break; + case '\n': os << "\\n"; break; + case '\t': os << "\\t"; break; + default: os << t.out(); + } + + os << ' ' << t.next(); + } + + os << '\n'; + + if(!t.continued()) + os << '\n'; + + return os; +} +*/ + +//--------------------------------------------------------------------------- +#endif + diff --git a/app/src/lib/word.cc b/app/src/lib/word.cc new file mode 100644 index 0000000..0616cd7 --- /dev/null +++ b/app/src/lib/word.cc @@ -0,0 +1,199 @@ +//--------------------------------------------------------------------------- +#include "word.h" +#include "auttools.h" +#include +//--------------------------------------------------------------------------- +//--------------------------------------------------------------------------- + +void Word::autodescr(const char* fo, const char* de) +{ + strcpy(f,fo); + // len=strlen(f); + + char lemd[MAXDESCRLEN]; + int i=strcspn(de,","); + strncpy(lemd,de,i); + lemd[i]='\0'; + if(isdigit(lemd[0])) + fullform(f,lemd,l); // jeli lemat zakodowany + else + strcpy(l,lemd); // jeli lemat w penej postaci + strcpy(d,de+i+1); +} + +//--------------------------------------------------------------------------- +int Word::cmp_w(Word a, Word b) { + return (a.w_suf() > b.w_suf()); +} +//--------------------------------------------------------------------------- + +istream& operator>>(istream& is, Word& w) +{ + char temp[Word::MAXLEN+1]; + char c; + + int i=0; + while(i'; + return os; +} + +//--------------------------------------------------------------------------- +Words::~Words() { + // for (int i=0; imax && !tab[i].returned) { + max = w; + result = i; + } + } + if (result != -1) + tab[result].returned = 1; + return result; +} + +//--------------------------------------------------------------------------- +void Words::sort() { + std::sort(tab.begin(), tab.end(), Word::cmp_w); +} + +//--------------------------------------------------------------------------- + +int Words::add(const char* fo) +{ + int i = find(fo); + if(i!=-1) { + return i; + } + + if (cnt>=tab.capacity()-1) + tab.resize(tab.size()*2); + + tab[cnt].form(fo); + tab[cnt].w_suf(0.0); + + // if(cntform(fo); + tab[cnt]->w_suf(0.0); + tab[cnt]->w_pref(0.0);*/ + return cnt++; + // } + //return -1; +} + +//--------------------------------------------------------------------------- + +int Words::add(const char* fo, const char* des) +{ + char d[Word::MAXDESCRLEN]; + int l=strcspn(des,","); + int ok=1; + if( *(des+l) == ',' ) + { + strcpy(d,des+l+1); + // printf("\t%s->%s,\n", des, d); + int i=find(fo, d); + if(i!=-1) + return i; + } + else + ok=0; + + if (cnt>=tab.capacity()-1) + tab.resize(tab.size()*2); + + tab[cnt].form(fo); + if(ok) + tab[cnt].autodescr(fo, des); + else + tab[cnt].autodescr(fo, "?,?"); + + tab[cnt].w_suf(0.0); + tab[cnt].returned = 0; + /* + // if(cntform(fo); + tab[cnt]->autodescr(fo,des); + tab[cnt]->w_suf(0.0); + tab[cnt]->w_pref(0.0); + // printf("ok!\n");*/ + return cnt++; + // } + // printf("hm\n"); + return -1; +} + +//--------------------------------------------------------------------------- +void Words::prn(ostream& os) +{ + for(int i=0; i"; +} + +//--------------------------------------------------------------------------- + +ostream& operator<<(ostream& os, Words& tab) +{ + /* for(int i=0; i + +#include +//--------------------------------------------------------------------------- + +using namespace std; + +class Word +{ +public: + static const int MAXLEN=64; // dac do global + static const int MAXDESCRLEN=80; // dac do global + +private: + /// word form + char f[MAX_FORM]; // w wolnej chwili nazwy mozna zamienic na dluzsze + + /// length + int _len_suf; // dlugosc dopasowania koncowki... + // int _len_pref; // ... i prefiksu + + /// lemma + char l[MAX_FORM]; + + /// description + char d[MAX_DESC]; + + /// weight (probability) + float _w_suf; + // float _w_pref; +public: + static int cmp_w(Word a, Word b); + + Word() : _len_suf(-1) { *f='\0'; returned=0; }; + Word(const char* fo, const char* des) : _len_suf(-1) { autodescr(fo,des); _w_suf=1.0; returned=0; }; + + Word(const Word& w); + + char* form() { return f; } // przywrocic const + char* lemma() { return l; } // przywrocic const + char* descr() { return d; } + float w_suf() { return _w_suf; }; + int len_suf() { return _len_suf; } + + + void form(const char* s) { strcpy(f,s); } + void lemma(const char* s) { strcpy(l,s); } + void descr(const char* s) { strcpy(d,s); }; + void w_suf(float x) { _w_suf=x; }; + void len_suf(int n) { _len_suf=n; }; + + bool operator==(const Word& w); + bool operator!=(const Word& w); + int cmp(const Word&); + int cmpi(const Word&); + + char* operator!() { return f; }; + + operator bool() { return _len_suf>0; }; + + char* str() { return f; } + + void autodescr(const char* fo, const char* des); + + friend istream& operator>>(istream& is, Word& m); + friend ostream& operator<<(ostream& os, Word& m); + + bool returned; + +}; + +inline Word::Word(const Word& word) +{ strcpy(f,word.f); strcpy(l,word.l); strcpy(d,word.d); _len_suf=word._len_suf; _w_suf=word._w_suf; returned = 0; } + +//--------------------------------------------------------------------------- + +inline bool Word::operator==(const Word& w) +{return _len_suf==w._len_suf && + !strcmp(f,w.f) && !strcmp(l,w.l) && !strcmp(d,w.d); } + +//--------------------------------------------------------------------------- + +inline bool Word::operator!=(const Word& w) +{return _len_suf!=w._len_suf || + strcmp(f,w.f) || strcmp(l,w.l) || strcmp(d,w.d);} + +//--------------------------------------------------------------------------- + +inline int Word::cmp(const Word& w) { return strcmp(f,w.f); } + +//--------------------------------------------------------------------------- + +//inline int Word::cmpi(const Word& w) { return PL.cmpi(f,w.f); } + +//--------------------------------------------------------------------------- +//--------------------------------------------------------------------------- +//--------------------------------------------------------------------------- + +class Words +{ + private: + int find(const char* word); + int find(const char* word, const char* descr); + public: + + static const int MAX=1024; + + Words() : cnt(0) {tab.resize(MAX); }; + ~Words(); + Word& operator[](int i) { return tab[i]; } + int count() const { return cnt; } + void clear() { cnt=0; tab.clear(); } + int add(const char* fo); + int add(const char* fo, const char* des); + + /* zwraca index nastepnego wyniku, podczas pierwszego wywolania + * zwraca index wyniku o najwiekszej wadze, przy drugim wywolaniu + * wynik z druga najwyzsza waga, itd. + * Jezeli nie ma juz wynikow - zwraca -1. + */ + int next(); + + void sort(); + + void prn(ostream& os); + +// friend class Lem; +// friend class AuxLem; + friend ostream& operator<<(ostream& os, Words& tab); + vector tab; + int cnt; + +}; + +//--------------------------------------------------------------------------- + +#endif + diff --git a/app/src/mar/Makefile b/app/src/mar/Makefile new file mode 100644 index 0000000..63da335 --- /dev/null +++ b/app/src/mar/Makefile @@ -0,0 +1,6 @@ +main: + +copy: +ifdef UTT_BIN_DIR + cp mar ${UTT_BIN_DIR} +endif diff --git a/app/src/mar/mar b/app/src/mar/mar new file mode 100755 index 0000000..35318ad --- /dev/null +++ b/app/src/mar/mar @@ -0,0 +1,262 @@ +#!/usr/bin/perl + +#package: UAM Text Tools +#component name: mrk +#author: Marcin Walas + +#this program tags the tokenized file with given tags +#tags can be given in any order and configuration through the expression +#which is one of the parametres of the script +#contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com + +use strict; +use Getopt::Long; + +use attr; + +Getopt::Long::Configure('no_ignore_case_always'); + +my $help=0; +my $pattern=0; +my $macrofile=0; +my $define=0; +my $command=0; +my $action="pgP"; +my $eos="seg(EOS)"; +my $explicit_space=0; + +#this is our help function to cut the re to get another tag +#it takes only one argument which is our patern (after m4 processing) +#returns: the first root-level brace with content +sub cutRe +{ + my $i = 0; + my $level = 0; + my $text = $_[0]; + my $temp; + for( $i =0; $i < (length $text);$i++) + { + $temp = substr($text, $i,1); + if( $temp eq "(") + {#we have an opening + $level++; + } + elsif ( $temp eq ")") + {#we close + $level--; + } + if ( $level == 0) + { + $temp = substr($text,0,$i+1); + last; + } + } + $temp; +} + +#the same function as above althought it returns everything after the +#first root level brace +sub restRe +{ + my $i = 0; + my $level = 0; + my $text = $_[0]; + my $temp; + for( $i =0; $i < (length $text);$i++) + { + $temp = substr($text, $i,1); + if( $temp eq "(") + {#we have an opening + $level++; + } + elsif ( $temp eq ")") + {#we close + $level--; + } + if ( $level == 0) + { #we cut everything in the begining + $temp = substr($text,$i+1); + last; + } + } + $temp; +} + +GetOptions("pattern|e=s" => \$pattern, + "eos|E=s" => \$eos, + "macros=s" => \$macrofile, + "define=s" => \$macrofile, + "command" => \$command, + "action=s" => \$action, + "help|h" => \$help, + "space|s" => \$explicit_space + ); + +if($help) +{ + print <<'END' +Usage: mar [OPTIONS] [file ..] + +Options: + --pattern -e PATTERN Pattern. + --bos -E PATTERN Segment serving as sentence beginning marker. [TODO] + --macros=FILE Read macrodefinitions from FILE. [TODO] + --define=FILE Add macrodefinitions from FILE. [TODO] + --action -a [p][s][P] Perform only indicated actions. + p - preprocess + s - search + P - postprocess + (default pgP) + --command Print generated shell command and exit. + --help -h Print help. + +In patern you can put any tag. Tags should begin with the @ character. +They don't have to be closed. +They can't contain white spaces! + +Note: If you don't define any custom tags, whole pattern will be taged with + default tags (begining of match and end of match) + +Tags examples: + +mar -e '@BEG cat() @END' + it will find any adjectives in the text and tag them with surrounding tags +mar -e 'cat() @MYTAG cat()' + this will find two neighbouring adjectives and parcel them with tag MYTAG + +Some example patterns: +'word(domu)' - form of the word domu +'lexeme(dom)' - any form of lexeme dom +'space' - space +'cat()' - adjective + +You can use * in patterns to make zero or more counts of word. + +END +; + exit 0; +} + +die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/; + +die("$0: macro file not found") unless -e "terms.m4" and $macrofile="terms.m4"; + +my $preproc = ($action =~ /p/) ? ' fla | ' : ''; + +my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; + +#here we are preparing re for extended matching +my @tags; + +#we must find what our the tags +#some pattern adjustment +my $end = 0; +my $temp = " ".$pattern." "; +$temp =~ s/(\@[^ ]*) (\@[^ ]* )/\1 \2/g; +$pattern = $temp; + +while ($end != 1) +{ + #we seek for the first tag in pattern + if ($temp =~ /^.*?\@(.*?) /) + { + #we add this to tags array + push (@tags, $1); + #and cut the pattern + $temp =~ s/^.*?\@(.*?) / /; + #print $temp."\n"; + } + else + { + #if we dont find any tags we end + $end = 1; + } +} + +#here we have our patern with tags removed (we set sections of ()) between tags +my $patternmod = "( ".$pattern." )"; +$patternmod =~ s/\s@.*?\s/\)\(/g; + +#discarding spaces +$patternmod =~ s/\s+/\\`'/g; #` +# quoting escaped commas +$patternmod =~ s/\\,/\\`\\`\\,''/g; +# quoting commas in {m,n} r.e. operator +$patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; +#print "After m4:".$re."\n"; +my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' $macrofile - 2>/dev/null`; + +die("Incorrect pattern (m4).") if $? >> 8; + + +chomp $re; + +# <> expansion + +$re =~ s/<([^>]+)>/`echo $1 | .\/terms\.tag2re`/ge; + +# Perl-like special sequences +$re =~ s/\./[^ [:cntrl:]]/g; +$re =~ s/\\s/[ ]/g; +$re =~ s/\\S/[^ [:cntrl:]]/g; +$re =~ s/\\d/[0-9]/g; +$re =~ s/\\D/[^0-9 [:cntrl:]]/g; +$re =~ s/\\w/[a-z󶼿A-ZʣӦ0-9_]/g; +$re =~ s/\\W/[^a-z󶼿A-ZʣӦ0-9_ [:cntrl:]]/g; +# extensions +$re =~ s/\\l/[a-z󶼿]/g; #lowercase letter +$re =~ s/\\L/[A-ZʣӦ]/g; #upercase letter + +my $sedcommand; +my $grepcommand; + +#now we must built a sed script from our re +#we do this by cuting our re each tag until we cut them all +#if an user dint input any tags we do our default +my $defBOM = "BOM"; +my $defEOM = "EOM"; +my $defTempTagBeg = "####TempTAGBEG####"; +my $defTempTagEnd = "####TempTAGEND####"; + +if (@tags == 0) +{ + $sedcommand = "sed -r 's/($re)/\\500 $defBOM *\\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM *\\f\\1/g'"; +} +else #we have custom tags +{ + #first tag is easy to tag :) + my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg *\\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd *\\f\\1/g;"; + #after first step we have temp tagged parts of input matching re + #now we need to insert our custom tags + #we will find temp tags and process our input + + my $i = 0; + #copy of re which will be cut + my $rec = $re; + my $restre = $re; + + for ($i = 0 ; $i < @tags ; $i++) + { + #re cutting + $rec = cutRe($restre); + $restre = restRe($restre); + if ($rec =~ / *\( *\) */) + { + $sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\*\\f([0-9]+)/\\2 00 $tags[$i] *\\f\\2 00 $defTempTagBeg *\\f\\2/g;"; + } + else + { + $sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] *\\f\\1 00 $defTempTagBeg *\\f\\1/g;"; + } + + } + + $sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f//g;s/[0-9]+ 00 $defTempTagEnd \\*\\f//g'"; +} + +if($command) +{ + print $sedcommand."\n"; + exit 0; +} +exec $preproc.$sedcommand.$postproc; diff --git a/app/src/rm12/Makefile b/app/src/rm12/Makefile new file mode 100644 index 0000000..98bf295 --- /dev/null +++ b/app/src/rm12/Makefile @@ -0,0 +1,6 @@ +main: + +copy: +ifdef UTT_BIN_DIR + cp rm12 ${UTT_BIN_DIR} +endif diff --git a/app/src/rm12/rm12 b/app/src/rm12/rm12 new file mode 100755 index 0000000..09c7d2b --- /dev/null +++ b/app/src/rm12/rm12 @@ -0,0 +1,3 @@ +#!/bin/bash + +sed -r '/[0-9]+[ \t]+[0-9]+[ \t]+BOS/! s/[0-9]+[ \t]+[0-9]+[ \t]//' diff --git a/app/src/rs12/Makefile b/app/src/rs12/Makefile new file mode 100644 index 0000000..9c45a39 --- /dev/null +++ b/app/src/rs12/Makefile @@ -0,0 +1,12 @@ +main: rs12 + +rs12: rs12.c + gcc -static -o rs12 rs12.c + +clean: + rm rs12 + +copy: +ifdef UTT_BIN_DIR + cp rs12 ${UTT_BIN_DIR} +endif diff --git a/app/src/rs12/rs12.c b/app/src/rs12/rs12.c new file mode 100644 index 0000000..6d051b7 --- /dev/null +++ b/app/src/rs12/rs12.c @@ -0,0 +1,48 @@ +#include +#include + +#define MAXLINE 1000 + +main() +{ + char buf[MAXLINE+1], outbuf[MAXLINE+1]; + char form[MAXLINE+1]; + int len; + int curpos,nextpos=0; + int a,b; + while(fgets(buf,MAXLINE,stdin)) + { + int n=sscanf(buf,"%d %d",&a,&b); + if(n==2) + { + nextpos=a+b; + fputs(buf,stdout); + } + else + { + if(n==1) + { + curpos=a; + sscanf(buf,"%*d %*s %s",form); + } + else + { + curpos=nextpos; + sscanf(buf,"%*s %s",form); + } + + if(*form == '*') + len=0; + else + { + char *f = form; + for(len=0; *f; ++f) if(*f != '\\') ++len; + } + + char *buf1=buf; while(!isalpha(*buf1)) ++buf1; + sprintf(outbuf,"%04i %02i %s", curpos, len, buf1); + fputs(outbuf,stdout); + nextpos = curpos+len; + } + } +} diff --git a/app/src/sen-l/Makefile b/app/src/sen-l/Makefile new file mode 100644 index 0000000..5a3601e --- /dev/null +++ b/app/src/sen-l/Makefile @@ -0,0 +1,15 @@ + + +sen: sen.l + flex -osen.c sen.l + cc -O3 -o sen sen.c -lfl + +copy: +ifdef UTT_BIN_DIR + cp sen ${UTT_BIN_DIR} +endif + +clean: + rm sen.c sen + +uninstall: diff --git a/app/src/sen-l/sen.l b/app/src/sen-l/sen.l new file mode 100644 index 0000000..4128e15 --- /dev/null +++ b/app/src/sen-l/sen.l @@ -0,0 +1,80 @@ +%{ + int pos=0,len=0; + + void set_position(); +%} + +ul [A-ZʣӦ] +ll [a-z󶿼] +l ul|ll +n [0-9]+ +s [ \t]+ + + +ab1 (mgr|in|prof|hab|doc|dyr|kier|zast) +ab2 (ul|pl|al) + +abrv (ab1|ab2) + +SEG .*\n +N {n}{s}{n}{s}N{s}.*\n +S {n}{s}{n}{s}S{s}.*\n +P {n}{s}{n}{s}P{s}.*\n +W {n}{s}{n}{s}W{s}.*\n +UL {n}{s}{n}{s}W{s}{ul}.*\n +Cap {n}{s}{n}{s}W{s}{ul}{ll}*.*\n +POINT {n}{s}{n}{s}P{s}\.({s}.*)?\n +QMARK {n}{s}{n}{s}P{s}\?({s}.*)?\n +EXCL {n}{s}{n}{s}P{s}\!({s}.*)?\n +DASH {n}{s}{n}{s}P{s}\-({s}.*)?\n +POINTS {POINT}+ + +ABRV {n}{s}{n}{s}W{s}{abrv}({s}.*)?\n + +EOS {POINT}|{POINTS}|{QMARK}|{EXCL} + + +%% + + +{N}({POINT}{N})+ ECHO; set_position(); +({UL}{POINT}{S}?)+{Cap} ECHO; set_position(); +{ABRV}{POINT} ECHO; set_position(); + + +{P}/{S}{DASH} ECHO; set_position(); print_EOS(); +{EOS}/{S}({Cap}|{P}|{N}) ECHO; set_position(); print_EOS(); + +.* ECHO; set_position(); + +<> printf("%04d 00 EOS *\n",pos+len); exit(1); + +%% + +int main() +{ + printf("0000 00 BOS *\n"); + yylex(); +} + +int yywrap() +{ + return 1; +} + +void set_position() +{ + char *lastseg, *tmp; + yytext[yyleng-1]='\0'; + if(tmp=strrchr(yytext,'\n')) + lastseg=tmp+1; + else + lastseg=yytext; + sscanf(lastseg,"%d %d", &pos, &len); + yytext[yyleng-1]='\n'; +} + +int print_EOS() +{ + printf("%04d 00 EOS *\n%04d 00 BOS *\n",pos+len,pos+len); +} diff --git a/app/src/sen-nl/Makefile b/app/src/sen-nl/Makefile new file mode 100644 index 0000000..4e0c129 --- /dev/null +++ b/app/src/sen-nl/Makefile @@ -0,0 +1,9 @@ + +sen-nl: + +copy: +ifdef UTT_BIN_DIR + cp sen-nl ${UTT_BIN_DIR} +endif + +clean: diff --git a/app/src/sen-nl/sen-nl b/app/src/sen-nl/sen-nl new file mode 100755 index 0000000..cecd541 --- /dev/null +++ b/app/src/sen-nl/sen-nl @@ -0,0 +1,3 @@ +#!/bin/bash + +sed -r '1 s/^(([0-9]+)[ \t][0-9]+[ \t].*)$/\2 00 BOS \*\n\1/;t;$! s/(([0-9]+)[ \t][0-9]+[ \t][[:alpha:]]+[ \t]+[[:print:]]*\\n.*)$/\2 00 EOS *\n\2 00 BOS *\n\1/; $ s/^(([0-9]+) .*)$/\1\n\2 00 EOS */' diff --git a/app/src/ser/Makefile b/app/src/ser/Makefile new file mode 100644 index 0000000..a0ea0fe --- /dev/null +++ b/app/src/ser/Makefile @@ -0,0 +1,11 @@ + +ser: + +copy: +ifdef UTT_BIN_DIR + cp ser ${UTT_BIN_DIR} +endif + +clean: + +uninstall: diff --git a/app/src/ser/ser b/app/src/ser/ser new file mode 100755 index 0000000..a203bdf --- /dev/null +++ b/app/src/ser/ser @@ -0,0 +1,168 @@ +#!/usr/bin/perl + +#package: UAM Text Tools +#component: ser (pattern search tool) +#author: Tomasz Obrbski + +use strict; +use Getopt::Long; + +my $SHARE_DIR="/usr/share/utt"; +my $USER_DIR="$ENV{HOME}/.utt/share"; + +#use lib "$ENV{HOME}/.utt/lib/perl"; +#use attr; + +Getopt::Long::Configure('no_ignore_case_always'); + +my $help=0; +my $pattern=0; +my $only_matching=0; +my $no_markers=0; +my $macros=0; +my $flextemplate=0; +my $flex=0; +my $morfield='lem'; + +my $configfile1="../../conf/ser.conf"; +my $configfile2="../conf/ser.conf"; + +#read configuration files########################### +my $file; +foreach $file ($configfile1, $configfile2){ + if(open(CONFIG, $file)){ + while () { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "pattern")or($name eq "e")){ + $pattern=$value; + } + elsif($name eq "morph"){ + $morfield=$value; + } + elsif(($name eq "only-matching")or($name eq "m")){ + $only_matching=1; + } + elsif(($name eq "no-markers")or($name eq "M")){ + $no_markers=1; + } + elsif($name eq "define"){ + $macros=$value; + } + elsif($name eq "flex-template"){ + $flextemplate=$value; + } + elsif($name eq "flex"){ + $flex=1; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + + } + close CONFIG; + } +} +######################################################### + +GetOptions("pattern|e=s" => \$pattern, + "morph=s" => \$morfield, + "only-matching|m" => \$only_matching, + "no-markers|M" => \$no_markers, + "define=s" => \$macros, + "flex-template=s" => \$flextemplate, + "flex" => \$flex, + "help|h" => \$help); + +if($help) +{ + print <<'END' +Usage: ser [OPTIONS] [file ..] + +Options: + --help -h Help. + --pattern=PATTERN -e PATTERN Search pattern. + --morph=STRING Field containing morphological information (default 'lem'). + --define=FILE Read macrodefinitions from FILE. + --flex-template=FILE Read flex code template from FILE. + --only-matching -m Print only fragments matching PATTERN. + --no-markers -M Do not print BOM and EOM markers [TODO]. + --flex Print only the generated flex code and exit. +END +; + exit 0; +} + +die("$0: no pattern given.\n") unless $pattern; + +die("$0: flex template file not found") unless + $flextemplate or + -e "$USER_DIR/ser.l.template" and $flextemplate="$USER_DIR/ser.l.template" or + -e "$SHARE_DIR/ser.l.template" and $flextemplate="$SHARE_DIR/ser.l.template"; + +die("$0: macro file not found") unless + $macros or + -e "$USER_DIR/terms.m4" and $macros="$USER_DIR/terms.m4" or + -e "$SHARE_DIR/terms.m4" and $macros="$SHARE_DIR/terms.m4"; + + +#$pattern =~ s/cat\(([^)]+)\)/'cat('.pre($1).')'/ge; +# quoting escaped commas /NIE DZIAA/ +$pattern =~ s/\\,/\\`\\`\\,''/g; + +# protecting backslash +$pattern =~ s/\\/\\\\\\/g; + +# discarding spaces +$pattern =~ s/\s+/\\`'/g; #` + +my $flexpattern = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT=\\\\n --define=MORFIELD=$morfield $macros - 2>/dev/null`; + +die("Incorrect pattern (m4).") if $? >> 8; + + +chomp $flexpattern; + +# <> expansion +$flexpattern =~ s/<([^>]+)>/`echo $1 | tag2re`/ge; + +# restricting the value of the . special symbol +$flexpattern =~ s/\./[^ \\t\\n\\r\\f]/g; + +# perl-like shortcuts for character classes +# perl exact +$flexpattern =~ s/\\s/[ \\t]/g; +$flexpattern =~ s/\\S/[^ \\t\\n\\r\\f]/g; +$flexpattern =~ s/\\d/[0-9]/g; +$flexpattern =~ s/\\D/[^0-9 \\t\\n\\r\\f]/g; +$flexpattern =~ s/\\w/[a-z󶼿A-ZʣӦ0-9_]/g; +$flexpattern =~ s/\\W/[^a-z󶼿A-ZʣӦ0-9_ \\t\\n\\r\\f]/g; +# extensions +$flexpattern =~ s/\\l/[a-z󶼿]/g; #lowercase letter +$flexpattern =~ s/\\L/[A-ZʣӦ]/g; #upercase letter + +# protecting slash +$flexpattern =~ s/\//\\\//g; + +my $defaultaction = ($only_matching) ? '' : 'ECHO'; + +# docelowo posrednie pliki powinny byc w jakims tempie !!! + +system "m4 \"--define=PATTERN=$flexpattern\" \"--define=DEFAULTACTION=$defaultaction\" $flextemplate > $USER_DIR/ser.l"; + +if($flex) +{ + system "cat $USER_DIR/ser.l"; + exit 0; +} + +system "flex -o$USER_DIR/ser.c $USER_DIR/ser.l"; +system "cc -O3 -o $USER_DIR/ser.executable $USER_DIR/ser.c -lfl"; +system "$USER_DIR/ser.executable"; + +system "rm -f $USER_DIR/ser.{l,c,executable}"; + diff --git a/app/src/tags/Makefile b/app/src/tags/Makefile new file mode 100644 index 0000000..1775f12 --- /dev/null +++ b/app/src/tags/Makefile @@ -0,0 +1,8 @@ +main: + +copy: +ifdef UTT_TAGS_DIR + cp *.tag2re ${UTT_TAGS_DIR} +endif + +clean: diff --git a/app/src/tags/README b/app/src/tags/README new file mode 100644 index 0000000..885f748 --- /dev/null +++ b/app/src/tags/README @@ -0,0 +1,5 @@ +In this directory files specific to different tag formats are stored. + +TAGSET.tag2re are command-line programs which translate a tag constraint +specification into a character-level regular expression matching +all tags in the TAGSET format meeting the specified constraint. diff --git a/app/src/tags/ipi.tag2re b/app/src/tags/ipi.tag2re new file mode 100755 index 0000000..f9bf1ab --- /dev/null +++ b/app/src/tags/ipi.tag2re @@ -0,0 +1 @@ +#TODO diff --git a/app/src/tags/uam.tag2re b/app/src/tags/uam.tag2re new file mode 100755 index 0000000..1e3e591 --- /dev/null +++ b/app/src/tags/uam.tag2re @@ -0,0 +1,83 @@ +#!/usr/bin/perl + +use locale; + +my $input = <>; +chomp $input; + +our $pos_re = qr/(?:[[:upper:]]+)/; +our $attr_re = qr/(?:[[:upper:]]+)/; +our $val_re = qr/(?:[[:lower:][:digit:]+?!*-]|<[^>\n]+>)/; +our $av_re = qr/(?:$attr_re$val_re+)/; +our $avlist_re = qr/(?:$av_re+)/; +our $cat_re = qr/(?:$pos_re(?:\/$avlist_re)?)/; + +print pre($input); + +sub parse ($) +{ + my ($dstr)=@_; + my $avs={}; + my ($cat,$attrlist) = split '/', $dstr; + ATTR: + while( $attrlist =~ /($attr_re)($val_re+)/g ) + { + my ($attrstr,$valstr)=($1,$2); + my %vals; + while($valstr =~ /$val_re/g) + { + my $val = $&; + next ATTR if $val eq '*'; + $val =~ s/^<([[:lower:]])>$/$1/; + $vals{$val}=1; + } + + $avs->{$attrstr} = \%vals; # dlaczego to dziala? %vals jest lokalne + } + [$cat, $avs]; +} + +sub unparse (\@) +{ + my ($cat,$avs)= @{shift @_}; + my $dstr=$cat; + my @attrs = keys %$avs; + if(@attrs) + { + $dstr .= '/'; + for my $attr ( sort @attrs ) + { + $dstr .= $attr . (join '', sort keys %{$avs->{$attr}}); + } + } + $dstr; +} + +sub canonize ($) +{ + unparse @{parse shift} ; +} + +sub pre +{ + my $pos_res = '[[:upper:]]+'; + my $attr_res = '[[:upper:]]+'; + my $val_res = '[[:lower:][:digit:]+?!*-]|<[^>\n[:cntrl:]]+>'; + my $av_res = "$attr_res($val_res)+"; + my $avlist_res = "($av_res)+"; + + my $pat = canonize(shift); + my $ret; + my ($pos,$avlist) = split /\//, $pat; + $ret = $pos.'(\/'; + while ($avlist =~ /($attr_res)(${val_res}+)/g) + { + my $attr = $1; + my $vals = $2; + my $vals = "($val_res)*(".join('|',($vals =~ /$val_res/g)).")($val_res)*"; + $ret .= "($av_res)*$attr$vals"; + } + $ret .= "($av_res)*)?"; + return $ret; +} + diff --git a/app/src/tok.c/Makefile b/app/src/tok.c/Makefile new file mode 100644 index 0000000..6b0efba --- /dev/null +++ b/app/src/tok.c/Makefile @@ -0,0 +1,37 @@ +PAR=-Wno-deprecated -O3 +PAR2=-c -Wno-deprecated -O3 +LIB_PATH=../lib +CMDLINE_FILE='"../tok.c/cmdline.h"' + + +tok: tok.o cmdline.c common_tok.o common.o + g++ $(PAR) tok.c cmdline.c common.o common_tok.o -o tok + +tok.o: tok.c cmdline.h + g++ $(PAR2) tok.c + +common_tok.o: cmdline.h common_tok.cc common_tok.h + g++ $(PAR2) common_tok.cc + +common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc\ + $(COMMON_PATH)/common.h + g++ $(PAR2) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc + +cmdline.ggo: cmdline_tok.ggo ../common/cmdline_common.ggo + cat cmdline_tok.ggo ../common/cmdline_common.ggo > cmdline.ggo + +cmdline.c cmdline.h: cmdline.ggo + gengetopt -i cmdline.ggo --conf-parser + + +copy: +ifdef UTT_BIN_DIR + cp tok ${UTT_BIN_DIR}/ +endif + + +clean: clean.cmdline + rm *.o + +clean.cmdline: + rm cmdline.* diff --git a/app/src/tok.c/cmdline_tok.ggo b/app/src/tok.c/cmdline_tok.ggo new file mode 100644 index 0000000..8316f5c --- /dev/null +++ b/app/src/tok.c/cmdline_tok.ggo @@ -0,0 +1,4 @@ +package "tok" +version "0.1" + + diff --git a/app/src/tok.c/common_tok.cc b/app/src/tok.c/common_tok.cc new file mode 100644 index 0000000..eb4dca1 --- /dev/null +++ b/app/src/tok.c/common_tok.cc @@ -0,0 +1,27 @@ +#include +#include +#include "common_tok.h" + +char dictionary[255]; + +void process_tok_options(gengetopt_args_info args) +{ + if(args.dictionary_given) + strcpy(dictionary, args.dictionary_arg); + else { + char path[256]; + sprintf(path, "/etc/utt/%s", DICT_FILE); + if (file_accessible(path) == 0) + strcpy(dictionary, path); + else { + sprintf(path, "%s/.utt/%s", getenv("HOME"), DICT_FILE); + if (file_accessible(path) == 0) + strcpy(dictionary, path); + else { + fprintf(stderr, "Cannot find dictionary!\n"); + exit(1); + } + } + } + +} diff --git a/app/src/tok.c/common_tok.h b/app/src/tok.c/common_tok.h new file mode 100644 index 0000000..68e6395 --- /dev/null +++ b/app/src/tok.c/common_tok.h @@ -0,0 +1,16 @@ +#ifndef __COMMON_TOK__H +#define __COMMON_TOK__H + +#include +#define _CMDLINE_FILE "../tok.c/cmdline.h" +#include "../common/common.h" + +#include "cmdline.h" + +#define DICT_FILE "data/tok.bin" + +extern char dictionary[]; + +extern void process_tok_options(gengetopt_args_info args); + +#endif diff --git a/app/src/tok.c/tok.c b/app/src/tok.c/tok.c new file mode 100644 index 0000000..f15a225 --- /dev/null +++ b/app/src/tok.c/tok.c @@ -0,0 +1,83 @@ + +#include +#include +#include +#include + + +#include +//#include "../lib/iotools.h" + +#include "cmdline.h" + + +char buf[257]; +int filepos=0; +char* tokstart; +char* tokend; +char tmp; +char tag; + + +gengetopt_args_info args; + + +inline +void printtoken(char tag) +{ + tmp=*tokend; + *tokend='\0'; + printf("%04d %02d %c %s\n", filepos, tokend-tokstart, tag, tokstart); + *tokend=tmp; + filepos+=tokend-tokstart; + tokstart=tokend; + if(args.interactive_flag) fflush(stdout); +} + +main(int argc, char** argv) +{ + + if (cmdline_parser(argc, argv, &args) != 0) + exit(1); + + printf("inter:%d\n",args.interactive_flag); + + // process_common_options(&args, argv[0]); + // process_tok_options(args); + + setlocale(LC_CTYPE,""); + setlocale(LC_COLLATE,""); + + while(fgets(buf,256,stdin)) + { + + tokstart=tokend=buf; + while(*tokend) + { + char *prev=tokend; + ++tokend; + if(isalpha(*prev) && !isalpha(*tokend)) + printtoken('W'); + else if(isdigit(*prev) && !isdigit(*tokend)) + printtoken('N'); + else if(isspace(*prev)) + { + switch(*prev) + { + case ' ': *prev='_'; break; + case '\t':*prev='t'; break; + case '\r':*prev='r'; break; + case '\f':*prev='f'; break; + case '\n':*prev='n'; + } + if(!isspace(*tokend)) + printtoken('S'); + } + else if(ispunct(*prev)) + printtoken('P'); + } + } + + cmdline_parser_free(&args); +} + diff --git a/app/src/tok.l/Makefile b/app/src/tok.l/Makefile new file mode 100644 index 0000000..0cbb3fc --- /dev/null +++ b/app/src/tok.l/Makefile @@ -0,0 +1,20 @@ +PAR=-O3 -static + +tok: tok.c cmdline.c + cc $(PAR) tok.c cmdline.c -o tok + +tok.c: tok.l + flex -8 -f -otok.c tok.l + +cmdline.c cmdline.h: cmdline_tok.ggo + gengetopt -i cmdline_tok.ggo --conf-parser + +clean: + rm *.c + rm cmdline.* + rm tok + +copy: tok +ifdef UTT_BIN_DIR + cp tok ${UTT_BIN_DIR} +endif diff --git a/app/src/tok.l/cmdline_tok.ggo b/app/src/tok.l/cmdline_tok.ggo new file mode 100644 index 0000000..8b58931 --- /dev/null +++ b/app/src/tok.l/cmdline_tok.ggo @@ -0,0 +1,4 @@ +package "tok" +version "0.1" + +option "interactive" i "Interactive mode." flag off diff --git a/app/src/tok.l/tok.l b/app/src/tok.l/tok.l new file mode 100644 index 0000000..6e59383 --- /dev/null +++ b/app/src/tok.l/tok.l @@ -0,0 +1,70 @@ +%{ + #include + #include + #include "cmdline.h" + + int filepos=0; + + struct gengetopt_args_info args; + +%} + + +%% + +[a-zA-Z󶿼ʣӦ]+ { + printf("%04d %02d W %s\n", filepos, yyleng, yytext); + filepos+=yyleng; + if(args.interactive_flag) fflush(stdout); + } + +[[:digit:]]+ { + printf("%04d %02d N %s\n", filepos, yyleng, yytext); + filepos+=yyleng; + if(args.interactive_flag) fflush(stdout); + } + +[[:space:]\n]+ { + int i; + printf("%04d %02d S ", filepos, yyleng); + for(i=0; i) { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "interactive")or($name eq "i")){ + $interactive=1; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + } + close CONFIG; + } +} +#########################################################s + +GetOptions("interactive|i" => \$interactive, + "help|h" => \$help); + +if($help) +{ + print <<'END' +Usage: tok [OPTIONS] + +Options: + --interactive Interactive (no output buffering). + --help -h Help. +END +; + exit 0; +} + + +$| = $interactive; + +my $offset = 0; + +while(<>) +{ + 1 while + / [[:alpha:]]+ (?{seg('W',$&)}) + | \d+ (?{seg('N',$&)}) + | \s+ (?{seg('S',$&)}) + | [[:punct:]] (?{seg('P',$&)}) + | . (?{seg('B',$&)}) + /gxo; +} + +# | [^[:print:]] (?{seg("B",$&)}) + + +sub seg +{ + my ($tag,$match) = @_; + my $len=length $match; + printf "%04d %02d %s ", $offset, $len, $tag; + if($tag eq 'S') + { + for(my $i=0; $i<$len; ++$i) + { + my $c = substr $match, $i, 1; + print '_' if $c eq ' '; + print '\n' if $c eq "\n"; + print '\t' if $c eq "\t"; + print '\r' if $c eq "\r"; + print '\f' if $c eq "\f"; + } + } + elsif($tag eq 'B') + { + printf "\\x%02X", ord($match); + } + else + { + print $match; + } + print "\n"; + $offset += $len; +} + diff --git a/app/src/unfla/Makefile b/app/src/unfla/Makefile new file mode 100644 index 0000000..6c055d0 --- /dev/null +++ b/app/src/unfla/Makefile @@ -0,0 +1,6 @@ +unfla: + +copy: +ifdef UTT_BIN_DIR + cp unfla ${UTT_BIN_DIR} +endif diff --git a/app/src/unfla/unfla b/app/src/unfla/unfla new file mode 100755 index 0000000..dd73ca4 --- /dev/null +++ b/app/src/unfla/unfla @@ -0,0 +1 @@ +tr '\014' '\012' diff --git a/lang/README b/lang/README new file mode 100644 index 0000000..aba8ff8 --- /dev/null +++ b/lang/README @@ -0,0 +1 @@ +the directory contains \ No newline at end of file diff --git a/lang/conf/pl_PL.ISO-8859-2/pl_PL.ISO-8859-2.sym b/lang/conf/pl_PL.ISO-8859-2/pl_PL.ISO-8859-2.sym new file mode 100644 index 0000000..81e7b98 --- /dev/null +++ b/lang/conf/pl_PL.ISO-8859-2/pl_PL.ISO-8859-2.sym @@ -0,0 +1,8 @@ +lcase a b c d e f g h i j k l m n o +lcase p q r s t u v w x y z +ucase A B C D E F G H I J K L M N O +ucase P Q R S T U V W X Y Z +letter lcase ucase +digit 0 1 2 3 4 5 6 7 8 9 +punct , . @ / ' ~ ; _ - + ? \ +all letter digit signs sem diff --git a/lang/conf/pl_PL.UTF-8/pl_PL.utf-8.sym b/lang/conf/pl_PL.UTF-8/pl_PL.utf-8.sym new file mode 100644 index 0000000..43164f2 --- /dev/null +++ b/lang/conf/pl_PL.UTF-8/pl_PL.utf-8.sym @@ -0,0 +1,8 @@ +lcase a ą b c ć d e ę f g h i j k l ł m n ń o ó +lcase p q r s ś t u v w x y z ź ż é ö ü ä +ucase A Ą B C Ć D E Ę F G H I J K L Ł M N Ń O Ó +ucase P Q R S Ś T U V W X Y Z Ź Ż +letter lcase ucase +digit 0 1 2 3 4 5 6 7 8 9 +punct , . @ / ' ~ ; _ - + ? \ +all letter digit signs sem diff --git a/lang/dic/pl_PL.ISO-8859-2/cor.bin b/lang/dic/pl_PL.ISO-8859-2/cor.bin new file mode 100644 index 0000000..6b004d1 Binary files /dev/null and b/lang/dic/pl_PL.ISO-8859-2/cor.bin differ diff --git a/lang/dic/pl_PL.ISO-8859-2/gue.bin b/lang/dic/pl_PL.ISO-8859-2/gue.bin new file mode 100644 index 0000000..e3ddecb Binary files /dev/null and b/lang/dic/pl_PL.ISO-8859-2/gue.bin differ diff --git a/lang/dic/pl_PL.ISO-8859-2/lem.bin b/lang/dic/pl_PL.ISO-8859-2/lem.bin new file mode 100644 index 0000000..8779ef9 Binary files /dev/null and b/lang/dic/pl_PL.ISO-8859-2/lem.bin differ