git-svn-id: svn://atos.wmid.amu.edu.pl/utt@4 e293616e-ec6a-49c2-aa92-f4a8b91c5d16

This commit is contained in:
obrebski 2008-03-11 13:02:41 +00:00
parent f1563c0f02
commit 25ae32e4c2
117 changed files with 10440 additions and 0 deletions

152
app/Makefile Normal file
View File

@ -0,0 +1,152 @@
# main makefile
BIN=bin
SRC=src
DIR=$(shell pwd)
##############################
UTT_DIST_NAME=utt-0.9
export UTT_DIR=${DIR}/${UTT_DIST_NAME}
export UTT_BIN_DIR=${UTT_DIR}/bin # executables
export UTT_CONF_DIR=${UTT_DIR}/conf # configuration files
export UTT_SHARE_DIR=${UTT_DIR}/share # stuff
export UTT_LANG_DIR=${UTT_DIR}/lang # language/encoding specific stuff
export UTT_TAGS_DIR=${UTT_DIR}/tags # tag format specific stuff
#export UTT_LIB_DIR=${UTT_DIR}/lib # nothing
export UTT_DOC_DIR=${UTT_DIR}/doc # documantation
UTT_DIST_FILE=utt
# list of components to be included in the distribution
COMPONENTS = lib gue tok.l cor lem kot sen-l sen-nl ser grp con fla unfla mar compiledic
##############################
all: dirs components conf doc lang tags files share
@echo "Make completed successfully!"
.PHONY: dirs
dirs:
if [ -d ${UTT_DIR} ]; then rm -r ${UTT_DIR}; fi
mkdir -p ${UTT_DIR}
mkdir -p ${UTT_BIN_DIR}
mkdir -p ${UTT_CONF_DIR}
mkdir -p ${UTT_SHARE_DIR}
mkdir -p ${UTT_LANG_DIR}
mkdir -p ${UTT_TAGS_DIR}
mkdir -p ${UTT_DOC_DIR}
.PHONY: components
components:
@for cmp in $(COMPONENTS); do\
make $$cmp;\
done
${COMPONENTS}:
cd $(SRC)/$@ && make && make copy; cd $(DIR);
.PHONY: conf
conf:
cd $(DIR)/conf && make && make copy ; cd $(DIR)
.PHONY: doc
doc:
cd $(DIR)/doc && make && make copy ; cd $(DIR)
.PHONY: lang
lang:
cd $(DIR)/lang && make && make copy ; cd $(DIR)
.PHONY: tags
tags:
cd $(DIR)/tags && make && make copy ; cd $(DIR)
.PHONY: share
share:
cd $(DIR)/share && make && make copy ; cd $(DIR)
.PHONY: files
files:
cd ${DIR}/files && make && make copy ; cd ${DIR}
clean: clean_components clean_doc clean_dist
@echo "All files cleaned successfully!"
clean_components:
@for cmp in $(COMPONENTS); do \
cd $(SRC)/$$cmp && make clean ; cd $(DIR); \
done
clean_lib:
cd $(SRC)/lib && make clean
clean_doc:
cd $(DIR)/doc && make clean ; cd $(DIR)
install: all
cd ${UTT_DIR} && make install; cd ${DIR}
uninstall:
cd ${UTT_DIR} && make uninstall; cd ${DIR}
reinstall:
cd ${UTT_DIR} && make reinstall; cd ${DIR}
# ifdef INSTALL_BIN_DIR
# if [ -d ${INSTALL_BIN_DIR} ]; then true; else mkdir -p ${INSTALL_BIN_DIR}; fi
# cp -r ${UTT_BIN_DIR}/* ${INSTALL_BIN_DIR}/
# endif
# ifdef INSTALL_SHARE_DIR
# if [ -d ${INSTALL_SHARE_DIR} ]; then true; else mkdir -p ${INSTALL_SHARE_DIR}; fi
# cp -r ${UTT_SHARE_DIR}/* ${INSTALL_SHARE_DIR}/
# endif
# ifdef INSTALL_DOC_DIR
# if [ -d ${INSTALL_DOC_DIR} ]; then true; else mkdir -p ${INSTALL_DOC_DIR}; fi
# cp -r ${UTT_DOC_DIR}/* ${INSTALL_DOC_DIR}/
# endif
# ifdef INSTALL_LIB_DIR
# if [ -d ${INSTALL_LIB_DIR} ]; then true; else mkdir -p ${INSTALL_LIB_DIR}; fi
# cp -r ${UTT_LIB_DIR}/* ${INSTALL_LIB_DIR}
# endif
#
# install: make_dirs install_components install_cnf install_dta install_doc
# @echo "Installation completed successfully!"
# install_components:
# @for cmp in $(COMPONENTS); do \
# cd $(SRC)/$$cmp && make install ; cd $(DIR); \
# done
# install_cnf:
# cp conf/*.conf $(UTT_ETC_DIR)/
# install_dta:
# cp -r data/* $(UTT_SHARE_DIR)/
# install_doc:
# cp doc/utt.{ps,pdf,html,info} $(UTT_DOC_DIR)/
# make_dirs:
# if [ -d $(UTT_BIN_DIR) ]; then true; else mkdir -p $(UTT_BIN_DIR); fi
# if [ -d $(UTT_ETC_DIR) ]; then true; else mkdir -p $(UTT_ETC_DIR); fi
# if [ -d $(UTT_SHARE_DIR) ]; then true; else mkdir -p $(UTT_SHARE_DIR); fi
# if [ -d $(UTT_DOC_DIR) ]; then true; else mkdir -p $(UTT_DOC_DIR); fi
# if [ -d $(UTT_LIB_DIR) ]; then true; else mkdir -p $(UTT_LIB_DIR); fi
dist: all
tar -czvf $(UTT_DIST_NAME).tgz $(UTT_DIR)
clean_dist:
if [ -d $(UTT_DIST_DIR) ]; then rm -r $(UTT_DIST_DIR); else true; fi
if [ -f $(UTT_DIST_FILE).tar.bz2 ]; then rm $(UTT_DIST_FILE).tar.bz2; else true; fi
if [ -f $(UTT_DIST_PMDB_FILE).tar.bz2 ]; then rm $(UTT_DIST_PMDB_FILE).tar.bz2; else true; fi

16
app/README.developers Normal file
View File

@ -0,0 +1,16 @@
COMMANDS TO BE RUN IN THIS DIRECTORY:
% make
compiles all the components, moves all files destinated for
distribution in the directory [the value of UTT_DIR_NAME variable in
Makefile] (currently utt-0.9)
% make install
installes the package in your system in the directory ~/.utt
% make dist
prepares distribution file named ${UTT_DIR_NAME}.tgz (currently utt-0.9.tgz)

11
app/TODO Normal file
View File

@ -0,0 +1,11 @@
* wyprowadzic grp-pre i grp-post z grp do aux?
* zamienic kota na lepszego (Kubis)
*
1. DONE. Makefile do gph (install).
2. (zrobione dla ser?) Nazwy pmdb2re -> pmdb.tag2re (grp, ser).
3. DONE. Usuniecie bibliotek (aplhabet, erro).
4. DONE (dla gue i lem) Poprawna obsluga opcji --one-line i oraz --one-field.
---
5. Zadania zwiazane z rozbudowa ser (src/ser/TODO).

61
app/dist/Makefile vendored Normal file
View File

@ -0,0 +1,61 @@
# compile task doesn't compile sources, but just copy some files
# this should be changed
#
# I put here some variables
# path, where binaries are placed
# (they will be processed for making distribution)
export _UTT_DIST_DIR=$(shell pwd)/bin
# path, where distribution file should be placed
export _UTT_DIST_OUTPUT=$(shell pwd)
# -----------------------------------------------------------
# default task should display options
.PHONY: default
defaul:
@echo "Using: make compile|tarball|rpm|deb"
# -----------------------------------------------------------
# -----------------------------------------------------------
# this task should compile utt application
.PHONY: compile
compile:
if test -d ${_UTT_DIST_DIR}; then rm -fr ${_UTT_DIST_DIR}; fi
mkdir -p ${_UTT_DIST_DIR}
@# fake compilation
cp -r ../utt-0.9/* ${_UTT_DIST_DIR}/
@# we add some extra file (required during instalation)
cp common/create_utt_config.pl ${_UTT_DIST_DIR}/
chmod 700 ${_UTT_DIST_DIR}/create_utt_config.pl
# -----------------------------------------------------------
# this task should compile utt (if nesessery) and create tar.gz version
.PHONY: tarball
tarball: compile
cd tarball && make
# -----------------------------------------------------------
# this task should compile utt (if nesessery) and create rpm version
.PHONY: rpm
rpm: compile
@#we build rpm (see spec/README for details)
cd spec && make
# -----------------------------------------------------------
# this task should compile utt (if nesessery) and create deb version
.PHONY: deb
deb: compile
@#we build deb (see deb/README for details)
cd deb && make
# -----------------------------------------------------------
# this task should remove compiled files and directories
.PHONY: clean
clean:
# finally the line below should be uncomment
rm -fr ${_UTT_DIST_DIR}

1
app/dist/common/description.def vendored Normal file
View File

@ -0,0 +1 @@
I put here some description.

1
app/dist/common/description.pl.def vendored Normal file
View File

@ -0,0 +1 @@
Tu umieszczę opis po polsku.

1
app/dist/common/release.def vendored Normal file
View File

@ -0,0 +1 @@
1

0
app/dist/common/requirements.def vendored Normal file
View File

53
app/dist/common/utt_make_config.pl vendored Normal file
View File

@ -0,0 +1,53 @@
#!/usr/bin/perl
use Cwd 'abs_path';
use File::Basename;
use POSIX;
my $cur_dir = dirname(abs_path($0));
open(FILE, ">$cur_dir/conf/utt.conf");
# we put some description into utt.conf file
print FILE "# ************************************************************\n";
print FILE "# * This file was created automatically during installation. *\n";
print FILE "# * If you don't need do not change it. *\n";
print FILE "# * *\n";
print FILE "# * UAM Text Tools *\n";
print FILE "# * Adam Mickiewicz University, Poland *\n";
print FILE "# * http://utt.amu.edu.pl *\n";
print FILE "# ************************************************************\n";
print FILE "\n\n";
# we need utt home directory
print FILE "# absolute path to utt directory\n";
print FILE "UTT_HOME=$cur_dir\n\n";
# we need user default locale
$best_locale = findLocale();
print FILE "# user locale\n";
print FILE "UTT_LOCALE=$best_locale\n";
print FILE "\n";
close FILE;
sub findLocale() {
$cur_locale = setlocale(LC_CTYPE);
# we replace Latinx to ISO-8859-x
$cur_locale =~ s/(.+?)Latin(.+?)/$1ISO\-8859\-$2/g;
if($cur_locale =~ /\w+_\w+\.\S+/) {
$best_locale = $cur_locale;
}
elsif($cur_locale =~ /\w+_\w+/) {
$best_locale = $cur_locale.".UTF-8";
}
else {
$best_locale = toupper($cur_locale).'_'.tolower($cur_locale).'.UTF-8';
}
return $best_locale;
}

1
app/dist/common/version.def vendored Normal file
View File

@ -0,0 +1 @@
0.9

81
app/dist/deb/Makefile vendored Normal file
View File

@ -0,0 +1,81 @@
#default task
# here there're few properties
_PRODUCT_NAME=utt
_BUILD_DIR=$(_UTT_BIN_DIR)
_UTT_VER=$(shell cat ../common/version.def)
_UTT_REL=$(shell cat ../common/release.def)
_DEB_ROOT=$(shell pwd)/deb_root
_INSTALL_DIR=/usr/local/$(_PRODUCT_NAME)/$(_UTT_VER)-$(_UTT_REL)
.PHONY: default
default:
# we need some extra configuration files
make_control
make_postinst
# first, we prepare some directory structure
mkdir -p $(_DEB_ROOT)/DEBIAN
mkdir -p $(_DEB_ROOT)$(_INSTALL_DIR)
mkdir -p $(_DEB_ROOT)/usr/share/man/man1
mkdir -p $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)
find $(_DEB_ROOT) -type d | xargs chmod 755 # this is necessary on Debian Woody, don't ask me why
# next, we copy necessary files
mv ./control $(_DEB_ROOT)/DEBIAN/
cp ./postinst $(_DEB_ROOT)/DEBIAN/
cp ./prerm $(_DEB_ROOT)/DEBIAN/
# cp -r $(_BUILD_DIR)/man/* $(_DEB_ROOT)/usr/share/man/
cp $(_BUILD_DIR)/COPYRIGHT $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/copyright
# cp $(_BUILD_DIR)/changelog $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/
# cp $(_BUILD_DIR)/changelog.Debian $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/
# next we make man/doc archives
# gzip --best $(_DEB_ROOT)/usr/share/man/man1/$(_PRODUCT_NAME).1
# gzip --best $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/changelog
# gzip --best $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/changelog.Debian
# tar -cvvf control.tar.gz ${_DEB_ROOT}/DEBIAN/
# rm -fr ${_DEB_ROOT}/DEBIAN/
# and binaries
cp -rv $(_BUILD_DIR)/* $(_DEB_ROOT)$(_INSTALL_DIR)/
# tar -cvvf data.tar.gz ${_DEB_ROOT}/
# rm -fr ${_DEB_ROOT}/
# finally, we buid deb package
fakeroot dpkg-deb --build $(_DEB_ROOT)
mv $(_DEB_ROOT).deb $(_PRODUCT_NAME)_$(_UTT_VER)-$(_UTT_REL).all.deb
.PHONY: make_control
make_control:
echo "Package: $(_PRODUCT_NAME)" > control
echo "Version: $(_UTT_VER)" >> control
echo "Section: web" >> control
echo "Priority: optional" >> control
echo "Architecture: all" >> control
echo "Essential: no" >> control
echo "Depends: " >> control
# here we read this information from file ../common/requirements.def
#libwww-perl, acme-base (>= 1.2) <= wymagania pakietowe
echo "Pre-Depends: perl" >> control
echo "Maintainer: Adam Mickiewicz University" >> control
echo "Provides: $(_PRODUCT_NAME)" >> control
echo -n "Description: " >> control
cat ../common/description.def >> control
.PHONY: make_postinst
make_postinst:
echo "#!/bin/sh" > postinst
echo "$(_INSTALL_DIR)/create_utt_config.pl" >> postinst
echo "rm -f $(_INSTALL_DIR)/create_utt_config.pl" >> postinst
.PHONY: make_prerm
make_prerm:
echo "#!/bin/sh" > prerm

3
app/dist/deb/README vendored Normal file
View File

@ -0,0 +1,3 @@
This directory contains files necessery to create deb package.
apt-get install dpkg-dev debhelper devscripts fakeroot linda

0
app/dist/files/COPYRIGHT vendored Normal file
View File

0
app/dist/files/LICENCE vendored Normal file
View File

14
app/dist/files/README vendored Normal file
View File

@ -0,0 +1,14 @@
Installation:
1) Run the command:
make install
in this directory. This will install UTT in the directory '~/.utt'.
2) Add the path
~/.utt/bin
to your PATH variable to make UTT programs visible to your system.

15
app/dist/spec/Makefile vendored Normal file
View File

@ -0,0 +1,15 @@
# this makefile will build rpm
DIR=$(shell pwd)
ifndef _UTT_DIST_DIR
_UTT_DIST_DIR=${DIR}
endif
# default task
.PHONY: rpm
rpm:
cd ${_UTT_DIST_DIR}; rpmbuild -bb ${DIR}/utt.spec

16
app/dist/spec/README vendored Normal file
View File

@ -0,0 +1,16 @@
This directory contains files necessary to produce rpm package.
First, you must have variable _UTT_DIST_DIR defined properly.
This variable should be defined by main Makefile.
To create rpm file, just write:
make
The created package should appears in default RPM directory.
(in my computer it is /usr/src/redhat/RPMS/$arch/ directory)
To determine the rpm output directory, execute:
rpm --showrc | grep _rmpdir
You need access privilage to this directory to create rmp.

106
app/dist/spec/utt.spec vendored Normal file
View File

@ -0,0 +1,106 @@
#
# Default RPM header.
#
# START_RPM_STD_HEADER:
#
# RPM properties
#
%define _this_product UAM Text Tools
%define _this_summary Some tools for text processing
%define _this_name utt
%define _this_version %(cat ../common/version.def)
%define _this_release %(cat ../common/release.def)
%define _this_copyright Adam Mickiewicz University, Poland
#
# We need some paths
#
# Directory with utt binaries
%define _UTT_DIST_DIR %(pwd)
#Root directory in which utt will be installed
%define _UTT_DIR /usr/local/%_this_name
#Directory for rpm
%define _RPM_ROOT %_UTT_DIST_DIR/../rpm_root
#
# Default RPM header.
#
# END_RPM_STD_HEADER:
# --------------------------------------------------------------------
Summary: %_this_summary
Name: %_this_name
Version: %_this_version
Release: %_this_release
#Copyright: %_this_copyright
License: GPL
Group: Development/Tools
URL: http://utt.amu.edu.pl
Vendor: Adam Mickiewicz University
BuildRoot: %_RPM_ROOT
#BuildArch: i586
# requirements for utt application
#AutoReq: no
#AutoReqProv: no
#Requires: glibc >= 2.1.3
#Requires: libgcc1 >= 3.0
#Requires: libgcc >= 3.0
#Requires: libstdc++6 >= 3.4.1
#Requires: libstdc++ >= 3.4.1
%description
%(cat ../common/description.def)
%description -l pl
%(cat ../common/description.pl.def)
# -------------------------------------------------------------
# preparing sources for compilation
%prep
# source compilation
%build
# rpm building
%install
%__mkdir_p $RPM_BUILD_ROOT%_UTT_DIR
cp -fr %_UTT_DIST_DIR/* $RPM_BUILD_ROOT%_UTT_DIR/
# cleaning after rpm build
%clean
rm -rf $RPM_BUILD_ROOT
# -------------------------------------------------------------
#before installation
%pre
#after installation
%post
# we need to create utt.conf file
%_UTT_DIR/create_utt_config.pl
rm -f %_UTT_DIR/create_utt_config.pl
# we need to create links in /usr/local/bin
find %_UTT_DIR/bin/ -type f -exec ln -f {} /usr/local/bin \;
#before uninstallation
%preun
# we delete links from /usr/local/bin
for fn in `find %_UTT_DIR/bin/ -type f -exec basename {} \;`; do rm -f /usr/local/bin/$fn; done
#after uninstallation
%postun
# we remove all extra files
rm -fr %_UTT_DIR
# -------------------------------------------------------------
%files
%defattr(-,root,root)
/*

4
app/dist/tarball/INSTALL vendored Normal file
View File

@ -0,0 +1,4 @@
Here you can find some information about how to install utt.
You should just unpack archive and then
execute create_utt_conf.pl and remove it.

38
app/dist/tarball/Makefile vendored Normal file
View File

@ -0,0 +1,38 @@
# This makefile allows build tarball distribution for utt.
#
# Some variables
#
# Directory with utt binaries
ifndef _UTT_DIST_DIR
_UTT_DIST_DIR=${DIR}
endif
# Where put result
ifndef _UTT_DIST_OUTPUT
_UTT_DIST_OUTPUT=${DIR}
endif
# Common info about version and release
_UTT_VER=$(shell cat ../common/version.def)
_UTT_REL=$(shell cat ../common/release.def)
# Temp vars
DIR=$(shell pwd)
_TARBALL_ROOT=$(DIR)/utt_$(_UTT_VER)-$(_UTT_REL)
_TAR_FILE_NAME=utt.$(_UTT_VER)_$(_UTT_REL).tar.gz
#defualt task
.PHONY: default
default:
@echo Build dir is ${_UTT_DIST_DIR}
@echo Change output for tarball as ${_UTT_DIST_OUTPUT}
mkdir -p ${_TARBALL_ROOT}
cp -fr ${_UTT_DIST_DIR}/* ${_TARBALL_ROOT}
@# we add some extra files
cp ./INSTALL ${_TARBALL_ROOT}/
tar -czf ${_UTT_DIST_OUTPUT}/${_TAR_FILE_NAME} utt*
rm -rf ${_TARBALL_ROOT}

6
app/dist/tarball/README vendored Normal file
View File

@ -0,0 +1,6 @@
This directory contains Makefile, which allows to create tar.gz archive.
To create archive, just write:
make
Warning: you need define variable _UTT_DIST_DIR.

27
app/doc/Makefile Normal file
View File

@ -0,0 +1,27 @@
main: utt.info utt.pdf utt.html utt.ps
utt.info: utt.texinfo
makeinfo utt.texinfo
utt.pdf: utt.texinfo
texi2pdf utt.texinfo
rm utt.{aux,cp,fn,ky,log,pg,toc,tp,vr}
utt.html: utt.texinfo
makeinfo --html --no-split utt.texinfo
utt.dvi: utt.texinfo
texi2dvi utt.texinfo
utt.ps: utt.dvi
dvips -o utt.ps utt.dvi
copy:
ifdef UTT_DOC_DIR
cp utt.{info,ps,pdf,html} ${UTT_DOC_DIR}
endif
clean:
rm -f utt.{aux,cp,dvi,fn,fns,html,info,ky,log,pdf,pg,ps,toc,tp,vr}
rm -f *~

2687
app/doc/utt.texinfo Normal file

File diff suppressed because it is too large Load Diff

30
app/lib/ser.l.template Normal file
View File

@ -0,0 +1,30 @@
%{
#include<string.h>
int n=0;
%}
%%
PATTERN {
int start, end, len;
char *lastseg, *tmp;
if(yytext[yyleng-1]!='\n')
{fprintf(stderr,"ser: pattern matches incomplete line\n"); exit(1);}
n++;
sscanf(yytext,"%d %d",&start,&len);
yytext[yyleng-1]='\0';
if(tmp=strrchr(yytext,'\n'))
{
lastseg=tmp+1;
sscanf(lastseg,"%d %d", &end, &len);
}
else
end=start;
yytext[yyleng-1]='\n';
printf("%04d 00 BOM * ser:%d\n",start,n);
ECHO;
printf("%04d 00 EOM * ser:%d\n",end+len,n);
}
.*\n DEFAULTACTION;

52
app/lib/terms.m4 Normal file
View File

@ -0,0 +1,52 @@
divert(-1)
#--------------------------------------------------------------------------
# Macros defined here may be used in pattern specifications
# You can modify this file according to your needs.
# ENDOFSEGMENT and MORFIELD are macros expanded to, respectively,
# end of segment marker (dependes on the format: flattened or not)
# and the name of the annotation field containing morphological
# information (standard value is 'lem'). These values are controlled
# by programs using this file to expand search patterns (ser, grp, ...).
# seg(type,form,annotation)
define(`seg',`(\s*((\d+\s+)(\d+\s+)?)?dnl
ifelse($1, `',`(\S+)', `($1)')\s+dnl
ifelse($2, `',`(\S+)', `($2)')dnl
ifelse($3, `',`((\s+\S+)*)', `(\s+($3))')\s*ENDOFSEGMENT)')
# form(f) - segment containing the form f
define(`form', `seg(,$1)')
# field(f) segment containing auxiliary field f
define(`field', `seg(,,`(\S+\s+)*($1)(\s+\S+)*')')
# word, space, punct, number segments (assuming W, S, P, N segment types)
define(`space', `seg(`S',`$1')')
define(`word', `seg(`W',`$1')')
define(`punct', `seg(`P',`$1')')
define(`number', `seg(`N',`$1')')
# macros specific to PMDB format
define(`lexeme', `field(`MORFIELD:(\S+;)?$1,\S+')')
define(`cat', `field(`MORFIELD:\S+,$1([,;]\S+)?')')
# Place here your macro definitions.
#--------------------------------------------------------------------------
divert(0)

8
app/src/common/Makefile Normal file
View File

@ -0,0 +1,8 @@
# main: cmdline.c main_template.cc
# g++ -o main cmdline.c common.cc main_template.cc
# cmdline.c cmdline.h : cmdline.ggo
# gengetopt -i cmdline.ggo
# cmdline.ggo: cmdline_common.ggo cmdline_program.ggo
# cat cmdline_common.ggo cmdline_program.ggo > cmdline.ggo

18
app/src/common/README Normal file
View File

@ -0,0 +1,18 @@
Propozycja ujednolicenia dzialania klocka na poziomie
funkcji main. Parametry meta - zdefiniowane dla
wszystkich, poza tok, programow, definiujace ich zachowanie
w systemie klockow.
cmdline_common.ggo - deklaracje parametrow meta
cmdline_program.ggo - przyklad deklaracji parametrow programu
nazwa docelowa np. cmdline_guess.ggo
common.cc - zmienne globalne zawierajace informacje
przekazane przez parametry meta
common.h
main_template.cc - szkielet funkcji main
Makefile - sposob kompilacji

View File

@ -0,0 +1,34 @@
#section "Common UTT options"
option "input" f "Input file" string no hidden
option "output" o "Output file" string no hidden
option "fail" e "Output file for unsuccesfully processed segments " string no hidden
option "only-fail" - "Print only segments the program failed to process" flag off hidden
option "no-fail" - "Print only segments the program processed" flag off hidden
option "copy" c "Copy succesfully processed segments to standard output" flag off hidden
option "process" p "Process segments with this tag" string no multiple
option "select" s "Select only segments with this field" string no multiple
option "ignore" S "Select only segments without this field" string no multiple
option "output-field" O "Output field name" string no
option "input-field" I "Input field name" string no multiple
option "interactive" i "Toggle interactive mode" flag off
option "config" - "Configuration file" string typestr="FILENAME" no
option "one-field" 1 "Print all results in one segments (creates ambiguous annotation)" flag off
option "one-line" - "Print annotation alternatives as additional fields" flag off
option "language" - "Language." string no

View File

@ -0,0 +1,5 @@
package "guess"
version "0.1"
option "color" l "Show guessed descriptions in colour." flag off

264
app/src/common/common.cc Normal file
View File

@ -0,0 +1,264 @@
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "common.h"
#include <stdio.h>
#include <locale.h>
FILE* inputf=stdin;
FILE* outputf=stdout;
FILE* failedf=stdout;
bool copy_processed=0;
bool one_field=false;
bool one_line=false;
char output_field_prefix[32];
char input_field_prefix[32];
extern int argc;
extern char **argv;
// tilde (home dir) expansion in path
int expand_path(char* inpath, char* outpath)
{
if(inpath[0]=='~')
sprintf(outpath,"%s%s",getenv("HOME"),inpath+1);
else
strcpy(outpath,inpath);
return 0; // no problem
}
/*
parameters:
-name - field name, long or short
+prefix - field name with ':' appended if long name
return value:
1 if correct field name, 0 otherwise
examples:
name prefix r.v.
lem lem: 1
@ @ 1
:: 'undef' 0
a,b 'undef' 0
*/
int fieldprefix(char *name, char *prefix)
{
if (ispunct(name[0]) && name[1]=='\0') // correct short name
{
strcpy(prefix, name); return 1;
}
int i=0;
while(name[i]!='\0' && isalnum(name[i])) ++i;
if(name[i]=='\0' && i>0) // correct long name
{
sprintf(prefix,"%s:",name); return 1;
}
// incorrect
return 0;
}
void set_program_name(char program_name[], char* argv0)
{
if (char* p_name = strrchr(argv0, '/'))
strcpy(program_name,p_name+1);
else
strcpy(program_name,argv0);
}
extern void process_config_files(gengetopt_args_info* args, char* argv0)
{
char program_name[256];
char config_file[256];
char config_file_tmp[256];
set_program_name(program_name,argv0);
// obsługa pliku konfiguracyjnego podanego w linii komend
if (args->config_given) {
if (file_accessible(args->config_arg) == 0) {
if (cmdline_parser_configfile(args->config_arg,
args,
0, // 0 - nie nadpisuj wartości parametrów
0, // 0 - nie inicjuj
0) != 0) {
fprintf(stderr, "Error in config file (%s)\n", args->config_arg);
exit(1);
}
}
}
if(args->one_line_given && !one_line) one_line=true, one_field=false;
if(args->one_field_given && !one_field) one_line=false, one_field=true;
// obsluga pliku konfiguracyjnego uzytkownika dla programu
sprintf(config_file_tmp, "%s/%s.conf", USER_CONFIG_DIR, program_name);
expand_path(config_file_tmp, config_file);
if (file_accessible(config_file) == 0) {
if (cmdline_parser_configfile(config_file,
args,
0, // 0 - nie nadpisuj danych
0, // 0 - nie inicjuj struktury
0) != 0) {
fprintf(stderr, "Error in config file (%s)\n", config_file);
exit(1);
}
}
if(args->one_line_given && !one_line) one_line=true, one_field=false;
if(args->one_field_given && !one_field) one_line=false, one_field=true;
// obsluga pliku konfiguracyjnego uzytkownika globalnego
sprintf(config_file_tmp, "%s/utt.conf", USER_CONFIG_DIR);
expand_path(config_file_tmp, config_file);
if (file_accessible(config_file) == 0) {
if (cmdline_parser_configfile(config_file,
args,
0, // 0 - nie nadpisuj danych
0, // 0 - nie inicjuj struktury
0) != 0) {
fprintf(stderr, "Error in config file (%s)\n", config_file);
exit(1);
}
}
if(args->one_line_given && !one_line) one_line=true, one_field=false;
if(args->one_field_given && !one_field) one_line=false, one_field=true;
// obsluga systemowego pliku konfiguracyjnego dla programu
sprintf(config_file, "%s/%s.conf", SYSTEM_CONFIG_DIR, program_name);
if (file_accessible(config_file) == 0) {
if (cmdline_parser_configfile(config_file,
args,
0, // 0 - nie zmieniaj danych wczesniejszych
0, // 0 - nie inicjuj struktury
0 // 0 - nie sprawdzaj wymaganych parametrow
) != 0) {
fprintf(stderr, "Error in config file (%s)\n", config_file);
exit(1);
}
}
if(args->one_line_given && !one_line) one_line=true, one_field=false;
if(args->one_field_given && !one_field) one_line=false, one_field=true;
// obsluga systemowego pliku konfiguracyjnego globalnego
sprintf(config_file, "%s/utt.conf", SYSTEM_CONFIG_DIR);
if (file_accessible(config_file) == 0) {
if (cmdline_parser_configfile(config_file,
args,
0, // 0 - nie zmieniaj danych wczesniejszych
0, // 0 - nie inicjuj struktury
0 // 0 - nie sprawdzaj wymaganych parametrow
) != 0) {
fprintf(stderr, "Error in config file (%s)\n", config_file);
exit(1);
}
}
if(args->one_line_given && !one_line) one_line=true, one_field=false;
if(args->one_field_given && !one_field) one_line=false, one_field=true;
}
void process_common_options(gengetopt_args_info* args, char* argv0)
{
char program_name[256];
set_program_name(program_name,argv0);
setlocale(LC_CTYPE,"");
setlocale(LC_COLLATE, "");
if(args->help_given)
cmdline_parser_print_help ();
if(args->input_given)
if(!(inputf=fopen(args->input_arg,"r")))
{
fprintf(stderr,"No such file: %s.\n", args->input_arg);
exit(1);
}
if(args->output_given)
if(!(outputf=fopen(args->output_arg,"w")))
{
fprintf(stderr,"Cannot open output file: %s.\n", args->output_arg);
exit(1);
}
if(args->fail_given)
if(!(failedf=fopen(args->fail_arg,"w")))
{
fprintf(stderr,"Cannot open output file: %s.\n", args->fail_arg);
exit(1);
}
if(args->input_field_given)
fieldprefix(args->input_field_arg[0],input_field_prefix);
else
strcpy(input_field_prefix, "4");
if(args->output_field_given)
fieldprefix(args->output_field_arg,output_field_prefix);
else
sprintf(output_field_prefix, "%s%c", program_name, INFIELD_SEP);
if ((args->copy_given))
copy_processed=true;
}
// sprawdza istnienie pliku
int file_accessible(const char* path) {
return access(path, R_OK);
}
// sprawdza istnienie pliku konfiguracyjnego
int config_file_exists(const char* dir, const char* filename) {
struct stat dir_stat;
struct stat file_stat;
char* path = (char*)malloc(strlen(dir) + strlen(filename) + 2); // + '\0' + '/'
sprintf(path, "%s/%s", dir, filename);
if (stat(dir, &dir_stat) != 0)
return -1;
if (stat(path, &file_stat) != 0)
return -1;
if (!S_ISDIR(dir_stat.st_mode))
return -1; // katalog nie jest katalogiem
if (!S_ISREG(file_stat.st_mode))
return -1; // plik konfiguracyjny nie jest plikiem
if (access(dir, X_OK) != 0)
return -1; // nie mamy prawa zmienic katalogu
if (access(path, R_OK) != 0)
return -1; // nie mamy prawa odczytu pliku
free(path);
return 0;
}

416
app/src/common/common.h Normal file
View File

@ -0,0 +1,416 @@
#ifndef __COMMON_H
#define __COMMON_H
#include <stdio.h>
#include <ctype.h>
#include "../lib/const.h"
#include _CMDLINE_FILE
/**************************************************
* Stale dotyczace wejscia/wyjscia
*/
#define EMPTYFORM '*'
#define INFIELD_SEP ':'
#define MAXAUX 16
#define FIELD_SEP " \t\n"
// katalogi z plikami konfiguracyjnymi
// nowe
// stare - do wyrzucenia
// #define CONFIG_DIR ".utt/conf"
// nazwa zmiennej okreslajaca sciezke do danych
// #define UTT_DIR_VAR "UTT_DIR"
// sciezka do plikow z danymi (np UTT_DIR/pliki) wzgledem $HOME!
// #define UTT_DIR_DEFAULT ".utt/pl/"
/**************************************************/
extern FILE* inputf;
extern FILE* outputf;
extern FILE* failedf;
extern char* input_filename;
extern char* output_filename;
extern char* failed_filename;
extern bool one_line;
extern bool one_field;
extern char input_field_prefix[];
extern char output_field_prefix[];
extern bool copy_processed;
extern bool append_output;
extern bool append_failed;
//sciezka do katalogu z danymi
extern char utt_dir[];
extern void process_common_options(gengetopt_args_info* args, char* argv0);
extern void process_config_files(gengetopt_args_info* args, char* argv0);
extern int expand_path(char* inpath, char* outpath);
extern int fieldprefix(char *name, char *prefix);
/**************************************************
* problems with casing */
// sprawdzenie wielkosci liter
// warto¶æ zwracana:
// 0 - wszystkie ma³e litery
// 1 - pierwsza wielka, reszta male
// 2 - wszystkie wielkie
// 3 - inne
inline int casing(char* s)
{
int ret = isupper(*s) ? 1 : 0;
while(*++s != '\0')
{
if(isupper(*s))
{
if(ret==1) ret=2;
else if(ret==0) ret=3;
}
else
{
if(ret==2) ret=3;
}
}
return ret;
}
//
inline void tolowers(char* s, char* d)
{
*d=tolower(*s);
while(*s != '\0') * ++d = tolower(* ++s);
}
// przepisuje s do d
// nadajac wielko¶æ liter zgodnie z warto¶ci± casing
// casing - warto¶æ zwracana przez casing()
// je¶li casing==3 przepisuje bez zmian (za ma³o informacji)
inline void restorecasing(char *s, char *d, int casing)
{
switch(casing)
{
case 0:
case 3:
*d=*s;
while(*s != '\0') * ++d = * ++s;
break;
case 1:
*d=toupper(*s);
while(*s != '\0') * ++d = * ++s;
break;
case 2:
*d=toupper(*s);
while(*s != '\0') * ++d = toupper(* ++s);
break;
}
}
/**************************************************/
/*
parameters:
-seg - segment
-pref - field name or "1", "2", "3", "4" for the first four fields
+val - field contents
return value:
1 if specified field exists, 0 otherwise
*/
inline int getfield(char* seg, const char* pref, char* val)
{
char* p=seg;
char* p0;
while(isspace(*p)) ++p;
// field "1"
p0=p; while(isdigit(*p)) ++p;
if(*pref=='1') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0;
while(isspace(*p)) ++p;
// field "2"
p0=p; while(isdigit(*p)) ++p;
if(*pref=='2') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0;
while(isspace(*p)) ++p;
// field "3"
p0=p; while(isgraph(*p)) ++p;
if(*pref=='3') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0;
while(isspace(*p)) ++p;
// field "4"
p0=p; while(isgraph(*p)) ++p;
if(*pref=='4') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0;
while(isspace(*p)) ++p;
// annotation fields
do p=strstr(p,pref); while(p!=NULL && *(p-1)!=' ' && *(p-1)!='\t');
if(p==NULL) return 0;
else
{
p+=strlen(pref);
int len=strcspn(p,FIELD_SEP "\n\r\f\0");
strncpy(val,p,len);
val[len]='\0';
return 1;
}
}
inline
bool process_seg(char* seg, gengetopt_args_info& args)
{
char buf[256];
bool ret = !args.process_given;
if(args.process_given)
{
getfield(seg,"3",buf);
for(int i=0; i<args.process_given; ++i)
if(strcmp(args.process_arg[i],buf)==0)
{
ret=true;
break;
}
}
for(int i=0; i<args.select_given; ++i)
if(! getfield(seg,args.select_arg[i],buf))
ret=false;
for(int i=0; i<args.ignore_given; ++i)
if(getfield(seg,args.ignore_arg[i],buf))
ret=false;
return ret;
}
/*
parameters:
-+seg - segment
-pref - prefix of the new field
-val - contents of the new field
return value:
1 - success, 0 - fail (limit on segment length exceeded)
*/
inline
int addfield(char *seg, const char *pref, const char *val)
// zalozenie, ze seg konczy sie znakiem \n
{
if(strlen(seg)+strlen(pref)+strlen(val) >= MAX_LINE) return 0; // bezpieczniej, ale wolniej
int seglen=strlen(seg);
sprintf(seg+(seglen-1)," %s%s\n",pref,val);
return 1;
}
/**************************************************/
struct Seg
{
int filepos, len;
char* tag;
char* form;
char* aux[MAXAUX];
int auxn;
bool parse(char* line);
char* getfield(char* fieldname);
void print(char* line);
bool addfield(char* s);
bool clearfields();
};
/**************************************************/
/* definicja struktury wejscia/wyjscia
*/
struct Segment
{
int filepos, len;
char* tag;
char* form;
char* aux[MAXAUX];
int auxn;
bool parse(char* line);
char* getfield(char* fieldname);
void print(char* line);
bool addfield(char* s);
bool clearfields();
};
/*
* Sprawdza czy nalezy przetwarzac dany segment.
*/
inline
bool process_seg(Segment& s, gengetopt_args_info& args)
{
bool ret = !args.process_given;
for(int i=0; i<args.process_given; ++i)
if(strcmp(args.process_arg[i],s.tag)==0)
{
ret=true;
break;
}
for(int i=0; i<args.select_given; ++i)
if(! s.getfield(args.select_arg[i]))
ret=false;
for(int i=0; i<args.ignore_given; ++i)
if(s.getfield(args.ignore_arg[i]))
ret=false;
return ret;
}
/*
* FUNKCJE OBSLUGUJACE WEJSCIE/WYJSCIE
*/
// napisy zostaj na miejscu (w line), tylko wskazniki sa ustawian
// i zara dopisywane zera s dopisywane
inline
bool Segment::parse(char* line)
{
auxn=0;
char* field;
if((field=strtok(line,FIELD_SEP))!=NULL)
filepos=atoi(field); // nie sprawdzana poprawnosc
else
return false;
if((field=strtok(NULL,FIELD_SEP))!=NULL)
len=atoi(field); // nie sprawdzana poprawnosc
else return false;
if((tag=strtok(NULL,FIELD_SEP))==NULL) return false;
if((form=strtok(NULL,FIELD_SEP))==NULL)
return true;
else
if(form[0] == EMPTYFORM && form[1] =='\0')
form=NULL;
while((aux[auxn]=strtok(NULL,FIELD_SEP))!=NULL) ++auxn;
return true;
}
inline char* Segment::getfield(char* f)
{
int flen=strlen(f);
if(isalnum(*f))
{
for(int i=0; i<auxn; ++i)
if(strncmp(aux[i],f,flen)==0 && aux[i][flen]==INFIELD_SEP)
return aux[i]+flen+1;
} else
{
for(int i=0; i<auxn; ++i)
{
if(*f==*(aux[i]))
return aux[i]+1;
}
}
return NULL;
}
inline bool Segment::clearfields() {
for (int i=0; i<auxn; ++i) {
// free(aux[i]);
aux[i] = NULL;
}
auxn=0;
return true;
}
inline // NIEEFEKTYWNE
void Segment::print(char* line)
{
sprintf(line,"%04d %02d %s", filepos, len, tag);
if(form)
{
strcat(line," ");
strcat(line,form);
}
else
if(auxn)
strcat(line," *");
for(int i=0; i<auxn; ++i)
{
strcat(line," ");
strcat(line,aux[i]);
}
strcat(line,"\n");
}
inline
bool Segment::addfield(char* s)
{
if(auxn<MAXAUX)
{
aux[auxn++]=s;
return true;
}
else
return false;
}
/**************************************************
* funkcje pomocne w operacjach na plikach *
* konfiguracyjnych *
**************************************************/
// sprawdza istnienie pliku
int file_accessible(const char* path);
// sprawdza istnienie pliku konfiguracyjnego
int config_file(const char* dir, const char* filename);
/**************************************************/
/* Pobiera wejscie
* parametry:
* - args - tablica stringow okresnajacych pola wejsciowe
* - args_len - rozmiar args
* - seg - segment
* wartosc - wskaznik do wejscia
*/
inline char* getInput(char** args, int args_len, Segment seg) {
char* formp = NULL;
for (int i=0; i<args_len; ++i) {
if ('4' == args[i][0])
return seg.form;
if ((formp = seg.getfield(args[i])) != NULL) {
return formp;
}
}
return formp;
}
#endif

View File

@ -0,0 +1,20 @@
#include <stdlib.h>
#include "common.h"
main(int argc, char* argv[])
{
gengetopt_args_info args;
if(cmdline_parser(argc,argv,&args) != 0)
exit(1);
process_common_options(args);
//
// TU KOD
//
cmdline_parser_free(&args);
}

View File

@ -0,0 +1,12 @@
all: compiledic aut2fsa
compiledic:
aut2fsa: aut2fsa.cc
g++ -Wno-deprecated -O3 -fpermissive -static -o aut2fsa aut2fsa.cc
copy:
ifdef UTT_BIN_DIR
cp compiledic fsm2aut aut2fsa ${UTT_BIN_DIR}
endif

5
app/src/compiledic/TODO Normal file
View File

@ -0,0 +1,5 @@
* pliki tymczasowe:
- pliki symboli lab i scl
- pliki powstajace podczas kompilacji slownika
gdzie maja byc tworzone? tak jak teraz nie moze byc!

BIN
app/src/compiledic/aut2fsa Executable file

Binary file not shown.

View File

@ -0,0 +1,16 @@
#include <iostream.h>
#include <stdlib.h>
#include "../lib/tfti.h"
#include <fstream.h>
int main()
{
TFTiv<char,char> a;
a.read();
a.save();
return 0;
}

190
app/src/compiledic/compiledic Executable file
View File

@ -0,0 +1,190 @@
#! /usr/bin/env perl
$symfile='~/.utt/pl/pl_PL.iso-8859-2.sym';
$symfilenoext = $symfile;
$symfilenoext =~ s/\.sym$//;
$labfile = $symfilenoext . '.lab';
$sclfile = $symfilenoext . '.scl';
use locale;
#use strict;
##################################################
$linesPerFile = 20000;
if (@ARGV < 1) {
print "usage: prep_user_dict.pl dictionary_file\n";
exit;
}
my $file = shift; # @ARGV;
my $filenameprefix;
if ($file =~ /(.*)\.dic/)
{
$filenameprefix = $1;
}
else
{
print "The input file must have .dic extension.";
exit(1);
}
# Przygotowanie etykiet
#`makeLabels.pl > labels.sym`;
`lexmakelab $symfilenoext`;
# Analiza pliku s³ownika
print "preparing file...........................................";
`sed -r "s/([[:punct:]])/\[\\1\]/g" < $file > temp1`;
`cp temp1 temp2`;
print "OK\n";
#dzielimy plik na wiele czê¶ci, uruchamiamy lexcomplex dla ka¿dej
#czê¶ci osobno, nastêpnie ³±czymy to za pomoc± programu fsmunion
#print "Dzielê s³ownik na mniejsze czê¶ci...";
open(IN, "./temp2");
$lineCount = 0;
$fileCount = 0;
`mkdir LemTEMP`;
open(FILE, ">LemTEMP/slo_0");
while (<IN>) {
if (++$lineCount >= $linesPerFile) {
$fileCount++;
$lineCount = 0;
close(FILE);
# print "Tworzê nowy plik tymczasowy: slo_".$fileCount."\n";
open(FILE, ">LemTEMP/slo_".$fileCount);
}
print(FILE $_);
}
#print "OK\n";
print "building partial automata";
#32 kropki, fileCount plikow
$filesPerDot = $fileCount/32;
$files=$filesPerDot;
$dots=0;
for ($i=0; $i<=$fileCount; $i++) {
if ($files >= $filesPerDot) {
$files = 0;
print ".";
$dots++;
}
$files++;
$command = "lexcomplex -l $labfile -S $sclfile < LemTEMP/slo_".$i." > LemTEMP/slownik_".$i.".fsm";
`$command`;
}
if ($dots < 32) {
for ($i=0; $i<32 - $dots; $i++) {
print ".";
}
}
print "OK\n";
`rm LemTEMP/slo_*`;
print "building final automaton";
#35 kropek...
$ndots=33;
$filesPerDot = $fileCount/$ndots;
$files=$filesPerDot;
$dots=0;
`cp LemTEMP/slownik_0.fsm slownik1.fsm`;
for ($i=1; $i<=$filecount; $i++) {
if ($files >= $filesPerDot) {
$files = 0;
print ".";
$dots++;
}
$files++;
$command = "fsmunion LemTEMP/slownik_".$i." slownik1.fsm > slownik2.fsm";
`$command`;
`mv slownik2.fsm slownik1.fsm`;
}
if ($dots < $ndots) {
for ($i=0; $i<$ndots - $dots; $i++) {
print ".";
}
}
`fsmunion LemTEMP/* > slownik1.fsm`;
print "OK\n";
print "removing epsilon-transitions.............................";
`fsmrmepsilon slownik1.fsm > slownik2.fsm`;
`rm slownik1.fsm`;
print "OK\n";
print "determinizing automaton..................................";
`fsmdeterminize slownik2.fsm > slownik1.fsm`;
`rm slownik2.fsm`;
print "OK\n";
print "minimizing automaton.....................................";
`fsmminimize slownik1.fsm > slownik.fsm`;
#`rm slownik1.fsm`;
print "OK\n";
print "converting fsm format to bin.............................";
`fsmprint -i $labfile slownik.fsm > slownik.txt`;
`fsm2aut slownik.txt > slownik.aut`;
`aut2fsa < slownik.aut > $filenameprefix.bin`;
print "OK\n";
print "removing temporary files.................................";
`rm LemTEMP/*`;
`rmdir LemTEMP`;
`rm temp2`;
`rm slownik.fsm`;
`rm slownik.txt`;
`rm slownik.aut`;
`rm labels.*`;
print "OK\n";

44
app/src/compiledic/fsm2aut Executable file
View File

@ -0,0 +1,44 @@
#!/usr/bin/perl
my $currstate=-1;
my @states;
my @final;
my $tn=0;
while(<>)
{
if(/^\s*([0-9]+)\s+([0-9]+)\s+(.)(\s*)?$/)
{
push @{$states[$1]}, ($3, $2);
$#states=$2 if $#states<$2;
$tn++;
}
elsif(/^\s*([0-9]+)\s*$/)
{
$final[$1]=1;
$#states=$1 if $#states<$1;
}
else
{
die("Input error.");
}
}
print scalar(@states)," ",$tn," char void\n";
my $i=0;
my $width=int(log(@states+1)/log(10));
foreach $stateref (@states)
{
$f = ($final[$i]?"+":"-");
printf "%${width}d %s",$i++,$f;
while(@$stateref)
{
$c=shift @$stateref;
$s=shift @$stateref;
print " $c $s";
}
print "\n";
}

7
app/src/con/Makefile Normal file
View File

@ -0,0 +1,7 @@
con:
copy:
ifdef UTT_BIN_DIR
cp con ${UTT_BIN_DIR}
endif

549
app/src/con/con Executable file
View File

@ -0,0 +1,549 @@
#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use locale;
Getopt::Long::Configure('no_ignore_case_always');
my $l='30c';
my $r='30c';
my $trim=0;
my $white=0;
my $bon='[0-9]+ [0-9]+ BOM .*';
my $eon='[0-9]+ [0-9]+ EOM .*';
my $bod='[';
my $eod=']';
my $column=0;
my $ignore=0;
my $help=0;
my $configfile1="../../conf/con.conf";
my $configfile2="../conf/con.conf";
#read configuration files###########################
my $file;
foreach $file ($configfile1, $configfile2){
if(open(CONFIG, $file)){
while (<CONFIG>) {
chomp;
s/#.*//;
s/^\s+//;
s/\s+$//;
next unless length;
my ($name, $value) = split(/\s*=\s*/, $_, 2);
if(($name eq "left")or($name eq "l")){
$l=$value;
}
elsif(($name eq "right")or($name eq "r")){
$r=$value;
}
elsif(($name eq "trim")or($name eq "t")){
$trim=1;
}
elsif(($name eq "white")or($name eq "w")){
$white=1;
}
elsif($name eq "bom"){
$bon=$value;
}
elsif($name eq "eom"){
$eon=$value;
}
elsif($name eq "bod"){
$bod=$value;
}
elsif($name eq "eod"){
$eod=$value;
}
elsif(($name eq "column")or($name eq "c")){
$column=$value;
}
elsif(($name eq "ignore")or($name eq "i")){
$ignore=1;
}
elsif(($name eq "help")or($name eq "h")){
$help=1;
}
}
close CONFIG;
}
}
#########################################################
GetOptions("left|l=s" => \$l,
"right|r=s" => \$r,
"trim|t" => \$trim,
"white|w" => \$white,
"bom=s" => \$bon,
"eom=s" => \$eon,
"bod=s" => \$bod,
"eod=s" => \$eod,
"column|c=s" => \$column,
"ignore|i" => \$ignore,
"help|h" => \$help);
if(!($column=~/^[0-9]+$/)){$column=0;}
if($help)
{
print <<'END'
Options:
--help -h Help.
--left -l Left context info (default='30c')
Examples:
-l=5c: left context is 5 characters
-l=5w: left context is 5 words
-l=5s: left context is 5 non-empty input lines
-l='\s*\S+\sr\S+BOS': left context starts with the given regex
--right -r Right context info (default='30c')
--trim -t Clear incomplete words from output
--white -w DO NOT change all white characters into spaces
--column -c Left column minimal width in characters (default = 0)
--ignore -i Ignore input inconsistency
--bon Beginning of selected segment
(regex, default='[0-9]+ [0-9]+ BOM .*')
--eon End of selected segment
(regex, default='[0-9]+ [0-9]+ EOM .*')
--bod Selected segment beginning display (default='[')
--eod Selected segment end display (default=']')
END
;
exit 0;
}
my $seg_no=0;
my $seg_size=0;
my $left_type;
my $left_size;
my $right_type;
my $right_size;
set_lr_types($l, $r, \$left_type,\$left_size,\$right_type,\$right_size, $trim);
my $inn=0;
my $after_bos=0;
my $before_eos=0;
my @LEFT; #tablica skalarów
my @CENTER; #tablica skalarów
my @RIGHT;
my @current_center;
my @current_left; #skalar dla c, w pp. tablica
my @current_left_words;
my @current_right_words_number;
while(<>){
my $line = $_;
chomp $line;
my @line = split / /, $line;
my $line_s=@line;
if(!line_format_ok(@line)){next;}
if(!$white){white_into_spaces(\@line);}
else{if($line[2] eq "S"){symbols_into_white(\$line[3]);}}
if(!input_consistent(\$seg_no,\$seg_size,$line[0],$line[1],$ignore)){
eof_or_inconsistency(\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,$left_type,$right_type);
@current_center=();
@current_left=();
@current_left_words=();
@current_right_words_number=();
$after_bos=0;
$before_eos=0;
}
remember_current_left($left_type,$left_size,\@current_left,\@line, \@current_left_words, $line, \$after_bos, \$before_eos);
remember_center($line,\@line,\$inn,\@current_center,$white,\@CENTER,\@current_left,\@LEFT, \$after_bos, \$before_eos, \@RIGHT, \@current_right_words_number);
remember_right($right_type,$left_type,$right_size,\@line,\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,\@current_right_words_number, $line, \$before_eos);
}
eof_or_inconsistency(\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,$left_type,$right_type);
exit(0);
#################procedury###############################
sub line_format_ok{
my @line = @_;
my $size = @line;
if($size<4){return 0;}
if($line[0]!~/[0-9]+/){return 0;}
if($line[1]!~/[0-9]+/){return 0;}
return 1;
}
sub white_into_spaces{
my $line_ref=shift;
if(@{$line_ref}[2] eq "S"){
@{$line_ref}[3]=" ";
}
}
sub symbols_into_white{
my $string_ref=shift;
${$string_ref} =~ s/\\n/\n/g;
${$string_ref} =~ s/\\t/\t/g;
${$string_ref} =~ s/_/ /g;
}
sub white_into_symbols{
my $string_ref=shift;
${$string_ref} =~ s/\n/\\n/g;
${$string_ref} =~ s/\t/\\t/g;
${$string_ref} =~ s/ /_/g;
}
sub input_consistent{
my $seg_no_ref = shift;
my $seg_size_ref = shift;
my $line0 = shift;
my $line1 = shift;
my $ig = shift;
my $ok=1;
if(${$seg_no_ref}!=0&&(!$ig)){
my $distance = $line0-${$seg_size_ref};
if($distance!=${$seg_no_ref}){$ok=0;}
}
${$seg_no_ref}=$line0;
${$seg_size_ref}=$line1;
return $ok;
}
sub set_lr_types{
my $left = shift;
my $right = shift;
my $left_type_ref =shift;
my $left_size_ref =shift;
my $right_type_ref =shift;
my $right_size_ref =shift;
my $do_trim=shift;
if($left=~/[0-9]+c/){
${$left_type_ref}='c';
${$left_size_ref}=get_number($left);
if($do_trim){${$left_size_ref}++;}
}
else{
if($left=~/[0-9]+w/){
${$left_type_ref}='w';
${$left_size_ref}=get_number($left);
}
else{
if($left=~/[0-9]+s/){
${$left_type_ref}='s';
${$left_size_ref}=get_number($left);
}
else{
${$left_type_ref}=$left;
}
}
}
if($right=~/[0-9]+c/){
${$right_type_ref}='c';
${$right_size_ref}=get_number($right);
if($do_trim){${$right_size_ref}++;}
}
else{
if($right=~/[0-9]+w/){
${$right_type_ref}='w';
${$right_size_ref}=get_number($right);
}
else{
if($right=~/[0-9]+s/){
${$right_type_ref}='s';
${$right_size_ref}=get_number($right);
}
else{
${$right_type_ref}=$right;
}
}
}
}
sub get_number{
my $string = shift;
my @letters = split(//,$string);
my $i=0;
while($letters[$i]=~/[0-9]/){$i++;}
my $j;
my $number=0;
my $ten=1;
for($j=$i-1;$j>=0;$j--){
$number+=$letters[$j]*$ten;
$ten*=10;
}
return $number;
}
sub remember_center{
my $lin = shift;
my $lin_ref = shift;
my $inn_ref = shift;
my $current_center_ref = shift;
my $white_info = shift;
my $CENTER_REF = shift;
my $current_left_ref = shift;
my $LEFT_REF = shift;
my $after_bos_ref = shift;
my $before_eos_ref = shift;
my $RIGHT_REF = shift;
my $current_words_right_number_ref = shift;
if((!${$inn_ref}) && $lin=~/$bon/){
${$inn_ref}=1;
@{$current_center_ref}=();
${$after_bos_ref}=0;
push(@{$LEFT_REF},join('',@{$current_left_ref}));
}
if(${$inn_ref} && $lin=~/$eon/){
${$inn_ref}=0;
push(@{$CENTER_REF},join('',@{$current_center_ref}));
${$before_eos_ref}=1;
my @new_table;
push(@{$RIGHT_REF},\@new_table);
push(@{$current_words_right_number_ref},0);
}
if($inn && index($lin,'*')==-1){
white_into_symbols(\${$lin_ref}[3]);
if($white_info){push(@{$current_center_ref},${$lin_ref}[3]);}
else{push(@{$current_center_ref},${$lin_ref}[3]);}
}
}
sub remember_current_left{
my $type=shift;
my $size=shift;
my $ref=shift;
my $line_ref=shift;
if($type eq 'c'){
if(!(${$line_ref}[3] eq '*')){
push(@{$ref},split('',${$line_ref}[3]));
my $lsize = @{$ref};
if($lsize>$size){splice(@{$ref},0,$lsize-$size);}
}
}
else{
if($type eq 'w'){
my $words_ref = shift;
if(!(${$line_ref}[3] eq '*')){
push(@{$ref},${$line_ref}[3]);
if(${$line_ref}[2] eq 'W'){
push(@{$words_ref},${$line_ref}[3]);
}
my $lsize = @{$words_ref};
if($lsize>$size){
my $word = ${$words_ref}[1];
splice(@{$words_ref},0,1);
while(!(${$ref}[0] eq $word)){splice(@{$ref},0,1); }
}
}
}
else{
if($type eq 's'){
if(!(${$line_ref}[3] eq '*')){
push(@{$ref},${$line_ref}[3]);
my $lsize = @{$ref};
if($lsize>$size){splice(@{$ref},0,$lsize-$size);}
}
}
else{#bos/eos
shift;
my $line = shift;
my $after_bos_ref = shift;
my $before_eos_ref = shift;
if($line=~/$type/){
${$after_bos_ref}=1;
@{$ref}=();
}
if(${$after_bos_ref} && !(${$line_ref}[3] eq '*')){
push(@{$ref},${$line_ref}[3]);
}
}
}
}
}
sub remember_right{
my $type=shift;
my $type_left=shift;
my $size=shift;
my $line_ref=shift;
my $LEFT_REF=shift;
my $CENTER_REF=shift;
my $RIGHT_REF=shift;
my $bod=shift;
my $eod=shift;
my $w=shift;
my $c=shift;
my $t=shift;
if($type eq 'c'){
if(!(${$line_ref}[3] eq '*')){
my $right_size = @{$RIGHT_REF};
for(my $i=0; $i<$right_size; $i++){
push(@{${$RIGHT_REF}[$i]}, split('',${$line_ref}[3]));
my $lsize = @{${$RIGHT_REF}[$i]};
if($lsize>=$size){
splice(@{${$RIGHT_REF}[$i]},$size-1); #wypisz i usun
print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type);
$right_size = @{$RIGHT_REF};
$i--;
}
}
}
}
else{
if($type eq 'w'){
my $words_number_ref = shift;
if(!(${$line_ref}[3] eq '*')){
my $right_size = @{$RIGHT_REF};
for(my $i=0; $i<$right_size; $i++){
push(@{${$RIGHT_REF}[$i]},${$line_ref}[3]);
if(${$line_ref}[2] eq 'W'){
${$words_number_ref}[$i]=${$words_number_ref}[$i]+1;
if(${$words_number_ref}[$i]==$size){
print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type);
$right_size = @{$RIGHT_REF};
$i--;
splice(@{$words_number_ref},$i,1);
}
}
}
}
}
else{
if($type eq 's'){
if(!(${$line_ref}[3] eq '*')){
my $right_s = @{$RIGHT_REF};
for(my $i=0; $i<$right_s; $i++){
push(@{${$RIGHT_REF}[$i]},${$line_ref}[3]);
my $rsize=@{${$RIGHT_REF}[$i]};
if($rsize==$size){
print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type);
$right_s = @{$RIGHT_REF};
$i--;
}
}
}
}
else{#bos/eos
shift;
my $line = shift;
my $before_eos_ref = shift;
if(${$before_eos_ref}){
if(!(${$line_ref}[3] eq '*')){
#tylko 1 pozycja
push(@{${$RIGHT_REF}[0]},${$line_ref}[3]);
}
if($line=~/$type/){
${$before_eos_ref}=0;
print_and_remove(0,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type);
}
}
}
}
}
}
sub print_and_remove{
my $index = shift;
my $LEFT_REF = shift;
my $CENTER_REF = shift;
my $RIGHT_REF = shift;
my $bdis = shift;
my $edis = shift;
my $white = shift;
my $column = shift;
my $trim = shift;
my $left_type = shift;
my $right_type = shift;
my $left_string = "${$LEFT_REF}[$index]";
my $right_string = join('',@{${$RIGHT_REF}[$index]});
if($trim){
if($left_type eq "c"){$left_string=trim_left($left_string);}
if($right_type eq "c"){$right_string=trim_right($right_string);}
}
if(length($left_string)<$column){$left_string=" "x($column-length($left_string)).$left_string;}
if($white){
white_into_symbols(\$left_string);
white_into_symbols(\$right_string);
#ponizsza linijka dodana 18 listopada
white_into_symbols(\${$CENTER_REF}[$index]);
}
print $left_string;
print $bdis;
#ponizsza 3 linijki (tj. 1 blok) dodana 18 listopada
if(!$white){
symbols_into_white(\${$CENTER_REF}[$index]);
}
print "${$CENTER_REF}[$index]";
print $edis;
print $right_string;
print "\n";
splice(@{$LEFT_REF},$index,1);
splice(@{$CENTER_REF},$index,1);
splice(@{$RIGHT_REF},$index,1);
}
sub trim_left{
my $string = shift;
if(substr($string,0,1) eq " "){return substr($string,1);}
my $position = index($string," ");
my $temp_position = index($string,"\n");
if(!$temp_position==-1&&($position==-1||$temp_position<$position)){$position=$temp_position;}
$temp_position = index($string,"\t");
if(!$temp_position==-1&&($position==-1||$temp_position<$position)){$position=$temp_position;}
return substr($string,$position+1);
}
sub trim_right{
my $string = shift;
my $length = length($string);
if(substr($string,$length-1,1) eq " "){return substr($string,0,$length-1);}
my $position = rindex($string," ");
my $temp_position = rindex($string,"\n");
if($temp_position>$position){$position=$temp_position;}
$temp_position = rindex($string,"\t");
if($temp_position>$position){$position=$temp_position;}
return substr($string,0,$position);
}
sub eof_or_inconsistency{
my $LEFT_REF = shift;
my $CENTER_REF = shift;
my $RIGHT_REF = shift;
my $bdis = shift;
my $edis = shift;
my $white = shift;
my $column = shift;
my $trim = shift;
my $left_type = shift;
my $right_type = shift;
my $length = @{$CENTER_REF};
for(my $i=0;$i<$length;$i++){
print_and_remove(0,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bdis,$edis,$white,$column,$trim,$left_type,$right_type);
$length = @{$CENTER_REF};
$i--;
}
}

42
app/src/cor/Makefile Normal file
View File

@ -0,0 +1,42 @@
PAR=-Wno-deprecated -m32 -fpermissive
# -static
PAR2=-c -Wno-deprecated -m32 -fpermissive
LIB_PATH=../lib
COMMON_PATH=../common
CMDLINE_FILE='"../cor/cmdline.h"'
cor: main.cc corr.o $(LIB_PATH)/word.o \
$(LIB_PATH)/auttools.o cmdline.c common_cor.o common.o
g++ $(PAR) main.cc corr.o common.o \
$(LIB_PATH)/word.o $(LIB_PATH)/auttools.o cmdline.c common_cor.o \
-o cor
corr.o: corr.cc corr.hh
g++ $(PAR2) corr.cc
common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \
$(COMMON_PATH)/common.h
g++ $(PAR2) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc
common_cor.o: cmdline.h common_cor.cc common_cor.h
g++ $(PAR2) common_cor.cc
cmdline.c cmdline.h: cmdline.ggo
gengetopt -i cmdline.ggo --conf-parser
cmdline.ggo: cmdline_cor.ggo ../common/cmdline_common.ggo
cat cmdline_cor.ggo ../common/cmdline_common.ggo > cmdline.ggo
copy:
ifdef UTT_BIN_DIR
cp cor ${UTT_BIN_DIR}
endif
clean: clean.cmdline
rm *.o || true
rm cor || true
clean.cmdline:
rm cmdline.* || true

View File

@ -0,0 +1,8 @@
package "cor"
version "0.1"
option "dictionary-home" - "Dictionary home dir." string typestr="FILENAME" no hidden
option "dictionary" d "Dictionary" string typestr="FILENAME" default="cor.bin" no
option "distance" n "Maximal edit distance." int default="1" no
option "replace" r "Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off
#option "single" - "Place all alternatives in the same line" flag off

19
app/src/cor/common_cor.cc Normal file
View File

@ -0,0 +1,19 @@
#include <stdlib.h>
#include <string.h>
#include "common_cor.h"
char dictionary[256];
void process_cor_options(gengetopt_args_info* args)
{
if(args->dictionary_given)
{
expand_path(args->dictionary_arg,dictionary);
}
else if (args->dictionary_home_given && args->language_given)
{
char buf[255];
expand_path(args->dictionary_home_arg, buf);
sprintf(dictionary,"%s/%s/cor.bin",buf,args->language_arg);
}
}

19
app/src/cor/common_cor.h Normal file
View File

@ -0,0 +1,19 @@
#ifndef __COMMON_COR_H
#define __COMMON_COR_H
#include <stdio.h>
#define _CMDLINE_FILE "../cor/cmdline.h"
#include "../common/common.h"
#include "cmdline.h"
#define DICT_FILE "cor.bin"
extern int change_count;
extern void process_cor_options(gengetopt_args_info* args);
extern char dictionary[];
#endif

142
app/src/cor/corr.cc Normal file
View File

@ -0,0 +1,142 @@
//---------------------------------------------------------------------------
#include "corr.hh"
#define MAXPATH 256
#define min(x,y) ((x<y)?(x):(y))
#define max(x,y) ((x>y)?(x):(y))
int Corr::ed(int i,int j)
{
if(i==-1)
return j+1;
if(j==-1)
return i+1;
if(i==-2 || j==-2)
return n+1;
if(X[i]==Y[j])
return H2[i-1][j-1];
if(X[i-1]==Y[j] && X[i]==Y[j-1])
return 1+min(H2[i-2][j-2],min(H2[i][j-1],H2[i-1][j]));
return 1+min(H2[i-1][j-1],min(H2[i][j-1],H2[i-1][j]));
/*
if(X[i]==Y[j])
return H[(i-1)+2][(j-1)+2];
if(X[i-1]==Y[j] && X[i]==Y[j-1])
return 1+min(H[(i-2)+2][(j-2)+2],min(H[(i)+2][(j-1)+2],H[(i-1)+2][(j)+2]));
return 1+min(H[(i-1)+2][(j-1)+2],min(H[(i)+2][(j-1)+2],H[(i-1)+2][(j)+2]));
*/
}
int Corr::cuted(int j)
{
int l=max(0,j-t);
int u=min(m,j+t);
int ce=j+t;
for(int k=l;k<=u;k++)
{
if(H2[k][j]<ce)//if(H[(k)+2][(j)+2]<ce)
ce=H2[k][j];//ce=H[(k)+2][(j)+2];
}
return ce;
}
/*
void Corr::recomputeH(int j)
{
for(int i=0;i<=m;i++)
H[(i)+2][(j)+2]=ed(i,j);
}
*/
void Corr::recomputeH(int j)
{
int lo=max(0,j-t-2);
int hi=min(m,j+t+2);
for(int i=lo;i<=hi;++i)
H2[i][j]=ed(i,j);//H[(i)+2][(j)+2]=ed(i,j);
}
int Corr::correct(const char* w, Words& tab)
{
long int path[MAXPATH]={0};
int i; // row index (X)
int j; // column index (Y)
long state=0;
strcpy(X,w);
m=strlen(X)-1;
n=m+t;
for(i=(-2);i<=m;i++)
H[(i)+2][(-2)+2]=n;
for(i=(-1);i<=m;i++)
H[(i)+2][(-1)+2]=(i)+1;
for(j=(-2);j<=n;j++)
H[(-2)+2][(j)+2]=n;
for(j=(-1);j<=n;j++)
H[(-1)+2][(j)+2]=(j)+1;
for(j=0; j<=n; ++j)
for(i=0; i<=m; ++i)
H[i+2][j+2]=t+1;
int more=1;
bool cont=false;
strcpy(Y,"");
j=0;
state=0;
int count=0;
while(more)
{
if(!empty(state))
{
Y[j]=input(state);
recomputeH(j);
if(cuted(j)<=t)
{
int edd;
if(final(next(state)) && (edd=H[(m)+2][(j)+2])<=t)
{
char* out=new char[j+2];
strncpy(out,Y,j+1);
out[j+1]='\0';
// if(cont) putchar(' ');
cont=true;
// printf("%i,%s", edd,out);
// cout << out << "(" << edd << ")" << endl;
tab.add(out);
count++;
}
path[j++]=state;
state=next(state);
continue;
}
else
if(continued(state))
{
state++;
continue;
}
}
//backtracking
do
if(j>0)
j--;
else
more=0;
while(more && !continued(path[j]));
state=path[j]+1;
}
return count;
}
//---------------------------------------------------------------------------

34
app/src/cor/corr.hh Normal file
View File

@ -0,0 +1,34 @@
//---------------------------------------------------------------------------
#ifndef _corr_hh
#define _corr_hh
//---------------------------------------------------------------------------
#include "../lib/tfti.h"
#include "../lib/word.h"
class Corr : public TFTiv<char,char>
{
private:
int H[100][100];
char X[100]; // misspelled string
char Y[100]; // (possibly partial) candidate string
int m; // length of X
int n; // maximal length of Y
int ed(int,int);
int cuted(int);
void recomputeH(int);
public:
int (*H2)[100];
int t; // threshold
Corr() : H2((int(*)[100])&H[2][2]) {};
Corr(const char* a) : TFTiv<char,char>(a), H2((int(*)[100])&H[2][2]) { };
int correct(const char* w, Words& tab);
};
//---------------------------------------------------------------------------
#endif

155
app/src/cor/main.cc Normal file
View File

@ -0,0 +1,155 @@
#include <stdlib.h>
#include <ctype.h>
#include "../lib/iotools.h"
#define _CMDLINE_FILE "../cor/cmdline.h"
#include "../common/common.h"
#include "common_cor.h"
#include "corr.hh"
#include "cmdline.h"
#include <locale.h>
int main(int argc, char** argv) {
// setlocale(LC_CTYPE,"");
// setlocale(LC_COLLATE,"");
gengetopt_args_info args;
if(cmdline_parser(argc, argv, &args) != 0)
exit(1);
process_config_files(&args,argv[0]);
process_common_options(&args,argv[0]);
process_cor_options(&args);
Corr cor;
cor.load(dictionary);
cor.t=args.distance_arg;
char line[MAX_LINE+1];
long line_count = 0;
Segment seg;
Words tab;
char form1[MAX_LINE];
char* form;
int formcasing;
char corfield[MAX_LINE]="";
while (fgets(line, MAX_LINE, inputf))
{
// strcpy(outline,line);
++line_count;
// if(!seg.parse(line))
// {
// fprintf(stderr,"Input error in line %d.\n",line_count);
// exit(1);
// }
char outline[128];
//printf("Starting cor... searching for %d fields\n", args.input_field_given);
//for (int i=0; i<args.input_field_given; ++i) {
// printf("\t%d. %s\n", i, args.input_field_arg[i]);
//}
if (!process_seg(line, args))
fputs(line, outputf);
else
{
char form[MAX_FORM];
tab.clear();
getfield(line,input_field_prefix,form);
if (form==NULL) continue;
formcasing=3;
cor.correct(form, tab);
if( tab.count() == 0 )
{
formcasing=casing(form);
if( formcasing == 1 || formcasing == 2)
tolowers(form, form1), cor.correct(form1, tab);
}
if ( tab.count() == 0)
fputs(line, failedf);
else
{
if(args.replace_flag)
{
char corfield[128];
strcpy(corfield, input_field_prefix);
strcat(corfield, form);
seg.aux[seg.auxn]=corfield;
++seg.auxn;
for(int i=0; i<tab.count(); ++i)
{
seg.form=tab[i].form();
restorecasing(seg.form,seg.form,formcasing);
seg.print(outline);
fputs(outline, outputf);
}
--seg.auxn;
}
else
{
if(one_line)
{
char* p=corfield;
for(int i=0; i<tab.count(); ++i)
{
restorecasing(tab[i].form(),tab[i].form(),formcasing);
p += sprintf(p," %s%s",output_field_prefix,tab[i].form());
}
sprintf(p,"\n");
strcpy(outline,line);
outline[strlen(outline)-1]='\0';
strcat(outline,corfield);
fputs(outline, outputf);
}
else if(one_field)
{
char* p=corfield;
p += sprintf(p," %s",output_field_prefix);
for(int i=0; i<tab.count(); ++i)
{
restorecasing(tab[i].form(),tab[i].form(),formcasing);
p += sprintf(p,(i==0)?"%s":";%s",tab[i].form());
}
sprintf(p,"\n");
strcpy(outline,line);
outline[strlen(outline)-1]='\0';
strcat(outline,corfield);
fputs(outline, outputf);
}
else
{
for(int i=0; i<tab.count(); ++i)
{
restorecasing(tab[i].form(),tab[i].form(),formcasing);
sprintf(corfield," %s%s\n",output_field_prefix,tab[i].form());
strcpy(outline,line);
outline[strlen(outline)-1]='\0';
strcat(outline,corfield);
fputs(outline, outputf);
}
}
}
}
}
if(args.interactive_flag)
{
fflush(outputf);
fflush(failedf);
}
}
cmdline_parser_free(&args);
}

13
app/src/fla/Makefile Normal file
View File

@ -0,0 +1,13 @@
fla: fla.c
gcc -static -o fla fla.c
copy:
ifdef UTT_BIN_DIR
cp fla ${UTT_BIN_DIR}
endif
clean:
rm fla
uninstall:

46
app/src/fla/fla.c Normal file
View File

@ -0,0 +1,46 @@
#include <stdio.h>
#include <string.h>
#include <regex.h>
char buf[5001];
main(int argc, char **argv)
{
char *pattern;
char eoln;
regex_t re;
int firstline=1;
if(argc < 2)
/* pattern="[ \t]*([0-9]+[ \t]+){2}EOS([ \t].*)?"; */
pattern="[ \t]*BOS([ \t].*)?";
else
pattern=argv[1];
if(argc < 3)
eoln='\f';
else
eoln=atoi(argv[2]);
if(regcomp(&re, pattern, REG_EXTENDED|REG_NOSUB) !=0)
{
fprintf(stderr,"Invalid pattern.\n");
exit(1);
}
while(fgets(buf,5000,stdin))
{
buf[strlen(buf)-1]='\0';
if(firstline)
firstline=0;
else
if(regexec(&re, buf, (size_t)0, NULL, 0) == 0)
putchar('\n');
else
putchar(eoln);
fputs(buf,stdout);
}
putchar('\n');
}

7
app/src/gph/Makefile Normal file
View File

@ -0,0 +1,7 @@
gph:
copy:
ifdef UTT_BIN_DIR
cp gph ${UTT_BIN_DIR}
endif

85
app/src/gph/gph Executable file
View File

@ -0,0 +1,85 @@
#!/usr/bin/perl
use Getopt::Long;
my @process;
my $help=0;
my $reset;
my $interactive=1;
GetOptions("process|p=s" => \@process,
"help|h" => \$help,
"reset|r=s" => \$reset,
"interactive|i" => \$interactive);
if($help)
{
print <<'END'
Usage: gph [OPTIONS]
Options:
-p tag Process segments with this tag as nodes.
-r tag Start new graph at this tag.
-f filename Input file (NIE DZIALA).
-o filename Output file (NIE DZIALA).
-i Toggle interactive mode (default=on).
END
;
exit 0;
}
$|=1 if $interactive;
my @prev;
my $n=0;
while(<>)
{
chomp;
my $do=0;
my @line = split /\s+/;
if($line[2] eq $reset)
{
$n=0;
@prev = ();
}
for my $p (@process)
{
$do=1 if $line[2] eq $p;
}
if($do)
{
@preds = ();
shift @prev while @prev+0 && $prev[0]->[1] + $prev[0]->[2] < $line[0];
for my $p (@prev)
{
push(@preds, $p->[0]) if $p->[1] + $p->[2] == $line[0];
}
push @prev, [$n, $line[0], $line[1]];
$gph=' gph:'.$n.':'.join(',',@preds);
$n++;
}
else
{
for my $p (@prev)
{
if($p->[1]+$p->[2] == $line[0])
{
$p->[2] += $line[1];
}
}
$gph='';
}
print $_.$gph."\n";
}

6
app/src/grp/Makefile Normal file
View File

@ -0,0 +1,6 @@
main:
copy:
ifdef UTT_BIN_DIR
cp grp ${UTT_BIN_DIR}
endif

154
app/src/grp/grp Executable file
View File

@ -0,0 +1,154 @@
#!/usr/bin/perl
#package: UAM Text Tools
#component name: gre
#author: Tomasz Obrêbski
use strict;
use Getopt::Long;
my $LIB_DIR="/usr/local/lib/utt"; # katalog zawierajacy terms.m4
my $systemconfigfile="/usr/local/etc/utt/grp.conf";
my $userconfigfile="$ENV{'HOME'}/.utt/grp.conf";
Getopt::Long::Configure('no_ignore_case_always');
my $help=0;
my $pattern=0;
my $matches_only=0;
my $macrofile=0;
my $define=0;
my $show_command=0;
my $action="pgP";
my $eos="seg(EOS)";
my $morfield='lem';
#read configuration files###########################
my $file;
foreach $file ($systemconfigfile, $userconfigfile){
if(open(CONFIG, $file)){
while (<CONFIG>) {
chomp;
s/#.*//;
s/^\s+//;
s/\s+$//;
next unless length;
my ($name, $value) = split(/\s*=\s*/, $_, 2);
if(($name eq "pattern")or($name eq "e")){
$pattern=$value;
}
elsif(($name eq "eos")or($name eq "E")){
$eos=$value;
}
elsif($name eq "morph"){
$morfield=$value;
}
elsif($name eq "macros"){
$macrofile=$value;
}
elsif($name eq "define"){
$define=$value;
}
elsif($name eq "command"){
$show_command=1;
}
elsif($name eq "action"){
$action;
}
elsif(($name eq "help")or($name eq "h")){
$help=1;
}
}
close CONFIG;
}
}
#########################################################
GetOptions("pattern|e=s" => \$pattern,
"eos|E=s" => \$eos,
"morph=s" => \$morfield,
"macros=s" => \$macrofile,
"define=s" => \$macrofile,
"command" => \$show_command,
"action=s" => \$action,
"help|h" => \$help);
if($help)
{
print <<'END'
Usage: gre [OPTIONS] [file ..]
Options:
--pattern -e PATTERN Pattern.
--eos -E PATTERN Segment serving as sentence delimiter.
--morph=STRING Field containing morphological information (default 'lem').
--macros=FILE Read macrodefinitions from FILE.
--define=FILE Add macrodefinitions from FILE.
--action -a [u][p][g][P] Perform only indicated actions.
u - uncompress with 'lzop -cd'
p - preprocess
g - grep
P - postprocess
(default pgP)
--command Print the shell command to be executed and exit.
--help -h Help.
END
;
exit 0;
}
die("$0: no pattern given.\n") unless $pattern || $action !~ /g/;
die("$0: macro file not found") unless
$macrofile or
-e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4";
my $uncompress = ($action =~ /u/) ? ' lzop -cd | ' : '';
my $preproc = ($action =~ /p/) ? ' fla | ' : '';
my $postproc = ($action =~ /P/) ? ' | unfla ' : '';
# discarding spaces
$pattern =~ s/\s+/\\`'/g; #`
# quoting escaped commas
$pattern =~ s/\\,/\\`\\`\\,''/g;
# quoting commas in {m,n} r.e. operator
$pattern =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;
my $grepre = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`;
die("Incorrect pattern (m4).") if $? >> 8;
chomp $grepre;
# <> expansion
$grepre =~ s/<([^>]+)>/`echo $1 | tag2re`/ge;
$grepre =~ s/\./[^ [:cntrl:]]/g;
$grepre =~ s/\\s/[ ]/g;
$grepre =~ s/\\S/[^ [:cntrl:]]/g;
$grepre =~ s/\\d/[0-9]/g;
$grepre =~ s/\\D/[^0-9 [:cntrl:]]/g;
$grepre =~ s/\\w/[a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_]/g;
$grepre =~ s/\\W/[^a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_ [:cntrl:]]/g;
# extensions
$grepre =~ s/\\l/[a-z±æê³ñ󶼿]/g; #lowercase letter
$grepre =~ s/\\L/[A-Z¡ÆÊ£ÑÓ¦¬¯]/g; #upercase letter
my $grep_command = ($action =~ /g/) ? "egrep '$grepre'" : " cat ";
if($show_command)
{
print $grep_command."\n";
exit 0;
}
#print $preproc.$grep_command.$postproc."\n";
exec $preproc.$grep_command.$postproc;

42
app/src/gue/Makefile Normal file
View File

@ -0,0 +1,42 @@
PAR=-Wno-deprecated -O3 -fpermissive -static
PAR2=-c -Wno-deprecated -O3 -fpermissive
LIB_PATH=../lib
COMMON_PATH=../common
CMDLINE_FILE='"../gue/cmdline.h"'
gue: main.cc guess.o $(LIB_PATH)/auttools.o $(LIB_PATH)/word.o \
cmdline.c common_guess.o common.o
g++ $(PAR) main.cc guess.o \
$(LIB_PATH)/auttools.o $(LIB_PATH)/word.o cmdline.c common.o common_guess.o \
-o gue
guess.o: guess.h guess.cc
g++ $(PAR2) guess.cc
common_guess.o: cmdline.h common_guess.cc common_guess.h
g++ $(PAR2) common_guess.cc
common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \
$(COMMON_PATH)/common.h
g++ $(PAR2) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc
cmdline.c cmdline.h: cmdline.ggo
gengetopt -i cmdline.ggo --conf-parser
cmdline.ggo: cmdline_guess.ggo ../common/cmdline_common.ggo
cat cmdline_guess.ggo ../common/cmdline_common.ggo > cmdline.ggo
clean: clean.cmdline
rm *.o || true
rm gue || true
clean.cmdline:
rm cmdline.* || true
copy:
ifdef UTT_BIN_DIR
cp gue ${UTT_BIN_DIR}
endif

View File

@ -0,0 +1,12 @@
package "guess"
version "0.1"
option "guess_count" n "Guess up to n descriptions" int default="0" no
option "delta" - "Stop displaying answers after fall of weight" float default="0.2" no
option "cut-off" - "Do not display answers with less weight than cut-off" int default="200" no
option "dictionary-home" - "dh" hidden
option "dictionary" d "File with dictionary information" string typestr="filename" default="~/.utt/lang/pl_PL.ISO-8859-2/gue.bin" no
option "per-info" v "Display performance information" flag off
option "weights" w "Print weights" flag off hidden
option "no-uppercase" - "Do not process form containing uppercase letters" flag off

View File

@ -0,0 +1,50 @@
#include <stdlib.h>
#include <string.h>
#include "common_guess.h"
int guess_count=0;
double delta=0.1;
int cut_off=100;
char dictionary[255];
bool per_info=false;
bool weights=true;
void process_guess_options(gengetopt_args_info* args)
{
if(args->dictionary_given)
{
expand_path(args->dictionary_arg,dictionary);
}
else if (args->dictionary_home_given && args->language_given)
{
char buf[255];
expand_path(args->dictionary_home_arg, buf);
sprintf(dictionary,"%s/%s/lem.bin",buf,args->language_arg);
}
if(args->guess_count_given)
guess_count=args->guess_count_arg;
else
guess_count=0;
if(guess_count==0)
guess_count=100;
if(args->delta_given)
delta=args->delta_arg;
else
delta=0.1;
if(args->cut_off_given)
cut_off=args->cut_off_arg;
else
cut_off=100;
if(args->per_info_given)
per_info=args->per_info_flag;
if(args->weights_given)
weights=false;
}

View File

@ -0,0 +1,20 @@
#ifndef __COMMON_GUESS_H
#define __COMMON_GUESS_H
#include <stdio.h>
#define _CMDLINE_FILE "../gue/cmdline.h"
#include "../common/common.h"
#include "cmdline.h"
#define DIC_FILE "gue.bin"
extern int guess_count;
extern double delta;
extern int cut_off;
extern char dictionary[];
extern bool per_info;
extern bool weights;
void process_guess_options(gengetopt_args_info* args);
#endif

138
app/src/gue/guess.cc Normal file
View File

@ -0,0 +1,138 @@
#include "guess.h"
#include <string.h>
#include <iostream.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#define DICT 1
#define COR 2
#define DICT_P 3
#define COR_P 4
#define W_PRE 0.1
#define W_SUF 0.9
#define PREF_SIGN '_'
Guess::Guess(const char* suf_file)
: _suf(suf_file) {
/* _suf = NULL;
_pref = NULL;
if (strlen(suf_file) > 0)
_suf = new TFTiv<char, char>(suf_file);
if (strlen(pref_file) > 0)
_pref = new TFTiv<char, char>(corp_file);
*/
}
char buf[MAX_LINE];
char out[MAX_LINE];
char* buf0_s = buf;
char* word_t = NULL;
long state_s = 0;
unsigned length_s = buf0_s - buf;
long len = 0;
int i=0;
int Guess::ana(const char* word, Words& result) {
assert(word && &result);
/* Word zawiera wyraz, ktory mamy zbadac.
* Nalezy przepisac go w odwrotnej kolejnosci do bufora,
* znalezc najdluzszy prefiks pasujacy do tego bufora
* separatorem jest '/' - za tym znakiem znajduje sie
* prawdopodobienstwo wystapienia danego opisu */
buf0_s = buf;
word_t = strdup(word);
if (reverse(word, buf) != 0)
return -1;
state_s = -1;
// printf("#buf0_s=%s, ", buf0_s);
state_s = _suf.pref(buf0_s, PREF_SIGN);
// printf("#word=%s, buf0_s=%s\t", word, buf0_s);
/* jezeli state_s != -1 to oznacza, ze w slowniku jest zawarta
* informacja o prefiksie tego slowa.
* nie jest ona odwrocona, wiec porownujemy do word a nie do buf
*/
// printf("state_s=%d\t", state_s);
if (state_s != -1) {
state_s = _suf.pref(word_t, '~', state_s);
// printf("state_s(wp)=%d, word_t=%s, word=%s\n", state_s, word_t, word);
}
if (state_s == -1) {
// if (_suf != NULL)
buf0_s = buf;
state_s = _suf.pref(buf0_s, '~');
// printf("state_s=%d\n", state_s);
}
length_s = buf0_s - buf;
/* state jest stanem, od ktorego zaczyna sie sciezka opisujaca
* prawdopodobienstwo przeciwienstwa wystapienia opisu
* znajdujacego sie dalej na tej sciezce.
* Im mniejsza wartosc liczby tym wieksze prawdopodobienstwo */
len = 0;
i=0;
// if (_suf != NULL)
len = _suf.cont(state_s, out);
while (len > 0) {
i++;
add_word_prob(result, word, out, length_s, DICT);
len = _suf.cont(-1, out);
}
return i;
}
int Guess::add_word_prob(Words& tab, const char* word, const char* path, unsigned len, int source) {
/* Dodaje do tablicy tab wyraz word wraz
* z prawdopodobienstwem i opisem zawartym
* w sciezce path */
// printf("add_word_prob(");
// fflush(stdout);
char p[MAX_LINE];
strcpy(p, path);
int probLen = strcspn(p, ";");
char prob[probLen+1];
strncpy(prob, p, probLen);
prob[probLen] = '\0';
char* desc = p + probLen+1; // +2 bo pomijamy jeszcze znak ';'
int i = tab.add(word, desc);
if (source==DICT) {
tab[i].len_suf(len);
tab[i].w_suf(atof(prob)); // + W_PRE*tab[i].w_suf()));
// tab[i].w_suf((float)(W_SUF*(1000-atof(prob)) + W_PRE*tab[i].w_suf()));
}
// if (source==COR) {
// tab[i].len_pref(len);
// tab[i].w_pref(W_SUF*(1000-atof(prob)) + W_PRE*tab[i].w_pref());
// }
// printf(")\n");
// fflush(stdout);
return i;
}

56
app/src/gue/guess.h Normal file
View File

@ -0,0 +1,56 @@
#include "../lib/tfti.h"
#include "../lib/word.h"
#include <sys/timeb.h>
/**************************************************************
* Zawiera definicje klasy Guess. *
* *
* Klasa ta pozwala na okreslenie opisu slowa nie *
* znajdujacego sie w slowniku wraz z prawdopodobienstwem *
* jego wystapienia. *
*************************************************************/
class Guess {
public:
// nazawa pliku slownika w parametrze
Guess(const char* suf_file);
// zwraca tablice opisow slowa wraz z prawdopodobienstwem ich wystapienia
int ana(const char* word, Words& result);
long time_overall;
private:
// sufiksy
TFTiv<char, char> _suf;
// prefiksy
TFTiv<char, char> _pref;
//odwraca ciag znakow
int reverse(const char* src, char* dest) {
// assert((src != NULL) && (dest != NULL));
const char* c = src;
int len = strlen(src);
for (int i=1; i<=len; ++i) {
dest[i-1] = src[len-i];
}
dest[len] = '\0';
return 0;
}
//dodaje nowy element do tablicy WordsProb
int add_word_prob(Words& tab, const char* word, const char* path, unsigned len, int source);
};

192
app/src/gue/main.cc Normal file
View File

@ -0,0 +1,192 @@
#include <time.h>
#include <stdlib.h>
#include "../lib/iotools.h"
#define _CMDLINE_FILE "../gue/cmdline.h"
#define CONFIGFILE1 "/home/ynka/utt/utt-0.9/conf/gue.conf"
#define CONFIGFILE2 "/home/ynka/utt/utt-0.9/conf/gue.conf"
#include "../common/common.h"
#include "common_guess.h"
#include "guess.h"
#include "cmdline.h"
#define W_SUFF 0.6
#define W_PREF 0.4
int main(int argc, char** argv) {
int non_standard_config=0;
gengetopt_args_info args;
if(cmdline_parser(argc, argv, &args) != 0)
exit(1);
process_config_files(&args,argv[0]);
process_common_options(&args,argv[0]);
process_guess_options(&args);
// PONIŻEJ POPRZEDNI KOD (JUSTYNY)
// //preliminary command-line parsing - for configuration file info only
// gengetopt_args_info pre_args;
// if (cmdline_parser(argc, argv, &pre_args) != 0)
// exit(1);
// if(pre_args.config_given){
// printf("podano config: %s\n",pre_args.config_arg);
// non_standard_config=1;
// }
// //configuration file 1 parsing
// struct cmdline_parser_params *params;
// params = cmdline_parser_params_init();
// params->initialize = 1;
// if(cmdline_parser_config_file(CONFIGFILE1,&args, params)!=0){
// printf("System-wide configuration file parsing error!\n");
// exit(1);
// }
// //configuration file 2 parsing-overriding
// params->initialize=0;
// params->override=1;
// char* config2=(non_standard_config)?pre_args.config_arg:CONFIGFILE2;
// if(cmdline_parser_config_file(config2,&args, params)!=0){
// printf("User configuration file parsing error!\n");
// return 1;
// }
// params->initialize=0;
// params->override=1;
// //params->check_required=1;
// free(params);
// //command-line options parsing-overriding
// if (cmdline_parser(argc, argv, &args) != 0)
// exit(1);
char line[MAX_LINE];
char outline[MAX_LINE];
char parms[MAX_LINE], desc[MAX_LINE], lemma[MAX_LINE];
long line_count = 0;
// printf("d_f=%s\n", dict_file);
Guess guess(dictionary);
int words_count=0;
time_t start_time = time(NULL);
Segment seg;
Words tab;
char* form; //[MAX_FORM];
while (fgets(line, MAX_LINE, inputf)==line) {
line_count++;
int start, len;
line[strlen(line)-1] = '\0';
if (!seg.parse(line)) {
fprintf(stderr, "B³±d w wej¶ciu (linia: %d)\n", line_count);
return -1;
}
if (process_seg(seg, args)) {
words_count++;
tab.clear();
if (args.input_field_given>0) {
form = getInput(args.input_field_arg, args.input_field_given, seg);
} else
form = seg.form;
if (NULL == form) {
continue;
}
guess.ana(form, tab);
if ((tab.count()==0) && (!args.no_fail_flag)) {
// no guesses - analysis was unsuccessful
seg.print(outline); //this is necessary - seg.parse destroys line...
fputs(outline, failedf);
if (copy_processed)
fputs(line, stdout);
continue;
}
// we've got some guesses. Do we want to print it?
if (args.only_fail_flag)
continue;
float last_weight=0;
int i=0;
int count=0;
unsigned first=1;
char* parms_end = parms;
char last_lemma[MAX_LINE];
while ((i=tab.next()) != -1 && count++<guess_count) {
/* if we have "one-line" flag then everything goes in one segment as many fields,
* if we have "one-field" flag everything goes in one segment as ONE field:
* - diferent lemmas are separated with ';', sequent descriptions to one lemma
* are separated with ','
*/
if ((!first) && (tab[i].w_suf() < cut_off) || (tab[i].w_suf() < delta * last_weight)) {
break;
}
if (first) {
parms_end += sprintf(parms_end, "%s", field_prefix);
} else if (!args.one_field_flag)
parms_end += sprintf(parms_end, "%s", field_prefix);
if (!args.one_field_flag || strcmp(last_lemma, tab[i].lemma()) != 0) {
if (args.one_field_flag && !first)
parms_end += sprintf(parms_end, ";");
parms_end += sprintf(parms_end, "%s", tab[i].lemma());
strcpy(last_lemma, tab[i].lemma());
}
first=0;
last_weight = tab[i].w_suf();
if (!weights)
parms_end += sprintf(parms_end, ",%s:%d", tab[i].descr(), (int)tab[i].w_suf());
else
parms_end += sprintf(parms_end, ",%s", tab[i].descr());
if (!args.one_field_flag) {
seg.addfield(parms);
parms_end = parms;
}
if (!(args.one_field_flag || args.one_line_flag)) {
seg.print(outline);
fputs(outline, outputf);
--seg.auxn;
}
//if (copy_processed)
// fputs(outline, stdout);
} //while
if (args.one_field_flag)
seg.addfield(parms);
if (args.one_field_flag || args.one_line_flag){
seg.print(outline);
fputs(outline, outputf);
}
} else { // if (process_segment)
// jak to nie jest wyraz - to przepisz token na wyjscie.
// printtok(line, start, len, cat, form);
seg.print(outline);
fputs(outline, outputf);
if (copy_processed)
fputs(outline, stdout);
}
}
time_t end_time = time(NULL);
if (per_info) {
printf("Liczba s³ów: %d\n", words_count);
printf("Czas analizy: %d sekund\n", end_time-start_time);
}
cmdline_parser_free(&args);
}

7
app/src/kot/Makefile Normal file
View File

@ -0,0 +1,7 @@
kot:
copy:
ifdef UTT_BIN_DIR
cp kot ${UTT_BIN_DIR}
endif

99
app/src/kot/kot Executable file
View File

@ -0,0 +1,99 @@
#!/usr/bin/perl
use strict;
use Getopt::Long;
my $help=0;
my $gap_fill="\n-----\n";
my $spaces=0;
my $configfile1="../../conf/kot.conf";
my $configfile2="../conf/kot.conf";
#read configuration files###########################
my $file;
foreach $file ($configfile1, $configfile2){
if(open(CONFIG, $file)){
while (<CONFIG>) {
chomp;
s/#.*//;
s/^\s+//;
s/\s+$//;
next unless length;
my ($name, $value) = split(/\s*=\s*/, $_, 2);
if(($name eq "gap-fill")or($name eq "g")){
$gap_fill=$value;
}
elsif(($name eq "spaces")or($name eq "s")){
$spaces=1;
}
elsif(($name eq "help")or($name eq "h")){
$help=1;
}
}
close CONFIG;
}
}
#########################################################
GetOptions("gap-fill|g=s" => \$gap_fill,
"spaces|r" => \$spaces,
"help|h" => \$help);
if($help)
{
print <<'END'
Usage: ser [OPTIONS] [file ..]
Options:
--gap-fill -g Help.
--spaces -r
--define=FILE Read macrodefinitions from FILE.
--flex-template=FILE Read flex code template from FILE.
--only-matching -m Print only fragments matching PATTERN.
--flex Print only the generated flex code and exit.
END
;
exit 0;
}
$gap_fill =~ s/\\t/\t/g;
$gap_fill =~ s/\\n/\n/g;
$gap_fill =~ s/\\r/\r/g;
$gap_fill =~ s/\\f/\f/g;
my $prevend=-1;
my $count=0;
while(<>)
{
my ($start,$len,$type,$form) = /^\s*(\d+)\s+(\d+)\s+(\S+)\s+(\S+)/;
if($start > $prevend)
{
print $gap_fill unless $count++ == 0;
}
$prevend=$start+$len;
next if $len==0;# || $form eq "*";
$form =~ s/\\\*/*/g;
if($type eq 'S' && ! $spaces)
{
$form =~ s/_/ /g;
$form =~ s/\\t/\t/g;
$form =~ s/\\n/\n/g;
$form =~ s/\\r/\r/g;
$form =~ s/\\f/\f/g;
}
print $form;
}
#print $gap_fill;
# print "\n";

56
app/src/lem/Makefile Normal file
View File

@ -0,0 +1,56 @@
PAR=-Wno-deprecated -m32 -O3 -fpermissive
#-static
PAR2=-c -Wno-deprecated -m32 -O3 -fpermissive
LIB_PATH=../lib
COMMON_PATH=../common
CMDLINE_FILE='"../lem/cmdline.h"'
lem: main.cc lem.o $(LIB_PATH)/auttools.o $(LIB_PATH)/word.o \
cmdline.c common_lem.o common.o symtab.o
g++ $(PAR) main.cc lem.o $(LIB_PATH)/auttools.o \
$(LIB_PATH)/word.o cmdline.c common.o common_lem.o \
symtab.o -o lem
lem.o: lem.h lem.cc
g++ $(PAR2) lem.cc
# alphabet.o: $(LIB_PATH)/alphabet.h $(LIB_PATH)/alphabet.cc
# g++ $(PAR2) $(LIB_PATH)/alphabet.cc
# auttools.o: $(LIB_PATH)/auttools.h $(LIB_PATH)/auttools.cc
# g++ $(PAR2) $(LIB_PATH)/auttools.cc
# word.o: $(LIB_PATH)/word.h $(LIB_PATH)/word.cc
# g++ $(PAR2) $(LIB_PATH)/word.cc
# erro.o: $(LIB_PATH)/erro.h $(LIB_PATH)/erro.cc
# g++ $(PAR2) $(LIB_PATH)/erro.cc
common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \
$(COMMON_PATH)/common.h
g++ $(PAR2) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc
common_lem.o: cmdline.h common_lem.h common_lem.cc
g++ $(PAR2) common_lem.cc
cmdline.c cmdline.h: cmdline.ggo
gengetopt -i cmdline.ggo --conf-parser
cmdline.ggo: cmdline_lem.ggo ../common/cmdline_common.ggo
cat cmdline_lem.ggo ../common/cmdline_common.ggo > cmdline.ggo
symtab.o: $(LIB_PATH)/symtab.h $(LIB_PATH)/symtab.cc
g++ $(PAR2) $(LIB_PATH)/symtab.cc
clean: clean.cmdline
rm *.o || true
rm lem || true
clean.cmdline:
rm cmdline.* || true
copy:
ifdef UTT_BIN_DIR
cp lem $(UTT_BIN_DIR)
endif

View File

@ -0,0 +1,5 @@
package "lem"
version "0.1"
option "dictionary-home" - "D.h." string typestr="FILENAME" hidden no
option "dictionary" d "Dictionary" string typestr="FILENAME" default="lem.bin" no

41
app/src/lem/common_lem.cc Normal file
View File

@ -0,0 +1,41 @@
#include <stdlib.h>
#include <string.h>
#include "common_lem.h"
char dictionary[255];
void process_lem_options(gengetopt_args_info* args)
{
if(args->dictionary_given)
{
expand_path(args->dictionary_arg,dictionary);
}
else if (args->dictionary_home_given && args->language_given)
{
char buf[255];
expand_path(args->dictionary_home_arg, buf);
sprintf(dictionary,"%s/%s/lem.bin",buf,args->language_arg);
}
}
// STARE
// if(args.dictionary_given)
// strcpy(dictionary, args.dictionary_arg);
// else {
// char path[256];
// //sprintf(path, "/etc/utt/data/%s/%s", args.locale_arg, DICT_FILE);
// //if (file_accessible(path) == 0)
// // strcpy(dictionary, path);
// //else {
// sprintf(path, "%s/%s", utt_dir, DICT_FILE);
// if (file_accessible(path) == 0)
// strcpy(dictionary, path);
// else {
// fprintf(stderr, "Cannot find dictionary!\n");
// exit(1);
// }
// //}
// }

16
app/src/lem/common_lem.h Normal file
View File

@ -0,0 +1,16 @@
#ifndef __COMMON_LEM__H
#define __COMMON_LEM__H
#include <stdio.h>
#define _CMDLINE_FILE "../lem/cmdline.h"
#include "../common/common.h"
#include "cmdline.h"
#define DICT_FILE "lem.bin"
extern char dictionary[];
extern void process_lem_options(gengetopt_args_info* args);
#endif

152
app/src/lem/lem.cc Normal file
View File

@ -0,0 +1,152 @@
#include "lem.h"
#include <assert.h>
/* Znajduje opisy slownikowe dla wyrazu.
* Parametry:
* form - wyraz,
* tab - referencja do tablicy Words (miejsce na wyniki)
* Wartosc:
* liczba dodanych opisow
*/
int Lem::ana(const char* form, Words& tab) {
// sprawdzamy czy parametry wywolania sa poprawne
assert(form && &tab);
int count0 = tab.count();
long l;
if ((l=_dict.next(_dict.gtra(0, form, FT::ftMAXPATH), ';'))>=0)
add_to_table(tab, form, l);
return tab.count()-count0;
}
/* Szukamy opisu slownikowego nastepnego wyrazu w buforze.
* Parametry:
* buf - bufor
* tab - miejsce na wyniki
* Wartosc:
* ilosc dodanych opisow
*/
int Lem::pref(char* buf, Words& tab) {
// sprawdzamy czy parametry wywolania sa poprawne
assert(buf && &tab);
int count0 = tab.count();
long l;
char* buf0 = buf;
if((l=_dict.pref(buf, ';'))>=0) {
char form[MAX_FORM];
int len=buf-buf0;
form[len]='\0';
add_to_table(tab,form,l);
}
return tab.count() - count0;
}
/* Dodaje kolejne opisy do tablicy wynikow.
* Parametry:
* tab - tablica wynikow,
* f - wyraz,
* s - stan, na ktorym zaczyna sie pierwszy opis
*/
void Lem::add_to_table(Words& tab, const char* f, long s) {
// sprawdzenie parametrow
assert(&tab);
assert(f);
char des[FT::ftMAXPATH];
while (_dict.cont(s, des)) {
char* des1;
if ((des1=strtok(des, ";")) != NULL)
do {
if (tab.count() >= MAX_ALT) break;
tab.add(f, des1);
des1=strtok(NULL, ";");
} while (des1!=NULL);
s=-1;
}
}
void Lem::prn_dict()
{
char des[FT::ftMAXPATH];
long s=0;
while (_dict.cont(s, des))
{
printf("%s\n",des);
s=-1;
}
}
AuxLem::AuxLem(const char* filename)
: Lem(), _dict(SIZE)
{
FILE* f;
char buf[MAX_LINE+2];
f=fopen(filename,"r");
for(long i=0; i<SIZE; ++i) info[i]=(char*)NULL;
while(fgets(buf,MAX_LINE,f))
{
int l=strlen(buf);
if(l>=MAX_LINE-1) continue; // BEZ isalpha!
buf[l-1]='\0';
char* sep=strchr(buf,';');
if(sep==NULL) continue;
*sep='\0';
long formind=_dict.add(buf);
if(formind>=0)
{
char* desc=strdup(sep+1);
info[formind]=desc;
}
else
fprintf(stderr,"AuxLem: Form not added: %s;%s.\n", buf,sep+1);
}
fclose(f);
};
//---------------------------------------------------------------------------
AuxLem::~AuxLem()
{
// for(long i=0; i<_dict.count(); ++i)
// free(info[_dict.hashindex(i)]);
for(long i=0; i<SIZE; ++i)
if(info[i]) free(info[i]);
}
//---------------------------------------------------------------------------
int AuxLem::ana(const char* form, Words& tab)
{
if(!form) return 0;
int count0=tab.count();
char des[MAX_LINE];
long ind=_dict[form];
if(ind>=0)
{
strcpy(des,info[ind]);
char* des1;
if((des1=strtok(des,";"))!=NULL)
do
{
if(tab.cnt>=MAXALT) break;
tab.add(form,des1);
des1=strtok(NULL,";");
} while(des1!=NULL);
}
return tab.count()-count0;
}
//---------------------------------------------------------------------------

50
app/src/lem/lem.h Normal file
View File

@ -0,0 +1,50 @@
#include "../lib/tfti.h"
#include "../lib/word.h"
#include "../lib/symtab.h"
#include "../lib/const.h"
class Lem {
protected:
// Alphabet& _alpha;
// slownik
TFTiv<char,char> _dict;
void add_to_table(Words& tab, const char* f, long s);
public:
Lem() {};
Lem(const char* d)
: _dict(d) {};
virtual int ana(const char* form, Words& tab);
int pref(char* form, Words& tab);
void prn_dict();
};
class AuxLem : public Lem {
public:
static const int SIZE=1500000;
// static const int MAXLINE=1000;
static const int MAXALT=256;
AuxLem(const char* filename);
~AuxLem();
// int ana(const char* form, Grams& tab);
int ana(const char* form, Words& tab);
// operator bool() { return _dict && info; }
private:
SymbolTable _dict;
char* info[SIZE];
};

132
app/src/lem/main.cc Normal file
View File

@ -0,0 +1,132 @@
#include "../lib/iotools.h"
#define _CMDLINE_FILE "../lem/cmdline.h"
#include "../common/common.h"
#include "common_lem.h"
#include "lem.h"
#include "cmdline.h"
#include <locale.h>
int main(int argc, char** argv) {
// setlocale(LC_CTYPE,""); //PO CO TO?
// setlocale(LC_COLLATE,""); //
gengetopt_args_info args;
if(cmdline_parser(argc, argv, &args) != 0)
exit(1);
process_config_files(&args,argv[0]);
process_common_options(&args,argv[0]);
process_lem_options(&args);
char line[MAX_LINE+1];
char outline[MAX_LINE+1];
char parms[MAX_LINE+1], desc[MAX_LINE+1], lemma[MAX_LINE+1];
long line_count = 0;
Lem* lem;
if(strcmp(dictionary+strlen(dictionary)-4,".bin")==0)
lem = new Lem(dictionary);
else if(strcmp(dictionary+strlen(dictionary)-4,".dic")==0)
lem = new AuxLem(dictionary);
else
fprintf(stderr,"lem: Invalid dictionary file extension.\n");
Words tab;
// Segment seg;
while (fgets(line, MAX_LINE, inputf))
{
// strcpy(outline,line);
++line_count;
int start, len;
if (!process_seg(line, args)) // TO POWINNO BYC WCZESNIEJ ZABEZPIECZONE
fputs(line, outputf);
else
{
char form[MAX_FORM];
tab.clear();
getfield(line,input_field_prefix,form);
if (form==NULL) continue;
lem->ana(form, tab);
if(tab.count()==0)
{
char form1[MAX_FORM]; // tymczasowo tak, trzeba zmienic ana
char* p;
strcpy(form1,form);
for(p=form1;*p;++p) *p=tolower(*p);
p=form1;
lem->ana(p,tab);
}
if (tab.count() == 0)
fputs(line, failedf);
else
{ // mamy jakies opisy w slowniku
if(one_line)
{
char* descp=desc;
for (int i=0; i< tab.count(); ++i)
{
descp += sprintf(descp," %s%s,%s", output_field_prefix, tab[i].lemma(), tab[i].descr());
}
strcpy(outline,line);
outline[strlen(outline)-1]='\0';
strcat(outline,desc);
strcat(outline,"\n");
fputs(outline, outputf);
if (copy_processed)
fputs(line,outputf);
}
else if(one_field)
{
char* descp=desc;
for (int i=0; i< tab.count(); ++i)
if(i==0)
descp += sprintf(descp," %s%s,%s", output_field_prefix, tab[i].lemma(), tab[i].descr());
else
{
if(strcmp(tab[i].lemma(),tab[i-1].lemma())==0)
descp += sprintf(descp,",%s",tab[i].descr());
else
descp += sprintf(descp,";%s,%s",tab[i].lemma(),tab[i].descr());
}
strcpy(outline,line);
outline[strlen(outline)-1]='\0';
strcat(outline,desc);
strcat(outline,"\n");
fputs(outline, outputf);
if (copy_processed)
fputs(line,outputf);
}
else
{
for (int i=0; i< tab.count(); ++i)
{
// kolejne opisy - kolejne linie.
sprintf(desc, " %s%s,%s\n", output_field_prefix, tab[i].lemma(), tab[i].descr());
strcpy(outline,line);
outline[strlen(outline)-1]='\0';
strcat(outline,desc);
fputs(outline, outputf);
}
if (copy_processed)
fputs(line,outputf);
}
}
}
if(args.interactive_flag)
fflush(outputf), fflush(failedf);
}
cmdline_parser_free(&args);
}

20
app/src/lib/Makefile Normal file
View File

@ -0,0 +1,20 @@
PAR=-Wno-deprecated -m32 -O3
PAR2=-c -Wno-deprecated -m32 -O3 -static -fpermissive
LIB_PATH=../lib
COMMON_PATH=../common
main: auttools.o word.o copy
auttools.o: auttools.h auttools.cc
g++ $(PAR2) auttools.cc
word.o: word.h word.cc
g++ $(PAR2) word.cc
clean:
rm *.o
copy:
ifdef UTT_LIB_DIR
cp -r perl $(UTT_LIB_DIR)/
endif

164
app/src/lib/auttools.cc Normal file
View File

@ -0,0 +1,164 @@
#include "auttools.h"
//#include "/src/cpp-comm/plx/Plx.h"
void fullform(const char* b, const char* d, char* f)
{
int i,j=0;
int n1, n2=0;
bool g=false;
char s1[200], s2[200], temps[200];
while(d[j]>='0' && d[j]<='9')j++;
strncpy(temps,d,j); temps[j]='\0';
n1=atoi(temps);
i=j;
while(!ispunct(d[j]) || d[j]=='*') j++;
strncpy(s1,d+i,j-i);
s1[j-i]='\0';
if(d[j++]=='-')
{
i=j;
while(d[j]>='0' && d[j]<='9')j++;
strncpy(temps,d+i,j-i); temps[j]='\0';
n2=atoi(temps);
i=j;
while(!ispunct(d[j]) || d[j]=='*') j++;
strncpy(s2,d+i,j-i);
s2[j-i]='\0';
g=true;
}
int blen=strlen(b);
if(g)
if(n1+n2<=blen)
{
strcpy(f,s1);
strcat(f,b+n1);
f[strlen(f)-n2]='\0';
strcat(f,s2);
}
else
strcpy(f,"<ERR>");
else
if(n1<=blen)
{
strcpy(f,b);
f[strlen(f)-n1]='\0';
strcat(f,s1);
}
else
strcpy(f,"<ERR>");
}
void compose(char* stem, char* ending, char* form)
{
bool suffix=true;
while(*stem)
if(*stem=='*')
{
strcpy(form,ending);
form+=strlen(ending);
suffix=false;
stem++;
}
else
*(form++)=*(stem++);
if(suffix)
{
strcpy(form,ending);
form+=strlen(ending);
}
*form='\0';
}
void autodescr(const char* f, const char* des, char* lemma, char* pos, char* attr)
{
char lemd[MAXWORDLEN];
int o,l=strcspn(des,",");
strncpy(lemd,des,l);
lemd[l]='\0';
fullform(f,lemd,lemma);
o=l+1;
l=strcspn(des+o,"/:");
strncpy(pos,des+o,l);
pos[l]='\0';
o=o+l;
if(des[o]=='/')
{
o++;
strcpy(attr,des+o);
}
else
attr[0]='\0';
}
int common_prefix(const char* s, const char* t)
{
int n=0;
while(*s==*t && *s!='\0')
{ s++,t++;n++; }
return n;
}
int strdiff(const char* s, const char* t,
int& frontcut, char* prefix, int& endcut, char* suffix)
{
int slen=strlen(s);
int tlen=strlen(t);
int ss, ss_max=0; /* ss - s shift */
int ts, ts_max=0; /* ts - t shift */
int common, common_max=0;
for(ss=0;ss<slen;ss++)
for(ts=0;ts<tlen;ts++)
if( (common=common_prefix(s+ss,t+ts))>common_max
&& (common>4 || (ss==0 && ts==0 && common>1)) )
{
ss_max=ss;
ts_max=ts;
common_max=common;
}
// print "--", tsmax,"\n"
printf("--%d\n", ts_max);
frontcut=ss_max;
strncpy(prefix,t,ts_max); prefix[ts_max]='\0';
endcut=slen-ss_max-common_max;
strcpy(suffix,t+ts_max+common_max);
return common_max;
}
void fprndiff(FILE* f, const char* s, const char* t)
{
int frontcut,endcut;
char pref[MAXWORDLEN],suff[MAXWORDLEN];
strdiff(s,t,frontcut,pref,endcut,suff);
if(frontcut!=0 || pref[0]!='\0')
fprintf(f,"%d%s-%d%s",frontcut,pref,endcut,suff);
else
fprintf(f,"%d%s",endcut,suff);
}
void sprndiff(char* outstr, const char* s, const char* t)
{
int frontcut,endcut;
char pref[MAXWORDLEN],suff[MAXWORDLEN];
strdiff(s,t,frontcut,pref,endcut,suff);
if(frontcut!=0 || pref[0]!='\0')
sprintf(outstr,"%d%s-%d%s",frontcut,pref,endcut,suff);
else
sprintf(outstr,"%d%s",endcut,suff);
}
void despos(const char* des, char* pos)
{
int di=0;
int pi=0;
while(des[di]!=',' && des[di]!='\0') ++di;
if(des[di]==',')
{
++di;
while(isupper(des[di])) pos[pi++]=des[di++];
}
pos[pi]='\0';
}

39
app/src/lib/auttools.h Normal file
View File

@ -0,0 +1,39 @@
#ifndef _Auttools_h
#define _Auttools_h
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
/* #define ISALPHAG(c) ((c>='A' && c<='Z') || (c>='a' && c<='z') || \ */
/* c=='¡' || c=='±' || c=='Æ' || c=='æ' || \ */
/* c=='Ê' || c=='ê' || c=='£' || c=='³' || \ */
/* c=='Ñ' || c=='ñ' || c=='Ó' || c=='ó' || \ */
/* c=='¦' || c=='¶' || c=='¬' || c=='¼' || \ */
/* c=='¯' || c=='¿' || c=='*') */
#define MAXWORDLEN 64
extern void fullform(const char* b, const char* d, // in
char* f); // out
extern void compose(char* stem, char* ending, // in
char* form); // out
extern void autodescr(const char* f, const char* des, // in
char* lemma, char* pos, char* attr); // out
extern int strdiff(char* s, char* t, // in
int& frontcut, char* prefix, // out
int& endcut, char* suffix); // out
extern void fprndiff(FILE* f, const char* s, const char* t);// in
extern void sprndiff(char* outstr, const char* s, const char* t); // in
extern void despos(const char* des, // in
char* pos); // out
#endif

24
app/src/lib/const.h Normal file
View File

@ -0,0 +1,24 @@
// maksymalna dlugosc wyrazu
#define MAX_FORM 80
// maksymalna dlugosc opisu
#define MAX_DESC 80
// maksymalna dlogosc lini w pliku przejsciowym
#define MAX_LINE 1024
// separator pol w pliku posrednim
#define FIELD_SEP " \t\n"
// maksymalna liczba alternatywnych opisow
#define MAX_ALT 256
// plik ze slownikiem dla guessa
#define GUESS_DICT_FILE "slownik.fsa"
// katalogi z plikami konfiguracyjnymi
#define SYSTEM_CONFIG_DIR "/usr/local/etc/utt"
#define USER_CONFIG_DIR "~/.utt"

53
app/src/lib/iotools.h Normal file
View File

@ -0,0 +1,53 @@
#include "const.h"
#include <string.h>
#include <stdio.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// napisy zostaj na miejscu (w line), tylko wskazniki sa ustawian
// i zara dopisywane zera s dopisywane
inline
int parsetok(char* line, int* a, int* b, char** c, char** d, char** e, char** f)
{
char* field;
if((field=strtok(line,FIELD_SEP))!=NULL)
*a=atoi(field); // nie sprawdzana poprawnosc
else
return 0;
if((field=strtok(NULL,FIELD_SEP))!=NULL)
*b=atoi(field); // nie sprawdzana poprawnosc
else return 1;
if((*c=strtok(NULL,FIELD_SEP))==NULL) return 2;
if((*d=strtok(NULL,FIELD_SEP))==NULL) return 3;
if((*e=strtok(NULL,FIELD_SEP))==NULL) return 4;
if((*f=strtok(NULL,FIELD_SEP))==NULL) return 6;
return 6;
}
// napisy s kopiowane
inline
int scantok(const char* line, int* a, int* b, char* c, char* d, char* e=NULL, char* f=NULL)
{
return sscanf(line," %d %d %s %s %s %s", a, b, c, d, e, f);
}
inline
int printtok(char* line, int a, int b, char* c, char* d, char* e, char* f, char* parms)
{
sprintf(line,"%04d %02d %s %s %s %s `%s\n", a, b, c, d, e, f, parms);
}
inline
int printtok(char* line, int a, int b, char* c, char* d, char* e, char* f)
{
sprintf(line,"%04d %02d %s %s %s %s\n", a, b, c, d, e, f);
}
inline
int printtok(char* line, int a, int b, char* c, char* d)
{
sprintf(line,"%04d %02d %s %s\n", a, b, c, d);
}

86
app/src/lib/matchdescr.cc Normal file
View File

@ -0,0 +1,86 @@
#include <ctype.h>
#include <stdio.h>
inline
bool inline_matchattr(const char* a, const char* b)
{
const char *p, *q; // pomocnicze wskazniki
while(*a && *b)
{
p=a; q=b;
while(isupper(*p) && isupper(*q)) // rowny prefiks
if(*p==*q) ++p, ++q;
else if(*p<*q) // a jest mniejszy
{
// przesywamy a do nastepnego atr
a=p;
while(isupper(*a)) ++a; while(islower(*a)) ++a;
goto end;
}
else
{
// przesuwamy b do nastepnego atr
b=q;
while(isupper(*b)) ++b; while(islower(*b)) ++b;
goto end;
}
if(islower(*p) && islower(*q)) // rowne atrybuty
{
a=p; b=q; // przesuwamy wskaznik, sprawdzamy wartosci
while(*a != *b)
{
if(*a > *b && !islower(*++b)) return false;
if(*a < *b && !islower(*++a)) return false;
}
// znaleziono rowna wartosc, przesywamy a i b do nast atr
while(isupper(*a)) ++a; while(islower(*a)) ++a;
while(isupper(*b)) ++b; while(islower(*b)) ++b;
goto end;
}
if(islower(*p)) // a jest krotszy, czyli mniejszy
{ // przesuwamy a do nastepnego atrybutu
a=p;
while(islower(*a)) ++a;
goto end;
}
if(islower(*q)) // b jest krotszy, czyli mniejszy
{ // przesuwamy b do nastepnego atrybutu
b=q;
while(islower(*b)) ++b;
goto end;
}
end: ;
}
return true;
}
bool matchattr(const char* a, const char* b)
{
return inline_matchattr(a,b);
}
bool matchdescr(const char* a, const char* b)
{
while(isupper(*a) && isupper(*b) && *a==*b) ++a, ++b;
if(*a=='\0')
if(*b=='\0' || *b=='/') return true;
else return false;
if(*a=='/')
if(*b=='\0') return true;
else if(*b=='/') return inline_matchattr(++a, ++b);
return false;
}
int main()
{
char a[100], b[100];
while(scanf("%s %s", a, b)==2)
printf("%s & %s = %d\n", a, b, matchdescr(a,b));
}

10
app/src/lib/matchdescr.h Normal file
View File

@ -0,0 +1,10 @@
// obie funkcje wymagaja by deskrypcje byly w postaci kanonicznej
// obslugiwane sa tylko krotkie (jednoliterowe) atrybuty
// test czy zgadzaja sie deskrypcje
bool matchdescr(const char* a, const char* b);
// test czy zgadaja sie same atrybuty (czyli to, co po ukosniku)
bool matchattr(const char* a, const char* b);

171
app/src/lib/symtab.cc Normal file
View File

@ -0,0 +1,171 @@
#include "symtab.h"
#include <values.h>
#include <stdio.h>
#include <alloc.h>
#include <stdlib.h>
//---------------------------------------------------------------------------
SymbolTable::SymbolTable(int n, int (*h)(const char*,int), const char* filename)
: _mx(n), _cnt(0), hash(h)
{
_sz=first(n);
_key=new char*[_sz];
_defind=new int[_sz];
_hashind=new int[_sz];
_def=new char*[_mx];
for(int i=0; i<_sz; i++) _key[i]=NULL;
if(filename)
add_from_file(filename);
}
//---------------------------------------------------------------------------
SymbolTable::SymbolTable(int n, const char* filename)
: _mx(n), _cnt(0), hash(hash1)
{
_sz=first(n);
_key=new char*[_sz];
_defind=new int[_sz];
_hashind=new int[_sz];
_def=new char*[_mx];
for(int i=0; i<_sz; ++i) _key[i]=NULL;
if(filename)
add_from_file(filename);
}
//---------------------------------------------------------------------------
SymbolTable::~SymbolTable()
{
clear();
delete[] _key;
delete[] _defind;
delete[] _hashind;
delete[] _def;
}
//---------------------------------------------------------------------------
void SymbolTable::clear()
{
for(int i=0; i<_sz; ++i)
if(_key[i])
free(_key[i]);
}
//---------------------------------------------------------------------------
bool SymbolTable::add_from_file(const char* filename)
{
FILE* in=fopen(filename,"r");
char buf[MAXKEYLEN+1];
if(in)
while(fscanf(in,"%s",buf)==1)
{
if(strlen(buf)==MAXKEYLEN || add(buf)<0)
return false;
}
return true;
}
//---------------------------------------------------------------------------
int SymbolTable::add(const char* s)
{
if(_cnt<_mx)
{
int ind=hash(s,_sz);
while(_key[ind])
if(strcmp(_key[ind],s))
ind=++ind%_sz;
else
return _defind[ind];
_key[ind]=strdup(s);
_defind[ind]=_cnt;
_hashind[_cnt]=ind;
_def[_cnt]=_key[ind];
_cnt++;
return _cnt-1;
}
else
return -1;
}
//---------------------------------------------------------------------------
int SymbolTable::operator[](const char* s)
{
int ind=hash(s,_sz);
while(_key[ind])
if(strcmp(_key[ind],s)==0)
return _defind[ind];
else
ind=++ind % _sz;
return -1;
}
//---------------------------------------------------------------------------
int SymbolTable::first(unsigned int n)
{
int fi=n;
int bound=(n/2 < MAXKEYLEN)? n/2 : MAXKEYLEN;
bool found;
do
{
found=true;
if(fi++ == MAXINT) return -1;
for(int i=2; i<bound; i++)
if(fi%i==0) { found=false; break; }
} while(!found);
return fi;
}
float SymbolTable::search_rate()
{
long s=0;
for(int i=0; i<_sz; i++)
if(_key[i])
s+=(i+_sz-hash(_key[i],_sz))%_sz+1;
return _cnt ? (float)s/(float)_cnt : 0;
}
//---------------------------------------------------------------------------
int hash1(const char* s, int _sz)
{
int l=strlen(s);
if(l>=4)
return abs((*((int*)(s+(l/2-2)))+(int)(*s * s[l-1])) % _sz);
else
{
int i=0;
strcpy((char*)&i,s);
return abs((i+(int)(*s * s[l-1])) % _sz);
}
}
//---------------------------------------------------------------------------
int hash2(const char* s, int _sz)
{
int l=strlen(s);
if(l>=6)
{
unsigned int i1,i2,i3;
strncpy((char*)&i1,s,sizeof(int));
strncpy((char*)&i2,s+(l/2-2),sizeof(int));
strncpy((char*)&i3,s+(l-4),sizeof(int));
return abs((i1+i2+i3) % _sz);
}
else
{
int i=0;
strncpy((char*)&i,s,sizeof(int));
return abs((i+(int)(*s * s[l-1])) % _sz);
}
}
//---------------------------------------------------------------------------

52
app/src/lib/symtab.h Normal file
View File

@ -0,0 +1,52 @@
#ifndef _HashTable_h
#define _HashTable_h
//---------------------------------------------------------------------------
#include <stddef.h>
#include <string.h>
//---------------------------------------------------------------------------
int hash1(const char* s, int sz);
int hash2(const char* s, int sz);
//---------------------------------------------------------------------------
class SymbolTable
{
int _mx;
int _sz;
int _cnt;
char** _key;
char** _def;
int* _defind;
int* _hashind; // s¹ tu redundancje
public:
static const unsigned int MAXKEYLEN=2000;
SymbolTable(int n, int (*h)(const char*,int), const char* filename=NULL);
SymbolTable(int n, const char* filename=NULL);
~SymbolTable();
void clear();
int (*hash)(const char*, int);
bool add_from_file(const char* filename);
int add(const char* s);
int operator[](const char* s);
const char* operator[](int i){if(i<0||i>=_cnt)return NULL;else return _def[i];}
int index(const char* s) { return this->operator[](s); };
int index(int i) { if(i<0||i>=_cnt) return -1; else return i; };
int hash_index(int i) { return _hashind[i]; }
const char* symbol(int i) { if(i<0||i>=_cnt)return NULL; else return _def[i];}
int capacity() { return _mx; }
int size() { return _sz; }
int count() { return _cnt; }
float search_rate();
private:
static int first(unsigned int n);
};
//---------------------------------------------------------------------------
#endif

879
app/src/lib/tft.h Executable file
View File

@ -0,0 +1,879 @@
#ifndef _TFT_h
#define _TFT_h
//---------------------------------------------------------------------------
#include <stddef.h>
#include <iostream.h>
#include <typeinfo>
#include <string.h>
#include <stdio.h>
//#include "top.h"
#include "ttrans.h"
//---------------------------------------------------------------------------
/// Klasa bazowa przetwornika skoñczonego.
/**
\remark Po co ta klasa? Co dotyczy samych przejæ, przenieæ do TTrans,
resztê wcieliæ do TFT.
*/
class FT
{
public:
FT() : copy_default(false), print_mode(OO), ttn(0) {};
//print mode
enum OUTPUT { II, ///< tylko symbole wej¶ciowe
OO, ///< tylko symbole wyj¶ciowe
IOIO, ///< symbol wyj¶ciowy po wej¶ciowym
OIOI, ///< symbol wyj¶ciowy przed wej¶ciowym
IIOO, ///< ca³e wej¶cie, potem ca³e wyj¶cie
OOII ///< ca³e wyj¶cie, potem ca³e wej¶cie
};
/// maks d³ugo¶æ ¶cie¿ki
static const unsigned int ftMAXPATH=500;
/// maks d³ugo¶æ opisu typu symbolu we/wy
/**
\remark Przenieæ do TTrans
*/
static const unsigned int ftTYPELEN=32;
/// specjalny symbol dla warto¶ci 'epsilon'
/**
\remark Przenieæ do TTrans
*/
static const char ftEPSILON='~';
/// specialny symbol dla warto¶ci 'default'
/**
\remark Przenieæ do TTrans
*/
static const char ftDEFAULT='@';
/// domy¶lny symbol wyj¶ciowy (true-'@', flase-'~')
/**
\remark Przenieæ do TTrans(???)
*/
bool copy_default;
/// tryb wyj¶cia
OUTPUT print_mode;
/// false, je¶li automat nie ma przej¶æ
operator bool() { return (bool)ttn; };
virtual const char* intype() { return itype; };
virtual const char* outtype() { return otype; };
protected:
/// liczba elementów tablicy tt
unsigned long ttn;
/// liczba stanów
unsigned long states;
/// liczba przej¶æ
unsigned long transitions;
/// typ symboli wej¶ciowych (napis)
/**
\remark Przenieæ do TTrans(???)
*/
char itype[ftTYPELEN];
/// typ symboli wyj¶ciowych (napis)
/**
\remark Przenieæ do TTrans(???)
*/
char otype[ftTYPELEN];
};
//---------------------------------------------------------------------------
/// Szablon przetwornika skoñczonego
/**
\param I - typ symbolu wejciowego
\param Ipass - typ, jaki ma byæ u¿yty przy przekazywaniu symbolu we jako parametru
do funkcji (metody), równy \a I lub \a I&
\param O - typ symbolu wyjciowego
\param Opass - typ, jaki ma byæ u¿yty przy przekazywaniu symbolu wy jako parametru
do funkcji (metody), równy \a O lub \a O&
\param - typ przejcia, musi byæ podklas± TTrans
*/
template<class I, class Ipass, class O, class Opass, class TT>
class TFT : public FT
{
public:
TFT() : FT(), tt(NULL) { setiotypes(); };
/**
\name Metody poziomu 1
Poziom przejæ.
*/
//@{
/// Test, czy przej¶cie \a t akceptuje symbol \a in.
bool accepts(long t, Ipass in) const;
/// Test, czy lista przej¶æ dla aktualnego stanu jest kontynuowana po \a t.
bool continued(long t) const;
/// Stan, do którego prowadzi przej¶cie \a t.
/**
\pre !empty(t)
*/
long next(long t) const;
/// Symbol wej¶ciowy przej¶cia \a t.
Ipass input(long t) const;
/// Symbol wyj¶ciowy przej¶cia \a t.
Opass output(long t) const;
/// Zwraca \c true, je¶li symbolem we przej¶cia \a t jest epsilon.
bool epsi(long t) const;
/// Zwraca \c true, je¶li symbolem we przej¶cia \a t jest symbol domy¶lny.
bool defi(long t) const;
/// Zwraca \c true, je¶li symbolem wy przej¶cia \a t jest epsilon.
bool epso(long t) const;
/// Zwraca \c true, je¶li symbolem wy przej¶cia \a t jest symbol domy¶lny.
bool defo(long t) const;
/// Indeks przej¶cia przez \a in.
long tra(long t, Ipass in) const;
/// Indeks przej¶cia przez \a in - non-deterministic.
long tra_nd(long t, Ipass in, long nth) const;
//@}
/**
\name Poziom 2
Poziom stanów. Stan (indeks stanu) = indeks jego pierwszego przejcia
*/
//@{
/// Zwraca \c true je¶li stan \a s jest pusty (nie ma z niego przej¶æ).
bool empty(long s) const { return tt[s].empty(); }
/// Zwraca \c true je¶li stan \a s jest stanem koñcowym.
bool final(long s) const { return tt[s].final(); }
long next(long t, Ipass in) const;
//long trans(const I* si, I* so, long& olen) const;
long gtra(long s, const I* w, long maxpath=ftMAXPATH) const;
//@}
/**
\name Poziom 3
Poziom ...
*/
//@{
long cont(long s=-1, I* c=NULL) const;
long match(const I* w=NULL, long* p=NULL) const;
long match_nd(const I* w=NULL, long* p=NULL) const;
long lgstmatch(const I* w, long* p, long& plen, long maxpath=ftMAXPATH) const;
/*NOWE*/
long lgstpath(I*& buf, long*& path, long start=0) const;
long pref(I*& buf, I sep, long start=0) const;
//@}
protected:
TT* tt; // tablica przej¶æ
long prn(const I* si, long* p, O* so) const;
void prntt(ostream& os);
void sort();
void setiotypes(); // NIE DZIA£A (dlaczego???)
// friend ostream& operator<<(ostream&,const CDFA&);
// friend istream& operator>>(istream&,CDFA&);
private:
long prn_oo(const I* si, long* p, O* so) const;
long prn_ioio(const I* si, long* p, O* so) const;
long prn_oioi(const I* si, long* p, O* so) const;
long prn_iioo(const I* si, long* p, O* so) const;
long prn_ooii(const I* si, long* p, O* so) const;
};
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
/**
stan = indeks pierwszego przejcia
state(t) = stan, do którego nale¿y t
symbol zerowy = symbol s, dla którego (bool)s zwraca \c false,
w przypadku znaków - '\0'
*/
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
template <class I, class Ipass, class O, class Opass, class TT>
inline
bool TFT<I,Ipass,O,Opass,TT>::accepts(long t, Ipass in) const
{ return tt[t].accepts(in); }
/// Test whether the transition list continues after \a t.
template <class I, class Ipass, class O, class Opass, class TT>
inline
bool TFT<I,Ipass,O,Opass,TT>::continued(long t) const
{ return tt[t].continued(); }
/**
\pre !empty(t)
*/
template <class I, class Ipass, class O, class Opass, class TT>
inline
long TFT<I,Ipass,O,Opass,TT>::next(long t) const
{ return tt[t].next(); }
template <class I, class Ipass, class O, class Opass, class TT>
inline
Ipass TFT<I,Ipass,O,Opass,TT>::input(long t) const
{ return tt[t].in(); }
template <class I, class Ipass, class O, class Opass, class TT>
inline
Opass TFT<I,Ipass,O,Opass,TT>::output(long t) const
{ return tt[t].out(); }
template <class I, class Ipass, class O, class Opass, class TT>
inline
bool TFT<I,Ipass,O,Opass,TT>::epsi(long t) const
{ return tt[t].epsi(); }
template <class I, class Ipass, class O, class Opass, class TT>
inline
bool TFT<I,Ipass,O,Opass,TT>::defi(long t) const
{ return tt[t].defi(); }
template <class I, class Ipass, class O, class Opass, class TT>
inline
bool TFT<I,Ipass,O,Opass,TT>::epso(long t) const
{ return tt[t].epso(); }
template <class I, class Ipass, class O, class Opass, class TT>
inline
bool TFT<I,Ipass,O,Opass,TT>::defo(long t) const
{ return tt[t].defo(); }
/**
\param +t - indeks przejcia
\param +in - symbol we
\return Indeks przjcia (>=\a t) dla bie¿±cego stanu, które
akceptuje symbol we \a in lub -1, jeli nie ma takiego przejcia
*/
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::tra(long t, Ipass in) const
{
if(t<0 || t>=ttn)
return -1;
if(empty(t)) return -1;
while(!accepts(t,in))
if(continued(t))
t++;
else
return -1;
return t;
}
//---------------------------------------------------------------------------
/// Indeks przej¶cia - wersja dla automatu niedeterministycznego.
/**
\param +t - indeks przejcia
\param +in - symbol we
\return Indeks przjcia (>=\a t) dla bie¿±cego stanu, które
akceptuje symbol we \a in lub -1, jeli nie ma takiego przejcia
Jeli nth==0, t1>=t, w przeciwnym razie t1>t.
*/
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::tra_nd(long t, Ipass in, long nth) const
{
if(t<0 || t>=ttn)
return -1;
if(nth)
if(continued(t))
t++;
else
return -1;
else
{ if(empty(t)) return -1; }
while(!accepts(t,in))
if(continued(t))
t++;
else
return -1;
return t;
}
//}
//---------------------------------------------------------------------------
//----------------------------------------------------------------------------
/// Funkcja przej¶cia.
/**
\param t - stan
\param in - symbol we
\return Stan, do którego mo¿na przejæ z \a t po wp³ywem symbolu \a in
lub -1, jeli nie ma przejcia przez \a in
*/
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::next(long t, Ipass in) const
{
if(t<0 || (unsigned long)t>=ttn)
return -1;
if(empty(t)) return -1;
while(!accepts(t,in))
if(continued(t))
t++;
else {
return -1;
}
return next(t);
}
//---------------------------------------------------------------------------
//----------------------------------------------------------------------------
/// Uogólniona funkcja przejscia.
/**
\param +s - stan
\param +w - wska¼nik pierwszego elementu ci±gu symboli we, zakoñczonego symbolem zerowym
\param maxpath maksymalna d³ugoæ cie¿ki, domylnie ftMAXPATH
\return stan osi±galny z \a s pod wp³ywem \a w (na cie¿ce mog± siê pojawiæ
epsilon-przejcia
*/
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::gtra(long s, const I* w, long maxpath) const
{
if(s<0 || (unsigned long)s>=ttn)
return -1;
long i=0;
while(*w)
{
if(i>maxpath || empty(s)) return -1;
while(!accepts(s,*w))
if(continued(s))
s++;
else
return -1;
if(!epsi(s)) w++;
s=next(s);
i++;
}
return s;
}
//----------------------------------------------------------------------------
/// Kontynuacja.
/**
...
\param +s stan, jeli -1 - poszukiwane jest nastêpne rozwi±zanie
\param -c ci±g symboli we ze cie¿ki prowadz±cej z \a s do
stanu koñcowego
\return d³ugoæ ci±gu \a c (= d³ugoæ cie¿ki)
\remark DZIA£A TYLKO DLA ZNAKÓW!!!
EPSILON-PRZEJ¦CIA NIEDOZWOLONE!!!
*/
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::cont(long s, I* c) const
{
static unsigned long path[ftMAXPATH]={0};
static unsigned long i=0;
static bool more=false;
bool found=false;
if(s!=-1)
{
if(s<0 || (unsigned long)s>=ttn)
more=false;
else
{
i=0;
c[0]=0;
path[0]=s;
more=true;
if(final(s))
found=true;
}
}
while(more && !found)
{
if(!empty(path[i]) && i<ftMAXPATH)
{
path[i+1]=next(path[i]);
c[i]=input(path[i]);
i++;
}
else
{
do
{
if(i>0)
c[--i]=0;
else
more=false;
}while(more && !continued(path[i]));
path[i]=path[i]+1;
}
if(final(path[i]))
{
found=true;
c[i]=0;
}
}
return i;
}
//----------------------------------------------------------------------------
/// Dopasowannie.
/**
\remark Nie zaimplementowane.
*/
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::match(const I* w, long* p) const
{}
//----------------------------------------------------------------------------
/// Dopasowanie niedeterministyczne.
/**
\param +w - wska¼nik pierwszego elementu ci±gu symboli we, zakoñczonego symbolem zerowym,
jeli NULL - poszukiwane jest nastêpne rozwi±zanie
\param -p ci±g przejæ zakoñczony -1
\return d³ugoæ dopasowania (PO CO?)
*/
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::match_nd(const I* w, long* p) const
{
static bool more=false;
static I *w0, *wc;
static long s=0, *p0, *pc, *pc_bound;
bool found=false;
if(w)
{
wc=w0=w;
pc=p0=p;
more=true;
pc_bound=pc+ftMAXPATH;
if(final(s=0))
{
*pc=-1; return 0;
}
}
while(more)
{
if(*wc && pc<pc_bound && (*pc=trand(s,*wc,0))>=0)
{ if(!epsi(*pc)) wc++; s=next(*pc); pc++; }
else
while(true)
{
if(pc==p0) { more=false; return -1; }
if(!epsi(*(--pc))) wc--;
if((*pc=trand(*pc,*wc,1))>=0)
{ if(!epsi(*pc)) wc++; s=next(*pc); pc++; break; }
}
if(final(s)) { *pc=-1; return wc-w0; }
}
return -1;
}
//----------------------------------------------------------------------------
/// Najd³u¿sze dopasowanie.
/**
\param +w wska¼nik pierwszego elementu ci±gu symboli wejciowych
\param -p cie¿ka
\param -plen d³ugoæ cie¿ki
\param +maxpath maks dd³ugoæ cie¿ki, domylnie FT::ftMAXPATH
\return d³ugoæ skonsumowanego wejcia
*/
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>
::lgstmatch(const I* w, long* p, long& plen, long maxpath) const
{
long s=0;
long t;
long i=0;
const char* w0=w;
long ilen=0;
while(*w && i<maxpath && (t=tra(s,*w))>=0)
{
if(!epsi(t)) w++;
s=next(t);
i++;
*(p++)=t;
if(final(s)) { plen=i; ilen=w-w0; }
}
*p=-1;
return ilen;
}
//----------------------------------------------------------------------------
/// Najd³u¿sza ¶cie¿ka.
/**
\param +buf wska¼nik pierwszego elementu ci±gu symboli wejciowych
\param -buf pozycja jeden za skonsumowanym prefiksem
\param +path wska¼nik pierwszego elementu wektora przejæ
\param -path wska¼nik jeden za ostatnim przejciem
\return d³ugoæ skonsumowanego prefiksu (PO CO? LEPIEJ D£ ¦CIE¯KI)
*/
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>
::lgstpath(I*& buf, long*& path, long start) const
{
long s=start;
long t;
const char* buf0=buf;
const long* pathlimit=path+FT::ftMAXPATH;
while(*buf && path<pathlimit && (t=tra(s,*buf))>=0)
{
if(!epsi(t)) buf++;
s=next(t);
*(path++)=t;
}
return buf-buf0;
}
//----------------------------------------------------------------------------
/// Najd³u¿szy prefiks.
/**
\param +buf wska¼nik pierwszego elementu ci±gu symboli wejciowych
\param -buf pozycja jeden za skonsumowanym prefiksem
\param +sep separator
\return stan po przejciu przez \a sep
\remark Dzia³a tylko dla automatów deterministycznych, minimalnych, eps-wolnych,
gdzie d³. cie¿ki == d³. dopasowania.
*/
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>
::pref(I*& buf, I sep, long start) const
{
static long pathtab[ftMAXPATH];
// static long* path=pathtab;
long* path=pathtab;
static bool more;
long s;
if(*buf) // pierwsze wywo³anie
{
if(!lgstpath(buf,path,start))
return -1;
--path;
more=true;
}
else // kolejne wywo³anie
--buf,--path;
while(more)
if(path>=pathtab)
if((s=next(next(*path),sep))>=0) {
return s;
}
else
--buf, --path;
else
{
more=false;
return -1;
}
return -1;
}
//----------------------------------------------------------------------------
/*
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::trans(const I* si, O* so, long& olen) const
{
long p[ftMAXPATH];
long ilen;
long plen;
if((ilen=lgstmatch(si,p,plen))>0)
olen=prn(si,p,so);
else
ilen=olen=0;
return ilen;
}
*/
//----------------------------------------------------------------------------
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::prn(const I* si, long* p, O* so) const
{
switch(print_mode)
{
case OO: return prn_oo(si,p,so);
case IOIO: return prn_ioio(si,p,so);
case OIOI: return prn_oioi(si,p,so);
case IIOO: return prn_iioo(si,p,so);
case OOII: return prn_ooii(si,p,so);
}
}
//----------------------------------------------------------------------------
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::prn_oo(const I* si, long* p, O* so) const
{
char* so0=so;
while(*p>=0)
{
long t=*p;
if(!epso(t))
{
if(defo(t))
*(so++)=*si;
else
*(so++)=output(t);
}
if(!epsi(t)) si++;
p++;
}
return so-so0;
}
//----------------------------------------------------------------------------
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::prn_ioio(const I* si, long* p, O* so) const
{
char* so0=so;
while(*p>=0)
{
long t=*p;
if(!epsi(t))
*(so++)=*si;
if(!epso(t))
if(defo(t))
*(so++)=*si;
else
*(so++)=output(t);
if(!epsi(t)) si++;
p++;
}
return so-so0;
}
//----------------------------------------------------------------------------
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::prn_oioi(const I* si, long* p, O* so) const
{
char* so0=so;
while(*p>=0)
{
long t=*p;
if(!epso(t))
{
if(defo(t))
*(so++)=*si;
else
*(so++)=output(t);
}
if(!epsi(t))
*(so++)=*(si++);
p++;
}
return so-so0;
}
//----------------------------------------------------------------------------
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::prn_iioo(const I* si, long* p, O* so) const
{
const char* si0=si;
long* p0=p;
char* so0=so;
while(*p>=0)
{
long t=*p;
if(!epsi(t))
{
*(so++)=*si;
si++;
}
p++;
}
si=si0;
p=p0;
while(*p>=0)
{
long t=*p;
if(!epso(t))
if(defo(t))
*(so++)=*si;
else
*(so++)=output(t);
if(!epsi(t)) si++;
p++;
}
return so-so0;
}
//----------------------------------------------------------------------------
template <class I, class Ipass, class O, class Opass, class TT>
long TFT<I,Ipass,O,Opass,TT>::prn_ooii(const I* si, long* p, O* so) const
{
const char* si0=si;
long* p0=p;
char* so0=so;
while(*p>=0)
{
long t=*p;
if(!epso(t))
{
if(defo(t))
*(so++)=*si;
else
*(so++)=output(t);
}
if(!epsi(t)) si++;
p++;
}
si=si0;
p=p0;
while(*p>=0)
{
long t=*p;
if(!epsi(t))
*(so++)=*(si++);
p++;
}
return so-so0;
}
//---------------------------------------------------------------------------
template<class I, class Ipass, class O, class Opass, class TT>
void TFT<I,Ipass,O,Opass,TT>::sort()
{
long t=0;
while(t<ttn)
{
long t0=t;
long tn=1;
while(continued(t++)) tn++;
if(tn>1)
{
long eps=-1;
long def=-1;
for(int i=0; i<tn; i++)
{
if(defi(t0+i))
if(epsi(t0+i)) eps=i; else def=i;
}
if(eps>=0 && eps<tn-1)
{
TT temp=tt[t0+eps];
memmove(tt+t0+eps+1,tt+t0+eps,tn-eps-1);
tt[t-1]=temp;
}
if(def>eps) def--;
if(def>=0 && def<tn-1)
{
TT temp=tt[t0+def];
if(eps>=0)
{
memmove(tt+t0+def+1,tt+t0+def,tn-eps-2);
tt[t-2]=temp;
}
else
{
memmove(tt+t0+def+1,tt+t0+def,tn-eps-2);
tt[t-1]=temp;
}
}
while(t0<t-1)
tt[t0++].continued(true);
tt[t-1].continued(false);
}
}
}
//---------------------------------------------------------------------------
template <class I, class Ipass, class O, class Opass, class TT>
void TFT<I,Ipass,O,Opass,TT>::setiotypes()
{
int i=0;
const char* it=typeid(I).name();
while(*it)
if(*it==' ')
{ it++; continue; }
else
itype[i++]=*(it++);
itype[i]='\0';
i=0;
const char* ot=typeid(O).name();
while(*ot)
if(*ot==' ')
{ ot++; continue; }
else
otype[i++]=*(ot++);
otype[i]='\0';
};
//---------------------------------------------------------------------------
template <class I, class Ipass, class O, class Opass, class TT>
void TFT<I,Ipass,O,Opass,TT>::prntt(ostream& os)
{
for(long i=0; i<ttn; ++i)
{
os << i << ':';
os << tt[i];
}
}
#endif

523
app/src/lib/tfti.h Executable file
View File

@ -0,0 +1,523 @@
#ifndef TFTiH
#define TFTiH
//---------------------------------------------------------------------------
#include <fstream.h>
#include <math.h>
#include <iomanip.h>
//#include <typeinfo.h>
#include "tft.h"
//---------------------------------------------------------------------------
template<class I, class Ipass, class O, class Opass>
class TFTi : public TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >
{
public:
TFTi() : TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >() {};
TFTi(const char* filename)
: TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >() { load(filename); };
void read(const char* filename);
void read(istream& is=cin);
void write(const char* filename);
void write(ostream& os=cout);
void load(const char* filename);
void load(FILE* f=stdin);
void save(const char* filename);
void save(FILE* f=stdout);
void clear();
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::ttn;
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::states;
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::transitions;
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::itype;
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::ftTYPELEN;
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::otype;
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::tt;
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::copy_default;
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::print_mode;
// friend istream& operator>>(istream&, TFTi<I,Ipass,O,Opass>&);
// friend ostream& operator<<(ostream&, const TFTi<I,Ipass,O,Opass>&);
};
//---------------------------------------------------------------------------
template<class I, class Ipass, class O, class Opass>
void TFTi<I,Ipass,O,Opass>::read(const char* filename)
{
ifstream is(filename);
if(!is) { fprintf(stderr,"Failed to open input file."); exit(1); }
read(is);
}
template<class I, class Ipass, class O, class Opass>
void TFTi<I,Ipass,O,Opass>::read(istream& is)
{
long *si; // state-index relation
long ci=0; // current index
char ch; // character read;
int empty=0; // no of states with 0 trans?
char intype[FT::ftTYPELEN];
char outtype[FT::ftTYPELEN];
clear();
is >> states >> transitions >> intype >> outtype;
// if(strcmp(intype,itype)!=0 ||
// strcmp(outtype,otype)!=0 && strcmp(outtype,"void")!=0)
// { is.clear(ios::badbit); goto end; };
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
while(is.peek()!='\n')
{
char s[20];
is >> s;
if(strcmp(s,"COPY")==0 && strcmp(intype,outtype)==0) copy_default=true;
else if(strcmp(s,"NOCOPY")==0) copy_default=false;
else if(strcmp(s,"II")==0) print_mode=FT::II;
else if(strcmp(s,"OO")==0) print_mode=FT::OO;
else if(strcmp(s,"IOIO")==0) print_mode=FT::IOIO;
else if(strcmp(s,"OIOI")==0) print_mode=FT::OIOI;
else if(strcmp(s,"IIOO")==0) print_mode=FT::IIOO;
else if(strcmp(s,"OIOI")==0) print_mode=FT::OIOI;
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
}
ttn=transitions+2; // 1 state without trans., 1 additional
si=new long[states];
tt=new TTrans_i<I,Ipass,O,Opass>[ttn];
for(long cs=0;cs<states;cs++)
{
long tc; // transition counter
si[cs]=ci;
long cscheck;
if(!is) goto end;
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
is >> cscheck;
if(cs!=cscheck) goto end;
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
is.get(ch);
if(!is) goto end;
switch(ch)
{
case '-': tt[ci].final(false); break;
case '+': tt[ci].final(true); break;
default: goto end;
}
tc=0, tt[ci].continued(false);
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
while(is && is.peek()!='\n')
{
switch(is.peek())
{
case '~': tt[ci].epsi(true); tt[ci].defi(true); is.get(ch);
break;
case '@': tt[ci].epsi(false); tt[ci].defi(true); is.get(ch);
break;
default : tt[ci].geti(is);
}
if(!is) goto end;
if(is.peek()=='/')
{
is.get(ch);
switch(is.peek())
{
case '~': tt[ci].epso(true); tt[ci].defo(true); is.get(ch);
break;
case '@': tt[ci].epso(false); tt[ci].defo(true); is.get(ch);
break;
default : tt[ci].geto(is);
}
}
else
{
tt[ci].defo(true);
if(copy_default) tt[ci].epso(false); else tt[ci].epso(true);
}
if(!is) goto end;
unsigned long transition;
is >> transition;
tt[ci].next(transition);
tt[ci].continued(false);
tt[ci].empty(false);
if(tc>0) tt[ci-1].continued(true);
tc++,ci++;
}
if(tc==0)
{
if(++empty>2) { fprintf(stderr, "Nondeterministic automaton."); exit(1); }
tt[ci].empty(true);
ci++;
}
is.get(ch);
if(ch!='\n') { is.clear(ios::badbit); goto end; }
}
ttn=transitions+empty;
if(ttn!=ci) { is.clear(ios::badbit); goto end; };
for(long i=0;i<ttn;i++)
tt[i].next(si[tt[i].next()]);
delete[] si;
sort();
end:
if(is.bad()) { fprintf(stderr,"Input error."); exit(1); }
}
//---------------------------------------------------------------------------
template<class I, class Ipass, class O, class Opass>
void TFTi<I,Ipass,O,Opass>::write(const char* filename)
{
ofstream os(filename);
if(!os) err("Failed to open output file.");
write(os);
}
template<class I, class Ipass, class O, class Opass>
void TFTi<I,Ipass,O,Opass>::write(ostream& os)
{
os << states << ' ' << transitions << ' ';
// os << itype << ' ' << otype << ' ';
os << "char void";
// os << (copy_default ? "COPY" : "NOCOPY") << ' ';
// switch(print_mode)
// {
// case FT::II : os << "II"; break;
// case FT::OO : os << "OO"; break;
// case FT::IOIO: os << "IOIO"; break;
// case FT::OIOI: os << "OIOI"; break;
// case FT::IIOO: os << "IIOO"; break;
// case FT::OOII: os << "OOII";
// }
os << '\n';
long* si=new long[ttn];
long cs=0;
for(long i=0;i<ttn;i++)
{
si[i]=cs;
if(continued(i)==false) cs++;
}
int statefieldwidth=log10(cs+1);
bool first=true;
for(long i=0;i<ttn;i++)
{
if(first)
{
os << setw(statefieldwidth) << si[i] << " ";
if(final(i)) os << '+'; else os << '-';
}
if(!empty(i))
{
os << ' ';
if(epsi(i))
os << FT::ftEPSILON;
else
if(defi(i))
os << FT::ftDEFAULT;
else
os << input(i);
if(epso(i))
{ if(copy_default) os << '/' << FT::ftEPSILON; }
else
if(defo(i))
{ if(!copy_default) os << '/' << FT::ftDEFAULT; }
else
{ os << '/' << output(i); }
if(strcmp(itype,"char")!=0 || strcmp(otype,"char")!=0)
os << ' ';
os << si[next(i)];
}
if(continued(i))
first=false;
else
{ os << '\n'; first=true; }
}
}
//---------------------------------------------------------------------------
template<class I, class Ipass, class O, class Opass>
void TFTi<I,Ipass,O,Opass>::load(const char* filename)
{
FILE* f;
if(*filename)
f=fopen(filename,"rb");
else
f=stdin;
if(!f) { fprintf(stderr, "Cannot open automaton file."); return; }
load(f);
}
template<class I, class Ipass, class O, class Opass>
void TFTi<I,Ipass,O,Opass>::load(FILE* f)
{
clear();
if(fread(&ttn,sizeof(ttn),1,f)!=1) { fprintf(stderr, "Binary input error."); return;}
if(fread(&states,sizeof(states),1,f)!=1) { fprintf(stderr, "Binary input error."); return;}
if(fread(&transitions,sizeof(transitions),1,f)!=1) { fprintf(stderr, "Binary input error."); return;}
if(fread(itype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr, "Binary input error."); return;}
if(fread(otype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr, "Binary input error."); return;}
if(fread(&copy_default,sizeof(copy_default),1,f)!=1) { fprintf(stderr, "Binary input error."); return;}
if(fread(&print_mode,sizeof(print_mode),1,f)!=1) { fprintf(stderr, "Binary input error."); return;}
if((tt=new TTrans_i<I,Ipass,O,Opass>[ttn])==NULL) { fprintf(stderr, "Cannot allocate memory for tt."); return;}
if(fread(tt,sizeof(TTrans_i<I,Ipass,O,Opass>),ttn,f)!=ttn) { fprintf(stderr, "Binary input error."); return; }
fclose(f);
}
//---------------------------------------------------------------------------
template<class I, class Ipass, class O, class Opass>
void TFTi<I,Ipass,O,Opass>::save(const char* filename)
{
FILE* f;
if(*filename)
f=fopen(filename,"wb");
else
f=stdout;
if(!f) err("Cannot open file.");
save(f);
}
template<class I, class Ipass, class O, class Opass>
void TFTi<I,Ipass,O,Opass>::save(FILE* f)
{
if(fwrite(&ttn,sizeof(ttn),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); }
if(fwrite(&states,sizeof(states),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); }
if(fwrite(&transitions,sizeof(transitions),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); }
if(fwrite(itype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr,"Binary output error."); exit(1); }
if(fwrite(otype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr,"Binary output error."); exit(1); }
if(fwrite(&copy_default,sizeof(copy_default),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); }
if(fwrite(&print_mode,sizeof(print_mode),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); }
if(fwrite(tt,sizeof(TTrans_i<I,Ipass,O,Opass>),ttn,f)!=ttn) { fprintf(stderr,"Binary output error."); exit(1); }
fclose(f);
}
//---------------------------------------------------------------------------
template<class I, class Ipass, class O, class Opass>
void TFTi<I,Ipass,O,Opass>::clear()
{
if(tt) delete[] tt;
ttn=0;
}
//---------------------------------------------------------------------------
/*
template<class I, class Ipass, class O, class Opass>
istream& operator>>(istream& is, TFTi<I,Ipass,O,Opass>& ft)
{
long *si; // state-index relation
long ci=0; // current index
char ch; // character read;
int empty=0; // no of states with 0 trans?
char intype[FT::ftTYPELEN];
char outtype[FT::ftTYPELEN];
ft.clear();
is >> ft.states >> ft.transitions >> intype >> outtype;
if(strcmp(intype,ft.itype)!=0 ||
strcmp(outtype,ft.otype)!=0 && strcmp(outtype,"void")!=0)
{ is.clear(ios::badbit); return is; };
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
while(is.peek()!='\n')
{
char s[20];
is >> s;
if(strcmp(s,"COPY")==0 && strcmp(intype,outtype)==0) ft.copy_default=true;
else if(strcmp(s,"NOCOPY")==0) ft.copy_default=false;
else if(strcmp(s,"II")==0) ft.print_mode=FT::II;
else if(strcmp(s,"OO")==0) ft.print_mode=FT::OO;
else if(strcmp(s,"IOIO")==0) ft.print_mode=FT::IOIO;
else if(strcmp(s,"OIOI")==0) ft.print_mode=FT::OIOI;
else if(strcmp(s,"IIOO")==0) ft.print_mode=FT::IIOO;
else if(strcmp(s,"OIOI")==0) ft.print_mode=FT::OIOI;
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
}
ft.ttn=ft.transitions+2; // 1 state without trans., 1 additional
si=new long[ft.states];
ft.tt=new TTrans_i<I,Ipass,O,Opass>[ft.ttn];
for(long cs=0;cs<ft.states;cs++)
{
long tc; // transition counter
si[cs]=ci;
do is >> ch; while(ch!='+' && ch!='-');
switch(ch)
{
case '-': ft.tt[ci].final(false); break;
case '+': ft.tt[ci].final(true); break;
default: return is;
}
tc=0, ft.tt[ci].continued(false);
while((is.get(ch),ch==' '))
{
if(!is) return is;
switch(is.peek())
{
case '~': ft.tt[ci].epsi(true); ft.tt[ci].defi(true); is.get(ch);
break;
case '@': ft.tt[ci].epsi(false); ft.tt[ci].defi(true); is.get(ch);
break;
default : ft.tt[ci].geti(is);
}
if(!is) return is;
if(is.peek()=='/')
{
is.get(ch);
switch(is.peek())
{
case '~': ft.tt[ci].epso(true); ft.tt[ci].defo(true); is.get(ch);
break;
case '@': ft.tt[ci].epso(false); ft.tt[ci].defo(true); is.get(ch);
break;
default : ft.tt[ci].geto(is);
}
}
else
{
ft.tt[ci].defo(true);
if(ft.copy_default) ft.tt[ci].epso(false); else ft.tt[ci].epso(true);
}
if(!is) return is;
unsigned long transition;
is >> transition;
ft.tt[ci].next(transition);
ft.tt[ci].continued(false);
ft.tt[ci].empty(false);
if(tc>0) ft.tt[ci-1].continued(true);
tc++,ci++;
}
if(tc==0)
{
if(++empty>2) err("Nondeterministic automaton.");
ft.tt[ci].empty(true);
ci++;
}
if(ch!='\n') { is.clear(ios::badbit); return is; }
}
ft.ttn=ft.transitions+empty;
if(ft.ttn!=ci) { is.clear(ios::badbit); return is; };
for(long i=0;i<ft.ttn;i++)
ft.tt[i].next(si[ft.tt[i].next()]);
delete[] si;
ft.sort();
return is;
}
*/
//---------------------------------------------------------------------------
/*
template<class I, class Ipass, class O, class Opass>
ostream& operator<<(ostream& os, const TFTi<I,Ipass,O,Opass>& ft)
{
os << ft.states << ' ' << ft.transitions << ' '
<< ft.itype << ' ' << ft.otype << ' ';
os << (ft.copy_default ? "COPY" : "NOCOPY") << ' ';
switch(ft.print_mode)
{
case FT::II : os << "II"; break;
case FT::OO : os << "OO"; break;
case FT::IOIO: os << "IOIO"; break;
case FT::OIOI: os << "OIOI"; break;
case FT::IIOO: os << "IIOO"; break;
case FT::OOII: os << "OOII";
}
os << ' ' << '\n';
long* si=new long[ft.ttn];
long cs=0;
for(long i=0;i<ft.ttn;i++)
{
si[i]=cs;
if(ft.continued(i)==false) cs++;
}
bool first=true;
for(long i=0;i<ft.ttn;i++)
{
if(first)
if(ft.final(i)) os << '+'; else os << '-';
if(!ft.empty(i))
{
os << ' ';
if(ft.epsi(i))
os << FT::ftEPSILON;
else
if(ft.defi(i))
os << FT::ftDEFAULT;
else
os << ft.input(i);
if(ft.epso(i))
{ if(ft.copy_default) os << '/' << FT::ftEPSILON; }
else
if(ft.defo(i))
{ if(!ft.copy_default) os << '/' << FT::ftDEFAULT; }
else
{ os << '/' << ft.output(i); }
if(strcmp(ft.itype,"char")!=0 || strcmp(ft.otype,"char")!=0)
os << ' ';
os << si[ft.next(i)];
}
if(ft.continued(i))
first=false;
else
{ os << '\n'; first=true; }
}
return os;
}
*/
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
template<class I, class O>
class TFTiv : public TFTi<I,I,O,O>
{
public:
TFTiv() : TFTi<I,I,O,O>() {};
TFTiv(const char* filename) : TFTi<I,I,O,O>(filename) {};
};
//---------------------------------------------------------------------------
template<class I, class O>
class TFTir : public TFTi<I,I&,O,O&>
{
public:
TFTir() : TFTi<I,I,O,O>() {};
};
//---------------------------------------------------------------------------
#endif

204
app/src/lib/ttrans.h Executable file
View File

@ -0,0 +1,204 @@
#ifndef _TTransi_h
#define _TTransi_h
//---------------------------------------------------------------------------
#include <iostream.h>
//---------------------------------------------------------------------------
//! The template for a transition with input and output symbols stored internally.
/*!
A state is identified with the set of its outgoing transitions.
The state index is the index of the first transition for it.
A state with no outgoing transition is represented as an empty transition.
*/
template<class I, class Ipass, class O, class Opass>
class TTrans_i
{
public:
//private:
//! Input symbol
I i;
//! Output symbol
O o;
public:
//! state is final
static const unsigned char BITf=0x01;
//! transition list is continued
static const unsigned char BITc=0x02;
//! no transition
static const unsigned char BITe=0x04;
//! epsilon input
static const unsigned char BITepsi=0x08;
//! default input
static const unsigned char BITdefi=0x10;
//! epsilon output
static const unsigned char BITepso=0x20;
//! default output
static const unsigned char BITdefo=0x40;
//! Flags
unsigned char flags;
//! The index of the next state
long nxt;
//! Input symbol.
//! \return The input symbol of the transition.
Ipass in() const { return i; }
//! Output symbol.
//! \return The output symbol of the transition.
Opass out() const { return o; }
//! Set the input symbol.
//! \param in input symbol
void in(Ipass in) { i=in; }
//! Set the output symbol.
//! \param out output symbol
void out(Opass out) { o=out; }
//! remark Is this needed?
I& iref() { return i; }
//! remark Is this needed?
O& oref() { return o; }
//! Test whether an input symbol is accepted.
//! \remark Simplified. Should rely on a test function provided by the user.
bool accepts(Ipass in) { return defi() || in==i; }
//! Next state.
//! \return Destination state of the transition.
long next() const { return nxt; };
//! Set the next state.
//! \param t destination state of the transition
void next(long t) { nxt=t; };
//! Is the state final?
//! \return \c true if the state is final, false otherwise.
bool final() const { return flags&BITf; };
//! Set the \b final flag.
//! \param b \c true if the state is final, \c false otherwise.
void final(bool b) { if(b) flags|=BITf; else flags&=~BITf; };
//! Is the transition list continued?
//! \return \c true if the transition is not the last transition for the state,
//! \c false otherwise.
bool continued() const { return flags&BITc; };
//! Set the \b continuation flag.
//! \param b \c true if the transition is not the last one for the state, \c false otherwise.
void continued(bool b) { if(b) flags|=BITc; else flags&=~BITc; };
//! Is the transition empty?
//! \return \c true if the transition is empty (represents a state with no outgoing transitions),
//! \c false otherwise.
bool empty() const { return flags&BITe; };
//! Set the \b empty flag.
//! \param b \c true if the transition is empty, \c false otherwise.
void empty(bool b) { if(b) flags|=BITe; else flags&=~BITe; };
bool epsi() const { return flags&BITepsi; };
void epsi(bool b) { if(b) flags|=BITepsi; else flags&=~BITepsi; };
bool defi() const { return flags&BITdefi; };
void defi(bool b) { if(b) flags|=BITdefi; else flags&=~BITdefi; };
bool epso() const { return flags&BITepso; };
void epso(bool b) { if(b) flags|=BITepso; else flags&=~BITepso; };
bool defo() const { return flags&BITdefo; };
void defo(bool b) { if(b) flags|=BITdefo; else flags&=~BITdefo; };
void geti(istream&);
void geto(istream&);
// friend ostream& operator<<(ostream& os, const TTrans_i<I,Ipass,O,Opass>& t);
};
//---------------------------------------------------------------------------
template<char>
void getsym(istream& is, char& c)
{
is >> c;
if(c=='\\')
{
is.get(c);
switch(c)
{
case 'n':c='\n';break;
case 't':c='\t';break;
}
}
}
template<class T>
void getsym(istream& is, T& s)
{ is >> s; }
//---------------------------------------------------------------------------
template<class I, class Ipass, class O, class Opass>
void TTrans_i<I,Ipass,O,Opass>::geti(istream& is)
{ getsym<I>(is,iref()); };
template<class I, class Ipass, class O, class Opass>
void TTrans_i<I,Ipass,O,Opass>::geto(istream& is)
{ getsym<I>(is,oref()); };
//---------------------------------------------------------------------------
/*
template<class I, class Ipass, class O, class Opass>
ostream& operator<<(ostream& os, const TTrans_i<I,Ipass,O,Opass>& t)
{
os << (t.final() ? '+' : '-');
os << ' ';
if(!t.empty())
{
if(t.defi())
os << (t.epsi() ? '~' : '@');
else
switch(t.in())
{
case ' ': os << "\\ "; break;
case '\n': os << "\\n"; break;
case '\t': os << "\\t"; break;
default: os << t.in();
}
os << '/';
if(t.defo())
os << (t.epso() ? '~' : '@');
else
switch(t.out())
{
case ' ': os << "\\ "; break;
case '\n': os << "\\n"; break;
case '\t': os << "\\t"; break;
default: os << t.out();
}
os << ' ' << t.next();
}
os << '\n';
if(!t.continued())
os << '\n';
return os;
}
*/
//---------------------------------------------------------------------------
#endif

199
app/src/lib/word.cc Normal file
View File

@ -0,0 +1,199 @@
//---------------------------------------------------------------------------
#include "word.h"
#include "auttools.h"
#include <istream.h>
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
void Word::autodescr(const char* fo, const char* de)
{
strcpy(f,fo);
// len=strlen(f);
char lemd[MAXDESCRLEN];
int i=strcspn(de,",");
strncpy(lemd,de,i);
lemd[i]='\0';
if(isdigit(lemd[0]))
fullform(f,lemd,l); // je¶li lemat zakodowany
else
strcpy(l,lemd); // je¶li lemat w pe³nej postaci
strcpy(d,de+i+1);
}
//---------------------------------------------------------------------------
int Word::cmp_w(Word a, Word b) {
return (a.w_suf() > b.w_suf());
}
//---------------------------------------------------------------------------
istream& operator>>(istream& is, Word& w)
{
char temp[Word::MAXLEN+1];
char c;
int i=0;
while(i<Word::MAXLEN && is.get(c) && isalpha(c)) temp[i++]=c;
if(i==Word::MAXLEN) {
fprintf(stderr, "To long word");
}
if(i==0) is.clear(ios::badbit);
temp[i]='\0';
if(is)
is.putback(c);
strcpy(w.f,temp);
// w.len=i;
return is;
}
//---------------------------------------------------------------------------
ostream& operator<<(ostream& os, Word& w)
{
if(*(w.f))
os << "<W " << w.form()
<< ";" << w.lemma()
<< ',' << w.descr() << '>';
return os;
}
//---------------------------------------------------------------------------
Words::~Words() {
// for (int i=0; i<tab.size(); ++i)
// delete(tab[i]);
}
//---------------------------------------------------------------------------
int Words::find(const char* word) {
for (int i=0; i<cnt; ++i) {
if (strcmp(word, tab[i].form()) == 0) {
return i;
}
}
return -1;
}
//---------------------------------------------------------------------------
int Words::find(const char* word, const char* descr) {
for (int i=0; i<cnt; ++i) {
if ((strcmp(word, tab[i].form()) == 0) && (strcmp(descr, tab[i].descr()) == 0)) {
return i;
}
}
return -1;
}
//---------------------------------------------------------------------------
/* zwraca index nastepnego wyniku, podczas pierwszego wywolania
* zwraca index wyniku o najwiekszej wadze, przy drugim wywolaniu
* wynik z druga najwyzsza waga, itd.
* Jezeli nie ma juz wynikow - zwraca -1.
*/
int Words::next() {
float max = -1;
int result = -1;
for (int i=0; i<cnt; ++i) {
float w = tab[i].w_suf();
if (w>max && !tab[i].returned) {
max = w;
result = i;
}
}
if (result != -1)
tab[result].returned = 1;
return result;
}
//---------------------------------------------------------------------------
void Words::sort() {
std::sort(tab.begin(), tab.end(), Word::cmp_w);
}
//---------------------------------------------------------------------------
int Words::add(const char* fo)
{
int i = find(fo);
if(i!=-1) {
return i;
}
if (cnt>=tab.capacity()-1)
tab.resize(tab.size()*2);
tab[cnt].form(fo);
tab[cnt].w_suf(0.0);
// if(cnt<MAX-1) {
/* tab.push_back(new Word());
tab[cnt]->form(fo);
tab[cnt]->w_suf(0.0);
tab[cnt]->w_pref(0.0);*/
return cnt++;
// }
//return -1;
}
//---------------------------------------------------------------------------
int Words::add(const char* fo, const char* des)
{
char d[Word::MAXDESCRLEN];
int l=strcspn(des,",");
int ok=1;
if( *(des+l) == ',' )
{
strcpy(d,des+l+1);
// printf("\t%s->%s,\n", des, d);
int i=find(fo, d);
if(i!=-1)
return i;
}
else
ok=0;
if (cnt>=tab.capacity()-1)
tab.resize(tab.size()*2);
tab[cnt].form(fo);
if(ok)
tab[cnt].autodescr(fo, des);
else
tab[cnt].autodescr(fo, "?,?");
tab[cnt].w_suf(0.0);
tab[cnt].returned = 0;
/*
// if(cnt<MAX-1) {
tab.push_back(new Word());
tab[cnt]->form(fo);
tab[cnt]->autodescr(fo,des);
tab[cnt]->w_suf(0.0);
tab[cnt]->w_pref(0.0);
// printf("ok!\n");*/
return cnt++;
// }
// printf("hm\n");
return -1;
}
//---------------------------------------------------------------------------
void Words::prn(ostream& os)
{
for(int i=0; i<count(); ++i)
os << "<W " << tab[i].lemma() << ',' << tab[i].descr() << ">";
}
//---------------------------------------------------------------------------
ostream& operator<<(ostream& os, Words& tab)
{
/* for(int i=0; i<tab.count(); ++i)
os << i << ". " << tab[i] << '\n';
return os;*/
}
//---------------------------------------------------------------------------

145
app/src/lib/word.h Normal file
View File

@ -0,0 +1,145 @@
//---------------------------------------------------------------------------
#ifndef _Word_h
#define _Word_h
//---------------------------------------------------------------------------
//#include "alphabet.h"
//#include "erro.h"
#include "const.h"
#include <iostream.h>
#include <vector>
//---------------------------------------------------------------------------
using namespace std;
class Word
{
public:
static const int MAXLEN=64; // dac do global
static const int MAXDESCRLEN=80; // dac do global
private:
/// word form
char f[MAX_FORM]; // w wolnej chwili nazwy mozna zamienic na dluzsze
/// length
int _len_suf; // dlugosc dopasowania koncowki...
// int _len_pref; // ... i prefiksu
/// lemma
char l[MAX_FORM];
/// description
char d[MAX_DESC];
/// weight (probability)
float _w_suf;
// float _w_pref;
public:
static int cmp_w(Word a, Word b);
Word() : _len_suf(-1) { *f='\0'; returned=0; };
Word(const char* fo, const char* des) : _len_suf(-1) { autodescr(fo,des); _w_suf=1.0; returned=0; };
Word(const Word& w);
char* form() { return f; } // przywrocic const
char* lemma() { return l; } // przywrocic const
char* descr() { return d; }
float w_suf() { return _w_suf; };
int len_suf() { return _len_suf; }
void form(const char* s) { strcpy(f,s); }
void lemma(const char* s) { strcpy(l,s); }
void descr(const char* s) { strcpy(d,s); };
void w_suf(float x) { _w_suf=x; };
void len_suf(int n) { _len_suf=n; };
bool operator==(const Word& w);
bool operator!=(const Word& w);
int cmp(const Word&);
int cmpi(const Word&);
char* operator!() { return f; };
operator bool() { return _len_suf>0; };
char* str() { return f; }
void autodescr(const char* fo, const char* des);
friend istream& operator>>(istream& is, Word& m);
friend ostream& operator<<(ostream& os, Word& m);
bool returned;
};
inline Word::Word(const Word& word)
{ strcpy(f,word.f); strcpy(l,word.l); strcpy(d,word.d); _len_suf=word._len_suf; _w_suf=word._w_suf; returned = 0; }
//---------------------------------------------------------------------------
inline bool Word::operator==(const Word& w)
{return _len_suf==w._len_suf &&
!strcmp(f,w.f) && !strcmp(l,w.l) && !strcmp(d,w.d); }
//---------------------------------------------------------------------------
inline bool Word::operator!=(const Word& w)
{return _len_suf!=w._len_suf ||
strcmp(f,w.f) || strcmp(l,w.l) || strcmp(d,w.d);}
//---------------------------------------------------------------------------
inline int Word::cmp(const Word& w) { return strcmp(f,w.f); }
//---------------------------------------------------------------------------
//inline int Word::cmpi(const Word& w) { return PL.cmpi(f,w.f); }
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
//---------------------------------------------------------------------------
class Words
{
private:
int find(const char* word);
int find(const char* word, const char* descr);
public:
static const int MAX=1024;
Words() : cnt(0) {tab.resize(MAX); };
~Words();
Word& operator[](int i) { return tab[i]; }
int count() const { return cnt; }
void clear() { cnt=0; tab.clear(); }
int add(const char* fo);
int add(const char* fo, const char* des);
/* zwraca index nastepnego wyniku, podczas pierwszego wywolania
* zwraca index wyniku o najwiekszej wadze, przy drugim wywolaniu
* wynik z druga najwyzsza waga, itd.
* Jezeli nie ma juz wynikow - zwraca -1.
*/
int next();
void sort();
void prn(ostream& os);
// friend class Lem;
// friend class AuxLem;
friend ostream& operator<<(ostream& os, Words& tab);
vector<Word> tab;
int cnt;
};
//---------------------------------------------------------------------------
#endif

6
app/src/mar/Makefile Normal file
View File

@ -0,0 +1,6 @@
main:
copy:
ifdef UTT_BIN_DIR
cp mar ${UTT_BIN_DIR}
endif

262
app/src/mar/mar Executable file
View File

@ -0,0 +1,262 @@
#!/usr/bin/perl
#package: UAM Text Tools
#component name: mrk
#author: Marcin Walas
#this program tags the tokenized file with given tags
#tags can be given in any order and configuration through the expression
#which is one of the parametres of the script
#contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com
use strict;
use Getopt::Long;
use attr;
Getopt::Long::Configure('no_ignore_case_always');
my $help=0;
my $pattern=0;
my $macrofile=0;
my $define=0;
my $command=0;
my $action="pgP";
my $eos="seg(EOS)";
my $explicit_space=0;
#this is our help function to cut the re to get another tag
#it takes only one argument which is our patern (after m4 processing)
#returns: the first root-level brace with content
sub cutRe
{
my $i = 0;
my $level = 0;
my $text = $_[0];
my $temp;
for( $i =0; $i < (length $text);$i++)
{
$temp = substr($text, $i,1);
if( $temp eq "(")
{#we have an opening
$level++;
}
elsif ( $temp eq ")")
{#we close
$level--;
}
if ( $level == 0)
{
$temp = substr($text,0,$i+1);
last;
}
}
$temp;
}
#the same function as above althought it returns everything after the
#first root level brace
sub restRe
{
my $i = 0;
my $level = 0;
my $text = $_[0];
my $temp;
for( $i =0; $i < (length $text);$i++)
{
$temp = substr($text, $i,1);
if( $temp eq "(")
{#we have an opening
$level++;
}
elsif ( $temp eq ")")
{#we close
$level--;
}
if ( $level == 0)
{ #we cut everything in the begining
$temp = substr($text,$i+1);
last;
}
}
$temp;
}
GetOptions("pattern|e=s" => \$pattern,
"eos|E=s" => \$eos,
"macros=s" => \$macrofile,
"define=s" => \$macrofile,
"command" => \$command,
"action=s" => \$action,
"help|h" => \$help,
"space|s" => \$explicit_space
);
if($help)
{
print <<'END'
Usage: mar [OPTIONS] [file ..]
Options:
--pattern -e PATTERN Pattern.
--bos -E PATTERN Segment serving as sentence beginning marker. [TODO]
--macros=FILE Read macrodefinitions from FILE. [TODO]
--define=FILE Add macrodefinitions from FILE. [TODO]
--action -a [p][s][P] Perform only indicated actions.
p - preprocess
s - search
P - postprocess
(default pgP)
--command Print generated shell command and exit.
--help -h Print help.
In patern you can put any tag. Tags should begin with the @ character.
They don't have to be closed.
They can't contain white spaces!
Note: If you don't define any custom tags, whole pattern will be taged with
default tags (begining of match and end of match)
Tags examples:
mar -e '@BEG cat(<ADJ>) @END'
it will find any adjectives in the text and tag them with surrounding tags
mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)'
this will find two neighbouring adjectives and parcel them with tag MYTAG
Some example patterns:
'word(domu)' - form of the word domu
'lexeme(dom)' - any form of lexeme dom
'space' - space
'cat(<ADJ>)' - adjective
You can use * in patterns to make zero or more counts of word.
END
;
exit 0;
}
die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/;
die("$0: macro file not found") unless -e "terms.m4" and $macrofile="terms.m4";
my $preproc = ($action =~ /p/) ? ' fla | ' : '';
my $postproc = ($action =~ /P/) ? ' | unfla ' : '';
#here we are preparing re for extended matching
my @tags;
#we must find what our the tags
#some pattern adjustment
my $end = 0;
my $temp = " ".$pattern." ";
$temp =~ s/(\@[^ ]*) (\@[^ ]* )/\1 \2/g;
$pattern = $temp;
while ($end != 1)
{
#we seek for the first tag in pattern
if ($temp =~ /^.*?\@(.*?) /)
{
#we add this to tags array
push (@tags, $1);
#and cut the pattern
$temp =~ s/^.*?\@(.*?) / /;
#print $temp."\n";
}
else
{
#if we dont find any tags we end
$end = 1;
}
}
#here we have our patern with tags removed (we set sections of ()) between tags
my $patternmod = "( ".$pattern." )";
$patternmod =~ s/\s@.*?\s/\)\(/g;
#discarding spaces
$patternmod =~ s/\s+/\\`'/g; #`
# quoting escaped commas
$patternmod =~ s/\\,/\\`\\`\\,''/g;
# quoting commas in {m,n} r.e. operator
$patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;
#print "After m4:".$re."\n";
my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' $macrofile - 2>/dev/null`;
die("Incorrect pattern (m4).") if $? >> 8;
chomp $re;
# <> expansion
$re =~ s/<([^>]+)>/`echo $1 | .\/terms\.tag2re`/ge;
# Perl-like special sequences
$re =~ s/\./[^ [:cntrl:]]/g;
$re =~ s/\\s/[ ]/g;
$re =~ s/\\S/[^ [:cntrl:]]/g;
$re =~ s/\\d/[0-9]/g;
$re =~ s/\\D/[^0-9 [:cntrl:]]/g;
$re =~ s/\\w/[a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_]/g;
$re =~ s/\\W/[^a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_ [:cntrl:]]/g;
# extensions
$re =~ s/\\l/[a-z±æê³ñ󶼿]/g; #lowercase letter
$re =~ s/\\L/[A-Z¡ÆÊ£ÑÓ¦¬¯]/g; #upercase letter
my $sedcommand;
my $grepcommand;
#now we must built a sed script from our re
#we do this by cuting our re each tag until we cut them all
#if an user dint input any tags we do our default
my $defBOM = "BOM";
my $defEOM = "EOM";
my $defTempTagBeg = "####TempTAGBEG####";
my $defTempTagEnd = "####TempTAGEND####";
if (@tags == 0)
{
$sedcommand = "sed -r 's/($re)/\\500 $defBOM *\\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM *\\f\\1/g'";
}
else #we have custom tags
{
#first tag is easy to tag :)
my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg *\\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd *\\f\\1/g;";
#after first step we have temp tagged parts of input matching re
#now we need to insert our custom tags
#we will find temp tags and process our input
my $i = 0;
#copy of re which will be cut
my $rec = $re;
my $restre = $re;
for ($i = 0 ; $i < @tags ; $i++)
{
#re cutting
$rec = cutRe($restre);
$restre = restRe($restre);
if ($rec =~ / *\( *\) */)
{
$sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\*\\f([0-9]+)/\\2 00 $tags[$i] *\\f\\2 00 $defTempTagBeg *\\f\\2/g;";
}
else
{
$sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] *\\f\\1 00 $defTempTagBeg *\\f\\1/g;";
}
}
$sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f//g;s/[0-9]+ 00 $defTempTagEnd \\*\\f//g'";
}
if($command)
{
print $sedcommand."\n";
exit 0;
}
exec $preproc.$sedcommand.$postproc;

6
app/src/rm12/Makefile Normal file
View File

@ -0,0 +1,6 @@
main:
copy:
ifdef UTT_BIN_DIR
cp rm12 ${UTT_BIN_DIR}
endif

3
app/src/rm12/rm12 Executable file
View File

@ -0,0 +1,3 @@
#!/bin/bash
sed -r '/[0-9]+[ \t]+[0-9]+[ \t]+BOS/! s/[0-9]+[ \t]+[0-9]+[ \t]//'

12
app/src/rs12/Makefile Normal file
View File

@ -0,0 +1,12 @@
main: rs12
rs12: rs12.c
gcc -static -o rs12 rs12.c
clean:
rm rs12
copy:
ifdef UTT_BIN_DIR
cp rs12 ${UTT_BIN_DIR}
endif

48
app/src/rs12/rs12.c Normal file
View File

@ -0,0 +1,48 @@
#include <stdio.h>
#include <string.h>
#define MAXLINE 1000
main()
{
char buf[MAXLINE+1], outbuf[MAXLINE+1];
char form[MAXLINE+1];
int len;
int curpos,nextpos=0;
int a,b;
while(fgets(buf,MAXLINE,stdin))
{
int n=sscanf(buf,"%d %d",&a,&b);
if(n==2)
{
nextpos=a+b;
fputs(buf,stdout);
}
else
{
if(n==1)
{
curpos=a;
sscanf(buf,"%*d %*s %s",form);
}
else
{
curpos=nextpos;
sscanf(buf,"%*s %s",form);
}
if(*form == '*')
len=0;
else
{
char *f = form;
for(len=0; *f; ++f) if(*f != '\\') ++len;
}
char *buf1=buf; while(!isalpha(*buf1)) ++buf1;
sprintf(outbuf,"%04i %02i %s", curpos, len, buf1);
fputs(outbuf,stdout);
nextpos = curpos+len;
}
}
}

15
app/src/sen-l/Makefile Normal file
View File

@ -0,0 +1,15 @@
sen: sen.l
flex -osen.c sen.l
cc -O3 -o sen sen.c -lfl
copy:
ifdef UTT_BIN_DIR
cp sen ${UTT_BIN_DIR}
endif
clean:
rm sen.c sen
uninstall:

80
app/src/sen-l/sen.l Normal file
View File

@ -0,0 +1,80 @@
%{
int pos=0,len=0;
void set_position();
%}
ul [A-Z¡ÆÊ£ÑÓ¦¯¬]
ll [a-z±æê³ñ󶿼]
l ul|ll
n [0-9]+
s [ \t]+
ab1 (mgr|in¿|prof|hab|doc|dyr|kier|zast)
ab2 (ul|pl|al)
abrv (ab1|ab2)
SEG .*\n
N {n}{s}{n}{s}N{s}.*\n
S {n}{s}{n}{s}S{s}.*\n
P {n}{s}{n}{s}P{s}.*\n
W {n}{s}{n}{s}W{s}.*\n
UL {n}{s}{n}{s}W{s}{ul}.*\n
Cap {n}{s}{n}{s}W{s}{ul}{ll}*.*\n
POINT {n}{s}{n}{s}P{s}\.({s}.*)?\n
QMARK {n}{s}{n}{s}P{s}\?({s}.*)?\n
EXCL {n}{s}{n}{s}P{s}\!({s}.*)?\n
DASH {n}{s}{n}{s}P{s}\-({s}.*)?\n
POINTS {POINT}+
ABRV {n}{s}{n}{s}W{s}{abrv}({s}.*)?\n
EOS {POINT}|{POINTS}|{QMARK}|{EXCL}
%%
{N}({POINT}{N})+ ECHO; set_position();
({UL}{POINT}{S}?)+{Cap} ECHO; set_position();
{ABRV}{POINT} ECHO; set_position();
{P}/{S}{DASH} ECHO; set_position(); print_EOS();
{EOS}/{S}({Cap}|{P}|{N}) ECHO; set_position(); print_EOS();
.* ECHO; set_position();
<<EOF>> printf("%04d 00 EOS *\n",pos+len); exit(1);
%%
int main()
{
printf("0000 00 BOS *\n");
yylex();
}
int yywrap()
{
return 1;
}
void set_position()
{
char *lastseg, *tmp;
yytext[yyleng-1]='\0';
if(tmp=strrchr(yytext,'\n'))
lastseg=tmp+1;
else
lastseg=yytext;
sscanf(lastseg,"%d %d", &pos, &len);
yytext[yyleng-1]='\n';
}
int print_EOS()
{
printf("%04d 00 EOS *\n%04d 00 BOS *\n",pos+len,pos+len);
}

9
app/src/sen-nl/Makefile Normal file
View File

@ -0,0 +1,9 @@
sen-nl:
copy:
ifdef UTT_BIN_DIR
cp sen-nl ${UTT_BIN_DIR}
endif
clean:

3
app/src/sen-nl/sen-nl Executable file
View File

@ -0,0 +1,3 @@
#!/bin/bash
sed -r '1 s/^(([0-9]+)[ \t][0-9]+[ \t].*)$/\2 00 BOS \*\n\1/;t;$! s/(([0-9]+)[ \t][0-9]+[ \t][[:alpha:]]+[ \t]+[[:print:]]*\\n.*)$/\2 00 EOS *\n\2 00 BOS *\n\1/; $ s/^(([0-9]+) .*)$/\1\n\2 00 EOS */'

11
app/src/ser/Makefile Normal file
View File

@ -0,0 +1,11 @@
ser:
copy:
ifdef UTT_BIN_DIR
cp ser ${UTT_BIN_DIR}
endif
clean:
uninstall:

168
app/src/ser/ser Executable file
View File

@ -0,0 +1,168 @@
#!/usr/bin/perl
#package: UAM Text Tools
#component: ser (pattern search tool)
#author: Tomasz Obrêbski
use strict;
use Getopt::Long;
my $SHARE_DIR="/usr/share/utt";
my $USER_DIR="$ENV{HOME}/.utt/share";
#use lib "$ENV{HOME}/.utt/lib/perl";
#use attr;
Getopt::Long::Configure('no_ignore_case_always');
my $help=0;
my $pattern=0;
my $only_matching=0;
my $no_markers=0;
my $macros=0;
my $flextemplate=0;
my $flex=0;
my $morfield='lem';
my $configfile1="../../conf/ser.conf";
my $configfile2="../conf/ser.conf";
#read configuration files###########################
my $file;
foreach $file ($configfile1, $configfile2){
if(open(CONFIG, $file)){
while (<CONFIG>) {
chomp;
s/#.*//;
s/^\s+//;
s/\s+$//;
next unless length;
my ($name, $value) = split(/\s*=\s*/, $_, 2);
if(($name eq "pattern")or($name eq "e")){
$pattern=$value;
}
elsif($name eq "morph"){
$morfield=$value;
}
elsif(($name eq "only-matching")or($name eq "m")){
$only_matching=1;
}
elsif(($name eq "no-markers")or($name eq "M")){
$no_markers=1;
}
elsif($name eq "define"){
$macros=$value;
}
elsif($name eq "flex-template"){
$flextemplate=$value;
}
elsif($name eq "flex"){
$flex=1;
}
elsif(($name eq "help")or($name eq "h")){
$help=1;
}
}
close CONFIG;
}
}
#########################################################
GetOptions("pattern|e=s" => \$pattern,
"morph=s" => \$morfield,
"only-matching|m" => \$only_matching,
"no-markers|M" => \$no_markers,
"define=s" => \$macros,
"flex-template=s" => \$flextemplate,
"flex" => \$flex,
"help|h" => \$help);
if($help)
{
print <<'END'
Usage: ser [OPTIONS] [file ..]
Options:
--help -h Help.
--pattern=PATTERN -e PATTERN Search pattern.
--morph=STRING Field containing morphological information (default 'lem').
--define=FILE Read macrodefinitions from FILE.
--flex-template=FILE Read flex code template from FILE.
--only-matching -m Print only fragments matching PATTERN.
--no-markers -M Do not print BOM and EOM markers [TODO].
--flex Print only the generated flex code and exit.
END
;
exit 0;
}
die("$0: no pattern given.\n") unless $pattern;
die("$0: flex template file not found") unless
$flextemplate or
-e "$USER_DIR/ser.l.template" and $flextemplate="$USER_DIR/ser.l.template" or
-e "$SHARE_DIR/ser.l.template" and $flextemplate="$SHARE_DIR/ser.l.template";
die("$0: macro file not found") unless
$macros or
-e "$USER_DIR/terms.m4" and $macros="$USER_DIR/terms.m4" or
-e "$SHARE_DIR/terms.m4" and $macros="$SHARE_DIR/terms.m4";
#$pattern =~ s/cat\(([^)]+)\)/'cat('.pre($1).')'/ge;
# quoting escaped commas /NIE DZIA£A/
$pattern =~ s/\\,/\\`\\`\\,''/g;
# protecting backslash
$pattern =~ s/\\/\\\\\\/g;
# discarding spaces
$pattern =~ s/\s+/\\`'/g; #`
my $flexpattern = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT=\\\\n --define=MORFIELD=$morfield $macros - 2>/dev/null`;
die("Incorrect pattern (m4).") if $? >> 8;
chomp $flexpattern;
# <> expansion
$flexpattern =~ s/<([^>]+)>/`echo $1 | tag2re`/ge;
# restricting the value of the . special symbol
$flexpattern =~ s/\./[^ \\t\\n\\r\\f]/g;
# perl-like shortcuts for character classes
# perl exact
$flexpattern =~ s/\\s/[ \\t]/g;
$flexpattern =~ s/\\S/[^ \\t\\n\\r\\f]/g;
$flexpattern =~ s/\\d/[0-9]/g;
$flexpattern =~ s/\\D/[^0-9 \\t\\n\\r\\f]/g;
$flexpattern =~ s/\\w/[a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_]/g;
$flexpattern =~ s/\\W/[^a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_ \\t\\n\\r\\f]/g;
# extensions
$flexpattern =~ s/\\l/[a-z±æê³ñ󶼿]/g; #lowercase letter
$flexpattern =~ s/\\L/[A-Z¡ÆÊ£ÑÓ¦¬¯]/g; #upercase letter
# protecting slash
$flexpattern =~ s/\//\\\//g;
my $defaultaction = ($only_matching) ? '' : 'ECHO';
# docelowo posrednie pliki powinny byc w jakims tempie !!!
system "m4 \"--define=PATTERN=$flexpattern\" \"--define=DEFAULTACTION=$defaultaction\" $flextemplate > $USER_DIR/ser.l";
if($flex)
{
system "cat $USER_DIR/ser.l";
exit 0;
}
system "flex -o$USER_DIR/ser.c $USER_DIR/ser.l";
system "cc -O3 -o $USER_DIR/ser.executable $USER_DIR/ser.c -lfl";
system "$USER_DIR/ser.executable";
system "rm -f $USER_DIR/ser.{l,c,executable}";

8
app/src/tags/Makefile Normal file
View File

@ -0,0 +1,8 @@
main:
copy:
ifdef UTT_TAGS_DIR
cp *.tag2re ${UTT_TAGS_DIR}
endif
clean:

5
app/src/tags/README Normal file
View File

@ -0,0 +1,5 @@
In this directory files specific to different tag formats are stored.
TAGSET.tag2re are command-line programs which translate a tag constraint
specification into a character-level regular expression matching
all tags in the TAGSET format meeting the specified constraint.

1
app/src/tags/ipi.tag2re Executable file
View File

@ -0,0 +1 @@
#TODO

83
app/src/tags/uam.tag2re Executable file
View File

@ -0,0 +1,83 @@
#!/usr/bin/perl
use locale;
my $input = <>;
chomp $input;
our $pos_re = qr/(?:[[:upper:]]+)/;
our $attr_re = qr/(?:[[:upper:]]+)/;
our $val_re = qr/(?:[[:lower:][:digit:]+?!*-]|<[^>\n]+>)/;
our $av_re = qr/(?:$attr_re$val_re+)/;
our $avlist_re = qr/(?:$av_re+)/;
our $cat_re = qr/(?:$pos_re(?:\/$avlist_re)?)/;
print pre($input);
sub parse ($)
{
my ($dstr)=@_;
my $avs={};
my ($cat,$attrlist) = split '/', $dstr;
ATTR:
while( $attrlist =~ /($attr_re)($val_re+)/g )
{
my ($attrstr,$valstr)=($1,$2);
my %vals;
while($valstr =~ /$val_re/g)
{
my $val = $&;
next ATTR if $val eq '*';
$val =~ s/^<([[:lower:]])>$/$1/;
$vals{$val}=1;
}
$avs->{$attrstr} = \%vals; # dlaczego to dziala? %vals jest lokalne
}
[$cat, $avs];
}
sub unparse (\@)
{
my ($cat,$avs)= @{shift @_};
my $dstr=$cat;
my @attrs = keys %$avs;
if(@attrs)
{
$dstr .= '/';
for my $attr ( sort @attrs )
{
$dstr .= $attr . (join '', sort keys %{$avs->{$attr}});
}
}
$dstr;
}
sub canonize ($)
{
unparse @{parse shift} ;
}
sub pre
{
my $pos_res = '[[:upper:]]+';
my $attr_res = '[[:upper:]]+';
my $val_res = '[[:lower:][:digit:]+?!*-]|<[^>\n[:cntrl:]]+>';
my $av_res = "$attr_res($val_res)+";
my $avlist_res = "($av_res)+";
my $pat = canonize(shift);
my $ret;
my ($pos,$avlist) = split /\//, $pat;
$ret = $pos.'(\/';
while ($avlist =~ /($attr_res)(${val_res}+)/g)
{
my $attr = $1;
my $vals = $2;
my $vals = "($val_res)*(".join('|',($vals =~ /$val_res/g)).")($val_res)*";
$ret .= "($av_res)*$attr$vals";
}
$ret .= "($av_res)*)?";
return $ret;
}

37
app/src/tok.c/Makefile Normal file
View File

@ -0,0 +1,37 @@
PAR=-Wno-deprecated -O3
PAR2=-c -Wno-deprecated -O3
LIB_PATH=../lib
CMDLINE_FILE='"../tok.c/cmdline.h"'
tok: tok.o cmdline.c common_tok.o common.o
g++ $(PAR) tok.c cmdline.c common.o common_tok.o -o tok
tok.o: tok.c cmdline.h
g++ $(PAR2) tok.c
common_tok.o: cmdline.h common_tok.cc common_tok.h
g++ $(PAR2) common_tok.cc
common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc\
$(COMMON_PATH)/common.h
g++ $(PAR2) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc
cmdline.ggo: cmdline_tok.ggo ../common/cmdline_common.ggo
cat cmdline_tok.ggo ../common/cmdline_common.ggo > cmdline.ggo
cmdline.c cmdline.h: cmdline.ggo
gengetopt -i cmdline.ggo --conf-parser
copy:
ifdef UTT_BIN_DIR
cp tok ${UTT_BIN_DIR}/
endif
clean: clean.cmdline
rm *.o
clean.cmdline:
rm cmdline.*

Some files were not shown because too many files have changed in this diff Show More