git-svn-id: svn://atos.wmid.amu.edu.pl/utt@4 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
This commit is contained in:
parent
f1563c0f02
commit
25ae32e4c2
152
app/Makefile
Normal file
152
app/Makefile
Normal file
@ -0,0 +1,152 @@
|
||||
# main makefile
|
||||
|
||||
BIN=bin
|
||||
SRC=src
|
||||
DIR=$(shell pwd)
|
||||
|
||||
##############################
|
||||
|
||||
UTT_DIST_NAME=utt-0.9
|
||||
|
||||
export UTT_DIR=${DIR}/${UTT_DIST_NAME}
|
||||
|
||||
export UTT_BIN_DIR=${UTT_DIR}/bin # executables
|
||||
export UTT_CONF_DIR=${UTT_DIR}/conf # configuration files
|
||||
export UTT_SHARE_DIR=${UTT_DIR}/share # stuff
|
||||
export UTT_LANG_DIR=${UTT_DIR}/lang # language/encoding specific stuff
|
||||
export UTT_TAGS_DIR=${UTT_DIR}/tags # tag format specific stuff
|
||||
#export UTT_LIB_DIR=${UTT_DIR}/lib # nothing
|
||||
export UTT_DOC_DIR=${UTT_DIR}/doc # documantation
|
||||
|
||||
UTT_DIST_FILE=utt
|
||||
|
||||
# list of components to be included in the distribution
|
||||
|
||||
COMPONENTS = lib gue tok.l cor lem kot sen-l sen-nl ser grp con fla unfla mar compiledic
|
||||
|
||||
##############################
|
||||
|
||||
all: dirs components conf doc lang tags files share
|
||||
@echo "Make completed successfully!"
|
||||
|
||||
.PHONY: dirs
|
||||
dirs:
|
||||
if [ -d ${UTT_DIR} ]; then rm -r ${UTT_DIR}; fi
|
||||
mkdir -p ${UTT_DIR}
|
||||
mkdir -p ${UTT_BIN_DIR}
|
||||
mkdir -p ${UTT_CONF_DIR}
|
||||
mkdir -p ${UTT_SHARE_DIR}
|
||||
mkdir -p ${UTT_LANG_DIR}
|
||||
mkdir -p ${UTT_TAGS_DIR}
|
||||
mkdir -p ${UTT_DOC_DIR}
|
||||
|
||||
.PHONY: components
|
||||
components:
|
||||
@for cmp in $(COMPONENTS); do\
|
||||
make $$cmp;\
|
||||
done
|
||||
|
||||
${COMPONENTS}:
|
||||
cd $(SRC)/$@ && make && make copy; cd $(DIR);
|
||||
|
||||
.PHONY: conf
|
||||
conf:
|
||||
cd $(DIR)/conf && make && make copy ; cd $(DIR)
|
||||
|
||||
.PHONY: doc
|
||||
doc:
|
||||
cd $(DIR)/doc && make && make copy ; cd $(DIR)
|
||||
|
||||
.PHONY: lang
|
||||
lang:
|
||||
cd $(DIR)/lang && make && make copy ; cd $(DIR)
|
||||
|
||||
.PHONY: tags
|
||||
tags:
|
||||
cd $(DIR)/tags && make && make copy ; cd $(DIR)
|
||||
|
||||
.PHONY: share
|
||||
share:
|
||||
cd $(DIR)/share && make && make copy ; cd $(DIR)
|
||||
|
||||
.PHONY: files
|
||||
files:
|
||||
cd ${DIR}/files && make && make copy ; cd ${DIR}
|
||||
|
||||
|
||||
clean: clean_components clean_doc clean_dist
|
||||
@echo "All files cleaned successfully!"
|
||||
|
||||
clean_components:
|
||||
@for cmp in $(COMPONENTS); do \
|
||||
cd $(SRC)/$$cmp && make clean ; cd $(DIR); \
|
||||
done
|
||||
|
||||
clean_lib:
|
||||
cd $(SRC)/lib && make clean
|
||||
|
||||
clean_doc:
|
||||
cd $(DIR)/doc && make clean ; cd $(DIR)
|
||||
|
||||
|
||||
|
||||
install: all
|
||||
cd ${UTT_DIR} && make install; cd ${DIR}
|
||||
|
||||
uninstall:
|
||||
cd ${UTT_DIR} && make uninstall; cd ${DIR}
|
||||
|
||||
reinstall:
|
||||
cd ${UTT_DIR} && make reinstall; cd ${DIR}
|
||||
|
||||
# ifdef INSTALL_BIN_DIR
|
||||
# if [ -d ${INSTALL_BIN_DIR} ]; then true; else mkdir -p ${INSTALL_BIN_DIR}; fi
|
||||
# cp -r ${UTT_BIN_DIR}/* ${INSTALL_BIN_DIR}/
|
||||
# endif
|
||||
# ifdef INSTALL_SHARE_DIR
|
||||
# if [ -d ${INSTALL_SHARE_DIR} ]; then true; else mkdir -p ${INSTALL_SHARE_DIR}; fi
|
||||
# cp -r ${UTT_SHARE_DIR}/* ${INSTALL_SHARE_DIR}/
|
||||
# endif
|
||||
# ifdef INSTALL_DOC_DIR
|
||||
# if [ -d ${INSTALL_DOC_DIR} ]; then true; else mkdir -p ${INSTALL_DOC_DIR}; fi
|
||||
# cp -r ${UTT_DOC_DIR}/* ${INSTALL_DOC_DIR}/
|
||||
# endif
|
||||
# ifdef INSTALL_LIB_DIR
|
||||
# if [ -d ${INSTALL_LIB_DIR} ]; then true; else mkdir -p ${INSTALL_LIB_DIR}; fi
|
||||
# cp -r ${UTT_LIB_DIR}/* ${INSTALL_LIB_DIR}
|
||||
# endif
|
||||
|
||||
#
|
||||
# install: make_dirs install_components install_cnf install_dta install_doc
|
||||
# @echo "Installation completed successfully!"
|
||||
|
||||
# install_components:
|
||||
# @for cmp in $(COMPONENTS); do \
|
||||
# cd $(SRC)/$$cmp && make install ; cd $(DIR); \
|
||||
# done
|
||||
|
||||
# install_cnf:
|
||||
# cp conf/*.conf $(UTT_ETC_DIR)/
|
||||
|
||||
# install_dta:
|
||||
# cp -r data/* $(UTT_SHARE_DIR)/
|
||||
|
||||
# install_doc:
|
||||
# cp doc/utt.{ps,pdf,html,info} $(UTT_DOC_DIR)/
|
||||
|
||||
# make_dirs:
|
||||
# if [ -d $(UTT_BIN_DIR) ]; then true; else mkdir -p $(UTT_BIN_DIR); fi
|
||||
# if [ -d $(UTT_ETC_DIR) ]; then true; else mkdir -p $(UTT_ETC_DIR); fi
|
||||
# if [ -d $(UTT_SHARE_DIR) ]; then true; else mkdir -p $(UTT_SHARE_DIR); fi
|
||||
# if [ -d $(UTT_DOC_DIR) ]; then true; else mkdir -p $(UTT_DOC_DIR); fi
|
||||
# if [ -d $(UTT_LIB_DIR) ]; then true; else mkdir -p $(UTT_LIB_DIR); fi
|
||||
|
||||
dist: all
|
||||
tar -czvf $(UTT_DIST_NAME).tgz $(UTT_DIR)
|
||||
|
||||
|
||||
clean_dist:
|
||||
if [ -d $(UTT_DIST_DIR) ]; then rm -r $(UTT_DIST_DIR); else true; fi
|
||||
if [ -f $(UTT_DIST_FILE).tar.bz2 ]; then rm $(UTT_DIST_FILE).tar.bz2; else true; fi
|
||||
if [ -f $(UTT_DIST_PMDB_FILE).tar.bz2 ]; then rm $(UTT_DIST_PMDB_FILE).tar.bz2; else true; fi
|
||||
|
16
app/README.developers
Normal file
16
app/README.developers
Normal file
@ -0,0 +1,16 @@
|
||||
|
||||
COMMANDS TO BE RUN IN THIS DIRECTORY:
|
||||
|
||||
% make
|
||||
|
||||
compiles all the components, moves all files destinated for
|
||||
distribution in the directory [the value of UTT_DIR_NAME variable in
|
||||
Makefile] (currently utt-0.9)
|
||||
|
||||
% make install
|
||||
|
||||
installes the package in your system in the directory ~/.utt
|
||||
|
||||
% make dist
|
||||
|
||||
prepares distribution file named ${UTT_DIR_NAME}.tgz (currently utt-0.9.tgz)
|
11
app/TODO
Normal file
11
app/TODO
Normal file
@ -0,0 +1,11 @@
|
||||
* wyprowadzic grp-pre i grp-post z grp do aux?
|
||||
* zamienic kota na lepszego (Kubis)
|
||||
*
|
||||
|
||||
1. DONE. Makefile do gph (install).
|
||||
2. (zrobione dla ser?) Nazwy pmdb2re -> pmdb.tag2re (grp, ser).
|
||||
3. DONE. Usuniecie bibliotek (aplhabet, erro).
|
||||
4. DONE (dla gue i lem) Poprawna obsluga opcji --one-line i oraz --one-field.
|
||||
---
|
||||
5. Zadania zwiazane z rozbudowa ser (src/ser/TODO).
|
||||
|
61
app/dist/Makefile
vendored
Normal file
61
app/dist/Makefile
vendored
Normal file
@ -0,0 +1,61 @@
|
||||
# compile task doesn't compile sources, but just copy some files
|
||||
# this should be changed
|
||||
#
|
||||
|
||||
# I put here some variables
|
||||
|
||||
# path, where binaries are placed
|
||||
# (they will be processed for making distribution)
|
||||
export _UTT_DIST_DIR=$(shell pwd)/bin
|
||||
# path, where distribution file should be placed
|
||||
export _UTT_DIST_OUTPUT=$(shell pwd)
|
||||
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# default task should display options
|
||||
.PHONY: default
|
||||
defaul:
|
||||
@echo "Using: make compile|tarball|rpm|deb"
|
||||
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# -----------------------------------------------------------
|
||||
# this task should compile utt application
|
||||
.PHONY: compile
|
||||
compile:
|
||||
if test -d ${_UTT_DIST_DIR}; then rm -fr ${_UTT_DIST_DIR}; fi
|
||||
mkdir -p ${_UTT_DIST_DIR}
|
||||
@# fake compilation
|
||||
cp -r ../utt-0.9/* ${_UTT_DIST_DIR}/
|
||||
@# we add some extra file (required during instalation)
|
||||
cp common/create_utt_config.pl ${_UTT_DIST_DIR}/
|
||||
chmod 700 ${_UTT_DIST_DIR}/create_utt_config.pl
|
||||
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# this task should compile utt (if nesessery) and create tar.gz version
|
||||
.PHONY: tarball
|
||||
tarball: compile
|
||||
cd tarball && make
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# this task should compile utt (if nesessery) and create rpm version
|
||||
.PHONY: rpm
|
||||
rpm: compile
|
||||
@#we build rpm (see spec/README for details)
|
||||
cd spec && make
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# this task should compile utt (if nesessery) and create deb version
|
||||
.PHONY: deb
|
||||
deb: compile
|
||||
@#we build deb (see deb/README for details)
|
||||
cd deb && make
|
||||
|
||||
# -----------------------------------------------------------
|
||||
# this task should remove compiled files and directories
|
||||
.PHONY: clean
|
||||
clean:
|
||||
# finally the line below should be uncomment
|
||||
rm -fr ${_UTT_DIST_DIR}
|
||||
|
1
app/dist/common/description.def
vendored
Normal file
1
app/dist/common/description.def
vendored
Normal file
@ -0,0 +1 @@
|
||||
I put here some description.
|
1
app/dist/common/description.pl.def
vendored
Normal file
1
app/dist/common/description.pl.def
vendored
Normal file
@ -0,0 +1 @@
|
||||
Tu umieszczę opis po polsku.
|
1
app/dist/common/release.def
vendored
Normal file
1
app/dist/common/release.def
vendored
Normal file
@ -0,0 +1 @@
|
||||
1
|
0
app/dist/common/requirements.def
vendored
Normal file
0
app/dist/common/requirements.def
vendored
Normal file
53
app/dist/common/utt_make_config.pl
vendored
Normal file
53
app/dist/common/utt_make_config.pl
vendored
Normal file
@ -0,0 +1,53 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use Cwd 'abs_path';
|
||||
use File::Basename;
|
||||
use POSIX;
|
||||
|
||||
my $cur_dir = dirname(abs_path($0));
|
||||
|
||||
open(FILE, ">$cur_dir/conf/utt.conf");
|
||||
|
||||
# we put some description into utt.conf file
|
||||
print FILE "# ************************************************************\n";
|
||||
print FILE "# * This file was created automatically during installation. *\n";
|
||||
print FILE "# * If you don't need do not change it. *\n";
|
||||
print FILE "# * *\n";
|
||||
print FILE "# * UAM Text Tools *\n";
|
||||
print FILE "# * Adam Mickiewicz University, Poland *\n";
|
||||
print FILE "# * http://utt.amu.edu.pl *\n";
|
||||
print FILE "# ************************************************************\n";
|
||||
print FILE "\n\n";
|
||||
|
||||
# we need utt home directory
|
||||
print FILE "# absolute path to utt directory\n";
|
||||
print FILE "UTT_HOME=$cur_dir\n\n";
|
||||
|
||||
|
||||
# we need user default locale
|
||||
$best_locale = findLocale();
|
||||
print FILE "# user locale\n";
|
||||
print FILE "UTT_LOCALE=$best_locale\n";
|
||||
print FILE "\n";
|
||||
|
||||
close FILE;
|
||||
|
||||
|
||||
|
||||
sub findLocale() {
|
||||
$cur_locale = setlocale(LC_CTYPE);
|
||||
|
||||
# we replace Latinx to ISO-8859-x
|
||||
$cur_locale =~ s/(.+?)Latin(.+?)/$1ISO\-8859\-$2/g;
|
||||
|
||||
if($cur_locale =~ /\w+_\w+\.\S+/) {
|
||||
$best_locale = $cur_locale;
|
||||
}
|
||||
elsif($cur_locale =~ /\w+_\w+/) {
|
||||
$best_locale = $cur_locale.".UTF-8";
|
||||
}
|
||||
else {
|
||||
$best_locale = toupper($cur_locale).'_'.tolower($cur_locale).'.UTF-8';
|
||||
}
|
||||
return $best_locale;
|
||||
}
|
1
app/dist/common/version.def
vendored
Normal file
1
app/dist/common/version.def
vendored
Normal file
@ -0,0 +1 @@
|
||||
0.9
|
81
app/dist/deb/Makefile
vendored
Normal file
81
app/dist/deb/Makefile
vendored
Normal file
@ -0,0 +1,81 @@
|
||||
#default task
|
||||
|
||||
# here there're few properties
|
||||
_PRODUCT_NAME=utt
|
||||
_BUILD_DIR=$(_UTT_BIN_DIR)
|
||||
_UTT_VER=$(shell cat ../common/version.def)
|
||||
_UTT_REL=$(shell cat ../common/release.def)
|
||||
_DEB_ROOT=$(shell pwd)/deb_root
|
||||
_INSTALL_DIR=/usr/local/$(_PRODUCT_NAME)/$(_UTT_VER)-$(_UTT_REL)
|
||||
|
||||
.PHONY: default
|
||||
default:
|
||||
# we need some extra configuration files
|
||||
make_control
|
||||
make_postinst
|
||||
|
||||
# first, we prepare some directory structure
|
||||
mkdir -p $(_DEB_ROOT)/DEBIAN
|
||||
mkdir -p $(_DEB_ROOT)$(_INSTALL_DIR)
|
||||
mkdir -p $(_DEB_ROOT)/usr/share/man/man1
|
||||
mkdir -p $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)
|
||||
|
||||
find $(_DEB_ROOT) -type d | xargs chmod 755 # this is necessary on Debian Woody, don't ask me why
|
||||
|
||||
# next, we copy necessary files
|
||||
mv ./control $(_DEB_ROOT)/DEBIAN/
|
||||
cp ./postinst $(_DEB_ROOT)/DEBIAN/
|
||||
cp ./prerm $(_DEB_ROOT)/DEBIAN/
|
||||
# cp -r $(_BUILD_DIR)/man/* $(_DEB_ROOT)/usr/share/man/
|
||||
cp $(_BUILD_DIR)/COPYRIGHT $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/copyright
|
||||
# cp $(_BUILD_DIR)/changelog $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/
|
||||
# cp $(_BUILD_DIR)/changelog.Debian $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/
|
||||
|
||||
|
||||
# next we make man/doc archives
|
||||
# gzip --best $(_DEB_ROOT)/usr/share/man/man1/$(_PRODUCT_NAME).1
|
||||
# gzip --best $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/changelog
|
||||
# gzip --best $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/changelog.Debian
|
||||
# tar -cvvf control.tar.gz ${_DEB_ROOT}/DEBIAN/
|
||||
# rm -fr ${_DEB_ROOT}/DEBIAN/
|
||||
|
||||
# and binaries
|
||||
cp -rv $(_BUILD_DIR)/* $(_DEB_ROOT)$(_INSTALL_DIR)/
|
||||
# tar -cvvf data.tar.gz ${_DEB_ROOT}/
|
||||
# rm -fr ${_DEB_ROOT}/
|
||||
|
||||
|
||||
# finally, we buid deb package
|
||||
fakeroot dpkg-deb --build $(_DEB_ROOT)
|
||||
mv $(_DEB_ROOT).deb $(_PRODUCT_NAME)_$(_UTT_VER)-$(_UTT_REL).all.deb
|
||||
|
||||
|
||||
.PHONY: make_control
|
||||
make_control:
|
||||
echo "Package: $(_PRODUCT_NAME)" > control
|
||||
echo "Version: $(_UTT_VER)" >> control
|
||||
echo "Section: web" >> control
|
||||
echo "Priority: optional" >> control
|
||||
echo "Architecture: all" >> control
|
||||
echo "Essential: no" >> control
|
||||
|
||||
echo "Depends: " >> control
|
||||
# here we read this information from file ../common/requirements.def
|
||||
#libwww-perl, acme-base (>= 1.2) <= wymagania pakietowe
|
||||
|
||||
echo "Pre-Depends: perl" >> control
|
||||
|
||||
echo "Maintainer: Adam Mickiewicz University" >> control
|
||||
echo "Provides: $(_PRODUCT_NAME)" >> control
|
||||
echo -n "Description: " >> control
|
||||
cat ../common/description.def >> control
|
||||
|
||||
.PHONY: make_postinst
|
||||
make_postinst:
|
||||
echo "#!/bin/sh" > postinst
|
||||
echo "$(_INSTALL_DIR)/create_utt_config.pl" >> postinst
|
||||
echo "rm -f $(_INSTALL_DIR)/create_utt_config.pl" >> postinst
|
||||
|
||||
.PHONY: make_prerm
|
||||
make_prerm:
|
||||
echo "#!/bin/sh" > prerm
|
3
app/dist/deb/README
vendored
Normal file
3
app/dist/deb/README
vendored
Normal file
@ -0,0 +1,3 @@
|
||||
This directory contains files necessery to create deb package.
|
||||
|
||||
apt-get install dpkg-dev debhelper devscripts fakeroot linda
|
0
app/dist/files/COPYRIGHT
vendored
Normal file
0
app/dist/files/COPYRIGHT
vendored
Normal file
0
app/dist/files/LICENCE
vendored
Normal file
0
app/dist/files/LICENCE
vendored
Normal file
14
app/dist/files/README
vendored
Normal file
14
app/dist/files/README
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
|
||||
Installation:
|
||||
|
||||
1) Run the command:
|
||||
|
||||
make install
|
||||
|
||||
in this directory. This will install UTT in the directory '~/.utt'.
|
||||
|
||||
2) Add the path
|
||||
|
||||
~/.utt/bin
|
||||
|
||||
to your PATH variable to make UTT programs visible to your system.
|
15
app/dist/spec/Makefile
vendored
Normal file
15
app/dist/spec/Makefile
vendored
Normal file
@ -0,0 +1,15 @@
|
||||
# this makefile will build rpm
|
||||
|
||||
DIR=$(shell pwd)
|
||||
|
||||
ifndef _UTT_DIST_DIR
|
||||
_UTT_DIST_DIR=${DIR}
|
||||
endif
|
||||
|
||||
|
||||
# default task
|
||||
.PHONY: rpm
|
||||
rpm:
|
||||
cd ${_UTT_DIST_DIR}; rpmbuild -bb ${DIR}/utt.spec
|
||||
|
||||
|
16
app/dist/spec/README
vendored
Normal file
16
app/dist/spec/README
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
This directory contains files necessary to produce rpm package.
|
||||
|
||||
First, you must have variable _UTT_DIST_DIR defined properly.
|
||||
This variable should be defined by main Makefile.
|
||||
|
||||
To create rpm file, just write:
|
||||
make
|
||||
|
||||
The created package should appears in default RPM directory.
|
||||
(in my computer it is /usr/src/redhat/RPMS/$arch/ directory)
|
||||
|
||||
To determine the rpm output directory, execute:
|
||||
rpm --showrc | grep _rmpdir
|
||||
|
||||
You need access privilage to this directory to create rmp.
|
||||
|
106
app/dist/spec/utt.spec
vendored
Normal file
106
app/dist/spec/utt.spec
vendored
Normal file
@ -0,0 +1,106 @@
|
||||
#
|
||||
# Default RPM header.
|
||||
#
|
||||
# START_RPM_STD_HEADER:
|
||||
|
||||
|
||||
#
|
||||
# RPM properties
|
||||
#
|
||||
%define _this_product UAM Text Tools
|
||||
%define _this_summary Some tools for text processing
|
||||
%define _this_name utt
|
||||
%define _this_version %(cat ../common/version.def)
|
||||
%define _this_release %(cat ../common/release.def)
|
||||
%define _this_copyright Adam Mickiewicz University, Poland
|
||||
|
||||
#
|
||||
# We need some paths
|
||||
#
|
||||
# Directory with utt binaries
|
||||
%define _UTT_DIST_DIR %(pwd)
|
||||
#Root directory in which utt will be installed
|
||||
%define _UTT_DIR /usr/local/%_this_name
|
||||
#Directory for rpm
|
||||
%define _RPM_ROOT %_UTT_DIST_DIR/../rpm_root
|
||||
|
||||
#
|
||||
# Default RPM header.
|
||||
#
|
||||
# END_RPM_STD_HEADER:
|
||||
# --------------------------------------------------------------------
|
||||
|
||||
Summary: %_this_summary
|
||||
Name: %_this_name
|
||||
Version: %_this_version
|
||||
Release: %_this_release
|
||||
#Copyright: %_this_copyright
|
||||
License: GPL
|
||||
Group: Development/Tools
|
||||
URL: http://utt.amu.edu.pl
|
||||
Vendor: Adam Mickiewicz University
|
||||
BuildRoot: %_RPM_ROOT
|
||||
#BuildArch: i586
|
||||
# requirements for utt application
|
||||
#AutoReq: no
|
||||
#AutoReqProv: no
|
||||
|
||||
#Requires: glibc >= 2.1.3
|
||||
#Requires: libgcc1 >= 3.0
|
||||
#Requires: libgcc >= 3.0
|
||||
#Requires: libstdc++6 >= 3.4.1
|
||||
#Requires: libstdc++ >= 3.4.1
|
||||
|
||||
%description
|
||||
%(cat ../common/description.def)
|
||||
|
||||
%description -l pl
|
||||
%(cat ../common/description.pl.def)
|
||||
|
||||
|
||||
# -------------------------------------------------------------
|
||||
# preparing sources for compilation
|
||||
%prep
|
||||
|
||||
# source compilation
|
||||
%build
|
||||
|
||||
# rpm building
|
||||
%install
|
||||
%__mkdir_p $RPM_BUILD_ROOT%_UTT_DIR
|
||||
cp -fr %_UTT_DIST_DIR/* $RPM_BUILD_ROOT%_UTT_DIR/
|
||||
|
||||
|
||||
# cleaning after rpm build
|
||||
%clean
|
||||
rm -rf $RPM_BUILD_ROOT
|
||||
|
||||
# -------------------------------------------------------------
|
||||
#before installation
|
||||
%pre
|
||||
|
||||
|
||||
#after installation
|
||||
%post
|
||||
# we need to create utt.conf file
|
||||
%_UTT_DIR/create_utt_config.pl
|
||||
rm -f %_UTT_DIR/create_utt_config.pl
|
||||
# we need to create links in /usr/local/bin
|
||||
find %_UTT_DIR/bin/ -type f -exec ln -f {} /usr/local/bin \;
|
||||
|
||||
|
||||
#before uninstallation
|
||||
%preun
|
||||
# we delete links from /usr/local/bin
|
||||
for fn in `find %_UTT_DIR/bin/ -type f -exec basename {} \;`; do rm -f /usr/local/bin/$fn; done
|
||||
|
||||
|
||||
#after uninstallation
|
||||
%postun
|
||||
# we remove all extra files
|
||||
rm -fr %_UTT_DIR
|
||||
|
||||
# -------------------------------------------------------------
|
||||
%files
|
||||
%defattr(-,root,root)
|
||||
/*
|
4
app/dist/tarball/INSTALL
vendored
Normal file
4
app/dist/tarball/INSTALL
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
Here you can find some information about how to install utt.
|
||||
|
||||
You should just unpack archive and then
|
||||
execute create_utt_conf.pl and remove it.
|
38
app/dist/tarball/Makefile
vendored
Normal file
38
app/dist/tarball/Makefile
vendored
Normal file
@ -0,0 +1,38 @@
|
||||
# This makefile allows build tarball distribution for utt.
|
||||
|
||||
#
|
||||
# Some variables
|
||||
#
|
||||
|
||||
# Directory with utt binaries
|
||||
ifndef _UTT_DIST_DIR
|
||||
_UTT_DIST_DIR=${DIR}
|
||||
endif
|
||||
|
||||
# Where put result
|
||||
ifndef _UTT_DIST_OUTPUT
|
||||
_UTT_DIST_OUTPUT=${DIR}
|
||||
endif
|
||||
|
||||
# Common info about version and release
|
||||
_UTT_VER=$(shell cat ../common/version.def)
|
||||
_UTT_REL=$(shell cat ../common/release.def)
|
||||
|
||||
# Temp vars
|
||||
DIR=$(shell pwd)
|
||||
_TARBALL_ROOT=$(DIR)/utt_$(_UTT_VER)-$(_UTT_REL)
|
||||
_TAR_FILE_NAME=utt.$(_UTT_VER)_$(_UTT_REL).tar.gz
|
||||
|
||||
#defualt task
|
||||
.PHONY: default
|
||||
default:
|
||||
@echo Build dir is ${_UTT_DIST_DIR}
|
||||
@echo Change output for tarball as ${_UTT_DIST_OUTPUT}
|
||||
mkdir -p ${_TARBALL_ROOT}
|
||||
cp -fr ${_UTT_DIST_DIR}/* ${_TARBALL_ROOT}
|
||||
@# we add some extra files
|
||||
cp ./INSTALL ${_TARBALL_ROOT}/
|
||||
|
||||
tar -czf ${_UTT_DIST_OUTPUT}/${_TAR_FILE_NAME} utt*
|
||||
|
||||
rm -rf ${_TARBALL_ROOT}
|
6
app/dist/tarball/README
vendored
Normal file
6
app/dist/tarball/README
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
This directory contains Makefile, which allows to create tar.gz archive.
|
||||
|
||||
To create archive, just write:
|
||||
make
|
||||
|
||||
Warning: you need define variable _UTT_DIST_DIR.
|
27
app/doc/Makefile
Normal file
27
app/doc/Makefile
Normal file
@ -0,0 +1,27 @@
|
||||
main: utt.info utt.pdf utt.html utt.ps
|
||||
|
||||
utt.info: utt.texinfo
|
||||
makeinfo utt.texinfo
|
||||
|
||||
utt.pdf: utt.texinfo
|
||||
texi2pdf utt.texinfo
|
||||
rm utt.{aux,cp,fn,ky,log,pg,toc,tp,vr}
|
||||
|
||||
utt.html: utt.texinfo
|
||||
makeinfo --html --no-split utt.texinfo
|
||||
|
||||
utt.dvi: utt.texinfo
|
||||
texi2dvi utt.texinfo
|
||||
|
||||
utt.ps: utt.dvi
|
||||
dvips -o utt.ps utt.dvi
|
||||
|
||||
|
||||
copy:
|
||||
ifdef UTT_DOC_DIR
|
||||
cp utt.{info,ps,pdf,html} ${UTT_DOC_DIR}
|
||||
endif
|
||||
|
||||
clean:
|
||||
rm -f utt.{aux,cp,dvi,fn,fns,html,info,ky,log,pdf,pg,ps,toc,tp,vr}
|
||||
rm -f *~
|
2687
app/doc/utt.texinfo
Normal file
2687
app/doc/utt.texinfo
Normal file
File diff suppressed because it is too large
Load Diff
30
app/lib/ser.l.template
Normal file
30
app/lib/ser.l.template
Normal file
@ -0,0 +1,30 @@
|
||||
%{
|
||||
#include<string.h>
|
||||
int n=0;
|
||||
%}
|
||||
|
||||
%%
|
||||
|
||||
PATTERN {
|
||||
int start, end, len;
|
||||
char *lastseg, *tmp;
|
||||
if(yytext[yyleng-1]!='\n')
|
||||
{fprintf(stderr,"ser: pattern matches incomplete line\n"); exit(1);}
|
||||
n++;
|
||||
sscanf(yytext,"%d %d",&start,&len);
|
||||
yytext[yyleng-1]='\0';
|
||||
if(tmp=strrchr(yytext,'\n'))
|
||||
{
|
||||
lastseg=tmp+1;
|
||||
sscanf(lastseg,"%d %d", &end, &len);
|
||||
}
|
||||
else
|
||||
end=start;
|
||||
yytext[yyleng-1]='\n';
|
||||
printf("%04d 00 BOM * ser:%d\n",start,n);
|
||||
ECHO;
|
||||
printf("%04d 00 EOM * ser:%d\n",end+len,n);
|
||||
}
|
||||
|
||||
|
||||
.*\n DEFAULTACTION;
|
52
app/lib/terms.m4
Normal file
52
app/lib/terms.m4
Normal file
@ -0,0 +1,52 @@
|
||||
divert(-1)
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
# Macros defined here may be used in pattern specifications
|
||||
# You can modify this file according to your needs.
|
||||
|
||||
# ENDOFSEGMENT and MORFIELD are macros expanded to, respectively,
|
||||
# end of segment marker (dependes on the format: flattened or not)
|
||||
# and the name of the annotation field containing morphological
|
||||
# information (standard value is 'lem'). These values are controlled
|
||||
# by programs using this file to expand search patterns (ser, grp, ...).
|
||||
|
||||
# seg(type,form,annotation)
|
||||
|
||||
define(`seg',`(\s*((\d+\s+)(\d+\s+)?)?dnl
|
||||
ifelse($1, `',`(\S+)', `($1)')\s+dnl
|
||||
ifelse($2, `',`(\S+)', `($2)')dnl
|
||||
ifelse($3, `',`((\s+\S+)*)', `(\s+($3))')\s*ENDOFSEGMENT)')
|
||||
|
||||
# form(f) - segment containing the form f
|
||||
|
||||
define(`form', `seg(,$1)')
|
||||
|
||||
# field(f) segment containing auxiliary field f
|
||||
|
||||
define(`field', `seg(,,`(\S+\s+)*($1)(\s+\S+)*')')
|
||||
|
||||
# word, space, punct, number segments (assuming W, S, P, N segment types)
|
||||
|
||||
define(`space', `seg(`S',`$1')')
|
||||
define(`word', `seg(`W',`$1')')
|
||||
define(`punct', `seg(`P',`$1')')
|
||||
define(`number', `seg(`N',`$1')')
|
||||
|
||||
# macros specific to PMDB format
|
||||
|
||||
define(`lexeme', `field(`MORFIELD:(\S+;)?$1,\S+')')
|
||||
define(`cat', `field(`MORFIELD:\S+,$1([,;]\S+)?')')
|
||||
|
||||
|
||||
# Place here your macro definitions.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
divert(0)
|
8
app/src/common/Makefile
Normal file
8
app/src/common/Makefile
Normal file
@ -0,0 +1,8 @@
|
||||
# main: cmdline.c main_template.cc
|
||||
# g++ -o main cmdline.c common.cc main_template.cc
|
||||
|
||||
# cmdline.c cmdline.h : cmdline.ggo
|
||||
# gengetopt -i cmdline.ggo
|
||||
|
||||
# cmdline.ggo: cmdline_common.ggo cmdline_program.ggo
|
||||
# cat cmdline_common.ggo cmdline_program.ggo > cmdline.ggo
|
18
app/src/common/README
Normal file
18
app/src/common/README
Normal file
@ -0,0 +1,18 @@
|
||||
Propozycja ujednolicenia dzialania klocka na poziomie
|
||||
funkcji main. Parametry meta - zdefiniowane dla
|
||||
wszystkich, poza tok, programow, definiujace ich zachowanie
|
||||
w systemie klockow.
|
||||
|
||||
cmdline_common.ggo - deklaracje parametrow meta
|
||||
|
||||
cmdline_program.ggo - przyklad deklaracji parametrow programu
|
||||
nazwa docelowa np. cmdline_guess.ggo
|
||||
|
||||
common.cc - zmienne globalne zawierajace informacje
|
||||
przekazane przez parametry meta
|
||||
common.h
|
||||
|
||||
main_template.cc - szkielet funkcji main
|
||||
|
||||
Makefile - sposob kompilacji
|
||||
|
34
app/src/common/cmdline_common.ggo
Normal file
34
app/src/common/cmdline_common.ggo
Normal file
@ -0,0 +1,34 @@
|
||||
#section "Common UTT options"
|
||||
|
||||
|
||||
option "input" f "Input file" string no hidden
|
||||
|
||||
option "output" o "Output file" string no hidden
|
||||
|
||||
option "fail" e "Output file for unsuccesfully processed segments " string no hidden
|
||||
|
||||
option "only-fail" - "Print only segments the program failed to process" flag off hidden
|
||||
|
||||
option "no-fail" - "Print only segments the program processed" flag off hidden
|
||||
|
||||
option "copy" c "Copy succesfully processed segments to standard output" flag off hidden
|
||||
|
||||
option "process" p "Process segments with this tag" string no multiple
|
||||
|
||||
option "select" s "Select only segments with this field" string no multiple
|
||||
|
||||
option "ignore" S "Select only segments without this field" string no multiple
|
||||
|
||||
option "output-field" O "Output field name" string no
|
||||
|
||||
option "input-field" I "Input field name" string no multiple
|
||||
|
||||
option "interactive" i "Toggle interactive mode" flag off
|
||||
|
||||
option "config" - "Configuration file" string typestr="FILENAME" no
|
||||
|
||||
option "one-field" 1 "Print all results in one segments (creates ambiguous annotation)" flag off
|
||||
|
||||
option "one-line" - "Print annotation alternatives as additional fields" flag off
|
||||
|
||||
option "language" - "Language." string no
|
5
app/src/common/cmdline_program.ggo
Normal file
5
app/src/common/cmdline_program.ggo
Normal file
@ -0,0 +1,5 @@
|
||||
package "guess"
|
||||
version "0.1"
|
||||
|
||||
option "color" l "Show guessed descriptions in colour." flag off
|
||||
|
264
app/src/common/common.cc
Normal file
264
app/src/common/common.cc
Normal file
@ -0,0 +1,264 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include "common.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <locale.h>
|
||||
|
||||
FILE* inputf=stdin;
|
||||
FILE* outputf=stdout;
|
||||
FILE* failedf=stdout;
|
||||
bool copy_processed=0;
|
||||
bool one_field=false;
|
||||
bool one_line=false;
|
||||
char output_field_prefix[32];
|
||||
char input_field_prefix[32];
|
||||
|
||||
extern int argc;
|
||||
extern char **argv;
|
||||
|
||||
|
||||
// tilde (home dir) expansion in path
|
||||
int expand_path(char* inpath, char* outpath)
|
||||
{
|
||||
if(inpath[0]=='~')
|
||||
sprintf(outpath,"%s%s",getenv("HOME"),inpath+1);
|
||||
else
|
||||
strcpy(outpath,inpath);
|
||||
return 0; // no problem
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
parameters:
|
||||
-name - field name, long or short
|
||||
+prefix - field name with ':' appended if long name
|
||||
return value:
|
||||
1 if correct field name, 0 otherwise
|
||||
examples:
|
||||
name prefix r.v.
|
||||
lem lem: 1
|
||||
@ @ 1
|
||||
:: 'undef' 0
|
||||
a,b 'undef' 0
|
||||
*/
|
||||
int fieldprefix(char *name, char *prefix)
|
||||
{
|
||||
if (ispunct(name[0]) && name[1]=='\0') // correct short name
|
||||
{
|
||||
strcpy(prefix, name); return 1;
|
||||
}
|
||||
|
||||
int i=0;
|
||||
while(name[i]!='\0' && isalnum(name[i])) ++i;
|
||||
|
||||
if(name[i]=='\0' && i>0) // correct long name
|
||||
{
|
||||
sprintf(prefix,"%s:",name); return 1;
|
||||
}
|
||||
|
||||
// incorrect
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void set_program_name(char program_name[], char* argv0)
|
||||
{
|
||||
if (char* p_name = strrchr(argv0, '/'))
|
||||
strcpy(program_name,p_name+1);
|
||||
else
|
||||
strcpy(program_name,argv0);
|
||||
}
|
||||
|
||||
|
||||
|
||||
extern void process_config_files(gengetopt_args_info* args, char* argv0)
|
||||
{
|
||||
|
||||
char program_name[256];
|
||||
char config_file[256];
|
||||
char config_file_tmp[256];
|
||||
|
||||
set_program_name(program_name,argv0);
|
||||
|
||||
// obsługa pliku konfiguracyjnego podanego w linii komend
|
||||
if (args->config_given) {
|
||||
if (file_accessible(args->config_arg) == 0) {
|
||||
if (cmdline_parser_configfile(args->config_arg,
|
||||
args,
|
||||
0, // 0 - nie nadpisuj wartości parametrów
|
||||
0, // 0 - nie inicjuj
|
||||
0) != 0) {
|
||||
fprintf(stderr, "Error in config file (%s)\n", args->config_arg);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(args->one_line_given && !one_line) one_line=true, one_field=false;
|
||||
if(args->one_field_given && !one_field) one_line=false, one_field=true;
|
||||
|
||||
// obsluga pliku konfiguracyjnego uzytkownika dla programu
|
||||
sprintf(config_file_tmp, "%s/%s.conf", USER_CONFIG_DIR, program_name);
|
||||
expand_path(config_file_tmp, config_file);
|
||||
if (file_accessible(config_file) == 0) {
|
||||
if (cmdline_parser_configfile(config_file,
|
||||
args,
|
||||
0, // 0 - nie nadpisuj danych
|
||||
0, // 0 - nie inicjuj struktury
|
||||
0) != 0) {
|
||||
fprintf(stderr, "Error in config file (%s)\n", config_file);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if(args->one_line_given && !one_line) one_line=true, one_field=false;
|
||||
if(args->one_field_given && !one_field) one_line=false, one_field=true;
|
||||
|
||||
|
||||
// obsluga pliku konfiguracyjnego uzytkownika globalnego
|
||||
sprintf(config_file_tmp, "%s/utt.conf", USER_CONFIG_DIR);
|
||||
expand_path(config_file_tmp, config_file);
|
||||
if (file_accessible(config_file) == 0) {
|
||||
if (cmdline_parser_configfile(config_file,
|
||||
args,
|
||||
0, // 0 - nie nadpisuj danych
|
||||
0, // 0 - nie inicjuj struktury
|
||||
0) != 0) {
|
||||
fprintf(stderr, "Error in config file (%s)\n", config_file);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if(args->one_line_given && !one_line) one_line=true, one_field=false;
|
||||
if(args->one_field_given && !one_field) one_line=false, one_field=true;
|
||||
|
||||
|
||||
|
||||
// obsluga systemowego pliku konfiguracyjnego dla programu
|
||||
sprintf(config_file, "%s/%s.conf", SYSTEM_CONFIG_DIR, program_name);
|
||||
if (file_accessible(config_file) == 0) {
|
||||
if (cmdline_parser_configfile(config_file,
|
||||
args,
|
||||
0, // 0 - nie zmieniaj danych wczesniejszych
|
||||
0, // 0 - nie inicjuj struktury
|
||||
0 // 0 - nie sprawdzaj wymaganych parametrow
|
||||
) != 0) {
|
||||
fprintf(stderr, "Error in config file (%s)\n", config_file);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if(args->one_line_given && !one_line) one_line=true, one_field=false;
|
||||
if(args->one_field_given && !one_field) one_line=false, one_field=true;
|
||||
|
||||
|
||||
// obsluga systemowego pliku konfiguracyjnego globalnego
|
||||
sprintf(config_file, "%s/utt.conf", SYSTEM_CONFIG_DIR);
|
||||
if (file_accessible(config_file) == 0) {
|
||||
if (cmdline_parser_configfile(config_file,
|
||||
args,
|
||||
0, // 0 - nie zmieniaj danych wczesniejszych
|
||||
0, // 0 - nie inicjuj struktury
|
||||
0 // 0 - nie sprawdzaj wymaganych parametrow
|
||||
) != 0) {
|
||||
fprintf(stderr, "Error in config file (%s)\n", config_file);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if(args->one_line_given && !one_line) one_line=true, one_field=false;
|
||||
if(args->one_field_given && !one_field) one_line=false, one_field=true;
|
||||
|
||||
}
|
||||
|
||||
|
||||
void process_common_options(gengetopt_args_info* args, char* argv0)
|
||||
{
|
||||
char program_name[256];
|
||||
|
||||
set_program_name(program_name,argv0);
|
||||
|
||||
setlocale(LC_CTYPE,"");
|
||||
setlocale(LC_COLLATE, "");
|
||||
|
||||
if(args->help_given)
|
||||
cmdline_parser_print_help ();
|
||||
|
||||
if(args->input_given)
|
||||
if(!(inputf=fopen(args->input_arg,"r")))
|
||||
{
|
||||
fprintf(stderr,"No such file: %s.\n", args->input_arg);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if(args->output_given)
|
||||
if(!(outputf=fopen(args->output_arg,"w")))
|
||||
{
|
||||
fprintf(stderr,"Cannot open output file: %s.\n", args->output_arg);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if(args->fail_given)
|
||||
if(!(failedf=fopen(args->fail_arg,"w")))
|
||||
{
|
||||
fprintf(stderr,"Cannot open output file: %s.\n", args->fail_arg);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if(args->input_field_given)
|
||||
fieldprefix(args->input_field_arg[0],input_field_prefix);
|
||||
else
|
||||
strcpy(input_field_prefix, "4");
|
||||
|
||||
if(args->output_field_given)
|
||||
fieldprefix(args->output_field_arg,output_field_prefix);
|
||||
else
|
||||
sprintf(output_field_prefix, "%s%c", program_name, INFIELD_SEP);
|
||||
|
||||
if ((args->copy_given))
|
||||
copy_processed=true;
|
||||
}
|
||||
|
||||
// sprawdza istnienie pliku
|
||||
int file_accessible(const char* path) {
|
||||
return access(path, R_OK);
|
||||
}
|
||||
|
||||
// sprawdza istnienie pliku konfiguracyjnego
|
||||
int config_file_exists(const char* dir, const char* filename) {
|
||||
struct stat dir_stat;
|
||||
struct stat file_stat;
|
||||
|
||||
char* path = (char*)malloc(strlen(dir) + strlen(filename) + 2); // + '\0' + '/'
|
||||
|
||||
sprintf(path, "%s/%s", dir, filename);
|
||||
|
||||
if (stat(dir, &dir_stat) != 0)
|
||||
return -1;
|
||||
|
||||
if (stat(path, &file_stat) != 0)
|
||||
return -1;
|
||||
|
||||
if (!S_ISDIR(dir_stat.st_mode))
|
||||
return -1; // katalog nie jest katalogiem
|
||||
|
||||
if (!S_ISREG(file_stat.st_mode))
|
||||
return -1; // plik konfiguracyjny nie jest plikiem
|
||||
|
||||
if (access(dir, X_OK) != 0)
|
||||
return -1; // nie mamy prawa zmienic katalogu
|
||||
|
||||
if (access(path, R_OK) != 0)
|
||||
return -1; // nie mamy prawa odczytu pliku
|
||||
|
||||
free(path);
|
||||
|
||||
return 0;
|
||||
}
|
416
app/src/common/common.h
Normal file
416
app/src/common/common.h
Normal file
@ -0,0 +1,416 @@
|
||||
#ifndef __COMMON_H
|
||||
#define __COMMON_H
|
||||
|
||||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "../lib/const.h"
|
||||
|
||||
#include _CMDLINE_FILE
|
||||
|
||||
|
||||
/**************************************************
|
||||
* Stale dotyczace wejscia/wyjscia
|
||||
*/
|
||||
|
||||
#define EMPTYFORM '*'
|
||||
#define INFIELD_SEP ':'
|
||||
#define MAXAUX 16
|
||||
#define FIELD_SEP " \t\n"
|
||||
|
||||
|
||||
// katalogi z plikami konfiguracyjnymi
|
||||
// nowe
|
||||
// stare - do wyrzucenia
|
||||
// #define CONFIG_DIR ".utt/conf"
|
||||
|
||||
// nazwa zmiennej okreslajaca sciezke do danych
|
||||
|
||||
// #define UTT_DIR_VAR "UTT_DIR"
|
||||
|
||||
// sciezka do plikow z danymi (np UTT_DIR/pliki) wzgledem $HOME!
|
||||
|
||||
// #define UTT_DIR_DEFAULT ".utt/pl/"
|
||||
|
||||
/**************************************************/
|
||||
|
||||
|
||||
extern FILE* inputf;
|
||||
extern FILE* outputf;
|
||||
extern FILE* failedf;
|
||||
|
||||
extern char* input_filename;
|
||||
extern char* output_filename;
|
||||
extern char* failed_filename;
|
||||
extern bool one_line;
|
||||
extern bool one_field;
|
||||
|
||||
extern char input_field_prefix[];
|
||||
extern char output_field_prefix[];
|
||||
|
||||
extern bool copy_processed;
|
||||
extern bool append_output;
|
||||
extern bool append_failed;
|
||||
|
||||
//sciezka do katalogu z danymi
|
||||
extern char utt_dir[];
|
||||
|
||||
extern void process_common_options(gengetopt_args_info* args, char* argv0);
|
||||
extern void process_config_files(gengetopt_args_info* args, char* argv0);
|
||||
|
||||
extern int expand_path(char* inpath, char* outpath);
|
||||
|
||||
extern int fieldprefix(char *name, char *prefix);
|
||||
|
||||
|
||||
/**************************************************
|
||||
* problems with casing */
|
||||
// sprawdzenie wielkosci liter
|
||||
// warto¶æ zwracana:
|
||||
// 0 - wszystkie ma³e litery
|
||||
// 1 - pierwsza wielka, reszta male
|
||||
// 2 - wszystkie wielkie
|
||||
// 3 - inne
|
||||
inline int casing(char* s)
|
||||
{
|
||||
int ret = isupper(*s) ? 1 : 0;
|
||||
while(*++s != '\0')
|
||||
{
|
||||
if(isupper(*s))
|
||||
{
|
||||
if(ret==1) ret=2;
|
||||
else if(ret==0) ret=3;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(ret==2) ret=3;
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
//
|
||||
inline void tolowers(char* s, char* d)
|
||||
{
|
||||
*d=tolower(*s);
|
||||
while(*s != '\0') * ++d = tolower(* ++s);
|
||||
}
|
||||
|
||||
|
||||
// przepisuje s do d
|
||||
// nadajac wielko¶æ liter zgodnie z warto¶ci± casing
|
||||
// casing - warto¶æ zwracana przez casing()
|
||||
// je¶li casing==3 przepisuje bez zmian (za ma³o informacji)
|
||||
inline void restorecasing(char *s, char *d, int casing)
|
||||
{
|
||||
switch(casing)
|
||||
{
|
||||
case 0:
|
||||
case 3:
|
||||
*d=*s;
|
||||
while(*s != '\0') * ++d = * ++s;
|
||||
break;
|
||||
case 1:
|
||||
*d=toupper(*s);
|
||||
while(*s != '\0') * ++d = * ++s;
|
||||
break;
|
||||
case 2:
|
||||
*d=toupper(*s);
|
||||
while(*s != '\0') * ++d = toupper(* ++s);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/**************************************************/
|
||||
|
||||
/*
|
||||
parameters:
|
||||
-seg - segment
|
||||
-pref - field name or "1", "2", "3", "4" for the first four fields
|
||||
+val - field contents
|
||||
return value:
|
||||
1 if specified field exists, 0 otherwise
|
||||
*/
|
||||
|
||||
inline int getfield(char* seg, const char* pref, char* val)
|
||||
{
|
||||
|
||||
char* p=seg;
|
||||
char* p0;
|
||||
|
||||
while(isspace(*p)) ++p;
|
||||
|
||||
// field "1"
|
||||
p0=p; while(isdigit(*p)) ++p;
|
||||
if(*pref=='1') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0;
|
||||
|
||||
while(isspace(*p)) ++p;
|
||||
|
||||
// field "2"
|
||||
p0=p; while(isdigit(*p)) ++p;
|
||||
if(*pref=='2') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0;
|
||||
|
||||
while(isspace(*p)) ++p;
|
||||
|
||||
// field "3"
|
||||
p0=p; while(isgraph(*p)) ++p;
|
||||
if(*pref=='3') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0;
|
||||
|
||||
while(isspace(*p)) ++p;
|
||||
|
||||
// field "4"
|
||||
p0=p; while(isgraph(*p)) ++p;
|
||||
if(*pref=='4') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0;
|
||||
|
||||
while(isspace(*p)) ++p;
|
||||
|
||||
// annotation fields
|
||||
do p=strstr(p,pref); while(p!=NULL && *(p-1)!=' ' && *(p-1)!='\t');
|
||||
|
||||
if(p==NULL) return 0;
|
||||
else
|
||||
{
|
||||
p+=strlen(pref);
|
||||
int len=strcspn(p,FIELD_SEP "\n\r\f\0");
|
||||
strncpy(val,p,len);
|
||||
val[len]='\0';
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline
|
||||
bool process_seg(char* seg, gengetopt_args_info& args)
|
||||
{
|
||||
char buf[256];
|
||||
bool ret = !args.process_given;
|
||||
if(args.process_given)
|
||||
{
|
||||
getfield(seg,"3",buf);
|
||||
for(int i=0; i<args.process_given; ++i)
|
||||
if(strcmp(args.process_arg[i],buf)==0)
|
||||
{
|
||||
ret=true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for(int i=0; i<args.select_given; ++i)
|
||||
if(! getfield(seg,args.select_arg[i],buf))
|
||||
ret=false;
|
||||
for(int i=0; i<args.ignore_given; ++i)
|
||||
if(getfield(seg,args.ignore_arg[i],buf))
|
||||
ret=false;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
parameters:
|
||||
-+seg - segment
|
||||
-pref - prefix of the new field
|
||||
-val - contents of the new field
|
||||
return value:
|
||||
1 - success, 0 - fail (limit on segment length exceeded)
|
||||
*/
|
||||
inline
|
||||
int addfield(char *seg, const char *pref, const char *val)
|
||||
// zalozenie, ze seg konczy sie znakiem \n
|
||||
{
|
||||
if(strlen(seg)+strlen(pref)+strlen(val) >= MAX_LINE) return 0; // bezpieczniej, ale wolniej
|
||||
|
||||
int seglen=strlen(seg);
|
||||
sprintf(seg+(seglen-1)," %s%s\n",pref,val);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**************************************************/
|
||||
|
||||
struct Seg
|
||||
{
|
||||
int filepos, len;
|
||||
char* tag;
|
||||
char* form;
|
||||
char* aux[MAXAUX];
|
||||
int auxn;
|
||||
|
||||
bool parse(char* line);
|
||||
char* getfield(char* fieldname);
|
||||
void print(char* line);
|
||||
bool addfield(char* s);
|
||||
bool clearfields();
|
||||
};
|
||||
|
||||
/**************************************************/
|
||||
|
||||
/* definicja struktury wejscia/wyjscia
|
||||
*/
|
||||
struct Segment
|
||||
{
|
||||
int filepos, len;
|
||||
char* tag;
|
||||
char* form;
|
||||
char* aux[MAXAUX];
|
||||
int auxn;
|
||||
|
||||
bool parse(char* line);
|
||||
char* getfield(char* fieldname);
|
||||
void print(char* line);
|
||||
bool addfield(char* s);
|
||||
bool clearfields();
|
||||
};
|
||||
|
||||
/*
|
||||
* Sprawdza czy nalezy przetwarzac dany segment.
|
||||
*/
|
||||
|
||||
inline
|
||||
bool process_seg(Segment& s, gengetopt_args_info& args)
|
||||
{
|
||||
bool ret = !args.process_given;
|
||||
|
||||
for(int i=0; i<args.process_given; ++i)
|
||||
if(strcmp(args.process_arg[i],s.tag)==0)
|
||||
{
|
||||
ret=true;
|
||||
break;
|
||||
}
|
||||
|
||||
for(int i=0; i<args.select_given; ++i)
|
||||
if(! s.getfield(args.select_arg[i]))
|
||||
ret=false;
|
||||
|
||||
for(int i=0; i<args.ignore_given; ++i)
|
||||
if(s.getfield(args.ignore_arg[i]))
|
||||
ret=false;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* FUNKCJE OBSLUGUJACE WEJSCIE/WYJSCIE
|
||||
*/
|
||||
// napisy zostaj na miejscu (w line), tylko wskazniki sa ustawian
|
||||
// i zara dopisywane zera s dopisywane
|
||||
|
||||
inline
|
||||
bool Segment::parse(char* line)
|
||||
{
|
||||
auxn=0;
|
||||
char* field;
|
||||
if((field=strtok(line,FIELD_SEP))!=NULL)
|
||||
filepos=atoi(field); // nie sprawdzana poprawnosc
|
||||
else
|
||||
return false;
|
||||
if((field=strtok(NULL,FIELD_SEP))!=NULL)
|
||||
len=atoi(field); // nie sprawdzana poprawnosc
|
||||
else return false;
|
||||
if((tag=strtok(NULL,FIELD_SEP))==NULL) return false;
|
||||
if((form=strtok(NULL,FIELD_SEP))==NULL)
|
||||
return true;
|
||||
else
|
||||
if(form[0] == EMPTYFORM && form[1] =='\0')
|
||||
form=NULL;
|
||||
|
||||
while((aux[auxn]=strtok(NULL,FIELD_SEP))!=NULL) ++auxn;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
inline char* Segment::getfield(char* f)
|
||||
{
|
||||
int flen=strlen(f);
|
||||
if(isalnum(*f))
|
||||
{
|
||||
for(int i=0; i<auxn; ++i)
|
||||
if(strncmp(aux[i],f,flen)==0 && aux[i][flen]==INFIELD_SEP)
|
||||
return aux[i]+flen+1;
|
||||
} else
|
||||
{
|
||||
for(int i=0; i<auxn; ++i)
|
||||
{
|
||||
if(*f==*(aux[i]))
|
||||
return aux[i]+1;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
inline bool Segment::clearfields() {
|
||||
for (int i=0; i<auxn; ++i) {
|
||||
// free(aux[i]);
|
||||
aux[i] = NULL;
|
||||
}
|
||||
auxn=0;
|
||||
return true;
|
||||
}
|
||||
|
||||
inline // NIEEFEKTYWNE
|
||||
void Segment::print(char* line)
|
||||
{
|
||||
sprintf(line,"%04d %02d %s", filepos, len, tag);
|
||||
if(form)
|
||||
{
|
||||
strcat(line," ");
|
||||
strcat(line,form);
|
||||
}
|
||||
else
|
||||
if(auxn)
|
||||
strcat(line," *");
|
||||
|
||||
for(int i=0; i<auxn; ++i)
|
||||
{
|
||||
strcat(line," ");
|
||||
strcat(line,aux[i]);
|
||||
}
|
||||
|
||||
strcat(line,"\n");
|
||||
}
|
||||
|
||||
|
||||
inline
|
||||
bool Segment::addfield(char* s)
|
||||
{
|
||||
if(auxn<MAXAUX)
|
||||
{
|
||||
aux[auxn++]=s;
|
||||
return true;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
/**************************************************
|
||||
* funkcje pomocne w operacjach na plikach *
|
||||
* konfiguracyjnych *
|
||||
**************************************************/
|
||||
|
||||
// sprawdza istnienie pliku
|
||||
int file_accessible(const char* path);
|
||||
|
||||
// sprawdza istnienie pliku konfiguracyjnego
|
||||
int config_file(const char* dir, const char* filename);
|
||||
|
||||
/**************************************************/
|
||||
|
||||
/* Pobiera wejscie
|
||||
* parametry:
|
||||
* - args - tablica stringow okresnajacych pola wejsciowe
|
||||
* - args_len - rozmiar args
|
||||
* - seg - segment
|
||||
* wartosc - wskaznik do wejscia
|
||||
*/
|
||||
inline char* getInput(char** args, int args_len, Segment seg) {
|
||||
char* formp = NULL;
|
||||
for (int i=0; i<args_len; ++i) {
|
||||
if ('4' == args[i][0])
|
||||
return seg.form;
|
||||
if ((formp = seg.getfield(args[i])) != NULL) {
|
||||
return formp;
|
||||
}
|
||||
}
|
||||
return formp;
|
||||
}
|
||||
|
||||
#endif
|
20
app/src/common/main_template.cc
Normal file
20
app/src/common/main_template.cc
Normal file
@ -0,0 +1,20 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
main(int argc, char* argv[])
|
||||
{
|
||||
gengetopt_args_info args;
|
||||
|
||||
if(cmdline_parser(argc,argv,&args) != 0)
|
||||
exit(1);
|
||||
|
||||
process_common_options(args);
|
||||
|
||||
//
|
||||
// TU KOD
|
||||
//
|
||||
|
||||
cmdline_parser_free(&args);
|
||||
|
||||
}
|
12
app/src/compiledic/Makefile
Normal file
12
app/src/compiledic/Makefile
Normal file
@ -0,0 +1,12 @@
|
||||
all: compiledic aut2fsa
|
||||
|
||||
compiledic:
|
||||
|
||||
aut2fsa: aut2fsa.cc
|
||||
g++ -Wno-deprecated -O3 -fpermissive -static -o aut2fsa aut2fsa.cc
|
||||
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp compiledic fsm2aut aut2fsa ${UTT_BIN_DIR}
|
||||
endif
|
5
app/src/compiledic/TODO
Normal file
5
app/src/compiledic/TODO
Normal file
@ -0,0 +1,5 @@
|
||||
* pliki tymczasowe:
|
||||
- pliki symboli lab i scl
|
||||
- pliki powstajace podczas kompilacji slownika
|
||||
|
||||
gdzie maja byc tworzone? tak jak teraz nie moze byc!
|
BIN
app/src/compiledic/aut2fsa
Executable file
BIN
app/src/compiledic/aut2fsa
Executable file
Binary file not shown.
16
app/src/compiledic/aut2fsa.cc
Normal file
16
app/src/compiledic/aut2fsa.cc
Normal file
@ -0,0 +1,16 @@
|
||||
|
||||
#include <iostream.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "../lib/tfti.h"
|
||||
|
||||
#include <fstream.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
TFTiv<char,char> a;
|
||||
a.read();
|
||||
a.save();
|
||||
|
||||
return 0;
|
||||
}
|
190
app/src/compiledic/compiledic
Executable file
190
app/src/compiledic/compiledic
Executable file
@ -0,0 +1,190 @@
|
||||
#! /usr/bin/env perl
|
||||
|
||||
$symfile='~/.utt/pl/pl_PL.iso-8859-2.sym';
|
||||
$symfilenoext = $symfile;
|
||||
$symfilenoext =~ s/\.sym$//;
|
||||
$labfile = $symfilenoext . '.lab';
|
||||
$sclfile = $symfilenoext . '.scl';
|
||||
|
||||
use locale;
|
||||
#use strict;
|
||||
|
||||
##################################################
|
||||
$linesPerFile = 20000;
|
||||
|
||||
if (@ARGV < 1) {
|
||||
print "usage: prep_user_dict.pl dictionary_file\n";
|
||||
exit;
|
||||
}
|
||||
|
||||
my $file = shift; # @ARGV;
|
||||
my $filenameprefix;
|
||||
|
||||
if ($file =~ /(.*)\.dic/)
|
||||
{
|
||||
$filenameprefix = $1;
|
||||
}
|
||||
else
|
||||
{
|
||||
print "The input file must have .dic extension.";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
# Przygotowanie etykiet
|
||||
|
||||
#`makeLabels.pl > labels.sym`;
|
||||
|
||||
`lexmakelab $symfilenoext`;
|
||||
|
||||
# Analiza pliku s³ownika
|
||||
|
||||
print "preparing file...........................................";
|
||||
|
||||
`sed -r "s/([[:punct:]])/\[\\1\]/g" < $file > temp1`;
|
||||
|
||||
`cp temp1 temp2`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
#dzielimy plik na wiele czê¶ci, uruchamiamy lexcomplex dla ka¿dej
|
||||
#czê¶ci osobno, nastêpnie ³±czymy to za pomoc± programu fsmunion
|
||||
|
||||
#print "Dzielê s³ownik na mniejsze czê¶ci...";
|
||||
|
||||
open(IN, "./temp2");
|
||||
|
||||
$lineCount = 0;
|
||||
$fileCount = 0;
|
||||
|
||||
`mkdir LemTEMP`;
|
||||
|
||||
open(FILE, ">LemTEMP/slo_0");
|
||||
|
||||
while (<IN>) {
|
||||
|
||||
if (++$lineCount >= $linesPerFile) {
|
||||
$fileCount++;
|
||||
$lineCount = 0;
|
||||
|
||||
close(FILE);
|
||||
# print "Tworzê nowy plik tymczasowy: slo_".$fileCount."\n";
|
||||
open(FILE, ">LemTEMP/slo_".$fileCount);
|
||||
}
|
||||
|
||||
print(FILE $_);
|
||||
}
|
||||
|
||||
#print "OK\n";
|
||||
|
||||
print "building partial automata";
|
||||
|
||||
#32 kropki, fileCount plikow
|
||||
$filesPerDot = $fileCount/32;
|
||||
$files=$filesPerDot;
|
||||
$dots=0;
|
||||
|
||||
for ($i=0; $i<=$fileCount; $i++) {
|
||||
|
||||
if ($files >= $filesPerDot) {
|
||||
$files = 0;
|
||||
print ".";
|
||||
$dots++;
|
||||
}
|
||||
$files++;
|
||||
|
||||
$command = "lexcomplex -l $labfile -S $sclfile < LemTEMP/slo_".$i." > LemTEMP/slownik_".$i.".fsm";
|
||||
|
||||
`$command`;
|
||||
|
||||
}
|
||||
if ($dots < 32) {
|
||||
for ($i=0; $i<32 - $dots; $i++) {
|
||||
print ".";
|
||||
}
|
||||
}
|
||||
|
||||
print "OK\n";
|
||||
|
||||
`rm LemTEMP/slo_*`;
|
||||
|
||||
print "building final automaton";
|
||||
|
||||
#35 kropek...
|
||||
$ndots=33;
|
||||
$filesPerDot = $fileCount/$ndots;
|
||||
$files=$filesPerDot;
|
||||
$dots=0;
|
||||
|
||||
`cp LemTEMP/slownik_0.fsm slownik1.fsm`;
|
||||
|
||||
for ($i=1; $i<=$filecount; $i++) {
|
||||
|
||||
if ($files >= $filesPerDot) {
|
||||
$files = 0;
|
||||
print ".";
|
||||
$dots++;
|
||||
}
|
||||
$files++;
|
||||
|
||||
$command = "fsmunion LemTEMP/slownik_".$i." slownik1.fsm > slownik2.fsm";
|
||||
|
||||
`$command`;
|
||||
|
||||
`mv slownik2.fsm slownik1.fsm`;
|
||||
}
|
||||
|
||||
if ($dots < $ndots) {
|
||||
for ($i=0; $i<$ndots - $dots; $i++) {
|
||||
print ".";
|
||||
}
|
||||
}
|
||||
|
||||
`fsmunion LemTEMP/* > slownik1.fsm`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "removing epsilon-transitions.............................";
|
||||
|
||||
`fsmrmepsilon slownik1.fsm > slownik2.fsm`;
|
||||
|
||||
`rm slownik1.fsm`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "determinizing automaton..................................";
|
||||
|
||||
`fsmdeterminize slownik2.fsm > slownik1.fsm`;
|
||||
|
||||
`rm slownik2.fsm`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "minimizing automaton.....................................";
|
||||
|
||||
`fsmminimize slownik1.fsm > slownik.fsm`;
|
||||
|
||||
#`rm slownik1.fsm`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "converting fsm format to bin.............................";
|
||||
|
||||
`fsmprint -i $labfile slownik.fsm > slownik.txt`;
|
||||
|
||||
`fsm2aut slownik.txt > slownik.aut`;
|
||||
|
||||
`aut2fsa < slownik.aut > $filenameprefix.bin`;
|
||||
|
||||
print "OK\n";
|
||||
|
||||
print "removing temporary files.................................";
|
||||
|
||||
`rm LemTEMP/*`;
|
||||
`rmdir LemTEMP`;
|
||||
`rm temp2`;
|
||||
`rm slownik.fsm`;
|
||||
`rm slownik.txt`;
|
||||
`rm slownik.aut`;
|
||||
`rm labels.*`;
|
||||
|
||||
print "OK\n";
|
44
app/src/compiledic/fsm2aut
Executable file
44
app/src/compiledic/fsm2aut
Executable file
@ -0,0 +1,44 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
my $currstate=-1;
|
||||
my @states;
|
||||
my @final;
|
||||
my $tn=0;
|
||||
|
||||
while(<>)
|
||||
{
|
||||
if(/^\s*([0-9]+)\s+([0-9]+)\s+(.)(\s*)?$/)
|
||||
{
|
||||
push @{$states[$1]}, ($3, $2);
|
||||
$#states=$2 if $#states<$2;
|
||||
$tn++;
|
||||
}
|
||||
elsif(/^\s*([0-9]+)\s*$/)
|
||||
{
|
||||
$final[$1]=1;
|
||||
$#states=$1 if $#states<$1;
|
||||
}
|
||||
else
|
||||
{
|
||||
die("Input error.");
|
||||
}
|
||||
}
|
||||
|
||||
print scalar(@states)," ",$tn," char void\n";
|
||||
|
||||
my $i=0;
|
||||
my $width=int(log(@states+1)/log(10));
|
||||
foreach $stateref (@states)
|
||||
{
|
||||
$f = ($final[$i]?"+":"-");
|
||||
printf "%${width}d %s",$i++,$f;
|
||||
while(@$stateref)
|
||||
{
|
||||
$c=shift @$stateref;
|
||||
$s=shift @$stateref;
|
||||
print " $c $s";
|
||||
}
|
||||
print "\n";
|
||||
}
|
||||
|
||||
|
7
app/src/con/Makefile
Normal file
7
app/src/con/Makefile
Normal file
@ -0,0 +1,7 @@
|
||||
|
||||
con:
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp con ${UTT_BIN_DIR}
|
||||
endif
|
549
app/src/con/con
Executable file
549
app/src/con/con
Executable file
@ -0,0 +1,549 @@
|
||||
#!/usr/bin/perl -w
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
use locale;
|
||||
|
||||
Getopt::Long::Configure('no_ignore_case_always');
|
||||
|
||||
my $l='30c';
|
||||
my $r='30c';
|
||||
my $trim=0;
|
||||
my $white=0;
|
||||
my $bon='[0-9]+ [0-9]+ BOM .*';
|
||||
my $eon='[0-9]+ [0-9]+ EOM .*';
|
||||
my $bod='[';
|
||||
my $eod=']';
|
||||
my $column=0;
|
||||
my $ignore=0;
|
||||
my $help=0;
|
||||
|
||||
my $configfile1="../../conf/con.conf";
|
||||
my $configfile2="../conf/con.conf";
|
||||
|
||||
#read configuration files###########################
|
||||
my $file;
|
||||
foreach $file ($configfile1, $configfile2){
|
||||
if(open(CONFIG, $file)){
|
||||
while (<CONFIG>) {
|
||||
chomp;
|
||||
s/#.*//;
|
||||
s/^\s+//;
|
||||
s/\s+$//;
|
||||
next unless length;
|
||||
my ($name, $value) = split(/\s*=\s*/, $_, 2);
|
||||
if(($name eq "left")or($name eq "l")){
|
||||
$l=$value;
|
||||
}
|
||||
elsif(($name eq "right")or($name eq "r")){
|
||||
$r=$value;
|
||||
}
|
||||
elsif(($name eq "trim")or($name eq "t")){
|
||||
$trim=1;
|
||||
}
|
||||
elsif(($name eq "white")or($name eq "w")){
|
||||
$white=1;
|
||||
}
|
||||
elsif($name eq "bom"){
|
||||
$bon=$value;
|
||||
}
|
||||
elsif($name eq "eom"){
|
||||
$eon=$value;
|
||||
}
|
||||
elsif($name eq "bod"){
|
||||
$bod=$value;
|
||||
}
|
||||
elsif($name eq "eod"){
|
||||
$eod=$value;
|
||||
}
|
||||
elsif(($name eq "column")or($name eq "c")){
|
||||
$column=$value;
|
||||
}
|
||||
elsif(($name eq "ignore")or($name eq "i")){
|
||||
$ignore=1;
|
||||
}
|
||||
elsif(($name eq "help")or($name eq "h")){
|
||||
$help=1;
|
||||
}
|
||||
|
||||
}
|
||||
close CONFIG;
|
||||
}
|
||||
}
|
||||
#########################################################
|
||||
|
||||
GetOptions("left|l=s" => \$l,
|
||||
"right|r=s" => \$r,
|
||||
"trim|t" => \$trim,
|
||||
"white|w" => \$white,
|
||||
"bom=s" => \$bon,
|
||||
"eom=s" => \$eon,
|
||||
"bod=s" => \$bod,
|
||||
"eod=s" => \$eod,
|
||||
"column|c=s" => \$column,
|
||||
"ignore|i" => \$ignore,
|
||||
"help|h" => \$help);
|
||||
|
||||
if(!($column=~/^[0-9]+$/)){$column=0;}
|
||||
|
||||
if($help)
|
||||
{
|
||||
print <<'END'
|
||||
Options:
|
||||
--help -h Help.
|
||||
--left -l Left context info (default='30c')
|
||||
Examples:
|
||||
-l=5c: left context is 5 characters
|
||||
-l=5w: left context is 5 words
|
||||
-l=5s: left context is 5 non-empty input lines
|
||||
-l='\s*\S+\sr\S+BOS': left context starts with the given regex
|
||||
--right -r Right context info (default='30c')
|
||||
--trim -t Clear incomplete words from output
|
||||
--white -w DO NOT change all white characters into spaces
|
||||
--column -c Left column minimal width in characters (default = 0)
|
||||
--ignore -i Ignore input inconsistency
|
||||
--bon Beginning of selected segment
|
||||
(regex, default='[0-9]+ [0-9]+ BOM .*')
|
||||
--eon End of selected segment
|
||||
(regex, default='[0-9]+ [0-9]+ EOM .*')
|
||||
--bod Selected segment beginning display (default='[')
|
||||
--eod Selected segment end display (default=']')
|
||||
|
||||
END
|
||||
;
|
||||
exit 0;
|
||||
}
|
||||
|
||||
|
||||
my $seg_no=0;
|
||||
my $seg_size=0;
|
||||
|
||||
my $left_type;
|
||||
my $left_size;
|
||||
my $right_type;
|
||||
my $right_size;
|
||||
|
||||
set_lr_types($l, $r, \$left_type,\$left_size,\$right_type,\$right_size, $trim);
|
||||
|
||||
|
||||
my $inn=0;
|
||||
my $after_bos=0;
|
||||
my $before_eos=0;
|
||||
|
||||
my @LEFT; #tablica skalarów
|
||||
my @CENTER; #tablica skalarów
|
||||
my @RIGHT;
|
||||
|
||||
my @current_center;
|
||||
my @current_left; #skalar dla c, w pp. tablica
|
||||
my @current_left_words;
|
||||
my @current_right_words_number;
|
||||
|
||||
|
||||
while(<>){
|
||||
my $line = $_;
|
||||
chomp $line;
|
||||
my @line = split / /, $line;
|
||||
my $line_s=@line;
|
||||
|
||||
if(!line_format_ok(@line)){next;}
|
||||
|
||||
if(!$white){white_into_spaces(\@line);}
|
||||
else{if($line[2] eq "S"){symbols_into_white(\$line[3]);}}
|
||||
|
||||
if(!input_consistent(\$seg_no,\$seg_size,$line[0],$line[1],$ignore)){
|
||||
eof_or_inconsistency(\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,$left_type,$right_type);
|
||||
@current_center=();
|
||||
@current_left=();
|
||||
@current_left_words=();
|
||||
@current_right_words_number=();
|
||||
$after_bos=0;
|
||||
$before_eos=0;
|
||||
}
|
||||
|
||||
remember_current_left($left_type,$left_size,\@current_left,\@line, \@current_left_words, $line, \$after_bos, \$before_eos);
|
||||
remember_center($line,\@line,\$inn,\@current_center,$white,\@CENTER,\@current_left,\@LEFT, \$after_bos, \$before_eos, \@RIGHT, \@current_right_words_number);
|
||||
remember_right($right_type,$left_type,$right_size,\@line,\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,\@current_right_words_number, $line, \$before_eos);
|
||||
}
|
||||
|
||||
eof_or_inconsistency(\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,$left_type,$right_type);
|
||||
exit(0);
|
||||
|
||||
#################procedury###############################
|
||||
|
||||
sub line_format_ok{
|
||||
my @line = @_;
|
||||
my $size = @line;
|
||||
if($size<4){return 0;}
|
||||
if($line[0]!~/[0-9]+/){return 0;}
|
||||
if($line[1]!~/[0-9]+/){return 0;}
|
||||
return 1;
|
||||
}
|
||||
|
||||
sub white_into_spaces{
|
||||
my $line_ref=shift;
|
||||
if(@{$line_ref}[2] eq "S"){
|
||||
@{$line_ref}[3]=" ";
|
||||
}
|
||||
}
|
||||
|
||||
sub symbols_into_white{
|
||||
my $string_ref=shift;
|
||||
${$string_ref} =~ s/\\n/\n/g;
|
||||
${$string_ref} =~ s/\\t/\t/g;
|
||||
${$string_ref} =~ s/_/ /g;
|
||||
}
|
||||
|
||||
sub white_into_symbols{
|
||||
my $string_ref=shift;
|
||||
${$string_ref} =~ s/\n/\\n/g;
|
||||
${$string_ref} =~ s/\t/\\t/g;
|
||||
${$string_ref} =~ s/ /_/g;
|
||||
}
|
||||
|
||||
sub input_consistent{
|
||||
my $seg_no_ref = shift;
|
||||
my $seg_size_ref = shift;
|
||||
my $line0 = shift;
|
||||
my $line1 = shift;
|
||||
my $ig = shift;
|
||||
my $ok=1;
|
||||
|
||||
if(${$seg_no_ref}!=0&&(!$ig)){
|
||||
my $distance = $line0-${$seg_size_ref};
|
||||
if($distance!=${$seg_no_ref}){$ok=0;}
|
||||
}
|
||||
${$seg_no_ref}=$line0;
|
||||
${$seg_size_ref}=$line1;
|
||||
return $ok;
|
||||
}
|
||||
|
||||
sub set_lr_types{
|
||||
my $left = shift;
|
||||
my $right = shift;
|
||||
my $left_type_ref =shift;
|
||||
my $left_size_ref =shift;
|
||||
my $right_type_ref =shift;
|
||||
my $right_size_ref =shift;
|
||||
my $do_trim=shift;
|
||||
|
||||
if($left=~/[0-9]+c/){
|
||||
${$left_type_ref}='c';
|
||||
${$left_size_ref}=get_number($left);
|
||||
if($do_trim){${$left_size_ref}++;}
|
||||
}
|
||||
else{
|
||||
if($left=~/[0-9]+w/){
|
||||
${$left_type_ref}='w';
|
||||
${$left_size_ref}=get_number($left);
|
||||
}
|
||||
else{
|
||||
if($left=~/[0-9]+s/){
|
||||
${$left_type_ref}='s';
|
||||
${$left_size_ref}=get_number($left);
|
||||
}
|
||||
else{
|
||||
${$left_type_ref}=$left;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if($right=~/[0-9]+c/){
|
||||
${$right_type_ref}='c';
|
||||
${$right_size_ref}=get_number($right);
|
||||
if($do_trim){${$right_size_ref}++;}
|
||||
}
|
||||
else{
|
||||
if($right=~/[0-9]+w/){
|
||||
${$right_type_ref}='w';
|
||||
${$right_size_ref}=get_number($right);
|
||||
}
|
||||
else{
|
||||
if($right=~/[0-9]+s/){
|
||||
${$right_type_ref}='s';
|
||||
${$right_size_ref}=get_number($right);
|
||||
}
|
||||
else{
|
||||
${$right_type_ref}=$right;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub get_number{
|
||||
my $string = shift;
|
||||
my @letters = split(//,$string);
|
||||
my $i=0;
|
||||
while($letters[$i]=~/[0-9]/){$i++;}
|
||||
my $j;
|
||||
my $number=0;
|
||||
my $ten=1;
|
||||
for($j=$i-1;$j>=0;$j--){
|
||||
$number+=$letters[$j]*$ten;
|
||||
$ten*=10;
|
||||
}
|
||||
return $number;
|
||||
}
|
||||
|
||||
sub remember_center{
|
||||
my $lin = shift;
|
||||
my $lin_ref = shift;
|
||||
my $inn_ref = shift;
|
||||
my $current_center_ref = shift;
|
||||
my $white_info = shift;
|
||||
my $CENTER_REF = shift;
|
||||
my $current_left_ref = shift;
|
||||
my $LEFT_REF = shift;
|
||||
my $after_bos_ref = shift;
|
||||
my $before_eos_ref = shift;
|
||||
my $RIGHT_REF = shift;
|
||||
my $current_words_right_number_ref = shift;
|
||||
|
||||
if((!${$inn_ref}) && $lin=~/$bon/){
|
||||
${$inn_ref}=1;
|
||||
@{$current_center_ref}=();
|
||||
${$after_bos_ref}=0;
|
||||
|
||||
push(@{$LEFT_REF},join('',@{$current_left_ref}));
|
||||
|
||||
}
|
||||
if(${$inn_ref} && $lin=~/$eon/){
|
||||
${$inn_ref}=0;
|
||||
push(@{$CENTER_REF},join('',@{$current_center_ref}));
|
||||
${$before_eos_ref}=1;
|
||||
my @new_table;
|
||||
push(@{$RIGHT_REF},\@new_table);
|
||||
push(@{$current_words_right_number_ref},0);
|
||||
}
|
||||
if($inn && index($lin,'*')==-1){
|
||||
white_into_symbols(\${$lin_ref}[3]);
|
||||
if($white_info){push(@{$current_center_ref},${$lin_ref}[3]);}
|
||||
else{push(@{$current_center_ref},${$lin_ref}[3]);}
|
||||
}
|
||||
}
|
||||
|
||||
sub remember_current_left{
|
||||
my $type=shift;
|
||||
my $size=shift;
|
||||
my $ref=shift;
|
||||
my $line_ref=shift;
|
||||
if($type eq 'c'){
|
||||
if(!(${$line_ref}[3] eq '*')){
|
||||
push(@{$ref},split('',${$line_ref}[3]));
|
||||
my $lsize = @{$ref};
|
||||
if($lsize>$size){splice(@{$ref},0,$lsize-$size);}
|
||||
}
|
||||
}
|
||||
else{
|
||||
if($type eq 'w'){
|
||||
my $words_ref = shift;
|
||||
if(!(${$line_ref}[3] eq '*')){
|
||||
push(@{$ref},${$line_ref}[3]);
|
||||
if(${$line_ref}[2] eq 'W'){
|
||||
push(@{$words_ref},${$line_ref}[3]);
|
||||
}
|
||||
my $lsize = @{$words_ref};
|
||||
if($lsize>$size){
|
||||
my $word = ${$words_ref}[1];
|
||||
splice(@{$words_ref},0,1);
|
||||
while(!(${$ref}[0] eq $word)){splice(@{$ref},0,1); }
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else{
|
||||
if($type eq 's'){
|
||||
if(!(${$line_ref}[3] eq '*')){
|
||||
push(@{$ref},${$line_ref}[3]);
|
||||
my $lsize = @{$ref};
|
||||
if($lsize>$size){splice(@{$ref},0,$lsize-$size);}
|
||||
}
|
||||
}
|
||||
else{#bos/eos
|
||||
shift;
|
||||
my $line = shift;
|
||||
my $after_bos_ref = shift;
|
||||
my $before_eos_ref = shift;
|
||||
if($line=~/$type/){
|
||||
${$after_bos_ref}=1;
|
||||
@{$ref}=();
|
||||
}
|
||||
if(${$after_bos_ref} && !(${$line_ref}[3] eq '*')){
|
||||
push(@{$ref},${$line_ref}[3]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub remember_right{
|
||||
my $type=shift;
|
||||
my $type_left=shift;
|
||||
my $size=shift;
|
||||
my $line_ref=shift;
|
||||
my $LEFT_REF=shift;
|
||||
my $CENTER_REF=shift;
|
||||
my $RIGHT_REF=shift;
|
||||
my $bod=shift;
|
||||
my $eod=shift;
|
||||
my $w=shift;
|
||||
my $c=shift;
|
||||
my $t=shift;
|
||||
|
||||
if($type eq 'c'){
|
||||
if(!(${$line_ref}[3] eq '*')){
|
||||
my $right_size = @{$RIGHT_REF};
|
||||
for(my $i=0; $i<$right_size; $i++){
|
||||
push(@{${$RIGHT_REF}[$i]}, split('',${$line_ref}[3]));
|
||||
my $lsize = @{${$RIGHT_REF}[$i]};
|
||||
if($lsize>=$size){
|
||||
splice(@{${$RIGHT_REF}[$i]},$size-1); #wypisz i usun
|
||||
print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type);
|
||||
$right_size = @{$RIGHT_REF};
|
||||
$i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else{
|
||||
if($type eq 'w'){
|
||||
my $words_number_ref = shift;
|
||||
if(!(${$line_ref}[3] eq '*')){
|
||||
my $right_size = @{$RIGHT_REF};
|
||||
for(my $i=0; $i<$right_size; $i++){
|
||||
push(@{${$RIGHT_REF}[$i]},${$line_ref}[3]);
|
||||
if(${$line_ref}[2] eq 'W'){
|
||||
${$words_number_ref}[$i]=${$words_number_ref}[$i]+1;
|
||||
if(${$words_number_ref}[$i]==$size){
|
||||
print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type);
|
||||
$right_size = @{$RIGHT_REF};
|
||||
$i--;
|
||||
splice(@{$words_number_ref},$i,1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else{
|
||||
if($type eq 's'){
|
||||
if(!(${$line_ref}[3] eq '*')){
|
||||
my $right_s = @{$RIGHT_REF};
|
||||
for(my $i=0; $i<$right_s; $i++){
|
||||
push(@{${$RIGHT_REF}[$i]},${$line_ref}[3]);
|
||||
my $rsize=@{${$RIGHT_REF}[$i]};
|
||||
if($rsize==$size){
|
||||
print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type);
|
||||
$right_s = @{$RIGHT_REF};
|
||||
$i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else{#bos/eos
|
||||
shift;
|
||||
my $line = shift;
|
||||
my $before_eos_ref = shift;
|
||||
if(${$before_eos_ref}){
|
||||
if(!(${$line_ref}[3] eq '*')){
|
||||
#tylko 1 pozycja
|
||||
push(@{${$RIGHT_REF}[0]},${$line_ref}[3]);
|
||||
}
|
||||
if($line=~/$type/){
|
||||
${$before_eos_ref}=0;
|
||||
print_and_remove(0,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sub print_and_remove{
|
||||
my $index = shift;
|
||||
my $LEFT_REF = shift;
|
||||
my $CENTER_REF = shift;
|
||||
my $RIGHT_REF = shift;
|
||||
my $bdis = shift;
|
||||
my $edis = shift;
|
||||
my $white = shift;
|
||||
my $column = shift;
|
||||
my $trim = shift;
|
||||
my $left_type = shift;
|
||||
my $right_type = shift;
|
||||
|
||||
my $left_string = "${$LEFT_REF}[$index]";
|
||||
my $right_string = join('',@{${$RIGHT_REF}[$index]});
|
||||
|
||||
if($trim){
|
||||
if($left_type eq "c"){$left_string=trim_left($left_string);}
|
||||
if($right_type eq "c"){$right_string=trim_right($right_string);}
|
||||
}
|
||||
|
||||
if(length($left_string)<$column){$left_string=" "x($column-length($left_string)).$left_string;}
|
||||
|
||||
if($white){
|
||||
white_into_symbols(\$left_string);
|
||||
white_into_symbols(\$right_string);
|
||||
#ponizsza linijka dodana 18 listopada
|
||||
white_into_symbols(\${$CENTER_REF}[$index]);
|
||||
}
|
||||
|
||||
print $left_string;
|
||||
print $bdis;
|
||||
|
||||
#ponizsza 3 linijki (tj. 1 blok) dodana 18 listopada
|
||||
if(!$white){
|
||||
symbols_into_white(\${$CENTER_REF}[$index]);
|
||||
}
|
||||
|
||||
print "${$CENTER_REF}[$index]";
|
||||
print $edis;
|
||||
print $right_string;
|
||||
print "\n";
|
||||
|
||||
splice(@{$LEFT_REF},$index,1);
|
||||
splice(@{$CENTER_REF},$index,1);
|
||||
splice(@{$RIGHT_REF},$index,1);
|
||||
}
|
||||
|
||||
sub trim_left{
|
||||
my $string = shift;
|
||||
if(substr($string,0,1) eq " "){return substr($string,1);}
|
||||
my $position = index($string," ");
|
||||
my $temp_position = index($string,"\n");
|
||||
if(!$temp_position==-1&&($position==-1||$temp_position<$position)){$position=$temp_position;}
|
||||
$temp_position = index($string,"\t");
|
||||
if(!$temp_position==-1&&($position==-1||$temp_position<$position)){$position=$temp_position;}
|
||||
return substr($string,$position+1);
|
||||
}
|
||||
|
||||
sub trim_right{
|
||||
my $string = shift;
|
||||
my $length = length($string);
|
||||
if(substr($string,$length-1,1) eq " "){return substr($string,0,$length-1);}
|
||||
my $position = rindex($string," ");
|
||||
my $temp_position = rindex($string,"\n");
|
||||
if($temp_position>$position){$position=$temp_position;}
|
||||
$temp_position = rindex($string,"\t");
|
||||
if($temp_position>$position){$position=$temp_position;}
|
||||
return substr($string,0,$position);
|
||||
}
|
||||
|
||||
sub eof_or_inconsistency{
|
||||
my $LEFT_REF = shift;
|
||||
my $CENTER_REF = shift;
|
||||
my $RIGHT_REF = shift;
|
||||
my $bdis = shift;
|
||||
my $edis = shift;
|
||||
my $white = shift;
|
||||
my $column = shift;
|
||||
my $trim = shift;
|
||||
my $left_type = shift;
|
||||
my $right_type = shift;
|
||||
|
||||
my $length = @{$CENTER_REF};
|
||||
for(my $i=0;$i<$length;$i++){
|
||||
print_and_remove(0,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bdis,$edis,$white,$column,$trim,$left_type,$right_type);
|
||||
$length = @{$CENTER_REF};
|
||||
$i--;
|
||||
}
|
||||
}
|
42
app/src/cor/Makefile
Normal file
42
app/src/cor/Makefile
Normal file
@ -0,0 +1,42 @@
|
||||
PAR=-Wno-deprecated -m32 -fpermissive
|
||||
# -static
|
||||
PAR2=-c -Wno-deprecated -m32 -fpermissive
|
||||
LIB_PATH=../lib
|
||||
COMMON_PATH=../common
|
||||
CMDLINE_FILE='"../cor/cmdline.h"'
|
||||
|
||||
|
||||
cor: main.cc corr.o $(LIB_PATH)/word.o \
|
||||
$(LIB_PATH)/auttools.o cmdline.c common_cor.o common.o
|
||||
g++ $(PAR) main.cc corr.o common.o \
|
||||
$(LIB_PATH)/word.o $(LIB_PATH)/auttools.o cmdline.c common_cor.o \
|
||||
-o cor
|
||||
|
||||
corr.o: corr.cc corr.hh
|
||||
g++ $(PAR2) corr.cc
|
||||
|
||||
common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \
|
||||
$(COMMON_PATH)/common.h
|
||||
g++ $(PAR2) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc
|
||||
|
||||
common_cor.o: cmdline.h common_cor.cc common_cor.h
|
||||
g++ $(PAR2) common_cor.cc
|
||||
|
||||
cmdline.c cmdline.h: cmdline.ggo
|
||||
gengetopt -i cmdline.ggo --conf-parser
|
||||
|
||||
cmdline.ggo: cmdline_cor.ggo ../common/cmdline_common.ggo
|
||||
cat cmdline_cor.ggo ../common/cmdline_common.ggo > cmdline.ggo
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp cor ${UTT_BIN_DIR}
|
||||
endif
|
||||
|
||||
clean: clean.cmdline
|
||||
rm *.o || true
|
||||
rm cor || true
|
||||
|
||||
clean.cmdline:
|
||||
rm cmdline.* || true
|
||||
|
8
app/src/cor/cmdline_cor.ggo
Normal file
8
app/src/cor/cmdline_cor.ggo
Normal file
@ -0,0 +1,8 @@
|
||||
package "cor"
|
||||
version "0.1"
|
||||
|
||||
option "dictionary-home" - "Dictionary home dir." string typestr="FILENAME" no hidden
|
||||
option "dictionary" d "Dictionary" string typestr="FILENAME" default="cor.bin" no
|
||||
option "distance" n "Maximal edit distance." int default="1" no
|
||||
option "replace" r "Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off
|
||||
#option "single" - "Place all alternatives in the same line" flag off
|
19
app/src/cor/common_cor.cc
Normal file
19
app/src/cor/common_cor.cc
Normal file
@ -0,0 +1,19 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "common_cor.h"
|
||||
|
||||
char dictionary[256];
|
||||
|
||||
void process_cor_options(gengetopt_args_info* args)
|
||||
{
|
||||
if(args->dictionary_given)
|
||||
{
|
||||
expand_path(args->dictionary_arg,dictionary);
|
||||
}
|
||||
else if (args->dictionary_home_given && args->language_given)
|
||||
{
|
||||
char buf[255];
|
||||
expand_path(args->dictionary_home_arg, buf);
|
||||
sprintf(dictionary,"%s/%s/cor.bin",buf,args->language_arg);
|
||||
}
|
||||
}
|
19
app/src/cor/common_cor.h
Normal file
19
app/src/cor/common_cor.h
Normal file
@ -0,0 +1,19 @@
|
||||
#ifndef __COMMON_COR_H
|
||||
#define __COMMON_COR_H
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#define _CMDLINE_FILE "../cor/cmdline.h"
|
||||
#include "../common/common.h"
|
||||
|
||||
#include "cmdline.h"
|
||||
|
||||
#define DICT_FILE "cor.bin"
|
||||
|
||||
extern int change_count;
|
||||
|
||||
extern void process_cor_options(gengetopt_args_info* args);
|
||||
|
||||
extern char dictionary[];
|
||||
|
||||
#endif
|
142
app/src/cor/corr.cc
Normal file
142
app/src/cor/corr.cc
Normal file
@ -0,0 +1,142 @@
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
#include "corr.hh"
|
||||
|
||||
#define MAXPATH 256
|
||||
|
||||
#define min(x,y) ((x<y)?(x):(y))
|
||||
#define max(x,y) ((x>y)?(x):(y))
|
||||
|
||||
|
||||
int Corr::ed(int i,int j)
|
||||
{
|
||||
if(i==-1)
|
||||
return j+1;
|
||||
if(j==-1)
|
||||
return i+1;
|
||||
if(i==-2 || j==-2)
|
||||
return n+1;
|
||||
|
||||
if(X[i]==Y[j])
|
||||
return H2[i-1][j-1];
|
||||
if(X[i-1]==Y[j] && X[i]==Y[j-1])
|
||||
return 1+min(H2[i-2][j-2],min(H2[i][j-1],H2[i-1][j]));
|
||||
return 1+min(H2[i-1][j-1],min(H2[i][j-1],H2[i-1][j]));
|
||||
|
||||
/*
|
||||
if(X[i]==Y[j])
|
||||
return H[(i-1)+2][(j-1)+2];
|
||||
if(X[i-1]==Y[j] && X[i]==Y[j-1])
|
||||
return 1+min(H[(i-2)+2][(j-2)+2],min(H[(i)+2][(j-1)+2],H[(i-1)+2][(j)+2]));
|
||||
return 1+min(H[(i-1)+2][(j-1)+2],min(H[(i)+2][(j-1)+2],H[(i-1)+2][(j)+2]));
|
||||
*/
|
||||
}
|
||||
|
||||
int Corr::cuted(int j)
|
||||
{
|
||||
int l=max(0,j-t);
|
||||
int u=min(m,j+t);
|
||||
int ce=j+t;
|
||||
for(int k=l;k<=u;k++)
|
||||
{
|
||||
if(H2[k][j]<ce)//if(H[(k)+2][(j)+2]<ce)
|
||||
ce=H2[k][j];//ce=H[(k)+2][(j)+2];
|
||||
}
|
||||
return ce;
|
||||
}
|
||||
|
||||
/*
|
||||
void Corr::recomputeH(int j)
|
||||
{
|
||||
for(int i=0;i<=m;i++)
|
||||
H[(i)+2][(j)+2]=ed(i,j);
|
||||
}
|
||||
*/
|
||||
|
||||
void Corr::recomputeH(int j)
|
||||
{
|
||||
int lo=max(0,j-t-2);
|
||||
int hi=min(m,j+t+2);
|
||||
for(int i=lo;i<=hi;++i)
|
||||
H2[i][j]=ed(i,j);//H[(i)+2][(j)+2]=ed(i,j);
|
||||
}
|
||||
|
||||
|
||||
int Corr::correct(const char* w, Words& tab)
|
||||
{
|
||||
long int path[MAXPATH]={0};
|
||||
int i; // row index (X)
|
||||
int j; // column index (Y)
|
||||
long state=0;
|
||||
|
||||
strcpy(X,w);
|
||||
m=strlen(X)-1;
|
||||
n=m+t;
|
||||
|
||||
for(i=(-2);i<=m;i++)
|
||||
H[(i)+2][(-2)+2]=n;
|
||||
for(i=(-1);i<=m;i++)
|
||||
H[(i)+2][(-1)+2]=(i)+1;
|
||||
for(j=(-2);j<=n;j++)
|
||||
H[(-2)+2][(j)+2]=n;
|
||||
for(j=(-1);j<=n;j++)
|
||||
H[(-1)+2][(j)+2]=(j)+1;
|
||||
|
||||
for(j=0; j<=n; ++j)
|
||||
for(i=0; i<=m; ++i)
|
||||
H[i+2][j+2]=t+1;
|
||||
|
||||
int more=1;
|
||||
bool cont=false;
|
||||
|
||||
strcpy(Y,"");
|
||||
j=0;
|
||||
state=0;
|
||||
int count=0;
|
||||
while(more)
|
||||
{
|
||||
if(!empty(state))
|
||||
{
|
||||
Y[j]=input(state);
|
||||
recomputeH(j);
|
||||
if(cuted(j)<=t)
|
||||
{
|
||||
int edd;
|
||||
if(final(next(state)) && (edd=H[(m)+2][(j)+2])<=t)
|
||||
{
|
||||
char* out=new char[j+2];
|
||||
strncpy(out,Y,j+1);
|
||||
out[j+1]='\0';
|
||||
// if(cont) putchar(' ');
|
||||
cont=true;
|
||||
// printf("%i,%s", edd,out);
|
||||
// cout << out << "(" << edd << ")" << endl;
|
||||
tab.add(out);
|
||||
count++;
|
||||
}
|
||||
path[j++]=state;
|
||||
state=next(state);
|
||||
continue;
|
||||
}
|
||||
else
|
||||
if(continued(state))
|
||||
{
|
||||
state++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
//backtracking
|
||||
do
|
||||
if(j>0)
|
||||
j--;
|
||||
else
|
||||
more=0;
|
||||
while(more && !continued(path[j]));
|
||||
state=path[j]+1;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
34
app/src/cor/corr.hh
Normal file
34
app/src/cor/corr.hh
Normal file
@ -0,0 +1,34 @@
|
||||
//---------------------------------------------------------------------------
|
||||
#ifndef _corr_hh
|
||||
#define _corr_hh
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
#include "../lib/tfti.h"
|
||||
#include "../lib/word.h"
|
||||
|
||||
class Corr : public TFTiv<char,char>
|
||||
{
|
||||
private:
|
||||
int H[100][100];
|
||||
char X[100]; // misspelled string
|
||||
char Y[100]; // (possibly partial) candidate string
|
||||
int m; // length of X
|
||||
int n; // maximal length of Y
|
||||
|
||||
int ed(int,int);
|
||||
int cuted(int);
|
||||
void recomputeH(int);
|
||||
|
||||
public:
|
||||
int (*H2)[100];
|
||||
|
||||
int t; // threshold
|
||||
|
||||
Corr() : H2((int(*)[100])&H[2][2]) {};
|
||||
Corr(const char* a) : TFTiv<char,char>(a), H2((int(*)[100])&H[2][2]) { };
|
||||
|
||||
int correct(const char* w, Words& tab);
|
||||
};
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
#endif
|
155
app/src/cor/main.cc
Normal file
155
app/src/cor/main.cc
Normal file
@ -0,0 +1,155 @@
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include "../lib/iotools.h"
|
||||
#define _CMDLINE_FILE "../cor/cmdline.h"
|
||||
#include "../common/common.h"
|
||||
#include "common_cor.h"
|
||||
#include "corr.hh"
|
||||
#include "cmdline.h"
|
||||
#include <locale.h>
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
// setlocale(LC_CTYPE,"");
|
||||
// setlocale(LC_COLLATE,"");
|
||||
|
||||
gengetopt_args_info args;
|
||||
|
||||
if(cmdline_parser(argc, argv, &args) != 0)
|
||||
exit(1);
|
||||
|
||||
process_config_files(&args,argv[0]);
|
||||
process_common_options(&args,argv[0]);
|
||||
process_cor_options(&args);
|
||||
|
||||
Corr cor;
|
||||
|
||||
cor.load(dictionary);
|
||||
cor.t=args.distance_arg;
|
||||
|
||||
char line[MAX_LINE+1];
|
||||
long line_count = 0;
|
||||
|
||||
Segment seg;
|
||||
Words tab;
|
||||
char form1[MAX_LINE];
|
||||
char* form;
|
||||
int formcasing;
|
||||
char corfield[MAX_LINE]="";
|
||||
|
||||
while (fgets(line, MAX_LINE, inputf))
|
||||
{
|
||||
// strcpy(outline,line);
|
||||
++line_count;
|
||||
|
||||
// if(!seg.parse(line))
|
||||
// {
|
||||
// fprintf(stderr,"Input error in line %d.\n",line_count);
|
||||
// exit(1);
|
||||
// }
|
||||
|
||||
char outline[128];
|
||||
//printf("Starting cor... searching for %d fields\n", args.input_field_given);
|
||||
//for (int i=0; i<args.input_field_given; ++i) {
|
||||
// printf("\t%d. %s\n", i, args.input_field_arg[i]);
|
||||
//}
|
||||
|
||||
if (!process_seg(line, args))
|
||||
fputs(line, outputf);
|
||||
else
|
||||
{
|
||||
char form[MAX_FORM];
|
||||
|
||||
tab.clear();
|
||||
getfield(line,input_field_prefix,form);
|
||||
if (form==NULL) continue;
|
||||
|
||||
formcasing=3;
|
||||
cor.correct(form, tab);
|
||||
|
||||
if( tab.count() == 0 )
|
||||
{
|
||||
formcasing=casing(form);
|
||||
if( formcasing == 1 || formcasing == 2)
|
||||
tolowers(form, form1), cor.correct(form1, tab);
|
||||
}
|
||||
|
||||
if ( tab.count() == 0)
|
||||
fputs(line, failedf);
|
||||
else
|
||||
{
|
||||
if(args.replace_flag)
|
||||
{
|
||||
char corfield[128];
|
||||
strcpy(corfield, input_field_prefix);
|
||||
strcat(corfield, form);
|
||||
seg.aux[seg.auxn]=corfield;
|
||||
++seg.auxn;
|
||||
for(int i=0; i<tab.count(); ++i)
|
||||
{
|
||||
seg.form=tab[i].form();
|
||||
restorecasing(seg.form,seg.form,formcasing);
|
||||
seg.print(outline);
|
||||
fputs(outline, outputf);
|
||||
}
|
||||
--seg.auxn;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(one_line)
|
||||
{
|
||||
char* p=corfield;
|
||||
for(int i=0; i<tab.count(); ++i)
|
||||
{
|
||||
restorecasing(tab[i].form(),tab[i].form(),formcasing);
|
||||
p += sprintf(p," %s%s",output_field_prefix,tab[i].form());
|
||||
}
|
||||
sprintf(p,"\n");
|
||||
|
||||
strcpy(outline,line);
|
||||
outline[strlen(outline)-1]='\0';
|
||||
strcat(outline,corfield);
|
||||
fputs(outline, outputf);
|
||||
}
|
||||
else if(one_field)
|
||||
{
|
||||
char* p=corfield;
|
||||
p += sprintf(p," %s",output_field_prefix);
|
||||
for(int i=0; i<tab.count(); ++i)
|
||||
{
|
||||
restorecasing(tab[i].form(),tab[i].form(),formcasing);
|
||||
p += sprintf(p,(i==0)?"%s":";%s",tab[i].form());
|
||||
}
|
||||
|
||||
sprintf(p,"\n");
|
||||
|
||||
strcpy(outline,line);
|
||||
outline[strlen(outline)-1]='\0';
|
||||
strcat(outline,corfield);
|
||||
fputs(outline, outputf);
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int i=0; i<tab.count(); ++i)
|
||||
{
|
||||
restorecasing(tab[i].form(),tab[i].form(),formcasing);
|
||||
sprintf(corfield," %s%s\n",output_field_prefix,tab[i].form());
|
||||
strcpy(outline,line);
|
||||
outline[strlen(outline)-1]='\0';
|
||||
strcat(outline,corfield);
|
||||
fputs(outline, outputf);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(args.interactive_flag)
|
||||
{
|
||||
fflush(outputf);
|
||||
fflush(failedf);
|
||||
}
|
||||
}
|
||||
cmdline_parser_free(&args);
|
||||
}
|
13
app/src/fla/Makefile
Normal file
13
app/src/fla/Makefile
Normal file
@ -0,0 +1,13 @@
|
||||
|
||||
fla: fla.c
|
||||
gcc -static -o fla fla.c
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp fla ${UTT_BIN_DIR}
|
||||
endif
|
||||
|
||||
clean:
|
||||
rm fla
|
||||
|
||||
uninstall:
|
46
app/src/fla/fla.c
Normal file
46
app/src/fla/fla.c
Normal file
@ -0,0 +1,46 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <regex.h>
|
||||
|
||||
char buf[5001];
|
||||
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
|
||||
char *pattern;
|
||||
char eoln;
|
||||
regex_t re;
|
||||
|
||||
int firstline=1;
|
||||
|
||||
if(argc < 2)
|
||||
/* pattern="[ \t]*([0-9]+[ \t]+){2}EOS([ \t].*)?"; */
|
||||
pattern="[ \t]*BOS([ \t].*)?";
|
||||
else
|
||||
pattern=argv[1];
|
||||
|
||||
if(argc < 3)
|
||||
eoln='\f';
|
||||
else
|
||||
eoln=atoi(argv[2]);
|
||||
|
||||
if(regcomp(&re, pattern, REG_EXTENDED|REG_NOSUB) !=0)
|
||||
{
|
||||
fprintf(stderr,"Invalid pattern.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
while(fgets(buf,5000,stdin))
|
||||
{
|
||||
buf[strlen(buf)-1]='\0';
|
||||
if(firstline)
|
||||
firstline=0;
|
||||
else
|
||||
if(regexec(&re, buf, (size_t)0, NULL, 0) == 0)
|
||||
putchar('\n');
|
||||
else
|
||||
putchar(eoln);
|
||||
fputs(buf,stdout);
|
||||
}
|
||||
putchar('\n');
|
||||
}
|
7
app/src/gph/Makefile
Normal file
7
app/src/gph/Makefile
Normal file
@ -0,0 +1,7 @@
|
||||
|
||||
gph:
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp gph ${UTT_BIN_DIR}
|
||||
endif
|
85
app/src/gph/gph
Executable file
85
app/src/gph/gph
Executable file
@ -0,0 +1,85 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use Getopt::Long;
|
||||
|
||||
my @process;
|
||||
my $help=0;
|
||||
my $reset;
|
||||
my $interactive=1;
|
||||
|
||||
GetOptions("process|p=s" => \@process,
|
||||
"help|h" => \$help,
|
||||
"reset|r=s" => \$reset,
|
||||
"interactive|i" => \$interactive);
|
||||
|
||||
if($help)
|
||||
{
|
||||
print <<'END'
|
||||
Usage: gph [OPTIONS]
|
||||
|
||||
Options:
|
||||
-p tag Process segments with this tag as nodes.
|
||||
-r tag Start new graph at this tag.
|
||||
-f filename Input file (NIE DZIALA).
|
||||
-o filename Output file (NIE DZIALA).
|
||||
-i Toggle interactive mode (default=on).
|
||||
END
|
||||
;
|
||||
exit 0;
|
||||
}
|
||||
|
||||
|
||||
$|=1 if $interactive;
|
||||
|
||||
my @prev;
|
||||
|
||||
my $n=0;
|
||||
|
||||
while(<>)
|
||||
{
|
||||
chomp;
|
||||
my $do=0;
|
||||
|
||||
my @line = split /\s+/;
|
||||
|
||||
if($line[2] eq $reset)
|
||||
{
|
||||
$n=0;
|
||||
@prev = ();
|
||||
}
|
||||
|
||||
for my $p (@process)
|
||||
{
|
||||
$do=1 if $line[2] eq $p;
|
||||
}
|
||||
|
||||
if($do)
|
||||
{
|
||||
@preds = ();
|
||||
shift @prev while @prev+0 && $prev[0]->[1] + $prev[0]->[2] < $line[0];
|
||||
for my $p (@prev)
|
||||
{
|
||||
push(@preds, $p->[0]) if $p->[1] + $p->[2] == $line[0];
|
||||
}
|
||||
push @prev, [$n, $line[0], $line[1]];
|
||||
|
||||
$gph=' gph:'.$n.':'.join(',',@preds);
|
||||
|
||||
$n++;
|
||||
}
|
||||
else
|
||||
{
|
||||
for my $p (@prev)
|
||||
{
|
||||
if($p->[1]+$p->[2] == $line[0])
|
||||
{
|
||||
$p->[2] += $line[1];
|
||||
}
|
||||
}
|
||||
|
||||
$gph='';
|
||||
|
||||
}
|
||||
|
||||
print $_.$gph."\n";
|
||||
}
|
6
app/src/grp/Makefile
Normal file
6
app/src/grp/Makefile
Normal file
@ -0,0 +1,6 @@
|
||||
main:
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp grp ${UTT_BIN_DIR}
|
||||
endif
|
154
app/src/grp/grp
Executable file
154
app/src/grp/grp
Executable file
@ -0,0 +1,154 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
#package: UAM Text Tools
|
||||
#component name: gre
|
||||
#author: Tomasz Obrêbski
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
|
||||
my $LIB_DIR="/usr/local/lib/utt"; # katalog zawierajacy terms.m4
|
||||
|
||||
my $systemconfigfile="/usr/local/etc/utt/grp.conf";
|
||||
my $userconfigfile="$ENV{'HOME'}/.utt/grp.conf";
|
||||
|
||||
Getopt::Long::Configure('no_ignore_case_always');
|
||||
|
||||
my $help=0;
|
||||
my $pattern=0;
|
||||
my $matches_only=0;
|
||||
my $macrofile=0;
|
||||
my $define=0;
|
||||
my $show_command=0;
|
||||
my $action="pgP";
|
||||
my $eos="seg(EOS)";
|
||||
my $morfield='lem';
|
||||
|
||||
#read configuration files###########################
|
||||
my $file;
|
||||
foreach $file ($systemconfigfile, $userconfigfile){
|
||||
if(open(CONFIG, $file)){
|
||||
while (<CONFIG>) {
|
||||
chomp;
|
||||
s/#.*//;
|
||||
s/^\s+//;
|
||||
s/\s+$//;
|
||||
next unless length;
|
||||
my ($name, $value) = split(/\s*=\s*/, $_, 2);
|
||||
if(($name eq "pattern")or($name eq "e")){
|
||||
$pattern=$value;
|
||||
}
|
||||
elsif(($name eq "eos")or($name eq "E")){
|
||||
$eos=$value;
|
||||
}
|
||||
elsif($name eq "morph"){
|
||||
$morfield=$value;
|
||||
}
|
||||
elsif($name eq "macros"){
|
||||
$macrofile=$value;
|
||||
}
|
||||
elsif($name eq "define"){
|
||||
$define=$value;
|
||||
}
|
||||
elsif($name eq "command"){
|
||||
$show_command=1;
|
||||
}
|
||||
elsif($name eq "action"){
|
||||
$action;
|
||||
}
|
||||
elsif(($name eq "help")or($name eq "h")){
|
||||
$help=1;
|
||||
}
|
||||
|
||||
}
|
||||
close CONFIG;
|
||||
}
|
||||
}
|
||||
#########################################################
|
||||
|
||||
GetOptions("pattern|e=s" => \$pattern,
|
||||
"eos|E=s" => \$eos,
|
||||
"morph=s" => \$morfield,
|
||||
"macros=s" => \$macrofile,
|
||||
"define=s" => \$macrofile,
|
||||
"command" => \$show_command,
|
||||
"action=s" => \$action,
|
||||
"help|h" => \$help);
|
||||
|
||||
if($help)
|
||||
{
|
||||
print <<'END'
|
||||
Usage: gre [OPTIONS] [file ..]
|
||||
|
||||
Options:
|
||||
--pattern -e PATTERN Pattern.
|
||||
--eos -E PATTERN Segment serving as sentence delimiter.
|
||||
--morph=STRING Field containing morphological information (default 'lem').
|
||||
--macros=FILE Read macrodefinitions from FILE.
|
||||
--define=FILE Add macrodefinitions from FILE.
|
||||
--action -a [u][p][g][P] Perform only indicated actions.
|
||||
u - uncompress with 'lzop -cd'
|
||||
p - preprocess
|
||||
g - grep
|
||||
P - postprocess
|
||||
(default pgP)
|
||||
--command Print the shell command to be executed and exit.
|
||||
--help -h Help.
|
||||
END
|
||||
;
|
||||
exit 0;
|
||||
}
|
||||
|
||||
die("$0: no pattern given.\n") unless $pattern || $action !~ /g/;
|
||||
|
||||
die("$0: macro file not found") unless
|
||||
$macrofile or
|
||||
-e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4";
|
||||
|
||||
my $uncompress = ($action =~ /u/) ? ' lzop -cd | ' : '';
|
||||
my $preproc = ($action =~ /p/) ? ' fla | ' : '';
|
||||
|
||||
my $postproc = ($action =~ /P/) ? ' | unfla ' : '';
|
||||
|
||||
|
||||
# discarding spaces
|
||||
$pattern =~ s/\s+/\\`'/g; #`
|
||||
# quoting escaped commas
|
||||
$pattern =~ s/\\,/\\`\\`\\,''/g;
|
||||
# quoting commas in {m,n} r.e. operator
|
||||
$pattern =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;
|
||||
|
||||
my $grepre = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`;
|
||||
|
||||
die("Incorrect pattern (m4).") if $? >> 8;
|
||||
|
||||
|
||||
chomp $grepre;
|
||||
|
||||
# <> expansion
|
||||
|
||||
$grepre =~ s/<([^>]+)>/`echo $1 | tag2re`/ge;
|
||||
|
||||
$grepre =~ s/\./[^ [:cntrl:]]/g;
|
||||
|
||||
$grepre =~ s/\\s/[ ]/g;
|
||||
$grepre =~ s/\\S/[^ [:cntrl:]]/g;
|
||||
$grepre =~ s/\\d/[0-9]/g;
|
||||
$grepre =~ s/\\D/[^0-9 [:cntrl:]]/g;
|
||||
$grepre =~ s/\\w/[a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_]/g;
|
||||
$grepre =~ s/\\W/[^a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_ [:cntrl:]]/g;
|
||||
# extensions
|
||||
$grepre =~ s/\\l/[a-z±æê³ñ󶼿]/g; #lowercase letter
|
||||
$grepre =~ s/\\L/[A-Z¡ÆÊ£ÑÓ¦¬¯]/g; #upercase letter
|
||||
|
||||
my $grep_command = ($action =~ /g/) ? "egrep '$grepre'" : " cat ";
|
||||
|
||||
if($show_command)
|
||||
{
|
||||
print $grep_command."\n";
|
||||
exit 0;
|
||||
}
|
||||
|
||||
#print $preproc.$grep_command.$postproc."\n";
|
||||
|
||||
exec $preproc.$grep_command.$postproc;
|
42
app/src/gue/Makefile
Normal file
42
app/src/gue/Makefile
Normal file
@ -0,0 +1,42 @@
|
||||
PAR=-Wno-deprecated -O3 -fpermissive -static
|
||||
PAR2=-c -Wno-deprecated -O3 -fpermissive
|
||||
LIB_PATH=../lib
|
||||
COMMON_PATH=../common
|
||||
CMDLINE_FILE='"../gue/cmdline.h"'
|
||||
|
||||
|
||||
gue: main.cc guess.o $(LIB_PATH)/auttools.o $(LIB_PATH)/word.o \
|
||||
cmdline.c common_guess.o common.o
|
||||
g++ $(PAR) main.cc guess.o \
|
||||
$(LIB_PATH)/auttools.o $(LIB_PATH)/word.o cmdline.c common.o common_guess.o \
|
||||
-o gue
|
||||
|
||||
guess.o: guess.h guess.cc
|
||||
g++ $(PAR2) guess.cc
|
||||
|
||||
common_guess.o: cmdline.h common_guess.cc common_guess.h
|
||||
g++ $(PAR2) common_guess.cc
|
||||
|
||||
common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \
|
||||
$(COMMON_PATH)/common.h
|
||||
g++ $(PAR2) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc
|
||||
|
||||
cmdline.c cmdline.h: cmdline.ggo
|
||||
gengetopt -i cmdline.ggo --conf-parser
|
||||
|
||||
cmdline.ggo: cmdline_guess.ggo ../common/cmdline_common.ggo
|
||||
cat cmdline_guess.ggo ../common/cmdline_common.ggo > cmdline.ggo
|
||||
|
||||
|
||||
clean: clean.cmdline
|
||||
rm *.o || true
|
||||
rm gue || true
|
||||
|
||||
|
||||
clean.cmdline:
|
||||
rm cmdline.* || true
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp gue ${UTT_BIN_DIR}
|
||||
endif
|
12
app/src/gue/cmdline_guess.ggo
Normal file
12
app/src/gue/cmdline_guess.ggo
Normal file
@ -0,0 +1,12 @@
|
||||
package "guess"
|
||||
version "0.1"
|
||||
|
||||
option "guess_count" n "Guess up to n descriptions" int default="0" no
|
||||
option "delta" - "Stop displaying answers after fall of weight" float default="0.2" no
|
||||
option "cut-off" - "Do not display answers with less weight than cut-off" int default="200" no
|
||||
option "dictionary-home" - "dh" hidden
|
||||
option "dictionary" d "File with dictionary information" string typestr="filename" default="~/.utt/lang/pl_PL.ISO-8859-2/gue.bin" no
|
||||
option "per-info" v "Display performance information" flag off
|
||||
option "weights" w "Print weights" flag off hidden
|
||||
option "no-uppercase" - "Do not process form containing uppercase letters" flag off
|
||||
|
50
app/src/gue/common_guess.cc
Normal file
50
app/src/gue/common_guess.cc
Normal file
@ -0,0 +1,50 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "common_guess.h"
|
||||
|
||||
int guess_count=0;
|
||||
double delta=0.1;
|
||||
int cut_off=100;
|
||||
char dictionary[255];
|
||||
bool per_info=false;
|
||||
bool weights=true;
|
||||
|
||||
void process_guess_options(gengetopt_args_info* args)
|
||||
{
|
||||
|
||||
if(args->dictionary_given)
|
||||
{
|
||||
expand_path(args->dictionary_arg,dictionary);
|
||||
}
|
||||
else if (args->dictionary_home_given && args->language_given)
|
||||
{
|
||||
char buf[255];
|
||||
expand_path(args->dictionary_home_arg, buf);
|
||||
sprintf(dictionary,"%s/%s/lem.bin",buf,args->language_arg);
|
||||
}
|
||||
|
||||
if(args->guess_count_given)
|
||||
guess_count=args->guess_count_arg;
|
||||
else
|
||||
guess_count=0;
|
||||
|
||||
if(guess_count==0)
|
||||
guess_count=100;
|
||||
|
||||
if(args->delta_given)
|
||||
delta=args->delta_arg;
|
||||
else
|
||||
delta=0.1;
|
||||
|
||||
if(args->cut_off_given)
|
||||
cut_off=args->cut_off_arg;
|
||||
else
|
||||
cut_off=100;
|
||||
|
||||
if(args->per_info_given)
|
||||
per_info=args->per_info_flag;
|
||||
|
||||
if(args->weights_given)
|
||||
weights=false;
|
||||
|
||||
}
|
20
app/src/gue/common_guess.h
Normal file
20
app/src/gue/common_guess.h
Normal file
@ -0,0 +1,20 @@
|
||||
#ifndef __COMMON_GUESS_H
|
||||
#define __COMMON_GUESS_H
|
||||
|
||||
#include <stdio.h>
|
||||
#define _CMDLINE_FILE "../gue/cmdline.h"
|
||||
#include "../common/common.h"
|
||||
#include "cmdline.h"
|
||||
|
||||
#define DIC_FILE "gue.bin"
|
||||
|
||||
extern int guess_count;
|
||||
extern double delta;
|
||||
extern int cut_off;
|
||||
extern char dictionary[];
|
||||
extern bool per_info;
|
||||
extern bool weights;
|
||||
|
||||
void process_guess_options(gengetopt_args_info* args);
|
||||
|
||||
#endif
|
138
app/src/gue/guess.cc
Normal file
138
app/src/gue/guess.cc
Normal file
@ -0,0 +1,138 @@
|
||||
|
||||
#include "guess.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <iostream.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <time.h>
|
||||
|
||||
#define DICT 1
|
||||
#define COR 2
|
||||
#define DICT_P 3
|
||||
#define COR_P 4
|
||||
|
||||
#define W_PRE 0.1
|
||||
#define W_SUF 0.9
|
||||
|
||||
#define PREF_SIGN '_'
|
||||
|
||||
Guess::Guess(const char* suf_file)
|
||||
: _suf(suf_file) {
|
||||
/* _suf = NULL;
|
||||
_pref = NULL;
|
||||
|
||||
if (strlen(suf_file) > 0)
|
||||
_suf = new TFTiv<char, char>(suf_file);
|
||||
if (strlen(pref_file) > 0)
|
||||
_pref = new TFTiv<char, char>(corp_file);
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
char buf[MAX_LINE];
|
||||
char out[MAX_LINE];
|
||||
char* buf0_s = buf;
|
||||
char* word_t = NULL;
|
||||
long state_s = 0;
|
||||
unsigned length_s = buf0_s - buf;
|
||||
long len = 0;
|
||||
int i=0;
|
||||
|
||||
int Guess::ana(const char* word, Words& result) {
|
||||
|
||||
assert(word && &result);
|
||||
|
||||
/* Word zawiera wyraz, ktory mamy zbadac.
|
||||
* Nalezy przepisac go w odwrotnej kolejnosci do bufora,
|
||||
* znalezc najdluzszy prefiks pasujacy do tego bufora
|
||||
* separatorem jest '/' - za tym znakiem znajduje sie
|
||||
* prawdopodobienstwo wystapienia danego opisu */
|
||||
|
||||
buf0_s = buf;
|
||||
word_t = strdup(word);
|
||||
|
||||
if (reverse(word, buf) != 0)
|
||||
return -1;
|
||||
|
||||
|
||||
|
||||
state_s = -1;
|
||||
// printf("#buf0_s=%s, ", buf0_s);
|
||||
state_s = _suf.pref(buf0_s, PREF_SIGN);
|
||||
// printf("#word=%s, buf0_s=%s\t", word, buf0_s);
|
||||
/* jezeli state_s != -1 to oznacza, ze w slowniku jest zawarta
|
||||
* informacja o prefiksie tego slowa.
|
||||
* nie jest ona odwrocona, wiec porownujemy do word a nie do buf
|
||||
*/
|
||||
// printf("state_s=%d\t", state_s);
|
||||
if (state_s != -1) {
|
||||
state_s = _suf.pref(word_t, '~', state_s);
|
||||
// printf("state_s(wp)=%d, word_t=%s, word=%s\n", state_s, word_t, word);
|
||||
}
|
||||
if (state_s == -1) {
|
||||
// if (_suf != NULL)
|
||||
buf0_s = buf;
|
||||
state_s = _suf.pref(buf0_s, '~');
|
||||
// printf("state_s=%d\n", state_s);
|
||||
}
|
||||
|
||||
length_s = buf0_s - buf;
|
||||
|
||||
/* state jest stanem, od ktorego zaczyna sie sciezka opisujaca
|
||||
* prawdopodobienstwo przeciwienstwa wystapienia opisu
|
||||
* znajdujacego sie dalej na tej sciezce.
|
||||
* Im mniejsza wartosc liczby tym wieksze prawdopodobienstwo */
|
||||
|
||||
len = 0;
|
||||
i=0;
|
||||
|
||||
// if (_suf != NULL)
|
||||
len = _suf.cont(state_s, out);
|
||||
while (len > 0) {
|
||||
i++;
|
||||
add_word_prob(result, word, out, length_s, DICT);
|
||||
len = _suf.cont(-1, out);
|
||||
}
|
||||
|
||||
return i;
|
||||
|
||||
}
|
||||
|
||||
|
||||
int Guess::add_word_prob(Words& tab, const char* word, const char* path, unsigned len, int source) {
|
||||
|
||||
/* Dodaje do tablicy tab wyraz word wraz
|
||||
* z prawdopodobienstwem i opisem zawartym
|
||||
* w sciezce path */
|
||||
|
||||
// printf("add_word_prob(");
|
||||
// fflush(stdout);
|
||||
char p[MAX_LINE];
|
||||
|
||||
strcpy(p, path);
|
||||
|
||||
int probLen = strcspn(p, ";");
|
||||
char prob[probLen+1];
|
||||
strncpy(prob, p, probLen);
|
||||
prob[probLen] = '\0';
|
||||
|
||||
char* desc = p + probLen+1; // +2 bo pomijamy jeszcze znak ';'
|
||||
|
||||
int i = tab.add(word, desc);
|
||||
|
||||
if (source==DICT) {
|
||||
tab[i].len_suf(len);
|
||||
tab[i].w_suf(atof(prob)); // + W_PRE*tab[i].w_suf()));
|
||||
// tab[i].w_suf((float)(W_SUF*(1000-atof(prob)) + W_PRE*tab[i].w_suf()));
|
||||
}
|
||||
// if (source==COR) {
|
||||
// tab[i].len_pref(len);
|
||||
// tab[i].w_pref(W_SUF*(1000-atof(prob)) + W_PRE*tab[i].w_pref());
|
||||
// }
|
||||
// printf(")\n");
|
||||
// fflush(stdout);
|
||||
|
||||
return i;
|
||||
|
||||
}
|
56
app/src/gue/guess.h
Normal file
56
app/src/gue/guess.h
Normal file
@ -0,0 +1,56 @@
|
||||
|
||||
#include "../lib/tfti.h"
|
||||
#include "../lib/word.h"
|
||||
|
||||
#include <sys/timeb.h>
|
||||
|
||||
/**************************************************************
|
||||
* Zawiera definicje klasy Guess. *
|
||||
* *
|
||||
* Klasa ta pozwala na okreslenie opisu slowa nie *
|
||||
* znajdujacego sie w slowniku wraz z prawdopodobienstwem *
|
||||
* jego wystapienia. *
|
||||
*************************************************************/
|
||||
|
||||
class Guess {
|
||||
|
||||
public:
|
||||
|
||||
// nazawa pliku slownika w parametrze
|
||||
Guess(const char* suf_file);
|
||||
|
||||
// zwraca tablice opisow slowa wraz z prawdopodobienstwem ich wystapienia
|
||||
int ana(const char* word, Words& result);
|
||||
|
||||
long time_overall;
|
||||
|
||||
private:
|
||||
|
||||
// sufiksy
|
||||
TFTiv<char, char> _suf;
|
||||
|
||||
// prefiksy
|
||||
TFTiv<char, char> _pref;
|
||||
|
||||
//odwraca ciag znakow
|
||||
int reverse(const char* src, char* dest) {
|
||||
|
||||
// assert((src != NULL) && (dest != NULL));
|
||||
|
||||
const char* c = src;
|
||||
|
||||
int len = strlen(src);
|
||||
|
||||
for (int i=1; i<=len; ++i) {
|
||||
dest[i-1] = src[len-i];
|
||||
}
|
||||
|
||||
dest[len] = '\0';
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//dodaje nowy element do tablicy WordsProb
|
||||
int add_word_prob(Words& tab, const char* word, const char* path, unsigned len, int source);
|
||||
|
||||
};
|
192
app/src/gue/main.cc
Normal file
192
app/src/gue/main.cc
Normal file
@ -0,0 +1,192 @@
|
||||
#include <time.h>
|
||||
#include <stdlib.h>
|
||||
#include "../lib/iotools.h"
|
||||
#define _CMDLINE_FILE "../gue/cmdline.h"
|
||||
#define CONFIGFILE1 "/home/ynka/utt/utt-0.9/conf/gue.conf"
|
||||
#define CONFIGFILE2 "/home/ynka/utt/utt-0.9/conf/gue.conf"
|
||||
#include "../common/common.h"
|
||||
#include "common_guess.h"
|
||||
#include "guess.h"
|
||||
#include "cmdline.h"
|
||||
|
||||
#define W_SUFF 0.6
|
||||
#define W_PREF 0.4
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
int non_standard_config=0;
|
||||
|
||||
gengetopt_args_info args;
|
||||
|
||||
if(cmdline_parser(argc, argv, &args) != 0)
|
||||
exit(1);
|
||||
|
||||
process_config_files(&args,argv[0]);
|
||||
|
||||
process_common_options(&args,argv[0]);
|
||||
process_guess_options(&args);
|
||||
|
||||
|
||||
// PONIŻEJ POPRZEDNI KOD (JUSTYNY)
|
||||
// //preliminary command-line parsing - for configuration file info only
|
||||
// gengetopt_args_info pre_args;
|
||||
|
||||
// if (cmdline_parser(argc, argv, &pre_args) != 0)
|
||||
// exit(1);
|
||||
// if(pre_args.config_given){
|
||||
// printf("podano config: %s\n",pre_args.config_arg);
|
||||
// non_standard_config=1;
|
||||
// }
|
||||
|
||||
|
||||
// //configuration file 1 parsing
|
||||
// struct cmdline_parser_params *params;
|
||||
// params = cmdline_parser_params_init();
|
||||
// params->initialize = 1;
|
||||
// if(cmdline_parser_config_file(CONFIGFILE1,&args, params)!=0){
|
||||
// printf("System-wide configuration file parsing error!\n");
|
||||
// exit(1);
|
||||
// }
|
||||
|
||||
// //configuration file 2 parsing-overriding
|
||||
// params->initialize=0;
|
||||
// params->override=1;
|
||||
// char* config2=(non_standard_config)?pre_args.config_arg:CONFIGFILE2;
|
||||
// if(cmdline_parser_config_file(config2,&args, params)!=0){
|
||||
// printf("User configuration file parsing error!\n");
|
||||
// return 1;
|
||||
// }
|
||||
|
||||
// params->initialize=0;
|
||||
// params->override=1;
|
||||
// //params->check_required=1;
|
||||
|
||||
// free(params);
|
||||
|
||||
// //command-line options parsing-overriding
|
||||
// if (cmdline_parser(argc, argv, &args) != 0)
|
||||
// exit(1);
|
||||
|
||||
|
||||
char line[MAX_LINE];
|
||||
char outline[MAX_LINE];
|
||||
char parms[MAX_LINE], desc[MAX_LINE], lemma[MAX_LINE];
|
||||
long line_count = 0;
|
||||
// printf("d_f=%s\n", dict_file);
|
||||
Guess guess(dictionary);
|
||||
int words_count=0;
|
||||
time_t start_time = time(NULL);
|
||||
|
||||
Segment seg;
|
||||
Words tab;
|
||||
char* form; //[MAX_FORM];
|
||||
while (fgets(line, MAX_LINE, inputf)==line) {
|
||||
line_count++;
|
||||
int start, len;
|
||||
|
||||
line[strlen(line)-1] = '\0';
|
||||
|
||||
if (!seg.parse(line)) {
|
||||
fprintf(stderr, "B³±d w wej¶ciu (linia: %d)\n", line_count);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (process_seg(seg, args)) {
|
||||
words_count++;
|
||||
tab.clear();
|
||||
if (args.input_field_given>0) {
|
||||
form = getInput(args.input_field_arg, args.input_field_given, seg);
|
||||
} else
|
||||
form = seg.form;
|
||||
|
||||
if (NULL == form) {
|
||||
continue;
|
||||
}
|
||||
|
||||
guess.ana(form, tab);
|
||||
|
||||
if ((tab.count()==0) && (!args.no_fail_flag)) {
|
||||
// no guesses - analysis was unsuccessful
|
||||
seg.print(outline); //this is necessary - seg.parse destroys line...
|
||||
fputs(outline, failedf);
|
||||
if (copy_processed)
|
||||
fputs(line, stdout);
|
||||
continue;
|
||||
}
|
||||
// we've got some guesses. Do we want to print it?
|
||||
if (args.only_fail_flag)
|
||||
continue;
|
||||
|
||||
float last_weight=0;
|
||||
int i=0;
|
||||
int count=0;
|
||||
unsigned first=1;
|
||||
char* parms_end = parms;
|
||||
char last_lemma[MAX_LINE];
|
||||
|
||||
while ((i=tab.next()) != -1 && count++<guess_count) {
|
||||
/* if we have "one-line" flag then everything goes in one segment as many fields,
|
||||
* if we have "one-field" flag everything goes in one segment as ONE field:
|
||||
* - diferent lemmas are separated with ';', sequent descriptions to one lemma
|
||||
* are separated with ','
|
||||
*/
|
||||
if ((!first) && (tab[i].w_suf() < cut_off) || (tab[i].w_suf() < delta * last_weight)) {
|
||||
break;
|
||||
}
|
||||
if (first) {
|
||||
parms_end += sprintf(parms_end, "%s", field_prefix);
|
||||
} else if (!args.one_field_flag)
|
||||
parms_end += sprintf(parms_end, "%s", field_prefix);
|
||||
|
||||
if (!args.one_field_flag || strcmp(last_lemma, tab[i].lemma()) != 0) {
|
||||
if (args.one_field_flag && !first)
|
||||
parms_end += sprintf(parms_end, ";");
|
||||
parms_end += sprintf(parms_end, "%s", tab[i].lemma());
|
||||
strcpy(last_lemma, tab[i].lemma());
|
||||
}
|
||||
|
||||
first=0;
|
||||
|
||||
last_weight = tab[i].w_suf();
|
||||
if (!weights)
|
||||
parms_end += sprintf(parms_end, ",%s:%d", tab[i].descr(), (int)tab[i].w_suf());
|
||||
else
|
||||
parms_end += sprintf(parms_end, ",%s", tab[i].descr());
|
||||
|
||||
if (!args.one_field_flag) {
|
||||
seg.addfield(parms);
|
||||
parms_end = parms;
|
||||
}
|
||||
|
||||
if (!(args.one_field_flag || args.one_line_flag)) {
|
||||
seg.print(outline);
|
||||
fputs(outline, outputf);
|
||||
--seg.auxn;
|
||||
}
|
||||
//if (copy_processed)
|
||||
// fputs(outline, stdout);
|
||||
} //while
|
||||
|
||||
if (args.one_field_flag)
|
||||
seg.addfield(parms);
|
||||
|
||||
if (args.one_field_flag || args.one_line_flag){
|
||||
seg.print(outline);
|
||||
fputs(outline, outputf);
|
||||
}
|
||||
} else { // if (process_segment)
|
||||
// jak to nie jest wyraz - to przepisz token na wyjscie.
|
||||
// printtok(line, start, len, cat, form);
|
||||
seg.print(outline);
|
||||
fputs(outline, outputf);
|
||||
if (copy_processed)
|
||||
fputs(outline, stdout);
|
||||
}
|
||||
}
|
||||
time_t end_time = time(NULL);
|
||||
if (per_info) {
|
||||
printf("Liczba s³ów: %d\n", words_count);
|
||||
printf("Czas analizy: %d sekund\n", end_time-start_time);
|
||||
}
|
||||
cmdline_parser_free(&args);
|
||||
}
|
7
app/src/kot/Makefile
Normal file
7
app/src/kot/Makefile
Normal file
@ -0,0 +1,7 @@
|
||||
|
||||
kot:
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp kot ${UTT_BIN_DIR}
|
||||
endif
|
99
app/src/kot/kot
Executable file
99
app/src/kot/kot
Executable file
@ -0,0 +1,99 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
|
||||
my $help=0;
|
||||
my $gap_fill="\n-----\n";
|
||||
my $spaces=0;
|
||||
|
||||
my $configfile1="../../conf/kot.conf";
|
||||
my $configfile2="../conf/kot.conf";
|
||||
|
||||
#read configuration files###########################
|
||||
my $file;
|
||||
foreach $file ($configfile1, $configfile2){
|
||||
if(open(CONFIG, $file)){
|
||||
while (<CONFIG>) {
|
||||
chomp;
|
||||
s/#.*//;
|
||||
s/^\s+//;
|
||||
s/\s+$//;
|
||||
next unless length;
|
||||
my ($name, $value) = split(/\s*=\s*/, $_, 2);
|
||||
if(($name eq "gap-fill")or($name eq "g")){
|
||||
$gap_fill=$value;
|
||||
}
|
||||
elsif(($name eq "spaces")or($name eq "s")){
|
||||
$spaces=1;
|
||||
}
|
||||
elsif(($name eq "help")or($name eq "h")){
|
||||
$help=1;
|
||||
}
|
||||
|
||||
}
|
||||
close CONFIG;
|
||||
}
|
||||
}
|
||||
#########################################################
|
||||
|
||||
GetOptions("gap-fill|g=s" => \$gap_fill,
|
||||
"spaces|r" => \$spaces,
|
||||
"help|h" => \$help);
|
||||
|
||||
if($help)
|
||||
{
|
||||
print <<'END'
|
||||
Usage: ser [OPTIONS] [file ..]
|
||||
|
||||
Options:
|
||||
--gap-fill -g Help.
|
||||
--spaces -r
|
||||
--define=FILE Read macrodefinitions from FILE.
|
||||
--flex-template=FILE Read flex code template from FILE.
|
||||
--only-matching -m Print only fragments matching PATTERN.
|
||||
--flex Print only the generated flex code and exit.
|
||||
END
|
||||
;
|
||||
exit 0;
|
||||
}
|
||||
|
||||
|
||||
$gap_fill =~ s/\\t/\t/g;
|
||||
$gap_fill =~ s/\\n/\n/g;
|
||||
$gap_fill =~ s/\\r/\r/g;
|
||||
$gap_fill =~ s/\\f/\f/g;
|
||||
|
||||
my $prevend=-1;
|
||||
my $count=0;
|
||||
|
||||
while(<>)
|
||||
{
|
||||
my ($start,$len,$type,$form) = /^\s*(\d+)\s+(\d+)\s+(\S+)\s+(\S+)/;
|
||||
|
||||
if($start > $prevend)
|
||||
{
|
||||
print $gap_fill unless $count++ == 0;
|
||||
}
|
||||
|
||||
$prevend=$start+$len;
|
||||
|
||||
next if $len==0;# || $form eq "*";
|
||||
|
||||
$form =~ s/\\\*/*/g;
|
||||
|
||||
if($type eq 'S' && ! $spaces)
|
||||
{
|
||||
$form =~ s/_/ /g;
|
||||
$form =~ s/\\t/\t/g;
|
||||
$form =~ s/\\n/\n/g;
|
||||
$form =~ s/\\r/\r/g;
|
||||
$form =~ s/\\f/\f/g;
|
||||
}
|
||||
|
||||
print $form;
|
||||
}
|
||||
|
||||
#print $gap_fill;
|
||||
|
||||
# print "\n";
|
56
app/src/lem/Makefile
Normal file
56
app/src/lem/Makefile
Normal file
@ -0,0 +1,56 @@
|
||||
PAR=-Wno-deprecated -m32 -O3 -fpermissive
|
||||
#-static
|
||||
PAR2=-c -Wno-deprecated -m32 -O3 -fpermissive
|
||||
LIB_PATH=../lib
|
||||
COMMON_PATH=../common
|
||||
CMDLINE_FILE='"../lem/cmdline.h"'
|
||||
|
||||
|
||||
lem: main.cc lem.o $(LIB_PATH)/auttools.o $(LIB_PATH)/word.o \
|
||||
cmdline.c common_lem.o common.o symtab.o
|
||||
g++ $(PAR) main.cc lem.o $(LIB_PATH)/auttools.o \
|
||||
$(LIB_PATH)/word.o cmdline.c common.o common_lem.o \
|
||||
symtab.o -o lem
|
||||
|
||||
lem.o: lem.h lem.cc
|
||||
g++ $(PAR2) lem.cc
|
||||
|
||||
# alphabet.o: $(LIB_PATH)/alphabet.h $(LIB_PATH)/alphabet.cc
|
||||
# g++ $(PAR2) $(LIB_PATH)/alphabet.cc
|
||||
|
||||
# auttools.o: $(LIB_PATH)/auttools.h $(LIB_PATH)/auttools.cc
|
||||
# g++ $(PAR2) $(LIB_PATH)/auttools.cc
|
||||
|
||||
# word.o: $(LIB_PATH)/word.h $(LIB_PATH)/word.cc
|
||||
# g++ $(PAR2) $(LIB_PATH)/word.cc
|
||||
|
||||
# erro.o: $(LIB_PATH)/erro.h $(LIB_PATH)/erro.cc
|
||||
# g++ $(PAR2) $(LIB_PATH)/erro.cc
|
||||
|
||||
common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \
|
||||
$(COMMON_PATH)/common.h
|
||||
g++ $(PAR2) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc
|
||||
|
||||
common_lem.o: cmdline.h common_lem.h common_lem.cc
|
||||
g++ $(PAR2) common_lem.cc
|
||||
|
||||
cmdline.c cmdline.h: cmdline.ggo
|
||||
gengetopt -i cmdline.ggo --conf-parser
|
||||
|
||||
cmdline.ggo: cmdline_lem.ggo ../common/cmdline_common.ggo
|
||||
cat cmdline_lem.ggo ../common/cmdline_common.ggo > cmdline.ggo
|
||||
|
||||
symtab.o: $(LIB_PATH)/symtab.h $(LIB_PATH)/symtab.cc
|
||||
g++ $(PAR2) $(LIB_PATH)/symtab.cc
|
||||
|
||||
clean: clean.cmdline
|
||||
rm *.o || true
|
||||
rm lem || true
|
||||
|
||||
clean.cmdline:
|
||||
rm cmdline.* || true
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp lem $(UTT_BIN_DIR)
|
||||
endif
|
5
app/src/lem/cmdline_lem.ggo
Normal file
5
app/src/lem/cmdline_lem.ggo
Normal file
@ -0,0 +1,5 @@
|
||||
package "lem"
|
||||
version "0.1"
|
||||
|
||||
option "dictionary-home" - "D.h." string typestr="FILENAME" hidden no
|
||||
option "dictionary" d "Dictionary" string typestr="FILENAME" default="lem.bin" no
|
41
app/src/lem/common_lem.cc
Normal file
41
app/src/lem/common_lem.cc
Normal file
@ -0,0 +1,41 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "common_lem.h"
|
||||
|
||||
char dictionary[255];
|
||||
|
||||
void process_lem_options(gengetopt_args_info* args)
|
||||
{
|
||||
|
||||
if(args->dictionary_given)
|
||||
{
|
||||
expand_path(args->dictionary_arg,dictionary);
|
||||
}
|
||||
else if (args->dictionary_home_given && args->language_given)
|
||||
{
|
||||
char buf[255];
|
||||
expand_path(args->dictionary_home_arg, buf);
|
||||
sprintf(dictionary,"%s/%s/lem.bin",buf,args->language_arg);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// STARE
|
||||
// if(args.dictionary_given)
|
||||
// strcpy(dictionary, args.dictionary_arg);
|
||||
// else {
|
||||
// char path[256];
|
||||
// //sprintf(path, "/etc/utt/data/%s/%s", args.locale_arg, DICT_FILE);
|
||||
// //if (file_accessible(path) == 0)
|
||||
// // strcpy(dictionary, path);
|
||||
// //else {
|
||||
// sprintf(path, "%s/%s", utt_dir, DICT_FILE);
|
||||
// if (file_accessible(path) == 0)
|
||||
// strcpy(dictionary, path);
|
||||
// else {
|
||||
// fprintf(stderr, "Cannot find dictionary!\n");
|
||||
// exit(1);
|
||||
// }
|
||||
// //}
|
||||
// }
|
||||
|
16
app/src/lem/common_lem.h
Normal file
16
app/src/lem/common_lem.h
Normal file
@ -0,0 +1,16 @@
|
||||
#ifndef __COMMON_LEM__H
|
||||
#define __COMMON_LEM__H
|
||||
|
||||
#include <stdio.h>
|
||||
#define _CMDLINE_FILE "../lem/cmdline.h"
|
||||
#include "../common/common.h"
|
||||
|
||||
#include "cmdline.h"
|
||||
|
||||
#define DICT_FILE "lem.bin"
|
||||
|
||||
extern char dictionary[];
|
||||
|
||||
extern void process_lem_options(gengetopt_args_info* args);
|
||||
|
||||
#endif
|
152
app/src/lem/lem.cc
Normal file
152
app/src/lem/lem.cc
Normal file
@ -0,0 +1,152 @@
|
||||
#include "lem.h"
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
|
||||
/* Znajduje opisy slownikowe dla wyrazu.
|
||||
* Parametry:
|
||||
* form - wyraz,
|
||||
* tab - referencja do tablicy Words (miejsce na wyniki)
|
||||
* Wartosc:
|
||||
* liczba dodanych opisow
|
||||
*/
|
||||
int Lem::ana(const char* form, Words& tab) {
|
||||
|
||||
// sprawdzamy czy parametry wywolania sa poprawne
|
||||
assert(form && &tab);
|
||||
int count0 = tab.count();
|
||||
long l;
|
||||
if ((l=_dict.next(_dict.gtra(0, form, FT::ftMAXPATH), ';'))>=0)
|
||||
add_to_table(tab, form, l);
|
||||
return tab.count()-count0;
|
||||
}
|
||||
|
||||
|
||||
/* Szukamy opisu slownikowego nastepnego wyrazu w buforze.
|
||||
* Parametry:
|
||||
* buf - bufor
|
||||
* tab - miejsce na wyniki
|
||||
* Wartosc:
|
||||
* ilosc dodanych opisow
|
||||
*/
|
||||
int Lem::pref(char* buf, Words& tab) {
|
||||
|
||||
// sprawdzamy czy parametry wywolania sa poprawne
|
||||
assert(buf && &tab);
|
||||
|
||||
int count0 = tab.count();
|
||||
long l;
|
||||
char* buf0 = buf;
|
||||
|
||||
if((l=_dict.pref(buf, ';'))>=0) {
|
||||
char form[MAX_FORM];
|
||||
int len=buf-buf0;
|
||||
form[len]='\0';
|
||||
add_to_table(tab,form,l);
|
||||
}
|
||||
return tab.count() - count0;
|
||||
}
|
||||
|
||||
/* Dodaje kolejne opisy do tablicy wynikow.
|
||||
* Parametry:
|
||||
* tab - tablica wynikow,
|
||||
* f - wyraz,
|
||||
* s - stan, na ktorym zaczyna sie pierwszy opis
|
||||
*/
|
||||
void Lem::add_to_table(Words& tab, const char* f, long s) {
|
||||
|
||||
// sprawdzenie parametrow
|
||||
assert(&tab);
|
||||
assert(f);
|
||||
|
||||
char des[FT::ftMAXPATH];
|
||||
|
||||
while (_dict.cont(s, des)) {
|
||||
char* des1;
|
||||
if ((des1=strtok(des, ";")) != NULL)
|
||||
do {
|
||||
if (tab.count() >= MAX_ALT) break;
|
||||
tab.add(f, des1);
|
||||
des1=strtok(NULL, ";");
|
||||
} while (des1!=NULL);
|
||||
s=-1;
|
||||
}
|
||||
}
|
||||
|
||||
void Lem::prn_dict()
|
||||
{
|
||||
|
||||
char des[FT::ftMAXPATH];
|
||||
|
||||
long s=0;
|
||||
|
||||
while (_dict.cont(s, des))
|
||||
{
|
||||
printf("%s\n",des);
|
||||
s=-1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
AuxLem::AuxLem(const char* filename)
|
||||
: Lem(), _dict(SIZE)
|
||||
{
|
||||
FILE* f;
|
||||
char buf[MAX_LINE+2];
|
||||
f=fopen(filename,"r");
|
||||
for(long i=0; i<SIZE; ++i) info[i]=(char*)NULL;
|
||||
while(fgets(buf,MAX_LINE,f))
|
||||
{
|
||||
int l=strlen(buf);
|
||||
if(l>=MAX_LINE-1) continue; // BEZ isalpha!
|
||||
buf[l-1]='\0';
|
||||
char* sep=strchr(buf,';');
|
||||
if(sep==NULL) continue;
|
||||
*sep='\0';
|
||||
long formind=_dict.add(buf);
|
||||
if(formind>=0)
|
||||
{
|
||||
char* desc=strdup(sep+1);
|
||||
info[formind]=desc;
|
||||
}
|
||||
else
|
||||
fprintf(stderr,"AuxLem: Form not added: %s;%s.\n", buf,sep+1);
|
||||
}
|
||||
fclose(f);
|
||||
};
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
AuxLem::~AuxLem()
|
||||
{
|
||||
// for(long i=0; i<_dict.count(); ++i)
|
||||
// free(info[_dict.hashindex(i)]);
|
||||
for(long i=0; i<SIZE; ++i)
|
||||
if(info[i]) free(info[i]);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
int AuxLem::ana(const char* form, Words& tab)
|
||||
{
|
||||
if(!form) return 0;
|
||||
int count0=tab.count();
|
||||
char des[MAX_LINE];
|
||||
long ind=_dict[form];
|
||||
if(ind>=0)
|
||||
{
|
||||
strcpy(des,info[ind]);
|
||||
char* des1;
|
||||
if((des1=strtok(des,";"))!=NULL)
|
||||
do
|
||||
{
|
||||
if(tab.cnt>=MAXALT) break;
|
||||
tab.add(form,des1);
|
||||
des1=strtok(NULL,";");
|
||||
} while(des1!=NULL);
|
||||
}
|
||||
return tab.count()-count0;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
50
app/src/lem/lem.h
Normal file
50
app/src/lem/lem.h
Normal file
@ -0,0 +1,50 @@
|
||||
#include "../lib/tfti.h"
|
||||
#include "../lib/word.h"
|
||||
#include "../lib/symtab.h"
|
||||
#include "../lib/const.h"
|
||||
|
||||
class Lem {
|
||||
|
||||
protected:
|
||||
// Alphabet& _alpha;
|
||||
|
||||
// slownik
|
||||
TFTiv<char,char> _dict;
|
||||
|
||||
void add_to_table(Words& tab, const char* f, long s);
|
||||
|
||||
public:
|
||||
|
||||
Lem() {};
|
||||
Lem(const char* d)
|
||||
: _dict(d) {};
|
||||
virtual int ana(const char* form, Words& tab);
|
||||
int pref(char* form, Words& tab);
|
||||
void prn_dict();
|
||||
|
||||
};
|
||||
|
||||
|
||||
class AuxLem : public Lem {
|
||||
public:
|
||||
|
||||
static const int SIZE=1500000;
|
||||
// static const int MAXLINE=1000;
|
||||
static const int MAXALT=256;
|
||||
|
||||
AuxLem(const char* filename);
|
||||
~AuxLem();
|
||||
|
||||
// int ana(const char* form, Grams& tab);
|
||||
int ana(const char* form, Words& tab);
|
||||
|
||||
// operator bool() { return _dict && info; }
|
||||
|
||||
private:
|
||||
SymbolTable _dict;
|
||||
char* info[SIZE];
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
132
app/src/lem/main.cc
Normal file
132
app/src/lem/main.cc
Normal file
@ -0,0 +1,132 @@
|
||||
#include "../lib/iotools.h"
|
||||
#define _CMDLINE_FILE "../lem/cmdline.h"
|
||||
#include "../common/common.h"
|
||||
#include "common_lem.h"
|
||||
#include "lem.h"
|
||||
#include "cmdline.h"
|
||||
#include <locale.h>
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
// setlocale(LC_CTYPE,""); //PO CO TO?
|
||||
// setlocale(LC_COLLATE,""); //
|
||||
|
||||
gengetopt_args_info args;
|
||||
|
||||
if(cmdline_parser(argc, argv, &args) != 0)
|
||||
exit(1);
|
||||
|
||||
process_config_files(&args,argv[0]);
|
||||
process_common_options(&args,argv[0]);
|
||||
process_lem_options(&args);
|
||||
|
||||
char line[MAX_LINE+1];
|
||||
char outline[MAX_LINE+1];
|
||||
char parms[MAX_LINE+1], desc[MAX_LINE+1], lemma[MAX_LINE+1];
|
||||
long line_count = 0;
|
||||
|
||||
Lem* lem;
|
||||
|
||||
if(strcmp(dictionary+strlen(dictionary)-4,".bin")==0)
|
||||
lem = new Lem(dictionary);
|
||||
else if(strcmp(dictionary+strlen(dictionary)-4,".dic")==0)
|
||||
lem = new AuxLem(dictionary);
|
||||
else
|
||||
fprintf(stderr,"lem: Invalid dictionary file extension.\n");
|
||||
|
||||
Words tab;
|
||||
// Segment seg;
|
||||
|
||||
while (fgets(line, MAX_LINE, inputf))
|
||||
{
|
||||
// strcpy(outline,line);
|
||||
++line_count;
|
||||
|
||||
int start, len;
|
||||
|
||||
if (!process_seg(line, args)) // TO POWINNO BYC WCZESNIEJ ZABEZPIECZONE
|
||||
fputs(line, outputf);
|
||||
else
|
||||
{
|
||||
char form[MAX_FORM];
|
||||
|
||||
tab.clear();
|
||||
getfield(line,input_field_prefix,form);
|
||||
if (form==NULL) continue;
|
||||
|
||||
lem->ana(form, tab);
|
||||
if(tab.count()==0)
|
||||
{
|
||||
char form1[MAX_FORM]; // tymczasowo tak, trzeba zmienic ana
|
||||
char* p;
|
||||
strcpy(form1,form);
|
||||
for(p=form1;*p;++p) *p=tolower(*p);
|
||||
p=form1;
|
||||
lem->ana(p,tab);
|
||||
}
|
||||
|
||||
if (tab.count() == 0)
|
||||
fputs(line, failedf);
|
||||
else
|
||||
{ // mamy jakies opisy w slowniku
|
||||
|
||||
if(one_line)
|
||||
{
|
||||
char* descp=desc;
|
||||
for (int i=0; i< tab.count(); ++i)
|
||||
{
|
||||
descp += sprintf(descp," %s%s,%s", output_field_prefix, tab[i].lemma(), tab[i].descr());
|
||||
}
|
||||
strcpy(outline,line);
|
||||
outline[strlen(outline)-1]='\0';
|
||||
strcat(outline,desc);
|
||||
strcat(outline,"\n");
|
||||
fputs(outline, outputf);
|
||||
if (copy_processed)
|
||||
fputs(line,outputf);
|
||||
}
|
||||
else if(one_field)
|
||||
{
|
||||
char* descp=desc;
|
||||
for (int i=0; i< tab.count(); ++i)
|
||||
if(i==0)
|
||||
descp += sprintf(descp," %s%s,%s", output_field_prefix, tab[i].lemma(), tab[i].descr());
|
||||
else
|
||||
{
|
||||
if(strcmp(tab[i].lemma(),tab[i-1].lemma())==0)
|
||||
descp += sprintf(descp,",%s",tab[i].descr());
|
||||
else
|
||||
descp += sprintf(descp,";%s,%s",tab[i].lemma(),tab[i].descr());
|
||||
}
|
||||
|
||||
strcpy(outline,line);
|
||||
outline[strlen(outline)-1]='\0';
|
||||
strcat(outline,desc);
|
||||
strcat(outline,"\n");
|
||||
fputs(outline, outputf);
|
||||
if (copy_processed)
|
||||
fputs(line,outputf);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i=0; i< tab.count(); ++i)
|
||||
{
|
||||
// kolejne opisy - kolejne linie.
|
||||
sprintf(desc, " %s%s,%s\n", output_field_prefix, tab[i].lemma(), tab[i].descr());
|
||||
strcpy(outline,line);
|
||||
outline[strlen(outline)-1]='\0';
|
||||
strcat(outline,desc);
|
||||
fputs(outline, outputf);
|
||||
}
|
||||
if (copy_processed)
|
||||
fputs(line,outputf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(args.interactive_flag)
|
||||
fflush(outputf), fflush(failedf);
|
||||
|
||||
}
|
||||
cmdline_parser_free(&args);
|
||||
}
|
20
app/src/lib/Makefile
Normal file
20
app/src/lib/Makefile
Normal file
@ -0,0 +1,20 @@
|
||||
PAR=-Wno-deprecated -m32 -O3
|
||||
PAR2=-c -Wno-deprecated -m32 -O3 -static -fpermissive
|
||||
LIB_PATH=../lib
|
||||
COMMON_PATH=../common
|
||||
|
||||
main: auttools.o word.o copy
|
||||
|
||||
auttools.o: auttools.h auttools.cc
|
||||
g++ $(PAR2) auttools.cc
|
||||
|
||||
word.o: word.h word.cc
|
||||
g++ $(PAR2) word.cc
|
||||
|
||||
clean:
|
||||
rm *.o
|
||||
|
||||
copy:
|
||||
ifdef UTT_LIB_DIR
|
||||
cp -r perl $(UTT_LIB_DIR)/
|
||||
endif
|
164
app/src/lib/auttools.cc
Normal file
164
app/src/lib/auttools.cc
Normal file
@ -0,0 +1,164 @@
|
||||
#include "auttools.h"
|
||||
//#include "/src/cpp-comm/plx/Plx.h"
|
||||
|
||||
void fullform(const char* b, const char* d, char* f)
|
||||
{
|
||||
int i,j=0;
|
||||
int n1, n2=0;
|
||||
bool g=false;
|
||||
char s1[200], s2[200], temps[200];
|
||||
while(d[j]>='0' && d[j]<='9')j++;
|
||||
strncpy(temps,d,j); temps[j]='\0';
|
||||
n1=atoi(temps);
|
||||
i=j;
|
||||
while(!ispunct(d[j]) || d[j]=='*') j++;
|
||||
strncpy(s1,d+i,j-i);
|
||||
s1[j-i]='\0';
|
||||
if(d[j++]=='-')
|
||||
{
|
||||
i=j;
|
||||
while(d[j]>='0' && d[j]<='9')j++;
|
||||
strncpy(temps,d+i,j-i); temps[j]='\0';
|
||||
n2=atoi(temps);
|
||||
i=j;
|
||||
while(!ispunct(d[j]) || d[j]=='*') j++;
|
||||
strncpy(s2,d+i,j-i);
|
||||
s2[j-i]='\0';
|
||||
g=true;
|
||||
}
|
||||
|
||||
int blen=strlen(b);
|
||||
if(g)
|
||||
if(n1+n2<=blen)
|
||||
{
|
||||
strcpy(f,s1);
|
||||
strcat(f,b+n1);
|
||||
f[strlen(f)-n2]='\0';
|
||||
strcat(f,s2);
|
||||
}
|
||||
else
|
||||
strcpy(f,"<ERR>");
|
||||
else
|
||||
if(n1<=blen)
|
||||
{
|
||||
strcpy(f,b);
|
||||
f[strlen(f)-n1]='\0';
|
||||
strcat(f,s1);
|
||||
}
|
||||
else
|
||||
strcpy(f,"<ERR>");
|
||||
}
|
||||
|
||||
void compose(char* stem, char* ending, char* form)
|
||||
{
|
||||
bool suffix=true;
|
||||
while(*stem)
|
||||
if(*stem=='*')
|
||||
{
|
||||
strcpy(form,ending);
|
||||
form+=strlen(ending);
|
||||
suffix=false;
|
||||
stem++;
|
||||
}
|
||||
else
|
||||
*(form++)=*(stem++);
|
||||
if(suffix)
|
||||
{
|
||||
strcpy(form,ending);
|
||||
form+=strlen(ending);
|
||||
}
|
||||
*form='\0';
|
||||
}
|
||||
|
||||
void autodescr(const char* f, const char* des, char* lemma, char* pos, char* attr)
|
||||
{
|
||||
char lemd[MAXWORDLEN];
|
||||
int o,l=strcspn(des,",");
|
||||
strncpy(lemd,des,l);
|
||||
lemd[l]='\0';
|
||||
fullform(f,lemd,lemma);
|
||||
o=l+1;
|
||||
l=strcspn(des+o,"/:");
|
||||
strncpy(pos,des+o,l);
|
||||
pos[l]='\0';
|
||||
o=o+l;
|
||||
if(des[o]=='/')
|
||||
{
|
||||
o++;
|
||||
strcpy(attr,des+o);
|
||||
}
|
||||
else
|
||||
attr[0]='\0';
|
||||
}
|
||||
|
||||
|
||||
int common_prefix(const char* s, const char* t)
|
||||
{
|
||||
int n=0;
|
||||
while(*s==*t && *s!='\0')
|
||||
{ s++,t++;n++; }
|
||||
return n;
|
||||
}
|
||||
|
||||
int strdiff(const char* s, const char* t,
|
||||
int& frontcut, char* prefix, int& endcut, char* suffix)
|
||||
{
|
||||
int slen=strlen(s);
|
||||
int tlen=strlen(t);
|
||||
int ss, ss_max=0; /* ss - s shift */
|
||||
int ts, ts_max=0; /* ts - t shift */
|
||||
int common, common_max=0;
|
||||
for(ss=0;ss<slen;ss++)
|
||||
for(ts=0;ts<tlen;ts++)
|
||||
if( (common=common_prefix(s+ss,t+ts))>common_max
|
||||
&& (common>4 || (ss==0 && ts==0 && common>1)) )
|
||||
{
|
||||
ss_max=ss;
|
||||
ts_max=ts;
|
||||
common_max=common;
|
||||
}
|
||||
// print "--", tsmax,"\n"
|
||||
printf("--%d\n", ts_max);
|
||||
frontcut=ss_max;
|
||||
strncpy(prefix,t,ts_max); prefix[ts_max]='\0';
|
||||
endcut=slen-ss_max-common_max;
|
||||
strcpy(suffix,t+ts_max+common_max);
|
||||
return common_max;
|
||||
}
|
||||
|
||||
void fprndiff(FILE* f, const char* s, const char* t)
|
||||
{
|
||||
int frontcut,endcut;
|
||||
char pref[MAXWORDLEN],suff[MAXWORDLEN];
|
||||
strdiff(s,t,frontcut,pref,endcut,suff);
|
||||
if(frontcut!=0 || pref[0]!='\0')
|
||||
fprintf(f,"%d%s-%d%s",frontcut,pref,endcut,suff);
|
||||
else
|
||||
fprintf(f,"%d%s",endcut,suff);
|
||||
}
|
||||
|
||||
void sprndiff(char* outstr, const char* s, const char* t)
|
||||
{
|
||||
int frontcut,endcut;
|
||||
char pref[MAXWORDLEN],suff[MAXWORDLEN];
|
||||
strdiff(s,t,frontcut,pref,endcut,suff);
|
||||
if(frontcut!=0 || pref[0]!='\0')
|
||||
sprintf(outstr,"%d%s-%d%s",frontcut,pref,endcut,suff);
|
||||
else
|
||||
sprintf(outstr,"%d%s",endcut,suff);
|
||||
}
|
||||
|
||||
|
||||
void despos(const char* des, char* pos)
|
||||
{
|
||||
int di=0;
|
||||
int pi=0;
|
||||
while(des[di]!=',' && des[di]!='\0') ++di;
|
||||
if(des[di]==',')
|
||||
{
|
||||
++di;
|
||||
while(isupper(des[di])) pos[pi++]=des[di++];
|
||||
}
|
||||
pos[pi]='\0';
|
||||
}
|
||||
|
39
app/src/lib/auttools.h
Normal file
39
app/src/lib/auttools.h
Normal file
@ -0,0 +1,39 @@
|
||||
|
||||
#ifndef _Auttools_h
|
||||
#define _Auttools_h
|
||||
|
||||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
/* #define ISALPHAG(c) ((c>='A' && c<='Z') || (c>='a' && c<='z') || \ */
|
||||
/* c=='¡' || c=='±' || c=='Æ' || c=='æ' || \ */
|
||||
/* c=='Ê' || c=='ê' || c=='£' || c=='³' || \ */
|
||||
/* c=='Ñ' || c=='ñ' || c=='Ó' || c=='ó' || \ */
|
||||
/* c=='¦' || c=='¶' || c=='¬' || c=='¼' || \ */
|
||||
/* c=='¯' || c=='¿' || c=='*') */
|
||||
|
||||
#define MAXWORDLEN 64
|
||||
|
||||
extern void fullform(const char* b, const char* d, // in
|
||||
char* f); // out
|
||||
|
||||
extern void compose(char* stem, char* ending, // in
|
||||
char* form); // out
|
||||
|
||||
extern void autodescr(const char* f, const char* des, // in
|
||||
char* lemma, char* pos, char* attr); // out
|
||||
|
||||
extern int strdiff(char* s, char* t, // in
|
||||
int& frontcut, char* prefix, // out
|
||||
int& endcut, char* suffix); // out
|
||||
|
||||
extern void fprndiff(FILE* f, const char* s, const char* t);// in
|
||||
|
||||
extern void sprndiff(char* outstr, const char* s, const char* t); // in
|
||||
|
||||
extern void despos(const char* des, // in
|
||||
char* pos); // out
|
||||
|
||||
#endif
|
24
app/src/lib/const.h
Normal file
24
app/src/lib/const.h
Normal file
@ -0,0 +1,24 @@
|
||||
|
||||
|
||||
|
||||
// maksymalna dlugosc wyrazu
|
||||
#define MAX_FORM 80
|
||||
|
||||
// maksymalna dlugosc opisu
|
||||
#define MAX_DESC 80
|
||||
|
||||
// maksymalna dlogosc lini w pliku przejsciowym
|
||||
#define MAX_LINE 1024
|
||||
|
||||
// separator pol w pliku posrednim
|
||||
#define FIELD_SEP " \t\n"
|
||||
|
||||
// maksymalna liczba alternatywnych opisow
|
||||
#define MAX_ALT 256
|
||||
|
||||
// plik ze slownikiem dla guessa
|
||||
#define GUESS_DICT_FILE "slownik.fsa"
|
||||
|
||||
// katalogi z plikami konfiguracyjnymi
|
||||
#define SYSTEM_CONFIG_DIR "/usr/local/etc/utt"
|
||||
#define USER_CONFIG_DIR "~/.utt"
|
53
app/src/lib/iotools.h
Normal file
53
app/src/lib/iotools.h
Normal file
@ -0,0 +1,53 @@
|
||||
#include "const.h"
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
// napisy zostaj na miejscu (w line), tylko wskazniki sa ustawian
|
||||
// i zara dopisywane zera s dopisywane
|
||||
|
||||
inline
|
||||
int parsetok(char* line, int* a, int* b, char** c, char** d, char** e, char** f)
|
||||
{
|
||||
char* field;
|
||||
if((field=strtok(line,FIELD_SEP))!=NULL)
|
||||
*a=atoi(field); // nie sprawdzana poprawnosc
|
||||
else
|
||||
return 0;
|
||||
if((field=strtok(NULL,FIELD_SEP))!=NULL)
|
||||
*b=atoi(field); // nie sprawdzana poprawnosc
|
||||
else return 1;
|
||||
if((*c=strtok(NULL,FIELD_SEP))==NULL) return 2;
|
||||
if((*d=strtok(NULL,FIELD_SEP))==NULL) return 3;
|
||||
if((*e=strtok(NULL,FIELD_SEP))==NULL) return 4;
|
||||
if((*f=strtok(NULL,FIELD_SEP))==NULL) return 6;
|
||||
return 6;
|
||||
}
|
||||
|
||||
// napisy s kopiowane
|
||||
inline
|
||||
int scantok(const char* line, int* a, int* b, char* c, char* d, char* e=NULL, char* f=NULL)
|
||||
{
|
||||
return sscanf(line," %d %d %s %s %s %s", a, b, c, d, e, f);
|
||||
}
|
||||
|
||||
inline
|
||||
int printtok(char* line, int a, int b, char* c, char* d, char* e, char* f, char* parms)
|
||||
{
|
||||
sprintf(line,"%04d %02d %s %s %s %s `%s\n", a, b, c, d, e, f, parms);
|
||||
}
|
||||
|
||||
inline
|
||||
int printtok(char* line, int a, int b, char* c, char* d, char* e, char* f)
|
||||
{
|
||||
sprintf(line,"%04d %02d %s %s %s %s\n", a, b, c, d, e, f);
|
||||
}
|
||||
|
||||
inline
|
||||
int printtok(char* line, int a, int b, char* c, char* d)
|
||||
{
|
||||
sprintf(line,"%04d %02d %s %s\n", a, b, c, d);
|
||||
}
|
86
app/src/lib/matchdescr.cc
Normal file
86
app/src/lib/matchdescr.cc
Normal file
@ -0,0 +1,86 @@
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
|
||||
inline
|
||||
bool inline_matchattr(const char* a, const char* b)
|
||||
{
|
||||
const char *p, *q; // pomocnicze wskazniki
|
||||
while(*a && *b)
|
||||
{
|
||||
p=a; q=b;
|
||||
while(isupper(*p) && isupper(*q)) // rowny prefiks
|
||||
if(*p==*q) ++p, ++q;
|
||||
else if(*p<*q) // a jest mniejszy
|
||||
{
|
||||
// przesywamy a do nastepnego atr
|
||||
a=p;
|
||||
while(isupper(*a)) ++a; while(islower(*a)) ++a;
|
||||
goto end;
|
||||
}
|
||||
else
|
||||
{
|
||||
// przesuwamy b do nastepnego atr
|
||||
b=q;
|
||||
while(isupper(*b)) ++b; while(islower(*b)) ++b;
|
||||
goto end;
|
||||
}
|
||||
|
||||
if(islower(*p) && islower(*q)) // rowne atrybuty
|
||||
{
|
||||
a=p; b=q; // przesuwamy wskaznik, sprawdzamy wartosci
|
||||
while(*a != *b)
|
||||
{
|
||||
if(*a > *b && !islower(*++b)) return false;
|
||||
if(*a < *b && !islower(*++a)) return false;
|
||||
}
|
||||
// znaleziono rowna wartosc, przesywamy a i b do nast atr
|
||||
while(isupper(*a)) ++a; while(islower(*a)) ++a;
|
||||
while(isupper(*b)) ++b; while(islower(*b)) ++b;
|
||||
goto end;
|
||||
}
|
||||
|
||||
if(islower(*p)) // a jest krotszy, czyli mniejszy
|
||||
{ // przesuwamy a do nastepnego atrybutu
|
||||
a=p;
|
||||
while(islower(*a)) ++a;
|
||||
goto end;
|
||||
}
|
||||
|
||||
if(islower(*q)) // b jest krotszy, czyli mniejszy
|
||||
{ // przesuwamy b do nastepnego atrybutu
|
||||
b=q;
|
||||
while(islower(*b)) ++b;
|
||||
goto end;
|
||||
}
|
||||
end: ;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool matchattr(const char* a, const char* b)
|
||||
{
|
||||
return inline_matchattr(a,b);
|
||||
}
|
||||
|
||||
bool matchdescr(const char* a, const char* b)
|
||||
{
|
||||
while(isupper(*a) && isupper(*b) && *a==*b) ++a, ++b;
|
||||
if(*a=='\0')
|
||||
if(*b=='\0' || *b=='/') return true;
|
||||
else return false;
|
||||
|
||||
if(*a=='/')
|
||||
if(*b=='\0') return true;
|
||||
else if(*b=='/') return inline_matchattr(++a, ++b);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
int main()
|
||||
{
|
||||
char a[100], b[100];
|
||||
while(scanf("%s %s", a, b)==2)
|
||||
printf("%s & %s = %d\n", a, b, matchdescr(a,b));
|
||||
}
|
10
app/src/lib/matchdescr.h
Normal file
10
app/src/lib/matchdescr.h
Normal file
@ -0,0 +1,10 @@
|
||||
|
||||
// obie funkcje wymagaja by deskrypcje byly w postaci kanonicznej
|
||||
// obslugiwane sa tylko krotkie (jednoliterowe) atrybuty
|
||||
|
||||
// test czy zgadzaja sie deskrypcje
|
||||
bool matchdescr(const char* a, const char* b);
|
||||
|
||||
// test czy zgadaja sie same atrybuty (czyli to, co po ukosniku)
|
||||
bool matchattr(const char* a, const char* b);
|
||||
|
171
app/src/lib/symtab.cc
Normal file
171
app/src/lib/symtab.cc
Normal file
@ -0,0 +1,171 @@
|
||||
#include "symtab.h"
|
||||
#include <values.h>
|
||||
#include <stdio.h>
|
||||
#include <alloc.h>
|
||||
#include <stdlib.h>
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
SymbolTable::SymbolTable(int n, int (*h)(const char*,int), const char* filename)
|
||||
: _mx(n), _cnt(0), hash(h)
|
||||
{
|
||||
_sz=first(n);
|
||||
_key=new char*[_sz];
|
||||
_defind=new int[_sz];
|
||||
_hashind=new int[_sz];
|
||||
_def=new char*[_mx];
|
||||
for(int i=0; i<_sz; i++) _key[i]=NULL;
|
||||
if(filename)
|
||||
add_from_file(filename);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
SymbolTable::SymbolTable(int n, const char* filename)
|
||||
: _mx(n), _cnt(0), hash(hash1)
|
||||
{
|
||||
_sz=first(n);
|
||||
_key=new char*[_sz];
|
||||
_defind=new int[_sz];
|
||||
_hashind=new int[_sz];
|
||||
_def=new char*[_mx];
|
||||
for(int i=0; i<_sz; ++i) _key[i]=NULL;
|
||||
if(filename)
|
||||
add_from_file(filename);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
SymbolTable::~SymbolTable()
|
||||
{
|
||||
clear();
|
||||
delete[] _key;
|
||||
delete[] _defind;
|
||||
delete[] _hashind;
|
||||
delete[] _def;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
void SymbolTable::clear()
|
||||
{
|
||||
for(int i=0; i<_sz; ++i)
|
||||
if(_key[i])
|
||||
free(_key[i]);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
bool SymbolTable::add_from_file(const char* filename)
|
||||
{
|
||||
FILE* in=fopen(filename,"r");
|
||||
char buf[MAXKEYLEN+1];
|
||||
|
||||
if(in)
|
||||
while(fscanf(in,"%s",buf)==1)
|
||||
{
|
||||
if(strlen(buf)==MAXKEYLEN || add(buf)<0)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
int SymbolTable::add(const char* s)
|
||||
{
|
||||
if(_cnt<_mx)
|
||||
{
|
||||
int ind=hash(s,_sz);
|
||||
while(_key[ind])
|
||||
if(strcmp(_key[ind],s))
|
||||
ind=++ind%_sz;
|
||||
else
|
||||
return _defind[ind];
|
||||
_key[ind]=strdup(s);
|
||||
_defind[ind]=_cnt;
|
||||
_hashind[_cnt]=ind;
|
||||
_def[_cnt]=_key[ind];
|
||||
_cnt++;
|
||||
return _cnt-1;
|
||||
}
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
int SymbolTable::operator[](const char* s)
|
||||
{
|
||||
int ind=hash(s,_sz);
|
||||
while(_key[ind])
|
||||
if(strcmp(_key[ind],s)==0)
|
||||
return _defind[ind];
|
||||
else
|
||||
ind=++ind % _sz;
|
||||
return -1;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
int SymbolTable::first(unsigned int n)
|
||||
{
|
||||
int fi=n;
|
||||
int bound=(n/2 < MAXKEYLEN)? n/2 : MAXKEYLEN;
|
||||
bool found;
|
||||
do
|
||||
{
|
||||
found=true;
|
||||
if(fi++ == MAXINT) return -1;
|
||||
for(int i=2; i<bound; i++)
|
||||
if(fi%i==0) { found=false; break; }
|
||||
} while(!found);
|
||||
return fi;
|
||||
}
|
||||
|
||||
float SymbolTable::search_rate()
|
||||
{
|
||||
long s=0;
|
||||
for(int i=0; i<_sz; i++)
|
||||
if(_key[i])
|
||||
s+=(i+_sz-hash(_key[i],_sz))%_sz+1;
|
||||
return _cnt ? (float)s/(float)_cnt : 0;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
int hash1(const char* s, int _sz)
|
||||
{
|
||||
int l=strlen(s);
|
||||
if(l>=4)
|
||||
return abs((*((int*)(s+(l/2-2)))+(int)(*s * s[l-1])) % _sz);
|
||||
else
|
||||
{
|
||||
int i=0;
|
||||
strcpy((char*)&i,s);
|
||||
return abs((i+(int)(*s * s[l-1])) % _sz);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
int hash2(const char* s, int _sz)
|
||||
{
|
||||
int l=strlen(s);
|
||||
if(l>=6)
|
||||
{
|
||||
unsigned int i1,i2,i3;
|
||||
strncpy((char*)&i1,s,sizeof(int));
|
||||
strncpy((char*)&i2,s+(l/2-2),sizeof(int));
|
||||
strncpy((char*)&i3,s+(l-4),sizeof(int));
|
||||
return abs((i1+i2+i3) % _sz);
|
||||
}
|
||||
else
|
||||
{
|
||||
int i=0;
|
||||
strncpy((char*)&i,s,sizeof(int));
|
||||
return abs((i+(int)(*s * s[l-1])) % _sz);
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
52
app/src/lib/symtab.h
Normal file
52
app/src/lib/symtab.h
Normal file
@ -0,0 +1,52 @@
|
||||
#ifndef _HashTable_h
|
||||
#define _HashTable_h
|
||||
//---------------------------------------------------------------------------
|
||||
#include <stddef.h>
|
||||
#include <string.h>
|
||||
//---------------------------------------------------------------------------
|
||||
int hash1(const char* s, int sz);
|
||||
int hash2(const char* s, int sz);
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
class SymbolTable
|
||||
{
|
||||
int _mx;
|
||||
int _sz;
|
||||
int _cnt;
|
||||
char** _key;
|
||||
char** _def;
|
||||
int* _defind;
|
||||
int* _hashind; // s¹ tu redundancje
|
||||
|
||||
public:
|
||||
static const unsigned int MAXKEYLEN=2000;
|
||||
|
||||
SymbolTable(int n, int (*h)(const char*,int), const char* filename=NULL);
|
||||
SymbolTable(int n, const char* filename=NULL);
|
||||
~SymbolTable();
|
||||
|
||||
void clear();
|
||||
|
||||
int (*hash)(const char*, int);
|
||||
|
||||
bool add_from_file(const char* filename);
|
||||
|
||||
int add(const char* s);
|
||||
int operator[](const char* s);
|
||||
const char* operator[](int i){if(i<0||i>=_cnt)return NULL;else return _def[i];}
|
||||
int index(const char* s) { return this->operator[](s); };
|
||||
int index(int i) { if(i<0||i>=_cnt) return -1; else return i; };
|
||||
int hash_index(int i) { return _hashind[i]; }
|
||||
const char* symbol(int i) { if(i<0||i>=_cnt)return NULL; else return _def[i];}
|
||||
|
||||
int capacity() { return _mx; }
|
||||
int size() { return _sz; }
|
||||
int count() { return _cnt; }
|
||||
float search_rate();
|
||||
|
||||
private:
|
||||
static int first(unsigned int n);
|
||||
};
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
#endif
|
879
app/src/lib/tft.h
Executable file
879
app/src/lib/tft.h
Executable file
@ -0,0 +1,879 @@
|
||||
#ifndef _TFT_h
|
||||
#define _TFT_h
|
||||
//---------------------------------------------------------------------------
|
||||
#include <stddef.h>
|
||||
#include <iostream.h>
|
||||
#include <typeinfo>
|
||||
#include <string.h>
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
//#include "top.h"
|
||||
#include "ttrans.h"
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
/// Klasa bazowa przetwornika skoñczonego.
|
||||
/**
|
||||
\remark Po co ta klasa? Co dotyczy samych przej¶æ, przenie¶æ do TTrans,
|
||||
resztê wcieliæ do TFT.
|
||||
*/
|
||||
class FT
|
||||
{
|
||||
public:
|
||||
FT() : copy_default(false), print_mode(OO), ttn(0) {};
|
||||
|
||||
//print mode
|
||||
enum OUTPUT { II, ///< tylko symbole wej¶ciowe
|
||||
OO, ///< tylko symbole wyj¶ciowe
|
||||
IOIO, ///< symbol wyj¶ciowy po wej¶ciowym
|
||||
OIOI, ///< symbol wyj¶ciowy przed wej¶ciowym
|
||||
IIOO, ///< ca³e wej¶cie, potem ca³e wyj¶cie
|
||||
OOII ///< ca³e wyj¶cie, potem ca³e wej¶cie
|
||||
|
||||
};
|
||||
|
||||
/// maks d³ugo¶æ ¶cie¿ki
|
||||
static const unsigned int ftMAXPATH=500;
|
||||
|
||||
/// maks d³ugo¶æ opisu typu symbolu we/wy
|
||||
/**
|
||||
\remark Przenie¶æ do TTrans
|
||||
*/
|
||||
static const unsigned int ftTYPELEN=32;
|
||||
|
||||
/// specjalny symbol dla warto¶ci 'epsilon'
|
||||
/**
|
||||
\remark Przenie¶æ do TTrans
|
||||
*/
|
||||
static const char ftEPSILON='~';
|
||||
|
||||
/// specialny symbol dla warto¶ci 'default'
|
||||
/**
|
||||
\remark Przenie¶æ do TTrans
|
||||
*/
|
||||
static const char ftDEFAULT='@';
|
||||
|
||||
/// domy¶lny symbol wyj¶ciowy (true-'@', flase-'~')
|
||||
/**
|
||||
\remark Przenie¶æ do TTrans(???)
|
||||
*/
|
||||
bool copy_default;
|
||||
|
||||
/// tryb wyj¶cia
|
||||
OUTPUT print_mode;
|
||||
|
||||
/// false, je¶li automat nie ma przej¶æ
|
||||
operator bool() { return (bool)ttn; };
|
||||
|
||||
virtual const char* intype() { return itype; };
|
||||
virtual const char* outtype() { return otype; };
|
||||
|
||||
protected:
|
||||
|
||||
/// liczba elementów tablicy tt
|
||||
unsigned long ttn;
|
||||
|
||||
/// liczba stanów
|
||||
unsigned long states;
|
||||
|
||||
/// liczba przej¶æ
|
||||
unsigned long transitions;
|
||||
|
||||
/// typ symboli wej¶ciowych (napis)
|
||||
/**
|
||||
\remark Przenie¶æ do TTrans(???)
|
||||
*/
|
||||
char itype[ftTYPELEN];
|
||||
|
||||
/// typ symboli wyj¶ciowych (napis)
|
||||
/**
|
||||
\remark Przenie¶æ do TTrans(???)
|
||||
*/
|
||||
char otype[ftTYPELEN];
|
||||
};
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
/// Szablon przetwornika skoñczonego
|
||||
/**
|
||||
\param I - typ symbolu wej¶ciowego
|
||||
\param Ipass - typ, jaki ma byæ u¿yty przy przekazywaniu symbolu we jako parametru
|
||||
do funkcji (metody), równy \a I lub \a I&
|
||||
\param O - typ symbolu wyj¶ciowego
|
||||
\param Opass - typ, jaki ma byæ u¿yty przy przekazywaniu symbolu wy jako parametru
|
||||
do funkcji (metody), równy \a O lub \a O&
|
||||
\param - typ przej¶cia, musi byæ podklas± TTrans
|
||||
*/
|
||||
template<class I, class Ipass, class O, class Opass, class TT>
|
||||
class TFT : public FT
|
||||
{
|
||||
|
||||
|
||||
public:
|
||||
|
||||
TFT() : FT(), tt(NULL) { setiotypes(); };
|
||||
|
||||
/**
|
||||
\name Metody poziomu 1
|
||||
Poziom przej¶æ.
|
||||
*/
|
||||
|
||||
//@{
|
||||
|
||||
/// Test, czy przej¶cie \a t akceptuje symbol \a in.
|
||||
bool accepts(long t, Ipass in) const;
|
||||
|
||||
/// Test, czy lista przej¶æ dla aktualnego stanu jest kontynuowana po \a t.
|
||||
bool continued(long t) const;
|
||||
|
||||
/// Stan, do którego prowadzi przej¶cie \a t.
|
||||
/**
|
||||
\pre !empty(t)
|
||||
*/
|
||||
long next(long t) const;
|
||||
|
||||
/// Symbol wej¶ciowy przej¶cia \a t.
|
||||
Ipass input(long t) const;
|
||||
|
||||
/// Symbol wyj¶ciowy przej¶cia \a t.
|
||||
Opass output(long t) const;
|
||||
|
||||
/// Zwraca \c true, je¶li symbolem we przej¶cia \a t jest epsilon.
|
||||
bool epsi(long t) const;
|
||||
|
||||
/// Zwraca \c true, je¶li symbolem we przej¶cia \a t jest symbol domy¶lny.
|
||||
bool defi(long t) const;
|
||||
|
||||
/// Zwraca \c true, je¶li symbolem wy przej¶cia \a t jest epsilon.
|
||||
bool epso(long t) const;
|
||||
|
||||
/// Zwraca \c true, je¶li symbolem wy przej¶cia \a t jest symbol domy¶lny.
|
||||
bool defo(long t) const;
|
||||
|
||||
/// Indeks przej¶cia przez \a in.
|
||||
long tra(long t, Ipass in) const;
|
||||
|
||||
/// Indeks przej¶cia przez \a in - non-deterministic.
|
||||
long tra_nd(long t, Ipass in, long nth) const;
|
||||
|
||||
//@}
|
||||
|
||||
/**
|
||||
\name Poziom 2
|
||||
Poziom stanów. Stan (indeks stanu) = indeks jego pierwszego przej¶cia
|
||||
*/
|
||||
//@{
|
||||
/// Zwraca \c true je¶li stan \a s jest pusty (nie ma z niego przej¶æ).
|
||||
bool empty(long s) const { return tt[s].empty(); }
|
||||
|
||||
/// Zwraca \c true je¶li stan \a s jest stanem koñcowym.
|
||||
bool final(long s) const { return tt[s].final(); }
|
||||
|
||||
long next(long t, Ipass in) const;
|
||||
|
||||
//long trans(const I* si, I* so, long& olen) const;
|
||||
|
||||
long gtra(long s, const I* w, long maxpath=ftMAXPATH) const;
|
||||
|
||||
//@}
|
||||
|
||||
/**
|
||||
\name Poziom 3
|
||||
Poziom ...
|
||||
*/
|
||||
//@{
|
||||
long cont(long s=-1, I* c=NULL) const;
|
||||
|
||||
long match(const I* w=NULL, long* p=NULL) const;
|
||||
|
||||
long match_nd(const I* w=NULL, long* p=NULL) const;
|
||||
|
||||
long lgstmatch(const I* w, long* p, long& plen, long maxpath=ftMAXPATH) const;
|
||||
|
||||
/*NOWE*/
|
||||
|
||||
long lgstpath(I*& buf, long*& path, long start=0) const;
|
||||
|
||||
long pref(I*& buf, I sep, long start=0) const;
|
||||
|
||||
//@}
|
||||
|
||||
protected:
|
||||
|
||||
TT* tt; // tablica przej¶æ
|
||||
|
||||
long prn(const I* si, long* p, O* so) const;
|
||||
|
||||
void prntt(ostream& os);
|
||||
|
||||
void sort();
|
||||
|
||||
void setiotypes(); // NIE DZIA£A (dlaczego???)
|
||||
|
||||
// friend ostream& operator<<(ostream&,const CDFA&);
|
||||
// friend istream& operator>>(istream&,CDFA&);
|
||||
|
||||
private:
|
||||
long prn_oo(const I* si, long* p, O* so) const;
|
||||
long prn_ioio(const I* si, long* p, O* so) const;
|
||||
long prn_oioi(const I* si, long* p, O* so) const;
|
||||
long prn_iioo(const I* si, long* p, O* so) const;
|
||||
long prn_ooii(const I* si, long* p, O* so) const;
|
||||
};
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
stan = indeks pierwszego przej¶cia
|
||||
|
||||
state(t) = stan, do którego nale¿y t
|
||||
|
||||
symbol zerowy = symbol s, dla którego (bool)s zwraca \c false,
|
||||
w przypadku znaków - '\0'
|
||||
*/
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
inline
|
||||
bool TFT<I,Ipass,O,Opass,TT>::accepts(long t, Ipass in) const
|
||||
{ return tt[t].accepts(in); }
|
||||
|
||||
/// Test whether the transition list continues after \a t.
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
inline
|
||||
bool TFT<I,Ipass,O,Opass,TT>::continued(long t) const
|
||||
{ return tt[t].continued(); }
|
||||
|
||||
/**
|
||||
\pre !empty(t)
|
||||
*/
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
inline
|
||||
long TFT<I,Ipass,O,Opass,TT>::next(long t) const
|
||||
{ return tt[t].next(); }
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
inline
|
||||
Ipass TFT<I,Ipass,O,Opass,TT>::input(long t) const
|
||||
{ return tt[t].in(); }
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
inline
|
||||
Opass TFT<I,Ipass,O,Opass,TT>::output(long t) const
|
||||
{ return tt[t].out(); }
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
inline
|
||||
bool TFT<I,Ipass,O,Opass,TT>::epsi(long t) const
|
||||
{ return tt[t].epsi(); }
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
inline
|
||||
bool TFT<I,Ipass,O,Opass,TT>::defi(long t) const
|
||||
{ return tt[t].defi(); }
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
inline
|
||||
bool TFT<I,Ipass,O,Opass,TT>::epso(long t) const
|
||||
{ return tt[t].epso(); }
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
inline
|
||||
bool TFT<I,Ipass,O,Opass,TT>::defo(long t) const
|
||||
{ return tt[t].defo(); }
|
||||
|
||||
/**
|
||||
\param +t - indeks przej¶cia
|
||||
\param +in - symbol we
|
||||
\return Indeks przj¶cia (>=\a t) dla bie¿±cego stanu, które
|
||||
akceptuje symbol we \a in lub -1, je¶li nie ma takiego przej¶cia
|
||||
*/
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::tra(long t, Ipass in) const
|
||||
{
|
||||
if(t<0 || t>=ttn)
|
||||
return -1;
|
||||
|
||||
if(empty(t)) return -1;
|
||||
while(!accepts(t,in))
|
||||
if(continued(t))
|
||||
t++;
|
||||
else
|
||||
return -1;
|
||||
return t;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
/// Indeks przej¶cia - wersja dla automatu niedeterministycznego.
|
||||
/**
|
||||
\param +t - indeks przej¶cia
|
||||
\param +in - symbol we
|
||||
\return Indeks przj¶cia (>=\a t) dla bie¿±cego stanu, które
|
||||
akceptuje symbol we \a in lub -1, je¶li nie ma takiego przej¶cia
|
||||
Je¶li nth==0, t1>=t, w przeciwnym razie t1>t.
|
||||
*/
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::tra_nd(long t, Ipass in, long nth) const
|
||||
{
|
||||
if(t<0 || t>=ttn)
|
||||
return -1;
|
||||
|
||||
if(nth)
|
||||
if(continued(t))
|
||||
t++;
|
||||
else
|
||||
return -1;
|
||||
else
|
||||
{ if(empty(t)) return -1; }
|
||||
|
||||
while(!accepts(t,in))
|
||||
if(continued(t))
|
||||
t++;
|
||||
else
|
||||
return -1;
|
||||
|
||||
return t;
|
||||
}
|
||||
|
||||
//}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
/// Funkcja przej¶cia.
|
||||
/**
|
||||
\param t - stan
|
||||
\param in - symbol we
|
||||
\return Stan, do którego mo¿na przej¶æ z \a t po wp³ywem symbolu \a in
|
||||
lub -1, je¶li nie ma przej¶cia przez \a in
|
||||
|
||||
*/
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::next(long t, Ipass in) const
|
||||
{
|
||||
if(t<0 || (unsigned long)t>=ttn)
|
||||
return -1;
|
||||
|
||||
if(empty(t)) return -1;
|
||||
while(!accepts(t,in))
|
||||
if(continued(t))
|
||||
t++;
|
||||
else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return next(t);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/// Uogólniona funkcja przejscia.
|
||||
/**
|
||||
\param +s - stan
|
||||
\param +w - wska¼nik pierwszego elementu ci±gu symboli we, zakoñczonego symbolem zerowym
|
||||
\param maxpath maksymalna d³ugo¶æ ¶cie¿ki, domy¶lnie ftMAXPATH
|
||||
\return stan osi±galny z \a s pod wp³ywem \a w (na ¶cie¿ce mog± siê pojawiæ
|
||||
epsilon-przej¶cia
|
||||
*/
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::gtra(long s, const I* w, long maxpath) const
|
||||
{
|
||||
if(s<0 || (unsigned long)s>=ttn)
|
||||
return -1;
|
||||
|
||||
long i=0;
|
||||
while(*w)
|
||||
{
|
||||
if(i>maxpath || empty(s)) return -1;
|
||||
while(!accepts(s,*w))
|
||||
if(continued(s))
|
||||
s++;
|
||||
else
|
||||
return -1;
|
||||
if(!epsi(s)) w++;
|
||||
s=next(s);
|
||||
i++;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/// Kontynuacja.
|
||||
/**
|
||||
...
|
||||
\param +s stan, je¶li -1 - poszukiwane jest nastêpne rozwi±zanie
|
||||
\param -c ci±g symboli we ze ¶cie¿ki prowadz±cej z \a s do
|
||||
stanu koñcowego
|
||||
\return d³ugo¶æ ci±gu \a c (= d³ugo¶æ ¶cie¿ki)
|
||||
\remark DZIA£A TYLKO DLA ZNAKÓW!!!
|
||||
EPSILON-PRZEJ¦CIA NIEDOZWOLONE!!!
|
||||
*/
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::cont(long s, I* c) const
|
||||
{
|
||||
static unsigned long path[ftMAXPATH]={0};
|
||||
static unsigned long i=0;
|
||||
static bool more=false;
|
||||
|
||||
bool found=false;
|
||||
|
||||
if(s!=-1)
|
||||
{
|
||||
if(s<0 || (unsigned long)s>=ttn)
|
||||
more=false;
|
||||
else
|
||||
{
|
||||
i=0;
|
||||
c[0]=0;
|
||||
path[0]=s;
|
||||
more=true;
|
||||
if(final(s))
|
||||
found=true;
|
||||
}
|
||||
}
|
||||
|
||||
while(more && !found)
|
||||
{
|
||||
if(!empty(path[i]) && i<ftMAXPATH)
|
||||
{
|
||||
path[i+1]=next(path[i]);
|
||||
c[i]=input(path[i]);
|
||||
i++;
|
||||
}
|
||||
else
|
||||
{
|
||||
do
|
||||
{
|
||||
if(i>0)
|
||||
c[--i]=0;
|
||||
else
|
||||
more=false;
|
||||
}while(more && !continued(path[i]));
|
||||
path[i]=path[i]+1;
|
||||
}
|
||||
if(final(path[i]))
|
||||
{
|
||||
found=true;
|
||||
c[i]=0;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/// Dopasowannie.
|
||||
/**
|
||||
\remark Nie zaimplementowane.
|
||||
*/
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::match(const I* w, long* p) const
|
||||
{}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/// Dopasowanie niedeterministyczne.
|
||||
/**
|
||||
\param +w - wska¼nik pierwszego elementu ci±gu symboli we, zakoñczonego symbolem zerowym,
|
||||
je¶li NULL - poszukiwane jest nastêpne rozwi±zanie
|
||||
\param -p ci±g przej¶æ zakoñczony -1
|
||||
\return d³ugo¶æ dopasowania (PO CO?)
|
||||
*/
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::match_nd(const I* w, long* p) const
|
||||
{
|
||||
static bool more=false;
|
||||
static I *w0, *wc;
|
||||
static long s=0, *p0, *pc, *pc_bound;
|
||||
|
||||
bool found=false;
|
||||
|
||||
if(w)
|
||||
{
|
||||
wc=w0=w;
|
||||
pc=p0=p;
|
||||
more=true;
|
||||
pc_bound=pc+ftMAXPATH;
|
||||
if(final(s=0))
|
||||
{
|
||||
*pc=-1; return 0;
|
||||
}
|
||||
}
|
||||
|
||||
while(more)
|
||||
{
|
||||
if(*wc && pc<pc_bound && (*pc=trand(s,*wc,0))>=0)
|
||||
{ if(!epsi(*pc)) wc++; s=next(*pc); pc++; }
|
||||
else
|
||||
while(true)
|
||||
{
|
||||
if(pc==p0) { more=false; return -1; }
|
||||
if(!epsi(*(--pc))) wc--;
|
||||
if((*pc=trand(*pc,*wc,1))>=0)
|
||||
{ if(!epsi(*pc)) wc++; s=next(*pc); pc++; break; }
|
||||
}
|
||||
if(final(s)) { *pc=-1; return wc-w0; }
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/// Najd³u¿sze dopasowanie.
|
||||
/**
|
||||
\param +w wska¼nik pierwszego elementu ci±gu symboli wej¶ciowych
|
||||
\param -p ¶cie¿ka
|
||||
\param -plen d³ugo¶æ ¶cie¿ki
|
||||
\param +maxpath maks dd³ugo¶æ ¶cie¿ki, domy¶lnie FT::ftMAXPATH
|
||||
\return d³ugo¶æ skonsumowanego wej¶cia
|
||||
*/
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>
|
||||
::lgstmatch(const I* w, long* p, long& plen, long maxpath) const
|
||||
{
|
||||
long s=0;
|
||||
long t;
|
||||
long i=0;
|
||||
const char* w0=w;
|
||||
long ilen=0;
|
||||
while(*w && i<maxpath && (t=tra(s,*w))>=0)
|
||||
{
|
||||
if(!epsi(t)) w++;
|
||||
s=next(t);
|
||||
i++;
|
||||
*(p++)=t;
|
||||
if(final(s)) { plen=i; ilen=w-w0; }
|
||||
}
|
||||
*p=-1;
|
||||
return ilen;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/// Najd³u¿sza ¶cie¿ka.
|
||||
/**
|
||||
\param +buf wska¼nik pierwszego elementu ci±gu symboli wej¶ciowych
|
||||
\param -buf pozycja jeden za skonsumowanym prefiksem
|
||||
\param +path wska¼nik pierwszego elementu wektora przej¶æ
|
||||
\param -path wska¼nik jeden za ostatnim przej¶ciem
|
||||
\return d³ugo¶æ skonsumowanego prefiksu (PO CO? LEPIEJ D£ ¦CIE¯KI)
|
||||
*/
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>
|
||||
::lgstpath(I*& buf, long*& path, long start) const
|
||||
{
|
||||
long s=start;
|
||||
long t;
|
||||
const char* buf0=buf;
|
||||
const long* pathlimit=path+FT::ftMAXPATH;
|
||||
while(*buf && path<pathlimit && (t=tra(s,*buf))>=0)
|
||||
{
|
||||
if(!epsi(t)) buf++;
|
||||
s=next(t);
|
||||
*(path++)=t;
|
||||
}
|
||||
return buf-buf0;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/// Najd³u¿szy prefiks.
|
||||
/**
|
||||
\param +buf wska¼nik pierwszego elementu ci±gu symboli wej¶ciowych
|
||||
\param -buf pozycja jeden za skonsumowanym prefiksem
|
||||
\param +sep separator
|
||||
\return stan po przej¶ciu przez \a sep
|
||||
\remark Dzia³a tylko dla automatów deterministycznych, minimalnych, eps-wolnych,
|
||||
gdzie d³. ¶cie¿ki == d³. dopasowania.
|
||||
*/
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>
|
||||
::pref(I*& buf, I sep, long start) const
|
||||
{
|
||||
static long pathtab[ftMAXPATH];
|
||||
// static long* path=pathtab;
|
||||
long* path=pathtab;
|
||||
static bool more;
|
||||
|
||||
long s;
|
||||
if(*buf) // pierwsze wywo³anie
|
||||
{
|
||||
if(!lgstpath(buf,path,start))
|
||||
return -1;
|
||||
--path;
|
||||
more=true;
|
||||
}
|
||||
else // kolejne wywo³anie
|
||||
--buf,--path;
|
||||
while(more)
|
||||
if(path>=pathtab)
|
||||
if((s=next(next(*path),sep))>=0) {
|
||||
return s;
|
||||
}
|
||||
else
|
||||
--buf, --path;
|
||||
else
|
||||
{
|
||||
more=false;
|
||||
return -1;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/*
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::trans(const I* si, O* so, long& olen) const
|
||||
{
|
||||
long p[ftMAXPATH];
|
||||
long ilen;
|
||||
long plen;
|
||||
if((ilen=lgstmatch(si,p,plen))>0)
|
||||
olen=prn(si,p,so);
|
||||
else
|
||||
ilen=olen=0;
|
||||
return ilen;
|
||||
}
|
||||
*/
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::prn(const I* si, long* p, O* so) const
|
||||
{
|
||||
switch(print_mode)
|
||||
{
|
||||
case OO: return prn_oo(si,p,so);
|
||||
case IOIO: return prn_ioio(si,p,so);
|
||||
case OIOI: return prn_oioi(si,p,so);
|
||||
case IIOO: return prn_iioo(si,p,so);
|
||||
case OOII: return prn_ooii(si,p,so);
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::prn_oo(const I* si, long* p, O* so) const
|
||||
{
|
||||
char* so0=so;
|
||||
while(*p>=0)
|
||||
{
|
||||
long t=*p;
|
||||
if(!epso(t))
|
||||
{
|
||||
if(defo(t))
|
||||
*(so++)=*si;
|
||||
else
|
||||
*(so++)=output(t);
|
||||
}
|
||||
if(!epsi(t)) si++;
|
||||
p++;
|
||||
|
||||
}
|
||||
return so-so0;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::prn_ioio(const I* si, long* p, O* so) const
|
||||
{
|
||||
char* so0=so;
|
||||
while(*p>=0)
|
||||
{
|
||||
long t=*p;
|
||||
if(!epsi(t))
|
||||
*(so++)=*si;
|
||||
if(!epso(t))
|
||||
if(defo(t))
|
||||
*(so++)=*si;
|
||||
else
|
||||
*(so++)=output(t);
|
||||
if(!epsi(t)) si++;
|
||||
p++;
|
||||
}
|
||||
return so-so0;
|
||||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::prn_oioi(const I* si, long* p, O* so) const
|
||||
{
|
||||
char* so0=so;
|
||||
while(*p>=0)
|
||||
{
|
||||
long t=*p;
|
||||
if(!epso(t))
|
||||
{
|
||||
if(defo(t))
|
||||
*(so++)=*si;
|
||||
else
|
||||
*(so++)=output(t);
|
||||
}
|
||||
if(!epsi(t))
|
||||
*(so++)=*(si++);
|
||||
p++;
|
||||
}
|
||||
return so-so0;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::prn_iioo(const I* si, long* p, O* so) const
|
||||
{
|
||||
const char* si0=si;
|
||||
long* p0=p;
|
||||
char* so0=so;
|
||||
while(*p>=0)
|
||||
{
|
||||
long t=*p;
|
||||
if(!epsi(t))
|
||||
{
|
||||
*(so++)=*si;
|
||||
si++;
|
||||
}
|
||||
p++;
|
||||
}
|
||||
si=si0;
|
||||
p=p0;
|
||||
while(*p>=0)
|
||||
{
|
||||
long t=*p;
|
||||
if(!epso(t))
|
||||
if(defo(t))
|
||||
*(so++)=*si;
|
||||
else
|
||||
*(so++)=output(t);
|
||||
if(!epsi(t)) si++;
|
||||
p++;
|
||||
}
|
||||
return so-so0;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
long TFT<I,Ipass,O,Opass,TT>::prn_ooii(const I* si, long* p, O* so) const
|
||||
{
|
||||
|
||||
const char* si0=si;
|
||||
long* p0=p;
|
||||
char* so0=so;
|
||||
while(*p>=0)
|
||||
{
|
||||
long t=*p;
|
||||
if(!epso(t))
|
||||
{
|
||||
if(defo(t))
|
||||
*(so++)=*si;
|
||||
else
|
||||
*(so++)=output(t);
|
||||
}
|
||||
if(!epsi(t)) si++;
|
||||
p++;
|
||||
}
|
||||
si=si0;
|
||||
p=p0;
|
||||
while(*p>=0)
|
||||
{
|
||||
long t=*p;
|
||||
if(!epsi(t))
|
||||
*(so++)=*(si++);
|
||||
p++;
|
||||
}
|
||||
return so-so0;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
template<class I, class Ipass, class O, class Opass, class TT>
|
||||
void TFT<I,Ipass,O,Opass,TT>::sort()
|
||||
{
|
||||
long t=0;
|
||||
while(t<ttn)
|
||||
{
|
||||
long t0=t;
|
||||
long tn=1;
|
||||
while(continued(t++)) tn++;
|
||||
if(tn>1)
|
||||
{
|
||||
long eps=-1;
|
||||
long def=-1;
|
||||
for(int i=0; i<tn; i++)
|
||||
{
|
||||
if(defi(t0+i))
|
||||
if(epsi(t0+i)) eps=i; else def=i;
|
||||
}
|
||||
if(eps>=0 && eps<tn-1)
|
||||
{
|
||||
TT temp=tt[t0+eps];
|
||||
memmove(tt+t0+eps+1,tt+t0+eps,tn-eps-1);
|
||||
tt[t-1]=temp;
|
||||
}
|
||||
if(def>eps) def--;
|
||||
if(def>=0 && def<tn-1)
|
||||
{
|
||||
TT temp=tt[t0+def];
|
||||
if(eps>=0)
|
||||
{
|
||||
memmove(tt+t0+def+1,tt+t0+def,tn-eps-2);
|
||||
tt[t-2]=temp;
|
||||
}
|
||||
else
|
||||
{
|
||||
memmove(tt+t0+def+1,tt+t0+def,tn-eps-2);
|
||||
tt[t-1]=temp;
|
||||
}
|
||||
}
|
||||
while(t0<t-1)
|
||||
tt[t0++].continued(true);
|
||||
tt[t-1].continued(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
void TFT<I,Ipass,O,Opass,TT>::setiotypes()
|
||||
{
|
||||
int i=0;
|
||||
const char* it=typeid(I).name();
|
||||
while(*it)
|
||||
if(*it==' ')
|
||||
{ it++; continue; }
|
||||
else
|
||||
itype[i++]=*(it++);
|
||||
itype[i]='\0';
|
||||
|
||||
i=0;
|
||||
const char* ot=typeid(O).name();
|
||||
while(*ot)
|
||||
if(*ot==' ')
|
||||
{ ot++; continue; }
|
||||
else
|
||||
otype[i++]=*(ot++);
|
||||
otype[i]='\0';
|
||||
};
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
template <class I, class Ipass, class O, class Opass, class TT>
|
||||
void TFT<I,Ipass,O,Opass,TT>::prntt(ostream& os)
|
||||
{
|
||||
for(long i=0; i<ttn; ++i)
|
||||
{
|
||||
os << i << ':';
|
||||
os << tt[i];
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
523
app/src/lib/tfti.h
Executable file
523
app/src/lib/tfti.h
Executable file
@ -0,0 +1,523 @@
|
||||
#ifndef TFTiH
|
||||
#define TFTiH
|
||||
//---------------------------------------------------------------------------
|
||||
#include <fstream.h>
|
||||
#include <math.h>
|
||||
#include <iomanip.h>
|
||||
//#include <typeinfo.h>
|
||||
|
||||
#include "tft.h"
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
class TFTi : public TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >
|
||||
{
|
||||
public:
|
||||
TFTi() : TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >() {};
|
||||
TFTi(const char* filename)
|
||||
: TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >() { load(filename); };
|
||||
|
||||
void read(const char* filename);
|
||||
void read(istream& is=cin);
|
||||
void write(const char* filename);
|
||||
void write(ostream& os=cout);
|
||||
void load(const char* filename);
|
||||
void load(FILE* f=stdin);
|
||||
void save(const char* filename);
|
||||
void save(FILE* f=stdout);
|
||||
void clear();
|
||||
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::ttn;
|
||||
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::states;
|
||||
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::transitions;
|
||||
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::itype;
|
||||
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::ftTYPELEN;
|
||||
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::otype;
|
||||
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::tt;
|
||||
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::copy_default;
|
||||
using TFT<I,Ipass,O,Opass,TTrans_i<I,Ipass,O,Opass> >::print_mode;
|
||||
|
||||
|
||||
// friend istream& operator>>(istream&, TFTi<I,Ipass,O,Opass>&);
|
||||
// friend ostream& operator<<(ostream&, const TFTi<I,Ipass,O,Opass>&);
|
||||
};
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
void TFTi<I,Ipass,O,Opass>::read(const char* filename)
|
||||
{
|
||||
ifstream is(filename);
|
||||
if(!is) { fprintf(stderr,"Failed to open input file."); exit(1); }
|
||||
read(is);
|
||||
}
|
||||
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
void TFTi<I,Ipass,O,Opass>::read(istream& is)
|
||||
{
|
||||
long *si; // state-index relation
|
||||
long ci=0; // current index
|
||||
char ch; // character read;
|
||||
int empty=0; // no of states with 0 trans?
|
||||
char intype[FT::ftTYPELEN];
|
||||
char outtype[FT::ftTYPELEN];
|
||||
|
||||
clear();
|
||||
|
||||
is >> states >> transitions >> intype >> outtype;
|
||||
|
||||
// if(strcmp(intype,itype)!=0 ||
|
||||
// strcmp(outtype,otype)!=0 && strcmp(outtype,"void")!=0)
|
||||
// { is.clear(ios::badbit); goto end; };
|
||||
|
||||
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
|
||||
while(is.peek()!='\n')
|
||||
{
|
||||
char s[20];
|
||||
is >> s;
|
||||
if(strcmp(s,"COPY")==0 && strcmp(intype,outtype)==0) copy_default=true;
|
||||
else if(strcmp(s,"NOCOPY")==0) copy_default=false;
|
||||
else if(strcmp(s,"II")==0) print_mode=FT::II;
|
||||
else if(strcmp(s,"OO")==0) print_mode=FT::OO;
|
||||
else if(strcmp(s,"IOIO")==0) print_mode=FT::IOIO;
|
||||
else if(strcmp(s,"OIOI")==0) print_mode=FT::OIOI;
|
||||
else if(strcmp(s,"IIOO")==0) print_mode=FT::IIOO;
|
||||
else if(strcmp(s,"OIOI")==0) print_mode=FT::OIOI;
|
||||
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
|
||||
}
|
||||
|
||||
ttn=transitions+2; // 1 state without trans., 1 additional
|
||||
si=new long[states];
|
||||
tt=new TTrans_i<I,Ipass,O,Opass>[ttn];
|
||||
|
||||
for(long cs=0;cs<states;cs++)
|
||||
{
|
||||
long tc; // transition counter
|
||||
si[cs]=ci;
|
||||
long cscheck;
|
||||
|
||||
if(!is) goto end;
|
||||
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
|
||||
is >> cscheck;
|
||||
if(cs!=cscheck) goto end;
|
||||
|
||||
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
|
||||
|
||||
is.get(ch);
|
||||
if(!is) goto end;
|
||||
switch(ch)
|
||||
{
|
||||
case '-': tt[ci].final(false); break;
|
||||
case '+': tt[ci].final(true); break;
|
||||
default: goto end;
|
||||
}
|
||||
tc=0, tt[ci].continued(false);
|
||||
|
||||
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
|
||||
while(is && is.peek()!='\n')
|
||||
{
|
||||
switch(is.peek())
|
||||
{
|
||||
case '~': tt[ci].epsi(true); tt[ci].defi(true); is.get(ch);
|
||||
break;
|
||||
case '@': tt[ci].epsi(false); tt[ci].defi(true); is.get(ch);
|
||||
break;
|
||||
default : tt[ci].geti(is);
|
||||
}
|
||||
if(!is) goto end;
|
||||
if(is.peek()=='/')
|
||||
{
|
||||
is.get(ch);
|
||||
switch(is.peek())
|
||||
{
|
||||
case '~': tt[ci].epso(true); tt[ci].defo(true); is.get(ch);
|
||||
break;
|
||||
case '@': tt[ci].epso(false); tt[ci].defo(true); is.get(ch);
|
||||
break;
|
||||
default : tt[ci].geto(is);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
tt[ci].defo(true);
|
||||
if(copy_default) tt[ci].epso(false); else tt[ci].epso(true);
|
||||
}
|
||||
if(!is) goto end;
|
||||
|
||||
unsigned long transition;
|
||||
is >> transition;
|
||||
tt[ci].next(transition);
|
||||
|
||||
tt[ci].continued(false);
|
||||
tt[ci].empty(false);
|
||||
|
||||
if(tc>0) tt[ci-1].continued(true);
|
||||
tc++,ci++;
|
||||
}
|
||||
if(tc==0)
|
||||
{
|
||||
if(++empty>2) { fprintf(stderr, "Nondeterministic automaton."); exit(1); }
|
||||
tt[ci].empty(true);
|
||||
ci++;
|
||||
}
|
||||
is.get(ch);
|
||||
if(ch!='\n') { is.clear(ios::badbit); goto end; }
|
||||
}
|
||||
|
||||
ttn=transitions+empty;
|
||||
if(ttn!=ci) { is.clear(ios::badbit); goto end; };
|
||||
for(long i=0;i<ttn;i++)
|
||||
tt[i].next(si[tt[i].next()]);
|
||||
delete[] si;
|
||||
sort();
|
||||
|
||||
end:
|
||||
if(is.bad()) { fprintf(stderr,"Input error."); exit(1); }
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
void TFTi<I,Ipass,O,Opass>::write(const char* filename)
|
||||
{
|
||||
ofstream os(filename);
|
||||
if(!os) err("Failed to open output file.");
|
||||
write(os);
|
||||
}
|
||||
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
void TFTi<I,Ipass,O,Opass>::write(ostream& os)
|
||||
{
|
||||
os << states << ' ' << transitions << ' ';
|
||||
// os << itype << ' ' << otype << ' ';
|
||||
os << "char void";
|
||||
// os << (copy_default ? "COPY" : "NOCOPY") << ' ';
|
||||
// switch(print_mode)
|
||||
// {
|
||||
// case FT::II : os << "II"; break;
|
||||
// case FT::OO : os << "OO"; break;
|
||||
// case FT::IOIO: os << "IOIO"; break;
|
||||
// case FT::OIOI: os << "OIOI"; break;
|
||||
// case FT::IIOO: os << "IIOO"; break;
|
||||
// case FT::OOII: os << "OOII";
|
||||
// }
|
||||
os << '\n';
|
||||
|
||||
long* si=new long[ttn];
|
||||
long cs=0;
|
||||
for(long i=0;i<ttn;i++)
|
||||
{
|
||||
si[i]=cs;
|
||||
if(continued(i)==false) cs++;
|
||||
}
|
||||
|
||||
int statefieldwidth=log10(cs+1);
|
||||
|
||||
bool first=true;
|
||||
for(long i=0;i<ttn;i++)
|
||||
{
|
||||
if(first)
|
||||
{
|
||||
os << setw(statefieldwidth) << si[i] << " ";
|
||||
if(final(i)) os << '+'; else os << '-';
|
||||
}
|
||||
|
||||
|
||||
if(!empty(i))
|
||||
{
|
||||
os << ' ';
|
||||
if(epsi(i))
|
||||
os << FT::ftEPSILON;
|
||||
else
|
||||
if(defi(i))
|
||||
os << FT::ftDEFAULT;
|
||||
else
|
||||
os << input(i);
|
||||
|
||||
if(epso(i))
|
||||
{ if(copy_default) os << '/' << FT::ftEPSILON; }
|
||||
else
|
||||
if(defo(i))
|
||||
{ if(!copy_default) os << '/' << FT::ftDEFAULT; }
|
||||
else
|
||||
{ os << '/' << output(i); }
|
||||
|
||||
if(strcmp(itype,"char")!=0 || strcmp(otype,"char")!=0)
|
||||
os << ' ';
|
||||
os << si[next(i)];
|
||||
}
|
||||
if(continued(i))
|
||||
first=false;
|
||||
else
|
||||
{ os << '\n'; first=true; }
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
void TFTi<I,Ipass,O,Opass>::load(const char* filename)
|
||||
{
|
||||
FILE* f;
|
||||
if(*filename)
|
||||
f=fopen(filename,"rb");
|
||||
else
|
||||
f=stdin;
|
||||
if(!f) { fprintf(stderr, "Cannot open automaton file."); return; }
|
||||
load(f);
|
||||
}
|
||||
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
void TFTi<I,Ipass,O,Opass>::load(FILE* f)
|
||||
{
|
||||
|
||||
clear();
|
||||
|
||||
if(fread(&ttn,sizeof(ttn),1,f)!=1) { fprintf(stderr, "Binary input error."); return;}
|
||||
if(fread(&states,sizeof(states),1,f)!=1) { fprintf(stderr, "Binary input error."); return;}
|
||||
if(fread(&transitions,sizeof(transitions),1,f)!=1) { fprintf(stderr, "Binary input error."); return;}
|
||||
if(fread(itype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr, "Binary input error."); return;}
|
||||
if(fread(otype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr, "Binary input error."); return;}
|
||||
if(fread(©_default,sizeof(copy_default),1,f)!=1) { fprintf(stderr, "Binary input error."); return;}
|
||||
if(fread(&print_mode,sizeof(print_mode),1,f)!=1) { fprintf(stderr, "Binary input error."); return;}
|
||||
if((tt=new TTrans_i<I,Ipass,O,Opass>[ttn])==NULL) { fprintf(stderr, "Cannot allocate memory for tt."); return;}
|
||||
if(fread(tt,sizeof(TTrans_i<I,Ipass,O,Opass>),ttn,f)!=ttn) { fprintf(stderr, "Binary input error."); return; }
|
||||
fclose(f);
|
||||
|
||||
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
void TFTi<I,Ipass,O,Opass>::save(const char* filename)
|
||||
{
|
||||
FILE* f;
|
||||
if(*filename)
|
||||
f=fopen(filename,"wb");
|
||||
else
|
||||
f=stdout;
|
||||
if(!f) err("Cannot open file.");
|
||||
save(f);
|
||||
}
|
||||
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
void TFTi<I,Ipass,O,Opass>::save(FILE* f)
|
||||
{
|
||||
if(fwrite(&ttn,sizeof(ttn),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); }
|
||||
if(fwrite(&states,sizeof(states),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); }
|
||||
if(fwrite(&transitions,sizeof(transitions),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); }
|
||||
if(fwrite(itype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr,"Binary output error."); exit(1); }
|
||||
if(fwrite(otype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr,"Binary output error."); exit(1); }
|
||||
if(fwrite(©_default,sizeof(copy_default),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); }
|
||||
if(fwrite(&print_mode,sizeof(print_mode),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); }
|
||||
if(fwrite(tt,sizeof(TTrans_i<I,Ipass,O,Opass>),ttn,f)!=ttn) { fprintf(stderr,"Binary output error."); exit(1); }
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
void TFTi<I,Ipass,O,Opass>::clear()
|
||||
{
|
||||
if(tt) delete[] tt;
|
||||
ttn=0;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
/*
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
istream& operator>>(istream& is, TFTi<I,Ipass,O,Opass>& ft)
|
||||
{
|
||||
long *si; // state-index relation
|
||||
long ci=0; // current index
|
||||
char ch; // character read;
|
||||
int empty=0; // no of states with 0 trans?
|
||||
char intype[FT::ftTYPELEN];
|
||||
char outtype[FT::ftTYPELEN];
|
||||
|
||||
ft.clear();
|
||||
|
||||
is >> ft.states >> ft.transitions >> intype >> outtype;
|
||||
|
||||
if(strcmp(intype,ft.itype)!=0 ||
|
||||
strcmp(outtype,ft.otype)!=0 && strcmp(outtype,"void")!=0)
|
||||
{ is.clear(ios::badbit); return is; };
|
||||
|
||||
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
|
||||
while(is.peek()!='\n')
|
||||
{
|
||||
char s[20];
|
||||
is >> s;
|
||||
if(strcmp(s,"COPY")==0 && strcmp(intype,outtype)==0) ft.copy_default=true;
|
||||
else if(strcmp(s,"NOCOPY")==0) ft.copy_default=false;
|
||||
else if(strcmp(s,"II")==0) ft.print_mode=FT::II;
|
||||
else if(strcmp(s,"OO")==0) ft.print_mode=FT::OO;
|
||||
else if(strcmp(s,"IOIO")==0) ft.print_mode=FT::IOIO;
|
||||
else if(strcmp(s,"OIOI")==0) ft.print_mode=FT::OIOI;
|
||||
else if(strcmp(s,"IIOO")==0) ft.print_mode=FT::IIOO;
|
||||
else if(strcmp(s,"OIOI")==0) ft.print_mode=FT::OIOI;
|
||||
while(is.peek()==' ' || is.peek()=='\t') is.get(ch);
|
||||
}
|
||||
|
||||
ft.ttn=ft.transitions+2; // 1 state without trans., 1 additional
|
||||
si=new long[ft.states];
|
||||
ft.tt=new TTrans_i<I,Ipass,O,Opass>[ft.ttn];
|
||||
|
||||
for(long cs=0;cs<ft.states;cs++)
|
||||
{
|
||||
long tc; // transition counter
|
||||
si[cs]=ci;
|
||||
do is >> ch; while(ch!='+' && ch!='-');
|
||||
switch(ch)
|
||||
{
|
||||
case '-': ft.tt[ci].final(false); break;
|
||||
case '+': ft.tt[ci].final(true); break;
|
||||
default: return is;
|
||||
}
|
||||
tc=0, ft.tt[ci].continued(false);
|
||||
while((is.get(ch),ch==' '))
|
||||
{
|
||||
if(!is) return is;
|
||||
switch(is.peek())
|
||||
{
|
||||
case '~': ft.tt[ci].epsi(true); ft.tt[ci].defi(true); is.get(ch);
|
||||
break;
|
||||
case '@': ft.tt[ci].epsi(false); ft.tt[ci].defi(true); is.get(ch);
|
||||
break;
|
||||
default : ft.tt[ci].geti(is);
|
||||
}
|
||||
if(!is) return is;
|
||||
if(is.peek()=='/')
|
||||
{
|
||||
is.get(ch);
|
||||
switch(is.peek())
|
||||
{
|
||||
case '~': ft.tt[ci].epso(true); ft.tt[ci].defo(true); is.get(ch);
|
||||
break;
|
||||
case '@': ft.tt[ci].epso(false); ft.tt[ci].defo(true); is.get(ch);
|
||||
break;
|
||||
default : ft.tt[ci].geto(is);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ft.tt[ci].defo(true);
|
||||
if(ft.copy_default) ft.tt[ci].epso(false); else ft.tt[ci].epso(true);
|
||||
}
|
||||
if(!is) return is;
|
||||
|
||||
unsigned long transition;
|
||||
is >> transition;
|
||||
ft.tt[ci].next(transition);
|
||||
|
||||
ft.tt[ci].continued(false);
|
||||
|
||||
ft.tt[ci].empty(false);
|
||||
if(tc>0) ft.tt[ci-1].continued(true);
|
||||
tc++,ci++;
|
||||
}
|
||||
if(tc==0)
|
||||
{
|
||||
if(++empty>2) err("Nondeterministic automaton.");
|
||||
ft.tt[ci].empty(true);
|
||||
ci++;
|
||||
}
|
||||
if(ch!='\n') { is.clear(ios::badbit); return is; }
|
||||
}
|
||||
|
||||
ft.ttn=ft.transitions+empty;
|
||||
if(ft.ttn!=ci) { is.clear(ios::badbit); return is; };
|
||||
for(long i=0;i<ft.ttn;i++)
|
||||
ft.tt[i].next(si[ft.tt[i].next()]);
|
||||
delete[] si;
|
||||
ft.sort();
|
||||
return is;
|
||||
}
|
||||
*/
|
||||
//---------------------------------------------------------------------------
|
||||
/*
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
ostream& operator<<(ostream& os, const TFTi<I,Ipass,O,Opass>& ft)
|
||||
{
|
||||
os << ft.states << ' ' << ft.transitions << ' '
|
||||
<< ft.itype << ' ' << ft.otype << ' ';
|
||||
os << (ft.copy_default ? "COPY" : "NOCOPY") << ' ';
|
||||
switch(ft.print_mode)
|
||||
{
|
||||
case FT::II : os << "II"; break;
|
||||
case FT::OO : os << "OO"; break;
|
||||
case FT::IOIO: os << "IOIO"; break;
|
||||
case FT::OIOI: os << "OIOI"; break;
|
||||
case FT::IIOO: os << "IIOO"; break;
|
||||
case FT::OOII: os << "OOII";
|
||||
}
|
||||
os << ' ' << '\n';
|
||||
|
||||
long* si=new long[ft.ttn];
|
||||
long cs=0;
|
||||
for(long i=0;i<ft.ttn;i++)
|
||||
{
|
||||
si[i]=cs;
|
||||
if(ft.continued(i)==false) cs++;
|
||||
}
|
||||
|
||||
bool first=true;
|
||||
for(long i=0;i<ft.ttn;i++)
|
||||
{
|
||||
if(first)
|
||||
if(ft.final(i)) os << '+'; else os << '-';
|
||||
|
||||
if(!ft.empty(i))
|
||||
{
|
||||
os << ' ';
|
||||
if(ft.epsi(i))
|
||||
os << FT::ftEPSILON;
|
||||
else
|
||||
if(ft.defi(i))
|
||||
os << FT::ftDEFAULT;
|
||||
else
|
||||
os << ft.input(i);
|
||||
|
||||
if(ft.epso(i))
|
||||
{ if(ft.copy_default) os << '/' << FT::ftEPSILON; }
|
||||
else
|
||||
if(ft.defo(i))
|
||||
{ if(!ft.copy_default) os << '/' << FT::ftDEFAULT; }
|
||||
else
|
||||
{ os << '/' << ft.output(i); }
|
||||
|
||||
if(strcmp(ft.itype,"char")!=0 || strcmp(ft.otype,"char")!=0)
|
||||
|
||||
os << ' ';
|
||||
os << si[ft.next(i)];
|
||||
}
|
||||
if(ft.continued(i))
|
||||
first=false;
|
||||
else
|
||||
{ os << '\n'; first=true; }
|
||||
}
|
||||
return os;
|
||||
}
|
||||
*/
|
||||
//---------------------------------------------------------------------------
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
template<class I, class O>
|
||||
class TFTiv : public TFTi<I,I,O,O>
|
||||
{
|
||||
public:
|
||||
TFTiv() : TFTi<I,I,O,O>() {};
|
||||
TFTiv(const char* filename) : TFTi<I,I,O,O>(filename) {};
|
||||
};
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
template<class I, class O>
|
||||
class TFTir : public TFTi<I,I&,O,O&>
|
||||
{
|
||||
public:
|
||||
TFTir() : TFTi<I,I,O,O>() {};
|
||||
};
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
#endif
|
204
app/src/lib/ttrans.h
Executable file
204
app/src/lib/ttrans.h
Executable file
@ -0,0 +1,204 @@
|
||||
#ifndef _TTransi_h
|
||||
#define _TTransi_h
|
||||
//---------------------------------------------------------------------------
|
||||
#include <iostream.h>
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
//! The template for a transition with input and output symbols stored internally.
|
||||
/*!
|
||||
A state is identified with the set of its outgoing transitions.
|
||||
The state index is the index of the first transition for it.
|
||||
A state with no outgoing transition is represented as an empty transition.
|
||||
*/
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
class TTrans_i
|
||||
{
|
||||
public:
|
||||
//private:
|
||||
//! Input symbol
|
||||
I i;
|
||||
//! Output symbol
|
||||
O o;
|
||||
|
||||
public:
|
||||
|
||||
//! state is final
|
||||
static const unsigned char BITf=0x01;
|
||||
//! transition list is continued
|
||||
static const unsigned char BITc=0x02;
|
||||
//! no transition
|
||||
static const unsigned char BITe=0x04;
|
||||
//! epsilon input
|
||||
static const unsigned char BITepsi=0x08;
|
||||
//! default input
|
||||
static const unsigned char BITdefi=0x10;
|
||||
//! epsilon output
|
||||
static const unsigned char BITepso=0x20;
|
||||
//! default output
|
||||
static const unsigned char BITdefo=0x40;
|
||||
|
||||
//! Flags
|
||||
unsigned char flags;
|
||||
|
||||
//! The index of the next state
|
||||
long nxt;
|
||||
|
||||
//! Input symbol.
|
||||
//! \return The input symbol of the transition.
|
||||
Ipass in() const { return i; }
|
||||
|
||||
//! Output symbol.
|
||||
//! \return The output symbol of the transition.
|
||||
Opass out() const { return o; }
|
||||
|
||||
//! Set the input symbol.
|
||||
//! \param in input symbol
|
||||
void in(Ipass in) { i=in; }
|
||||
|
||||
//! Set the output symbol.
|
||||
//! \param out output symbol
|
||||
void out(Opass out) { o=out; }
|
||||
|
||||
//! remark Is this needed?
|
||||
I& iref() { return i; }
|
||||
|
||||
//! remark Is this needed?
|
||||
O& oref() { return o; }
|
||||
|
||||
//! Test whether an input symbol is accepted.
|
||||
//! \remark Simplified. Should rely on a test function provided by the user.
|
||||
bool accepts(Ipass in) { return defi() || in==i; }
|
||||
|
||||
//! Next state.
|
||||
//! \return Destination state of the transition.
|
||||
long next() const { return nxt; };
|
||||
|
||||
//! Set the next state.
|
||||
//! \param t destination state of the transition
|
||||
void next(long t) { nxt=t; };
|
||||
|
||||
//! Is the state final?
|
||||
//! \return \c true if the state is final, false otherwise.
|
||||
bool final() const { return flags&BITf; };
|
||||
|
||||
//! Set the \b final flag.
|
||||
//! \param b \c true if the state is final, \c false otherwise.
|
||||
void final(bool b) { if(b) flags|=BITf; else flags&=~BITf; };
|
||||
|
||||
//! Is the transition list continued?
|
||||
//! \return \c true if the transition is not the last transition for the state,
|
||||
//! \c false otherwise.
|
||||
bool continued() const { return flags&BITc; };
|
||||
|
||||
//! Set the \b continuation flag.
|
||||
//! \param b \c true if the transition is not the last one for the state, \c false otherwise.
|
||||
void continued(bool b) { if(b) flags|=BITc; else flags&=~BITc; };
|
||||
|
||||
//! Is the transition empty?
|
||||
//! \return \c true if the transition is empty (represents a state with no outgoing transitions),
|
||||
//! \c false otherwise.
|
||||
bool empty() const { return flags&BITe; };
|
||||
|
||||
//! Set the \b empty flag.
|
||||
//! \param b \c true if the transition is empty, \c false otherwise.
|
||||
void empty(bool b) { if(b) flags|=BITe; else flags&=~BITe; };
|
||||
|
||||
bool epsi() const { return flags&BITepsi; };
|
||||
void epsi(bool b) { if(b) flags|=BITepsi; else flags&=~BITepsi; };
|
||||
|
||||
bool defi() const { return flags&BITdefi; };
|
||||
void defi(bool b) { if(b) flags|=BITdefi; else flags&=~BITdefi; };
|
||||
|
||||
bool epso() const { return flags&BITepso; };
|
||||
void epso(bool b) { if(b) flags|=BITepso; else flags&=~BITepso; };
|
||||
|
||||
bool defo() const { return flags&BITdefo; };
|
||||
void defo(bool b) { if(b) flags|=BITdefo; else flags&=~BITdefo; };
|
||||
|
||||
void geti(istream&);
|
||||
void geto(istream&);
|
||||
|
||||
// friend ostream& operator<<(ostream& os, const TTrans_i<I,Ipass,O,Opass>& t);
|
||||
|
||||
};
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
template<char>
|
||||
void getsym(istream& is, char& c)
|
||||
{
|
||||
is >> c;
|
||||
if(c=='\\')
|
||||
{
|
||||
is.get(c);
|
||||
switch(c)
|
||||
{
|
||||
case 'n':c='\n';break;
|
||||
case 't':c='\t';break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<class T>
|
||||
void getsym(istream& is, T& s)
|
||||
{ is >> s; }
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
void TTrans_i<I,Ipass,O,Opass>::geti(istream& is)
|
||||
{ getsym<I>(is,iref()); };
|
||||
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
void TTrans_i<I,Ipass,O,Opass>::geto(istream& is)
|
||||
{ getsym<I>(is,oref()); };
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
/*
|
||||
template<class I, class Ipass, class O, class Opass>
|
||||
ostream& operator<<(ostream& os, const TTrans_i<I,Ipass,O,Opass>& t)
|
||||
{
|
||||
os << (t.final() ? '+' : '-');
|
||||
os << ' ';
|
||||
|
||||
if(!t.empty())
|
||||
{
|
||||
if(t.defi())
|
||||
os << (t.epsi() ? '~' : '@');
|
||||
else
|
||||
switch(t.in())
|
||||
{
|
||||
case ' ': os << "\\ "; break;
|
||||
case '\n': os << "\\n"; break;
|
||||
case '\t': os << "\\t"; break;
|
||||
default: os << t.in();
|
||||
}
|
||||
|
||||
os << '/';
|
||||
|
||||
if(t.defo())
|
||||
os << (t.epso() ? '~' : '@');
|
||||
else
|
||||
switch(t.out())
|
||||
{
|
||||
case ' ': os << "\\ "; break;
|
||||
case '\n': os << "\\n"; break;
|
||||
case '\t': os << "\\t"; break;
|
||||
default: os << t.out();
|
||||
}
|
||||
|
||||
os << ' ' << t.next();
|
||||
}
|
||||
|
||||
os << '\n';
|
||||
|
||||
if(!t.continued())
|
||||
os << '\n';
|
||||
|
||||
return os;
|
||||
}
|
||||
*/
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
#endif
|
||||
|
199
app/src/lib/word.cc
Normal file
199
app/src/lib/word.cc
Normal file
@ -0,0 +1,199 @@
|
||||
//---------------------------------------------------------------------------
|
||||
#include "word.h"
|
||||
#include "auttools.h"
|
||||
#include <istream.h>
|
||||
//---------------------------------------------------------------------------
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
void Word::autodescr(const char* fo, const char* de)
|
||||
{
|
||||
strcpy(f,fo);
|
||||
// len=strlen(f);
|
||||
|
||||
char lemd[MAXDESCRLEN];
|
||||
int i=strcspn(de,",");
|
||||
strncpy(lemd,de,i);
|
||||
lemd[i]='\0';
|
||||
if(isdigit(lemd[0]))
|
||||
fullform(f,lemd,l); // je¶li lemat zakodowany
|
||||
else
|
||||
strcpy(l,lemd); // je¶li lemat w pe³nej postaci
|
||||
strcpy(d,de+i+1);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
int Word::cmp_w(Word a, Word b) {
|
||||
return (a.w_suf() > b.w_suf());
|
||||
}
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
istream& operator>>(istream& is, Word& w)
|
||||
{
|
||||
char temp[Word::MAXLEN+1];
|
||||
char c;
|
||||
|
||||
int i=0;
|
||||
while(i<Word::MAXLEN && is.get(c) && isalpha(c)) temp[i++]=c;
|
||||
if(i==Word::MAXLEN) {
|
||||
fprintf(stderr, "To long word");
|
||||
}
|
||||
if(i==0) is.clear(ios::badbit);
|
||||
temp[i]='\0';
|
||||
if(is)
|
||||
is.putback(c);
|
||||
strcpy(w.f,temp);
|
||||
// w.len=i;
|
||||
return is;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
ostream& operator<<(ostream& os, Word& w)
|
||||
{
|
||||
if(*(w.f))
|
||||
os << "<W " << w.form()
|
||||
<< ";" << w.lemma()
|
||||
<< ',' << w.descr() << '>';
|
||||
return os;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
Words::~Words() {
|
||||
// for (int i=0; i<tab.size(); ++i)
|
||||
// delete(tab[i]);
|
||||
}
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
int Words::find(const char* word) {
|
||||
for (int i=0; i<cnt; ++i) {
|
||||
if (strcmp(word, tab[i].form()) == 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
int Words::find(const char* word, const char* descr) {
|
||||
for (int i=0; i<cnt; ++i) {
|
||||
if ((strcmp(word, tab[i].form()) == 0) && (strcmp(descr, tab[i].descr()) == 0)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
/* zwraca index nastepnego wyniku, podczas pierwszego wywolania
|
||||
* zwraca index wyniku o najwiekszej wadze, przy drugim wywolaniu
|
||||
* wynik z druga najwyzsza waga, itd.
|
||||
* Jezeli nie ma juz wynikow - zwraca -1.
|
||||
*/
|
||||
int Words::next() {
|
||||
float max = -1;
|
||||
int result = -1;
|
||||
for (int i=0; i<cnt; ++i) {
|
||||
float w = tab[i].w_suf();
|
||||
if (w>max && !tab[i].returned) {
|
||||
max = w;
|
||||
result = i;
|
||||
}
|
||||
}
|
||||
if (result != -1)
|
||||
tab[result].returned = 1;
|
||||
return result;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
void Words::sort() {
|
||||
std::sort(tab.begin(), tab.end(), Word::cmp_w);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
int Words::add(const char* fo)
|
||||
{
|
||||
int i = find(fo);
|
||||
if(i!=-1) {
|
||||
return i;
|
||||
}
|
||||
|
||||
if (cnt>=tab.capacity()-1)
|
||||
tab.resize(tab.size()*2);
|
||||
|
||||
tab[cnt].form(fo);
|
||||
tab[cnt].w_suf(0.0);
|
||||
|
||||
// if(cnt<MAX-1) {
|
||||
/* tab.push_back(new Word());
|
||||
tab[cnt]->form(fo);
|
||||
tab[cnt]->w_suf(0.0);
|
||||
tab[cnt]->w_pref(0.0);*/
|
||||
return cnt++;
|
||||
// }
|
||||
//return -1;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
int Words::add(const char* fo, const char* des)
|
||||
{
|
||||
char d[Word::MAXDESCRLEN];
|
||||
int l=strcspn(des,",");
|
||||
int ok=1;
|
||||
if( *(des+l) == ',' )
|
||||
{
|
||||
strcpy(d,des+l+1);
|
||||
// printf("\t%s->%s,\n", des, d);
|
||||
int i=find(fo, d);
|
||||
if(i!=-1)
|
||||
return i;
|
||||
}
|
||||
else
|
||||
ok=0;
|
||||
|
||||
if (cnt>=tab.capacity()-1)
|
||||
tab.resize(tab.size()*2);
|
||||
|
||||
tab[cnt].form(fo);
|
||||
if(ok)
|
||||
tab[cnt].autodescr(fo, des);
|
||||
else
|
||||
tab[cnt].autodescr(fo, "?,?");
|
||||
|
||||
tab[cnt].w_suf(0.0);
|
||||
tab[cnt].returned = 0;
|
||||
/*
|
||||
// if(cnt<MAX-1) {
|
||||
tab.push_back(new Word());
|
||||
tab[cnt]->form(fo);
|
||||
tab[cnt]->autodescr(fo,des);
|
||||
tab[cnt]->w_suf(0.0);
|
||||
tab[cnt]->w_pref(0.0);
|
||||
// printf("ok!\n");*/
|
||||
return cnt++;
|
||||
// }
|
||||
// printf("hm\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
void Words::prn(ostream& os)
|
||||
{
|
||||
for(int i=0; i<count(); ++i)
|
||||
os << "<W " << tab[i].lemma() << ',' << tab[i].descr() << ">";
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
ostream& operator<<(ostream& os, Words& tab)
|
||||
{
|
||||
/* for(int i=0; i<tab.count(); ++i)
|
||||
os << i << ". " << tab[i] << '\n';
|
||||
return os;*/
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
145
app/src/lib/word.h
Normal file
145
app/src/lib/word.h
Normal file
@ -0,0 +1,145 @@
|
||||
//---------------------------------------------------------------------------
|
||||
#ifndef _Word_h
|
||||
#define _Word_h
|
||||
//---------------------------------------------------------------------------
|
||||
//#include "alphabet.h"
|
||||
//#include "erro.h"
|
||||
#include "const.h"
|
||||
#include <iostream.h>
|
||||
|
||||
#include <vector>
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
using namespace std;
|
||||
|
||||
class Word
|
||||
{
|
||||
public:
|
||||
static const int MAXLEN=64; // dac do global
|
||||
static const int MAXDESCRLEN=80; // dac do global
|
||||
|
||||
private:
|
||||
/// word form
|
||||
char f[MAX_FORM]; // w wolnej chwili nazwy mozna zamienic na dluzsze
|
||||
|
||||
/// length
|
||||
int _len_suf; // dlugosc dopasowania koncowki...
|
||||
// int _len_pref; // ... i prefiksu
|
||||
|
||||
/// lemma
|
||||
char l[MAX_FORM];
|
||||
|
||||
/// description
|
||||
char d[MAX_DESC];
|
||||
|
||||
/// weight (probability)
|
||||
float _w_suf;
|
||||
// float _w_pref;
|
||||
public:
|
||||
static int cmp_w(Word a, Word b);
|
||||
|
||||
Word() : _len_suf(-1) { *f='\0'; returned=0; };
|
||||
Word(const char* fo, const char* des) : _len_suf(-1) { autodescr(fo,des); _w_suf=1.0; returned=0; };
|
||||
|
||||
Word(const Word& w);
|
||||
|
||||
char* form() { return f; } // przywrocic const
|
||||
char* lemma() { return l; } // przywrocic const
|
||||
char* descr() { return d; }
|
||||
float w_suf() { return _w_suf; };
|
||||
int len_suf() { return _len_suf; }
|
||||
|
||||
|
||||
void form(const char* s) { strcpy(f,s); }
|
||||
void lemma(const char* s) { strcpy(l,s); }
|
||||
void descr(const char* s) { strcpy(d,s); };
|
||||
void w_suf(float x) { _w_suf=x; };
|
||||
void len_suf(int n) { _len_suf=n; };
|
||||
|
||||
bool operator==(const Word& w);
|
||||
bool operator!=(const Word& w);
|
||||
int cmp(const Word&);
|
||||
int cmpi(const Word&);
|
||||
|
||||
char* operator!() { return f; };
|
||||
|
||||
operator bool() { return _len_suf>0; };
|
||||
|
||||
char* str() { return f; }
|
||||
|
||||
void autodescr(const char* fo, const char* des);
|
||||
|
||||
friend istream& operator>>(istream& is, Word& m);
|
||||
friend ostream& operator<<(ostream& os, Word& m);
|
||||
|
||||
bool returned;
|
||||
|
||||
};
|
||||
|
||||
inline Word::Word(const Word& word)
|
||||
{ strcpy(f,word.f); strcpy(l,word.l); strcpy(d,word.d); _len_suf=word._len_suf; _w_suf=word._w_suf; returned = 0; }
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
inline bool Word::operator==(const Word& w)
|
||||
{return _len_suf==w._len_suf &&
|
||||
!strcmp(f,w.f) && !strcmp(l,w.l) && !strcmp(d,w.d); }
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
inline bool Word::operator!=(const Word& w)
|
||||
{return _len_suf!=w._len_suf ||
|
||||
strcmp(f,w.f) || strcmp(l,w.l) || strcmp(d,w.d);}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
inline int Word::cmp(const Word& w) { return strcmp(f,w.f); }
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
//inline int Word::cmpi(const Word& w) { return PL.cmpi(f,w.f); }
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//---------------------------------------------------------------------------
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
class Words
|
||||
{
|
||||
private:
|
||||
int find(const char* word);
|
||||
int find(const char* word, const char* descr);
|
||||
public:
|
||||
|
||||
static const int MAX=1024;
|
||||
|
||||
Words() : cnt(0) {tab.resize(MAX); };
|
||||
~Words();
|
||||
Word& operator[](int i) { return tab[i]; }
|
||||
int count() const { return cnt; }
|
||||
void clear() { cnt=0; tab.clear(); }
|
||||
int add(const char* fo);
|
||||
int add(const char* fo, const char* des);
|
||||
|
||||
/* zwraca index nastepnego wyniku, podczas pierwszego wywolania
|
||||
* zwraca index wyniku o najwiekszej wadze, przy drugim wywolaniu
|
||||
* wynik z druga najwyzsza waga, itd.
|
||||
* Jezeli nie ma juz wynikow - zwraca -1.
|
||||
*/
|
||||
int next();
|
||||
|
||||
void sort();
|
||||
|
||||
void prn(ostream& os);
|
||||
|
||||
// friend class Lem;
|
||||
// friend class AuxLem;
|
||||
friend ostream& operator<<(ostream& os, Words& tab);
|
||||
vector<Word> tab;
|
||||
int cnt;
|
||||
|
||||
};
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
|
6
app/src/mar/Makefile
Normal file
6
app/src/mar/Makefile
Normal file
@ -0,0 +1,6 @@
|
||||
main:
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp mar ${UTT_BIN_DIR}
|
||||
endif
|
262
app/src/mar/mar
Executable file
262
app/src/mar/mar
Executable file
@ -0,0 +1,262 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
#package: UAM Text Tools
|
||||
#component name: mrk
|
||||
#author: Marcin Walas
|
||||
|
||||
#this program tags the tokenized file with given tags
|
||||
#tags can be given in any order and configuration through the expression
|
||||
#which is one of the parametres of the script
|
||||
#contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
|
||||
use attr;
|
||||
|
||||
Getopt::Long::Configure('no_ignore_case_always');
|
||||
|
||||
my $help=0;
|
||||
my $pattern=0;
|
||||
my $macrofile=0;
|
||||
my $define=0;
|
||||
my $command=0;
|
||||
my $action="pgP";
|
||||
my $eos="seg(EOS)";
|
||||
my $explicit_space=0;
|
||||
|
||||
#this is our help function to cut the re to get another tag
|
||||
#it takes only one argument which is our patern (after m4 processing)
|
||||
#returns: the first root-level brace with content
|
||||
sub cutRe
|
||||
{
|
||||
my $i = 0;
|
||||
my $level = 0;
|
||||
my $text = $_[0];
|
||||
my $temp;
|
||||
for( $i =0; $i < (length $text);$i++)
|
||||
{
|
||||
$temp = substr($text, $i,1);
|
||||
if( $temp eq "(")
|
||||
{#we have an opening
|
||||
$level++;
|
||||
}
|
||||
elsif ( $temp eq ")")
|
||||
{#we close
|
||||
$level--;
|
||||
}
|
||||
if ( $level == 0)
|
||||
{
|
||||
$temp = substr($text,0,$i+1);
|
||||
last;
|
||||
}
|
||||
}
|
||||
$temp;
|
||||
}
|
||||
|
||||
#the same function as above althought it returns everything after the
|
||||
#first root level brace
|
||||
sub restRe
|
||||
{
|
||||
my $i = 0;
|
||||
my $level = 0;
|
||||
my $text = $_[0];
|
||||
my $temp;
|
||||
for( $i =0; $i < (length $text);$i++)
|
||||
{
|
||||
$temp = substr($text, $i,1);
|
||||
if( $temp eq "(")
|
||||
{#we have an opening
|
||||
$level++;
|
||||
}
|
||||
elsif ( $temp eq ")")
|
||||
{#we close
|
||||
$level--;
|
||||
}
|
||||
if ( $level == 0)
|
||||
{ #we cut everything in the begining
|
||||
$temp = substr($text,$i+1);
|
||||
last;
|
||||
}
|
||||
}
|
||||
$temp;
|
||||
}
|
||||
|
||||
GetOptions("pattern|e=s" => \$pattern,
|
||||
"eos|E=s" => \$eos,
|
||||
"macros=s" => \$macrofile,
|
||||
"define=s" => \$macrofile,
|
||||
"command" => \$command,
|
||||
"action=s" => \$action,
|
||||
"help|h" => \$help,
|
||||
"space|s" => \$explicit_space
|
||||
);
|
||||
|
||||
if($help)
|
||||
{
|
||||
print <<'END'
|
||||
Usage: mar [OPTIONS] [file ..]
|
||||
|
||||
Options:
|
||||
--pattern -e PATTERN Pattern.
|
||||
--bos -E PATTERN Segment serving as sentence beginning marker. [TODO]
|
||||
--macros=FILE Read macrodefinitions from FILE. [TODO]
|
||||
--define=FILE Add macrodefinitions from FILE. [TODO]
|
||||
--action -a [p][s][P] Perform only indicated actions.
|
||||
p - preprocess
|
||||
s - search
|
||||
P - postprocess
|
||||
(default pgP)
|
||||
--command Print generated shell command and exit.
|
||||
--help -h Print help.
|
||||
|
||||
In patern you can put any tag. Tags should begin with the @ character.
|
||||
They don't have to be closed.
|
||||
They can't contain white spaces!
|
||||
|
||||
Note: If you don't define any custom tags, whole pattern will be taged with
|
||||
default tags (begining of match and end of match)
|
||||
|
||||
Tags examples:
|
||||
|
||||
mar -e '@BEG cat(<ADJ>) @END'
|
||||
it will find any adjectives in the text and tag them with surrounding tags
|
||||
mar -e 'cat(<ADJ>) @MYTAG cat(<ADJ>)'
|
||||
this will find two neighbouring adjectives and parcel them with tag MYTAG
|
||||
|
||||
Some example patterns:
|
||||
'word(domu)' - form of the word domu
|
||||
'lexeme(dom)' - any form of lexeme dom
|
||||
'space' - space
|
||||
'cat(<ADJ>)' - adjective
|
||||
|
||||
You can use * in patterns to make zero or more counts of word.
|
||||
|
||||
END
|
||||
;
|
||||
exit 0;
|
||||
}
|
||||
|
||||
die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/;
|
||||
|
||||
die("$0: macro file not found") unless -e "terms.m4" and $macrofile="terms.m4";
|
||||
|
||||
my $preproc = ($action =~ /p/) ? ' fla | ' : '';
|
||||
|
||||
my $postproc = ($action =~ /P/) ? ' | unfla ' : '';
|
||||
|
||||
#here we are preparing re for extended matching
|
||||
my @tags;
|
||||
|
||||
#we must find what our the tags
|
||||
#some pattern adjustment
|
||||
my $end = 0;
|
||||
my $temp = " ".$pattern." ";
|
||||
$temp =~ s/(\@[^ ]*) (\@[^ ]* )/\1 \2/g;
|
||||
$pattern = $temp;
|
||||
|
||||
while ($end != 1)
|
||||
{
|
||||
#we seek for the first tag in pattern
|
||||
if ($temp =~ /^.*?\@(.*?) /)
|
||||
{
|
||||
#we add this to tags array
|
||||
push (@tags, $1);
|
||||
#and cut the pattern
|
||||
$temp =~ s/^.*?\@(.*?) / /;
|
||||
#print $temp."\n";
|
||||
}
|
||||
else
|
||||
{
|
||||
#if we dont find any tags we end
|
||||
$end = 1;
|
||||
}
|
||||
}
|
||||
|
||||
#here we have our patern with tags removed (we set sections of ()) between tags
|
||||
my $patternmod = "( ".$pattern." )";
|
||||
$patternmod =~ s/\s@.*?\s/\)\(/g;
|
||||
|
||||
#discarding spaces
|
||||
$patternmod =~ s/\s+/\\`'/g; #`
|
||||
# quoting escaped commas
|
||||
$patternmod =~ s/\\,/\\`\\`\\,''/g;
|
||||
# quoting commas in {m,n} r.e. operator
|
||||
$patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g;
|
||||
#print "After m4:".$re."\n";
|
||||
my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' $macrofile - 2>/dev/null`;
|
||||
|
||||
die("Incorrect pattern (m4).") if $? >> 8;
|
||||
|
||||
|
||||
chomp $re;
|
||||
|
||||
# <> expansion
|
||||
|
||||
$re =~ s/<([^>]+)>/`echo $1 | .\/terms\.tag2re`/ge;
|
||||
|
||||
# Perl-like special sequences
|
||||
$re =~ s/\./[^ [:cntrl:]]/g;
|
||||
$re =~ s/\\s/[ ]/g;
|
||||
$re =~ s/\\S/[^ [:cntrl:]]/g;
|
||||
$re =~ s/\\d/[0-9]/g;
|
||||
$re =~ s/\\D/[^0-9 [:cntrl:]]/g;
|
||||
$re =~ s/\\w/[a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_]/g;
|
||||
$re =~ s/\\W/[^a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_ [:cntrl:]]/g;
|
||||
# extensions
|
||||
$re =~ s/\\l/[a-z±æê³ñ󶼿]/g; #lowercase letter
|
||||
$re =~ s/\\L/[A-Z¡ÆÊ£ÑÓ¦¬¯]/g; #upercase letter
|
||||
|
||||
my $sedcommand;
|
||||
my $grepcommand;
|
||||
|
||||
#now we must built a sed script from our re
|
||||
#we do this by cuting our re each tag until we cut them all
|
||||
#if an user dint input any tags we do our default
|
||||
my $defBOM = "BOM";
|
||||
my $defEOM = "EOM";
|
||||
my $defTempTagBeg = "####TempTAGBEG####";
|
||||
my $defTempTagEnd = "####TempTAGEND####";
|
||||
|
||||
if (@tags == 0)
|
||||
{
|
||||
$sedcommand = "sed -r 's/($re)/\\500 $defBOM *\\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM *\\f\\1/g'";
|
||||
}
|
||||
else #we have custom tags
|
||||
{
|
||||
#first tag is easy to tag :)
|
||||
my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg *\\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd *\\f\\1/g;";
|
||||
#after first step we have temp tagged parts of input matching re
|
||||
#now we need to insert our custom tags
|
||||
#we will find temp tags and process our input
|
||||
|
||||
my $i = 0;
|
||||
#copy of re which will be cut
|
||||
my $rec = $re;
|
||||
my $restre = $re;
|
||||
|
||||
for ($i = 0 ; $i < @tags ; $i++)
|
||||
{
|
||||
#re cutting
|
||||
$rec = cutRe($restre);
|
||||
$restre = restRe($restre);
|
||||
if ($rec =~ / *\( *\) */)
|
||||
{
|
||||
$sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\*\\f([0-9]+)/\\2 00 $tags[$i] *\\f\\2 00 $defTempTagBeg *\\f\\2/g;";
|
||||
}
|
||||
else
|
||||
{
|
||||
$sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] *\\f\\1 00 $defTempTagBeg *\\f\\1/g;";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
$sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f//g;s/[0-9]+ 00 $defTempTagEnd \\*\\f//g'";
|
||||
}
|
||||
|
||||
if($command)
|
||||
{
|
||||
print $sedcommand."\n";
|
||||
exit 0;
|
||||
}
|
||||
exec $preproc.$sedcommand.$postproc;
|
6
app/src/rm12/Makefile
Normal file
6
app/src/rm12/Makefile
Normal file
@ -0,0 +1,6 @@
|
||||
main:
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp rm12 ${UTT_BIN_DIR}
|
||||
endif
|
3
app/src/rm12/rm12
Executable file
3
app/src/rm12/rm12
Executable file
@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
sed -r '/[0-9]+[ \t]+[0-9]+[ \t]+BOS/! s/[0-9]+[ \t]+[0-9]+[ \t]//'
|
12
app/src/rs12/Makefile
Normal file
12
app/src/rs12/Makefile
Normal file
@ -0,0 +1,12 @@
|
||||
main: rs12
|
||||
|
||||
rs12: rs12.c
|
||||
gcc -static -o rs12 rs12.c
|
||||
|
||||
clean:
|
||||
rm rs12
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp rs12 ${UTT_BIN_DIR}
|
||||
endif
|
48
app/src/rs12/rs12.c
Normal file
48
app/src/rs12/rs12.c
Normal file
@ -0,0 +1,48 @@
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#define MAXLINE 1000
|
||||
|
||||
main()
|
||||
{
|
||||
char buf[MAXLINE+1], outbuf[MAXLINE+1];
|
||||
char form[MAXLINE+1];
|
||||
int len;
|
||||
int curpos,nextpos=0;
|
||||
int a,b;
|
||||
while(fgets(buf,MAXLINE,stdin))
|
||||
{
|
||||
int n=sscanf(buf,"%d %d",&a,&b);
|
||||
if(n==2)
|
||||
{
|
||||
nextpos=a+b;
|
||||
fputs(buf,stdout);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(n==1)
|
||||
{
|
||||
curpos=a;
|
||||
sscanf(buf,"%*d %*s %s",form);
|
||||
}
|
||||
else
|
||||
{
|
||||
curpos=nextpos;
|
||||
sscanf(buf,"%*s %s",form);
|
||||
}
|
||||
|
||||
if(*form == '*')
|
||||
len=0;
|
||||
else
|
||||
{
|
||||
char *f = form;
|
||||
for(len=0; *f; ++f) if(*f != '\\') ++len;
|
||||
}
|
||||
|
||||
char *buf1=buf; while(!isalpha(*buf1)) ++buf1;
|
||||
sprintf(outbuf,"%04i %02i %s", curpos, len, buf1);
|
||||
fputs(outbuf,stdout);
|
||||
nextpos = curpos+len;
|
||||
}
|
||||
}
|
||||
}
|
15
app/src/sen-l/Makefile
Normal file
15
app/src/sen-l/Makefile
Normal file
@ -0,0 +1,15 @@
|
||||
|
||||
|
||||
sen: sen.l
|
||||
flex -osen.c sen.l
|
||||
cc -O3 -o sen sen.c -lfl
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp sen ${UTT_BIN_DIR}
|
||||
endif
|
||||
|
||||
clean:
|
||||
rm sen.c sen
|
||||
|
||||
uninstall:
|
80
app/src/sen-l/sen.l
Normal file
80
app/src/sen-l/sen.l
Normal file
@ -0,0 +1,80 @@
|
||||
%{
|
||||
int pos=0,len=0;
|
||||
|
||||
void set_position();
|
||||
%}
|
||||
|
||||
ul [A-Z¡ÆÊ£ÑÓ¦¯¬]
|
||||
ll [a-z±æê³ñ󶿼]
|
||||
l ul|ll
|
||||
n [0-9]+
|
||||
s [ \t]+
|
||||
|
||||
|
||||
ab1 (mgr|in¿|prof|hab|doc|dyr|kier|zast)
|
||||
ab2 (ul|pl|al)
|
||||
|
||||
abrv (ab1|ab2)
|
||||
|
||||
SEG .*\n
|
||||
N {n}{s}{n}{s}N{s}.*\n
|
||||
S {n}{s}{n}{s}S{s}.*\n
|
||||
P {n}{s}{n}{s}P{s}.*\n
|
||||
W {n}{s}{n}{s}W{s}.*\n
|
||||
UL {n}{s}{n}{s}W{s}{ul}.*\n
|
||||
Cap {n}{s}{n}{s}W{s}{ul}{ll}*.*\n
|
||||
POINT {n}{s}{n}{s}P{s}\.({s}.*)?\n
|
||||
QMARK {n}{s}{n}{s}P{s}\?({s}.*)?\n
|
||||
EXCL {n}{s}{n}{s}P{s}\!({s}.*)?\n
|
||||
DASH {n}{s}{n}{s}P{s}\-({s}.*)?\n
|
||||
POINTS {POINT}+
|
||||
|
||||
ABRV {n}{s}{n}{s}W{s}{abrv}({s}.*)?\n
|
||||
|
||||
EOS {POINT}|{POINTS}|{QMARK}|{EXCL}
|
||||
|
||||
|
||||
%%
|
||||
|
||||
|
||||
{N}({POINT}{N})+ ECHO; set_position();
|
||||
({UL}{POINT}{S}?)+{Cap} ECHO; set_position();
|
||||
{ABRV}{POINT} ECHO; set_position();
|
||||
|
||||
|
||||
{P}/{S}{DASH} ECHO; set_position(); print_EOS();
|
||||
{EOS}/{S}({Cap}|{P}|{N}) ECHO; set_position(); print_EOS();
|
||||
|
||||
.* ECHO; set_position();
|
||||
|
||||
<<EOF>> printf("%04d 00 EOS *\n",pos+len); exit(1);
|
||||
|
||||
%%
|
||||
|
||||
int main()
|
||||
{
|
||||
printf("0000 00 BOS *\n");
|
||||
yylex();
|
||||
}
|
||||
|
||||
int yywrap()
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
void set_position()
|
||||
{
|
||||
char *lastseg, *tmp;
|
||||
yytext[yyleng-1]='\0';
|
||||
if(tmp=strrchr(yytext,'\n'))
|
||||
lastseg=tmp+1;
|
||||
else
|
||||
lastseg=yytext;
|
||||
sscanf(lastseg,"%d %d", &pos, &len);
|
||||
yytext[yyleng-1]='\n';
|
||||
}
|
||||
|
||||
int print_EOS()
|
||||
{
|
||||
printf("%04d 00 EOS *\n%04d 00 BOS *\n",pos+len,pos+len);
|
||||
}
|
9
app/src/sen-nl/Makefile
Normal file
9
app/src/sen-nl/Makefile
Normal file
@ -0,0 +1,9 @@
|
||||
|
||||
sen-nl:
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp sen-nl ${UTT_BIN_DIR}
|
||||
endif
|
||||
|
||||
clean:
|
3
app/src/sen-nl/sen-nl
Executable file
3
app/src/sen-nl/sen-nl
Executable file
@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
|
||||
sed -r '1 s/^(([0-9]+)[ \t][0-9]+[ \t].*)$/\2 00 BOS \*\n\1/;t;$! s/(([0-9]+)[ \t][0-9]+[ \t][[:alpha:]]+[ \t]+[[:print:]]*\\n.*)$/\2 00 EOS *\n\2 00 BOS *\n\1/; $ s/^(([0-9]+) .*)$/\1\n\2 00 EOS */'
|
11
app/src/ser/Makefile
Normal file
11
app/src/ser/Makefile
Normal file
@ -0,0 +1,11 @@
|
||||
|
||||
ser:
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp ser ${UTT_BIN_DIR}
|
||||
endif
|
||||
|
||||
clean:
|
||||
|
||||
uninstall:
|
168
app/src/ser/ser
Executable file
168
app/src/ser/ser
Executable file
@ -0,0 +1,168 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
#package: UAM Text Tools
|
||||
#component: ser (pattern search tool)
|
||||
#author: Tomasz Obrêbski
|
||||
|
||||
use strict;
|
||||
use Getopt::Long;
|
||||
|
||||
my $SHARE_DIR="/usr/share/utt";
|
||||
my $USER_DIR="$ENV{HOME}/.utt/share";
|
||||
|
||||
#use lib "$ENV{HOME}/.utt/lib/perl";
|
||||
#use attr;
|
||||
|
||||
Getopt::Long::Configure('no_ignore_case_always');
|
||||
|
||||
my $help=0;
|
||||
my $pattern=0;
|
||||
my $only_matching=0;
|
||||
my $no_markers=0;
|
||||
my $macros=0;
|
||||
my $flextemplate=0;
|
||||
my $flex=0;
|
||||
my $morfield='lem';
|
||||
|
||||
my $configfile1="../../conf/ser.conf";
|
||||
my $configfile2="../conf/ser.conf";
|
||||
|
||||
#read configuration files###########################
|
||||
my $file;
|
||||
foreach $file ($configfile1, $configfile2){
|
||||
if(open(CONFIG, $file)){
|
||||
while (<CONFIG>) {
|
||||
chomp;
|
||||
s/#.*//;
|
||||
s/^\s+//;
|
||||
s/\s+$//;
|
||||
next unless length;
|
||||
my ($name, $value) = split(/\s*=\s*/, $_, 2);
|
||||
if(($name eq "pattern")or($name eq "e")){
|
||||
$pattern=$value;
|
||||
}
|
||||
elsif($name eq "morph"){
|
||||
$morfield=$value;
|
||||
}
|
||||
elsif(($name eq "only-matching")or($name eq "m")){
|
||||
$only_matching=1;
|
||||
}
|
||||
elsif(($name eq "no-markers")or($name eq "M")){
|
||||
$no_markers=1;
|
||||
}
|
||||
elsif($name eq "define"){
|
||||
$macros=$value;
|
||||
}
|
||||
elsif($name eq "flex-template"){
|
||||
$flextemplate=$value;
|
||||
}
|
||||
elsif($name eq "flex"){
|
||||
$flex=1;
|
||||
}
|
||||
elsif(($name eq "help")or($name eq "h")){
|
||||
$help=1;
|
||||
}
|
||||
|
||||
}
|
||||
close CONFIG;
|
||||
}
|
||||
}
|
||||
#########################################################
|
||||
|
||||
GetOptions("pattern|e=s" => \$pattern,
|
||||
"morph=s" => \$morfield,
|
||||
"only-matching|m" => \$only_matching,
|
||||
"no-markers|M" => \$no_markers,
|
||||
"define=s" => \$macros,
|
||||
"flex-template=s" => \$flextemplate,
|
||||
"flex" => \$flex,
|
||||
"help|h" => \$help);
|
||||
|
||||
if($help)
|
||||
{
|
||||
print <<'END'
|
||||
Usage: ser [OPTIONS] [file ..]
|
||||
|
||||
Options:
|
||||
--help -h Help.
|
||||
--pattern=PATTERN -e PATTERN Search pattern.
|
||||
--morph=STRING Field containing morphological information (default 'lem').
|
||||
--define=FILE Read macrodefinitions from FILE.
|
||||
--flex-template=FILE Read flex code template from FILE.
|
||||
--only-matching -m Print only fragments matching PATTERN.
|
||||
--no-markers -M Do not print BOM and EOM markers [TODO].
|
||||
--flex Print only the generated flex code and exit.
|
||||
END
|
||||
;
|
||||
exit 0;
|
||||
}
|
||||
|
||||
die("$0: no pattern given.\n") unless $pattern;
|
||||
|
||||
die("$0: flex template file not found") unless
|
||||
$flextemplate or
|
||||
-e "$USER_DIR/ser.l.template" and $flextemplate="$USER_DIR/ser.l.template" or
|
||||
-e "$SHARE_DIR/ser.l.template" and $flextemplate="$SHARE_DIR/ser.l.template";
|
||||
|
||||
die("$0: macro file not found") unless
|
||||
$macros or
|
||||
-e "$USER_DIR/terms.m4" and $macros="$USER_DIR/terms.m4" or
|
||||
-e "$SHARE_DIR/terms.m4" and $macros="$SHARE_DIR/terms.m4";
|
||||
|
||||
|
||||
#$pattern =~ s/cat\(([^)]+)\)/'cat('.pre($1).')'/ge;
|
||||
# quoting escaped commas /NIE DZIA£A/
|
||||
$pattern =~ s/\\,/\\`\\`\\,''/g;
|
||||
|
||||
# protecting backslash
|
||||
$pattern =~ s/\\/\\\\\\/g;
|
||||
|
||||
# discarding spaces
|
||||
$pattern =~ s/\s+/\\`'/g; #`
|
||||
|
||||
my $flexpattern = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT=\\\\n --define=MORFIELD=$morfield $macros - 2>/dev/null`;
|
||||
|
||||
die("Incorrect pattern (m4).") if $? >> 8;
|
||||
|
||||
|
||||
chomp $flexpattern;
|
||||
|
||||
# <> expansion
|
||||
$flexpattern =~ s/<([^>]+)>/`echo $1 | tag2re`/ge;
|
||||
|
||||
# restricting the value of the . special symbol
|
||||
$flexpattern =~ s/\./[^ \\t\\n\\r\\f]/g;
|
||||
|
||||
# perl-like shortcuts for character classes
|
||||
# perl exact
|
||||
$flexpattern =~ s/\\s/[ \\t]/g;
|
||||
$flexpattern =~ s/\\S/[^ \\t\\n\\r\\f]/g;
|
||||
$flexpattern =~ s/\\d/[0-9]/g;
|
||||
$flexpattern =~ s/\\D/[^0-9 \\t\\n\\r\\f]/g;
|
||||
$flexpattern =~ s/\\w/[a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_]/g;
|
||||
$flexpattern =~ s/\\W/[^a-z±æê³ñ󶼿A-Z¡ÆÊ£ÑÓ¦¬¯0-9_ \\t\\n\\r\\f]/g;
|
||||
# extensions
|
||||
$flexpattern =~ s/\\l/[a-z±æê³ñ󶼿]/g; #lowercase letter
|
||||
$flexpattern =~ s/\\L/[A-Z¡ÆÊ£ÑÓ¦¬¯]/g; #upercase letter
|
||||
|
||||
# protecting slash
|
||||
$flexpattern =~ s/\//\\\//g;
|
||||
|
||||
my $defaultaction = ($only_matching) ? '' : 'ECHO';
|
||||
|
||||
# docelowo posrednie pliki powinny byc w jakims tempie !!!
|
||||
|
||||
system "m4 \"--define=PATTERN=$flexpattern\" \"--define=DEFAULTACTION=$defaultaction\" $flextemplate > $USER_DIR/ser.l";
|
||||
|
||||
if($flex)
|
||||
{
|
||||
system "cat $USER_DIR/ser.l";
|
||||
exit 0;
|
||||
}
|
||||
|
||||
system "flex -o$USER_DIR/ser.c $USER_DIR/ser.l";
|
||||
system "cc -O3 -o $USER_DIR/ser.executable $USER_DIR/ser.c -lfl";
|
||||
system "$USER_DIR/ser.executable";
|
||||
|
||||
system "rm -f $USER_DIR/ser.{l,c,executable}";
|
||||
|
8
app/src/tags/Makefile
Normal file
8
app/src/tags/Makefile
Normal file
@ -0,0 +1,8 @@
|
||||
main:
|
||||
|
||||
copy:
|
||||
ifdef UTT_TAGS_DIR
|
||||
cp *.tag2re ${UTT_TAGS_DIR}
|
||||
endif
|
||||
|
||||
clean:
|
5
app/src/tags/README
Normal file
5
app/src/tags/README
Normal file
@ -0,0 +1,5 @@
|
||||
In this directory files specific to different tag formats are stored.
|
||||
|
||||
TAGSET.tag2re are command-line programs which translate a tag constraint
|
||||
specification into a character-level regular expression matching
|
||||
all tags in the TAGSET format meeting the specified constraint.
|
1
app/src/tags/ipi.tag2re
Executable file
1
app/src/tags/ipi.tag2re
Executable file
@ -0,0 +1 @@
|
||||
#TODO
|
83
app/src/tags/uam.tag2re
Executable file
83
app/src/tags/uam.tag2re
Executable file
@ -0,0 +1,83 @@
|
||||
#!/usr/bin/perl
|
||||
|
||||
use locale;
|
||||
|
||||
my $input = <>;
|
||||
chomp $input;
|
||||
|
||||
our $pos_re = qr/(?:[[:upper:]]+)/;
|
||||
our $attr_re = qr/(?:[[:upper:]]+)/;
|
||||
our $val_re = qr/(?:[[:lower:][:digit:]+?!*-]|<[^>\n]+>)/;
|
||||
our $av_re = qr/(?:$attr_re$val_re+)/;
|
||||
our $avlist_re = qr/(?:$av_re+)/;
|
||||
our $cat_re = qr/(?:$pos_re(?:\/$avlist_re)?)/;
|
||||
|
||||
print pre($input);
|
||||
|
||||
sub parse ($)
|
||||
{
|
||||
my ($dstr)=@_;
|
||||
my $avs={};
|
||||
my ($cat,$attrlist) = split '/', $dstr;
|
||||
ATTR:
|
||||
while( $attrlist =~ /($attr_re)($val_re+)/g )
|
||||
{
|
||||
my ($attrstr,$valstr)=($1,$2);
|
||||
my %vals;
|
||||
while($valstr =~ /$val_re/g)
|
||||
{
|
||||
my $val = $&;
|
||||
next ATTR if $val eq '*';
|
||||
$val =~ s/^<([[:lower:]])>$/$1/;
|
||||
$vals{$val}=1;
|
||||
}
|
||||
|
||||
$avs->{$attrstr} = \%vals; # dlaczego to dziala? %vals jest lokalne
|
||||
}
|
||||
[$cat, $avs];
|
||||
}
|
||||
|
||||
sub unparse (\@)
|
||||
{
|
||||
my ($cat,$avs)= @{shift @_};
|
||||
my $dstr=$cat;
|
||||
my @attrs = keys %$avs;
|
||||
if(@attrs)
|
||||
{
|
||||
$dstr .= '/';
|
||||
for my $attr ( sort @attrs )
|
||||
{
|
||||
$dstr .= $attr . (join '', sort keys %{$avs->{$attr}});
|
||||
}
|
||||
}
|
||||
$dstr;
|
||||
}
|
||||
|
||||
sub canonize ($)
|
||||
{
|
||||
unparse @{parse shift} ;
|
||||
}
|
||||
|
||||
sub pre
|
||||
{
|
||||
my $pos_res = '[[:upper:]]+';
|
||||
my $attr_res = '[[:upper:]]+';
|
||||
my $val_res = '[[:lower:][:digit:]+?!*-]|<[^>\n[:cntrl:]]+>';
|
||||
my $av_res = "$attr_res($val_res)+";
|
||||
my $avlist_res = "($av_res)+";
|
||||
|
||||
my $pat = canonize(shift);
|
||||
my $ret;
|
||||
my ($pos,$avlist) = split /\//, $pat;
|
||||
$ret = $pos.'(\/';
|
||||
while ($avlist =~ /($attr_res)(${val_res}+)/g)
|
||||
{
|
||||
my $attr = $1;
|
||||
my $vals = $2;
|
||||
my $vals = "($val_res)*(".join('|',($vals =~ /$val_res/g)).")($val_res)*";
|
||||
$ret .= "($av_res)*$attr$vals";
|
||||
}
|
||||
$ret .= "($av_res)*)?";
|
||||
return $ret;
|
||||
}
|
||||
|
37
app/src/tok.c/Makefile
Normal file
37
app/src/tok.c/Makefile
Normal file
@ -0,0 +1,37 @@
|
||||
PAR=-Wno-deprecated -O3
|
||||
PAR2=-c -Wno-deprecated -O3
|
||||
LIB_PATH=../lib
|
||||
CMDLINE_FILE='"../tok.c/cmdline.h"'
|
||||
|
||||
|
||||
tok: tok.o cmdline.c common_tok.o common.o
|
||||
g++ $(PAR) tok.c cmdline.c common.o common_tok.o -o tok
|
||||
|
||||
tok.o: tok.c cmdline.h
|
||||
g++ $(PAR2) tok.c
|
||||
|
||||
common_tok.o: cmdline.h common_tok.cc common_tok.h
|
||||
g++ $(PAR2) common_tok.cc
|
||||
|
||||
common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc\
|
||||
$(COMMON_PATH)/common.h
|
||||
g++ $(PAR2) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc
|
||||
|
||||
cmdline.ggo: cmdline_tok.ggo ../common/cmdline_common.ggo
|
||||
cat cmdline_tok.ggo ../common/cmdline_common.ggo > cmdline.ggo
|
||||
|
||||
cmdline.c cmdline.h: cmdline.ggo
|
||||
gengetopt -i cmdline.ggo --conf-parser
|
||||
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp tok ${UTT_BIN_DIR}/
|
||||
endif
|
||||
|
||||
|
||||
clean: clean.cmdline
|
||||
rm *.o
|
||||
|
||||
clean.cmdline:
|
||||
rm cmdline.*
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user