diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..a30b53f --- /dev/null +++ b/Makefile @@ -0,0 +1,129 @@ +include config.mak + +CUR_DIR=$(shell pwd) +SRC_DIR=$(CUR_DIR)/src + +TARGETS = components configuration share libraries + +ifeq ($(BUILD_DOC), yes) + TARGETS += documentation +endif + +.PHONY: all +all: $(TARGETS) + +# ------------------------------------------------------------------ +# main section +# ------------------------------------------------------------------ +.PHONY: components +components: + cd $(SRC_DIR)/lib && make; cd $(CUR_DIR) + + @for cmp in $(COMPONENTS); do\ + cd $(SRC_DIR)/$$cmp && make; cd $(CUR_DIR); \ + done + +.PHONY: documentation +documentation: + cd $(CUR_DIR)/doc && make; cd $(CUR_DIR) + +.PHONY: configuration +configuration: + +.PHONY: libraries +libraries: + +.PHONY: share +share: + +# ------------------------------------------------------------------ +# cleanup section +# ------------------------------------------------------------------ +.PHONY: clean +clean: clean_components clean_documentation + @echo "All files cleaned successfully!" + +.PHONY: clean_components +clean_components: + @for cmp in $(COMPONENTS); do \ + cd $(SRC_DIR)/$$cmp && make clean; cd $(CUR_DIR); \ + done + cd $(SRC_DIR)/lib && make clean; cd $(CUR_DIR); + +.PHONY: clean_documentation +clean_documentation: + cd $(CUR_DIR)/doc && make clean; cd $(CUR_DIR) + +# ------------------------------------------------------------------ +# install section +# ------------------------------------------------------------------ +.PHONY: install +install: all install_dirs install_components install_configuration install_libraries install_documentation install_share + +.PHONY: install_dirs +install_dirs: + install -d $(PREFIX) \ + $(BIN_DIR) \ + $(CONF_DIR) \ + $(LANG_DIR) \ + $(LIB_DIR) \ + $(DOC_DIR) \ + $(SHARE_DIR)/info + +.PHONY: install_components +install_components: components + @for cmp in $(COMPONENTS); do \ + cd $(SRC_DIR)/$$cmp && make install; cd $(CUR_DIR); \ + done + +.PHONY: install_configuration +install_configuration: configuration + cd $(CUR_DIR)/conf && make install; cd $(CUR_DIR) + +.PHONY: install_libraries +install_libraries: libraries + cd $(CUR_DIR)/lib && make install; cd $(CUR_DIR) + +.PHONY: install_documentation +install_documentation: documentation + cd $(CUR_DIR)/doc && make install; cd $(CUR_DIR) + +.PHONY: install_share +install_share: share + cd $(CUR_DIR)/share && make install; cd $(CUR_DIR) + +# ------------------------------------------------------------------ +# uninstall section +# ------------------------------------------------------------------ + +.PHONY: uninstall +uninstall: uninstall_share uninstall_documentation uninstall_libraries uninstall_configuration uninstall_components uninstall_dirs + +.PHONY: uninstall_components +uninstall_components: + @for cmp in $(COMPONENTS); do \ + cd $(SRC_DIR)/$$cmp && make uninstall; cd $(CUR_DIR); \ + done + +.PHONY: uninstall_configuration +uninstall_configuration: + cd $(CUR_DIR)/conf && make uninstall; cd $(CUR_DIR) + +.PHONY: uninstall_libraries +uninstall_libraries: + cd $(CUR_DIR)/lib && make uninstall; cd $(CUR_DIR) + +.PHONY: uninstall_documentation +uninstall_documentation: + cd $(CUR_DIR)/doc && make uninstall; cd $(CUR_DIR) + +.PHONY: uninstall_share +uninstall_share: + cd $(CUR_DIR)/share && make uninstall; cd $(CUR_DIR) + +.PHONY: uninstall_dirs +uninstall_dirs: uninstall_configuration uninstall_documentation uninstall_share uninstall_libraries + rmdir $(CONF_DIR) + rmdir $(DOC_DIR) + rmdir $(LANG_DIR) + rmdir $(LIB_DIR) diff --git a/conf/Makefile b/conf/Makefile new file mode 100644 index 0000000..cd9f265 --- /dev/null +++ b/conf/Makefile @@ -0,0 +1,35 @@ +include ../config.mak + +.PHONY: install +install: +ifdef CONF_DIR + install -m 0644 compiledic.conf $(CONF_DIR) + install -m 0644 cor.conf $(CONF_DIR) + install -m 0644 dgc.conf $(CONF_DIR) + install -m 0644 dgp.conf $(CONF_DIR) + install -m 0644 gph.conf $(CONF_DIR) + install -m 0644 grp.conf $(CONF_DIR) + install -m 0644 gue.conf $(CONF_DIR) + install -m 0644 kor.conf $(CONF_DIR) + install -m 0644 lem.conf $(CONF_DIR) + install -m 0644 mar.conf $(CONF_DIR) + install -m 0644 ser.conf $(CONF_DIR) + install -m 0644 utt.conf $(CONF_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef CONF_DIR + rm $(CONF_DIR)/compiledic.conf + rm $(CONF_DIR)/cor.conf + rm $(CONF_DIR)/dgc.conf + rm $(CONF_DIR)/dgp.conf + rm $(CONF_DIR)/gph.conf + rm $(CONF_DIR)/grp.conf + rm $(CONF_DIR)/gue.conf + rm $(CONF_DIR)/kor.conf + rm $(CONF_DIR)/lem.conf + rm $(CONF_DIR)/mar.conf + rm $(CONF_DIR)/ser.conf + rm $(CONF_DIR)/utt.conf +endif diff --git a/dist/Makefile b/dist/Makefile new file mode 100644 index 0000000..a5093d6 --- /dev/null +++ b/dist/Makefile @@ -0,0 +1,46 @@ + +# some variables + +# path, where all nessesary files are placed +# (they will be processed for making distribution) +export UTT_DIST_DIR=$(UTT_DIR) +# path, where distribution package will be placed +export UTT_DIST_OUTPUT=$(UTT_DIR)/.. +#temp path for making distribution +export UTT_DIST_TMP=$(shell pwd)/dist_tmp + + +# ----------------------------------------------------------- +# default task should display options +.PHONY: default +defaul: + @echo "Using: make tarball|rpm|deb" + + +# ----------------------------------------------------------- +# this task should compile utt (if nesessery) and create tar.gz version +.PHONY: tarball +tarball: + cd tarball && make + +# ----------------------------------------------------------- +# this task should compile utt (if nesessery) and create rpm version +.PHONY: rpm +rpm: + @#we build rpm (see spec/README for details) + cd spec && make + +# ----------------------------------------------------------- +# this task should compile utt (if nesessery) and create deb version +.PHONY: deb +deb: + @#we build deb (see deb/README for details) + cd deb && make + +# ----------------------------------------------------------- +# this task should remove compiled files and directories +.PHONY: clean +clean: + # finally the line below should be uncomment + rm -fr ${UTT_DIST_TMP} + diff --git a/dist/common/check_dependenties.pl b/dist/common/check_dependenties.pl new file mode 100644 index 0000000..ef08c92 --- /dev/null +++ b/dist/common/check_dependenties.pl @@ -0,0 +1,15 @@ + +while() { + chomp; + $app = $_; + $path = `which $app 2>/dev/null`; + $err = $?; + print $app.": "; + if($err != 0) { + print "failed ($err)\n"; + } + else { + print $path; + } +} + diff --git a/dist/common/description.def b/dist/common/description.def new file mode 100644 index 0000000..4c8c701 --- /dev/null +++ b/dist/common/description.def @@ -0,0 +1 @@ +I put here some description. \ No newline at end of file diff --git a/dist/common/description.pl.def b/dist/common/description.pl.def new file mode 100644 index 0000000..07382bb --- /dev/null +++ b/dist/common/description.pl.def @@ -0,0 +1 @@ +Tu umieszczę opis po polsku. \ No newline at end of file diff --git a/dist/common/find_perl_deps.pl b/dist/common/find_perl_deps.pl new file mode 100644 index 0000000..deb79c8 --- /dev/null +++ b/dist/common/find_perl_deps.pl @@ -0,0 +1,29 @@ + + +while() { + chomp; + # najpierw nazwa pliku zrodlowego out + if($_ =~ /^[^:]*\:\s*(.*)$/) { + my $line = $1; +# print "TEXT: $line\n"; + # teraz tylko samo polecenie + if($line =~ /\`\s*(.+)\s*\`/) { + my $cmd = $1; +# print "CMD: $cmd\n"; + # teraz splitujemy potoki + my @progs = split(/\s*\|\s*/, $cmd); + foreach (@progs) { +# print "$_\n"; + # ucinamy agrumenty programu + # oraz linie, zawierajace nie-programy (--replace, $tmpfile_x, /g, \\, itp.) + if($_ =~ /^([^\$\\\/\s\']+)(\s.*)?$/) { + my $app = $1; +# print "APP: $app\n"; +# my $res = `which $app`; +# print "WYNIK: $res\n"; + print "$app\n"; + } + } # foreach + } + } +} diff --git a/dist/common/find_rpm_deps.sh b/dist/common/find_rpm_deps.sh new file mode 100755 index 0000000..d870429 --- /dev/null +++ b/dist/common/find_rpm_deps.sh @@ -0,0 +1,16 @@ +#!/bin/sh + +# sprawdzamy czy podano argument +if test $# -lt 1; then echo RPM file name expected!; exit -1; fi + +# sprawdzamy czy to plik, potem, czy rpm +if test -f $1; \ +then + if test "rpm" = `ls $1 | tail -c 4`; \ + then + rpm -q -R -p $1; \ + else echo "It's not a RPM file!"; \ + fi +else echo RPM file not found!; \ +fi + diff --git a/dist/common/make_deps.sh b/dist/common/make_deps.sh new file mode 100755 index 0000000..6bf55c7 --- /dev/null +++ b/dist/common/make_deps.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +grep -r -e "\`" ../../src | grep -v -e "\.svn\/" | perl find_perl_deps.pl | sort | uniq > dep_list.txt diff --git a/dist/common/prepare_conf.sh b/dist/common/prepare_conf.sh new file mode 100755 index 0000000..7684a5b --- /dev/null +++ b/dist/common/prepare_conf.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +if test 3 -ne $# +then + echo "Usage: " `basename $0` src_conf_dir dest_conf_dir replacement +else + POLEC="s/PATH_PREFIX/$3/g" + + for FN in `ls $1/*.conf` + do + BN=`basename ${FN}` + sed ${POLEC} ${FN} > $2/${BN} + done +fi diff --git a/dist/common/release.def b/dist/common/release.def new file mode 100644 index 0000000..d8263ee --- /dev/null +++ b/dist/common/release.def @@ -0,0 +1 @@ +2 \ No newline at end of file diff --git a/dist/common/requirements.def b/dist/common/requirements.def new file mode 100644 index 0000000..e69de29 diff --git a/dist/common/version.def b/dist/common/version.def new file mode 100644 index 0000000..9a7d84f --- /dev/null +++ b/dist/common/version.def @@ -0,0 +1 @@ +0.9 \ No newline at end of file diff --git a/dist/deb/Makefile b/dist/deb/Makefile new file mode 100644 index 0000000..fd0b47e --- /dev/null +++ b/dist/deb/Makefile @@ -0,0 +1,86 @@ +#default task + +DIR=$(shell pwd) + +ifndef UTT_DIST_DIR + UTT_DIST_DIR=${DIR} +endif + +ifndef UTT_DIST_OUTPUT + UTT_DIST_OUTPUT=${DIR} +endif + +# here there're few properties +_PRODUCT_NAME=utt +_UTT_VER=$(shell cat ../common/version.def) +_UTT_REL=$(shell cat ../common/release.def) +_DEB_FROOT=$(DIR)/deb_root +_UTT_DIR=${_DEB_FROOT}/usr/local/$(_PRODUCT_NAME).$(_UTT_VER)-$(_UTT_REL) + +.PHONY: default +default: make_control make_postinst make_prerm + # first, we prepare some directory structure + mkdir -p $(_DEB_FROOT)/DEBIAN + mkdir -p $(_UTT_DIR) + + # next, we copy deb package files + mv ./control $(_DEB_FROOT)/DEBIAN/ + mv ./postinst $(_DEB_FROOT)/DEBIAN/ + mv ./prerm $(_DEB_FROOT)/DEBIAN/ + cd ${_DEB_FROOT} && tar -cvvf control.tar.gz DEBIAN/ + cd ${DIR}; + rm -fr ${_DEB_FROOT}/DEBIAN/ + + # we copy all necessery files (binaries) + cp -r ${UTT_DIST_DIR}/* ${_UTT_DIR}/ + cp ./changelog ${_UTT_DIR}/share/doc/$(_PRODUCT_NAME)/ +# gzip --best $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/changelog + cp ./changelog.Debian $(_UTT_DIR)/share/doc/$(_PRODUCT_NAME)/ +# gzip --best $(_DEB_ROOT)/usr/share/doc/$(_PRODUCT_NAME)/changelog.Debian + cp ../files/* ${_UTT_DIR}/share/doc/${_PRODUCT_NAME}/ + cp ../common/utt_make_config.pl ${_UTT_DIR}/bin/ + chmod 755 ${_UTT_DIR}/bin/utt_make_config.pl + + + +# # next we make man/doc archives +# gzip --best $(_DEB_ROOT)/usr/share/man/man1/$(_PRODUCT_NAME).1 + + find $(_DEB_FROOT) -type d | xargs chmod 755 # this is necessary on Debian Woody, don't ask me why + + # finally, we buid deb package + fakeroot dpkg-deb --build $(_DEB_FROOT) + mv $(_DEB_FROOT).deb $(_PRODUCT_NAME)_$(_UTT_VER)-$(_UTT_REL).all.deb + rm -rf ${_DEB_FROOT} + + + +.PHONY: make_control +make_control: + echo "Package: $(_PRODUCT_NAME)" > control + echo "Version: $(_UTT_VER)" >> control + echo "Section: web" >> control + echo "Priority: optional" >> control + echo "Architecture: all" >> control + echo "Essential: no" >> control + + echo "Depends: " >> control +# here we read this information from file ../common/requirements.def + #libwww-perl, acme-base (>= 1.2) <= wymagania pakietowe + + echo "Pre-Depends: perl" >> control + + echo "Maintainer: Adam Mickiewicz University" >> control + echo "Provides: $(_PRODUCT_NAME)" >> control + echo -n "Description: " >> control + cat ../common/description.def >> control + +.PHONY: make_postinst +make_postinst: + echo "#!/bin/sh" > postinst + echo "$(_INSTALL_DIR)/create_utt_config.pl" >> postinst + echo "rm -f $(_INSTALL_DIR)/create_utt_config.pl" >> postinst + +.PHONY: make_prerm +make_prerm: + echo "#!/bin/sh" > prerm diff --git a/dist/deb/README b/dist/deb/README new file mode 100644 index 0000000..771b11a --- /dev/null +++ b/dist/deb/README @@ -0,0 +1,3 @@ +This directory contains files necessery to create deb package. + +apt-get install dpkg-dev debhelper devscripts fakeroot linda diff --git a/dist/files/COPYRIGHT b/dist/files/COPYRIGHT new file mode 100644 index 0000000..5e397a5 --- /dev/null +++ b/dist/files/COPYRIGHT @@ -0,0 +1,8 @@ +Copyright (C) 2005 - 2008 Tomasz Obrebski, Michal Stolarski, Justyna Walkowska, Pawel Konieczka + +Permission is granted to copy, distribute and/or modify this document +under the terms of the GNU Free Documentation License, Version 1.2 +or any later version published by the Free Software Foundation; +with no Invariant Sections, no Front-Cover Texts, and no Back-Cover +Texts. A copy of the license is included in the section entitled ‘‘GNU +Free Documentation License’’. diff --git a/dist/files/LICENCE b/dist/files/LICENCE new file mode 100644 index 0000000..6abc7a1 --- /dev/null +++ b/dist/files/LICENCE @@ -0,0 +1,264 @@ +GNU Free Documentation License +Version 1.2, November 2002 +Copyright (c) 2000,2001,2002 Free Software Foundation, Inc. +51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA + +Everyone is permitted to copy and distribute verbatim copies +of this license document, but changing it is not allowed. + + 0. PREAMBLE + The purpose of this License is to make a manual, textbook, or other functional and + useful document free in the sense of freedom: to assure everyone the effective freedom + to copy and redistribute it, with or without modifying it, either commercially or noncommercially. + Secondarily, this License preserves for the author and publisher a way + to get credit for their work, while not being considered responsible for modifications + made by others. + This License is a kind of “copyleft”, which means that derivative works of the document + must themselves be free in the same sense. It complements the GNU General Public + License, which is a copyleft license designed for free software. + We have designed this License in order to use it for manuals for free software, because + free software needs free documentation: a free program should come with manuals + providing the same freedoms that the software does. But this License is not limited to + software manuals; it can be used for any textual work, regardless of subject matter or + whether it is published as a printed book. We recommend this License principally for + works whose purpose is instruction or reference. + 1. APPLICABILITY AND DEFINITIONS + This License applies to any manual or other work, in any medium, that contains a + notice placed by the copyright holder saying it can be distributed under the terms + of this License. Such a notice grants a world-wide, royalty-free license, unlimited in + duration, to use that work under the conditions stated herein. The “Document”, + below, refers to any such manual or work. Any member of the public is a licensee, and + is addressed as “you”. You accept the license if you copy, modify or distribute the work + in a way requiring permission under copyright law. + A “Modified Version” of the Document means any work containing the Document or + a portion of it, either copied verbatim, or with modifications and/or translated into + another language. + A “Secondary Section” is a named appendix or a front-matter section of the Document + that deals exclusively with the relationship of the publishers or authors of the Document + to the Document’s overall subject (or to related matters) and contains nothing that + could fall directly within that overall subject. (Thus, if the Document is in part a + textbook of mathematics, a Secondary Section may not explain any mathematics.) The + relationship could be a matter of historical connection with the subject or with related + matters, or of legal, commercial, philosophical, ethical or political position regarding + them. + The “Invariant Sections” are certain Secondary Sections whose titles are designated, as + being those of Invariant Sections, in the notice that says that the Document is released + under this License. If a section does not fit the above definition of Secondary then it is + not allowed to be designated as Invariant. The Document may contain zero Invariant + Sections. If the Document does not identify any Invariant Sections then there are none. + The “Cover Texts” are certain short passages of text that are listed, as Front-Cover + Texts or Back-Cover Texts, in the notice that says that the Document is released under + this License. A Front-Cover Text may be at most 5 words, and a Back-Cover Text may + be at most 25 words. + A “Transparent” copy of the Document means a machine-readable copy, represented + in a format whose specification is available to the general public, that is suitable for + revising the document straightforwardly with generic text editors or (for images composed + of pixels) generic paint programs or (for drawings) some widely available drawing + editor, and that is suitable for input to text formatters or for automatic translation to + a variety of formats suitable for input to text formatters. A copy made in an otherwise + Transparent file format whose markup, or absence of markup, has been arranged to + thwart or discourage subsequent modification by readers is not Transparent. An image + format is not Transparent if used for any substantial amount of text. A copy that is + not “Transparent” is called “Opaque”. + Examples of suitable formats for Transparent copies include plain ascii without + markup, Texinfo input format, LaTEX input format, SGML or XML using a publicly + available DTD, and standard-conforming simple HTML, PostScript or PDF designed + for human modification. Examples of transparent image formats include PNG, XCF + and JPG. Opaque formats include proprietary formats that can be read and edited + only by proprietary word processors, SGML or XML for which the DTD and/or + processing tools are not generally available, and the machine-generated HTML, + PostScript or PDF produced by some word processors for output purposes only. + The “Title Page” means, for a printed book, the title page itself, plus such following + pages as are needed to hold, legibly, the material this License requires to appear in the + title page. For works in formats which do not have any title page as such, “Title Page” + means the text near the most prominent appearance of the work’s title, preceding the + beginning of the body of the text. + A section “Entitled XYZ” means a named subunit of the Document whose title either + is precisely XYZ or contains XYZ in parentheses following text that translates XYZ in + another language. (Here XYZ stands for a specific section name mentioned below, such + as “Acknowledgements”, “Dedications”, “Endorsements”, or “History”.) To “Preserve + the Title” of such a section when you modify the Document means that it remains a + section “Entitled XYZ” according to this definition. + The Document may include Warranty Disclaimers next to the notice which states that + this License applies to the Document. These Warranty Disclaimers are considered to + be included by reference in this License, but only as regards disclaiming warranties: + any other implication that these Warranty Disclaimers may have is void and has no + effect on the meaning of this License. + 2. VERBATIM COPYING + You may copy and distribute the Document in any medium, either commercially or + noncommercially, provided that this License, the copyright notices, and the license + notice saying this License applies to the Document are reproduced in all copies, and + that you add no other conditions whatsoever to those of this License. You may not use + technical measures to obstruct or control the reading or further copying of the copies + you make or distribute. However, you may accept compensation in exchange for copies. + If you distribute a large enough number of copies you must also follow the conditions + in section 3. + You may also lend copies, under the same conditions stated above, and you may publicly + display copies. + 3. COPYING IN QUANTITY + If you publish printed copies (or copies in media that commonly have printed covers) of + the Document, numbering more than 100, and the Document’s license notice requires + Cover Texts, you must enclose the copies in covers that carry, clearly and legibly, all + these Cover Texts: Front-Cover Texts on the front cover, and Back-Cover Texts on + the back cover. Both covers must also clearly and legibly identify you as the publisher + of these copies. The front cover must present the full title with all words of the title + equally prominent and visible. You may add other material on the covers in addition. + Copying with changes limited to the covers, as long as they preserve the title of the + Document and satisfy these conditions, can be treated as verbatim copying in other + respects. + If the required texts for either cover are too voluminous to fit legibly, you should put + the first ones listed (as many as fit reasonably) on the actual cover, and continue the + rest onto adjacent pages. + If you publish or distribute Opaque copies of the Document numbering more than 100, + you must either include a machine-readable Transparent copy along with each Opaque + copy, or state in or with each Opaque copy a computer-network location from which + the general network-using public has access to download using public-standard network + protocols a complete Transparent copy of the Document, free of added material. If + you use the latter option, you must take reasonably prudent steps, when you begin + distribution of Opaque copies in quantity, to ensure that this Transparent copy will + remain thus accessible at the stated location until at least one year after the last time + you distribute an Opaque copy (directly or through your agents or retailers) of that + edition to the public. + It is requested, but not required, that you contact the authors of the Document well + before redistributing any large number of copies, to give them a chance to provide you + with an updated version of the Document. + 4. MODIFICATIONS + You may copy and distribute a Modified Version of the Document under the conditions + of sections 2 and 3 above, provided that you release the Modified Version under precisely + this License, with the Modified Version filling the role of the Document, thus licensing + distribution and modification of the Modified Version to whoever possesses a copy of + it. In addition, you must do these things in the Modified Version: + A. Use in the Title Page (and on the covers, if any) a title distinct from that of the + Document, and from those of previous versions (which should, if there were any, + be listed in the History section of the Document). You may use the same title as + a previous version if the original publisher of that version gives permission. + B. List on the Title Page, as authors, one or more persons or entities responsible for + authorship of the modifications in the Modified Version, together with at least five + of the principal authors of the Document (all of its principal authors, if it has fewer + than five), unless they release you from this requirement. + C. State on the Title page the name of the publisher of the Modified Version, as the + publisher. + D. Preserve all the copyright notices of the Document. + E. Add an appropriate copyright notice for your modifications adjacent to the other + copyright notices. + F. Include, immediately after the copyright notices, a license notice giving the public + permission to use the Modified Version under the terms of this License, in the form + shown in the Addendum below. + G. Preserve in that license notice the full lists of Invariant Sections and required Cover + Texts given in the Document’s license notice. + H. Include an unaltered copy of this License. + I. Preserve the section Entitled “History”, Preserve its Title, and add to it an item + stating at least the title, year, new authors, and publisher of the Modified Version + as given on the Title Page. If there is no section Entitled “History” in the Document, + create one stating the title, year, authors, and publisher of the Document + as given on its Title Page, then add an item describing the Modified Version as + stated in the previous sentence. + J. Preserve the network location, if any, given in the Document for public access to + a Transparent copy of the Document, and likewise the network locations given in + the Document for previous versions it was based on. These may be placed in the + “History” section. You may omit a network location for a work that was published + at least four years before the Document itself, or if the original publisher of the + version it refers to gives permission. + K. For any section Entitled “Acknowledgements” or “Dedications”, Preserve the Title + of the section, and preserve in the section all the substance and tone of each of the + contributor acknowledgements and/or dedications given therein. + L. Preserve all the Invariant Sections of the Document, unaltered in their text and + in their titles. Section numbers or the equivalent are not considered part of the + section titles. + M. Delete any section Entitled “Endorsements”. Such a section may not be included + in the Modified Version. + N. Do not retitle any existing section to be Entitled “Endorsements” or to conflict in + title with any Invariant Section. + O. Preserve any Warranty Disclaimers. + If the Modified Version includes new front-matter sections or appendices that qualify + as Secondary Sections and contain no material copied from the Document, you may at + your option designate some or all of these sections as invariant. To do this, add their + titles to the list of Invariant Sections in the Modified Version’s license notice. These + titles must be distinct from any other section titles. + You may add a section Entitled “Endorsements”, provided it contains nothing but + endorsements of your Modified Version by various parties—for example, statements of + peer review or that the text has been approved by an organization as the authoritative + definition of a standard. + You may add a passage of up to five words as a Front-Cover Text, and a passage of up + to 25 words as a Back-Cover Text, to the end of the list of Cover Texts in the Modified + Version. Only one passage of Front-Cover Text and one of Back-Cover Text may be + added by (or through arrangements made by) any one entity. If the Document already + includes a cover text for the same cover, previously added by you or by arrangement + made by the same entity you are acting on behalf of, you may not add another; but + you may replace the old one, on explicit permission from the previous publisher that + added the old one. + The author(s) and publisher(s) of the Document do not by this License give permission + to use their names for publicity for or to assert or imply endorsement of any Modified + Version. + 5. COMBINING DOCUMENTS + You may combine the Document with other documents released under this License, + under the terms defined in section 4 above for modified versions, provided that you + include in the combination all of the Invariant Sections of all of the original documents, + unmodified, and list them all as Invariant Sections of your combined work in its license + notice, and that you preserve all their Warranty Disclaimers. + The combined work need only contain one copy of this License, and multiple identical + Invariant Sections may be replaced with a single copy. If there are multiple Invariant + Sections with the same name but different contents, make the title of each such section + unique by adding at the end of it, in parentheses, the name of the original author or + publisher of that section if known, or else a unique number. Make the same adjustment + to the section titles in the list of Invariant Sections in the license notice of the combined + work. + In the combination, you must combine any sections Entitled “History” in the various + original documents, forming one section Entitled “History”; likewise combine any + sections Entitled “Acknowledgements”, and any sections Entitled “Dedications”. You + must delete all sections Entitled “Endorsements.” + 6. COLLECTIONS OF DOCUMENTS + You may make a collection consisting of the Document and other documents released + under this License, and replace the individual copies of this License in the various + documents with a single copy that is included in the collection, provided that you + follow the rules of this License for verbatim copying of each of the documents in all + other respects. + You may extract a single document from such a collection, and distribute it individually + under this License, provided you insert a copy of this License into the extracted + document, and follow this License in all other respects regarding verbatim copying of + that document. + 7. AGGREGATION WITH INDEPENDENT WORKS + A compilation of the Document or its derivatives with other separate and independent + documents or works, in or on a volume of a storage or distribution medium, is called + an “aggregate” if the copyright resulting from the compilation is not used to limit the + legal rights of the compilation’s users beyond what the individual works permit. When + the Document is included in an aggregate, this License does not apply to the other + works in the aggregate which are not themselves derivative works of the Document. + If the Cover Text requirement of section 3 is applicable to these copies of the Document, + then if the Document is less than one half of the entire aggregate, the Document’s Cover + Texts may be placed on covers that bracket the Document within the aggregate, or the + electronic equivalent of covers if the Document is in electronic form. Otherwise they + must appear on printed covers that bracket the whole aggregate. + 8. TRANSLATION + Translation is considered a kind of modification, so you may distribute translations + of the Document under the terms of section 4. Replacing Invariant Sections with + translations requires special permission from their copyright holders, but you may + include translations of some or all Invariant Sections in addition to the original versions + of these Invariant Sections. You may include a translation of this License, and all the + license notices in the Document, and any Warranty Disclaimers, provided that you + also include the original English version of this License and the original versions of + those notices and disclaimers. In case of a disagreement between the translation and + the original version of this License or a notice or disclaimer, the original version will + prevail. + If a section in the Document is Entitled “Acknowledgements”, “Dedications”, or “History”, + the requirement (section 4) to Preserve its Title (section 1) will typically require + changing the actual title. + 9. TERMINATION + You may not copy, modify, sublicense, or distribute the Document except as expressly + provided for under this License. Any other attempt to copy, modify, sublicense or + distribute the Document is void, and will automatically terminate your rights under + this License. However, parties who have received copies, or rights, from you under this + License will not have their licenses terminated so long as such parties remain in full + compliance. + 10. FUTURE REVISIONS OF THIS LICENSE + The Free Software Foundation may publish new, revised versions of the GNU Free + Documentation License from time to time. Such new versions will be similar in spirit + to the present version, but may differ in detail to address new problems or concerns. + See http://www.gnu.org/copyleft/. + Each version of the License is given a distinguishing version number. If the Document + specifies that a particular numbered version of this License “or any later version” + applies to it, you have the option of following the terms and conditions either of that + specified version or of any later version that has been published (not as a draft) by + the Free Software Foundation. If the Document does not specify a version number of + this License, you may choose any version ever published (not as a draft) by the Free + Software Foundation. diff --git a/dist/files/README b/dist/files/README new file mode 100644 index 0000000..60403f8 --- /dev/null +++ b/dist/files/README @@ -0,0 +1,51 @@ +General information +********************* + +UAM Text Tools (UTT) is a package of language processing tools +developed at Adam Mickiewicz University. Its functionality includes: +* tokenization +* dictionary-based morphological analysis +* heuristic morphological analysis of unknown words +* spelling correction +* pattern search +* sentence splitting +* generation of concordance tables + +The toolkit is destined for processing of raw (not annotated) +unrestricted text for any conceivable purpose. + + +Installation +************** + +1) unpack the UTT tar archive +2) in the same directory, unpack the tar archives of all UTT dictionary modules you have +3) run + make install + in the root directory of the installation +4) add the bin directory to the PATH variable + + +Requirements +************* + +* File::HomeDir + + the Perl package File::HomeDir must be installed + (to install the package, run 'perl -MCPAN -e shell' and write + 'install File::HomeDir' after the 'cpan>' prompt appears) + +* flex + + to run the ser component, flex must be installed in your system + +* ruby + + to run the tre component, ruby must be installed in your system + +* locale pl_PL.iso-8852-2 + + the locales pl_PL.iso-8859-2 (pl_PL in short) must be installed + and set while using UTT with the Polish module. The text you + process with UTT must be encoded in iso-8859-2. + diff --git a/dist/spec/Makefile b/dist/spec/Makefile new file mode 100644 index 0000000..8a4640e --- /dev/null +++ b/dist/spec/Makefile @@ -0,0 +1,26 @@ +# this makefile will build rpm + +DIR=$(shell pwd) + +ifndef UTT_DIST_DIR + UTT_DIST_DIR=${DIR} +endif + +_RPM_FROOT=${DIR}/rmp_root +_UTT_ROOT=${_RPM_FROOT}/usr/local +# default task +.PHONY: rpm +rpm: + if test -d ${_RPM_FROOT}; then rm -fr ${_RMP_FROOT}; fi + mkdir -p ${_UTT_ROOT} + cp -rf ${UTT_DIST_DIR}/* ${_UTT_ROOT}/ + mkdir -p ${_UTT_ROOT}/cnf + mv ${_UTT_ROOT}/etc/utt/*.conf ${_UTT_ROOT}/cnf/ + ${DIR}/../common/prepare_conf.sh ${_UTT_ROOT}/cnf ${_UTT_ROOT}/etc/utt \\\/usr\\\/local + rm -rf ${_UTT_ROOT}/cnf + + cp ../files/* ${_UTT_ROOT}/ + cd ${_RPM_FROOT}; rpmbuild -bb ${DIR}/utt.spec + + rm -rf ${_RPM_FROOT} + diff --git a/dist/spec/README b/dist/spec/README new file mode 100644 index 0000000..b231c20 --- /dev/null +++ b/dist/spec/README @@ -0,0 +1,16 @@ +This directory contains files necessary to produce rpm package. + +First, you must have variable UTT_DIST_DIR defined properly. +This variable should be defined by main Makefile. + +To create rpm file, just write: +make + +The created package should appears in default RPM directory. +(in my computer it is /usr/src/redhat/RPMS/$arch/ directory) + +To determine the rpm output directory, execute: +rpm --showrc | grep _rmpdir + +You need access privilage to this directory to create rmp. + diff --git a/dist/spec/utt.spec b/dist/spec/utt.spec new file mode 100644 index 0000000..fe68039 --- /dev/null +++ b/dist/spec/utt.spec @@ -0,0 +1,81 @@ +# +# Default RPM header. +# +# START_RPM_STD_HEADER: + + +# +# RPM properties +# +%define _this_product UAM Text Tools +%define _this_summary Some tools for text processing +%define _this_name utt +%define _this_version %(cat ../../common/version.def) +%define _this_release %(cat ../../common/release.def) +%define _this_copyright Adam Mickiewicz University, Poland + +# +# Default RPM header. +# +# END_RPM_STD_HEADER: +# -------------------------------------------------------------------- + +Summary: %_this_summary +Name: %_this_name +Version: %_this_version +Release: %_this_release +#Copyright: %_this_copyright +License: GPL +Group: Development/Tools +URL: http://utt.amu.edu.pl +Vendor: Adam Mickiewicz University +BuildRoot: %(pwd) +#BuildArch: i586 +# requirements for utt application +#AutoReq: no +#AutoReqProv: no + +#Requires: glibc >= 2.1.3 +#Requires: libgcc1 >= 3.0 +#Requires: libgcc >= 3.0 +#Requires: libstdc++6 >= 3.4.1 +#Requires: libstdc++ >= 3.4.1 + +%description +%(cat ../../common/description.def) + +%description -l pl +%(cat ../../common/description.pl.def) + + +# ------------------------------------------------------------- +# preparing sources for compilation +%prep + +# source compilation +%build + +# rpm building +%install + +# cleaning after rpm build +%clean + +# ------------------------------------------------------------- +#before installation +%pre + +#after installation +%post +# we need to create utt.conf file + +#before uninstallation +%preun + +#after uninstallation +%postun + +# ------------------------------------------------------------- +%files +%defattr(-,root,root) +/* diff --git a/dist/struktura.txt b/dist/struktura.txt new file mode 100644 index 0000000..83aba85 --- /dev/null +++ b/dist/struktura.txt @@ -0,0 +1,74 @@ +/usr/local/bin/aut2fsa +/usr/local/bin/canonize +/usr/local/bin/compiledic +/usr/local/bin/con +/usr/local/bin/cor +/usr/local/bin/dgc +/usr/local/bin/dgp +/usr/local/bin/fla +/usr/local/bin/fsm2aut +/usr/local/bin/go ? +/usr/local/bin/Makefile.go ? +/usr/local/bin/gph +/usr/local/bin/grp +/usr/local/bin/gue +/usr/local/bin/kot +/usr/local/bin/lem +/usr/local/bin/mar +/usr/local/bin/sen +/usr/local/bin/sen-nl +/usr/local/bin/ser +/usr/local/bin/tok +/usr/local/bin/tre.rb +/usr/local/bin/unfla +/usr/local/bin/ipi.tag2re +/usr/local/bin/uam.tag2re + + +# R.D. sugeruje /etc/utt +# lokalnie: ~/.utt lub (trendy) ~/.config/utt +/usr/local/etc/utt/con.conf +/usr/local/etc/utt/cor.conf +/usr/local/etc/utt/dgc.conf +/usr/local/etc/utt/fla.conf +/usr/local/etc/utt/grp.conf +/usr/local/etc/utt/gue.conf +/usr/local/etc/utt/kor.conf +/usr/local/etc/utt/kot.conf +/usr/local/etc/utt/lem.conf +/usr/local/etc/utt/mar.conf +/usr/local/etc/utt/sen.conf +/usr/local/etc/utt/ser.conf +/usr/local/etc/utt/tok.conf +/usr/local/etc/utt/unfla.conf +/usr/local/etc/utt/utt.conf + +/usr/local/share/utt/weights.kor +/usr/local/share/utt/cats.dgc +/usr/local/share/utt/gram.dgc +# lokalnie: wszystkie pliki beda umieszczone w ~/.local/share/utt/ + +/usr/local/share/utt/pl_PL.ISO-8859-2/pl_PL.ISO-8859-2.sym +/usr/local/share/utt/pl_PL.ISO-8859-2/cor.bin +/usr/local/share/utt/pl_PL.ISO-8859-2/gue.bin +/usr/local/share/utt/pl_PL.ISO-8859-2/lem.bin +/usr/local/share/utt/pl_PL.UTF-8/pl_PL.UTF-8.sym +/usr/local/share/utt/pl_PL.UTF-8/cor.bin +/usr/local/share/utt/pl_PL.UTF-8/gue.bin +/usr/local/share/utt/pl_PL.UTF-8/lem.bin +# lokalnie: wszystkie slowniki beda umieszczone w ~/.local/share/utt/ + +/usr/local/lib/utt/ser.l.template +/usr/local/lib/utt/terms.m4 +/usr/local/lib/utt/seg.rb +/usr/local/lib/attr.pm +# lokalnie: wszystkie pliki beda umieszczone w ~/.local/lib/utt/ + +/usr/local/share/doc/utt/FAQ +/usr/local/share/doc/utt/COPYRIGHT +/usr/local/share/doc/utt/NEWS +/usr/local/share/doc/utt/README +/usr/local/share/info/utt.info.gz +/usr/local/share/man/man3/utt.gz + +~/.utt/*.conf (wszystko z /usr/local/etc/utt) diff --git a/dist/tarball/INSTALL b/dist/tarball/INSTALL new file mode 100644 index 0000000..c891845 --- /dev/null +++ b/dist/tarball/INSTALL @@ -0,0 +1,5 @@ +Here you can find some information about how to install utt. + +You should just unpack archive and then execute +make test +make install diff --git a/dist/tarball/Makefile b/dist/tarball/Makefile new file mode 100644 index 0000000..bb83369 --- /dev/null +++ b/dist/tarball/Makefile @@ -0,0 +1,48 @@ +# This makefile allows build tarball distribution for utt. + +# +# Some variables +# + +DIR=$(shell pwd) + +# Directory with utt binaries +ifndef UTT_DIST_DIR + UTT_DIST_DIR=${DIR} +endif + +# Where put result +ifndef UTT_DIST_OUTPUT + UTT_DIST_OUTPUT=${DIR} +endif + +# Common info about version and release +_UTT_VER=$(shell cat ../common/version.def) +_UTT_REL=$(shell cat ../common/release.def) + +# Temp vars +_TARBALL_ROOT=$(DIR)/utt-$(_UTT_VER).$(_UTT_REL) +_TAR_FILE_NAME=utt.$(_UTT_VER)_$(_UTT_REL).tar.gz + +#defualt task +.PHONY: default +default: + @echo Build directory: ${UTT_DIST_DIR} + @echo Output directory for tarball: ${UTT_DIST_OUTPUT} + mkdir -p ${_TARBALL_ROOT} + cp -fr ${UTT_DIST_DIR}/* ${_TARBALL_ROOT} + @# we add some extra files + @# config files + mkdir -p ${_TARBALL_ROOT}/cnf + mv ${_TARBALL_ROOT}/etc/utt/*.conf ${_TARBALL_ROOT}/cnf/ + ${DIR}/../common/prepare_conf.sh ${_TARBALL_ROOT}/cnf ${_TARBALL_ROOT}/etc/utt \~\\\/.local + rm -rf ${_TARBALL_ROOT}/cnf + cp ./INSTALL ${_TARBALL_ROOT}/ + cp ./Makefile.tarball ${_TARBALL_ROOT}/Makefile + cp ../files/* ${_TARBALL_ROOT}/ + + + tar -czf ${UTT_DIST_OUTPUT}/${_TAR_FILE_NAME} utt* + + rm -rf ${_TARBALL_ROOT} + diff --git a/dist/tarball/Makefile.tarball b/dist/tarball/Makefile.tarball new file mode 100644 index 0000000..c021ac9 --- /dev/null +++ b/dist/tarball/Makefile.tarball @@ -0,0 +1,18 @@ +UTT_LIB_DIR=${HOME}/.local/lib/utt +UTT_SHARE_DIR=${HOME}/.local/share/utt +UTT_CONF_DIR=${HOME}/.utt + +.PHONY: install +install: + mkdir -p ${UTT_LIB_DIR} + cp -r lib/utt/* ${UTT_LIB_DIR} + mkdir -p ${UTT_SHARE_DIR} + cp -r share/utt/* ${UTT_SHARE_DIR} + mkdir -p ${UTT_CONF_DIR} + cp -r etc/utt/* ${UTT_CONF_DIR} + +.PHONY: uninstall +uninstall: + rm -r ${UTT_LIB_DIR} + rm -r ${UTT_SHARE_DIR} + rm -r ${CONF_DIR} diff --git a/dist/tarball/README b/dist/tarball/README new file mode 100644 index 0000000..16e05e2 --- /dev/null +++ b/dist/tarball/README @@ -0,0 +1,6 @@ +This directory contains Makefile, which allows to create tar.gz archive. + +To create archive, just write: +make + +Warning: you need define variable UTT_DIST_DIR. diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..c6edf2b --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,45 @@ +include ../config.mak + +all: utt.info utt.dvi utt.html utt.pdf utt.ps + +utt.info: utt.texinfo + $(MAKEINFO) $< -o $@ + +utt.dvi: utt.texinfo + $(TEXI2DVI) --build=clean $< -o $@ + +utt.html: utt.texinfo + $(MAKEINFO) --html --no-split $< -o $@ + +utt.pdf: utt.texinfo + $(TEXI2PDF) --build=clean $< -o $@ + +utt.ps: utt.dvi + $(DVIPS) $< -o $@ + +.PHONY: install +install: +ifdef SHARE_DIR + install -m 0644 utt.info $(SHARE_DIR)/info +endif +ifdef DOC_DIR + install -m 0644 utt.dvi $(DOC_DIR) + install -m 0644 utt.html $(DOC_DIR) + install -m 0644 utt.pdf $(DOC_DIR) + install -m 0644 utt.ps $(DOC_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef SHARE_DIR + rm $(SHARE_DIR)/info/utt.info +endif +ifdef DOC_DIR + rm $(DOC_DIR)/utt.dvi + rm $(DOC_DIR)/utt.html + rm $(DOC_DIR)/utt.pdf + rm $(DOC_DIR)/utt.ps +endif + +clean: + rm utt.info utt.dvi utt.html utt.pdf utt.ps || true diff --git a/lib/Makefile b/lib/Makefile new file mode 100644 index 0000000..ded7abf --- /dev/null +++ b/lib/Makefile @@ -0,0 +1,19 @@ +include ../config.mak + +.PHONY: install +install: +ifdef LIB_DIR + install -m 0755 attr.pm $(LIB_DIR) + install -m 0755 seg.rb $(LIB_DIR) + install -m 0755 ser.l.template $(LIB_DIR) + install -m 0755 terms.m4 $(LIB_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef LIB_DIR + rm $(LIB_DIR)/attr.pm + rm $(LIB_DIR)/seg.rb + rm $(LIB_DIR)/ser.l.template + rm $(LIB_DIR)/terms.m4 +endif diff --git a/lib/attr.pm b/lib/attr.pm new file mode 100644 index 0000000..44bb220 --- /dev/null +++ b/lib/attr.pm @@ -0,0 +1,133 @@ +package attr; + +use locale; +use strict; + +use Data::Dumper; + +our $pos_re = qr/(?:[[:upper:]]+)/; +our $attr_re = qr/(?:[[:upper:]]+)/; +our $val_re = qr/(?:[[:lower:][:digit:]+?!*-]|<[^>\n]+>)/; +our $av_re = qr/(?:$attr_re$val_re+)/; +our $avlist_re = qr/(?:$av_re+)/; +our $cat_re = qr/(?:$pos_re(?:\/$avlist_re)?)/; + +sub match(\@\@) +{ + my ($cat1,$avs1)= @{shift @_}; + my ($cat2,$avs2)= @{shift @_}; + + if($cat1 ne $cat2 && $cat1 ne '*' && $cat2 ne '*') + { + return 0; + } + else + { + ATTR:for my $attr (keys %$avs1) + { + if(exists $avs2->{$attr}) + { + for my $val (keys %{$avs1->{$attr}}) + { + next ATTR if $avs2->{$attr}->{$val}; + } + return 0; + last ATTR; + } + } + } + + return 1; +} + +sub agree(\@\@$) +{ + my $val1 = $_[0]->[1]->{$_[2]}; + my $val2 = $_[1]->[1]->{$_[2]}; + + return 1 if !$val1 || !$val2; + + for my $v (keys %$val1) + { + return 1 if exists $val2->{$v}; + } + return 0; +} + +# funkcja parse +# arg: deskrypcja +# warto: referencja do tablicy [, ], +# gdzie jest referencja do hasza, zawierajacego pary +# atrybut=>hasz wartoci (pary warto=>1), czyli np. + +# [ +# 'ADJ', +# { +# 'KOLEDZY' => { +# '' => 1, +# '' => 1, +# '' => 1 +# }, +# 'C' => { +# 'p' => 1, +# 'a' => 1, +# 'i' => 1 +# }, +# 'N' => { +# 'p' => 1 +# } +# } +# ]; + +sub parse ($) +{ + my ($dstr)=@_; + my $avs={}; + my ($cat,$attrlist) = split '/', $dstr; + ATTR: +# while( $attrlist =~ /([[:upper:]]+)((?:[[:lower:][:digit:]+?!*-]|<[^>\n]+>)+)/g ) + while( $attrlist =~ /($attr_re)($val_re+)/g ) + { + my ($attrstr,$valstr)=($1,$2); + my %vals; + while($valstr =~ /$val_re/g) + { + my $val = $&; + next ATTR if $val eq '*'; + $val =~ s/^<([[:lower:]])>$/$1/; + $vals{$val}=1; + } + + $avs->{$attrstr} = \%vals; # dlaczego to dziala? %vals jest lokalne + } + [$cat, $avs]; +} + +# funkcja unparse +# arg: jak warto parse +# warto: deskrypcja - napis + +sub unparse (\@) +{ + my ($cat,$avs)= @{shift @_}; + my $dstr=$cat; + my @attrs = keys %$avs; + if(@attrs) + { + $dstr .= '/'; + for my $attr ( sort @attrs ) + { + $dstr .= $attr . (join '', sort keys %{$avs->{$attr}}); + } + } + $dstr; +} + + +sub canonize ($) +{ + unparse @{parse @_[0]} ; +} + + +1; diff --git a/lib/seg.rb b/lib/seg.rb new file mode 100644 index 0000000..9a72221 --- /dev/null +++ b/lib/seg.rb @@ -0,0 +1,31 @@ + +class Seg + + def initialize(s="") + @line=s + self + end + + def to_s + @line.chomp + end + + def set(s) + @line=s + self + end + + def field(key) + if key.class==Fixnum + @line.split[key-1] + elsif key.class==String + @line =~ /\s#{key}:(\S+)/; $1 + end + end + alias [] field + + def fields + @line.split + end + +end diff --git a/lib/ser.l.template b/lib/ser.l.template new file mode 100644 index 0000000..1c72081 --- /dev/null +++ b/lib/ser.l.template @@ -0,0 +1,30 @@ +%{ + #include + int n=0; +%} + +%% + +PATTERN { + int start, end, len; + char *lastseg, *tmp; + if(yytext[yyleng-1]!='\n') + {fprintf(stderr,"ser: pattern matches incomplete line\n"); exit(1);} + n++; + sscanf(yytext,"%d %d",&start,&len); + yytext[yyleng-1]='\0'; + if(tmp=strrchr(yytext,'\n')) + { + lastseg=tmp+1; + sscanf(lastseg,"%d %d", &end, &len); + } + else + end=start; + yytext[yyleng-1]='\n'; + printf("%04d 00 BOM * ser:%d\n",start,n); + ECHO; + printf("%04d 00 EOM * ser:%d\n",end+len,n); + } + + +.*\n DEFAULTACTION; diff --git a/lib/terms.m4 b/lib/terms.m4 new file mode 100644 index 0000000..d4ea143 --- /dev/null +++ b/lib/terms.m4 @@ -0,0 +1,52 @@ +divert(-1) +#-------------------------------------------------------------------------- + +# Macros defined here may be used in pattern specifications +# You can modify this file according to your needs. + +# ENDOFSEGMENT and MORFIELD are macros expanded to, respectively, +# end of segment marker (dependes on the format: flattened or not) +# and the name of the annotation field containing morphological +# information (standard value is 'lem'). These values are controlled +# by programs using this file to expand search patterns (ser, grp, ...). + +# seg(type,form,annotation) + +define(`seg',`(\s*((\d+\s+)(\d+\s+)?)?dnl +ifelse($1, `',`(\S+)', `($1)')\s+dnl +ifelse($2, `',`(\S+)', `($2)')dnl +ifelse($3, `',`((\s+\S+)*)', `(\s+($3))')\s*ENDOFSEGMENT)') + +# form(f) - segment containing the form f + +define(`form', `seg(,$1)') + +# field(f) segment containing auxiliary field f + +define(`field', `seg(,,`(\S+\s+)*($1)(\s+\S+)*')') + +# word, space, punct, number segments (assuming W, S, P, N segment types) + +define(`space', `seg(`S',`$1')') +define(`word', `seg(`W',`$1')') +define(`punct', `seg(`P',`$1')') +define(`number', `seg(`N',`$1')') + +# macros specific to PMDB format + +define(`lexeme', `field(`MORFIELD:(\S+;)?$1,\S+')') +define(`cat', `field(`MORFIELD:\S+,$1([,;]\S+)?')') + + +# Place here your macro definitions. + + + + + + + + + +#-------------------------------------------------------------------------- +divert(0) \ No newline at end of file diff --git a/share/Makefile b/share/Makefile new file mode 100644 index 0000000..580dbfd --- /dev/null +++ b/share/Makefile @@ -0,0 +1,31 @@ +include ../config.mak + +.PHONY: install +install: +ifdef LANG_DIR + install -d $(LANG_DIR)/pl_PL.ISO-8859-2 + install -d $(LANG_DIR)/pl_PL.UTF-8 + install -m 0644 pl_PL.ISO-8859-2/cor.bin $(LANG_DIR)/pl_PL.ISO-8859-2 + install -m 0644 pl_PL.ISO-8859-2/gue.bin $(LANG_DIR)/pl_PL.ISO-8859-2 + install -m 0644 pl_PL.ISO-8859-2/lem.bin $(LANG_DIR)/pl_PL.ISO-8859-2 + install -m 0644 pl_PL.ISO-8859-2/pl_PL.ISO-8859-2.sym $(LANG_DIR)/pl_PL.ISO-8859-2 + install -m 0644 pl_PL.UTF-8/lem.bin $(LANG_DIR)/pl_PL.UTF-8 + install -m 0644 cats.dgc $(LANG_DIR) + install -m 0644 gram.dgc $(LANG_DIR) + install -m 0644 weights.kor $(LANG_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef LANG_DIR + rm $(LANG_DIR)/weights.kor + rm $(LANG_DIR)/gram.dgc + rm $(LANG_DIR)/cats.dgc + rm $(LANG_DIR)/pl_PL.UTF-8/lem.bin + rm $(LANG_DIR)/pl_PL.ISO-8859-2/pl_PL.ISO-8859-2.sym + rm $(LANG_DIR)/pl_PL.ISO-8859-2/lem.bin + rm $(LANG_DIR)/pl_PL.ISO-8859-2/gue.bin + rm $(LANG_DIR)/pl_PL.ISO-8859-2/cor.bin + rmdir $(LANG_DIR)/pl_PL.ISO-8859-2 + rmdir $(LANG_DIR)/pl_PL.UTF-8 +endif diff --git a/share/cats.dgc b/share/cats.dgc new file mode 100644 index 0000000..fb800be --- /dev/null +++ b/share/cats.dgc @@ -0,0 +1,696 @@ +ADJ/DcNpCaGp +ADJ/DcNpCd +ADJ/DcNpCgl +ADJ/DcNpCi +ADJ/DcNpCnavGaifn +ADJ/DcNpCnvGp +ADJ/DcNsCaGi +ADJ/DcNsCaGpa +ADJ/DcNsCaiGf +ADJ/DcNsCavGf +ADJ/DcNsCdGpain +ADJ/DcNsCgdlGf +ADJ/DcNsCgGpain +ADJ/DcNsCilGpain +ADJ/DcNsCnavGn +ADJ/DcNsCnvGpai +ADJ/DpNpCaGp +ADJ/DpNpCd +ADJ/DpNpCgl +ADJ/DpNpCi +ADJ/DpNpCnavGaifn +ADJ/DpNpCnvGp +ADJ/DpNsCaGi +ADJ/DpNsCaGpa +ADJ/DpNsCaiGf +ADJ/DpNsCavGf +ADJ/DpNsCdGpain +ADJ/DpNsCgdlGf +ADJ/DpNsCgGpain +ADJ/DpNsCilGpain +ADJ/DpNsCnavGn +ADJ/DpNsCnvGpai +ADJ/DsNpCaGp +ADJ/DsNpCd +ADJ/DsNpCgl +ADJ/DsNpCi +ADJ/DsNpCnavGaifn +ADJ/DsNpCnvGp +ADJ/DsNsCaGi +ADJ/DsNsCaGpa +ADJ/DsNsCaiGf +ADJ/DsNsCavGf +ADJ/DsNsCdGpain +ADJ/DsNsCgdlGf +ADJ/DsNsCgGpain +ADJ/DsNsCilGpain +ADJ/DsNsCnavGn +ADJ/DsNsCnvGpai +ADJNUM/NpCaGp +ADJNUM/NpCd +ADJNUM/NpCgl +ADJNUM/NpCi +ADJNUM/NpCnavGaifn +ADJNUM/NpCnvGp +ADJNUM/NsCaGi +ADJNUM/NsCaGpa +ADJNUM/NsCaiGf +ADJNUM/NsCavGf +ADJNUM/NsCdGpain +ADJNUM/NsCgdlGf +ADJNUM/NsCgGpain +ADJNUM/NsCilGpain +ADJNUM/NsCnavGn +ADJNUM/NsCnvGpai +ADJPAP/NpCaGp +ADJPAP/NpCd +ADJPAP/NpCgl +ADJPAP/NpCi +ADJPAP/NpCnavGaifn +ADJPAP/NpCnvGp +ADJPAP/NsCaGi +ADJPAP/NsCaGpa +ADJPAP/NsCaiGf +ADJPAP/NsCavGf +ADJPAP/NsCdGpain +ADJPAP/NsCgdlGf +ADJPAP/NsCgGpain +ADJPAP/NsCilGpain +ADJPAP/NsCnavGn +ADJPAP/NsCnvGpai +ADJPP/NpCaGp +ADJPP/NpCd +ADJPP/NpCgl +ADJPP/NpCi +ADJPP/NpCnavGaifn +ADJPP/NpCnvGp +ADJPP/NsCaGi +ADJPP/NsCaGpa +ADJPP/NsCaiGf +ADJPP/NsCavGf +ADJPP/NsCdGpain +ADJPP/NsCgdlGf +ADJPP/NsCgGpain +ADJPP/NsCilGpain +ADJPP/NsCnavGn +ADJPP/NsCnvGpai +ADJPRO/NpCaGp +ADJPRO/NpCd +ADJPRO/NpCgl +ADJPRO/NpCi +ADJPRO/NpCnavGaifn +ADJPRO/NpCnvGp +ADJPRO/NsCaGi +ADJPRO/NsCaGpa +ADJPRO/NsCaiGf +ADJPRO/NsCavGf +ADJPRO/NsCdGpain +ADJPRO/NsCgdlGf +ADJPRO/NsCgGpain +ADJPRO/NsCilGpain +ADJPRO/NsCnavGn +ADJPRO/NsCnvGpai +ADJPRO/ZdNpCaGp +ADJPRO/ZdNpCd +ADJPRO/ZdNpCgl +ADJPRO/ZdNpCi +ADJPRO/ZdNpCnavGaifn +ADJPRO/ZdNpCnvGp +ADJPRO/ZdNsCaGi +ADJPRO/ZdNsCaGpa +ADJPRO/ZdNsCaiGf +ADJPRO/ZdNsCavGf +ADJPRO/ZdNsCdGpain +ADJPRO/ZdNsCgdlGf +ADJPRO/ZdNsCgGpain +ADJPRO/ZdNsCilGpain +ADJPRO/ZdNsCnavGn +ADJPRO/ZdNsCnvGpai +ADJPRO/ZgNpCaGp +ADJPRO/ZgNpCd +ADJPRO/ZgNpCgl +ADJPRO/ZgNpCi +ADJPRO/ZgNpCnavGaifn +ADJPRO/ZgNpCnvGp +ADJPRO/ZgNsCaGi +ADJPRO/ZgNsCaGpa +ADJPRO/ZgNsCaiGf +ADJPRO/ZgNsCavGf +ADJPRO/ZgNsCdGpain +ADJPRO/ZgNsCgdlGf +ADJPRO/ZgNsCgGpain +ADJPRO/ZgNsCilGpain +ADJPRO/ZgNsCnavGn +ADJPRO/ZgNsCnvGpai +ADJPRO/ZiNpCaGp +ADJPRO/ZiNpCd +ADJPRO/ZiNpCgl +ADJPRO/ZiNpCi +ADJPRO/ZiNpCnavGaifn +ADJPRO/ZiNpCnvGp +ADJPRO/ZiNsCaGi +ADJPRO/ZiNsCaGpa +ADJPRO/ZiNsCaiGf +ADJPRO/ZiNsCavGf +ADJPRO/ZiNsCdGpain +ADJPRO/ZiNsCgdlGf +ADJPRO/ZiNsCgGpain +ADJPRO/ZiNsCilGpain +ADJPRO/ZiNsCnavGn +ADJPRO/ZiNsCnvGpai +ADJPRO/ZnNpCaGp +ADJPRO/ZnNpCd +ADJPRO/ZnNpCgl +ADJPRO/ZnNpCi +ADJPRO/ZnNpCnavGaifn +ADJPRO/ZnNpCnvGp +ADJPRO/ZnNsCaGi +ADJPRO/ZnNsCaGpa +ADJPRO/ZnNsCaiGf +ADJPRO/ZnNsCavGf +ADJPRO/ZnNsCdGpain +ADJPRO/ZnNsCgdlGf +ADJPRO/ZnNsCgGpain +ADJPRO/ZnNsCilGpain +ADJPRO/ZnNsCnavGn +ADJPRO/ZnNsCnvGpai +ADJPRO/ZqNpCaGp +ADJPRO/ZqNpCd +ADJPRO/ZqNpCgl +ADJPRO/ZqNpCi +ADJPRO/ZqNpCnavGaifn +ADJPRO/ZqNpCnvGp +ADJPRO/ZqNsCaGi +ADJPRO/ZqNsCaGpa +ADJPRO/ZqNsCaiGf +ADJPRO/ZqNsCavGf +ADJPRO/ZqNsCdGpain +ADJPRO/ZqNsCgdlGf +ADJPRO/ZqNsCgGpain +ADJPRO/ZqNsCilGpain +ADJPRO/ZqNsCnavGn +ADJPRO/ZqNsCnvGpai +ADJPRO/ZqrNpCaGp +ADJPRO/ZqrNpCd +ADJPRO/ZqrNpCgl +ADJPRO/ZqrNpCi +ADJPRO/ZqrNpCnavGaifn +ADJPRO/ZqrNpCnvGp +ADJPRO/ZqrNsCaGi +ADJPRO/ZqrNsCaGpa +ADJPRO/ZqrNsCaiGf +ADJPRO/ZqrNsCavGf +ADJPRO/ZqrNsCdGpain +ADJPRO/ZqrNsCgdlGf +ADJPRO/ZqrNsCgGpain +ADJPRO/ZqrNsCilGpain +ADJPRO/ZqrNsCnavGn +ADJPRO/ZqrNsCnvGpai +ADJPRO/ZsNpCaGp +ADJPRO/ZsNpCd +ADJPRO/ZsNpCgl +ADJPRO/ZsNpCi +ADJPRO/ZsNpCnavGaifn +ADJPRO/ZsNpCnvGp +ADJPRO/ZsNsCaGi +ADJPRO/ZsNsCaGpa +ADJPRO/ZsNsCaiGf +ADJPRO/ZsNsCavGf +ADJPRO/ZsNsCdGpain +ADJPRO/ZsNsCgdlGf +ADJPRO/ZsNsCgGpain +ADJPRO/ZsNsCilGpain +ADJPRO/ZsNsCnavGn +ADJPRO/ZsNsCnvGpai +ADJPRP/NpCaGp +ADJPRP/NpCd +ADJPRP/NpCgl +ADJPRP/NpCi +ADJPRP/NpCnavGaifn +ADJPRP/NpCnvGp +ADJPRP/NsCaGi +ADJPRP/NsCaGpa +ADJPRP/NsCaiGf +ADJPRP/NsCavGf +ADJPRP/NsCdGpain +ADJPRP/NsCgdlGf +ADJPRP/NsCgGpain +ADJPRP/NsCilGpain +ADJPRP/NsCnavGn +ADJPRP/NsCnvGpai +ADVANP +ADV/Dc +ADV/Dp +ADV/Ds +ADVNUM +ADVPRO +ADVPRO/Zd +ADVPRO/Zi +ADVPRO/Zn +ADVPRO/Zq +ADVPRO/Zqr +ADVPRO/Zr +ADVPRP +APP +BYC/Vb +BYC/VpMcNpP1Gaifn +BYC/VpMcNpP1Gp +BYC/VpMcNpP2Gaifn +BYC/VpMcNpP2Gp +BYC/VpMcNpP3Gaifn +BYC/VpMcNpP3Gp +BYC/VpMcNsP1Gf +BYC/VpMcNsP1Gpai +BYC/VpMcNsP2Gf +BYC/VpMcNsP2Gpai +BYC/VpMcNsP3Gf +BYC/VpMcNsP3Gn +BYC/VpMcNsP3Gpai +BYC/VpMdTaNpP1Gaifn +BYC/VpMdTaNpP1Gp +BYC/VpMdTaNpP2Gaifn +BYC/VpMdTaNpP2Gp +BYC/VpMdTaNpP3Gaifn +BYC/VpMdTaNpP3Gp +BYC/VpMdTaNsP1Gf +BYC/VpMdTaNsP1Gpai +BYC/VpMdTaNsP2Gf +BYC/VpMdTaNsP2Gpai +BYC/VpMdTaNsP3Gf +BYC/VpMdTaNsP3Gn +BYC/VpMdTaNsP3Gpai +BYC/VpMdTrfNpP1 +BYC/VpMdTrfNpP2 +BYC/VpMdTrfNpP3 +BYC/VpMdTrfNsP1 +BYC/VpMdTrfNsP2 +BYC/VpMdTrfNsP3 +BYC/VpMiNpP1 +BYC/VpMiNpP2 +BYC/VpMiNsP2 +CONJ +EXCL +N/GaNpCa +N/GaNpCd +N/GaNpCg +N/GaNpCi +N/GaNpCl +N/GaNpCn +N/GaNpCv +N/GaNsCa +N/GaNsCd +N/GaNsCg +N/GaNsCi +N/GaNsCl +N/GaNsCn +N/GaNsCv +N/GfNpCa +N/GfNpCd +N/GfNpCg +N/GfNpCi +N/GfNpCl +N/GfNpCn +N/GfNpCv +N/GfNsCa +N/GfNsCd +N/GfNsCg +N/GfNsCi +N/GfNsCl +N/GfNsCn +N/GfNsCv +N/GiNpCa +N/GiNpCd +N/GiNpCg +N/GiNpCi +N/GiNpCl +N/GiNpCn +N/GiNpCv +N/GiNsCa +N/GiNsCd +N/GiNsCg +N/GiNsCi +N/GiNsCl +N/GiNsCn +N/GiNsCv +N/GnNpCa +N/GnNpCd +N/GnNpCg +N/GnNpCi +N/GnNpCl +N/GnNpCn +N/GnNpCv +N/GnNsCa +N/GnNsCd +N/GnNsCg +N/GnNsCi +N/GnNsCl +N/GnNsCn +N/GnNsCv +N/G?NpCa +N/G*NpCa +N/G?NpCd +N/G*NpCd +N/G?NpCg +N/G*NpCg +N/G?NpCi +N/G*NpCi +N/G?NpCl +N/G*NpCl +N/G?NpCn +N/G*NpCn +N/G?NpCv +N/G*NpCv +N/G?NsCa +N/G?NsCd +N/G?NsCg +N/G?NsCi +N/G?NsCl +N/G?NsCn +N/G?NsCv +N/GpNpCa +N/GpNpCd +N/GpNpCg +N/GpNpCi +N/GpNpCl +N/GpNpCn +N/GpNpCv +N/GpNsCa +N/GpNsCd +N/GpNsCg +N/GpNsCi +N/GpNsCl +N/GpNsCn +N/GpNsCv +NPRO/ZdGnNsCa +NPRO/ZdGnNsCd +NPRO/ZdGnNsCg +NPRO/ZdGnNsCi +NPRO/ZdGnNsCl +NPRO/ZdGnNsCn +NPRO/ZgGnNsCa +NPRO/ZgGnNsCd +NPRO/ZgGnNsCg +NPRO/ZgGnNsCi +NPRO/ZgGnNsCl +NPRO/ZgGnNsCn +NPRO/ZgGpNpCa +NPRO/ZgGpNpCd +NPRO/ZgGpNpCg +NPRO/ZgGpNpCi +NPRO/ZgGpNpCl +NPRO/ZgGpNpCn +NPRO/ZiGnNsCa +NPRO/ZiGnNsCd +NPRO/ZiGnNsCg +NPRO/ZiGnNsCi +NPRO/ZiGnNsCl +NPRO/ZiGnNsCn +NPRO/ZiGpNsCa +NPRO/ZiGpNsCd +NPRO/ZiGpNsCg +NPRO/ZiGpNsCi +NPRO/ZiGpNsCl +NPRO/ZiGpNsCn +NPRO/ZnGnNsCa +NPRO/ZnGnNsCd +NPRO/ZnGnNsCg +NPRO/ZnGnNsCi +NPRO/ZnGnNsCl +NPRO/ZnGnNsCn +NPRO/ZnGpNsCa +NPRO/ZnGpNsCd +NPRO/ZnGpNsCg +NPRO/ZnGpNsCi +NPRO/ZnGpNsCl +NPRO/ZnGpNsCn +NPRO/ZpGaifnNpCa +NPRO/ZpGaifnNpCd +NPRO/ZpGaifnNpCg +NPRO/ZpGaifnNpCi +NPRO/ZpGaifnNpCl +NPRO/ZpGaifnNpCn +NPRO/ZpGfNsCa +NPRO/ZpGfNsCd +NPRO/ZpGfNsCg +NPRO/ZpGfNsCi +NPRO/ZpGfNsCl +NPRO/ZpGfNsCn +NPRO/ZpGnNsCa +NPRO/ZpGnNsCd +NPRO/ZpGnNsCg +NPRO/ZpGnNsCi +NPRO/ZpGnNsCl +NPRO/ZpGnNsCn +NPRO/ZpG*NpCa +NPRO/ZpG*NpCd +NPRO/ZpG*NpCg +NPRO/ZpG*NpCi +NPRO/ZpG*NpCl +NPRO/ZpG*NpCn +NPRO/ZpG*NsCa +NPRO/ZpG*NsCd +NPRO/ZpG*NsCg +NPRO/ZpG*NsCi +NPRO/ZpG*NsCl +NPRO/ZpG*NsCn +NPRO/ZpGpaiNsCa +NPRO/ZpGpaiNsCd +NPRO/ZpGpaiNsCg +NPRO/ZpGpaiNsCi +NPRO/ZpGpaiNsCl +NPRO/ZpGpaiNsCn +NPRO/ZpGpNpCa +NPRO/ZpGpNpCd +NPRO/ZpGpNpCg +NPRO/ZpGpNpCi +NPRO/ZpGpNpCl +NPRO/ZpGpNpCn +NPRO/ZqGnNsCa +NPRO/ZqGnNsCd +NPRO/ZqGnNsCg +NPRO/ZqGnNsCi +NPRO/ZqGnNsCl +NPRO/ZqGnNsCn +NPRO/ZqGpNsCa +NPRO/ZqGpNsCd +NPRO/ZqGpNsCg +NPRO/ZqGpNsCi +NPRO/ZqGpNsCl +NPRO/ZqGpNsCn +NPRO/ZqrGnNsCa +NPRO/ZqrGnNsCd +NPRO/ZqrGnNsCg +NPRO/ZqrGnNsCi +NPRO/ZqrGnNsCl +NPRO/ZqrGnNsCn +NPRO/ZqrGpNsCa +NPRO/ZqrGpNsCd +NPRO/ZqrGpNsCg +NPRO/ZqrGpNsCi +NPRO/ZqrGpNsCl +NPRO/ZqrGpNsCn +NPRO/ZxG*N*Ca +NPRO/ZxG*N*Cd +NPRO/ZxG*N*Cg +NPRO/ZxG*N*Ci +NPRO/ZxG*N*Cl +NUMCOL/Ca +NUMCOL/Cd +NUMCOL/Cg +NUMCOL/Ci +NUMCOL/Cl +NUMCOL/Cn +NUMCRD/Ca +NUMCRD/CaGaifn +NUMCRD/CaGain +NUMCRD/CaGf +NUMCRD/CaGp +NUMCRD/Cd +NUMCRD/Cg +NUMCRD/Ci +NUMCRD/CiGf +NUMCRD/CiGpain +NUMCRD/Cl +NUMCRD/Cn +NUMCRD/CnGaifn +NUMCRD/CnGain +NUMCRD/CnGf +NUMCRD/CnGp +NUMCRD/ZiCaGaifn +NUMCRD/ZiCaGain +NUMCRD/ZiCaGf +NUMCRD/ZiCaGp +NUMCRD/ZiCd +NUMCRD/ZiCg +NUMCRD/ZiCi +NUMCRD/ZiCiGf +NUMCRD/ZiCiGpain +NUMCRD/ZiCl +NUMCRD/ZiCnGaifn +NUMCRD/ZiCnGain +NUMCRD/ZiCnGf +NUMCRD/ZiCnGp +NUMCRD/ZqiCaGaifn +NUMCRD/ZqiCaGp +NUMCRD/ZqiCd +NUMCRD/ZqiCg +NUMCRD/ZqiCi +NUMCRD/ZqiCl +NUMCRD/ZqiCnGaifn +NUMCRD/ZqiCnGp +NUMORD/NpCaGp +NUMORD/NpCd +NUMORD/NpCgl +NUMORD/NpCi +NUMORD/NpCnavGaifn +NUMORD/NpCnvGp +NUMORD/NsCaGi +NUMORD/NsCaGpa +NUMORD/NsCaiGf +NUMORD/NsCavGf +NUMORD/NsCdGpain +NUMORD/NsCgdlGf +NUMORD/NsCgGpain +NUMORD/NsCilGpain +NUMORD/NsCnavGn +NUMORD/NsCnvGpai +NUMPAR +NUMPAR/Dc +NUMPAR/Dp +NUMPAR/Ds +NUMPAR/Ns +NUMPAR/NsGf +NUMPAR/NsGpain +NV/Ca +NV/Cd +NV/Cg +NV/Ci +NV/Cl +NV/Cn +ONO +P +PART +P/Ca +P/Cai +P/Cal +P/Cd +P/Cg +P/Cga +P/Cgai +P/Cgd +P/Cgi +P/Ci +P/Cl +PPRO/Zp +V/AiVb +V/AiViTa +V/AiVpMcNpP1Gaifn +V/AiVpMcNpP1Gp +V/AiVpMcNpP2Gaifn +V/AiVpMcNpP2Gp +V/AiVpMcNpP3Gaifn +V/AiVpMcNpP3Gp +V/AiVpMcNsP1Gf +V/AiVpMcNsP1Gpai +V/AiVpMcNsP2Gf +V/AiVpMcNsP2Gpai +V/AiVpMcNsP3Gf +V/AiVpMcNsP3Gn +V/AiVpMcNsP3Gpai +V/AiVpMdTaNpP1Gaifn +V/AiVpMdTaNpP1Gp +V/AiVpMdTaNpP2Gaifn +V/AiVpMdTaNpP2Gp +V/AiVpMdTaNpP3Gaifn +V/AiVpMdTaNpP3Gp +V/AiVpMdTaNsP1Gf +V/AiVpMdTaNsP1Gpai +V/AiVpMdTaNsP2Gf +V/AiVpMdTaNsP2Gpai +V/AiVpMdTaNsP3Gf +V/AiVpMdTaNsP3Gn +V/AiVpMdTaNsP3Gpai +V/AiVpMdTrfNpP1 +V/AiVpMdTrfNpP2 +V/AiVpMdTrfNpP3 +V/AiVpMdTrfNsP1 +V/AiVpMdTrfNsP2 +V/AiVpMdTrfNsP3 +V/AiVpMiNpP1 +V/AiVpMiNpP2 +V/AiVpMiNsP2 +V/ApVb +V/ApViTa +V/ApVpMcNpP1Gaifn +V/ApVpMcNpP1Gp +V/ApVpMcNpP2Gaifn +V/ApVpMcNpP2Gp +V/ApVpMcNpP3Gaifn +V/ApVpMcNpP3Gp +V/ApVpMcNsP1Gf +V/ApVpMcNsP1Gpai +V/ApVpMcNsP2Gf +V/ApVpMcNsP2Gpai +V/ApVpMcNsP3Gf +V/ApVpMcNsP3Gn +V/ApVpMcNsP3Gpai +V/ApVpMdTaNpP1Gaifn +V/ApVpMdTaNpP1Gp +V/ApVpMdTaNpP2Gaifn +V/ApVpMdTaNpP2Gp +V/ApVpMdTaNpP3Gaifn +V/ApVpMdTaNpP3Gp +V/ApVpMdTaNsP1Gf +V/ApVpMdTaNsP1Gpai +V/ApVpMdTaNsP2Gf +V/ApVpMdTaNsP2Gpai +V/ApVpMdTaNsP3Gf +V/ApVpMdTaNsP3Gn +V/ApVpMdTaNsP3Gpai +V/ApVpMdTrfNpP1 +V/ApVpMdTrfNpP2 +V/ApVpMdTrfNpP3 +V/ApVpMdTrfNsP1 +V/ApVpMdTrfNsP2 +V/ApVpMdTrfNsP3 +V/ApVpMiNpP1 +V/ApVpMiNpP2 +V/ApVpMiNsP2 +V/GiVb +V/GiViTa +V/GiVpMcNpP1Gaifn +V/GiVpMcNpP1Gp +V/GiVpMcNpP2Gaifn +V/GiVpMcNpP2Gp +V/GiVpMcNpP3Gaifn +V/GiVpMcNpP3Gp +V/GiVpMcNsP1Gf +V/GiVpMcNsP1Gpai +V/GiVpMcNsP2Gf +V/GiVpMcNsP2Gpai +V/GiVpMcNsP3Gf +V/GiVpMcNsP3Gn +V/GiVpMcNsP3Gpai +V/GiVpMdTaNpP1Gaifn +V/GiVpMdTaNpP1Gp +V/GiVpMdTaNpP2Gaifn +V/GiVpMdTaNpP2Gp +V/GiVpMdTaNpP3Gaifn +V/GiVpMdTaNpP3Gp +V/GiVpMdTaNsP1Gf +V/GiVpMdTaNsP1Gpai +V/GiVpMdTaNsP2Gf +V/GiVpMdTaNsP2Gpai +V/GiVpMdTaNsP3Gf +V/GiVpMdTaNsP3Gn +V/GiVpMdTaNsP3Gpai +V/GiVpMdTrfNpP1 +V/GiVpMdTrfNpP2 +V/GiVpMdTrfNpP3 +V/GiVpMdTrfNsP1 +V/GiVpMdTrfNsP2 +V/GiVpMdTrfNsP3 +V/GiVpMiNpP1 +V/GiVpMiNpP2 +V/GiVpMiNsP2 +VNI diff --git a/share/gram.dgc b/share/gram.dgc new file mode 100644 index 0000000..10134bc --- /dev/null +++ b/share/gram.dgc @@ -0,0 +1,124 @@ +#FLAG REL + +#UP REL + +#ORDER * pcmpl +#ORDER .. +#ORDER subj .. * .. cmpl +#ORDER refl .. * +#ORDER * refl + +#CONSTR cmpl_g => ~cmpl_a +#CONSTR cmpl_inf => ~(cmpl_g|cmpl_d|cmpl_a|cmpl_p|cmpl_ze|cmpl_s) +#CONSTR subj_pred => subj +#constr cmpl_pred => cmpl + + + +#subj +ROLE subj # deklaracja roli (typ zaleznosci) podmiot +AGR subj N # zgodnosc podrzednika z nadrzednikiem co do liczby +AGR subj G # zgodnosc podrzednika z nadrzednikiem co do rodzaju +GOV subj */Cn # wymaganie by podrzednik byl w mianowniku + +# pary kategorii, jakie mozna polaczyc zaleznoscia typu podmiot + +# nadrzednik podrzednik +LINK V/VpP3,BYC/VpP3 N,NPRO subj + +# (przecinek znaczy lub) + +ROLE cmpl_ga # dopelnienie w bierniku/dopelniaczu +ROLE cmpl_d # w celowniku +ROLE cmpl_i # w narzedniku +ROLE cmpl_inf # w bezokoliczniku +ROLE cmpl_s # bedace zdaniem +ROLE cmpl_ze # bedace zdaniem poprzedzonym 'ze' +ROLE aux # +ROLE mod # modyfikator (okolicznik/przydawka) (niewymagane określenie) (biały kot) +ROLE prep # modyfikator w postaci frazy przyimkowej +ROLE pcmpl # dopełnienie przyimka (wymagany rzeczownik) +ROLE ccmpl # dopełnienie spójnika (wymagany drugi człon konstrukcji spójnikowej) +ROLE poss # np. książka Marii, ojciec kolegi +ROLE restr # (bardzo <- duży) +ROLE part # partykuła +ROLE coord # koordynacja (powiązanie pierwszego członu konstrukcji współrzędnej + # ze spójnikiem współrzędnym centralnym ( Oto [pies -> i] kot. ) + + +AGR aux N +AGR aux G + +AGR mod N +AGR mod C +AGR mod G + +AGR pcmpl C + +GOV cmpl_ga */Cga +GOV cmpl_d */Cd +GOV cmpl_i */Ci +GOV poss */Cg + +SGL subj +SGL cmpl_ga +SGL cmpl_d +SGL cmpl_i +SGL cmpl_inf +SGL aux +SGL pcmpl +SGL ccmpl +SGL poss +SGL restr + +REQ P pcmpl +REQ CONJ ccmpl + +RIGHT pcmpl +RIGHT ccmpl +RIGHT cmpl_ze +RIGHT poss + + +#cmpl_* +LINK V,ADVPRP,ADVANP,ADJPRP,ADJPAP,NV N,NPRO cmpl_ga +LINK V,ADVPRP,ADVANP,ADJPRP,ADJPAP,NV N,NPRO cmpl_d +LINK V,ADVPRP,ADVANP,ADJPRP,ADJPAP,NV N,NPRO cmpl_i +LINK V,ADVPRP,ADVANP,ADJPRP,ADJPAP,NV V/Vb cmpl_inf +LINK V,ADVPRP,ADVANP,ADJPRP,ADJPAP,NV CONJ cmpl_ze + +#aux +#czas przyszly analityczny +LINK BYC/VpMdTf V/AiVpP3,V/AiVb aux +#czas zaprzeszly(?) +LINK BYC/VpMc V/VpP3 aux +#BYC jako lacznik w (jest bialy, jest zaszlachtowany, jest pilotem) +LINK BYC ADJPAP/Cn,ADJ/Cn,N/Ci aux + +#mod + +LINK V ADV,ADVPRP,ADVANP,ADVPRO mod +LINK N,NV ADJ,ADJPAP,ADJPRP,ADJPRO mod + + +#prep +LINK N,V P prep + +#pcmpl +LINK P N,NV pcmpl + + +#poss +LINK N N,NV,NPRO poss + + +#ccmpl +LINK CONJ V/Vp ccmpl + + +#restr +LINK ADJ ADV restr + + +#part +LINK V PART part diff --git a/share/pl_PL.ISO-8859-2/cor.bin b/share/pl_PL.ISO-8859-2/cor.bin new file mode 100644 index 0000000..6b004d1 Binary files /dev/null and b/share/pl_PL.ISO-8859-2/cor.bin differ diff --git a/share/pl_PL.ISO-8859-2/gue.bin b/share/pl_PL.ISO-8859-2/gue.bin new file mode 100644 index 0000000..5e46485 Binary files /dev/null and b/share/pl_PL.ISO-8859-2/gue.bin differ diff --git a/share/pl_PL.ISO-8859-2/lem.bin b/share/pl_PL.ISO-8859-2/lem.bin new file mode 100644 index 0000000..ebfd176 Binary files /dev/null and b/share/pl_PL.ISO-8859-2/lem.bin differ diff --git a/share/pl_PL.ISO-8859-2/pl_PL.ISO-8859-2.sym b/share/pl_PL.ISO-8859-2/pl_PL.ISO-8859-2.sym new file mode 100644 index 0000000..81e7b98 --- /dev/null +++ b/share/pl_PL.ISO-8859-2/pl_PL.ISO-8859-2.sym @@ -0,0 +1,8 @@ +lcase a b c d e f g h i j k l m n o +lcase p q r s t u v w x y z +ucase A B C D E F G H I J K L M N O +ucase P Q R S T U V W X Y Z +letter lcase ucase +digit 0 1 2 3 4 5 6 7 8 9 +punct , . @ / ' ~ ; _ - + ? \ +all letter digit signs sem diff --git a/share/pl_PL.UTF-8/lem.bin b/share/pl_PL.UTF-8/lem.bin new file mode 100644 index 0000000..ce44152 Binary files /dev/null and b/share/pl_PL.UTF-8/lem.bin differ diff --git a/share/weights.kor b/share/weights.kor new file mode 100644 index 0000000..5e966fa --- /dev/null +++ b/share/weights.kor @@ -0,0 +1,21 @@ +%stdcor 1 +%xchg 1 + rz 0.5 +ch h 0.5 +u 0.5 +u o 0.75 +om 0.5 +om a 0.75 +en 0.5 +en 0.75 +a 0.25 +c 0.25 +e 0.25 +l 0.25 +n 0.25 +o 0.25 +s 0.25 +z 0.25 +z 0.25 +x 0.30 + diff --git a/src/common/Makefile b/src/common/Makefile new file mode 100644 index 0000000..cd4bb61 --- /dev/null +++ b/src/common/Makefile @@ -0,0 +1,8 @@ +# main: cmdline.c main_template.cc +# g++ -o main cmdline.c common.cc main_template.cc + +# cmdline.c cmdline.h : cmdline.ggo +# gengetopt -i cmdline.ggo + +# cmdline.ggo: cmdline_common.ggo cmdline_program.ggo +# cat cmdline_common.ggo cmdline_program.ggo > cmdline.ggo diff --git a/src/common/README b/src/common/README new file mode 100644 index 0000000..46870a9 --- /dev/null +++ b/src/common/README @@ -0,0 +1,18 @@ +Propozycja ujednolicenia dzialania klocka na poziomie +funkcji main. Parametry meta - zdefiniowane dla +wszystkich, poza tok, programow, definiujace ich zachowanie +w systemie klockow. + +cmdline_common.ggo - deklaracje parametrow meta + +cmdline_program.ggo - przyklad deklaracji parametrow programu + nazwa docelowa np. cmdline_guess.ggo + +common.cc - zmienne globalne zawierajace informacje + przekazane przez parametry meta +common.h + +main_template.cc - szkielet funkcji main + +Makefile - sposob kompilacji + diff --git a/src/common/cmdline.c b/src/common/cmdline.c new file mode 100644 index 0000000..6fa1d2c --- /dev/null +++ b/src/common/cmdline.c @@ -0,0 +1,1248 @@ +/* + File autogenerated by gengetopt version 2.22.4 + generated with the following command: + gengetopt -i cmdline.ggo + + The developers of gengetopt consider the fixed text that goes in all + gengetopt output files to be in the public domain: + we make no copyright claims on it. +*/ + +/* If we use autoconf. */ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include + +#ifndef FIX_UNUSED +#define FIX_UNUSED(X) (void) (X) /* avoid warnings for unused params */ +#endif + +#include + +#include "cmdline.h" + +const char *gengetopt_args_info_purpose = ""; + +const char *gengetopt_args_info_usage = "Usage: guess [OPTIONS]..."; + +const char *gengetopt_args_info_description = ""; + +const char *gengetopt_args_info_full_help[] = { + " -h, --help Print help and exit", + " --full-help Print help, including hidden options, and exit", + " -V, --version Print version and exit", + " -f, --input=STRING Input file", + " -o, --output=STRING Output file for succesfully processed segments", + " -e, --fail=STRING Output file for unsuccesfully processed segments ", + " --only-fail Print only segments the program failed to process \n (default=off)", + " --no-fail Print only segments the program processed \n (default=off)", + " -c, --copy Copy succesfully processed segments to standard \n output (default=off)", + " -p, --process=STRING Process segments with this tag", + " -s, --select=STRING Select only segments with this field", + " -S, --ignore=STRING Select only segments without this field", + " -O, --output-field=STRING Output field name", + " -I, --input-field=STRING Input field name", + " -i, --interactive Toggle interactive mode (default=off)", + " --config=FILENAME Configuration file", + " -1, --one-field Print all results in one segments (creates \n ambiguous annotation) (default=off)", + " --one-line Print annotation alternatives as additional fields \n (default=off)", + " --language=STRING Language.", + " -l, --color Show guessed descriptions in colour. \n (default=off)", + 0 +}; + +static void +init_help_array(void) +{ + gengetopt_args_info_help[0] = gengetopt_args_info_full_help[0]; + gengetopt_args_info_help[1] = gengetopt_args_info_full_help[1]; + gengetopt_args_info_help[2] = gengetopt_args_info_full_help[2]; + gengetopt_args_info_help[3] = gengetopt_args_info_full_help[3]; + gengetopt_args_info_help[4] = gengetopt_args_info_full_help[4]; + gengetopt_args_info_help[5] = gengetopt_args_info_full_help[5]; + gengetopt_args_info_help[6] = gengetopt_args_info_full_help[8]; + gengetopt_args_info_help[7] = gengetopt_args_info_full_help[9]; + gengetopt_args_info_help[8] = gengetopt_args_info_full_help[10]; + gengetopt_args_info_help[9] = gengetopt_args_info_full_help[11]; + gengetopt_args_info_help[10] = gengetopt_args_info_full_help[12]; + gengetopt_args_info_help[11] = gengetopt_args_info_full_help[13]; + gengetopt_args_info_help[12] = gengetopt_args_info_full_help[14]; + gengetopt_args_info_help[13] = gengetopt_args_info_full_help[15]; + gengetopt_args_info_help[14] = gengetopt_args_info_full_help[16]; + gengetopt_args_info_help[15] = gengetopt_args_info_full_help[17]; + gengetopt_args_info_help[16] = gengetopt_args_info_full_help[18]; + gengetopt_args_info_help[17] = gengetopt_args_info_full_help[19]; + gengetopt_args_info_help[18] = 0; + +} + +const char *gengetopt_args_info_help[19]; + +typedef enum {ARG_NO + , ARG_FLAG + , ARG_STRING +} cmdline_parser_arg_type; + +static +void clear_given (struct gengetopt_args_info *args_info); +static +void clear_args (struct gengetopt_args_info *args_info); + +static int +cmdline_parser_internal (int argc, char **argv, struct gengetopt_args_info *args_info, + struct cmdline_parser_params *params, const char *additional_error); + +static int +cmdline_parser_required2 (struct gengetopt_args_info *args_info, const char *prog_name, const char *additional_error); + +static char * +gengetopt_strdup (const char *s); + +static +void clear_given (struct gengetopt_args_info *args_info) +{ + args_info->help_given = 0 ; + args_info->full_help_given = 0 ; + args_info->version_given = 0 ; + args_info->input_given = 0 ; + args_info->output_given = 0 ; + args_info->fail_given = 0 ; + args_info->only_fail_given = 0 ; + args_info->no_fail_given = 0 ; + args_info->copy_given = 0 ; + args_info->process_given = 0 ; + args_info->select_given = 0 ; + args_info->ignore_given = 0 ; + args_info->output_field_given = 0 ; + args_info->input_field_given = 0 ; + args_info->interactive_given = 0 ; + args_info->config_given = 0 ; + args_info->one_field_given = 0 ; + args_info->one_line_given = 0 ; + args_info->language_given = 0 ; + args_info->color_given = 0 ; +} + +static +void clear_args (struct gengetopt_args_info *args_info) +{ + FIX_UNUSED (args_info); + args_info->input_arg = NULL; + args_info->input_orig = NULL; + args_info->output_arg = NULL; + args_info->output_orig = NULL; + args_info->fail_arg = NULL; + args_info->fail_orig = NULL; + args_info->only_fail_flag = 0; + args_info->no_fail_flag = 0; + args_info->copy_flag = 0; + args_info->process_arg = NULL; + args_info->process_orig = NULL; + args_info->select_arg = NULL; + args_info->select_orig = NULL; + args_info->ignore_arg = NULL; + args_info->ignore_orig = NULL; + args_info->output_field_arg = NULL; + args_info->output_field_orig = NULL; + args_info->input_field_arg = NULL; + args_info->input_field_orig = NULL; + args_info->interactive_flag = 0; + args_info->config_arg = NULL; + args_info->config_orig = NULL; + args_info->one_field_flag = 0; + args_info->one_line_flag = 0; + args_info->language_arg = NULL; + args_info->language_orig = NULL; + args_info->color_flag = 0; + +} + +static +void init_args_info(struct gengetopt_args_info *args_info) +{ + + init_help_array(); + args_info->help_help = gengetopt_args_info_full_help[0] ; + args_info->full_help_help = gengetopt_args_info_full_help[1] ; + args_info->version_help = gengetopt_args_info_full_help[2] ; + args_info->input_help = gengetopt_args_info_full_help[3] ; + args_info->output_help = gengetopt_args_info_full_help[4] ; + args_info->fail_help = gengetopt_args_info_full_help[5] ; + args_info->only_fail_help = gengetopt_args_info_full_help[6] ; + args_info->no_fail_help = gengetopt_args_info_full_help[7] ; + args_info->copy_help = gengetopt_args_info_full_help[8] ; + args_info->process_help = gengetopt_args_info_full_help[9] ; + args_info->process_min = 0; + args_info->process_max = 0; + args_info->select_help = gengetopt_args_info_full_help[10] ; + args_info->select_min = 0; + args_info->select_max = 0; + args_info->ignore_help = gengetopt_args_info_full_help[11] ; + args_info->ignore_min = 0; + args_info->ignore_max = 0; + args_info->output_field_help = gengetopt_args_info_full_help[12] ; + args_info->input_field_help = gengetopt_args_info_full_help[13] ; + args_info->input_field_min = 0; + args_info->input_field_max = 0; + args_info->interactive_help = gengetopt_args_info_full_help[14] ; + args_info->config_help = gengetopt_args_info_full_help[15] ; + args_info->one_field_help = gengetopt_args_info_full_help[16] ; + args_info->one_line_help = gengetopt_args_info_full_help[17] ; + args_info->language_help = gengetopt_args_info_full_help[18] ; + args_info->color_help = gengetopt_args_info_full_help[19] ; + +} + +void +cmdline_parser_print_version (void) +{ + printf ("%s %s\n", + (strlen(CMDLINE_PARSER_PACKAGE_NAME) ? CMDLINE_PARSER_PACKAGE_NAME : CMDLINE_PARSER_PACKAGE), + CMDLINE_PARSER_VERSION); +} + +static void print_help_common(void) { + cmdline_parser_print_version (); + + if (strlen(gengetopt_args_info_purpose) > 0) + printf("\n%s\n", gengetopt_args_info_purpose); + + if (strlen(gengetopt_args_info_usage) > 0) + printf("\n%s\n", gengetopt_args_info_usage); + + printf("\n"); + + if (strlen(gengetopt_args_info_description) > 0) + printf("%s\n\n", gengetopt_args_info_description); +} + +void +cmdline_parser_print_help (void) +{ + int i = 0; + print_help_common(); + while (gengetopt_args_info_help[i]) + printf("%s\n", gengetopt_args_info_help[i++]); +} + +void +cmdline_parser_print_full_help (void) +{ + int i = 0; + print_help_common(); + while (gengetopt_args_info_full_help[i]) + printf("%s\n", gengetopt_args_info_full_help[i++]); +} + +void +cmdline_parser_init (struct gengetopt_args_info *args_info) +{ + clear_given (args_info); + clear_args (args_info); + init_args_info (args_info); +} + +void +cmdline_parser_params_init(struct cmdline_parser_params *params) +{ + if (params) + { + params->override = 0; + params->initialize = 1; + params->check_required = 1; + params->check_ambiguity = 0; + params->print_errors = 1; + } +} + +struct cmdline_parser_params * +cmdline_parser_params_create(void) +{ + struct cmdline_parser_params *params = + (struct cmdline_parser_params *)malloc(sizeof(struct cmdline_parser_params)); + cmdline_parser_params_init(params); + return params; +} + +static void +free_string_field (char **s) +{ + if (*s) + { + free (*s); + *s = 0; + } +} + +/** @brief generic value variable */ +union generic_value { + char *string_arg; + const char *default_string_arg; +}; + +/** @brief holds temporary values for multiple options */ +struct generic_list +{ + union generic_value arg; + char *orig; + struct generic_list *next; +}; + +/** + * @brief add a node at the head of the list + */ +static void add_node(struct generic_list **list) { + struct generic_list *new_node = (struct generic_list *) malloc (sizeof (struct generic_list)); + new_node->next = *list; + *list = new_node; + new_node->arg.string_arg = 0; + new_node->orig = 0; +} + + +static void +free_multiple_string_field(unsigned int len, char ***arg, char ***orig) +{ + unsigned int i; + if (*arg) { + for (i = 0; i < len; ++i) + { + free_string_field(&((*arg)[i])); + free_string_field(&((*orig)[i])); + } + free_string_field(&((*arg)[0])); /* free default string */ + + free (*arg); + *arg = 0; + free (*orig); + *orig = 0; + } +} + +static void +cmdline_parser_release (struct gengetopt_args_info *args_info) +{ + + free_string_field (&(args_info->input_arg)); + free_string_field (&(args_info->input_orig)); + free_string_field (&(args_info->output_arg)); + free_string_field (&(args_info->output_orig)); + free_string_field (&(args_info->fail_arg)); + free_string_field (&(args_info->fail_orig)); + free_multiple_string_field (args_info->process_given, &(args_info->process_arg), &(args_info->process_orig)); + free_multiple_string_field (args_info->select_given, &(args_info->select_arg), &(args_info->select_orig)); + free_multiple_string_field (args_info->ignore_given, &(args_info->ignore_arg), &(args_info->ignore_orig)); + free_string_field (&(args_info->output_field_arg)); + free_string_field (&(args_info->output_field_orig)); + free_multiple_string_field (args_info->input_field_given, &(args_info->input_field_arg), &(args_info->input_field_orig)); + free_string_field (&(args_info->config_arg)); + free_string_field (&(args_info->config_orig)); + free_string_field (&(args_info->language_arg)); + free_string_field (&(args_info->language_orig)); + + + + clear_given (args_info); +} + + +static void +write_into_file(FILE *outfile, const char *opt, const char *arg, const char *values[]) +{ + FIX_UNUSED (values); + if (arg) { + fprintf(outfile, "%s=\"%s\"\n", opt, arg); + } else { + fprintf(outfile, "%s\n", opt); + } +} + +static void +write_multiple_into_file(FILE *outfile, int len, const char *opt, char **arg, const char *values[]) +{ + int i; + + for (i = 0; i < len; ++i) + write_into_file(outfile, opt, (arg ? arg[i] : 0), values); +} + +int +cmdline_parser_dump(FILE *outfile, struct gengetopt_args_info *args_info) +{ + int i = 0; + + if (!outfile) + { + fprintf (stderr, "%s: cannot dump options to stream\n", CMDLINE_PARSER_PACKAGE); + return EXIT_FAILURE; + } + + if (args_info->help_given) + write_into_file(outfile, "help", 0, 0 ); + if (args_info->full_help_given) + write_into_file(outfile, "full-help", 0, 0 ); + if (args_info->version_given) + write_into_file(outfile, "version", 0, 0 ); + if (args_info->input_given) + write_into_file(outfile, "input", args_info->input_orig, 0); + if (args_info->output_given) + write_into_file(outfile, "output", args_info->output_orig, 0); + if (args_info->fail_given) + write_into_file(outfile, "fail", args_info->fail_orig, 0); + if (args_info->only_fail_given) + write_into_file(outfile, "only-fail", 0, 0 ); + if (args_info->no_fail_given) + write_into_file(outfile, "no-fail", 0, 0 ); + if (args_info->copy_given) + write_into_file(outfile, "copy", 0, 0 ); + write_multiple_into_file(outfile, args_info->process_given, "process", args_info->process_orig, 0); + write_multiple_into_file(outfile, args_info->select_given, "select", args_info->select_orig, 0); + write_multiple_into_file(outfile, args_info->ignore_given, "ignore", args_info->ignore_orig, 0); + if (args_info->output_field_given) + write_into_file(outfile, "output-field", args_info->output_field_orig, 0); + write_multiple_into_file(outfile, args_info->input_field_given, "input-field", args_info->input_field_orig, 0); + if (args_info->interactive_given) + write_into_file(outfile, "interactive", 0, 0 ); + if (args_info->config_given) + write_into_file(outfile, "config", args_info->config_orig, 0); + if (args_info->one_field_given) + write_into_file(outfile, "one-field", 0, 0 ); + if (args_info->one_line_given) + write_into_file(outfile, "one-line", 0, 0 ); + if (args_info->language_given) + write_into_file(outfile, "language", args_info->language_orig, 0); + if (args_info->color_given) + write_into_file(outfile, "color", 0, 0 ); + + + i = EXIT_SUCCESS; + return i; +} + +int +cmdline_parser_file_save(const char *filename, struct gengetopt_args_info *args_info) +{ + FILE *outfile; + int i = 0; + + outfile = fopen(filename, "w"); + + if (!outfile) + { + fprintf (stderr, "%s: cannot open file for writing: %s\n", CMDLINE_PARSER_PACKAGE, filename); + return EXIT_FAILURE; + } + + i = cmdline_parser_dump(outfile, args_info); + fclose (outfile); + + return i; +} + +void +cmdline_parser_free (struct gengetopt_args_info *args_info) +{ + cmdline_parser_release (args_info); +} + +/** @brief replacement of strdup, which is not standard */ +char * +gengetopt_strdup (const char *s) +{ + char *result = 0; + if (!s) + return result; + + result = (char*)malloc(strlen(s) + 1); + if (result == (char*)0) + return (char*)0; + strcpy(result, s); + return result; +} + +static char * +get_multiple_arg_token(const char *arg) +{ + const char *tok; + char *ret; + size_t len, num_of_escape, i, j; + + if (!arg) + return 0; + + tok = strchr (arg, ','); + num_of_escape = 0; + + /* make sure it is not escaped */ + while (tok) + { + if (*(tok-1) == '\\') + { + /* find the next one */ + tok = strchr (tok+1, ','); + ++num_of_escape; + } + else + break; + } + + if (tok) + len = (size_t)(tok - arg + 1); + else + len = strlen (arg) + 1; + + len -= num_of_escape; + + ret = (char *) malloc (len); + + i = 0; + j = 0; + while (arg[i] && (j < len-1)) + { + if (arg[i] == '\\' && + arg[ i + 1 ] && + arg[ i + 1 ] == ',') + ++i; + + ret[j++] = arg[i++]; + } + + ret[len-1] = '\0'; + + return ret; +} + +static const char * +get_multiple_arg_token_next(const char *arg) +{ + const char *tok; + + if (!arg) + return 0; + + tok = strchr (arg, ','); + + /* make sure it is not escaped */ + while (tok) + { + if (*(tok-1) == '\\') + { + /* find the next one */ + tok = strchr (tok+1, ','); + } + else + break; + } + + if (! tok || strlen(tok) == 1) + return 0; + + return tok+1; +} + +static int +check_multiple_option_occurrences(const char *prog_name, unsigned int option_given, unsigned int min, unsigned int max, const char *option_desc); + +int +check_multiple_option_occurrences(const char *prog_name, unsigned int option_given, unsigned int min, unsigned int max, const char *option_desc) +{ + int error = 0; + + if (option_given && (min > 0 || max > 0)) + { + if (min > 0 && max > 0) + { + if (min == max) + { + /* specific occurrences */ + if (option_given != (unsigned int) min) + { + fprintf (stderr, "%s: %s option occurrences must be %d\n", + prog_name, option_desc, min); + error = 1; + } + } + else if (option_given < (unsigned int) min + || option_given > (unsigned int) max) + { + /* range occurrences */ + fprintf (stderr, "%s: %s option occurrences must be between %d and %d\n", + prog_name, option_desc, min, max); + error = 1; + } + } + else if (min > 0) + { + /* at least check */ + if (option_given < min) + { + fprintf (stderr, "%s: %s option occurrences must be at least %d\n", + prog_name, option_desc, min); + error = 1; + } + } + else if (max > 0) + { + /* at most check */ + if (option_given > max) + { + fprintf (stderr, "%s: %s option occurrences must be at most %d\n", + prog_name, option_desc, max); + error = 1; + } + } + } + + return error; +} +int +cmdline_parser (int argc, char **argv, struct gengetopt_args_info *args_info) +{ + return cmdline_parser2 (argc, argv, args_info, 0, 1, 1); +} + +int +cmdline_parser_ext (int argc, char **argv, struct gengetopt_args_info *args_info, + struct cmdline_parser_params *params) +{ + int result; + result = cmdline_parser_internal (argc, argv, args_info, params, 0); + + if (result == EXIT_FAILURE) + { + cmdline_parser_free (args_info); + exit (EXIT_FAILURE); + } + + return result; +} + +int +cmdline_parser2 (int argc, char **argv, struct gengetopt_args_info *args_info, int override, int initialize, int check_required) +{ + int result; + struct cmdline_parser_params params; + + params.override = override; + params.initialize = initialize; + params.check_required = check_required; + params.check_ambiguity = 0; + params.print_errors = 1; + + result = cmdline_parser_internal (argc, argv, args_info, ¶ms, 0); + + if (result == EXIT_FAILURE) + { + cmdline_parser_free (args_info); + exit (EXIT_FAILURE); + } + + return result; +} + +int +cmdline_parser_required (struct gengetopt_args_info *args_info, const char *prog_name) +{ + int result = EXIT_SUCCESS; + + if (cmdline_parser_required2(args_info, prog_name, 0) > 0) + result = EXIT_FAILURE; + + if (result == EXIT_FAILURE) + { + cmdline_parser_free (args_info); + exit (EXIT_FAILURE); + } + + return result; +} + +int +cmdline_parser_required2 (struct gengetopt_args_info *args_info, const char *prog_name, const char *additional_error) +{ + int error = 0; + FIX_UNUSED (additional_error); + + /* checks for required options */ + if (check_multiple_option_occurrences(prog_name, args_info->process_given, args_info->process_min, args_info->process_max, "'--process' ('-p')")) + error = 1; + + if (check_multiple_option_occurrences(prog_name, args_info->select_given, args_info->select_min, args_info->select_max, "'--select' ('-s')")) + error = 1; + + if (check_multiple_option_occurrences(prog_name, args_info->ignore_given, args_info->ignore_min, args_info->ignore_max, "'--ignore' ('-S')")) + error = 1; + + if (check_multiple_option_occurrences(prog_name, args_info->input_field_given, args_info->input_field_min, args_info->input_field_max, "'--input-field' ('-I')")) + error = 1; + + + /* checks for dependences among options */ + + return error; +} + + +static char *package_name = 0; + +/** + * @brief updates an option + * @param field the generic pointer to the field to update + * @param orig_field the pointer to the orig field + * @param field_given the pointer to the number of occurrence of this option + * @param prev_given the pointer to the number of occurrence already seen + * @param value the argument for this option (if null no arg was specified) + * @param possible_values the possible values for this option (if specified) + * @param default_value the default value (in case the option only accepts fixed values) + * @param arg_type the type of this option + * @param check_ambiguity @see cmdline_parser_params.check_ambiguity + * @param override @see cmdline_parser_params.override + * @param no_free whether to free a possible previous value + * @param multiple_option whether this is a multiple option + * @param long_opt the corresponding long option + * @param short_opt the corresponding short option (or '-' if none) + * @param additional_error possible further error specification + */ +static +int update_arg(void *field, char **orig_field, + unsigned int *field_given, unsigned int *prev_given, + char *value, const char *possible_values[], + const char *default_value, + cmdline_parser_arg_type arg_type, + int check_ambiguity, int override, + int no_free, int multiple_option, + const char *long_opt, char short_opt, + const char *additional_error) +{ + char *stop_char = 0; + const char *val = value; + int found; + char **string_field; + FIX_UNUSED (field); + + stop_char = 0; + found = 0; + + if (!multiple_option && prev_given && (*prev_given || (check_ambiguity && *field_given))) + { + if (short_opt != '-') + fprintf (stderr, "%s: `--%s' (`-%c') option given more than once%s\n", + package_name, long_opt, short_opt, + (additional_error ? additional_error : "")); + else + fprintf (stderr, "%s: `--%s' option given more than once%s\n", + package_name, long_opt, + (additional_error ? additional_error : "")); + return 1; /* failure */ + } + + FIX_UNUSED (default_value); + + if (field_given && *field_given && ! override) + return 0; + if (prev_given) + (*prev_given)++; + if (field_given) + (*field_given)++; + if (possible_values) + val = possible_values[found]; + + switch(arg_type) { + case ARG_FLAG: + *((int *)field) = !*((int *)field); + break; + case ARG_STRING: + if (val) { + string_field = (char **)field; + if (!no_free && *string_field) + free (*string_field); /* free previous string */ + *string_field = gengetopt_strdup (val); + } + break; + default: + break; + }; + + + /* store the original value */ + switch(arg_type) { + case ARG_NO: + case ARG_FLAG: + break; + default: + if (value && orig_field) { + if (no_free) { + *orig_field = value; + } else { + if (*orig_field) + free (*orig_field); /* free previous string */ + *orig_field = gengetopt_strdup (value); + } + } + }; + + return 0; /* OK */ +} + +/** + * @brief store information about a multiple option in a temporary list + * @param list where to (temporarily) store multiple options + */ +static +int update_multiple_arg_temp(struct generic_list **list, + unsigned int *prev_given, const char *val, + const char *possible_values[], const char *default_value, + cmdline_parser_arg_type arg_type, + const char *long_opt, char short_opt, + const char *additional_error) +{ + /* store single arguments */ + char *multi_token; + const char *multi_next; + + if (arg_type == ARG_NO) { + (*prev_given)++; + return 0; /* OK */ + } + + multi_token = get_multiple_arg_token(val); + multi_next = get_multiple_arg_token_next (val); + + while (1) + { + add_node (list); + if (update_arg((void *)&((*list)->arg), &((*list)->orig), 0, + prev_given, multi_token, possible_values, default_value, + arg_type, 0, 1, 1, 1, long_opt, short_opt, additional_error)) { + if (multi_token) free(multi_token); + return 1; /* failure */ + } + + if (multi_next) + { + multi_token = get_multiple_arg_token(multi_next); + multi_next = get_multiple_arg_token_next (multi_next); + } + else + break; + } + + return 0; /* OK */ +} + +/** + * @brief free the passed list (including possible string argument) + */ +static +void free_list(struct generic_list *list, short string_arg) +{ + if (list) { + struct generic_list *tmp; + while (list) + { + tmp = list; + if (string_arg && list->arg.string_arg) + free (list->arg.string_arg); + if (list->orig) + free (list->orig); + list = list->next; + free (tmp); + } + } +} + +/** + * @brief updates a multiple option starting from the passed list + */ +static +void update_multiple_arg(void *field, char ***orig_field, + unsigned int field_given, unsigned int prev_given, union generic_value *default_value, + cmdline_parser_arg_type arg_type, + struct generic_list *list) +{ + int i; + struct generic_list *tmp; + + if (prev_given && list) { + *orig_field = (char **) realloc (*orig_field, (field_given + prev_given) * sizeof (char *)); + + switch(arg_type) { + case ARG_STRING: + *((char ***)field) = (char **)realloc (*((char ***)field), (field_given + prev_given) * sizeof (char *)); break; + default: + break; + }; + + for (i = (prev_given - 1); i >= 0; --i) + { + tmp = list; + + switch(arg_type) { + case ARG_STRING: + (*((char ***)field))[i + field_given] = tmp->arg.string_arg; break; + default: + break; + } + (*orig_field) [i + field_given] = list->orig; + list = list->next; + free (tmp); + } + } else { /* set the default value */ + if (default_value && ! field_given) { + switch(arg_type) { + case ARG_STRING: + if (! *((char ***)field)) { + *((char ***)field) = (char **)malloc (sizeof (char *)); + (*((char ***)field))[0] = gengetopt_strdup(default_value->string_arg); + } + break; + default: break; + } + if (!(*orig_field)) { + *orig_field = (char **) malloc (sizeof (char *)); + (*orig_field)[0] = 0; + } + } + } +} + +int +cmdline_parser_internal ( + int argc, char **argv, struct gengetopt_args_info *args_info, + struct cmdline_parser_params *params, const char *additional_error) +{ + int c; /* Character of the parsed option. */ + + struct generic_list * process_list = NULL; + struct generic_list * select_list = NULL; + struct generic_list * ignore_list = NULL; + struct generic_list * input_field_list = NULL; + int error = 0; + struct gengetopt_args_info local_args_info; + + int override; + int initialize; + int check_required; + int check_ambiguity; + + package_name = argv[0]; + + override = params->override; + initialize = params->initialize; + check_required = params->check_required; + check_ambiguity = params->check_ambiguity; + + if (initialize) + cmdline_parser_init (args_info); + + cmdline_parser_init (&local_args_info); + + optarg = 0; + optind = 0; + opterr = params->print_errors; + optopt = '?'; + + while (1) + { + int option_index = 0; + + static struct option long_options[] = { + { "help", 0, NULL, 'h' }, + { "full-help", 0, NULL, 0 }, + { "version", 0, NULL, 'V' }, + { "input", 1, NULL, 'f' }, + { "output", 1, NULL, 'o' }, + { "fail", 1, NULL, 'e' }, + { "only-fail", 0, NULL, 0 }, + { "no-fail", 0, NULL, 0 }, + { "copy", 0, NULL, 'c' }, + { "process", 1, NULL, 'p' }, + { "select", 1, NULL, 's' }, + { "ignore", 1, NULL, 'S' }, + { "output-field", 1, NULL, 'O' }, + { "input-field", 1, NULL, 'I' }, + { "interactive", 0, NULL, 'i' }, + { "config", 1, NULL, 0 }, + { "one-field", 0, NULL, '1' }, + { "one-line", 0, NULL, 0 }, + { "language", 1, NULL, 0 }, + { "color", 0, NULL, 'l' }, + { 0, 0, 0, 0 } + }; + + c = getopt_long (argc, argv, "hVf:o:e:cp:s:S:O:I:i1l", long_options, &option_index); + + if (c == -1) break; /* Exit from `while (1)' loop. */ + + switch (c) + { + case 'h': /* Print help and exit. */ + cmdline_parser_print_help (); + cmdline_parser_free (&local_args_info); + exit (EXIT_SUCCESS); + + case 'V': /* Print version and exit. */ + cmdline_parser_print_version (); + cmdline_parser_free (&local_args_info); + exit (EXIT_SUCCESS); + + case 'f': /* Input file. */ + + + if (update_arg( (void *)&(args_info->input_arg), + &(args_info->input_orig), &(args_info->input_given), + &(local_args_info.input_given), optarg, 0, 0, ARG_STRING, + check_ambiguity, override, 0, 0, + "input", 'f', + additional_error)) + goto failure; + + break; + case 'o': /* Output file for succesfully processed segments. */ + + + if (update_arg( (void *)&(args_info->output_arg), + &(args_info->output_orig), &(args_info->output_given), + &(local_args_info.output_given), optarg, 0, 0, ARG_STRING, + check_ambiguity, override, 0, 0, + "output", 'o', + additional_error)) + goto failure; + + break; + case 'e': /* Output file for unsuccesfully processed segments . */ + + + if (update_arg( (void *)&(args_info->fail_arg), + &(args_info->fail_orig), &(args_info->fail_given), + &(local_args_info.fail_given), optarg, 0, 0, ARG_STRING, + check_ambiguity, override, 0, 0, + "fail", 'e', + additional_error)) + goto failure; + + break; + case 'c': /* Copy succesfully processed segments to standard output. */ + + + if (update_arg((void *)&(args_info->copy_flag), 0, &(args_info->copy_given), + &(local_args_info.copy_given), optarg, 0, 0, ARG_FLAG, + check_ambiguity, override, 1, 0, "copy", 'c', + additional_error)) + goto failure; + + break; + case 'p': /* Process segments with this tag. */ + + if (update_multiple_arg_temp(&process_list, + &(local_args_info.process_given), optarg, 0, 0, ARG_STRING, + "process", 'p', + additional_error)) + goto failure; + + break; + case 's': /* Select only segments with this field. */ + + if (update_multiple_arg_temp(&select_list, + &(local_args_info.select_given), optarg, 0, 0, ARG_STRING, + "select", 's', + additional_error)) + goto failure; + + break; + case 'S': /* Select only segments without this field. */ + + if (update_multiple_arg_temp(&ignore_list, + &(local_args_info.ignore_given), optarg, 0, 0, ARG_STRING, + "ignore", 'S', + additional_error)) + goto failure; + + break; + case 'O': /* Output field name. */ + + + if (update_arg( (void *)&(args_info->output_field_arg), + &(args_info->output_field_orig), &(args_info->output_field_given), + &(local_args_info.output_field_given), optarg, 0, 0, ARG_STRING, + check_ambiguity, override, 0, 0, + "output-field", 'O', + additional_error)) + goto failure; + + break; + case 'I': /* Input field name. */ + + if (update_multiple_arg_temp(&input_field_list, + &(local_args_info.input_field_given), optarg, 0, 0, ARG_STRING, + "input-field", 'I', + additional_error)) + goto failure; + + break; + case 'i': /* Toggle interactive mode. */ + + + if (update_arg((void *)&(args_info->interactive_flag), 0, &(args_info->interactive_given), + &(local_args_info.interactive_given), optarg, 0, 0, ARG_FLAG, + check_ambiguity, override, 1, 0, "interactive", 'i', + additional_error)) + goto failure; + + break; + case '1': /* Print all results in one segments (creates ambiguous annotation). */ + + + if (update_arg((void *)&(args_info->one_field_flag), 0, &(args_info->one_field_given), + &(local_args_info.one_field_given), optarg, 0, 0, ARG_FLAG, + check_ambiguity, override, 1, 0, "one-field", '1', + additional_error)) + goto failure; + + break; + case 'l': /* Show guessed descriptions in colour.. */ + + + if (update_arg((void *)&(args_info->color_flag), 0, &(args_info->color_given), + &(local_args_info.color_given), optarg, 0, 0, ARG_FLAG, + check_ambiguity, override, 1, 0, "color", 'l', + additional_error)) + goto failure; + + break; + + case 0: /* Long option with no short option */ + if (strcmp (long_options[option_index].name, "full-help") == 0) { + cmdline_parser_print_full_help (); + cmdline_parser_free (&local_args_info); + exit (EXIT_SUCCESS); + } + + /* Print only segments the program failed to process. */ + if (strcmp (long_options[option_index].name, "only-fail") == 0) + { + + + if (update_arg((void *)&(args_info->only_fail_flag), 0, &(args_info->only_fail_given), + &(local_args_info.only_fail_given), optarg, 0, 0, ARG_FLAG, + check_ambiguity, override, 1, 0, "only-fail", '-', + additional_error)) + goto failure; + + } + /* Print only segments the program processed. */ + else if (strcmp (long_options[option_index].name, "no-fail") == 0) + { + + + if (update_arg((void *)&(args_info->no_fail_flag), 0, &(args_info->no_fail_given), + &(local_args_info.no_fail_given), optarg, 0, 0, ARG_FLAG, + check_ambiguity, override, 1, 0, "no-fail", '-', + additional_error)) + goto failure; + + } + /* Configuration file. */ + else if (strcmp (long_options[option_index].name, "config") == 0) + { + + + if (update_arg( (void *)&(args_info->config_arg), + &(args_info->config_orig), &(args_info->config_given), + &(local_args_info.config_given), optarg, 0, 0, ARG_STRING, + check_ambiguity, override, 0, 0, + "config", '-', + additional_error)) + goto failure; + + } + /* Print annotation alternatives as additional fields. */ + else if (strcmp (long_options[option_index].name, "one-line") == 0) + { + + + if (update_arg((void *)&(args_info->one_line_flag), 0, &(args_info->one_line_given), + &(local_args_info.one_line_given), optarg, 0, 0, ARG_FLAG, + check_ambiguity, override, 1, 0, "one-line", '-', + additional_error)) + goto failure; + + } + /* Language.. */ + else if (strcmp (long_options[option_index].name, "language") == 0) + { + + + if (update_arg( (void *)&(args_info->language_arg), + &(args_info->language_orig), &(args_info->language_given), + &(local_args_info.language_given), optarg, 0, 0, ARG_STRING, + check_ambiguity, override, 0, 0, + "language", '-', + additional_error)) + goto failure; + + } + + break; + case '?': /* Invalid option. */ + /* `getopt_long' already printed an error message. */ + goto failure; + + default: /* bug: option not considered. */ + fprintf (stderr, "%s: option unknown: %c%s\n", CMDLINE_PARSER_PACKAGE, c, (additional_error ? additional_error : "")); + abort (); + } /* switch */ + } /* while */ + + + update_multiple_arg((void *)&(args_info->process_arg), + &(args_info->process_orig), args_info->process_given, + local_args_info.process_given, 0, + ARG_STRING, process_list); + update_multiple_arg((void *)&(args_info->select_arg), + &(args_info->select_orig), args_info->select_given, + local_args_info.select_given, 0, + ARG_STRING, select_list); + update_multiple_arg((void *)&(args_info->ignore_arg), + &(args_info->ignore_orig), args_info->ignore_given, + local_args_info.ignore_given, 0, + ARG_STRING, ignore_list); + update_multiple_arg((void *)&(args_info->input_field_arg), + &(args_info->input_field_orig), args_info->input_field_given, + local_args_info.input_field_given, 0, + ARG_STRING, input_field_list); + + args_info->process_given += local_args_info.process_given; + local_args_info.process_given = 0; + args_info->select_given += local_args_info.select_given; + local_args_info.select_given = 0; + args_info->ignore_given += local_args_info.ignore_given; + local_args_info.ignore_given = 0; + args_info->input_field_given += local_args_info.input_field_given; + local_args_info.input_field_given = 0; + + if (check_required) + { + error += cmdline_parser_required2 (args_info, argv[0], additional_error); + } + + cmdline_parser_release (&local_args_info); + + if ( error ) + return (EXIT_FAILURE); + + return 0; + +failure: + free_list (process_list, 1 ); + free_list (select_list, 1 ); + free_list (ignore_list, 1 ); + free_list (input_field_list, 1 ); + + cmdline_parser_release (&local_args_info); + return (EXIT_FAILURE); +} diff --git a/src/common/cmdline.ggo b/src/common/cmdline.ggo new file mode 100644 index 0000000..a203b7f --- /dev/null +++ b/src/common/cmdline.ggo @@ -0,0 +1,39 @@ +#section "Common UTT options" + + +option "input" f "Input file" string no + +option "output" o "Output file for succesfully processed segments" string no + +option "fail" e "Output file for unsuccesfully processed segments " string no + +option "only-fail" - "Print only segments the program failed to process" flag off hidden + +option "no-fail" - "Print only segments the program processed" flag off hidden + +option "copy" c "Copy succesfully processed segments to standard output" flag off + +option "process" p "Process segments with this tag" string no multiple + +option "select" s "Select only segments with this field" string no multiple + +option "ignore" S "Select only segments without this field" string no multiple + +option "output-field" O "Output field name" string no + +option "input-field" I "Input field name" string no multiple + +option "interactive" i "Toggle interactive mode" flag off + +option "config" - "Configuration file" string typestr="FILENAME" no + +option "one-field" 1 "Print all results in one segments (creates ambiguous annotation)" flag off + +option "one-line" - "Print annotation alternatives as additional fields" flag off + +option "language" - "Language." string no +package "guess" +version "0.1" + +option "color" l "Show guessed descriptions in colour." flag off + diff --git a/src/common/cmdline.h b/src/common/cmdline.h new file mode 100644 index 0000000..61bad58 --- /dev/null +++ b/src/common/cmdline.h @@ -0,0 +1,248 @@ +/** @file cmdline.h + * @brief The header file for the command line option parser + * generated by GNU Gengetopt version 2.22.4 + * http://www.gnu.org/software/gengetopt. + * DO NOT modify this file, since it can be overwritten + * @author GNU Gengetopt by Lorenzo Bettini */ + +#ifndef CMDLINE_H +#define CMDLINE_H + +/* If we use autoconf. */ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include /* for FILE */ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#ifndef CMDLINE_PARSER_PACKAGE +/** @brief the program name (used for printing errors) */ +#define CMDLINE_PARSER_PACKAGE "guess" +#endif + +#ifndef CMDLINE_PARSER_PACKAGE_NAME +/** @brief the complete program name (used for help and version) */ +#define CMDLINE_PARSER_PACKAGE_NAME "guess" +#endif + +#ifndef CMDLINE_PARSER_VERSION +/** @brief the program version */ +#define CMDLINE_PARSER_VERSION "0.1" +#endif + +/** @brief Where the command line options are stored */ +struct gengetopt_args_info +{ + const char *help_help; /**< @brief Print help and exit help description. */ + const char *full_help_help; /**< @brief Print help, including hidden options, and exit help description. */ + const char *version_help; /**< @brief Print version and exit help description. */ + char * input_arg; /**< @brief Input file. */ + char * input_orig; /**< @brief Input file original value given at command line. */ + const char *input_help; /**< @brief Input file help description. */ + char * output_arg; /**< @brief Output file for succesfully processed segments. */ + char * output_orig; /**< @brief Output file for succesfully processed segments original value given at command line. */ + const char *output_help; /**< @brief Output file for succesfully processed segments help description. */ + char * fail_arg; /**< @brief Output file for unsuccesfully processed segments . */ + char * fail_orig; /**< @brief Output file for unsuccesfully processed segments original value given at command line. */ + const char *fail_help; /**< @brief Output file for unsuccesfully processed segments help description. */ + int only_fail_flag; /**< @brief Print only segments the program failed to process (default=off). */ + const char *only_fail_help; /**< @brief Print only segments the program failed to process help description. */ + int no_fail_flag; /**< @brief Print only segments the program processed (default=off). */ + const char *no_fail_help; /**< @brief Print only segments the program processed help description. */ + int copy_flag; /**< @brief Copy succesfully processed segments to standard output (default=off). */ + const char *copy_help; /**< @brief Copy succesfully processed segments to standard output help description. */ + char ** process_arg; /**< @brief Process segments with this tag. */ + char ** process_orig; /**< @brief Process segments with this tag original value given at command line. */ + unsigned int process_min; /**< @brief Process segments with this tag's minimum occurreces */ + unsigned int process_max; /**< @brief Process segments with this tag's maximum occurreces */ + const char *process_help; /**< @brief Process segments with this tag help description. */ + char ** select_arg; /**< @brief Select only segments with this field. */ + char ** select_orig; /**< @brief Select only segments with this field original value given at command line. */ + unsigned int select_min; /**< @brief Select only segments with this field's minimum occurreces */ + unsigned int select_max; /**< @brief Select only segments with this field's maximum occurreces */ + const char *select_help; /**< @brief Select only segments with this field help description. */ + char ** ignore_arg; /**< @brief Select only segments without this field. */ + char ** ignore_orig; /**< @brief Select only segments without this field original value given at command line. */ + unsigned int ignore_min; /**< @brief Select only segments without this field's minimum occurreces */ + unsigned int ignore_max; /**< @brief Select only segments without this field's maximum occurreces */ + const char *ignore_help; /**< @brief Select only segments without this field help description. */ + char * output_field_arg; /**< @brief Output field name. */ + char * output_field_orig; /**< @brief Output field name original value given at command line. */ + const char *output_field_help; /**< @brief Output field name help description. */ + char ** input_field_arg; /**< @brief Input field name. */ + char ** input_field_orig; /**< @brief Input field name original value given at command line. */ + unsigned int input_field_min; /**< @brief Input field name's minimum occurreces */ + unsigned int input_field_max; /**< @brief Input field name's maximum occurreces */ + const char *input_field_help; /**< @brief Input field name help description. */ + int interactive_flag; /**< @brief Toggle interactive mode (default=off). */ + const char *interactive_help; /**< @brief Toggle interactive mode help description. */ + char * config_arg; /**< @brief Configuration file. */ + char * config_orig; /**< @brief Configuration file original value given at command line. */ + const char *config_help; /**< @brief Configuration file help description. */ + int one_field_flag; /**< @brief Print all results in one segments (creates ambiguous annotation) (default=off). */ + const char *one_field_help; /**< @brief Print all results in one segments (creates ambiguous annotation) help description. */ + int one_line_flag; /**< @brief Print annotation alternatives as additional fields (default=off). */ + const char *one_line_help; /**< @brief Print annotation alternatives as additional fields help description. */ + char * language_arg; /**< @brief Language.. */ + char * language_orig; /**< @brief Language. original value given at command line. */ + const char *language_help; /**< @brief Language. help description. */ + int color_flag; /**< @brief Show guessed descriptions in colour. (default=off). */ + const char *color_help; /**< @brief Show guessed descriptions in colour. help description. */ + + unsigned int help_given ; /**< @brief Whether help was given. */ + unsigned int full_help_given ; /**< @brief Whether full-help was given. */ + unsigned int version_given ; /**< @brief Whether version was given. */ + unsigned int input_given ; /**< @brief Whether input was given. */ + unsigned int output_given ; /**< @brief Whether output was given. */ + unsigned int fail_given ; /**< @brief Whether fail was given. */ + unsigned int only_fail_given ; /**< @brief Whether only-fail was given. */ + unsigned int no_fail_given ; /**< @brief Whether no-fail was given. */ + unsigned int copy_given ; /**< @brief Whether copy was given. */ + unsigned int process_given ; /**< @brief Whether process was given. */ + unsigned int select_given ; /**< @brief Whether select was given. */ + unsigned int ignore_given ; /**< @brief Whether ignore was given. */ + unsigned int output_field_given ; /**< @brief Whether output-field was given. */ + unsigned int input_field_given ; /**< @brief Whether input-field was given. */ + unsigned int interactive_given ; /**< @brief Whether interactive was given. */ + unsigned int config_given ; /**< @brief Whether config was given. */ + unsigned int one_field_given ; /**< @brief Whether one-field was given. */ + unsigned int one_line_given ; /**< @brief Whether one-line was given. */ + unsigned int language_given ; /**< @brief Whether language was given. */ + unsigned int color_given ; /**< @brief Whether color was given. */ + +} ; + +/** @brief The additional parameters to pass to parser functions */ +struct cmdline_parser_params +{ + int override; /**< @brief whether to override possibly already present options (default 0) */ + int initialize; /**< @brief whether to initialize the option structure gengetopt_args_info (default 1) */ + int check_required; /**< @brief whether to check that all required options were provided (default 1) */ + int check_ambiguity; /**< @brief whether to check for options already specified in the option structure gengetopt_args_info (default 0) */ + int print_errors; /**< @brief whether getopt_long should print an error message for a bad option (default 1) */ +} ; + +/** @brief the purpose string of the program */ +extern const char *gengetopt_args_info_purpose; +/** @brief the usage string of the program */ +extern const char *gengetopt_args_info_usage; +/** @brief all the lines making the help output */ +extern const char *gengetopt_args_info_help[]; +/** @brief all the lines making the full help output (including hidden options) */ +extern const char *gengetopt_args_info_full_help[]; + +/** + * The command line parser + * @param argc the number of command line options + * @param argv the command line options + * @param args_info the structure where option information will be stored + * @return 0 if everything went fine, NON 0 if an error took place + */ +int cmdline_parser (int argc, char **argv, + struct gengetopt_args_info *args_info); + +/** + * The command line parser (version with additional parameters - deprecated) + * @param argc the number of command line options + * @param argv the command line options + * @param args_info the structure where option information will be stored + * @param override whether to override possibly already present options + * @param initialize whether to initialize the option structure my_args_info + * @param check_required whether to check that all required options were provided + * @return 0 if everything went fine, NON 0 if an error took place + * @deprecated use cmdline_parser_ext() instead + */ +int cmdline_parser2 (int argc, char **argv, + struct gengetopt_args_info *args_info, + int override, int initialize, int check_required); + +/** + * The command line parser (version with additional parameters) + * @param argc the number of command line options + * @param argv the command line options + * @param args_info the structure where option information will be stored + * @param params additional parameters for the parser + * @return 0 if everything went fine, NON 0 if an error took place + */ +int cmdline_parser_ext (int argc, char **argv, + struct gengetopt_args_info *args_info, + struct cmdline_parser_params *params); + +/** + * Save the contents of the option struct into an already open FILE stream. + * @param outfile the stream where to dump options + * @param args_info the option struct to dump + * @return 0 if everything went fine, NON 0 if an error took place + */ +int cmdline_parser_dump(FILE *outfile, + struct gengetopt_args_info *args_info); + +/** + * Save the contents of the option struct into a (text) file. + * This file can be read by the config file parser (if generated by gengetopt) + * @param filename the file where to save + * @param args_info the option struct to save + * @return 0 if everything went fine, NON 0 if an error took place + */ +int cmdline_parser_file_save(const char *filename, + struct gengetopt_args_info *args_info); + +/** + * Print the help + */ +void cmdline_parser_print_help(void); +/** + * Print the full help (including hidden options) + */ +void cmdline_parser_print_full_help(void); +/** + * Print the version + */ +void cmdline_parser_print_version(void); + +/** + * Initializes all the fields a cmdline_parser_params structure + * to their default values + * @param params the structure to initialize + */ +void cmdline_parser_params_init(struct cmdline_parser_params *params); + +/** + * Allocates dynamically a cmdline_parser_params structure and initializes + * all its fields to their default values + * @return the created and initialized cmdline_parser_params structure + */ +struct cmdline_parser_params *cmdline_parser_params_create(void); + +/** + * Initializes the passed gengetopt_args_info structure's fields + * (also set default values for options that have a default) + * @param args_info the structure to initialize + */ +void cmdline_parser_init (struct gengetopt_args_info *args_info); +/** + * Deallocates the string fields of the gengetopt_args_info structure + * (but does not deallocate the structure itself) + * @param args_info the structure to deallocate + */ +void cmdline_parser_free (struct gengetopt_args_info *args_info); + +/** + * Checks that all the required options were specified + * @param args_info the structure to check + * @param prog_name the name of the program that will be used to print + * possible errors + * @return + */ +int cmdline_parser_required (struct gengetopt_args_info *args_info, + const char *prog_name); + + +#ifdef __cplusplus +} +#endif /* __cplusplus */ +#endif /* CMDLINE_H */ diff --git a/src/common/cmdline_common.ggo b/src/common/cmdline_common.ggo new file mode 100644 index 0000000..45a385a --- /dev/null +++ b/src/common/cmdline_common.ggo @@ -0,0 +1,34 @@ +#section "Common UTT options" + + +option "input" f "Input file" string no + +option "output" o "Output file for succesfully processed segments" string no + +option "fail" e "Output file for unsuccesfully processed segments " string no + +option "only-fail" - "Print only segments the program failed to process" flag off hidden + +option "no-fail" - "Print only segments the program processed" flag off hidden + +option "copy" c "Copy succesfully processed segments to standard output" flag off + +option "process" p "Process segments with this tag" string no multiple + +option "select" s "Select only segments with this field" string no multiple + +option "ignore" S "Select only segments without this field" string no multiple + +option "output-field" O "Output field name" string no + +option "input-field" I "Input field name" string no multiple + +option "interactive" i "Toggle interactive mode" flag off + +option "config" - "Configuration file" string typestr="FILENAME" no + +option "one-field" 1 "Print all results in one segments (creates ambiguous annotation)" flag off + +option "one-line" - "Print annotation alternatives as additional fields" flag off + +option "language" - "Language." string no diff --git a/src/common/cmdline_program.ggo b/src/common/cmdline_program.ggo new file mode 100644 index 0000000..e5e3058 --- /dev/null +++ b/src/common/cmdline_program.ggo @@ -0,0 +1,5 @@ +package "guess" +version "0.1" + +option "color" l "Show guessed descriptions in colour." flag off + diff --git a/src/common/common.cc b/src/common/common.cc new file mode 100644 index 0000000..6803139 --- /dev/null +++ b/src/common/common.cc @@ -0,0 +1,229 @@ +#include +#include +#include +#include +#include +#include "common.h" +#include +#include + +FILE* inputf=stdin; +FILE* outputf=stdout; +FILE* failedf=stdout; +bool copy_processed=0; +bool one_field=false; +bool one_line=false; +char output_field_prefix[FIELD_PREFIX_MAXLEN]; +char input_field_prefix[FIELD_PREFIX_MAXLEN]; + +extern int argc; +extern char **argv; + + +// tilde (home dir) expansion in path +int expand_path(char* inpath, char* outpath) +{ + if(inpath[0]=='~') + sprintf(outpath,"%s%s",getenv("HOME"),inpath+1); + else + strcpy(outpath,inpath); + return 0; // no problem +} + + + + +void set_program_name(char program_name[], char* argv0) +{ + if (char* p_name = strrchr(argv0, '/')) + strcpy(program_name,p_name+1); + else + strcpy(program_name,argv0); +} + + + +extern void process_config_files(gengetopt_args_info* args, char* argv0) +{ + + char program_name[256]; + char config_file[256]; + char config_file_tmp[256]; + + set_program_name(program_name,argv0); + + // obsługa pliku konfiguracyjnego podanego w linii komend + if (args->config_given) { + if (file_accessible(args->config_arg) == 0) { + if (cmdline_parser_configfile(args->config_arg, + args, + 0, // 0 - nie nadpisuj wartości parametrów + 0, // 0 - nie inicjuj + 0) != 0) { + fprintf(stderr, "Error in config file (%s)\n", args->config_arg); + exit(1); + } + } + } + + if(args->one_line_given && !one_line) one_line=true, one_field=false; + if(args->one_field_given && !one_field) one_line=false, one_field=true; + + // obsluga pliku konfiguracyjnego uzytkownika dla programu + sprintf(config_file_tmp, "%s/%s.conf", USER_CONFIG_DIR, program_name); + expand_path(config_file_tmp, config_file); + if (file_accessible(config_file) == 0) { + if (cmdline_parser_configfile(config_file, + args, + 0, // 0 - nie nadpisuj danych + 0, // 0 - nie inicjuj struktury + 0) != 0) { + fprintf(stderr, "Error in config file (%s)\n", config_file); + exit(1); + } + } + + if(args->one_line_given && !one_line) one_line=true, one_field=false; + if(args->one_field_given && !one_field) one_line=false, one_field=true; + + + // obsluga pliku konfiguracyjnego uzytkownika globalnego + sprintf(config_file_tmp, "%s/utt.conf", USER_CONFIG_DIR); + expand_path(config_file_tmp, config_file); + if (file_accessible(config_file) == 0) { + if (cmdline_parser_configfile(config_file, + args, + 0, // 0 - nie nadpisuj danych + 0, // 0 - nie inicjuj struktury + 0) != 0) { + fprintf(stderr, "Error in config file (%s)\n", config_file); + exit(1); + } + } + + if(args->one_line_given && !one_line) one_line=true, one_field=false; + if(args->one_field_given && !one_field) one_line=false, one_field=true; + + + + // obsluga systemowego pliku konfiguracyjnego dla programu + sprintf(config_file, "%s/%s.conf", SYSTEM_CONFIG_DIR, program_name); + if (file_accessible(config_file) == 0) { + if (cmdline_parser_configfile(config_file, + args, + 0, // 0 - nie zmieniaj danych wczesniejszych + 0, // 0 - nie inicjuj struktury + 0 // 0 - nie sprawdzaj wymaganych parametrow + ) != 0) { + fprintf(stderr, "Error in config file (%s)\n", config_file); + exit(1); + } + } + + if(args->one_line_given && !one_line) one_line=true, one_field=false; + if(args->one_field_given && !one_field) one_line=false, one_field=true; + + + // obsluga systemowego pliku konfiguracyjnego globalnego + sprintf(config_file, "%s/utt.conf", SYSTEM_CONFIG_DIR); + if (file_accessible(config_file) == 0) { + if (cmdline_parser_configfile(config_file, + args, + 0, // 0 - nie zmieniaj danych wczesniejszych + 0, // 0 - nie inicjuj struktury + 0 // 0 - nie sprawdzaj wymaganych parametrow + ) != 0) { + fprintf(stderr, "Error in config file (%s)\n", config_file); + exit(1); + } + } + + if(args->one_line_given && !one_line) one_line=true, one_field=false; + if(args->one_field_given && !one_field) one_line=false, one_field=true; + +} + + +void process_common_options(gengetopt_args_info* args, char* argv0) +{ + char program_name[256]; + + set_program_name(program_name,argv0); + + setlocale(LC_CTYPE,""); + setlocale(LC_COLLATE, ""); + + if(args->help_given) + cmdline_parser_print_help (); + + if(args->input_given) + if(!(inputf=fopen(args->input_arg,"r"))) + { + fprintf(stderr,"No such file: %s.\n", args->input_arg); + exit(1); + } + + if(args->output_given) + if(!(outputf=fopen(args->output_arg,"w"))) + { + fprintf(stderr,"Cannot open output file: %s.\n", args->output_arg); + exit(1); + } + + if(args->fail_given) + if(!(failedf=fopen(args->fail_arg,"w"))) + { + fprintf(stderr,"Cannot open output file: %s.\n", args->fail_arg); + exit(1); + } + + if(args->input_field_given) + fieldprefix(args->input_field_arg[0],input_field_prefix); + else + strcpy(input_field_prefix, "4"); + + if(args->output_field_given) + fieldprefix(args->output_field_arg,output_field_prefix); + else + sprintf(output_field_prefix, "%s%c", program_name, INFIELD_SEP); + + if ((args->copy_given)) + copy_processed=true; +} + +// sprawdza istnienie pliku +int file_accessible(const char* path) { + return access(path, R_OK); +} + +// sprawdza istnienie pliku konfiguracyjnego +int config_file_exists(const char* dir, const char* filename) { + struct stat dir_stat; + struct stat file_stat; + + char* path = (char*)malloc(strlen(dir) + strlen(filename) + 2); // + '\0' + '/' + + sprintf(path, "%s/%s", dir, filename); + + if (stat(dir, &dir_stat) != 0) + return -1; + + if (stat(path, &file_stat) != 0) + return -1; + + if (!S_ISDIR(dir_stat.st_mode)) + return -1; // katalog nie jest katalogiem + + if (!S_ISREG(file_stat.st_mode)) + return -1; // plik konfiguracyjny nie jest plikiem + + if (access(dir, X_OK) != 0) + return -1; // nie mamy prawa zmienic katalogu + + if (access(path, R_OK) != 0) + return -1; // nie mamy prawa odczytu pliku + + free(path); + + return 0; +} diff --git a/src/common/common.d b/src/common/common.d new file mode 100644 index 0000000..e69de29 diff --git a/src/common/common.h b/src/common/common.h new file mode 100644 index 0000000..0dfa040 --- /dev/null +++ b/src/common/common.h @@ -0,0 +1,516 @@ +#ifndef __COMMON_H +#define __COMMON_H + +#include +#include +#include +#include + +#include +#include + +#include "../lib/const.h" + +#ifndef _CMDLINE_FILE +#error _CMDLINE_FILE constant not defined! +#else +#include _CMDLINE_FILE +#endif + + +/************************************************** + * Stale dotyczace wejscia/wyjscia + */ + +#define EMPTYFORM '*' +#define INFIELD_SEP ':' +#define MAXAUX 16 +#define FIELD_SEP " \t\n" +#define FIELD_PREFIX_MAXLEN 32 + + +// katalogi z plikami konfiguracyjnymi +// nowe +// stare - do wyrzucenia +// #define CONFIG_DIR ".utt/conf" + +// nazwa zmiennej okreslajaca sciezke do danych + +// #define UTT_DIR_VAR "UTT_DIR" + +// sciezka do plikow z danymi (np UTT_DIR/pliki) wzgledem $HOME! + +// #define UTT_DIR_DEFAULT ".utt/pl/" + +/**************************************************/ + + +extern FILE* inputf; +extern FILE* outputf; +extern FILE* failedf; + +extern char* input_filename; +extern char* output_filename; +extern char* failed_filename; +extern bool one_line; +extern bool one_field; + +extern char input_field_prefix[]; +extern char output_field_prefix[]; + +extern bool copy_processed; +extern bool append_output; +extern bool append_failed; + +//sciezka do katalogu z danymi +extern char utt_dir[]; + +extern void process_common_options(gengetopt_args_info* args, char* argv0); +extern void process_config_files(gengetopt_args_info* args, char* argv0); + +extern int expand_path(char* inpath, char* outpath); + +/************************************************** + * problems with casing */ +// sprawdzenie wielkosci liter +// warto¶æ zwracana: +// 0 - wszystkie ma³e litery +// 1 - pierwsza wielka, reszta male +// 2 - wszystkie wielkie +// 3 - inne +inline int casing(char* s) +{ + int ret = isupper(*s) ? 1 : 0; + while(*++s != '\0') + { + if(isupper(*s)) + { + if(ret==1) ret=2; + else if(ret==0) ret=3; + } + else + { + if(ret==2) ret=3; + } + } + return ret; +} + +// +inline void tolowers(char* s, char* d) +{ + *d=tolower(*s); + while(*s != '\0') * ++d = tolower(* ++s); +} + + +// przepisuje s do d +// nadajac wielko¶æ liter zgodnie z warto¶ci± casing +// casing - warto¶æ zwracana przez casing() +// je¶li casing==3 przepisuje bez zmian (za ma³o informacji) +inline void restorecasing(char *s, char *d, int casing) +{ + switch(casing) + { + case 0: + case 3: + *d=*s; + while(*s != '\0') * ++d = * ++s; + break; + case 1: + *d=toupper(*s); + while(*s != '\0') * ++d = * ++s; + break; + case 2: + *d=toupper(*s); + while(*s != '\0') * ++d = toupper(* ++s); + break; + } +} + +/**************************************************/ + +/* +parameters: + -seg - segment + -pref - field name or "1", "2", "3", "4" for the first four fields + +val - field contents +return value: + 1 if specified field exists, 0 otherwise +*/ + +inline int getfield(char* seg, const char* pref, char* val) +{ + + char* p=seg; + char* p0; + + while(isspace(*p)) ++p; + + // field "1" + p0=p; while(isdigit(*p)) ++p; + if(*pref=='1') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0; + + while(isspace(*p)) ++p; + + // field "2" + p0=p; while(isdigit(*p)) ++p; + if(*pref=='2') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0; + + while(isspace(*p)) ++p; + + // field "3" + p0=p; while(isgraph(*p)) ++p; + if(*pref=='3') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0; + + while(isspace(*p)) ++p; + + // field "4" + p0=p; while(isgraph(*p)) ++p; + if(*pref=='4') if(p!=p0) { strncpy(val,p0,p-p0); val[p-p0]='\0'; return 1; } else return 0; + + while(isspace(*p)) ++p; + + // annotation fields + do p=strstr(p,pref); while(p!=NULL && *(p-1)!=' ' && *(p-1)!='\t'); + + if(p==NULL) return 0; + else + { + p+=strlen(pref); + int len=strcspn(p,FIELD_SEP "\n\r\f\0"); + strncpy(val,p,len); + val[len]='\0'; + return 1; + } +} + +inline int getfield(wchar_t* seg, const wchar_t* pref, wchar_t* val) +{ + + wchar_t* p=seg; + wchar_t* p0; + while(iswspace(*p)) ++p; + + // field "1" + p0=p; while(iswdigit(*p)) ++p; + if(*pref==L'1') + if(p!=p0) + { + wcsncpy(val,p0,p-p0); + val[p-p0]=L'\0'; + return 1; + } + else + return 0; + while(iswspace(*p)) ++p; + +// field "2" + p0=p; while(iswdigit(*p)) ++p; + if(*pref==L'2') if(p!=p0) { wcsncpy(val,p0,p-p0); val[p-p0]=L'\0'; return 1; } else return 0; + while(iswspace(*p)) ++p; + // field "3" + p0=p; while(iswgraph(*p)) ++p; + if(*pref==L'3') if(p!=p0) { wcsncpy(val,p0,p-p0); val[p-p0]=L'\0'; return 1; } else return 0; + + while(iswspace(*p)) ++p; + p0=p; while(iswgraph(*p)) ++p; + + if(*pref==L'4') if(p!=p0) { wcsncpy(val,p0,p-p0); val[p-p0]=L'\0'; return 1; } else return 0; + while(iswspace(*p)) ++p; + + // annotation fields + do p=wcsstr(p,pref); while(p!=NULL && *(p-1)!=L' ' && *(p-1)!=L'\t'); + if(p==NULL) return 0; + else + { + p+=wcslen(pref); + int len=wcscspn(p,FIELD_SEP L"\n\r\f\0"); + wcsncpy(val,p,len); + val[len]=L'\0'; + return 1; + } +} + +/* +parameters: + -name - field name, long or short + +prefix - field name with ':' appended if long name +return value: + 1 if correct field name, 0 otherwise +examples: +name prefix r.v. +lem lem: 1 +@ @ 1 +:: 'undef' 0 +a,b 'undef' 0 +*/ +inline +int fieldprefix(char *name, char *prefix) +{ + if (ispunct(name[0]) && name[1]=='\0') // correct short name + { + strcpy(prefix, name); return 1; + } + + int i=0; + while(name[i]!='\0' && isalnum(name[i])) ++i; + + if(name[i]=='\0' && i>0) // correct long name + { + sprintf(prefix,"%s:",name); return 1; + } + + // incorrect + return 0; +} + +inline +bool process_seg(char* seg, gengetopt_args_info& args) +{ + char buf[256]; + char pref[FIELD_PREFIX_MAXLEN]; + bool ret = !args.process_given; + if(args.process_given) + { + getfield(seg,"3",buf); + for(int i=0; i= MAX_LINE) return 0; // bezpieczniej, ale wolniej + + int seglen=strlen(seg); + sprintf(seg+(seglen-1)," %s%s\n",pref,val); + return 1; +} + +/**************************************************/ + +struct Seg +{ + int filepos, len; + char* tag; + char* form; + char* aux[MAXAUX]; + int auxn; + + bool parse(char* line); + char* getfield(char* fieldname); + void print(char* line); + bool addfield(char* s); + bool clearfields(); +}; + +/**************************************************/ + +/* definicja struktury wejscia/wyjscia + */ +struct Segment +{ + int filepos, len; + char* tag; + char* form; + char* aux[MAXAUX]; + int auxn; + + bool parse(char* line); + char* getfield(char* fieldname); + void print(char* line); + bool addfield(char* s); + bool clearfields(); +}; + +/* + * Sprawdza czy nalezy przetwarzac dany segment. + */ + +inline +bool process_seg(Segment& s, gengetopt_args_info& args) +{ + bool ret = !args.process_given; + + for(int i=0; i + +#include "common.h" + +main(int argc, char* argv[]) +{ + gengetopt_args_info args; + + if(cmdline_parser(argc,argv,&args) != 0) + exit(1); + + process_common_options(args); + + // + // TU KOD + // + + cmdline_parser_free(&args); + +} diff --git a/src/compiledic/Makefile b/src/compiledic/Makefile new file mode 100644 index 0000000..dc373e8 --- /dev/null +++ b/src/compiledic/Makefile @@ -0,0 +1,25 @@ +include ../../config.mak + +all: compiledic + +.PHONY: compiledic +compiledic: + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 compiledic $(BIN_DIR) + install -m 0755 text2fst.py $(BIN_DIR) + install -m 0755 symbols.py $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/compiledic + rm $(BIN_DIR)/text2fst.py + rm $(BIN_DIR)/symbols.py +endif + +.PHONY: clean +clean: diff --git a/src/compiledic/compiledic b/src/compiledic/compiledic new file mode 100755 index 0000000..860c50f --- /dev/null +++ b/src/compiledic/compiledic @@ -0,0 +1,214 @@ +#! /usr/bin/env perl + +#package: UAM Text Tools +#component: compiledic +#version: 1.3 +#author: Tomasz Obrebski +#author: Krzysztof Szarzyski (2012 migration to OpenFST format) + +use utf8; + +use strict; +use locale; +use File::HomeDir; +use File::Basename; +use File::Temp; +use File::Copy; +use Getopt::Long; + + + +my $linesPerFile = 20000; + + + +Getopt::Long::Configure('no_ignore_case_always'); +my $help=0; +GetOptions("help|h" => \$help); + +if($help) +{ + print <<'END' +Usage: compiledic dictionaryfile.dic + +The dictionary file must be UTF8 without Byte Order Mark (BOM). +To remove BOM see removeBom.sh + +Options: + --help -h Help. +END +; + exit 0; +} + +################################################## + +@ARGV > 0 or die("Source dictionary not given.\n"); + +my $file = shift; + +-f $file or die("Source dictionary not found.\n"); + +$file =~ /(.*)\.dic/ or die("The input file must have .dic extension.\n"); + +my $filenameprefix = $1; + +################################################## + +# Tworzymy katalog tymczasowy, gdzie wszystko bedzie umieszczane. +my $tmp_root = File::Temp::tempdir( CLEANUP => 1 ); +print "Using temp dir: $tmp_root\n"; + + +################################################## +# Tworzymy tabele symboli: +print "Generating the symbols table\t\t"; +`python ./symbols.py > $tmp_root/symbols`;# or die "Failed!\n"; +print "OK\n"; + + +################################################## +# Dzielenie pliku slownika: +print "Dividing the dictionary file\t\t"; + +open(IN, $file); + +my $lineCount = 0; +my $fileCount = 0; + +open(FILE, ">$tmp_root/slo_$fileCount"); + +while () { + if (++$lineCount >= $linesPerFile) { + $fileCount++; + $lineCount = 0; + close(FILE); + open(FILE, ">$tmp_root/slo_".$fileCount); + } + print(FILE $_); +} +print "OK\n"; + +################################################## +# Budujemy male automaty: +print "Building partial automata"; + +#32 kropki, fileCount plikow +my $filesPerDot = $fileCount/32; +my $files=$filesPerDot; +my $dots=0; + +for (my $i=0; $i<=$fileCount; $i++) { + + if ($files >= $filesPerDot) { + $files = 0; + print "."; + $dots++; + } + $files++; + `python text2fst.py < $tmp_root/slo_$i > $tmp_root/slownik_$i.fst`; + #`fstcompile --acceptor $tmp_root/slownik_$i.fst $tmp_root/slownikC_$i.fst`; + `fstcompile --acceptor --isymbols=$tmp_root/symbols $tmp_root/slownik_$i.fst $tmp_root/slownikC_$i.fst`; + move("$tmp_root/slownikC_$i.fst", "$tmp_root/slownik_$i.bin") or die "Cant create slownik_$i.bin\n"; +} +if ($dots < 32) { + for (my $i=0; $i<32 - $dots; $i++) { + print "."; + } +} + +print "OK\n"; + +################################################## +# Usuwamy czesci slownika: +print "Deleteing $tmp_root/slo_ text files\t\t"; +unlink <$tmp_root/slo_*> or die "Faiiled\n"; +print "OK\n"; + + +################################################## +# Budowanie koncowego automatu: +print "Building final automaton"; + +#35 kropek... +my $ndots=33; +$filesPerDot = $fileCount/$ndots; +$files=$filesPerDot; +$dots=0; + + +my $out_fst = "slownik.bin"; +my $tmp_fst = "slownik_T.bin"; + + +###################################################################### +# Budowanie jednego automatu +###################################################################### +move("$tmp_root/slownik_0.bin", "$tmp_root/$out_fst") or die "Failed to move slownik_0.bin -> $out_fst\n"; +for (my $i=1; $i<=$fileCount; $i++) { + + if ($files >= $filesPerDot) { + $files = 0; + print "."; + $dots++; + } + $files++; + `fstunion $tmp_root/$out_fst $tmp_root/slownik_$i.bin $tmp_root/$tmp_fst`; + move("$tmp_root/$tmp_fst", "$tmp_root/$out_fst") or die "Failed at union: slownik_$i\n"; + `fstrmepsilon $tmp_root/$out_fst $tmp_root/$tmp_fst`; + move("$tmp_root/$tmp_fst", "$tmp_root/$out_fst") or die "Failed at rmepsilon: slownik_$i\n"; + `fstdeterminize $tmp_root/$out_fst $tmp_root/$tmp_fst`; + move("$tmp_root/$tmp_fst", "$tmp_root/$out_fst") or die "Failed at minimization: slownik_$i\n"; + `fstminimize $tmp_root/$out_fst $tmp_root/$tmp_fst`; + + + move("$tmp_root/$tmp_fst", "$tmp_root/$out_fst") || die "Unable to move $tmp_root/$tmp_fst -> $out_fst!\n"; + +} + +if ($dots < $ndots) { + for (my $i=0; $i<$ndots - $dots; $i++) { + print "."; + } +} + +print "OK\n"; + + + +###################################################################### +# Minimalizacja automatu: +###################################################################### +print "removing epsilon-transitions\t\t"; +`fstrmepsilon $tmp_root/$out_fst $tmp_root/$tmp_fst`; +move("$tmp_root/$tmp_fst", "$tmp_root/$out_fst") or die "Failed\n"; +print "OK\n"; + +print "determinizing automaton\t\t"; +`fstdeterminize $tmp_root/$out_fst $tmp_root/$tmp_fst`; +move("$tmp_root/$tmp_fst", "$tmp_root/$out_fst") or die "Failed\n"; +print "OK\n"; + +print "minimizing automaton\t\t"; +`fstminimize $tmp_root/$out_fst $tmp_root/$tmp_fst`; +move("$tmp_root/$tmp_fst", "$tmp_root/$out_fst") or die "Failed\n"; +print "OK\n"; + + + + +print "moving the FST to compiledic directory\t\t"; +use Cwd; +my $workdir = getcwd($0); +move("$tmp_root/$out_fst", "$workdir/dictionary.bin") or die "Failed\n"; +print "OK\n"; + +######################################################## +# Sprzatanie: +print "removing temporary files\t\t"; + +unlink <$tmp_root/*> or die "Failed\nCan't delete contents of $tmp_root \n"; +unlink ($tmp_root); +print "OK\n"; + +print "Finished!\n"; diff --git a/src/compiledic/removeBom.sh b/src/compiledic/removeBom.sh new file mode 100755 index 0000000..1364f98 --- /dev/null +++ b/src/compiledic/removeBom.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +# Remove bom from file. +awk '{if(NR==1)sub(/^\xef\xbb\xbf/,"");print}' $1 diff --git a/src/compiledic/symbols.py b/src/compiledic/symbols.py new file mode 100755 index 0000000..605cc68 --- /dev/null +++ b/src/compiledic/symbols.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import locale +encoding = locale.getdefaultlocale()[1] + + +sys.stdout.write(u"\t0\n".encode(encoding)) +for i in range(33,60000): + line = u"%s\t%s\n"%(unichr(i), i) + sys.stdout.write(line.encode(encoding)) diff --git a/src/compiledic/text2fst.py b/src/compiledic/text2fst.py new file mode 100755 index 0000000..d95be94 --- /dev/null +++ b/src/compiledic/text2fst.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""----------------------------------------------------------------------------- +# Name: text2FST +# Purpose: A tool for translating a dictionary file into a OpenFST format. +# +# Author: Krzysztof Szarzynski +# +# Created: 19/11/2012 +# Copyright: (c) UAM Text Tools 2012 +# Licence: Simplified BSD License +# Usage: +# cat dictionary.dic | ./text2fst > output.fst +# Warning: the dictionary.dic file must be UTF8 _without_ BOM +# +# TODO: Checking the BOM and removing it from the dictionary.file +-----------------------------------------------------------------------------""" + +import sys +import locale + +encoding = locale.getdefaultlocale()[1] + +def prn(str): + sys.stdout.write(str.encode(encoding)) + +begState = 0 +endState = 1 +eps = u"" +currentState = begState +for line in sys.stdin: + prn(u"%s %s %s\n"%(begState, currentState, eps)) + line = line.decode('UTF-8') + for letter in line: + prn(u"%s %s %s\n"%(currentState, currentState+1, letter)) + currentState+=1 + prn(u"%s %s %s\n"%(currentState, endState, eps)) +print endState + diff --git a/src/cor/Makefile b/src/cor/Makefile new file mode 100644 index 0000000..c9b22d1 --- /dev/null +++ b/src/cor/Makefile @@ -0,0 +1,56 @@ +include ../../config.mak + +ifeq ($(BUILD_STATIC), yes) + LDFLAGS += -static +endif + +LDFLAGS += +CXXFLAGS += -O2 -fpermissive + +LIB_PATH=../lib +COMMON_PATH=../common +CMDLINE_FILE='"../cor/cmdline.h"' + +all: cor + +cor: main.cc corr.o $(LIB_PATH)/word.o \ + $(LIB_PATH)/auttools.o cmdline.c common_cor.o common.o + $(CXX) $(CXXFLAGS) -D _CMDLINE_FILE=$(CMDLINE_FILE) main.cc corr.o common.o \ + $(LIB_PATH)/word.o $(LIB_PATH)/auttools.o cmdline.c common_cor.o \ + -o cor $(LDFLAGS) + +corr.o: corr.cc corr.hh + $(CXX) $(CXXFLAGS) -c corr.cc + +common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \ + $(COMMON_PATH)/common.h + $(CXX) $(CXXFLAGS) -c -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc + +common_cor.o: cmdline.h common_cor.cc common_cor.h + $(CXX) $(CXXFLAGS) -c -D _CMDLINE_FILE=$(CMDLINE_FILE) common_cor.cc + +cmdline.c cmdline.h: cmdline.ggo + $(GENGETOPT) -i cmdline.ggo --conf-parser + +cmdline.ggo: cmdline_cor.ggo $(COMMON_PATH)/cmdline_common.ggo + cat cmdline_cor.ggo $(COMMON_PATH)/cmdline_common.ggo > cmdline.ggo + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 cor $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/cor +endif + +clean: clean.cmdline + rm *.o || true + rm cor || true + +clean.cmdline: + rm cmdline.* || true + diff --git a/src/cor/cmdline_cor.ggo b/src/cor/cmdline_cor.ggo new file mode 100644 index 0000000..810d511 --- /dev/null +++ b/src/cor/cmdline_cor.ggo @@ -0,0 +1,8 @@ +package "cor" +version "0.1" + +option "dictionary-home" - "Dictionary home dir." string typestr="FILENAME" no hidden +option "dictionary" d "Dictionary" string typestr="FILENAME" default="cor.bin" no +option "distance" n "Maximal edit distance." int default="1" no +option "replace" r "Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off hidden +#option "single" - "Place all alternatives in the same line" flag off diff --git a/src/cor/common_cor.cc b/src/cor/common_cor.cc new file mode 100644 index 0000000..fc20905 --- /dev/null +++ b/src/cor/common_cor.cc @@ -0,0 +1,29 @@ +#include +#include +#include "common_cor.h" + +char dictionary[256]; + +void process_cor_options(gengetopt_args_info* args) +{ + if(args->dictionary_given) + { + expand_path(args->dictionary_arg,dictionary); + if(file_accessible(dictionary)!=0) + { + fprintf(stderr,"Cannot open the dictionary file: %s\nAborting.\n",dictionary); + exit(1); + } + } + else if (args->dictionary_home_given && args->language_given) + { + char buf[255]; + expand_path(args->dictionary_home_arg, buf); + sprintf(dictionary,"%s/%s/cor.bin",buf,args->language_arg); + if(file_accessible(dictionary)!=0) + { + fprintf(stderr,"Cannot open the dictionary file: %s\nAborting.\n",dictionary); + exit(1); + } + } +} diff --git a/src/cor/common_cor.h b/src/cor/common_cor.h new file mode 100644 index 0000000..83b2948 --- /dev/null +++ b/src/cor/common_cor.h @@ -0,0 +1,19 @@ +#ifndef __COMMON_COR_H +#define __COMMON_COR_H + +#include + +//do wyrzucenia - definicja w Makefile! #define _CMDLINE_FILE "../cor/cmdline.h" +#include "../common/common.h" + +#include "cmdline.h" + +#define DICT_FILE "cor.bin" + +extern int change_count; + +extern void process_cor_options(gengetopt_args_info* args); + +extern char dictionary[]; + +#endif diff --git a/src/cor/corr.cc b/src/cor/corr.cc new file mode 100644 index 0000000..1e0d83c --- /dev/null +++ b/src/cor/corr.cc @@ -0,0 +1,142 @@ +//--------------------------------------------------------------------------- + +#include "corr.hh" + +#define MAXPATH 256 + +#define min(x,y) ((xy)?(x):(y)) + + +int Corr::ed(int i,int j) +{ + if(i==-1) + return j+1; + if(j==-1) + return i+1; + if(i==-2 || j==-2) + return n+1; + + if(X[i]==Y[j]) + return H2[i-1][j-1]; + if(X[i-1]==Y[j] && X[i]==Y[j-1]) + return 1+min(H2[i-2][j-2],min(H2[i][j-1],H2[i-1][j])); + return 1+min(H2[i-1][j-1],min(H2[i][j-1],H2[i-1][j])); + +/* + if(X[i]==Y[j]) + return H[(i-1)+2][(j-1)+2]; + if(X[i-1]==Y[j] && X[i]==Y[j-1]) + return 1+min(H[(i-2)+2][(j-2)+2],min(H[(i)+2][(j-1)+2],H[(i-1)+2][(j)+2])); + return 1+min(H[(i-1)+2][(j-1)+2],min(H[(i)+2][(j-1)+2],H[(i-1)+2][(j)+2])); +*/ +} + +int Corr::cuted(int j) +{ + int l=max(0,j-t); + int u=min(m,j+t); + int ce=j+t; + for(int k=l;k<=u;k++) + { + if(H2[k][j]0) + j--; + else + more=0; + while(more && !continued(path[j])); + state=path[j]+1; + } + return count; +} + + +//--------------------------------------------------------------------------- + diff --git a/src/cor/corr.hh b/src/cor/corr.hh new file mode 100644 index 0000000..5c7438e --- /dev/null +++ b/src/cor/corr.hh @@ -0,0 +1,34 @@ +//--------------------------------------------------------------------------- +#ifndef _corr_hh +#define _corr_hh +//--------------------------------------------------------------------------- + +#include "../lib/tfti.h" +#include "../lib/word.h" + +class Corr : public TFTiv +{ +private: + int H[100][100]; + char X[100]; // misspelled string + char Y[100]; // (possibly partial) candidate string + int m; // length of X + int n; // maximal length of Y + + int ed(int,int); + int cuted(int); + void recomputeH(int); + +public: + int (*H2)[100]; + + int t; // threshold + + Corr() : H2((int(*)[100])&H[2][2]) {}; + Corr(const char* a) : TFTiv(a), H2((int(*)[100])&H[2][2]) { }; + + int correct(const char* w, Words& tab); +}; + +//--------------------------------------------------------------------------- +#endif diff --git a/src/cor/main.cc b/src/cor/main.cc new file mode 100644 index 0000000..a109c86 --- /dev/null +++ b/src/cor/main.cc @@ -0,0 +1,155 @@ +#include +#include +#include "../lib/iotools.h" +//do wyrzucenia - definicja w Makefile! #define _CMDLINE_FILE "../cor/cmdline.h" +#include "../common/common.h" +#include "common_cor.h" +#include "corr.hh" +#include "cmdline.h" +#include + + +int main(int argc, char** argv) { + +// setlocale(LC_CTYPE,""); +// setlocale(LC_COLLATE,""); + + gengetopt_args_info args; + + if(cmdline_parser(argc, argv, &args) != 0) + exit(1); + + process_config_files(&args,argv[0]); + process_common_options(&args,argv[0]); + process_cor_options(&args); + + Corr cor; + + cor.load(dictionary); + cor.t=args.distance_arg; + + char line[MAX_LINE+1]; + long line_count = 0; + + Segment seg; + Words tab; + char form1[MAX_LINE]; + char* form; + int formcasing; + char corfield[MAX_LINE]=""; + + while (fgets(line, MAX_LINE, inputf)) + { +// strcpy(outline,line); + ++line_count; + +// if(!seg.parse(line)) +// { +// fprintf(stderr,"Input error in line %d.\n",line_count); +// exit(1); +// } + + char outline[128]; + //printf("Starting cor... searching for %d fields\n", args.input_field_given); + //for (int i=0; i $@.$$$$; \ + sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ + rm -f $@.$$$$ + +# stare: +# cmdline.cc cmdline.h : cmdline.ggo +# gengetopt --c-extension=cc -i cmdline.ggo +# nowe +cmdline.cc cmdline.h: cmdline.ggo + $(GENGETOPT) -i cmdline.ggo --c-extension=cc --conf-parser + +cmdline.ggo: cmdline_dgp.ggo ../common/cmdline_common.ggo + cat cmdline_dgp.ggo ../common/cmdline_common.ggo > cmdline.ggo +# endnowe + + +clean: + rm ${bin} ${objs} cmdline.cc cmdline.h + rm -rf *.d + +prof: dgp + gprof dgp ~/tmp/dgp-pl/gmon.out > dgp.prof + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 dgp $(BIN_DIR) + install -m 0755 dgc $(BIN_DIR) + install -m 0755 canonize $(BIN_DIR) + install -m 0755 tre $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/dgp + rm $(BIN_DIR)/dgc + rm $(BIN_DIR)/canonize + rm $(BIN_DIR)/tre +endif diff --git a/src/dgp/Makefile.user b/src/dgp/Makefile.user new file mode 100644 index 0000000..64c00b7 --- /dev/null +++ b/src/dgp/Makefile.user @@ -0,0 +1,3 @@ + +gram.dgp: gram.dgc + dgc -c cats.dgc < gram.dgc > gram.dgp diff --git a/src/dgp/canonize b/src/dgp/canonize new file mode 100755 index 0000000..f9bd7b3 --- /dev/null +++ b/src/dgp/canonize @@ -0,0 +1,50 @@ +#!/usr/bin/perl + +#package: UAM TExt Tools +#component: canonize +#version: 1.0 +#author: Tomasz Obrebski + +use lib "/usr/local/lib/utt"; +use lib "$ENV{'HOME'}/.local/lib/utt"; + +use strict; +use Getopt::Long; +use attr; + + +my $help; + +GetOptions("help|h" => \$help); + +if($help) +{ + print <<'END' + +Transforms syntactic categories to their canonical form. + +Usage: canonize + +Options: + --help -h Help. + +END +; + exit 0; +} + +#$|=1; + +my %tra; + +while(<>) +{ + s/$attr::pos_re\/$attr::avlist_re/trans($&)/ge; + print; +} + +sub trans +{ + my $cat=shift; + exists($tra{$cat}) ? $tra{$cat} : ( $tra{$cat} = attr::canonize $cat ); +} diff --git a/src/dgp/cmdline.ggo b/src/dgp/cmdline.ggo new file mode 100644 index 0000000..ce3890b --- /dev/null +++ b/src/dgp/cmdline.ggo @@ -0,0 +1,52 @@ +package "dgp" +version "0.1" + +option "grammar" g "Grammar file" + string no typestr="filename" + +option "long" l "Long output" + flag off + +option "debug" d "Debug mode." + flag off + +option "info" - "Print info. +h - heads d - dependents +s - sets +c - constraints n - node/arc counts t - parse time +" +string no default="h" +#section "Common UTT options" + + +option "input" f "Input file" string no + +option "output" o "Output file for succesfully processed segments" string no + +option "fail" e "Output file for unsuccesfully processed segments " string no + +option "only-fail" - "Print only segments the program failed to process" flag off hidden + +option "no-fail" - "Print only segments the program processed" flag off hidden + +option "copy" c "Copy succesfully processed segments to standard output" flag off + +option "process" p "Process segments with this tag" string no multiple + +option "select" s "Select only segments with this field" string no multiple + +option "ignore" S "Select only segments without this field" string no multiple + +option "output-field" O "Output field name" string no + +option "input-field" I "Input field name" string no multiple + +option "interactive" i "Toggle interactive mode" flag off + +option "config" - "Configuration file" string typestr="FILENAME" no + +option "one-field" 1 "Print all results in one segments (creates ambiguous annotation)" flag off + +option "one-line" - "Print annotation alternatives as additional fields" flag off + +option "language" - "Language." string no diff --git a/src/dgp/cmdline_dgp.ggo b/src/dgp/cmdline_dgp.ggo new file mode 100644 index 0000000..c59786d --- /dev/null +++ b/src/dgp/cmdline_dgp.ggo @@ -0,0 +1,18 @@ +package "dgp" +version "0.1" + +option "grammar" g "Grammar file" + string no typestr="filename" + +option "long" l "Long output" + flag off + +option "debug" d "Debug mode." + flag off + +option "info" - "Print info. +h - heads d - dependents +s - sets +c - constraints n - node/arc counts t - parse time +" +string no default="h" diff --git a/src/dgp/const.hh b/src/dgp/const.hh new file mode 100644 index 0000000..7077f81 --- /dev/null +++ b/src/dgp/const.hh @@ -0,0 +1,13 @@ +#ifndef CONST_HH +#define CONST_HH + +#define MAXTYPES 32 +#define MAXFLAGS 64 +#define MAXNODES 1024 +#define MAXCONSTRS 32 +#define MAXLINE 256 +#define MAXFORMLEN 64 +#define MAXDESCRLEN 80 +#define FIELDSEP " \n\t" + +#endif diff --git a/src/dgp/dgc b/src/dgp/dgc new file mode 100755 index 0000000..8fd9d40 --- /dev/null +++ b/src/dgp/dgc @@ -0,0 +1,292 @@ +#!/usr/bin/perl + +#package: UAM Text Tools +#component: dgc (dg compiler) +#version: 1.0 +#author: Tomasz Obrebski + +# wymaga niejawnie programu canonize!!!! +use lib "/usr/local/lib/utt"; +use lib "$ENV{'HOME'}/.local/lib/utt"; + +use strict; +use Getopt::Long; +use Data::Dumper; +use attr; +use File::HomeDir; + +my $systemconfigfile='/usr/local/etc/utt/dgc.conf'; +my $userconfigfile=home()."/.utt/dgc.conf"; + +Getopt::Long::Configure('no_ignore_case_always'); + +my $help=0; +my $catfile=0; +my $dicfile=0; +my $gramfile=0; +my $outputfile=0; + +#read configuration files########################### +my $file; +foreach $file ($systemconfigfile, $userconfigfile){ + if(open(CONFIG, $file)){ + while () { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "categories")or($name eq "c")){ + $catfile=$value; + } + elsif(($name eq "dictionary")or($name eq "d")){ + $dicfile=$value; + } + elsif(($name eq "grammar")or($name eq "g")){ + $gramfile=$value; + } + elsif(($name eq "outputfile")or($name eq "o")){ + $outputfile=$value; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + + } + close CONFIG; + } +} +######################################################### + +GetOptions("help|h" => \$help, + "categories|c=s" => \$catfile, + "dictionary|d=s" => \$dicfile, + "grammar|g=s" => \$gramfile, + "outputfile|o=s" => \$outputfile); + +my $homedir = $ENV{'HOME'}; +$catfile =~ s/~/$homedir/; +$dicfile =~ s/~/$homedir/; +$gramfile =~ s/~/$homedir/; +$outputfile =~ s/~/$homedir/; + + +if($help) +{ + print <<'END' +Usage: dgc [OPTIONS] + +Options: + --categories -c filename List of syntactic categories. + --dictionary -d filename Dictionary. + --grammar -g filename List of grammar rules. + --outputfile -o filename Output file name. + --help -h Help. +END +; + exit 0; +} + +die("At least one of --cats and --dic must be given.\n") if !$catfile && !$dicfile; + +my $ncat=0; +my $nrole=0; +my $nsgl=0; +my $nleft=0; +my $nright=0; +my $nreq=0; +my $nlink=0; +my $nflag=0; + +my %cats; +my %roles; +my %agr; +my %gov; + +if(!$outputfile) { + *OUTPUT = *STDOUT; +} +elsif($outputfile eq "-") { + *OUTPUT = *STDOUT; +} +else { + open(OUTPUT, ">$outputfile") or die("Can't open output file: $outputfile!"); +} + + +loadcats($catfile) if $catfile; +extractcats($dicfile) if $dicfile; + + +my $cats_re = qr/(?:$attr::cat_re\s*(?:,\s*$attr::cat_re)*)/; + +# class parse_class: +# /$attr::cat_re/g; + + +if(!$gramfile) { + *INPUT = *STDIN; +} +elsif($gramfile eq "-"){ + *INPUT = *STDIN; +} +else { + open(INPUT, $gramfile) or die("Unable to open: $gramfile!"); +} + +while() +{ + s/#.*//; + s/^\s+//; + s/\s+$//; + if(/^AGR\s+(\S+)\s+(\S+)$/) + { + push @{$agr{$1}}, $2; + } + elsif(/^GOV\s+(\S+)\s+(\S+)$/) + { + push @{$gov{$1}}, attr::parse($2); + } + elsif(/^ROLE\s+\S+$/) + { + $roles{$_}=1; + print OUTPUT "$_\n"; + } + elsif(/^SGL\s+\S+$/) + { + ++$nsgl; + print OUTPUT "$_\n"; + } + elsif(/^REQ\s+(\S+)\s+(\S+)$/) + { + print OUTPUT "#$_\n"; + my $cat = attr::parse $1; + for my $atomcat (keys %cats) + { + if(attr::match @$cat, @{$cats{$atomcat}}) + { + print OUTPUT "REQ ".$atomcat." $2\n"; + ++$nreq; + } + } + } + elsif(/^LEFT\s+\S+$/) + { + ++$nleft; + print OUTPUT "$_\n"; + } + elsif(/^RIGHT\s+\S+$/) + { + ++$nright; + print OUTPUT "$_\n"; + } + elsif(my ($hs,$ds,$r) = /^LINK\s+($cats_re)\s+($cats_re)\s+(\S+)$/) + { + print OUTPUT "#$_\n"; + for my $h ($hs =~ /$attr::cat_re/g) + { + for my $d ($ds =~ /$attr::cat_re/g) + { + addlinks($h,$d,$r); + } + } + } + elsif(/^FLAG\s+\S+$/) + { + ++$nflag; + print OUTPUT "$_\n" + } + elsif(/^$/) { + # pomijamy puste linie oraz komentarze + } + else + { + print STDERR "Illegal format: $_\n"; + } +} + + +sub addlinks +{ + my ($h,$d,$r) = @_; + + for my $a (@{$agr{$r}}) { print OUTPUT "#AGR $r $a\n"; } + for my $c (@{$gov{$r}}) { print OUTPUT "#GOV $r ".attr::unparse(@$c)."\n"; } + my $head = attr::parse $h; + my $dep = attr::parse $d; + + for my $atomhead (keys %cats) + { + if(attr::match @$head, @{$cats{$atomhead}}) + { + DEP: + for my $atomdep (keys %cats) + { + next DEP if ! attr::match @$dep, @{$cats{$atomdep}}; + + for my $a (@{$agr{$r}}) + { + next DEP if ! attr::agree(@{$cats{$atomhead}},@{$cats{$atomdep}},$a); + } + + for my $c (@{$gov{$r}}) + { + next DEP if ! attr::match(@$c,@{$cats{$atomdep}}); + } + + print OUTPUT "LINK "; + print OUTPUT $atomhead." "; + print OUTPUT $atomdep." $r\n"; + ++$nlink; + + } + } + } +} + + +printf STDERR "%6d CAT statements\n", 0+keys(%cats); +printf STDERR "%6d ROLE statements\n", 0+keys(%roles); +printf STDERR "%6d SGL statements\n", $nsgl; +printf STDERR "%6d REQ statements\n", $nreq; +printf STDERR "%6d LEFT statements\n", $nleft; +printf STDERR "%6d RIGHT statements\n", $nright; +printf STDERR "%6d LINK statements\n", $nlink; +printf STDERR "%6d FLAG statements\n", $nflag; + + +sub extractcats +{ + my $file = shift; + open DICFILE, "canonize $file |"; + while() + { + while(/,([^[:space:];]+)/g) + { + my $cat=$1; + next if !$cat || exists $cats{$cat}; + $ncat++; + print OUTPUT "CAT $1\n"; + $cats{$cat}=attr::parse($cat); + } + } + close DICFILE; +} + + +sub loadcats +{ + my $file = shift; + open CATFILE, "canonize $file |"; + while() + { + tr/ \t\n//d; + next if !$_ || exists $cats{$_}; + print OUTPUT "CAT $_\n"; + ++$ncat; + $cats{$_}=attr::parse($_); + } + close CATFILE; +} + diff --git a/src/dgp/dgp0.cc b/src/dgp/dgp0.cc new file mode 100644 index 0000000..24faeb7 --- /dev/null +++ b/src/dgp/dgp0.cc @@ -0,0 +1,217 @@ +#include "dgp0.hh" +#include "global.hh" + +extern Grammar grammar; +extern MGraph mgraph; +extern SGraph sgraph; + +SNode* snodes; + +extern bool debug; + +list nodelist; +list::iterator processed; + + +void set_initial_constraints(int node) +{ + snodes[node].prop.forbidden.reset(); + snodes[node].prop.required=grammar.obl[snodes[node].mnode->cat]; +} + + +bool changing_constraints(int head, Role role) +{ + return grammar.sgl[role] || snodes[head].prop.required[role]; +} + +void apply_constraints(int head, Role role) +{ + if(grammar.sgl[role]) snodes[head].prop.forbidden.set(role); + snodes[head].prop.required.reset(role); +} + +NodeProp compute_prop_left(NodeProp headprop, Role role) +{ + NodeProp ret=headprop; + if(grammar.sgl[role]) ret.forbidden.set(role); + ret.required.reset(role); + return ret; +} + +NodeProp compute_prop_right(NodeProp headprop, Role role) +{ + NodeProp ret=headprop; + + if(grammar.sgl[role]) ret.forbidden.set(role); + ret.required.reset(role); + return ret; +} + +int get_node(MNode& mnode, NodeProp p, bitset& newheadLH, bitset& newheadLV) +{ + for(vector::iterator ps=mnode.snodes.begin(); ps!=mnode.snodes.end(); ++ps) + if(snodes[*ps].prop==p && snodes[*ps].LH==newheadLH && snodes[*ps].LV==newheadLV) + return *ps; + return -1; +} + +void connect_left(list::iterator h, list::iterator d, Role r) +{ + NodeProp &oldheadprop = snodes[*h].prop; + NodeProp newheadprop; + bitset newheadLV; + bitset newheadLH; + bitset newheadLD; + + newheadprop=compute_prop_left(oldheadprop,r); + + int newheadind; + if(oldheadprop==newheadprop) + newheadind = *h; + else + { + newheadLH = snodes[*h].LH; + newheadLV = snodes[*d].LV; + newheadLD = snodes[*h].LD; + + newheadind = get_node(*(snodes[*h].mnode), newheadprop, newheadLH, newheadLV); + if( newheadind < 0 ) + { + newheadind = sgraph.clone(*h,newheadprop); + list::iterator nextit=h; ++nextit; + nodelist.insert(nextit,newheadind); + snodes[newheadind].LH=newheadLH; + snodes[newheadind].in_LH=true; + snodes[newheadind].LV.reset(); + snodes[newheadind].LD = newheadLD; + + if(debug) sgraph.print_node_debug(stderr," C ",newheadind); + } + else + snodes[newheadind].LD |= newheadLD; // TYLKO DLA LD + } + + snodes[newheadind].deps.push_back(Arc(*d,r,*h)); + + if(snodes[*d].saturated()) snodes[newheadind].LV |= snodes[*d].LV; + snodes[newheadind].LD.set(*d); + if(snodes[*d].saturated()) snodes[newheadind].LD |= snodes[*d].LD; + + if(debug) + sgraph.print_arc(stderr,*d,newheadind,r,0), sgraph.print_node_debug(stderr," U ",newheadind); +} + + +void connect_right(list::iterator h, list::iterator d, Role r) +{ + NodeProp &oldheadprop = snodes[*h].prop; + NodeProp newheadprop; + bitset newheadLV; + bitset newheadLH; + bitset newheadLD; + int newheadind; + + newheadprop = compute_prop_right(oldheadprop,r); + if(oldheadprop==newheadprop) + newheadind = *h; + else + { + newheadLH = snodes[*h].LH; + newheadLV = snodes[*h].LV; + newheadLD = snodes[*h].LD; + + newheadind = get_node(*(snodes[*h].mnode), newheadprop, newheadLH, newheadLV); + if( newheadind < 0 ) + { + newheadind = sgraph.clone(*h,newheadprop); + snodes[newheadind].LH=newheadLH; + snodes[newheadind].in_LH=false; + snodes[newheadind].LV=newheadLV; + snodes[newheadind].LD=newheadLD; + list::iterator nextit=h; ++nextit; + nodelist.insert(nextit,newheadind); + + if(debug) sgraph.print_node_debug(stderr," C ",newheadind); + } + else + snodes[newheadind].LD |= newheadLD; // TYLKO DLA LD + } + + snodes[*d].heads.push_back(Arc(newheadind,r,*h)); + + snodes[*d].LH.set(newheadind); + + if(snodes[newheadind].saturated()) snodes[*d].LH |= snodes[newheadind].LH; + + if(debug) + sgraph.print_arc(stderr,newheadind,*d,r,1), sgraph.print_node_debug(stderr," U ",*d); + +} + + +void try_connect_dependents(list::iterator j) +{ + for(list::iterator i(j); i!=nodelist.begin(); --i) + if(sgraph.visible(*i,*j) && sgraph.saturated(*i)) + { + Roles& ji_roles = grammar.connect[snodes[*j].mnode->cat][snodes[*i].mnode->cat]; + for(RolesIter r=ji_roles.begin(); r!=ji_roles.end();++r) + if(grammar.check_constr(snodes[*j].prop,snodes[*i].prop,0,*r)) + connect_left(j,i,*r); + } +} + + +void try_connect_heads(list::iterator j) +{ + for(list::iterator i(j); i!=nodelist.begin(); --i) + if(sgraph.visible(*i,*j)) + { + Roles& ij_roles = grammar.connect[snodes[*i].mnode->cat][snodes[*j].mnode->cat]; + for(RolesIter r=ij_roles.begin(); r!=ij_roles.end();++r) + if(grammar.check_constr(snodes[*i].prop,snodes[*j].prop,1,*r)) + connect_right(i,j,*r); + } +} + + +void reverse_links() +{ + list::iterator i = nodelist.begin(); + for(++i; i!=nodelist.end(); ++i) + { + for(vector::iterator da=sgraph.nodes[*i].deps.begin()--; da!=sgraph.nodes[*i].deps.end(); ++da) + sgraph.nodes[da->dst].heads.push_back(Arc(*i,da->role,da->anc)); + for(vector::iterator ha=sgraph.nodes[*i].heads.begin(); ha!=sgraph.nodes[*i].heads.end(); ++ha) + sgraph.nodes[ha->dst].deps.push_back(Arc(*i,ha->role,ha->anc)); + } +} + + +void dgp0() +{ + snodes=sgraph.nodes; + + nodelist.clear(); + nodelist.push_back(0); // BOS + processed=nodelist.begin(); + + for(int m=0; m::iterator cursor=processed; + while(++cursor != nodelist.end()) + { + try_connect_dependents(cursor); + try_connect_heads(cursor); + processed=cursor; + } + } + reverse_links(); +} diff --git a/src/dgp/dgp0.hh b/src/dgp/dgp0.hh new file mode 100644 index 0000000..1a135da --- /dev/null +++ b/src/dgp/dgp0.hh @@ -0,0 +1,12 @@ +#ifndef _DGP0_HH +#define _DGP0_HH + +#include "grammar.hh" +#include "sgraph.hh" +#include "mgraph.hh" + +// API + +void dgp0(); + +#endif diff --git a/src/dgp/global.cc b/src/dgp/global.cc new file mode 100644 index 0000000..80a32f5 --- /dev/null +++ b/src/dgp/global.cc @@ -0,0 +1,5 @@ + +#include "global.hh" + +bool debug = false; + diff --git a/src/dgp/global.hh b/src/dgp/global.hh new file mode 100644 index 0000000..4af6605 --- /dev/null +++ b/src/dgp/global.hh @@ -0,0 +1 @@ +extern bool debug; diff --git a/src/dgp/go b/src/dgp/go new file mode 100755 index 0000000..f9e4329 --- /dev/null +++ b/src/dgp/go @@ -0,0 +1,13 @@ +if test -f Makefile.go; +then + make -f Makefile.go gram.dgp; + tok |\ + lem -p W |\ + canonize |\ + sen |\ + gph -p W -p BOS -p EOS -r BOS |\ + dgp -i ds -p W -p BOS -p EOS -g gram.dgp +else + echo "Invalid configuration! Run utt_make_config.pl first." +fi + diff --git a/src/dgp/grammar.cc b/src/dgp/grammar.cc new file mode 100644 index 0000000..63e2882 --- /dev/null +++ b/src/dgp/grammar.cc @@ -0,0 +1,181 @@ + +#include + +#include "grammar.hh" + +bool (*constraint[MAXCONSTRS])(int head, int dep); + + +int chk_type(const char* s, int lineno) // SIDE EFECTS! +{ + if(Role::index(s)>0) return 1; + + fprintf(stderr,"%8d: Invalid type '%s'. Line ignored.\n",lineno,s); + return 0; +} + +int chk_cat(const char* s, int lineno) +{ + if(Cat::index(s)>0) return 1; + + fprintf(stderr,"%8d: Invalid category '%s'. Line ignored.\n",lineno,s); + return 0; +} + +void Grammar::add_category(const char* s) +{ + Cat::add(s); + if(Cat::count()>cats_sz) + { + cats_sz += 16; + connect.resize(cats_sz); + for(int i=0; itypes_sz) + { + types_sz += 16; + lt.resize(types_sz); + gt.resize(types_sz); + } +} + +void Grammar::add_flag(const char* s) +{ + Flag::add(s); + if(Flag::count()>flags_sz) + { + flags_sz += 16; + pass.resize(flags_sz); + } +} + + +void Grammar::set_lt(Role s, Role t) +{ + lt[s].set(t); + gt[t].set(s); + if(s==0||(int)t==0) + return; + else + { + for(int i=0; i=2) + { + add_category(arg1); + } + else if(strcmp(key,"ROLE")==0 && fields>=2) + { + add_type(arg1); + } + else if(strcmp(key,"SGL")==0 && fields>=2) + { + if(chk_type(arg1,lineno)) + set_sgl(arg1); + } + else if(strcmp(key,"LEFT")==0 && fields>=2) + { + if(chk_type(arg1,lineno)) + set_left(arg1); + } + else if(strcmp(key,"RIGHT")==0 && fields>=2) + { + if(chk_type(arg1,lineno)) + set_right(arg1); + } + else if(strcmp(key,"REQ")==0 && fields>=3) + { + if(chk_cat(arg1,lineno) + chk_type(arg2,lineno) == 2) + set_obl(arg1,arg2); + } + else if(strcmp(key,"LINK")==0 && fields>=4) + { + if(chk_cat(arg1,lineno) + chk_cat(arg2,lineno) + chk_type(arg3,lineno) == 3) + set_connect(arg1,arg2,arg3); + } + // FLAG DECLARATION + else if(strcmp(key,"FLAG")==0 && fields>=2) + { + add_flag(arg1); + } + + else fprintf(stderr,"Invalid line %d. Ignored.\n", lineno); + } + +// compute_gt(); + + return true; + +} + +void Grammar::write(FILE* f) +{ + for(Cat i=1; i +#include +#include +#include + +#include "const.hh" +#include "thesymbols.hh" +#include "sgraph.hh" + + +class Link +{ + Role role; + FlagSet hflags; + FlagSet dflags; +}; + + +class Grammar +{ + + public: + + // enum CONSTR { SGL, OBL, LEFT, RIGHT, INIT, NONINIT, FIN, NONFIN }; + + Grammar() : types_sz(0), cats_sz(0), flags_sz(0) {} ; + + int types_sz; + int cats_sz; + int flags_sz; + + vector< vector< Roles > > connect; + RoleSet sgl; + vector< RoleSet > obl; + RoleSet left; + RoleSet right; + vector< RoleSet > lt; + vector< RoleSet > gt; + + + // vector< vector< vector< + vector< FlagSet > set; + vector< FlagSet > pass; + + bool read(FILE* f); + void write(FILE* f); + + void add_category(const char* s); + void add_type(const char* s); + void add_flag(const char* s); + + void set_sgl(Role r) { sgl.set(r); } + void set_obl(Cat c, Role r) { obl[c].set(r); } + void set_left(Role r) { left.set(r); } + void set_right(Role r) { right.set(r); } + void set_order(Role r, Role s) { lt[s].set(r); } + void set_connect(Cat c, Cat d, Role r) { connect[c][d].insert(r); } + void set_lt(Role r, Role s); + void compute_gt(); + + + bool check_constr(NodeProp& hprop, NodeProp& dprop, int dir, Role role); + +}; + +inline bool Grammar::check_constr(NodeProp& hprop, NodeProp& dprop, int dir, Role role) +{ + return + !hprop.forbidden[role] && + ( !right[role] || dir==1 ) && + ( !left[role] || dir==0 ) + ; +} + + +#endif diff --git a/src/dgp/main.cc b/src/dgp/main.cc new file mode 100644 index 0000000..b29171e --- /dev/null +++ b/src/dgp/main.cc @@ -0,0 +1,121 @@ + +/** + * Package: UAM Text Tools + * Component: dgp (dg parser) + * Version: 1.0 + * Author: Tomasz Obrebski + */ + +#include "global.hh" +#include "mgraph.hh" +#include "sgraph.hh" +#include "grammar.hh" +#include "dgp0.hh" +#include "../common/common.h" +#include "cmdline.h" + +#define MAXSEGMENTS 500 + +char segment[MAXSEGMENTS][MAXLINE]; +int segcount=0; +char seg_mnode[MAXSEGMENTS]; +char grammarfile[255]; + + +Grammar grammar; +MGraph mgraph; +SGraph sgraph; + +FILE* grammarf; +FILE* debugf=stdout; +unsigned int info=0U; + +void output(); + +main(int argc, char* argv[]) +{ + gengetopt_args_info args; + + if(cmdline_parser(argc,argv,&args) != 0) + exit(1); + + process_config_files(&args,argv[0]); + process_common_options(&args,argv[0]); + + if(!args.grammar_given) + fprintf(stderr,"dgp: no grammar given\n"); + + expand_path(args.grammar_arg,grammarfile); + + if(!(grammarf=fopen(grammarfile,"r"))) + fprintf(stderr,"dgp: grammar file not found: %s.\n", grammarfile), exit(1); + + if(args.debug_given) debug=true; + + for(char* c=args.info_arg; *c!='\0' ; ++c) + switch(*c) + { + case 'h': info|=SGraph::HEADS; break; + case 'd': info|=SGraph::DEPS; break; + case 's': info|=SGraph::SETS; break; + case 'c': info|=SGraph::CONSTRAINTS; break; + } + + grammar.read(grammarf); + fclose(grammarf); + + mgraph.clear(); + sgraph.clear(); + + char line[1000]; + while (fgets(line, MAXLINE+1, inputf)) + { + line[strlen(line)-1] = '\0'; + strcpy(segment[segcount],line); + + char segtype[80]; + + seg_mnode[segcount] = process_seg(line, args) ? mgraph.add_node(line) : -1; + + segcount++; + + getfield(line,"3",segtype); + if(strcmp(segtype,"EOS")==0) + { + dgp0(); // parametry!!! MGraph, SGraph, Grammar + output(); + + mgraph.clear(); + sgraph.clear(); + segcount=0; + } + // if(args.interactive_flag) { fflush(outputf); fflush(failedf); } + } + + fclose(inputf); + fclose(outputf); + cmdline_parser_free(&args); + exit(0); +} + +void output() +{ + for(int si=0; si=0) + { + MNode& m=mgraph.nodes[seg_mnode[si]]; + for(vector::iterator s=m.snodes.begin(); s!=m.snodes.end(); ++s) + { + fputs(segment[si],outputf); + sgraph.print_node(outputf, *s, info); + fputc('\n',outputf); + } + } + else + { + fputs(segment[si],outputf); + fputc('\n',outputf); + } + } +} diff --git a/src/dgp/mgraph.cc b/src/dgp/mgraph.cc new file mode 100644 index 0000000..adc9d41 --- /dev/null +++ b/src/dgp/mgraph.cc @@ -0,0 +1,54 @@ + +#include "mgraph.hh" +#include "thesymbols.hh" +#include "const.hh" + +#include + +int MGraph::add_node(char* seg) +{ + nodes[n].clear(); + + char field1[80], field3[80], descr[256], gph[256]; + char* cat; + + getfield(seg,"1",field1); + nodes[n].pos=atoi(field1); + + getfield(seg,"3",field3); + if(!getfield(seg,"lem",descr)) strcpy(descr,"?,?"); + + cat=descr; + while(*cat!=',' && *cat ) ++cat; + if(*cat) ++cat; + +// Cat::add(cat); + if(Cat::index(cat)>0) + nodes[n].cat=cat; + else + nodes[n].cat="NULL"; + + nodes[n].pred.clear(); + + char* tok; + int previd; + + if(!getfield(seg,"gph",gph)) + { + fprintf(stderr,"No gph field. Aborting (sorry).\n"); + exit(1); + } + + char* ids=strtok(gph,":"); + if(n!=atoi(ids)){fprintf(stderr,"Invalid node id in line ?. Program aborted.\n"); exit(1); } + + char *preds; + while(preds=strtok(NULL,",")) + { + previd=atoi(preds); + nodes[n].pred.push_back(&nodes[previd]); + } + + return n++; +} + diff --git a/src/dgp/mgraph.hh b/src/dgp/mgraph.hh new file mode 100644 index 0000000..373eac2 --- /dev/null +++ b/src/dgp/mgraph.hh @@ -0,0 +1,34 @@ +#ifndef _MGRAPH_HH +#define _MGRAPH_HH + +#include + +#include "const.hh" +#include "thesymbols.hh" +#include "../common/common.h" + +class MNode +{ +public: + + char type[MAXFORMLEN]; + Cat cat; + int pos; + vector pred; + vector snodes; + + void clear() { snodes.clear(); }; +}; + +class MGraph +{ + public: + + MNode nodes[MAXNODES]; + int n; + + void clear() { n=0; }; + int add_node(char* seg); +}; + +#endif diff --git a/src/dgp/sgraph.cc b/src/dgp/sgraph.cc new file mode 100644 index 0000000..e8d50d5 --- /dev/null +++ b/src/dgp/sgraph.cc @@ -0,0 +1,165 @@ +#include "global.hh" +#include "sgraph.hh" +#include "mgraph.hh" +#include "grammar.hh" +#include "const.hh" +#include + + +int SGraph::add_base_snode(MNode* mn) +{ + int nodeind=n; + SNode &node=nodes[n]; + + node.clear(); + + node.mnode=mn; + + for(vector::iterator pm=node.mnode->pred.begin(); pm!=node.mnode->pred.end(); ++pm) + for(vector::iterator ps=(*pm)->snodes.begin(); ps!=(*pm)->snodes.end(); ++ps) + if(nodes[*ps].in_LH) + { + node.LV.set(*ps); + if(nodes[*ps].saturated()) node.LV |= nodes[*ps].LH; + } + + mn->snodes.push_back(nodeind); + ++n; + + node.in_LH=true; + + return nodeind; +} + + +void SGraph::update_left(int headind, int depind) +{ + SNode &head=nodes[headind], &dep=nodes[depind]; + + if(dep.saturated()) head.LV |= dep.LV, head.LD |= dep.LD; +} + + +void SGraph::update_right(int headind, int depind) +{ + SNode &head=nodes[headind], &dep=nodes[depind]; + + dep.LH.set(headind); + if(head.saturated()) + dep.LH |= head.LH; +} + + +int SGraph::clone(int ancind, NodeProp newprop) +{ + int newind = n++; + SNode &newnode=nodes[newind]; + SNode &ancnode = nodes[ancind]; + + newnode.clear(); + newnode.prop=newprop; + newnode.mnode=ancnode.mnode; + newnode.mnode->snodes.push_back(newind); + return newind; +} + + +//------------------------------------------------------------------------- +//------------------------------------------------------------------------- + + +int SGraph::print_node(FILE* f, int n, unsigned int info) +{ + char buf[1000]; + sprint_node(buf,n,info); + fputs(buf,f); +} + +int SGraph::sprint_node(char* buf, int nodeind, unsigned int info) +{ + char* buf0=buf; + char descr[256]; + char nodeinfo[16]; + + SNode &node=nodes[nodeind]; + + buf+=sprintf(buf," dgp:%d",nodeind); + buf+=sprintf(buf, saturated(nodeind) ? ";s" : ";u"); + + bool cont=false; + if (info&HEADS) + { + buf+=sprintf(buf,";"); + for(vector::iterator h=node.heads.begin(); h!=node.heads.end(); ++h) + { + if(cont) buf+=sprintf(buf,","); else cont=true; + buf+=sprintf(buf,"++%s-%d/%d",h->role.str(),h->dst,h->anc); + } + } + + if (info&DEPS) + { + buf+=sprintf(buf,";"); + for(vector::iterator d=node.deps.begin(); d!=node.deps.end(); ++d) + { + // if(! nodes[d->dst].saturated()) continue; // NIE DRUKUJ NIENASYCONYCH PODRZEDNIKOW + if(cont) buf+=sprintf(buf,","); else cont=true; + buf+=sprintf(buf,"--%s-%d/%d",d->role.str(),d->dst,d->anc); + } + } + + if (info&SETS) + { + int ord=0; + buf+=sprintf(buf,";{"); + for(vector::iterator pm=node.mnode->pred.begin(); pm!=node.mnode->pred.end(); ++pm) + for(vector::iterator ps=(*pm)->snodes.begin(); ps!=(*pm)->snodes.end(); ++ps) + buf+=sprintf(buf, ord++ ? ",%d" : "%d", *ps); + buf+=sprintf(buf,"};{"); + ord=0;for(int j=0; j<=n; ++j) if(node.LV[j]) buf+=sprintf(buf, ord++ ? ",%d" : "%d", j); + buf+=sprintf(buf,"};{"); + ord=0;for(int j=0; j<=n; ++j) if(node.LH[j]) buf+=sprintf(buf, ord++ ? ",%d" : "%d", j); + buf+=sprintf(buf,"};{"); + ord=0;for(int j=0; j<=n; ++j) if(node.LD[j]) buf+=sprintf(buf, ord++ ? ",%d" : "%d", j); + buf+=sprintf(buf,"}"); + } + + if (info&CONSTRAINTS)// buf+=sprint_node_constraints(buf,n); + { + buf+=sprintf(buf,";"); + int cont=0; + for(Role i=1; i<=Role::count(); ++i) + if(node.prop.forbidden[i]) buf+=sprintf(buf,"%s!%s",(cont++)?",":"",i.str()); + for(Role i=1; i<=Role::count(); ++i) + if(node.prop.required[i]) buf+=sprintf(buf,"%s&%s",(cont++)?",":"",i.str()); + } + +// buf+=sprintf(buf,"\n"); + + return buf-buf0; +} + + +int SGraph::sprint_node_debug(char* buf, const char* pref, int n) +{ + char *buf0 = buf; + buf+=sprintf(buf,"#%s",pref); + buf+=sprint_node(buf,n,HEADS|DEPS|SETS|CONSTRAINTS); + buf+=sprintf(buf,"\n"); + return buf-buf0; +} + +int SGraph::print_node_debug(FILE* f, const char* pref, int n) +{ + char buf[1000]; + sprint_node_debug(buf,pref,n); + fputs(buf,f); +} + +void SGraph::print_arc(FILE* f, int left, int right, Role role, int dir) // 0 - left, 1 - right +{ + fprintf(f,"# %s:%s.%02d %s %s.%02d\n", + role.str(),nodes[left].mnode->type,left, + dir ? "-->" : "<--", + nodes[right].mnode->type,right); +} diff --git a/src/dgp/sgraph.hh b/src/dgp/sgraph.hh new file mode 100644 index 0000000..1c04e39 --- /dev/null +++ b/src/dgp/sgraph.hh @@ -0,0 +1,108 @@ +#ifndef _SGRAPH_HH +#define _SGRAPH_HH + +#include + +#include +#include +#include + +#include "const.hh" +#include "thesymbols.hh" + + +class MNode; + + +struct Arc +{ + int dst; + Role role; + int anc; + + Arc(int d, Role r, int a) : dst(d), role(r), anc(a) {}; + }; + + +struct NodeProp +{ + bitset required; + bitset forbidden; + + bool operator==(const NodeProp& p) + { return required==p.required && forbidden==p.forbidden; } + + void clear() + { required.reset(), forbidden.reset(); } + +}; + + +struct SNode +{ + + MNode* mnode; + + NodeProp prop; + + bitset LV; + bitset LH; + bitset LD; + bool in_LH; + + vector heads; + vector deps; + + void clear() { prop.clear(), LV.reset(), LD.reset(), LH.reset(), heads.clear(), deps.clear(); } + bool saturated() { return prop.required.none(); } +}; + + + +class SGraph +{ +public: + + SNode nodes[MAXNODES]; + int n; // number of nodes + + enum Output { HEADS=1, DEPS=2, SETS=4, CONSTRAINTS=8 }; + + SGraph() : n(0) {} + + void clear() { n=0; } + + int add_base_snode(MNode* mn); + int clone(int ancind, NodeProp newprop); + void update_left(int headind, int depind); + void update_right(int headind, int depind); + + bool visible(int left, int right); + bool saturated(int node); + + //-------------------------------------------------------------------- + + void read(FILE* f); + void write(FILE* f, list nodelist, unsigned int info); + + int sprint_node(char* buf, int n, unsigned int info); + int print_node(FILE* f, int n, unsigned int info); + int sprint_node_debug(char* buf, const char* pref, int n); + int print_node_debug(FILE* f, const char* pref, int n); + + void print_arc(FILE* f, int left, int right, Role role, int dir); // 0 - left, 1 - right + +}; + + +inline bool SGraph::visible(int left, int right) +{ + return nodes[right].LV[left]; +} + +inline bool SGraph::saturated(int node) +{ + return nodes[node].saturated(); +} + +#endif diff --git a/src/dgp/symbol.cc b/src/dgp/symbol.cc new file mode 100644 index 0000000..a10c241 --- /dev/null +++ b/src/dgp/symbol.cc @@ -0,0 +1,39 @@ +#include "symbol.hh" + +// CLASS symbols + +//int Symbols::_no_of_spaces=0; + +Symbols::~Symbols() +{ + while(!table.empty()) + { + free((void*)table.back()); + table.pop_back(); + } +} + +void Symbols::load(const char* filename) +{ + ifstream f(filename); + char s[100]; + while(f) + { + f >> s >> ws; + if(strlen(s)) add(s); + } +} + +void Symbols::add(const char* sym) +{ + if(hash.count(sym)==0) + { + char* symdup=strdup(sym); + hash[symdup]=table.size(); + table.push_back(symdup); + } +} + + +//template +//Symbols Symbol::defs; diff --git a/src/dgp/symbol.hh b/src/dgp/symbol.hh new file mode 100644 index 0000000..2a70bc2 --- /dev/null +++ b/src/dgp/symbol.hh @@ -0,0 +1,143 @@ +#ifndef _SYMBOL_HH +#define _SYMBOL_HH + +#include +//#include +#include +#include +#include +#include +#include + +using namespace std; + +using __gnu_cxx::hash_map; +using __gnu_cxx::hash; + + +// Key comparison for the cstr_hash hash table +struct eqstr +{ + bool operator()(const char * s, const char* t) const + { return strcmp(s,t)==0; } +}; + + +// Hash table for storing symbols + +typedef hash_map,eqstr> cstr_hash; + +// Symbol table. Provides access to symbols through their index or name. + +class Symbols +{ + public: + + Symbols() { add("NULL"); }; + ~Symbols(); + + void load(const char* filename); + + int operator[](const char* s) { return hash[s]; }; + + const char* operator[](int i) { return table[i]; }; + + void add(const char* c); + + int count() { return table.size(); }; + + private: + + std::vector table; + cstr_hash hash; + +}; + +////////////////////////////////////////////////////////////////////// + +/// Symbol class template. +/** The template argument determines the symbol space. + Each space is created with symbol "NULL" with indexed 0 already in. +*/ + +template +class Symbol +{ + public: + + /// Load the contents of the symbol table from file. + static void define(const char *filename) + { defs.load(filename); } + + /// Add symbol s. + /** The string is duplicated. + */ + static Symbol add(const char* s) { defs.add(s); } + + /// Number of symbols. + static int count() { return defs.count(); }; + + /// First symbol. + static int first() { return 1; } + + /// Last symbol. + static int last() { return defs.count()+1; } + + /// Last symbol. + static int index(const char* s) { return defs[s]; } + + /// Just for tests. + static void print(); + + /// 0-argument constructor, default value is 0 ("NULL"). + Symbol() : val(0) {}; + + /// Constructing a symbol from its index. + /** No check is performed. + */ + + Symbol(int v) : val(v) {}; + + /// Constructing a symbol from its name (string to Symbol conversion). + /** If s is not a symbol name, the value of 0 ("NULL") is assigned. + */ + + Symbol(const char * s) : val(defs[s]) {}; + + /// Symbol to char* conversion. If symbol is invalid, NULL is returned. + const char* str() const { return (val>=0 && val s=1; s; s++ ) ... + s=0; while(++s) ... + */ + (operator int)() const { return val; }; + + Symbol operator++() {val++; return *this;} + + // bool operator<(Symbol& s) { return val < s.val; } + + + private: + static Symbols defs; + int val; +}; + +template +void Symbol::print() +{ + for(Symbol i=0; i +Symbols Symbol::defs; + +template +bool operator<(const Symbol& s, const Symbol& t) +{ + return (int)s < (int)t; +} + +#endif diff --git a/src/dgp/thesymbols.hh b/src/dgp/thesymbols.hh new file mode 100644 index 0000000..b90f997 --- /dev/null +++ b/src/dgp/thesymbols.hh @@ -0,0 +1,29 @@ +#ifndef __THESYMBOLS__HH +#define __THESYMBOLS__HH + +#include "symbol.hh" +#include "const.hh" + +#include +#include +#include + +typedef Symbol<1> Cat; + +typedef Symbol<2> Role; +typedef list RoleList; +typedef list::iterator RoleListIter; +typedef bitset RoleSet; +typedef set Roles; +typedef Roles::iterator RolesIter; + +typedef Symbol<3> Constr; +typedef list ConstrList; +typedef list::iterator ConstrListIter; + +typedef Symbol<4> Rel; + +typedef Symbol<5> Flag; +typedef bitset FlagSet; + +#endif diff --git a/src/dgp/tre b/src/dgp/tre new file mode 100755 index 0000000..fc337ae --- /dev/null +++ b/src/dgp/tre @@ -0,0 +1,304 @@ +#!/usr/bin/ruby -I /usr/local/lib/utt -I $HOME/.local/lib/utt + +$: << "#{ENV['HOME']}/.local/lib/utt" +$: << "/usr/local/lib/utt" + +require 'getoptlong' +require 'seg.rb' + +opts = GetoptLong.new( +[ '--help', '-h', GetoptLong::NO_ARGUMENT ], +[ '--debug', '-d', GetoptLong::NO_ARGUMENT ], +[ '--format', '-F', GetoptLong::REQUIRED_ARGUMENT ], +[ '--info', '-I', GetoptLong::REQUIRED_ARGUMENT ], +[ '--only-trees','-t', GetoptLong::NO_ARGUMENT ]) + +$helptext= +"The program generates trees from the graph output by dgp. dgp must\n"+ +"must be run with '-i ds' option.\n\n"+ +"Command: tre [options]\n\n"+ +"Options:\n"+ +"--help -h Print help (this text) and exit.\n"+ +"--debug -d Verbose output. For developers only.\n"+ +"--format=s -F s Output format. Recognized values:\n"+ +" a root + list of arcs\n"+ +" p parenthesized notation\n"+ +" h human readable indented tree format\n"+ +" Multiple values are allowed. (default p)\n"+ +"--info=s -I s Information printed. Recognized values:\n"+ +" n node identifier\n"+ +" f surface form\n"+ +" m morphological information\n"+ +" l arc labels\n"+ +"--only-trees -t Do not copy input. Print trees only.\n" + +$DEBUG=false +$FORMAT='p' +$INFO='DEFAULT' +$ONLYTREES=false + +opts.each do |opt, arg| + case opt + when '--help' + print $helptext + exit 0 + when '--debug' + $DEBUG=true + when '--format' + $FORMAT=arg + when '--info' + $INFO=arg + when '--only-trees' + $ONLYTREES=true + else + print "Unknown option #{opt}. Ignored.\n" + end +end + +if $INFO=='DEFAULT' + case $FORMAT + when 'p','a' + $INFO='nl' + when 'h' + $INFO='fmnl' + end +end + +$dgpsep=';' + +def tre(input) + $gphid=[] + $form=[] + $lem=[] + nodes=[] + count=0 + seg=Seg.new + for line in input + print line unless $ONLYTREES + seg.set(line) + if dgp=seg['dgp'] + if nodes==[] && seg[3]!='BOS' + print "A sentence must start with BOS segment. Aborting.\n" + return + end + + id=dgp[/^\d+/].to_i + + if gph=seg['gph'] + $gphid[id]=gph[/^\d+/].to_i + else + print "No gph field. Aborting.\n" + return + end + + $form[$gphid[id]]=seg[4] + $lem[$gphid[id]]=seg['lem'] + + nodes[id] = [seg[1].to_i,dgp] + + if seg[3]=='EOS' + $pref = "#{seg[1]} #{seg[2]} SYN *" + parsegraph(nodes) + printgraph if $DEBUG + $thetrees=[] + gentrees2 + for t in $thetrees + count += 1 + t1=ground(t) + case $FORMAT + when /a/ + print "#{$pref} tre:#{count} arc:" + printarcs(t1[0],t1[1]) + print "\n" + when /p/ + print "#{$pref} tre:#{count} par:" + printpar(t1[0],t1[1]) + print "\n" + when /h/ + print "#\n# tree #{count}\n# ------\n" + printtree(t1[0],t1[1],0) + end + end + nodes=[] + end + end + end +end + + +def nodeinfo(id) + info="" + if $INFO =~ /n/ + info += id.to_s + info += '.' if $INFO =~ /[fm]/ + end + if $INFO =~ /f/ + info += $form[id] + info += ';' if $INFO =~ /m/ + end + if $INFO =~ /m/ + info += $lem[id] + end + info +end + + +def printarcs(root,arcs) + print nodeinfo(root) + for a in arcs + print ';' + print "#{a[2]}:" if $INFO =~ /l/ + print nodeinfo(a[0])+'-'+nodeinfo(a[1]) + end +end + +def printtree(root,arcs,o) + if o==0 + print "# %-16s" % "root: " + end + print nodeinfo(root),"\n" + for arc in arcs.select{ |a| a[0]==root }.sort{|a,b| a[1]<=>b[1] } + print '# '," "*(o+1) + print "%-16s" % (arc[2]+": ") + printtree(arc[1],arcs,o+1) + end +end + +def printpar(root,arcs) + print nodeinfo(root) + deps = arcs.select{ |a| a[0]==root }.sort{|a,b| a[1]<=>b[1] } + unless deps == [] + print '(' + cont=false + for arc in deps + if cont then print ',' else cont=true end + print arc[2],':' if $INFO =~ /l/ + printpar(arc[1],arcs) + end + print ')' + end +end + + +def parsegraph(nodes) + + $n =nodes.length + $sat =[]; + + $vis =[]; + $succ=[]; + $lhs =[]; + $arcs=[]; + $pos=[] + + for dgp in nodes + + parts = dgp[1].split($dgpsep,6) + + if parts[3]==nil || parts[4]==nil || parts[5]==nil + $stderr.print "ERR: tre requires dgp be called with '--info s' option. Aborting.\n" + exit + end + + i = parts[0].to_i + $pos[i] = dgp[0].to_i + $sat << i if parts[1]=="s" + $arcs |= parts[2].split(',').map{ |a| case a + when /\-\-(\w+)-(\d+)\/(\d+)/ + [i, $2.to_i, $1, $3.to_i] + when /\+\+(\d+)-(\w+)\/(\d+)/ + [$1.to_i, i, $2, $3.to_i] + end } + $succ |= parts[3][1..-2].split(',').map{|x| [x.to_i,i]} + $vis |= parts[4][1..-2].split(',').map{|x| [x.to_i,i]} + $lhs |= parts[5][1..-2].split(',').map{|x| [x.to_i,i]} + [[i,i]] + + end +end + + +def ground(t) + [ $gphid[t[0]] , t[1].map{|a| [$gphid[a[0]],$gphid[a[1]],a[2]]} ] +end + + +def gentrees2() + $thetrees=[]; + bos=0; eos=$n-1; + roots = (1...eos).select{|i| $vis.include? [i,eos]}.select{|i| $vis.include? [bos,i]} + if $DEBUG then print "ROOTS: #{roots.inspect}\n" end + for i in roots + $theroot=i + for r in buildR(i , eos, []) + (rmin,rmax,rtree) = r + buildR(bos, rmin, rtree) + end + end +end + + +def buildR(min, max, tree) + if $DEBUG then print "buildR--#{min}--#{max}--#{tree.inspect}\n" end + trees=[] + for a in $arcs.select{|a| a[0]==max && $vis.include?([min,a[1]]) } + if $DEBUG then print "ARC: #{a.inspect}\n" end + for r in buildR(a[1],a[3],tree+[a]) + (rmin,rmax,rarcs) = r + for l in buildR(min,rmin,rarcs) + (lmin,lmax,larcs) = l + trees << [lmin,rmax,larcs] + end + end + end + for i in (0...$n).select{|i| $succ.include?([i,max])}.select{|i| $lhs.include?([min,i])} + for l in buildL(min,i,tree) + (lmin,lmax,larcs) = l + trees << [lmin,lmax,larcs] + end + end + trees +end + + +def buildL(min,max,tree) + if $DEBUG then print "buildL--#{min}--#{max}--#{tree.inspect}\n" end + if $pos[min]==$pos[max] + if min==0 && max==0 + $thetrees.push [$theroot,tree] + if $DEBUG then print "adding tree: #{tree.inspect}\n" end + end + return [[max,max,tree]] + end + trees=[] + for arc in $arcs.select{|a| a[1]==max && $lhs.include?([min,a[0]]) } + if $DEBUG then print "ARC: #{arc.inspect}\n" end + for r in buildR(arc[3],max,tree+[arc]) + (rmin,rmax,rarcs) = r + for l in buildL(min,rmin,rarcs) + (lmin,lmax,larcs) = l + trees << [lmin,lmax,larcs] + end + end + end + trees +end + + +def printgraph() + + print "N: #{$n}\n" + print "SAT: #{set_to_s($sat)}\n" + print "SUCC: #{rel_to_s($succ)}\n" + print "VIS: #{rel_to_s($vis)}\n" + print "LHS: #{rel_to_s($lhs)}\n" + print "ARCS: #{arcs_to_s($arcs)}\n" +end + +def set_to_s(s) "{#{s.join(',')}}" end +def rel_to_s(r) "{#{r.map{|p| "(#{p[0]},#{p[1]})"}.join(',')}}" end +def arc_to_s(q) "-#{q[0]}-#{q[2]}-#{q[1]}/#{q[3]}" end +def arcs_to_s(a) "{#{a.map{|q| arc_to_s(q)}.join(',')}}" end + +###################################################################### + +tre($stdin) diff --git a/src/dgp/uttcommon.c b/src/dgp/uttcommon.c new file mode 100644 index 0000000..4f5773a --- /dev/null +++ b/src/dgp/uttcommon.c @@ -0,0 +1,2 @@ +#include "uttcommon.h" + diff --git a/src/dgp/uttcommon.h b/src/dgp/uttcommon.h new file mode 100644 index 0000000..490f964 --- /dev/null +++ b/src/dgp/uttcommon.h @@ -0,0 +1,146 @@ +#ifndef __COMMON_H +#define __COMMON_H + +#include + +/************************************************** + * Stale dotyczace wejscia/wyjscia + */ + +#define MAXLINE 1024 + +#define EMPTYFORM '*' +#define INFIELD_SEP ':' +#define MAXAUX 16 +#define FIELD_SEP " \t\n" + + +/***************************************************************/ +/* problems with casing */ +/* sprawdzenie wielkosci liter */ +/* warto zwracana: */ +/* 0 - wszystkie mae litery, 1 - pierwsza wielka, reszta male */ +/* 2 - wszystkie wielkie, 3 - inne */ +/***************************************************************/ +inline int casing(char* s) +{ + int ret = isupper(*s) ? 1 : 0; + while(*++s != '\0') + { + if(isupper(*s)) + { + if(ret==1) ret=2; + else if(ret==0) ret=3; + } + else + { + if(ret==2) ret=3; + } + } + return ret; +} + +// +inline void tolowers(char* s, char* d) +{ + *d=tolower(*s); + while(*s != '\0') * ++d = tolower(* ++s); +} + + +// przepisuje s do d +// nadajac wielko liter zgodnie z wartoci casing +// casing - warto zwracana przez casing() +// jeli casing==3 przepisuje bez zmian (za mao informacji) +inline void restorecasing(char *s, char *d, int casing) +{ + switch(casing) + { + case 0: + case 3: + *d=*s; + while(*s != '\0') * ++d = * ++s; + break; + case 1: + *d=toupper(*s); + while(*s != '\0') * ++d = * ++s; + break; + case 2: + *d=toupper(*s); + while(*s != '\0') * ++d = toupper(* ++s); + break; + } +} + + +/**************************************************/ +/* +parameters: + -seg - segment + -name - field name + +val - field contents +return value: + 1 if specified field exists, 0 otherwise +*/ + +inline int getfield(char* seg, const char* pref, char* val) +{ + char* p=seg; + + while(isspace(*p)) ++p; + + pos: + if(isdigit(*p) or *p=='*') while(!isspace(*p)) ++p; + else goto type; + + while(isspace(*p)) ++p; + + len: + if(isdigit(*p) or *p=='*') while(!isspace(*p)) ++p; + else goto type; + + while(isspace(*p)) ++p; + + type: + while(isspace(*p)) ++p; while(!isspace(*p)) ++p; + + while(isspace(*p)) ++p; + + form: + while(isspace(*p)) ++p; while(!isspace(*p)) ++p; + + annotation: + do p=strstr(p,pref); while(p!=NULL && *(p-1)!=' ' && *(p-1)!='\t'); + + if(p==NULL) return 0; + else + { + p+=strlen(pref); + int len=strcspn(p,FIELD_SEP "\n\r\f\0"); + strncpy(val,p,len); + val[len]='\0'; + return 1; + } +} + + +/* +parameters: + +seg - segment + -pref - prefix of the new field + -val - contents of the new field +return value: + 1 - success, 0 - fail (limit on segment length exceeded) +*/ +inline int addfield(char *seg, const char *pref, const char *val) + // zalozenie, ze seg konczy sie znakiem \n +{ + if(strlen(seg)+strlen(pref)+strlen(val) >= MAXLINE) return 0; // bezpieczniej, ale wolniej + + int seglen=strlen(seg); + sprintf(seg+(seglen-1)," %s%s\n",pref,val); + return 1; +} + + +#endif diff --git a/src/fla/Makefile b/src/fla/Makefile new file mode 100644 index 0000000..6f6fcd4 --- /dev/null +++ b/src/fla/Makefile @@ -0,0 +1,25 @@ +include ../../config.mak + +ifeq ($(BUILD_STATIC), yes) + LDFLAGS += -static +endif + +CFLAGS +=-O2 + +fla: + $(CC) $(CFLAGS) fla.c -o fla $(LDFLAGS) + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 fla $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/fla +endif + +clean: + rm fla || true diff --git a/src/fla/fla.c b/src/fla/fla.c new file mode 100644 index 0000000..7d16f4e --- /dev/null +++ b/src/fla/fla.c @@ -0,0 +1,68 @@ +#include +#include +#include +#include + +char buf[5000]; + +int main(int argc, char **argv) +{ + + char *pattern; + char eoln; + regex_t re; + + int firstline = 1; + + if( argc < 2 ) + { +/* pattern="[ \t]*([0-9]+[ \t]+){2}EOS([ \t].*)?"; */ + pattern = "[ \t]*BOS([ \t].*)?"; + } + else + { + pattern = argv[1]; + } + + if( argc < 3 ) + { + eoln = '\f'; + } + else + { + eoln = atoi(argv[2]); + } + + if( 0 != regcomp(&re, pattern, REG_EXTENDED|REG_NOSUB) ) + { + fprintf(stderr, "Invalid pattern.\n"); + return 1; + } + + while( fgets(buf, 5000, stdin) ) + { + buf[strlen(buf)-1] = '\0'; + if( firstline ) + { + firstline = 0; + } + else + { + if( 0 == regexec(&re, buf, (size_t)0, NULL, 0) ) + { + putchar('\n'); + } + else + { + putchar(eoln); + } + } + fputs(buf, stdout); + } + + putchar('\n'); + + return 0; + +} + diff --git a/src/gph/Makefile b/src/gph/Makefile new file mode 100644 index 0000000..cd72bb9 --- /dev/null +++ b/src/gph/Makefile @@ -0,0 +1,18 @@ +include ../../config.mak + +gph: + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 gph $(BIN_DIR) +endif + + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/gph +endif + +clean: diff --git a/src/gph/gph b/src/gph/gph new file mode 100755 index 0000000..3f739c8 --- /dev/null +++ b/src/gph/gph @@ -0,0 +1,162 @@ +#!/usr/bin/perl + +#package: UAM Text Tools +#component: gph +#version: 1.0 +#author: Tomasz Obrebski + +use strict; +use Getopt::Long; +use File::HomeDir; + + + + + +my $systemconfigfile='/usr/local/etc/utt/gph.conf'; +my $userconfigfile=home()."/.utt/gph.conf"; + +Getopt::Long::Configure('no_ignore_case_always'); + +my $help=0; +my $inputfile=0; +my $outputfile=0; +my @process=(); +my $reset; +my $interactive=0; + +#read configuration files########################### +my $file; +my @process_conf=(); +foreach $file ($systemconfigfile, $userconfigfile){ + if(open(CONFIG, $file)){ + while () { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "inputfile")or($name eq "f")){ + $inputfile=$value; + } + elsif(($name eq "outputfile")or($name eq "o")){ + $outputfile=$value; + } + elsif(($name eq "process")or($name eq "p")){ + push @process_conf, $value; + } + elsif(($name eq "reset")or($name eq "r")){ + $reset=$value; + } + elsif(($name eq "interactive")or($name eq "i")){ + $interactive=1; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + + } + close CONFIG; + } +} +######################################################### + + + +GetOptions("process|p=s" => \@process, + "inputfile|f=s" => \$inputfile, + "outputfile|o=s" => \$outputfile, + "help|h" => \$help, + "reset|r=s" => \$reset, + "interactive|i" => \$interactive); + +@process = @process_conf if @process<1; + +if($help) +{ + print <<'END' +Usage: gph [OPTIONS] + +Options: + --process=TYPE -p TYPE Process segments of type TYPE. + --reset=TYPE -r TYPE Start new graph at tags of type TYPE. + --inputfile=FILE -f FILE Input file. + --outputfile=FILE -o FILE Output file. + --interactive -i Toggle interactive mode (default=off). +END +; + exit 0; +} + + +$|=1 if $interactive; + + +if(!$inputfile or $inputfile eq "-") { + *INPUT = *STDIN; +} +else { + open(INPUT, "$inputfile") or die("Can't open input file: $inputfile!"); +} + +if(!$outputfile or $outputfile eq "-") { + *OUTPUT = *STDOUT; +} +else { + open(OUTPUT, "$outputfile") or die("Can't open output file: $outputfile!"); +} + +my @prev; +my $n=0; + +while() +{ + chomp; + my $do=0; + + my @line = split /\s+/; + + if($line[2] eq $reset) + { + $n=0; + @prev = (); + } + + for my $p (@process) + { + $do=1 if $line[2] eq $p; + } + + my $gph = ''; + if($do) + { + my @preds = (); + shift @prev while @prev+0 && $prev[0]->[1] + $prev[0]->[2] < $line[0]; + for my $p (@prev) + { + push(@preds, $p->[0]) if $p->[1] + $p->[2] == $line[0]; + } + push @prev, [$n, $line[0], $line[1]]; + + $gph=' gph:'.$n.':'.join(',',@preds); + + $n++; + } + else + { + for my $p (@prev) + { + if($p->[1]+$p->[2] == $line[0]) + { + $p->[2] += $line[1]; + } + } + + $gph=''; + + } + + print OUTPUT $_.$gph."\n"; +} + diff --git a/src/grp/Makefile b/src/grp/Makefile new file mode 100644 index 0000000..4193550 --- /dev/null +++ b/src/grp/Makefile @@ -0,0 +1,17 @@ +include ../../config.mak + +grp: + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 grp $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/grp +endif + +clean: diff --git a/src/grp/grp b/src/grp/grp new file mode 100755 index 0000000..1abbb4b --- /dev/null +++ b/src/grp/grp @@ -0,0 +1,170 @@ +#!/usr/bin/perl + +#package: UAM Text Tools +#component name: grp +#version: 1.0 +#author: Tomasz Obrebski + +use strict; +use Getopt::Long; +use File::HomeDir; + +# katalog zawierajacy terms.m4 +my $LIB_DIR="/usr/local/lib/utt"; + +my $systemconfigfile="/usr/local/etc/utt/grp.conf"; +my $userconfigfile=home()."/.utt/grp.conf"; + +Getopt::Long::Configure('no_ignore_case_always'); + +my $help=0; +my $pattern=0; +my $matches_only=0; +my $macrofile=0; +my $define=0; +my $show_command=0; +my $action="pgP"; +my $eos="seg(EOS)"; +my $morfield='lem'; +my $tags=0; + +#read configuration files########################### +my $file; +foreach $file ($systemconfigfile, $userconfigfile){ + if(open(CONFIG, $file)){ + while () { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "pattern")or($name eq "e")){ + $pattern=$value; + } + elsif(($name eq "eos")or($name eq "E")){ + $eos=$value; + } + elsif($name eq "morph"){ + $morfield=$value; + } + elsif($name eq "macros"){ + $macrofile=$value; + } + elsif($name eq "define"){ + $define=$value; + } + elsif($name eq "command"){ + $show_command=1; + } + elsif($name eq "action"){ + $action; + } + elsif($name eq "tags"){ + $tags=$value; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + + } + close CONFIG; + } +} +######################################################### + +GetOptions("pattern|e=s" => \$pattern, + "eos|E=s" => \$eos, + "morph=s" => \$morfield, + "macros=s" => \$macrofile, + "define=s" => \$macrofile, + "command" => \$show_command, + "action=s" => \$action, + "tags=s" => \$tags, + "help|h" => \$help); + +if($help) +{ + print <<'END' +Usage: gre [OPTIONS] [file ..] + +Options: + --pattern -e PATTERN Pattern. + --eos -E PATTERN Segment serving as sentence delimiter. + --morph=STRING Field containing morphological information (default 'lem'). + --macros=FILE Read macrodefinitions from FILE. + --define=FILE Add macrodefinitions from FILE. + --action -a [u][p][g][P] Perform only indicated actions. + u - uncompress with 'lzop -cd' + p - preprocess + g - grep + P - postprocess + (default pgP) + --tags=STRING Morphosyntactic tag format. + --command Print the shell command to be executed and exit. + --help -h Help. +END +; + exit 0; +} + +die("$0: no pattern given.\n") unless $pattern || $action !~ /g/; + +die("$0: macro file not found") unless + $macrofile or + -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4"; + +die("$0: undefined tagset format (tags option missing)") unless + $tags; + +die("$0: $tags.tag2re program not found") unless + 1; #JAK NAPISAC WARUNEK??? + + +my $uncompress = ($action =~ /u/) ? ' lzop -cd | ' : ''; +my $preproc = ($action =~ /p/) ? ' fla | ' : ''; + +my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; + + +# discarding spaces +$pattern =~ s/\s+/\\`'/g; #` +# quoting escaped commas +$pattern =~ s/\\,/\\`\\`\\,''/g; +# quoting commas in {m,n} r.e. operator +$pattern =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; + +my $grepre = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`; + +die("Incorrect pattern (m4).") if $? >> 8; + + +chomp $grepre; + +# <> expansion + +$grepre =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge; + +$grepre =~ s/\./[^ [:cntrl:]]/g; + +$grepre =~ s/\\s/[ ]/g; +$grepre =~ s/\\S/[^ [:cntrl:]]/g; +$grepre =~ s/\\d/[0-9]/g; +$grepre =~ s/\\D/[^0-9 [:cntrl:]]/g; +$grepre =~ s/\\w/[a-z󶼿A-ZʣӦ0-9_]/g; +$grepre =~ s/\\W/[^a-z󶼿A-ZʣӦ0-9_ [:cntrl:]]/g; +# extensions +$grepre =~ s/\\l/[a-z󶼿]/g; #lowercase letter +$grepre =~ s/\\L/[A-ZʣӦ]/g; #upercase letter + +my $grep_command = ($action =~ /g/) ? "egrep '$grepre'" : " cat "; + +if($show_command) +{ + print $grep_command."\n"; + exit 0; +} + +#print $preproc.$grep_command.$postproc."\n"; + +exec $preproc.$grep_command.$postproc; diff --git a/src/gue/Makefile b/src/gue/Makefile new file mode 100644 index 0000000..139015e --- /dev/null +++ b/src/gue/Makefile @@ -0,0 +1,55 @@ +include ../../config.mak + +ifeq ($(BUILD_STATIC), yes) + LDFLAGS += -static +endif + +LDFLAGS += +CXXFLAGS += -O2 -fpermissive + +LIB_PATH=../lib +COMMON_PATH=../common +CMDLINE_FILE='"../gue/cmdline.h"' + + +gue: main.cc guess.o $(LIB_PATH)/auttools.o $(LIB_PATH)/word.o \ + cmdline.c common_guess.o common.o + $(CXX) $(CXXFLAGS) main.cc guess.o \ + $(LIB_PATH)/auttools.o $(LIB_PATH)/word.o cmdline.c common.o common_guess.o \ + -o gue $(LDFLAGS) + +guess.o: guess.h guess.cc + $(CXX) $(CXXFLAGS) -c guess.cc + +common_guess.o: cmdline.h common_guess.cc common_guess.h + $(CXX) $(CXXFLAGS) -c common_guess.cc + +common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \ + $(COMMON_PATH)/common.h + $(CXX) $(CXXFLAGS) -c -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc + +cmdline.c cmdline.h: cmdline.ggo + $(GENGETOPT) -i cmdline.ggo --conf-parser + +cmdline.ggo: cmdline_guess.ggo $(COMMON_PATH)/cmdline_common.ggo + cat cmdline_guess.ggo $(COMMON_PATH)/cmdline_common.ggo > cmdline.ggo + + +clean: clean.cmdline + rm *.o || true + rm gue || true + +clean.cmdline: + rm cmdline.* || true + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 gue $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/gue +endif diff --git a/src/gue/cmdline_guess.ggo b/src/gue/cmdline_guess.ggo new file mode 100644 index 0000000..a568f44 --- /dev/null +++ b/src/gue/cmdline_guess.ggo @@ -0,0 +1,12 @@ +package "guess" +version "0.1" + +option "guess_count" n "Guess up to n descriptions" int default="0" no +option "delta" - "Stop displaying answers after fall of weight" float default="0.2" no +option "cut-off" - "Do not display answers with less weight than cut-off" int default="200" no +option "dictionary-home" - "dh" string typestr="FILENAME" no hidden +option "dictionary" d "File with dictionary information" string typestr="filename" default="gue.bin" no +option "per-info" v "Display performance information" flag off +option "weights" w "Print weights" flag off +option "no-uppercase" - "Do not process form containing uppercase letters" flag off + diff --git a/src/gue/common_guess.cc b/src/gue/common_guess.cc new file mode 100644 index 0000000..08a178b --- /dev/null +++ b/src/gue/common_guess.cc @@ -0,0 +1,60 @@ +#include +#include +#include "common_guess.h" + +int guess_count=0; +double delta=0.1; +int cut_off=100; +char dictionary[255]; +bool per_info=false; +bool weights=false; + +void process_guess_options(gengetopt_args_info* args) +{ + + if(args->dictionary_given) + { + expand_path(args->dictionary_arg,dictionary); + if(file_accessible(dictionary)!=0) + { + fprintf(stderr,"Cannot open the dictionary file: %s\nAborting.\n",dictionary); + exit(1); + } + } + else if (args->dictionary_home_given && args->language_given) + { + char buf[255]; + expand_path(args->dictionary_home_arg, buf); + sprintf(dictionary,"%s/%s/gue.bin",buf,args->language_arg); + if(file_accessible(dictionary)!=0) + { + fprintf(stderr,"Cannot open the dictionary file: %s\nAborting.\n",dictionary); + exit(1); + } + } + + if(args->guess_count_given) + guess_count=args->guess_count_arg; + else + guess_count=0; + + if(guess_count==0) + guess_count=100; + + if(args->delta_given) + delta=args->delta_arg; + else + delta=0.1; + + if(args->cut_off_given) + cut_off=args->cut_off_arg; + else + cut_off=100; + + if(args->per_info_given) + per_info=args->per_info_flag; + + if(args->weights_given) + weights=true; + +} diff --git a/src/gue/common_guess.h b/src/gue/common_guess.h new file mode 100644 index 0000000..c7a44bc --- /dev/null +++ b/src/gue/common_guess.h @@ -0,0 +1,20 @@ +#ifndef __COMMON_GUESS_H +#define __COMMON_GUESS_H + +#include +#define _CMDLINE_FILE "../gue/cmdline.h" +#include "../common/common.h" +#include "cmdline.h" + +#define DIC_FILE "gue.bin" + +extern int guess_count; +extern double delta; +extern int cut_off; +extern char dictionary[]; +extern bool per_info; +extern bool weights; + +void process_guess_options(gengetopt_args_info* args); + +#endif diff --git a/src/gue/guess.cc b/src/gue/guess.cc new file mode 100644 index 0000000..8537669 --- /dev/null +++ b/src/gue/guess.cc @@ -0,0 +1,138 @@ + +#include "guess.h" + +#include +#include +#include +#include +#include + +#define DICT 1 +#define COR 2 +#define DICT_P 3 +#define COR_P 4 + +#define W_PRE 0.1 +#define W_SUF 0.9 + +#define PREF_SIGN '_' + +Guess::Guess(const char* suf_file) + : _suf(suf_file) { + /* _suf = NULL; + _pref = NULL; + + if (strlen(suf_file) > 0) + _suf = new TFTiv(suf_file); + if (strlen(pref_file) > 0) + _pref = new TFTiv(corp_file); + */ +} + + + char buf[MAX_LINE]; + char out[MAX_LINE]; + char* buf0_s = buf; + char* word_t = NULL; + long state_s = 0; + unsigned length_s = buf0_s - buf; + long len = 0; + int i=0; + +int Guess::ana(const char* word, Words& result) { + + assert(word && &result); + + /* Word zawiera wyraz, ktory mamy zbadac. + * Nalezy przepisac go w odwrotnej kolejnosci do bufora, + * znalezc najdluzszy prefiks pasujacy do tego bufora + * separatorem jest '/' - za tym znakiem znajduje sie + * prawdopodobienstwo wystapienia danego opisu */ + + buf0_s = buf; + word_t = strdup(word); + + if (reverse(word, buf) != 0) + return -1; + + + + state_s = -1; + // printf("#buf0_s=%s, ", buf0_s); + state_s = _suf.pref(buf0_s, PREF_SIGN); + // printf("#word=%s, buf0_s=%s\t", word, buf0_s); + /* jezeli state_s != -1 to oznacza, ze w slowniku jest zawarta + * informacja o prefiksie tego slowa. + * nie jest ona odwrocona, wiec porownujemy do word a nie do buf + */ + // printf("state_s=%d\t", state_s); + if (state_s != -1) { + state_s = _suf.pref(word_t, '~', state_s); + // printf("state_s(wp)=%d, word_t=%s, word=%s\n", state_s, word_t, word); + } + if (state_s == -1) { + // if (_suf != NULL) + buf0_s = buf; + state_s = _suf.pref(buf0_s, '~'); + // printf("state_s=%d\n", state_s); + } + + length_s = buf0_s - buf; + + /* state jest stanem, od ktorego zaczyna sie sciezka opisujaca + * prawdopodobienstwo przeciwienstwa wystapienia opisu + * znajdujacego sie dalej na tej sciezce. + * Im mniejsza wartosc liczby tym wieksze prawdopodobienstwo */ + + len = 0; + i=0; + + // if (_suf != NULL) + len = _suf.cont(state_s, out); + while (len > 0) { + i++; + add_word_prob(result, word, out, length_s, DICT); + len = _suf.cont(-1, out); + } + + return i; + +} + + +int Guess::add_word_prob(Words& tab, const char* word, const char* path, unsigned len, int source) { + + /* Dodaje do tablicy tab wyraz word wraz + * z prawdopodobienstwem i opisem zawartym + * w sciezce path */ + + // printf("add_word_prob("); + // fflush(stdout); + char p[MAX_LINE]; + + strcpy(p, path); + + int probLen = strcspn(p, ";"); + char prob[probLen+1]; + strncpy(prob, p, probLen); + prob[probLen] = '\0'; + + char* desc = p + probLen+1; // +2 bo pomijamy jeszcze znak ';' + + int i = tab.add(word, desc); + + if (source==DICT) { + tab[i].len_suf(len); + tab[i].w_suf(atof(prob)); // + W_PRE*tab[i].w_suf())); + // tab[i].w_suf((float)(W_SUF*(1000-atof(prob)) + W_PRE*tab[i].w_suf())); + } +// if (source==COR) { +// tab[i].len_pref(len); +// tab[i].w_pref(W_SUF*(1000-atof(prob)) + W_PRE*tab[i].w_pref()); +// } +// printf(")\n"); +// fflush(stdout); + + return i; + +} diff --git a/src/gue/guess.h b/src/gue/guess.h new file mode 100644 index 0000000..68b7584 --- /dev/null +++ b/src/gue/guess.h @@ -0,0 +1,56 @@ + +#include "../lib/tfti.h" +#include "../lib/word.h" + +#include + +/************************************************************** + * Zawiera definicje klasy Guess. * + * * + * Klasa ta pozwala na okreslenie opisu slowa nie * + * znajdujacego sie w slowniku wraz z prawdopodobienstwem * + * jego wystapienia. * + *************************************************************/ + +class Guess { + + public: + + // nazawa pliku slownika w parametrze + Guess(const char* suf_file); + + // zwraca tablice opisow slowa wraz z prawdopodobienstwem ich wystapienia + int ana(const char* word, Words& result); + + long time_overall; + + private: + + // sufiksy + TFTiv _suf; + + // prefiksy + TFTiv _pref; + + //odwraca ciag znakow + int reverse(const char* src, char* dest) { + + // assert((src != NULL) && (dest != NULL)); + + const char* c = src; + + int len = strlen(src); + + for (int i=1; i<=len; ++i) { + dest[i-1] = src[len-i]; + } + + dest[len] = '\0'; + + return 0; + } + + //dodaje nowy element do tablicy WordsProb + int add_word_prob(Words& tab, const char* word, const char* path, unsigned len, int source); + +}; diff --git a/src/gue/main.cc b/src/gue/main.cc new file mode 100644 index 0000000..0f09a43 --- /dev/null +++ b/src/gue/main.cc @@ -0,0 +1,237 @@ +#include +#include +#include "../lib/iotools.h" +#define _CMDLINE_FILE "../gue/cmdline.h" +#define CONFIGFILE1 "/home/ynka/utt/utt-0.9/conf/gue.conf" +#define CONFIGFILE2 "/home/ynka/utt/utt-0.9/conf/gue.conf" +#include "../common/common.h" +#include "common_guess.h" +#include "guess.h" +#include "cmdline.h" + +#define W_SUFF 0.6 +#define W_PREF 0.4 + + +int main(int argc, char** argv) { + + // int non_standard_config=0; + + gengetopt_args_info args; + + if(cmdline_parser(argc, argv, &args) != 0) + exit(1); + + process_config_files(&args,argv[0]); + process_common_options(&args,argv[0]); + process_guess_options(&args); + + char line[MAX_LINE]; + char outline[MAX_LINE]; + char parms[MAX_LINE], desc[MAX_LINE], lemma[MAX_LINE]; + long line_count = 0; + + Guess guess(dictionary); + int words_count=0; + time_t start_time = time(NULL); + + // Segment seg; + Words tab; + while (fgets(line, MAX_LINE, inputf)) + { + line_count++; + int start, len; + + // line[strlen(line)-1] = '\0'; + + if (!process_seg(line, args)) + fputs(line,outputf); + else + { + char form[MAX_FORM]; + words_count++; + tab.clear(); + getfield(line,input_field_prefix,form); + if (form==NULL) continue;//BZDURA + + guess.ana(form, tab); + + if ((tab.count()==0) && (!args.no_fail_flag)) // no guesses - analysis was unsuccessful + fputs(line, failedf); + else + { + +// if (copy_processed) +// fputs(line, stdout); +// continue; +// } + // we've got some guesses. Do we want to print it? +// if (args.only_fail_flag) +// continue; + + float last_weight=0; + int i=0; + int count=0; + unsigned first=1; + char* parms_end = parms; + char last_lemma[MAX_LINE]; + + count = 1; + + tab.sort(); + + while (count < tab.count() && count <= guess_count) + if (first || tab[count].w_suf() >= cut_off && tab[count].w_suf() >= delta * last_weight) + { + first=0; + last_weight = tab[i].w_suf(); + count++; + } + else + break; + + // drukujemy count pierwszych z tab + + + if(one_line) + { + char* descp=desc; + for (int i=0; i< count; ++i) + { + descp += sprintf(descp," %s%s,%s", output_field_prefix, tab[i].lemma(), tab[i].descr()); + if(weights) descp += sprintf(descp,":%d",(int)tab[i].w_suf()); + } + strcpy(outline,line); + outline[strlen(outline)-1]='\0'; + strcat(outline,desc); + strcat(outline,"\n"); + fputs(outline, outputf); + if (copy_processed) + fputs(line,outputf); + } + else if(one_field) + { + char* descp=desc; + for (int i=0; i< count; ++i) + if(i==0) + { + descp += sprintf(descp," %s%s,%s", output_field_prefix, tab[i].lemma(), tab[i].descr()); + if(weights) descp += sprintf(descp,":%d",(int)tab[i].w_suf()); + } + else + { + if(strcmp(tab[i].lemma(),tab[i-1].lemma())==0) + descp += sprintf(descp,",%s",tab[i].descr()); + else + descp += sprintf(descp,";%s,%s",tab[i].lemma(),tab[i].descr()); + if(weights) descp += sprintf(descp,":%d",(int)tab[i].w_suf()); + } + + strcpy(outline,line); + outline[strlen(outline)-1]='\0'; + strcat(outline,desc); + strcat(outline,"\n"); + fputs(outline, outputf); + if (copy_processed) + fputs(line,outputf); + } + else + { + for (int i=0; i< count; ++i) + { + // kolejne opisy - kolejne linie. + char* descp=desc; + descp += sprintf(desc, " %s%s,%s", output_field_prefix, tab[i].lemma(), tab[i].descr()); + if(weights) descp += sprintf(descp,":%d",(int)tab[i].w_suf()); + descp += sprintf(descp,"\n"); + strcpy(outline,line); + outline[strlen(outline)-1]='\0'; + strcat(outline,desc); + fputs(outline, outputf); + } + if (copy_processed) + fputs(line,outputf); + } + } + } + if(args.interactive_flag) + fflush(outputf), fflush(failedf); + + } + cmdline_parser_free(&args); +} + + + + + + + + +// while ((i=tab.next()) != -1 && count++) { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "left")or($name eq "l")){ + $l=$value; + } + elsif(($name eq "right")or($name eq "r")){ + $r=$value; + } + elsif(($name eq "trim")or($name eq "t")){ + $trim=1; + } + elsif(($name eq "white")or($name eq "w")){ + $white=1; + } + elsif($name eq "bom"){ + $bon=$value; + } + elsif($name eq "eom"){ + $eon=$value; + } + elsif($name eq "bod"){ + $bod=$value; + } + elsif($name eq "eod"){ + $eod=$value; + } + elsif(($name eq "column")or($name eq "c")){ + $column=$value; + } + elsif(($name eq "ignore")or($name eq "i")){ + $ignore=1; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + + } + close CONFIG; + } +} +######################################################### + +GetOptions("left|l=s" => \$l, + "right|r=s" => \$r, + "trim|t" => \$trim, + "white|w" => \$white, + "bom=s" => \$bon, + "eom=s" => \$eon, + "bod=s" => \$bod, + "eod=s" => \$eod, + "column|c=s" => \$column, + "ignore|i" => \$ignore, + "help|h" => \$help); + +if(!($column=~/^[0-9]+$/)){$column=0;} + +if($help) +{ + print <<'END' +Options: + --help -h Help. + --left -l Left context info (default='30c') + Examples: + -l=5c: left context is 5 characters + -l=5w: left context is 5 words + -l=5s: left context is 5 non-empty input lines + -l='\s*\S+\sr\S+BOS': left context starts with the given regex + --right -r Right context info (default='30c') + --trim -t Clear incomplete words from output + --white -w DO NOT change all white characters into spaces + --column -c Left column minimal width in characters (default = 0) + --ignore -i Ignore input inconsistency + --bon Beginning of selected segment + (regex, default='[0-9]+ [0-9]+ BOM .*') + --eon End of selected segment + (regex, default='[0-9]+ [0-9]+ EOM .*') + --bod Selected segment beginning display (default='[') + --eod Selected segment end display (default=']') + +END +; + exit 0; +} + + +my $seg_no=0; +my $seg_size=0; + +my $left_type; +my $left_size; +my $right_type; +my $right_size; + +set_lr_types($l, $r, \$left_type,\$left_size,\$right_type,\$right_size, $trim); + + +my $inn=0; +my $after_bos=0; +my $before_eos=0; + +my @LEFT; #tablica skalarw +my @CENTER; #tablica skalarw +my @RIGHT; + +my @current_center; +my @current_left; #skalar dla c, w pp. tablica +my @current_left_words; +my @current_right_words_number; + + +while(<>){ + my $line = $_; + chomp $line; + my @line = split / /, $line; + my $line_s=@line; + + if(!line_format_ok(@line)){next;} + + if(!$white){white_into_spaces(\@line);} + else{if($line[2] eq "S"){symbols_into_white(\$line[3]);}} + + if(!input_consistent(\$seg_no,\$seg_size,$line[0],$line[1],$ignore)){ + eof_or_inconsistency(\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,$left_type,$right_type); + @current_center=(); + @current_left=(); + @current_left_words=(); + @current_right_words_number=(); + $after_bos=0; + $before_eos=0; + } + + remember_current_left($left_type,$left_size,\@current_left,\@line, \@current_left_words, $line, \$after_bos, \$before_eos); + remember_center($line,\@line,\$inn,\@current_center,$white,\@CENTER,\@current_left,\@LEFT, \$after_bos, \$before_eos, \@RIGHT, \@current_right_words_number); + remember_right($right_type,$left_type,$right_size,\@line,\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,\@current_right_words_number, $line, \$before_eos); +} + +eof_or_inconsistency(\@LEFT,\@CENTER,\@RIGHT,$bod,$eod,$white,$column,$trim,$left_type,$right_type); +exit(0); + +#################procedury############################### + +sub line_format_ok{ + my @line = @_; + my $size = @line; + if($size<4){return 0;} + if($line[0]!~/[0-9]+/){return 0;} + if($line[1]!~/[0-9]+/){return 0;} + return 1; + } + +sub white_into_spaces{ + my $line_ref=shift; + if(@{$line_ref}[2] eq "S"){ + @{$line_ref}[3]=" "; + } + } + +sub symbols_into_white{ + my $string_ref=shift; + ${$string_ref} =~ s/\\n/\n/g; + ${$string_ref} =~ s/\\t/\t/g; + ${$string_ref} =~ s/_/ /g; + } + +sub white_into_symbols{ + my $string_ref=shift; + ${$string_ref} =~ s/\n/\\n/g; + ${$string_ref} =~ s/\t/\\t/g; + ${$string_ref} =~ s/ /_/g; + } + +sub input_consistent{ + my $seg_no_ref = shift; + my $seg_size_ref = shift; + my $line0 = shift; + my $line1 = shift; + my $ig = shift; + my $ok=1; + + if(${$seg_no_ref}!=0&&(!$ig)){ + my $distance = $line0-${$seg_size_ref}; + if($distance!=${$seg_no_ref}){$ok=0;} + } + ${$seg_no_ref}=$line0; + ${$seg_size_ref}=$line1; + return $ok; + } + +sub set_lr_types{ + my $left = shift; + my $right = shift; + my $left_type_ref =shift; + my $left_size_ref =shift; + my $right_type_ref =shift; + my $right_size_ref =shift; + my $do_trim=shift; + + if($left=~/[0-9]+c/){ + ${$left_type_ref}='c'; + ${$left_size_ref}=get_number($left); + if($do_trim){${$left_size_ref}++;} + } + else{ + if($left=~/[0-9]+w/){ + ${$left_type_ref}='w'; + ${$left_size_ref}=get_number($left); + } + else{ + if($left=~/[0-9]+s/){ + ${$left_type_ref}='s'; + ${$left_size_ref}=get_number($left); + } + else{ + ${$left_type_ref}=$left; + } + } + } + +if($right=~/[0-9]+c/){ + ${$right_type_ref}='c'; + ${$right_size_ref}=get_number($right); + if($do_trim){${$right_size_ref}++;} + } + else{ + if($right=~/[0-9]+w/){ + ${$right_type_ref}='w'; + ${$right_size_ref}=get_number($right); + } + else{ + if($right=~/[0-9]+s/){ + ${$right_type_ref}='s'; + ${$right_size_ref}=get_number($right); + } + else{ + ${$right_type_ref}=$right; + } + } + } + } + +sub get_number{ + my $string = shift; + my @letters = split(//,$string); + my $i=0; + while($letters[$i]=~/[0-9]/){$i++;} + my $j; + my $number=0; + my $ten=1; + for($j=$i-1;$j>=0;$j--){ + $number+=$letters[$j]*$ten; + $ten*=10; + } + return $number; + } + +sub remember_center{ + my $lin = shift; + my $lin_ref = shift; + my $inn_ref = shift; + my $current_center_ref = shift; + my $white_info = shift; + my $CENTER_REF = shift; + my $current_left_ref = shift; + my $LEFT_REF = shift; + my $after_bos_ref = shift; + my $before_eos_ref = shift; + my $RIGHT_REF = shift; + my $current_words_right_number_ref = shift; + + if((!${$inn_ref}) && $lin=~/$bon/){ + ${$inn_ref}=1; + @{$current_center_ref}=(); + ${$after_bos_ref}=0; + + push(@{$LEFT_REF},join('',@{$current_left_ref})); + + } + if(${$inn_ref} && $lin=~/$eon/){ + ${$inn_ref}=0; + push(@{$CENTER_REF},join('',@{$current_center_ref})); + ${$before_eos_ref}=1; + my @new_table; + push(@{$RIGHT_REF},\@new_table); + push(@{$current_words_right_number_ref},0); + } + if($inn && index($lin,'*')==-1){ + white_into_symbols(\${$lin_ref}[3]); + if($white_info){push(@{$current_center_ref},${$lin_ref}[3]);} + else{push(@{$current_center_ref},${$lin_ref}[3]);} + } + } + +sub remember_current_left{ +my $type=shift; +my $size=shift; +my $ref=shift; +my $line_ref=shift; + if($type eq 'c'){ + if(!(${$line_ref}[3] eq '*')){ + push(@{$ref},split('',${$line_ref}[3])); + my $lsize = @{$ref}; + if($lsize>$size){splice(@{$ref},0,$lsize-$size);} + } + } + else{ + if($type eq 'w'){ + my $words_ref = shift; + if(!(${$line_ref}[3] eq '*')){ + push(@{$ref},${$line_ref}[3]); + if(${$line_ref}[2] eq 'W'){ + push(@{$words_ref},${$line_ref}[3]); + } + my $lsize = @{$words_ref}; + if($lsize>$size){ + my $word = ${$words_ref}[1]; + splice(@{$words_ref},0,1); + while(!(${$ref}[0] eq $word)){splice(@{$ref},0,1); } + } + } + + } + else{ + if($type eq 's'){ + if(!(${$line_ref}[3] eq '*')){ + push(@{$ref},${$line_ref}[3]); + my $lsize = @{$ref}; + if($lsize>$size){splice(@{$ref},0,$lsize-$size);} + } + } + else{#bos/eos + shift; + my $line = shift; + my $after_bos_ref = shift; + my $before_eos_ref = shift; + if($line=~/$type/){ + ${$after_bos_ref}=1; + @{$ref}=(); + } + if(${$after_bos_ref} && !(${$line_ref}[3] eq '*')){ + push(@{$ref},${$line_ref}[3]); + } + } + } + } + } + +sub remember_right{ +my $type=shift; +my $type_left=shift; +my $size=shift; +my $line_ref=shift; +my $LEFT_REF=shift; +my $CENTER_REF=shift; +my $RIGHT_REF=shift; +my $bod=shift; +my $eod=shift; +my $w=shift; +my $c=shift; +my $t=shift; + + if($type eq 'c'){ + if(!(${$line_ref}[3] eq '*')){ + my $right_size = @{$RIGHT_REF}; + for(my $i=0; $i<$right_size; $i++){ + push(@{${$RIGHT_REF}[$i]}, split('',${$line_ref}[3])); + my $lsize = @{${$RIGHT_REF}[$i]}; + if($lsize>=$size){ + splice(@{${$RIGHT_REF}[$i]},$size-1); #wypisz i usun + print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type); + $right_size = @{$RIGHT_REF}; + $i--; + } + } + } + } + else{ + if($type eq 'w'){ + my $words_number_ref = shift; + if(!(${$line_ref}[3] eq '*')){ + my $right_size = @{$RIGHT_REF}; + for(my $i=0; $i<$right_size; $i++){ + push(@{${$RIGHT_REF}[$i]},${$line_ref}[3]); + if(${$line_ref}[2] eq 'W'){ + ${$words_number_ref}[$i]=${$words_number_ref}[$i]+1; + if(${$words_number_ref}[$i]==$size){ + print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type); + $right_size = @{$RIGHT_REF}; + $i--; + splice(@{$words_number_ref},$i,1); + } + } + } + } + } + else{ + if($type eq 's'){ + if(!(${$line_ref}[3] eq '*')){ + my $right_s = @{$RIGHT_REF}; + for(my $i=0; $i<$right_s; $i++){ + push(@{${$RIGHT_REF}[$i]},${$line_ref}[3]); + my $rsize=@{${$RIGHT_REF}[$i]}; + if($rsize==$size){ + print_and_remove($i,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type); + $right_s = @{$RIGHT_REF}; + $i--; + } + } + } + } + else{#bos/eos + shift; + my $line = shift; + my $before_eos_ref = shift; + if(${$before_eos_ref}){ + if(!(${$line_ref}[3] eq '*')){ + #tylko 1 pozycja + push(@{${$RIGHT_REF}[0]},${$line_ref}[3]); + } + if($line=~/$type/){ + ${$before_eos_ref}=0; + print_and_remove(0,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bod,$eod,$w,$c,$t,$type_left,$type); + } + } + } + } + } +} + +sub print_and_remove{ + my $index = shift; + my $LEFT_REF = shift; + my $CENTER_REF = shift; + my $RIGHT_REF = shift; + my $bdis = shift; + my $edis = shift; + my $white = shift; + my $column = shift; + my $trim = shift; + my $left_type = shift; + my $right_type = shift; + + my $left_string = "${$LEFT_REF}[$index]"; + my $right_string = join('',@{${$RIGHT_REF}[$index]}); + + if($trim){ + if($left_type eq "c"){$left_string=trim_left($left_string);} + if($right_type eq "c"){$right_string=trim_right($right_string);} + } + + if(length($left_string)<$column){$left_string=" "x($column-length($left_string)).$left_string;} + + if($white){ + white_into_symbols(\$left_string); + white_into_symbols(\$right_string); +#ponizsza linijka dodana 18 listopada + white_into_symbols(\${$CENTER_REF}[$index]); + } + + print $left_string; + print $bdis; + +#ponizsza 3 linijki (tj. 1 blok) dodana 18 listopada + if(!$white){ + symbols_into_white(\${$CENTER_REF}[$index]); + } + + print "${$CENTER_REF}[$index]"; + print $edis; + print $right_string; + print "\n"; + + splice(@{$LEFT_REF},$index,1); + splice(@{$CENTER_REF},$index,1); + splice(@{$RIGHT_REF},$index,1); + } + +sub trim_left{ + my $string = shift; + if(substr($string,0,1) eq " "){return substr($string,1);} + my $position = index($string," "); + my $temp_position = index($string,"\n"); + if(!$temp_position==-1&&($position==-1||$temp_position<$position)){$position=$temp_position;} + $temp_position = index($string,"\t"); + if(!$temp_position==-1&&($position==-1||$temp_position<$position)){$position=$temp_position;} + return substr($string,$position+1); + } + +sub trim_right{ + my $string = shift; + my $length = length($string); + if(substr($string,$length-1,1) eq " "){return substr($string,0,$length-1);} + my $position = rindex($string," "); + my $temp_position = rindex($string,"\n"); + if($temp_position>$position){$position=$temp_position;} + $temp_position = rindex($string,"\t"); + if($temp_position>$position){$position=$temp_position;} + return substr($string,0,$position); + } + +sub eof_or_inconsistency{ + my $LEFT_REF = shift; + my $CENTER_REF = shift; + my $RIGHT_REF = shift; + my $bdis = shift; + my $edis = shift; + my $white = shift; + my $column = shift; + my $trim = shift; + my $left_type = shift; + my $right_type = shift; + + my $length = @{$CENTER_REF}; + for(my $i=0;$i<$length;$i++){ + print_and_remove(0,$LEFT_REF,$CENTER_REF,$RIGHT_REF,$bdis,$edis,$white,$column,$trim,$left_type,$right_type); + $length = @{$CENTER_REF}; + $i--; + } + } diff --git a/src/kor/Makefile b/src/kor/Makefile new file mode 100644 index 0000000..90ef1ae --- /dev/null +++ b/src/kor/Makefile @@ -0,0 +1,56 @@ +include ../../config.mak + +ifeq ($(BUILD_STATIC), yes) + LDFLAGS += -static +endif + +CXXFLAGS += -fpermissive -O2 + +LIB_PATH=../lib +COMMON_PATH=../common +CMDLINE_FILE='"../kor/cmdline.h"' + + +kor: main.cc corr.o corlist.o cmdline.o $(LIB_PATH)/word.o \ + $(LIB_PATH)/auttools.o cmdline.c common_cor.o common.o + $(CXX) $(CXXFLAGS) -D _CMDLINE_FILE=$(CMDLINE_FILE) main.cc corlist.o corr.o common.o \ + $(LIB_PATH)/word.o $(LIB_PATH)/auttools.o cmdline.c common_cor.o \ + -o kor $(LDFLAGS) + +corr.o: corr.cc corr.hh cmdline.h + $(CXX) -c $(CXXFLAGS) -D _CMDLINE_FILE=$(CMDLINE_FILE) corr.cc + +corlist.o: corlist.cc corlist.h cmdline.h + $(CXX) -c $(CXXFLAGS) -D _CMDLINE_FILE=$(CMDLINE_FILE) corlist.cc + +common.o: cmdline.h $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \ + $(COMMON_PATH)/common.h + $(CXX) -c $(CXXFLAGS) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc + +common_cor.o: cmdline.h common_cor.cc common_cor.h + $(CXX) -c $(CXXFLAGS) -D _CMDLINE_FILE=$(CMDLINE_FILE) common_cor.cc + +cmdline.c cmdline.h: cmdline.ggo + $(GENGETOPT) -i cmdline.ggo --conf-parser + +cmdline.ggo: cmdline_cor.ggo $(COMMON_PATH)/cmdline_common.ggo + cat cmdline_cor.ggo $(COMMON_PATH)/cmdline_common.ggo > cmdline.ggo + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 kor $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/kor +endif + +clean: clean.cmdline + rm *.o || true + rm kor || true + +clean.cmdline: + rm cmdline.* || true diff --git a/src/kor/cmdline_cor.ggo b/src/kor/cmdline_cor.ggo new file mode 100644 index 0000000..3738d59 --- /dev/null +++ b/src/kor/cmdline_cor.ggo @@ -0,0 +1,13 @@ +package "kor" +version "0.1" + +option "dictionary-home" - "Dictionary home dir." string typestr="FILENAME" no hidden +option "dictionary" d "Dictionary" string typestr="FILENAME" default="cor.bin" no +option "distance" D "Maximal edit distance." int default="1" no +option "replace" r "Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off +#option "single" - "Place all alternatives in the same line" flag off +option "weights" w "File with translation rules." string typestr="FILENAME" default="weight.cor" no +option "threshold" t "Edit distance threshold" float default="1" no +option "show-scores" - "Show scores" flag off +option "count" n "Print only count best results" int no + diff --git a/src/kor/common_cor.cc b/src/kor/common_cor.cc new file mode 100644 index 0000000..59e8b3c --- /dev/null +++ b/src/kor/common_cor.cc @@ -0,0 +1,48 @@ +#include +#include +#include "common_cor.h" + +#define MAX_PATH_LENGTH 255 + +char dictionary[MAX_PATH_LENGTH]; +char file_weights[MAX_PATH_LENGTH]; +float threshold; +bool show_scores = false; +int result_count; + +void process_cor_options(gengetopt_args_info* args) +{ + if(args->dictionary_given) + { + expand_path(args->dictionary_arg,dictionary); + if(file_accessible(dictionary)!=0) + { + fprintf(stderr,"Cannot open the dictionary file: %s\nAborting.\n",dictionary); + exit(1); + } + } + else if (args->dictionary_home_given && args->language_given) + { + char buf[MAX_PATH_LENGTH]; + expand_path(args->dictionary_home_arg, buf); + sprintf(dictionary,"%s/%s/cor.bin",buf,args->language_arg); + if(file_accessible(dictionary)!=0) + { + fprintf(stderr,"Cannot open the dictionary file: %s\nAborting.\n",dictionary); + exit(1); + } + } + + expand_path(args->weights_arg, file_weights); + + threshold = args->threshold_arg; + + show_scores = args->show_scores_flag; + + if(args->count_given) { + result_count = args->count_arg; + } + else { + result_count = 0; + } +} diff --git a/src/kor/common_cor.h b/src/kor/common_cor.h new file mode 100644 index 0000000..5572cbd --- /dev/null +++ b/src/kor/common_cor.h @@ -0,0 +1,25 @@ +#ifndef __COMMON_COR_H +#define __COMMON_COR_H + +// SEKCJA STALYCH + +#define MAX_LEN 2 +#define PREC 1000 +#define Weight int + +// SEKCJA INCLUDOW +#include "../common/common.h" +#include "cmdline.h" + + +// SEKCJA GENGETOPT +extern int change_count; +extern void process_cor_options(gengetopt_args_info* args); +extern char dictionary[]; +extern char file_weights[]; +extern float threshold; +extern bool show_scores; +extern int result_count; + +#endif + diff --git a/src/kor/corlist.cc b/src/kor/corlist.cc new file mode 100644 index 0000000..d5ba81d --- /dev/null +++ b/src/kor/corlist.cc @@ -0,0 +1,70 @@ +#include +#include +#include "corlist.h" + +#define min(x,y) ((xMAX_LEN) { printf("ERROR in file %s: the string '%s' exceeds maximum length of %d characters.\n",Name,a,MAX_LEN); fclose(f); return -1; } + if (w.lb>MAX_LEN) { printf("ERROR in file %s: the string '%s' exceeds maximum length of %d characters.\n",Name,b,MAX_LEN); fclose(f); return -1; } + strcpy(w.a,a), strcpy(w.b,b); + total++; + List = (CorWeight*)realloc(List,total*sizeof(CorWeight)); + List[total-1]=w; + // printf("%s\t<->\t%s\t%1.2f\n",w.a,w.b,((float)w.w/PREC)); + } + } + fclose(f); + // printf("Total: %d\n\n",total); + return(total); +} diff --git a/src/kor/corlist.h b/src/kor/corlist.h new file mode 100644 index 0000000..5626d14 --- /dev/null +++ b/src/kor/corlist.h @@ -0,0 +1,20 @@ +#ifndef _CORLIST_H +#define _CORLIST_H + +//#include +#include "common_cor.h" + +typedef struct { char a[MAX_LEN+1],b[MAX_LEN+1]; Weight w; short la,lb; } CorWeight; + +class CorList +{ + private: + CorWeight *List; + int total; + public: + Weight cor_stdcor, cor_xchg; + int loadCWL(char *Name); + Weight GetValue(char X[100], char Y[100], Weight (*H2)[100], int i, int j); +}; + +#endif diff --git a/src/kor/corr.cc b/src/kor/corr.cc new file mode 100644 index 0000000..7a7afc2 --- /dev/null +++ b/src/kor/corr.cc @@ -0,0 +1,163 @@ +//--------------------------------------------------------------------------- +#include "common_cor.h" +#include "corr.hh" + +#define MAXPATH 256 + +#define min(x,y) ((xy)?(x):(y)) + + +Weight Corr::ed(int i,int j) +{ + if(i==-1) + return (j+1)*CL.cor_stdcor; // moje* Nie wiem czy tak bdzie dobrze, ale uznaem, e poza tablic powinny by wartosci przemnoone przez wag standardowej zmiany litery + if(j==-1) + return (i+1)*CL.cor_stdcor; // moje* + if(i==-2 || j==-2) + return (n+1)*CL.cor_stdcor; // moje* + + if(X[i]==Y[j]) + return min(H2[i-1][j-1], min(CL.cor_stdcor+min(H2[i][j-1],H2[i-1][j]),CL.GetValue(X,Y,H2,i,j))); + if(X[i-1]==Y[j] && X[i]==Y[j-1]) + return min(min(CL.cor_xchg+H2[i-2][j-2],CL.cor_stdcor+min(H2[i][j-1],H2[i-1][j])), CL.GetValue(X,Y,H2,i,j)); + return min(CL.cor_stdcor+min(H2[i-1][j-1],min(H2[i][j-1],H2[i-1][j])), CL.GetValue(X,Y,H2,i,j)); + +/* // wersja z wagami ale dla floatw + if(X[i]==Y[j])//zielone-> <- niebieskie -> <- rzowe -> + return min(H2[i-1][j-1], min(1+min(H2[i][j-1],H2[i-1][j]),CL.GetValue(X,Y,H2,i,j))); + if(X[i-1]==Y[j] && X[i]==Y[j-1]) + return min(1+min(H2[i-2][j-2],min(H2[i][j-1],H2[i-1][j])), CL.GetValue(X,Y,H2,i,j)); + return min(1+min(H2[i-1][j-1],min(H2[i][j-1],H2[i-1][j])), CL.GetValue(X,Y,H2,i,j)); +*/ + +/* // normalna wersja + if(X[i]==Y[j]) + return H2[i-1][j-1]; + if(X[i-1]==Y[j] && X[i]==Y[j-1]) + return 1+min(H2[i-2][j-2],min(H2[i][j-1],H2[i-1][j])); + return 1+min(H2[i-1][j-1],min(H2[i][j-1],H2[i-1][j])); +*/ + +/* + if(X[i]==Y[j]) + return H[(i-1)+2][(j-1)+2]; + if(X[i-1]==Y[j] && X[i]==Y[j-1]) + return 1+min(H[(i-2)+2][(j-2)+2],min(H[(i)+2][(j-1)+2],H[(i-1)+2][(j)+2])); + return 1+min(H[(i-1)+2][(j-1)+2],min(H[(i)+2][(j-1)+2],H[(i-1)+2][(j)+2])); +*/ +} + +int Corr::load2(char *Name) // moje +{ + return CL.loadCWL(Name); +} + +Weight Corr::cuted(int j) +{ + int l=max(0,j-t); + int u=min(m,j+t); + Weight ce=(j+t)*PREC; // moje* + for(int k=l;k<=u;k++) + { + if(H2[k][j]0) + j--; + else + more=0; + while(more && !continued(path[j])); + state=path[j]+1; + } + return count; +} + + +//--------------------------------------------------------------------------- + diff --git a/src/kor/corr.hh b/src/kor/corr.hh new file mode 100644 index 0000000..fcbf669 --- /dev/null +++ b/src/kor/corr.hh @@ -0,0 +1,39 @@ +//--------------------------------------------------------------------------- +#ifndef _corr_hh +#define _corr_hh +//--------------------------------------------------------------------------- + +#include "../lib/tfti.h" +#include "../lib/word.h" +#include "corlist.h" +#include "../common/common.h" + +class Corr : public TFTiv +{ +private: + Weight H[100][100]; + char X[100]; // misspelled string + char Y[100]; // (possibly partial) candidate string + int m; // length of X + int n; // maximal length of Y + + Weight ed(int,int); + Weight cuted(int); + void recomputeH(int); + + +public: + Weight (*H2)[100]; // moje: zmiana z int na Weight (float) + int t; // threshold + CorList CL; // moje + + Corr() : H2((Weight(*)[100])&H[2][2]) {}; // moje (int->float) + Corr(const char* a) : TFTiv(a), H2((Weight(*)[100])&H[2][2]) { }; + + int correct(const char* w, Words& tab); + + int load2(char *Name); // moje +}; + +//--------------------------------------------------------------------------- +#endif diff --git a/src/kor/main.cc b/src/kor/main.cc new file mode 100644 index 0000000..8095c7b --- /dev/null +++ b/src/kor/main.cc @@ -0,0 +1,174 @@ +#include +#include +#include "../lib/iotools.h" +#include "common_cor.h" +#include "corr.hh" +#include + + +int main(int argc, char** argv) { + +// setlocale(LC_CTYPE,""); +// setlocale(LC_COLLATE,""); + + gengetopt_args_info args; + + if(cmdline_parser(argc, argv, &args) != 0) + exit(1); + + process_config_files(&args,argv[0]); + process_common_options(&args,argv[0]); + process_cor_options(&args); + + Corr cor; + + //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +// strcpy(dictionary,"cor.bin"); + + cor.load(dictionary); + cor.t=args.distance_arg; + + //>>>>>>>>>>>>>>> + cor.CL.cor_stdcor=1*PREC; + cor.CL.cor_xchg=1*PREC; + if (cor.load2(file_weights)==-1) return -1; // moje + cor.t=1*PREC; // ODLEGLOSC EDYCYJNA + //<<<<<<<<<<<<<< + + char line[MAX_LINE+1]; + long line_count = 0; + + Segment seg; + Words tab; + char form1[MAX_LINE]; + char* form; + int formcasing; + char corfield[MAX_LINE]=""; + + while (fgets(line, MAX_LINE, inputf)) + { + ++line_count; + char outline[128]; + + if (!process_seg(line, args)) + fputs(line, outputf); + else + { + char form[MAX_FORM]; + + tab.clear(); + getfield(line,input_field_prefix,form); + if (form==NULL) continue; + + formcasing=3; + cor.correct(form, tab); + + if( tab.count() == 0 ) + { + formcasing=casing(form); + if( formcasing == 1 || formcasing == 2) + tolowers(form, form1), cor.correct(form1, tab); + } + + if ( tab.count() == 0) + fputs(line, failedf); + else + { + tab.sort(); + + int max_cnt = 0; + if(result_count < 1) { + max_cnt = tab.count(); + } + else { + max_cnt = (tab.count() < result_count) ? tab.count() : result_count; + } + + if(args.replace_flag) + { + char corfield[128]; + strcpy(corfield, input_field_prefix); + strcat(corfield, form); + seg.aux[seg.auxn]=corfield; + ++seg.auxn; + for(int i=0; i= tab.count()-max_cnt; --i) + { + if(tab[i].w_suf() > threshold) continue; + restorecasing(tab[i].form(),tab[i].form(),formcasing); + p += sprintf(p," %s%s",output_field_prefix,tab[i].form()); + if(show_scores) { + p += sprintf(p,",%1.2f",tab[i].w_suf()); + } + } + sprintf(p,"\n"); + + strcpy(outline,line); + outline[strlen(outline)-1]='\0'; + strcat(outline,corfield); + fputs(outline, outputf); + } + else if(one_field) + { + char* p=corfield; + p += sprintf(p," %s",output_field_prefix); + for(int i=tab.count()-1; i >= tab.count()-max_cnt; --i) + { + if(tab[i].w_suf() > threshold) continue; + restorecasing(tab[i].form(),tab[i].form(),formcasing); + p += sprintf(p,(i==0)?"%s":";%s",tab[i].form()); + if(show_scores) { + p += sprintf(p,",%1.2f",tab[i].w_suf()); + } + } + + sprintf(p,"\n"); + + strcpy(outline,line); + outline[strlen(outline)-1]='\0'; + strcat(outline,corfield); + fputs(outline, outputf); + } + else + { + for(int i=tab.count()-1; i >= tab.count()-max_cnt; --i) + { + if(tab[i].w_suf() > threshold) continue; + restorecasing(tab[i].form(),tab[i].form(),formcasing); + char* p = corfield; + p += sprintf(p," %s%s",output_field_prefix,tab[i].form()); + if(show_scores) { + p += sprintf(p,",%1.2f",tab[i].w_suf()); + } + p += sprintf(p, "\n"); + strcpy(outline,line); + outline[strlen(outline)-1]='\0'; + strcat(outline,corfield); + fputs(outline, outputf); + } + } + } + } + } + + if(args.interactive_flag) + { + fflush(outputf); + fflush(failedf); + } + } + cmdline_parser_free(&args); +} diff --git a/src/kot/Makefile b/src/kot/Makefile new file mode 100644 index 0000000..653742a --- /dev/null +++ b/src/kot/Makefile @@ -0,0 +1,17 @@ +include ../../config.mak + +kot: + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 kot $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/kot +endif + +clean: diff --git a/src/kot/kot b/src/kot/kot new file mode 100755 index 0000000..f4b38f3 --- /dev/null +++ b/src/kot/kot @@ -0,0 +1,102 @@ +#!/usr/bin/perl + +#package: UAM Text Tools +#component: kot +#version: 1.0 +#author: Tomasz Obrebski + +use strict; +use Getopt::Long; +use File::HomeDir; + +my $help=0; +my $gap_fill="\n-----\n"; +my $spaces=0; + +my $systemconfigfile='/usr/local/etc/utt/kot.conf'; +#my $userconfigfile="$ENV{'HOME'}/.utt/kot.conf"; +my $userconfigfile=home()."/.utt/kot.conf"; + +#read configuration files########################### +my $file; +foreach $file ($systemconfigfile, $userconfigfile){ + if(open(CONFIG, $file)){ + while () { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "gap-fill")or($name eq "g")){ + $gap_fill=$value; + } + elsif(($name eq "spaces")or($name eq "s")){ + $spaces=1; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + + } + close CONFIG; + } +} +######################################################### + +GetOptions("gap-fill|g=s" => \$gap_fill, + "spaces|r" => \$spaces, + "help|h" => \$help); + +if($help) +{ + print <<'END' +Usage: ser [OPTIONS] [file ..] + +Options: + --gap-fill -g Help. + --spaces -r + --define=FILE Read macrodefinitions from FILE. + --flex-template=FILE Read flex code template from FILE. + --only-matching -m Print only fragments matching PATTERN. + --flex Print only the generated flex code and exit. +END +; + exit 0; +} + + +$gap_fill =~ s/\\t/\t/g; +$gap_fill =~ s/\\n/\n/g; +$gap_fill =~ s/\\r/\r/g; +$gap_fill =~ s/\\f/\f/g; + +my $prevend=-1; +my $count=0; + +while(<>) +{ + my ($start,$len,$type,$form) = /^\s*(\d+)\s+(\d+)\s+(\S+)\s+(\S+)/; + + if($start > $prevend) + { + print $gap_fill unless $count++ == 0; + } + + $prevend=$start+$len; + + next if $len==0;# || $form eq "*"; + + $form =~ s/\\\*/*/g; + + if($type eq 'S' && ! $spaces) + { + $form =~ s/_/ /g; + $form =~ s/\\t/\t/g; + $form =~ s/\\n/\n/g; + $form =~ s/\\r/\r/g; + $form =~ s/\\f/\f/g; + } + + print $form; +} diff --git a/src/lem/Makefile b/src/lem/Makefile new file mode 100644 index 0000000..64fceb4 --- /dev/null +++ b/src/lem/Makefile @@ -0,0 +1,45 @@ +LDFLAGS += -static +CXXFLAGS += -O2 -fpermissive + +LIB_PATH=../lib +COMMON_PATH=../common +CMDLINE_FILE='"../lem/cmdline.h"' + + +lem: main.cc lem.o $(LIB_PATH)/auttools.o $(LIB_PATH)/word.o \ + cmdline.c common_lem.o common.o symtab.o + g++ $(CXXFLAGS) $(LDFLAGS) -D _CMDLINE_FILE=$(CMDLINE_FILE) \ + main.cc lem.o $(LIB_PATH)/auttools.o \ + $(LIB_PATH)/word.o cmdline.c common.o common_lem.o \ + symtab.o -o lem + +lem.o: lem.h lem.cc + g++ $(CXXFLAGS) -c -D _CMDLINE_FILE=$(CMDLINE_FILE) lem.cc + +common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \ + $(COMMON_PATH)/common.h + g++ $(CXXFLAGS) -c -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc + +common_lem.o: cmdline.h common_lem.h common_lem.cc + g++ $(CXXFLAGS) -c -D _CMDLINE_FILE=$(CMDLINE_FILE) common_lem.cc + +cmdline.c cmdline.h: cmdline.ggo + gengetopt -i cmdline.ggo --conf-parser + +cmdline.ggo: cmdline_lem.ggo ../common/cmdline_common.ggo + cat cmdline_lem.ggo ../common/cmdline_common.ggo > cmdline.ggo + +symtab.o: $(LIB_PATH)/symtab.h $(LIB_PATH)/symtab.cc + g++ $(CXXFLAGS) -c $(LIB_PATH)/symtab.cc + +clean: clean.cmdline + rm *.o || true + rm lem || true + +clean.cmdline: + rm cmdline.* || true + +copy: +ifdef UTT_BIN_DIR + cp lem $(UTT_BIN_DIR) +endif diff --git a/src/lem/cmdline_lem.ggo b/src/lem/cmdline_lem.ggo new file mode 100644 index 0000000..21c93f0 --- /dev/null +++ b/src/lem/cmdline_lem.ggo @@ -0,0 +1,5 @@ +package "lem" +version "0.1" + +option "dictionary-home" - "D.h." string typestr="FILENAME" hidden no +option "dictionary" d "Dictionary" string typestr="FILENAME" default="lem.bin" no diff --git a/src/lem/common_lem.cc b/src/lem/common_lem.cc new file mode 100644 index 0000000..610e309 --- /dev/null +++ b/src/lem/common_lem.cc @@ -0,0 +1,51 @@ +#include +#include +#include "common_lem.h" + +char dictionary[255]; + +void process_lem_options(gengetopt_args_info* args) +{ + + if(args->dictionary_given) + { + expand_path(args->dictionary_arg,dictionary); + if(file_accessible(dictionary)!=0) + { + fprintf(stderr,"Cannot open the dictionary file: %s\nAborting.\n",dictionary); + exit(1); + } + } + else if (args->dictionary_home_given && args->language_given) + { + char buf[255]; + expand_path(args->dictionary_home_arg, buf); + sprintf(dictionary,"%s/%s/lem.bin",buf,args->language_arg); + if(file_accessible(dictionary)!=0) + { + fprintf(stderr,"Cannot open the dictionary file: %s\nAborting.\n",dictionary); + exit(1); + } + } +} + + +// STARE +// if(args.dictionary_given) +// strcpy(dictionary, args.dictionary_arg); +// else { +// char path[256]; +// //sprintf(path, "/etc/utt/data/%s/%s", args.locale_arg, DICT_FILE); +// //if (file_accessible(path) == 0) +// // strcpy(dictionary, path); +// //else { +// sprintf(path, "%s/%s", utt_dir, DICT_FILE); +// if (file_accessible(path) == 0) +// strcpy(dictionary, path); +// else { +// fprintf(stderr, "Cannot find dictionary!\n"); +// exit(1); +// } +// //} +// } + diff --git a/src/lem/common_lem.h b/src/lem/common_lem.h new file mode 100644 index 0000000..86bc400 --- /dev/null +++ b/src/lem/common_lem.h @@ -0,0 +1,15 @@ +#ifndef __COMMON_LEM__H +#define __COMMON_LEM__H + +#include +#include "../common/common.h" + +#include "cmdline.h" + +#define DICT_FILE "lem.bin" + +extern char dictionary[]; + +extern void process_lem_options(gengetopt_args_info* args); + +#endif diff --git a/src/lem/lem.cc b/src/lem/lem.cc new file mode 100644 index 0000000..b83499e --- /dev/null +++ b/src/lem/lem.cc @@ -0,0 +1,152 @@ +#include "lem.h" +#include +#include + + +/* Znajduje opisy slownikowe dla wyrazu. + * Parametry: + * form - wyraz, + * tab - referencja do tablicy Words (miejsce na wyniki) + * Wartosc: + * liczba dodanych opisow + */ +int Lem::ana(const char* form, Words& tab) { + + // sprawdzamy czy parametry wywolania sa poprawne + assert(form && &tab); + int count0 = tab.count(); + long l; + if ((l=_dict.next(_dict.gtra(0, form, FT::ftMAXPATH), ';'))>=0) + add_to_table(tab, form, l); + return tab.count()-count0; +} + + +/* Szukamy opisu slownikowego nastepnego wyrazu w buforze. + * Parametry: + * buf - bufor + * tab - miejsce na wyniki + * Wartosc: + * ilosc dodanych opisow + */ +int Lem::pref(char* buf, Words& tab) { + + // sprawdzamy czy parametry wywolania sa poprawne + assert(buf && &tab); + + int count0 = tab.count(); + long l; + char* buf0 = buf; + + if((l=_dict.pref(buf, ';'))>=0) { + char form[MAX_FORM]; + int len=buf-buf0; + form[len]='\0'; + add_to_table(tab,form,l); + } + return tab.count() - count0; +} + +/* Dodaje kolejne opisy do tablicy wynikow. + * Parametry: + * tab - tablica wynikow, + * f - wyraz, + * s - stan, na ktorym zaczyna sie pierwszy opis + */ +void Lem::add_to_table(Words& tab, const char* f, long s) { + + // sprawdzenie parametrow + assert(&tab); + assert(f); + + char des[FT::ftMAXPATH]; + + while (_dict.cont(s, des)) { + char* des1; + if ((des1=strtok(des, ";")) != NULL) + do { + if (tab.count() >= MAX_ALT) break; + tab.add(f, des1); + des1=strtok(NULL, ";"); + } while (des1!=NULL); + s=-1; + } +} + +void Lem::prn_dict() +{ + + char des[FT::ftMAXPATH]; + + long s=0; + + while (_dict.cont(s, des)) + { + printf("%s\n",des); + s=-1; + } +} + + +AuxLem::AuxLem(const char* filename) + : Lem(), _dict(SIZE) +{ + FILE* f; + char buf[MAX_LINE+2]; + f=fopen(filename,"r"); + for(long i=0; i=MAX_LINE-1) continue; // BEZ isalpha! + buf[l-1]='\0'; + char* sep=strchr(buf,';'); + if(sep==NULL) continue; + *sep='\0'; + long formind=_dict.add(buf); + if(formind>=0) + { + char* desc=strdup(sep+1); + info[formind]=desc; + } + else + fprintf(stderr,"AuxLem: Form not added: %s;%s.\n", buf,sep+1); + } + fclose(f); +}; + +//--------------------------------------------------------------------------- + +AuxLem::~AuxLem() +{ +// for(long i=0; i<_dict.count(); ++i) +// free(info[_dict.hashindex(i)]); + for(long i=0; i=0) + { + strcpy(des,info[ind]); + char* des1; + if((des1=strtok(des,";"))!=NULL) + do + { + if(tab.cnt>=MAXALT) break; + tab.add(form,des1); + des1=strtok(NULL,";"); + } while(des1!=NULL); + } + return tab.count()-count0; +} + +//--------------------------------------------------------------------------- + diff --git a/src/lem/lem.h b/src/lem/lem.h new file mode 100644 index 0000000..767e82d --- /dev/null +++ b/src/lem/lem.h @@ -0,0 +1,50 @@ +#include "../lib/tfti.h" +#include "../lib/word.h" +#include "../lib/symtab.h" +#include "../lib/const.h" + +class Lem { + + protected: + // Alphabet& _alpha; + + // slownik + TFTiv _dict; + + void add_to_table(Words& tab, const char* f, long s); + + public: + + Lem() {}; + Lem(const char* d) + : _dict(d) {}; + virtual int ana(const char* form, Words& tab); + int pref(char* form, Words& tab); + void prn_dict(); + +}; + + +class AuxLem : public Lem { +public: + + static const int SIZE=1500000; + // static const int MAXLINE=1000; + static const int MAXALT=256; + + AuxLem(const char* filename); + ~AuxLem(); + +// int ana(const char* form, Grams& tab); + int ana(const char* form, Words& tab); + +// operator bool() { return _dict && info; } + +private: + UTTSymbolTable _dict; + char* info[SIZE]; + +}; + + + diff --git a/src/lem/main.cc b/src/lem/main.cc new file mode 100644 index 0000000..a963aff --- /dev/null +++ b/src/lem/main.cc @@ -0,0 +1,131 @@ +#include "../lib/iotools.h" +//do wyrzucenia - definicja w Makefile! #define _CMDLINE_FILE "../lem/cmdline.h" +#include "../common/common.h" +#include "common_lem.h" +#include "lem.h" +#include "cmdline.h" +#include + +int main(int argc, char** argv) { + +// setlocale(LC_CTYPE,""); //PO CO TO? +// setlocale(LC_COLLATE,""); // + + gengetopt_args_info args; + + if(cmdline_parser(argc, argv, &args) != 0) + exit(1); + + process_config_files(&args,argv[0]); + process_common_options(&args,argv[0]); + process_lem_options(&args); + + char line[MAX_LINE+1]; + char outline[MAX_LINE+1]; + char parms[MAX_LINE+1], desc[MAX_LINE+1], lemma[MAX_LINE+1]; + long line_count = 0; + + Lem* lem; + + if(strcmp(dictionary+strlen(dictionary)-4,".bin")==0) + lem = new Lem(dictionary); + else if(strcmp(dictionary+strlen(dictionary)-4,".dic")==0) + lem = new AuxLem(dictionary); + else + fprintf(stderr,"lem: Invalid dictionary file extension.\n"); + + Words tab; +// Segment seg; + + while (fgets(line, MAX_LINE, inputf)) + { + ++line_count; + int start, len; + + if (!process_seg(line, args)) // TO POWINNO BYC WCZESNIEJ ZABEZPIECZONE + fputs(line, outputf); + else + { + char form[MAX_FORM]; + + tab.clear(); + getfield(line,input_field_prefix,form); + + if (form==NULL) continue;//BZDURA + + lem->ana(form, tab); + if(tab.count()==0) + { + char form1[MAX_FORM]; // tymczasowo tak, trzeba zmienic ana + char* p; + strcpy(form1,form); + for(p=form1;*p;++p) *p=tolower(*p); + p=form1; + lem->ana(p,tab); + } + + if (tab.count() == 0) + fputs(line, failedf); + else + { // mamy jakies opisy w slowniku + + if(one_line) + { + char* descp=desc; + for (int i=0; i< tab.count(); ++i) + { + descp += sprintf(descp," %s%s,%s", output_field_prefix, tab[i].lemma(), tab[i].descr()); + } + strcpy(outline,line); + outline[strlen(outline)-1]='\0'; + strcat(outline,desc); + strcat(outline,"\n"); + fputs(outline, outputf); + if (copy_processed) + fputs(line,outputf); + } + else if(one_field) + { + char* descp=desc; + for (int i=0; i< tab.count(); ++i) + if(i==0) + descp += sprintf(descp," %s%s,%s", output_field_prefix, tab[i].lemma(), tab[i].descr()); + else + { + if(strcmp(tab[i].lemma(),tab[i-1].lemma())==0) + descp += sprintf(descp,",%s",tab[i].descr()); + else + descp += sprintf(descp,";%s,%s",tab[i].lemma(),tab[i].descr()); + } + + strcpy(outline,line); + outline[strlen(outline)-1]='\0'; + strcat(outline,desc); + strcat(outline,"\n"); + fputs(outline, outputf); + if (copy_processed) + fputs(line,outputf); + } + else + { + for (int i=0; i< tab.count(); ++i) + { + // kolejne opisy - kolejne linie. + sprintf(desc, " %s%s,%s\n", output_field_prefix, tab[i].lemma(), tab[i].descr()); + strcpy(outline,line); + outline[strlen(outline)-1]='\0'; + strcat(outline,desc); + fputs(outline, outputf); + } + if (copy_processed) + fputs(line,outputf); + } + } + } + + if(args.interactive_flag) + fflush(outputf), fflush(failedf); + + } + cmdline_parser_free(&args); +} diff --git a/src/lem/test.txt b/src/lem/test.txt new file mode 100644 index 0000000..3651eb3 --- /dev/null +++ b/src/lem/test.txt @@ -0,0 +1,6 @@ +0000 06 W abakus +0006 01 S _ +0007 07 W abdykuj +0014 01 S _ +0015 10 W abietynowi +0025 01 S \n diff --git a/src/lem_utf8/Makefile b/src/lem_utf8/Makefile new file mode 100644 index 0000000..7f06823 --- /dev/null +++ b/src/lem_utf8/Makefile @@ -0,0 +1,62 @@ +include ../../config.mak + +# because of OpenFST this application cannot be compiled statically, yet +#ifeq ($(BUILD_STATIC), yes) +# LDFLAGS += -static +#endif + +LDFLAGS += -ldl -lfst +CXXFLAGS += -g -fpermissive + +LIB_PATH=../lib +COMMON_PATH=../common +CMDLINE_FILE='"../lem_utf8/cmdline.h"' + + +lem: main.cc lem.o lemfst.o $(LIB_PATH)/auttools.o $(LIB_PATH)/word.o \ + cmdline.c common_lem.o common.o symtab.o + $(CXX) $(CXXFLAGS) -D _CMDLINE_FILE=$(CMDLINE_FILE) \ + main.cc lem.o lemfst.o $(LIB_PATH)/auttools.o \ + $(LIB_PATH)/word.o cmdline.c common.o common_lem.o \ + symtab.o -o lem $(LDFLAGS) + +lem.o: lem.h lem.cc + $(CXX) $(CXXFLAGS) -c -D _CMDLINE_FILE=$(CMDLINE_FILE) lem.cc + +common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \ + $(COMMON_PATH)/common.h + $(CXX) $(CXXFLAGS) -c -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc + +common_lem.o: cmdline.h common_lem.h common_lem.cc + $(CXX) $(CXXFLAGS) -c -D _CMDLINE_FILE=$(CMDLINE_FILE) common_lem.cc + +cmdline.c cmdline.h: cmdline.ggo + $(GENGETOPT) -i cmdline.ggo --conf-parser + +cmdline.ggo: cmdline_lem.ggo ../common/cmdline_common.ggo + cat cmdline_lem.ggo ../common/cmdline_common.ggo > cmdline.ggo + +symtab.o: $(LIB_PATH)/symtab.h $(LIB_PATH)/symtab.cc + $(CXX) $(CXXFLAGS) -c $(LIB_PATH)/symtab.cc + +clean: clean.cmdline + rm *.o || true + rm lem || true + +clean.cmdline: + rm cmdline.* || true + +lemfst.o: + $(CXX) -c lemfst.cpp $(CXXFLAGS) -o lemfst.o + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 lem $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/lem +endif diff --git a/src/lem_utf8/cmdline_lem.ggo b/src/lem_utf8/cmdline_lem.ggo new file mode 100644 index 0000000..21c93f0 --- /dev/null +++ b/src/lem_utf8/cmdline_lem.ggo @@ -0,0 +1,5 @@ +package "lem" +version "0.1" + +option "dictionary-home" - "D.h." string typestr="FILENAME" hidden no +option "dictionary" d "Dictionary" string typestr="FILENAME" default="lem.bin" no diff --git a/src/lem_utf8/common_lem.cc b/src/lem_utf8/common_lem.cc new file mode 100644 index 0000000..610e309 --- /dev/null +++ b/src/lem_utf8/common_lem.cc @@ -0,0 +1,51 @@ +#include +#include +#include "common_lem.h" + +char dictionary[255]; + +void process_lem_options(gengetopt_args_info* args) +{ + + if(args->dictionary_given) + { + expand_path(args->dictionary_arg,dictionary); + if(file_accessible(dictionary)!=0) + { + fprintf(stderr,"Cannot open the dictionary file: %s\nAborting.\n",dictionary); + exit(1); + } + } + else if (args->dictionary_home_given && args->language_given) + { + char buf[255]; + expand_path(args->dictionary_home_arg, buf); + sprintf(dictionary,"%s/%s/lem.bin",buf,args->language_arg); + if(file_accessible(dictionary)!=0) + { + fprintf(stderr,"Cannot open the dictionary file: %s\nAborting.\n",dictionary); + exit(1); + } + } +} + + +// STARE +// if(args.dictionary_given) +// strcpy(dictionary, args.dictionary_arg); +// else { +// char path[256]; +// //sprintf(path, "/etc/utt/data/%s/%s", args.locale_arg, DICT_FILE); +// //if (file_accessible(path) == 0) +// // strcpy(dictionary, path); +// //else { +// sprintf(path, "%s/%s", utt_dir, DICT_FILE); +// if (file_accessible(path) == 0) +// strcpy(dictionary, path); +// else { +// fprintf(stderr, "Cannot find dictionary!\n"); +// exit(1); +// } +// //} +// } + diff --git a/src/lem_utf8/common_lem.h b/src/lem_utf8/common_lem.h new file mode 100644 index 0000000..9eef856 --- /dev/null +++ b/src/lem_utf8/common_lem.h @@ -0,0 +1,15 @@ +#ifndef __COMMON_LEM__H +#define __COMMON_LEM__H + +#include +#include "../common/common.h" + +#include "cmdline.h" + +#define DICT_FILE "lem.bin" + +extern char dictionary[]; + +extern void process_lem_options(gengetopt_args_info* args); + +#endif diff --git a/src/lem_utf8/lem.cc b/src/lem_utf8/lem.cc new file mode 100644 index 0000000..3ff9de9 --- /dev/null +++ b/src/lem_utf8/lem.cc @@ -0,0 +1,133 @@ +#include "lem.h" +#include +#include + +/* Znajduje opisy slownikowe dla wyrazu. + * Parametry: + * form - wyraz, + * tab - referencja do tablicy Words (miejsce na wyniki) + * Wartosc: + * liczba dodanych opisow + */ +int Lem::ana(const wchar_t* form, Words& tab) { + assert(form && &tab); + int count0 = tab.count(); + LemFST::State l,d; + int i = 0; + LemFST::Word w = *LemFST::charToWord(form); + d = _dict.next(_dict.start(), w); + d = _dict.next(d, ';'); + + if (d>=0) + add_to_table(tab, form, d); + return tab.count()-count0; +} + + + +/* Dodaje kolejne opisy do tablicy wynikow. + * Parametry: + * tab - tablica wynikow, + * f - wyraz, + * s - stan, na ktorym zaczyna sie pierwszy opis + */ +void Lem::add_to_table(Words& tab, const wchar_t* f, LemFST::State s) { + assert(&tab); + assert(f); + + LemFST::Word *w = new LemFST::Word(); + int r; + while (r = _dict.cont(s, w, 200)) { + char fchar[200]; //Bufor do przetrzymywania f w postaci zwyklych char + wcstombs(fchar, f, 200); // Zamien f w zwykle chary i umiesc w fchar + const char* wChar = LemFST::wordToChar(w); // Aktualne slowo zamien w chary + w->clear(); // czyscimy bufor ze slowem. + tab.add(fchar, wChar); + } + _dict.cont(LemFST::noStateId, w, 200); // ubijamy generator + delete w; +} + +void Lem::prn_dict() +{ + LemFST::Word des; + LemFST::State s=_dict.start(); + + while (_dict.cont(s, &des, 200)) + { + wprintf(L"%ls\n",des.size()); // Ladne drukowanie! + s=-1; + // TODO: Ladne wyswiatlanie slownika; + } +} + + + + + + + + + + + +//====================================================================== +// AUX LEM +//====================================================================== +AuxLem::AuxLem(const char* filename) + : Lem(filename), _dict(SIZE) +{ + FILE* f; + char buf[MAX_LINE+2]; + f=fopen(filename,"r"); + for(long i=0; i=MAX_LINE-1) continue; // BEZ isalpha! + buf[l-1]='\0'; + char* sep=strchr(buf,';'); + if(sep==NULL) continue; + *sep='\0'; + long formind=_dict.add(buf); + if(formind>=0) + { + char* desc=strdup(sep+1); + info[formind]=desc; + } + else + fprintf(stderr,"AuxLem: Form not added: %s;%s.\n", buf,sep+1); + } + fclose(f); +}; + + +AuxLem::~AuxLem() +{ + for(long i=0; i=0) + { + strcpy(des,info[ind]); + char* des1; + if((des1=strtok(des,";"))!=NULL) + do + { + if(tab.cnt>=MAXALT) break; + tab.add(form,des1); + des1=strtok(NULL,";"); + } while(des1!=NULL); + } + return tab.count()-count0; +} + + diff --git a/src/lem_utf8/lem.h b/src/lem_utf8/lem.h new file mode 100644 index 0000000..aa3ad4f --- /dev/null +++ b/src/lem_utf8/lem.h @@ -0,0 +1,50 @@ +// Do wyrzucenia. Korzystamy z LemFST.h teraz#include "../lib/tfti.h" +#include "lemfst.h" +#include "../lib/word.h" +#include "../lib/symtab.h" +#include "../lib/const.h" + +class Lem { + + protected: + // Alphabet& _alpha; + + // slownik + //TFTiv _dict; + LemFST _dict; + void add_to_table(Words& tab, const wchar_t* f, LemFST::State s); + + public: + + //Lem(// {} + Lem(const char* d) : _dict(d) {} // TODO: Dodaj konstruktor z sciezka do FST.bin + virtual int ana(const wchar_t* form, Words& tab); + //int pref(char* form, Words& tab); + void prn_dict(); + +}; + + +class AuxLem : public Lem { +public: + + static const int SIZE=1500000; + // static const int MAXLINE=1000; + static const int MAXALT=256; + + AuxLem(const char* filename); + ~AuxLem(); + +// int ana(const char* form, Grams& tab); + int ana(const char* form, Words& tab); + +// operator bool() { return _dict && info; } + +private: + UTT::SymbolTable _dict; + char* info[SIZE]; + +}; + + + diff --git a/src/lem_utf8/lemfst.cpp b/src/lem_utf8/lemfst.cpp new file mode 100644 index 0000000..8e9a423 --- /dev/null +++ b/src/lem_utf8/lemfst.cpp @@ -0,0 +1,147 @@ +#include "lemfst.h" + +LemFST::LemFST(const char* plik_fst) +{ + fst = StdFst::Read(plik_fst); +} + + +LemFST::~LemFST(void) +{ + delete fst; + path.clear(); +} + +/** Zwraca true jezeli s jest koncowym */ +bool LemFST::accept(State s) { + if (s == fst::kNoStateId) return false; + return fst->Final(s)!=StdFst::Weight::Zero(); +} + + +LemFST::State LemFST::start() { + return fst->Start(); +} + + +/** Przechodzi do nastepnego stanu po jednym znaku */ +LemFST::State LemFST::next(State s, Char c) { + if (s==fst::kNoStateId) return fst::kNoStateId; + ArcsIt aiter(*fst, s); + for (; !aiter.Done(); aiter.Next()) { + StdArc arc = aiter.Value(); + if (arc.ilabel==c) return arc.nextstate; + + } + return fst::kNoStateId; +} + +/** Konsumuje slowo W zaczynajac w stanie s. + * Zwraca: + * State (int64) - w jakim sie znajdzie + * kNoStateId (-1) - gdy nie moze dokonac takiego przejscia. + */ +LemFST::State LemFST::next(State s, Word w) { + if (s==fst::kNoStateId) return fst::kNoStateId; + LemFST::WordIt it; + State ns = s; + for(it=w.begin(); it!=w.end(); ++it) { + ns = next(ns, *it); + if (ns==fst::kNoStateId) { + return fst::kNoStateId; + } + } + return ns; + +} + +/** Funkcja zwraca kolejne sciezki z automatu w formie generatora. + * Ustawienie s na kNoStateId (lub -1) restartuje generator. + * Dopoki s!=-1 metoda zwraca przy kolejnych wywolaniach kolejne sciezki od stanu s + * do stanow konczacych. + * Sciezke mozna odzyskac poprzez metode getPath(); + */ +long LemFST::cont(State s, Word *result, int maxPath) { + // Restart generatora po podaniu -1 + // Lub przekroczeniu dlugosci sciezki + if (s==fst::kNoStateId || path.size() > maxPath) { + path.clear(); + return 0; + } + + + // Stos jest pusty. Mozemy dodac do niego wierzcholek poczatkowy. + if (path.empty()) { + path.push_back(getStateInfo(s, fst::kNoStateId, 0)); + } + while(!path.empty()) { + + // Zdejmujemy ze stosu stan i jego iterator: + StateInfo *state = &path.back(); + ArcsIt *ait = state->it; + + // Jezeli stan jest koncowy to zwracamy sciezke do stanu "s" + // Dodatkowo sprawdzamy czy juz nie startowalismy z tego stanu, aby nie wyswietlac wielokrotnie tego samego + if (!state->checked && accept(state->id)) { + state->checked = true; + StateInfo tState; + PathRevIt pit=path.rbegin(); + State prevId = pit->id; + result->clear(); + for(; pit!=path.rend(); ++pit) { + tState = (*pit); + if (tState.id == prevId && tState.prev!=fst::kNoStateId && tState.id !=s) { + ArcsIt *tArcIt = tState.it; + result->push_front(tState.symbol); + } + if (tState.prev==fst::kNoStateId) break; + prevId = tState.prev; + } + ait->Next(); + return path.size(); + } + + + // Skonczylismy sprawdzac dany iterator (stan). + // Dlatego mozemy go usunac ze stosu. + if (ait->Done()) { + path.pop_back(); + // Jezeli jakies wierzcholki sa jeszcze na stosie to kontynuuj: + if (path.size() > 0) continue; + // w.p.p zakoncz. + else return 0; + } + + + // Odwiedzamy stan. + // Dodajemy jego nastepnikow: + for(; !ait->Done(); ait->Next()) { + State next = ait->Value().nextstate; + Char isymbol = ait->Value().ilabel; + path.push_back(getStateInfo(next, state->id, isymbol)); + } + } // Koniec (while(!path.empty()) + + + + return 0; +} + + +/** Zwraca strukture StateInfo uzupelniajac ja o dane + * Parametry: + * s - ktory stan opisujemy + * prev - poprzednik stanu w automacie + * isymbol - symbol po jakim dostalismy sie z 'prev' do 's' + */ +inline const LemFST::StateInfo LemFST::getStateInfo(State s, State prev, Char isymbol) { + StateInfo sInfo; + sInfo.id = s; + sInfo.it = new ArcsIt(*fst, s); //Wyciagamy iterator z automatu + sInfo.symbol = isymbol; + sInfo.prev = prev; + sInfo.checked = false; // Czy stan byl juz sprawdzany jako koncowy + return sInfo; +} + + diff --git a/src/lem_utf8/lemfst.h b/src/lem_utf8/lemfst.h new file mode 100644 index 0000000..8a85c27 --- /dev/null +++ b/src/lem_utf8/lemfst.h @@ -0,0 +1,112 @@ +#ifndef __LemFST +#define __LemFST + +#include +#include +#include + +#define fstMAXPATH 100 +using namespace fst; +class LemFST +{ +public: + LemFST(const char*); + ~LemFST(void); + + +////////////////////////////////////////////////////////////////////////////// + typedef StdFst::Arc::Label Char; // Pojedynczy znak + typedef StdFst::StateId State; // Stan + + typedef std::list Word; // Lista znakow. + typedef std::list::iterator WordIt; // Iterator po lisice znakow + + + typedef ArcIterator ArcsIt; // Iterator po krawedziach + typedef struct { + ArcsIt* it; // Krawedzie wychodzace (Iterator) + State id; // Numer stanu + State prev; // Poprzednik stanu + Char symbol; // Symbol po jakim przeszlismy z prev do id + bool checked; // Czy stan byl juz brany pod uwage jako koncowy + } StateInfo; + + typedef std::list Path; // Sciezka stanow (wlasc. stos) + typedef Path::iterator PathIt; // Iterator po Sciezce + typedef Path::reverse_iterator PathRevIt; // Odwrotny iterator po Scieze + + + // Operacje na automacie: + StdFst *fst; // Automat LEMa + State start(); + bool accept(State); + State next(State, Char); // Go to next state by Char + State next(State, Word); // Go to end of the Word + + + + + + // Przeszukiwanie automatu: + Path path; + long cont(State, Word*, int); // Return a path from state to finish state; + + + + // Pomocnicze: + inline const StateInfo getStateInfo(State, State, Char); + + + static const State noStateId = fst::kNoStateId; + + /** Zwraca ciag wchar_t zbudowany ze slowa w */ + inline static char* wordToChar(Word* w) { + int len = w->size(); + WordIt it; + char* ret = new char[len+1]; + int i=0; + for(it=w->begin(); it!=w->end(); it++) { + ret[i++] = static_cast(*it); + } + ret[i]='\0'; + return ret; + } + + /** Zwraca ciag wchar_t zbudowany ze slowa w */ + inline static wchar_t* wordToWChar(Word* w) { + int len = w->size(); + WordIt it; + wchar_t* ret = new wchar_t[len+1]; + int i=0; + for(it=w->begin(); it!=w->end(); it++) { + ret[i++] = static_cast(*it); + } + ret[i]=L'\0'; + return ret; + } + + + + /** Zwraca slowo Word* na podstawie ciagu wchar_t */ + inline static Word* charToWord(const wchar_t* ch) { + int i =0; + Word *w = new Word(); + for(; ch[i]!=L'\0'; i++) { + w->push_back((int)ch[i]); + } + return w; + } + + /** Zwraca slowo Word* na podstawie ciagu wchar_t */ + inline static Word* charToWord(const char* ch) { + int i =0; + Word *w = new Word(); + for(; ch[i]!='\0'; i++) { + w->push_back((int)ch[i]); + } + return w; + } + +}; + +#endif diff --git a/src/lem_utf8/main.cc b/src/lem_utf8/main.cc new file mode 100644 index 0000000..6c1f7da --- /dev/null +++ b/src/lem_utf8/main.cc @@ -0,0 +1,140 @@ +#include "../lib/iotools.h" +//do wyrzucenia - definicja w Makefile! #define _CMDLINE_FILE "../lem/cmdline.h" +#include "../common/common.h" +#include "common_lem.h" +#include "lem.h" +#include "cmdline.h" +#include +#include +#include + +int main(int argc, char** argv) { + + setlocale(LC_CTYPE,""); + + gengetopt_args_info args; + + if(cmdline_parser(argc, argv, &args) != 0) + exit(1); + + process_config_files(&args,argv[0]); + process_common_options(&args,argv[0]); + process_lem_options(&args); + + wchar_t line[MAX_LINE+1]; + wchar_t outline[MAX_LINE+1]; + wchar_t parms[MAX_LINE+1], desc[MAX_LINE+1], lemma[MAX_LINE+1]; + long line_count = 0; + + Lem* lem; + + if(strcmp(dictionary+strlen(dictionary)-4,".bin")==0) + lem = new Lem(dictionary); + else if(strcmp(dictionary+strlen(dictionary)-4,".dic")==0) + lem = new AuxLem(dictionary); + else + fprintf(stderr,"lem: Invalid dictionary file extension.\n"); + + Words tab; +// Segment seg; + + while (fgetws(line, MAX_LINE, inputf)) + { + ++line_count; + int start, len; + + char linechar[MAX_LINE+1]; + wcstombs(linechar, line, MAX_LINE+1); + if (!process_seg(linechar, args)) // TO POWINNO BYC WCZESNIEJ ZABEZPIECZONE + fputws(line, outputf); + else + { + wchar_t form[MAX_FORM+1]; + + tab.clear(); + wchar_t winput_field_prefix[MAX_LINE+1]; + mbstowcs(winput_field_prefix, input_field_prefix, MAX_LINE+1); + getfield(line,winput_field_prefix,form); /// SEGMENTATION FAULT!!! + /* W trakcie uzupelniania pola "4" nastepuje cos dziwnego... */ + if (form==NULL) continue;//BZDURA + lem->ana(form, tab); + if(tab.count()==0) + { + + wchar_t form1[MAX_FORM]; // tymczasowo tak, trzeba zmienic ana + wchar_t* p; + wcscpy(form1,form); + for(p=form1;*p;++p) *p=towlower(*p); + p=form1; + lem->ana(p,tab); + } + if (tab.count() == 0) { + fputws(line, failedf); + } + + else + { // mamy jakies opisy w slowniku + if(one_line) + { + wchar_t* descp=desc; + //wchar_t woutput_field_prefix[MAX_LINE+1]; + //mbstowcs(woutput_field_prefix, output_field_prefix, MAX_LINE+1); + wprintf(L"tab.count() in one-line: %d prefix: %s", tab.count(), output_field_prefix); + for (int i=0; i< tab.count(); ++i) + { + descp += swprintf(descp,MAX_LINE, L" %s%s,%s", output_field_prefix, tab[i].lemma(), tab[i].descr()); + wprintf(L"for %d lemma '%s' descr '%s'", i, tab[i].lemma(), tab[i].descr()); + } + wprintf(L"descp: %ls", descp); + wcscpy(outline,line); + outline[wcslen(outline)-1]='\0'; + wcscat(outline,descp); + wcscat(outline,L"\n"); + fputws(outline, outputf); + if (copy_processed) + fputws(line,outputf); + } + else if(one_field) + { + wchar_t* descp=desc; + for (int i=0; i< tab.count(); ++i) + if(i==0) + descp += swprintf(descp,MAX_LINE,L" %s%s,%s", output_field_prefix, tab[i].lemma(), tab[i].descr()); + else + { + if(strcmp(tab[i].lemma(),tab[i-1].lemma())==0) + descp += swprintf(descp,MAX_LINE,L",%s",tab[i].descr()); + else + descp += swprintf(descp,MAX_LINE,L";%s,%s",tab[i].lemma(),tab[i].descr()); + } + + wcscpy(outline,line); + wprintf(L"%ls \n", line); + outline[wcslen(outline)-1]=L'\0'; + wcscat(outline,desc); + wcscat(outline,L"\n"); + fputws(outline, outputf); + + if (copy_processed) fputws(line,outputf); + } + else + { + for (int i=0; i< tab.count(); ++i) + { + swprintf(desc,MAX_LINE,L" %s%s,%s \n", output_field_prefix, tab[i].lemma(), tab[i].descr()); + wcscpy(outline,line); + outline[wcslen(outline)-1]='\0'; + wcscat(outline,desc); + fputws(outline, outputf); + } + if (copy_processed) fputws(line,outputf); + } + } + } + + if (args.interactive_flag) + fflush(outputf), fflush(failedf); + + } + cmdline_parser_free(&args); +} diff --git a/src/lib/Makefile b/src/lib/Makefile new file mode 100644 index 0000000..a540cae --- /dev/null +++ b/src/lib/Makefile @@ -0,0 +1,23 @@ +include ../../config.mak + +CXXFLAGS += -O2 -fpermissive + +LIB_PATH=../lib +COMMON_PATH=../common + +main: auttools.o word.o + +auttools.o: auttools.h auttools.cc + $(CXX) $(CXXFLAGS) -c auttools.cc + +word.o: word.h word.cc + $(CXX) $(CXXFLAGS) -c word.cc + +clean: + rm *.o + +.PHONY: install +install: + +.PHONY: uninstall +uninstall: diff --git a/src/lib/auttools.cc b/src/lib/auttools.cc new file mode 100644 index 0000000..daf018d --- /dev/null +++ b/src/lib/auttools.cc @@ -0,0 +1,164 @@ +#include "auttools.h" +//#include "/src/cpp-comm/plx/Plx.h" + +void fullform(const char* b, const char* d, char* f) +{ + int i,j=0; + int n1, n2=0; + bool g=false; + char s1[200], s2[200], temps[200]; + while(d[j]>='0' && d[j]<='9')j++; + strncpy(temps,d,j); temps[j]='\0'; + n1=atoi(temps); + i=j; + while(!ispunct(d[j]) || d[j]=='*') j++; + strncpy(s1,d+i,j-i); + s1[j-i]='\0'; + if(d[j++]=='-') + { + i=j; + while(d[j]>='0' && d[j]<='9')j++; + strncpy(temps,d+i,j-i); temps[j]='\0'; + n2=atoi(temps); + i=j; + while(!ispunct(d[j]) || d[j]=='*') j++; + strncpy(s2,d+i,j-i); + s2[j-i]='\0'; + g=true; + } + + int blen=strlen(b); + if(g) + if(n1+n2<=blen) + { + strcpy(f,s1); + strcat(f,b+n1); + f[strlen(f)-n2]='\0'; + strcat(f,s2); + } + else + strcpy(f,""); + else + if(n1<=blen) + { + strcpy(f,b); + f[strlen(f)-n1]='\0'; + strcat(f,s1); + } + else + strcpy(f,""); +} + +void compose(char* stem, char* ending, char* form) +{ + bool suffix=true; + while(*stem) + if(*stem=='*') + { + strcpy(form,ending); + form+=strlen(ending); + suffix=false; + stem++; + } + else + *(form++)=*(stem++); + if(suffix) + { + strcpy(form,ending); + form+=strlen(ending); + } + *form='\0'; +} + +void autodescr(const char* f, const char* des, char* lemma, char* pos, char* attr) +{ + char lemd[MAXWORDLEN]; + int o,l=strcspn(des,","); + strncpy(lemd,des,l); + lemd[l]='\0'; + fullform(f,lemd,lemma); + o=l+1; + l=strcspn(des+o,"/:"); + strncpy(pos,des+o,l); + pos[l]='\0'; + o=o+l; + if(des[o]=='/') + { + o++; + strcpy(attr,des+o); + } + else + attr[0]='\0'; +} + + +int common_prefix(const char* s, const char* t) +{ + int n=0; + while(*s==*t && *s!='\0') + { s++,t++;n++; } + return n; +} + +int strdiff(const char* s, const char* t, + int& frontcut, char* prefix, int& endcut, char* suffix) +{ + int slen=strlen(s); + int tlen=strlen(t); + int ss, ss_max=0; /* ss - s shift */ + int ts, ts_max=0; /* ts - t shift */ + int common, common_max=0; + for(ss=0;sscommon_max + && (common>4 || (ss==0 && ts==0 && common>1)) ) + { + ss_max=ss; + ts_max=ts; + common_max=common; + } + // print "--", tsmax,"\n" + printf("--%d\n", ts_max); + frontcut=ss_max; + strncpy(prefix,t,ts_max); prefix[ts_max]='\0'; + endcut=slen-ss_max-common_max; + strcpy(suffix,t+ts_max+common_max); + return common_max; +} + +void fprndiff(FILE* f, const char* s, const char* t) +{ + int frontcut,endcut; + char pref[MAXWORDLEN],suff[MAXWORDLEN]; + strdiff(s,t,frontcut,pref,endcut,suff); + if(frontcut!=0 || pref[0]!='\0') + fprintf(f,"%d%s-%d%s",frontcut,pref,endcut,suff); + else + fprintf(f,"%d%s",endcut,suff); +} + +void sprndiff(char* outstr, const char* s, const char* t) +{ + int frontcut,endcut; + char pref[MAXWORDLEN],suff[MAXWORDLEN]; + strdiff(s,t,frontcut,pref,endcut,suff); + if(frontcut!=0 || pref[0]!='\0') + sprintf(outstr,"%d%s-%d%s",frontcut,pref,endcut,suff); + else + sprintf(outstr,"%d%s",endcut,suff); +} + + +void despos(const char* des, char* pos) +{ + int di=0; + int pi=0; + while(des[di]!=',' && des[di]!='\0') ++di; + if(des[di]==',') + { + ++di; + while(isupper(des[di])) pos[pi++]=des[di++]; + } + pos[pi]='\0'; +} + diff --git a/src/lib/auttools.h b/src/lib/auttools.h new file mode 100644 index 0000000..f91a7e2 --- /dev/null +++ b/src/lib/auttools.h @@ -0,0 +1,39 @@ + +#ifndef _Auttools_h +#define _Auttools_h + +#include +#include +#include +#include + +/* #define ISALPHAG(c) ((c>='A' && c<='Z') || (c>='a' && c<='z') || \ */ +/* c=='' || c=='' || c=='' || c=='' || \ */ +/* c=='' || c=='' || c=='' || c=='' || \ */ +/* c=='' || c=='' || c=='' || c=='' || \ */ +/* c=='' || c=='' || c=='' || c=='' || \ */ +/* c=='' || c=='' || c=='*') */ + +#define MAXWORDLEN 64 + +extern void fullform(const char* b, const char* d, // in + char* f); // out + +extern void compose(char* stem, char* ending, // in + char* form); // out + +extern void autodescr(const char* f, const char* des, // in + char* lemma, char* pos, char* attr); // out + +extern int strdiff(char* s, char* t, // in + int& frontcut, char* prefix, // out + int& endcut, char* suffix); // out + +extern void fprndiff(FILE* f, const char* s, const char* t);// in + +extern void sprndiff(char* outstr, const char* s, const char* t); // in + +extern void despos(const char* des, // in + char* pos); // out + +#endif diff --git a/src/lib/const.h b/src/lib/const.h new file mode 100644 index 0000000..36d9399 --- /dev/null +++ b/src/lib/const.h @@ -0,0 +1,24 @@ +// maksymalna dlugosc wyrazu +#define MAX_FORM 80 + +// maksymalna dlugosc opisu +#define MAX_DESC 80 + +// maksymalna dlogosc lini w pliku przejsciowym +#define MAX_LINE 1024 + +// separator pol w pliku posrednim +#define FIELD_SEP " \t\n" + +// separator pol w pliku posrednim +#define WFIELD_SEP L" \t\n" + +// maksymalna liczba alternatywnych opisow +#define MAX_ALT 256 + +// plik ze slownikiem dla guessa +#define GUESS_DICT_FILE "slownik.fsa" + +// katalogi z plikami konfiguracyjnymi +#define SYSTEM_CONFIG_DIR "/usr/local/etc/utt" +#define USER_CONFIG_DIR "~/.utt" diff --git a/src/lib/iotools.h b/src/lib/iotools.h new file mode 100644 index 0000000..23298f9 --- /dev/null +++ b/src/lib/iotools.h @@ -0,0 +1,108 @@ +#include "const.h" +#include +#include +#include +#include +#include +#include +#include + +// napisy zostaj na miejscu (w line), tylko wskazniki sa ustawian +// i zara dopisywane zera s dopisywane + +inline +int parsetok(char* line, int* a, int* b, char** c, char** d, char** e, char** f) +{ + char* field; + if((field=strtok(line,FIELD_SEP))!=NULL) + *a=atoi(field); // nie sprawdzana poprawnosc + else + return 0; + if((field=strtok(NULL,FIELD_SEP))!=NULL) + *b=atoi(field); // nie sprawdzana poprawnosc + else return 1; + if((*c=strtok(NULL,FIELD_SEP))==NULL) return 2; + if((*d=strtok(NULL,FIELD_SEP))==NULL) return 3; + if((*e=strtok(NULL,FIELD_SEP))==NULL) return 4; + if((*f=strtok(NULL,FIELD_SEP))==NULL) return 6; + return 6; +} +// wchar_t version +int parsetok(wchar_t* line, int* a, int* b, wchar_t** c, wchar_t** d, wchar_t** e, wchar_t** f) +{ + wchar_t* field; + if((field=wcstok(line,WFIELD_SEP,NULL))!=NULL) + { + std::wistringstream s(field); + int i = 0; + s >> i; + *a=i; // nie sprawdzana poprawnosc + } + else + return 0; + if((field=wcstok(NULL,WFIELD_SEP,NULL))!=NULL) + { + std::wistringstream k(field); + int j = 0; + k >> j; + *b=j; // nie sprawdzana poprawnosc + } + else return 1; + if((*c=wcstok(NULL,WFIELD_SEP,NULL))==NULL) return 2; + if((*d=wcstok(NULL,WFIELD_SEP,NULL))==NULL) return 3; + if((*e=wcstok(NULL,WFIELD_SEP,NULL))==NULL) return 4; + if((*f=wcstok(NULL,WFIELD_SEP,NULL))==NULL) return 6; + return 6; +} +// napisy s kopiowane +inline +int scantok(const char* line, int* a, int* b, char* c, char* d, char* e=NULL, char* f=NULL) +{ + return sscanf(line," %d %d %s %s %s %s", a, b, c, d, e, f); +} + +// wchar_t version +inline +int scantok(const wchar_t* line, int* a, int* b, wchar_t* c, wchar_t* d, wchar_t* e=NULL, wchar_t* f=NULL) +{ + return swscanf(line,L" %d %d %ls %ls %ls %ls", a, b, c, d, e, f); +} + +inline +int printtok(char* line, int a, int b, char* c, char* d, char* e, char* f, char* parms) +{ + sprintf(line,"%04d %02d %s %s %s %s `%s\n", a, b, c, d, e, f, parms); +} + +// wchar_t version +inline +int printtok(wchar_t* line, int a, int b, wchar_t* c, wchar_t* d, wchar_t* e, wchar_t* f, wchar_t* parms) +{ + swprintf(line,MAX_LINE,L"%04d %02d %ls %ls %ls %ls `%ls\n", a, b, c, d, e, f, parms); +} + +inline +int printtok(char* line, int a, int b, char* c, char* d, char* e, char* f) +{ + sprintf(line,"%04d %02d %s %s %s %s\n", a, b, c, d, e, f); +} + +// wchar_t version +inline +int printtok(wchar_t* line, int a, int b, wchar_t* c, wchar_t* d, wchar_t* e, wchar_t* f) +{ + swprintf(line,MAX_LINE,L"%04d %02d %ls %ls %ls %ls\n", a, b, c, d, e, f); +} + +inline +int printtok(char* line, int a, int b, char* c, char* d) +{ + sprintf(line,"%04d %02d %s %s\n", a, b, c, d); +} + +// wchar_t version +inline +int printtok(wchar_t* line, int a, int b, wchar_t* c, wchar_t* d) +{ + swprintf(line,MAX_LINE,L"%04d %02d %ls %ls\n", a, b, c, d); +} diff --git a/src/lib/matchdescr.cc b/src/lib/matchdescr.cc new file mode 100644 index 0000000..ea54655 --- /dev/null +++ b/src/lib/matchdescr.cc @@ -0,0 +1,86 @@ +#include +#include + +inline +bool inline_matchattr(const char* a, const char* b) +{ + const char *p, *q; // pomocnicze wskazniki + while(*a && *b) + { + p=a; q=b; + while(isupper(*p) && isupper(*q)) // rowny prefiks + if(*p==*q) ++p, ++q; + else if(*p<*q) // a jest mniejszy + { + // przesywamy a do nastepnego atr + a=p; + while(isupper(*a)) ++a; while(islower(*a)) ++a; + goto end; + } + else + { + // przesuwamy b do nastepnego atr + b=q; + while(isupper(*b)) ++b; while(islower(*b)) ++b; + goto end; + } + + if(islower(*p) && islower(*q)) // rowne atrybuty + { + a=p; b=q; // przesuwamy wskaznik, sprawdzamy wartosci + while(*a != *b) + { + if(*a > *b && !islower(*++b)) return false; + if(*a < *b && !islower(*++a)) return false; + } + // znaleziono rowna wartosc, przesywamy a i b do nast atr + while(isupper(*a)) ++a; while(islower(*a)) ++a; + while(isupper(*b)) ++b; while(islower(*b)) ++b; + goto end; + } + + if(islower(*p)) // a jest krotszy, czyli mniejszy + { // przesuwamy a do nastepnego atrybutu + a=p; + while(islower(*a)) ++a; + goto end; + } + + if(islower(*q)) // b jest krotszy, czyli mniejszy + { // przesuwamy b do nastepnego atrybutu + b=q; + while(islower(*b)) ++b; + goto end; + } + end: ; + } + return true; +} + + +bool matchattr(const char* a, const char* b) +{ + return inline_matchattr(a,b); +} + +bool matchdescr(const char* a, const char* b) +{ + while(isupper(*a) && isupper(*b) && *a==*b) ++a, ++b; + if(*a=='\0') + if(*b=='\0' || *b=='/') return true; + else return false; + + if(*a=='/') + if(*b=='\0') return true; + else if(*b=='/') return inline_matchattr(++a, ++b); + + return false; +} + + +int main() +{ + char a[100], b[100]; + while(scanf("%s %s", a, b)==2) + printf("%s & %s = %d\n", a, b, matchdescr(a,b)); +} diff --git a/src/lib/matchdescr.h b/src/lib/matchdescr.h new file mode 100644 index 0000000..f9ee5d5 --- /dev/null +++ b/src/lib/matchdescr.h @@ -0,0 +1,10 @@ + +// obie funkcje wymagaja by deskrypcje byly w postaci kanonicznej +// obslugiwane sa tylko krotkie (jednoliterowe) atrybuty + +// test czy zgadzaja sie deskrypcje +bool matchdescr(const char* a, const char* b); + +// test czy zgadaja sie same atrybuty (czyli to, co po ukosniku) +bool matchattr(const char* a, const char* b); + diff --git a/src/lib/symtab.cc b/src/lib/symtab.cc new file mode 100644 index 0000000..1685ed5 --- /dev/null +++ b/src/lib/symtab.cc @@ -0,0 +1,170 @@ +#include "symtab.h" +#include // numeric_limits +#include +#include +//--------------------------------------------------------------------------- +using namespace UTT; +SymbolTable::SymbolTable(int n, int (*h)(const char*,int), const char* filename) + : _mx(n), _cnt(0), hash(h) +{ + _sz=first(n); + _key=new char*[_sz]; + _defind=new int[_sz]; + _hashind=new int[_sz]; + _def=new char*[_mx]; + for(int i=0; i<_sz; i++) _key[i]=NULL; + if(filename) + add_from_file(filename); +} + +//--------------------------------------------------------------------------- + +SymbolTable::SymbolTable(int n, const char* filename) + : _mx(n), _cnt(0), hash(hash1) +{ + _sz=first(n); + _key=new char*[_sz]; + _defind=new int[_sz]; + _hashind=new int[_sz]; + _def=new char*[_mx]; + for(int i=0; i<_sz; ++i) _key[i]=NULL; + if(filename) + add_from_file(filename); +} + +//--------------------------------------------------------------------------- + +SymbolTable::~SymbolTable() +{ + clear(); + delete[] _key; + delete[] _defind; + delete[] _hashind; + delete[] _def; +} + +//--------------------------------------------------------------------------- + +void SymbolTable::clear() +{ + for(int i=0; i<_sz; ++i) + if(_key[i]) + free(_key[i]); +} + +//--------------------------------------------------------------------------- + +bool SymbolTable::add_from_file(const char* filename) +{ + FILE* in=fopen(filename,"r"); + char buf[MAXKEYLEN+1]; + + if(in) + while(fscanf(in,"%s",buf)==1) + { + if(strlen(buf)==MAXKEYLEN || add(buf)<0) + return false; + } + return true; +} + +//--------------------------------------------------------------------------- + +int SymbolTable::add(const char* s) +{ + if(_cnt<_mx) + { + int ind=hash(s,_sz); + while(_key[ind]) + if(strcmp(_key[ind],s)) + ind=++ind%_sz; + else + return _defind[ind]; + _key[ind]=strdup(s); + _defind[ind]=_cnt; + _hashind[_cnt]=ind; + _def[_cnt]=_key[ind]; + _cnt++; + return _cnt-1; + } + else + return -1; +} + +//--------------------------------------------------------------------------- + +int SymbolTable::operator[](const char* s) +{ + int ind=hash(s,_sz); + while(_key[ind]) + if(strcmp(_key[ind],s)==0) + return _defind[ind]; + else + ind=++ind % _sz; + return -1; +} + +//--------------------------------------------------------------------------- + +int SymbolTable::first(unsigned int n) +{ + int fi=n; + int bound=(n/2 < MAXKEYLEN)? n/2 : MAXKEYLEN; + bool found; + do + { + found=true; + if(fi++ == std::numeric_limits::max) return -1; + for(int i=2; i=4) + return abs((*((int*)(s+(l/2-2)))+(int)(*s * s[l-1])) % _sz); + else + { + int i=0; + strcpy((char*)&i,s); + return abs((i+(int)(*s * s[l-1])) % _sz); + } +} + +//--------------------------------------------------------------------------- + +int hash2(const char* s, int _sz) +{ + int l=strlen(s); + if(l>=6) + { + unsigned int i1,i2,i3; + strncpy((char*)&i1,s,sizeof(int)); + strncpy((char*)&i2,s+(l/2-2),sizeof(int)); + strncpy((char*)&i3,s+(l-4),sizeof(int)); + return abs((i1+i2+i3) % _sz); + } + else + { + int i=0; + strncpy((char*)&i,s,sizeof(int)); + return abs((i+(int)(*s * s[l-1])) % _sz); + } +} + +//--------------------------------------------------------------------------- + diff --git a/src/lib/symtab.h b/src/lib/symtab.h new file mode 100644 index 0000000..a5d11b2 --- /dev/null +++ b/src/lib/symtab.h @@ -0,0 +1,52 @@ +#ifndef _HashTable_h +#define _HashTable_h +//--------------------------------------------------------------------------- +#include +#include +//--------------------------------------------------------------------------- +int hash1(const char* s, int sz); +int hash2(const char* s, int sz); +//--------------------------------------------------------------------------- +namespace UTT { +class SymbolTable +{ + int _mx; + int _sz; + int _cnt; + char** _key; + char** _def; + int* _defind; + int* _hashind; // s tu redundancje + +public: + static const unsigned int MAXKEYLEN=2000; + + SymbolTable(int n, int (*h)(const char*,int), const char* filename=NULL); + SymbolTable(int n, const char* filename=NULL); + ~SymbolTable(); + + void clear(); + + int (*hash)(const char*, int); + + bool add_from_file(const char* filename); + + int add(const char* s); + int operator[](const char* s); + const char* operator[](int i){if(i<0||i>=_cnt)return NULL;else return _def[i];} + int index(const char* s) { return this->operator[](s); }; + int index(int i) { if(i<0||i>=_cnt) return -1; else return i; }; + int hash_index(int i) { return _hashind[i]; } + const char* symbol(int i) { if(i<0||i>=_cnt)return NULL; else return _def[i];} + + int capacity() { return _mx; } + int size() { return _sz; } + int count() { return _cnt; } + float search_rate(); + +private: + static int first(unsigned int n); +}; +} +//--------------------------------------------------------------------------- +#endif diff --git a/src/lib/tft.h b/src/lib/tft.h new file mode 100644 index 0000000..196ce5f --- /dev/null +++ b/src/lib/tft.h @@ -0,0 +1,878 @@ +#ifndef _TFT_h +#define _TFT_h +//--------------------------------------------------------------------------- +#include +#include +#include +#include +#include + +//#include "top.h" +#include "ttrans.h" +//--------------------------------------------------------------------------- + +/// Klasa bazowa przetwornika skoczonego. +/** + \remark Po co ta klasa? Co dotyczy samych przej, przenie do TTrans, + reszt wcieli do TFT. +*/ +class FT +{ +public: + FT() : copy_default(false), print_mode(OO), ttn(0) {}; + +//print mode + enum OUTPUT { II, ///< tylko symbole wejciowe + OO, ///< tylko symbole wyjciowe + IOIO, ///< symbol wyjciowy po wejciowym + OIOI, ///< symbol wyjciowy przed wejciowym + IIOO, ///< cae wejcie, potem cae wyjcie + OOII ///< cae wyjcie, potem cae wejcie + + }; + +/// maks dugo cieki + static const unsigned int ftMAXPATH=500; + +/// maks dugo opisu typu symbolu we/wy +/** + \remark Przenie do TTrans +*/ + static const unsigned int ftTYPELEN=32; + +/// specjalny symbol dla wartoci 'epsilon' +/** + \remark Przenie do TTrans +*/ + static const char ftEPSILON='~'; + +/// specialny symbol dla wartoci 'default' +/** + \remark Przenie do TTrans +*/ + static const char ftDEFAULT='@'; + +/// domylny symbol wyjciowy (true-'@', flase-'~') +/** + \remark Przenie do TTrans(???) +*/ + bool copy_default; + +/// tryb wyjcia + OUTPUT print_mode; + +/// false, jeli automat nie ma przej + operator bool() { return (bool)ttn; }; + + virtual const char* intype() { return itype; }; + virtual const char* outtype() { return otype; }; + +protected: + +/// liczba elementw tablicy tt + unsigned long ttn; + +/// liczba stanw + unsigned long states; + +/// liczba przej + unsigned long transitions; + +/// typ symboli wejciowych (napis) +/** + \remark Przenie do TTrans(???) +*/ + char itype[ftTYPELEN]; + +/// typ symboli wyjciowych (napis) +/** + \remark Przenie do TTrans(???) +*/ + char otype[ftTYPELEN]; +}; + +//--------------------------------------------------------------------------- + +/// Szablon przetwornika skoczonego +/** + \param I - typ symbolu wejciowego + \param Ipass - typ, jaki ma by uyty przy przekazywaniu symbolu we jako parametru + do funkcji (metody), rwny \a I lub \a I& + \param O - typ symbolu wyjciowego + \param Opass - typ, jaki ma by uyty przy przekazywaniu symbolu wy jako parametru + do funkcji (metody), rwny \a O lub \a O& + \param - typ przejcia, musi by podklas TTrans +*/ +template +class TFT : public FT +{ + + +public: + + TFT() : FT(), tt(NULL) { setiotypes(); }; + +/** +\name Metody poziomu 1 +Poziom przej. +*/ + +//@{ + +/// Test, czy przejcie \a t akceptuje symbol \a in. + bool accepts(long t, Ipass in) const; + +/// Test, czy lista przej dla aktualnego stanu jest kontynuowana po \a t. + bool continued(long t) const; + +/// Stan, do ktrego prowadzi przejcie \a t. +/** + \pre !empty(t) +*/ + long next(long t) const; + +/// Symbol wejciowy przejcia \a t. + Ipass input(long t) const; + +/// Symbol wyjciowy przejcia \a t. + Opass output(long t) const; + +/// Zwraca \c true, jeli symbolem we przejcia \a t jest epsilon. + bool epsi(long t) const; + +/// Zwraca \c true, jeli symbolem we przejcia \a t jest symbol domylny. + bool defi(long t) const; + +/// Zwraca \c true, jeli symbolem wy przejcia \a t jest epsilon. + bool epso(long t) const; + +/// Zwraca \c true, jeli symbolem wy przejcia \a t jest symbol domylny. + bool defo(long t) const; + +/// Indeks przejcia przez \a in. + long tra(long t, Ipass in) const; + +/// Indeks przejcia przez \a in - non-deterministic. + long tra_nd(long t, Ipass in, long nth) const; + +//@} + +/** +\name Poziom 2 +Poziom stanw. Stan (indeks stanu) = indeks jego pierwszego przejcia +*/ +//@{ +/// Zwraca \c true jeli stan \a s jest pusty (nie ma z niego przej). + bool empty(long s) const { return tt[s].empty(); } + +/// Zwraca \c true jeli stan \a s jest stanem kocowym. + bool final(long s) const { return tt[s].final(); } + + long next(long t, Ipass in) const; + +//long trans(const I* si, I* so, long& olen) const; + + long gtra(long s, const I* w, long maxpath=ftMAXPATH) const; + +//@} + +/** +\name Poziom 3 +Poziom ... +*/ +//@{ + long cont(long s=-1, I* c=NULL) const; + + long match(const I* w=NULL, long* p=NULL) const; + + long match_nd(const I* w=NULL, long* p=NULL) const; + + long lgstmatch(const I* w, long* p, long& plen, long maxpath=ftMAXPATH) const; + + /*NOWE*/ + + long lgstpath(I*& buf, long*& path, long start=0) const; + + long pref(I*& buf, I sep, long start=0) const; + +//@} + +protected: + + TT* tt; // tablica przej + + long prn(const I* si, long* p, O* so) const; + + void prntt(ostream& os); + + void sort(); + + void setiotypes(); // NIE DZIAA (dlaczego???) + +// friend ostream& operator<<(ostream&,const CDFA&); +// friend istream& operator>>(istream&,CDFA&); + +private: + long prn_oo(const I* si, long* p, O* so) const; + long prn_ioio(const I* si, long* p, O* so) const; + long prn_oioi(const I* si, long* p, O* so) const; + long prn_iioo(const I* si, long* p, O* so) const; + long prn_ooii(const I* si, long* p, O* so) const; +}; + + +//--------------------------------------------------------------------------- +//--------------------------------------------------------------------------- + +/** + stan = indeks pierwszego przejcia + + state(t) = stan, do ktrego naley t + + symbol zerowy = symbol s, dla ktrego (bool)s zwraca \c false, + w przypadku znakw - '\0' +*/ + +//--------------------------------------------------------------------------- +//--------------------------------------------------------------------------- + + +template +inline +bool TFT::accepts(long t, Ipass in) const +{ return tt[t].accepts(in); } + +/// Test whether the transition list continues after \a t. +template +inline +bool TFT::continued(long t) const +{ return tt[t].continued(); } + +/** + \pre !empty(t) +*/ +template +inline +long TFT::next(long t) const +{ return tt[t].next(); } + +template +inline +Ipass TFT::input(long t) const +{ return tt[t].in(); } + +template +inline +Opass TFT::output(long t) const +{ return tt[t].out(); } + +template +inline +bool TFT::epsi(long t) const +{ return tt[t].epsi(); } + +template +inline +bool TFT::defi(long t) const +{ return tt[t].defi(); } + +template +inline +bool TFT::epso(long t) const +{ return tt[t].epso(); } + +template +inline +bool TFT::defo(long t) const +{ return tt[t].defo(); } + +/** + \param +t - indeks przejcia + \param +in - symbol we + \return Indeks przjcia (>=\a t) dla biecego stanu, ktre + akceptuje symbol we \a in lub -1, jeli nie ma takiego przejcia +*/ +template +long TFT::tra(long t, Ipass in) const +{ + if(t<0 || t>=ttn) + return -1; + + if(empty(t)) return -1; + while(!accepts(t,in)) + if(continued(t)) + t++; + else + return -1; + return t; +} + +//--------------------------------------------------------------------------- +/// Indeks przejcia - wersja dla automatu niedeterministycznego. +/** + \param +t - indeks przejcia + \param +in - symbol we + \return Indeks przjcia (>=\a t) dla biecego stanu, ktre + akceptuje symbol we \a in lub -1, jeli nie ma takiego przejcia + Jeli nth==0, t1>=t, w przeciwnym razie t1>t. +*/ +template +long TFT::tra_nd(long t, Ipass in, long nth) const +{ + if(t<0 || t>=ttn) + return -1; + + if(nth) + if(continued(t)) + t++; + else + return -1; + else + { if(empty(t)) return -1; } + + while(!accepts(t,in)) + if(continued(t)) + t++; + else + return -1; + + return t; +} + +//} + +//--------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + + +/// Funkcja przejcia. +/** + \param t - stan + \param in - symbol we + \return Stan, do ktrego mona przej z \a t po wpywem symbolu \a in + lub -1, jeli nie ma przejcia przez \a in + +*/ +template +long TFT::next(long t, Ipass in) const +{ + if(t<0 || (unsigned long)t>=ttn) + return -1; + + if(empty(t)) return -1; + while(!accepts(t,in)) + if(continued(t)) + t++; + else { + return -1; + } + + return next(t); +} + +//--------------------------------------------------------------------------- + +//---------------------------------------------------------------------------- +/// Uoglniona funkcja przejscia. +/** + \param +s - stan + \param +w - wskanik pierwszego elementu cigu symboli we, zakoczonego symbolem zerowym + \param maxpath maksymalna dugo cieki, domylnie ftMAXPATH + \return stan osigalny z \a s pod wpywem \a w (na ciece mog si pojawi + epsilon-przejcia +*/ +template +long TFT::gtra(long s, const I* w, long maxpath) const +{ + if(s<0 || (unsigned long)s>=ttn) + return -1; + + long i=0; + while(*w) + { + if(i>maxpath || empty(s)) return -1; + while(!accepts(s,*w)) + if(continued(s)) + s++; + else + return -1; + if(!epsi(s)) w++; + s=next(s); + i++; + } + return s; +} + +//---------------------------------------------------------------------------- + +/// Kontynuacja. +/** +... +\param +s stan, jeli -1 - poszukiwane jest nastpne rozwizanie +\param -c cig symboli we ze cieki prowadzcej z \a s do + stanu kocowego +\return dugo cigu \a c (= dugo cieki) +\remark DZIAA TYLKO DLA ZNAKW!!! + EPSILON-PRZEJCIA NIEDOZWOLONE!!! +*/ +template +long TFT::cont(long s, I* c) const +{ + static unsigned long path[ftMAXPATH]={0}; + static unsigned long i=0; + static bool more=false; + + bool found=false; + + if(s!=-1) + { + if(s<0 || (unsigned long)s>=ttn) + more=false; + else + { + i=0; + c[0]=0; + path[0]=s; + more=true; + if(final(s)) + found=true; + } + } + + while(more && !found) + { + if(!empty(path[i]) && i0) + c[--i]=0; + else + more=false; + }while(more && !continued(path[i])); + path[i]=path[i]+1; + } + if(final(path[i])) + { + found=true; + c[i]=0; + } + } + return i; +} + +//---------------------------------------------------------------------------- +/// Dopasowannie. +/** + \remark Nie zaimplementowane. +*/ +template +long TFT::match(const I* w, long* p) const +{} + +//---------------------------------------------------------------------------- +/// Dopasowanie niedeterministyczne. +/** + \param +w - wskanik pierwszego elementu cigu symboli we, zakoczonego symbolem zerowym, + jeli NULL - poszukiwane jest nastpne rozwizanie + \param -p cig przej zakoczony -1 + \return dugo dopasowania (PO CO?) +*/ +template +long TFT::match_nd(const I* w, long* p) const +{ + static bool more=false; + static I *w0, *wc; + static long s=0, *p0, *pc, *pc_bound; + + bool found=false; + + if(w) + { + wc=w0=w; + pc=p0=p; + more=true; + pc_bound=pc+ftMAXPATH; + if(final(s=0)) + { + *pc=-1; return 0; + } + } + + while(more) + { + if(*wc && pc=0) + { if(!epsi(*pc)) wc++; s=next(*pc); pc++; } + else + while(true) + { + if(pc==p0) { more=false; return -1; } + if(!epsi(*(--pc))) wc--; + if((*pc=trand(*pc,*wc,1))>=0) + { if(!epsi(*pc)) wc++; s=next(*pc); pc++; break; } + } + if(final(s)) { *pc=-1; return wc-w0; } + } + return -1; +} + +//---------------------------------------------------------------------------- +/// Najdusze dopasowanie. +/** + \param +w wskanik pierwszego elementu cigu symboli wejciowych + \param -p cieka + \param -plen dugo cieki + \param +maxpath maks ddugo cieki, domylnie FT::ftMAXPATH + \return dugo skonsumowanego wejcia +*/ +template +long TFT + ::lgstmatch(const I* w, long* p, long& plen, long maxpath) const +{ + long s=0; + long t; + long i=0; + const char* w0=w; + long ilen=0; + while(*w && i=0) + { + if(!epsi(t)) w++; + s=next(t); + i++; + *(p++)=t; + if(final(s)) { plen=i; ilen=w-w0; } + } + *p=-1; + return ilen; +} + +//---------------------------------------------------------------------------- +/// Najdusza cieka. +/** + \param +buf wskanik pierwszego elementu cigu symboli wejciowych + \param -buf pozycja jeden za skonsumowanym prefiksem + \param +path wskanik pierwszego elementu wektora przej + \param -path wskanik jeden za ostatnim przejciem + \return dugo skonsumowanego prefiksu (PO CO? LEPIEJ D CIEKI) +*/ +template +long TFT + ::lgstpath(I*& buf, long*& path, long start) const +{ + long s=start; + long t; + const char* buf0=buf; + const long* pathlimit=path+FT::ftMAXPATH; + while(*buf && path=0) + { + if(!epsi(t)) buf++; + s=next(t); + *(path++)=t; + } + return buf-buf0; +} + +//---------------------------------------------------------------------------- +/// Najduszy prefiks. +/** + \param +buf wskanik pierwszego elementu cigu symboli wejciowych + \param -buf pozycja jeden za skonsumowanym prefiksem + \param +sep separator + \return stan po przejciu przez \a sep + \remark Dziaa tylko dla automatw deterministycznych, minimalnych, eps-wolnych, + gdzie d. cieki == d. dopasowania. +*/ +template +long TFT + ::pref(I*& buf, I sep, long start) const +{ + static long pathtab[ftMAXPATH]; + // static long* path=pathtab; + long* path=pathtab; + static bool more; + + long s; + if(*buf) // pierwsze wywoanie + { + if(!lgstpath(buf,path,start)) + return -1; + --path; + more=true; + } + else // kolejne wywoanie + --buf,--path; + while(more) + if(path>=pathtab) + if((s=next(next(*path),sep))>=0) { + return s; + } + else + --buf, --path; + else + { + more=false; + return -1; + } + return -1; +} + +//---------------------------------------------------------------------------- + +/* +template +long TFT::trans(const I* si, O* so, long& olen) const +{ + long p[ftMAXPATH]; + long ilen; + long plen; + if((ilen=lgstmatch(si,p,plen))>0) + olen=prn(si,p,so); + else + ilen=olen=0; + return ilen; +} +*/ +//---------------------------------------------------------------------------- + +template +long TFT::prn(const I* si, long* p, O* so) const +{ + switch(print_mode) + { + case OO: return prn_oo(si,p,so); + case IOIO: return prn_ioio(si,p,so); + case OIOI: return prn_oioi(si,p,so); + case IIOO: return prn_iioo(si,p,so); + case OOII: return prn_ooii(si,p,so); + } +} + +//---------------------------------------------------------------------------- + +template +long TFT::prn_oo(const I* si, long* p, O* so) const +{ + char* so0=so; + while(*p>=0) + { + long t=*p; + if(!epso(t)) + { + if(defo(t)) + *(so++)=*si; + else + *(so++)=output(t); + } + if(!epsi(t)) si++; + p++; + + } + return so-so0; +} + +//---------------------------------------------------------------------------- + + +template +long TFT::prn_ioio(const I* si, long* p, O* so) const +{ + char* so0=so; + while(*p>=0) + { + long t=*p; + if(!epsi(t)) + *(so++)=*si; + if(!epso(t)) + if(defo(t)) + *(so++)=*si; + else + *(so++)=output(t); + if(!epsi(t)) si++; + p++; + } + return so-so0; +} + + +//---------------------------------------------------------------------------- + +template +long TFT::prn_oioi(const I* si, long* p, O* so) const +{ + char* so0=so; + while(*p>=0) + { + long t=*p; + if(!epso(t)) + { + if(defo(t)) + *(so++)=*si; + else + *(so++)=output(t); + } + if(!epsi(t)) + *(so++)=*(si++); + p++; + } + return so-so0; +} + +//---------------------------------------------------------------------------- + +template +long TFT::prn_iioo(const I* si, long* p, O* so) const +{ + const char* si0=si; + long* p0=p; + char* so0=so; + while(*p>=0) + { + long t=*p; + if(!epsi(t)) + { + *(so++)=*si; + si++; + } + p++; + } + si=si0; + p=p0; + while(*p>=0) + { + long t=*p; + if(!epso(t)) + if(defo(t)) + *(so++)=*si; + else + *(so++)=output(t); + if(!epsi(t)) si++; + p++; + } + return so-so0; +} + +//---------------------------------------------------------------------------- + +template +long TFT::prn_ooii(const I* si, long* p, O* so) const +{ + + const char* si0=si; + long* p0=p; + char* so0=so; + while(*p>=0) + { + long t=*p; + if(!epso(t)) + { + if(defo(t)) + *(so++)=*si; + else + *(so++)=output(t); + } + if(!epsi(t)) si++; + p++; + } + si=si0; + p=p0; + while(*p>=0) + { + long t=*p; + if(!epsi(t)) + *(so++)=*(si++); + p++; + } + return so-so0; +} + +//--------------------------------------------------------------------------- + +template +void TFT::sort() +{ + long t=0; + while(t1) + { + long eps=-1; + long def=-1; + for(int i=0; i=0 && epseps) def--; + if(def>=0 && def=0) + { + memmove(tt+t0+def+1,tt+t0+def,tn-eps-2); + tt[t-2]=temp; + } + else + { + memmove(tt+t0+def+1,tt+t0+def,tn-eps-2); + tt[t-1]=temp; + } + } + while(t0 +void TFT::setiotypes() +{ + int i=0; + const char* it=typeid(I).name(); + while(*it) + if(*it==' ') + { it++; continue; } + else + itype[i++]=*(it++); + itype[i]='\0'; + + i=0; + const char* ot=typeid(O).name(); + while(*ot) + if(*ot==' ') + { ot++; continue; } + else + otype[i++]=*(ot++); + otype[i]='\0'; +}; + +//--------------------------------------------------------------------------- + +template +void TFT::prntt(ostream& os) +{ + for(long i=0; i +#include +#include +//#include + +#include "tft.h" +//--------------------------------------------------------------------------- + +template +class TFTi : public TFT > +{ +public: + TFTi() : TFT >() {}; + TFTi(const char* filename) + : TFT >() { load(filename); }; + + void read(const char* filename); + void read(istream& is=cin); + void write(const char* filename); + void write(ostream& os=cout); + void load(const char* filename); + void load(FILE* f=stdin); + void save(const char* filename); + void save(FILE* f=stdout); + void clear(); + using TFT >::ttn; + using TFT >::states; + using TFT >::transitions; + using TFT >::itype; + using TFT >::ftTYPELEN; + using TFT >::otype; + using TFT >::tt; + using TFT >::copy_default; + using TFT >::print_mode; + + +// friend istream& operator>>(istream&, TFTi&); +// friend ostream& operator<<(ostream&, const TFTi&); +}; + +//--------------------------------------------------------------------------- + +template +void TFTi::read(const char* filename) +{ + ifstream is(filename); + if(!is) { fprintf(stderr,"Failed to open input file."); exit(1); } + read(is); +} + +template +void TFTi::read(istream& is) +{ + long *si; // state-index relation + long ci=0; // current index + char ch; // character read; + int empty=0; // no of states with 0 trans? + char intype[FT::ftTYPELEN]; + char outtype[FT::ftTYPELEN]; + + clear(); + + is >> states >> transitions >> intype >> outtype; + +// if(strcmp(intype,itype)!=0 || +// strcmp(outtype,otype)!=0 && strcmp(outtype,"void")!=0) +// { is.clear(ios::badbit); goto end; }; + + while(is.peek()==' ' || is.peek()=='\t') is.get(ch); + while(is.peek()!='\n') + { + char s[20]; + is >> s; + if(strcmp(s,"COPY")==0 && strcmp(intype,outtype)==0) copy_default=true; + else if(strcmp(s,"NOCOPY")==0) copy_default=false; + else if(strcmp(s,"II")==0) print_mode=FT::II; + else if(strcmp(s,"OO")==0) print_mode=FT::OO; + else if(strcmp(s,"IOIO")==0) print_mode=FT::IOIO; + else if(strcmp(s,"OIOI")==0) print_mode=FT::OIOI; + else if(strcmp(s,"IIOO")==0) print_mode=FT::IIOO; + else if(strcmp(s,"OIOI")==0) print_mode=FT::OIOI; + while(is.peek()==' ' || is.peek()=='\t') is.get(ch); + } + + ttn=transitions+2; // 1 state without trans., 1 additional + si=new long[states]; + tt=new TTrans_i[ttn]; + + for(long cs=0;cs> cscheck; + if(cs!=cscheck) goto end; + + while(is.peek()==' ' || is.peek()=='\t') is.get(ch); + + is.get(ch); + if(!is) goto end; + switch(ch) + { + case '-': tt[ci].final(false); break; + case '+': tt[ci].final(true); break; + default: goto end; + } + tc=0, tt[ci].continued(false); + + while(is.peek()==' ' || is.peek()=='\t') is.get(ch); + while(is && is.peek()!='\n') + { + switch(is.peek()) + { + case '~': tt[ci].epsi(true); tt[ci].defi(true); is.get(ch); + break; + case '@': tt[ci].epsi(false); tt[ci].defi(true); is.get(ch); + break; + default : tt[ci].geti(is); + } + if(!is) goto end; + if(is.peek()=='/') + { + is.get(ch); + switch(is.peek()) + { + case '~': tt[ci].epso(true); tt[ci].defo(true); is.get(ch); + break; + case '@': tt[ci].epso(false); tt[ci].defo(true); is.get(ch); + break; + default : tt[ci].geto(is); + } + } + else + { + tt[ci].defo(true); + if(copy_default) tt[ci].epso(false); else tt[ci].epso(true); + } + if(!is) goto end; + + unsigned long transition; + is >> transition; + tt[ci].next(transition); + + tt[ci].continued(false); + tt[ci].empty(false); + + if(tc>0) tt[ci-1].continued(true); + tc++,ci++; + } + if(tc==0) + { + if(++empty>2) { fprintf(stderr, "Nondeterministic automaton."); exit(1); } + tt[ci].empty(true); + ci++; + } + is.get(ch); + if(ch!='\n') { is.clear(ios::badbit); goto end; } + } + + ttn=transitions+empty; + if(ttn!=ci) { is.clear(ios::badbit); goto end; }; + for(long i=0;i +void TFTi::write(const char* filename) +{ + ofstream os(filename); + if(!os) err("Failed to open output file."); + write(os); +} + +template +void TFTi::write(ostream& os) +{ + os << states << ' ' << transitions << ' '; +// os << itype << ' ' << otype << ' '; + os << "char void"; +// os << (copy_default ? "COPY" : "NOCOPY") << ' '; +// switch(print_mode) +// { +// case FT::II : os << "II"; break; +// case FT::OO : os << "OO"; break; +// case FT::IOIO: os << "IOIO"; break; +// case FT::OIOI: os << "OIOI"; break; +// case FT::IIOO: os << "IIOO"; break; +// case FT::OOII: os << "OOII"; +// } + os << '\n'; + + long* si=new long[ttn]; + long cs=0; + for(long i=0;i +void TFTi::load(const char* filename) +{ + FILE* f; + if(*filename) + f=fopen(filename,"rb"); + else + f=stdin; + if(!f) { fprintf(stderr, "Cannot open automaton file."); return; } + load(f); +} + +template +void TFTi::load(FILE* f) +{ + + clear(); + + if(fread(&ttn,sizeof(ttn),1,f)!=1) { fprintf(stderr, "Binary input error."); return;} + if(fread(&states,sizeof(states),1,f)!=1) { fprintf(stderr, "Binary input error."); return;} + if(fread(&transitions,sizeof(transitions),1,f)!=1) { fprintf(stderr, "Binary input error."); return;} + if(fread(itype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr, "Binary input error."); return;} + if(fread(otype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr, "Binary input error."); return;} + if(fread(©_default,sizeof(copy_default),1,f)!=1) { fprintf(stderr, "Binary input error."); return;} + if(fread(&print_mode,sizeof(print_mode),1,f)!=1) { fprintf(stderr, "Binary input error."); return;} + if((tt=new TTrans_i[ttn])==NULL) { fprintf(stderr, "Cannot allocate memory for tt."); return;} + if(fread(tt,sizeof(TTrans_i),ttn,f)!=ttn) { fprintf(stderr, "Binary input error."); return; } + fclose(f); + + +} + +//--------------------------------------------------------------------------- + +template +void TFTi::save(const char* filename) +{ + FILE* f; + if(*filename) + f=fopen(filename,"wb"); + else + f=stdout; + if(!f) err("Cannot open file."); + save(f); +} + +template +void TFTi::save(FILE* f) +{ + if(fwrite(&ttn,sizeof(ttn),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(&states,sizeof(states),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(&transitions,sizeof(transitions),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(itype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(otype,sizeof(char),ftTYPELEN,f)!=ftTYPELEN) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(©_default,sizeof(copy_default),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(&print_mode,sizeof(print_mode),1,f)!=1) { fprintf(stderr,"Binary output error."); exit(1); } + if(fwrite(tt,sizeof(TTrans_i),ttn,f)!=ttn) { fprintf(stderr,"Binary output error."); exit(1); } + fclose(f); +} + +//--------------------------------------------------------------------------- + +template +void TFTi::clear() +{ + if(tt) delete[] tt; + ttn=0; +} + +//--------------------------------------------------------------------------- +/* +template +istream& operator>>(istream& is, TFTi& ft) +{ + long *si; // state-index relation + long ci=0; // current index + char ch; // character read; + int empty=0; // no of states with 0 trans? + char intype[FT::ftTYPELEN]; + char outtype[FT::ftTYPELEN]; + + ft.clear(); + + is >> ft.states >> ft.transitions >> intype >> outtype; + + if(strcmp(intype,ft.itype)!=0 || + strcmp(outtype,ft.otype)!=0 && strcmp(outtype,"void")!=0) + { is.clear(ios::badbit); return is; }; + + while(is.peek()==' ' || is.peek()=='\t') is.get(ch); + while(is.peek()!='\n') + { + char s[20]; + is >> s; + if(strcmp(s,"COPY")==0 && strcmp(intype,outtype)==0) ft.copy_default=true; + else if(strcmp(s,"NOCOPY")==0) ft.copy_default=false; + else if(strcmp(s,"II")==0) ft.print_mode=FT::II; + else if(strcmp(s,"OO")==0) ft.print_mode=FT::OO; + else if(strcmp(s,"IOIO")==0) ft.print_mode=FT::IOIO; + else if(strcmp(s,"OIOI")==0) ft.print_mode=FT::OIOI; + else if(strcmp(s,"IIOO")==0) ft.print_mode=FT::IIOO; + else if(strcmp(s,"OIOI")==0) ft.print_mode=FT::OIOI; + while(is.peek()==' ' || is.peek()=='\t') is.get(ch); + } + + ft.ttn=ft.transitions+2; // 1 state without trans., 1 additional + si=new long[ft.states]; + ft.tt=new TTrans_i[ft.ttn]; + + for(long cs=0;cs> ch; while(ch!='+' && ch!='-'); + switch(ch) + { + case '-': ft.tt[ci].final(false); break; + case '+': ft.tt[ci].final(true); break; + default: return is; + } + tc=0, ft.tt[ci].continued(false); + while((is.get(ch),ch==' ')) + { + if(!is) return is; + switch(is.peek()) + { + case '~': ft.tt[ci].epsi(true); ft.tt[ci].defi(true); is.get(ch); + break; + case '@': ft.tt[ci].epsi(false); ft.tt[ci].defi(true); is.get(ch); + break; + default : ft.tt[ci].geti(is); + } + if(!is) return is; + if(is.peek()=='/') + { + is.get(ch); + switch(is.peek()) + { + case '~': ft.tt[ci].epso(true); ft.tt[ci].defo(true); is.get(ch); + break; + case '@': ft.tt[ci].epso(false); ft.tt[ci].defo(true); is.get(ch); + break; + default : ft.tt[ci].geto(is); + } + } + else + { + ft.tt[ci].defo(true); + if(ft.copy_default) ft.tt[ci].epso(false); else ft.tt[ci].epso(true); + } + if(!is) return is; + + unsigned long transition; + is >> transition; + ft.tt[ci].next(transition); + + ft.tt[ci].continued(false); + + ft.tt[ci].empty(false); + if(tc>0) ft.tt[ci-1].continued(true); + tc++,ci++; + } + if(tc==0) + { + if(++empty>2) err("Nondeterministic automaton."); + ft.tt[ci].empty(true); + ci++; + } + if(ch!='\n') { is.clear(ios::badbit); return is; } + } + + ft.ttn=ft.transitions+empty; + if(ft.ttn!=ci) { is.clear(ios::badbit); return is; }; + for(long i=0;i +ostream& operator<<(ostream& os, const TFTi& ft) +{ + os << ft.states << ' ' << ft.transitions << ' ' + << ft.itype << ' ' << ft.otype << ' '; + os << (ft.copy_default ? "COPY" : "NOCOPY") << ' '; + switch(ft.print_mode) + { + case FT::II : os << "II"; break; + case FT::OO : os << "OO"; break; + case FT::IOIO: os << "IOIO"; break; + case FT::OIOI: os << "OIOI"; break; + case FT::IIOO: os << "IIOO"; break; + case FT::OOII: os << "OOII"; + } + os << ' ' << '\n'; + + long* si=new long[ft.ttn]; + long cs=0; + for(long i=0;i +class TFTiv : public TFTi +{ +public: + TFTiv() : TFTi() {}; + TFTiv(const char* filename) : TFTi(filename) {}; +}; + +//--------------------------------------------------------------------------- + +template +class TFTir : public TFTi +{ +public: + TFTir() : TFTi() {}; +}; + +//--------------------------------------------------------------------------- +#endif diff --git a/src/lib/ttrans.h b/src/lib/ttrans.h new file mode 100644 index 0000000..3f07084 --- /dev/null +++ b/src/lib/ttrans.h @@ -0,0 +1,206 @@ +#ifndef _TTransi_h +#define _TTransi_h +//--------------------------------------------------------------------------- +#include + +using namespace std; +//--------------------------------------------------------------------------- + +//! The template for a transition with input and output symbols stored internally. +/*! + A state is identified with the set of its outgoing transitions. + The state index is the index of the first transition for it. + A state with no outgoing transition is represented as an empty transition. +*/ +template +class TTrans_i +{ +public: +//private: +//! Input symbol + I i; +//! Output symbol + O o; + +public: + +//! state is final + static const unsigned char BITf=0x01; +//! transition list is continued + static const unsigned char BITc=0x02; +//! no transition + static const unsigned char BITe=0x04; +//! epsilon input + static const unsigned char BITepsi=0x08; +//! default input + static const unsigned char BITdefi=0x10; +//! epsilon output + static const unsigned char BITepso=0x20; +//! default output + static const unsigned char BITdefo=0x40; + +//! Flags + unsigned char flags; + +//! The index of the next state + long nxt; + +//! Input symbol. +//! \return The input symbol of the transition. + Ipass in() const { return i; } + +//! Output symbol. +//! \return The output symbol of the transition. + Opass out() const { return o; } + +//! Set the input symbol. +//! \param in input symbol + void in(Ipass in) { i=in; } + +//! Set the output symbol. +//! \param out output symbol + void out(Opass out) { o=out; } + +//! remark Is this needed? + I& iref() { return i; } + +//! remark Is this needed? + O& oref() { return o; } + +//! Test whether an input symbol is accepted. +//! \remark Simplified. Should rely on a test function provided by the user. + bool accepts(Ipass in) { return defi() || in==i; } + +//! Next state. +//! \return Destination state of the transition. + long next() const { return nxt; }; + +//! Set the next state. +//! \param t destination state of the transition + void next(long t) { nxt=t; }; + +//! Is the state final? +//! \return \c true if the state is final, false otherwise. + bool final() const { return flags&BITf; }; + +//! Set the \b final flag. +//! \param b \c true if the state is final, \c false otherwise. + void final(bool b) { if(b) flags|=BITf; else flags&=~BITf; }; + +//! Is the transition list continued? +//! \return \c true if the transition is not the last transition for the state, +//! \c false otherwise. + bool continued() const { return flags&BITc; }; + +//! Set the \b continuation flag. +//! \param b \c true if the transition is not the last one for the state, \c false otherwise. + void continued(bool b) { if(b) flags|=BITc; else flags&=~BITc; }; + +//! Is the transition empty? +//! \return \c true if the transition is empty (represents a state with no outgoing transitions), +//! \c false otherwise. + bool empty() const { return flags&BITe; }; + +//! Set the \b empty flag. +//! \param b \c true if the transition is empty, \c false otherwise. + void empty(bool b) { if(b) flags|=BITe; else flags&=~BITe; }; + + bool epsi() const { return flags&BITepsi; }; + void epsi(bool b) { if(b) flags|=BITepsi; else flags&=~BITepsi; }; + + bool defi() const { return flags&BITdefi; }; + void defi(bool b) { if(b) flags|=BITdefi; else flags&=~BITdefi; }; + + bool epso() const { return flags&BITepso; }; + void epso(bool b) { if(b) flags|=BITepso; else flags&=~BITepso; }; + + bool defo() const { return flags&BITdefo; }; + void defo(bool b) { if(b) flags|=BITdefo; else flags&=~BITdefo; }; + + void geti(istream&); + void geto(istream&); + +// friend ostream& operator<<(ostream& os, const TTrans_i& t); + +}; + +//--------------------------------------------------------------------------- + +template +void getsym(istream& is, char& c) +{ + is >> c; + if(c=='\\') + { + is.get(c); + switch(c) + { + case 'n':c='\n';break; + case 't':c='\t';break; + } + } +} + +template +void getsym(istream& is, T& s) +{ is >> s; } + +//--------------------------------------------------------------------------- + +template +void TTrans_i::geti(istream& is) +{ getsym(is,iref()); }; + +template +void TTrans_i::geto(istream& is) +{ getsym(is,oref()); }; + +//--------------------------------------------------------------------------- +/* +template +ostream& operator<<(ostream& os, const TTrans_i& t) +{ + os << (t.final() ? '+' : '-'); + os << ' '; + + if(!t.empty()) + { + if(t.defi()) + os << (t.epsi() ? '~' : '@'); + else + switch(t.in()) + { + case ' ': os << "\\ "; break; + case '\n': os << "\\n"; break; + case '\t': os << "\\t"; break; + default: os << t.in(); + } + + os << '/'; + + if(t.defo()) + os << (t.epso() ? '~' : '@'); + else + switch(t.out()) + { + case ' ': os << "\\ "; break; + case '\n': os << "\\n"; break; + case '\t': os << "\\t"; break; + default: os << t.out(); + } + + os << ' ' << t.next(); + } + + os << '\n'; + + if(!t.continued()) + os << '\n'; + + return os; +} +*/ + +//--------------------------------------------------------------------------- +#endif + diff --git a/src/lib/word.cc b/src/lib/word.cc new file mode 100644 index 0000000..36277ee --- /dev/null +++ b/src/lib/word.cc @@ -0,0 +1,247 @@ +//--------------------------------------------------------------------------- +#include "word.h" +#include "auttools.h" +#include +#include +//--------------------------------------------------------------------------- +//--------------------------------------------------------------------------- + +void Word::autodescr(const char* fo, const char* de) +{ + strcpy(f,fo); + // len=strlen(f); + + char lemd[MAXDESCRLEN]; + int i=strcspn(de,","); + strncpy(lemd,de,i); + lemd[i]='\0'; + if(isdigit(lemd[0])) + fullform(f,lemd,l); // je¶li lemat zakodowany + else + strcpy(l,lemd); // je¶li lemat w pe³nej postaci + strcpy(d,de+i+1); +} + +//--------------------------------------------------------------------------- +bool Word::cmp_w(Word a, Word b) { + return (a.w_suf() > b.w_suf()); +} +//--------------------------------------------------------------------------- +bool Word::cmp_w_rev(Word a, Word b) { + return (a.w_suf() < b.w_suf()); +} +//--------------------------------------------------------------------------- +bool cmp_w_fun(Word a, Word b) { + return (a.w_suf() > b.w_suf()); +} +//--------------------------------------------------------------------------- +bool cmp_w_rev_fun(Word a, Word b) { + return (a.w_suf() < b.w_suf()); +} +//--------------------------------------------------------------------------- + +istream& operator>>(istream& is, Word& w) +{ + char temp[Word::MAXLEN+1]; + char c; + + int i=0; + while(i'; + return os; +} + +//--------------------------------------------------------------------------- +Words::~Words() { + // for (int i=0; imax && !tab[i].returned) { + max = w; + result = i; + } + } + if (result != -1) + tab[result].returned = 1; + return result; +} + +//--------------------------------------------------------------------------- +void Words::sort() { + std::sort(tab.begin(), tab.end(), Word::cmp_w); +} + +//--------------------------------------------------------------------------- +void Words::sort_rev() { + std::sort(tab.begin(), tab.end(), cmp_w_rev_fun); +} + +//--------------------------------------------------------------------------- + +int Words::add(const char* fo) +{ + int i = find(fo); +if(i!=-1) { + return i; + } + + if (cnt>=tab.capacity()-1) + tab.resize(tab.size()*2); + + + Word o; + o.form(fo); + o.w_suf(0.0); + tab.push_back(o); +// tab[cnt].form(fo); +// tab[cnt].w_suf(0.0); + + + // if(cntform(fo); + tab[cnt]->w_suf(0.0); + tab[cnt]->w_pref(0.0);*/ + return cnt++; + // } + //return -1; +} + +//--------------------------------------------------------------------------- + //TYMCZASOWO TAK(DLA CORA) +int Words::add(const char* fo, float weight) +{ + int i = find(fo); + if(i!=-1) { + return i; + } + + if (cnt>=tab.capacity()-1) + tab.resize(tab.size()*2); + + Word o; + o.form(fo); + o.w_suf(weight); + tab.push_back(o); +// tab[cnt].form(fo); +// tab[cnt].w_suf(weight); + + return cnt++; + // } + //return -1; +} + +//--------------------------------------------------------------------------- + +int Words::add(const char* fo, const char* des) +{ + char d[Word::MAXDESCRLEN]; + int l=strcspn(des,","); + int ok=1; + if( *(des+l) == ',' ) + { + strcpy(d,des+l+1); + // printf("\t%s->%s,\n", des, d); + int i=find(fo, d); + if(i!=-1) + return i; + } + else + ok=0; + + if (cnt>=tab.capacity()-1) + tab.resize(tab.size()*2); + + tab[cnt].form(fo); + if(ok) + tab[cnt].autodescr(fo, des); + else + tab[cnt].autodescr(fo, "?,?"); + + tab[cnt].w_suf(0.0); + tab[cnt].returned = 0; + /* + // if(cntform(fo); + tab[cnt]->autodescr(fo,des); + tab[cnt]->w_suf(0.0); + tab[cnt]->w_pref(0.0); + // printf("ok!\n");*/ + return cnt++; + // } + // printf("hm\n"); + return -1; +} + +//--------------------------------------------------------------------------- +void Words::prn(ostream& os) +{ + for(int i=0; i"; +} + +//--------------------------------------------------------------------------- + +ostream& operator<<(ostream& os, Words& tab) +{ + /* for(int i=0; i +#include +#include +//--------------------------------------------------------------------------- + +using namespace std; + + + +class Word +{ +public: + static const int MAXLEN=64; // dac do global + static const int MAXDESCRLEN=80; // dac do global + +private: + /// word form + char f[MAX_FORM]; // w wolnej chwili nazwy mozna zamienic na dluzsze + + /// length + int _len_suf; // dlugosc dopasowania koncowki... + // int _len_pref; // ... i prefiksu + + /// lemma + char l[MAX_FORM]; + + /// description + char d[MAX_DESC]; + + /// weight (probability) + float _w_suf; + // float _w_pref; +public: + static bool cmp_w(Word a, Word b); + static bool cmp_w_rev(Word a, Word b); + + Word() : _len_suf(-1) { *f='\0'; returned=0; }; + Word(const char* fo, const char* des) : _len_suf(-1) { autodescr(fo,des); _w_suf=1.0; returned=0; }; + + Word(const Word& w); + + char* form() { return f; } // przywrocic const + char* lemma() { return l; } // przywrocic const + char* descr() { return d; } + float w_suf() { return _w_suf; }; + int len_suf() { return _len_suf; } + + + void form(const char* s) { strcpy(f,s); } + void lemma(const char* s) { strcpy(l,s); } + void descr(const char* s) { strcpy(d,s); }; + void w_suf(float x) { _w_suf=x; }; + void len_suf(int n) { _len_suf=n; }; + + bool operator==(const Word& w); + bool operator!=(const Word& w); + int cmp(const Word&); + int cmpi(const Word&); + + char* operator!() { return f; }; + + operator bool() { return _len_suf>0; }; + + char* str() { return f; } + + void autodescr(const char* fo, const char* des); + + friend istream& operator>>(istream& is, Word& m); + friend ostream& operator<<(ostream& os, Word& m); + + bool returned; + +}; + + +inline Word::Word(const Word& word) +{ strcpy(f,word.f); strcpy(l,word.l); strcpy(d,word.d); _len_suf=word._len_suf; _w_suf=word._w_suf; returned = 0; } + +//--------------------------------------------------------------------------- + +inline bool Word::operator==(const Word& w) +{return _len_suf==w._len_suf && + !strcmp(f,w.f) && !strcmp(l,w.l) && !strcmp(d,w.d); } + +//--------------------------------------------------------------------------- + +inline bool Word::operator!=(const Word& w) +{return _len_suf!=w._len_suf || + strcmp(f,w.f) || strcmp(l,w.l) || strcmp(d,w.d);} + +//--------------------------------------------------------------------------- + +inline int Word::cmp(const Word& w) { return strcmp(f,w.f); } + +//--------------------------------------------------------------------------- + +//inline int Word::cmpi(const Word& w) { return PL.cmpi(f,w.f); } + +//--------------------------------------------------------------------------- + + + + +bool cmp_w_fun(Word a, Word b); +bool cmp_w_rev_fun(Word a, Word b); + + +//--------------------------------------------------------------------------- +//--------------------------------------------------------------------------- + +class Words +{ + private: + int find(const char* word); + int find(const char* word, const char* descr); + public: + + static const int MAX=1024; + + Words() : cnt(0) {tab.resize(MAX); }; + ~Words(); + Word& operator[](int i) { return tab[i]; } + int count() const { return cnt; } + void clear() { cnt=0; tab.clear(); } + int add(const char* fo); + int add(const char* fo, float weight); + int add(const char* fo, const char* des); + + /* zwraca index nastepnego wyniku, podczas pierwszego wywolania + * zwraca index wyniku o najwiekszej wadze, przy drugim wywolaniu + * wynik z druga najwyzsza waga, itd. + * Jezeli nie ma juz wynikow - zwraca -1. + */ + int next(); + + void sort(); + void sort_rev(); + + void prn(ostream& os); + +// friend class Lem; +// friend class AuxLem; + friend ostream& operator<<(ostream& os, Words& tab); + vector tab; + int cnt; + +}; + +//--------------------------------------------------------------------------- + +#endif + diff --git a/src/mar/Makefile b/src/mar/Makefile new file mode 100644 index 0000000..b05cc77 --- /dev/null +++ b/src/mar/Makefile @@ -0,0 +1,16 @@ +include ../../config.mak +mar: + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 mar $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/mar +endif + +clean: diff --git a/src/mar/mar b/src/mar/mar new file mode 100755 index 0000000..7b1077d --- /dev/null +++ b/src/mar/mar @@ -0,0 +1,336 @@ +#!/usr/bin/perl + +#package: UAM Text Tools +#component: mar +#version: 1.0 +#author: Marcin Walas + +#this program tags the tokenized file with given tags +#tags can be given in any order and configuration through the expression +#which is one of the parametres of the script +#contact: d287572@atos.wmid.amu.edu.pl, walasiek@gmail.com + +my $version = '1.0'; + +use lib "/usr/local/lib/utt"; +use lib "$ENV{'HOME'}/.local/lib/utt"; + +use strict; +use Getopt::Long; +use File::HomeDir; + +use attr; + + +my $LIB_DIR="/usr/local/lib/utt"; +my $systemconfigfile='/usr/local/etc/utt/mar.conf'; +my $userconfigfile=home()."/.utt/mar.conf"; + +Getopt::Long::Configure('no_ignore_case_always'); + +my $help=0; +my $pattern=0; +my $macrofile=0; +my $define=0; +my $command=0; +my $action="pgP"; +my $eos="seg(EOS)"; +my $explicit_space=0; +my $morfield='lem'; +my $tags=0; +my $show_version = 0; + +#read configuration files########################### +my $file; +foreach $file ($systemconfigfile, $userconfigfile){ + if(open(CONFIG, $file)){ + while () { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "pattern")or($name eq "e")){ + $pattern=$value; + } + elsif($name eq "eos"){ + $eos=$value; + } + elsif($name eq "macros"){ + $macrofile=$value; + } + elsif($name eq "tags"){ + $tags=$value; + } + elsif($name eq "morph"){ + $morfield=$value; + } + elsif($name eq "command"){ + $command=1; + } + elsif($name eq "action"){ + $action=$value; + } + elsif($name eq "space"){ + $explicit_space=1; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + + } + close CONFIG; + } +} +######################################################### + +GetOptions("pattern|e=s" => \$pattern, + "eos|E=s" => \$eos, + "macros=s" => \$macrofile, + "define=s" => \$macrofile, + "command" => \$command, + "action=s" => \$action, + "help|h" => \$help, + "space|s" => \$explicit_space, + "version|v" => \$show_version, + ); + + + +if($show_version){ + print "Version: $version\n"; + exit 0; +} + +if($help) +{ + print <<'END' +Usage: mar [OPTIONS] [file ..] + +Options: + --pattern -e PATTERN Pattern. + --eos -E PATTERN Segment serving as sentence beginning marker. [TODO] + --macros=FILE Read macrodefinitions from FILE. [TODO] + --define=FILE Add macrodefinitions from FILE. [TODO] + --action -a [p][s][P] Perform only indicated actions. + p - preprocess + s - search + P - postprocess + (default psP) + --command Print generated shell command and exit. + --help -h Print help. + --version -v Script version + +In patern you can put any tag. Tags should begin with the @ character. +They don't have to be closed. +They can't contain white spaces! + +Note: If you don't define any custom tags, whole pattern will be taged with + default tags (begining of match and end of match) + +Tags examples: + +mar -e '@BEG cat() @END' + it will find any adjectives in the text and tag them with surrounding tags +mar -e 'cat() @MYTAG cat()' + this will find two neighbouring adjectives and parcel them with tag MYTAG + +Some example patterns: +'word(domu)' - form of the word domu +'lexeme(dom)' - any form of lexeme dom +'space' - space +'cat()' - adjective + +You can use * in patterns to make zero or more counts of word. + +END +; + exit 0; +} + +die("$0: no pattern given. Run with -h to get help.\n") unless $pattern || $action !~ /g/; + +die("$0: macro file not found") unless + $macrofile or + -e "$LIB_DIR/terms.m4" and $macrofile="$LIB_DIR/terms.m4"; + +my $preproc = ($action =~ /p/) ? ' fla | ' : ''; + +my $postproc = ($action =~ /P/) ? ' | unfla ' : ''; + + +#this is our help function to cut the re to get another tag +#it takes only one argument which is our patern (after m4 processing) +#returns: the first root-level brace with content +sub cutRe +{ + my $i = 0; + my $level = 0; + my $text = $_[0]; + my $temp; + for( $i =0; $i < (length $text);$i++) + { + $temp = substr($text, $i,1); + if( $temp eq "(") + {#we have an opening + $level++; + } + elsif ( $temp eq ")") + {#we close + $level--; + } + if ( $level == 0) + { + $temp = substr($text,0,$i+1); + last; + } + } + $temp; +} + +#the same function as above althought it returns everything after the +#first root level brace +sub restRe +{ + my $i = 0; + my $level = 0; + my $text = $_[0]; + my $temp; + for( $i =0; $i < (length $text);$i++) + { + $temp = substr($text, $i,1); + if( $temp eq "(") + {#we have an opening + $level++; + } + elsif ( $temp eq ")") + {#we close + $level--; + } + if ( $level == 0) + { #we cut everything in the begining + $temp = substr($text,$i+1); + last; + } + } + $temp; +} + + +#here we are preparing re for extended matching +my @tags; + +#we must find what our the tags +#some pattern adjustment +my $end = 0; +my $temp = " ".$pattern." "; +$temp =~ s/(\@[^ ]*) (\@[^ ]* )/\1 \2/g; +$pattern = $temp; + +while ($end != 1) +{ + #we seek for the first tag in pattern + if ($temp =~ /^.*?\@(.*?) /) + { + #we add this to tags array + push (@tags, $1); + #and cut the pattern + $temp =~ s/^.*?\@(.*?) / /; + #print $temp."\n"; + } + else + { + #if we dont find any tags we end + $end = 1; + } +} + +#here we have our patern with tags removed (we set sections of ()) between tags +my $patternmod = "( ".$pattern." )"; +$patternmod =~ s/\s@.*?\s/\)\(/g; + +#discarding spaces +$patternmod =~ s/\s+/\\`'/g; #` +# quoting escaped commas +$patternmod =~ s/\\,/\\`\\`\\,''/g; +# quoting commas in {m,n} r.e. operator +$patternmod =~ s/(\{\d*),(\d*\})/\1\\`\\`,''\2/g; +#print "After m4:".$re."\n"; + +my $re = `echo \"$patternmod\" | m4 --define=ENDOFSEGMENT='[[:cntrl:]]' --define=MORFIELD=$morfield $macrofile - 2>/dev/null`; + +die("Incorrect pattern (m4).") if $? >> 8; + + +chomp $re; + +# <> expansion + +$re =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge; + +# Perl-like special sequences +$re =~ s/\./[^ [:cntrl:]]/g; +$re =~ s/\\s/[ ]/g; +$re =~ s/\\S/[^ [:cntrl:]]/g; +$re =~ s/\\d/[0-9]/g; +$re =~ s/\\D/[^0-9 [:cntrl:]]/g; +$re =~ s/\\w/[a-z󶼿A-ZʣӦ0-9_]/g; +$re =~ s/\\W/[^a-z󶼿A-ZʣӦ0-9_ [:cntrl:]]/g; +# extensions +$re =~ s/\\l/[a-z󶼿]/g; #lowercase letter +$re =~ s/\\L/[A-ZʣӦ]/g; #upercase letter + +my $sedcommand; +my $grepcommand; + +#now we must built a sed script from our re +#we do this by cuting our re each tag until we cut them all +#if an user dint input any tags we do our default +my $defBOM = "BOM"; +my $defEOM = "EOM"; +my $defTempTagBeg = "####TempTAGBEG####"; +my $defTempTagEnd = "####TempTAGEND####"; + +if (@tags == 0) +{ + $sedcommand = "sed -r 's/($re)/\\500 $defBOM *\\f\\1###EOM###/g; s/###EOM###([0-9]+)/\\1 00 $defEOM *\\f\\1/g'"; +} +else #we have custom tags +{ + #first tag is easy to tag :) + my $sedscript="sed -r 's/($re)/\\600 $defTempTagBeg *\\f\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $defTempTagEnd *\\f\\1/g;"; + #after first step we have temp tagged parts of input matching re + #now we need to insert our custom tags + #we will find temp tags and process our input + + my $i = 0; + #copy of re which will be cut + my $rec = $re; + my $restre = $re; + + for ($i = 0 ; $i < @tags ; $i++) + { + #re cutting + $rec = cutRe($restre); + $restre = restRe($restre); + if ($rec =~ / *\( *\) */) + { + $sedscript = $sedscript."s/([0-9]+) 00 $defTempTagBeg \\*\\f([0-9]+)/\\2 00 $tags[$i] *\\f\\2 00 $defTempTagBeg *\\f\\2/g;"; + } + else + { + $sedscript = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f($rec)/\\1###EOM###/g;s/###EOM###([0-9]+)/\\1 00 $tags[$i] *\\f\\1 00 $defTempTagBeg *\\f\\1/g;"; + } + + } + + $sedcommand = $sedscript."s/[0-9]+ 00 $defTempTagBeg \\*\\f//g;s/[0-9]+ 00 $defTempTagEnd \\*\\f//g'"; +} + +if($command) +{ + print $sedcommand."\n"; + exit 0; +} +exec $preproc.$sedcommand.$postproc; diff --git a/src/rm12/Makefile b/src/rm12/Makefile new file mode 100644 index 0000000..d1e0119 --- /dev/null +++ b/src/rm12/Makefile @@ -0,0 +1,16 @@ +include ../../config.mak +rm12: + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 rm12 $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/rm12 +endif + +clean: diff --git a/src/rm12/rm12 b/src/rm12/rm12 new file mode 100755 index 0000000..b1b4fec --- /dev/null +++ b/src/rm12/rm12 @@ -0,0 +1,10 @@ +#!/bin/sed -rf + +#package: UAM Text Tools +#component: rm12 +#version: 1.0 +#author: Tomasz Obrebski + + +/[0-9]+[ \t]+[0-9]+[ \t]+BOS/! +s/[0-9]+[ \t]+[0-9]+[ \t]// diff --git a/src/rs12/Makefile b/src/rs12/Makefile new file mode 100644 index 0000000..8a28263 --- /dev/null +++ b/src/rs12/Makefile @@ -0,0 +1,26 @@ +include ../../config.mak + +ifeq ($(BUILD_STATIC), yes) + LDFLAGS += -static +endif + +LDFLAGS += +CFLAGS += -O2 + +rs12: + $(CC) $(CFLAGS) rs12.c -o rs12 $(LDFLAGS) + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 rs12 $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/rs12 +endif + +clean: + rm rs12 || true diff --git a/src/rs12/rs12.c b/src/rs12/rs12.c new file mode 100644 index 0000000..cbed10e --- /dev/null +++ b/src/rs12/rs12.c @@ -0,0 +1,66 @@ +#include +#include +#include + +#define MAXLINE 1000 + +int main() +{ + char buf[MAXLINE]; + char outbuf[MAXLINE]; + char form[MAXLINE]; + int len; + int curpos; + int nextpos=0; + int a; + int b; + + while( fgets(buf, MAXLINE, stdin) ) + { + int n = sscanf(buf, "%d %d", &a, &b); + if( 2 == n ) + { + nextpos = a + b; + fputs(buf, stdout); + } + else + { + if( 1 == n ) + { + curpos = a; + sscanf(buf, "%*d %*s %s", form); + } + else + { + curpos = nextpos; + sscanf(buf, "%*s %s", form); + } + + if( '*' == *form ) + { + len = 0; + } + else + { + char *f = form; + for( len = 0; *f; ++f ) + { + if( '\\' != *f) + { + ++len; + } + } + } + + char *buf1 = buf; + while( !isalpha(*buf1) ) + { + ++buf1; + } + sprintf(outbuf, "%04i %02i %s", curpos, len, buf1); + fputs(outbuf, stdout); + nextpos = curpos + len; + } + } + return 0; +} diff --git a/src/sen-nl/Makefile b/src/sen-nl/Makefile new file mode 100644 index 0000000..d12d194 --- /dev/null +++ b/src/sen-nl/Makefile @@ -0,0 +1,17 @@ +include ../../config.mak + +sen-nl: + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 sen-nl $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/sen-nl +endif + +clean: diff --git a/src/sen-nl/sen-nl b/src/sen-nl/sen-nl new file mode 100755 index 0000000..cecd541 --- /dev/null +++ b/src/sen-nl/sen-nl @@ -0,0 +1,3 @@ +#!/bin/bash + +sed -r '1 s/^(([0-9]+)[ \t][0-9]+[ \t].*)$/\2 00 BOS \*\n\1/;t;$! s/(([0-9]+)[ \t][0-9]+[ \t][[:alpha:]]+[ \t]+[[:print:]]*\\n.*)$/\2 00 EOS *\n\2 00 BOS *\n\1/; $ s/^(([0-9]+) .*)$/\1\n\2 00 EOS */' diff --git a/src/ser/Makefile b/src/ser/Makefile new file mode 100644 index 0000000..7fa4704 --- /dev/null +++ b/src/ser/Makefile @@ -0,0 +1,17 @@ +include ../../config.mak + +ser: + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 ser $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/ser +endif + +clean: diff --git a/src/ser/ser b/src/ser/ser new file mode 100755 index 0000000..ab7bf0f --- /dev/null +++ b/src/ser/ser @@ -0,0 +1,198 @@ +#!/usr/bin/perl + +#package: UAM Text Tools +#component: ser (pattern search tool) +#version: 1.0 +#author: Tomasz Obrebski + +use strict; +use Getopt::Long; +use File::Temp; +use File::HomeDir; + +my $LIB_DIR="/usr/local/lib/utt"; +my $systemconfigfile='/usr/local/etc/utt/ser.conf'; +my $userconfigfile=home()."/.utt/ser.conf"; + +Getopt::Long::Configure('no_ignore_case_always'); + +my $help=0; +my $pattern=0; +my $only_matching=0; +my $no_markers=0; +my $macros=0; +my $flextemplate=0; +my $flex=0; +my $morfield='lem'; +my $tags=0; + +#read configuration files########################### +my $file; +foreach $file ($systemconfigfile, $userconfigfile){ + if(open(CONFIG, $file)){ + while () { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "pattern")or($name eq "e")){ + $pattern=$value; + } + elsif($name eq "morph"){ + $morfield=$value; + } + elsif(($name eq "only-matching")or($name eq "m")){ + $only_matching=1; + } + elsif(($name eq "no-markers")or($name eq "M")){ + $no_markers=1; + } + elsif($name eq "macros"){ + $macros=$value; + } + elsif($name eq "flex-template"){ + $flextemplate=$value; + } + elsif($name eq "tags"){ + $tags=$value; + } + elsif($name eq "flex"){ + $flex=1; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + + } + close CONFIG; + } +} +######################################################### + +GetOptions("pattern|e=s" => \$pattern, + "morph=s" => \$morfield, + "only-matching|m" => \$only_matching, + "no-markers|M" => \$no_markers, + "macros=s" => \$macros, + "flex-template=s" => \$flextemplate, + "tags=s" => \$tags, + "flex" => \$flex, + "help|h" => \$help); + +if($help) +{ + print <<'END' +Usage: ser [OPTIONS] [file ..] + +Options: + --help -h Help. + --pattern=PATTERN -e PATTERN Search pattern. + --morph=STRING Field containing morphological information (default 'lem'). + --macros=FILE Read macrodefinitions from FILE. + --flex-template=FILE Read flex code template from FILE. + --tags=STRING Morphosyntactic tag format. + --only-matching -m Print only fragments matching PATTERN. + --no-markers -M Do not print BOM and EOM markers [TODO]. + --flex Print only the generated flex code and exit. +END +; + exit 0; +} + + +die("$0: no pattern given.\n") unless $pattern; + +die("$0: flex template file not found") unless + $flextemplate or + -e "$LIB_DIR/ser.l.template" and $flextemplate="$LIB_DIR/ser.l.template"; + +die("$0: macro file not found") unless + $macros or + -e "$LIB_DIR/terms.m4" and $macros="$LIB_DIR/terms.m4"; + +die("$0: $tags.tag2re program not found") unless + 1; #JAK NAPISAC WARUNEK??? + +die("$0: undefined tagset format (tags option missing)") unless + $tags; + + +#$pattern =~ s/cat\(([^)]+)\)/'cat('.pre($1).')'/ge; +# quoting escaped commas /NIE DZIAA/ +$pattern =~ s/\\,/\\`\\`\\,''/g; + +# protecting backslash +$pattern =~ s/\\/\\\\\\/g; + +# discarding spaces +$pattern =~ s/\s+/\\`'/g; #` + + +my $flexpattern = `echo \"$pattern\" | m4 --define=ENDOFSEGMENT=\\\\n --define=MORFIELD=$morfield $macros - 2>/dev/null`; + +die("Incorrect pattern (m4).") if $? >> 8; + + +chomp $flexpattern; + +# <> expansion +$flexpattern =~ s/<([^>]+)>/`echo $1 | $tags.tag2re`/ge; + +# restricting the value of the . special symbol +$flexpattern =~ s/\./[^ \\t\\n\\r\\f]/g; + +# perl-like shortcuts for character classes +# perl exact +$flexpattern =~ s/\\s/[ \\t]/g; +$flexpattern =~ s/\\S/[^ \\t\\n\\r\\f]/g; +$flexpattern =~ s/\\d/[0-9]/g; +$flexpattern =~ s/\\D/[^0-9 \\t\\n\\r\\f]/g; +$flexpattern =~ s/\\w/[a-z󶼿A-ZʣӦ0-9_]/g; +$flexpattern =~ s/\\W/[^a-z󶼿A-ZʣӦ0-9_ \\t\\n\\r\\f]/g; +# extensions +$flexpattern =~ s/\\l/[a-z󶼿]/g; #lowercase letter +$flexpattern =~ s/\\L/[A-ZʣӦ]/g; #upercase letter + +# protecting slash +$flexpattern =~ s/\//\\\//g; + +my $defaultaction = ($only_matching) ? '' : 'ECHO'; + +# docelowo posrednie pliki powinny byc w jakims tempie !!! + +(undef, my $tmpfile_l) = File::Temp::tempfile(SUFFIX=>'.l'); +(undef, my $tmpfile_c) = File::Temp::tempfile(SUFFIX=>'.c'); +(undef, my $tmpfile_x) = File::Temp::tempfile(); + +# w tych `` nie dziala +#`m4 "--define=PATTERN=$flexpattern" "--define=DEFAULTACTION=$defaultaction" $flextemplate > $tmpfile_l`; + +system "m4 \"--define=PATTERN=$flexpattern\" \"--define=DEFAULTACTION=$defaultaction\" $flextemplate > $tmpfile_l"; + +if($flex) +{ + # w tych `` nie dziala + system "cat $tmpfile_l"; +# if(open(FLEX, $tmpfile_l)) { +# while() { +# print @_; +# } +# close FLEX; +# } +# else { +# print "Unable to open file $tmpfile_l\n"; +# } + exit(0); +} + +`flex -o$tmpfile_c $tmpfile_l`; +`cc -O3 -o $tmpfile_x $tmpfile_c -lfl`; +#`$tmpfile_x`; + +system "$tmpfile_x"; + +unlink $tmpfile_l; +unlink $tmpfile_c; +unlink $tmpfile_x; diff --git a/src/tags/Makefile b/src/tags/Makefile new file mode 100644 index 0000000..7beb4bb --- /dev/null +++ b/src/tags/Makefile @@ -0,0 +1,19 @@ +include ../../config.mak + +all: + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 ipi.tag2re $(BIN_DIR) + install -m 0755 uam.tag2re $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/ipi.tag2re + rm $(BIN_DIR)/uam.tag2re +endif + +clean: diff --git a/src/tags/README b/src/tags/README new file mode 100644 index 0000000..885f748 --- /dev/null +++ b/src/tags/README @@ -0,0 +1,5 @@ +In this directory files specific to different tag formats are stored. + +TAGSET.tag2re are command-line programs which translate a tag constraint +specification into a character-level regular expression matching +all tags in the TAGSET format meeting the specified constraint. diff --git a/src/tags/ipi.tag2re b/src/tags/ipi.tag2re new file mode 100755 index 0000000..f9bf1ab --- /dev/null +++ b/src/tags/ipi.tag2re @@ -0,0 +1 @@ +#TODO diff --git a/src/tags/uam.tag2re b/src/tags/uam.tag2re new file mode 100755 index 0000000..033a168 --- /dev/null +++ b/src/tags/uam.tag2re @@ -0,0 +1,89 @@ +#!/usr/bin/perl + +#package: UAM Text Tools +#component: tags for utt +#version: 1.0 +#author: Tomasz Obrebski + +use strict; +use locale; + +my $input = <>; +chomp $input; + +our $pos_re = qr/(?:[[:upper:]]+)/; +our $attr_re = qr/(?:[[:upper:]]+)/; +our $val_re = qr/(?:[[:lower:][:digit:]+?!*-]|<[^>\n]+>)/; +our $av_re = qr/(?:$attr_re$val_re+)/; +our $avlist_re = qr/(?:$av_re+)/; +our $cat_re = qr/(?:$pos_re(?:\/$avlist_re)?)/; + +print pre($input); + +sub parse ($) +{ + my ($dstr)=@_; + my $avs={}; + my ($cat,$attrlist) = split '/', $dstr; + ATTR: + while( $attrlist =~ /($attr_re)($val_re+)/g ) + { + my ($attrstr,$valstr)=($1,$2); + my %vals; + while($valstr =~ /$val_re/g) + { + my $val = $&; + next ATTR if $val eq '*'; + $val =~ s/^<([[:lower:]])>$/$1/; + $vals{$val}=1; + } + + $avs->{$attrstr} = \%vals; # dlaczego to dziala? %vals jest lokalne + } + [$cat, $avs]; +} + +sub unparse (\@) +{ + my ($cat,$avs)= @{shift @_}; + my $dstr=$cat; + my @attrs = keys %$avs; + if(@attrs) + { + $dstr .= '/'; + for my $attr ( sort @attrs ) + { + $dstr .= $attr . (join '', sort keys %{$avs->{$attr}}); + } + } + $dstr; +} + +sub canonize ($) +{ + unparse @{parse shift} ; +} + +sub pre +{ + my $pos_res = '[[:upper:]]+'; + my $attr_res = '[[:upper:]]+'; + my $val_res = '[[:lower:][:digit:]+?!*-]|<[^>\n[:cntrl:]]+>'; + my $av_res = "$attr_res($val_res)+"; + my $avlist_res = "($av_res)+"; + + my $pat = canonize(shift); + my $ret; + my ($pos,$avlist) = split /\//, $pat; + $ret = $pos.'(\/'; + while ($avlist =~ /($attr_res)(${val_res}+)/g) + { + my $attr = $1; + my $vals = $2; + my $vals = "($val_res)*(".join('|',($vals =~ /$val_res/g)).")($val_res)*"; + $ret .= "($av_res)*$attr$vals"; + } + $ret .= "($av_res)*)?"; + return $ret; +} + diff --git a/src/tok.c/Makefile b/src/tok.c/Makefile new file mode 100644 index 0000000..53c1673 --- /dev/null +++ b/src/tok.c/Makefile @@ -0,0 +1,51 @@ +include ../../config.mak + +ifeq ($(BUILD_STATIC), yes) + LDFLAGS += -static +endif + +LDFLAGS += +CXXFLAGS += -O2 + +LIB_PATH=../lib +CMDLINE_FILE='"../tok.c/cmdline.h"' +COMMON_PATH=../common + +tok_c: tok.o cmdline.c common_tok.o common.o + $(CXX) $(CXXFLAGS) -D _CMDLINE_FILE=$(CMDLINE_FILE) \ + tok.c cmdline.c common.o common_tok.o -o tok_c $(LDFLAGS) + +tok.o: tok.c cmdline.h + $(CXX) $(CXXFLAGS) -c -D _CMDLINE_FILE=$(CMDLINE_FILE) tok.c + +common_tok.o: cmdline.h common_tok.cc common_tok.h + $(CXX) $(CXXFLAGS) -c -D _CMDLINE_FILE=$(CMDLINE_FILE) common_tok.cc + +common.o: $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \ + $(COMMON_PATH)/common.h + $(CXX) $(CXXFLAGS) -c -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc + +cmdline.ggo: cmdline_tok.ggo ../common/cmdline_common.ggo + cat cmdline_tok.ggo ../common/cmdline_common.ggo > cmdline.ggo + +cmdline.c cmdline.h: cmdline.ggo + $(GENGETOPT) -i cmdline.ggo --conf-parser + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 tok_c $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/tok_c +endif + +clean: clean.cmdline + rm *.o || true + rm tok_c || true + +clean.cmdline: + rm cmdline.* || true diff --git a/src/tok.c/cmdline_tok.ggo b/src/tok.c/cmdline_tok.ggo new file mode 100644 index 0000000..05929b1 --- /dev/null +++ b/src/tok.c/cmdline_tok.ggo @@ -0,0 +1,6 @@ +package "tok" +version "0.1" + +#section "UTT Tok" + +option "dictionary" d "Dictionary" string typestr="FILENAME" default="cor.bin" no diff --git a/src/tok.c/common_tok.cc b/src/tok.c/common_tok.cc new file mode 100644 index 0000000..68d2af1 --- /dev/null +++ b/src/tok.c/common_tok.cc @@ -0,0 +1,27 @@ +#include +#include +#include "common_tok.h" + +char dictionary[255]; + +void process_tok_options(gengetopt_args_info args) +{ + if(args.dictionary_given) + strcpy(dictionary, args.dictionary_arg); + else { + char path[256]; + sprintf(path, "/etc/utt/%s", DICT_FILE); + if (file_accessible(path) == 0) + strcpy(dictionary, path); + else { + sprintf(path, "%s/.utt/%s", getenv("HOME"), DICT_FILE); + if (file_accessible(path) == 0) + strcpy(dictionary, path); + else { + fprintf(stderr, "Cannot find dictionary!\n"); + exit(1); + } + } + } + +} diff --git a/src/tok.c/common_tok.h b/src/tok.c/common_tok.h new file mode 100644 index 0000000..327e6b0 --- /dev/null +++ b/src/tok.c/common_tok.h @@ -0,0 +1,15 @@ +#ifndef __COMMON_TOK__H +#define __COMMON_TOK__H + +#include +#include "../common/common.h" + +#include "cmdline.h" + +#define DICT_FILE "data/tok.bin" + +extern char dictionary[]; + +extern void process_tok_options(gengetopt_args_info args); + +#endif diff --git a/src/tok.c/tok.c b/src/tok.c/tok.c new file mode 100644 index 0000000..55b3e3d --- /dev/null +++ b/src/tok.c/tok.c @@ -0,0 +1,81 @@ + +#include +#include +#include +#include +#include +//#include "../lib/iotools.h" + +#include "cmdline.h" + + +char buf[257]; +int filepos=0; +char* tokstart; +char* tokend; +char tmp; +char tag; + + +gengetopt_args_info args; + + +inline +void printtoken(char tag) +{ + tmp=*tokend; + *tokend='\0'; + printf("%04d %02d %c %s\n", filepos, tokend-tokstart, tag, tokstart); + *tokend=tmp; + filepos+=tokend-tokstart; + tokstart=tokend; + if(args.interactive_flag) fflush(stdout); +} + +main(int argc, char** argv) +{ + + if (cmdline_parser(argc, argv, &args) != 0) + exit(1); + + printf("inter:%d\n",args.interactive_flag); + + // process_common_options(&args, argv[0]); + // process_tok_options(args); + + setlocale(LC_CTYPE,""); + setlocale(LC_COLLATE,""); + + while(fgets(buf,256,stdin)) + { + + tokstart=tokend=buf; + while(*tokend) + { + char *prev=tokend; + ++tokend; + if(isalpha(*prev) && !isalpha(*tokend)) + printtoken('W'); + else if(isdigit(*prev) && !isdigit(*tokend)) + printtoken('N'); + else if(isspace(*prev)) + { + switch(*prev) + { + case ' ': *prev='_'; break; + case '\t':*prev='t'; break; + case '\r':*prev='r'; break; + case '\f':*prev='f'; break; + case '\n':*prev='n'; + } + if(!isspace(*tokend)) + printtoken('S'); + } + else if(ispunct(*prev)) + printtoken('P'); + } + } + + cmdline_parser_free(&args); +} + diff --git a/src/tok.l/Makefile b/src/tok.l/Makefile new file mode 100644 index 0000000..698bbdc --- /dev/null +++ b/src/tok.l/Makefile @@ -0,0 +1,39 @@ +include ../../config.mak + +ifeq ($(BUILD_STATIC), yes) + LDFLAGS += -static +endif + +LDFLAGS += +CFLAGS += -O2 + +tok: lex.yy.c + $(CC) $(CFLAGS) -lfl -o tok lex.yy.c $(LDFLAGS) + +lex.yy.c: tok_cmdline.h tok_cmdline.c + $(FLEX) tok.l + +tok_cmdline.h tok_cmdline.c: + $(GENGETOPT) -i tok_cmdline.ggo --conf-parser --file=tok_cmdline + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 tok $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/tok +endif + +clean: clean.cmdline clean.flex + rm tok || true + +clean.cmdline: + rm tok_cmdline.c || true + rm tok_cmdline.h || true + +clean.flex: + rm lex.yy.c || true diff --git a/src/tok.l b/src/tok.l/tok.l similarity index 97% rename from src/tok.l rename to src/tok.l/tok.l index 3dbbe5a..bc9606d 100644 --- a/src/tok.l +++ b/src/tok.l/tok.l @@ -55,14 +55,14 @@ %% -int main(int argc, char** argv) +/*int main(int argc, char** argv) { if (cmdline_parser(argc, argv, &args) != 0) return 1; setlocale(LC_CTYPE,""); setlocale(LC_COLLATE,""); yylex(); return 0; -} +}*/ int yywrap() { diff --git a/src/tok.l/tok_cmdline.ggo b/src/tok.l/tok_cmdline.ggo new file mode 100644 index 0000000..8b58931 --- /dev/null +++ b/src/tok.l/tok_cmdline.ggo @@ -0,0 +1,4 @@ +package "tok" +version "0.1" + +option "interactive" i "Interactive mode." flag off diff --git a/src/tok.pl/Makefile b/src/tok.pl/Makefile new file mode 100644 index 0000000..01cae27 --- /dev/null +++ b/src/tok.pl/Makefile @@ -0,0 +1,15 @@ +include ../../config.mak + +all: + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 tok.pl $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/tok.pl +endif diff --git a/src/tok.pl/example.txt b/src/tok.pl/example.txt new file mode 100644 index 0000000..84b00c1 --- /dev/null +++ b/src/tok.pl/example.txt @@ -0,0 +1,19 @@ +0000 10 W dobierzcie +0010 01 S \n +0011 06 W afiszu +0017 01 S _ +0018 11 W ablacyjnego +0029 01 S _ +0030 02 N 23 +0032 01 S _ +0033 08 W aliantem +0041 01 S _ +0042 06 W aliant +0048 01 S _ +0049 03 W czu +0052 01 B \xC5 +0053 01 B \x82 +0054 05 W ostce +0059 01 S _ +0060 10 W dobieranym +0070 01 S \n diff --git a/src/tok.pl/in.txt b/src/tok.pl/in.txt new file mode 100644 index 0000000..055899d --- /dev/null +++ b/src/tok.pl/in.txt @@ -0,0 +1 @@ +dobierzcie afiszu ablacyjnego aliantem aliant czullostce dobieranym czullku abazzur diff --git a/src/tok.pl/tok.pl b/src/tok.pl/tok.pl new file mode 100755 index 0000000..991787c --- /dev/null +++ b/src/tok.pl/tok.pl @@ -0,0 +1,125 @@ +#!/usr/bin/perl + +#package: UAM Text Tools +#component: tok (tokenizer) +#version: 1.0 +#author: Tomasz Obrebski + +use strict; +use locale; +use Getopt::Long; +use File::HomeDir; + +my $max_form_length = 50; + +my $interactive=0; +my $help; + +my $systemconfigfile='/usr/local/etc/utt/tok.conf'; +#my $userconfigfile="$ENV{'HOME'}/.utt/tok.conf"; +my $userconfigfile=home()."/.utt/tok.conf"; + +#read configuration files########################### +my $file; +foreach $file ($systemconfigfile, $userconfigfile){ + if(open(CONFIG, $file)){ + while () { + chomp; + s/#.*//; + s/^\s+//; + s/\s+$//; + next unless length; + my ($name, $value) = split(/\s*=\s*/, $_, 2); + if(($name eq "interactive")or($name eq "i")){ + $interactive=1; + } + elsif(($name eq "help")or($name eq "h")){ + $help=1; + } + } + close CONFIG; + } +} +#########################################################s + +GetOptions("interactive|i" => \$interactive, + "help|h" => \$help); + +if($help) +{ + print <<'END' +Usage: tok [OPTIONS] + +Options: + --interactive Interactive (no output buffering). + --help -h Help. +END +; + exit 0; +} + + +$| = $interactive; + +my $offset = 0; + +while(<>) +{ + 1 while + / [[:alpha:]]+ (?{seg('W',$&)}) + | \d+ (?{seg('N',$&)}) + | \s+ (?{seg('S',$&)}) + | [[:punct:]] (?{seg('P',$&)}) + | . (?{seg('B',$&)}) + /gxo; +} + +# | [^[:print:]] (?{seg("B",$&)}) + +sub min { + my ($val1, $val2) = @_; + if($val1 < $val2) { + return $val1; + } + else { + return $val2; + } +} + + +sub seg +{ + my ($tag,$match) = @_; + my $length = length $match; + my $idx = 0; + while($idx < $length) { + my $l = min $max_form_length, $length - $idx; + my $m = substr $match, $idx, $l; + + printf "%04d %02d %s ", $offset + $idx, $l, $tag; + if($tag eq 'S') + { + for(my $i=0; $i<$l; ++$i) + { + my $c = substr $m, $i, 1; + print '_' if $c eq ' '; + print '\n' if $c eq "\n"; + print '\t' if $c eq "\t"; + print '\r' if $c eq "\r"; + print '\f' if $c eq "\f"; + } + } + elsif($tag eq 'B') + { + printf "\\x%02X", ord($m); + } + else + { + print $m; + } + print "\n"; + $idx += $l; + } # while($idx < $length) + $offset += $length; +} #sub seg + diff --git a/src/unfla/Makefile b/src/unfla/Makefile new file mode 100644 index 0000000..5ea3c2c --- /dev/null +++ b/src/unfla/Makefile @@ -0,0 +1,17 @@ +include ../../config.mak + +all: + +.PHONY: install +install: +ifdef BIN_DIR + install -m 0755 unfla $(BIN_DIR) +endif + +.PHONY: uninstall +uninstall: +ifdef BIN_DIR + rm $(BIN_DIR)/unfla +endif + +clean: diff --git a/src/unfla/unfla b/src/unfla/unfla new file mode 100755 index 0000000..af65b9d --- /dev/null +++ b/src/unfla/unfla @@ -0,0 +1,8 @@ +#!/bin/sh + +#package: UAM Text Tools +#component: unfla +#version: 1.0 +#author: Tomasz Obrebski + +exec tr '\014' '\012'