Uaktualnilismy kora.
git-svn-id: svn://atos.wmid.amu.edu.pl/utt@23 e293616e-ec6a-49c2-aa92-f4a8b91c5d16
This commit is contained in:
parent
317d53bd6b
commit
ac7d97018e
@ -27,7 +27,7 @@ export UTT_SHARE_DIR=${UTT_DIR}/share
|
||||
##############################
|
||||
|
||||
# list of components to be included in the distribution
|
||||
COMPONENTS = compiledic cor fla gph grp kon kot lem mar rm12 rs12 sen-l sen-nl ser tags tok.l unfla
|
||||
COMPONENTS = compiledic cor fla gph grp kon kor kot lem mar rm12 rs12 sen-l sen-nl ser tags tok.l unfla
|
||||
# gue nie dziala!
|
||||
|
||||
##############################
|
||||
|
2
app/TODO
2
app/TODO
@ -1,8 +1,6 @@
|
||||
BARDZO WAZNE:
|
||||
|
||||
* lem nie obsluguje dlugich wyrazow [TO]
|
||||
* gue SIE NIE KOMPILUJE !!!
|
||||
* poprawic cora [PK]
|
||||
|
||||
WAZNE:
|
||||
* zamienic kota na lepszego (Kubis) [TO]
|
||||
|
14
app/dist/common/utt_make_config.pl
vendored
14
app/dist/common/utt_make_config.pl
vendored
@ -14,6 +14,7 @@ prepareUttUsrHome($usr_home);
|
||||
conf_utt(catfile($usr_home, 'utt.conf'), $sys_home);
|
||||
|
||||
conf_cor(catfile($usr_home, 'cor.conf'), $sys_home);
|
||||
conf_kor(catfile($usr_home, 'kor.conf'), $sys_home);
|
||||
conf_compiledic(catfile($usr_home, 'compiledic.conf'), $sys_home);
|
||||
conf_grp(catfile($usr_home, 'grp.conf'), $sys_home);
|
||||
conf_gue(catfile($usr_home, 'gue.conf'), $sys_home);
|
||||
@ -98,6 +99,19 @@ sub conf_cor() {
|
||||
close FILE;
|
||||
}
|
||||
|
||||
sub conf_kor() {
|
||||
my $kor_file = shift;
|
||||
my $utthome = shift;
|
||||
open(FILE, ">$kor_file");
|
||||
|
||||
print FILE makeConfigHeader();
|
||||
print FILE "dictionary-home=", abs_path("$utthome/share/utt"), "\n";
|
||||
print FILE "weights=", abs_path("$utthome/lib/utt/weights.cor"), "\n";
|
||||
print FILE "threshold=1.0\n";
|
||||
|
||||
close FILE;
|
||||
}
|
||||
|
||||
sub conf_grp() {
|
||||
my $grp_file = shift;
|
||||
my $utthome = shift;
|
||||
|
1
app/dist/struktura.txt
vendored
1
app/dist/struktura.txt
vendored
@ -43,6 +43,7 @@
|
||||
|
||||
/usr/local/lib/utt/ser.l.template
|
||||
/usr/local/lib/utt/terms.m4
|
||||
/usr/local/lib/utt/weights.cor
|
||||
|
||||
/usr/local/share/doc/utt/FAQ
|
||||
/usr/local/share/doc/utt/COPYRIGHT
|
||||
|
21
app/lib/weights.cor
Normal file
21
app/lib/weights.cor
Normal file
@ -0,0 +1,21 @@
|
||||
%stdcor 1
|
||||
%xchg 1
|
||||
¿ rz 0.5
|
||||
ch h 0.5
|
||||
u ó 0.5
|
||||
u o 0.75
|
||||
om ¹ 0.5
|
||||
om a 0.75
|
||||
en ê 0.5
|
||||
en ê 0.75
|
||||
a ¹ 0.25
|
||||
c æ 0.25
|
||||
e ê 0.25
|
||||
l ³ 0.25
|
||||
n ñ 0.25
|
||||
o ó 0.25
|
||||
s ¶ 0.25
|
||||
z ¿ 0.25
|
||||
z ¼ 0.25
|
||||
x ¼ 0.30
|
||||
|
@ -3,6 +3,8 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "../lib/const.h"
|
||||
|
||||
|
47
app/src/kor/Makefile
Executable file
47
app/src/kor/Makefile
Executable file
@ -0,0 +1,47 @@
|
||||
PAR=-Wno-deprecated -m32 -fpermissive
|
||||
# -static
|
||||
PAR2=-c -Wno-deprecated -m32 -fpermissive
|
||||
LIB_PATH=../lib
|
||||
COMMON_PATH=../common
|
||||
CMDLINE_FILE='"../kor/cmdline.h"'
|
||||
|
||||
|
||||
kor: main.cc corr.o corlist.o cmdline.o $(LIB_PATH)/word.o \
|
||||
$(LIB_PATH)/auttools.o cmdline.c common_cor.o common.o
|
||||
g++ $(PAR) main.cc corlist.o corr.o common.o \
|
||||
$(LIB_PATH)/word.o $(LIB_PATH)/auttools.o cmdline.c common_cor.o \
|
||||
-o kor
|
||||
|
||||
corr.o: corr.cc corr.hh cmdline.h
|
||||
g++ $(PAR2) corr.cc
|
||||
|
||||
corlist.o: corlist.cc corlist.h cmdline.h
|
||||
g++ $(PAR2) corlist.cc
|
||||
|
||||
|
||||
|
||||
common.o: cmdline.h $(COMMON_PATH)/cmdline_common.ggo $(COMMON_PATH)/common.cc \
|
||||
$(COMMON_PATH)/common.h
|
||||
g++ $(PAR2) -D _CMDLINE_FILE=$(CMDLINE_FILE) $(COMMON_PATH)/common.cc
|
||||
|
||||
common_cor.o: cmdline.h common_cor.cc common_cor.h
|
||||
g++ $(PAR2) common_cor.cc
|
||||
|
||||
cmdline.c cmdline.h: cmdline.ggo
|
||||
gengetopt -i cmdline.ggo --conf-parser
|
||||
|
||||
cmdline.ggo: cmdline_cor.ggo $(COMMON_PATH)/cmdline_common.ggo
|
||||
cat cmdline_cor.ggo $(COMMON_PATH)/cmdline_common.ggo > cmdline.ggo
|
||||
|
||||
copy:
|
||||
ifdef UTT_BIN_DIR
|
||||
cp kor ${UTT_BIN_DIR}
|
||||
endif
|
||||
|
||||
clean: clean.cmdline
|
||||
rm *.o || true
|
||||
rm kor || true
|
||||
|
||||
clean.cmdline:
|
||||
rm cmdline.* || true
|
||||
|
10
app/src/kor/cmdline_cor.ggo
Executable file
10
app/src/kor/cmdline_cor.ggo
Executable file
@ -0,0 +1,10 @@
|
||||
package "kor"
|
||||
version "0.1"
|
||||
|
||||
option "dictionary-home" - "Dictionary home dir." string typestr="FILENAME" no hidden
|
||||
option "dictionary" d "Dictionary" string typestr="FILENAME" default="cor.bin" no
|
||||
option "distance" n "Maximal edit distance." int default="1" no
|
||||
option "replace" r "Replace original form with corrected form, place original form in the cor field. This option has no effect in single mode" flag off
|
||||
#option "single" - "Place all alternatives in the same line" flag off
|
||||
option "weights" w "File with translation rules." string typestr="FILENAME" default="weight.cor" no
|
||||
option "threshold" t "Edit distance threshold" float default="1" no
|
27
app/src/kor/common_cor.cc
Executable file
27
app/src/kor/common_cor.cc
Executable file
@ -0,0 +1,27 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "common_cor.h"
|
||||
|
||||
#define MAX_PATH_LENGTH 255
|
||||
|
||||
char dictionary[MAX_PATH_LENGTH];
|
||||
char file_weights[MAX_PATH_LENGTH];
|
||||
float threshold;
|
||||
|
||||
void process_cor_options(gengetopt_args_info* args)
|
||||
{
|
||||
if(args->dictionary_given)
|
||||
{
|
||||
expand_path(args->dictionary_arg,dictionary);
|
||||
}
|
||||
else if (args->dictionary_home_given && args->language_given)
|
||||
{
|
||||
char buf[MAX_PATH_LENGTH];
|
||||
expand_path(args->dictionary_home_arg, buf);
|
||||
sprintf(dictionary,"%s/%s/cor.bin",buf,args->language_arg);
|
||||
}
|
||||
|
||||
expand_path(args->weights_arg, file_weights);
|
||||
|
||||
threshold = args->threshold_arg;
|
||||
}
|
26
app/src/kor/common_cor.h
Executable file
26
app/src/kor/common_cor.h
Executable file
@ -0,0 +1,26 @@
|
||||
#ifndef __COMMON_COR_H
|
||||
#define __COMMON_COR_H
|
||||
|
||||
// SEKCJA STALYCH
|
||||
#ifndef _CMDLINE_FILE
|
||||
#define _CMDLINE_FILE "../kor/cmdline.h"
|
||||
#endif
|
||||
|
||||
#define MAX_LEN 2
|
||||
#define PREC 1000
|
||||
#define Weight int
|
||||
|
||||
// SEKCJA INCLUDOW
|
||||
#include "../common/common.h"
|
||||
#include _CMDLINE_FILE
|
||||
|
||||
|
||||
// SEKCJA GENGETOPT
|
||||
extern int change_count;
|
||||
extern void process_cor_options(gengetopt_args_info* args);
|
||||
extern char dictionary[];
|
||||
extern char file_weights[];
|
||||
extern float threshold;
|
||||
|
||||
#endif
|
||||
|
70
app/src/kor/corlist.cc
Executable file
70
app/src/kor/corlist.cc
Executable file
@ -0,0 +1,70 @@
|
||||
#include <stdio.h>
|
||||
#include <alloc.h>
|
||||
#include "corlist.h"
|
||||
|
||||
#define min(x,y) ((x<y)?(x):(y))
|
||||
|
||||
|
||||
Weight CorList::GetValue(char X[100], char Y[100], Weight (*H2)[100], int i, int j)
|
||||
{
|
||||
Weight R = 9999*PREC; // (+nieskonczonosc)
|
||||
int n;
|
||||
|
||||
for (n=0; n<total; n++)
|
||||
{
|
||||
int la = List[n].la;
|
||||
int lb = List[n].lb;
|
||||
if (la<=i+1 && lb<=j+1)
|
||||
if (strncmp(List[n].a,X+i+1-la,la)==0 && strncmp(List[n].b,Y+j+1-lb,lb)==0)
|
||||
R = min(R,H2[i-la][j-lb]+List[n].w);
|
||||
if (la<=j+1 && lb<=i+1)
|
||||
if (strncmp(List[n].b,X+i+1-lb,lb)==0 && strncmp(List[n].a,Y+j+1-la,la)==0)
|
||||
R = min(R,H2[i-lb][j-la]+List[n].w);
|
||||
}
|
||||
return R;
|
||||
}
|
||||
|
||||
|
||||
int CorList::loadCWL(char *Name)
|
||||
{
|
||||
FILE *f = fopen(Name,"r");
|
||||
int len=MAX_LEN*2+100;
|
||||
char a[100],b[100], buf[len+1];
|
||||
float wtmp;
|
||||
CorWeight w;
|
||||
|
||||
cor_stdcor = 1 * PREC;
|
||||
cor_xchg = 1 * PREC;
|
||||
|
||||
List = (CorWeight*)malloc(sizeof(CorWeight)); // 100 BO NIE DZIALA REALLOC
|
||||
total=0;
|
||||
|
||||
if (!f) { fprintf(stderr,"\nCan't open correction weight list file!\n"); return -1; }
|
||||
while (!feof(f) && fgets(buf,len,f))
|
||||
{
|
||||
if (buf[0]=='%')
|
||||
{
|
||||
sscanf(buf+1,"%s %f",&a,&wtmp);
|
||||
int ok=0;
|
||||
if (strcmp(a,"stdcor")==0) { ok=1; cor_stdcor=Weight(wtmp*PREC); /*printf("Standard letter correction set to: %1.2f\n",wtmp);*/ }
|
||||
if (strcmp(a,"xchg")==0) { ok=1; cor_xchg=Weight(wtmp*PREC); /*printf("Inverted letters correction set to: %1.2f\n",wtmp);*/ }
|
||||
if (!ok) { fprintf(stderr,"Error in file %s: Unknown keyword: '%s'.\n",Name,a); return -1; }
|
||||
}
|
||||
else
|
||||
{
|
||||
sscanf(buf,"%s %s %f",&a,&b,&wtmp);
|
||||
w.w=(Weight)(wtmp*PREC);
|
||||
w.la=strlen(a); w.lb=strlen(b);
|
||||
if (w.la>MAX_LEN) { printf("ERROR in file %s: the string '%s' exceeds maximum length of %d characters.\n",Name,a,MAX_LEN); fclose(f); return -1; }
|
||||
if (w.lb>MAX_LEN) { printf("ERROR in file %s: the string '%s' exceeds maximum length of %d characters.\n",Name,b,MAX_LEN); fclose(f); return -1; }
|
||||
strcpy(w.a,a), strcpy(w.b,b);
|
||||
total++;
|
||||
List = (CorWeight*)realloc(List,total*sizeof(CorWeight));
|
||||
List[total-1]=w;
|
||||
// printf("%s\t<->\t%s\t%1.2f\n",w.a,w.b,((float)w.w/PREC));
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
// printf("Total: %d\n\n",total);
|
||||
return(total);
|
||||
}
|
20
app/src/kor/corlist.h
Executable file
20
app/src/kor/corlist.h
Executable file
@ -0,0 +1,20 @@
|
||||
#ifndef _CORLIST_H
|
||||
#define _CORLIST_H
|
||||
|
||||
//#include <stdio.h>
|
||||
#include "common_cor.h"
|
||||
|
||||
typedef struct { char a[MAX_LEN+1],b[MAX_LEN+1]; Weight w; short la,lb; } CorWeight;
|
||||
|
||||
class CorList
|
||||
{
|
||||
private:
|
||||
CorWeight *List;
|
||||
int total;
|
||||
public:
|
||||
Weight cor_stdcor, cor_xchg;
|
||||
int loadCWL(char *Name);
|
||||
Weight GetValue(char X[100], char Y[100], Weight (*H2)[100], int i, int j);
|
||||
};
|
||||
|
||||
#endif
|
163
app/src/kor/corr.cc
Executable file
163
app/src/kor/corr.cc
Executable file
@ -0,0 +1,163 @@
|
||||
//---------------------------------------------------------------------------
|
||||
#include "common_cor.h"
|
||||
#include "corr.hh"
|
||||
|
||||
#define MAXPATH 256
|
||||
|
||||
#define min(x,y) ((x<y)?(x):(y))
|
||||
#define max(x,y) ((x>y)?(x):(y))
|
||||
|
||||
|
||||
Weight Corr::ed(int i,int j)
|
||||
{
|
||||
if(i==-1)
|
||||
return (j+1)*CL.cor_stdcor; // moje* Nie wiem czy tak bêdzie dobrze, ale uzna³em, ¿e poza tablic¹ powinny byæ wartosci przemno¿one przez wagê standardowej zmiany litery
|
||||
if(j==-1)
|
||||
return (i+1)*CL.cor_stdcor; // moje*
|
||||
if(i==-2 || j==-2)
|
||||
return (n+1)*CL.cor_stdcor; // moje*
|
||||
|
||||
if(X[i]==Y[j])
|
||||
return min(H2[i-1][j-1], min(CL.cor_stdcor+min(H2[i][j-1],H2[i-1][j]),CL.GetValue(X,Y,H2,i,j)));
|
||||
if(X[i-1]==Y[j] && X[i]==Y[j-1])
|
||||
return min(min(CL.cor_xchg+H2[i-2][j-2],CL.cor_stdcor+min(H2[i][j-1],H2[i-1][j])), CL.GetValue(X,Y,H2,i,j));
|
||||
return min(CL.cor_stdcor+min(H2[i-1][j-1],min(H2[i][j-1],H2[i-1][j])), CL.GetValue(X,Y,H2,i,j));
|
||||
|
||||
/* // wersja z wagami ale dla floatów
|
||||
if(X[i]==Y[j])//zielone-> <- niebieskie -> <- rózowe ->
|
||||
return min(H2[i-1][j-1], min(1+min(H2[i][j-1],H2[i-1][j]),CL.GetValue(X,Y,H2,i,j)));
|
||||
if(X[i-1]==Y[j] && X[i]==Y[j-1])
|
||||
return min(1+min(H2[i-2][j-2],min(H2[i][j-1],H2[i-1][j])), CL.GetValue(X,Y,H2,i,j));
|
||||
return min(1+min(H2[i-1][j-1],min(H2[i][j-1],H2[i-1][j])), CL.GetValue(X,Y,H2,i,j));
|
||||
*/
|
||||
|
||||
/* // normalna wersja
|
||||
if(X[i]==Y[j])
|
||||
return H2[i-1][j-1];
|
||||
if(X[i-1]==Y[j] && X[i]==Y[j-1])
|
||||
return 1+min(H2[i-2][j-2],min(H2[i][j-1],H2[i-1][j]));
|
||||
return 1+min(H2[i-1][j-1],min(H2[i][j-1],H2[i-1][j]));
|
||||
*/
|
||||
|
||||
/*
|
||||
if(X[i]==Y[j])
|
||||
return H[(i-1)+2][(j-1)+2];
|
||||
if(X[i-1]==Y[j] && X[i]==Y[j-1])
|
||||
return 1+min(H[(i-2)+2][(j-2)+2],min(H[(i)+2][(j-1)+2],H[(i-1)+2][(j)+2]));
|
||||
return 1+min(H[(i-1)+2][(j-1)+2],min(H[(i)+2][(j-1)+2],H[(i-1)+2][(j)+2]));
|
||||
*/
|
||||
}
|
||||
|
||||
int Corr::load2(char *Name) // moje
|
||||
{
|
||||
return CL.loadCWL(Name);
|
||||
}
|
||||
|
||||
Weight Corr::cuted(int j)
|
||||
{
|
||||
int l=max(0,j-t);
|
||||
int u=min(m,j+t);
|
||||
Weight ce=(j+t)*PREC; // moje*
|
||||
for(int k=l;k<=u;k++)
|
||||
{
|
||||
if(H2[k][j]<ce)//if(H[(k)+2][(j)+2]<ce)
|
||||
ce=H2[k][j];//ce=H[(k)+2][(j)+2];
|
||||
}
|
||||
return ce;
|
||||
}
|
||||
|
||||
/*
|
||||
void Corr::recomputeH(int j)
|
||||
{
|
||||
for(int i=0;i<=m;i++)
|
||||
H[(i)+2][(j)+2]=ed(i,j);
|
||||
}
|
||||
*/
|
||||
|
||||
void Corr::recomputeH(int j)
|
||||
{
|
||||
int lo=max(0,j-t-2);
|
||||
int hi=min(m,j+t+2);
|
||||
for(int i=lo;i<=hi;++i)
|
||||
H2[i][j]=ed(i,j);//H[(i)+2][(j)+2]=ed(i,j);
|
||||
}
|
||||
|
||||
|
||||
int Corr::correct(const char* w, Words& tab)
|
||||
{
|
||||
long int path[MAXPATH]={0};
|
||||
int i; // row index (X)
|
||||
int j; // column index (Y)
|
||||
long state=0;
|
||||
|
||||
strcpy(X,w);
|
||||
m=strlen(X)-1;
|
||||
n=m+t;
|
||||
|
||||
for(i=(-2);i<=m;i++)
|
||||
H[(i)+2][(-2)+2]=n*PREC; // moje *PREC
|
||||
for(i=(-1);i<=m;i++)
|
||||
H[(i)+2][(-1)+2]=((i)+1)*PREC; // moje*
|
||||
for(j=(-2);j<=n;j++)
|
||||
H[(-2)+2][(j)+2]=n*1000; // moje*
|
||||
for(j=(-1);j<=n;j++)
|
||||
H[(-1)+2][(j)+2]=((j)+1)*PREC; // moje*
|
||||
|
||||
for(j=0; j<=n; ++j)
|
||||
for(i=0; i<=m; ++i)
|
||||
H[i+2][j+2]=(t+1)*PREC;
|
||||
|
||||
int more=1;
|
||||
bool cont=false;
|
||||
|
||||
strcpy(Y,"");
|
||||
j=0;
|
||||
state=0;
|
||||
int count=0;
|
||||
while(more)
|
||||
{
|
||||
if(!empty(state))
|
||||
{
|
||||
Y[j]=input(state);
|
||||
recomputeH(j);
|
||||
if(cuted(j)<=t)
|
||||
{
|
||||
Weight edd; // moje
|
||||
if(final(next(state)) && (edd=H[(m)+2][(j)+2])<=t)
|
||||
{
|
||||
char* out=new char[j+2];
|
||||
strncpy(out,Y,j+1);
|
||||
out[j+1]='\0';
|
||||
// if(cont) putchar(' ');
|
||||
cont=true;
|
||||
// printf("%1.2f %s\n", (float)edd/PREC,out); // moje
|
||||
// cout << out << "(" << edd << ")" << endl;
|
||||
tab.add(out,(float)edd/PREC);
|
||||
count++;
|
||||
}
|
||||
path[j++]=state;
|
||||
state=next(state);
|
||||
continue;
|
||||
}
|
||||
else
|
||||
if(continued(state))
|
||||
{
|
||||
state++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
//backtracking
|
||||
do
|
||||
if(j>0)
|
||||
j--;
|
||||
else
|
||||
more=0;
|
||||
while(more && !continued(path[j]));
|
||||
state=path[j]+1;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
39
app/src/kor/corr.hh
Executable file
39
app/src/kor/corr.hh
Executable file
@ -0,0 +1,39 @@
|
||||
//---------------------------------------------------------------------------
|
||||
#ifndef _corr_hh
|
||||
#define _corr_hh
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
#include "../lib/tfti.h"
|
||||
#include "../lib/word.h"
|
||||
#include "corlist.h"
|
||||
#include "../common/common.h"
|
||||
|
||||
class Corr : public TFTiv<char,char>
|
||||
{
|
||||
private:
|
||||
Weight H[100][100];
|
||||
char X[100]; // misspelled string
|
||||
char Y[100]; // (possibly partial) candidate string
|
||||
int m; // length of X
|
||||
int n; // maximal length of Y
|
||||
|
||||
Weight ed(int,int);
|
||||
Weight cuted(int);
|
||||
void recomputeH(int);
|
||||
|
||||
|
||||
public:
|
||||
Weight (*H2)[100]; // moje: zmiana z int na Weight (float)
|
||||
int t; // threshold
|
||||
CorList CL; // moje
|
||||
|
||||
Corr() : H2((Weight(*)[100])&H[2][2]) {}; // moje (int->float)
|
||||
Corr(const char* a) : TFTiv<char,char>(a), H2((Weight(*)[100])&H[2][2]) { };
|
||||
|
||||
int correct(const char* w, Words& tab);
|
||||
|
||||
int load2(char *Name); // moje
|
||||
};
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
#endif
|
155
app/src/kor/main.cc
Executable file
155
app/src/kor/main.cc
Executable file
@ -0,0 +1,155 @@
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include "../lib/iotools.h"
|
||||
#include "common_cor.h"
|
||||
#include "corr.hh"
|
||||
#include <locale.h>
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
// setlocale(LC_CTYPE,"");
|
||||
// setlocale(LC_COLLATE,"");
|
||||
|
||||
gengetopt_args_info args;
|
||||
|
||||
if(cmdline_parser(argc, argv, &args) != 0)
|
||||
exit(1);
|
||||
|
||||
process_config_files(&args,argv[0]);
|
||||
process_common_options(&args,argv[0]);
|
||||
process_cor_options(&args);
|
||||
|
||||
Corr cor;
|
||||
|
||||
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
// strcpy(dictionary,"cor.bin");
|
||||
|
||||
cor.load(dictionary);
|
||||
cor.t=args.distance_arg;
|
||||
|
||||
//>>>>>>>>>>>>>>>
|
||||
cor.CL.cor_stdcor=1*PREC;
|
||||
cor.CL.cor_xchg=1*PREC;
|
||||
if (cor.load2(file_weights)==-1) return -1; // moje
|
||||
cor.t=1*PREC; // ODLEGLOSC EDYCYJNA
|
||||
//<<<<<<<<<<<<<<
|
||||
|
||||
char line[MAX_LINE+1];
|
||||
long line_count = 0;
|
||||
|
||||
Segment seg;
|
||||
Words tab;
|
||||
char form1[MAX_LINE];
|
||||
char* form;
|
||||
int formcasing;
|
||||
char corfield[MAX_LINE]="";
|
||||
|
||||
while (fgets(line, MAX_LINE, inputf))
|
||||
{
|
||||
++line_count;
|
||||
char outline[128];
|
||||
|
||||
if (!process_seg(line, args))
|
||||
fputs(line, outputf);
|
||||
else
|
||||
{
|
||||
char form[MAX_FORM];
|
||||
|
||||
tab.clear();
|
||||
getfield(line,input_field_prefix,form);
|
||||
if (form==NULL) continue;
|
||||
|
||||
formcasing=3;
|
||||
cor.correct(form, tab);
|
||||
|
||||
if( tab.count() == 0 )
|
||||
{
|
||||
formcasing=casing(form);
|
||||
if( formcasing == 1 || formcasing == 2)
|
||||
tolowers(form, form1), cor.correct(form1, tab);
|
||||
}
|
||||
|
||||
if ( tab.count() == 0)
|
||||
fputs(line, failedf);
|
||||
else
|
||||
{
|
||||
tab.sort();
|
||||
|
||||
if(args.replace_flag)
|
||||
{
|
||||
char corfield[128];
|
||||
strcpy(corfield, input_field_prefix);
|
||||
strcat(corfield, form);
|
||||
seg.aux[seg.auxn]=corfield;
|
||||
++seg.auxn;
|
||||
for(int i=0; i<tab.count(); ++i)
|
||||
{
|
||||
seg.form=tab[i].form();
|
||||
restorecasing(seg.form,seg.form,formcasing);
|
||||
seg.print(outline);
|
||||
fputs(outline, outputf);
|
||||
}
|
||||
--seg.auxn;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(one_line)
|
||||
{
|
||||
char* p=corfield;
|
||||
for(int i=tab.count()-1; i >= 0; --i)
|
||||
{
|
||||
if(tab[i].w_suf() > threshold) continue;
|
||||
restorecasing(tab[i].form(),tab[i].form(),formcasing);
|
||||
p += sprintf(p," %s%s,%1.2f",output_field_prefix,tab[i].form(),tab[i].w_suf());
|
||||
}
|
||||
sprintf(p,"\n");
|
||||
|
||||
strcpy(outline,line);
|
||||
outline[strlen(outline)-1]='\0';
|
||||
strcat(outline,corfield);
|
||||
fputs(outline, outputf);
|
||||
}
|
||||
else if(one_field)
|
||||
{
|
||||
char* p=corfield;
|
||||
p += sprintf(p," %s",output_field_prefix);
|
||||
for(int i=tab.count()-1; i >= 0; --i)
|
||||
{
|
||||
if(tab[i].w_suf() > threshold) continue;
|
||||
restorecasing(tab[i].form(),tab[i].form(),formcasing);
|
||||
p += sprintf(p,(i==0)?"%s,%1.2f":";%s,%1.2f",tab[i].form(),tab[i].w_suf());
|
||||
}
|
||||
|
||||
sprintf(p,"\n");
|
||||
|
||||
strcpy(outline,line);
|
||||
outline[strlen(outline)-1]='\0';
|
||||
strcat(outline,corfield);
|
||||
fputs(outline, outputf);
|
||||
}
|
||||
else
|
||||
{
|
||||
for(int i=tab.count()-1; i >= 0; --i)
|
||||
{
|
||||
if(tab[i].w_suf() > threshold) continue;
|
||||
restorecasing(tab[i].form(),tab[i].form(),formcasing);
|
||||
sprintf(corfield," %s%s,%1.2f\n",output_field_prefix,tab[i].form(),tab[i].w_suf());
|
||||
strcpy(outline,line);
|
||||
outline[strlen(outline)-1]='\0';
|
||||
strcat(outline,corfield);
|
||||
fputs(outline, outputf);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(args.interactive_flag)
|
||||
{
|
||||
fflush(outputf);
|
||||
fflush(failedf);
|
||||
}
|
||||
}
|
||||
cmdline_parser_free(&args);
|
||||
}
|
@ -22,7 +22,7 @@ void Word::autodescr(const char* fo, const char* de)
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
int Word::cmp_w(Word a, Word b) {
|
||||
bool Word::cmp_w(Word a, Word b) {
|
||||
return (a.w_suf() > b.w_suf());
|
||||
}
|
||||
//---------------------------------------------------------------------------
|
||||
@ -107,7 +107,7 @@ int Words::next() {
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
void Words::sort() {
|
||||
void Words::sort() {
|
||||
std::sort(tab.begin(), tab.end(), Word::cmp_w);
|
||||
}
|
||||
|
||||
@ -123,9 +123,15 @@ int Words::add(const char* fo)
|
||||
if (cnt>=tab.capacity()-1)
|
||||
tab.resize(tab.size()*2);
|
||||
|
||||
tab[cnt].form(fo);
|
||||
tab[cnt].w_suf(0.0);
|
||||
|
||||
Word o;
|
||||
o.form(fo);
|
||||
o.w_suf(0.0);
|
||||
tab.push_back(o);
|
||||
// tab[cnt].form(fo);
|
||||
// tab[cnt].w_suf(0.0);
|
||||
|
||||
|
||||
// if(cnt<MAX-1) {
|
||||
/* tab.push_back(new Word());
|
||||
tab[cnt]->form(fo);
|
||||
@ -136,6 +142,30 @@ int Words::add(const char* fo)
|
||||
//return -1;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
//TYMCZASOWO TAK(DLA CORA)
|
||||
int Words::add(const char* fo, float weight)
|
||||
{
|
||||
int i = find(fo);
|
||||
if(i!=-1) {
|
||||
return i;
|
||||
}
|
||||
|
||||
if (cnt>=tab.capacity()-1)
|
||||
tab.resize(tab.size()*2);
|
||||
|
||||
Word o;
|
||||
o.form(fo);
|
||||
o.w_suf(weight);
|
||||
tab.push_back(o);
|
||||
// tab[cnt].form(fo);
|
||||
// tab[cnt].w_suf(weight);
|
||||
|
||||
return cnt++;
|
||||
// }
|
||||
//return -1;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
int Words::add(const char* fo, const char* des)
|
||||
|
@ -12,6 +12,8 @@
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
||||
|
||||
class Word
|
||||
{
|
||||
public:
|
||||
@ -36,7 +38,7 @@ private:
|
||||
float _w_suf;
|
||||
// float _w_pref;
|
||||
public:
|
||||
static int cmp_w(Word a, Word b);
|
||||
static bool cmp_w(Word a, Word b);
|
||||
|
||||
Word() : _len_suf(-1) { *f='\0'; returned=0; };
|
||||
Word(const char* fo, const char* des) : _len_suf(-1) { autodescr(fo,des); _w_suf=1.0; returned=0; };
|
||||
@ -76,6 +78,7 @@ public:
|
||||
|
||||
};
|
||||
|
||||
|
||||
inline Word::Word(const Word& word)
|
||||
{ strcpy(f,word.f); strcpy(l,word.l); strcpy(d,word.d); _len_suf=word._len_suf; _w_suf=word._w_suf; returned = 0; }
|
||||
|
||||
@ -118,6 +121,7 @@ class Words
|
||||
int count() const { return cnt; }
|
||||
void clear() { cnt=0; tab.clear(); }
|
||||
int add(const char* fo);
|
||||
int add(const char* fo, float weight);
|
||||
int add(const char* fo, const char* des);
|
||||
|
||||
/* zwraca index nastepnego wyniku, podczas pierwszego wywolania
|
||||
|
@ -10,6 +10,8 @@ use locale;
|
||||
use Getopt::Long;
|
||||
use File::HomeDir;
|
||||
|
||||
my $max_form_length = 50;
|
||||
|
||||
my $interactive=0;
|
||||
my $help;
|
||||
|
||||
@ -20,7 +22,7 @@ my $userconfigfile=home()."/.utt/tok.conf";
|
||||
#read configuration files###########################
|
||||
my $file;
|
||||
foreach $file ($systemconfigfile, $userconfigfile){
|
||||
if(open(CONFIG, $configfile1)){
|
||||
if(open(CONFIG, $file)){
|
||||
while (<CONFIG>) {
|
||||
chomp;
|
||||
s/#.*//;
|
||||
@ -74,17 +76,32 @@ while(<>)
|
||||
|
||||
# | [^[:print:]] (?{seg("B",$&)})
|
||||
|
||||
sub min {
|
||||
my ($val1, $val2) = @_;
|
||||
if($val1 < $val2) {
|
||||
return $val1;
|
||||
}
|
||||
else {
|
||||
return $val2;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sub seg
|
||||
{
|
||||
my ($tag,$match) = @_;
|
||||
my $len=length $match;
|
||||
printf "%04d %02d %s ", $offset, $len, $tag;
|
||||
my $length = length $match;
|
||||
my $idx = 0;
|
||||
while($idx < $length) {
|
||||
my $l = min $max_form_length, $length - $idx;
|
||||
my $m = substr $match, $idx, $l;
|
||||
|
||||
printf "%04d %02d %s ", $offset + $idx, $l, $tag;
|
||||
if($tag eq 'S')
|
||||
{
|
||||
for(my $i=0; $i<$len; ++$i)
|
||||
for(my $i=0; $i<$l; ++$i)
|
||||
{
|
||||
my $c = substr $match, $i, 1;
|
||||
my $c = substr $m, $i, 1;
|
||||
print '_' if $c eq ' ';
|
||||
print '\n' if $c eq "\n";
|
||||
print '\t' if $c eq "\t";
|
||||
@ -94,12 +111,15 @@ sub seg
|
||||
}
|
||||
elsif($tag eq 'B')
|
||||
{
|
||||
printf "\\x%02X", ord($match);
|
||||
printf "\\x%02X", ord($m);
|
||||
}
|
||||
else
|
||||
{
|
||||
print $match;
|
||||
print $m;
|
||||
}
|
||||
print "\n";
|
||||
$offset += $len;
|
||||
}
|
||||
$idx += $l;
|
||||
} # while($idx < $length)
|
||||
$offset += $length;
|
||||
} #sub seg
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user