354 lines
9.8 KiB
C++
354 lines
9.8 KiB
C++
/*
|
|
|
|
EGYPT Toolkit for Statistical Machine Translation
|
|
Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU General Public License
|
|
as published by the Free Software Foundation; either version 2
|
|
of the License, or (at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
|
|
USA.
|
|
|
|
*/
|
|
/* --------------------------------------------------------------------------*
|
|
* *
|
|
* Module : TTables *
|
|
* *
|
|
* Prototypes File: TTables.h *
|
|
* *
|
|
* Objective: Defines clases and methods for handling I/O for Probability & *
|
|
* Count tables and also alignment tables *
|
|
*****************************************************************************/
|
|
|
|
#ifndef _ttables_h
|
|
#define _ttables_h 1
|
|
|
|
|
|
#include "defs.h"
|
|
#include "vocab.h"
|
|
|
|
#include <cassert>
|
|
|
|
#include <iostream>
|
|
#include <algorithm>
|
|
#include <functional>
|
|
#include <map>
|
|
#include <set>
|
|
#include "Vector.h"
|
|
#include <utility>
|
|
#include "syncObj.h"
|
|
|
|
#if __GNUC__>2
|
|
#include <ext/hash_map>
|
|
using __gnu_cxx::hash_map;
|
|
#else
|
|
#include <hash_map>
|
|
#endif
|
|
|
|
#include <fstream>
|
|
|
|
#include "Globals.h"
|
|
|
|
|
|
/* The tables defined in the following classes are defined as hash tables. For
|
|
example. the t-table is a hash function of a word pair; an alignment is
|
|
a hash function of a vector of integer numbers (sentence positions) and so
|
|
on */
|
|
|
|
|
|
/*----------- Defnition of Hash Function for class tmodel ------- -----------*/
|
|
|
|
typedef pair<WordIndex, WordIndex> wordPairIds;
|
|
|
|
|
|
|
|
class hashpair : public unary_function< pair<WordIndex, WordIndex>, size_t >
|
|
{
|
|
public:
|
|
size_t operator() (const pair<WordIndex, WordIndex>& key) const {
|
|
return (size_t) MAX_W*key.first + key.second; /* hash function and it
|
|
is guarnteed to have
|
|
unique id for each
|
|
unique pair */
|
|
}
|
|
#ifdef WIN32
|
|
inline bool operator() (const pair<WordIndex, WordIndex>& key, const pair<WordIndex, WordIndex>& key2) {
|
|
return key.first==key2.first && key.second==key2.second;
|
|
}
|
|
enum {
|
|
// parameters for hash table
|
|
bucket_size = 1 // 0 < bucket_size
|
|
};
|
|
#endif
|
|
};
|
|
|
|
|
|
|
|
/* ------------------ Class Prototype Definitions ---------------------------*
|
|
Class Name: tmodel
|
|
Objective: This defines the underlying data structur for t Tables and t
|
|
Count Tables. They are defined as a hash table. Each entry in the hash table
|
|
is the probability (P(fj/ei) ) or count collected for ( C(fj/ei)). The
|
|
probability and the count are represented as log integer probability as
|
|
defined by the class LogProb .
|
|
|
|
This class is used to represents t Tables (probabiliity) and n (fertility
|
|
Tables and also their corresponding count tables .
|
|
|
|
*---------------------------------------------------------------------------*/
|
|
|
|
//typedef float COUNT ;
|
|
//typedef LogProb PROB ;
|
|
template <class COUNT, class PROB>
|
|
class LpPair
|
|
{
|
|
public:
|
|
COUNT count ;
|
|
PROB prob ;
|
|
public: // constructor
|
|
LpPair():count(0), prob(0) {} ;
|
|
LpPair(COUNT c, PROB p):count(c), prob(p) {};
|
|
} ;
|
|
|
|
template<class T>
|
|
T*mbinary_search(T*x,T*y,unsigned int val)
|
|
{
|
|
if( y-x==0 )
|
|
return 0;
|
|
if( x->first==val)
|
|
return x;
|
|
if( y-x<2 )
|
|
return 0;
|
|
T*mid=x+(y-x)/2;
|
|
if( val < mid->first )
|
|
return mbinary_search(x,mid,val);
|
|
else
|
|
return mbinary_search(mid,y,val);
|
|
|
|
}
|
|
|
|
template<class T>
|
|
const T*mbinary_search(const T*x,const T*y,unsigned int val)
|
|
{
|
|
if( y-x==0 )
|
|
return 0;
|
|
if( x->first==val)
|
|
return x;
|
|
if( y-x<2 )
|
|
return 0;
|
|
const T*mid=x+(y-x)/2;
|
|
if( val < mid->first )
|
|
return mbinary_search(x,mid,val);
|
|
else
|
|
return mbinary_search(mid,y,val);
|
|
|
|
}
|
|
|
|
template <class COUNT, class PROB>
|
|
class tmodel
|
|
{
|
|
typedef LpPair<COUNT, PROB> CPPair;
|
|
public:
|
|
bool recordDiff;
|
|
|
|
public:
|
|
int noEnglishWords; // total number of unique source words
|
|
int noFrenchWords; // total number of unique target words
|
|
//vector<pair<unsigned int,CPPair> > fs;
|
|
//vector<unsigned int> es;
|
|
vector< vector<pair<unsigned int,CPPair> >* > lexmat;
|
|
vector< Mutex* > mutex;
|
|
|
|
void erase(WordIndex e, WordIndex f) {
|
|
CPPair *p=find(e,f);
|
|
if(p)
|
|
*p=CPPair(0,0);
|
|
};
|
|
|
|
CPPair*find(int e,int f) {
|
|
//pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
|
|
//pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
|
|
if(e>=lexmat.size()||lexmat[e]==NULL) {
|
|
return NULL;
|
|
}
|
|
pair<unsigned int,CPPair> *be=&(*lexmat[e])[0];
|
|
pair<unsigned int,CPPair> *en=&(*lexmat[e])[0]+(*lexmat[e]).size();
|
|
pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
|
|
if( x==0 ) {
|
|
//cerr << "A:DID NOT FIND ENTRY: " << e << " " << f << '\n';
|
|
//abort();
|
|
return 0;
|
|
}
|
|
return &(x->second);
|
|
}
|
|
|
|
const CPPair*find(int e,int f)const {
|
|
if(e>=lexmat.size()||lexmat[e]==NULL) {
|
|
return NULL;
|
|
}
|
|
const pair<unsigned int,CPPair> *be=&(*lexmat[e])[0];
|
|
const pair<unsigned int,CPPair> *en=&(*lexmat[e])[0]+(*lexmat[e]).size();
|
|
//const pair<unsigned int,CPPair> *be=&(fs[0])+es[e];
|
|
//const pair<unsigned int,CPPair> *en=&(fs[0])+es[e+1];
|
|
const pair<unsigned int,CPPair> *x= mbinary_search(be,en,f);
|
|
if( x==0 ) {
|
|
//cerr << "B:DID NOT FIND ENTRY: " << e << " " << f << '\n';
|
|
//abort();
|
|
return 0;
|
|
}
|
|
|
|
return &(x->second);
|
|
}
|
|
public:
|
|
void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0) {
|
|
CPPair* found = find(e,f);
|
|
if(found)
|
|
*found=CPPair(cval,pval);
|
|
}
|
|
|
|
CPPair*getPtr(int e,int f) {
|
|
return find(e,f);
|
|
}
|
|
|
|
tmodel() {};
|
|
tmodel(const string&fn) {
|
|
recordDiff = false;
|
|
int count=0,count2=0;
|
|
ifstream infile2(fn.c_str());
|
|
cerr << "Inputfile in " << fn << endl;
|
|
int e,f,olde=-1,oldf=-1;
|
|
pair<unsigned int,CPPair> cp;
|
|
vector< pair<unsigned int,CPPair> > cps;
|
|
while(infile2>>e>>f) {
|
|
cp.first=f;
|
|
assert(e>=olde);
|
|
assert(e>olde ||f>oldf);
|
|
if( e!=olde&&olde>=0 ) {
|
|
int oldsize=lexmat.size();
|
|
lexmat.resize(olde+1);
|
|
for(unsigned int i=oldsize; i<lexmat.size(); ++i)
|
|
lexmat[i]=0;
|
|
lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
|
|
cps.clear();
|
|
if( !((*lexmat[olde]).size()==(*lexmat[olde]).capacity()) )
|
|
cerr << "eRROR: waste of memory: " << (*lexmat[olde]).size() << " " << (*lexmat[olde]).capacity() << endl;
|
|
count2+=lexmat[olde]->capacity();
|
|
}
|
|
cps.push_back(cp);
|
|
olde=e;
|
|
oldf=f;
|
|
count++;
|
|
}
|
|
lexmat.resize(olde+1);
|
|
lexmat[olde]=new vector< pair<unsigned int,CPPair> > (cps);
|
|
count2+=lexmat[olde]->capacity();
|
|
cout << "There are " << count << " " << count2 << " entries in table" << '\n';
|
|
mutex.resize(lexmat.size());
|
|
for(int _i = 0; _i< lexmat.size(); _i++) {
|
|
mutex[_i] = new Mutex();
|
|
}
|
|
/* Create mutex */
|
|
}
|
|
|
|
~tmodel() {
|
|
for(int _i = 0; _i< lexmat.size(); _i++) {
|
|
delete mutex[_i];
|
|
}
|
|
|
|
}
|
|
|
|
|
|
/* tmodel(const string&fn)
|
|
{
|
|
size_t count=0;
|
|
{
|
|
ifstream infile1(fn.c_str());
|
|
if( !infile1 )
|
|
{
|
|
cerr << "ERROR: can't read coocurrence file " << fn << '\n';
|
|
abort();
|
|
}
|
|
int e,f;
|
|
while(infile1>>e>>f)
|
|
count++;
|
|
}
|
|
cout << "There are " << count << " entries in table" << '\n';
|
|
ifstream infile2(fn.c_str());
|
|
fs.resize(count);
|
|
int e,f,olde=-1,oldf=-1;
|
|
pair<unsigned int,CPPair> cp;
|
|
count=0;
|
|
while(infile2>>e>>f)
|
|
{
|
|
assert(e>=olde);
|
|
assert(e>olde ||f>oldf);
|
|
if( e!=olde )
|
|
{
|
|
es.resize(e+1);
|
|
for(unsigned int i=olde+1;int(i)<=e;++i)
|
|
es[i]=count;
|
|
}
|
|
cp.first=f;
|
|
assert(count<fs.size());
|
|
fs[count]=cp;
|
|
//fs.push_back(cp);
|
|
olde=e;
|
|
oldf=f;
|
|
count++;
|
|
}
|
|
assert(count==fs.size());
|
|
es.push_back(fs.size());
|
|
cout << fs.size() << " " << count << " coocurrences read" << '\n';
|
|
}*/
|
|
|
|
void incCount(WordIndex e, WordIndex f, COUNT inc) {
|
|
if( inc ) {
|
|
CPPair *p=find(e,f);
|
|
if( p ) {
|
|
mutex[e]->lock();
|
|
p->count += inc ;
|
|
mutex[e]->unlock();
|
|
}
|
|
}
|
|
}
|
|
|
|
PROB getProb(WordIndex e, WordIndex f) const {
|
|
const CPPair *p=find(e,f);
|
|
if( p )
|
|
return max(p->prob, PROB_SMOOTH);
|
|
else
|
|
return PROB_SMOOTH;
|
|
}
|
|
|
|
COUNT getCount(WordIndex e, WordIndex f) const {
|
|
const CPPair *p=find(e,f);
|
|
if( p )
|
|
return p->count;
|
|
else
|
|
return 0.0;
|
|
}
|
|
|
|
void printProbTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
|
|
void printCountTable(const char* filename, const Vector<WordEntry>&, const Vector<WordEntry>&,bool actual) const;
|
|
void printProbTableInverse(const char *filename,
|
|
const Vector<WordEntry>& evlist,
|
|
const Vector<WordEntry>& fvlist,
|
|
const double eTotal,
|
|
const double fTotal,
|
|
const bool actual = false ) const;
|
|
void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2);
|
|
bool readProbTable(const char *filename);
|
|
bool readSubSampledProbTable(const char* filename, std::set<WordIndex> &e, std::set<WordIndex> &f);
|
|
};
|
|
|
|
#endif
|