concordia-server/mgiza-aligner/mgiza/mgizapp/src/vocab.h
2017-01-21 17:07:36 +01:00

124 lines
3.3 KiB
C++

/*
EGYPT Toolkit for Statistical Machine Translation
Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA.
*/
#ifndef _vocab_h
#define _vocab_h 1
#include "defs.h"
#include "Vector.h"
#include <fstream>
#include <strstream>
#include <map>
#include <set>
class WordEntry
{
public:
string word ;
double freq ;
WordEntry():word("\0"), freq(0) {};
WordEntry(string w, int f):word(w), freq(f) {};
};
class vcbList
{
private:
Vector<WordEntry>& list ;
map<string,int> s2i;
double total;
WordIndex noUniqueTokens ;
WordIndex noUniqueTokensInCorpus ;
const char* fname ;
public:
vcbList(Vector<WordEntry>& vcb,const char* f=0):list(vcb), total(0), noUniqueTokens(0), noUniqueTokensInCorpus(0), fname(f) {};
void setName(const char*f) {
fname=f;
}
vcbList(const vcbList& a):list(a.list), total(a.total), noUniqueTokens(a.noUniqueTokens), noUniqueTokensInCorpus(0), fname(a.fname) {};
void compact(const std::set<WordIndex>& evoc);
inline WordIndex size()const {
return (list.size());
};
inline WordIndex uniqTokens()const {
return noUniqueTokens;
};
inline WordIndex uniqTokensInCorpus()const {
return noUniqueTokensInCorpus;
};
inline double totalVocab() const {
return total;
};
inline Vector<WordEntry>& getVocabList() {
return(list);
};
inline const Vector<WordEntry>& getVocabList()const {
return(list);
};
void readVocabList();
void incFreq(WordIndex id , double f) {
if(id < list.size()) {
if (list[id].freq == 0)
noUniqueTokensInCorpus++;
list[id].freq += f ;
total += f ;
}
};
void clearAllFreq() {
for (WordIndex id = 0 ; id < list.size() ; id++)
list[id].freq = 0 ;
total = 0 ;
noUniqueTokensInCorpus = 0 ;
};
const bool has_word(const string& x) const {
map<string,int>::const_iterator i=s2i.find(x);
return i!=s2i.end();
}
int operator()(const string&x)const {
map<string,int>::const_iterator i=s2i.find(x);
if( i!=s2i.end() )
return i->second;
else {
cerr << "ERROR: no word index for '"<<x<<"'\n";
return 0;
}
}
const string operator()(WordIndex id) const { // Yaser - 2000-12-13
if (id < list.size())
return list[id].word ;
else return 0 ;
}
const string operator[](WordIndex id) const { // Yaser - 2000-12-13
if (id < list.size())
return list[id].word ;
else return 0 ;
}
void printVocabList(ostream& of) {
for (WordIndex i = 1 ; i < list.size() ; i++) {
if (list[i].word != "" && list[i].freq > 0)
of << i << ' ' << list[i].word << ' ' << list[i].freq << '\n';
}
}
};
#endif