/* EGYPT Toolkit for Statistical Machine Translation Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifndef _vocab_h #define _vocab_h 1 #include "defs.h" #include "Vector.h" #include #include #include #include class WordEntry { public: string word ; double freq ; WordEntry():word("\0"), freq(0) {}; WordEntry(string w, int f):word(w), freq(f) {}; }; class vcbList { private: Vector& list ; map s2i; double total; WordIndex noUniqueTokens ; WordIndex noUniqueTokensInCorpus ; const char* fname ; public: vcbList(Vector& vcb,const char* f=0):list(vcb), total(0), noUniqueTokens(0), noUniqueTokensInCorpus(0), fname(f) {}; void setName(const char*f) { fname=f; } vcbList(const vcbList& a):list(a.list), total(a.total), noUniqueTokens(a.noUniqueTokens), noUniqueTokensInCorpus(0), fname(a.fname) {}; void compact(const std::set& evoc); inline WordIndex size()const { return (list.size()); }; inline WordIndex uniqTokens()const { return noUniqueTokens; }; inline WordIndex uniqTokensInCorpus()const { return noUniqueTokensInCorpus; }; inline double totalVocab() const { return total; }; inline Vector& getVocabList() { return(list); }; inline const Vector& getVocabList()const { return(list); }; void readVocabList(); void incFreq(WordIndex id , double f) { if(id < list.size()) { if (list[id].freq == 0) noUniqueTokensInCorpus++; list[id].freq += f ; total += f ; } }; void clearAllFreq() { for (WordIndex id = 0 ; id < list.size() ; id++) list[id].freq = 0 ; total = 0 ; noUniqueTokensInCorpus = 0 ; }; const bool has_word(const string& x) const { map::const_iterator i=s2i.find(x); return i!=s2i.end(); } int operator()(const string&x)const { map::const_iterator i=s2i.find(x); if( i!=s2i.end() ) return i->second; else { cerr << "ERROR: no word index for '"< 0) of << i << ' ' << list[i].word << ' ' << list[i].freq << '\n'; } } }; #endif