124 lines
3.3 KiB
C++
124 lines
3.3 KiB
C++
/*
|
|
|
|
EGYPT Toolkit for Statistical Machine Translation
|
|
Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU General Public License
|
|
as published by the Free Software Foundation; either version 2
|
|
of the License, or (at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
|
|
USA.
|
|
|
|
*/
|
|
#ifndef _vocab_h
|
|
#define _vocab_h 1
|
|
|
|
#include "defs.h"
|
|
#include "Vector.h"
|
|
|
|
#include <fstream>
|
|
#include <strstream>
|
|
#include <map>
|
|
#include <set>
|
|
|
|
class WordEntry
|
|
{
|
|
public:
|
|
string word ;
|
|
double freq ;
|
|
WordEntry():word("\0"), freq(0) {};
|
|
WordEntry(string w, int f):word(w), freq(f) {};
|
|
};
|
|
|
|
class vcbList
|
|
{
|
|
private:
|
|
Vector<WordEntry>& list ;
|
|
map<string,int> s2i;
|
|
double total;
|
|
WordIndex noUniqueTokens ;
|
|
WordIndex noUniqueTokensInCorpus ;
|
|
const char* fname ;
|
|
public:
|
|
vcbList(Vector<WordEntry>& vcb,const char* f=0):list(vcb), total(0), noUniqueTokens(0), noUniqueTokensInCorpus(0), fname(f) {};
|
|
void setName(const char*f) {
|
|
fname=f;
|
|
}
|
|
vcbList(const vcbList& a):list(a.list), total(a.total), noUniqueTokens(a.noUniqueTokens), noUniqueTokensInCorpus(0), fname(a.fname) {};
|
|
void compact(const std::set<WordIndex>& evoc);
|
|
inline WordIndex size()const {
|
|
return (list.size());
|
|
};
|
|
inline WordIndex uniqTokens()const {
|
|
return noUniqueTokens;
|
|
};
|
|
inline WordIndex uniqTokensInCorpus()const {
|
|
return noUniqueTokensInCorpus;
|
|
};
|
|
inline double totalVocab() const {
|
|
return total;
|
|
};
|
|
inline Vector<WordEntry>& getVocabList() {
|
|
return(list);
|
|
};
|
|
inline const Vector<WordEntry>& getVocabList()const {
|
|
return(list);
|
|
};
|
|
void readVocabList();
|
|
void incFreq(WordIndex id , double f) {
|
|
if(id < list.size()) {
|
|
if (list[id].freq == 0)
|
|
noUniqueTokensInCorpus++;
|
|
list[id].freq += f ;
|
|
total += f ;
|
|
}
|
|
};
|
|
void clearAllFreq() {
|
|
for (WordIndex id = 0 ; id < list.size() ; id++)
|
|
list[id].freq = 0 ;
|
|
total = 0 ;
|
|
noUniqueTokensInCorpus = 0 ;
|
|
};
|
|
|
|
const bool has_word(const string& x) const {
|
|
map<string,int>::const_iterator i=s2i.find(x);
|
|
return i!=s2i.end();
|
|
}
|
|
int operator()(const string&x)const {
|
|
map<string,int>::const_iterator i=s2i.find(x);
|
|
if( i!=s2i.end() )
|
|
return i->second;
|
|
else {
|
|
cerr << "ERROR: no word index for '"<<x<<"'\n";
|
|
return 0;
|
|
}
|
|
}
|
|
const string operator()(WordIndex id) const { // Yaser - 2000-12-13
|
|
if (id < list.size())
|
|
return list[id].word ;
|
|
else return 0 ;
|
|
}
|
|
const string operator[](WordIndex id) const { // Yaser - 2000-12-13
|
|
if (id < list.size())
|
|
return list[id].word ;
|
|
else return 0 ;
|
|
}
|
|
void printVocabList(ostream& of) {
|
|
for (WordIndex i = 1 ; i < list.size() ; i++) {
|
|
if (list[i].word != "" && list[i].freq > 0)
|
|
of << i << ' ' << list[i].word << ' ' << list[i].freq << '\n';
|
|
}
|
|
}
|
|
|
|
};
|
|
#endif
|