#include #include #include "concordia/concordia.hpp" #include "concordia/common/config.hpp" #include "concordia/common/utils.hpp" // =========================================== std::string _createLibraryVersion(); // =========================================== std::string Concordia::_libraryVersion = _createLibraryVersion(); // =========================================== Concordia::Concordia() { } Concordia::Concordia(const std::string & indexPath, const std::string & configFilePath): _indexPath(indexPath) { _config = boost::shared_ptr ( new ConcordiaConfig(configFilePath)); _index = boost::shared_ptr( new ConcordiaIndex(_getHashedIndexFilePath(), _getMarkersFilePath())); _searcher = boost::shared_ptr(new IndexSearcher()); _initializeIndex(); } Concordia::~Concordia() { } std::string & Concordia::getVersion() { return _libraryVersion; } std::string _createLibraryVersion() { std::stringstream version; version << CONCORDIA_VERSION_MAJOR << "." << CONCORDIA_VERSION_MINOR; return version.str(); } TokenizedSentence Concordia::tokenize(const std::string & sentence, bool byWhitespace, bool generateCodes) { if (generateCodes) { TokenizedSentence result = _hashGenerator->generateHash(sentence, byWhitespace); _hashGenerator->serializeWordMap(); return result; } else { return _hashGenerator->generateTokens(sentence, byWhitespace); } } std::vector Concordia::tokenizeAll( const std::vector & sentences, bool byWhitespace, bool generateCodes) { std::vector result; if (generateCodes) { BOOST_FOREACH(std::string sentence, sentences) { result.push_back(_hashGenerator->generateHash(sentence, byWhitespace)); } _hashGenerator->serializeWordMap(); } else { BOOST_FOREACH(std::string sentence, sentences) { result.push_back(_hashGenerator->generateTokens(sentence, byWhitespace)); } } return result; } // Sentences are written to disk and added to T. // SA is generated on command by other methods. TokenizedSentence Concordia::addExample( const Example & example) { return _index->addExample(_hashGenerator, _T, _markers, example); } // Sentences are written to disk and added to T. // SA is generated on command by other methods. void Concordia::addTokenizedExample( const TokenizedSentence & tokenizedSentence, const SUFFIX_MARKER_TYPE id) { _index->addTokenizedExample(_hashGenerator, _T, _markers, tokenizedSentence, id); } void Concordia::addAllTokenizedExamples( const std::vector & tokenizedSentences, const std::vector & ids) { _index->addAllTokenizedExamples(_hashGenerator, _T, _markers, tokenizedSentences, ids); } // Sentences are written to disk and added to T. // SA is generated on command by other methods. std::vector Concordia::addAllExamples( const std::vector & examples) { return _index->addAllExamples(_hashGenerator, _T, _markers, examples); } void Concordia::loadRAMIndexFromDisk() { if (boost::filesystem::exists(_getWordMapFilePath()) && boost::filesystem::exists(_getHashedIndexFilePath()) && boost::filesystem::exists(_getMarkersFilePath())) { try { // reading index from file _T->clear(); std::ifstream hashedIndexFile; hashedIndexFile.open( _getHashedIndexFilePath().c_str(), std::ios::in | std::ios::ate | std::ios::binary); long hiFileSize = hashedIndexFile.tellg(); if (hiFileSize > 0) { hashedIndexFile.seekg(0, std::ios::beg); while (!hashedIndexFile.eof()) { INDEX_CHARACTER_TYPE character = Utils::readIndexCharacter(hashedIndexFile); Utils::appendCharToSaucharVector(_T, character); } hashedIndexFile.close(); } else { hashedIndexFile.close(); throw ConcordiaException("Index corrupt: empty hash index file"); } // reading markers from file _markers->clear(); std::ifstream markersFile; markersFile.open(_getMarkersFilePath().c_str(), std::ios::in | std::ios::ate | std::ios::binary); long maFileSize = markersFile.tellg(); if (maFileSize > 0) { markersFile.seekg(0, std::ios::beg); while (!markersFile.eof()) { SUFFIX_MARKER_TYPE marker = Utils::readMarker(markersFile); _markers->push_back(marker); } markersFile.close(); } else { markersFile.close(); throw ConcordiaException("Index corrupt: empty markers file"); } // generating suffix array _index->generateSuffixArray(_T,_SA); } catch (const std::bad_alloc&) { throw ConcordiaException("Error allocating memory, probably out of memory."); } } else { throw ConcordiaException("Index corrupt: missing files"); } } void Concordia::refreshSAfromRAM() { _index->generateSuffixArray(_T,_SA); } void Concordia::_initializeIndex() { _hashGenerator = boost::shared_ptr( new HashGenerator(_indexPath, _config)); _T = boost::shared_ptr >(new std::vector); _SA = boost::shared_ptr >(new std::vector); _markers = boost::shared_ptr >( new std::vector); if ( boost::filesystem::exists(_getWordMapFilePath()) && boost::filesystem::exists(_getHashedIndexFilePath()) && boost::filesystem::exists(_getMarkersFilePath())) { loadRAMIndexFromDisk(); } } SUFFIX_MARKER_TYPE Concordia::countOccurrences(const std::string & pattern) { if (_T->size() > 0) { return _searcher->countOccurrences(_hashGenerator, _T, _markers, _SA, pattern); } else { return 0; } } MatchedPatternFragment Concordia::simpleSearch( const std::string & pattern, bool byWhitespace) { if (_T->size() > 0 && pattern.size() > 0) { return _searcher->simpleSearch(_hashGenerator, _T, _markers, _SA, pattern, byWhitespace); } else { // If the index or search pattern are empty, return an empty result. MatchedPatternFragment result(0, 0); return result; } } OccurrencesList Concordia::fullSearch( const std::string & pattern, int limit, int offset, bool byWhitespace) { if (_T->size() > 0 && pattern.size() > 0) { return _searcher->fullSearch(_hashGenerator, _T, _markers, _SA, pattern, limit, offset, byWhitespace); } else { // If the index or search pattern are empty, return an empty result. OccurrencesList result(0); return result; } } MatchedPatternFragment Concordia::lexiconSearch( const std::string & pattern, bool byWhitespace) { if (_T->size() > 0 && pattern.size() > 0) { return _searcher->lexiconSearch(_hashGenerator, _T, _markers, _SA, pattern, byWhitespace); } else { // If the index or search pattern are empty, return an empty result. // Especially performing the lexicon search with an empty pattern // would not be funny, as it would effectively search for double EOS, // which is very frequent in the index. MatchedPatternFragment result(0, 0); return result; } } std::vector Concordia::anubisSearch( const std::string & pattern) { if (_T->size() > 0) { return _searcher->anubisSearch(_config, _hashGenerator, _T, _markers, _SA, pattern); } else { std::vector result; return result; } } boost::shared_ptr Concordia::concordiaSearch( const std::string & pattern, bool byWhitespace) { if (_T->size() > 0) { return _searcher->concordiaSearch(_hashGenerator, _T, _markers, _SA, pattern, byWhitespace); } else { std::string empty; return boost::shared_ptr( new ConcordiaSearchResult(TokenizedSentence(empty))); } } void Concordia::clearIndex() { _hashGenerator->clearWordMap(); _T = boost::shared_ptr >( new std::vector); _markers = boost::shared_ptr >( new std::vector); _SA = boost::shared_ptr >( new std::vector); boost::filesystem::remove(_getHashedIndexFilePath()); boost::filesystem::remove(_getMarkersFilePath()); } std::string Concordia::_getWordMapFilePath() { return _indexPath+"/"+WORD_MAP_FILE_NAME; } std::string Concordia::_getHashedIndexFilePath() { return _indexPath+"/"+HASHED_INDEX_FILE_NAME; } std::string Concordia::_getMarkersFilePath() { return _indexPath+"/"+MARKERS_FILE_NAME; }