separated tokenization and adding to index

This commit is contained in:
rjawor 2015-08-01 17:03:39 +02:00
parent 5a57406875
commit 28704c2f43
6 changed files with 127 additions and 8 deletions

View File

@ -1,5 +1,6 @@
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) ----------------------------- ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
- document the code (classes, cfg files) and update tutorial - multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results.
IN PROGRESS - document the code (classes, cfg files) and update tutorial
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń. - wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
- testy zużycia pamięci - testy zużycia pamięci
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła. - Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.

View File

@ -42,6 +42,17 @@ std::string _createLibraryVersion() {
return version.str(); return version.str();
} }
boost::shared_ptr<TokenizedSentence>
Concordia::tokenize(const std::string & sentence)
throw(ConcordiaException) {
boost::shared_ptr<TokenizedSentence> result =
_hashGenerator->generateHash(sentence);
_hashGenerator->serializeWordMap();
return result;
}
// Sentences are written to disk and added to T. // Sentences are written to disk and added to T.
// SA is generated on command by other methods. // SA is generated on command by other methods.
boost::shared_ptr<TokenizedSentence> Concordia::addExample( boost::shared_ptr<TokenizedSentence> Concordia::addExample(
@ -50,6 +61,17 @@ boost::shared_ptr<TokenizedSentence> Concordia::addExample(
return _index->addExample(_hashGenerator, _T, _markers, example); return _index->addExample(_hashGenerator, _T, _markers, example);
} }
// Sentences are written to disk and added to T.
// SA is generated on command by other methods.
void Concordia::addTokenizedExample(
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id)
throw(ConcordiaException) {
_index->addTokenizedExample(_hashGenerator, _T,
_markers, tokenizedSentence, id);
}
// Sentences are written to disk and added to T. // Sentences are written to disk and added to T.
// SA is generated on command by other methods. // SA is generated on command by other methods.
std::vector<TokenizedSentence> Concordia::addAllExamples( std::vector<TokenizedSentence> Concordia::addAllExamples(

View File

@ -52,6 +52,15 @@ public:
*/ */
std::string & getVersion(); std::string & getVersion();
/*! Tokenizes the given sentence.
\param sentence sentence to be tokenized
\returns tokenized sentence object,
containing information about original word positions
\throws ConcordiaException
*/
boost::shared_ptr<TokenizedSentence> tokenize(const std::string & sentence)
throw(ConcordiaException);
/*! Adds an Example to the index. /*! Adds an Example to the index.
\param example example to be added \param example example to be added
\returns tokenized sentence object, \returns tokenized sentence object,
@ -61,6 +70,16 @@ public:
boost::shared_ptr<TokenizedSentence> addExample(const Example & example) boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
throw(ConcordiaException); throw(ConcordiaException);
/*! Adds a tokenized example to the index.
\param tokenizedSentence tokenized sentence to be added
\param id of the sentence to be added
\throws ConcordiaException
*/
void addTokenizedExample(
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id)
throw(ConcordiaException);
/*! Adds multiple examples to the index. /*! Adds multiple examples to the index.
\param examples vector of examples to be added \param examples vector of examples to be added
\returns vector of tokenized sentence objects, \returns vector of tokenized sentence objects,

View File

@ -82,16 +82,33 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
return hashedPattern; return hashedPattern;
} }
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample( void ConcordiaIndex::addTokenizedExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id) {
std::ofstream hashedIndexFile;
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
std::ofstream markersFile;
markersFile.open(_markersFilePath.c_str(), std::ios::out|
std::ios::app|std::ios::binary);
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, tokenizedSentence, id);
hashedIndexFile.close();
markersFile.close();
}
void ConcordiaIndex::_addSingleTokenizedExample(
std::ofstream & hashedIndexFile, std::ofstream & hashedIndexFile,
std::ofstream & markersFile, std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator, boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T, boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) { boost::shared_ptr<TokenizedSentence> tokenizedSentence,
boost::shared_ptr<TokenizedSentence> hashedPattern = SUFFIX_MARKER_TYPE id) {
hashGenerator->generateHash(example.getSentence()); std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence->getCodes();
std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
int offset = 0; int offset = 0;
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin(); for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
@ -102,7 +119,7 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
// append to markersFile // append to markersFile
SUFFIX_MARKER_TYPE marker = Utils::createMarker( SUFFIX_MARKER_TYPE marker = Utils::createMarker(
example.getId(), id,
offset, offset,
hash.size()); hash.size());
@ -120,7 +137,20 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE; SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
Utils::writeMarker(markersFile, sentenceBoundaryMA); Utils::writeMarker(markersFile, sentenceBoundaryMA);
markers->push_back(sentenceBoundaryMA); markers->push_back(sentenceBoundaryMA);
}
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example) {
boost::shared_ptr<TokenizedSentence> hashedPattern =
hashGenerator->generateHash(example.getSentence());
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
T, markers, hashedPattern, example.getId());
return hashedPattern; return hashedPattern;
} }

View File

@ -59,6 +59,27 @@ public:
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers, boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
const Example & example); const Example & example);
/*! Adds a tokenized example to the index. Hashed index
and markers array are appended with the example.
At the same time, HDD versions of these
two data structures are also appended with the same example.
The method returns a tokenized version of the example.
\param hashGenerator hash generator to be used to prepare the hash
of the example
\param T RAM-based hash index to be appended to
\param markers RAM-based markers array to be appended to
\param example example to be added to index
\param tokenizedSentence tokenized sentence to be added
\param id of the sentence to be added
\throws ConcordiaException
*/
void addTokenizedExample(
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id);
/*! Adds multiple examples to the index. Examples are first hashed using /*! Adds multiple examples to the index. Examples are first hashed using
the hash generator passed to this method. Then, hashed index the hash generator passed to this method. Then, hashed index
and markers array (also passed to this method) are appended and markers array (also passed to this method) are appended
@ -87,6 +108,15 @@ public:
boost::shared_ptr<std::vector<sauchar_t> > T); boost::shared_ptr<std::vector<sauchar_t> > T);
private: private:
void _addSingleTokenizedExample(
std::ofstream & hashedIndexFile,
std::ofstream & markersFile,
boost::shared_ptr<HashGenerator> hashGenerator,
boost::shared_ptr<std::vector<sauchar_t> > T,
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
SUFFIX_MARKER_TYPE id);
boost::shared_ptr<TokenizedSentence> _addSingleExample( boost::shared_ptr<TokenizedSentence> _addSingleExample(
std::ofstream & hashedIndexFile, std::ofstream & hashedIndexFile,
std::ofstream & markersFile, std::ofstream & markersFile,

View File

@ -287,7 +287,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
concordia.addExample(Example("Alice has a cat", 56)); concordia.addExample(Example("Alice has a cat", 56));
concordia.addExample(Example("Alice has a dog", 23)); concordia.addExample(Example("Alice has a dog", 23));
concordia.addExample(Example("New test product has a mistake", 321)); concordia.addExample(Example("New test product has a mistake", 321));
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14)); boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize("This is just testing and it has nothing to do with the above");
concordia.addTokenizedExample(ts, 14);
concordia.refreshSAfromRAM(); concordia.refreshSAfromRAM();
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers"); boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
@ -324,4 +325,20 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
concordia.clearIndex(); concordia.clearIndex();
} }
BOOST_AUTO_TEST_CASE( Tokenize )
{
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize(" Ala posiada kota");
/*
0,3 type: 1 value: ala
4,11 type: 1 value: posiada
12,16 type: 1 value: kota
*/
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
}
BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END()