separated tokenization and adding to index
This commit is contained in:
parent
5a57406875
commit
28704c2f43
3
TODO.txt
3
TODO.txt
@ -1,5 +1,6 @@
|
||||
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
|
||||
- document the code (classes, cfg files) and update tutorial
|
||||
- multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results.
|
||||
IN PROGRESS - document the code (classes, cfg files) and update tutorial
|
||||
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
|
||||
- testy zużycia pamięci
|
||||
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
||||
|
@ -42,6 +42,17 @@ std::string _createLibraryVersion() {
|
||||
return version.str();
|
||||
}
|
||||
|
||||
boost::shared_ptr<TokenizedSentence>
|
||||
Concordia::tokenize(const std::string & sentence)
|
||||
throw(ConcordiaException) {
|
||||
boost::shared_ptr<TokenizedSentence> result =
|
||||
_hashGenerator->generateHash(sentence);
|
||||
_hashGenerator->serializeWordMap();
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
|
||||
// Sentences are written to disk and added to T.
|
||||
// SA is generated on command by other methods.
|
||||
boost::shared_ptr<TokenizedSentence> Concordia::addExample(
|
||||
@ -50,6 +61,17 @@ boost::shared_ptr<TokenizedSentence> Concordia::addExample(
|
||||
return _index->addExample(_hashGenerator, _T, _markers, example);
|
||||
}
|
||||
|
||||
// Sentences are written to disk and added to T.
|
||||
// SA is generated on command by other methods.
|
||||
void Concordia::addTokenizedExample(
|
||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
||||
SUFFIX_MARKER_TYPE id)
|
||||
throw(ConcordiaException) {
|
||||
_index->addTokenizedExample(_hashGenerator, _T,
|
||||
_markers, tokenizedSentence, id);
|
||||
}
|
||||
|
||||
|
||||
// Sentences are written to disk and added to T.
|
||||
// SA is generated on command by other methods.
|
||||
std::vector<TokenizedSentence> Concordia::addAllExamples(
|
||||
|
@ -52,6 +52,15 @@ public:
|
||||
*/
|
||||
std::string & getVersion();
|
||||
|
||||
/*! Tokenizes the given sentence.
|
||||
\param sentence sentence to be tokenized
|
||||
\returns tokenized sentence object,
|
||||
containing information about original word positions
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
boost::shared_ptr<TokenizedSentence> tokenize(const std::string & sentence)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Adds an Example to the index.
|
||||
\param example example to be added
|
||||
\returns tokenized sentence object,
|
||||
@ -61,6 +70,16 @@ public:
|
||||
boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Adds a tokenized example to the index.
|
||||
\param tokenizedSentence tokenized sentence to be added
|
||||
\param id of the sentence to be added
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
void addTokenizedExample(
|
||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
||||
SUFFIX_MARKER_TYPE id)
|
||||
throw(ConcordiaException);
|
||||
|
||||
/*! Adds multiple examples to the index.
|
||||
\param examples vector of examples to be added
|
||||
\returns vector of tokenized sentence objects,
|
||||
|
@ -82,16 +82,33 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
|
||||
return hashedPattern;
|
||||
}
|
||||
|
||||
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
||||
void ConcordiaIndex::addTokenizedExample(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
||||
SUFFIX_MARKER_TYPE id) {
|
||||
std::ofstream hashedIndexFile;
|
||||
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
|
||||
std::ios::app|std::ios::binary);
|
||||
std::ofstream markersFile;
|
||||
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
||||
std::ios::app|std::ios::binary);
|
||||
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
|
||||
T, markers, tokenizedSentence, id);
|
||||
hashedIndexFile.close();
|
||||
markersFile.close();
|
||||
}
|
||||
|
||||
void ConcordiaIndex::_addSingleTokenizedExample(
|
||||
std::ofstream & hashedIndexFile,
|
||||
std::ofstream & markersFile,
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
const Example & example) {
|
||||
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
||||
hashGenerator->generateHash(example.getSentence());
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
|
||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
||||
SUFFIX_MARKER_TYPE id) {
|
||||
std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence->getCodes();
|
||||
|
||||
int offset = 0;
|
||||
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||
@ -102,7 +119,7 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
||||
|
||||
// append to markersFile
|
||||
SUFFIX_MARKER_TYPE marker = Utils::createMarker(
|
||||
example.getId(),
|
||||
id,
|
||||
offset,
|
||||
hash.size());
|
||||
|
||||
@ -120,6 +137,19 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
||||
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
||||
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
||||
markers->push_back(sentenceBoundaryMA);
|
||||
}
|
||||
|
||||
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
||||
std::ofstream & hashedIndexFile,
|
||||
std::ofstream & markersFile,
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
const Example & example) {
|
||||
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
||||
hashGenerator->generateHash(example.getSentence());
|
||||
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
|
||||
T, markers, hashedPattern, example.getId());
|
||||
|
||||
return hashedPattern;
|
||||
}
|
||||
|
@ -59,6 +59,27 @@ public:
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
const Example & example);
|
||||
|
||||
/*! Adds a tokenized example to the index. Hashed index
|
||||
and markers array are appended with the example.
|
||||
At the same time, HDD versions of these
|
||||
two data structures are also appended with the same example.
|
||||
The method returns a tokenized version of the example.
|
||||
\param hashGenerator hash generator to be used to prepare the hash
|
||||
of the example
|
||||
\param T RAM-based hash index to be appended to
|
||||
\param markers RAM-based markers array to be appended to
|
||||
\param example example to be added to index
|
||||
\param tokenizedSentence tokenized sentence to be added
|
||||
\param id of the sentence to be added
|
||||
\throws ConcordiaException
|
||||
*/
|
||||
void addTokenizedExample(
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
||||
SUFFIX_MARKER_TYPE id);
|
||||
|
||||
/*! Adds multiple examples to the index. Examples are first hashed using
|
||||
the hash generator passed to this method. Then, hashed index
|
||||
and markers array (also passed to this method) are appended
|
||||
@ -87,6 +108,15 @@ public:
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T);
|
||||
|
||||
private:
|
||||
void _addSingleTokenizedExample(
|
||||
std::ofstream & hashedIndexFile,
|
||||
std::ofstream & markersFile,
|
||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
||||
SUFFIX_MARKER_TYPE id);
|
||||
|
||||
boost::shared_ptr<TokenizedSentence> _addSingleExample(
|
||||
std::ofstream & hashedIndexFile,
|
||||
std::ofstream & markersFile,
|
||||
|
@ -287,7 +287,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
|
||||
concordia.addExample(Example("Alice has a cat", 56));
|
||||
concordia.addExample(Example("Alice has a dog", 23));
|
||||
concordia.addExample(Example("New test product has a mistake", 321));
|
||||
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
|
||||
boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize("This is just testing and it has nothing to do with the above");
|
||||
concordia.addTokenizedExample(ts, 14);
|
||||
concordia.refreshSAfromRAM();
|
||||
|
||||
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
|
||||
@ -324,4 +325,20 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
|
||||
|
||||
concordia.clearIndex();
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE( Tokenize )
|
||||
{
|
||||
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||
boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize(" Ala posiada kota");
|
||||
/*
|
||||
0,3 type: 1 value: ala
|
||||
4,11 type: 1 value: posiada
|
||||
12,16 type: 1 value: kota
|
||||
*/
|
||||
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
|
||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9);
|
||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16);
|
||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
|
||||
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
|
||||
}
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
|
Loading…
Reference in New Issue
Block a user