separated tokenization and adding to index
This commit is contained in:
parent
5a57406875
commit
28704c2f43
3
TODO.txt
3
TODO.txt
@ -1,5 +1,6 @@
|
|||||||
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
|
---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) -----------------------------
|
||||||
- document the code (classes, cfg files) and update tutorial
|
- multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results.
|
||||||
|
IN PROGRESS - document the code (classes, cfg files) and update tutorial
|
||||||
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
|
- wiele pamięci tłumaczeń: można je przechowywać w jednym indeksie, ale trzeba dodać tm_id jako metadane zdania (np. zamiast example length). Przy wyszukiwaniu należy filtrować wyniki, aby pochodziły z odpowiedniej pamięci tłumaczeń.
|
||||||
- testy zużycia pamięci
|
- testy zużycia pamięci
|
||||||
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
- Prawdopodobnie długość example w markers będzie potrzebna tylko anubisowi (który, jak się okazuje, jest wolny). Pomyśleć, do czego można wykorzystać markery, bo ich idea wydaje się niezła.
|
||||||
|
@ -42,6 +42,17 @@ std::string _createLibraryVersion() {
|
|||||||
return version.str();
|
return version.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boost::shared_ptr<TokenizedSentence>
|
||||||
|
Concordia::tokenize(const std::string & sentence)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
boost::shared_ptr<TokenizedSentence> result =
|
||||||
|
_hashGenerator->generateHash(sentence);
|
||||||
|
_hashGenerator->serializeWordMap();
|
||||||
|
return result;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Sentences are written to disk and added to T.
|
// Sentences are written to disk and added to T.
|
||||||
// SA is generated on command by other methods.
|
// SA is generated on command by other methods.
|
||||||
boost::shared_ptr<TokenizedSentence> Concordia::addExample(
|
boost::shared_ptr<TokenizedSentence> Concordia::addExample(
|
||||||
@ -50,6 +61,17 @@ boost::shared_ptr<TokenizedSentence> Concordia::addExample(
|
|||||||
return _index->addExample(_hashGenerator, _T, _markers, example);
|
return _index->addExample(_hashGenerator, _T, _markers, example);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sentences are written to disk and added to T.
|
||||||
|
// SA is generated on command by other methods.
|
||||||
|
void Concordia::addTokenizedExample(
|
||||||
|
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
||||||
|
SUFFIX_MARKER_TYPE id)
|
||||||
|
throw(ConcordiaException) {
|
||||||
|
_index->addTokenizedExample(_hashGenerator, _T,
|
||||||
|
_markers, tokenizedSentence, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Sentences are written to disk and added to T.
|
// Sentences are written to disk and added to T.
|
||||||
// SA is generated on command by other methods.
|
// SA is generated on command by other methods.
|
||||||
std::vector<TokenizedSentence> Concordia::addAllExamples(
|
std::vector<TokenizedSentence> Concordia::addAllExamples(
|
||||||
|
@ -52,6 +52,15 @@ public:
|
|||||||
*/
|
*/
|
||||||
std::string & getVersion();
|
std::string & getVersion();
|
||||||
|
|
||||||
|
/*! Tokenizes the given sentence.
|
||||||
|
\param sentence sentence to be tokenized
|
||||||
|
\returns tokenized sentence object,
|
||||||
|
containing information about original word positions
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
|
boost::shared_ptr<TokenizedSentence> tokenize(const std::string & sentence)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Adds an Example to the index.
|
/*! Adds an Example to the index.
|
||||||
\param example example to be added
|
\param example example to be added
|
||||||
\returns tokenized sentence object,
|
\returns tokenized sentence object,
|
||||||
@ -61,6 +70,16 @@ public:
|
|||||||
boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
|
boost::shared_ptr<TokenizedSentence> addExample(const Example & example)
|
||||||
throw(ConcordiaException);
|
throw(ConcordiaException);
|
||||||
|
|
||||||
|
/*! Adds a tokenized example to the index.
|
||||||
|
\param tokenizedSentence tokenized sentence to be added
|
||||||
|
\param id of the sentence to be added
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
|
void addTokenizedExample(
|
||||||
|
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
||||||
|
SUFFIX_MARKER_TYPE id)
|
||||||
|
throw(ConcordiaException);
|
||||||
|
|
||||||
/*! Adds multiple examples to the index.
|
/*! Adds multiple examples to the index.
|
||||||
\param examples vector of examples to be added
|
\param examples vector of examples to be added
|
||||||
\returns vector of tokenized sentence objects,
|
\returns vector of tokenized sentence objects,
|
||||||
|
@ -82,16 +82,33 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::addExample(
|
|||||||
return hashedPattern;
|
return hashedPattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
void ConcordiaIndex::addTokenizedExample(
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
||||||
|
SUFFIX_MARKER_TYPE id) {
|
||||||
|
std::ofstream hashedIndexFile;
|
||||||
|
hashedIndexFile.open(_hashedIndexFilePath.c_str(), std::ios::out|
|
||||||
|
std::ios::app|std::ios::binary);
|
||||||
|
std::ofstream markersFile;
|
||||||
|
markersFile.open(_markersFilePath.c_str(), std::ios::out|
|
||||||
|
std::ios::app|std::ios::binary);
|
||||||
|
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
|
||||||
|
T, markers, tokenizedSentence, id);
|
||||||
|
hashedIndexFile.close();
|
||||||
|
markersFile.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ConcordiaIndex::_addSingleTokenizedExample(
|
||||||
std::ofstream & hashedIndexFile,
|
std::ofstream & hashedIndexFile,
|
||||||
std::ofstream & markersFile,
|
std::ofstream & markersFile,
|
||||||
boost::shared_ptr<HashGenerator> hashGenerator,
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
boost::shared_ptr<std::vector<sauchar_t> > T,
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const Example & example) {
|
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
||||||
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
SUFFIX_MARKER_TYPE id) {
|
||||||
hashGenerator->generateHash(example.getSentence());
|
std::vector<INDEX_CHARACTER_TYPE> hash = tokenizedSentence->getCodes();
|
||||||
std::vector<INDEX_CHARACTER_TYPE> hash = hashedPattern->getCodes();
|
|
||||||
|
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
for (std::vector<INDEX_CHARACTER_TYPE>::iterator it = hash.begin();
|
||||||
@ -102,7 +119,7 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
|||||||
|
|
||||||
// append to markersFile
|
// append to markersFile
|
||||||
SUFFIX_MARKER_TYPE marker = Utils::createMarker(
|
SUFFIX_MARKER_TYPE marker = Utils::createMarker(
|
||||||
example.getId(),
|
id,
|
||||||
offset,
|
offset,
|
||||||
hash.size());
|
hash.size());
|
||||||
|
|
||||||
@ -120,7 +137,20 @@ boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
|||||||
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
SUFFIX_MARKER_TYPE sentenceBoundaryMA = SUFFIX_MARKER_TYPE_MAX_VALUE;
|
||||||
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
Utils::writeMarker(markersFile, sentenceBoundaryMA);
|
||||||
markers->push_back(sentenceBoundaryMA);
|
markers->push_back(sentenceBoundaryMA);
|
||||||
|
}
|
||||||
|
|
||||||
|
boost::shared_ptr<TokenizedSentence> ConcordiaIndex::_addSingleExample(
|
||||||
|
std::ofstream & hashedIndexFile,
|
||||||
|
std::ofstream & markersFile,
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
const Example & example) {
|
||||||
|
boost::shared_ptr<TokenizedSentence> hashedPattern =
|
||||||
|
hashGenerator->generateHash(example.getSentence());
|
||||||
|
_addSingleTokenizedExample(hashedIndexFile, markersFile, hashGenerator,
|
||||||
|
T, markers, hashedPattern, example.getId());
|
||||||
|
|
||||||
return hashedPattern;
|
return hashedPattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -59,6 +59,27 @@ public:
|
|||||||
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
const Example & example);
|
const Example & example);
|
||||||
|
|
||||||
|
/*! Adds a tokenized example to the index. Hashed index
|
||||||
|
and markers array are appended with the example.
|
||||||
|
At the same time, HDD versions of these
|
||||||
|
two data structures are also appended with the same example.
|
||||||
|
The method returns a tokenized version of the example.
|
||||||
|
\param hashGenerator hash generator to be used to prepare the hash
|
||||||
|
of the example
|
||||||
|
\param T RAM-based hash index to be appended to
|
||||||
|
\param markers RAM-based markers array to be appended to
|
||||||
|
\param example example to be added to index
|
||||||
|
\param tokenizedSentence tokenized sentence to be added
|
||||||
|
\param id of the sentence to be added
|
||||||
|
\throws ConcordiaException
|
||||||
|
*/
|
||||||
|
void addTokenizedExample(
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
||||||
|
SUFFIX_MARKER_TYPE id);
|
||||||
|
|
||||||
/*! Adds multiple examples to the index. Examples are first hashed using
|
/*! Adds multiple examples to the index. Examples are first hashed using
|
||||||
the hash generator passed to this method. Then, hashed index
|
the hash generator passed to this method. Then, hashed index
|
||||||
and markers array (also passed to this method) are appended
|
and markers array (also passed to this method) are appended
|
||||||
@ -87,6 +108,15 @@ public:
|
|||||||
boost::shared_ptr<std::vector<sauchar_t> > T);
|
boost::shared_ptr<std::vector<sauchar_t> > T);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
void _addSingleTokenizedExample(
|
||||||
|
std::ofstream & hashedIndexFile,
|
||||||
|
std::ofstream & markersFile,
|
||||||
|
boost::shared_ptr<HashGenerator> hashGenerator,
|
||||||
|
boost::shared_ptr<std::vector<sauchar_t> > T,
|
||||||
|
boost::shared_ptr<std::vector<SUFFIX_MARKER_TYPE> > markers,
|
||||||
|
boost::shared_ptr<TokenizedSentence> tokenizedSentence,
|
||||||
|
SUFFIX_MARKER_TYPE id);
|
||||||
|
|
||||||
boost::shared_ptr<TokenizedSentence> _addSingleExample(
|
boost::shared_ptr<TokenizedSentence> _addSingleExample(
|
||||||
std::ofstream & hashedIndexFile,
|
std::ofstream & hashedIndexFile,
|
||||||
std::ofstream & markersFile,
|
std::ofstream & markersFile,
|
||||||
|
@ -287,7 +287,8 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
|
|||||||
concordia.addExample(Example("Alice has a cat", 56));
|
concordia.addExample(Example("Alice has a cat", 56));
|
||||||
concordia.addExample(Example("Alice has a dog", 23));
|
concordia.addExample(Example("Alice has a dog", 23));
|
||||||
concordia.addExample(Example("New test product has a mistake", 321));
|
concordia.addExample(Example("New test product has a mistake", 321));
|
||||||
concordia.addExample(Example("This is just testing and it has nothing to do with the above", 14));
|
boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize("This is just testing and it has nothing to do with the above");
|
||||||
|
concordia.addTokenizedExample(ts, 14);
|
||||||
concordia.refreshSAfromRAM();
|
concordia.refreshSAfromRAM();
|
||||||
|
|
||||||
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
|
boost::shared_ptr<ConcordiaSearchResult> searchResult1 = concordia.concordiaSearch("Our new test product has nothing to do with computers");
|
||||||
@ -324,4 +325,20 @@ BOOST_AUTO_TEST_CASE( ConcordiaSearch2 )
|
|||||||
|
|
||||||
concordia.clearIndex();
|
concordia.clearIndex();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( Tokenize )
|
||||||
|
{
|
||||||
|
Concordia concordia = Concordia(TestResourcesManager::getTestConcordiaConfigFilePath("concordia.cfg"));
|
||||||
|
boost::shared_ptr<TokenizedSentence> ts = concordia.tokenize(" Ala posiada kota");
|
||||||
|
/*
|
||||||
|
0,3 type: 1 value: ala
|
||||||
|
4,11 type: 1 value: posiada
|
||||||
|
12,16 type: 1 value: kota
|
||||||
|
*/
|
||||||
|
BOOST_CHECK_EQUAL(ts->getTokens().size(), 3);
|
||||||
|
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getStart(), 9);
|
||||||
|
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getEnd(), 16);
|
||||||
|
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getType(), 1);
|
||||||
|
BOOST_CHECK_EQUAL(ts->getTokens().at(1).getValue(), "posiada");
|
||||||
|
}
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
Loading…
Reference in New Issue
Block a user