extended markers - length, bitwise operators
Former-commit-id: 948a7fc68bf0b2284ce631d877fc13fa3eaa4882
This commit is contained in:
parent
fec63e561d
commit
2533fd5b44
@ -9,17 +9,17 @@ set (CONCORDIA_VERSION_MINOR 1)
|
|||||||
# Type of the characters in SA
|
# Type of the characters in SA
|
||||||
|
|
||||||
set (INDEX_CHARACTER_TYPE "unsigned int")
|
set (INDEX_CHARACTER_TYPE "unsigned int")
|
||||||
set (INDEX_CHARACTER_TYPE_MAX_VALUE 4294967295)
|
set (INDEX_CHARACTER_TYPE_MAX_VALUE "ULONG_MAX")
|
||||||
# The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus.
|
# The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus.
|
||||||
|
|
||||||
# Suffix markers
|
# Suffix markers
|
||||||
set (SUFFIX_MARKER_TYPE "unsigned int")
|
set (SUFFIX_MARKER_TYPE "unsigned long")
|
||||||
set (SUFFIX_MARKER_TYPE_MAX_VALUE 4294967295)
|
set (SUFFIX_MARKER_TYPE_MAX_VALUE "ULLONG_MAX")
|
||||||
set (SUFFIX_MARKER_DIVISOR 256)
|
set (SUFFIX_MARKER_SENTENCE_BYTES 2)
|
||||||
# The above settings assign 3 bytes to sentence id and 1 byte for suffix offset.
|
# The above settings assign 4 bytes to sentence id and 2 bytes each for suffix offset and sentence length.
|
||||||
# This allows to store 2^24 = 16 777 216 sentences no longer than 256 words.
|
# This allows to store 2^32=4 294 967 296 sentences no longer than 65536 words.
|
||||||
|
|
||||||
# ============================== #
|
# =============================== #
|
||||||
# Production paths
|
# Production paths
|
||||||
# ============================== #
|
# ============================== #
|
||||||
|
|
||||||
|
8
TODO.txt
8
TODO.txt
@ -2,6 +2,8 @@ DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
|
|||||||
DONE 2. anonimizacja zdań
|
DONE 2. anonimizacja zdań
|
||||||
DONE 3. Dzielenie zdań (max 255 tokenów)
|
DONE 3. Dzielenie zdań (max 255 tokenów)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
- concordia-server
|
- concordia-server
|
||||||
- zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
|
- zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
|
||||||
- wyłączyć stopWords
|
- wyłączyć stopWords
|
||||||
@ -13,4 +15,8 @@ zastanowić się nad optymalizacją:
|
|||||||
- unordered_map tmMatchesMap
|
- unordered_map tmMatchesMap
|
||||||
- LCP array
|
- LCP array
|
||||||
|
|
||||||
Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
|
DONE Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
|
||||||
|
|
||||||
|
DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz markerów
|
||||||
|
2. Wykonać anubis search na nowych markerach z długością zdania
|
||||||
|
3. Multi-threading?
|
||||||
|
@ -117,7 +117,7 @@ void AnubisSearcher::_collectResults(
|
|||||||
saidx_t resultPos = SA->at(left + i);
|
saidx_t resultPos = SA->at(left + i);
|
||||||
SUFFIX_MARKER_TYPE marker = markers->at(resultPos);
|
SUFFIX_MARKER_TYPE marker = markers->at(resultPos);
|
||||||
result.push_back(new SubstringOccurence(
|
result.push_back(new SubstringOccurence(
|
||||||
marker / SUFFIX_MARKER_DIVISOR,
|
Utils::getIdFromMarker(marker),
|
||||||
marker % SUFFIX_MARKER_DIVISOR));
|
Utils::getOffsetFromMarker(marker)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -23,8 +23,9 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
|
|||||||
#define SUFFIX_MARKER_TYPE_MAX_VALUE @SUFFIX_MARKER_TYPE_MAX_VALUE@
|
#define SUFFIX_MARKER_TYPE_MAX_VALUE @SUFFIX_MARKER_TYPE_MAX_VALUE@
|
||||||
|
|
||||||
|
|
||||||
#define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@
|
#define SUFFIX_MARKER_SENTENCE_BYTES @SUFFIX_MARKER_SENTENCE_BYTES@
|
||||||
//Max sentence size is determined by suffix marker divisor.
|
//Max sentence size is determined by the SUFFIX_MARKER_SENTENCE_BYTES property.
|
||||||
//The last bits in a sentence marker denote offset whose maximum value
|
//The sentence marker is build as follows: its first bytes store the
|
||||||
//is the sentence size minus 2.
|
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
|
||||||
#define MAX_SENTENCE_SIZE @SUFFIX_MARKER_DIVISOR@
|
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include "concordia/common/utils.hpp"
|
#include "concordia/common/utils.hpp"
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
Utils::Utils() {
|
Utils::Utils() {
|
||||||
}
|
}
|
||||||
@ -71,3 +72,43 @@ void Utils::_insertCharToSaucharArray(sauchar_t * array,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE Utils::getIdFromMarker(SUFFIX_MARKER_TYPE marker) {
|
||||||
|
// shift right to erase offset and length
|
||||||
|
return marker >> SUFFIX_MARKER_SENTENCE_BYTES * 16;
|
||||||
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE Utils::getOffsetFromMarker(SUFFIX_MARKER_TYPE marker) {
|
||||||
|
// shift left to erase id
|
||||||
|
SUFFIX_MARKER_TYPE result = marker << _idBytes * 8;
|
||||||
|
// shift back right and go further to erase length
|
||||||
|
result = result >> (_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE Utils::getLengthFromMarker(SUFFIX_MARKER_TYPE marker) {
|
||||||
|
// shift left to erase id and offset
|
||||||
|
SUFFIX_MARKER_TYPE result = marker <<
|
||||||
|
(_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
|
||||||
|
// shift back
|
||||||
|
return result >> (_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE Utils::createMarker(SUFFIX_MARKER_TYPE id,
|
||||||
|
SUFFIX_MARKER_TYPE offset,
|
||||||
|
SUFFIX_MARKER_TYPE length) {
|
||||||
|
// shift twice by SUFFIX_MARKER_SENTENCE_BYTES
|
||||||
|
SUFFIX_MARKER_TYPE result = id << SUFFIX_MARKER_SENTENCE_BYTES * 16;
|
||||||
|
// shift once by SUFFIX_MARKER_SENTENCE_BYTES
|
||||||
|
result += offset << SUFFIX_MARKER_SENTENCE_BYTES * 8;
|
||||||
|
// no shift at all
|
||||||
|
result += length;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
SUFFIX_MARKER_TYPE Utils::maxSentenceSize =
|
||||||
|
pow(2, SUFFIX_MARKER_SENTENCE_BYTES*8);
|
||||||
|
|
||||||
|
int Utils::_idBytes = sizeof(SUFFIX_MARKER_TYPE) -
|
||||||
|
2 * SUFFIX_MARKER_SENTENCE_BYTES;
|
||||||
|
|
||||||
|
@ -45,9 +45,23 @@ public:
|
|||||||
template <typename T>
|
template <typename T>
|
||||||
static void printVector(boost::shared_ptr<std::vector<T> > vector);
|
static void printVector(boost::shared_ptr<std::vector<T> > vector);
|
||||||
|
|
||||||
|
static SUFFIX_MARKER_TYPE getIdFromMarker(SUFFIX_MARKER_TYPE marker);
|
||||||
|
|
||||||
|
static SUFFIX_MARKER_TYPE getOffsetFromMarker(SUFFIX_MARKER_TYPE marker);
|
||||||
|
|
||||||
|
static SUFFIX_MARKER_TYPE getLengthFromMarker(SUFFIX_MARKER_TYPE marker);
|
||||||
|
|
||||||
|
static SUFFIX_MARKER_TYPE createMarker(SUFFIX_MARKER_TYPE id,
|
||||||
|
SUFFIX_MARKER_TYPE offset,
|
||||||
|
SUFFIX_MARKER_TYPE length);
|
||||||
|
|
||||||
|
static SUFFIX_MARKER_TYPE maxSentenceSize;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static void _insertCharToSaucharArray(sauchar_t * array,
|
static void _insertCharToSaucharArray(sauchar_t * array,
|
||||||
INDEX_CHARACTER_TYPE character, int pos);
|
INDEX_CHARACTER_TYPE character, int pos);
|
||||||
|
|
||||||
|
static int _idBytes;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -5,6 +5,7 @@
|
|||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
#include <boost/foreach.hpp>
|
#include <boost/foreach.hpp>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <climits>
|
||||||
|
|
||||||
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
|
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
|
||||||
const string & markersFilePath)
|
const string & markersFilePath)
|
||||||
@ -91,8 +92,10 @@ void ConcordiaIndex::_addSingleExample(
|
|||||||
|
|
||||||
// append to markersFile
|
// append to markersFile
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE marker = offset;
|
SUFFIX_MARKER_TYPE marker = Utils::createMarker(
|
||||||
marker += example.getId() * SUFFIX_MARKER_DIVISOR;
|
example.getId(),
|
||||||
|
offset,
|
||||||
|
hash->size());
|
||||||
|
|
||||||
Utils::writeMarker(markersFile, marker);
|
Utils::writeMarker(markersFile, marker);
|
||||||
markers->push_back(marker);
|
markers->push_back(marker);
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
#include "concordia/example.hpp"
|
#include "concordia/example.hpp"
|
||||||
|
#include <climits>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
|
Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
|
||||||
throw(ConcordiaException):
|
throw(ConcordiaException):
|
||||||
@ -7,7 +8,7 @@ Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
|
|||||||
_id(id) {
|
_id(id) {
|
||||||
// check if the example id exceeds space
|
// check if the example id exceeds space
|
||||||
// reserved for it in the suffix marker
|
// reserved for it in the suffix marker
|
||||||
if (id >= (SUFFIX_MARKER_TYPE_MAX_VALUE+1) / SUFFIX_MARKER_DIVISOR) {
|
if (id >= SUFFIX_MARKER_TYPE_MAX_VALUE >> 8) {
|
||||||
throw ConcordiaException("Example id too large.");
|
throw ConcordiaException("Example id too large.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include "concordia/hash_generator.hpp"
|
#include "concordia/hash_generator.hpp"
|
||||||
|
#include "concordia/common/utils.hpp"
|
||||||
|
|
||||||
#include <boost/filesystem.hpp>
|
#include <boost/filesystem.hpp>
|
||||||
#include <boost/archive/binary_oarchive.hpp>
|
#include <boost/archive/binary_oarchive.hpp>
|
||||||
@ -30,7 +31,7 @@ boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
|
|||||||
result(new vector<INDEX_CHARACTER_TYPE>());
|
result(new vector<INDEX_CHARACTER_TYPE>());
|
||||||
boost::shared_ptr<vector<string> > tokenTexts =
|
boost::shared_ptr<vector<string> > tokenTexts =
|
||||||
generateTokenVector(sentence);
|
generateTokenVector(sentence);
|
||||||
if (tokenTexts->size() > MAX_SENTENCE_SIZE) {
|
if (tokenTexts->size() > Utils::maxSentenceSize) {
|
||||||
throw ConcordiaException("Trying to add too long sentence.");
|
throw ConcordiaException("Trying to add too long sentence.");
|
||||||
}
|
}
|
||||||
for (vector<string>::iterator it = tokenTexts->begin();
|
for (vector<string>::iterator it = tokenTexts->begin();
|
||||||
|
@ -40,11 +40,9 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
|
|||||||
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
|
||||||
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
|
||||||
|
|
||||||
// TODO(rafalj): think about using bitwise operators
|
|
||||||
// in the below code
|
|
||||||
result.push_back(new SubstringOccurence(
|
result.push_back(new SubstringOccurence(
|
||||||
marker / SUFFIX_MARKER_DIVISOR,
|
Utils::getIdFromMarker(marker),
|
||||||
marker % SUFFIX_MARKER_DIVISOR));
|
Utils::getOffsetFromMarker(marker)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
#include "concordia/anubis_searcher.hpp"
|
#include "concordia/anubis_searcher.hpp"
|
||||||
#include "concordia/common/config.hpp"
|
#include "concordia/common/config.hpp"
|
||||||
|
#include "concordia/common/utils.hpp"
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
@ -27,9 +28,8 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
|
|||||||
T->push_back(3);
|
T->push_back(3);
|
||||||
T->push_back(2);
|
T->push_back(2);
|
||||||
|
|
||||||
SUFFIX_MARKER_TYPE marker = 34 * SUFFIX_MARKER_DIVISOR;
|
|
||||||
for(int i=0;i<6;i++) {
|
for(int i=0;i<6;i++) {
|
||||||
markers->push_back(marker++);
|
markers->push_back(Utils::createMarker(34,i,6));
|
||||||
}
|
}
|
||||||
|
|
||||||
pattern->push_back(2);
|
pattern->push_back(2);
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#include <boost/algorithm/string/predicate.hpp>
|
#include <boost/algorithm/string/predicate.hpp>
|
||||||
#include "tests/unit-tests/unit_tests_globals.hpp"
|
#include "tests/unit-tests/unit_tests_globals.hpp"
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <climits>
|
||||||
|
|
||||||
#include "concordia/example.hpp"
|
#include "concordia/example.hpp"
|
||||||
|
|
||||||
@ -10,19 +11,19 @@ BOOST_AUTO_TEST_SUITE(exampleTest)
|
|||||||
|
|
||||||
BOOST_AUTO_TEST_CASE( ExceedingId )
|
BOOST_AUTO_TEST_CASE( ExceedingId )
|
||||||
{
|
{
|
||||||
Example example1("Test", 16777215);
|
unsigned long maxId = (ULLONG_MAX >> 8) - 1;
|
||||||
|
Example example1("Test", maxId);
|
||||||
|
|
||||||
bool exceptionThrown = false;
|
bool exceptionThrown = false;
|
||||||
string message = "";
|
string message = "";
|
||||||
try {
|
try {
|
||||||
Example example2("Test", 16777216);
|
Example example2("Test", maxId+1);
|
||||||
} catch (ConcordiaException & e) {
|
} catch (ConcordiaException & e) {
|
||||||
exceptionThrown = true;
|
exceptionThrown = true;
|
||||||
message = e.what();
|
message = e.what();
|
||||||
}
|
}
|
||||||
BOOST_CHECK_EQUAL(exceptionThrown, true);
|
BOOST_CHECK_EQUAL(exceptionThrown, true);
|
||||||
BOOST_CHECK_EQUAL(boost::starts_with(message, "Example id too large"), true);
|
BOOST_CHECK_EQUAL(boost::starts_with(message, "Example id too large"), true);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )
|
|||||||
HashGenerator hashGenerator = HashGenerator(config);
|
HashGenerator hashGenerator = HashGenerator(config);
|
||||||
|
|
||||||
stringstream ss;
|
stringstream ss;
|
||||||
for (int i=0;i<257;i++) {
|
for (int i=0;i<65537;i++) {
|
||||||
ss << "xx" << i << " ";
|
ss << "xx" << i << " ";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -85,6 +85,33 @@ BOOST_AUTO_TEST_CASE( IndexVectorToSaucharVector )
|
|||||||
BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end());
|
BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( MaxSentenceSize )
|
||||||
|
{
|
||||||
|
BOOST_CHECK_EQUAL(Utils::maxSentenceSize, 65536);
|
||||||
|
}
|
||||||
|
|
||||||
|
//The below examples use the following marker:
|
||||||
|
//00000000|00000000|00000000|00000011|00000000|00000101|00000000|00000111
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( CreateMarker )
|
||||||
|
{
|
||||||
|
BOOST_CHECK_EQUAL(Utils::createMarker(3,5,7), 12885229575);
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( GetIdFromMarker )
|
||||||
|
{
|
||||||
|
BOOST_CHECK_EQUAL(Utils::getIdFromMarker(12885229575), 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( GetOffsetFromMarker )
|
||||||
|
{
|
||||||
|
BOOST_CHECK_EQUAL(Utils::getOffsetFromMarker(12885229575), 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
BOOST_AUTO_TEST_CASE( GetLengthFromMarker )
|
||||||
|
{
|
||||||
|
BOOST_CHECK_EQUAL(Utils::getLengthFromMarker(12885229575), 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
BOOST_AUTO_TEST_SUITE_END()
|
BOOST_AUTO_TEST_SUITE_END()
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include "concordia/word_map.hpp"
|
#include "concordia/word_map.hpp"
|
||||||
|
#include <climits>
|
||||||
|
|
||||||
|
|
||||||
WordMap::WordMap() throw(ConcordiaException) {
|
WordMap::WordMap() throw(ConcordiaException) {
|
||||||
|
Loading…
Reference in New Issue
Block a user