extended markers - length, bitwise operators

Former-commit-id: 948a7fc68bf0b2284ce631d877fc13fa3eaa4882
This commit is contained in:
rjawor 2015-04-09 22:17:19 +02:00
parent fec63e561d
commit 2533fd5b44
15 changed files with 124 additions and 30 deletions

View File

@ -9,17 +9,17 @@ set (CONCORDIA_VERSION_MINOR 1)
# Type of the characters in SA
set (INDEX_CHARACTER_TYPE "unsigned int")
set (INDEX_CHARACTER_TYPE_MAX_VALUE 4294967295)
set (INDEX_CHARACTER_TYPE_MAX_VALUE "ULONG_MAX")
# The above allows for (roughly) 2^32 = 4 294 967 295 words in corpus.
# Suffix markers
set (SUFFIX_MARKER_TYPE "unsigned int")
set (SUFFIX_MARKER_TYPE_MAX_VALUE 4294967295)
set (SUFFIX_MARKER_DIVISOR 256)
# The above settings assign 3 bytes to sentence id and 1 byte for suffix offset.
# This allows to store 2^24 = 16 777 216 sentences no longer than 256 words.
set (SUFFIX_MARKER_TYPE "unsigned long")
set (SUFFIX_MARKER_TYPE_MAX_VALUE "ULLONG_MAX")
set (SUFFIX_MARKER_SENTENCE_BYTES 2)
# The above settings assign 4 bytes to sentence id and 2 bytes each for suffix offset and sentence length.
# This allows to store 2^32=4 294 967 296 sentences no longer than 65536 words.
# ============================== #
# =============================== #
# Production paths
# ============================== #

View File

@ -2,6 +2,8 @@ DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy)
DONE 2. anonimizacja zdań
DONE 3. Dzielenie zdań (max 255 tokenów)
- concordia-server
- zastanowić się nad empty hash examples (rozwiązanie: w ogóle nie szukać fraz o pustym hashu, rzucać wyjątek).
- wyłączyć stopWords
@ -13,4 +15,8 @@ zastanowić się nad optymalizacją:
- unordered_map tmMatchesMap
- LCP array
Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
DONE Anubis search się komplikuje! Przy tworzeniu obiektu tmMatches dla przykładu trzeba podać id przykładu, długość patternu i długość przykładu. Dwa pierwsze mamy, ale niestety nie ma skąd wziąć długości przykładu. Pamiętamy tylko offset sufiksu.
DONE 1. Bitwise operators (i stałe!) przy rozmiarze index character oraz markerów
2. Wykonać anubis search na nowych markerach z długością zdania
3. Multi-threading?

View File

@ -117,7 +117,7 @@ void AnubisSearcher::_collectResults(
saidx_t resultPos = SA->at(left + i);
SUFFIX_MARKER_TYPE marker = markers->at(resultPos);
result.push_back(new SubstringOccurence(
marker / SUFFIX_MARKER_DIVISOR,
marker % SUFFIX_MARKER_DIVISOR));
Utils::getIdFromMarker(marker),
Utils::getOffsetFromMarker(marker)));
}
}

View File

@ -23,8 +23,9 @@ typedef @SUFFIX_MARKER_TYPE@ SUFFIX_MARKER_TYPE;
#define SUFFIX_MARKER_TYPE_MAX_VALUE @SUFFIX_MARKER_TYPE_MAX_VALUE@
#define SUFFIX_MARKER_DIVISOR @SUFFIX_MARKER_DIVISOR@
//Max sentence size is determined by suffix marker divisor.
//The last bits in a sentence marker denote offset whose maximum value
//is the sentence size minus 2.
#define MAX_SENTENCE_SIZE @SUFFIX_MARKER_DIVISOR@
#define SUFFIX_MARKER_SENTENCE_BYTES @SUFFIX_MARKER_SENTENCE_BYTES@
//Max sentence size is determined by the SUFFIX_MARKER_SENTENCE_BYTES property.
//The sentence marker is build as follows: its first bytes store the
// sentence id. Next, SUFFIX_MARKER_SENTENCE_BYTES store the suffix offset
// and the last SUFFIX_MARKER_SENTENCE_BYTES store the sentence length.

View File

@ -1,4 +1,5 @@
#include "concordia/common/utils.hpp"
#include <math.h>
Utils::Utils() {
}
@ -71,3 +72,43 @@ void Utils::_insertCharToSaucharArray(sauchar_t * array,
}
}
SUFFIX_MARKER_TYPE Utils::getIdFromMarker(SUFFIX_MARKER_TYPE marker) {
// shift right to erase offset and length
return marker >> SUFFIX_MARKER_SENTENCE_BYTES * 16;
}
SUFFIX_MARKER_TYPE Utils::getOffsetFromMarker(SUFFIX_MARKER_TYPE marker) {
// shift left to erase id
SUFFIX_MARKER_TYPE result = marker << _idBytes * 8;
// shift back right and go further to erase length
result = result >> (_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
return result;
}
SUFFIX_MARKER_TYPE Utils::getLengthFromMarker(SUFFIX_MARKER_TYPE marker) {
// shift left to erase id and offset
SUFFIX_MARKER_TYPE result = marker <<
(_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
// shift back
return result >> (_idBytes * 8 + SUFFIX_MARKER_SENTENCE_BYTES * 8);
}
SUFFIX_MARKER_TYPE Utils::createMarker(SUFFIX_MARKER_TYPE id,
SUFFIX_MARKER_TYPE offset,
SUFFIX_MARKER_TYPE length) {
// shift twice by SUFFIX_MARKER_SENTENCE_BYTES
SUFFIX_MARKER_TYPE result = id << SUFFIX_MARKER_SENTENCE_BYTES * 16;
// shift once by SUFFIX_MARKER_SENTENCE_BYTES
result += offset << SUFFIX_MARKER_SENTENCE_BYTES * 8;
// no shift at all
result += length;
return result;
}
SUFFIX_MARKER_TYPE Utils::maxSentenceSize =
pow(2, SUFFIX_MARKER_SENTENCE_BYTES*8);
int Utils::_idBytes = sizeof(SUFFIX_MARKER_TYPE) -
2 * SUFFIX_MARKER_SENTENCE_BYTES;

View File

@ -45,9 +45,23 @@ public:
template <typename T>
static void printVector(boost::shared_ptr<std::vector<T> > vector);
static SUFFIX_MARKER_TYPE getIdFromMarker(SUFFIX_MARKER_TYPE marker);
static SUFFIX_MARKER_TYPE getOffsetFromMarker(SUFFIX_MARKER_TYPE marker);
static SUFFIX_MARKER_TYPE getLengthFromMarker(SUFFIX_MARKER_TYPE marker);
static SUFFIX_MARKER_TYPE createMarker(SUFFIX_MARKER_TYPE id,
SUFFIX_MARKER_TYPE offset,
SUFFIX_MARKER_TYPE length);
static SUFFIX_MARKER_TYPE maxSentenceSize;
private:
static void _insertCharToSaucharArray(sauchar_t * array,
INDEX_CHARACTER_TYPE character, int pos);
static int _idBytes;
};
template <typename T>

View File

@ -5,6 +5,7 @@
#include <boost/filesystem.hpp>
#include <boost/foreach.hpp>
#include <iostream>
#include <climits>
ConcordiaIndex::ConcordiaIndex(const string & hashedIndexFilePath,
const string & markersFilePath)
@ -91,8 +92,10 @@ void ConcordiaIndex::_addSingleExample(
// append to markersFile
SUFFIX_MARKER_TYPE marker = offset;
marker += example.getId() * SUFFIX_MARKER_DIVISOR;
SUFFIX_MARKER_TYPE marker = Utils::createMarker(
example.getId(),
offset,
hash->size());
Utils::writeMarker(markersFile, marker);
markers->push_back(marker);

View File

@ -1,5 +1,6 @@
#include "concordia/example.hpp"
#include <climits>
#include <iostream>
Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
throw(ConcordiaException):
@ -7,7 +8,7 @@ Example::Example(const string & sentence, const SUFFIX_MARKER_TYPE & id)
_id(id) {
// check if the example id exceeds space
// reserved for it in the suffix marker
if (id >= (SUFFIX_MARKER_TYPE_MAX_VALUE+1) / SUFFIX_MARKER_DIVISOR) {
if (id >= SUFFIX_MARKER_TYPE_MAX_VALUE >> 8) {
throw ConcordiaException("Example id too large.");
}
}

View File

@ -1,4 +1,5 @@
#include "concordia/hash_generator.hpp"
#include "concordia/common/utils.hpp"
#include <boost/filesystem.hpp>
#include <boost/archive/binary_oarchive.hpp>
@ -30,7 +31,7 @@ boost::shared_ptr<vector<INDEX_CHARACTER_TYPE> > HashGenerator::generateHash(
result(new vector<INDEX_CHARACTER_TYPE>());
boost::shared_ptr<vector<string> > tokenTexts =
generateTokenVector(sentence);
if (tokenTexts->size() > MAX_SENTENCE_SIZE) {
if (tokenTexts->size() > Utils::maxSentenceSize) {
throw ConcordiaException("Trying to add too long sentence.");
}
for (vector<string>::iterator it = tokenTexts->begin();

View File

@ -40,11 +40,9 @@ boost::ptr_vector<SubstringOccurence> IndexSearcher::simpleSearch(
saidx_t actualResultPos = resultPos / sizeof(INDEX_CHARACTER_TYPE);
SUFFIX_MARKER_TYPE marker = markers->at(actualResultPos);
// TODO(rafalj): think about using bitwise operators
// in the below code
result.push_back(new SubstringOccurence(
marker / SUFFIX_MARKER_DIVISOR,
marker % SUFFIX_MARKER_DIVISOR));
Utils::getIdFromMarker(marker),
Utils::getOffsetFromMarker(marker)));
}
}

View File

@ -1,6 +1,7 @@
#include "tests/unit-tests/unit_tests_globals.hpp"
#include "concordia/anubis_searcher.hpp"
#include "concordia/common/config.hpp"
#include "concordia/common/utils.hpp"
using namespace std;
@ -27,9 +28,8 @@ BOOST_AUTO_TEST_CASE( LcpSearch1 )
T->push_back(3);
T->push_back(2);
SUFFIX_MARKER_TYPE marker = 34 * SUFFIX_MARKER_DIVISOR;
for(int i=0;i<6;i++) {
markers->push_back(marker++);
markers->push_back(Utils::createMarker(34,i,6));
}
pattern->push_back(2);

View File

@ -1,6 +1,7 @@
#include <boost/algorithm/string/predicate.hpp>
#include "tests/unit-tests/unit_tests_globals.hpp"
#include <string>
#include <climits>
#include "concordia/example.hpp"
@ -10,19 +11,19 @@ BOOST_AUTO_TEST_SUITE(exampleTest)
BOOST_AUTO_TEST_CASE( ExceedingId )
{
Example example1("Test", 16777215);
unsigned long maxId = (ULLONG_MAX >> 8) - 1;
Example example1("Test", maxId);
bool exceptionThrown = false;
string message = "";
try {
Example example2("Test", 16777216);
Example example2("Test", maxId+1);
} catch (ConcordiaException & e) {
exceptionThrown = true;
message = e.what();
}
BOOST_CHECK_EQUAL(exceptionThrown, true);
BOOST_CHECK_EQUAL(boost::starts_with(message, "Example id too large"), true);
}

View File

@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE( TooLongHashTest )
HashGenerator hashGenerator = HashGenerator(config);
stringstream ss;
for (int i=0;i<257;i++) {
for (int i=0;i<65537;i++) {
ss << "xx" << i << " ";
}

View File

@ -85,6 +85,33 @@ BOOST_AUTO_TEST_CASE( IndexVectorToSaucharVector )
BOOST_CHECK_EQUAL_COLLECTIONS(result->begin(), result->end(), expected->begin(), expected->end());
}
BOOST_AUTO_TEST_CASE( MaxSentenceSize )
{
BOOST_CHECK_EQUAL(Utils::maxSentenceSize, 65536);
}
//The below examples use the following marker:
//00000000|00000000|00000000|00000011|00000000|00000101|00000000|00000111
BOOST_AUTO_TEST_CASE( CreateMarker )
{
BOOST_CHECK_EQUAL(Utils::createMarker(3,5,7), 12885229575);
}
BOOST_AUTO_TEST_CASE( GetIdFromMarker )
{
BOOST_CHECK_EQUAL(Utils::getIdFromMarker(12885229575), 3);
}
BOOST_AUTO_TEST_CASE( GetOffsetFromMarker )
{
BOOST_CHECK_EQUAL(Utils::getOffsetFromMarker(12885229575), 5);
}
BOOST_AUTO_TEST_CASE( GetLengthFromMarker )
{
BOOST_CHECK_EQUAL(Utils::getLengthFromMarker(12885229575), 7);
}
BOOST_AUTO_TEST_SUITE_END()

View File

@ -1,4 +1,5 @@
#include "concordia/word_map.hpp"
#include <climits>
WordMap::WordMap() throw(ConcordiaException) {