diff --git a/TODO.txt b/TODO.txt index aedde3a..0d5bd3e 100644 --- a/TODO.txt +++ b/TODO.txt @@ -1,5 +1,4 @@ ---------------------------- Developer's private notes (language may vary, bo tak czasem wygodniej) ----------------------------- -- concordia helper - aplikacja windowsowa, która wyszukuje w Concordii zdanie zaznaczone w dowolnej aplikacji: https://www.jayway.com/2013/02/06/how-to-get-selected-text-from-another-windows-program/ - document analysis - concordia score should be calculated for a document. Idea - for each fragment: (fragment length/document length) * log penalty at sentence level. - multiple indexes based on different hashes. One can be word-net base forms, other - pos-tags and so on. Develop a method of combining results. It may be a way to implement lemmatization. IN PROGRESS - document the code (classes, cfg files) and update tutorial @@ -10,6 +9,7 @@ IN PROGRESS - document the code (classes, cfg files) and update tutorial ---------------------------- Archive ----------------------------- +DONE - concordia helper - aplikacja windowsowa, która wyszukuje w Concordii zdanie zaznaczone w dowolnej aplikacji: https://www.jayway.com/2013/02/06/how-to-get-selected-text-from-another-windows-program/ DONE - rethink passing variables such as TokenizedSentence by smart pointers. Consider using references in getters. DONE - moving/extending concordia matches on demand - powered by concordia-server DONE - testy zużycia pamięci @@ -36,7 +36,7 @@ DONE - wyłączyć stopWords DONE - Przy concordia searCh dodatkowo obliczany ma być zestaw optymalnego pokrycia patternu. Może siłowo? (jeśli przyjąć max dł. zdania 500 tokenów, to nie powinno być źle) -DONE - wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości. +DONE - wyszukiwanie zdania: wyszukanie najdłuższych pasujących fragmentów Anubisem, 1D (approximate) bin packing. Nazwijmy to concordia search. Wyszukiwane są wszystkie najdłuższe dopasowania patternu dzięki LCP search. Zwracany jest wynik w postaci listy najdłuższych dopasowanych fragmentów, posortowanych malejąco po długości, z maksymalnie 3 przedstawicielami każdej długości. DONE 1. lokalizowane to_lower (wykorzystać utf8case, naprawić testy) DONE 2. anonimizacja zdań @@ -55,4 +55,3 @@ DONE - !important! rezygnacja z ptr_vector DONE - zwracanie wektorów DONE - powyrzucać using namespace std DONE - profiling - diff --git a/concordia/CMakeLists.txt b/concordia/CMakeLists.txt index 6f6e246..0b44572 100644 --- a/concordia/CMakeLists.txt +++ b/concordia/CMakeLists.txt @@ -5,62 +5,30 @@ foreach(dir ${ALL_DIRECTORIES}) add_subdirectory(${dir}) endforeach(dir) +file(GLOB main_sources "*.cpp") +file(GLOB common_sources "common/*.cpp") + + add_library(concordia SHARED - token_annotation.cpp - tokenized_sentence.cpp - concordia_search_result.cpp - matched_pattern_fragment.cpp - concordia_searcher.cpp - regex_rule.cpp - sentence_tokenizer.cpp - interval.cpp - tm_matches.cpp - anubis_search_result.cpp - substring_occurence.cpp - example.cpp - index_searcher.cpp - concordia_index.cpp - word_map.cpp - hash_generator.cpp - concordia.cpp - concordia_config.cpp - concordia_exception.cpp - common/logging.cpp - common/utils.cpp - common/text_utils.cpp + ${main_sources} + ${common_sources} ) add_subdirectory(t) # ===================================== install(TARGETS concordia DESTINATION lib/) -install(FILES - token_annotation.hpp - tokenized_sentence.hpp - concordia_search_result.hpp - matched_pattern_fragment.hpp - concordia_searcher.hpp - regex_rule.hpp - sentence_tokenizer.hpp - interval.hpp - tm_matches.hpp - anubis_search_result.hpp - substring_occurence.hpp - example.hpp - index_searcher.hpp - concordia_index.hpp - word_map.hpp - hash_generator.hpp - concordia.hpp - concordia_config.hpp - concordia_exception.hpp + + +file(GLOB main_headers "*.hpp") +file(GLOB common_headers "common/*.hpp") + +install(FILES + ${main_headers} DESTINATION include/concordia/) -install(FILES - common/config.hpp - common/logging.hpp - common/utils.hpp - common/text_utils.hpp +install(FILES + ${common_headers} DESTINATION include/concordia/common/) # ---------------------------------------------------- @@ -75,11 +43,7 @@ if(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE}) link_directories(${LIBCONFIG_LIB}) endif(EXISTS ${LIBCONFIG_LIB} AND EXISTS ${LIBCONFIG_INCLUDE}) -target_link_libraries(concordia config++) -target_link_libraries(concordia log4cpp) -target_link_libraries(concordia ${Boost_LIBRARIES}) -target_link_libraries(concordia divsufsort) -target_link_libraries(concordia utf8case) +target_link_libraries(concordia config++ log4cpp ${Boost_LIBRARIES} divsufsort utf8case ${Boost_LIBRARIES}) if (WITH_RE2) target_link_libraries(concordia re2) @@ -91,4 +55,3 @@ else(WITH_RE2) target_link_libraries(concordia pcrecpp) endif(WITH_PCRE) endif(WITH_RE2) - diff --git a/concordia/t/CMakeLists.txt b/concordia/t/CMakeLists.txt index 1088036..b5ff6a8 100644 --- a/concordia/t/CMakeLists.txt +++ b/concordia/t/CMakeLists.txt @@ -1,22 +1,7 @@ +file(GLOB test_sources "*.cpp") + add_library(concordia-tests - test_hash_generator.cpp - test_regex_rule.cpp - test_tokenized_sentence.cpp - test_concordia_searcher.cpp - test_sentence_tokenizer.cpp - test_text_utils.cpp - test_example.cpp - test_tm_matches.cpp - test_interval.cpp - test_logging.cpp - test_utils.cpp - test_word_map.cpp - test_concordia_index.cpp - test_concordia_config.cpp - test_concordia.cpp - range_based_case_converter_tests.cpp - simple_convert_tests.cpp - special_casing_converter_tests.cpp + ${test_sources} ) target_link_libraries(concordia-tests concordia ${LIBCONFIG_LIB} concordia-tests-common utf8case) diff --git a/scripts/concordia-runner.sh b/scripts/concordia-runner.sh index e7c9b0c..26d49aa 100755 --- a/scripts/concordia-runner.sh +++ b/scripts/concordia-runner.sh @@ -4,8 +4,8 @@ echo "CONCORDIA RUNNER: Running Concordia" rm ../prod/resources/temp/* echo "CONCORDIA RUNNER: reading from file" -../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -r ../prod/resources/text-files/medium.txt +concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -r ../prod/resources/text-files/medium.txt echo "CONCORDIA RUNNER: searching for pattern: \"drawn from his own\"" -../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -s "drawn from his own" +concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -s "drawn from his own" echo "CONCORDIA RUNNER: searching for pattern: \"it is\"" -../build/concordia-console/concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -s "it is" -n +concordia-console -i ../prod/resources/temp/ -c ../prod/resources/concordia-config/concordia.cfg -s "it is" -n