Merge branch 'master' into two-step-concordia-search

This commit is contained in:
Rafał Jaworski 2019-01-04 14:15:00 +01:00
commit aa544051dc
26 changed files with 142 additions and 23 deletions

View File

@ -21,6 +21,7 @@ namespace LemmaGenSockets
lemmatizersDict.Add("pl", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Polish)); lemmatizersDict.Add("pl", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Polish));
lemmatizersDict.Add("en", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English)); lemmatizersDict.Add("en", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English));
lemmatizersDict.Add("hr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Serbian)); lemmatizersDict.Add("hr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Serbian));
lemmatizersDict.Add("fr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.French));
} }
public LemmatizerListener() public LemmatizerListener()
@ -52,13 +53,37 @@ namespace LemmaGenSockets
private string lemmatizeWord(string languageCode, string word) private string lemmatizeWord(string languageCode, string word)
{ {
if (word.StartsWith("ne_") || word == "i" || word == "o" || word == "do") // exceptions
if (word.StartsWith("ne_"))
{ {
return word; return word;
} }
string[] parts = word.Split(wordInnerSeparator);
Dictionary<String, HashSet<String>> exceptions = new Dictionary<string, HashSet<string>>();
HashSet<String> plExceptions = new HashSet<string>();
plExceptions.Add("i");
plExceptions.Add("o");
plExceptions.Add("do");
exceptions.Add("pl", plExceptions);
HashSet<String> enExceptions = new HashSet<string>();
enExceptions.Add("d");
exceptions.Add("en", enExceptions);
HashSet<String> languageExceptions;
if (exceptions.TryGetValue(languageCode, out languageExceptions))
{
if(languageExceptions.Contains(word))
{
return word;
}
}
string result = ""; string result = "";
string[] parts = word.Split(wordInnerSeparator);
if (parts.Length == 2) if (parts.Length == 2)
{ {
string firstPart = parts[0]; string firstPart = parts[0];
@ -74,7 +99,7 @@ namespace LemmaGenSockets
result = lemmatizersDict[languageCode].Lemmatize(word); result = lemmatizersDict[languageCode].Lemmatize(word);
} }
if (result == "" || result.Contains(" ")) if (result == "")
{ {
return word; return word;
} }

View File

@ -8,3 +8,14 @@ j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Deb
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe.config
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.pdb
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharp.dll
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuilt.dll
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuiltCompact.dll
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\Lzma#.dll
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csproj.CoreCompileInputs.cache
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb

View File

@ -1,7 +1,7 @@
dir@#@jrc_enes dir@#@jrc_enes
concordia_host@#@concordia.vm.wmi.amu.edu.pl concordia_host@#@concordia.vm.wmi.amu.edu.pl
concordia_port@#@8800 concordia_port@#@8800
tmid@#@6 tmid@#@1
desc@#@Welcome to the interactive Concordia demo. The system finds the longest matches of the pattern sentence in its translation memory. This translation memory is 200 000 sentences taken from English-Spanish corpus of European Law. Please enter an English sentence in the field below and press Enter (or use the search button). This instance of Concordia works best with law sentences, but is very likely to output some results for any English sentence. You can also use predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences. After the search, click on the highlighted fragments to see their context. desc@#@Welcome to the interactive Concordia demo. The system finds the longest matches of the pattern sentence in its translation memory. This translation memory is 200 000 sentences taken from English-Spanish corpus of European Law. Please enter an English sentence in the field below and press Enter (or use the search button). This instance of Concordia works best with law sentences, but is very likely to output some results for any English sentence. You can also use predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences. After the search, click on the highlighted fragments to see their context.
enjoy@#@Enjoy your work with the system! enjoy@#@Enjoy your work with the system!
prompt@#@Enter search pattern (English sentence): prompt@#@Enter search pattern (English sentence):

View File

@ -0,0 +1,10 @@
dir@#@pytania_odpowiedzi_logistyka
concordia_host@#@localhost
concordia_port@#@8800
tmid@#@12
desc@#@Wyszukiwarka pytań
enjoy@#@Wybierz przykładowe pytanie:
prompt@#@Wprowadź pytanie (po polsku):
suggestion@#@chciałbym zakupić samochód specjalistyczny
suggestion@#@czy są jakieś zlecenia od spedytorów z terminala?
suggestion@#@potrzebuję oprogramowania do zarządzania korporacją taksówkarską

View File

@ -1 +0,0 @@
../versions_available/emea_plen.cfg

View File

@ -1 +0,0 @@
../versions_available/europarl_sample.cfg

View File

@ -1 +0,0 @@
../versions_available/icd_filtered_plen.cfg

View File

@ -1 +0,0 @@
../versions_available/icd_plen.cfg

View File

@ -1 +0,0 @@
../versions_available/logofag_enpl.cfg

View File

@ -1 +0,0 @@
../versions_available/logofag_plen.cfg

View File

@ -1 +0,0 @@
../versions_available/opus_medicine_plen.cfg

View File

@ -1 +0,0 @@
../versions_available/tmrepository_enhr.cfg

View File

@ -11,10 +11,12 @@ LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
std::string plCode = "pl"; std::string plCode = "pl";
std::string enCode = "en"; std::string enCode = "en";
std::string hrCode = "hr"; std::string hrCode = "hr";
std::string frCode = "fr";
_lemmatizersMap.insert(plCode, socketLemmatizer1); _lemmatizersMap.insert(plCode, socketLemmatizer1);
_lemmatizersMap.insert(enCode, socketLemmatizer1); _lemmatizersMap.insert(enCode, socketLemmatizer1);
_lemmatizersMap.insert(hrCode, socketLemmatizer1); _lemmatizersMap.insert(hrCode, socketLemmatizer1);
_lemmatizersMap.insert(frCode, socketLemmatizer1);
} }
LemmatizerFacade::~LemmatizerFacade() { LemmatizerFacade::~LemmatizerFacade() {

View File

@ -1,6 +1,9 @@
#include "socket_lemmatizer.hpp" #include "socket_lemmatizer.hpp"
#include <time.h>
#include "config.hpp" #include "config.hpp"
#include <boost/lexical_cast.hpp> #include <boost/lexical_cast.hpp>
SocketLemmatizer::SocketLemmatizer(int port) throw(ConcordiaException) : SocketLemmatizer::SocketLemmatizer(int port) throw(ConcordiaException) :
@ -79,20 +82,38 @@ bool SocketLemmatizer::_send_data(std::string data)
std::string SocketLemmatizer::_receive(int size=512) std::string SocketLemmatizer::_receive(int size=512)
{ {
char buffer[size]; char buffer[size];
std::string reply; std::string reply = "";
//Receive a reply from the server //Receive a reply from the server
if(recv(_sock , buffer , sizeof(buffer) , 0) < 0) { bool dataAvailable = true;
throw ConcordiaException("Receive failed"); while (dataAvailable) {
int amountReceived = recv(_sock , buffer , sizeof(buffer) , 0);
if (amountReceived < 0) {
throw ConcordiaException("Lemmatizer: recv failed");
} else if (amountReceived == 0) {
dataAvailable = false;
} else {
buffer[amountReceived] = '\0';
reply += buffer;
}
} }
reply = buffer;
return reply; return reply;
} }
std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) { std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
for (int i=0;i<5;i++) {
try {
_connect(); _connect();
_send_data(languageCode+sentence+LEMMATIZER_DELIMITER); _send_data(languageCode+sentence+LEMMATIZER_DELIMITER);
std::string reply = _receive(512); std::string reply = _receive(512);
_disconnect(); _disconnect();
return reply.substr(0,reply.find(LEMMATIZER_DELIMITER)); return reply.substr(0,reply.find(LEMMATIZER_DELIMITER));
} catch (std::exception & e) {
_logger.logString("Problem with lemmatization of the sentence", sentence);
_logger.log("Waiting 2 seconds and retrying...");
sleep(2);
}
}
throw ConcordiaException("Can not lemmatize sentence: "+sentence);
} }

View File

@ -9,6 +9,7 @@
#include <concordia/concordia_exception.hpp> #include <concordia/concordia_exception.hpp>
#include "logger.hpp"
class SocketLemmatizer { class SocketLemmatizer {
public: public:
@ -34,6 +35,8 @@ private:
int _sock; int _sock;
struct sockaddr_in _server; struct sockaddr_in _server;
Logger _logger;
}; };
#endif #endif

View File

@ -1,7 +1,7 @@
#!/bin/sh #!/bin/sh
CORPUS_NAME="europarl_sample" CORPUS_NAME="jrc_enes"
SRC_LANG_ID=2 SRC_LANG_ID=2
TRG_LANG_ID=1 TRG_LANG_ID=4
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt ./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt

2
tests/lemmatizer-test/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
differences.log
corpora/

10
tests/lemmatizer-test/test.sh Executable file
View File

@ -0,0 +1,10 @@
#!/bin/bash
./test_corpus.py corpora/A_en.txt en >> differences.log
./test_corpus.py corpora/B_en.txt en >> differences.log
./test_corpus.py corpora/C_en.txt en >> differences.log
./test_corpus.py corpora/D_en.txt en >> differences.log
./test_corpus.py corpora/A_fr.txt fr >> differences.log
./test_corpus.py corpora/B_fr.txt fr >> differences.log
./test_corpus.py corpora/C_fr.txt fr >> differences.log
./test_corpus.py corpora/D_fr.txt fr >> differences.log

View File

@ -0,0 +1,36 @@
#!/usr/bin/python3
import unittest
import json
import requests
import sys
def lemmatizeSentence(lang, sentence):
data = {
'operation': 'lemmatize',
'languageCode':lang,
'sentence':sentence
}
address = 'http://localhost:8800'
response = requests.post(address, data=json.dumps(data))
return response.json()['lemmatizedSentence']
corpus_file_path = sys.argv[1]
lang = sys.argv[2]
line_count = 0
with open(corpus_file_path) as corpus_file:
for line in corpus_file:
line_count += 1
orig = line.rstrip()
lemmatized = lemmatizeSentence(lang,orig)
if len(orig.split()) != len(lemmatized.split()):
print("Different length in:")
print(orig)
print(lemmatized)
if line_count % 1000 == 0:
sys.stderr.write("Done %d lines\n" % line_count)

View File

@ -0,0 +1,7 @@
#!/bin/bash
for corpus_file in `ls /mnt/storage/rjawor_storage/copycat_corpus/cleaned/*txt`
do
a=`basename $corpus_file`
concordia-sentence-tokenizer -c /home/rjawor/concordia-server/concordia.cfg < $corpus_file > corpora/$a
done