Merge branch 'master' into two-step-concordia-search

This commit is contained in:
Rafał Jaworski 2019-01-04 14:15:00 +01:00
commit aa544051dc
26 changed files with 142 additions and 23 deletions

View File

@ -21,6 +21,7 @@ namespace LemmaGenSockets
lemmatizersDict.Add("pl", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Polish));
lemmatizersDict.Add("en", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English));
lemmatizersDict.Add("hr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Serbian));
lemmatizersDict.Add("fr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.French));
}
public LemmatizerListener()
@ -52,13 +53,37 @@ namespace LemmaGenSockets
private string lemmatizeWord(string languageCode, string word)
{
if (word.StartsWith("ne_") || word == "i" || word == "o" || word == "do")
// exceptions
if (word.StartsWith("ne_"))
{
return word;
}
string[] parts = word.Split(wordInnerSeparator);
Dictionary<String, HashSet<String>> exceptions = new Dictionary<string, HashSet<string>>();
HashSet<String> plExceptions = new HashSet<string>();
plExceptions.Add("i");
plExceptions.Add("o");
plExceptions.Add("do");
exceptions.Add("pl", plExceptions);
HashSet<String> enExceptions = new HashSet<string>();
enExceptions.Add("d");
exceptions.Add("en", enExceptions);
HashSet<String> languageExceptions;
if (exceptions.TryGetValue(languageCode, out languageExceptions))
{
if(languageExceptions.Contains(word))
{
return word;
}
}
string result = "";
string[] parts = word.Split(wordInnerSeparator);
if (parts.Length == 2)
{
string firstPart = parts[0];
@ -74,7 +99,7 @@ namespace LemmaGenSockets
result = lemmatizersDict[languageCode].Lemmatize(word);
}
if (result == "" || result.Contains(" "))
if (result == "")
{
return word;
}

View File

@ -8,3 +8,14 @@ j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Deb
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe.config
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.pdb
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharp.dll
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuilt.dll
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuiltCompact.dll
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\Lzma#.dll
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csproj.CoreCompileInputs.cache
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb

View File

@ -1,7 +1,7 @@
dir@#@jrc_enes
concordia_host@#@concordia.vm.wmi.amu.edu.pl
concordia_port@#@8800
tmid@#@6
tmid@#@1
desc@#@Welcome to the interactive Concordia demo. The system finds the longest matches of the pattern sentence in its translation memory. This translation memory is 200 000 sentences taken from English-Spanish corpus of European Law. Please enter an English sentence in the field below and press Enter (or use the search button). This instance of Concordia works best with law sentences, but is very likely to output some results for any English sentence. You can also use predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences. After the search, click on the highlighted fragments to see their context.
enjoy@#@Enjoy your work with the system!
prompt@#@Enter search pattern (English sentence):

View File

@ -0,0 +1,10 @@
dir@#@pytania_odpowiedzi_logistyka
concordia_host@#@localhost
concordia_port@#@8800
tmid@#@12
desc@#@Wyszukiwarka pytań
enjoy@#@Wybierz przykładowe pytanie:
prompt@#@Wprowadź pytanie (po polsku):
suggestion@#@chciałbym zakupić samochód specjalistyczny
suggestion@#@czy są jakieś zlecenia od spedytorów z terminala?
suggestion@#@potrzebuję oprogramowania do zarządzania korporacją taksówkarską

View File

@ -1 +0,0 @@
../versions_available/emea_plen.cfg

View File

@ -1 +0,0 @@
../versions_available/europarl_sample.cfg

View File

@ -1 +0,0 @@
../versions_available/icd_filtered_plen.cfg

View File

@ -1 +0,0 @@
../versions_available/icd_plen.cfg

View File

@ -1 +0,0 @@
../versions_available/logofag_enpl.cfg

View File

@ -1 +0,0 @@
../versions_available/logofag_plen.cfg

View File

@ -1 +0,0 @@
../versions_available/opus_medicine_plen.cfg

View File

@ -1 +0,0 @@
../versions_available/tmrepository_enhr.cfg

View File

@ -11,10 +11,12 @@ LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
std::string plCode = "pl";
std::string enCode = "en";
std::string hrCode = "hr";
std::string frCode = "fr";
_lemmatizersMap.insert(plCode, socketLemmatizer1);
_lemmatizersMap.insert(enCode, socketLemmatizer1);
_lemmatizersMap.insert(hrCode, socketLemmatizer1);
_lemmatizersMap.insert(frCode, socketLemmatizer1);
}
LemmatizerFacade::~LemmatizerFacade() {

View File

@ -1,6 +1,9 @@
#include "socket_lemmatizer.hpp"
#include <time.h>
#include "config.hpp"
#include <boost/lexical_cast.hpp>
SocketLemmatizer::SocketLemmatizer(int port) throw(ConcordiaException) :
@ -79,20 +82,38 @@ bool SocketLemmatizer::_send_data(std::string data)
std::string SocketLemmatizer::_receive(int size=512)
{
char buffer[size];
std::string reply;
std::string reply = "";
//Receive a reply from the server
if(recv(_sock , buffer , sizeof(buffer) , 0) < 0) {
throw ConcordiaException("Receive failed");
bool dataAvailable = true;
while (dataAvailable) {
int amountReceived = recv(_sock , buffer , sizeof(buffer) , 0);
if (amountReceived < 0) {
throw ConcordiaException("Lemmatizer: recv failed");
} else if (amountReceived == 0) {
dataAvailable = false;
} else {
buffer[amountReceived] = '\0';
reply += buffer;
}
}
reply = buffer;
return reply;
}
std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
for (int i=0;i<5;i++) {
try {
_connect();
_send_data(languageCode+sentence+LEMMATIZER_DELIMITER);
std::string reply = _receive(512);
_disconnect();
return reply.substr(0,reply.find(LEMMATIZER_DELIMITER));
} catch (std::exception & e) {
_logger.logString("Problem with lemmatization of the sentence", sentence);
_logger.log("Waiting 2 seconds and retrying...");
sleep(2);
}
}
throw ConcordiaException("Can not lemmatize sentence: "+sentence);
}

View File

@ -9,6 +9,7 @@
#include <concordia/concordia_exception.hpp>
#include "logger.hpp"
class SocketLemmatizer {
public:
@ -34,6 +35,8 @@ private:
int _sock;
struct sockaddr_in _server;
Logger _logger;
};
#endif

View File

@ -1,7 +1,7 @@
#!/bin/sh
CORPUS_NAME="europarl_sample"
CORPUS_NAME="jrc_enes"
SRC_LANG_ID=2
TRG_LANG_ID=1
TRG_LANG_ID=4
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt

2
tests/lemmatizer-test/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
differences.log
corpora/

10
tests/lemmatizer-test/test.sh Executable file
View File

@ -0,0 +1,10 @@
#!/bin/bash
./test_corpus.py corpora/A_en.txt en >> differences.log
./test_corpus.py corpora/B_en.txt en >> differences.log
./test_corpus.py corpora/C_en.txt en >> differences.log
./test_corpus.py corpora/D_en.txt en >> differences.log
./test_corpus.py corpora/A_fr.txt fr >> differences.log
./test_corpus.py corpora/B_fr.txt fr >> differences.log
./test_corpus.py corpora/C_fr.txt fr >> differences.log
./test_corpus.py corpora/D_fr.txt fr >> differences.log

View File

@ -0,0 +1,36 @@
#!/usr/bin/python3
import unittest
import json
import requests
import sys
def lemmatizeSentence(lang, sentence):
data = {
'operation': 'lemmatize',
'languageCode':lang,
'sentence':sentence
}
address = 'http://localhost:8800'
response = requests.post(address, data=json.dumps(data))
return response.json()['lemmatizedSentence']
corpus_file_path = sys.argv[1]
lang = sys.argv[2]
line_count = 0
with open(corpus_file_path) as corpus_file:
for line in corpus_file:
line_count += 1
orig = line.rstrip()
lemmatized = lemmatizeSentence(lang,orig)
if len(orig.split()) != len(lemmatized.split()):
print("Different length in:")
print(orig)
print(lemmatized)
if line_count % 1000 == 0:
sys.stderr.write("Done %d lines\n" % line_count)

View File

@ -0,0 +1,7 @@
#!/bin/bash
for corpus_file in `ls /mnt/storage/rjawor_storage/copycat_corpus/cleaned/*txt`
do
a=`basename $corpus_file`
concordia-sentence-tokenizer -c /home/rjawor/concordia-server/concordia.cfg < $corpus_file > corpora/$a
done