Merge branch 'master' into two-step-concordia-search
This commit is contained in:
commit
aa544051dc
@ -21,6 +21,7 @@ namespace LemmaGenSockets
|
||||
lemmatizersDict.Add("pl", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Polish));
|
||||
lemmatizersDict.Add("en", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.English));
|
||||
lemmatizersDict.Add("hr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.Serbian));
|
||||
lemmatizersDict.Add("fr", new LemmatizerPrebuiltCompact(LemmaSharp.LanguagePrebuilt.French));
|
||||
}
|
||||
|
||||
public LemmatizerListener()
|
||||
@ -52,13 +53,37 @@ namespace LemmaGenSockets
|
||||
|
||||
private string lemmatizeWord(string languageCode, string word)
|
||||
{
|
||||
if (word.StartsWith("ne_") || word == "i" || word == "o" || word == "do")
|
||||
// exceptions
|
||||
if (word.StartsWith("ne_"))
|
||||
{
|
||||
return word;
|
||||
}
|
||||
string[] parts = word.Split(wordInnerSeparator);
|
||||
|
||||
|
||||
Dictionary<String, HashSet<String>> exceptions = new Dictionary<string, HashSet<string>>();
|
||||
|
||||
HashSet<String> plExceptions = new HashSet<string>();
|
||||
plExceptions.Add("i");
|
||||
plExceptions.Add("o");
|
||||
plExceptions.Add("do");
|
||||
exceptions.Add("pl", plExceptions);
|
||||
|
||||
HashSet<String> enExceptions = new HashSet<string>();
|
||||
enExceptions.Add("d");
|
||||
exceptions.Add("en", enExceptions);
|
||||
|
||||
HashSet<String> languageExceptions;
|
||||
if (exceptions.TryGetValue(languageCode, out languageExceptions))
|
||||
{
|
||||
if(languageExceptions.Contains(word))
|
||||
{
|
||||
return word;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
string result = "";
|
||||
string[] parts = word.Split(wordInnerSeparator);
|
||||
if (parts.Length == 2)
|
||||
{
|
||||
string firstPart = parts[0];
|
||||
@ -74,7 +99,7 @@ namespace LemmaGenSockets
|
||||
result = lemmatizersDict[languageCode].Lemmatize(word);
|
||||
}
|
||||
|
||||
if (result == "" || result.Contains(" "))
|
||||
if (result == "")
|
||||
{
|
||||
return word;
|
||||
}
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -8,3 +8,14 @@ j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\bin\Deb
|
||||
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache
|
||||
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe
|
||||
j:\Documents\Visual Studio 2015\Projects\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb
|
||||
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe.config
|
||||
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.exe
|
||||
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaGenSockets.pdb
|
||||
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharp.dll
|
||||
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuilt.dll
|
||||
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\LemmaSharpPrebuiltCompact.dll
|
||||
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\bin\Debug\Lzma#.dll
|
||||
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csprojResolveAssemblyReference.cache
|
||||
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.csproj.CoreCompileInputs.cache
|
||||
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.exe
|
||||
J:\projects\concordia-server\LemmaGenSockets\LemmaGenSockets\obj\Debug\LemmaGenSockets.pdb
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,7 +1,7 @@
|
||||
dir@#@jrc_enes
|
||||
concordia_host@#@concordia.vm.wmi.amu.edu.pl
|
||||
concordia_port@#@8800
|
||||
tmid@#@6
|
||||
tmid@#@1
|
||||
desc@#@Welcome to the interactive Concordia demo. The system finds the longest matches of the pattern sentence in its translation memory. This translation memory is 200 000 sentences taken from English-Spanish corpus of European Law. Please enter an English sentence in the field below and press Enter (or use the search button). This instance of Concordia works best with law sentences, but is very likely to output some results for any English sentence. You can also use predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences. After the search, click on the highlighted fragments to see their context.
|
||||
enjoy@#@Enjoy your work with the system!
|
||||
prompt@#@Enter search pattern (English sentence):
|
||||
|
10
cat/versions_available/pytania_odpowiedzi_logistyka.cfg
Normal file
10
cat/versions_available/pytania_odpowiedzi_logistyka.cfg
Normal file
@ -0,0 +1,10 @@
|
||||
dir@#@pytania_odpowiedzi_logistyka
|
||||
concordia_host@#@localhost
|
||||
concordia_port@#@8800
|
||||
tmid@#@12
|
||||
desc@#@Wyszukiwarka pytań
|
||||
enjoy@#@Wybierz przykładowe pytanie:
|
||||
prompt@#@Wprowadź pytanie (po polsku):
|
||||
suggestion@#@chciałbym zakupić samochód specjalistyczny
|
||||
suggestion@#@czy są jakieś zlecenia od spedytorów z terminala?
|
||||
suggestion@#@potrzebuję oprogramowania do zarządzania korporacją taksówkarską
|
@ -1 +0,0 @@
|
||||
../versions_available/emea_plen.cfg
|
@ -1 +0,0 @@
|
||||
../versions_available/europarl_sample.cfg
|
@ -1 +0,0 @@
|
||||
../versions_available/icd_filtered_plen.cfg
|
@ -1 +0,0 @@
|
||||
../versions_available/icd_plen.cfg
|
@ -1 +0,0 @@
|
||||
../versions_available/logofag_enpl.cfg
|
@ -1 +0,0 @@
|
||||
../versions_available/logofag_plen.cfg
|
@ -1 +0,0 @@
|
||||
../versions_available/opus_medicine_plen.cfg
|
@ -1 +0,0 @@
|
||||
../versions_available/tmrepository_enhr.cfg
|
@ -11,10 +11,12 @@ LemmatizerFacade::LemmatizerFacade() throw(ConcordiaException) {
|
||||
std::string plCode = "pl";
|
||||
std::string enCode = "en";
|
||||
std::string hrCode = "hr";
|
||||
std::string frCode = "fr";
|
||||
|
||||
_lemmatizersMap.insert(plCode, socketLemmatizer1);
|
||||
_lemmatizersMap.insert(enCode, socketLemmatizer1);
|
||||
_lemmatizersMap.insert(hrCode, socketLemmatizer1);
|
||||
_lemmatizersMap.insert(frCode, socketLemmatizer1);
|
||||
}
|
||||
|
||||
LemmatizerFacade::~LemmatizerFacade() {
|
||||
|
@ -1,6 +1,9 @@
|
||||
#include "socket_lemmatizer.hpp"
|
||||
|
||||
#include <time.h>
|
||||
|
||||
#include "config.hpp"
|
||||
|
||||
#include <boost/lexical_cast.hpp>
|
||||
|
||||
SocketLemmatizer::SocketLemmatizer(int port) throw(ConcordiaException) :
|
||||
@ -79,20 +82,38 @@ bool SocketLemmatizer::_send_data(std::string data)
|
||||
std::string SocketLemmatizer::_receive(int size=512)
|
||||
{
|
||||
char buffer[size];
|
||||
std::string reply;
|
||||
std::string reply = "";
|
||||
|
||||
//Receive a reply from the server
|
||||
if(recv(_sock , buffer , sizeof(buffer) , 0) < 0) {
|
||||
throw ConcordiaException("Receive failed");
|
||||
bool dataAvailable = true;
|
||||
while (dataAvailable) {
|
||||
int amountReceived = recv(_sock , buffer , sizeof(buffer) , 0);
|
||||
if (amountReceived < 0) {
|
||||
throw ConcordiaException("Lemmatizer: recv failed");
|
||||
} else if (amountReceived == 0) {
|
||||
dataAvailable = false;
|
||||
} else {
|
||||
buffer[amountReceived] = '\0';
|
||||
reply += buffer;
|
||||
}
|
||||
}
|
||||
reply = buffer;
|
||||
return reply;
|
||||
}
|
||||
|
||||
std::string SocketLemmatizer::lemmatizeSentence(std::string languageCode, std::string sentence) {
|
||||
for (int i=0;i<5;i++) {
|
||||
try {
|
||||
_connect();
|
||||
_send_data(languageCode+sentence+LEMMATIZER_DELIMITER);
|
||||
std::string reply = _receive(512);
|
||||
_disconnect();
|
||||
return reply.substr(0,reply.find(LEMMATIZER_DELIMITER));
|
||||
} catch (std::exception & e) {
|
||||
_logger.logString("Problem with lemmatization of the sentence", sentence);
|
||||
_logger.log("Waiting 2 seconds and retrying...");
|
||||
sleep(2);
|
||||
}
|
||||
}
|
||||
|
||||
throw ConcordiaException("Can not lemmatize sentence: "+sentence);
|
||||
}
|
||||
|
@ -9,6 +9,7 @@
|
||||
|
||||
#include <concordia/concordia_exception.hpp>
|
||||
|
||||
#include "logger.hpp"
|
||||
|
||||
class SocketLemmatizer {
|
||||
public:
|
||||
@ -34,6 +35,8 @@ private:
|
||||
int _sock;
|
||||
|
||||
struct sockaddr_in _server;
|
||||
|
||||
Logger _logger;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -1,7 +1,7 @@
|
||||
#!/bin/sh
|
||||
|
||||
CORPUS_NAME="europarl_sample"
|
||||
CORPUS_NAME="jrc_enes"
|
||||
SRC_LANG_ID=2
|
||||
TRG_LANG_ID=1
|
||||
TRG_LANG_ID=4
|
||||
|
||||
./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt
|
||||
|
2
tests/lemmatizer-test/.gitignore
vendored
Normal file
2
tests/lemmatizer-test/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
differences.log
|
||||
corpora/
|
10
tests/lemmatizer-test/test.sh
Executable file
10
tests/lemmatizer-test/test.sh
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
./test_corpus.py corpora/A_en.txt en >> differences.log
|
||||
./test_corpus.py corpora/B_en.txt en >> differences.log
|
||||
./test_corpus.py corpora/C_en.txt en >> differences.log
|
||||
./test_corpus.py corpora/D_en.txt en >> differences.log
|
||||
./test_corpus.py corpora/A_fr.txt fr >> differences.log
|
||||
./test_corpus.py corpora/B_fr.txt fr >> differences.log
|
||||
./test_corpus.py corpora/C_fr.txt fr >> differences.log
|
||||
./test_corpus.py corpora/D_fr.txt fr >> differences.log
|
36
tests/lemmatizer-test/test_corpus.py
Executable file
36
tests/lemmatizer-test/test_corpus.py
Executable file
@ -0,0 +1,36 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import unittest
|
||||
import json
|
||||
import requests
|
||||
import sys
|
||||
|
||||
|
||||
|
||||
def lemmatizeSentence(lang, sentence):
|
||||
data = {
|
||||
'operation': 'lemmatize',
|
||||
'languageCode':lang,
|
||||
'sentence':sentence
|
||||
}
|
||||
|
||||
address = 'http://localhost:8800'
|
||||
response = requests.post(address, data=json.dumps(data))
|
||||
return response.json()['lemmatizedSentence']
|
||||
|
||||
corpus_file_path = sys.argv[1]
|
||||
lang = sys.argv[2]
|
||||
|
||||
|
||||
line_count = 0
|
||||
with open(corpus_file_path) as corpus_file:
|
||||
for line in corpus_file:
|
||||
line_count += 1
|
||||
orig = line.rstrip()
|
||||
lemmatized = lemmatizeSentence(lang,orig)
|
||||
if len(orig.split()) != len(lemmatized.split()):
|
||||
print("Different length in:")
|
||||
print(orig)
|
||||
print(lemmatized)
|
||||
if line_count % 1000 == 0:
|
||||
sys.stderr.write("Done %d lines\n" % line_count)
|
7
tests/lemmatizer-test/tokenize.sh
Executable file
7
tests/lemmatizer-test/tokenize.sh
Executable file
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
for corpus_file in `ls /mnt/storage/rjawor_storage/copycat_corpus/cleaned/*txt`
|
||||
do
|
||||
a=`basename $corpus_file`
|
||||
concordia-sentence-tokenizer -c /home/rjawor/concordia-server/concordia.cfg < $corpus_file > corpora/$a
|
||||
done
|
Loading…
Reference in New Issue
Block a user