lemmatize all

This commit is contained in:
Rafał Jaworski 2019-02-04 15:27:56 +01:00
parent 7178a7f5db
commit a8c7db6ee4
10 changed files with 200 additions and 1 deletions

View File

@ -224,7 +224,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
jsonWriter.Int(newId);
jsonWriter.EndObject();
} else if (operation == "lemmatize") {
} else if (operation == LEMMATIZE_OP) {
std::string sentence = _getStringParameter(d, "sentence");
std::string languageCode = _getStringParameter(d, "languageCode");
std::string lemmatizedSentence = _lemmatizerFacade->lemmatizeSentence(languageCode, sentence);
@ -232,6 +232,22 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
jsonWriter.String("lemmatizedSentence");
jsonWriter.String(lemmatizedSentence.c_str());
jsonWriter.EndObject();
} else if (operation == LEMMATIZE_ALL_OP) {
std::vector<std::string> sentences;
std::string languageCode = _getStringParameter(d, "languageCode");
const rapidjson::Value & sentencesArray = d[SENTENCES_PARAM];
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
sentences.push_back(sentencesArray[i].GetString());
}
std::vector<std::string> lemmatizedSentences = _lemmatizerFacade->lemmatizeSentences(languageCode, sentences);
jsonWriter.StartObject();
jsonWriter.String("lemmatizedSentences");
jsonWriter.StartArray();
BOOST_FOREACH(std::string & lemmatizedSentence, lemmatizedSentences) {
jsonWriter.String(lemmatizedSentence.c_str());
}
jsonWriter.EndArray();
jsonWriter.EndObject();
} else if (operation == REFRESH_INDEX_OP) {
int tmId = _getIntParameter(d, TM_ID_PARAM);
_indexController->refreshIndexFromRAM(jsonWriter, tmId);

View File

@ -31,6 +31,8 @@
#define ADD_SENTENCE_OP "addSentence"
#define ADD_SENTENCES_OP "addSentences"
#define LEMMATIZE_OP "lemmatize"
#define LEMMATIZE_ALL_OP "lemmatizeAll"
#define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences"
#define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences"
#define ADD_REQUEST_OP "addRequest"

View File

@ -33,6 +33,15 @@ std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::s
}
std::vector<std::string> LemmatizerFacade::lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences) {
std::vector<std::string> result;
BOOST_FOREACH(std::string & sentence, sentences) {
result.push_back(lemmatizeSentence(languageCode, sentence));
}
return result;
}
std::string LemmatizerFacade::lemmatizeIfNeeded(std::string pattern, int tmId) {
std::pair<bool, std::string> tmInfo = _tmDAO.getTmInfo(tmId);
if (tmInfo.first) {

View File

@ -20,6 +20,8 @@ public:
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
std::vector<std::string> lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences);
std::string lemmatizeIfNeeded(std::string pattern, int tmId);
std::vector<std::string> lemmatizeSentencesIfNeeded(std::vector<std::string> patterns, int tmId);

1
fast-aligner/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
corpora/

61
fast-aligner/Makefile Normal file
View File

@ -0,0 +1,61 @@
SRC_LANG=pl
TRG_LANG=en
CORPUS_NAME=opensubtitles_sample
SEPARATOR=@\#@
DICTIONARY_WEIGHT=3
all: corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem
clean-intermediate-files:
rm -f corpora/$(CORPUS_NAME)/*.lem
rm -f corpora/$(CORPUS_NAME)/*.tok
rm -f corpora/$(CORPUS_NAME)/*.dict
rm -f corpora/$(CORPUS_NAME)/*.classes
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
rm -f corpora/$(CORPUS_NAME)/*.vcb
rm -f corpora/$(CORPUS_NAME)/*.snt
rm -f corpora/$(CORPUS_NAME)/*.cooc
rm -f corpora/$(CORPUS_NAME)/aligned*part*
rm -f corpora/$(CORPUS_NAME)/aligned.txt
rm -f corpora/$(CORPUS_NAME)/giza.cfg
rm -f corpora/$(CORPUS_NAME)/aligned.gizacfg
rm -f corpora/$(CORPUS_NAME)/pasted.txt
rm -f corpora/$(CORPUS_NAME)/pasted_deduplicated.txt
rm -f corpora/$(CORPUS_NAME)/src_clean.txt
rm -f corpora/$(CORPUS_NAME)/trg_clean.txt
clean: clean-intermediate-files
rm -f corpora/$(CORPUS_NAME)/src_final.txt
rm -f corpora/$(CORPUS_NAME)/trg_final.txt
rm -f corpora/$(CORPUS_NAME)/aligned_final.txt
corpora/$(CORPUS_NAME)/src.dict:
./collect_dict.py $(SRC_LANG) $(TRG_LANG) $(DICTIONARY_WEIGHT) > $@
corpora/$(CORPUS_NAME)/trg.dict:
./collect_dict.py $(TRG_LANG) $(SRC_LANG) $(DICTIONARY_WEIGHT) > $@
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.txt
/usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< | ./sentence_lemmatizer.py $(SRC_LANG) > $@
corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt
/usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< | ./sentence_lemmatizer.py $(TRG_LANG) > $@
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok
./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok $(SEPARATOR)
corpora/$(CORPUS_NAME)/pasted.txt: corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt
./paste.py corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt $(SEPARATOR)> $@
corpora/$(CORPUS_NAME)/pasted_deduplicated.txt: corpora/$(CORPUS_NAME)/pasted.txt
sort -k 1.13 $< | uniq -s 12 | sort > $@
corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt corpora/$(CORPUS_NAME)/aligned.txt
./extract.py $< corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt $(SEPARATOR)

25
fast-aligner/clean_corpus.py Executable file
View File

@ -0,0 +1,25 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
max_tokens = 100
max_ratio = 4.0
separator = sys.argv[9]
with open(sys.argv[1]) as src_file, open(sys.argv[2]) as trg_file, open(sys.argv[3]) as src_tok, open(sys.argv[4]) as trg_tok, open(sys.argv[5], 'w') as src_clean, open(sys.argv[6], 'w') as trg_clean, open(sys.argv[7], 'w') as src_clean_tok, open(sys.argv[8], 'w') as trg_clean_tok:
for line in src_file:
src_line_orig = line.strip()
trg_line_orig = trg_file.readline().strip()
src_line_tok = src_tok.readline().strip()
trg_line_tok = trg_tok.readline().strip()
src_token_count = len(src_line_tok.split())
trg_token_count = len(trg_line_tok.split())
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count)
if (ratio <= max_ratio):
src_clean.write(src_line_orig+"\n")
trg_clean.write(trg_line_orig+"\n")
src_clean_tok.write(src_line_tok+"\n")
trg_clean_tok.write(trg_line_tok+"\n")

17
fast-aligner/collect_dict.py Executable file
View File

@ -0,0 +1,17 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys, os, bz2
src_lang = sys.argv[1]
trg_lang = sys.argv[2]
weight = int(sys.argv[3])
for dname in os.listdir('dictionaries'):
src_path = 'dictionaries/%s/%s.bz2' % (dname, src_lang)
trg_path = 'dictionaries/%s/%s.bz2' % (dname, trg_lang)
if os.path.isfile(src_path) and os.path.isfile(trg_path):
with bz2.open(src_path, 'rt') as src_dict_file:
for line in src_dict_file:
for i in range(weight):
print(line.strip())

View File

@ -0,0 +1,37 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import json
import requests
import sys
BUFFER_SIZE = 500
def lemmatize_sentences(language_code, sentences):
data = {
'operation': 'lemmatizeAll',
'languageCode':language_code,
'sentences':sentences
}
address = 'http://localhost:8800'
response = requests.post(address, data = json.dumps(data))
response.encoding = 'utf-8'
response_json = json.loads(response.text)
return '\n'.join(response_json['lemmatizedSentences'])
language_code = sys.argv[1]
sentences_buffer = []
for line in sys.stdin:
sentences_buffer.append(line.rstrip())
if len(sentences_buffer) == BUFFER_SIZE:
print(lemmatize_sentences(language_code,sentences_buffer))
sentences_buffer = []
if len(sentences_buffer) > 0:
print(lemmatize_sentences(language_code,sentences_buffer))

29
tests/lemmatizeSentences.py Executable file
View File

@ -0,0 +1,29 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import unittest
import json
import urllib2
import sys
import time
import host
data = {
'operation': 'lemmatizeAll',
'languageCode':sys.argv[1],
'sentences':["ona poszła do sklepu", "powiedziałem to Tomkowi"]
}
address = 'http://'+host.concordia_host
if len(host.concordia_port) > 0:
address += ':'+host.concordia_port
start = time.time()
req = urllib2.Request(address)
req.add_header('Content-Type', 'application/json')
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
end = time.time()
print "Execution time: %.4f seconds." % (end-start)
print "Result: "
print response