From a8c7db6ee4110ce421109a189d5537a93df2ed9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Jaworski?= Date: Mon, 4 Feb 2019 15:27:56 +0100 Subject: [PATCH] lemmatize all --- concordia-server/concordia_server.cpp | 18 +++++++- concordia-server/config.hpp.in | 2 + concordia-server/lemmatizer_facade.cpp | 9 ++++ concordia-server/lemmatizer_facade.hpp | 2 + fast-aligner/.gitignore | 1 + fast-aligner/Makefile | 61 ++++++++++++++++++++++++++ fast-aligner/clean_corpus.py | 25 +++++++++++ fast-aligner/collect_dict.py | 17 +++++++ fast-aligner/sentence_lemmatizer.py | 37 ++++++++++++++++ tests/lemmatizeSentences.py | 29 ++++++++++++ 10 files changed, 200 insertions(+), 1 deletion(-) create mode 100644 fast-aligner/.gitignore create mode 100644 fast-aligner/Makefile create mode 100755 fast-aligner/clean_corpus.py create mode 100755 fast-aligner/collect_dict.py create mode 100755 fast-aligner/sentence_lemmatizer.py create mode 100755 tests/lemmatizeSentences.py diff --git a/concordia-server/concordia_server.cpp b/concordia-server/concordia_server.cpp index baf67db..f399c1a 100644 --- a/concordia-server/concordia_server.cpp +++ b/concordia-server/concordia_server.cpp @@ -224,7 +224,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) { jsonWriter.Int(newId); jsonWriter.EndObject(); - } else if (operation == "lemmatize") { + } else if (operation == LEMMATIZE_OP) { std::string sentence = _getStringParameter(d, "sentence"); std::string languageCode = _getStringParameter(d, "languageCode"); std::string lemmatizedSentence = _lemmatizerFacade->lemmatizeSentence(languageCode, sentence); @@ -232,6 +232,22 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) { jsonWriter.String("lemmatizedSentence"); jsonWriter.String(lemmatizedSentence.c_str()); jsonWriter.EndObject(); + } else if (operation == LEMMATIZE_ALL_OP) { + std::vector sentences; + std::string languageCode = _getStringParameter(d, "languageCode"); + const rapidjson::Value & sentencesArray = d[SENTENCES_PARAM]; + for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) { + sentences.push_back(sentencesArray[i].GetString()); + } + std::vector lemmatizedSentences = _lemmatizerFacade->lemmatizeSentences(languageCode, sentences); + jsonWriter.StartObject(); + jsonWriter.String("lemmatizedSentences"); + jsonWriter.StartArray(); + BOOST_FOREACH(std::string & lemmatizedSentence, lemmatizedSentences) { + jsonWriter.String(lemmatizedSentence.c_str()); + } + jsonWriter.EndArray(); + jsonWriter.EndObject(); } else if (operation == REFRESH_INDEX_OP) { int tmId = _getIntParameter(d, TM_ID_PARAM); _indexController->refreshIndexFromRAM(jsonWriter, tmId); diff --git a/concordia-server/config.hpp.in b/concordia-server/config.hpp.in index 7d9a87c..26bb711 100644 --- a/concordia-server/config.hpp.in +++ b/concordia-server/config.hpp.in @@ -31,6 +31,8 @@ #define ADD_SENTENCE_OP "addSentence" #define ADD_SENTENCES_OP "addSentences" +#define LEMMATIZE_OP "lemmatize" +#define LEMMATIZE_ALL_OP "lemmatizeAll" #define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences" #define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences" #define ADD_REQUEST_OP "addRequest" diff --git a/concordia-server/lemmatizer_facade.cpp b/concordia-server/lemmatizer_facade.cpp index c853024..1c406e0 100644 --- a/concordia-server/lemmatizer_facade.cpp +++ b/concordia-server/lemmatizer_facade.cpp @@ -33,6 +33,15 @@ std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::s } +std::vector LemmatizerFacade::lemmatizeSentences(std::string languageCode, std::vector sentences) { + std::vector result; + BOOST_FOREACH(std::string & sentence, sentences) { + result.push_back(lemmatizeSentence(languageCode, sentence)); + } + return result; + +} + std::string LemmatizerFacade::lemmatizeIfNeeded(std::string pattern, int tmId) { std::pair tmInfo = _tmDAO.getTmInfo(tmId); if (tmInfo.first) { diff --git a/concordia-server/lemmatizer_facade.hpp b/concordia-server/lemmatizer_facade.hpp index e9f5c3e..149f53f 100644 --- a/concordia-server/lemmatizer_facade.hpp +++ b/concordia-server/lemmatizer_facade.hpp @@ -20,6 +20,8 @@ public: std::string lemmatizeSentence(std::string languageCode, std::string sentence); + std::vector lemmatizeSentences(std::string languageCode, std::vector sentences); + std::string lemmatizeIfNeeded(std::string pattern, int tmId); std::vector lemmatizeSentencesIfNeeded(std::vector patterns, int tmId); diff --git a/fast-aligner/.gitignore b/fast-aligner/.gitignore new file mode 100644 index 0000000..fa3afaf --- /dev/null +++ b/fast-aligner/.gitignore @@ -0,0 +1 @@ +corpora/ diff --git a/fast-aligner/Makefile b/fast-aligner/Makefile new file mode 100644 index 0000000..f080bf9 --- /dev/null +++ b/fast-aligner/Makefile @@ -0,0 +1,61 @@ +SRC_LANG=pl +TRG_LANG=en +CORPUS_NAME=opensubtitles_sample +SEPARATOR=@\#@ + +DICTIONARY_WEIGHT=3 + +all: corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem + + +clean-intermediate-files: + rm -f corpora/$(CORPUS_NAME)/*.lem + rm -f corpora/$(CORPUS_NAME)/*.tok + rm -f corpora/$(CORPUS_NAME)/*.dict + rm -f corpora/$(CORPUS_NAME)/*.classes + rm -f corpora/$(CORPUS_NAME)/*.classes.cats + rm -f corpora/$(CORPUS_NAME)/*.vcb + rm -f corpora/$(CORPUS_NAME)/*.snt + rm -f corpora/$(CORPUS_NAME)/*.cooc + rm -f corpora/$(CORPUS_NAME)/aligned*part* + rm -f corpora/$(CORPUS_NAME)/aligned.txt + rm -f corpora/$(CORPUS_NAME)/giza.cfg + rm -f corpora/$(CORPUS_NAME)/aligned.gizacfg + rm -f corpora/$(CORPUS_NAME)/pasted.txt + rm -f corpora/$(CORPUS_NAME)/pasted_deduplicated.txt + rm -f corpora/$(CORPUS_NAME)/src_clean.txt + rm -f corpora/$(CORPUS_NAME)/trg_clean.txt + +clean: clean-intermediate-files + rm -f corpora/$(CORPUS_NAME)/src_final.txt + rm -f corpora/$(CORPUS_NAME)/trg_final.txt + rm -f corpora/$(CORPUS_NAME)/aligned_final.txt + + +corpora/$(CORPUS_NAME)/src.dict: + ./collect_dict.py $(SRC_LANG) $(TRG_LANG) $(DICTIONARY_WEIGHT) > $@ + +corpora/$(CORPUS_NAME)/trg.dict: + ./collect_dict.py $(TRG_LANG) $(SRC_LANG) $(DICTIONARY_WEIGHT) > $@ + + +corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.txt + /usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< | ./sentence_lemmatizer.py $(SRC_LANG) > $@ + +corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt + /usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< | ./sentence_lemmatizer.py $(TRG_LANG) > $@ + + + + +corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok + ./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok $(SEPARATOR) + +corpora/$(CORPUS_NAME)/pasted.txt: corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt + ./paste.py corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt $(SEPARATOR)> $@ + +corpora/$(CORPUS_NAME)/pasted_deduplicated.txt: corpora/$(CORPUS_NAME)/pasted.txt + sort -k 1.13 $< | uniq -s 12 | sort > $@ + +corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt corpora/$(CORPUS_NAME)/aligned.txt + ./extract.py $< corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt $(SEPARATOR) diff --git a/fast-aligner/clean_corpus.py b/fast-aligner/clean_corpus.py new file mode 100755 index 0000000..a545ff4 --- /dev/null +++ b/fast-aligner/clean_corpus.py @@ -0,0 +1,25 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +import sys + +max_tokens = 100 +max_ratio = 4.0 + +separator = sys.argv[9] + +with open(sys.argv[1]) as src_file, open(sys.argv[2]) as trg_file, open(sys.argv[3]) as src_tok, open(sys.argv[4]) as trg_tok, open(sys.argv[5], 'w') as src_clean, open(sys.argv[6], 'w') as trg_clean, open(sys.argv[7], 'w') as src_clean_tok, open(sys.argv[8], 'w') as trg_clean_tok: + for line in src_file: + src_line_orig = line.strip() + trg_line_orig = trg_file.readline().strip() + src_line_tok = src_tok.readline().strip() + trg_line_tok = trg_tok.readline().strip() + src_token_count = len(src_line_tok.split()) + trg_token_count = len(trg_line_tok.split()) + if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens): + ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count) + if (ratio <= max_ratio): + src_clean.write(src_line_orig+"\n") + trg_clean.write(trg_line_orig+"\n") + src_clean_tok.write(src_line_tok+"\n") + trg_clean_tok.write(trg_line_tok+"\n") diff --git a/fast-aligner/collect_dict.py b/fast-aligner/collect_dict.py new file mode 100755 index 0000000..3aacaba --- /dev/null +++ b/fast-aligner/collect_dict.py @@ -0,0 +1,17 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +import sys, os, bz2 + +src_lang = sys.argv[1] +trg_lang = sys.argv[2] +weight = int(sys.argv[3]) + +for dname in os.listdir('dictionaries'): + src_path = 'dictionaries/%s/%s.bz2' % (dname, src_lang) + trg_path = 'dictionaries/%s/%s.bz2' % (dname, trg_lang) + if os.path.isfile(src_path) and os.path.isfile(trg_path): + with bz2.open(src_path, 'rt') as src_dict_file: + for line in src_dict_file: + for i in range(weight): + print(line.strip()) diff --git a/fast-aligner/sentence_lemmatizer.py b/fast-aligner/sentence_lemmatizer.py new file mode 100755 index 0000000..005b55a --- /dev/null +++ b/fast-aligner/sentence_lemmatizer.py @@ -0,0 +1,37 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +import json +import requests +import sys + +BUFFER_SIZE = 500 + +def lemmatize_sentences(language_code, sentences): + data = { + 'operation': 'lemmatizeAll', + 'languageCode':language_code, + 'sentences':sentences + } + + address = 'http://localhost:8800' + + response = requests.post(address, data = json.dumps(data)) + response.encoding = 'utf-8' + + response_json = json.loads(response.text) + return '\n'.join(response_json['lemmatizedSentences']) + + +language_code = sys.argv[1] +sentences_buffer = [] +for line in sys.stdin: + sentences_buffer.append(line.rstrip()) + if len(sentences_buffer) == BUFFER_SIZE: + print(lemmatize_sentences(language_code,sentences_buffer)) + sentences_buffer = [] + +if len(sentences_buffer) > 0: + print(lemmatize_sentences(language_code,sentences_buffer)) + + diff --git a/tests/lemmatizeSentences.py b/tests/lemmatizeSentences.py new file mode 100755 index 0000000..0ef29a6 --- /dev/null +++ b/tests/lemmatizeSentences.py @@ -0,0 +1,29 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import unittest +import json +import urllib2 +import sys +import time +import host + +data = { + 'operation': 'lemmatizeAll', + 'languageCode':sys.argv[1], + 'sentences':["ona poszła do sklepu", "powiedziałem to Tomkowi"] +} + +address = 'http://'+host.concordia_host +if len(host.concordia_port) > 0: + address += ':'+host.concordia_port + +start = time.time() +req = urllib2.Request(address) +req.add_header('Content-Type', 'application/json') +response = json.loads(urllib2.urlopen(req, json.dumps(data)).read()) +end = time.time() + +print "Execution time: %.4f seconds." % (end-start) +print "Result: " +print response