lemmatize all
This commit is contained in:
parent
7178a7f5db
commit
a8c7db6ee4
@ -224,7 +224,7 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
|||||||
jsonWriter.Int(newId);
|
jsonWriter.Int(newId);
|
||||||
jsonWriter.EndObject();
|
jsonWriter.EndObject();
|
||||||
|
|
||||||
} else if (operation == "lemmatize") {
|
} else if (operation == LEMMATIZE_OP) {
|
||||||
std::string sentence = _getStringParameter(d, "sentence");
|
std::string sentence = _getStringParameter(d, "sentence");
|
||||||
std::string languageCode = _getStringParameter(d, "languageCode");
|
std::string languageCode = _getStringParameter(d, "languageCode");
|
||||||
std::string lemmatizedSentence = _lemmatizerFacade->lemmatizeSentence(languageCode, sentence);
|
std::string lemmatizedSentence = _lemmatizerFacade->lemmatizeSentence(languageCode, sentence);
|
||||||
@ -232,6 +232,22 @@ std::string ConcordiaServer::handleRequest(std::string & requestString) {
|
|||||||
jsonWriter.String("lemmatizedSentence");
|
jsonWriter.String("lemmatizedSentence");
|
||||||
jsonWriter.String(lemmatizedSentence.c_str());
|
jsonWriter.String(lemmatizedSentence.c_str());
|
||||||
jsonWriter.EndObject();
|
jsonWriter.EndObject();
|
||||||
|
} else if (operation == LEMMATIZE_ALL_OP) {
|
||||||
|
std::vector<std::string> sentences;
|
||||||
|
std::string languageCode = _getStringParameter(d, "languageCode");
|
||||||
|
const rapidjson::Value & sentencesArray = d[SENTENCES_PARAM];
|
||||||
|
for (rapidjson::SizeType i = 0; i < sentencesArray.Size(); i++) {
|
||||||
|
sentences.push_back(sentencesArray[i].GetString());
|
||||||
|
}
|
||||||
|
std::vector<std::string> lemmatizedSentences = _lemmatizerFacade->lemmatizeSentences(languageCode, sentences);
|
||||||
|
jsonWriter.StartObject();
|
||||||
|
jsonWriter.String("lemmatizedSentences");
|
||||||
|
jsonWriter.StartArray();
|
||||||
|
BOOST_FOREACH(std::string & lemmatizedSentence, lemmatizedSentences) {
|
||||||
|
jsonWriter.String(lemmatizedSentence.c_str());
|
||||||
|
}
|
||||||
|
jsonWriter.EndArray();
|
||||||
|
jsonWriter.EndObject();
|
||||||
} else if (operation == REFRESH_INDEX_OP) {
|
} else if (operation == REFRESH_INDEX_OP) {
|
||||||
int tmId = _getIntParameter(d, TM_ID_PARAM);
|
int tmId = _getIntParameter(d, TM_ID_PARAM);
|
||||||
_indexController->refreshIndexFromRAM(jsonWriter, tmId);
|
_indexController->refreshIndexFromRAM(jsonWriter, tmId);
|
||||||
|
@ -31,6 +31,8 @@
|
|||||||
|
|
||||||
#define ADD_SENTENCE_OP "addSentence"
|
#define ADD_SENTENCE_OP "addSentence"
|
||||||
#define ADD_SENTENCES_OP "addSentences"
|
#define ADD_SENTENCES_OP "addSentences"
|
||||||
|
#define LEMMATIZE_OP "lemmatize"
|
||||||
|
#define LEMMATIZE_ALL_OP "lemmatizeAll"
|
||||||
#define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences"
|
#define ADD_ALIGNED_SENTENCES_OP "addAlignedSentences"
|
||||||
#define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences"
|
#define ADD_ALIGNED_LEMMATIZED_SENTENCES_OP "addAlignedLemmatizedSentences"
|
||||||
#define ADD_REQUEST_OP "addRequest"
|
#define ADD_REQUEST_OP "addRequest"
|
||||||
|
@ -33,6 +33,15 @@ std::string LemmatizerFacade::lemmatizeSentence(std::string languageCode, std::s
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> LemmatizerFacade::lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences) {
|
||||||
|
std::vector<std::string> result;
|
||||||
|
BOOST_FOREACH(std::string & sentence, sentences) {
|
||||||
|
result.push_back(lemmatizeSentence(languageCode, sentence));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
std::string LemmatizerFacade::lemmatizeIfNeeded(std::string pattern, int tmId) {
|
std::string LemmatizerFacade::lemmatizeIfNeeded(std::string pattern, int tmId) {
|
||||||
std::pair<bool, std::string> tmInfo = _tmDAO.getTmInfo(tmId);
|
std::pair<bool, std::string> tmInfo = _tmDAO.getTmInfo(tmId);
|
||||||
if (tmInfo.first) {
|
if (tmInfo.first) {
|
||||||
|
@ -20,6 +20,8 @@ public:
|
|||||||
|
|
||||||
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
|
std::string lemmatizeSentence(std::string languageCode, std::string sentence);
|
||||||
|
|
||||||
|
std::vector<std::string> lemmatizeSentences(std::string languageCode, std::vector<std::string> sentences);
|
||||||
|
|
||||||
std::string lemmatizeIfNeeded(std::string pattern, int tmId);
|
std::string lemmatizeIfNeeded(std::string pattern, int tmId);
|
||||||
|
|
||||||
std::vector<std::string> lemmatizeSentencesIfNeeded(std::vector<std::string> patterns, int tmId);
|
std::vector<std::string> lemmatizeSentencesIfNeeded(std::vector<std::string> patterns, int tmId);
|
||||||
|
1
fast-aligner/.gitignore
vendored
Normal file
1
fast-aligner/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
corpora/
|
61
fast-aligner/Makefile
Normal file
61
fast-aligner/Makefile
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
SRC_LANG=pl
|
||||||
|
TRG_LANG=en
|
||||||
|
CORPUS_NAME=opensubtitles_sample
|
||||||
|
SEPARATOR=@\#@
|
||||||
|
|
||||||
|
DICTIONARY_WEIGHT=3
|
||||||
|
|
||||||
|
all: corpora/$(CORPUS_NAME)/src.lem corpora/$(CORPUS_NAME)/trg.lem
|
||||||
|
|
||||||
|
|
||||||
|
clean-intermediate-files:
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.lem
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.tok
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.dict
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.classes
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.classes.cats
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.vcb
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.snt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.cooc
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/aligned*part*
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/aligned.txt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/giza.cfg
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/aligned.gizacfg
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/pasted.txt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/pasted_deduplicated.txt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/src_clean.txt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/trg_clean.txt
|
||||||
|
|
||||||
|
clean: clean-intermediate-files
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/src_final.txt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/trg_final.txt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/aligned_final.txt
|
||||||
|
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/src.dict:
|
||||||
|
./collect_dict.py $(SRC_LANG) $(TRG_LANG) $(DICTIONARY_WEIGHT) > $@
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/trg.dict:
|
||||||
|
./collect_dict.py $(TRG_LANG) $(SRC_LANG) $(DICTIONARY_WEIGHT) > $@
|
||||||
|
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/src.lem: corpora/$(CORPUS_NAME)/src.txt
|
||||||
|
/usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< | ./sentence_lemmatizer.py $(SRC_LANG) > $@
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/trg.lem: corpora/$(CORPUS_NAME)/trg.txt
|
||||||
|
/usr/local/bin/concordia-sentence-tokenizer -c ../concordia.cfg < $< | ./sentence_lemmatizer.py $(TRG_LANG) > $@
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok
|
||||||
|
./clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok $(SEPARATOR)
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/pasted.txt: corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt
|
||||||
|
./paste.py corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt $(SEPARATOR)> $@
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/pasted_deduplicated.txt: corpora/$(CORPUS_NAME)/pasted.txt
|
||||||
|
sort -k 1.13 $< | uniq -s 12 | sort > $@
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt: corpora/$(CORPUS_NAME)/pasted_deduplicated.txt corpora/$(CORPUS_NAME)/aligned.txt
|
||||||
|
./extract.py $< corpora/$(CORPUS_NAME)/aligned.txt corpora/$(CORPUS_NAME)/aligned_final.txt corpora/$(CORPUS_NAME)/src_final.txt corpora/$(CORPUS_NAME)/trg_final.txt $(SEPARATOR)
|
25
fast-aligner/clean_corpus.py
Executable file
25
fast-aligner/clean_corpus.py
Executable file
@ -0,0 +1,25 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
max_tokens = 100
|
||||||
|
max_ratio = 4.0
|
||||||
|
|
||||||
|
separator = sys.argv[9]
|
||||||
|
|
||||||
|
with open(sys.argv[1]) as src_file, open(sys.argv[2]) as trg_file, open(sys.argv[3]) as src_tok, open(sys.argv[4]) as trg_tok, open(sys.argv[5], 'w') as src_clean, open(sys.argv[6], 'w') as trg_clean, open(sys.argv[7], 'w') as src_clean_tok, open(sys.argv[8], 'w') as trg_clean_tok:
|
||||||
|
for line in src_file:
|
||||||
|
src_line_orig = line.strip()
|
||||||
|
trg_line_orig = trg_file.readline().strip()
|
||||||
|
src_line_tok = src_tok.readline().strip()
|
||||||
|
trg_line_tok = trg_tok.readline().strip()
|
||||||
|
src_token_count = len(src_line_tok.split())
|
||||||
|
trg_token_count = len(trg_line_tok.split())
|
||||||
|
if (src_token_count > 0 and trg_token_count > 0 and src_token_count <= max_tokens and trg_token_count <= max_tokens):
|
||||||
|
ratio = float(src_token_count/trg_token_count) if src_token_count > trg_token_count else float(trg_token_count/src_token_count)
|
||||||
|
if (ratio <= max_ratio):
|
||||||
|
src_clean.write(src_line_orig+"\n")
|
||||||
|
trg_clean.write(trg_line_orig+"\n")
|
||||||
|
src_clean_tok.write(src_line_tok+"\n")
|
||||||
|
trg_clean_tok.write(trg_line_tok+"\n")
|
17
fast-aligner/collect_dict.py
Executable file
17
fast-aligner/collect_dict.py
Executable file
@ -0,0 +1,17 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys, os, bz2
|
||||||
|
|
||||||
|
src_lang = sys.argv[1]
|
||||||
|
trg_lang = sys.argv[2]
|
||||||
|
weight = int(sys.argv[3])
|
||||||
|
|
||||||
|
for dname in os.listdir('dictionaries'):
|
||||||
|
src_path = 'dictionaries/%s/%s.bz2' % (dname, src_lang)
|
||||||
|
trg_path = 'dictionaries/%s/%s.bz2' % (dname, trg_lang)
|
||||||
|
if os.path.isfile(src_path) and os.path.isfile(trg_path):
|
||||||
|
with bz2.open(src_path, 'rt') as src_dict_file:
|
||||||
|
for line in src_dict_file:
|
||||||
|
for i in range(weight):
|
||||||
|
print(line.strip())
|
37
fast-aligner/sentence_lemmatizer.py
Executable file
37
fast-aligner/sentence_lemmatizer.py
Executable file
@ -0,0 +1,37 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
import sys
|
||||||
|
|
||||||
|
BUFFER_SIZE = 500
|
||||||
|
|
||||||
|
def lemmatize_sentences(language_code, sentences):
|
||||||
|
data = {
|
||||||
|
'operation': 'lemmatizeAll',
|
||||||
|
'languageCode':language_code,
|
||||||
|
'sentences':sentences
|
||||||
|
}
|
||||||
|
|
||||||
|
address = 'http://localhost:8800'
|
||||||
|
|
||||||
|
response = requests.post(address, data = json.dumps(data))
|
||||||
|
response.encoding = 'utf-8'
|
||||||
|
|
||||||
|
response_json = json.loads(response.text)
|
||||||
|
return '\n'.join(response_json['lemmatizedSentences'])
|
||||||
|
|
||||||
|
|
||||||
|
language_code = sys.argv[1]
|
||||||
|
sentences_buffer = []
|
||||||
|
for line in sys.stdin:
|
||||||
|
sentences_buffer.append(line.rstrip())
|
||||||
|
if len(sentences_buffer) == BUFFER_SIZE:
|
||||||
|
print(lemmatize_sentences(language_code,sentences_buffer))
|
||||||
|
sentences_buffer = []
|
||||||
|
|
||||||
|
if len(sentences_buffer) > 0:
|
||||||
|
print(lemmatize_sentences(language_code,sentences_buffer))
|
||||||
|
|
||||||
|
|
29
tests/lemmatizeSentences.py
Executable file
29
tests/lemmatizeSentences.py
Executable file
@ -0,0 +1,29 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import urllib2
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import host
|
||||||
|
|
||||||
|
data = {
|
||||||
|
'operation': 'lemmatizeAll',
|
||||||
|
'languageCode':sys.argv[1],
|
||||||
|
'sentences':["ona poszła do sklepu", "powiedziałem to Tomkowi"]
|
||||||
|
}
|
||||||
|
|
||||||
|
address = 'http://'+host.concordia_host
|
||||||
|
if len(host.concordia_port) > 0:
|
||||||
|
address += ':'+host.concordia_port
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
req = urllib2.Request(address)
|
||||||
|
req.add_header('Content-Type', 'application/json')
|
||||||
|
response = json.loads(urllib2.urlopen(req, json.dumps(data)).read())
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
print "Execution time: %.4f seconds." % (end-start)
|
||||||
|
print "Result: "
|
||||||
|
print response
|
Loading…
Reference in New Issue
Block a user