From 9f91ea15b09ca4c7b229d1399a8b7f6ff57a015d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Jaworski?= Date: Wed, 26 Jul 2017 13:29:22 +0200 Subject: [PATCH] minor changes --- cat/versions_available/tmrepository_enhr.cfg | 8 ++++++ cat/versions_enabled/stocznia_enpl.cfg | 1 - cat/versions_enabled/stocznia_plen.cfg | 1 - cat/versions_enabled/tmrepository_enhr.cfg | 1 + mgiza-aligner/corpus-compilator/Makefile | 2 +- mgiza-aligner/corpus-compilator/filter.sh | 12 ++++++++ mgiza-aligner/corpus-compilator/setup_solr.sh | 2 +- tests/addLemmatizedTM.sh | 6 ++-- tests/generateIndex.py | 28 +++++++++++++++++++ 9 files changed, 54 insertions(+), 7 deletions(-) create mode 100644 cat/versions_available/tmrepository_enhr.cfg delete mode 120000 cat/versions_enabled/stocznia_enpl.cfg delete mode 120000 cat/versions_enabled/stocznia_plen.cfg create mode 120000 cat/versions_enabled/tmrepository_enhr.cfg create mode 100755 mgiza-aligner/corpus-compilator/filter.sh create mode 100755 tests/generateIndex.py diff --git a/cat/versions_available/tmrepository_enhr.cfg b/cat/versions_available/tmrepository_enhr.cfg new file mode 100644 index 0000000..1af1e39 --- /dev/null +++ b/cat/versions_available/tmrepository_enhr.cfg @@ -0,0 +1,8 @@ +dir@#@tmrepository_enhr +concordia_host@#@concordia.vm.wmi.amu.edu.pl +concordia_port@#@8800 +tmid@#@1 +desc@#@Welcome to Concordia. The system finds the longest matches of the pattern sentence in its translation memory. This translation memory is over 1M sentences from the TMrepository system (http://concordia.vm.wmi.amu.edu.pl/tmrepository). Please enter a Croatian sentence in the field below and press Enter (or use the search button). You can test the system on predefined samples, simply use the link "show/hide samples" and apply one of the sample sentences. After the search, click on the highlighted fragments to see their context. +enjoy@#@Enjoy your work with the system! +prompt@#@Enter search pattern (English sentence): +suggestion@#@BiHs Komsic resigns from his party diff --git a/cat/versions_enabled/stocznia_enpl.cfg b/cat/versions_enabled/stocznia_enpl.cfg deleted file mode 120000 index 884dd56..0000000 --- a/cat/versions_enabled/stocznia_enpl.cfg +++ /dev/null @@ -1 +0,0 @@ -../versions_available/stocznia_enpl.cfg \ No newline at end of file diff --git a/cat/versions_enabled/stocznia_plen.cfg b/cat/versions_enabled/stocznia_plen.cfg deleted file mode 120000 index 0ba3868..0000000 --- a/cat/versions_enabled/stocznia_plen.cfg +++ /dev/null @@ -1 +0,0 @@ -../versions_available/stocznia_plen.cfg \ No newline at end of file diff --git a/cat/versions_enabled/tmrepository_enhr.cfg b/cat/versions_enabled/tmrepository_enhr.cfg new file mode 120000 index 0000000..de131be --- /dev/null +++ b/cat/versions_enabled/tmrepository_enhr.cfg @@ -0,0 +1 @@ +../versions_available/tmrepository_enhr.cfg \ No newline at end of file diff --git a/mgiza-aligner/corpus-compilator/Makefile b/mgiza-aligner/corpus-compilator/Makefile index 942e7f7..59b6c52 100644 --- a/mgiza-aligner/corpus-compilator/Makefile +++ b/mgiza-aligner/corpus-compilator/Makefile @@ -1,6 +1,6 @@ SRC_LANG=pl TRG_LANG=en -CORPUS_NAME=europarl_sample +CORPUS_NAME=opus DICTIONARY_NAME=classyf_popular_medicine SEPARATOR=@\#@ CORPUS_CHUNK_SIZE=100000 diff --git a/mgiza-aligner/corpus-compilator/filter.sh b/mgiza-aligner/corpus-compilator/filter.sh new file mode 100755 index 0000000..5640b13 --- /dev/null +++ b/mgiza-aligner/corpus-compilator/filter.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +DICTIONARY_NAME=classyf_popular_medicine +CORPUS_NAME=opus + +make clean-filtering +make dictionaries/$DICTIONARY_NAME.lem + +./get_corpus_lines.py dictionaries/$DICTIONARY_NAME.lem corpora/$CORPUS_NAME/report.txt > corpora/$CORPUS_NAME/corpus_lines.txt +./compile.py corpora/$CORPUS_NAME/src_clean.txt corpora/$CORPUS_NAME/trg_clean.txt corpora/$CORPUS_NAME/corpus_lines.txt corpora/$CORPUS_NAME/src_filtered.txt corpora/$CORPUS_NAME/trg_filtered.txt + + diff --git a/mgiza-aligner/corpus-compilator/setup_solr.sh b/mgiza-aligner/corpus-compilator/setup_solr.sh index c9a09ef..e599ab0 100755 --- a/mgiza-aligner/corpus-compilator/setup_solr.sh +++ b/mgiza-aligner/corpus-compilator/setup_solr.sh @@ -1,6 +1,6 @@ #!/bin/sh -SOLR_HOME=/home/rafalj/programs/solr-6.0.0 +SOLR_HOME=/home/rjawor/programs/solr-5.5.4 $SOLR_HOME/bin/solr restart $SOLR_HOME/bin/solr create -c corpus_compiler diff --git a/tests/addLemmatizedTM.sh b/tests/addLemmatizedTM.sh index 282f2c3..28dd2bb 100755 --- a/tests/addLemmatizedTM.sh +++ b/tests/addLemmatizedTM.sh @@ -1,7 +1,7 @@ #!/bin/sh -CORPUS_NAME="europarl_sample" -SRC_LANG_ID=1 -TRG_LANG_ID=2 +CORPUS_NAME="tmrepository_enhr" +SRC_LANG_ID=2 +TRG_LANG_ID=6 ./addAlignedLemmatizedTM.py $CORPUS_NAME ../mgiza-aligner/corpora/$CORPUS_NAME/src_final.txt $SRC_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/trg_final.txt $TRG_LANG_ID ../mgiza-aligner/corpora/$CORPUS_NAME/aligned_final.txt diff --git a/tests/generateIndex.py b/tests/generateIndex.py new file mode 100755 index 0000000..51e3f93 --- /dev/null +++ b/tests/generateIndex.py @@ -0,0 +1,28 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import unittest +import json +import urllib2 +import sys +import host +import time + +address = 'http://'+host.concordia_host +if len(host.concordia_port) > 0: + address += ':'+host.concordia_port + + + +print "Generating index..." +start = time.time() +data = { + 'operation': 'refreshIndex', + 'tmId' : 1 +} +req = urllib2.Request(address) +req.add_header('Content-Type', 'application/json') +urllib2.urlopen(req, json.dumps(data)).read() + +end = time.time() +print "Index regeneration complete. The operation took %.4f s" % (end - start)