corpus compilator
This commit is contained in:
parent
8cf2911c72
commit
966a6530be
2
.gitignore
vendored
2
.gitignore
vendored
@ -36,3 +36,5 @@ mgiza-aligner/mgiza/mgizapp/src/mkcls/Makefile
|
|||||||
mgiza-aligner/mgiza/mgizapp/src/mkcls/cmake_install.cmake
|
mgiza-aligner/mgiza/mgizapp/src/mkcls/cmake_install.cmake
|
||||||
__pycache__
|
__pycache__
|
||||||
import-requests/request_handler.log
|
import-requests/request_handler.log
|
||||||
|
mgiza-aligner/corpus-compilator/corpora
|
||||||
|
mgiza-aligner/corpus-compilator/dictionaries/*lem
|
||||||
|
55
mgiza-aligner/corpus-compilator/Makefile
Normal file
55
mgiza-aligner/corpus-compilator/Makefile
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
SRC_LANG=pl
|
||||||
|
TRG_LANG=en
|
||||||
|
CORPUS_NAME=europarl_sample
|
||||||
|
DICTIONARY_NAME=classyf_popular_medicine
|
||||||
|
SEPARATOR=@\#@
|
||||||
|
CORPUS_CHUNK_SIZE=100000
|
||||||
|
|
||||||
|
all: corpora/$(CORPUS_NAME)/src_filtered.txt corpora/$(CORPUS_NAME)/trg_filtered.txt
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/src_filtered.txt corpora/$(CORPUS_NAME)/trg_filtered.txt: corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/corpus_lines.txt
|
||||||
|
./compile.py corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/corpus_lines.txt corpora/$(CORPUS_NAME)/src_filtered.txt corpora/$(CORPUS_NAME)/trg_filtered.txt
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/corpus_lines.txt: index-corpus dictionaries/$(DICTIONARY_NAME).lem
|
||||||
|
./get_corpus_lines.py dictionaries/$(DICTIONARY_NAME).lem corpora/$(CORPUS_NAME)/report.txt > $@
|
||||||
|
|
||||||
|
index-corpus: split-corpus
|
||||||
|
./load_corpus.sh corpora/$(CORPUS_NAME)/csv/
|
||||||
|
|
||||||
|
split-corpus: corpora/$(CORPUS_NAME)/src.csv
|
||||||
|
mkdir corpora/$(CORPUS_NAME)/csv
|
||||||
|
split -l $(CORPUS_CHUNK_SIZE) -d --additional-suffix=".csv" $< corpora/$(CORPUS_NAME)/csv/src
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/src.csv: corpora/$(CORPUS_NAME)/src_clean.lem
|
||||||
|
./lem2csv.py $< > $@
|
||||||
|
|
||||||
|
|
||||||
|
dictionaries/$(DICTIONARY_NAME).lem: dictionaries/$(DICTIONARY_NAME).txt
|
||||||
|
mono ../LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< | sort -u > $@
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/trg_clean.lem: corpora/$(CORPUS_NAME)/trg_clean.tok
|
||||||
|
mono ../LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(TRG_LANG) < $< > $@
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/src_clean.lem: corpora/$(CORPUS_NAME)/src_clean.tok
|
||||||
|
mono ../LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok
|
||||||
|
../clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok $(SEPARATOR)
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/trg.txt
|
||||||
|
/usr/local/bin/concordia-sentence-tokenizer -c ../../concordia.cfg < $< > $@
|
||||||
|
|
||||||
|
corpora/$(CORPUS_NAME)/src.tok: corpora/$(CORPUS_NAME)/src.txt
|
||||||
|
/usr/local/bin/concordia-sentence-tokenizer -c ../../concordia.cfg < $< > $@
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/report.txt
|
||||||
|
./clear_solr_index.sh
|
||||||
|
rm -rf corpora/$(CORPUS_NAME)/csv
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/src.csv
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/corpus_lines.txt
|
||||||
|
rm -f dictionaries/$(DICTIONARY_NAME).lem
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.lem
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/*.tok
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/src_clean.txt
|
||||||
|
rm -f corpora/$(CORPUS_NAME)/trg_clean.txt
|
5
mgiza-aligner/corpus-compilator/clear_solr_index.sh
Executable file
5
mgiza-aligner/corpus-compilator/clear_solr_index.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
echo "Clearing solr index"
|
||||||
|
|
||||||
|
curl "http://localhost:8983/solr/corpus_compiler/update?stream.body=<delete><query>*:*</query></delete>&commit=true"
|
17
mgiza-aligner/corpus-compilator/compile.py
Executable file
17
mgiza-aligner/corpus-compilator/compile.py
Executable file
@ -0,0 +1,17 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
with open(sys.argv[1]) as src_clean, open(sys.argv[2]) as trg_clean, open(sys.argv[3]) as corpus_lines, open(sys.argv[4], 'w') as src_filtered, open(sys.argv[5],'w') as trg_filtered:
|
||||||
|
index = 1
|
||||||
|
for corpus_line_raw in corpus_lines:
|
||||||
|
corpus_line = int(corpus_line_raw)
|
||||||
|
while index < corpus_line:
|
||||||
|
src_clean.readline()
|
||||||
|
trg_clean.readline()
|
||||||
|
index+=1
|
||||||
|
src_filtered.write(src_clean.readline())
|
||||||
|
trg_filtered.write(trg_clean.readline())
|
||||||
|
index+=1
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
2
mgiza-aligner/corpus-compilator/dictionaries/test.txt
Normal file
2
mgiza-aligner/corpus-compilator/dictionaries/test.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
parlamentu europejskiego
|
||||||
|
protokół
|
39
mgiza-aligner/corpus-compilator/get_corpus_lines.py
Executable file
39
mgiza-aligner/corpus-compilator/get_corpus_lines.py
Executable file
@ -0,0 +1,39 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys, json, requests
|
||||||
|
|
||||||
|
dictionary_path = sys.argv[1]
|
||||||
|
report_path = sys.argv[2]
|
||||||
|
|
||||||
|
|
||||||
|
dictionary_frequencies = []
|
||||||
|
filtered_corpus_lines = set([])
|
||||||
|
|
||||||
|
with open(dictionary_path) as dictionary:
|
||||||
|
index = 0
|
||||||
|
for line in dictionary:
|
||||||
|
index += 1
|
||||||
|
word = line.rstrip()
|
||||||
|
request_url = 'http://localhost:8983/solr/corpus_compiler/select?q='
|
||||||
|
request_url += '"'+word+'"'
|
||||||
|
request_url +='&rows=100000&wt=json'
|
||||||
|
response = requests.get(request_url)
|
||||||
|
try:
|
||||||
|
json_response = json.loads(response.content.decode('utf-8'))
|
||||||
|
if json_response['response']['numFound'] > 0:
|
||||||
|
dictionary_frequencies.append((word,json_response['response']['numFound']))
|
||||||
|
for doc in json_response['response']['docs']:
|
||||||
|
filtered_corpus_lines.add(doc['line_number'])
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if index % 100 == 0:
|
||||||
|
sys.stderr.write("Done %d dictionary words. Current corpus size: %d\n" % (index, len(filtered_corpus_lines)))
|
||||||
|
|
||||||
|
|
||||||
|
for number in sorted(filtered_corpus_lines):
|
||||||
|
print(str(number))
|
||||||
|
|
||||||
|
with open(report_path, 'w') as report:
|
||||||
|
for entry in sorted(dictionary_frequencies, key=lambda x:-x[1]):
|
||||||
|
report.write("%s\t%d\n" % entry)
|
10
mgiza-aligner/corpus-compilator/lem2csv.py
Executable file
10
mgiza-aligner/corpus-compilator/lem2csv.py
Executable file
@ -0,0 +1,10 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
index = 0
|
||||||
|
with open(sys.argv[1]) as corpus:
|
||||||
|
for line in corpus:
|
||||||
|
index += 1
|
||||||
|
print(str(index)+"\t"+line.rstrip())
|
15
mgiza-aligner/corpus-compilator/load_corpus.sh
Executable file
15
mgiza-aligner/corpus-compilator/load_corpus.sh
Executable file
@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
CORPUS_FOLDER=$1
|
||||||
|
|
||||||
|
CHUNKS_COUNT=`ls $CORPUS_FOLDER | wc -l`
|
||||||
|
|
||||||
|
echo "Total file count" $CHUNKS_COUNT
|
||||||
|
INDEX=0
|
||||||
|
for FILE in `ls $CORPUS_FOLDER`
|
||||||
|
do
|
||||||
|
INDEX=`expr $INDEX + 1`
|
||||||
|
echo "Working on file" $INDEX "of" $CHUNKS_COUNT
|
||||||
|
curl -X POST --data-binary @$CORPUS_FOLDER$FILE -H 'Content-type:application/csv' \
|
||||||
|
'http://localhost:8983/solr/corpus_compiler/update?commit=true&optimize=true&separator=%09&fieldnames=line_number,content'
|
||||||
|
done
|
12
mgiza-aligner/corpus-compilator/setup_solr.sh
Executable file
12
mgiza-aligner/corpus-compilator/setup_solr.sh
Executable file
@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
SOLR_HOME=/home/rafalj/programs/solr-6.0.0
|
||||||
|
|
||||||
|
$SOLR_HOME/bin/solr restart
|
||||||
|
$SOLR_HOME/bin/solr create -c corpus_compiler
|
||||||
|
|
||||||
|
|
||||||
|
curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/corpus_compiler/schema -d '{
|
||||||
|
"add-field":{ "name":"content", "type":"text_general"},
|
||||||
|
"add-field":{ "name":"line_number", "type":"int"}
|
||||||
|
}'
|
Loading…
Reference in New Issue
Block a user