corpus compilator

This commit is contained in:
rjawor 2017-07-21 17:37:00 +02:00
parent 8cf2911c72
commit 966a6530be
11 changed files with 16721 additions and 0 deletions

2
.gitignore vendored
View File

@ -36,3 +36,5 @@ mgiza-aligner/mgiza/mgizapp/src/mkcls/Makefile
mgiza-aligner/mgiza/mgizapp/src/mkcls/cmake_install.cmake
__pycache__
import-requests/request_handler.log
mgiza-aligner/corpus-compilator/corpora
mgiza-aligner/corpus-compilator/dictionaries/*lem

View File

@ -0,0 +1,55 @@
SRC_LANG=pl
TRG_LANG=en
CORPUS_NAME=europarl_sample
DICTIONARY_NAME=classyf_popular_medicine
SEPARATOR=@\#@
CORPUS_CHUNK_SIZE=100000
all: corpora/$(CORPUS_NAME)/src_filtered.txt corpora/$(CORPUS_NAME)/trg_filtered.txt
corpora/$(CORPUS_NAME)/src_filtered.txt corpora/$(CORPUS_NAME)/trg_filtered.txt: corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/corpus_lines.txt
./compile.py corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/corpus_lines.txt corpora/$(CORPUS_NAME)/src_filtered.txt corpora/$(CORPUS_NAME)/trg_filtered.txt
corpora/$(CORPUS_NAME)/corpus_lines.txt: index-corpus dictionaries/$(DICTIONARY_NAME).lem
./get_corpus_lines.py dictionaries/$(DICTIONARY_NAME).lem corpora/$(CORPUS_NAME)/report.txt > $@
index-corpus: split-corpus
./load_corpus.sh corpora/$(CORPUS_NAME)/csv/
split-corpus: corpora/$(CORPUS_NAME)/src.csv
mkdir corpora/$(CORPUS_NAME)/csv
split -l $(CORPUS_CHUNK_SIZE) -d --additional-suffix=".csv" $< corpora/$(CORPUS_NAME)/csv/src
corpora/$(CORPUS_NAME)/src.csv: corpora/$(CORPUS_NAME)/src_clean.lem
./lem2csv.py $< > $@
dictionaries/$(DICTIONARY_NAME).lem: dictionaries/$(DICTIONARY_NAME).txt
mono ../LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< | sort -u > $@
corpora/$(CORPUS_NAME)/trg_clean.lem: corpora/$(CORPUS_NAME)/trg_clean.tok
mono ../LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(TRG_LANG) < $< > $@
corpora/$(CORPUS_NAME)/src_clean.lem: corpora/$(CORPUS_NAME)/src_clean.tok
mono ../LemmaGenSentenceLemmatizer/LemmaGenSentenceLemmatizer/bin/Debug/LemmaGenSentenceLemmatizer.exe $(SRC_LANG) < $< > $@
corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok: corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok
../clean_corpus.py corpora/$(CORPUS_NAME)/src.txt corpora/$(CORPUS_NAME)/trg.txt corpora/$(CORPUS_NAME)/src.tok corpora/$(CORPUS_NAME)/trg.tok corpora/$(CORPUS_NAME)/src_clean.txt corpora/$(CORPUS_NAME)/trg_clean.txt corpora/$(CORPUS_NAME)/src_clean.tok corpora/$(CORPUS_NAME)/trg_clean.tok $(SEPARATOR)
corpora/$(CORPUS_NAME)/trg.tok: corpora/$(CORPUS_NAME)/trg.txt
/usr/local/bin/concordia-sentence-tokenizer -c ../../concordia.cfg < $< > $@
corpora/$(CORPUS_NAME)/src.tok: corpora/$(CORPUS_NAME)/src.txt
/usr/local/bin/concordia-sentence-tokenizer -c ../../concordia.cfg < $< > $@
clean:
rm -f corpora/$(CORPUS_NAME)/report.txt
./clear_solr_index.sh
rm -rf corpora/$(CORPUS_NAME)/csv
rm -f corpora/$(CORPUS_NAME)/src.csv
rm -f corpora/$(CORPUS_NAME)/corpus_lines.txt
rm -f dictionaries/$(DICTIONARY_NAME).lem
rm -f corpora/$(CORPUS_NAME)/*.lem
rm -f corpora/$(CORPUS_NAME)/*.tok
rm -f corpora/$(CORPUS_NAME)/src_clean.txt
rm -f corpora/$(CORPUS_NAME)/trg_clean.txt

View File

@ -0,0 +1,5 @@
#!/bin/sh
echo "Clearing solr index"
curl "http://localhost:8983/solr/corpus_compiler/update?stream.body=<delete><query>*:*</query></delete>&commit=true"

View File

@ -0,0 +1,17 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
with open(sys.argv[1]) as src_clean, open(sys.argv[2]) as trg_clean, open(sys.argv[3]) as corpus_lines, open(sys.argv[4], 'w') as src_filtered, open(sys.argv[5],'w') as trg_filtered:
index = 1
for corpus_line_raw in corpus_lines:
corpus_line = int(corpus_line_raw)
while index < corpus_line:
src_clean.readline()
trg_clean.readline()
index+=1
src_filtered.write(src_clean.readline())
trg_filtered.write(trg_clean.readline())
index+=1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
parlamentu europejskiego
protokół

View File

@ -0,0 +1,39 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys, json, requests
dictionary_path = sys.argv[1]
report_path = sys.argv[2]
dictionary_frequencies = []
filtered_corpus_lines = set([])
with open(dictionary_path) as dictionary:
index = 0
for line in dictionary:
index += 1
word = line.rstrip()
request_url = 'http://localhost:8983/solr/corpus_compiler/select?q='
request_url += '"'+word+'"'
request_url +='&rows=100000&wt=json'
response = requests.get(request_url)
try:
json_response = json.loads(response.content.decode('utf-8'))
if json_response['response']['numFound'] > 0:
dictionary_frequencies.append((word,json_response['response']['numFound']))
for doc in json_response['response']['docs']:
filtered_corpus_lines.add(doc['line_number'])
except:
pass
if index % 100 == 0:
sys.stderr.write("Done %d dictionary words. Current corpus size: %d\n" % (index, len(filtered_corpus_lines)))
for number in sorted(filtered_corpus_lines):
print(str(number))
with open(report_path, 'w') as report:
for entry in sorted(dictionary_frequencies, key=lambda x:-x[1]):
report.write("%s\t%d\n" % entry)

View File

@ -0,0 +1,10 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
index = 0
with open(sys.argv[1]) as corpus:
for line in corpus:
index += 1
print(str(index)+"\t"+line.rstrip())

View File

@ -0,0 +1,15 @@
#!/bin/sh
CORPUS_FOLDER=$1
CHUNKS_COUNT=`ls $CORPUS_FOLDER | wc -l`
echo "Total file count" $CHUNKS_COUNT
INDEX=0
for FILE in `ls $CORPUS_FOLDER`
do
INDEX=`expr $INDEX + 1`
echo "Working on file" $INDEX "of" $CHUNKS_COUNT
curl -X POST --data-binary @$CORPUS_FOLDER$FILE -H 'Content-type:application/csv' \
'http://localhost:8983/solr/corpus_compiler/update?commit=true&optimize=true&separator=%09&fieldnames=line_number,content'
done

View File

@ -0,0 +1,12 @@
#!/bin/sh
SOLR_HOME=/home/rafalj/programs/solr-6.0.0
$SOLR_HOME/bin/solr restart
$SOLR_HOME/bin/solr create -c corpus_compiler
curl -X POST -H 'Content-type:application/json' http://localhost:8983/solr/corpus_compiler/schema -d '{
"add-field":{ "name":"content", "type":"text_general"},
"add-field":{ "name":"line_number", "type":"int"}
}'