lemmatizer fixed

This commit is contained in:
Rafał Jaworski 2018-12-31 11:13:16 +01:00
parent dea4308618
commit c800fa7b57
10 changed files with 73 additions and 2 deletions

View File

@ -54,6 +54,12 @@ namespace LemmaGenSockets
private string lemmatizeWord(string languageCode, string word)
{
// exceptions
if (word.StartsWith("ne_"))
{
return word;
}
Dictionary<String, HashSet<String>> exceptions = new Dictionary<string, HashSet<string>>();
HashSet<String> plExceptions = new HashSet<string>();
@ -76,6 +82,7 @@ namespace LemmaGenSockets
}
string result = "";
string[] parts = word.Split(wordInnerSeparator);
if (parts.Length == 2)
{
@ -85,11 +92,20 @@ namespace LemmaGenSockets
firstPart = lemmatizersDict[languageCode].Lemmatize(firstPart);
}
string secondPart = lemmatizersDict[languageCode].Lemmatize(parts[1]);
return firstPart + "-" + secondPart;
result = firstPart + "-" + secondPart;
}
else
{
return lemmatizersDict[languageCode].Lemmatize(word);
result = lemmatizersDict[languageCode].Lemmatize(word);
}
if (result == "")
{
return word;
}
else
{
return result;
}
}

2
tests/lemmatizer-test/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
differences.log
corpora/

10
tests/lemmatizer-test/test.sh Executable file
View File

@ -0,0 +1,10 @@
#!/bin/bash
./test_corpus.py corpora/A_en.txt en >> differences.log
./test_corpus.py corpora/B_en.txt en >> differences.log
./test_corpus.py corpora/C_en.txt en >> differences.log
./test_corpus.py corpora/D_en.txt en >> differences.log
./test_corpus.py corpora/A_fr.txt fr >> differences.log
./test_corpus.py corpora/B_fr.txt fr >> differences.log
./test_corpus.py corpora/C_fr.txt fr >> differences.log
./test_corpus.py corpora/D_fr.txt fr >> differences.log

View File

@ -0,0 +1,36 @@
#!/usr/bin/python3
import unittest
import json
import requests
import sys
def lemmatizeSentence(lang, sentence):
data = {
'operation': 'lemmatize',
'languageCode':lang,
'sentence':sentence
}
address = 'http://localhost:8800'
response = requests.post(address, data=json.dumps(data))
return response.json()['lemmatizedSentence']
corpus_file_path = sys.argv[1]
lang = sys.argv[2]
line_count = 0
with open(corpus_file_path) as corpus_file:
for line in corpus_file:
line_count += 1
orig = line.rstrip()
lemmatized = lemmatizeSentence(lang,orig)
if len(orig.split()) != len(lemmatized.split()):
print("Different length in:")
print(orig)
print(lemmatized)
if line_count % 1000 == 0:
sys.stderr.write("Done %d lines\n" % line_count)

View File

@ -0,0 +1,7 @@
#!/bin/bash
for corpus_file in `ls /mnt/storage/rjawor_storage/copycat_corpus/cleaned/*txt`
do
a=`basename $corpus_file`
concordia-sentence-tokenizer -c /home/rjawor/concordia-server/concordia.cfg < $corpus_file > corpora/$a
done