diff --git a/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs index aaebab9..28afaf9 100644 --- a/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs +++ b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs @@ -54,6 +54,12 @@ namespace LemmaGenSockets private string lemmatizeWord(string languageCode, string word) { // exceptions + if (word.StartsWith("ne_")) + { + return word; + } + + Dictionary> exceptions = new Dictionary>(); HashSet plExceptions = new HashSet(); @@ -76,6 +82,7 @@ namespace LemmaGenSockets } + string result = ""; string[] parts = word.Split(wordInnerSeparator); if (parts.Length == 2) { @@ -85,11 +92,20 @@ namespace LemmaGenSockets firstPart = lemmatizersDict[languageCode].Lemmatize(firstPart); } string secondPart = lemmatizersDict[languageCode].Lemmatize(parts[1]); - return firstPart + "-" + secondPart; + result = firstPart + "-" + secondPart; } else { - return lemmatizersDict[languageCode].Lemmatize(word); + result = lemmatizersDict[languageCode].Lemmatize(word); + } + + if (result == "") + { + return word; + } + else + { + return result; } } diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe index a3ee5a9..dfb6538 100644 Binary files a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe and b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe differ diff --git a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb index c8247b9..e85154f 100644 Binary files a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb and b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb differ diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache index 08e454d..58d7cf5 100644 Binary files a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache and b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache differ diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe index a3ee5a9..dfb6538 100644 Binary files a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe and b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe differ diff --git a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb index c8247b9..e85154f 100644 Binary files a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb and b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb differ diff --git a/tests/lemmatizer-test/.gitignore b/tests/lemmatizer-test/.gitignore new file mode 100644 index 0000000..b87b494 --- /dev/null +++ b/tests/lemmatizer-test/.gitignore @@ -0,0 +1,2 @@ +differences.log +corpora/ diff --git a/tests/lemmatizer-test/test.sh b/tests/lemmatizer-test/test.sh new file mode 100755 index 0000000..1e4aa0b --- /dev/null +++ b/tests/lemmatizer-test/test.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +./test_corpus.py corpora/A_en.txt en >> differences.log +./test_corpus.py corpora/B_en.txt en >> differences.log +./test_corpus.py corpora/C_en.txt en >> differences.log +./test_corpus.py corpora/D_en.txt en >> differences.log +./test_corpus.py corpora/A_fr.txt fr >> differences.log +./test_corpus.py corpora/B_fr.txt fr >> differences.log +./test_corpus.py corpora/C_fr.txt fr >> differences.log +./test_corpus.py corpora/D_fr.txt fr >> differences.log diff --git a/tests/lemmatizer-test/test_corpus.py b/tests/lemmatizer-test/test_corpus.py new file mode 100755 index 0000000..8c986bb --- /dev/null +++ b/tests/lemmatizer-test/test_corpus.py @@ -0,0 +1,36 @@ +#!/usr/bin/python3 + +import unittest +import json +import requests +import sys + + + +def lemmatizeSentence(lang, sentence): + data = { + 'operation': 'lemmatize', + 'languageCode':lang, + 'sentence':sentence + } + + address = 'http://localhost:8800' + response = requests.post(address, data=json.dumps(data)) + return response.json()['lemmatizedSentence'] + +corpus_file_path = sys.argv[1] +lang = sys.argv[2] + + +line_count = 0 +with open(corpus_file_path) as corpus_file: + for line in corpus_file: + line_count += 1 + orig = line.rstrip() + lemmatized = lemmatizeSentence(lang,orig) + if len(orig.split()) != len(lemmatized.split()): + print("Different length in:") + print(orig) + print(lemmatized) + if line_count % 1000 == 0: + sys.stderr.write("Done %d lines\n" % line_count) diff --git a/tests/lemmatizer-test/tokenize.sh b/tests/lemmatizer-test/tokenize.sh new file mode 100755 index 0000000..442a2f8 --- /dev/null +++ b/tests/lemmatizer-test/tokenize.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +for corpus_file in `ls /mnt/storage/rjawor_storage/copycat_corpus/cleaned/*txt` +do + a=`basename $corpus_file` + concordia-sentence-tokenizer -c /home/rjawor/concordia-server/concordia.cfg < $corpus_file > corpora/$a +done