lemmatizer fixed

2018-12-31 11:13:16 +01:00 · 2018-12-31 11:13:16 +01:00 · c800fa7b57
commit c800fa7b57
parent dea4308618
10 changed files with 73 additions and 2 deletions
--- a/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs
+++ b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs
@ -54,6 +54,12 @@ namespace LemmaGenSockets
        private string lemmatizeWord(string languageCode, string word)
        {
            // exceptions
            if (word.StartsWith("ne_"))
            {
                return word;
            }
            Dictionary<String, HashSet<String>> exceptions = new Dictionary<string, HashSet<string>>();
            HashSet<String> plExceptions = new HashSet<string>();
@ -76,6 +82,7 @@ namespace LemmaGenSockets
            }
            string result = "";
            string[] parts = word.Split(wordInnerSeparator);
            if (parts.Length == 2)
            {
@ -85,11 +92,20 @@ namespace LemmaGenSockets
                    firstPart = lemmatizersDict[languageCode].Lemmatize(firstPart);
                }
                string secondPart = lemmatizersDict[languageCode].Lemmatize(parts[1]);
-                return firstPart + "-" + secondPart;
+                result = firstPart + "-" + secondPart;
            }
            else
            {
-                return lemmatizersDict[languageCode].Lemmatize(word);
+                result = lemmatizersDict[languageCode].Lemmatize(word);
            }
            if (result == "")
            {
                return word;
            }
            else
            {
                return result;
            }
        }
--- a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe
+++ b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe
--- a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb
+++ b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb
--- a/tests/lemmatizer-test/.gitignore
+++ b/tests/lemmatizer-test/.gitignore
@ -0,0 +1,2 @@
 differences.log
 corpora/
--- a/tests/lemmatizer-test/test.sh
+++ b/tests/lemmatizer-test/test.sh
@ -0,0 +1,10 @@
 #!/bin/bash
 ./test_corpus.py corpora/A_en.txt en >> differences.log
 ./test_corpus.py corpora/B_en.txt en >> differences.log
 ./test_corpus.py corpora/C_en.txt en >> differences.log
 ./test_corpus.py corpora/D_en.txt en >> differences.log
 ./test_corpus.py corpora/A_fr.txt fr >> differences.log
 ./test_corpus.py corpora/B_fr.txt fr >> differences.log
 ./test_corpus.py corpora/C_fr.txt fr >> differences.log
 ./test_corpus.py corpora/D_fr.txt fr >> differences.log
--- a/tests/lemmatizer-test/test_corpus.py
+++ b/tests/lemmatizer-test/test_corpus.py
@ -0,0 +1,36 @@
 #!/usr/bin/python3
 import unittest
 import json
 import requests
 import sys
 def lemmatizeSentence(lang, sentence):
    data = {
        'operation': 'lemmatize',
        'languageCode':lang,
        'sentence':sentence
    }
    address = 'http://localhost:8800'
    response = requests.post(address, data=json.dumps(data))
    return response.json()['lemmatizedSentence']
 corpus_file_path = sys.argv[1]
 lang = sys.argv[2]
 line_count = 0
 with open(corpus_file_path) as corpus_file:
    for line in corpus_file:
        line_count += 1
        orig = line.rstrip()
        lemmatized = lemmatizeSentence(lang,orig)
        if len(orig.split()) != len(lemmatized.split()):
            print("Different length in:")
            print(orig)
            print(lemmatized)
        if line_count % 1000 == 0:
            sys.stderr.write("Done %d lines\n" % line_count)
--- a/tests/lemmatizer-test/tokenize.sh
+++ b/tests/lemmatizer-test/tokenize.sh
@ -0,0 +1,7 @@
 #!/bin/bash
 for corpus_file in `ls /mnt/storage/rjawor_storage/copycat_corpus/cleaned/*txt`
 do
    a=`basename $corpus_file`
    concordia-sentence-tokenizer -c /home/rjawor/concordia-server/concordia.cfg < $corpus_file > corpora/$a
 done