lemmatizer fixed

2018-12-31 11:13:16 +01:00 · 2018-12-31 11:13:16 +01:00 · c800fa7b57
commit c800fa7b57
parent dea4308618
10 changed files with 73 additions and 2 deletions
--- a/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs
+++ b/LemmaGenSockets/LemmaGenSockets/LemmatizerListener.cs
@ -54,6 +54,12 @@ namespace LemmaGenSockets
        private string lemmatizeWord(string languageCode, string word)
        {
            // exceptions
+            if (word.StartsWith("ne_"))
+            {
+                return word;
+            }
+
+
            Dictionary<String, HashSet<String>> exceptions = new Dictionary<string, HashSet<string>>();

            HashSet<String> plExceptions = new HashSet<string>();
@ -76,6 +82,7 @@ namespace LemmaGenSockets
            }


+            string result = "";
            string[] parts = word.Split(wordInnerSeparator);
            if (parts.Length == 2)
            {
@ -85,11 +92,20 @@ namespace LemmaGenSockets
                    firstPart = lemmatizersDict[languageCode].Lemmatize(firstPart);
                }
                string secondPart = lemmatizersDict[languageCode].Lemmatize(parts[1]);
-                return firstPart + "-" + secondPart;
+                result = firstPart + "-" + secondPart;
            }
            else
            {
-                return lemmatizersDict[languageCode].Lemmatize(word);
+                result = lemmatizersDict[languageCode].Lemmatize(word);
+            }
+
+            if (result == "")
+            {
+                return word;
+            }
+            else
+            {
+                return result;
            }
        }

--- a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe
+++ b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.exe
--- a/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb
+++ b/LemmaGenSockets/LemmaGenSockets/bin/Debug/LemmaGenSockets.pdb
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.csprojResolveAssemblyReference.cache
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.exe
--- a/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb
+++ b/LemmaGenSockets/LemmaGenSockets/obj/Debug/LemmaGenSockets.pdb
--- a/tests/lemmatizer-test/.gitignore
+++ b/tests/lemmatizer-test/.gitignore
@ -0,0 +1,2 @@
+differences.log
+corpora/
--- a/tests/lemmatizer-test/test.sh
+++ b/tests/lemmatizer-test/test.sh
@ -0,0 +1,10 @@
+#!/bin/bash
+
+./test_corpus.py corpora/A_en.txt en >> differences.log
+./test_corpus.py corpora/B_en.txt en >> differences.log
+./test_corpus.py corpora/C_en.txt en >> differences.log
+./test_corpus.py corpora/D_en.txt en >> differences.log
+./test_corpus.py corpora/A_fr.txt fr >> differences.log
+./test_corpus.py corpora/B_fr.txt fr >> differences.log
+./test_corpus.py corpora/C_fr.txt fr >> differences.log
+./test_corpus.py corpora/D_fr.txt fr >> differences.log
--- a/tests/lemmatizer-test/test_corpus.py
+++ b/tests/lemmatizer-test/test_corpus.py
@ -0,0 +1,36 @@
+#!/usr/bin/python3
+
+import unittest
+import json
+import requests
+import sys
+
+
+
+def lemmatizeSentence(lang, sentence):
+    data = {
+        'operation': 'lemmatize',
+        'languageCode':lang,
+        'sentence':sentence
+    }
+
+    address = 'http://localhost:8800'
+    response = requests.post(address, data=json.dumps(data))
+    return response.json()['lemmatizedSentence']
+
+corpus_file_path = sys.argv[1]
+lang = sys.argv[2]
+
+
+line_count = 0
+with open(corpus_file_path) as corpus_file:
+    for line in corpus_file:
+        line_count += 1
+        orig = line.rstrip()
+        lemmatized = lemmatizeSentence(lang,orig)
+        if len(orig.split()) != len(lemmatized.split()):
+            print("Different length in:")
+            print(orig)
+            print(lemmatized)
+        if line_count % 1000 == 0:
+            sys.stderr.write("Done %d lines\n" % line_count)
--- a/tests/lemmatizer-test/tokenize.sh
+++ b/tests/lemmatizer-test/tokenize.sh
@ -0,0 +1,7 @@
+#!/bin/bash
+
+for corpus_file in `ls /mnt/storage/rjawor_storage/copycat_corpus/cleaned/*txt`
+do
+    a=`basename $corpus_file`
+    concordia-sentence-tokenizer -c /home/rjawor/concordia-server/concordia.cfg < $corpus_file > corpora/$a
+done