lemmatizer fixed
This commit is contained in:
parent
dea4308618
commit
c800fa7b57
@ -54,6 +54,12 @@ namespace LemmaGenSockets
|
|||||||
private string lemmatizeWord(string languageCode, string word)
|
private string lemmatizeWord(string languageCode, string word)
|
||||||
{
|
{
|
||||||
// exceptions
|
// exceptions
|
||||||
|
if (word.StartsWith("ne_"))
|
||||||
|
{
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
Dictionary<String, HashSet<String>> exceptions = new Dictionary<string, HashSet<string>>();
|
Dictionary<String, HashSet<String>> exceptions = new Dictionary<string, HashSet<string>>();
|
||||||
|
|
||||||
HashSet<String> plExceptions = new HashSet<string>();
|
HashSet<String> plExceptions = new HashSet<string>();
|
||||||
@ -76,6 +82,7 @@ namespace LemmaGenSockets
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
string result = "";
|
||||||
string[] parts = word.Split(wordInnerSeparator);
|
string[] parts = word.Split(wordInnerSeparator);
|
||||||
if (parts.Length == 2)
|
if (parts.Length == 2)
|
||||||
{
|
{
|
||||||
@ -85,11 +92,20 @@ namespace LemmaGenSockets
|
|||||||
firstPart = lemmatizersDict[languageCode].Lemmatize(firstPart);
|
firstPart = lemmatizersDict[languageCode].Lemmatize(firstPart);
|
||||||
}
|
}
|
||||||
string secondPart = lemmatizersDict[languageCode].Lemmatize(parts[1]);
|
string secondPart = lemmatizersDict[languageCode].Lemmatize(parts[1]);
|
||||||
return firstPart + "-" + secondPart;
|
result = firstPart + "-" + secondPart;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
return lemmatizersDict[languageCode].Lemmatize(word);
|
result = lemmatizersDict[languageCode].Lemmatize(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result == "")
|
||||||
|
{
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
2
tests/lemmatizer-test/.gitignore
vendored
Normal file
2
tests/lemmatizer-test/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
differences.log
|
||||||
|
corpora/
|
10
tests/lemmatizer-test/test.sh
Executable file
10
tests/lemmatizer-test/test.sh
Executable file
@ -0,0 +1,10 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
./test_corpus.py corpora/A_en.txt en >> differences.log
|
||||||
|
./test_corpus.py corpora/B_en.txt en >> differences.log
|
||||||
|
./test_corpus.py corpora/C_en.txt en >> differences.log
|
||||||
|
./test_corpus.py corpora/D_en.txt en >> differences.log
|
||||||
|
./test_corpus.py corpora/A_fr.txt fr >> differences.log
|
||||||
|
./test_corpus.py corpora/B_fr.txt fr >> differences.log
|
||||||
|
./test_corpus.py corpora/C_fr.txt fr >> differences.log
|
||||||
|
./test_corpus.py corpora/D_fr.txt fr >> differences.log
|
36
tests/lemmatizer-test/test_corpus.py
Executable file
36
tests/lemmatizer-test/test_corpus.py
Executable file
@ -0,0 +1,36 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def lemmatizeSentence(lang, sentence):
|
||||||
|
data = {
|
||||||
|
'operation': 'lemmatize',
|
||||||
|
'languageCode':lang,
|
||||||
|
'sentence':sentence
|
||||||
|
}
|
||||||
|
|
||||||
|
address = 'http://localhost:8800'
|
||||||
|
response = requests.post(address, data=json.dumps(data))
|
||||||
|
return response.json()['lemmatizedSentence']
|
||||||
|
|
||||||
|
corpus_file_path = sys.argv[1]
|
||||||
|
lang = sys.argv[2]
|
||||||
|
|
||||||
|
|
||||||
|
line_count = 0
|
||||||
|
with open(corpus_file_path) as corpus_file:
|
||||||
|
for line in corpus_file:
|
||||||
|
line_count += 1
|
||||||
|
orig = line.rstrip()
|
||||||
|
lemmatized = lemmatizeSentence(lang,orig)
|
||||||
|
if len(orig.split()) != len(lemmatized.split()):
|
||||||
|
print("Different length in:")
|
||||||
|
print(orig)
|
||||||
|
print(lemmatized)
|
||||||
|
if line_count % 1000 == 0:
|
||||||
|
sys.stderr.write("Done %d lines\n" % line_count)
|
7
tests/lemmatizer-test/tokenize.sh
Executable file
7
tests/lemmatizer-test/tokenize.sh
Executable file
@ -0,0 +1,7 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
for corpus_file in `ls /mnt/storage/rjawor_storage/copycat_corpus/cleaned/*txt`
|
||||||
|
do
|
||||||
|
a=`basename $corpus_file`
|
||||||
|
concordia-sentence-tokenizer -c /home/rjawor/concordia-server/concordia.cfg < $corpus_file > corpora/$a
|
||||||
|
done
|
Loading…
Reference in New Issue
Block a user