lemmatizer fixed
This commit is contained in:
parent
dea4308618
commit
c800fa7b57
@ -54,6 +54,12 @@ namespace LemmaGenSockets
|
||||
private string lemmatizeWord(string languageCode, string word)
|
||||
{
|
||||
// exceptions
|
||||
if (word.StartsWith("ne_"))
|
||||
{
|
||||
return word;
|
||||
}
|
||||
|
||||
|
||||
Dictionary<String, HashSet<String>> exceptions = new Dictionary<string, HashSet<string>>();
|
||||
|
||||
HashSet<String> plExceptions = new HashSet<string>();
|
||||
@ -76,6 +82,7 @@ namespace LemmaGenSockets
|
||||
}
|
||||
|
||||
|
||||
string result = "";
|
||||
string[] parts = word.Split(wordInnerSeparator);
|
||||
if (parts.Length == 2)
|
||||
{
|
||||
@ -85,11 +92,20 @@ namespace LemmaGenSockets
|
||||
firstPart = lemmatizersDict[languageCode].Lemmatize(firstPart);
|
||||
}
|
||||
string secondPart = lemmatizersDict[languageCode].Lemmatize(parts[1]);
|
||||
return firstPart + "-" + secondPart;
|
||||
result = firstPart + "-" + secondPart;
|
||||
}
|
||||
else
|
||||
{
|
||||
return lemmatizersDict[languageCode].Lemmatize(word);
|
||||
result = lemmatizersDict[languageCode].Lemmatize(word);
|
||||
}
|
||||
|
||||
if (result == "")
|
||||
{
|
||||
return word;
|
||||
}
|
||||
else
|
||||
{
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
2
tests/lemmatizer-test/.gitignore
vendored
Normal file
2
tests/lemmatizer-test/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
differences.log
|
||||
corpora/
|
10
tests/lemmatizer-test/test.sh
Executable file
10
tests/lemmatizer-test/test.sh
Executable file
@ -0,0 +1,10 @@
|
||||
#!/bin/bash
|
||||
|
||||
./test_corpus.py corpora/A_en.txt en >> differences.log
|
||||
./test_corpus.py corpora/B_en.txt en >> differences.log
|
||||
./test_corpus.py corpora/C_en.txt en >> differences.log
|
||||
./test_corpus.py corpora/D_en.txt en >> differences.log
|
||||
./test_corpus.py corpora/A_fr.txt fr >> differences.log
|
||||
./test_corpus.py corpora/B_fr.txt fr >> differences.log
|
||||
./test_corpus.py corpora/C_fr.txt fr >> differences.log
|
||||
./test_corpus.py corpora/D_fr.txt fr >> differences.log
|
36
tests/lemmatizer-test/test_corpus.py
Executable file
36
tests/lemmatizer-test/test_corpus.py
Executable file
@ -0,0 +1,36 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
import unittest
|
||||
import json
|
||||
import requests
|
||||
import sys
|
||||
|
||||
|
||||
|
||||
def lemmatizeSentence(lang, sentence):
|
||||
data = {
|
||||
'operation': 'lemmatize',
|
||||
'languageCode':lang,
|
||||
'sentence':sentence
|
||||
}
|
||||
|
||||
address = 'http://localhost:8800'
|
||||
response = requests.post(address, data=json.dumps(data))
|
||||
return response.json()['lemmatizedSentence']
|
||||
|
||||
corpus_file_path = sys.argv[1]
|
||||
lang = sys.argv[2]
|
||||
|
||||
|
||||
line_count = 0
|
||||
with open(corpus_file_path) as corpus_file:
|
||||
for line in corpus_file:
|
||||
line_count += 1
|
||||
orig = line.rstrip()
|
||||
lemmatized = lemmatizeSentence(lang,orig)
|
||||
if len(orig.split()) != len(lemmatized.split()):
|
||||
print("Different length in:")
|
||||
print(orig)
|
||||
print(lemmatized)
|
||||
if line_count % 1000 == 0:
|
||||
sys.stderr.write("Done %d lines\n" % line_count)
|
7
tests/lemmatizer-test/tokenize.sh
Executable file
7
tests/lemmatizer-test/tokenize.sh
Executable file
@ -0,0 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
for corpus_file in `ls /mnt/storage/rjawor_storage/copycat_corpus/cleaned/*txt`
|
||||
do
|
||||
a=`basename $corpus_file`
|
||||
concordia-sentence-tokenizer -c /home/rjawor/concordia-server/concordia.cfg < $corpus_file > corpora/$a
|
||||
done
|
Loading…
Reference in New Issue
Block a user