Split BM encoding, better tests.

This commit is contained in:
Antonin Delpeuch 2018-12-31 14:24:35 +01:00
parent 1c90129829
commit 75a0fac71f
3 changed files with 27 additions and 6 deletions

View File

@ -12,7 +12,18 @@ public class BeiderMorseKeyer extends Keyer {
@Override
public String key(String string, Object... params) {
try {
return encoder.encode(string);
/*
* Beider Morse encoding can return multiple phonetic
* encodings, separated by |.
* Ideally the Keyer interface should be changed to allow
* for multiple values to be returned (and the clustering code
* should be adapted accourdingly).
*
* As a simple workaround we only return the first value.
* We could also return the entire list but it would make
* matching harder.
*/
return encoder.encode(string).split("\\|")[0];
} catch (EncoderException e) {
return string;
}

View File

@ -1,24 +1,28 @@
package org.openrefine.phonetic.keyers;
import com.google.refine.clustering.binning.Keyer;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
import org.testng.annotations.Test;
import com.google.refine.clustering.binning.Keyer;
public class BeiderMorseKeyerTest {
Keyer keyer = new BeiderMorseKeyer();
@Test
public void testKey() {
assertTrue(keyer.key("Alphonse").contains("alponzi"));
assertEquals(keyer.key("Alphonse"), "YlfYnzi");
}
@Test
public void testAccents() {
assertEquals(keyer.key("Éléonore"), "ilionor|ilionori");
assertEquals(keyer.key("Éléonore"), "ilionor");
}
@Test
public void testEmpty() {
assertEquals(keyer.key(""), "");
}
}

View File

@ -18,4 +18,10 @@ public class DaitchMokotoffKeyerTest {
public void testAccents() {
assertEquals(keyer.key("Éléonore"), "086900");
}
@Test
public void testEmpty() {
assertEquals(keyer.key(""), "000000");
}
}