diff --git a/.classpath b/.classpath
index 02068d41a..c2aafba47 100644
--- a/.classpath
+++ b/.classpath
@@ -1,59 +1,58 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/main/src/com/google/refine/clustering/binning/BinningClusterer.java b/main/src/com/google/refine/clustering/binning/BinningClusterer.java
index d0f65ff2f..4bdbf88c7 100644
--- a/main/src/com/google/refine/clustering/binning/BinningClusterer.java
+++ b/main/src/com/google/refine/clustering/binning/BinningClusterer.java
@@ -40,9 +40,9 @@ import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Properties;
import java.util.TreeMap;
-import java.util.Map.Entry;
import org.json.JSONException;
import org.json.JSONObject;
@@ -73,6 +73,7 @@ public class BinningClusterer extends Clusterer {
_keyers.put("ngram-fingerprint", new NGramFingerprintKeyer());
_keyers.put("metaphone", new MetaphoneKeyer());
_keyers.put("double-metaphone", new DoubleMetaphoneKeyer());
+ _keyers.put("metaphone3", new Metaphone3Keyer());
_keyers.put("soundex", new SoundexKeyer());
}
diff --git a/main/src/com/google/refine/clustering/binning/Metaphone3.java b/main/src/com/google/refine/clustering/binning/Metaphone3.java
new file mode 100644
index 000000000..56f4cd6fc
--- /dev/null
+++ b/main/src/com/google/refine/clustering/binning/Metaphone3.java
@@ -0,0 +1,7455 @@
+/*
+
+Copyright 2010, Lawrence Philips
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/*
+ * A request from the author: Please comment and sign any changes you make to the Metaphone 3 reference implementation.
+ */
+
+/**
+ * Metaphone 3
+ * VERSION 2.1.3
+ *
+ * by Lawrence Philips
+ *
+ * Metaphone 3 is designed to return an *approximate* phonetic key (and an alternate
+ * approximate phonetic key when appropriate) that should be the same for English
+ * words, and most names familiar in the United States, that are pronounced *similarly*.
+ * The key value is *not* intended to be an *exact* phonetic, or even phonemic,
+ * representation of the word. This is because a certain degree of 'fuzziness' has
+ * proven to be useful in compensating for variations in pronunciation, as well as
+ * misheard pronunciations. For example, although americans are not usually aware of it,
+ * the letter 's' is normally pronounced 'z' at the end of words such as "sounds".
+ *
+ * The 'approximate' aspect of the encoding is implemented according to the following rules:
+ *
+ * (1) All vowels are encoded to the same value - 'A'. If the parameter encodeVowels
+ * is set to false, only *initial* vowels will be encoded at all. If encodeVowels is set
+ * to true, 'A' will be encoded at all places in the word that any vowels are normally
+ * pronounced. 'W' as well as 'Y' are treated as vowels. Although there are differences in
+ * the pronunciation of 'W' and 'Y' in different circumstances that lead to their being
+ * classified as vowels under some circumstances and as consonants in others, for the purposes
+ * of the 'fuzziness' component of the Soundex and Metaphone family of algorithms they will
+ * be always be treated here as vowels.
+ *
+ * (2) Voiced and un-voiced consonant pairs are mapped to the same encoded value. This
+ * means that:
+ * 'D' and 'T' -> 'T'
+ * 'B' and 'P' -> 'P'
+ * 'G' and 'K' -> 'K'
+ * 'Z' and 'S' -> 'S'
+ * 'V' and 'F' -> 'F'
+ *
+ * - In addition to the above voiced/unvoiced rules, 'CH' and 'SH' -> 'X', where 'X'
+ * represents the "-SH-" and "-CH-" sounds in Metaphone 3 encoding.
+ *
+ * - Also, the sound that is spelled as "TH" in English is encoded to '0' (zero symbol). (Although
+ * Americans are not usually aware of it, "TH" is pronounced in a voiced (e.g. "that") as
+ * well as an unvoiced (e.g. "theater") form, which are naturally mapped to the same encoding.)
+ *
+ * The encodings in this version of Metaphone 3 are according to pronunciations common in the
+ * United States. This means that they will be inaccurate for consonant pronunciations that
+ * are different in the United Kingdom, for example "tube" -> "CHOOBE" -> XAP rather than american TAP.
+ *
+ * Metaphone 3 was preceded by by Soundex, patented in 1919, and Metaphone and Double Metaphone,
+ * developed by Lawrence Philips. All of these algorithms resulted in a significant number of
+ * incorrect encodings. Metaphone3 was tested against a database of about 100 thousand English words,
+ * names common in the United States, and non-English words found in publications in the United States,
+ * with an emphasis on words that are commonly mispronounced, prepared by the Moby Words website,
+ * but with the Moby Words 'phonetic' encodings algorithmically mapped to Double Metaphone encodings.
+ * Metaphone3 increases the accuracy of encoding of english words, common names, and non-English
+ * words found in american publications from the 89% for Double Metaphone, to over 98%.
+ *
+ * DISCLAIMER:
+ * Anthropomorphic Software LLC claims only that Metaphone 3 will return correct encodings,
+ * within the 'fuzzy' definition of correct as above, for a very high percentage of correctly
+ * spelled English and commonly recognized non-English words. Anthropomorphic Software LLC
+ * warns the user that a number of words remain incorrectly encoded, that misspellings may not
+ * be encoded 'properly', and that people often have differing ideas about the pronunciation
+ * of a word. Therefore, Metaphone 3 is not guaranteed to return correct results every time, and
+ * so a desired target word may very well be missed. Creators of commercial products should
+ * keep in mind that systems like Metaphone 3 produce a 'best guess' result, and should
+ * condition the expectations of end users accordingly.
+ *
+ * METAPHONE3 IS PROVIDED "AS IS" WITHOUT
+ * WARRANTY OF ANY KIND. LAWRENCE PHILIPS AND ANTHROPOMORPHIC SOFTWARE LLC
+ * MAKE NO WARRANTIES, EXPRESS OR IMPLIED, THAT IT IS FREE OF ERROR,
+ * OR ARE CONSISTENT WITH ANY PARTICULAR STANDARD OF MERCHANTABILITY,
+ * OR THAT IT WILL MEET YOUR REQUIREMENTS FOR ANY PARTICULAR APPLICATION.
+ * LAWRENCE PHILIPS AND ANTHROPOMORPHIC SOFTWARE LLC DISCLAIM ALL LIABILITY
+ * FOR DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES RESULTING FROM USE
+ * OF THIS SOFTWARE.
+ *
+ * @author Lawrence Philips
+ *
+ * Metaphone 3 is designed to return an approximate phonetic key (and an alternate
+ * approximate phonetic key when appropriate) that should be the same for English
+ * words, and most names familiar in the United States, that are pronounced "similarly".
+ * The key value is not intended to be an exact phonetic, or even phonemic,
+ * representation of the word. This is because a certain degree of 'fuzziness' has
+ * proven to be useful in compensating for variations in pronunciation, as well as
+ * misheard pronunciations. For example, although americans are not usually aware of it,
+ * the letter 's' is normally pronounced 'z' at the end of words such as "sounds".
+ *
+ * The 'approximate' aspect of the encoding is implemented according to the following rules:
+ *
+ * (1) All vowels are encoded to the same value - 'A'. If the parameter encodeVowels
+ * is set to false, only *initial* vowels will be encoded at all. If encodeVowels is set
+ * to true, 'A' will be encoded at all places in the word that any vowels are normally
+ * pronounced. 'W' as well as 'Y' are treated as vowels. Although there are differences in
+ * the pronunciation of 'W' and 'Y' in different circumstances that lead to their being
+ * classified as vowels under some circumstances and as consonants in others, for the purposes
+ * of the 'fuzziness' component of the Soundex and Metaphone family of algorithms they will
+ * be always be treated here as vowels.
+ *
+ * (2) Voiced and un-voiced consonant pairs are mapped to the same encoded value. This
+ * means that:
+ * 'D' and 'T' -> 'T'
+ * 'B' and 'P' -> 'P'
+ * 'G' and 'K' -> 'K'
+ * 'Z' and 'S' -> 'S'
+ * 'V' and 'F' -> 'F'
+ *
+ * - In addition to the above voiced/unvoiced rules, 'CH' and 'SH' -> 'X', where 'X'
+ * represents the "-SH-" and "-CH-" sounds in Metaphone 3 encoding.
+ *
+ * - Also, the sound that is spelled as "TH" in English is encoded to '0' (zero symbol). (Although
+ * americans are not usually aware of it, "TH" is pronounced in a voiced (e.g. "that") as
+ * well as an unvoiced (e.g. "theater") form, which are naturally mapped to the same encoding.)
+ *
+ * In the "Exact" encoding, voiced/unvoiced pairs are not mapped to the same encoding, except
+ * for the voiced and unvoiced versions of 'TH', sounds such as 'CH' and 'SH', and for 'S' and 'Z',
+ * so that the words whose metaph keys match will in fact be closer in pronunciation that with the
+ * more approximate setting. Keep in mind that encoding settings for search strings should always
+ * be exactly the same as the encoding settings of the stored metaph keys in your database!
+ * Because of the considerably increased accuracy of Metaphone3, it is now possible to use this
+ * setting and have a very good chance of getting a correct encoding.
+ *
+ * In the Encode Vowels encoding, all non-initial vowels and diphthongs will be encoded to
+ * 'A', and there will only be one such vowel encoding character between any two consonants.
+ * It turns out that there are some surprising wrinkles to encoding non-initial vowels in
+ * practice, pre-eminently in inversions between spelling and pronunciation such as e.g.
+ * "wrinkle" => 'RANKAL', where the last two sounds are inverted when spelled.
+ *
+ * The encodings in this version of Metaphone 3 are according to pronunciations common in the
+ * United States. This means that they will be inaccurate for consonant pronunciations that
+ * are different in the United Kingdom, for example "tube" -> "CHOOBE" -> XAP rather than american TAP.
+ *
+ *
+ */
+
+package com.google.refine.clustering.binning;
+
+public class Metaphone3 {
+
+ /** Length of word sent in to be encoded, as
+ * measured at beginning of encoding. */
+ int m_length;
+
+ /** Length of encoded key string. */
+ int m_metaphLength;
+
+ /** Flag whether or not to encode non-initial vowels. */
+ boolean m_encodeVowels;
+
+ /** Flag whether or not to encode consonants as exactly
+ * as possible. */
+ boolean m_encodeExact;
+
+ /** Internal copy of word to be encoded, allocated separately
+ * from string pointed to in incoming parameter. */
+ String m_inWord;
+
+ /** Running copy of primary key. */
+ StringBuffer m_primary;
+
+ /** Running copy of secondary key. */
+ StringBuffer m_secondary;
+
+ /** Index of character in m_inWord currently being
+ * encoded. */
+ int m_current;
+
+ /** Index of last character in m_inWord. */
+ int m_last;
+
+ /** Flag that an AL inversion has already been done. */
+ boolean flag_AL_inversion;
+
+ /** Default size of key storage allocation */
+ int MAX_KEY_ALLOCATION = 32;
+
+ /** Default maximum length of encoded key. */
+ int DEFAULT_MAX_KEY_LENGTH = 8;
+
+ ////////////////////////////////////////////////////////////////////////////////
+ // Metaphone3 class definition
+ ////////////////////////////////////////////////////////////////////////////////
+
+ /**
+ * Constructor, default. This constructor is most convenient when
+ * encoding more than one word at a time. New words to encode can
+ * be set using SetWord(char *).
+ *
+ */
+ Metaphone3()
+ {
+ m_primary = new StringBuffer();
+ m_secondary = new StringBuffer();
+
+ m_metaphLength = DEFAULT_MAX_KEY_LENGTH;
+ m_encodeVowels = false;
+ m_encodeExact = false;
+ }
+
+ /**
+ * Constructor, parameterized. The Metaphone3 object will
+ * be initialized with the incoming string, and can be called
+ * on to encode this string. This constructor is most convenient
+ * when only one word needs to be encoded.
+ *
+ * @param in pointer to char string of word to be encoded.
+ *
+ */
+ Metaphone3(String in)
+ {
+ this();
+
+ SetWord(in);
+ }
+
+ /**
+ * Sets word to be encoded.
+ *
+ * @param in pointer to EXTERNALLY ALLOCATED char string of
+ * the word to be encoded.
+ *
+ */
+ void SetWord(String in)
+ {
+ m_inWord = in.toUpperCase();;
+ m_length = m_inWord.length();
+ }
+
+ /**
+ * Sets length allocated for output keys.
+ * If incoming number is greater than maximum allowable
+ * length returned by GetMaximumKeyLength(), set key length
+ * to maximum key length and return false; otherwise, set key
+ * length to parameter value and return true.
+ *
+ * @param inKeyLength new length of key.
+ * @return true if able to set key length to requested value.
+ *
+ */
+ boolean SetKeyLength(int inKeyLength)
+ {
+ if(inKeyLength < 1)
+ {
+ // can't have that -
+ // no room for terminating null
+ inKeyLength = 1;
+ }
+
+ if(inKeyLength > MAX_KEY_ALLOCATION)
+ {
+ m_metaphLength = MAX_KEY_ALLOCATION;
+ return false;
+ }
+
+ m_metaphLength = inKeyLength;
+ return true;
+ }
+
+ /**
+ * Adds an encoding character to the encoded key value string - one parameter version.
+ *
+ * @param main primary encoding character to be added to encoded key string.
+ */
+ void MetaphAdd(String in)
+ {
+ if(!(in.equals("A")
+ && (m_primary.length() > 0)
+ && (m_primary.charAt(m_primary.length() - 1) == 'A')))
+ {
+ m_primary.append(in);
+ }
+
+ if(!(in.equals("A")
+ && (m_secondary.length() > 0)
+ && (m_secondary.charAt(m_secondary.length() - 1) == 'A')))
+ {
+ m_secondary.append(in);
+ }
+ }
+
+ /**
+ * Adds an encoding character to the encoded key value string - two parameter version
+ *
+ * @param main primary encoding character to be added to encoded key string
+ * @param alt alternative encoding character to be added to encoded alternative key string
+ *
+ */
+ void MetaphAdd(String main, String alt)
+ {
+ if(!(main.equals("A")
+ && (m_primary.length() > 0)
+ && (m_primary.charAt(m_primary.length() - 1) == 'A')))
+ {
+ m_primary.append(main);
+ }
+
+ if(!(alt.equals("A")
+ && (m_secondary.length() > 0)
+ && (m_secondary.charAt(m_secondary.length() - 1) == 'A')))
+ {
+ if(!alt.isEmpty())
+ {
+ m_secondary.append(alt);
+ }
+ }
+ }
+
+ /**
+ * Adds an encoding character to the encoded key value string - Exact/Approx version
+ *
+ * @param mainExact primary encoding character to be added to encoded key string if
+ * m_encodeExact is set
+ *
+ * @param altExact alternative encoding character to be added to encoded alternative
+ * key string if m_encodeExact is set
+ *
+ * @param main primary encoding character to be added to encoded key string
+ *
+ * @param alt alternative encoding character to be added to encoded alternative key string
+ *
+ */
+ void MetaphAddExactApprox(String mainExact, String altExact, String main, String alt)
+ {
+ if(m_encodeExact)
+ {
+ MetaphAdd(mainExact, altExact);
+ }
+ else
+ {
+ MetaphAdd(main, alt);
+ }
+ }
+
+ /**
+ * Adds an encoding character to the encoded key value string - Exact/Approx version
+ *
+ * @param mainExact primary encoding character to be added to encoded key string if
+ * m_encodeExact is set
+ *
+ * @param main primary encoding character to be added to encoded key string
+ *
+ */
+ void MetaphAddExactApprox(String mainExact, String main)
+ {
+ if(m_encodeExact)
+ {
+ MetaphAdd(mainExact);
+ }
+ else
+ {
+ MetaphAdd(main);
+ }
+ }
+ /** Retrieves maximum number of characters currently allocated for encoded key.
+ *
+ * @return short integer representing the length allowed for the key.
+ */
+ int GetKeyLength(){return m_metaphLength;}
+
+ /** Retrieves maximum number of characters allowed for encoded key.
+ *
+ * @return short integer representing the length of allocated storage for the key.
+ */
+ int GetMaximumKeyLength(){return (int)MAX_KEY_ALLOCATION;}
+
+ /** Sets flag that causes Metaphone3 to encode non-initial vowels. However, even
+ * if there are more than one vowel sound in a vowel sequence (i.e.
+ * vowel diphthong, etc.), only one 'A' will be encoded before the next consonant or the
+ * end of the word.
+ *
+ * @param inEncodeVowels Non-initial vowels encoded if true, not if false.
+ */
+ void SetEncodeVowels(boolean inEncodeVowels){m_encodeVowels = inEncodeVowels;}
+
+ /** Retrieves setting determining whether or not non-initial vowels will be encoded.
+ *
+ * @return true if the Metaphone3 object has been set to encode non-initial vowels, false if not.
+ */
+ boolean GetEncodeVowels(){return m_encodeVowels;}
+
+ /** Sets flag that causes Metaphone3 to encode consonants as exactly as possible.
+ * This does not include 'S' vs. 'Z', since americans will pronounce 'S' at the
+ * at the end of many words as 'Z', nor does it include "CH" vs. "SH". It does cause
+ * a distinction to be made between 'B' and 'P', 'D' and 'T', 'G' and 'K', and 'V'
+ * and 'F'.
+ *
+ * @param inEncodeExact consonants to be encoded "exactly" if true, not if false.
+ */
+ void SetEncodeExact(boolean inEncodeExact){m_encodeExact = inEncodeExact;}
+
+ /** Retrieves setting determining whether or not consonants will be encoded "exactly".
+ *
+ * @return true if the Metaphone3 object has been set to encode "exactly", false if not.
+ */
+ boolean GetEncodeExact(){return m_encodeExact;}
+
+ /** Retrieves primary encoded key.
+ *
+ * @return a character pointer to the primary encoded key
+ */
+ String GetMetaph()
+ {
+ String primary = new String(m_primary);
+ return primary;
+ }
+
+ /** Retrieves alternate encoded key, if any.
+ *
+ * @return a character pointer to the alternate encoded key
+ */
+ String GetAlternateMetaph()
+ {
+ String secondary = new String(m_secondary);
+ return secondary;
+ }
+
+ /**
+ * Test for close front vowels
+ *
+ * @return true if close front vowel
+ */
+ boolean Front_Vowel(int at)
+ {
+ if(((CharAt(at) == 'E') || (CharAt(at) == 'I') || (CharAt(at) == 'Y')))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Detect names or words that begin with spellings
+ * typical of german or slavic words, for the purpose
+ * of choosing alternate pronunciations correctly
+ *
+ */
+ boolean SlavoGermanic()
+ {
+ if(StringAt(0, 3, "SCH", "")
+ || StringAt(0, 2, "SW", "")
+ || (CharAt(0) == 'J')
+ || (CharAt(0) == 'W'))
+ {
+ return true;
+ }
+
+ return false;
+ }
+ /**
+ * Tests if character is a vowel
+ *
+ * @param inChar character to be tested in string to be encoded
+ * @return true if character is a vowel, false if not
+ *
+ */
+ boolean IsVowel(char inChar)
+ {
+ if((inChar == 'A')
+ || (inChar == 'E')
+ || (inChar == 'I')
+ || (inChar == 'O')
+ || (inChar == 'U')
+ || (inChar == 'Y')
+ || (inChar == 'À')
+ || (inChar == 'Á')
+ || (inChar == 'Â')
+ || (inChar == 'Ã')
+ || (inChar == 'Ä')
+ || (inChar == 'Å')
+ || (inChar == 'Æ')
+ || (inChar == 'È')
+ || (inChar == 'É')
+ || (inChar == 'Ê')
+ || (inChar == 'Ë')
+ || (inChar == 'Ì')
+ || (inChar == 'Í')
+ || (inChar == 'Î')
+ || (inChar == 'Ï')
+ || (inChar == 'Ò')
+ || (inChar == 'Ó')
+ || (inChar == 'Ô')
+ || (inChar == 'Õ')
+ || (inChar == 'Ö')
+ || (inChar == 'Œ')
+ || (inChar == 'Ø')
+ || (inChar == 'Ù')
+ || (inChar == 'Ú')
+ || (inChar == 'Û')
+ || (inChar == 'Ü')
+ || (inChar == 'Ý')
+ || (inChar == 'Ÿ'))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Tests if character in the input string is a vowel
+ *
+ * @param at position of character to be tested in string to be encoded
+ * @return true if character is a vowel, false if not
+ *
+ */
+ boolean IsVowel(int at)
+ {
+ if((at < 0) || (at >= m_length))
+ {
+ return false;
+ }
+
+ char it = CharAt(at);
+
+ if(IsVowel(it))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Skips over vowels in a string. Has exceptions for skipping consonants that
+ * will not be encoded.
+ *
+ * @param at position, in string to be encoded, of character to start skipping from
+ *
+ * @return position of next consonant in string to be encoded
+ */
+ int SkipVowels(int at)
+ {
+ if(at < 0)
+ {
+ return 0;
+ }
+
+ if(at >= m_length)
+ {
+ return m_length;
+ }
+
+ char it = CharAt(at);
+
+ while(IsVowel(it) || (it == 'W'))
+ {
+ if(StringAt(at, 4, "WICZ", "WITZ", "WIAK", "")
+ || StringAt((at - 1), 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY", "")
+ || (StringAt(at, 5, "WICKI", "WACKI", "") && ((at + 4) == m_last)))
+ {
+ break;
+ }
+
+ at++;
+ if(((CharAt(at - 1) == 'W') && (CharAt(at) == 'H'))
+ && !(StringAt(at, 3, "HOP", "")
+ || StringAt(at, 4, "HIDE", "HARD", "HEAD", "HAWK", "HERD", "HOOK", "HAND", "HOLE", "")
+ || StringAt(at, 5, "HEART", "HOUSE", "HOUND", "")
+ || StringAt(at, 6, "HAMMER", "")))
+ {
+ at++;
+ }
+
+ if(at > (m_length - 1))
+ {
+ break;
+ }
+ it = CharAt(at);
+ }
+
+ return at;
+ }
+
+ /**
+ * Advanced counter m_current so that it indexes the next character to be encoded
+ *
+ * @param ifNotEncodeVowels number of characters to advance if not encoding internal vowels
+ * @param ifEncodeVowels number of characters to advance if encoding internal vowels
+ *
+ */
+ void AdvanceCounter(int ifNotEncodeVowels, int ifEncodeVowels)
+ {
+ if(!m_encodeVowels)
+ {
+ m_current += ifNotEncodeVowels;
+ }
+ else
+ {
+ m_current += ifEncodeVowels;
+ }
+ }
+
+
+ /**
+ * Subscript safe .charAt()
+ *
+ * @param at index of character to access
+ * @return null if index out of bounds, .charAt() otherwise
+ */
+ char CharAt(int at)
+ {
+ // check substring bounds
+ if((at < 0)
+ || (at > (m_length - 1)))
+ {
+ return '\0';
+ }
+
+ return m_inWord.charAt(at);
+ }
+
+ /**
+ * Tests whether the word is the root or a regular english inflection
+ * of it, e.g. "ache", "achy", "aches", "ached", "aching", "achingly"
+ * This is for cases where we want to match only the root and corresponding
+ * inflected forms, and not completely different words which may have the
+ * same substring in them.
+ */
+ boolean RootOrInflections(String inWord, String root)
+ {
+ int len = root.length();
+ String test;
+
+ test = root + "S";
+ if((inWord.equals(root))
+ || (inWord.equals(test)))
+ {
+ return true;
+ }
+
+ if(root.charAt(len - 1) != 'E')
+ {
+ test = root + "ES";
+ }
+
+ if(inWord.equals(test))
+ {
+ return true;
+ }
+
+ if(root.charAt(len - 1) != 'E')
+ {
+ test = root + "ED";
+ }
+ else
+ {
+ test = root + "D";
+ }
+
+ if(inWord.equals(test))
+ {
+ return true;
+ }
+
+ if(root.charAt(len - 1) == 'E')
+ {
+ root = root.substring(0, len - 1);
+ }
+
+ test = root + "ING";
+ if(inWord.equals(test))
+ {
+ return true;
+ }
+
+ test = root + "INGLY";
+ if(inWord.equals(test))
+ {
+ return true;
+ }
+
+ test = root + "Y";
+ if(inWord.equals(test))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Determines if one of the substrings sent in is the same as
+ * what is at the specified position in the string being encoded.
+ *
+ * @param start
+ * @param length
+ * @param compareStrings
+ * @return
+ */
+ boolean StringAt(int start, int length, String... compareStrings)
+ {
+ // check substring bounds
+ if((start < 0)
+ || (start > (m_length - 1))
+ || ((start + length - 1) > (m_length - 1)))
+ {
+ return false;
+ }
+
+ String target = m_inWord.substring(start, (start + length));
+
+ for(String strFragment : compareStrings)
+ {
+ if(target.equals(strFragment))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Encodes input string to one or two key values according to Metaphone 3 rules.
+ *
+ */
+ void Encode()
+ {
+ flag_AL_inversion = false;
+
+ m_current = 0;
+
+ m_primary.setLength(0);
+ m_secondary.setLength(0);
+
+ if(m_length < 1)
+ {
+ return;
+ }
+
+ //zero based index
+ m_last = m_length - 1;
+
+ ///////////main loop//////////////////////////
+ while(!(m_primary.length() > m_metaphLength) && !(m_secondary.length() > m_metaphLength))
+ {
+ if(m_current >= m_length)
+ {
+ break;
+ }
+
+ switch(CharAt(m_current))
+ {
+ case 'B':
+
+ Encode_B();
+ break;
+
+ case 'ß':
+ case 'Ç':
+
+ MetaphAdd("S");
+ m_current++;
+ break;
+
+ case 'C':
+
+ Encode_C();
+ break;
+
+ case 'D':
+
+ Encode_D();
+ break;
+
+ case 'F':
+
+ Encode_F();
+ break;
+
+ case 'G':
+
+ Encode_G();
+ break;
+
+ case 'H':
+
+ Encode_H();
+ break;
+
+ case 'J':
+
+ Encode_J();
+ break;
+
+ case 'K':
+
+ Encode_K();
+ break;
+
+ case 'L':
+
+ Encode_L();
+ break;
+
+ case 'M':
+
+ Encode_M();
+ break;
+
+ case 'N':
+
+ Encode_N();
+ break;
+
+ case 'Ñ':
+
+ MetaphAdd("N");
+ m_current++;
+ break;
+
+ case 'P':
+
+ Encode_P();
+ break;
+
+ case 'Q':
+
+ Encode_Q();
+ break;
+
+ case 'R':
+
+ Encode_R();
+ break;
+
+ case 'S':
+
+ Encode_S();
+ break;
+
+ case 'T':
+
+ Encode_T();
+ break;
+
+ case 'Ð': // eth
+ case 'Þ': // thorn
+
+ MetaphAdd("0");
+ m_current++;
+ break;
+
+ case 'V':
+
+ Encode_V();
+ break;
+
+ case 'W':
+
+ Encode_W();
+ break;
+
+ case 'X':
+
+ Encode_X();
+ break;
+
+ case 'Š':
+
+ MetaphAdd("X");
+ m_current++;
+ break;
+
+ case 'Ž':
+
+ MetaphAdd("S");
+ m_current++;
+ break;
+
+ case 'Z':
+
+ Encode_Z();
+ break;
+
+ default:
+
+ if(IsVowel(CharAt(m_current)))
+ {
+ Encode_Vowels();
+ break;
+ }
+
+ m_current++;
+
+ }
+ }
+
+ //only give back m_metaphLength number of chars in m_metaph
+ if(m_primary.length() > m_metaphLength)
+ {
+ m_primary.setLength(m_metaphLength);
+ }
+
+ if(m_secondary.length() > m_metaphLength)
+ {
+ m_secondary.setLength(m_metaphLength);
+ }
+
+ // it is possible for the two metaphs to be the same
+ // after truncation. lose the second one if so
+ if((m_primary.toString()).equals(m_secondary.toString()))
+ {
+ m_secondary.setLength(0);
+ }
+ }
+
+ /**
+ * Encodes all initial vowels to A.
+ *
+ * Encodes non-initial vowels to A if m_encodeVowels is true
+ *
+ *
+ */
+ void Encode_Vowels()
+ {
+ if(m_current == 0)
+ {
+ // all init vowels map to 'A'
+ // as of Double Metaphone
+ MetaphAdd("A");
+ }
+ else if(m_encodeVowels)
+ {
+ if(CharAt(m_current) != 'E')
+ {
+ if(Skip_Silent_UE())
+ {
+ return;
+ }
+
+ if (O_Silent())
+ {
+ m_current++;
+ return;
+ }
+
+ // encode all vowels and
+ // diphthongs to the same value
+ MetaphAdd("A");
+ }
+ else
+ {
+ Encode_E_Pronounced();
+ }
+ }
+
+ if(!(!IsVowel(m_current - 2) && StringAt((m_current - 1), 4, "LEWA", "LEWO", "LEWI", "")))
+ {
+ m_current = SkipVowels(m_current);
+ }
+ else
+ {
+ m_current++;
+ }
+ }
+
+ /**
+ * Encodes cases where non-initial 'e' is pronounced, taking
+ * care to detect unusual cases from the greek.
+ *
+ * Only executed if non initial vowel encoding is turned on
+ *
+ *
+ */
+ void Encode_E_Pronounced()
+ {
+ // special cases with two pronunciations
+ // 'agape' 'lame' 'resume'
+ if((StringAt(0, 4, "LAME", "SAKE", "PATE", "") && (m_length == 4))
+ || (StringAt(0, 5, "AGAPE", "") && (m_length == 5))
+ || ((m_current == 5) && StringAt(0, 6, "RESUME", "")))
+ {
+ MetaphAdd("", "A");
+ return;
+ }
+
+ // special case "inge" => 'INGA', 'INJ'
+ if(StringAt(0, 4, "INGE", "")
+ && (m_length == 4))
+ {
+ MetaphAdd("A", "");
+ return;
+ }
+
+ // special cases with two pronunciations
+ // special handling due to the difference in
+ // the pronunciation of the '-D'
+ if((m_current == 5) && StringAt(0, 7, "BLESSED", "LEARNED", ""))
+ {
+ MetaphAddExactApprox("D", "AD", "T", "AT");
+ m_current += 2;
+ return;
+ }
+
+ // encode all vowels and diphthongs to the same value
+ if((!E_Silent()
+ && !flag_AL_inversion
+ && !Silent_Internal_E())
+ || E_Pronounced_Exceptions())
+ {
+ MetaphAdd("A");
+ }
+
+ // now that we've visited the vowel in question
+ flag_AL_inversion = false;
+ }
+
+ /**
+ * Tests for cases where non-initial 'o' is not pronounced
+ * Only executed if non initial vowel encoding is turned on
+ *
+ * @return true if encoded as silent - no addition to m_metaph key
+ *
+ */
+ boolean O_Silent()
+ {
+ // if "iron" at beginning or end of word and not "irony"
+ if ((CharAt(m_current) == 'O')
+ && StringAt((m_current - 2), 4, "IRON", ""))
+ {
+ if ((StringAt(0, 4, "IRON", "")
+ || (StringAt((m_current - 2), 4, "IRON", "")
+ && (m_last == (m_current + 1))))
+ && !StringAt((m_current - 2), 6, "IRONIC", ""))
+ {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Tests and encodes cases where non-initial 'e' is never pronounced
+ * Only executed if non initial vowel encoding is turned on
+ *
+ * @return true if encoded as silent - no addition to m_metaph key
+ *
+ */
+ boolean E_Silent()
+ {
+ if(E_Pronounced_At_End())
+ {
+ return false;
+ }
+
+ // 'e' silent when last letter, altho
+ if((m_current == m_last)
+ // also silent if before plural 's'
+ // or past tense or participle 'd', e.g.
+ // 'grapes' and 'banished' => PNXT
+ || ((StringAt(m_last, 1, "S", "D", "")
+ && (m_current > 1)
+ && ((m_current + 1) == m_last)
+ // and not e.g. "nested", "rises", or "pieces" => RASAS
+ && !(StringAt((m_current - 1), 3, "TED", "SES", "CES", "")
+ || StringAt(0, 9, "ANTIPODES", "ANOPHELES", "")
+ || StringAt(0, 8, "MOHAMMED", "MUHAMMED", "MOUHAMED", "")
+ || StringAt(0, 7, "MOHAMED", "")
+ || StringAt(0, 6, "NORRED", "MEDVED", "MERCED", "ALLRED", "KHALED", "RASHED", "MASJED", "")
+ || StringAt(0, 5, "JARED", "AHMED", "HAMED", "JAVED", "")
+ || StringAt(0, 4, "ABED", "IMED", ""))))
+ // e.g. 'wholeness', 'boneless', 'barely'
+ || (StringAt((m_current + 1), 4, "NESS", "LESS", "") && ((m_current + 4) == m_last))
+ || (StringAt((m_current + 1), 2, "LY", "") && ((m_current + 2) == m_last)
+ && !StringAt(0, 6, "CICELY", "")))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Tests for words where an 'E' at the end of the word
+ * is pronounced
+ *
+ * special cases, mostly from the greek, spanish, japanese,
+ * italian, and french words normally having an acute accent.
+ * also, pronouns and articles
+ *
+ * Many Thanks to ali, QuentinCompson, JeffCO, ToonScribe, Xan,
+ * Trafalz, and VictorLaszlo, all of them atriots from the Eschaton,
+ * for all their fine contributions!
+ *
+ * @return true if 'E' at end is pronounced
+ *
+ */
+ boolean E_Pronounced_At_End()
+ {
+ if((m_current == m_last)
+ && (StringAt((m_current - 6), 7, "STROPHE", "")
+ // if a vowel is before the 'E', vowel eater will have eaten it.
+ //otherwise, consonant + 'E' will need 'E' pronounced
+ || (m_length == 2)
+ || ((m_length == 3) && !IsVowel(0))
+ // these german name endings can be relied on to have the 'e' pronounced
+ || (StringAt((m_last - 2), 3, "BKE", "DKE", "FKE", "KKE", "LKE",
+ "NKE", "MKE", "PKE", "TKE", "VKE", "ZKE", "")
+ && !StringAt(0, 5, "FINKE", "FUNKE", "")
+ && !StringAt(0, 6, "FRANKE", ""))
+ || StringAt((m_last - 4), 5, "SCHKE", "")
+ || (StringAt(0, 4, "ACME", "NIKE", "CAFE", "RENE", "LUPE", "JOSE", "ESME", "") && (m_length == 4))
+ || (StringAt(0, 5, "LETHE", "CADRE", "TILDE", "SIGNE", "POSSE", "LATTE", "ANIME", "DOLCE", "CROCE",
+ "ADOBE", "OUTRE", "JESSE", "JAIME", "JAFFE", "BENGE", "RUNGE",
+ "CHILE", "DESME", "CONDE", "URIBE", "LIBRE", "ANDRE", "") && (m_length == 5))
+ || (StringAt(0, 6, "HECATE", "PSYCHE", "DAPHNE", "PENSKE", "CLICHE", "RECIPE",
+ "TAMALE", "SESAME", "SIMILE", "FINALE", "KARATE", "RENATE", "SHANTE",
+ "OBERLE", "COYOTE", "KRESGE", "STONGE", "STANGE", "SWAYZE", "FUENTE",
+ "SALOME", "URRIBE", "") && (m_length == 6))
+ || (StringAt(0, 7, "ECHIDNE", "ARIADNE", "MEINEKE", "PORSCHE", "ANEMONE", "EPITOME",
+ "SYNCOPE", "SOUFFLE", "ATTACHE", "MACHETE", "KARAOKE", "BUKKAKE",
+ "VICENTE", "ELLERBE", "VERSACE", "") && (m_length == 7))
+ || (StringAt(0, 8, "PENELOPE", "CALLIOPE", "CHIPOTLE", "ANTIGONE", "KAMIKAZE", "EURIDICE",
+ "YOSEMITE", "FERRANTE", "") && (m_length == 8))
+ || (StringAt(0, 9, "HYPERBOLE", "GUACAMOLE", "XANTHIPPE", "") && (m_length == 9))
+ || (StringAt(0, 10, "SYNECDOCHE", "") && (m_length == 10))))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Detect internal silent 'E's e.g. "roseman",
+ * "firestone"
+ *
+ */
+ boolean Silent_Internal_E()
+ {
+ // 'olesen' but not 'olen' RAKE BLAKE
+ if((StringAt(0, 3, "OLE", "")
+ && E_Silent_Suffix(3) && !E_Pronouncing_Suffix(3))
+ || (StringAt(0, 4, "BARE", "FIRE", "FORE", "GATE", "HAGE", "HAVE",
+ "HAZE", "HOLE", "CAPE", "HUSE", "LACE", "LINE",
+ "LIVE", "LOVE", "MORE", "MOSE", "MORE", "NICE",
+ "RAKE", "ROBE", "ROSE", "SISE", "SIZE", "WARE",
+ "WAKE", "WISE", "WINE", "")
+ && E_Silent_Suffix(4) && !E_Pronouncing_Suffix(4))
+ || (StringAt(0, 5, "BLAKE", "BRAKE", "BRINE", "CARLE", "CLEVE", "DUNNE",
+ "HEDGE", "HOUSE", "JEFFE", "LUNCE", "STOKE", "STONE",
+ "THORE", "WEDGE", "WHITE", "")
+ && E_Silent_Suffix(5) && !E_Pronouncing_Suffix(5))
+ || (StringAt(0, 6, "BRIDGE", "CHEESE", "")
+ && E_Silent_Suffix(6) && !E_Pronouncing_Suffix(6))
+ || StringAt((m_current - 5), 7, "CHARLES", ""))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Detect conditions required
+ * for the 'E' not to be pronounced
+ *
+ */
+ boolean E_Silent_Suffix(int at)
+ {
+ if((m_current == (at - 1))
+ && (m_length > (at + 1))
+ && (IsVowel((at + 1))
+ || (StringAt(at, 2, "ST", "SL", "")
+ && (m_length > (at + 2)))))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Detect endings that will
+ * cause the 'e' to be pronounced
+ *
+ */
+ boolean E_Pronouncing_Suffix(int at)
+ {
+ // e.g. 'bridgewood' - the other vowels will get eaten
+ // up so we need to put one in here
+ if((m_length == (at + 4)) && StringAt(at, 4, "WOOD", ""))
+ {
+ return true;
+ }
+
+ // same as above
+ if((m_length == (at + 5)) && StringAt(at, 5, "WATER", "WORTH", ""))
+ {
+ return true;
+ }
+
+ // e.g. 'bridgette'
+ if((m_length == (at + 3)) && StringAt(at, 3, "TTE", "LIA", "NOW", "ROS", "RAS", ""))
+ {
+ return true;
+ }
+
+ // e.g. 'olena'
+ if((m_length == (at + 2)) && StringAt(at, 2, "TA", "TT", "NA", "NO", "NE",
+ "RS", "RE", "LA", "AU", "RO", "RA", ""))
+ {
+ return true;
+ }
+
+ // e.g. 'bridget'
+ if((m_length == (at + 1)) && StringAt(at, 1, "T", "R", ""))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Exceptions where 'E' is pronounced where it
+ * usually wouldn't be, and also some cases
+ * where 'LE' transposition rules don't apply
+ * and the vowel needs to be encoded here
+ *
+ * @return true if 'E' pronounced
+ *
+ */
+ boolean E_Pronounced_Exceptions()
+ {
+ // greek names e.g. "herakles" or hispanic names e.g. "robles", where 'e' is pronounced, other exceptions
+ if((((m_current + 1) == m_last)
+ && (StringAt((m_current - 3), 5, "OCLES", "ACLES", "AKLES", "")
+ || StringAt(0, 4, "INES", "")
+ || StringAt(0, 5, "LOPES", "ESTES", "GOMES", "NUNES", "ALVES", "ICKES",
+ "INNES", "PERES", "WAGES", "NEVES", "BENES", "DONES", "")
+ || StringAt(0, 6, "CORTES", "CHAVES", "VALDES", "ROBLES", "TORRES", "FLORES", "BORGES",
+ "NIEVES", "MONTES", "SOARES", "VALLES", "GEDDES", "ANDRES", "VIAJES",
+ "CALLES", "FONTES", "HERMES", "ACEVES", "BATRES", "MATHES", "")
+ || StringAt(0, 7, "DELORES", "MORALES", "DOLORES", "ANGELES", "ROSALES", "MIRELES", "LINARES",
+ "PERALES", "PAREDES", "BRIONES", "SANCHES", "CAZARES", "REVELES", "ESTEVES",
+ "ALVARES", "MATTHES", "SOLARES", "CASARES", "CACERES", "STURGES", "RAMIRES",
+ "FUNCHES", "BENITES", "FUENTES", "PUENTES", "TABARES", "HENTGES", "VALORES", "")
+ || StringAt(0, 8, "GONZALES", "MERCEDES", "FAGUNDES", "JOHANNES", "GONSALES", "BERMUDES",
+ "CESPEDES", "BETANCES", "TERRONES", "DIOGENES", "CORRALES", "CABRALES",
+ "MARTINES", "GRAJALES", "")
+ || StringAt(0, 9, "CERVANTES", "FERNANDES", "GONCALVES", "BENEVIDES", "CIFUENTES", "SIFUENTES",
+ "SERVANTES", "HERNANDES", "BENAVIDES", "")
+ || StringAt(0, 10, "ARCHIMEDES", "CARRIZALES", "MAGALLANES", "")))
+ || StringAt(m_current - 2, 4, "FRED", "DGES", "DRED", "GNES", "")
+ || StringAt((m_current - 5), 7, "PROBLEM", "RESPLEN", "")
+ || StringAt((m_current - 4), 6, "REPLEN", "")
+ || StringAt((m_current - 3), 4, "SPLE", ""))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes "-UE".
+ *
+ * @return true if encoding handled in this routine, false if not
+ */
+ boolean Skip_Silent_UE()
+ {
+ // always silent except for cases listed below
+ if((StringAt((m_current - 1), 3, "QUE", "GUE", "")
+ && !StringAt(0, 8, "BARBEQUE", "PALENQUE", "APPLIQUE", "")
+ // '-que' cases usually french but missing the acute accent
+ && !StringAt(0, 6, "RISQUE", "")
+ && !StringAt((m_current - 3), 5, "ARGUE", "SEGUE", "")
+ && !StringAt(0, 7, "PIROGUE", "ENRIQUE", "")
+ && !StringAt(0, 10, "COMMUNIQUE", ""))
+ && (m_current > 1)
+ && (((m_current + 1) == m_last)
+ || StringAt(0, 7, "JACQUES", "")))
+ {
+ m_current = SkipVowels(m_current);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes 'B'
+ *
+ *
+ */
+ void Encode_B()
+ {
+ if(Encode_Silent_B())
+ {
+ return;
+ }
+
+ // "-mb", e.g", "dumb", already skipped over under
+ // 'M', altho it should really be handled here...
+ MetaphAddExactApprox("B", "P");
+
+ if((CharAt(m_current + 1) == 'B')
+ || ((CharAt(m_current + 1) == 'P')
+ && ((m_current + 1 < m_last) && (CharAt(m_current + 2) != 'H'))))
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+ }
+
+ /**
+ * Encodes silent 'B' for cases not covered under "-mb-"
+ *
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_B()
+ {
+ //'debt', 'doubt', 'subtle'
+ if(StringAt((m_current - 2), 4, "DEBT", "")
+ || StringAt((m_current - 2), 5, "SUBTL", "")
+ || StringAt((m_current - 2), 6, "SUBTIL", "")
+ || StringAt((m_current - 3), 5, "DOUBT", ""))
+ {
+ MetaphAdd("T");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes 'C'
+ *
+ */
+ void Encode_C()
+ {
+
+ if(Encode_Silent_C_At_Beginning()
+ || Encode_CA_To_S()
+ || Encode_CO_To_S()
+ || Encode_CH()
+ || Encode_CCIA()
+ || Encode_CC()
+ || Encode_CK_CG_CQ()
+ || Encode_C_Front_Vowel()
+ || Encode_Silent_C()
+ || Encode_CZ()
+ || Encode_CS())
+ {
+ return;
+ }
+
+ //else
+ if(!StringAt((m_current - 1), 1, "C", "K", "G", "Q", ""))
+ {
+ MetaphAdd("K");
+ }
+
+ //name sent in 'mac caffrey', 'mac gregor
+ if(StringAt((m_current + 1), 2, " C", " Q", " G", ""))
+ {
+ m_current += 2;
+ }
+ else
+ {
+ if(StringAt((m_current + 1), 1, "C", "K", "Q", "")
+ && !StringAt((m_current + 1), 2, "CE", "CI", ""))
+ {
+ m_current += 2;
+ // account for combinations such as Ro-ckc-liffe
+ if(StringAt((m_current), 1, "C", "K", "Q", "")
+ && !StringAt((m_current + 1), 2, "CE", "CI", ""))
+ {
+ m_current++;
+ }
+ }
+ else
+ {
+ m_current++;
+ }
+ }
+ }
+
+ /**
+ * Encodes cases where 'C' is silent at beginning of word
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_C_At_Beginning()
+ {
+ //skip these when at start of word
+ if((m_current == 0)
+ && StringAt(m_current, 2, "CT", "CN", ""))
+ {
+ m_current += 1;
+ return true;
+ }
+
+ return false;
+ }
+
+
+ /**
+ * Encodes exceptions where "-CA-" should encode to S
+ * instead of K including cases where the cedilla has not been used
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_CA_To_S()
+ {
+ // Special case: 'caesar'.
+ // Also, where cedilla not used, as in "linguica" => LNKS
+ if(((m_current == 0) && StringAt(m_current, 4, "CAES", "CAEC", "CAEM", ""))
+ || StringAt(0, 8, "FRANCAIS", "FRANCAIX", "LINGUICA", "")
+ || StringAt(0, 6, "FACADE", "")
+ || StringAt(0, 9, "GONCALVES", "PROVENCAL", ""))
+ {
+ MetaphAdd("S");
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes exceptions where "-CO-" encodes to S instead of K
+ * including cases where the cedilla has not been used
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_CO_To_S()
+ {
+ // e.g. 'coelecanth' => SLKN0
+ if((StringAt(m_current, 4, "COEL", "")
+ && (IsVowel(m_current + 4) || ((m_current + 3) == m_last)))
+ || StringAt(m_current, 5, "COENA", "COENO", "")
+ || StringAt(0, 8, "FRANCOIS", "MELANCON", "")
+ || StringAt(0, 6, "GARCON", ""))
+ {
+ MetaphAdd("S");
+ AdvanceCounter(3, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-CH-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_CH()
+ {
+ if(StringAt(m_current, 2, "CH", ""))
+ {
+ if(Encode_CHAE()
+ || Encode_CH_To_H()
+ || Encode_Silent_CH()
+ || Encode_ARCH()
+ // Encode_CH_To_X() should be
+ // called before the germanic
+ // and greek encoding functions
+ || Encode_CH_To_X()
+ || Encode_English_CH_To_K()
+ || Encode_Germanic_CH_To_K()
+ || Encode_Greek_CH_Initial()
+ || Encode_Greek_CH_Non_Initial())
+ {
+ return true;
+ }
+
+ if(m_current > 0)
+ {
+ if(StringAt(0, 2, "MC", "")
+ && (m_current == 1))
+ {
+ //e.g., "McHugh"
+ MetaphAdd("K");
+ }
+ else
+ {
+ MetaphAdd("X", "K");
+ }
+ }
+ else
+ {
+ MetaphAdd("X");
+ }
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes "-CHAE-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_CHAE()
+ {
+ // e.g. 'michael'
+ if(((m_current > 0) && StringAt((m_current + 2), 2, "AE", "")))
+ {
+ if(StringAt(0, 7, "RACHAEL", ""))
+ {
+ MetaphAdd("X");
+ }
+ else if(!StringAt((m_current - 1), 1, "C", "K", "G", "Q", ""))
+ {
+ MetaphAdd("K");
+ }
+
+ AdvanceCounter(4, 2);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encdoes transliterations from the hebrew where the
+ * sound 'kh' is represented as "-CH-". The normal pronounciation
+ * of this in english is either 'h' or 'kh', and alternate
+ * spellings most often use "-H-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_CH_To_H()
+ {
+ // hebrew => 'H', e.g. 'channukah', 'chabad'
+ if(((m_current == 0)
+ && (StringAt((m_current + 2), 3, "AIM", "ETH", "ELM", "")
+ || StringAt((m_current + 2), 4, "ASID", "AZAN", "")
+ || StringAt((m_current + 2), 5, "UPPAH", "UTZPA", "ALLAH", "ALUTZ", "AMETZ", "")
+ || StringAt((m_current + 2), 6, "ESHVAN", "ADARIM", "ANUKAH", "")
+ || StringAt((m_current + 2), 7, "ALLLOTH", "ANNUKAH", "AROSETH", "")))
+ // and an irish name with the same encoding
+ || StringAt((m_current - 3), 7, "CLACHAN", ""))
+ {
+ MetaphAdd("H");
+ AdvanceCounter(3, 2);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes cases where "-CH-" is not pronounced
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_CH()
+ {
+ // '-ch-' not pronounced
+ if(StringAt((m_current - 2), 7, "FUCHSIA", "")
+ || StringAt((m_current - 2), 5, "YACHT", "")
+ || StringAt(0, 8, "STRACHAN", "")
+ || StringAt(0, 8, "CRICHTON", "")
+ || (StringAt((m_current - 3), 6, "DRACHM", ""))
+ && !StringAt((m_current - 3), 7, "DRACHMA", ""))
+ {
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes "-CH-" to X
+ * English language patterns
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_CH_To_X()
+ {
+ // e.g. 'approach', 'beach'
+ if((StringAt((m_current - 2), 4, "OACH", "EACH", "EECH", "OUCH", "OOCH", "MUCH", "SUCH", "")
+ && !StringAt((m_current - 3), 5, "JOACH", ""))
+ // e.g. 'dacha', 'macho'
+ || (((m_current + 2) == m_last ) && StringAt((m_current - 1), 4, "ACHA", "ACHO", ""))
+ || (StringAt(m_current, 4, "CHOT", "CHOD", "CHAT", "") && ((m_current + 3) == m_last))
+ || ((StringAt((m_current - 1), 4, "OCHE", "") && ((m_current + 2) == m_last))
+ && !StringAt((m_current - 2), 5, "DOCHE", ""))
+ || StringAt((m_current - 4), 6, "ATTACH", "DETACH", "KOVACH", "")
+ || StringAt((m_current - 5), 7, "SPINACH", "")
+ || StringAt(0, 6, "MACHAU", "")
+ || StringAt((m_current - 4), 8, "PARACHUT", "")
+ || StringAt((m_current - 5), 8, "MASSACHU", "")
+ || (StringAt((m_current - 3), 5, "THACH", "") && !StringAt((m_current - 1), 4, "ACHE", ""))
+ || StringAt((m_current - 2), 6, "VACHON", "") )
+ {
+ MetaphAdd("X");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes "-CH-" to K in contexts of
+ * initial "A" or "E" follwed by "CH"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_English_CH_To_K()
+ {
+ //'ache', 'echo', alternate spelling of 'michael'
+ if(((m_current == 1) && RootOrInflections(m_inWord, "ACHE"))
+ || (((m_current > 3) && RootOrInflections(m_inWord.substring(m_current - 1), "ACHE"))
+ && (StringAt(0, 3, "EAR", "")
+ || StringAt(0, 4, "HEAD", "BACK", "")
+ || StringAt(0, 5, "HEART", "BELLY", "TOOTH", "")))
+ || StringAt((m_current - 1), 4, "ECHO", "")
+ || StringAt((m_current - 2), 7, "MICHEAL", "")
+ || StringAt((m_current - 4), 7, "JERICHO", "")
+ || StringAt((m_current - 5), 7, "LEPRECH", ""))
+ {
+ MetaphAdd("K", "X");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes "-CH-" to K in mostly germanic context
+ * of internal "-ACH-", with exceptions
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Germanic_CH_To_K()
+ {
+ // various germanic
+ // "CH-"implies a german word where 'ch' => K
+ if(((m_current > 1)
+ && !IsVowel(m_current - 2)
+ && StringAt((m_current - 1), 3, "ACH", "")
+ && !StringAt((m_current - 2), 7, "MACHADO", "MACHUCA", "LACHANC", "LACHAPE", "KACHATU", "")
+ && !StringAt((m_current - 3), 7, "KHACHAT", "")
+ && ((CharAt(m_current + 2) != 'I')
+ && ((CharAt(m_current + 2) != 'E')
+ || StringAt((m_current - 2), 6, "BACHER", "MACHER", "MACHEN", "LACHER", "")) )
+ // e.g. 'brecht', 'fuchs'
+ || (StringAt((m_current + 2), 1, "T", "S", "")
+ && !(StringAt(0, 11, "WHICHSOEVER", "") || StringAt(0, 9, "LUNCHTIME", "") ))
+ // e.g. 'andromache'
+ || StringAt(0, 4, "SCHR", "")
+ || ((m_current > 2) && StringAt((m_current - 2), 5, "MACHE", ""))
+ || ((m_current == 2) && StringAt((m_current - 2), 4, "ZACH", ""))
+ || StringAt((m_current - 4), 6, "SCHACH", "")
+ || StringAt((m_current - 1), 5, "ACHEN", "")
+ || StringAt((m_current - 3), 5, "SPICH", "ZURCH", "BUECH", "")
+ || (StringAt((m_current - 3), 5, "KIRCH", "JOACH", "BLECH", "MALCH", "")
+ // "kirch" and "blech" both get 'X'
+ && !(StringAt((m_current - 3), 8, "KIRCHNER", "") || ((m_current + 1) == m_last)))
+ || (((m_current + 1) == m_last) && StringAt((m_current - 2), 4, "NICH", "LICH", "BACH", ""))
+ || (((m_current + 1) == m_last)
+ && StringAt((m_current - 3), 5, "URICH", "BRICH", "ERICH", "DRICH", "NRICH", "")
+ && !StringAt((m_current - 5), 7, "ALDRICH", "")
+ && !StringAt((m_current - 6), 8, "GOODRICH", "")
+ && !StringAt((m_current - 7), 9, "GINGERICH", "")))
+ || (((m_current + 1) == m_last) && StringAt((m_current - 4), 6, "ULRICH", "LFRICH", "LLRICH",
+ "EMRICH", "ZURICH", "EYRICH", ""))
+ // e.g., 'wachtler', 'wechsler', but not 'tichner'
+ || ((StringAt((m_current - 1), 1, "A", "O", "U", "E", "") || (m_current == 0))
+ && StringAt((m_current + 2), 1, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ", "")))
+ {
+ // "CHR/L-" e.g. 'chris' do not get
+ // alt pronunciation of 'X'
+ if(StringAt((m_current + 2), 1, "R", "L", "")
+ || SlavoGermanic())
+ {
+ MetaphAdd("K");
+ }
+ else
+ {
+ MetaphAdd("K", "X");
+ }
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-ARCH-". Some occurances are from greek roots and therefore encode
+ * to 'K', others are from english words and therefore encode to 'X'
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_ARCH()
+ {
+ if(StringAt((m_current - 2), 4, "ARCH", ""))
+ {
+ // "-ARCH-" has many combining forms where "-CH-" => K because of its
+ // derivation from the greek
+ if(((IsVowel(m_current + 2) && StringAt((m_current - 2), 5, "ARCHA", "ARCHI", "ARCHO", "ARCHU", "ARCHY", ""))
+ || StringAt((m_current - 2), 6, "ARCHEA", "ARCHEG", "ARCHEO", "ARCHET", "ARCHEL", "ARCHES", "ARCHEP",
+ "ARCHEM", "ARCHEN", "")
+ || (StringAt((m_current - 2), 4, "ARCH", "") && (((m_current + 1) == m_last)))
+ || StringAt(0, 7, "MENARCH", ""))
+ && (!RootOrInflections(m_inWord, "ARCH")
+ && !StringAt((m_current - 4), 6, "SEARCH", "POARCH", "")
+ && !StringAt(0, 9, "ARCHENEMY", "ARCHIBALD", "ARCHULETA", "ARCHAMBAU", "")
+ && !StringAt(0, 6, "ARCHER", "ARCHIE", "")
+ && !((((StringAt((m_current - 3), 5, "LARCH", "MARCH", "PARCH", "")
+ || StringAt((m_current - 4), 6, "STARCH", ""))
+ && !(StringAt(0, 6, "EPARCH", "")
+ || StringAt(0, 7, "NOMARCH", "")
+ || StringAt(0, 8, "EXILARCH", "HIPPARCH", "MARCHESE", "")
+ || StringAt(0, 9, "ARISTARCH", "")
+ || StringAt(0, 9, "MARCHETTI", "")) )
+ || RootOrInflections(m_inWord, "STARCH"))
+ && (!StringAt((m_current - 2), 5, "ARCHU", "ARCHY", "")
+ || StringAt(0, 7, "STARCHY", "")))))
+ {
+ MetaphAdd("K", "X");
+ }
+ else
+ {
+ MetaphAdd("X");
+ }
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-CH-" to K when from greek roots
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Greek_CH_Initial()
+ {
+ // greek roots e.g. 'chemistry', 'chorus', ch at beginning of root
+ if((StringAt(m_current, 6, "CHAMOM", "CHARAC", "CHARIS", "CHARTO", "CHARTU", "CHARYB", "CHRIST", "CHEMIC", "CHILIA", "")
+ || (StringAt(m_current, 5, "CHEMI", "CHEMO", "CHEMU", "CHEMY", "CHOND", "CHONA", "CHONI", "CHOIR", "CHASM",
+ "CHARO", "CHROM", "CHROI", "CHAMA", "CHALC", "CHALD", "CHAET","CHIRO", "CHILO", "CHELA", "CHOUS",
+ "CHEIL", "CHEIR", "CHEIM", "CHITI", "CHEOP", "")
+ && !(StringAt(m_current, 6, "CHEMIN", "") || StringAt((m_current - 2), 8, "ANCHONDO", "")))
+ || (StringAt(m_current, 5, "CHISM", "CHELI", "")
+ // exclude spanish "machismo"
+ && !(StringAt(0, 8, "MACHISMO", "")
+ // exclude some french words
+ || StringAt(0, 10, "REVANCHISM", "")
+ || StringAt(0, 9, "RICHELIEU", "")
+ || (StringAt(0, 5, "CHISM", "") && (m_length == 5))
+ || StringAt(0, 6, "MICHEL", "")))
+ // include e.g. "chorus", "chyme", "chaos"
+ || (StringAt(m_current, 4, "CHOR", "CHOL", "CHYM", "CHYL", "CHLO", "CHOS", "CHUS", "CHOE", "")
+ && !StringAt(0, 6, "CHOLLO", "CHOLLA", "CHORIZ", ""))
+ // "chaos" => K but not "chao"
+ || (StringAt(m_current, 4, "CHAO", "") && ((m_current + 3) != m_last))
+ // e.g. "abranchiate"
+ || (StringAt(m_current, 4, "CHIA", "") && !(StringAt(0, 10, "APPALACHIA", "") || StringAt(0, 7, "CHIAPAS", "")))
+ // e.g. "chimera"
+ || StringAt(m_current, 7, "CHIMERA", "CHIMAER", "CHIMERI", "")
+ // e.g. "chameleon"
+ || ((m_current == 0) && StringAt(m_current, 5, "CHAME", "CHELO", "CHITO", "") )
+ // e.g. "spirochete"
+ || ((((m_current + 4) == m_last) || ((m_current + 5) == m_last)) && StringAt((m_current - 1), 6, "OCHETE", "")))
+ // more exceptions where "-CH-" => X e.g. "chortle", "crocheter"
+ && !((StringAt(0, 5, "CHORE", "CHOLO", "CHOLA", "") && (m_length == 5))
+ || StringAt(m_current, 5, "CHORT", "CHOSE", "")
+ || StringAt((m_current - 3), 7, "CROCHET", "")
+ || StringAt(0, 7, "CHEMISE", "CHARISE", "CHARISS", "CHAROLE", "")) )
+ {
+ // "CHR/L-" e.g. 'christ', 'chlorine' do not get
+ // alt pronunciation of 'X'
+ if(StringAt((m_current + 2), 1, "R", "L", ""))
+ {
+ MetaphAdd("K");
+ }
+ else
+ {
+ MetaphAdd("K", "X");
+ }
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode a variety of greek and some german roots where "-CH-" => K
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Greek_CH_Non_Initial()
+ {
+ //greek & other roots e.g. 'tachometer', 'orchid', ch in middle or end of root
+ if(StringAt((m_current - 2), 6, "ORCHID", "NICHOL", "MECHAN", "LICHEN", "MACHIC", "PACHEL", "RACHIF", "RACHID",
+ "RACHIS", "RACHIC", "MICHAL", "")
+ || StringAt((m_current - 3), 5, "MELCH", "GLOCH", "TRACH", "TROCH", "BRACH", "SYNCH", "PSYCH",
+ "STICH", "PULCH", "EPOCH", "")
+ || (StringAt((m_current - 3), 5, "TRICH", "") && !StringAt((m_current - 5), 7, "OSTRICH", ""))
+ || (StringAt((m_current - 2), 4, "TYCH", "TOCH", "BUCH", "MOCH", "CICH", "DICH", "NUCH", "EICH", "LOCH",
+ "DOCH", "ZECH", "WYCH", "")
+ && !(StringAt((m_current - 4), 9, "INDOCHINA", "") || StringAt((m_current - 2), 6, "BUCHON", "")))
+ || StringAt((m_current - 2), 5, "LYCHN", "TACHO", "ORCHO", "ORCHI", "LICHO", "")
+ || (StringAt((m_current - 1), 5, "OCHER", "ECHIN", "ECHID", "") && ((m_current == 1) || (m_current == 2)))
+ || StringAt((m_current - 4), 6, "BRONCH", "STOICH", "STRYCH", "TELECH", "PLANCH", "CATECH", "MANICH", "MALACH",
+ "BIANCH", "DIDACH", "")
+ || (StringAt((m_current - 1), 4, "ICHA", "ICHN","") && (m_current == 1))
+ || StringAt((m_current - 2), 8, "ORCHESTR", "")
+ || StringAt((m_current - 4), 8, "BRANCHIO", "BRANCHIF", "")
+ || (StringAt((m_current - 1), 5, "ACHAB", "ACHAD", "ACHAN", "ACHAZ", "")
+ && !StringAt((m_current - 2), 7, "MACHADO", "LACHANC", ""))
+ || StringAt((m_current - 1), 6, "ACHISH", "ACHILL", "ACHAIA", "ACHENE", "")
+ || StringAt((m_current - 1), 7, "ACHAIAN", "ACHATES", "ACHIRAL", "ACHERON", "")
+ || StringAt((m_current - 1), 8, "ACHILLEA", "ACHIMAAS", "ACHILARY", "ACHELOUS", "ACHENIAL", "ACHERNAR", "")
+ || StringAt((m_current - 1), 9, "ACHALASIA", "ACHILLEAN", "ACHIMENES", "")
+ || StringAt((m_current - 1), 10, "ACHIMELECH", "ACHITOPHEL", "")
+ // e.g. 'inchoate'
+ || (((m_current - 2) == 0) && (StringAt((m_current - 2), 6, "INCHOA", "")
+ // e.g. 'ischemia'
+ || StringAt(0, 4, "ISCH", "")) )
+ // e.g. 'ablimelech', 'antioch', 'pentateuch'
+ || (((m_current + 1) == m_last) && StringAt((m_current - 1), 1, "A", "O", "U", "E", "")
+ && !(StringAt(0, 7, "DEBAUCH", "")
+ || StringAt((m_current - 2), 4, "MUCH", "SUCH", "KOCH", "")
+ || StringAt((m_current - 5), 7, "OODRICH", "ALDRICH", ""))))
+ {
+ MetaphAdd("K", "X");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes reliably italian "-CCIA-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_CCIA()
+ {
+ //e.g., 'focaccia'
+ if(StringAt((m_current + 1), 3, "CIA", ""))
+ {
+ MetaphAdd("X", "S");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-CC-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_CC()
+ {
+ //double 'C', but not if e.g. 'McClellan'
+ if(StringAt(m_current, 2, "CC", "") && !((m_current == 1) && (CharAt(0) == 'M')))
+ {
+ // exception
+ if (StringAt((m_current - 3), 7, "FLACCID", ""))
+ {
+ MetaphAdd("S");
+ AdvanceCounter(3, 2);
+ return true;
+ }
+
+ //'bacci', 'bertucci', other italian
+ if((((m_current + 2) == m_last) && StringAt((m_current + 2), 1, "I", ""))
+ || StringAt((m_current + 2), 2, "IO", "")
+ || (((m_current + 4) == m_last) && StringAt((m_current + 2), 3, "INO", "INI", "")))
+ {
+ MetaphAdd("X");
+ AdvanceCounter(3, 2);
+ return true;
+ }
+
+ //'accident', 'accede' 'succeed'
+ if(StringAt((m_current + 2), 1, "I", "E", "Y", "")
+ //except 'bellocchio','bacchus', 'soccer' get K
+ && !((CharAt(m_current + 2) == 'H')
+ || StringAt((m_current - 2), 6, "SOCCER", "")))
+ {
+ MetaphAdd("KS");
+ AdvanceCounter(3, 2);
+ return true;
+
+ }
+ else
+ {
+ //Pierce's rule
+ MetaphAdd("K");
+ m_current += 2;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases where the consonant following "C" is redundant
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_CK_CG_CQ()
+ {
+ if(StringAt(m_current, 2, "CK", "CG", "CQ", ""))
+ {
+ // eastern european spelling e.g. 'gorecki' == 'goresky'
+ if(StringAt(m_current, 3, "CKI", "CKY", "")
+ && ((m_current + 2) == m_last)
+ && (m_length > 6))
+ {
+ MetaphAdd("K", "SK");
+ }
+ else
+ {
+ MetaphAdd("K");
+ }
+ m_current += 2;
+
+ if(StringAt(m_current, 1, "K", "G", "Q", ""))
+ {
+ m_current++;
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases where "C" preceeds a front vowel such as "E", "I", or "Y".
+ * These cases most likely => S or X
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_C_Front_Vowel()
+ {
+ if(StringAt(m_current, 2, "CI", "CE", "CY", ""))
+ {
+ if(Encode_British_Silent_CE()
+ || Encode_CE()
+ || Encode_CI()
+ || Encode_Latinate_Suffixes())
+ {
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ MetaphAdd("S");
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_British_Silent_CE()
+ {
+ // english place names like e.g.'gloucester' pronounced glo-ster
+ if((StringAt((m_current + 1), 5, "ESTER", "") && ((m_current + 5) == m_last))
+ || StringAt((m_current + 1), 10, "ESTERSHIRE", ""))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_CE()
+ {
+ // 'ocean', 'commercial', 'provincial', 'cello', 'fettucini', 'medici'
+ if((StringAt((m_current + 1), 3, "EAN", "") && IsVowel(m_current - 1))
+ // e.g. 'rosacea'
+ || (StringAt((m_current - 1), 4, "ACEA", "")
+ && ((m_current + 2) == m_last)
+ && !StringAt(0, 7, "PANACEA", ""))
+ // e.g. 'botticelli', 'concerto'
+ || StringAt((m_current + 1), 4, "ELLI", "ERTO", "EORL", "")
+ // some italian names familiar to americans
+ || (StringAt((m_current - 3), 5, "CROCE", "") && ((m_current + 1) == m_last))
+ || StringAt((m_current - 3), 5, "DOLCE", "")
+ // e.g. 'cello'
+ || (StringAt((m_current + 1), 4, "ELLO", "")
+ && ((m_current + 4) == m_last)))
+ {
+ MetaphAdd("X", "S");
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_CI()
+ {
+ // with consonant before C
+ // e.g. 'fettucini', but exception for the americanized pronunciation of 'mancini'
+ if(((StringAt((m_current + 1), 3, "INI", "") && !StringAt(0, 7, "MANCINI", "")) && ((m_current + 3) == m_last))
+ // e.g. 'medici'
+ || (StringAt((m_current - 1), 3, "ICI", "") && ((m_current + 1) == m_last))
+ // e.g. "commercial', 'provincial', 'cistercian'
+ || StringAt((m_current - 1), 5, "RCIAL", "NCIAL", "RCIAN", "UCIUS", "")
+ // special cases
+ || StringAt((m_current - 3), 6, "MARCIA", "")
+ || StringAt((m_current - 2), 7, "ANCIENT", ""))
+ {
+ MetaphAdd("X", "S");
+ return true;
+ }
+
+ // with vowel before C (or at beginning?)
+ if(((StringAt(m_current, 3, "CIO", "CIE", "CIA", "")
+ && IsVowel(m_current - 1))
+ // e.g. "ciao"
+ || StringAt((m_current + 1), 3, "IAO", ""))
+ && !StringAt((m_current - 4), 8, "COERCION", ""))
+ {
+ if((StringAt(m_current, 4, "CIAN", "CIAL", "CIAO", "CIES", "CIOL", "CION", "")
+ // exception - "glacier" => 'X' but "spacier" = > 'S'
+ || StringAt((m_current - 3), 7, "GLACIER", "")
+ || StringAt(m_current, 5, "CIENT", "CIENC", "CIOUS", "CIATE", "CIATI", "CIATO", "CIABL", "CIARY", "")
+ || (((m_current + 2) == m_last) && StringAt(m_current, 3, "CIA", "CIO", ""))
+ || (((m_current + 3) == m_last) && StringAt(m_current, 3, "CIAS", "CIOS", "")))
+ // exceptions
+ && !(StringAt((m_current - 4), 11, "ASSOCIATION", "")
+ || StringAt(0, 4, "OCIE", "")
+ // exceptions mostly because these names are usually from
+ // the spanish rather than the italian in america
+ || StringAt((m_current - 2), 5, "LUCIO", "")
+ || StringAt((m_current - 2), 6, "MACIAS", "")
+ || StringAt((m_current - 3), 6, "GRACIE", "GRACIA", "")
+ || StringAt((m_current - 2), 7, "LUCIANO", "")
+ || StringAt((m_current - 3), 8, "MARCIANO", "")
+ || StringAt((m_current - 4), 7, "PALACIO", "")
+ || StringAt((m_current - 4), 9, "FELICIANO", "")
+ || StringAt((m_current - 5), 8, "MAURICIO", "")
+ || StringAt((m_current - 7), 11, "ENCARNACION", "")
+ || StringAt((m_current - 4), 8, "POLICIES", "")
+ || StringAt((m_current - 2), 8, "HACIENDA", "")
+ || StringAt((m_current - 6), 9, "ANDALUCIA", "")
+ || StringAt((m_current - 2), 5, "SOCIO", "SOCIE", "")))
+ {
+ MetaphAdd("X", "S");
+ }
+ else
+ {
+ MetaphAdd("S", "X");
+ }
+
+ return true;
+ }
+
+ // exception
+ if(StringAt((m_current - 4), 8, "COERCION", ""))
+ {
+ MetaphAdd("J");
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Latinate_Suffixes()
+ {
+ if(StringAt((m_current + 1), 4, "EOUS", "IOUS", ""))
+ {
+ MetaphAdd("X", "S");
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes some exceptions where "C" is silent
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_C()
+ {
+ if(StringAt((m_current + 1), 1, "T", "S", ""))
+ {
+ if (StringAt(0, 11, "CONNECTICUT", "")
+ || StringAt(0, 6, "INDICT", "TUCSON", ""))
+ {
+ m_current++;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes slavic spellings or transliterations
+ * written as "-CZ-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_CZ()
+ {
+ if(StringAt((m_current + 1), 1, "Z", "")
+ && !StringAt((m_current - 1), 6, "ECZEMA", ""))
+ {
+ if(StringAt(m_current, 4, "CZAR", ""))
+ {
+ MetaphAdd("S");
+ }
+ // otherwise most likely a czech word...
+ else
+ {
+ MetaphAdd("X");
+ }
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * "-CS" special cases
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_CS()
+ {
+ // give an 'etymological' 2nd
+ // encoding for "kovacs" so
+ // that it matches "kovach"
+ if(StringAt(0, 6, "KOVACS", ""))
+ {
+ MetaphAdd("KS", "X");
+ m_current += 2;
+ return true;
+ }
+
+ if(StringAt((m_current - 1), 3, "ACS", "")
+ && ((m_current + 1) == m_last)
+ && !StringAt((m_current - 4), 6, "ISAACS", ""))
+ {
+ MetaphAdd("X");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-D-"
+ *
+ */
+ void Encode_D()
+ {
+ if(Encode_DG()
+ || Encode_DJ()
+ || Encode_DT_DD()
+ || Encode_D_To_J()
+ || Encode_DOUS()
+ || Encode_Silent_D())
+ {
+ return;
+ }
+
+ if(m_encodeExact)
+ {
+ // "final de-voicing" in this case
+ // e.g. 'missed' == 'mist'
+ if((m_current == m_last)
+ && StringAt((m_current - 3), 4, "SSED", ""))
+ {
+ MetaphAdd("T");
+ }
+ else
+ {
+ MetaphAdd("D");
+ }
+ }
+ else
+ {
+ MetaphAdd("T");
+ }
+ m_current++;
+ }
+
+ /**
+ * Encode "-DG-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_DG()
+ {
+ if(StringAt(m_current, 2, "DG", ""))
+ {
+ // excludes exceptions e.g. 'edgar',
+ // or cases where 'g' is first letter of combining form
+ // e.g. 'handgun', 'waldglas'
+ if(StringAt((m_current + 2), 1, "A", "O", "")
+ // e.g. "midgut"
+ || StringAt((m_current + 1), 3, "GUN", "GUT", "")
+ // e.g. "handgrip"
+ || StringAt((m_current + 1), 4, "GEAR", "GLAS", "GRIP", "GREN", "GILL", "GRAF", "")
+ // e.g. "mudgard"
+ || StringAt((m_current + 1), 5, "GUARD", "GUILT", "GRAVE", "GRASS", "")
+ // e.g. "woodgrouse"
+ || StringAt((m_current + 1), 6, "GROUSE", ""))
+ {
+ MetaphAddExactApprox("DG", "TK");
+ }
+ else
+ {
+ //e.g. "edge", "abridgment"
+ MetaphAdd("J");
+ }
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-DJ-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_DJ()
+ {
+ // e.g. "adjacent"
+ if(StringAt(m_current, 2, "DJ", ""))
+ {
+ MetaphAdd("J");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-DD-" and "-DT-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_DT_DD()
+ {
+ // eat redundant 'T' or 'D'
+ if(StringAt(m_current, 2, "DT", "DD", ""))
+ {
+ if(StringAt(m_current, 3, "DTH", ""))
+ {
+ MetaphAddExactApprox("D0", "T0");
+ m_current += 3;
+ }
+ else
+ {
+ if(m_encodeExact)
+ {
+ // devoice it
+ if(StringAt(m_current, 2, "DT", ""))
+ {
+ MetaphAdd("T");
+ }
+ else
+ {
+ MetaphAdd("D");
+ }
+ }
+ else
+ {
+ MetaphAdd("T");
+ }
+ m_current += 2;
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases where "-DU-" "-DI-", and "-DI-" => J
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_D_To_J()
+ {
+ // e.g. "module", "adulate"
+ if((StringAt(m_current, 3, "DUL", "")
+ && (IsVowel(m_current - 1) && IsVowel(m_current + 3)))
+ // e.g. "soldier", "grandeur", "procedure"
+ || (((m_current + 3) == m_last)
+ && StringAt((m_current - 1) , 5, "LDIER", "NDEUR", "EDURE", "RDURE", ""))
+ || StringAt((m_current - 3), 7, "CORDIAL", "")
+ // e.g. "pendulum", "education"
+ || StringAt((m_current - 1), 5, "NDULA", "NDULU", "EDUCA", "")
+ // e.g. "individual", "individual", "residuum"
+ || StringAt((m_current - 1), 4, "ADUA", "IDUA", "IDUU", ""))
+ {
+ MetaphAddExactApprox("J", "D", "J", "T");
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode latinate suffix "-DOUS" where 'D' is pronounced as J
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_DOUS()
+ {
+ // e.g. "assiduous", "arduous"
+ if(StringAt((m_current + 1), 4, "UOUS", ""))
+ {
+ MetaphAddExactApprox("J", "D", "J", "T");
+ AdvanceCounter(4, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode silent "-D-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_D()
+ {
+ // silent 'D' e.g. 'wednesday', 'handsome'
+ if(StringAt((m_current - 2), 9, "WEDNESDAY", "")
+ || StringAt((m_current - 3), 7, "HANDKER", "HANDSOM", "WINDSOR", "")
+ // french silent D at end in words or names familiar to americans
+ || StringAt((m_current - 5), 6, "PERNOD", "ARTAUD", "RENAUD", "")
+ || StringAt((m_current - 6), 7, "RIMBAUD", "MICHAUD", "BICHAUD", ""))
+ {
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-F-"
+ *
+ */
+ void Encode_F()
+ {
+ // Encode cases where "-FT-" => "T" is usually silent
+ // e.g. 'often', 'soften'
+ // This should really be covered under "T"!
+ if(StringAt((m_current - 1), 5, "OFTEN", ""))
+ {
+ MetaphAdd("F", "FT");
+ m_current += 2;
+ return;
+ }
+
+ // eat redundant 'F'
+ if(CharAt(m_current + 1) == 'F')
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+
+ MetaphAdd("F");
+
+ }
+
+ /**
+ * Encode "-G-"
+ *
+ */
+ void Encode_G()
+ {
+ if(Encode_Silent_G_At_Beginning()
+ || Encode_GG()
+ || Encode_GK()
+ || Encode_GH()
+ || Encode_Silent_G()
+ || Encode_GN()
+ || Encode_GL()
+ || Encode_Initial_G_Front_Vowel()
+ || Encode_NGER()
+ || Encode_GER()
+ || Encode_GEL()
+ || Encode_Non_Initial_G_Front_Vowel()
+ || Encode_GA_To_J())
+ {
+ return;
+ }
+
+ if(!StringAt((m_current - 1), 1, "C", "K", "G", "Q", ""))
+ {
+ MetaphAddExactApprox("G", "K");
+ }
+
+ m_current++;
+ }
+
+ /**
+ * Encode cases where 'G' is silent at beginning of word
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_G_At_Beginning()
+ {
+ //skip these when at start of word
+ if((m_current == 0)
+ && StringAt(m_current, 2, "GN", ""))
+ {
+ m_current += 1;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-GG-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GG()
+ {
+ if(CharAt(m_current + 1) == 'G')
+ {
+ // italian e.g, 'loggia', 'caraveggio', also 'suggest' and 'exaggerate'
+ if(StringAt((m_current - 1), 5, "AGGIA", "OGGIA", "AGGIO", "EGGIO", "EGGIA", "IGGIO", "")
+ // 'ruggiero' but not 'snuggies'
+ || (StringAt((m_current - 1), 5, "UGGIE", "") && !(((m_current + 3) == m_last) || ((m_current + 4) == m_last)))
+ || (((m_current + 2) == m_last) && StringAt((m_current - 1), 4, "AGGI", "OGGI", ""))
+ || StringAt((m_current - 2), 6, "SUGGES", "XAGGER", "REGGIE", ""))
+ {
+ // expection where "-GG-" => KJ
+ if (StringAt((m_current - 2), 7, "SUGGEST", ""))
+ {
+ MetaphAddExactApprox("G", "K");
+ }
+
+ MetaphAdd("J");
+ AdvanceCounter(3, 2);
+ }
+ else
+ {
+ MetaphAddExactApprox("G", "K");
+ m_current += 2;
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-GK-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GK()
+ {
+ // 'gingko'
+ if(CharAt(m_current + 1) == 'K')
+ {
+ MetaphAdd("K");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-GH-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GH()
+ {
+ if(CharAt(m_current + 1) == 'H')
+ {
+ if(Encode_GH_After_Consonant()
+ || Encode_Initial_GH()
+ || Encode_GH_To_J()
+ || Encode_GH_To_H()
+ || Encode_UGHT()
+ || Encode_GH_H_Part_Of_Other_Word()
+ || Encode_Silent_GH()
+ || Encode_GH_To_F())
+ {
+ return true;
+ }
+
+ MetaphAddExactApprox("G", "K");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GH_After_Consonant()
+ {
+ // e.g. 'burgher', 'bingham'
+ if((m_current > 0)
+ && !IsVowel(m_current - 1)
+ // not e.g. 'greenhalgh'
+ && !(StringAt((m_current - 3), 5, "HALGH", "")
+ && ((m_current + 1) == m_last)))
+ {
+ MetaphAddExactApprox("G", "K");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Initial_GH()
+ {
+ if(m_current < 3)
+ {
+ // e.g. "ghislane", "ghiradelli"
+ if(m_current == 0)
+ {
+ if(CharAt(m_current + 2) == 'I')
+ {
+ MetaphAdd("J");
+ }
+ else
+ {
+ MetaphAddExactApprox("G", "K");
+ }
+ m_current += 2;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GH_To_J()
+ {
+ // e.g., 'greenhalgh', 'dunkenhalgh', english names
+ if(StringAt((m_current - 2), 4, "ALGH", "") && ((m_current + 1) == m_last))
+ {
+ MetaphAdd("J", "");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GH_To_H()
+ {
+ // special cases
+ // e.g., 'donoghue', 'donaghy'
+ if((StringAt((m_current - 4), 4, "DONO", "DONA", "") && IsVowel(m_current + 2))
+ || StringAt((m_current - 5), 9, "CALLAGHAN", ""))
+ {
+ MetaphAdd("H");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_UGHT()
+ {
+ //e.g. "ought", "aught", "daughter", "slaughter"
+ if(StringAt((m_current - 1), 4, "UGHT", ""))
+ {
+ if ((StringAt((m_current - 3), 5, "LAUGH", "")
+ && !(StringAt((m_current - 4), 7, "SLAUGHT", "")
+ || StringAt((m_current - 3), 7, "LAUGHTO", "")))
+ || StringAt((m_current - 4), 6, "DRAUGH", ""))
+ {
+ MetaphAdd("FT");
+ }
+ else
+ {
+ MetaphAdd("T");
+ }
+ m_current += 3;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GH_H_Part_Of_Other_Word()
+ {
+ // if the 'H' is the beginning of another word or syllable
+ if (StringAt((m_current + 1), 4, "HOUS", "HEAD", "HOLE", "HORN", "HARN", ""))
+ {
+ MetaphAddExactApprox("G", "K");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_GH()
+ {
+ //Parker's rule (with some further refinements) - e.g., 'hugh'
+ if(((((m_current > 1) && StringAt((m_current - 2), 1, "B", "H", "D", "G", "L", "") )
+ //e.g., 'bough'
+ || ((m_current > 2)
+ && StringAt((m_current - 3), 1, "B", "H", "D", "K", "W", "N", "P", "V", "")
+ && !StringAt(0, 6, "ENOUGH", ""))
+ //e.g., 'broughton'
+ || ((m_current > 3) && StringAt((m_current - 4), 1, "B", "H", "") )
+ //'plough', 'slaugh'
+ || ((m_current > 3) && StringAt((m_current - 4), 2, "PL", "SL", "") )
+ || ((m_current > 0)
+ // 'sigh', 'light'
+ && ((CharAt(m_current - 1) == 'I')
+ || StringAt(0, 4, "PUGH", "")
+ // e.g. 'MCDONAGH', 'MURTAGH', 'CREAGH'
+ || (StringAt((m_current - 1), 3, "AGH", "")
+ && ((m_current + 1) == m_last))
+ || StringAt((m_current - 4), 6, "GERAGH", "DRAUGH", "")
+ || (StringAt((m_current - 3), 5, "GAUGH", "GEOGH", "MAUGH", "")
+ && !StringAt(0, 9, "MCGAUGHEY", ""))
+ // exceptions to 'tough', 'rough', 'lough'
+ || (StringAt((m_current - 2), 4, "OUGH", "")
+ && (m_current > 3)
+ && !StringAt((m_current - 4), 6, "CCOUGH", "ENOUGH", "TROUGH", "CLOUGH", "")))))
+ // suffixes starting w/ vowel where "-GH-" is usually silent
+ && (StringAt((m_current - 3), 5, "VAUGH", "FEIGH", "LEIGH", "")
+ || StringAt((m_current - 2), 4, "HIGH", "TIGH", "")
+ || ((m_current + 1) == m_last)
+ || (StringAt((m_current + 2), 2, "IE", "EY", "ES", "ER", "ED", "TY", "")
+ && ((m_current + 3) == m_last)
+ && !StringAt((m_current - 5), 9, "GALLAGHER", ""))
+ || (StringAt((m_current + 2), 1, "Y", "") && ((m_current + 2) == m_last))
+ || (StringAt((m_current + 2), 3, "ING", "OUT", "") && ((m_current + 4) == m_last))
+ || (StringAt((m_current + 2), 4, "ERTY", "") && ((m_current + 5) == m_last))
+ || (!IsVowel(m_current + 2)
+ || StringAt((m_current - 3), 5, "GAUGH", "GEOGH", "MAUGH", "")
+ || StringAt((m_current - 4), 8, "BROUGHAM", ""))))
+ // exceptions where '-g-' pronounced
+ && !(StringAt(0, 6, "BALOGH", "SABAGH", "")
+ || StringAt((m_current - 2), 7, "BAGHDAD", "")
+ || StringAt((m_current - 3), 5, "WHIGH", "")
+ || StringAt((m_current - 5), 7, "SABBAGH", "AKHLAGH", "")))
+ {
+ // silent - do nothing
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GH_Special_Cases()
+ {
+ boolean handled = false;
+
+ // special case: 'hiccough' == 'hiccup'
+ if(StringAt((m_current - 6), 8, "HICCOUGH", ""))
+ {
+ MetaphAdd("P");
+ handled = true;
+ }
+ // special case: 'lough' alt spelling for scots 'loch'
+ else if(StringAt(0, 5, "LOUGH", ""))
+ {
+ MetaphAdd("K");
+ handled = true;
+ }
+ // hungarian
+ else if(StringAt(0, 6, "BALOGH", ""))
+ {
+ MetaphAddExactApprox("G", "", "K", "");
+ handled = true;
+ }
+ // "maclaughlin"
+ else if(StringAt((m_current - 3), 8, "LAUGHLIN", "COUGHLAN", "LOUGHLIN", ""))
+ {
+ MetaphAdd("K", "F");
+ handled = true;
+ }
+ else if(StringAt((m_current - 3), 5, "GOUGH", "")
+ || StringAt((m_current - 7), 9, "COLCLOUGH", ""))
+ {
+ MetaphAdd("", "F");
+ handled = true;
+ }
+
+ if(handled)
+ {
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GH_To_F()
+ {
+ // the cases covered here would fall under
+ // the GH_To_F rule below otherwise
+ if(Encode_GH_Special_Cases())
+ {
+ return true;
+ }
+ else
+ {
+ //e.g., 'laugh', 'cough', 'rough', 'tough'
+ if((m_current > 2)
+ && (CharAt(m_current - 1) == 'U')
+ && IsVowel(m_current - 2)
+ && StringAt((m_current - 3), 1, "C", "G", "L", "R", "T", "N", "S", "")
+ && !StringAt((m_current - 4), 8, "BREUGHEL", "FLAUGHER", ""))
+ {
+ MetaphAdd("F");
+ m_current += 2;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode some contexts where "g" is silent
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_G()
+ {
+ // e.g. "phlegm", "apothegm", "voigt"
+ if((((m_current + 1) == m_last)
+ && (StringAt((m_current - 1), 3, "EGM", "IGM", "AGM", "")
+ || StringAt(m_current, 2, "GT", "")))
+ || (StringAt(0, 5, "HUGES", "") && (m_length == 5)))
+ {
+ m_current++;
+ return true;
+ }
+
+ // vietnamese names e.g. "Nguyen" but not "Ng"
+ if(StringAt(0, 2, "NG", "") && (m_current != m_last))
+ {
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * ENcode "-GN-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GN()
+ {
+ if(CharAt(m_current + 1) == 'N')
+ {
+ // 'align' 'sign', 'resign' but not 'resignation'
+ // also 'impugn', 'impugnable', but not 'repugnant'
+ if(((m_current > 1)
+ && ((StringAt((m_current - 1), 1, "I", "U", "E", "")
+ || StringAt((m_current - 3), 9, "LORGNETTE", "")
+ || StringAt((m_current - 2), 9, "LAGNIAPPE", "")
+ || StringAt((m_current - 2), 6, "COGNAC", "")
+ || StringAt((m_current - 3), 7, "CHAGNON", "")
+ || StringAt((m_current - 5), 9, "COMPAGNIE", "")
+ || StringAt((m_current - 4), 6, "BOLOGN", ""))
+ // Exceptions: following are cases where 'G' is pronounced
+ // in "assign" 'g' is silent, but not in "assignation"
+ && !(StringAt((m_current + 2), 5, "ATION", "")
+ || StringAt((m_current + 2), 4, "ATOR", "")
+ || StringAt((m_current + 2), 3, "ATE", "ITY", "")
+ // exception to exceptions, not pronounced:
+ || (StringAt((m_current + 2), 2, "AN", "AC", "IA", "UM", "")
+ && !(StringAt((m_current - 3), 8, "POIGNANT", "")
+ || StringAt((m_current - 2), 6, "COGNAC", "")))
+ || StringAt(0, 7, "SPIGNER", "STEGNER", "")
+ || (StringAt(0, 5, "SIGNE", "") && (m_length == 5))
+ || StringAt((m_current - 2), 5, "LIGNI", "LIGNO", "REGNA", "DIGNI", "WEGNE",
+ "TIGNE", "RIGNE", "REGNE", "TIGNO", "")
+ || StringAt((m_current - 2), 6, "SIGNAL", "SIGNIF", "SIGNAT", "")
+ || StringAt((m_current - 1), 5, "IGNIT", ""))
+ && !StringAt((m_current - 2), 6, "SIGNET", "LIGNEO", "") ))
+ //not e.g. 'cagney', 'magna'
+ || (((m_current + 2) == m_last)
+ && StringAt(m_current, 3, "GNE", "GNA", "")
+ && !StringAt((m_current - 2), 5, "SIGNA", "MAGNA", "SIGNE", "")))
+ {
+ MetaphAddExactApprox("N", "GN", "N", "KN");
+ }
+ else
+ {
+ MetaphAddExactApprox("GN", "KN");
+ }
+ m_current += 2;
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Encode "-GL-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GL()
+ {
+ //'tagliaro', 'puglia' BUT add K in alternative
+ // since americans sometimes do this
+ if(StringAt((m_current + 1), 3, "LIA", "LIO", "LIE", "")
+ && IsVowel(m_current - 1))
+ {
+ MetaphAddExactApprox("L", "GL", "L", "KL");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Initial_G_Soft()
+ {
+ if(((StringAt((m_current + 1), 2, "EL", "EM", "EN", "EO", "ER", "ES", "IA", "IN", "IO", "IP", "IU", "YM", "YN", "YP", "YR", "EE", "")
+ || StringAt((m_current + 1), 3, "IRA", "IRO", ""))
+ // except for smaller set of cases where => K, e.g. "gerber"
+ && !(StringAt((m_current + 1), 3, "ELD", "ELT", "ERT", "INZ", "ERH", "ITE", "ERD", "ERL", "ERN",
+ "INT", "EES", "EEK", "ELB", "EER", "")
+ || StringAt((m_current + 1), 4, "ERSH", "ERST", "INSB", "INGR", "EROW", "ERKE", "EREN", "")
+ || StringAt((m_current + 1), 5, "ELLER", "ERDIE", "ERBER", "ESUND", "ESNER", "INGKO", "INKGO",
+ "IPPER", "ESELL", "IPSON", "EEZER", "ERSON", "ELMAN", "")
+ || StringAt((m_current + 1), 6, "ESTALT", "ESTAPO", "INGHAM", "ERRITY", "ERRISH", "ESSNER", "ENGLER", "")
+ || StringAt((m_current + 1), 7, "YNAECOL", "YNECOLO", "ENTHNER", "ERAGHTY", "")
+ || StringAt((m_current + 1), 8, "INGERICH", "EOGHEGAN", "")))
+ ||(IsVowel(m_current + 1)
+ && (StringAt((m_current + 1), 3, "EE ", "EEW", "")
+ || (StringAt((m_current + 1), 3, "IGI", "IRA", "IBE", "AOL", "IDE", "IGL", "")
+ && !StringAt((m_current + 1), 5, "IDEON", "") )
+ || StringAt((m_current + 1), 4, "ILES", "INGI", "ISEL", "")
+ || (StringAt((m_current + 1), 5, "INGER", "") && !StringAt((m_current + 1), 8, "INGERICH", ""))
+ || StringAt((m_current + 1), 5, "IBBER", "IBBET", "IBLET", "IBRAN", "IGOLO", "IRARD", "IGANT", "")
+ || StringAt((m_current + 1), 6, "IRAFFE", "EEWHIZ","")
+ || StringAt((m_current + 1), 7, "ILLETTE", "IBRALTA", ""))))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases where 'G' is at start of word followed
+ * by a "front" vowel e.g. 'E', 'I', 'Y'
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Initial_G_Front_Vowel()
+ {
+ // 'g' followed by vowel at beginning
+ if((m_current == 0) && Front_Vowel(m_current + 1))
+ {
+ // special case "gila" as in "gila monster"
+ if(StringAt((m_current + 1), 3, "ILA", "")
+ && (m_length == 4))
+ {
+ MetaphAdd("H");
+ }
+ else if(Initial_G_Soft())
+ {
+ MetaphAddExactApprox("J", "G", "J", "K");
+ }
+ else
+ {
+ // only code alternate 'J' if front vowel
+ if((m_inWord.charAt(m_current + 1) == 'E') || (m_inWord.charAt(m_current + 1) == 'I'))
+ {
+ MetaphAddExactApprox("G", "J", "K", "J");
+ }
+ else
+ {
+ MetaphAddExactApprox("G", "K");
+ }
+ }
+
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-NGER-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_NGER()
+ {
+ if((m_current > 1)
+ && StringAt((m_current - 1), 4, "NGER", ""))
+ {
+ // default 'G' => J such as 'ranger', 'stranger', 'manger', 'messenger', 'orangery', 'granger'
+ // 'boulanger', 'challenger', 'danger', 'changer', 'harbinger', 'lounger', 'ginger', 'passenger'
+ // except for these the following
+ if(!(RootOrInflections(m_inWord, "ANGER")
+ || RootOrInflections(m_inWord, "LINGER")
+ || RootOrInflections(m_inWord, "MALINGER")
+ || RootOrInflections(m_inWord, "FINGER")
+ || (StringAt((m_current - 3), 4, "HUNG", "FING", "BUNG", "WING", "RING", "DING", "ZENG",
+ "ZING", "JUNG", "LONG", "PING", "CONG", "MONG", "BANG",
+ "GANG", "HANG", "LANG", "SANG", "SING", "WANG", "ZANG", "")
+ // exceptions to above where 'G' => J
+ && !(StringAt((m_current - 6), 7, "BOULANG", "SLESING", "KISSING", "DERRING", "")
+ || StringAt((m_current - 8), 9, "SCHLESING", "")
+ || StringAt((m_current - 5), 6, "SALING", "BELANG", "")
+ || StringAt((m_current - 6), 7, "BARRING", "")
+ || StringAt((m_current - 6), 9, "PHALANGER", "")
+ || StringAt((m_current - 4), 5, "CHANG", "")))
+ || StringAt((m_current - 4), 5, "STING", "YOUNG", "")
+ || StringAt((m_current - 5), 6, "STRONG", "")
+ || StringAt(0, 3, "UNG", "ENG", "ING", "")
+ || StringAt(m_current, 6, "GERICH", "")
+ || StringAt(0, 6, "SENGER", "")
+ || StringAt((m_current - 3), 6, "WENGER", "MUNGER", "SONGER", "KINGER", "")
+ || StringAt((m_current - 4), 7, "FLINGER", "SLINGER", "STANGER", "STENGER", "KLINGER", "CLINGER", "")
+ || StringAt((m_current - 5), 8, "SPRINGER", "SPRENGER", "")
+ || StringAt((m_current - 3), 7, "LINGERF", "")
+ || StringAt((m_current - 2), 7, "ANGERLY", "ANGERBO", "INGERSO", "") ))
+ {
+ MetaphAddExactApprox("J", "G", "J", "K");
+ }
+ else
+ {
+ MetaphAddExactApprox("G", "J", "K", "J");
+ }
+
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-GER-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GER()
+ {
+ if((m_current > 0)
+ && StringAt((m_current + 1), 2, "ER", ""))
+ {
+ // Exceptions to 'GE' where 'G' => K
+ // e.g. "JAGER", "TIGER", "LIGER", "LAGER", "LUGER", "AUGER", "EAGER", "HAGER", "SAGER"
+ if((((m_current == 2) && IsVowel(m_current - 1) && !IsVowel(m_current - 2)
+ && !(StringAt((m_current - 2), 5, "PAGER", "WAGER", "NIGER", "ROGER", "LEGER", "CAGER", ""))
+ || StringAt((m_current - 2), 5, "AUGER", "EAGER", "INGER", "YAGER", ""))
+ || StringAt((m_current - 3), 6, "SEEGER", "JAEGER", "GEIGER", "KRUGER", "SAUGER", "BURGER",
+ "MEAGER", "MARGER", "RIEGER", "YAEGER", "STEGER", "PRAGER", "SWIGER",
+ "YERGER", "TORGER", "FERGER", "HILGER", "ZEIGER", "YARGER",
+ "COWGER", "CREGER", "KROGER", "KREGER", "GRAGER", "STIGER", "BERGER", "")
+ // 'berger' but not 'bergerac'
+ || (StringAt((m_current - 3), 6, "BERGER", "") && ((m_current + 2) == m_last))
+ || StringAt((m_current - 4), 7, "KREIGER", "KRUEGER", "METZGER", "KRIEGER", "KROEGER", "STEIGER",
+ "DRAEGER", "BUERGER", "BOERGER", "FIBIGER", "")
+ // e.g. 'harshbarger', 'winebarger'
+ || (StringAt((m_current - 3), 6, "BARGER", "") && (m_current > 4))
+ // e.g. 'weisgerber'
+ || (StringAt(m_current, 6, "GERBER", "") && (m_current > 0))
+ || StringAt((m_current - 5), 8, "SCHWAGER", "LYBARGER", "SPRENGER", "GALLAGER", "WILLIGER", "")
+ || StringAt(0, 4, "HARGER", "")
+ || (StringAt(0, 4, "AGER", "EGER", "") && (m_length == 4))
+ || StringAt((m_current - 1), 6, "YGERNE", "")
+ || StringAt((m_current - 6), 9, "SCHWEIGER", ""))
+ && !(StringAt((m_current - 5), 10, "BELLIGEREN", "")
+ || StringAt(0, 7, "MARGERY", "")
+ || StringAt((m_current - 3), 8, "BERGERAC", "")))
+ {
+ if(SlavoGermanic())
+ {
+ MetaphAddExactApprox("G", "K");
+ }
+ else
+ {
+ MetaphAddExactApprox("G", "J", "K", "J");
+ }
+ }
+ else
+ {
+ MetaphAddExactApprox("J", "G", "J", "K");
+ }
+
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * ENcode "-GEL-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GEL()
+ {
+ // more likely to be "-GEL-" => JL
+ if(StringAt((m_current + 1), 2, "EL", "")
+ && (m_current > 0))
+ {
+ // except for
+ // "BAGEL", "HEGEL", "HUGEL", "KUGEL", "NAGEL", "VOGEL", "FOGEL", "PAGEL"
+ if(((m_length == 5)
+ && IsVowel(m_current - 1)
+ && !IsVowel(m_current - 2)
+ && !StringAt((m_current - 2), 5, "NIGEL", "RIGEL", ""))
+ // or the following as combining forms
+ || StringAt((m_current - 2), 5, "ENGEL", "HEGEL", "NAGEL", "VOGEL", "")
+ || StringAt((m_current - 3), 6, "MANGEL", "WEIGEL", "FLUGEL", "RANGEL", "HAUGEN", "RIEGEL", "VOEGEL", "")
+ || StringAt((m_current - 4), 7, "SPEIGEL", "STEIGEL", "WRANGEL", "SPIEGEL", "")
+ || StringAt((m_current - 4), 8, "DANEGELD", ""))
+ {
+ if(SlavoGermanic())
+ {
+ MetaphAddExactApprox("G", "K");
+ }
+ else
+ {
+ MetaphAddExactApprox("G", "J", "K", "J");
+ }
+ }
+ else
+ {
+ MetaphAddExactApprox("J", "G", "J", "K");
+ }
+
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-G-" followed by a vowel when non-initial leter.
+ * Default for this is a 'J' sound, so check exceptions where
+ * it is pronounced 'G'
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Non_Initial_G_Front_Vowel()
+ {
+ // -gy-, gi-, ge-
+ if(StringAt((m_current + 1), 1, "E", "I", "Y", ""))
+ {
+ // '-ge' at end
+ // almost always 'j 'sound
+ if(StringAt(m_current, 2, "GE", "") && (m_current == (m_last - 1)))
+ {
+ if(Hard_GE_At_End())
+ {
+ if(SlavoGermanic())
+ {
+ MetaphAddExactApprox("G", "K");
+ }
+ else
+ {
+ MetaphAddExactApprox("G", "J", "K", "J");
+ }
+ }
+ else
+ {
+ MetaphAdd("J");
+ }
+ }
+ else
+ {
+ if(Internal_Hard_G())
+ {
+ // don't encode KG or KK if e.g. "mcgill"
+ if(!((m_current == 2) && StringAt(0, 2, "MC", ""))
+ || ((m_current == 3) && StringAt(0, 3, "MAC", "")))
+ {
+ if(SlavoGermanic())
+ {
+ MetaphAddExactApprox("G", "K");
+ }
+ else
+ {
+ MetaphAddExactApprox("G", "J", "K", "J");
+ }
+ }
+ }
+ else
+ {
+ MetaphAddExactApprox("J", "G", "J", "K");
+ }
+ }
+
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /*
+ * Detect german names and other words that have
+ * a 'hard' 'g' in the context of "-ge" at end
+ *
+ * @return true if encoding handled in this routine, false if not
+ */
+ boolean Hard_GE_At_End()
+ {
+ if(StringAt(0, 6, "RENEGE", "STONGE", "STANGE", "PRANGE", "KRESGE", "")
+ || StringAt(0, 5, "BYRGE", "BIRGE", "BERGE", "HAUGE", "")
+ || StringAt(0, 4, "HAGE", "")
+ || StringAt(0, 5, "LANGE", "SYNGE", "BENGE", "RUNGE", "HELGE", "")
+ || StringAt(0, 4, "INGE", "LAGE", ""))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Exceptions to default encoding to 'J':
+ * encode "-G-" to 'G' in "-g-" words
+ * where we are not at "-GE" at the end of the word
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Internal_Hard_G()
+ {
+ // if not "-GE" at end
+ if(!(((m_current + 1) == m_last) && (CharAt(m_current + 1) == 'E') )
+ && (Internal_Hard_NG()
+ || Internal_Hard_GEN_GIN_GET_GIT()
+ || Internal_Hard_G_Open_Syllable()
+ || Internal_Hard_G_Other()))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Detect words where "-ge-" or "-gi-" get a 'hard' 'g'
+ * even though this is usually a 'soft' 'g' context
+ *
+ * @return true if 'hard' 'g' detected
+ *
+ */
+ boolean Internal_Hard_G_Other()
+ {
+ if((StringAt(m_current, 4, "GETH", "GEAR", "GEIS", "GIRL", "GIVI", "GIVE", "GIFT",
+ "GIRD", "GIRT", "GILV", "GILD", "GELD", "")
+ && !StringAt((m_current - 3), 6, "GINGIV", "") )
+ // "gish" but not "largish"
+ || (StringAt((m_current + 1), 3, "ISH", "") && (m_current > 0) && !StringAt(0, 4, "LARG", ""))
+ || (StringAt((m_current - 2), 5, "MAGED", "MEGID", "") && !((m_current + 2) == m_last))
+ || StringAt(m_current, 3, "GEZ", "")
+ || StringAt(0, 4, "WEGE", "HAGE", "")
+ || (StringAt((m_current - 2), 6, "ONGEST", "UNGEST", "")
+ && ((m_current + 3) == m_last)
+ && !StringAt((m_current - 3), 7, "CONGEST", ""))
+ || StringAt(0, 5, "VOEGE", "BERGE", "HELGE", "")
+ || (StringAt(0, 4, "ENGE", "BOGY", "") && (m_length == 4))
+ || StringAt(m_current, 6, "GIBBON", "")
+ || StringAt(0, 10, "CORREGIDOR", "")
+ || StringAt(0, 8, "INGEBORG", "")
+ || (StringAt(m_current, 4, "GILL", "")
+ && (((m_current + 3) == m_last) || ((m_current + 4) == m_last))
+ && !StringAt(0, 8, "STURGILL", "")))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Detect words where "-gy-", "-gie-", "-gee-",
+ * or "-gio-" get a 'hard' 'g' even though this is
+ * usually a 'soft' 'g' context
+ *
+ * @return true if 'hard' 'g' detected
+ *
+ */
+ boolean Internal_Hard_G_Open_Syllable()
+ {
+ if(StringAt((m_current + 1), 3, "EYE", "")
+ || StringAt((m_current - 2), 4, "FOGY", "POGY", "YOGI", "")
+ || StringAt((m_current - 2), 5, "MAGEE", "MCGEE", "HAGIO", "")
+ || StringAt((m_current - 1), 4, "RGEY", "OGEY", "")
+ || StringAt((m_current - 3), 5, "HOAGY", "STOGY", "PORGY", "")
+ || StringAt((m_current - 5), 8, "CARNEGIE", "")
+ || (StringAt((m_current - 1), 4, "OGEY", "OGIE", "") && ((m_current + 2) == m_last)))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Detect a number of contexts, mostly german names, that
+ * take a 'hard' 'g'.
+ *
+ * @return true if 'hard' 'g' detected, false if not
+ *
+ */
+ boolean Internal_Hard_GEN_GIN_GET_GIT()
+ {
+ if((StringAt((m_current - 3), 6, "FORGET", "TARGET", "MARGIT", "MARGET", "TURGEN",
+ "BERGEN", "MORGEN", "JORGEN", "HAUGEN", "JERGEN",
+ "JURGEN", "LINGEN", "BORGEN", "LANGEN", "KLAGEN", "STIGER", "BERGER", "")
+ && !StringAt(m_current, 7, "GENETIC", "GENESIS", "")
+ && !StringAt((m_current - 4), 8, "PLANGENT", ""))
+ || (StringAt((m_current - 3), 6, "BERGIN", "FEAGIN", "DURGIN", "") && ((m_current + 2) == m_last))
+ || (StringAt((m_current - 2), 5, "ENGEN", "") && !StringAt((m_current + 3), 3, "DER", "ETI", "ESI", ""))
+ || StringAt((m_current - 4), 7, "JUERGEN", "")
+ || StringAt(0, 5, "NAGIN", "MAGIN", "HAGIN", "")
+ || (StringAt(0, 5, "ENGIN", "DEGEN", "LAGEN", "MAGEN", "NAGIN", "") && (m_length == 5))
+ || (StringAt((m_current - 2), 5, "BEGET", "BEGIN", "HAGEN", "FAGIN",
+ "BOGEN", "WIGIN", "NTGEN", "EIGEN",
+ "WEGEN", "WAGEN", "")
+ && !StringAt((m_current - 5), 8, "OSPHAGEN", "")))
+ {
+ return true;
+ }
+
+ return false;
+ }
+ /**
+ * Detect a number of contexts of '-ng-' that will
+ * take a 'hard' 'g' despite being followed by a
+ * front vowel.
+ *
+ * @return true if 'hard' 'g' detected, false if not
+ *
+ */
+ boolean Internal_Hard_NG()
+ {
+ if((StringAt((m_current - 3), 4, "DANG", "FANG", "SING", "")
+ // exception to exception
+ && !StringAt((m_current - 5), 8, "DISINGEN", "") )
+ || StringAt(0, 5, "INGEB", "ENGEB", "")
+ || (StringAt((m_current - 3), 4, "RING", "WING", "HANG", "LONG", "")
+ && !(StringAt((m_current - 4), 5, "CRING", "FRING", "ORANG", "TWING", "CHANG", "PHANG", "")
+ || StringAt((m_current - 5), 6, "SYRING", "")
+ || StringAt((m_current - 3), 7, "RINGENC", "RINGENT", "LONGITU", "LONGEVI", "")
+ // e.g. 'longino', 'mastrangelo'
+ || (StringAt(m_current, 4, "GELO", "GINO", "") && ((m_current + 3) == m_last))))
+ || (StringAt((m_current - 1), 3, "NGY", "")
+ // exceptions to exception
+ && !(StringAt((m_current - 3), 5, "RANGY", "MANGY", "MINGY", "")
+ || StringAt((m_current - 4), 6, "SPONGY", "STINGY", ""))))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode special case where "-GA-" => J
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_GA_To_J()
+ {
+ // 'margary', 'margarine'
+ if((StringAt((m_current - 3), 7, "MARGARY", "MARGARI", "")
+ // but not in spanish forms such as "margatita"
+ && !StringAt((m_current - 3), 8, "MARGARIT", ""))
+ || StringAt(0, 4, "GAOL", "")
+ || StringAt((m_current - 2), 5, "ALGAE", ""))
+ {
+ MetaphAddExactApprox("J", "G", "J", "K");
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode 'H'
+ *
+ *
+ */
+ void Encode_H()
+ {
+ if(Encode_Initial_Silent_H()
+ || Encode_Initial_HS()
+ || Encode_Initial_HU_HW()
+ || Encode_Non_Initial_Silent_H())
+ {
+ return;
+ }
+
+ //only keep if first & before vowel or btw. 2 vowels
+ if(!Encode_H_Pronounced())
+ {
+ //also takes care of 'HH'
+ m_current++;
+ }
+ }
+
+ /**
+ * Encode cases where initial 'H' is not pronounced (in American)
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Initial_Silent_H()
+ {
+ //'hour', 'herb', 'heir', 'honor'
+ if(StringAt((m_current + 1), 3, "OUR", "ERB", "EIR", "")
+ || StringAt((m_current + 1), 4, "ONOR", "")
+ || StringAt((m_current + 1), 5, "ONOUR", "ONEST", ""))
+ {
+ // british pronounce H in this word
+ // americans give it 'H' for the name,
+ // no 'H' for the plant
+ if((m_current == 0) && StringAt(m_current, 4, "HERB", ""))
+ {
+ if(m_encodeVowels)
+ {
+ MetaphAdd("HA", "A");
+ }
+ else
+ {
+ MetaphAdd("H", "A");
+ }
+ }
+ else if((m_current == 0) || m_encodeVowels)
+ {
+ MetaphAdd("A");
+ }
+
+ m_current++;
+ // don't encode vowels twice
+ m_current = SkipVowels(m_current);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "HS-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Initial_HS()
+ {
+ // old chinese pinyin transliteration
+ // e.g., 'HSIAO'
+ if ((m_current == 0) && StringAt(0, 2, "HS", ""))
+ {
+ MetaphAdd("X");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases where "HU-" is pronounced as part of a vowel dipthong
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Initial_HU_HW()
+ {
+ // spanish spellings and chinese pinyin transliteration
+ if (StringAt(0, 3, "HUA", "HUE", "HWA", ""))
+ {
+ if(!StringAt(m_current, 4, "HUEY", ""))
+ {
+ MetaphAdd("A");
+
+ if(!m_encodeVowels)
+ {
+ m_current += 3;
+ }
+ else
+ {
+ m_current++;
+ // don't encode vowels twice
+ while(IsVowel(m_current) || (CharAt(m_current) == 'W'))
+ {
+ m_current++;
+ }
+ }
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases where 'H' is silent between vowels
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Non_Initial_Silent_H()
+ {
+ //exceptions - 'h' not pronounced
+ // "PROHIB" BUT NOT "PROHIBIT"
+ if(StringAt((m_current - 2), 5, "NIHIL", "VEHEM", "LOHEN", "NEHEM",
+ "MAHON", "MAHAN", "COHEN", "GAHAN", "")
+ || StringAt((m_current - 3), 6, "GRAHAM", "PROHIB", "FRAHER",
+ "TOOHEY", "TOUHEY", "")
+ || StringAt((m_current - 3), 5, "TOUHY", "")
+ || StringAt(0, 9, "CHIHUAHUA", ""))
+ {
+ if(!m_encodeVowels)
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ // don't encode vowels twice
+ m_current = SkipVowels(m_current);
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases where 'H' is pronounced
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_H_Pronounced()
+ {
+ if((((m_current == 0)
+ || IsVowel(m_current - 1)
+ || ((m_current > 0)
+ && (CharAt(m_current - 1) == 'W')))
+ && IsVowel(m_current + 1))
+ // e.g. 'alWahhab'
+ || ((CharAt(m_current + 1) == 'H') && IsVowel(m_current + 2)))
+ {
+ MetaphAdd("H");
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode 'J'
+ *
+ */
+ void Encode_J()
+ {
+ if(Encode_Spanish_J()
+ || Encode_Spanish_OJ_UJ())
+ {
+ return;
+ }
+
+ Encode_Other_J();
+ }
+
+ /**
+ * Encode cases where initial or medial "j" is in a spanish word or name
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Spanish_J()
+ {
+ //obvious spanish, e.g. "jose", "san jacinto"
+ if((StringAt((m_current + 1), 3, "UAN", "ACI", "ALI", "EFE", "ICA", "IME", "OAQ", "UAR", "")
+ && !StringAt(m_current, 8, "JIMERSON", "JIMERSEN", ""))
+ || (StringAt((m_current + 1), 3, "OSE", "") && ((m_current + 3) == m_last))
+ || StringAt((m_current + 1), 4, "EREZ", "UNTA", "AIME", "AVIE", "AVIA", "")
+ || StringAt((m_current + 1), 6, "IMINEZ", "ARAMIL", "")
+ || (((m_current + 2) == m_last) && StringAt((m_current - 2), 5, "MEJIA", ""))
+ || StringAt((m_current - 2), 5, "TEJED", "TEJAD", "LUJAN", "FAJAR", "BEJAR", "BOJOR", "CAJIG",
+ "DEJAS", "DUJAR", "DUJAN", "MIJAR", "MEJOR", "NAJAR",
+ "NOJOS", "RAJED", "RIJAL", "REJON", "TEJAN", "UIJAN", "")
+ || StringAt((m_current - 3), 8, "ALEJANDR", "GUAJARDO", "TRUJILLO", "")
+ || (StringAt((m_current - 2), 5, "RAJAS", "") && (m_current > 2))
+ || (StringAt((m_current - 2), 5, "MEJIA", "") && !StringAt((m_current - 2), 6, "MEJIAN", ""))
+ || StringAt((m_current - 1), 5, "OJEDA", "")
+ || StringAt((m_current - 3), 5, "LEIJA", "MINJA", "")
+ || StringAt((m_current - 3), 6, "VIAJES", "GRAJAL", "")
+ || StringAt(m_current, 8, "JAUREGUI", "")
+ || StringAt((m_current - 4), 8, "HINOJOSA", "")
+ || StringAt(0, 4, "SAN ", "")
+ || (((m_current + 1) == m_last)
+ && (CharAt(m_current + 1) == 'O')
+ // exceptions
+ && !(StringAt(0, 4, "TOJO", "")
+ || StringAt(0, 5, "BANJO", "")
+ || StringAt(0, 6, "MARYJO", ""))))
+ {
+ // americans pronounce "juan" as 'wan'
+ // and "marijuana" and "tijuana" also
+ // do not get the 'H' as in spanish, so
+ // just treat it like a vowel in these cases
+ if(!(StringAt(m_current, 4, "JUAN", "") || StringAt(m_current, 4, "JOAQ", "")))
+ {
+ MetaphAdd("H");
+ }
+ else
+ {
+ if(m_current == 0)
+ {
+ MetaphAdd("A");
+ }
+ }
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ // Jorge gets 2nd HARHA. also JULIO, JESUS
+ if(StringAt((m_current + 1), 4, "ORGE", "ULIO", "ESUS", "")
+ && !StringAt(0, 6, "JORGEN", ""))
+ {
+ // get both consonants for "jorge"
+ if(((m_current + 4) == m_last) && StringAt((m_current + 1), 4, "ORGE", ""))
+ {
+ if(m_encodeVowels)
+ {
+ MetaphAdd("JARJ", "HARHA");
+ }
+ else
+ {
+ MetaphAdd("JRJ", "HRH");
+ }
+ AdvanceCounter(5, 5);
+ return true;
+ }
+
+ MetaphAdd("J", "H");
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases where 'J' is clearly in a german word or name
+ * that americans pronounce in the german fashion
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_German_J()
+ {
+ if(StringAt((m_current + 1), 2, "AH", "")
+ || (StringAt((m_current + 1), 5, "OHANN", "") && ((m_current + 5) == m_last))
+ || (StringAt((m_current + 1), 3, "UNG", "") && !StringAt((m_current + 1), 4, "UNGL", ""))
+ || StringAt((m_current + 1), 3, "UGO", ""))
+ {
+ MetaphAdd("A");
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-JOJ-" and "-JUJ-" as spanish words
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Spanish_OJ_UJ()
+ {
+ if(StringAt((m_current + 1), 5, "OJOBA", "UJUY ", ""))
+ {
+ if(m_encodeVowels)
+ {
+ MetaphAdd("HAH");
+ }
+ else
+ {
+ MetaphAdd("HH");
+ }
+
+ AdvanceCounter(4, 3);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode 'J' => J
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_J_To_J()
+ {
+ if(IsVowel(m_current + 1))
+ {
+ if((m_current == 0)
+ && Names_Beginning_With_J_That_Get_Alt_Y())
+ {
+ // 'Y' is a vowel so encode
+ // is as 'A'
+ if(m_encodeVowels)
+ {
+ MetaphAdd("JA", "A");
+ }
+ else
+ {
+ MetaphAdd("J", "A");
+ }
+ }
+ else
+ {
+ if(m_encodeVowels)
+ {
+ MetaphAdd("JA");
+ }
+ else
+ {
+ MetaphAdd("J");
+ }
+ }
+
+ m_current++;
+ m_current = SkipVowels(m_current);
+ return false;
+ }
+ else
+ {
+ MetaphAdd("J");
+ m_current++;
+ return true;
+ }
+
+// return false;
+ }
+
+ /**
+ * Encode 'J' toward end in spanish words
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Spanish_J_2()
+ {
+ // spanish forms e.g. "brujo", "badajoz"
+ if((((m_current - 2) == 0)
+ && StringAt((m_current - 2), 4, "BOJA", "BAJA", "BEJA", "BOJO", "MOJA", "MOJI", "MEJI", ""))
+ || (((m_current - 3) == 0)
+ && StringAt((m_current - 3), 5, "FRIJO", "BRUJO", "BRUJA", "GRAJE", "GRIJA", "LEIJA", "QUIJA", ""))
+ || (((m_current + 3) == m_last)
+ && StringAt((m_current - 1), 5, "AJARA", ""))
+ || (((m_current + 2) == m_last)
+ && StringAt((m_current - 1), 4, "AJOS", "EJOS", "OJAS", "OJOS", "UJON", "AJOZ", "AJAL", "UJAR", "EJON", "EJAN", ""))
+ || (((m_current + 1) == m_last)
+ && (StringAt((m_current - 1), 3, "OJA", "EJA", "") && !StringAt(0, 4, "DEJA", ""))))
+ {
+ MetaphAdd("H");
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode 'J' as vowel in some exception cases
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_J_As_Vowel()
+ {
+ if(StringAt(m_current, 5, "JEWSK", ""))
+ {
+ MetaphAdd("J", "");
+ return true;
+ }
+
+ // e.g. "stijl", "sejm" - dutch, scandanavian, and eastern european spellings
+ if((StringAt((m_current + 1), 1, "L", "T", "K", "S", "N", "M", "")
+ // except words from hindi and arabic
+ && !StringAt((m_current + 2), 1, "A", ""))
+ || StringAt(0, 9, "HALLELUJA", "LJUBLJANA", "")
+ || StringAt(0, 4, "LJUB", "BJOR", "")
+ || StringAt(0, 5, "HAJEK", "")
+ || StringAt(0, 3, "WOJ", "")
+ // e.g. 'fjord'
+ || StringAt(0, 2, "FJ", "")
+ // e.g. 'rekjavik', 'blagojevic'
+ || StringAt(m_current, 5, "JAVIK", "JEVIC", "")
+ || (((m_current + 1) == m_last) && StringAt(0, 5, "SONJA", "TANJA", "TONJA", "")))
+
+ {
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Call routines to encode 'J', in proper order
+ *
+ */
+ void Encode_Other_J()
+ {
+ if(m_current == 0)
+ {
+ if(Encode_German_J())
+ {
+ return;
+ }
+ else
+ {
+ if(Encode_J_To_J())
+ {
+ return;
+ }
+ }
+ }
+ else
+ {
+ if(Encode_Spanish_J_2())
+ {
+ return;
+ }
+ else if(!Encode_J_As_Vowel())
+ {
+ MetaphAdd("J");
+ }
+
+ //it could happen! e.g. "hajj"
+ // eat redundant 'J'
+ if(CharAt(m_current + 1) == 'J')
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+ }
+ }
+
+ /**
+ * Encode 'K'
+ *
+ *
+ */
+ void Encode_K()
+ {
+ if(!Encode_Silent_K())
+ {
+ MetaphAdd("K");
+
+ // eat redundant 'K's and 'Q's
+ if((CharAt(m_current + 1) == 'K')
+ || (CharAt(m_current + 1) == 'Q'))
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+ }
+ }
+
+ /**
+ * Encode cases where 'K' is not pronounced
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_K()
+ {
+ //skip this except for special cases
+ if((m_current == 0)
+ && StringAt(m_current, 2, "KN", ""))
+ {
+ if(!(StringAt((m_current + 2), 5, "ESSET", "IEVEL", "") || StringAt((m_current + 2), 3, "ISH", "") ))
+ {
+ m_current += 1;
+ return true;
+ }
+ }
+
+ // e.g. "know", "knit", "knob"
+ if((StringAt((m_current + 1), 3, "NOW", "NIT", "NOT", "NOB", "")
+ // exception, "slipknot" => SLPNT but "banknote" => PNKNT
+ && !StringAt(0, 8, "BANKNOTE", ""))
+ || StringAt((m_current + 1), 4, "NOCK", "NUCK", "NIFE", "NACK", "")
+ || StringAt((m_current + 1), 5, "NIGHT", ""))
+ {
+ // N already encoded before
+ // e.g. "penknife"
+ if ((m_current > 0) && CharAt(m_current - 1) == 'N')
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode 'L'
+ *
+ * Includes special vowel transposition
+ * encoding, where 'LE' => AL
+ *
+ */
+ void Encode_L()
+ {
+ // logic below needs to know this
+ // after 'm_current' variable changed
+ int save_current = m_current;
+
+ Interpolate_Vowel_When_Cons_L_At_End();
+
+ if(Encode_LELY_To_L()
+ || Encode_COLONEL()
+ || Encode_French_AULT()
+ || Encode_French_EUIL()
+ || Encode_French_OULX()
+ || Encode_Silent_L_In_LM()
+ || Encode_Silent_L_In_LK_LV()
+ || Encode_Silent_L_In_OULD())
+ {
+ return;
+ }
+
+ if(Encode_LL_As_Vowel_Cases())
+ {
+ return;
+ }
+
+ Encode_LE_Cases(save_current);
+ }
+
+ /**
+ * Cases where an L follows D, G, or T at the
+ * end have a schwa pronounced before the L
+ *
+ */
+ void Interpolate_Vowel_When_Cons_L_At_End()
+ {
+ if(m_encodeVowels == true)
+ {
+ // e.g. "ertl", "vogl"
+ if((m_current == m_last)
+ && StringAt((m_current - 1), 1, "D", "G", "T", ""))
+ {
+ MetaphAdd("A");
+ }
+ }
+ }
+
+ /**
+ * Catch cases where 'L' spelled twice but pronounced
+ * once, e.g., 'DOCILELY' => TSL
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_LELY_To_L()
+ {
+ // e.g. "agilely", "docilely"
+ if(StringAt((m_current - 1), 5, "ILELY", "")
+ && ((m_current + 3) == m_last))
+ {
+ MetaphAdd("L");
+ m_current += 3;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode special case "colonel" => KRNL. Can somebody tell
+ * me how this pronounciation came to be?
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_COLONEL()
+ {
+ if(StringAt((m_current - 2), 7, "COLONEL", ""))
+ {
+ MetaphAdd("R");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-AULT-", found in a french names
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_French_AULT()
+ {
+ // e.g. "renault" and "foucault", well known to americans, but not "fault"
+ if((m_current > 3)
+ && (StringAt((m_current - 3), 5, "RAULT", "NAULT", "BAULT", "SAULT", "GAULT", "CAULT", "")
+ || StringAt((m_current - 4), 6, "REAULT", "RIAULT", "NEAULT", "BEAULT", ""))
+ && !(RootOrInflections(m_inWord, "ASSAULT")
+ || StringAt((m_current - 8), 10, "SOMERSAULT","")
+ || StringAt((m_current - 9), 11, "SUMMERSAULT", "")))
+ {
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-EUIL-", always found in a french word
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_French_EUIL()
+ {
+ // e.g. "auteuil"
+ if(StringAt((m_current - 3), 4, "EUIL", "") && (m_current == m_last))
+ {
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-OULX", always found in a french word
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_French_OULX()
+ {
+ // e.g. "proulx"
+ if(StringAt((m_current - 2), 4, "OULX", "") && ((m_current + 1) == m_last))
+ {
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes contexts where 'L' is not pronounced in "-LM-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_L_In_LM()
+ {
+ if(StringAt(m_current, 2, "LM", "LN", ""))
+ {
+ // e.g. "lincoln", "holmes", "psalm", "salmon"
+ if((StringAt((m_current - 2), 4, "COLN", "CALM", "BALM", "MALM", "PALM", "")
+ || (StringAt((m_current - 1), 3, "OLM", "") && ((m_current + 1) == m_last))
+ || StringAt((m_current - 3), 5, "PSALM", "QUALM", "")
+ || StringAt((m_current - 2), 6, "SALMON", "HOLMES", "")
+ || StringAt((m_current - 1), 6, "ALMOND", "")
+ || ((m_current == 1) && StringAt((m_current - 1), 4, "ALMS", "") ))
+ && (!StringAt((m_current + 2), 1, "A", "")
+ && !StringAt((m_current - 2), 5, "BALMO", "")
+ && !StringAt((m_current - 2), 6, "PALMER", "PALMOR", "BALMER", "")
+ && !StringAt((m_current - 3), 5, "THALM", "")))
+ {
+ m_current++;
+ return true;
+ }
+ else
+ {
+ MetaphAdd("L");
+ m_current++;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes contexts where '-L-' is silent in 'LK', 'LV'
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_L_In_LK_LV()
+ {
+ if((StringAt((m_current - 2), 4, "WALK", "YOLK", "FOLK", "HALF", "TALK", "CALF", "BALK", "CALK", "")
+ || (StringAt((m_current - 2), 4, "POLK", "")
+ && !StringAt((m_current - 2), 5, "POLKA", "WALKO", ""))
+ || (StringAt((m_current - 2), 4, "HALV", "")
+ && !StringAt((m_current - 2), 5, "HALVA", "HALVO", ""))
+ || (StringAt((m_current - 3), 5, "CAULK", "CHALK", "BAULK", "FAULK", "")
+ && !StringAt((m_current - 4), 6, "SCHALK", ""))
+ || (StringAt((m_current - 2), 5, "SALVE", "CALVE", "")
+ || StringAt((m_current - 2), 6, "SOLDER", ""))
+ // exceptions to above cases where 'L' is usually pronounced
+ && !StringAt((m_current - 2), 6, "SALVER", "CALVER", ""))
+ && !StringAt((m_current - 5), 9, "GONSALVES", "GONCALVES", "")
+ && !StringAt((m_current - 2), 6, "BALKAN", "TALKAL", "")
+ && !StringAt((m_current - 3), 5, "PAULK", "CHALF", ""))
+ {
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode 'L' in contexts of "-OULD-" where it is silent
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_L_In_OULD()
+ {
+ //'would', 'could'
+ if(StringAt((m_current - 3), 5, "WOULD", "COULD", "")
+ || (StringAt((m_current - 4), 6, "SHOULD", "")
+ && !StringAt((m_current - 4), 8, "SHOULDER", "")))
+ {
+ MetaphAddExactApprox("D", "T");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-ILLA-" and "-ILLE-" in spanish and french
+ * contexts were americans know to pronounce it as a 'Y'
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_LL_As_Vowel_Special_Cases()
+ {
+ if(StringAt((m_current - 5), 8, "TORTILLA", "")
+ || StringAt((m_current - 8), 11, "RATATOUILLE", "")
+ // e.g. 'guillermo', "veillard"
+ || (StringAt(0, 5, "GUILL", "VEILL", "GAILL", "")
+ // 'guillotine' usually has '-ll-' pronounced as 'L' in english
+ && !(StringAt((m_current - 3), 7, "GUILLOT", "GUILLOR", "GUILLEN", "")
+ || (StringAt(0, 5, "GUILL", "") && (m_length == 5))))
+ // e.g. "brouillard", "gremillion"
+ || StringAt(0, 7, "BROUILL", "GREMILL", "ROBILL", "")
+ // e.g. 'mireille'
+ || (StringAt((m_current - 2), 5, "EILLE", "")
+ && ((m_current + 2) == m_last)
+ // exception "reveille" usually pronounced as 're-vil-lee'
+ && !StringAt((m_current - 5), 8, "REVEILLE", "")))
+ {
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode other spanish cases where "-LL-" is pronounced as 'Y'
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_LL_As_Vowel()
+ {
+ //spanish e.g. "cabrillo", "gallegos" but also "gorilla", "ballerina" -
+ // give both pronounciations since an american might pronounce "cabrillo"
+ // in the spanish or the american fashion.
+ if((((m_current + 3) == m_length)
+ && StringAt((m_current - 1), 4, "ILLO", "ILLA", "ALLE", ""))
+ || (((StringAt((m_last - 1), 2, "AS", "OS", "")
+ || StringAt(m_last, 2, "AS", "OS", "")
+ || StringAt(m_last, 1, "A", "O", ""))
+ && StringAt((m_current - 1), 2, "AL", "IL", ""))
+ && !StringAt((m_current - 1), 4, "ALLA", ""))
+ || StringAt(0, 5, "VILLE", "VILLA", "")
+ || StringAt(0, 8, "GALLARDO", "VALLADAR", "MAGALLAN", "CAVALLAR", "BALLASTE", "")
+ || StringAt(0, 3, "LLA", ""))
+ {
+ MetaphAdd("L", "");
+ m_current += 2;
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Call routines to encode "-LL-", in proper order
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_LL_As_Vowel_Cases()
+ {
+ if(CharAt(m_current + 1) == 'L')
+ {
+ if(Encode_LL_As_Vowel_Special_Cases())
+ {
+ return true;
+ }
+ else if(Encode_LL_As_Vowel())
+ {
+ return true;
+ }
+ m_current += 2;
+
+ }
+ else
+ {
+ m_current++;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode vowel-encoding cases where "-LE-" is pronounced "-EL-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Vowel_LE_Transposition(int save_current)
+ {
+ // transposition of vowel sound and L occurs in many words,
+ // e.g. "bristle", "dazzle", "goggle" => KAKAL
+ if(m_encodeVowels
+ && (save_current > 1)
+ && !IsVowel(save_current - 1)
+ && (CharAt(save_current + 1) == 'E')
+ && (CharAt(save_current - 1) != 'L')
+ && (CharAt(save_current - 1) != 'R')
+ // lots of exceptions to this:
+ && !IsVowel(save_current + 2)
+ && !StringAt(0, 7, "ECCLESI", "COMPLEC", "COMPLEJ", "ROBLEDO", "")
+ && !StringAt(0, 5, "MCCLE", "MCLEL", "")
+ && !StringAt(0, 6, "EMBLEM", "KADLEC", "")
+ && !(((save_current + 2) == m_last) && StringAt(save_current, 3, "LET", ""))
+ && !StringAt(save_current, 7, "LETTING", "")
+ && !StringAt(save_current, 6, "LETELY", "LETTER", "LETION", "LETIAN", "LETING", "LETORY", "")
+ && !StringAt(save_current, 5, "LETUS", "LETIV", "")
+ && !StringAt(save_current, 4, "LESS", "LESQ", "LECT", "LEDG", "LETE", "LETH", "LETS", "LETT", "")
+ && !StringAt(save_current, 3, "LEG", "LER", "LEX", "")
+ // e.g. "complement" !=> KAMPALMENT
+ && !(StringAt(save_current, 6, "LEMENT", "")
+ && !(StringAt((m_current - 5), 6, "BATTLE", "TANGLE", "PUZZLE", "RABBLE", "BABBLE", "")
+ || StringAt((m_current - 4), 5, "TABLE", "")))
+ && !(((save_current + 2) == m_last) && StringAt((save_current - 2), 5, "OCLES", "ACLES", "AKLES", ""))
+ && !StringAt((save_current - 3), 5, "LISLE", "AISLE", "")
+ && !StringAt(0, 4, "ISLE", "")
+ && !StringAt(0, 6, "ROBLES", "")
+ && !StringAt((save_current - 4), 7, "PROBLEM", "RESPLEN", "")
+ && !StringAt((save_current - 3), 6, "REPLEN", "")
+ && !StringAt((save_current - 2), 4, "SPLE", "")
+ && (CharAt(save_current - 1) != 'H')
+ && (CharAt(save_current - 1) != 'W'))
+ {
+ MetaphAdd("AL");
+ flag_AL_inversion = true;
+
+ // eat redundant 'L'
+ if(CharAt(save_current + 2) == 'L')
+ {
+ m_current = save_current + 3;
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode special vowel-encoding cases where 'E' is not
+ * silent at the end of a word as is the usual case
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Vowel_Preserve_Vowel_After_L(int save_current)
+ {
+ // an example of where the vowel would NOT need to be preserved
+ // would be, say, "hustled", where there is no vowel pronounced
+ // between the 'l' and the 'd'
+ if(m_encodeVowels
+ && !IsVowel(save_current - 1)
+ && (CharAt(save_current + 1) == 'E')
+ && (save_current > 1)
+ && ((save_current + 1) != m_last)
+ && !(StringAt((save_current + 1), 2, "ES", "ED", "")
+ && ((save_current + 2) == m_last))
+ && !StringAt((save_current - 1), 5, "RLEST", "") )
+ {
+ MetaphAdd("LA");
+ m_current = SkipVowels(m_current);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Call routines to encode "-LE-", in proper order
+ *
+ * @param save_current index of actual current letter
+ *
+ */
+ void Encode_LE_Cases(int save_current)
+ {
+ if(Encode_Vowel_LE_Transposition(save_current))
+ {
+ return;
+ }
+ else
+ {
+ if(Encode_Vowel_Preserve_Vowel_After_L(save_current))
+ {
+ return;
+ }
+ else
+ {
+ MetaphAdd("L");
+ }
+ }
+ }
+
+ /**
+ * Encode "-M-"
+ *
+ */
+ void Encode_M()
+ {
+ if(Encode_Silent_M_At_Beginning()
+ || Encode_MR_And_MRS()
+ || Encode_MAC()
+ || Encode_MPT())
+ {
+ return;
+ }
+
+ // Silent 'B' should really be handled
+ // under 'B", not here under 'M'!
+ Encode_MB();
+
+ MetaphAdd("M");
+ }
+
+ /**
+ * Encode cases where 'M' is silent at beginning of word
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_M_At_Beginning()
+ {
+ //skip these when at start of word
+ if((m_current == 0)
+ && StringAt(m_current, 2, "MN", ""))
+ {
+ m_current += 1;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode special cases "Mr." and "Mrs."
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_MR_And_MRS()
+ {
+ if((m_current == 0) && StringAt(m_current, 2, "MR", ""))
+ {
+ // exceptions for "mr." and "mrs."
+ if((m_length == 2) && StringAt(m_current, 2, "MR", ""))
+ {
+ if(m_encodeVowels)
+ {
+ MetaphAdd("MASTAR");
+ }
+ else
+ {
+ MetaphAdd("MSTR");
+ }
+ m_current += 2;
+ return true;
+ }
+ else if((m_length == 3) && StringAt(m_current, 3, "MRS", ""))
+ {
+ if(m_encodeVowels)
+ {
+ MetaphAdd("MASAS");
+ }
+ else
+ {
+ MetaphAdd("MSS");
+ }
+ m_current += 3;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "Mac-" and "Mc-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_MAC()
+ {
+ // should only find irish and
+ // scottish names e.g. 'macintosh'
+ if((m_current == 0)
+ && (StringAt(0, 7, "MACIVER", "MACEWEN", "")
+ || StringAt(0, 8, "MACELROY", "MACILROY", "")
+ || StringAt(0, 9, "MACINTOSH", "")
+ || StringAt(0, 2, "MC", "") ))
+ {
+ if(m_encodeVowels)
+ {
+ MetaphAdd("MAK");
+ }
+ else
+ {
+ MetaphAdd("MK");
+ }
+
+ if(StringAt(0, 2, "MC", ""))
+ {
+ if(StringAt((m_current + 2), 1, "K", "G", "Q", "")
+ // watch out for e.g. "McGeorge"
+ && !StringAt((m_current + 2), 4, "GEOR", ""))
+ {
+ m_current += 3;
+ }
+ else
+ {
+ m_current += 2;
+ }
+ }
+ else
+ {
+ m_current += 3;
+ }
+
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode silent 'M' in context of "-MPT-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_MPT()
+ {
+ if(StringAt((m_current - 2), 8, "COMPTROL", "")
+ || StringAt((m_current - 4), 7, "ACCOMPT", ""))
+
+ {
+ MetaphAdd("N");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Test if 'B' is silent in these contexts
+ *
+ * @return true if 'B' is silent in this context
+ *
+ */
+ boolean Test_Silent_MB_1()
+ {
+ // e.g. "LAMB", "COMB", "LIMB", "DUMB", "BOMB"
+ // Handle combining roots first
+ if (((m_current == 3)
+ && StringAt((m_current - 3), 5, "THUMB", ""))
+ || ((m_current == 2)
+ && StringAt((m_current - 2), 4, "DUMB", "BOMB", "DAMN", "LAMB", "NUMB", "TOMB", "") ))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Test if 'B' is pronounced in this context
+ *
+ * @return true if 'B' is pronounced in this context
+ *
+ */
+ boolean Test_Pronounced_MB()
+ {
+ if (StringAt((m_current - 2), 6, "NUMBER", "")
+ || (StringAt((m_current + 2), 1, "A", "")
+ && !StringAt((m_current - 2), 7, "DUMBASS", ""))
+ || StringAt((m_current + 2), 1, "O", "")
+ || StringAt((m_current - 2), 6, "LAMBEN", "LAMBER", "LAMBET", "TOMBIG", "LAMBRE", ""))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Test whether "-B-" is silent in these contexts
+ *
+ * @return true if 'B' is silent in this context
+ *
+ */
+ boolean Test_Silent_MB_2()
+ {
+ // 'M' is the current letter
+ if ((CharAt(m_current + 1) == 'B') && (m_current > 1)
+ && (((m_current + 1) == m_last)
+ // other situations where "-MB-" is at end of root
+ // but not at end of word. The tests are for standard
+ // noun suffixes.
+ // e.g. "climbing" => KLMNK
+ || StringAt((m_current + 2), 3, "ING", "ABL", "")
+ || StringAt((m_current + 2), 4, "LIKE", "")
+ || ((CharAt(m_current + 2) == 'S') && ((m_current + 2) == m_last))
+ || StringAt((m_current - 5), 7, "BUNCOMB", "")
+ // e.g. "bomber",
+ || (StringAt((m_current + 2), 2, "ED", "ER", "")
+ && ((m_current + 3) == m_last)
+ && (StringAt(0, 5, "CLIMB", "PLUMB", "")
+ // e.g. "beachcomber"
+ || !StringAt((m_current - 1), 5, "IMBER", "AMBER", "EMBER", "UMBER", ""))
+ // exceptions
+ && !StringAt((m_current - 2), 6, "CUMBER", "SOMBER", "") ) ) )
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Test if 'B' is pronounced in these "-MB-" contexts
+ *
+ * @return true if "-B-" is pronounced in these contexts
+ *
+ */
+ boolean Test_Pronounced_MB_2()
+ {
+ // e.g. "bombastic", "umbrage", "flamboyant"
+ if (StringAt((m_current - 1), 5, "OMBAS", "OMBAD", "UMBRA", "")
+ || StringAt((m_current - 3), 4, "FLAM", "") )
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Tests for contexts where "-N-" is silent when after "-M-"
+ *
+ * @return true if "-N-" is silent in these contexts
+ *
+ */
+ boolean Test_MN()
+ {
+
+ if ((CharAt(m_current + 1) == 'N')
+ && (((m_current + 1) == m_last)
+ // or at the end of a word but followed by suffixes
+ || (StringAt((m_current + 2), 3, "ING", "EST", "") && ((m_current + 4) == m_last))
+ || ((CharAt(m_current + 2) == 'S') && ((m_current + 2) == m_last))
+ || (StringAt((m_current + 2), 2, "LY", "ER", "ED", "")
+ && ((m_current + 3) == m_last))
+ || StringAt((m_current - 2), 9, "DAMNEDEST", "")
+ || StringAt((m_current - 5), 9, "GODDAMNIT", "") ))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Call routines to encode "-MB-", in proper order
+ *
+ */
+ void Encode_MB()
+ {
+ if(Test_Silent_MB_1())
+ {
+ if(Test_Pronounced_MB())
+ {
+ m_current++;
+ }
+ else
+ {
+ m_current += 2;
+ }
+ }
+ else if(Test_Silent_MB_2())
+ {
+ if(Test_Pronounced_MB_2())
+ {
+ m_current++;
+ }
+ else
+ {
+ m_current += 2;
+ }
+ }
+ else if(Test_MN())
+ {
+ m_current += 2;
+ }
+ else
+ {
+ // eat redundant 'M'
+ if (CharAt(m_current + 1) == 'M')
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+ }
+ }
+
+ /**
+ * Encode "-N-"
+ *
+ */
+ void Encode_N()
+ {
+ if(Encode_NCE())
+ {
+ return;
+ }
+
+ // eat redundant 'N'
+ if(CharAt(m_current + 1) == 'N')
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+
+ if (!StringAt((m_current - 3), 8, "MONSIEUR", "")
+ // e.g. "aloneness",
+ && !StringAt((m_current - 3), 6, "NENESS", ""))
+ {
+ MetaphAdd("N");
+ }
+ }
+
+ /**
+ * Encode "-NCE-" and "-NSE-"
+ * "entrance" is pronounced exactly the same as "entrants"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_NCE()
+ {
+ //'acceptance', 'accountancy'
+ if(StringAt((m_current + 1), 1, "C", "S", "")
+ && StringAt((m_current + 2), 1, "E", "Y", "I", "")
+ && (((m_current + 2) == m_last)
+ || (((m_current + 3) == m_last))
+ && (CharAt(m_current + 3) == 'S')))
+ {
+ MetaphAdd("NTS");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-P-"
+ *
+ */
+ void Encode_P()
+ {
+ if(Encode_Silent_P_At_Beginning()
+ || Encode_PT()
+ || Encode_PH()
+ || Encode_PPH()
+ || Encode_RPS()
+ || Encode_COUP()
+ || Encode_PNEUM()
+ || Encode_PSYCH()
+ || Encode_PSALM())
+ {
+ return;
+ }
+
+ Encode_PB();
+
+ MetaphAdd("P");
+ }
+
+ /**
+ * Encode cases where "-P-" is silent at the start of a word
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_P_At_Beginning()
+ {
+ //skip these when at start of word
+ if((m_current == 0)
+ && StringAt(m_current, 2, "PN", "PF", "PS", "PT", ""))
+ {
+ m_current += 1;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases where "-P-" is silent before "-T-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_PT()
+ {
+ // 'pterodactyl', 'receipt', 'asymptote'
+ if((CharAt(m_current + 1) == 'T'))
+ {
+ if (((m_current == 0) && StringAt(m_current, 5, "PTERO", ""))
+ || StringAt((m_current - 5), 7, "RECEIPT", "")
+ || StringAt((m_current - 4), 8, "ASYMPTOT", ""))
+ {
+ MetaphAdd("T");
+ m_current += 2;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Encode "-PH-", usually as F, with exceptions for
+ * cases where it is silent, or where the 'P' and 'T'
+ * are pronounced seperately because they belong to
+ * two different words in a combining form
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_PH()
+ {
+ if(CharAt(m_current + 1) == 'H')
+ {
+ // 'PH' silent in these contexts
+ if (StringAt(m_current, 9, "PHTHALEIN", "")
+ || ((m_current == 0) && StringAt(m_current, 4, "PHTH", ""))
+ || StringAt((m_current - 3), 10, "APOPHTHEGM", ""))
+ {
+ MetaphAdd("0");
+ m_current += 4;
+ }
+ // combining forms
+ //'sheepherd', 'upheaval', 'cupholder'
+ else if((m_current > 0)
+ && (StringAt((m_current + 2), 3, "EAD", "OLE", "ELD", "ILL", "OLD", "EAP", "ERD",
+ "ARD", "ANG", "ORN", "EAV", "ART", "")
+ || StringAt((m_current + 2), 4, "OUSE", "")
+ || (StringAt((m_current + 2), 2, "AM", "") && !StringAt((m_current -1), 5, "LPHAM", ""))
+ || StringAt((m_current + 2), 5, "AMMER", "AZARD", "UGGER", "")
+ || StringAt((m_current + 2), 6, "OLSTER", ""))
+ && !StringAt((m_current - 3), 5, "LYMPH", "NYMPH", ""))
+ {
+ MetaphAdd("P");
+ AdvanceCounter(3, 2);
+ }
+ else
+ {
+ MetaphAdd("F");
+ m_current += 2;
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-PPH-". I don't know why the greek poet's
+ * name is transliterated this way...
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_PPH()
+ {
+ // 'sappho'
+ if((CharAt(m_current + 1) == 'P')
+ && ((m_current + 2) < m_length) && (CharAt(m_current + 2) == 'H'))
+ {
+ MetaphAdd("F");
+ m_current += 3;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-CORPS-" where "-PS-" not pronounced
+ * since the cognate is here from the french
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_RPS()
+ {
+ //'-corps-', 'corpsman'
+ if(StringAt((m_current - 3), 5, "CORPS", "")
+ && !StringAt((m_current - 3), 6, "CORPSE", ""))
+ {
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-COUP-" where "-P-" is not pronounced
+ * since the word is from the french
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_COUP()
+ {
+ //'coup'
+ if((m_current == m_last)
+ && StringAt((m_current - 3), 4, "COUP", "")
+ && !StringAt((m_current - 5), 6, "RECOUP", ""))
+ {
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode 'P' in non-initial contexts of "-PNEUM-"
+ * where is also silent
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_PNEUM()
+ {
+ //'-pneum-'
+ if(StringAt((m_current + 1), 4, "NEUM", ""))
+ {
+ MetaphAdd("N");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode special case "-PSYCH-" where two encodings need to be
+ * accounted for in one syllable, one for the 'PS' and one for
+ * the 'CH'
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_PSYCH()
+ {
+ //'-psych-'
+ if(StringAt((m_current + 1), 4, "SYCH", ""))
+ {
+ if(m_encodeVowels)
+ {
+ MetaphAdd("SAK");
+ }
+ else
+ {
+ MetaphAdd("SK");
+ }
+
+ m_current += 5;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode 'P' in context of "-PSALM-", where it has
+ * become silent
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_PSALM()
+ {
+ //'-psalm-'
+ if(StringAt((m_current + 1), 4, "SALM", ""))
+ {
+ // go ahead and encode entire word
+ if(m_encodeVowels)
+ {
+ MetaphAdd("SAM");
+ }
+ else
+ {
+ MetaphAdd("SM");
+ }
+
+ m_current += 5;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Eat redundant 'B' or 'P'
+ *
+ */
+ void Encode_PB()
+ {
+ // e.g. "campbell", "raspberry"
+ // eat redundant 'P' or 'B'
+ if(StringAt((m_current + 1), 1, "P", "B", ""))
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+ }
+
+ /**
+ * Encode "-Q-"
+ *
+ */
+ void Encode_Q()
+ {
+ // current pinyin
+ if(StringAt(m_current, 3, "QIN", ""))
+ {
+ MetaphAdd("X");
+ m_current++;
+ return;
+ }
+
+ // eat redundant 'Q'
+ if(CharAt(m_current + 1) == 'Q')
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+
+ MetaphAdd("K");
+ }
+
+ /**
+ * Encode "-R-"
+ *
+ */
+ void Encode_R()
+ {
+ if(Encode_RZ())
+ {
+ return;
+ }
+
+ if(!Test_Silent_R())
+ {
+ if(!Encode_Vowel_RE_Transposition())
+ {
+ MetaphAdd("R");
+ }
+ }
+
+ // eat redundant 'R'; also skip 'S' as well as 'R' in "poitiers"
+ if((CharAt(m_current + 1) == 'R') || StringAt((m_current - 6), 8, "POITIERS", ""))
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+ }
+
+ /**
+ * Encode "-RZ-" according
+ * to american and polish pronunciations
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_RZ()
+ {
+ if(StringAt((m_current - 2), 4, "GARZ", "KURZ", "MARZ", "MERZ", "HERZ", "PERZ", "WARZ", "")
+ || StringAt(m_current, 5, "RZANO", "RZOLA", "")
+ || StringAt((m_current - 1), 4, "ARZA", "ARZN", ""))
+ {
+ return false;
+ }
+
+ // 'yastrzemski' usually has 'z' silent in
+ // united states, but should get 'X' in poland
+ if(StringAt((m_current - 4), 11, "YASTRZEMSKI", ""))
+ {
+ MetaphAdd("R", "X");
+ m_current += 2;
+ return true;
+ }
+ // 'BRZEZINSKI' gets two pronunciations
+ // in the united states, neither of which
+ // are authentically polish
+ if(StringAt((m_current - 1), 10, "BRZEZINSKI", ""))
+ {
+ MetaphAdd("RS", "RJ");
+ // skip over 2nd 'Z'
+ m_current += 4;
+ return true;
+ }
+ // 'z' in 'rz after voiceless consonant gets 'X'
+ // in alternate polish style pronunciation
+ else if(StringAt((m_current - 1), 3, "TRZ", "PRZ", "KRZ", "")
+ || (StringAt(m_current, 2, "RZ", "")
+ && (IsVowel(m_current - 1) || (m_current == 0))))
+ {
+ MetaphAdd("RS", "X");
+ m_current += 2;
+ return true;
+ }
+ // 'z' in 'rz after voiceled consonant, vowel, or at
+ // beginning gets 'J' in alternate polish style pronunciation
+ else if(StringAt((m_current - 1), 3, "BRZ", "DRZ", "GRZ", ""))
+ {
+ MetaphAdd("RS", "J");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Test whether 'R' is silent in this context
+ *
+ * @return true if 'R' is silent in this context
+ *
+ */
+ boolean Test_Silent_R()
+ {
+ // test cases where 'R' is silent, either because the
+ // word is from the french or because it is no longer pronounced.
+ // e.g. "rogier", "monsieur", "surburban"
+ if(((m_current == m_last)
+ // reliably french word ending
+ && StringAt((m_current - 2), 3, "IER", "")
+ // e.g. "metier"
+ && (StringAt((m_current - 5), 3, "MET", "VIV", "LUC", "")
+ // e.g. "cartier", "bustier"
+ || StringAt((m_current - 6), 4, "CART", "DOSS", "FOUR", "OLIV", "BUST", "DAUM", "ATEL",
+ "SONN", "CORM", "MERC", "PELT", "POIR", "BERN", "FORT", "GREN",
+ "SAUC", "GAGN", "GAUT", "GRAN", "FORC", "MESS", "LUSS", "MEUN",
+ "POTH", "HOLL", "CHEN", "")
+ // e.g. "croupier"
+ || StringAt((m_current - 7), 5, "CROUP", "TORCH", "CLOUT", "FOURN", "GAUTH", "TROTT",
+ "DEROS", "CHART", "")
+ // e.g. "chevalier"
+ || StringAt((m_current - 8), 6, "CHEVAL", "LAVOIS", "PELLET", "SOMMEL", "TREPAN", "LETELL", "COLOMB", "")
+ || StringAt((m_current - 9), 7, "CHARCUT", "")
+ || StringAt((m_current - 10), 8, "CHARPENT", "")))
+ || StringAt((m_current - 2), 7, "SURBURB", "WORSTED", "")
+ || StringAt((m_current - 2), 9, "WORCESTER", "")
+ || StringAt((m_current - 7), 8, "MONSIEUR", "")
+ || StringAt((m_current - 6), 8, "POITIERS", "") )
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode '-re-" as 'AR' in contexts
+ * where this is the correct pronunciation
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Vowel_RE_Transposition()
+ {
+ // -re inversion is just like
+ // -le inversion
+ // e.g. "fibre" => FABAR or "centre" => SANTAR
+ if((m_encodeVowels)
+ && (CharAt(m_current + 1) == 'E')
+ && (m_length > 3)
+ && !StringAt(0, 5, "OUTRE", "LIBRE", "ANDRE", "")
+ && !(StringAt(0, 4, "FRED", "TRES", "") && (m_length == 4))
+ && !StringAt((m_current - 2), 5, "LDRED", "LFRED", "NDRED", "NFRED", "NDRES", "TRES", "IFRED", "")
+ && !IsVowel(m_current - 1)
+ && (((m_current + 1) == m_last)
+ || (((m_current + 2) == m_last)
+ && StringAt((m_current + 2), 1, "D", "S", ""))))
+ {
+ MetaphAdd("AR");
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-S-"
+ *
+ */
+ void Encode_S()
+ {
+ if(Encode_SKJ()
+ || Encode_Special_SW()
+ || Encode_SJ()
+ || Encode_Silent_French_S_Final()
+ || Encode_Silent_French_S_Internal()
+ || Encode_ISL()
+ || Encode_STL()
+ || Encode_Christmas()
+ || Encode_STHM()
+ || Encode_ISTEN()
+ || Encode_Sugar()
+ || Encode_SH()
+ || Encode_SCH()
+ || Encode_SUR()
+ || Encode_SU()
+ || Encode_SSIO()
+ || Encode_SS()
+ || Encode_SIA()
+ || Encode_SIO()
+ || Encode_Anglicisations()
+ || Encode_SC()
+ || Encode_SEA_SUI_SIER()
+ || Encode_SEA())
+ {
+ return;
+ }
+
+ MetaphAdd("S");
+
+ if(StringAt((m_current + 1), 1, "S", "Z", "")
+ && !StringAt((m_current + 1), 2, "SH", ""))
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+ }
+
+ /**
+ * Encode a couple of contexts where scandinavian, slavic
+ * or german names should get an alternate, native
+ * pronunciation of 'SV' or 'XV'
+ *
+ * @return true if handled
+ *
+ */
+ boolean Encode_Special_SW()
+ {
+ if(m_current == 0)
+ {
+ //
+ if(Names_Beginning_With_SW_That_Get_Alt_SV())
+ {
+ MetaphAdd("S", "SV");
+ m_current += 2;
+ return true;
+ }
+
+ //
+ if(Names_Beginning_With_SW_That_Get_Alt_XV())
+ {
+ MetaphAdd("S", "XV");
+ m_current += 2;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-SKJ-" as X ("sh"), since americans pronounce
+ * the name Dag Hammerskjold as "hammer-shold"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_SKJ()
+ {
+ // scandinavian
+ if(StringAt(m_current, 4, "SKJO", "SKJU", "")
+ && IsVowel(m_current + 3))
+ {
+ MetaphAdd("X");
+ m_current += 3;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode initial swedish "SJ-" as X ("sh")
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_SJ()
+ {
+ if(StringAt(0, 2, "SJ", ""))
+ {
+ MetaphAdd("X");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode final 'S' in words from the french, where they
+ * are not pronounced
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_French_S_Final()
+ {
+ // "louis" is an exception because it gets two pronuncuations
+ if(StringAt(0, 5, "LOUIS", "") && (m_current == m_last))
+ {
+ MetaphAdd("S", "");
+ m_current++;
+ return true;
+ }
+
+ // french words familiar to americans where final s is silent
+ if((m_current == m_last)
+ && (StringAt(0, 4, "YVES", "")
+ || (StringAt(0, 4, "HORS", "") && (m_current == 3))
+ || StringAt((m_current - 4), 5, "CAMUS", "YPRES", "")
+ || StringAt((m_current - 5), 6, "MESNES", "DEBRIS", "BLANCS", "INGRES", "CANNES", "")
+ || StringAt((m_current - 6), 7, "CHABLIS", "APROPOS", "JACQUES", "ELYSEES", "OEUVRES",
+ "GEORGES", "DESPRES", "")
+ || StringAt(0, 8, "ARKANSAS", "FRANCAIS", "CRUDITES", "BRUYERES", "")
+ || StringAt(0, 9, "DESCARTES", "DESCHUTES", "DESCHAMPS", "DESROCHES", "DESCHENES", "")
+ || StringAt(0, 10, "RENDEZVOUS", "")
+ || StringAt(0, 11, "CONTRETEMPS", "DESLAURIERS", ""))
+ || ((m_current == m_last)
+ && StringAt((m_current - 2), 2, "AI", "OI", "UI", "")
+ && !StringAt(0, 4, "LOIS", "LUIS", "")))
+ {
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode non-final 'S' in words from the french where they
+ * are not pronounced.
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_French_S_Internal()
+ {
+ // french words familiar to americans where internal s is silent
+ if(StringAt((m_current - 2), 9, "DESCARTES", "")
+ || StringAt((m_current - 2), 7, "DESCHAM", "DESPRES", "DESROCH", "DESROSI", "DESJARD", "DESMARA",
+ "DESCHEN", "DESHOTE", "DESLAUR", "")
+ || StringAt((m_current - 2), 6, "MESNES", "")
+ || StringAt((m_current - 5), 8, "DUQUESNE", "DUCHESNE", "")
+ || StringAt((m_current - 7), 10, "BEAUCHESNE", "")
+ || StringAt((m_current - 3), 7, "FRESNEL", "")
+ || StringAt((m_current - 3), 9, "GROSVENOR", "")
+ || StringAt((m_current - 4), 10, "LOUISVILLE", "")
+ || StringAt((m_current - 7), 10, "ILLINOISAN", ""))
+ {
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode silent 'S' in context of "-ISL-"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_ISL()
+ {
+ //special cases 'island', 'isle', 'carlisle', 'carlysle'
+ if((StringAt((m_current - 2), 4, "LISL", "LYSL", "AISL", "")
+ && !StringAt((m_current - 3), 7, "PAISLEY", "BAISLEY", "ALISLAM", "ALISLAH", "ALISLAA", ""))
+ || ((m_current == 1)
+ && ((StringAt((m_current - 1), 4, "ISLE", "")
+ || StringAt((m_current - 1), 5, "ISLAN", ""))
+ && !StringAt((m_current - 1), 5, "ISLEY", "ISLER", ""))))
+ {
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-STL-" in contexts where the 'T' is silent. Also
+ * encode "-USCLE-" in contexts where the 'C' is silent
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_STL()
+ {
+ //'hustle', 'bustle', 'whistle'
+ if((StringAt(m_current, 4, "STLE", "STLI", "")
+ && !StringAt((m_current + 2), 4, "LESS", "LIKE", "LINE", ""))
+ || StringAt((m_current - 3), 7, "THISTLY", "BRISTLY", "GRISTLY", "")
+ // e.g. "corpuscle"
+ || StringAt((m_current - 1), 5, "USCLE", ""))
+ {
+ // KRISTEN, KRYSTLE, CRYSTLE, KRISTLE all pronounce the 't'
+ // also, exceptions where "-LING" is a nominalizing suffix
+ if(StringAt(0, 7, "KRISTEN", "KRYSTLE", "CRYSTLE", "KRISTLE", "")
+ || StringAt(0, 11, "CHRISTENSEN", "CHRISTENSON", "")
+ || StringAt((m_current - 3), 9, "FIRSTLING", "")
+ || StringAt((m_current - 2), 8, "NESTLING", "WESTLING", ""))
+ {
+ MetaphAdd("ST");
+ m_current += 2;
+ }
+ else
+ {
+ if(m_encodeVowels
+ && (CharAt(m_current + 3) == 'E')
+ && (CharAt(m_current + 4) != 'R')
+ && !StringAt((m_current + 3), 4, "ETTE", "ETTA", "")
+ && !StringAt((m_current + 3), 2, "EY", ""))
+ {
+ MetaphAdd("SAL");
+ flag_AL_inversion = true;
+ }
+ else
+ {
+ MetaphAdd("SL");
+ }
+ m_current += 3;
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "christmas". Americans always pronounce this as "krissmuss"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Christmas()
+ {
+ //'christmas'
+ if(StringAt((m_current - 4), 8, "CHRISTMA", ""))
+ {
+ MetaphAdd("SM");
+ m_current += 3;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-STHM-" in contexts where the 'TH'
+ * is silent.
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_STHM()
+ {
+ //'asthma', 'isthmus'
+ if(StringAt(m_current, 4, "STHM", ""))
+ {
+ MetaphAdd("SM");
+ m_current += 4;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-ISTEN-" and "-STNT-" in contexts
+ * where the 'T' is silent
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_ISTEN()
+ {
+ // 't' is silent in verb, pronounced in name
+ if(StringAt(0, 8, "CHRISTEN", ""))
+ {
+ // the word itself
+ if(RootOrInflections(m_inWord, "CHRISTEN")
+ || StringAt(0, 11, "CHRISTENDOM", ""))
+ {
+ MetaphAdd("S", "ST");
+ }
+ else
+ {
+ // e.g. 'christenson', 'christene'
+ MetaphAdd("ST");
+ }
+ m_current += 2;
+ return true;
+ }
+
+ //e.g. 'glisten', 'listen'
+ if(StringAt((m_current - 2), 6, "LISTEN", "RISTEN", "HASTEN", "FASTEN", "MUSTNT", "")
+ || StringAt((m_current - 3), 7, "MOISTEN", ""))
+ {
+ MetaphAdd("S");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode special case "sugar"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Sugar()
+ {
+ //special case 'sugar-'
+ if(StringAt(m_current, 5, "SUGAR", ""))
+ {
+ MetaphAdd("X");
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-SH-" as X ("sh"), except in cases
+ * where the 'S' and 'H' belong to different combining
+ * roots and are therefore pronounced seperately
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_SH()
+ {
+ if(StringAt(m_current, 2, "SH", ""))
+ {
+ // exception
+ if(StringAt((m_current - 2), 8, "CASHMERE", ""))
+ {
+ MetaphAdd("J");
+ m_current += 2;
+ return true;
+ }
+
+ //combining forms, e.g. 'clotheshorse', 'woodshole'
+ if((m_current > 0)
+ // e.g. "mishap"
+ && ((StringAt((m_current + 1), 3, "HAP", "") && ((m_current + 3) == m_last))
+ // e.g. "hartsheim", "clothshorse"
+ || StringAt((m_current + 1), 4, "HEIM", "HOEK", "HOLM", "HOLZ", "HOOD", "HEAD", "HEID",
+ "HAAR", "HORS", "HOLE", "HUND", "HELM", "HAWK", "HILL", "")
+ // e.g. "dishonor"
+ || StringAt((m_current + 1), 5, "HEART", "HATCH", "HOUSE", "HOUND", "HONOR", "")
+ // e.g. "mishear"
+ || (StringAt((m_current + 2), 3, "EAR", "") && ((m_current + 4) == m_last))
+ // e.g. "hartshorn"
+ || (StringAt((m_current + 2), 3, "ORN", "") && !StringAt((m_current - 2), 7, "UNSHORN", ""))
+ // e.g. "newshour" but not "bashour", "manshour"
+ || (StringAt((m_current + 1), 4, "HOUR", "")
+ && !(StringAt(0, 7, "BASHOUR", "") || StringAt(0, 8, "MANSHOUR", "") || StringAt(0, 6, "ASHOUR", "") ))
+ // e.g. "dishonest", "grasshopper"
+ || StringAt((m_current + 2), 5, "ARMON", "ONEST", "ALLOW", "OLDER", "OPPER", "EIMER", "ANDLE", "ONOUR", "")
+ // e.g. "dishabille", "transhumance"
+ || StringAt((m_current + 2), 6, "ABILLE", "UMANCE", "ABITUA", "")))
+ {
+ if (!StringAt((m_current - 1), 1, "S", ""))
+ MetaphAdd("S");
+ }
+ else
+ {
+ MetaphAdd("X");
+ }
+
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-SCH-" in cases where the 'S' is pronounced
+ * seperately from the "CH", in words from the dutch, italian,
+ * and greek where it can be pronounced SK, and german words
+ * where it is pronounced X ("sh")
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_SCH()
+ {
+ // these words were combining forms many centuries ago
+ if(StringAt((m_current + 1), 2, "CH", ""))
+ {
+ if((m_current > 0)
+ // e.g. "mischief", "escheat"
+ && (StringAt((m_current + 3), 3, "IEF", "EAT", "")
+ // e.g. "mischance"
+ || StringAt((m_current + 3), 4, "ANCE", "ARGE", "")
+ // e.g. "eschew"
+ || StringAt(0, 6, "ESCHEW", "")))
+ {
+ MetaphAdd("S");
+ m_current++;
+ return true;
+ }
+
+ //Schlesinger's rule
+ //dutch, danish, italian, greek origin, e.g. "school", "schooner", "schiavone", "schiz-"
+ if((StringAt((m_current + 3), 2, "OO", "ER", "EN", "UY", "ED", "EM", "IA", "IZ", "IS", "OL", "")
+ && !StringAt(m_current, 6, "SCHOLT", "SCHISL", "SCHERR", ""))
+ || StringAt((m_current + 3), 3, "ISZ", "")
+ || (StringAt((m_current - 1), 6, "ESCHAT", "ASCHIN", "ASCHAL", "ISCHAE", "ISCHIA", "")
+ && !StringAt((m_current - 2), 8, "FASCHING", ""))
+ || (StringAt((m_current - 1), 5, "ESCHI", "") && ((m_current + 3) == m_last))
+ || (CharAt(m_current + 3) == 'Y'))
+ {
+ // e.g. "schermerhorn", "schenker", "schistose"
+ if(StringAt((m_current + 3), 2, "ER", "EN", "IS", "")
+ && (((m_current + 4) == m_last)
+ || StringAt((m_current + 3), 3, "ENK", "ENB", "IST", "")))
+ {
+ MetaphAdd("X", "SK");
+ }
+ else
+ {
+ MetaphAdd("SK");
+ }
+ m_current += 3;
+ return true;
+ }
+ else
+ {
+ MetaphAdd("X");
+ m_current += 3;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-SUR-" to J, unless it is at the beginning,
+ * or preceeded by 'N', 'K', or "NO"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_SUR()
+ {
+ // 'erasure', 'usury'
+ if(StringAt((m_current + 1), 3, "URE", "URA", "URY", ""))
+ {
+ //'sure', 'ensure'
+ if ((m_current == 0)
+ || StringAt((m_current - 1), 1, "N", "K", "")
+ || StringAt((m_current - 2), 2, "NO", ""))
+ {
+ MetaphAdd("X");
+ }
+ else
+ {
+ MetaphAdd("J");
+ }
+
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-SU-" to X ("sh") unless it is preceeded by
+ * an 'R', in which case it is encoded to S, or it is
+ * preceeded by a vowel, in which case it is encoded to J
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_SU()
+ {
+ //'sensuous', 'consensual'
+ if(StringAt((m_current + 1), 2, "UO", "UA", "") && (m_current != 0))
+ {
+ // exceptions e.g. "persuade"
+ if(StringAt((m_current - 1), 4, "RSUA", ""))
+ {
+ MetaphAdd("S");
+ }
+ // exceptions e.g. "casual"
+ else if(IsVowel(m_current - 1))
+ {
+ MetaphAdd("J", "S");
+ }
+ else
+ {
+ MetaphAdd("X", "S");
+ }
+
+ AdvanceCounter(3, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes "-SSIO-" in contexts where it is pronounced
+ * either J or X ("sh")
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_SSIO()
+ {
+ if(StringAt((m_current + 1), 4, "SION", ""))
+ {
+ //"abcission"
+ if (StringAt((m_current - 2), 2, "CI", ""))
+ {
+ MetaphAdd("J");
+ }
+ //'mission'
+ else
+ {
+ if (IsVowel(m_current - 1))
+ {
+ MetaphAdd("X");
+ }
+ }
+
+ AdvanceCounter(4, 2);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-SS-" in contexts where it is pronounced X ("sh")
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_SS()
+ {
+ // e.g. "russian", "pressure"
+ if(StringAt((m_current - 1), 5, "USSIA", "ESSUR", "ISSUR", "ISSUE", "")
+ // e.g. "hessian", "assurance"
+ || StringAt((m_current - 1), 6, "ESSIAN", "ASSURE", "ASSURA", "ISSUAB", "ISSUAN", "ASSIUS", ""))
+ {
+ MetaphAdd("X");
+ AdvanceCounter(3, 2);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes "-SIA-" in contexts where it is pronounced
+ * as X ("sh"), J, or S
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_SIA()
+ {
+ // e.g. "controversial", also "fuchsia", "ch" is silent
+ if(StringAt((m_current - 2), 5, "CHSIA", "")
+ || StringAt((m_current - 1), 5, "RSIAL", ""))
+ {
+ MetaphAdd("X");
+ AdvanceCounter(3, 1);
+ return true;
+ }
+
+ // names generally get 'X' where terms, e.g. "aphasia" get 'J'
+ if((StringAt(0, 6, "ALESIA", "ALYSIA", "ALISIA", "STASIA", "")
+ && (m_current == 3)
+ && !StringAt(0, 9, "ANASTASIA", ""))
+ || StringAt((m_current - 5), 9, "DIONYSIAN", "")
+ || StringAt((m_current - 5), 8, "THERESIA", ""))
+ {
+ MetaphAdd("X", "S");
+ AdvanceCounter(3, 1);
+ return true;
+ }
+
+ if((StringAt(m_current, 3, "SIA", "") && ((m_current + 2) == m_last))
+ || (StringAt(m_current, 4, "SIAN", "") && ((m_current + 3) == m_last))
+ || StringAt((m_current - 5), 9, "AMBROSIAL", ""))
+ {
+ if ((IsVowel(m_current - 1) || StringAt((m_current - 1), 1, "R", ""))
+ // exclude compounds based on names, or french or greek words
+ && !(StringAt(0, 5, "JAMES", "NICOS", "PEGAS", "PEPYS", "")
+ || StringAt(0, 6, "HOBBES", "HOLMES", "JAQUES", "KEYNES", "")
+ || StringAt(0, 7, "MALTHUS", "HOMOOUS", "")
+ || StringAt(0, 8, "MAGLEMOS", "HOMOIOUS", "")
+ || StringAt(0, 9, "LEVALLOIS", "TARDENOIS", "")
+ || StringAt((m_current - 4), 5, "ALGES", "") ))
+ {
+ MetaphAdd("J");
+ }
+ else
+ {
+ MetaphAdd("S");
+ }
+
+ AdvanceCounter(2, 1);
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Encodes "-SIO-" in contexts where it is pronounced
+ * as J or X ("sh")
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_SIO()
+ {
+ // special case, irish name
+ if(StringAt(0, 7, "SIOBHAN", ""))
+ {
+ MetaphAdd("X");
+ AdvanceCounter(3, 1);
+ return true;
+ }
+
+ if(StringAt((m_current + 1), 3, "ION", ""))
+ {
+ // e.g. "vision", "version"
+ if (IsVowel(m_current - 1) || StringAt((m_current - 2), 2, "ER", "UR", ""))
+ {
+ MetaphAdd("J");
+ }
+ else // e.g. "declension"
+ {
+ MetaphAdd("X");
+ }
+
+ AdvanceCounter(3, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases where "-S-" might well be from a german name
+ * and add encoding of german pronounciation in alternate m_metaph
+ * so that it can be found in a genealogical search
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Anglicisations()
+ {
+ //german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider'
+ //also, -sz- in slavic language altho in hungarian it is pronounced 's'
+ if(((m_current == 0)
+ && StringAt((m_current + 1), 1, "M", "N", "L", ""))
+ || StringAt((m_current + 1), 1, "Z", ""))
+ {
+ MetaphAdd("S", "X");
+
+ // eat redundant 'Z'
+ if(StringAt((m_current + 1), 1, "Z", ""))
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-SC-" in contexts where it is silent,
+ * or pronounced as X ("sh"), S, or SK
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_SC()
+ {
+ if(StringAt(m_current, 2, "SC", ""))
+ {
+ // exception 'viscount'
+ if(StringAt((m_current - 2), 8, "VISCOUNT", ""))
+ {
+ m_current += 1;
+ return true;
+ }
+
+ // encode "-SC-"
+ if(StringAt((m_current + 2), 1, "I", "E", "Y", ""))
+ {
+ // e.g. "conscious"
+ if(StringAt((m_current + 2), 4, "IOUS", "")
+ // e.g. "prosciutto"
+ || StringAt((m_current + 2), 3, "IUT", "")
+ || StringAt((m_current - 4), 9, "OMNISCIEN", "")
+ // e.g. "conscious"
+ || StringAt((m_current - 3), 8, "CONSCIEN", "CRESCEND", "CONSCION", "")
+ || StringAt((m_current - 2), 6, "FASCIS", ""))
+ {
+ MetaphAdd("X");
+ }
+ else if(StringAt(m_current, 7, "SCEPTIC", "SCEPSIS", "")
+ || StringAt(m_current, 5, "SCIVV", "SCIRO", "")
+ // commonly pronounced this way in u.s.
+ || StringAt(m_current, 6, "SCIPIO", "")
+ || StringAt((m_current - 2), 10, "PISCITELLI", ""))
+ {
+ MetaphAdd("SK");
+ }
+ else
+ {
+ MetaphAdd("S");
+ }
+ m_current += 2;
+ return true;
+ }
+
+ MetaphAdd("SK");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-S-" in contexts where it is pronounced
+ * as J
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_SEA_SUI_SIER()
+ {
+ // "nausea" by itself has => NJ as a more likely encoding. Other forms
+ // using "nause-" (see Encode_SEA()) have X or S as more familiar pronounciations
+ if((StringAt((m_current - 3), 6, "NAUSEA", "") && ((m_current + 2) == m_last))
+ // e.g. "casuistry", "frasier", "hoosier"
+ || StringAt((m_current - 2), 5, "CASUI", "")
+ || (StringAt((m_current - 1), 5, "OSIER", "ASIER", "")
+ && !(StringAt(0, 6, "EASIER","")
+ || StringAt(0, 5, "OSIER","")
+ || StringAt((m_current - 2), 6, "ROSIER", "MOSIER", ""))))
+ {
+ MetaphAdd("J", "X");
+ AdvanceCounter(3, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases where "-SE-" is pronounced as X ("sh")
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_SEA()
+ {
+ if((StringAt(0, 4, "SEAN", "") && ((m_current + 3) == m_last))
+ || (StringAt((m_current - 3), 6, "NAUSEO", "")
+ && !StringAt((m_current - 3), 7, "NAUSEAT", "")))
+ {
+ MetaphAdd("X");
+ AdvanceCounter(3, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-T-"
+ *
+ */
+ void Encode_T()
+ {
+ if(Encode_T_Initial()
+ || Encode_TCH()
+ || Encode_Silent_French_T()
+ || Encode_TUN_TUL_TUA_TUO()
+ || Encode_TUE_TEU_TEOU_TUL_TIE()
+ || Encode_TUR_TIU_Suffixes()
+ || Encode_TI()
+ || Encode_TIENT()
+ || Encode_TSCH()
+ || Encode_TZSCH()
+ || Encode_TH_Pronounced_Separately()
+ || Encode_TTH()
+ || Encode_TH())
+ {
+ return;
+ }
+
+ // eat redundant 'T' or 'D'
+ if(StringAt((m_current + 1), 1, "T", "D", ""))
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+
+ MetaphAdd("T");
+ }
+
+ /**
+ * Encode some exceptions for initial 'T'
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_T_Initial()
+ {
+ if(m_current == 0)
+ {
+ // americans usually pronounce "tzar" as "zar"
+ if (StringAt((m_current + 1), 3, "SAR", "ZAR", ""))
+ {
+ m_current++;
+ return true;
+ }
+
+ // old 'École française d'Extrême-Orient' chinese pinyin where 'ts-' => 'X'
+ if (((m_length == 3) && StringAt((m_current + 1), 2, "SO", "SA", "SU", ""))
+ || ((m_length == 4) && StringAt((m_current + 1), 3, "SAO", "SAI", ""))
+ || ((m_length == 5) && StringAt((m_current + 1), 4, "SING", "SANG", "")))
+ {
+ MetaphAdd("X");
+ AdvanceCounter(3, 2);
+ return true;
+ }
+
+ // "TS-" at start can be pronounced both with and without 'T'
+ if (StringAt((m_current + 1), 1, "S", "") && IsVowel(m_current + 2))
+ {
+ MetaphAdd("TS", "S");
+ AdvanceCounter(3, 2);
+ return true;
+ }
+
+ // e.g. "Tjaarda"
+ if (StringAt((m_current + 1), 1, "J", ""))
+ {
+ MetaphAdd("X");
+ AdvanceCounter(3, 2);
+ return true;
+ }
+
+ // cases where initial "TH-" is pronounced as T and not 0 ("th")
+ if ((StringAt((m_current + 1), 2, "HU", "") && (m_length == 3))
+ || StringAt((m_current + 1), 3, "HAI", "HUY", "HAO", "")
+ || StringAt((m_current + 1), 4, "HYME", "HYMY", "HANH", "")
+ || StringAt((m_current + 1), 5, "HERES", ""))
+ {
+ MetaphAdd("T");
+ AdvanceCounter(3, 2);
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-TCH-", reliably X ("sh", or in this case, "ch")
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_TCH()
+ {
+ if(StringAt((m_current + 1), 2, "CH", ""))
+ {
+ MetaphAdd("X");
+ m_current += 3;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode the many cases where americans are aware that a certain word is
+ * french and know to not pronounce the 'T'
+ *
+ * @return true if encoding handled in this routine, false if not
+ * TOUCHET CHABOT BENOIT
+ */
+ boolean Encode_Silent_French_T()
+ {
+ // french silent T familiar to americans
+ if(((m_current == m_last) && StringAt((m_current - 4), 5, "MONET", "GENET", "CHAUT", ""))
+ || StringAt((m_current - 2), 9, "POTPOURRI", "")
+ || StringAt((m_current - 3), 9, "BOATSWAIN", "")
+ || StringAt((m_current - 3), 8, "MORTGAGE", "")
+ || (StringAt((m_current - 4), 5, "BERET", "BIDET", "FILET", "DEBUT", "DEPOT", "PINOT", "TAROT", "")
+ || StringAt((m_current - 5), 6, "BALLET", "BUFFET", "CACHET", "CHALET", "ESPRIT", "RAGOUT", "GOULET",
+ "CHABOT", "BENOIT", "")
+ || StringAt((m_current - 6), 7, "GOURMET", "BOUQUET", "CROCHET", "CROQUET", "PARFAIT", "PINCHOT",
+ "CABARET", "PARQUET", "RAPPORT", "TOUCHET", "COURBET", "DIDEROT", "")
+ || StringAt((m_current - 7), 8, "ENTREPOT", "CABERNET", "DUBONNET", "MASSENET", "MUSCADET", "RICOCHET", "ESCARGOT", "")
+ || StringAt((m_current - 8), 9, "SOBRIQUET", "CABRIOLET", "CASSOULET", "OUBRIQUET", "CAMEMBERT", ""))
+ && !StringAt((m_current + 1), 2, "AN", "RY", "IC", "OM", "IN", ""))
+ {
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-TU-" in cases where it is pronounced
+ * X ("sh", or in this case, "ch")
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_TUN_TUL_TUA_TUO()
+ {
+ // e.g. "fortune", "fortunate"
+ if(StringAt((m_current - 3), 6, "FORTUN", "")
+ // e.g. "capitulate"
+ || (StringAt(m_current, 3, "TUL", "")
+ && (IsVowel(m_current - 1) && IsVowel(m_current + 3)))
+ // e.g. "obituary", "barbituate"
+ || StringAt((m_current - 2), 5, "BITUA", "BITUE", "")
+ // e.g. "actual"
+ || ((m_current > 1) && StringAt(m_current, 3, "TUA", "TUO", "")))
+ {
+ MetaphAdd("X", "T");
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-T-" forms where 'T' is pronounced as X
+ * ("sh", or in this case "ch")
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_TUE_TEU_TEOU_TUL_TIE()
+ {
+ // 'constituent', 'pasteur'
+ if(StringAt((m_current + 1), 4, "UENT", "")
+ || StringAt((m_current - 4), 9, "RIGHTEOUS", "")
+ || StringAt((m_current - 3), 7, "STATUTE", "")
+ || StringAt((m_current - 3), 7, "AMATEUR", "")
+ // e.g. "blastula", "pasteur"
+ || (StringAt((m_current - 1), 5, "NTULE", "NTULA", "STULE", "STULA", "STEUR", ""))
+ // e.g. "statue"
+ || (((m_current + 2) == m_last) && StringAt(m_current, 3, "TUE", ""))
+ // e.g. "constituency"
+ || StringAt(m_current, 5, "TUENC", "")
+ // e.g. "statutory"
+ || StringAt((m_current - 3), 8, "STATUTOR", "")
+ // e.g. "patience"
+ || (((m_current + 5) == m_last) && StringAt(m_current, 6, "TIENCE", "")))
+ {
+ MetaphAdd("X", "T");
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-TU-" forms in suffixes where it is usually
+ * pronounced as X ("sh")
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_TUR_TIU_Suffixes()
+ {
+ // 'adventure', 'musculature'
+ if((m_current > 0) && StringAt((m_current + 1), 3, "URE", "URA", "URI", "URY", "URO", "IUS", ""))
+ {
+ // exceptions e.g. 'tessitura', mostly from romance languages
+ if ((StringAt((m_current + 1), 3, "URA", "URO", "")
+ //&& !StringAt((m_current + 1), 4, "URIA", "")
+ && ((m_current + 3) == m_last))
+ && !StringAt((m_current - 3), 7, "VENTURA", "")
+ // e.g. "kachaturian", "hematuria"
+ || StringAt((m_current + 1), 4, "URIA", ""))
+ {
+ MetaphAdd("T");
+ }
+ else
+ {
+ MetaphAdd("X", "T");
+ }
+
+ AdvanceCounter(2, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-TI-" as X ("sh"), except
+ * in cases where it is part of a combining form,
+ * or as J
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_TI()
+ {
+ // '-tio-', '-tia-', '-tiu-'
+ // except combining forms where T already pronounced e.g 'rooseveltian'
+ if((StringAt((m_current + 1), 2, "IO", "") && !StringAt((m_current - 1), 5, "ETIOL", ""))
+ || StringAt((m_current + 1), 3, "IAL", "")
+ || StringAt((m_current - 1), 5, "RTIUM", "ATIUM", "")
+ || ((StringAt((m_current + 1), 3, "IAN", "") && (m_current > 0))
+ && !(StringAt((m_current - 4), 8, "FAUSTIAN", "")
+ || StringAt((m_current - 5), 9, "PROUSTIAN", "")
+ || StringAt((m_current - 2), 7, "TATIANA", "")
+ ||(StringAt((m_current - 3), 7, "KANTIAN", "GENTIAN", "")
+ || StringAt((m_current - 8), 12, "ROOSEVELTIAN", "")))
+ || (((m_current + 2) == m_last)
+ && StringAt(m_current, 3, "TIA", "")
+ // exceptions to above rules where the pronounciation is usually X
+ && !(StringAt((m_current - 3), 6, "HESTIA", "MASTIA", "")
+ || StringAt((m_current - 2), 5, "OSTIA", "")
+ || StringAt(0, 3, "TIA", "")
+ || StringAt((m_current - 5), 8, "IZVESTIA", "")))
+ || StringAt((m_current + 1), 4, "IATE", "IATI", "IABL", "IATO", "IARY", "")
+ || StringAt((m_current - 5), 9, "CHRISTIAN", "")))
+ {
+ if(((m_current == 2) && StringAt(0, 4, "ANTI", ""))
+ || StringAt(0, 5, "PATIO", "PITIA", "DUTIA", ""))
+ {
+ MetaphAdd("T");
+ }
+ else if(StringAt((m_current - 4), 8, "EQUATION", ""))
+ {
+ MetaphAdd("J");
+ }
+ else
+ {
+ if(StringAt(m_current, 4, "TION", ""))
+ {
+ MetaphAdd("X");
+ }
+ else if(StringAt(0, 5, "KATIA", "LATIA", ""))
+ {
+ MetaphAdd("T", "X");
+ }
+ else
+ {
+ MetaphAdd("X", "T");
+ }
+ }
+
+ AdvanceCounter(3, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-TIENT-" where "TI" is pronounced X ("sh")
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_TIENT()
+ {
+ // e.g. 'patient'
+ if(StringAt((m_current + 1), 4, "IENT", ""))
+ {
+ MetaphAdd("X", "T");
+ AdvanceCounter(3, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-TSCH-" as X ("ch")
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_TSCH()
+ {
+ //'deutsch'
+ if(StringAt(m_current, 4, "TSCH", "")
+ // combining forms in german where the 'T' is pronounced seperately
+ && !StringAt((m_current - 3), 4, "WELT", "KLAT", "FEST", ""))
+ {
+ // pronounced the same as "ch" in "chit" => X
+ MetaphAdd("X");
+ m_current += 4;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-TZSCH-" as X ("ch")
+ *
+ * "Neitzsche is peachy"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_TZSCH()
+ {
+ //'neitzsche'
+ if(StringAt(m_current, 5, "TZSCH", ""))
+ {
+ MetaphAdd("X");
+ m_current += 5;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encodes cases where the 'H' in "-TH-" is the beginning of
+ * another word in a combining form, special cases where it is
+ * usually pronounced as 'T', and a special case where it has
+ * become pronounced as X ("sh", in this case "ch")
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_TH_Pronounced_Separately()
+ {
+ //'adulthood', 'bithead', 'apartheid'
+ if(((m_current > 0)
+ && StringAt((m_current + 1), 4, "HOOD", "HEAD", "HEID", "HAND", "HILL", "HOLD",
+ "HAWK", "HEAP", "HERD", "HOLE", "HOOK", "HUNT",
+ "HUMO", "HAUS", "HOFF", "HARD", "")
+ && !StringAt((m_current - 3), 5, "SOUTH", "NORTH", ""))
+ || StringAt((m_current + 1), 5, "HOUSE", "HEART", "HASTE", "HYPNO", "HEQUE", "")
+ // watch out for greek root "-thallic"
+ || (StringAt((m_current + 1), 4, "HALL", "")
+ && ((m_current + 4) == m_last)
+ && !StringAt((m_current - 3), 5, "SOUTH", "NORTH", ""))
+ || (StringAt((m_current + 1), 3, "HAM", "")
+ && ((m_current + 3) == m_last)
+ && !(StringAt(0, 6, "GOTHAM", "WITHAM", "LATHAM", "")
+ || StringAt(0, 7, "BENTHAM", "WALTHAM", "WORTHAM", "")
+ || StringAt(0, 8, "GRANTHAM", "")))
+ || (StringAt((m_current + 1), 5, "HATCH", "")
+ && !((m_current == 0) || StringAt((m_current - 2), 8, "UNTHATCH", "")))
+ || StringAt((m_current - 3), 7, "WARTHOG", "")
+ // and some special cases where "-TH-" is usually pronounced 'T'
+ || StringAt((m_current - 2), 6, "ESTHER", "")
+ || StringAt((m_current - 3), 6, "GOETHE", "")
+ || StringAt((m_current - 2), 8, "NATHALIE", ""))
+ {
+ // special case
+ if (StringAt((m_current - 3), 7, "POSTHUM", ""))
+ {
+ MetaphAdd("X");
+ }
+ else
+ {
+ MetaphAdd("T");
+ }
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode the "-TTH-" in "matthew", eating the redundant 'T'
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_TTH()
+ {
+ // 'matthew' vs. 'outthink'
+ if(StringAt(m_current, 3, "TTH", ""))
+ {
+ if (StringAt((m_current - 2), 5, "MATTH", ""))
+ {
+ MetaphAdd("0");
+ }
+ else
+ {
+ MetaphAdd("T0");
+ }
+ m_current += 3;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-TH-". 0 (zero) is used in Metaphone to encode this sound
+ * when it is pronounced as a dipthong, either voiced or unvoiced
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_TH()
+ {
+ if(StringAt(m_current, 2, "TH", "") )
+ {
+ //'-clothes-'
+ if(StringAt((m_current - 3), 7, "CLOTHES", ""))
+ {
+ // vowel already encoded so skip right to S
+ m_current += 3;
+ return true;
+ }
+
+ //special case "thomas", "thames", "beethoven" or germanic words
+ if(StringAt((m_current + 2), 4, "OMAS", "OMPS", "OMPK", "OMSO", "OMSE",
+ "AMES", "OVEN", "OFEN", "ILDA", "ILDE", "")
+ || (StringAt(0, 4, "THOM", "") && (m_length == 4))
+ || (StringAt(0, 5, "THOMS", "") && (m_length == 5))
+ || StringAt(0, 4, "VAN ", "VON ", "")
+ || StringAt(0, 3, "SCH", ""))
+ {
+ MetaphAdd("T");
+
+ }
+ else
+ {
+ // give an 'etymological' 2nd
+ // encoding for "smith"
+ if(StringAt(0, 2, "SM", ""))
+ {
+ MetaphAdd("0", "T");
+ }
+ else
+ {
+ MetaphAdd("0");
+ }
+ }
+
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-V-"
+ *
+ */
+ void Encode_V()
+ {
+ // eat redundant 'V'
+ if(CharAt(m_current + 1) == 'V')
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+
+ MetaphAddExactApprox("V", "F");
+ }
+
+ /**
+ * Encode "-W-"
+ *
+ */
+ void Encode_W()
+ {
+ if(Encode_Silent_W_At_Beginning()
+ || Encode_WITZ_WICZ()
+ || Encode_WR()
+ || Encode_Initial_W_Vowel()
+ || Encode_WH()
+ || Encode_Eastern_European_W())
+ {
+ return;
+ }
+
+ // e.g. 'zimbabwe'
+ if(m_encodeVowels
+ && StringAt(m_current, 2, "WE", "")
+ && ((m_current + 1) == m_last))
+ {
+ MetaphAdd("A");
+ }
+
+ //else skip it
+ m_current++;
+
+ }
+
+ /**
+ * Encode cases where 'W' is silent at beginning of word
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Silent_W_At_Beginning()
+ {
+ //skip these when at start of word
+ if((m_current == 0)
+ && StringAt(m_current, 2, "WR", ""))
+ {
+ m_current += 1;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode polish patronymic suffix, mapping
+ * alternate spellings to the same encoding,
+ * and including easern european pronounciation
+ * to the american so that both forms can
+ * be found in a genealogy search
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_WITZ_WICZ()
+ {
+ //polish e.g. 'filipowicz'
+ if(((m_current + 3) == m_last) && StringAt(m_current, 4, "WICZ", "WITZ", ""))
+ {
+ if(m_encodeVowels)
+ {
+ if((m_primary.length() > 0)
+ && m_primary.charAt(m_primary.length() - 1) == 'A')
+ {
+ MetaphAdd("TS", "FAX");
+ }
+ else
+ {
+ MetaphAdd("ATS", "FAX");
+ }
+ }
+ else
+ {
+ MetaphAdd("TS", "FX");
+ }
+ m_current += 4;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-WR-" as R ('W' always effectively silent)
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_WR()
+ {
+ //can also be in middle of word
+ if(StringAt(m_current, 2, "WR", ""))
+ {
+ MetaphAdd("R");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "W-", adding central and eastern european
+ * pronounciations so that both forms can be found
+ * in a genealogy search
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Initial_W_Vowel()
+ {
+ if((m_current == 0) && IsVowel(m_current + 1))
+ {
+ //Witter should match Vitter
+ if(Germanic_Or_Slavic_Name_Beginning_With_W())
+ {
+ if(m_encodeVowels)
+ {
+ MetaphAddExactApprox("A", "VA", "A", "FA");
+ }
+ else
+ {
+ MetaphAddExactApprox("A", "V", "A", "F");
+ }
+ }
+ else
+ {
+ MetaphAdd("A");
+ }
+
+ m_current++;
+ // don't encode vowels twice
+ m_current = SkipVowels(m_current);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-WH-" either as H, or close enough to 'U' to be
+ * considered a vowel
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_WH()
+ {
+ if(StringAt(m_current, 2, "WH", ""))
+ {
+ // cases where it is pronounced as H
+ // e.g. 'who', 'whole'
+ if((CharAt(m_current + 2) == 'O')
+ // exclude cases where it is pronounced like a vowel
+ && !(StringAt((m_current + 2), 4, "OOSH", "")
+ || StringAt((m_current + 2), 3, "OOP", "OMP", "ORL", "ORT", "")
+ || StringAt((m_current + 2), 2, "OA", "OP", "")))
+ {
+ MetaphAdd("H");
+ AdvanceCounter(3, 2);
+ return true;
+ }
+ else
+ {
+ // combining forms, e.g. 'hollowhearted', 'rawhide'
+ if(StringAt((m_current + 2), 3, "IDE", "ARD", "EAD", "AWK", "ERD",
+ "OOK", "AND", "OLE", "OOD", "")
+ || StringAt((m_current + 2), 4, "EART", "OUSE", "OUND", "")
+ || StringAt((m_current + 2), 5, "AMMER", ""))
+ {
+ MetaphAdd("H");
+ m_current += 2;
+ return true;
+ }
+ else if(m_current == 0)
+ {
+ MetaphAdd("A");
+ m_current += 2;
+ // don't encode vowels twice
+ m_current = SkipVowels(m_current);
+ return true;
+ }
+ }
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-W-" when in eastern european names, adding
+ * the eastern european pronounciation to the american so
+ * that both forms can be found in a genealogy search
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Eastern_European_W()
+ {
+ //Arnow should match Arnoff
+ if(((m_current == m_last) && IsVowel(m_current - 1))
+ || StringAt((m_current - 1), 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY", "")
+ || (StringAt(m_current, 5, "WICKI", "WACKI", "") && ((m_current + 4) == m_last))
+ || StringAt(m_current, 4, "WIAK", "") && ((m_current + 3) == m_last)
+ || StringAt(0, 3, "SCH", ""))
+ {
+ MetaphAddExactApprox("", "V", "", "F");
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-X-"
+ *
+ */
+ void Encode_X()
+ {
+ if(Encode_Initial_X()
+ || Encode_Greek_X()
+ || Encode_X_Special_Cases()
+ || Encode_X_To_H()
+ || Encode_X_Vowel()
+ || Encode_French_X_Final())
+ {
+ return;
+ }
+
+ // eat redundant 'X' or other redundant cases
+ if(StringAt((m_current + 1), 1, "X", "Z", "S", "")
+ // e.g. "excite", "exceed"
+ || StringAt((m_current + 1), 2, "CI", "CE", ""))
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+ }
+
+ /**
+ * Encode initial X where it is usually pronounced as S
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Initial_X()
+ {
+ // current chinese pinyin spelling
+ if(StringAt(0, 3, "XIA", "XIO", "XIE", "")
+ || StringAt(0, 2, "XU", ""))
+ {
+ MetaphAdd("X");
+ m_current++;
+ return true;
+ }
+
+ // else
+ if((m_current == 0))
+ {
+ MetaphAdd("S");
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode X when from greek roots where it is usually pronounced as S
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_Greek_X()
+ {
+ // 'xylophone', xylem', 'xanthoma', 'xeno-'
+ if(StringAt((m_current + 1), 3, "YLO", "YLE", "ENO", "")
+ || StringAt((m_current + 1), 4, "ANTH", ""))
+ {
+ MetaphAdd("S");
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode special cases, "LUXUR-", "Texeira"
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_X_Special_Cases()
+ {
+ // 'luxury'
+ if(StringAt((m_current - 2), 5, "LUXUR", ""))
+ {
+ MetaphAddExactApprox("GJ", "KJ");
+ m_current++;
+ return true;
+ }
+
+ // 'texeira' portuguese/galician name
+ if(StringAt(0, 7, "TEXEIRA", "")
+ || StringAt(0, 8, "TEIXEIRA", ""))
+ {
+ MetaphAdd("X");
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode special case where americans know the
+ * proper mexican indian pronounciation of this name
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_X_To_H()
+ {
+ // TODO: look for other mexican indian words
+ // where 'X' is usually pronounced this way
+ if(StringAt((m_current - 2), 6, "OAXACA", "")
+ || StringAt((m_current - 3), 7, "QUIXOTE", ""))
+ {
+ MetaphAdd("H");
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-X-" in vowel contexts where it is usually
+ * pronounced KX ("ksh")
+ * account also for BBC pronounciation of => KS
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_X_Vowel()
+ {
+ // e.g. "sexual", "connexion" (british), "noxious"
+ if(StringAt((m_current + 1), 3, "UAL", "ION", "IOU", ""))
+ {
+ MetaphAdd("KX", "KS");
+ AdvanceCounter(3, 1);
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases of "-X", encoding as silent when part
+ * of a french word where it is not pronounced
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_French_X_Final()
+ {
+ //french e.g. "breaux", "paix"
+ if(!((m_current == m_last)
+ && (StringAt((m_current - 3), 3, "IAU", "EAU", "IEU", "")
+ || StringAt((m_current - 2), 2, "AI", "AU", "OU", "OI", "EU", ""))) )
+ {
+ MetaphAdd("KS");
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-Z-"
+ *
+ */
+ void Encode_Z()
+ {
+ if(Encode_ZZ()
+ || Encode_ZU_ZIER_ZS()
+ || Encode_French_EZ()
+ || Encode_German_Z())
+ {
+ return;
+ }
+
+ if(Encode_ZH())
+ {
+ return;
+ }
+ else
+ {
+ MetaphAdd("S");
+ }
+
+ // eat redundant 'Z'
+ if(CharAt(m_current + 1) == 'Z')
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+ }
+
+ /**
+ * Encode cases of "-ZZ-" where it is obviously part
+ * of an italian word where "-ZZ-" is pronounced as TS
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_ZZ()
+ {
+ // "abruzzi", 'pizza'
+ if((CharAt(m_current + 1) == 'Z')
+ && ((StringAt((m_current + 2), 1, "I", "O", "A", "")
+ && ((m_current + 2) == m_last))
+ || StringAt((m_current - 2), 9, "MOZZARELL", "PIZZICATO", "PUZZONLAN", "")))
+ {
+ MetaphAdd("TS", "S");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode special cases where "-Z-" is pronounced as J
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_ZU_ZIER_ZS()
+ {
+ if(((m_current == 1) && StringAt((m_current - 1), 4, "AZUR", ""))
+ || (StringAt(m_current, 4, "ZIER", "")
+ && !StringAt((m_current - 2), 6, "VIZIER", ""))
+ || StringAt(m_current, 3, "ZSA", ""))
+ {
+ MetaphAdd("J", "S");
+
+ if(StringAt(m_current, 3, "ZSA", ""))
+ {
+ m_current += 2;
+ }
+ else
+ {
+ m_current++;
+ }
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases where americans recognize "-EZ" as part
+ * of a french word where Z not pronounced
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_French_EZ()
+ {
+ if(((m_current == 3) && StringAt((m_current - 3), 4, "CHEZ", ""))
+ || StringAt((m_current - 5), 6, "RENDEZ", ""))
+ {
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode cases where "-Z-" is in a german word
+ * where Z => TS in german
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_German_Z()
+ {
+ if(((m_current == 2) && ((m_current + 1) == m_last) && StringAt((m_current - 2), 4, "NAZI", ""))
+ || StringAt((m_current - 2), 6, "NAZIFY", "MOZART", "")
+ || StringAt((m_current - 3), 4, "HOLZ", "HERZ", "MERZ", "FITZ", "")
+ || (StringAt((m_current - 3), 4, "GANZ", "") && !IsVowel(m_current + 1))
+ || StringAt((m_current - 4), 5, "STOLZ", "PRINZ", "")
+ || StringAt((m_current - 4), 7, "VENEZIA", "")
+ || StringAt((m_current - 3), 6, "HERZOG", "")
+ // german words beginning with "sch-" but not schlimazel, schmooze
+ || (m_inWord.contains("SCH") && !(StringAt((m_last - 2), 3, "IZE", "OZE", "ZEL", "")))
+ || ((m_current > 0) && StringAt(m_current, 4, "ZEIT", ""))
+ || StringAt((m_current - 3), 4, "WEIZ", ""))
+ {
+ if((m_current > 0) && m_inWord.charAt(m_current - 1) == 'T')
+ {
+ MetaphAdd("S");
+ }
+ else
+ {
+ MetaphAdd("TS");
+ }
+ m_current++;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Encode "-ZH-" as J
+ *
+ * @return true if encoding handled in this routine, false if not
+ *
+ */
+ boolean Encode_ZH()
+ {
+ //chinese pinyin e.g. 'zhao', also english "phonetic spelling"
+ if(CharAt(m_current + 1) == 'H')
+ {
+ MetaphAdd("J");
+ m_current += 2;
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Test for names derived from the swedish,
+ * dutch, or slavic that should get an alternate
+ * pronunciation of 'SV' to match the native
+ * version
+ *
+ * @return true if swedish, dutch, or slavic derived name
+ */
+ boolean Names_Beginning_With_SW_That_Get_Alt_SV()
+ {
+ if(StringAt(0, 7, "SWANSON", "SWENSON", "SWINSON", "SWENSEN",
+ "SWOBODA", "")
+ || StringAt(0, 9, "SWIDERSKI", "SWARTHOUT", "")
+ || StringAt(0, 10, "SWEARENGIN", ""))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Test for names derived from the german
+ * that should get an alternate pronunciation
+ * of 'XV' to match the german version spelled
+ * "schw-"
+ *
+ * @return true if german derived name
+ */
+ boolean Names_Beginning_With_SW_That_Get_Alt_XV()
+ {
+ if(StringAt(0, 5, "SWART", "")
+ || StringAt(0, 6, "SWARTZ", "SWARTS", "SWIGER", "")
+ || StringAt(0, 7, "SWITZER", "SWANGER", "SWIGERT",
+ "SWIGART", "SWIHART", "")
+ || StringAt(0, 8, "SWEITZER", "SWATZELL", "SWINDLER", "")
+ || StringAt(0, 9, "SWINEHART", "")
+ || StringAt(0, 10, "SWEARINGEN", ""))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Test whether the word in question
+ * is a name of germanic or slavic origin, for
+ * the purpose of determining whether to add an
+ * alternate encoding of 'V'
+ *
+ * @return true if germanic or slavic name
+ */
+ boolean Germanic_Or_Slavic_Name_Beginning_With_W()
+ {
+ if(StringAt(0, 3, "WEE", "WIX", "WAX", "")
+ || StringAt(0, 4, "WOLF", "WEIS", "WAHL", "WALZ", "WEIL", "WERT",
+ "WINE", "WILK", "WALT", "WOLL", "WADA", "WULF",
+ "WEHR", "WURM", "WYSE", "WENZ", "WIRT", "WOLK",
+ "WEIN", "WYSS", "WASS", "WANN", "WINT", "WINK",
+ "WILE", "WIKE", "WIER", "WELK", "WISE", "")
+ || StringAt(0, 5, "WIRTH", "WIESE", "WITTE", "WENTZ", "WOLFF", "WENDT",
+ "WERTZ", "WILKE", "WALTZ", "WEISE", "WOOLF", "WERTH",
+ "WEESE", "WURTH", "WINES", "WARGO", "WIMER", "WISER",
+ "WAGER", "WILLE", "WILDS", "WAGAR", "WERTS", "WITTY",
+ "WIENS", "WIEBE", "WIRTZ", "WYMER", "WULFF", "WIBLE",
+ "WINER", "WIEST", "WALKO", "WALLA", "WEBRE", "WEYER",
+ "WYBLE", "WOMAC", "WILTZ", "WURST", "WOLAK", "WELKE",
+ "WEDEL", "WEIST", "WYGAN", "WUEST", "WEISZ", "WALCK",
+ "WEITZ", "WYDRA", "WANDA", "WILMA", "WEBER", "")
+ || StringAt(0, 6, "WETZEL", "WEINER", "WENZEL", "WESTER", "WALLEN", "WENGER",
+ "WALLIN", "WEILER", "WIMMER", "WEIMER", "WYRICK", "WEGNER",
+ "WINNER", "WESSEL", "WILKIE", "WEIGEL", "WOJCIK", "WENDEL",
+ "WITTER", "WIENER", "WEISER", "WEXLER", "WACKER", "WISNER",
+ "WITMER", "WINKLE", "WELTER", "WIDMER", "WITTEN", "WINDLE",
+ "WASHER", "WOLTER", "WILKEY", "WIDNER", "WARMAN", "WEYANT",
+ "WEIBEL", "WANNER", "WILKEN", "WILTSE", "WARNKE", "WALSER",
+ "WEIKEL", "WESNER", "WITZEL", "WROBEL", "WAGNON", "WINANS",
+ "WENNER", "WOLKEN", "WILNER", "WYSONG", "WYCOFF", "WUNDER",
+ "WINKEL", "WIDMAN", "WELSCH", "WEHNER", "WEIGLE", "WETTER",
+ "WUNSCH", "WHITTY", "WAXMAN", "WILKER", "WILHAM", "WITTIG",
+ "WITMAN", "WESTRA", "WEHRLE", "WASSER", "WILLER", "WEGMAN",
+ "WARFEL", "WYNTER", "WERNER", "WAGNER", "WISSER", "")
+ || StringAt(0, 7, "WISEMAN", "WINKLER", "WILHELM", "WELLMAN", "WAMPLER", "WACHTER",
+ "WALTHER", "WYCKOFF", "WEIDNER", "WOZNIAK", "WEILAND", "WILFONG",
+ "WIEGAND", "WILCHER", "WIELAND", "WILDMAN", "WALDMAN", "WORTMAN",
+ "WYSOCKI", "WEIDMAN", "WITTMAN", "WIDENER", "WOLFSON", "WENDELL",
+ "WEITZEL", "WILLMAN", "WALDRUP", "WALTMAN", "WALCZAK", "WEIGAND",
+ "WESSELS", "WIDEMAN", "WOLTERS", "WIREMAN", "WILHOIT", "WEGENER",
+ "WOTRING", "WINGERT", "WIESNER", "WAYMIRE", "WHETZEL", "WENTZEL",
+ "WINEGAR", "WESTMAN", "WYNKOOP", "WALLICK", "WURSTER", "WINBUSH",
+ "WILBERT", "WALLACH", "WYNKOOP", "WALLICK", "WURSTER", "WINBUSH",
+ "WILBERT", "WALLACH", "WEISSER", "WEISNER", "WINDERS", "WILLMON",
+ "WILLEMS", "WIERSMA", "WACHTEL", "WARNICK", "WEIDLER", "WALTRIP",
+ "WHETSEL", "WHELESS", "WELCHER", "WALBORN", "WILLSEY", "WEINMAN",
+ "WAGAMAN", "WOMMACK", "WINGLER", "WINKLES", "WIEDMAN", "WHITNER",
+ "WOLFRAM", "WARLICK", "WEEDMAN", "WHISMAN", "WINLAND", "WEESNER",
+ "WARTHEN", "WETZLER", "WENDLER", "WALLNER", "WOLBERT", "WITTMER",
+ "WISHART", "WILLIAM", "")
+ || StringAt(0, 8, "WESTPHAL", "WICKLUND", "WEISSMAN", "WESTLUND", "WOLFGANG", "WILLHITE",
+ "WEISBERG", "WALRAVEN", "WOLFGRAM", "WILHOITE", "WECHSLER", "WENDLING",
+ "WESTBERG", "WENDLAND", "WININGER", "WHISNANT", "WESTRICK", "WESTLING",
+ "WESTBURY", "WEITZMAN", "WEHMEYER", "WEINMANN", "WISNESKI", "WHELCHEL",
+ "WEISHAAR", "WAGGENER", "WALDROUP", "WESTHOFF", "WIEDEMAN", "WASINGER",
+ "WINBORNE", "")
+ || StringAt(0, 9, "WHISENANT", "WEINSTEIN", "WESTERMAN", "WASSERMAN", "WITKOWSKI", "WEINTRAUB",
+ "WINKELMAN", "WINKFIELD", "WANAMAKER", "WIECZOREK", "WIECHMANN", "WOJTOWICZ",
+ "WALKOWIAK", "WEINSTOCK", "WILLEFORD", "WARKENTIN", "WEISINGER", "WINKLEMAN",
+ "WILHEMINA", "")
+ || StringAt(0, 10, "WISNIEWSKI", "WUNDERLICH", "WHISENHUNT", "WEINBERGER", "WROBLEWSKI",
+ "WAGUESPACK", "WEISGERBER", "WESTERVELT", "WESTERLUND", "WASILEWSKI",
+ "WILDERMUTH", "WESTENDORF", "WESOLOWSKI", "WEINGARTEN", "WINEBARGER",
+ "WESTERBERG", "WANNAMAKER", "WEISSINGER", "")
+ || StringAt(0, 11, "WALDSCHMIDT", "WEINGARTNER", "WINEBRENNER", "")
+ || StringAt(0, 12, "WOLFENBARGER", "")
+ || StringAt(0, 13, "WOJCIECHOWSKI", ""))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * Test whether the word in question
+ * is a name starting with 'J' that should
+ * match names starting with a 'Y' sound.
+ * All forms of 'John', 'Jane', etc, get
+ * and alt to match e.g. 'Ian', 'Yana'. Joelle
+ * should match 'Yael', 'Joseph' should match
+ * 'Yusef'. German and slavic last names are
+ * also included.
+ *
+ * @return true if name starting with 'J' that
+ * should get an alternate encoding as a vowel
+ */
+ boolean Names_Beginning_With_J_That_Get_Alt_Y()
+ {
+ if(StringAt(0, 3, "JAN", "JON", "JAN", "JIN", "JEN", "")
+ || StringAt(0, 4, "JUHL", "JULY", "JOEL", "JOHN", "JOSH",
+ "JUDE", "JUNE", "JONI", "JULI", "JENA",
+ "JUNG", "JINA", "JANA", "JENI", "JOEL",
+ "JANN", "JONA", "JENE", "JULE", "JANI",
+ "JONG", "JOHN", "JEAN", "JUNG", "JONE",
+ "JARA", "JUST", "JOST", "JAHN", "JACO",
+ "JANG", "JUDE", "JONE", "")
+ || StringAt(0, 5, "JOANN", "JANEY", "JANAE", "JOANA", "JUTTA",
+ "JULEE", "JANAY", "JANEE", "JETTA", "JOHNA",
+ "JOANE", "JAYNA", "JANES", "JONAS", "JONIE",
+ "JUSTA", "JUNIE", "JUNKO", "JENAE", "JULIO",
+ "JINNY", "JOHNS", "JACOB", "JETER", "JAFFE",
+ "JESKE", "JANKE", "JAGER", "JANIK", "JANDA",
+ "JOSHI", "JULES", "JANTZ", "JEANS", "JUDAH",
+ "JANUS", "JENNY", "JENEE", "JONAH", "JONAS",
+ "JACOB", "JOSUE", "JOSEF", "JULES", "JULIE",
+ "JULIA", "JANIE", "JANIS", "JENNA", "JANNA",
+ "JEANA", "JENNI", "JEANE", "JONNA", "")
+ || StringAt(0, 6, "JORDAN", "JORDON", "JOSEPH", "JOSHUA", "JOSIAH",
+ "JOSPEH", "JUDSON", "JULIAN", "JULIUS", "JUNIOR",
+ "JUDITH", "JOESPH", "JOHNIE", "JOANNE", "JEANNE",
+ "JOANNA", "JOSEFA", "JULIET", "JANNIE", "JANELL",
+ "JASMIN", "JANINE", "JOHNNY", "JEANIE", "JEANNA",
+ "JOHNNA", "JOELLE", "JOVITA", "JOSEPH", "JONNIE",
+ "JANEEN", "JANINA", "JOANIE", "JAZMIN", "JOHNIE",
+ "JANENE", "JOHNNY", "JONELL", "JENELL", "JANETT",
+ "JANETH", "JENINE", "JOELLA", "JOEANN", "JULIAN",
+ "JOHANA", "JENICE", "JANNET", "JANISE", "JULENE",
+ "JOSHUA", "JANEAN", "JAIMEE", "JOETTE", "JANYCE",
+ "JENEVA", "JORDAN", "JACOBS", "JENSEN", "JOSEPH",
+ "JANSEN", "JORDON", "JULIAN", "JAEGER", "JACOBY",
+ "JENSON", "JARMAN", "JOSLIN", "JESSEN", "JAHNKE",
+ "JACOBO", "JULIEN", "JOSHUA", "JEPSON", "JULIUS",
+ "JANSON", "JACOBI", "JUDSON", "JARBOE", "JOHSON",
+ "JANZEN", "JETTON", "JUNKER", "JONSON", "JAROSZ",
+ "JENNER", "JAGGER", "JASMIN", "JEPSEN", "JORDEN",
+ "JANNEY", "JUHASZ", "JERGEN", "JAKOB", "")
+ || StringAt(0, 7, "JOHNSON", "JOHNNIE", "JASMINE", "JEANNIE", "JOHANNA",
+ "JANELLE", "JANETTE", "JULIANA", "JUSTINA", "JOSETTE",
+ "JOELLEN", "JENELLE", "JULIETA", "JULIANN", "JULISSA",
+ "JENETTE", "JANETTA", "JOSELYN", "JONELLE", "JESENIA",
+ "JANESSA", "JAZMINE", "JEANENE", "JOANNIE", "JADWIGA",
+ "JOLANDA", "JULIANE", "JANUARY", "JEANICE", "JANELLA",
+ "JEANETT", "JENNINE", "JOHANNE", "JOHNSIE", "JANIECE",
+ "JOHNSON", "JENNELL", "JAMISON", "JANSSEN", "JOHNSEN",
+ "JARDINE", "JAGGERS", "JURGENS", "JOURDAN", "JULIANO",
+ "JOSEPHS", "JHONSON", "JOZWIAK", "JANICKI", "JELINEK",
+ "JANSSON", "JOACHIM", "JANELLE", "JACOBUS", "JENNING",
+ "JANTZEN", "JOHNNIE", "")
+ || StringAt(0, 8, "JOSEFINA", "JEANNINE", "JULIANNE", "JULIANNA", "JONATHAN",
+ "JONATHON", "JEANETTE", "JANNETTE", "JEANETTA", "JOHNETTA",
+ "JENNEFER", "JULIENNE", "JOSPHINE", "JEANELLE", "JOHNETTE",
+ "JULIEANN", "JOSEFINE", "JULIETTA", "JOHNSTON", "JACOBSON",
+ "JACOBSEN", "JOHANSEN", "JOHANSON", "JAWORSKI", "JENNETTE",
+ "JELLISON", "JOHANNES", "JASINSKI", "JUERGENS", "JARNAGIN",
+ "JEREMIAH", "JEPPESEN", "JARNIGAN", "JANOUSEK", "")
+ || StringAt(0, 9, "JOHNATHAN", "JOHNATHON", "JORGENSEN", "JEANMARIE", "JOSEPHINA",
+ "JEANNETTE", "JOSEPHINE", "JEANNETTA", "JORGENSON", "JANKOWSKI",
+ "JOHNSTONE", "JABLONSKI", "JOSEPHSON", "JOHANNSEN", "JURGENSEN",
+ "JIMMERSON", "JOHANSSON", "")
+ || StringAt(0, 10, "JAKUBOWSKI", ""))
+ {
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args)
+ {
+ // example code
+
+ Metaphone3 m3 = new Metaphone3();
+
+ //m3.SetEncodeVowels(true);
+ //m3.SetEncodeExact(true);
+
+ m3.SetWord("iron");
+
+ m3.Encode();
+
+ System.out.println("iron : " + m3.GetMetaph());
+ System.out.println("iron : (alt) " + m3.GetAlternateMetaph());
+
+ m3.SetWord("witz");
+
+ m3.Encode();
+
+ System.out.println("witz : " + m3.GetMetaph());
+ System.out.println("witz : (alt) " + m3.GetAlternateMetaph());
+
+ m3.SetWord("");
+
+ m3.Encode();
+
+ System.out.println("BLANK : " + m3.GetMetaph());
+ System.out.println("BLANK : (alt) " + m3.GetAlternateMetaph());
+
+ // these settings default to false
+ m3.SetEncodeExact(true);
+ m3.SetEncodeVowels(true);
+
+ String test = new String("Guillermo");
+ m3.SetWord(test);
+ m3.Encode();
+ System.out.println(test + " : " + m3.GetMetaph());
+ System.out.println(test + " : (alt) " + m3.GetAlternateMetaph());
+
+ test = "VILLASENOR";
+ m3.SetWord(test);
+ m3.Encode();
+ System.out.println(test + " : " + m3.GetMetaph());
+ System.out.println(test + " : (alt) " + m3.GetAlternateMetaph());
+
+ test = "GUILLERMINA";
+ m3.SetWord(test);
+ m3.Encode();
+ System.out.println(test + " : " + m3.GetMetaph());
+ System.out.println(test + " : (alt) " + m3.GetAlternateMetaph());
+
+ test = "PADILLA";
+ m3.SetWord(test);
+ m3.Encode();
+ System.out.println(test + " : " + m3.GetMetaph());
+ System.out.println(test + " : (alt) " + m3.GetAlternateMetaph());
+
+ test = "BJORK";
+ m3.SetWord(test);
+ m3.Encode();
+ System.out.println(test + " : " + m3.GetMetaph());
+ System.out.println(test + " : (alt) " + m3.GetAlternateMetaph());
+
+ test = "belle";
+ m3.SetWord(test);
+ m3.Encode();
+ System.out.println(test + " : " + m3.GetMetaph());
+ System.out.println(test + " : (alt) " + m3.GetAlternateMetaph());
+
+ test = "ERICH";
+ m3.SetWord(test);
+ m3.Encode();
+ System.out.println(test + " : " + m3.GetMetaph());
+ System.out.println(test + " : (alt) " + m3.GetAlternateMetaph());
+
+ test = "CROCE";
+ m3.SetWord(test);
+ m3.Encode();
+ System.out.println(test + " : " + m3.GetMetaph());
+ System.out.println(test + " : (alt) " + m3.GetAlternateMetaph());
+
+ test = "GLOWACKI";
+ m3.SetWord(test);
+ m3.Encode();
+ System.out.println(test + " : " + m3.GetMetaph());
+ System.out.println(test + " : (alt) " + m3.GetAlternateMetaph());
+
+ test = "qing";
+ m3.SetWord(test);
+ m3.Encode();
+ System.out.println(test + " : " + m3.GetMetaph());
+ System.out.println(test + " : (alt) " + m3.GetAlternateMetaph());
+
+ test = "tsing";
+ m3.SetWord(test);
+ m3.Encode();
+ System.out.println(test + " : " + m3.GetMetaph());
+ System.out.println(test + " : (alt) " + m3.GetAlternateMetaph());
+
+ }
+}
+
+
diff --git a/main/src/com/google/refine/clustering/binning/Metaphone3Keyer.java b/main/src/com/google/refine/clustering/binning/Metaphone3Keyer.java
new file mode 100644
index 000000000..6688cf42e
--- /dev/null
+++ b/main/src/com/google/refine/clustering/binning/Metaphone3Keyer.java
@@ -0,0 +1,50 @@
+/*
+
+Copyright 2010, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+ * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+package com.google.refine.clustering.binning;
+
+public class Metaphone3Keyer extends Keyer {
+
+ public Metaphone3Keyer() {
+ }
+
+ public String key(String s, Object... o) {
+ Metaphone3 _metaphone3 = new Metaphone3();
+ //_metaphone3.SetEncodeVowels(true);
+ //_metaphone3.SetEncodeExact(true);
+ _metaphone3.SetWord(s);
+ _metaphone3.Encode();
+ return _metaphone3.GetMetaph();
+ }
+
+}
diff --git a/main/src/com/google/refine/expr/functions/strings/Phonetic.java b/main/src/com/google/refine/expr/functions/strings/Phonetic.java
index 38748a67b..44fb68713 100644
--- a/main/src/com/google/refine/expr/functions/strings/Phonetic.java
+++ b/main/src/com/google/refine/expr/functions/strings/Phonetic.java
@@ -39,6 +39,7 @@ import org.json.JSONException;
import org.json.JSONWriter;
import com.google.refine.clustering.binning.DoubleMetaphoneKeyer;
+import com.google.refine.clustering.binning.Metaphone3Keyer;
import com.google.refine.clustering.binning.MetaphoneKeyer;
import com.google.refine.clustering.binning.SoundexKeyer;
import com.google.refine.expr.EvalError;
@@ -47,6 +48,7 @@ import com.google.refine.grel.Function;
public class Phonetic implements Function {
+ static private Metaphone3Keyer metaphone3 = new Metaphone3Keyer();
static private DoubleMetaphoneKeyer metaphone2 = new DoubleMetaphoneKeyer();
static private MetaphoneKeyer metaphone = new MetaphoneKeyer();
static private SoundexKeyer soundex = new SoundexKeyer();
@@ -58,11 +60,14 @@ public class Phonetic implements Function {
if (o1 != null && o2 != null && o2 instanceof String) {
String str = (o1 instanceof String) ? (String) o1 : o1.toString();
String encoding = ((String) o2).toLowerCase();
- if ("doublemetaphone".equals(encoding)) {
+ if (encoding == null) encoding = "metaphone3";
+ if ("doublemetaphone".equalsIgnoreCase(encoding)) {
return metaphone2.key(str);
- } else if ("metaphone".equals(encoding)) {
+ } else if ("metaphone3".equalsIgnoreCase(encoding)) {
+ return metaphone3.key(str);
+ } else if ("metaphone".equalsIgnoreCase(encoding)) {
return metaphone.key(str);
- } else if ("soundex".equals(encoding)) {
+ } else if ("soundex".equalsIgnoreCase(encoding)) {
return soundex.key(str);
} else {
return new EvalError(ControlFunctionRegistry.getFunctionName(this) + " doesn't know how to handle the '" + encoding + "' encoding.");
@@ -77,7 +82,7 @@ public class Phonetic implements Function {
writer.object();
writer.key("description"); writer.value("Returns the a phonetic encoding of s (optionally indicating which encoding to use')");
- writer.key("params"); writer.value("string s, string encoding (optional, defaults to 'DoubleMetaphone')");
+ writer.key("params"); writer.value("string s, string encoding (optional, defaults to 'metaphone3')");
writer.key("returns"); writer.value("string");
writer.endObject();
}
diff --git a/main/webapp/modules/core/scripts/dialogs/clustering-dialog.html b/main/webapp/modules/core/scripts/dialogs/clustering-dialog.html
index 0a1276584..342a0a134 100644
--- a/main/webapp/modules/core/scripts/dialogs/clustering-dialog.html
+++ b/main/webapp/modules/core/scripts/dialogs/clustering-dialog.html
@@ -23,7 +23,7 @@