PCQRSCANER/venv/Lib/site-packages/nltk/stem/lancaster.py

# Natural Language Toolkit: Stemmers
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Steven Tomcavage <stomcava@law.upenn.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.
Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.
"""
from __future__ import unicode_literals
import re

from nltk.stem.api import StemmerI
from nltk.compat import python_2_unicode_compatible


@python_2_unicode_compatible
class LancasterStemmer(StemmerI):
    """
    Lancaster Stemmer

        >>> from nltk.stem.lancaster import LancasterStemmer
        >>> st = LancasterStemmer()
        >>> st.stem('maximum')     # Remove "-um" when word is intact
        'maxim'
        >>> st.stem('presumably')  # Don't remove "-um" when word is not intact
        'presum'
        >>> st.stem('multiply')    # No action taken if word ends with "-ply"
        'multiply'
        >>> st.stem('provision')   # Replace "-sion" with "-j" to trigger "j" set of rules
        'provid'
        >>> st.stem('owed')        # Word starting with vowel must contain at least 2 letters
        'ow'
        >>> st.stem('ear')         # ditto
        'ear'
        >>> st.stem('saying')      # Words starting with consonant must contain at least 3
        'say'
        >>> st.stem('crying')      #     letters and one of those letters must be a vowel
        'cry'
        >>> st.stem('string')      # ditto
        'string'
        >>> st.stem('meant')       # ditto
        'meant'
        >>> st.stem('cement')      # ditto
        'cem'
        >>> st_pre = LancasterStemmer(strip_prefix_flag=True)
        >>> st_pre.stem('kilometer') # Test Prefix
        'met'
        >>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))
        >>> st_custom.stem("ness") # Change s to t
        'nest'
    """

    # The rule list is static since it doesn't change between instances
    default_rule_tuple = (
        "ai*2.",  # -ia > -   if intact
        "a*1.",  # -a > -    if intact
        "bb1.",  # -bb > -b
        "city3s.",  # -ytic > -ys
        "ci2>",  # -ic > -
        "cn1t>",  # -nc > -nt
        "dd1.",  # -dd > -d
        "dei3y>",  # -ied > -y
        "deec2ss.",  # -ceed >", -cess
        "dee1.",  # -eed > -ee
        "de2>",  # -ed > -
        "dooh4>",  # -hood > -
        "e1>",  # -e > -
        "feil1v.",  # -lief > -liev
        "fi2>",  # -if > -
        "gni3>",  # -ing > -
        "gai3y.",  # -iag > -y
        "ga2>",  # -ag > -
        "gg1.",  # -gg > -g
        "ht*2.",  # -th > -   if intact
        "hsiug5ct.",  # -guish > -ct
        "hsi3>",  # -ish > -
        "i*1.",  # -i > -    if intact
        "i1y>",  # -i > -y
        "ji1d.",  # -ij > -id   --  see nois4j> & vis3j>
        "juf1s.",  # -fuj > -fus
        "ju1d.",  # -uj > -ud
        "jo1d.",  # -oj > -od
        "jeh1r.",  # -hej > -her
        "jrev1t.",  # -verj > -vert
        "jsim2t.",  # -misj > -mit
        "jn1d.",  # -nj > -nd
        "j1s.",  # -j > -s
        "lbaifi6.",  # -ifiabl > -
        "lbai4y.",  # -iabl > -y
        "lba3>",  # -abl > -
        "lbi3.",  # -ibl > -
        "lib2l>",  # -bil > -bl
        "lc1.",  # -cl > c
        "lufi4y.",  # -iful > -y
        "luf3>",  # -ful > -
        "lu2.",  # -ul > -
        "lai3>",  # -ial > -
        "lau3>",  # -ual > -
        "la2>",  # -al > -
        "ll1.",  # -ll > -l
        "mui3.",  # -ium > -
        "mu*2.",  # -um > -   if intact
        "msi3>",  # -ism > -
        "mm1.",  # -mm > -m
        "nois4j>",  # -sion > -j
        "noix4ct.",  # -xion > -ct
        "noi3>",  # -ion > -
        "nai3>",  # -ian > -
        "na2>",  # -an > -
        "nee0.",  # protect  -een
        "ne2>",  # -en > -
        "nn1.",  # -nn > -n
        "pihs4>",  # -ship > -
        "pp1.",  # -pp > -p
        "re2>",  # -er > -
        "rae0.",  # protect  -ear
        "ra2.",  # -ar > -
        "ro2>",  # -or > -
        "ru2>",  # -ur > -
        "rr1.",  # -rr > -r
        "rt1>",  # -tr > -t
        "rei3y>",  # -ier > -y
        "sei3y>",  # -ies > -y
        "sis2.",  # -sis > -s
        "si2>",  # -is > -
        "ssen4>",  # -ness > -
        "ss0.",  # protect  -ss
        "suo3>",  # -ous > -
        "su*2.",  # -us > -   if intact
        "s*1>",  # -s > -    if intact
        "s0.",  # -s > -s
        "tacilp4y.",  # -plicat > -ply
        "ta2>",  # -at > -
        "tnem4>",  # -ment > -
        "tne3>",  # -ent > -
        "tna3>",  # -ant > -
        "tpir2b.",  # -ript > -rib
        "tpro2b.",  # -orpt > -orb
        "tcud1.",  # -duct > -duc
        "tpmus2.",  # -sumpt > -sum
        "tpec2iv.",  # -cept > -ceiv
        "tulo2v.",  # -olut > -olv
        "tsis0.",  # protect  -sist
        "tsi3>",  # -ist > -
        "tt1.",  # -tt > -t
        "uqi3.",  # -iqu > -
        "ugo1.",  # -ogu > -og
        "vis3j>",  # -siv > -j
        "vie0.",  # protect  -eiv
        "vi2>",  # -iv > -
        "ylb1>",  # -bly > -bl
        "yli3y>",  # -ily > -y
        "ylp0.",  # protect  -ply
        "yl2>",  # -ly > -
        "ygo1.",  # -ogy > -og
        "yhp1.",  # -phy > -ph
        "ymo1.",  # -omy > -om
        "ypo1.",  # -opy > -op
        "yti3>",  # -ity > -
        "yte3>",  # -ety > -
        "ytl2.",  # -lty > -l
        "yrtsi5.",  # -istry > -
        "yra3>",  # -ary > -
        "yro3>",  # -ory > -
        "yfi3.",  # -ify > -
        "ycn2t>",  # -ncy > -nt
        "yca3>",  # -acy > -
        "zi2>",  # -iz > -
        "zy1s.",  # -yz > -ys
    )

    def __init__(self, rule_tuple=None, strip_prefix_flag=False):
        """Create an instance of the Lancaster stemmer.
        """
        # Setup an empty rule dictionary - this will be filled in later
        self.rule_dictionary = {}
        # Check if a user wants to strip prefix
        self._strip_prefix = strip_prefix_flag
        # Check if a user wants to use his/her own rule tuples.
        self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple

    def parseRules(self, rule_tuple=None):
        """Validate the set of rules used in this stemmer.

        If this function is called as an individual method, without using stem
        method, rule_tuple argument will be compiled into self.rule_dictionary.
        If this function is called within stem, self._rule_tuple will be used.

        """
        # If there is no argument for the function, use class' own rule tuple.
        rule_tuple = rule_tuple if rule_tuple else self._rule_tuple
        valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$")
        # Empty any old rules from the rule set before adding new ones
        self.rule_dictionary = {}

        for rule in rule_tuple:
            if not valid_rule.match(rule):
                raise ValueError("The rule {0} is invalid".format(rule))
            first_letter = rule[0:1]
            if first_letter in self.rule_dictionary:
                self.rule_dictionary[first_letter].append(rule)
            else:
                self.rule_dictionary[first_letter] = [rule]

    def stem(self, word):
        """Stem a word using the Lancaster stemmer.
        """
        # Lower-case the word, since all the rules are lower-cased
        word = word.lower()
        word = self.__stripPrefix(word) if self._strip_prefix else word

        # Save a copy of the original word
        intact_word = word

        # If rule dictionary is empty, parse rule tuple.
        if not self.rule_dictionary:
            self.parseRules()

        return self.__doStemming(word, intact_word)

    def __doStemming(self, word, intact_word):
        """Perform the actual word stemming
        """

        valid_rule = re.compile("^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$")

        proceed = True

        while proceed:

            # Find the position of the last letter of the word to be stemmed
            last_letter_position = self.__getLastLetter(word)

            # Only stem the word if it has a last letter and a rule matching that last letter
            if (
                last_letter_position < 0
                or word[last_letter_position] not in self.rule_dictionary
            ):
                proceed = False

            else:
                rule_was_applied = False

                # Go through each rule that matches the word's final letter
                for rule in self.rule_dictionary[word[last_letter_position]]:
                    rule_match = valid_rule.match(rule)
                    if rule_match:
                        (
                            ending_string,
                            intact_flag,
                            remove_total,
                            append_string,
                            cont_flag,
                        ) = rule_match.groups()

                        # Convert the number of chars to remove when stemming
                        # from a string to an integer
                        remove_total = int(remove_total)

                        # Proceed if word's ending matches rule's word ending
                        if word.endswith(ending_string[::-1]):
                            if intact_flag:
                                if word == intact_word and self.__isAcceptable(
                                    word, remove_total
                                ):
                                    word = self.__applyRule(
                                        word, remove_total, append_string
                                    )
                                    rule_was_applied = True
                                    if cont_flag == '.':
                                        proceed = False
                                    break
                            elif self.__isAcceptable(word, remove_total):
                                word = self.__applyRule(
                                    word, remove_total, append_string
                                )
                                rule_was_applied = True
                                if cont_flag == '.':
                                    proceed = False
                                break
                # If no rules apply, the word doesn't need any more stemming
                if rule_was_applied == False:
                    proceed = False
        return word

    def __getLastLetter(self, word):
        """Get the zero-based index of the last alphabetic character in this string
        """
        last_letter = -1
        for position in range(len(word)):
            if word[position].isalpha():
                last_letter = position
            else:
                break
        return last_letter

    def __isAcceptable(self, word, remove_total):
        """Determine if the word is acceptable for stemming.
        """
        word_is_acceptable = False
        # If the word starts with a vowel, it must be at least 2
        # characters long to be stemmed
        if word[0] in "aeiouy":
            if len(word) - remove_total >= 2:
                word_is_acceptable = True
        # If the word starts with a consonant, it must be at least 3
        # characters long (including one vowel) to be stemmed
        elif len(word) - remove_total >= 3:
            if word[1] in "aeiouy":
                word_is_acceptable = True
            elif word[2] in "aeiouy":
                word_is_acceptable = True
        return word_is_acceptable

    def __applyRule(self, word, remove_total, append_string):
        """Apply the stemming rule to the word
        """
        # Remove letters from the end of the word
        new_word_length = len(word) - remove_total
        word = word[0:new_word_length]

        # And add new letters to the end of the truncated word
        if append_string:
            word += append_string
        return word

    def __stripPrefix(self, word):
        """Remove prefix from a word.

        This function originally taken from Whoosh.

        """
        for prefix in (
            "kilo",
            "micro",
            "milli",
            "intra",
            "ultra",
            "mega",
            "nano",
            "pico",
            "pseudo",
        ):
            if word.startswith(prefix):
                return word[len(prefix) :]
        return word

    def __repr__(self):
        return '<LancasterStemmer>'
3 2019-12-22 21:51:47 +01:00			`# Natural Language Toolkit: Stemmers`
			`#`
			`# Copyright (C) 2001-2019 NLTK Project`
			`# Author: Steven Tomcavage <stomcava@law.upenn.edu>`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`

			`"""`
			`A word stemmer based on the Lancaster (Paice/Husk) stemming algorithm.`
			`Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61.`
			`"""`
			`from __future__ import unicode_literals`
			`import re`

			`from nltk.stem.api import StemmerI`
			`from nltk.compat import python_2_unicode_compatible`


			`@python_2_unicode_compatible`
			`class LancasterStemmer(StemmerI):`
			`"""`
			`Lancaster Stemmer`

			`>>> from nltk.stem.lancaster import LancasterStemmer`
			`>>> st = LancasterStemmer()`
			`>>> st.stem('maximum') # Remove "-um" when word is intact`
			`'maxim'`
			`>>> st.stem('presumably') # Don't remove "-um" when word is not intact`
			`'presum'`
			`>>> st.stem('multiply') # No action taken if word ends with "-ply"`
			`'multiply'`
			`>>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules`
			`'provid'`
			`>>> st.stem('owed') # Word starting with vowel must contain at least 2 letters`
			`'ow'`
			`>>> st.stem('ear') # ditto`
			`'ear'`
			`>>> st.stem('saying') # Words starting with consonant must contain at least 3`
			`'say'`
			`>>> st.stem('crying') # letters and one of those letters must be a vowel`
			`'cry'`
			`>>> st.stem('string') # ditto`
			`'string'`
			`>>> st.stem('meant') # ditto`
			`'meant'`
			`>>> st.stem('cement') # ditto`
			`'cem'`
			`>>> st_pre = LancasterStemmer(strip_prefix_flag=True)`
			`>>> st_pre.stem('kilometer') # Test Prefix`
			`'met'`
			`>>> st_custom = LancasterStemmer(rule_tuple=("ssen4>", "s1t."))`
			`>>> st_custom.stem("ness") # Change s to t`
			`'nest'`
			`"""`

			`# The rule list is static since it doesn't change between instances`
			`default_rule_tuple = (`
			`"ai*2.", # -ia > - if intact`
			`"a*1.", # -a > - if intact`
			`"bb1.", # -bb > -b`
			`"city3s.", # -ytic > -ys`
			`"ci2>", # -ic > -`
			`"cn1t>", # -nc > -nt`
			`"dd1.", # -dd > -d`
			`"dei3y>", # -ied > -y`
			`"deec2ss.", # -ceed >", -cess`
			`"dee1.", # -eed > -ee`
			`"de2>", # -ed > -`
			`"dooh4>", # -hood > -`
			`"e1>", # -e > -`
			`"feil1v.", # -lief > -liev`
			`"fi2>", # -if > -`
			`"gni3>", # -ing > -`
			`"gai3y.", # -iag > -y`
			`"ga2>", # -ag > -`
			`"gg1.", # -gg > -g`
			`"ht*2.", # -th > - if intact`
			`"hsiug5ct.", # -guish > -ct`
			`"hsi3>", # -ish > -`
			`"i*1.", # -i > - if intact`
			`"i1y>", # -i > -y`
			`"ji1d.", # -ij > -id -- see nois4j> & vis3j>`
			`"juf1s.", # -fuj > -fus`
			`"ju1d.", # -uj > -ud`
			`"jo1d.", # -oj > -od`
			`"jeh1r.", # -hej > -her`
			`"jrev1t.", # -verj > -vert`
			`"jsim2t.", # -misj > -mit`
			`"jn1d.", # -nj > -nd`
			`"j1s.", # -j > -s`
			`"lbaifi6.", # -ifiabl > -`
			`"lbai4y.", # -iabl > -y`
			`"lba3>", # -abl > -`
			`"lbi3.", # -ibl > -`
			`"lib2l>", # -bil > -bl`
			`"lc1.", # -cl > c`
			`"lufi4y.", # -iful > -y`
			`"luf3>", # -ful > -`
			`"lu2.", # -ul > -`
			`"lai3>", # -ial > -`
			`"lau3>", # -ual > -`
			`"la2>", # -al > -`
			`"ll1.", # -ll > -l`
			`"mui3.", # -ium > -`
			`"mu*2.", # -um > - if intact`
			`"msi3>", # -ism > -`
			`"mm1.", # -mm > -m`
			`"nois4j>", # -sion > -j`
			`"noix4ct.", # -xion > -ct`
			`"noi3>", # -ion > -`
			`"nai3>", # -ian > -`
			`"na2>", # -an > -`
			`"nee0.", # protect -een`
			`"ne2>", # -en > -`
			`"nn1.", # -nn > -n`
			`"pihs4>", # -ship > -`
			`"pp1.", # -pp > -p`
			`"re2>", # -er > -`
			`"rae0.", # protect -ear`
			`"ra2.", # -ar > -`
			`"ro2>", # -or > -`
			`"ru2>", # -ur > -`
			`"rr1.", # -rr > -r`
			`"rt1>", # -tr > -t`
			`"rei3y>", # -ier > -y`
			`"sei3y>", # -ies > -y`
			`"sis2.", # -sis > -s`
			`"si2>", # -is > -`
			`"ssen4>", # -ness > -`
			`"ss0.", # protect -ss`
			`"suo3>", # -ous > -`
			`"su*2.", # -us > - if intact`
			`"s*1>", # -s > - if intact`
			`"s0.", # -s > -s`
			`"tacilp4y.", # -plicat > -ply`
			`"ta2>", # -at > -`
			`"tnem4>", # -ment > -`
			`"tne3>", # -ent > -`
			`"tna3>", # -ant > -`
			`"tpir2b.", # -ript > -rib`
			`"tpro2b.", # -orpt > -orb`
			`"tcud1.", # -duct > -duc`
			`"tpmus2.", # -sumpt > -sum`
			`"tpec2iv.", # -cept > -ceiv`
			`"tulo2v.", # -olut > -olv`
			`"tsis0.", # protect -sist`
			`"tsi3>", # -ist > -`
			`"tt1.", # -tt > -t`
			`"uqi3.", # -iqu > -`
			`"ugo1.", # -ogu > -og`
			`"vis3j>", # -siv > -j`
			`"vie0.", # protect -eiv`
			`"vi2>", # -iv > -`
			`"ylb1>", # -bly > -bl`
			`"yli3y>", # -ily > -y`
			`"ylp0.", # protect -ply`
			`"yl2>", # -ly > -`
			`"ygo1.", # -ogy > -og`
			`"yhp1.", # -phy > -ph`
			`"ymo1.", # -omy > -om`
			`"ypo1.", # -opy > -op`
			`"yti3>", # -ity > -`
			`"yte3>", # -ety > -`
			`"ytl2.", # -lty > -l`
			`"yrtsi5.", # -istry > -`
			`"yra3>", # -ary > -`
			`"yro3>", # -ory > -`
			`"yfi3.", # -ify > -`
			`"ycn2t>", # -ncy > -nt`
			`"yca3>", # -acy > -`
			`"zi2>", # -iz > -`
			`"zy1s.", # -yz > -ys`
			`)`

			`def __init__(self, rule_tuple=None, strip_prefix_flag=False):`
			`"""Create an instance of the Lancaster stemmer.`
			`"""`
			`# Setup an empty rule dictionary - this will be filled in later`
			`self.rule_dictionary = {}`
			`# Check if a user wants to strip prefix`
			`self._strip_prefix = strip_prefix_flag`
			`# Check if a user wants to use his/her own rule tuples.`
			`self._rule_tuple = rule_tuple if rule_tuple else self.default_rule_tuple`

			`def parseRules(self, rule_tuple=None):`
			`"""Validate the set of rules used in this stemmer.`

			`If this function is called as an individual method, without using stem`
			`method, rule_tuple argument will be compiled into self.rule_dictionary.`
			`If this function is called within stem, self._rule_tuple will be used.`

			`"""`
			`# If there is no argument for the function, use class' own rule tuple.`
			`rule_tuple = rule_tuple if rule_tuple else self._rule_tuple`
			`valid_rule = re.compile("^[a-z]+\?\d[a-z][>\.]?$")`
			`# Empty any old rules from the rule set before adding new ones`
			`self.rule_dictionary = {}`

			`for rule in rule_tuple:`
			`if not valid_rule.match(rule):`
			`raise ValueError("The rule {0} is invalid".format(rule))`
			`first_letter = rule[0:1]`
			`if first_letter in self.rule_dictionary:`
			`self.rule_dictionary[first_letter].append(rule)`
			`else:`
			`self.rule_dictionary[first_letter] = [rule]`

			`def stem(self, word):`
			`"""Stem a word using the Lancaster stemmer.`
			`"""`
			`# Lower-case the word, since all the rules are lower-cased`
			`word = word.lower()`
			`word = self.__stripPrefix(word) if self._strip_prefix else word`

			`# Save a copy of the original word`
			`intact_word = word`

			`# If rule dictionary is empty, parse rule tuple.`
			`if not self.rule_dictionary:`
			`self.parseRules()`

			`return self.__doStemming(word, intact_word)`

			`def __doStemming(self, word, intact_word):`
			`"""Perform the actual word stemming`
			`"""`

			`valid_rule = re.compile("^([a-z]+)(\?)(\d)([a-z])([>\.]?)$")`

			`proceed = True`

			`while proceed:`

			`# Find the position of the last letter of the word to be stemmed`
			`last_letter_position = self.__getLastLetter(word)`

			`# Only stem the word if it has a last letter and a rule matching that last letter`
			`if (`
			`last_letter_position < 0`
			`or word[last_letter_position] not in self.rule_dictionary`
			`):`
			`proceed = False`

			`else:`
			`rule_was_applied = False`

			`# Go through each rule that matches the word's final letter`
			`for rule in self.rule_dictionary[word[last_letter_position]]:`
			`rule_match = valid_rule.match(rule)`
			`if rule_match:`
			`(`
			`ending_string,`
			`intact_flag,`
			`remove_total,`
			`append_string,`
			`cont_flag,`
			`) = rule_match.groups()`

			`# Convert the number of chars to remove when stemming`
			`# from a string to an integer`
			`remove_total = int(remove_total)`

			`# Proceed if word's ending matches rule's word ending`
			`if word.endswith(ending_string[::-1]):`
			`if intact_flag:`
			`if word == intact_word and self.__isAcceptable(`
			`word, remove_total`
			`):`
			`word = self.__applyRule(`
			`word, remove_total, append_string`
			`)`
			`rule_was_applied = True`
			`if cont_flag == '.':`
			`proceed = False`
			`break`
			`elif self.__isAcceptable(word, remove_total):`
			`word = self.__applyRule(`
			`word, remove_total, append_string`
			`)`
			`rule_was_applied = True`
			`if cont_flag == '.':`
			`proceed = False`
			`break`
			`# If no rules apply, the word doesn't need any more stemming`
			`if rule_was_applied == False:`
			`proceed = False`
			`return word`

			`def __getLastLetter(self, word):`
			`"""Get the zero-based index of the last alphabetic character in this string`
			`"""`
			`last_letter = -1`
			`for position in range(len(word)):`
			`if word[position].isalpha():`
			`last_letter = position`
			`else:`
			`break`
			`return last_letter`

			`def __isAcceptable(self, word, remove_total):`
			`"""Determine if the word is acceptable for stemming.`
			`"""`
			`word_is_acceptable = False`
			`# If the word starts with a vowel, it must be at least 2`
			`# characters long to be stemmed`
			`if word[0] in "aeiouy":`
			`if len(word) - remove_total >= 2:`
			`word_is_acceptable = True`
			`# If the word starts with a consonant, it must be at least 3`
			`# characters long (including one vowel) to be stemmed`
			`elif len(word) - remove_total >= 3:`
			`if word[1] in "aeiouy":`
			`word_is_acceptable = True`
			`elif word[2] in "aeiouy":`
			`word_is_acceptable = True`
			`return word_is_acceptable`

			`def __applyRule(self, word, remove_total, append_string):`
			`"""Apply the stemming rule to the word`
			`"""`
			`# Remove letters from the end of the word`
			`new_word_length = len(word) - remove_total`
			`word = word[0:new_word_length]`

			`# And add new letters to the end of the truncated word`
			`if append_string:`
			`word += append_string`
			`return word`

			`def __stripPrefix(self, word):`
			`"""Remove prefix from a word.`

			`This function originally taken from Whoosh.`

			`"""`
			`for prefix in (`
			`"kilo",`
			`"micro",`
			`"milli",`
			`"intra",`
			`"ultra",`
			`"mega",`
			`"nano",`
			`"pico",`
			`"pseudo",`
			`):`
			`if word.startswith(prefix):`
			`return word[len(prefix) :]`
			`return word`

			`def __repr__(self):`
			`return '<LancasterStemmer>'`