PCQRSCANER/venv/Lib/site-packages/nltk/tag/senna.py

# encoding: utf-8
# Natural Language Toolkit: Senna POS Tagger
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
Senna POS tagger, NER Tagger, Chunk Tagger

The input is:
- path to the directory that contains SENNA executables. If the path is incorrect,
   SennaTagger will automatically search for executable file specified in SENNA environment variable
- (optionally) the encoding of the input data (default:utf-8)

Note: Unit tests for this module can be found in test/unit/test_senna.py

    >>> from nltk.tag import SennaTagger
    >>> tagger = SennaTagger('/usr/share/senna-v3.0')
    >>> tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
    [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'),
    ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')]

    >>> from nltk.tag import SennaChunkTagger
    >>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0')
    >>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP
    [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'),
    ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'),
    ('?', 'O')]

    >>> from nltk.tag import SennaNERTagger
    >>> nertagger = SennaNERTagger('/usr/share/senna-v3.0')
    >>> nertagger.tag('Shakespeare theatre was in London .'.split()) # doctest: +SKIP
    [('Shakespeare', 'B-PER'), ('theatre', 'O'), ('was', 'O'), ('in', 'O'),
    ('London', 'B-LOC'), ('.', 'O')]
    >>> nertagger.tag('UN headquarters are in NY , USA .'.split()) # doctest: +SKIP
    [('UN', 'B-ORG'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'),
    ('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')]
"""

from nltk.compat import python_2_unicode_compatible
from nltk.classify import Senna


@python_2_unicode_compatible
class SennaTagger(Senna):
    def __init__(self, path, encoding='utf-8'):
        super(SennaTagger, self).__init__(path, ['pos'], encoding)

    def tag_sents(self, sentences):
        """
        Applies the tag method over a list of sentences. This method will return
        for each sentence a list of tuples of (word, tag).
        """
        tagged_sents = super(SennaTagger, self).tag_sents(sentences)
        for i in range(len(tagged_sents)):
            for j in range(len(tagged_sents[i])):
                annotations = tagged_sents[i][j]
                tagged_sents[i][j] = (annotations['word'], annotations['pos'])
        return tagged_sents


@python_2_unicode_compatible
class SennaChunkTagger(Senna):
    def __init__(self, path, encoding='utf-8'):
        super(SennaChunkTagger, self).__init__(path, ['chk'], encoding)

    def tag_sents(self, sentences):
        """
        Applies the tag method over a list of sentences. This method will return
        for each sentence a list of tuples of (word, tag).
        """
        tagged_sents = super(SennaChunkTagger, self).tag_sents(sentences)
        for i in range(len(tagged_sents)):
            for j in range(len(tagged_sents[i])):
                annotations = tagged_sents[i][j]
                tagged_sents[i][j] = (annotations['word'], annotations['chk'])
        return tagged_sents

    def bio_to_chunks(self, tagged_sent, chunk_type):
        """
        Extracts the chunks in a BIO chunk-tagged sentence.

        >>> from nltk.tag import SennaChunkTagger
        >>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0')
        >>> sent = 'What is the airspeed of an unladen swallow ?'.split()
        >>> tagged_sent = chktagger.tag(sent) # doctest: +SKIP
        >>> tagged_sent # doctest: +SKIP
        [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'),
        ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'),
        ('?', 'O')]
        >>> list(chktagger.bio_to_chunks(tagged_sent, chunk_type='NP')) # doctest: +SKIP
        [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')]

        :param tagged_sent: A list of tuples of word and BIO chunk tag.
        :type tagged_sent: list(tuple)
        :param tagged_sent: The chunk tag that users want to extract, e.g. 'NP' or 'VP'
        :type tagged_sent: str

        :return: An iterable of tuples of chunks that users want to extract
          and their corresponding indices.
        :rtype: iter(tuple(str))
        """
        current_chunk = []
        current_chunk_position = []
        for idx, word_pos in enumerate(tagged_sent):
            word, pos = word_pos
            if '-' + chunk_type in pos:  # Append the word to the current_chunk.
                current_chunk.append((word))
                current_chunk_position.append((idx))
            else:
                if current_chunk:  # Flush the full chunk when out of an NP.
                    _chunk_str = ' '.join(current_chunk)
                    _chunk_pos_str = '-'.join(map(str, current_chunk_position))
                    yield _chunk_str, _chunk_pos_str
                    current_chunk = []
                    current_chunk_position = []
        if current_chunk:  # Flush the last chunk.
            yield ' '.join(current_chunk), '-'.join(map(str, current_chunk_position))


@python_2_unicode_compatible
class SennaNERTagger(Senna):
    def __init__(self, path, encoding='utf-8'):
        super(SennaNERTagger, self).__init__(path, ['ner'], encoding)

    def tag_sents(self, sentences):
        """
        Applies the tag method over a list of sentences. This method will return
        for each sentence a list of tuples of (word, tag).
        """
        tagged_sents = super(SennaNERTagger, self).tag_sents(sentences)
        for i in range(len(tagged_sents)):
            for j in range(len(tagged_sents[i])):
                annotations = tagged_sents[i][j]
                tagged_sents[i][j] = (annotations['word'], annotations['ner'])
        return tagged_sents


# skip doctests if Senna is not installed
def setup_module(module):
    from nose import SkipTest

    try:
        tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])
    except OSError:
        raise SkipTest("Senna executable not found")
3 2019-12-22 21:51:47 +01:00			`# encoding: utf-8`
			`# Natural Language Toolkit: Senna POS Tagger`
			`#`
			`# Copyright (C) 2001-2019 NLTK Project`
			`# Author: Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`

			`"""`
			`Senna POS tagger, NER Tagger, Chunk Tagger`

			`The input is:`
			`- path to the directory that contains SENNA executables. If the path is incorrect,`
			`SennaTagger will automatically search for executable file specified in SENNA environment variable`
			`- (optionally) the encoding of the input data (default:utf-8)`

			`Note: Unit tests for this module can be found in test/unit/test_senna.py`

			`>>> from nltk.tag import SennaTagger`
			`>>> tagger = SennaTagger('/usr/share/senna-v3.0')`
			`>>> tagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP`
			`[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'),`
			`('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')]`

			`>>> from nltk.tag import SennaChunkTagger`
			`>>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0')`
			`>>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP`
			`[('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'),`
			`('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'),`
			`('?', 'O')]`

			`>>> from nltk.tag import SennaNERTagger`
			`>>> nertagger = SennaNERTagger('/usr/share/senna-v3.0')`
			`>>> nertagger.tag('Shakespeare theatre was in London .'.split()) # doctest: +SKIP`
			`[('Shakespeare', 'B-PER'), ('theatre', 'O'), ('was', 'O'), ('in', 'O'),`
			`('London', 'B-LOC'), ('.', 'O')]`
			`>>> nertagger.tag('UN headquarters are in NY , USA .'.split()) # doctest: +SKIP`
			`[('UN', 'B-ORG'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'),`
			`('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')]`
			`"""`

			`from nltk.compat import python_2_unicode_compatible`
			`from nltk.classify import Senna`


			`@python_2_unicode_compatible`
			`class SennaTagger(Senna):`
			`def __init__(self, path, encoding='utf-8'):`
			`super(SennaTagger, self).__init__(path, ['pos'], encoding)`

			`def tag_sents(self, sentences):`
			`"""`
			`Applies the tag method over a list of sentences. This method will return`
			`for each sentence a list of tuples of (word, tag).`
			`"""`
			`tagged_sents = super(SennaTagger, self).tag_sents(sentences)`
			`for i in range(len(tagged_sents)):`
			`for j in range(len(tagged_sents[i])):`
			`annotations = tagged_sents[i][j]`
			`tagged_sents[i][j] = (annotations['word'], annotations['pos'])`
			`return tagged_sents`


			`@python_2_unicode_compatible`
			`class SennaChunkTagger(Senna):`
			`def __init__(self, path, encoding='utf-8'):`
			`super(SennaChunkTagger, self).__init__(path, ['chk'], encoding)`

			`def tag_sents(self, sentences):`
			`"""`
			`Applies the tag method over a list of sentences. This method will return`
			`for each sentence a list of tuples of (word, tag).`
			`"""`
			`tagged_sents = super(SennaChunkTagger, self).tag_sents(sentences)`
			`for i in range(len(tagged_sents)):`
			`for j in range(len(tagged_sents[i])):`
			`annotations = tagged_sents[i][j]`
			`tagged_sents[i][j] = (annotations['word'], annotations['chk'])`
			`return tagged_sents`

			`def bio_to_chunks(self, tagged_sent, chunk_type):`
			`"""`
			`Extracts the chunks in a BIO chunk-tagged sentence.`

			`>>> from nltk.tag import SennaChunkTagger`
			`>>> chktagger = SennaChunkTagger('/usr/share/senna-v3.0')`
			`>>> sent = 'What is the airspeed of an unladen swallow ?'.split()`
			`>>> tagged_sent = chktagger.tag(sent) # doctest: +SKIP`
			`>>> tagged_sent # doctest: +SKIP`
			`[('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'),`
			`('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'),`
			`('?', 'O')]`
			`>>> list(chktagger.bio_to_chunks(tagged_sent, chunk_type='NP')) # doctest: +SKIP`
			`[('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')]`

			`:param tagged_sent: A list of tuples of word and BIO chunk tag.`
			`:type tagged_sent: list(tuple)`
			`:param tagged_sent: The chunk tag that users want to extract, e.g. 'NP' or 'VP'`
			`:type tagged_sent: str`

			`:return: An iterable of tuples of chunks that users want to extract`
			`and their corresponding indices.`
			`:rtype: iter(tuple(str))`
			`"""`
			`current_chunk = []`
			`current_chunk_position = []`
			`for idx, word_pos in enumerate(tagged_sent):`
			`word, pos = word_pos`
			`if '-' + chunk_type in pos: # Append the word to the current_chunk.`
			`current_chunk.append((word))`
			`current_chunk_position.append((idx))`
			`else:`
			`if current_chunk: # Flush the full chunk when out of an NP.`
			`_chunk_str = ' '.join(current_chunk)`
			`_chunk_pos_str = '-'.join(map(str, current_chunk_position))`
			`yield _chunk_str, _chunk_pos_str`
			`current_chunk = []`
			`current_chunk_position = []`
			`if current_chunk: # Flush the last chunk.`
			`yield ' '.join(current_chunk), '-'.join(map(str, current_chunk_position))`


			`@python_2_unicode_compatible`
			`class SennaNERTagger(Senna):`
			`def __init__(self, path, encoding='utf-8'):`
			`super(SennaNERTagger, self).__init__(path, ['ner'], encoding)`

			`def tag_sents(self, sentences):`
			`"""`
			`Applies the tag method over a list of sentences. This method will return`
			`for each sentence a list of tuples of (word, tag).`
			`"""`
			`tagged_sents = super(SennaNERTagger, self).tag_sents(sentences)`
			`for i in range(len(tagged_sents)):`
			`for j in range(len(tagged_sents[i])):`
			`annotations = tagged_sents[i][j]`
			`tagged_sents[i][j] = (annotations['word'], annotations['ner'])`
			`return tagged_sents`


			`# skip doctests if Senna is not installed`
			`def setup_module(module):`
			`from nose import SkipTest`

			`try:`
			`tagger = Senna('/usr/share/senna-v3.0', ['pos', 'chk', 'ner'])`
			`except OSError:`
			`raise SkipTest("Senna executable not found")`