PCQRSCANER/venv/Lib/site-packages/nltk/tag/mapping.py

# Natural Language Toolkit: Tagset Mapping
#
# Copyright (C) 2001-2019 NLTK Project
# Author: Nathan Schneider <nathan@cmu.edu>
#         Steven Bird <stevenbird1@gmail.com>
# URL: <http://nltk.org/>
# For license information, see LICENSE.TXT

"""
Interface for converting POS tags from various treebanks
to the universal tagset of Petrov, Das, & McDonald.

The tagset consists of the following 12 coarse tags:

VERB - verbs (all tenses and modes)
NOUN - nouns (common and proper)
PRON - pronouns
ADJ - adjectives
ADV - adverbs
ADP - adpositions (prepositions and postpositions)
CONJ - conjunctions
DET - determiners
NUM - cardinal numbers
PRT - particles or other function words
X - other: foreign words, typos, abbreviations
. - punctuation

@see: http://arxiv.org/abs/1104.2086 and http://code.google.com/p/universal-pos-tags/

"""

from __future__ import print_function, unicode_literals, division
from collections import defaultdict
from os.path import join

from nltk.data import load

_UNIVERSAL_DATA = "taggers/universal_tagset"
_UNIVERSAL_TAGS = (
    'VERB',
    'NOUN',
    'PRON',
    'ADJ',
    'ADV',
    'ADP',
    'CONJ',
    'DET',
    'NUM',
    'PRT',
    'X',
    '.',
)

# _MAPPINGS = defaultdict(lambda: defaultdict(dict))
# the mapping between tagset T1 and T2 returns UNK if appied to an unrecognized tag
_MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 'UNK')))


def _load_universal_map(fileid):
    contents = load(join(_UNIVERSAL_DATA, fileid + '.map'), format="text")

    # When mapping to the Universal Tagset,
    # map unknown inputs to 'X' not 'UNK'
    _MAPPINGS[fileid]['universal'].default_factory = lambda: 'X'

    for line in contents.splitlines():
        line = line.strip()
        if line == '':
            continue
        fine, coarse = line.split('\t')

        assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse)
        assert (
            fine not in _MAPPINGS[fileid]['universal']
        ), 'Multiple entries for original tag: {}'.format(fine)

        _MAPPINGS[fileid]['universal'][fine] = coarse


def tagset_mapping(source, target):
    """
    Retrieve the mapping dictionary between tagsets.

    >>> tagset_mapping('ru-rnc', 'universal') == {'!': '.', 'A': 'ADJ', 'C': 'CONJ', 'AD': 'ADV',\
            'NN': 'NOUN', 'VG': 'VERB', 'COMP': 'CONJ', 'NC': 'NUM', 'VP': 'VERB', 'P': 'ADP',\
            'IJ': 'X', 'V': 'VERB', 'Z': 'X', 'VI': 'VERB', 'YES_NO_SENT': 'X', 'PTCL': 'PRT'}
    True
    """

    if source not in _MAPPINGS or target not in _MAPPINGS[source]:
        if target == 'universal':
            _load_universal_map(source)
            # Added the new Russian National Corpus mappings because the
            # Russian model for nltk.pos_tag() uses it.
            _MAPPINGS['ru-rnc-new']['universal'] = {
                'A': 'ADJ',
                'A-PRO': 'PRON',
                'ADV': 'ADV',
                'ADV-PRO': 'PRON',
                'ANUM': 'ADJ',
                'CONJ': 'CONJ',
                'INTJ': 'X',
                'NONLEX': '.',
                'NUM': 'NUM',
                'PARENTH': 'PRT',
                'PART': 'PRT',
                'PR': 'ADP',
                'PRAEDIC': 'PRT',
                'PRAEDIC-PRO': 'PRON',
                'S': 'NOUN',
                'S-PRO': 'PRON',
                'V': 'VERB',
            }

    return _MAPPINGS[source][target]


def map_tag(source, target, source_tag):
    """
    Maps the tag from the source tagset to the target tagset.

    >>> map_tag('en-ptb', 'universal', 'VBZ')
    'VERB'
    >>> map_tag('en-ptb', 'universal', 'VBP')
    'VERB'
    >>> map_tag('en-ptb', 'universal', '``')
    '.'
    """

    # we need a systematic approach to naming
    if target == 'universal':
        if source == 'wsj':
            source = 'en-ptb'
        if source == 'brown':
            source = 'en-brown'

    return tagset_mapping(source, target)[source_tag]
3 2019-12-22 21:51:47 +01:00			`# Natural Language Toolkit: Tagset Mapping`
			`#`
			`# Copyright (C) 2001-2019 NLTK Project`
			`# Author: Nathan Schneider <nathan@cmu.edu>`
			`# Steven Bird <stevenbird1@gmail.com>`
			`# URL: <http://nltk.org/>`
			`# For license information, see LICENSE.TXT`

			`"""`
			`Interface for converting POS tags from various treebanks`
			`to the universal tagset of Petrov, Das, & McDonald.`

			`The tagset consists of the following 12 coarse tags:`

			`VERB - verbs (all tenses and modes)`
			`NOUN - nouns (common and proper)`
			`PRON - pronouns`
			`ADJ - adjectives`
			`ADV - adverbs`
			`ADP - adpositions (prepositions and postpositions)`
			`CONJ - conjunctions`
			`DET - determiners`
			`NUM - cardinal numbers`
			`PRT - particles or other function words`
			`X - other: foreign words, typos, abbreviations`
			`. - punctuation`

			`@see: http://arxiv.org/abs/1104.2086 and http://code.google.com/p/universal-pos-tags/`

			`"""`

			`from __future__ import print_function, unicode_literals, division`
			`from collections import defaultdict`
			`from os.path import join`

			`from nltk.data import load`

			`_UNIVERSAL_DATA = "taggers/universal_tagset"`
			`_UNIVERSAL_TAGS = (`
			`'VERB',`
			`'NOUN',`
			`'PRON',`
			`'ADJ',`
			`'ADV',`
			`'ADP',`
			`'CONJ',`
			`'DET',`
			`'NUM',`
			`'PRT',`
			`'X',`
			`'.',`
			`)`

			`# _MAPPINGS = defaultdict(lambda: defaultdict(dict))`
			`# the mapping between tagset T1 and T2 returns UNK if appied to an unrecognized tag`
			`_MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 'UNK')))`


			`def _load_universal_map(fileid):`
			`contents = load(join(_UNIVERSAL_DATA, fileid + '.map'), format="text")`

			`# When mapping to the Universal Tagset,`
			`# map unknown inputs to 'X' not 'UNK'`
			`_MAPPINGS[fileid]['universal'].default_factory = lambda: 'X'`

			`for line in contents.splitlines():`
			`line = line.strip()`
			`if line == '':`
			`continue`
			`fine, coarse = line.split('\t')`

			`assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse)`
			`assert (`
			`fine not in _MAPPINGS[fileid]['universal']`
			`), 'Multiple entries for original tag: {}'.format(fine)`

			`_MAPPINGS[fileid]['universal'][fine] = coarse`


			`def tagset_mapping(source, target):`
			`"""`
			`Retrieve the mapping dictionary between tagsets.`

			`>>> tagset_mapping('ru-rnc', 'universal') == {'!': '.', 'A': 'ADJ', 'C': 'CONJ', 'AD': 'ADV',\`
			`'NN': 'NOUN', 'VG': 'VERB', 'COMP': 'CONJ', 'NC': 'NUM', 'VP': 'VERB', 'P': 'ADP',\`
			`'IJ': 'X', 'V': 'VERB', 'Z': 'X', 'VI': 'VERB', 'YES_NO_SENT': 'X', 'PTCL': 'PRT'}`
			`True`
			`"""`

			`if source not in _MAPPINGS or target not in _MAPPINGS[source]:`
			`if target == 'universal':`
			`_load_universal_map(source)`
			`# Added the new Russian National Corpus mappings because the`
			`# Russian model for nltk.pos_tag() uses it.`
			`_MAPPINGS['ru-rnc-new']['universal'] = {`
			`'A': 'ADJ',`
			`'A-PRO': 'PRON',`
			`'ADV': 'ADV',`
			`'ADV-PRO': 'PRON',`
			`'ANUM': 'ADJ',`
			`'CONJ': 'CONJ',`
			`'INTJ': 'X',`
			`'NONLEX': '.',`
			`'NUM': 'NUM',`
			`'PARENTH': 'PRT',`
			`'PART': 'PRT',`
			`'PR': 'ADP',`
			`'PRAEDIC': 'PRT',`
			`'PRAEDIC-PRO': 'PRON',`
			`'S': 'NOUN',`
			`'S-PRO': 'PRON',`
			`'V': 'VERB',`
			`}`

			`return _MAPPINGS[source][target]`


			`def map_tag(source, target, source_tag):`
			`"""`
			`Maps the tag from the source tagset to the target tagset.`

			`>>> map_tag('en-ptb', 'universal', 'VBZ')`
			`'VERB'`
			`>>> map_tag('en-ptb', 'universal', 'VBP')`
			`'VERB'`
			>>> map_tag('en-ptb', 'universal', '``')
			`'.'`
			`"""`

			`# we need a systematic approach to naming`
			`if target == 'universal':`
			`if source == 'wsj':`
			`source = 'en-ptb'`
			`if source == 'brown':`
			`source = 'en-brown'`

			`return tagset_mapping(source, target)[source_tag]`