138 lines
3.9 KiB
Python
138 lines
3.9 KiB
Python
|
# Natural Language Toolkit: Tagset Mapping
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Nathan Schneider <nathan@cmu.edu>
|
||
|
# Steven Bird <stevenbird1@gmail.com>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
Interface for converting POS tags from various treebanks
|
||
|
to the universal tagset of Petrov, Das, & McDonald.
|
||
|
|
||
|
The tagset consists of the following 12 coarse tags:
|
||
|
|
||
|
VERB - verbs (all tenses and modes)
|
||
|
NOUN - nouns (common and proper)
|
||
|
PRON - pronouns
|
||
|
ADJ - adjectives
|
||
|
ADV - adverbs
|
||
|
ADP - adpositions (prepositions and postpositions)
|
||
|
CONJ - conjunctions
|
||
|
DET - determiners
|
||
|
NUM - cardinal numbers
|
||
|
PRT - particles or other function words
|
||
|
X - other: foreign words, typos, abbreviations
|
||
|
. - punctuation
|
||
|
|
||
|
@see: http://arxiv.org/abs/1104.2086 and http://code.google.com/p/universal-pos-tags/
|
||
|
|
||
|
"""
|
||
|
|
||
|
from __future__ import print_function, unicode_literals, division
|
||
|
from collections import defaultdict
|
||
|
from os.path import join
|
||
|
|
||
|
from nltk.data import load
|
||
|
|
||
|
_UNIVERSAL_DATA = "taggers/universal_tagset"
|
||
|
_UNIVERSAL_TAGS = (
|
||
|
'VERB',
|
||
|
'NOUN',
|
||
|
'PRON',
|
||
|
'ADJ',
|
||
|
'ADV',
|
||
|
'ADP',
|
||
|
'CONJ',
|
||
|
'DET',
|
||
|
'NUM',
|
||
|
'PRT',
|
||
|
'X',
|
||
|
'.',
|
||
|
)
|
||
|
|
||
|
# _MAPPINGS = defaultdict(lambda: defaultdict(dict))
|
||
|
# the mapping between tagset T1 and T2 returns UNK if appied to an unrecognized tag
|
||
|
_MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 'UNK')))
|
||
|
|
||
|
|
||
|
def _load_universal_map(fileid):
|
||
|
contents = load(join(_UNIVERSAL_DATA, fileid + '.map'), format="text")
|
||
|
|
||
|
# When mapping to the Universal Tagset,
|
||
|
# map unknown inputs to 'X' not 'UNK'
|
||
|
_MAPPINGS[fileid]['universal'].default_factory = lambda: 'X'
|
||
|
|
||
|
for line in contents.splitlines():
|
||
|
line = line.strip()
|
||
|
if line == '':
|
||
|
continue
|
||
|
fine, coarse = line.split('\t')
|
||
|
|
||
|
assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse)
|
||
|
assert (
|
||
|
fine not in _MAPPINGS[fileid]['universal']
|
||
|
), 'Multiple entries for original tag: {}'.format(fine)
|
||
|
|
||
|
_MAPPINGS[fileid]['universal'][fine] = coarse
|
||
|
|
||
|
|
||
|
def tagset_mapping(source, target):
|
||
|
"""
|
||
|
Retrieve the mapping dictionary between tagsets.
|
||
|
|
||
|
>>> tagset_mapping('ru-rnc', 'universal') == {'!': '.', 'A': 'ADJ', 'C': 'CONJ', 'AD': 'ADV',\
|
||
|
'NN': 'NOUN', 'VG': 'VERB', 'COMP': 'CONJ', 'NC': 'NUM', 'VP': 'VERB', 'P': 'ADP',\
|
||
|
'IJ': 'X', 'V': 'VERB', 'Z': 'X', 'VI': 'VERB', 'YES_NO_SENT': 'X', 'PTCL': 'PRT'}
|
||
|
True
|
||
|
"""
|
||
|
|
||
|
if source not in _MAPPINGS or target not in _MAPPINGS[source]:
|
||
|
if target == 'universal':
|
||
|
_load_universal_map(source)
|
||
|
# Added the new Russian National Corpus mappings because the
|
||
|
# Russian model for nltk.pos_tag() uses it.
|
||
|
_MAPPINGS['ru-rnc-new']['universal'] = {
|
||
|
'A': 'ADJ',
|
||
|
'A-PRO': 'PRON',
|
||
|
'ADV': 'ADV',
|
||
|
'ADV-PRO': 'PRON',
|
||
|
'ANUM': 'ADJ',
|
||
|
'CONJ': 'CONJ',
|
||
|
'INTJ': 'X',
|
||
|
'NONLEX': '.',
|
||
|
'NUM': 'NUM',
|
||
|
'PARENTH': 'PRT',
|
||
|
'PART': 'PRT',
|
||
|
'PR': 'ADP',
|
||
|
'PRAEDIC': 'PRT',
|
||
|
'PRAEDIC-PRO': 'PRON',
|
||
|
'S': 'NOUN',
|
||
|
'S-PRO': 'PRON',
|
||
|
'V': 'VERB',
|
||
|
}
|
||
|
|
||
|
return _MAPPINGS[source][target]
|
||
|
|
||
|
|
||
|
def map_tag(source, target, source_tag):
|
||
|
"""
|
||
|
Maps the tag from the source tagset to the target tagset.
|
||
|
|
||
|
>>> map_tag('en-ptb', 'universal', 'VBZ')
|
||
|
'VERB'
|
||
|
>>> map_tag('en-ptb', 'universal', 'VBP')
|
||
|
'VERB'
|
||
|
>>> map_tag('en-ptb', 'universal', '``')
|
||
|
'.'
|
||
|
"""
|
||
|
|
||
|
# we need a systematic approach to naming
|
||
|
if target == 'universal':
|
||
|
if source == 'wsj':
|
||
|
source = 'en-ptb'
|
||
|
if source == 'brown':
|
||
|
source = 'en-brown'
|
||
|
|
||
|
return tagset_mapping(source, target)[source_tag]
|