250 lines
8.3 KiB
Python
250 lines
8.3 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
# Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Nitin Madnani <nmadnani@ets.org>
|
||
|
# Rami Al-Rfou' <ralrfou@cs.stonybrook.edu>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
A module for interfacing with the Stanford taggers.
|
||
|
|
||
|
Tagger models need to be downloaded from https://nlp.stanford.edu/software
|
||
|
and the STANFORD_MODELS environment variable set (a colon-separated
|
||
|
list of paths).
|
||
|
|
||
|
For more details see the documentation for StanfordPOSTagger and StanfordNERTagger.
|
||
|
"""
|
||
|
|
||
|
from abc import abstractmethod
|
||
|
import os
|
||
|
import tempfile
|
||
|
from subprocess import PIPE
|
||
|
import warnings
|
||
|
|
||
|
from six import text_type
|
||
|
|
||
|
from nltk.internals import find_file, find_jar, config_java, java, _java_options
|
||
|
from nltk.tag.api import TaggerI
|
||
|
|
||
|
_stanford_url = 'https://nlp.stanford.edu/software'
|
||
|
|
||
|
|
||
|
class StanfordTagger(TaggerI):
|
||
|
"""
|
||
|
An interface to Stanford taggers. Subclasses must define:
|
||
|
|
||
|
- ``_cmd`` property: A property that returns the command that will be
|
||
|
executed.
|
||
|
- ``_SEPARATOR``: Class constant that represents that character that
|
||
|
is used to separate the tokens from their tags.
|
||
|
- ``_JAR`` file: Class constant that represents the jar file name.
|
||
|
"""
|
||
|
|
||
|
_SEPARATOR = ''
|
||
|
_JAR = ''
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
model_filename,
|
||
|
path_to_jar=None,
|
||
|
encoding='utf8',
|
||
|
verbose=False,
|
||
|
java_options='-mx1000m',
|
||
|
):
|
||
|
# Raise deprecation warning.
|
||
|
warnings.warn(
|
||
|
str(
|
||
|
"\nThe StanfordTokenizer will "
|
||
|
"be deprecated in version 3.2.6.\n"
|
||
|
"Please use \033[91mnltk.parse.corenlp.CoreNLPParser\033[0m instead."
|
||
|
),
|
||
|
DeprecationWarning,
|
||
|
stacklevel=2,
|
||
|
)
|
||
|
|
||
|
if not self._JAR:
|
||
|
warnings.warn(
|
||
|
'The StanfordTagger class is not meant to be '
|
||
|
'instantiated directly. Did you mean '
|
||
|
'StanfordPOSTagger or StanfordNERTagger?'
|
||
|
)
|
||
|
self._stanford_jar = find_jar(
|
||
|
self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose
|
||
|
)
|
||
|
|
||
|
self._stanford_model = find_file(
|
||
|
model_filename, env_vars=('STANFORD_MODELS',), verbose=verbose
|
||
|
)
|
||
|
|
||
|
self._encoding = encoding
|
||
|
self.java_options = java_options
|
||
|
|
||
|
@property
|
||
|
@abstractmethod
|
||
|
def _cmd(self):
|
||
|
"""
|
||
|
A property that returns the command that will be executed.
|
||
|
"""
|
||
|
|
||
|
def tag(self, tokens):
|
||
|
# This function should return list of tuple rather than list of list
|
||
|
return sum(self.tag_sents([tokens]), [])
|
||
|
|
||
|
def tag_sents(self, sentences):
|
||
|
encoding = self._encoding
|
||
|
default_options = ' '.join(_java_options)
|
||
|
config_java(options=self.java_options, verbose=False)
|
||
|
|
||
|
# Create a temporary input file
|
||
|
_input_fh, self._input_file_path = tempfile.mkstemp(text=True)
|
||
|
|
||
|
cmd = list(self._cmd)
|
||
|
cmd.extend(['-encoding', encoding])
|
||
|
|
||
|
# Write the actual sentences to the temporary input file
|
||
|
_input_fh = os.fdopen(_input_fh, 'wb')
|
||
|
_input = '\n'.join((' '.join(x) for x in sentences))
|
||
|
if isinstance(_input, text_type) and encoding:
|
||
|
_input = _input.encode(encoding)
|
||
|
_input_fh.write(_input)
|
||
|
_input_fh.close()
|
||
|
|
||
|
# Run the tagger and get the output
|
||
|
stanpos_output, _stderr = java(
|
||
|
cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE
|
||
|
)
|
||
|
stanpos_output = stanpos_output.decode(encoding)
|
||
|
|
||
|
# Delete the temporary file
|
||
|
os.unlink(self._input_file_path)
|
||
|
|
||
|
# Return java configurations to their default values
|
||
|
config_java(options=default_options, verbose=False)
|
||
|
|
||
|
return self.parse_output(stanpos_output, sentences)
|
||
|
|
||
|
def parse_output(self, text, sentences=None):
|
||
|
# Output the tagged sentences
|
||
|
tagged_sentences = []
|
||
|
for tagged_sentence in text.strip().split("\n"):
|
||
|
sentence = []
|
||
|
for tagged_word in tagged_sentence.strip().split():
|
||
|
word_tags = tagged_word.strip().split(self._SEPARATOR)
|
||
|
sentence.append((''.join(word_tags[:-1]), word_tags[-1]))
|
||
|
tagged_sentences.append(sentence)
|
||
|
return tagged_sentences
|
||
|
|
||
|
|
||
|
class StanfordPOSTagger(StanfordTagger):
|
||
|
"""
|
||
|
A class for pos tagging with Stanford Tagger. The input is the paths to:
|
||
|
- a model trained on training data
|
||
|
- (optionally) the path to the stanford tagger jar file. If not specified here,
|
||
|
then this jar file must be specified in the CLASSPATH envinroment variable.
|
||
|
- (optionally) the encoding of the training data (default: UTF-8)
|
||
|
|
||
|
Example:
|
||
|
|
||
|
>>> from nltk.tag import StanfordPOSTagger
|
||
|
>>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
|
||
|
>>> st.tag('What is the airspeed of an unladen swallow ?'.split())
|
||
|
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
|
||
|
"""
|
||
|
|
||
|
_SEPARATOR = '_'
|
||
|
_JAR = 'stanford-postagger.jar'
|
||
|
|
||
|
def __init__(self, *args, **kwargs):
|
||
|
super(StanfordPOSTagger, self).__init__(*args, **kwargs)
|
||
|
|
||
|
@property
|
||
|
def _cmd(self):
|
||
|
return [
|
||
|
'edu.stanford.nlp.tagger.maxent.MaxentTagger',
|
||
|
'-model',
|
||
|
self._stanford_model,
|
||
|
'-textFile',
|
||
|
self._input_file_path,
|
||
|
'-tokenize',
|
||
|
'false',
|
||
|
'-outputFormatOptions',
|
||
|
'keepEmptySentences',
|
||
|
]
|
||
|
|
||
|
|
||
|
class StanfordNERTagger(StanfordTagger):
|
||
|
"""
|
||
|
A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to:
|
||
|
|
||
|
- a model trained on training data
|
||
|
- (optionally) the path to the stanford tagger jar file. If not specified here,
|
||
|
then this jar file must be specified in the CLASSPATH envinroment variable.
|
||
|
- (optionally) the encoding of the training data (default: UTF-8)
|
||
|
|
||
|
Example:
|
||
|
|
||
|
>>> from nltk.tag import StanfordNERTagger
|
||
|
>>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP
|
||
|
>>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP
|
||
|
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'),
|
||
|
('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'),
|
||
|
('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')]
|
||
|
"""
|
||
|
|
||
|
_SEPARATOR = '/'
|
||
|
_JAR = 'stanford-ner.jar'
|
||
|
_FORMAT = 'slashTags'
|
||
|
|
||
|
def __init__(self, *args, **kwargs):
|
||
|
super(StanfordNERTagger, self).__init__(*args, **kwargs)
|
||
|
|
||
|
@property
|
||
|
def _cmd(self):
|
||
|
# Adding -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false for not using stanford Tokenizer
|
||
|
return [
|
||
|
'edu.stanford.nlp.ie.crf.CRFClassifier',
|
||
|
'-loadClassifier',
|
||
|
self._stanford_model,
|
||
|
'-textFile',
|
||
|
self._input_file_path,
|
||
|
'-outputFormat',
|
||
|
self._FORMAT,
|
||
|
'-tokenizerFactory',
|
||
|
'edu.stanford.nlp.process.WhitespaceTokenizer',
|
||
|
'-tokenizerOptions',
|
||
|
'\"tokenizeNLs=false\"',
|
||
|
]
|
||
|
|
||
|
def parse_output(self, text, sentences):
|
||
|
if self._FORMAT == 'slashTags':
|
||
|
# Joint together to a big list
|
||
|
tagged_sentences = []
|
||
|
for tagged_sentence in text.strip().split("\n"):
|
||
|
for tagged_word in tagged_sentence.strip().split():
|
||
|
word_tags = tagged_word.strip().split(self._SEPARATOR)
|
||
|
tagged_sentences.append((''.join(word_tags[:-1]), word_tags[-1]))
|
||
|
|
||
|
# Separate it according to the input
|
||
|
result = []
|
||
|
start = 0
|
||
|
for sent in sentences:
|
||
|
result.append(tagged_sentences[start : start + len(sent)])
|
||
|
start += len(sent)
|
||
|
return result
|
||
|
|
||
|
raise NotImplementedError
|
||
|
|
||
|
|
||
|
def setup_module(module):
|
||
|
from nose import SkipTest
|
||
|
|
||
|
try:
|
||
|
StanfordPOSTagger('english-bidirectional-distsim.tagger')
|
||
|
except LookupError:
|
||
|
raise SkipTest(
|
||
|
'Doctests from nltk.tag.stanford are skipped because one \
|
||
|
of the stanford jars cannot be found.'
|
||
|
)
|