152 lines
5.1 KiB
Python
152 lines
5.1 KiB
Python
|
# -*- coding: utf-8 -*-
|
||
|
# Natural Language Toolkit: Interface to the HunPos POS-tagger
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Peter Ljunglöf <peter.ljunglof@heatherleaf.se>
|
||
|
# Dávid Márk Nemeskey <nemeskeyd@gmail.com> (modifications)
|
||
|
# Attila Zséder <zseder@gmail.com> (modifications)
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
A module for interfacing with the HunPos open-source POS-tagger.
|
||
|
"""
|
||
|
|
||
|
import os
|
||
|
from subprocess import Popen, PIPE
|
||
|
|
||
|
from six import text_type
|
||
|
|
||
|
from nltk.internals import find_binary, find_file
|
||
|
from nltk.tag.api import TaggerI
|
||
|
|
||
|
_hunpos_url = 'http://code.google.com/p/hunpos/'
|
||
|
|
||
|
_hunpos_charset = 'ISO-8859-1'
|
||
|
"""The default encoding used by hunpos: ISO-8859-1."""
|
||
|
|
||
|
|
||
|
class HunposTagger(TaggerI):
|
||
|
"""
|
||
|
A class for pos tagging with HunPos. The input is the paths to:
|
||
|
- a model trained on training data
|
||
|
- (optionally) the path to the hunpos-tag binary
|
||
|
- (optionally) the encoding of the training data (default: ISO-8859-1)
|
||
|
|
||
|
Example:
|
||
|
|
||
|
>>> from nltk.tag import HunposTagger
|
||
|
>>> ht = HunposTagger('en_wsj.model')
|
||
|
>>> ht.tag('What is the airspeed of an unladen swallow ?'.split())
|
||
|
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')]
|
||
|
>>> ht.close()
|
||
|
|
||
|
This class communicates with the hunpos-tag binary via pipes. When the
|
||
|
tagger object is no longer needed, the close() method should be called to
|
||
|
free system resources. The class supports the context manager interface; if
|
||
|
used in a with statement, the close() method is invoked automatically:
|
||
|
|
||
|
>>> with HunposTagger('en_wsj.model') as ht:
|
||
|
... ht.tag('What is the airspeed of an unladen swallow ?'.split())
|
||
|
...
|
||
|
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')]
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self, path_to_model, path_to_bin=None, encoding=_hunpos_charset, verbose=False
|
||
|
):
|
||
|
"""
|
||
|
Starts the hunpos-tag executable and establishes a connection with it.
|
||
|
|
||
|
:param path_to_model: The model file.
|
||
|
:param path_to_bin: The hunpos-tag binary.
|
||
|
:param encoding: The encoding used by the model. Unicode tokens
|
||
|
passed to the tag() and tag_sents() methods are converted to
|
||
|
this charset when they are sent to hunpos-tag.
|
||
|
The default is ISO-8859-1 (Latin-1).
|
||
|
|
||
|
This parameter is ignored for str tokens, which are sent as-is.
|
||
|
The caller must ensure that tokens are encoded in the right charset.
|
||
|
"""
|
||
|
self._closed = True
|
||
|
hunpos_paths = [
|
||
|
'.',
|
||
|
'/usr/bin',
|
||
|
'/usr/local/bin',
|
||
|
'/opt/local/bin',
|
||
|
'/Applications/bin',
|
||
|
'~/bin',
|
||
|
'~/Applications/bin',
|
||
|
]
|
||
|
hunpos_paths = list(map(os.path.expanduser, hunpos_paths))
|
||
|
|
||
|
self._hunpos_bin = find_binary(
|
||
|
'hunpos-tag',
|
||
|
path_to_bin,
|
||
|
env_vars=('HUNPOS_TAGGER',),
|
||
|
searchpath=hunpos_paths,
|
||
|
url=_hunpos_url,
|
||
|
verbose=verbose,
|
||
|
)
|
||
|
|
||
|
self._hunpos_model = find_file(
|
||
|
path_to_model, env_vars=('HUNPOS_TAGGER',), verbose=verbose
|
||
|
)
|
||
|
self._encoding = encoding
|
||
|
self._hunpos = Popen(
|
||
|
[self._hunpos_bin, self._hunpos_model],
|
||
|
shell=False,
|
||
|
stdin=PIPE,
|
||
|
stdout=PIPE,
|
||
|
stderr=PIPE,
|
||
|
)
|
||
|
self._closed = False
|
||
|
|
||
|
def __del__(self):
|
||
|
self.close()
|
||
|
|
||
|
def close(self):
|
||
|
"""Closes the pipe to the hunpos executable."""
|
||
|
if not self._closed:
|
||
|
self._hunpos.communicate()
|
||
|
self._closed = True
|
||
|
|
||
|
def __enter__(self):
|
||
|
return self
|
||
|
|
||
|
def __exit__(self, exc_type, exc_value, traceback):
|
||
|
self.close()
|
||
|
|
||
|
def tag(self, tokens):
|
||
|
"""Tags a single sentence: a list of words.
|
||
|
The tokens should not contain any newline characters.
|
||
|
"""
|
||
|
for token in tokens:
|
||
|
assert "\n" not in token, "Tokens should not contain newlines"
|
||
|
if isinstance(token, text_type):
|
||
|
token = token.encode(self._encoding)
|
||
|
self._hunpos.stdin.write(token + b"\n")
|
||
|
# We write a final empty line to tell hunpos that the sentence is finished:
|
||
|
self._hunpos.stdin.write(b"\n")
|
||
|
self._hunpos.stdin.flush()
|
||
|
|
||
|
tagged_tokens = []
|
||
|
for token in tokens:
|
||
|
tagged = self._hunpos.stdout.readline().strip().split(b"\t")
|
||
|
tag = tagged[1] if len(tagged) > 1 else None
|
||
|
tagged_tokens.append((token, tag))
|
||
|
# We have to read (and dismiss) the final empty line:
|
||
|
self._hunpos.stdout.readline()
|
||
|
|
||
|
return tagged_tokens
|
||
|
|
||
|
|
||
|
# skip doctests if Hunpos tagger is not installed
|
||
|
def setup_module(module):
|
||
|
from nose import SkipTest
|
||
|
|
||
|
try:
|
||
|
HunposTagger('en_wsj.model')
|
||
|
except LookupError:
|
||
|
raise SkipTest("HunposTagger is not available")
|