1581 lines
58 KiB
Python
1581 lines
58 KiB
Python
# Natural Language Toolkit: Maximum Entropy Classifiers
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Edward Loper <edloper@gmail.com>
|
|
# Dmitry Chichkov <dchichkov@gmail.com> (TypedMaxentFeatureEncoding)
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.TXT
|
|
|
|
"""
|
|
A classifier model based on maximum entropy modeling framework. This
|
|
framework considers all of the probability distributions that are
|
|
empirically consistent with the training data; and chooses the
|
|
distribution with the highest entropy. A probability distribution is
|
|
"empirically consistent" with a set of training data if its estimated
|
|
frequency with which a class and a feature vector value co-occur is
|
|
equal to the actual frequency in the data.
|
|
|
|
Terminology: 'feature'
|
|
======================
|
|
The term *feature* is usually used to refer to some property of an
|
|
unlabeled token. For example, when performing word sense
|
|
disambiguation, we might define a ``'prevword'`` feature whose value is
|
|
the word preceding the target word. However, in the context of
|
|
maxent modeling, the term *feature* is typically used to refer to a
|
|
property of a "labeled" token. In order to prevent confusion, we
|
|
will introduce two distinct terms to disambiguate these two different
|
|
concepts:
|
|
|
|
- An "input-feature" is a property of an unlabeled token.
|
|
- A "joint-feature" is a property of a labeled token.
|
|
|
|
In the rest of the ``nltk.classify`` module, the term "features" is
|
|
used to refer to what we will call "input-features" in this module.
|
|
|
|
In literature that describes and discusses maximum entropy models,
|
|
input-features are typically called "contexts", and joint-features
|
|
are simply referred to as "features".
|
|
|
|
Converting Input-Features to Joint-Features
|
|
-------------------------------------------
|
|
In maximum entropy models, joint-features are required to have numeric
|
|
values. Typically, each input-feature ``input_feat`` is mapped to a
|
|
set of joint-features of the form:
|
|
|
|
| joint_feat(token, label) = { 1 if input_feat(token) == feat_val
|
|
| { and label == some_label
|
|
| {
|
|
| { 0 otherwise
|
|
|
|
For all values of ``feat_val`` and ``some_label``. This mapping is
|
|
performed by classes that implement the ``MaxentFeatureEncodingI``
|
|
interface.
|
|
"""
|
|
from __future__ import print_function, unicode_literals
|
|
|
|
try:
|
|
import numpy
|
|
except ImportError:
|
|
pass
|
|
|
|
import tempfile
|
|
import os
|
|
from collections import defaultdict
|
|
|
|
from six import integer_types
|
|
|
|
from nltk import compat
|
|
from nltk.data import gzip_open_unicode
|
|
from nltk.util import OrderedDict
|
|
from nltk.probability import DictionaryProbDist
|
|
|
|
from nltk.classify.api import ClassifierI
|
|
from nltk.classify.util import CutoffChecker, accuracy, log_likelihood
|
|
from nltk.classify.megam import call_megam, write_megam_file, parse_megam_weights
|
|
from nltk.classify.tadm import call_tadm, write_tadm_file, parse_tadm_weights
|
|
|
|
__docformat__ = 'epytext en'
|
|
|
|
######################################################################
|
|
# { Classifier Model
|
|
######################################################################
|
|
|
|
|
|
@compat.python_2_unicode_compatible
|
|
class MaxentClassifier(ClassifierI):
|
|
"""
|
|
A maximum entropy classifier (also known as a "conditional
|
|
exponential classifier"). This classifier is parameterized by a
|
|
set of "weights", which are used to combine the joint-features
|
|
that are generated from a featureset by an "encoding". In
|
|
particular, the encoding maps each ``(featureset, label)`` pair to
|
|
a vector. The probability of each label is then computed using
|
|
the following equation::
|
|
|
|
dotprod(weights, encode(fs,label))
|
|
prob(fs|label) = ---------------------------------------------------
|
|
sum(dotprod(weights, encode(fs,l)) for l in labels)
|
|
|
|
Where ``dotprod`` is the dot product::
|
|
|
|
dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))
|
|
"""
|
|
|
|
def __init__(self, encoding, weights, logarithmic=True):
|
|
"""
|
|
Construct a new maxent classifier model. Typically, new
|
|
classifier models are created using the ``train()`` method.
|
|
|
|
:type encoding: MaxentFeatureEncodingI
|
|
:param encoding: An encoding that is used to convert the
|
|
featuresets that are given to the ``classify`` method into
|
|
joint-feature vectors, which are used by the maxent
|
|
classifier model.
|
|
|
|
:type weights: list of float
|
|
:param weights: The feature weight vector for this classifier.
|
|
|
|
:type logarithmic: bool
|
|
:param logarithmic: If false, then use non-logarithmic weights.
|
|
"""
|
|
self._encoding = encoding
|
|
self._weights = weights
|
|
self._logarithmic = logarithmic
|
|
# self._logarithmic = False
|
|
assert encoding.length() == len(weights)
|
|
|
|
def labels(self):
|
|
return self._encoding.labels()
|
|
|
|
def set_weights(self, new_weights):
|
|
"""
|
|
Set the feature weight vector for this classifier.
|
|
:param new_weights: The new feature weight vector.
|
|
:type new_weights: list of float
|
|
"""
|
|
self._weights = new_weights
|
|
assert self._encoding.length() == len(new_weights)
|
|
|
|
def weights(self):
|
|
"""
|
|
:return: The feature weight vector for this classifier.
|
|
:rtype: list of float
|
|
"""
|
|
return self._weights
|
|
|
|
def classify(self, featureset):
|
|
return self.prob_classify(featureset).max()
|
|
|
|
def prob_classify(self, featureset):
|
|
prob_dict = {}
|
|
for label in self._encoding.labels():
|
|
feature_vector = self._encoding.encode(featureset, label)
|
|
|
|
if self._logarithmic:
|
|
total = 0.0
|
|
for (f_id, f_val) in feature_vector:
|
|
total += self._weights[f_id] * f_val
|
|
prob_dict[label] = total
|
|
|
|
else:
|
|
prod = 1.0
|
|
for (f_id, f_val) in feature_vector:
|
|
prod *= self._weights[f_id] ** f_val
|
|
prob_dict[label] = prod
|
|
|
|
# Normalize the dictionary to give a probability distribution
|
|
return DictionaryProbDist(prob_dict, log=self._logarithmic, normalize=True)
|
|
|
|
def explain(self, featureset, columns=4):
|
|
"""
|
|
Print a table showing the effect of each of the features in
|
|
the given feature set, and how they combine to determine the
|
|
probabilities of each label for that featureset.
|
|
"""
|
|
descr_width = 50
|
|
TEMPLATE = ' %-' + str(descr_width - 2) + 's%s%8.3f'
|
|
|
|
pdist = self.prob_classify(featureset)
|
|
labels = sorted(pdist.samples(), key=pdist.prob, reverse=True)
|
|
labels = labels[:columns]
|
|
print(
|
|
' Feature'.ljust(descr_width)
|
|
+ ''.join('%8s' % (("%s" % l)[:7]) for l in labels)
|
|
)
|
|
print(' ' + '-' * (descr_width - 2 + 8 * len(labels)))
|
|
sums = defaultdict(int)
|
|
for i, label in enumerate(labels):
|
|
feature_vector = self._encoding.encode(featureset, label)
|
|
feature_vector.sort(
|
|
key=lambda fid__: abs(self._weights[fid__[0]]), reverse=True
|
|
)
|
|
for (f_id, f_val) in feature_vector:
|
|
if self._logarithmic:
|
|
score = self._weights[f_id] * f_val
|
|
else:
|
|
score = self._weights[f_id] ** f_val
|
|
descr = self._encoding.describe(f_id)
|
|
descr = descr.split(' and label is ')[0] # hack
|
|
descr += ' (%s)' % f_val # hack
|
|
if len(descr) > 47:
|
|
descr = descr[:44] + '...'
|
|
print(TEMPLATE % (descr, i * 8 * ' ', score))
|
|
sums[label] += score
|
|
print(' ' + '-' * (descr_width - 1 + 8 * len(labels)))
|
|
print(
|
|
' TOTAL:'.ljust(descr_width) + ''.join('%8.3f' % sums[l] for l in labels)
|
|
)
|
|
print(
|
|
' PROBS:'.ljust(descr_width)
|
|
+ ''.join('%8.3f' % pdist.prob(l) for l in labels)
|
|
)
|
|
|
|
def most_informative_features(self, n=10):
|
|
"""
|
|
Generates the ranked list of informative features from most to least.
|
|
"""
|
|
if hasattr(self, '_most_informative_features'):
|
|
return self._most_informative_features[:n]
|
|
else:
|
|
self._most_informative_features = sorted(
|
|
list(range(len(self._weights))),
|
|
key=lambda fid: abs(self._weights[fid]),
|
|
reverse=True,
|
|
)
|
|
return self._most_informative_features[:n]
|
|
|
|
def show_most_informative_features(self, n=10, show='all'):
|
|
"""
|
|
:param show: all, neg, or pos (for negative-only or positive-only)
|
|
:type show: str
|
|
:param n: The no. of top features
|
|
:type n: int
|
|
"""
|
|
# Use None the full list of ranked features.
|
|
fids = self.most_informative_features(None)
|
|
if show == 'pos':
|
|
fids = [fid for fid in fids if self._weights[fid] > 0]
|
|
elif show == 'neg':
|
|
fids = [fid for fid in fids if self._weights[fid] < 0]
|
|
for fid in fids[:n]:
|
|
print('%8.3f %s' % (self._weights[fid], self._encoding.describe(fid)))
|
|
|
|
def __repr__(self):
|
|
return '<ConditionalExponentialClassifier: %d labels, %d features>' % (
|
|
len(self._encoding.labels()),
|
|
self._encoding.length(),
|
|
)
|
|
|
|
#: A list of the algorithm names that are accepted for the
|
|
#: ``train()`` method's ``algorithm`` parameter.
|
|
ALGORITHMS = ['GIS', 'IIS', 'MEGAM', 'TADM']
|
|
|
|
@classmethod
|
|
def train(
|
|
cls,
|
|
train_toks,
|
|
algorithm=None,
|
|
trace=3,
|
|
encoding=None,
|
|
labels=None,
|
|
gaussian_prior_sigma=0,
|
|
**cutoffs
|
|
):
|
|
"""
|
|
Train a new maxent classifier based on the given corpus of
|
|
training samples. This classifier will have its weights
|
|
chosen to maximize entropy while remaining empirically
|
|
consistent with the training corpus.
|
|
|
|
:rtype: MaxentClassifier
|
|
:return: The new maxent classifier
|
|
|
|
:type train_toks: list
|
|
:param train_toks: Training data, represented as a list of
|
|
pairs, the first member of which is a featureset,
|
|
and the second of which is a classification label.
|
|
|
|
:type algorithm: str
|
|
:param algorithm: A case-insensitive string, specifying which
|
|
algorithm should be used to train the classifier. The
|
|
following algorithms are currently available.
|
|
|
|
- Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``),
|
|
Improved Iterative Scaling (``'IIS'``)
|
|
- External Libraries (requiring megam):
|
|
LM-BFGS algorithm, with training performed by Megam (``'megam'``)
|
|
|
|
The default algorithm is ``'IIS'``.
|
|
|
|
:type trace: int
|
|
:param trace: The level of diagnostic tracing output to produce.
|
|
Higher values produce more verbose output.
|
|
:type encoding: MaxentFeatureEncodingI
|
|
:param encoding: A feature encoding, used to convert featuresets
|
|
into feature vectors. If none is specified, then a
|
|
``BinaryMaxentFeatureEncoding`` will be built based on the
|
|
features that are attested in the training corpus.
|
|
:type labels: list(str)
|
|
:param labels: The set of possible labels. If none is given, then
|
|
the set of all labels attested in the training data will be
|
|
used instead.
|
|
:param gaussian_prior_sigma: The sigma value for a gaussian
|
|
prior on model weights. Currently, this is supported by
|
|
``megam``. For other algorithms, its value is ignored.
|
|
:param cutoffs: Arguments specifying various conditions under
|
|
which the training should be halted. (Some of the cutoff
|
|
conditions are not supported by some algorithms.)
|
|
|
|
- ``max_iter=v``: Terminate after ``v`` iterations.
|
|
- ``min_ll=v``: Terminate after the negative average
|
|
log-likelihood drops under ``v``.
|
|
- ``min_lldelta=v``: Terminate if a single iteration improves
|
|
log likelihood by less than ``v``.
|
|
"""
|
|
if algorithm is None:
|
|
algorithm = 'iis'
|
|
for key in cutoffs:
|
|
if key not in (
|
|
'max_iter',
|
|
'min_ll',
|
|
'min_lldelta',
|
|
'max_acc',
|
|
'min_accdelta',
|
|
'count_cutoff',
|
|
'norm',
|
|
'explicit',
|
|
'bernoulli',
|
|
):
|
|
raise TypeError('Unexpected keyword arg %r' % key)
|
|
algorithm = algorithm.lower()
|
|
if algorithm == 'iis':
|
|
return train_maxent_classifier_with_iis(
|
|
train_toks, trace, encoding, labels, **cutoffs
|
|
)
|
|
elif algorithm == 'gis':
|
|
return train_maxent_classifier_with_gis(
|
|
train_toks, trace, encoding, labels, **cutoffs
|
|
)
|
|
elif algorithm == 'megam':
|
|
return train_maxent_classifier_with_megam(
|
|
train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs
|
|
)
|
|
elif algorithm == 'tadm':
|
|
kwargs = cutoffs
|
|
kwargs['trace'] = trace
|
|
kwargs['encoding'] = encoding
|
|
kwargs['labels'] = labels
|
|
kwargs['gaussian_prior_sigma'] = gaussian_prior_sigma
|
|
return TadmMaxentClassifier.train(train_toks, **kwargs)
|
|
else:
|
|
raise ValueError('Unknown algorithm %s' % algorithm)
|
|
|
|
|
|
#: Alias for MaxentClassifier.
|
|
ConditionalExponentialClassifier = MaxentClassifier
|
|
|
|
|
|
######################################################################
|
|
# { Feature Encodings
|
|
######################################################################
|
|
|
|
|
|
class MaxentFeatureEncodingI(object):
|
|
"""
|
|
A mapping that converts a set of input-feature values to a vector
|
|
of joint-feature values, given a label. This conversion is
|
|
necessary to translate featuresets into a format that can be used
|
|
by maximum entropy models.
|
|
|
|
The set of joint-features used by a given encoding is fixed, and
|
|
each index in the generated joint-feature vectors corresponds to a
|
|
single joint-feature. The length of the generated joint-feature
|
|
vectors is therefore constant (for a given encoding).
|
|
|
|
Because the joint-feature vectors generated by
|
|
``MaxentFeatureEncodingI`` are typically very sparse, they are
|
|
represented as a list of ``(index, value)`` tuples, specifying the
|
|
value of each non-zero joint-feature.
|
|
|
|
Feature encodings are generally created using the ``train()``
|
|
method, which generates an appropriate encoding based on the
|
|
input-feature values and labels that are present in a given
|
|
corpus.
|
|
"""
|
|
|
|
def encode(self, featureset, label):
|
|
"""
|
|
Given a (featureset, label) pair, return the corresponding
|
|
vector of joint-feature values. This vector is represented as
|
|
a list of ``(index, value)`` tuples, specifying the value of
|
|
each non-zero joint-feature.
|
|
|
|
:type featureset: dict
|
|
:rtype: list(tuple(int, int))
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def length(self):
|
|
"""
|
|
:return: The size of the fixed-length joint-feature vectors
|
|
that are generated by this encoding.
|
|
:rtype: int
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def labels(self):
|
|
"""
|
|
:return: A list of the \"known labels\" -- i.e., all labels
|
|
``l`` such that ``self.encode(fs,l)`` can be a nonzero
|
|
joint-feature vector for some value of ``fs``.
|
|
:rtype: list
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def describe(self, fid):
|
|
"""
|
|
:return: A string describing the value of the joint-feature
|
|
whose index in the generated feature vectors is ``fid``.
|
|
:rtype: str
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def train(cls, train_toks):
|
|
"""
|
|
Construct and return new feature encoding, based on a given
|
|
training corpus ``train_toks``.
|
|
|
|
:type train_toks: list(tuple(dict, str))
|
|
:param train_toks: Training data, represented as a list of
|
|
pairs, the first member of which is a feature dictionary,
|
|
and the second of which is a classification label.
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
|
|
class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI):
|
|
"""
|
|
A feature encoding that calls a user-supplied function to map a
|
|
given featureset/label pair to a sparse joint-feature vector.
|
|
"""
|
|
|
|
def __init__(self, func, length, labels):
|
|
"""
|
|
Construct a new feature encoding based on the given function.
|
|
|
|
:type func: (callable)
|
|
:param func: A function that takes two arguments, a featureset
|
|
and a label, and returns the sparse joint feature vector
|
|
that encodes them::
|
|
|
|
func(featureset, label) -> feature_vector
|
|
|
|
This sparse joint feature vector (``feature_vector``) is a
|
|
list of ``(index,value)`` tuples.
|
|
|
|
:type length: int
|
|
:param length: The size of the fixed-length joint-feature
|
|
vectors that are generated by this encoding.
|
|
|
|
:type labels: list
|
|
:param labels: A list of the \"known labels\" for this
|
|
encoding -- i.e., all labels ``l`` such that
|
|
``self.encode(fs,l)`` can be a nonzero joint-feature vector
|
|
for some value of ``fs``.
|
|
"""
|
|
self._length = length
|
|
self._func = func
|
|
self._labels = labels
|
|
|
|
def encode(self, featureset, label):
|
|
return self._func(featureset, label)
|
|
|
|
def length(self):
|
|
return self._length
|
|
|
|
def labels(self):
|
|
return self._labels
|
|
|
|
def describe(self, fid):
|
|
return 'no description available'
|
|
|
|
|
|
class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
|
|
"""
|
|
A feature encoding that generates vectors containing a binary
|
|
joint-features of the form:
|
|
|
|
| joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
|
|
| {
|
|
| { 0 otherwise
|
|
|
|
Where ``fname`` is the name of an input-feature, ``fval`` is a value
|
|
for that input-feature, and ``label`` is a label.
|
|
|
|
Typically, these features are constructed based on a training
|
|
corpus, using the ``train()`` method. This method will create one
|
|
feature for each combination of ``fname``, ``fval``, and ``label``
|
|
that occurs at least once in the training corpus.
|
|
|
|
The ``unseen_features`` parameter can be used to add "unseen-value
|
|
features", which are used whenever an input feature has a value
|
|
that was not encountered in the training corpus. These features
|
|
have the form:
|
|
|
|
| joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
|
|
| { and l == label
|
|
| {
|
|
| { 0 otherwise
|
|
|
|
Where ``is_unseen(fname, fval)`` is true if the encoding does not
|
|
contain any joint features that are true when ``fs[fname]==fval``.
|
|
|
|
The ``alwayson_features`` parameter can be used to add "always-on
|
|
features", which have the form::
|
|
|
|
| joint_feat(fs, l) = { 1 if (l == label)
|
|
| {
|
|
| { 0 otherwise
|
|
|
|
These always-on features allow the maxent model to directly model
|
|
the prior probabilities of each label.
|
|
"""
|
|
|
|
def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
|
|
"""
|
|
:param labels: A list of the \"known labels\" for this encoding.
|
|
|
|
:param mapping: A dictionary mapping from ``(fname,fval,label)``
|
|
tuples to corresponding joint-feature indexes. These
|
|
indexes must be the set of integers from 0...len(mapping).
|
|
If ``mapping[fname,fval,label]=id``, then
|
|
``self.encode(..., fname:fval, ..., label)[id]`` is 1;
|
|
otherwise, it is 0.
|
|
|
|
:param unseen_features: If true, then include unseen value
|
|
features in the generated joint-feature vectors.
|
|
|
|
:param alwayson_features: If true, then include always-on
|
|
features in the generated joint-feature vectors.
|
|
"""
|
|
if set(mapping.values()) != set(range(len(mapping))):
|
|
raise ValueError(
|
|
'Mapping values must be exactly the '
|
|
'set of integers from 0...len(mapping)'
|
|
)
|
|
|
|
self._labels = list(labels)
|
|
"""A list of attested labels."""
|
|
|
|
self._mapping = mapping
|
|
"""dict mapping from (fname,fval,label) -> fid"""
|
|
|
|
self._length = len(mapping)
|
|
"""The length of generated joint feature vectors."""
|
|
|
|
self._alwayson = None
|
|
"""dict mapping from label -> fid"""
|
|
|
|
self._unseen = None
|
|
"""dict mapping from fname -> fid"""
|
|
|
|
if alwayson_features:
|
|
self._alwayson = dict(
|
|
(label, i + self._length) for (i, label) in enumerate(labels)
|
|
)
|
|
self._length += len(self._alwayson)
|
|
|
|
if unseen_features:
|
|
fnames = set(fname for (fname, fval, label) in mapping)
|
|
self._unseen = dict(
|
|
(fname, i + self._length) for (i, fname) in enumerate(fnames)
|
|
)
|
|
self._length += len(fnames)
|
|
|
|
def encode(self, featureset, label):
|
|
# Inherit docs.
|
|
encoding = []
|
|
|
|
# Convert input-features to joint-features:
|
|
for fname, fval in featureset.items():
|
|
# Known feature name & value:
|
|
if (fname, fval, label) in self._mapping:
|
|
encoding.append((self._mapping[fname, fval, label], 1))
|
|
|
|
# Otherwise, we might want to fire an "unseen-value feature".
|
|
elif self._unseen:
|
|
# Have we seen this fname/fval combination with any label?
|
|
for label2 in self._labels:
|
|
if (fname, fval, label2) in self._mapping:
|
|
break # we've seen this fname/fval combo
|
|
# We haven't -- fire the unseen-value feature
|
|
else:
|
|
if fname in self._unseen:
|
|
encoding.append((self._unseen[fname], 1))
|
|
|
|
# Add always-on features:
|
|
if self._alwayson and label in self._alwayson:
|
|
encoding.append((self._alwayson[label], 1))
|
|
|
|
return encoding
|
|
|
|
def describe(self, f_id):
|
|
# Inherit docs.
|
|
if not isinstance(f_id, integer_types):
|
|
raise TypeError('describe() expected an int')
|
|
try:
|
|
self._inv_mapping
|
|
except AttributeError:
|
|
self._inv_mapping = [-1] * len(self._mapping)
|
|
for (info, i) in self._mapping.items():
|
|
self._inv_mapping[i] = info
|
|
|
|
if f_id < len(self._mapping):
|
|
(fname, fval, label) = self._inv_mapping[f_id]
|
|
return '%s==%r and label is %r' % (fname, fval, label)
|
|
elif self._alwayson and f_id in self._alwayson.values():
|
|
for (label, f_id2) in self._alwayson.items():
|
|
if f_id == f_id2:
|
|
return 'label is %r' % label
|
|
elif self._unseen and f_id in self._unseen.values():
|
|
for (fname, f_id2) in self._unseen.items():
|
|
if f_id == f_id2:
|
|
return '%s is unseen' % fname
|
|
else:
|
|
raise ValueError('Bad feature id')
|
|
|
|
def labels(self):
|
|
# Inherit docs.
|
|
return self._labels
|
|
|
|
def length(self):
|
|
# Inherit docs.
|
|
return self._length
|
|
|
|
@classmethod
|
|
def train(cls, train_toks, count_cutoff=0, labels=None, **options):
|
|
"""
|
|
Construct and return new feature encoding, based on a given
|
|
training corpus ``train_toks``. See the class description
|
|
``BinaryMaxentFeatureEncoding`` for a description of the
|
|
joint-features that will be included in this encoding.
|
|
|
|
:type train_toks: list(tuple(dict, str))
|
|
:param train_toks: Training data, represented as a list of
|
|
pairs, the first member of which is a feature dictionary,
|
|
and the second of which is a classification label.
|
|
|
|
:type count_cutoff: int
|
|
:param count_cutoff: A cutoff value that is used to discard
|
|
rare joint-features. If a joint-feature's value is 1
|
|
fewer than ``count_cutoff`` times in the training corpus,
|
|
then that joint-feature is not included in the generated
|
|
encoding.
|
|
|
|
:type labels: list
|
|
:param labels: A list of labels that should be used by the
|
|
classifier. If not specified, then the set of labels
|
|
attested in ``train_toks`` will be used.
|
|
|
|
:param options: Extra parameters for the constructor, such as
|
|
``unseen_features`` and ``alwayson_features``.
|
|
"""
|
|
mapping = {} # maps (fname, fval, label) -> fid
|
|
seen_labels = set() # The set of labels we've encountered
|
|
count = defaultdict(int) # maps (fname, fval) -> count
|
|
|
|
for (tok, label) in train_toks:
|
|
if labels and label not in labels:
|
|
raise ValueError('Unexpected label %s' % label)
|
|
seen_labels.add(label)
|
|
|
|
# Record each of the features.
|
|
for (fname, fval) in tok.items():
|
|
|
|
# If a count cutoff is given, then only add a joint
|
|
# feature once the corresponding (fname, fval, label)
|
|
# tuple exceeds that cutoff.
|
|
count[fname, fval] += 1
|
|
if count[fname, fval] >= count_cutoff:
|
|
if (fname, fval, label) not in mapping:
|
|
mapping[fname, fval, label] = len(mapping)
|
|
|
|
if labels is None:
|
|
labels = seen_labels
|
|
return cls(labels, mapping, **options)
|
|
|
|
|
|
class GISEncoding(BinaryMaxentFeatureEncoding):
|
|
"""
|
|
A binary feature encoding which adds one new joint-feature to the
|
|
joint-features defined by ``BinaryMaxentFeatureEncoding``: a
|
|
correction feature, whose value is chosen to ensure that the
|
|
sparse vector always sums to a constant non-negative number. This
|
|
new feature is used to ensure two preconditions for the GIS
|
|
training algorithm:
|
|
|
|
- At least one feature vector index must be nonzero for every
|
|
token.
|
|
- The feature vector must sum to a constant non-negative number
|
|
for every token.
|
|
"""
|
|
|
|
def __init__(
|
|
self, labels, mapping, unseen_features=False, alwayson_features=False, C=None
|
|
):
|
|
"""
|
|
:param C: The correction constant. The value of the correction
|
|
feature is based on this value. In particular, its value is
|
|
``C - sum([v for (f,v) in encoding])``.
|
|
:seealso: ``BinaryMaxentFeatureEncoding.__init__``
|
|
"""
|
|
BinaryMaxentFeatureEncoding.__init__(
|
|
self, labels, mapping, unseen_features, alwayson_features
|
|
)
|
|
if C is None:
|
|
C = len(set(fname for (fname, fval, label) in mapping)) + 1
|
|
self._C = C
|
|
|
|
@property
|
|
def C(self):
|
|
"""The non-negative constant that all encoded feature vectors
|
|
will sum to."""
|
|
return self._C
|
|
|
|
def encode(self, featureset, label):
|
|
# Get the basic encoding.
|
|
encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label)
|
|
base_length = BinaryMaxentFeatureEncoding.length(self)
|
|
|
|
# Add a correction feature.
|
|
total = sum(v for (f, v) in encoding)
|
|
if total >= self._C:
|
|
raise ValueError('Correction feature is not high enough!')
|
|
encoding.append((base_length, self._C - total))
|
|
|
|
# Return the result
|
|
return encoding
|
|
|
|
def length(self):
|
|
return BinaryMaxentFeatureEncoding.length(self) + 1
|
|
|
|
def describe(self, f_id):
|
|
if f_id == BinaryMaxentFeatureEncoding.length(self):
|
|
return 'Correction feature (%s)' % self._C
|
|
else:
|
|
return BinaryMaxentFeatureEncoding.describe(self, f_id)
|
|
|
|
|
|
class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding):
|
|
def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
|
|
self._mapping = OrderedDict(mapping)
|
|
self._label_mapping = OrderedDict()
|
|
BinaryMaxentFeatureEncoding.__init__(
|
|
self, labels, self._mapping, unseen_features, alwayson_features
|
|
)
|
|
|
|
def encode(self, featureset, label):
|
|
encoding = []
|
|
for feature, value in featureset.items():
|
|
if (feature, label) not in self._mapping:
|
|
self._mapping[(feature, label)] = len(self._mapping)
|
|
if value not in self._label_mapping:
|
|
if not isinstance(value, int):
|
|
self._label_mapping[value] = len(self._label_mapping)
|
|
else:
|
|
self._label_mapping[value] = value
|
|
encoding.append(
|
|
(self._mapping[(feature, label)], self._label_mapping[value])
|
|
)
|
|
return encoding
|
|
|
|
def labels(self):
|
|
return self._labels
|
|
|
|
def describe(self, fid):
|
|
for (feature, label) in self._mapping:
|
|
if self._mapping[(feature, label)] == fid:
|
|
return (feature, label)
|
|
|
|
def length(self):
|
|
return len(self._mapping)
|
|
|
|
@classmethod
|
|
def train(cls, train_toks, count_cutoff=0, labels=None, **options):
|
|
mapping = OrderedDict()
|
|
if not labels:
|
|
labels = []
|
|
|
|
# This gets read twice, so compute the values in case it's lazy.
|
|
train_toks = list(train_toks)
|
|
|
|
for (featureset, label) in train_toks:
|
|
if label not in labels:
|
|
labels.append(label)
|
|
|
|
for (featureset, label) in train_toks:
|
|
for label in labels:
|
|
for feature in featureset:
|
|
if (feature, label) not in mapping:
|
|
mapping[(feature, label)] = len(mapping)
|
|
|
|
return cls(labels, mapping, **options)
|
|
|
|
|
|
class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
|
|
"""
|
|
A feature encoding that generates vectors containing integer,
|
|
float and binary joint-features of the form:
|
|
|
|
Binary (for string and boolean features):
|
|
|
|
| joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
|
|
| {
|
|
| { 0 otherwise
|
|
|
|
Value (for integer and float features):
|
|
|
|
| joint_feat(fs, l) = { fval if (fs[fname] == type(fval))
|
|
| { and (l == label)
|
|
| {
|
|
| { not encoded otherwise
|
|
|
|
Where ``fname`` is the name of an input-feature, ``fval`` is a value
|
|
for that input-feature, and ``label`` is a label.
|
|
|
|
Typically, these features are constructed based on a training
|
|
corpus, using the ``train()`` method.
|
|
|
|
For string and boolean features [type(fval) not in (int, float)]
|
|
this method will create one feature for each combination of
|
|
``fname``, ``fval``, and ``label`` that occurs at least once in the
|
|
training corpus.
|
|
|
|
For integer and float features [type(fval) in (int, float)] this
|
|
method will create one feature for each combination of ``fname``
|
|
and ``label`` that occurs at least once in the training corpus.
|
|
|
|
For binary features the ``unseen_features`` parameter can be used
|
|
to add "unseen-value features", which are used whenever an input
|
|
feature has a value that was not encountered in the training
|
|
corpus. These features have the form:
|
|
|
|
| joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
|
|
| { and l == label
|
|
| {
|
|
| { 0 otherwise
|
|
|
|
Where ``is_unseen(fname, fval)`` is true if the encoding does not
|
|
contain any joint features that are true when ``fs[fname]==fval``.
|
|
|
|
The ``alwayson_features`` parameter can be used to add "always-on
|
|
features", which have the form:
|
|
|
|
| joint_feat(fs, l) = { 1 if (l == label)
|
|
| {
|
|
| { 0 otherwise
|
|
|
|
These always-on features allow the maxent model to directly model
|
|
the prior probabilities of each label.
|
|
"""
|
|
|
|
def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False):
|
|
"""
|
|
:param labels: A list of the \"known labels\" for this encoding.
|
|
|
|
:param mapping: A dictionary mapping from ``(fname,fval,label)``
|
|
tuples to corresponding joint-feature indexes. These
|
|
indexes must be the set of integers from 0...len(mapping).
|
|
If ``mapping[fname,fval,label]=id``, then
|
|
``self.encode({..., fname:fval, ...``, label)[id]} is 1;
|
|
otherwise, it is 0.
|
|
|
|
:param unseen_features: If true, then include unseen value
|
|
features in the generated joint-feature vectors.
|
|
|
|
:param alwayson_features: If true, then include always-on
|
|
features in the generated joint-feature vectors.
|
|
"""
|
|
if set(mapping.values()) != set(range(len(mapping))):
|
|
raise ValueError(
|
|
'Mapping values must be exactly the '
|
|
'set of integers from 0...len(mapping)'
|
|
)
|
|
|
|
self._labels = list(labels)
|
|
"""A list of attested labels."""
|
|
|
|
self._mapping = mapping
|
|
"""dict mapping from (fname,fval,label) -> fid"""
|
|
|
|
self._length = len(mapping)
|
|
"""The length of generated joint feature vectors."""
|
|
|
|
self._alwayson = None
|
|
"""dict mapping from label -> fid"""
|
|
|
|
self._unseen = None
|
|
"""dict mapping from fname -> fid"""
|
|
|
|
if alwayson_features:
|
|
self._alwayson = dict(
|
|
(label, i + self._length) for (i, label) in enumerate(labels)
|
|
)
|
|
self._length += len(self._alwayson)
|
|
|
|
if unseen_features:
|
|
fnames = set(fname for (fname, fval, label) in mapping)
|
|
self._unseen = dict(
|
|
(fname, i + self._length) for (i, fname) in enumerate(fnames)
|
|
)
|
|
self._length += len(fnames)
|
|
|
|
def encode(self, featureset, label):
|
|
# Inherit docs.
|
|
encoding = []
|
|
|
|
# Convert input-features to joint-features:
|
|
for fname, fval in featureset.items():
|
|
if isinstance(fval, (integer_types, float)):
|
|
# Known feature name & value:
|
|
if (fname, type(fval), label) in self._mapping:
|
|
encoding.append((self._mapping[fname, type(fval), label], fval))
|
|
else:
|
|
# Known feature name & value:
|
|
if (fname, fval, label) in self._mapping:
|
|
encoding.append((self._mapping[fname, fval, label], 1))
|
|
|
|
# Otherwise, we might want to fire an "unseen-value feature".
|
|
elif self._unseen:
|
|
# Have we seen this fname/fval combination with any label?
|
|
for label2 in self._labels:
|
|
if (fname, fval, label2) in self._mapping:
|
|
break # we've seen this fname/fval combo
|
|
# We haven't -- fire the unseen-value feature
|
|
else:
|
|
if fname in self._unseen:
|
|
encoding.append((self._unseen[fname], 1))
|
|
|
|
# Add always-on features:
|
|
if self._alwayson and label in self._alwayson:
|
|
encoding.append((self._alwayson[label], 1))
|
|
|
|
return encoding
|
|
|
|
def describe(self, f_id):
|
|
# Inherit docs.
|
|
if not isinstance(f_id, integer_types):
|
|
raise TypeError('describe() expected an int')
|
|
try:
|
|
self._inv_mapping
|
|
except AttributeError:
|
|
self._inv_mapping = [-1] * len(self._mapping)
|
|
for (info, i) in self._mapping.items():
|
|
self._inv_mapping[i] = info
|
|
|
|
if f_id < len(self._mapping):
|
|
(fname, fval, label) = self._inv_mapping[f_id]
|
|
return '%s==%r and label is %r' % (fname, fval, label)
|
|
elif self._alwayson and f_id in self._alwayson.values():
|
|
for (label, f_id2) in self._alwayson.items():
|
|
if f_id == f_id2:
|
|
return 'label is %r' % label
|
|
elif self._unseen and f_id in self._unseen.values():
|
|
for (fname, f_id2) in self._unseen.items():
|
|
if f_id == f_id2:
|
|
return '%s is unseen' % fname
|
|
else:
|
|
raise ValueError('Bad feature id')
|
|
|
|
def labels(self):
|
|
# Inherit docs.
|
|
return self._labels
|
|
|
|
def length(self):
|
|
# Inherit docs.
|
|
return self._length
|
|
|
|
@classmethod
|
|
def train(cls, train_toks, count_cutoff=0, labels=None, **options):
|
|
"""
|
|
Construct and return new feature encoding, based on a given
|
|
training corpus ``train_toks``. See the class description
|
|
``TypedMaxentFeatureEncoding`` for a description of the
|
|
joint-features that will be included in this encoding.
|
|
|
|
Note: recognized feature values types are (int, float), over
|
|
types are interpreted as regular binary features.
|
|
|
|
:type train_toks: list(tuple(dict, str))
|
|
:param train_toks: Training data, represented as a list of
|
|
pairs, the first member of which is a feature dictionary,
|
|
and the second of which is a classification label.
|
|
|
|
:type count_cutoff: int
|
|
:param count_cutoff: A cutoff value that is used to discard
|
|
rare joint-features. If a joint-feature's value is 1
|
|
fewer than ``count_cutoff`` times in the training corpus,
|
|
then that joint-feature is not included in the generated
|
|
encoding.
|
|
|
|
:type labels: list
|
|
:param labels: A list of labels that should be used by the
|
|
classifier. If not specified, then the set of labels
|
|
attested in ``train_toks`` will be used.
|
|
|
|
:param options: Extra parameters for the constructor, such as
|
|
``unseen_features`` and ``alwayson_features``.
|
|
"""
|
|
mapping = {} # maps (fname, fval, label) -> fid
|
|
seen_labels = set() # The set of labels we've encountered
|
|
count = defaultdict(int) # maps (fname, fval) -> count
|
|
|
|
for (tok, label) in train_toks:
|
|
if labels and label not in labels:
|
|
raise ValueError('Unexpected label %s' % label)
|
|
seen_labels.add(label)
|
|
|
|
# Record each of the features.
|
|
for (fname, fval) in tok.items():
|
|
if type(fval) in (int, float):
|
|
fval = type(fval)
|
|
# If a count cutoff is given, then only add a joint
|
|
# feature once the corresponding (fname, fval, label)
|
|
# tuple exceeds that cutoff.
|
|
count[fname, fval] += 1
|
|
if count[fname, fval] >= count_cutoff:
|
|
if (fname, fval, label) not in mapping:
|
|
mapping[fname, fval, label] = len(mapping)
|
|
|
|
if labels is None:
|
|
labels = seen_labels
|
|
return cls(labels, mapping, **options)
|
|
|
|
|
|
######################################################################
|
|
# { Classifier Trainer: Generalized Iterative Scaling
|
|
######################################################################
|
|
|
|
|
|
def train_maxent_classifier_with_gis(
|
|
train_toks, trace=3, encoding=None, labels=None, **cutoffs
|
|
):
|
|
"""
|
|
Train a new ``ConditionalExponentialClassifier``, using the given
|
|
training samples, using the Generalized Iterative Scaling
|
|
algorithm. This ``ConditionalExponentialClassifier`` will encode
|
|
the model that maximizes entropy from all the models that are
|
|
empirically consistent with ``train_toks``.
|
|
|
|
:see: ``train_maxent_classifier()`` for parameter descriptions.
|
|
"""
|
|
cutoffs.setdefault('max_iter', 100)
|
|
cutoffchecker = CutoffChecker(cutoffs)
|
|
|
|
# Construct an encoding from the training data.
|
|
if encoding is None:
|
|
encoding = GISEncoding.train(train_toks, labels=labels)
|
|
|
|
if not hasattr(encoding, 'C'):
|
|
raise TypeError(
|
|
'The GIS algorithm requires an encoding that '
|
|
'defines C (e.g., GISEncoding).'
|
|
)
|
|
|
|
# Cinv is the inverse of the sum of each joint feature vector.
|
|
# This controls the learning rate: higher Cinv (or lower C) gives
|
|
# faster learning.
|
|
Cinv = 1.0 / encoding.C
|
|
|
|
# Count how many times each feature occurs in the training data.
|
|
empirical_fcount = calculate_empirical_fcount(train_toks, encoding)
|
|
|
|
# Check for any features that are not attested in train_toks.
|
|
unattested = set(numpy.nonzero(empirical_fcount == 0)[0])
|
|
|
|
# Build the classifier. Start with weight=0 for each attested
|
|
# feature, and weight=-infinity for each unattested feature.
|
|
weights = numpy.zeros(len(empirical_fcount), 'd')
|
|
for fid in unattested:
|
|
weights[fid] = numpy.NINF
|
|
classifier = ConditionalExponentialClassifier(encoding, weights)
|
|
|
|
# Take the log of the empirical fcount.
|
|
log_empirical_fcount = numpy.log2(empirical_fcount)
|
|
del empirical_fcount
|
|
|
|
if trace > 0:
|
|
print(' ==> Training (%d iterations)' % cutoffs['max_iter'])
|
|
if trace > 2:
|
|
print()
|
|
print(' Iteration Log Likelihood Accuracy')
|
|
print(' ---------------------------------------')
|
|
|
|
# Train the classifier.
|
|
try:
|
|
while True:
|
|
if trace > 2:
|
|
ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
|
|
acc = cutoffchecker.acc or accuracy(classifier, train_toks)
|
|
iternum = cutoffchecker.iter
|
|
print(' %9d %14.5f %9.3f' % (iternum, ll, acc))
|
|
|
|
# Use the model to estimate the number of times each
|
|
# feature should occur in the training data.
|
|
estimated_fcount = calculate_estimated_fcount(
|
|
classifier, train_toks, encoding
|
|
)
|
|
|
|
# Take the log of estimated fcount (avoid taking log(0).)
|
|
for fid in unattested:
|
|
estimated_fcount[fid] += 1
|
|
log_estimated_fcount = numpy.log2(estimated_fcount)
|
|
del estimated_fcount
|
|
|
|
# Update the classifier weights
|
|
weights = classifier.weights()
|
|
weights += (log_empirical_fcount - log_estimated_fcount) * Cinv
|
|
classifier.set_weights(weights)
|
|
|
|
# Check the log-likelihood & accuracy cutoffs.
|
|
if cutoffchecker.check(classifier, train_toks):
|
|
break
|
|
|
|
except KeyboardInterrupt:
|
|
print(' Training stopped: keyboard interrupt')
|
|
except:
|
|
raise
|
|
|
|
if trace > 2:
|
|
ll = log_likelihood(classifier, train_toks)
|
|
acc = accuracy(classifier, train_toks)
|
|
print(' Final %14.5f %9.3f' % (ll, acc))
|
|
|
|
# Return the classifier.
|
|
return classifier
|
|
|
|
|
|
def calculate_empirical_fcount(train_toks, encoding):
|
|
fcount = numpy.zeros(encoding.length(), 'd')
|
|
|
|
for tok, label in train_toks:
|
|
for (index, val) in encoding.encode(tok, label):
|
|
fcount[index] += val
|
|
|
|
return fcount
|
|
|
|
|
|
def calculate_estimated_fcount(classifier, train_toks, encoding):
|
|
fcount = numpy.zeros(encoding.length(), 'd')
|
|
|
|
for tok, label in train_toks:
|
|
pdist = classifier.prob_classify(tok)
|
|
for label in pdist.samples():
|
|
prob = pdist.prob(label)
|
|
for (fid, fval) in encoding.encode(tok, label):
|
|
fcount[fid] += prob * fval
|
|
|
|
return fcount
|
|
|
|
|
|
######################################################################
|
|
# { Classifier Trainer: Improved Iterative Scaling
|
|
######################################################################
|
|
|
|
|
|
def train_maxent_classifier_with_iis(
|
|
train_toks, trace=3, encoding=None, labels=None, **cutoffs
|
|
):
|
|
"""
|
|
Train a new ``ConditionalExponentialClassifier``, using the given
|
|
training samples, using the Improved Iterative Scaling algorithm.
|
|
This ``ConditionalExponentialClassifier`` will encode the model
|
|
that maximizes entropy from all the models that are empirically
|
|
consistent with ``train_toks``.
|
|
|
|
:see: ``train_maxent_classifier()`` for parameter descriptions.
|
|
"""
|
|
cutoffs.setdefault('max_iter', 100)
|
|
cutoffchecker = CutoffChecker(cutoffs)
|
|
|
|
# Construct an encoding from the training data.
|
|
if encoding is None:
|
|
encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels)
|
|
|
|
# Count how many times each feature occurs in the training data.
|
|
empirical_ffreq = calculate_empirical_fcount(train_toks, encoding) / len(train_toks)
|
|
|
|
# Find the nf map, and related variables nfarray and nfident.
|
|
# nf is the sum of the features for a given labeled text.
|
|
# nfmap compresses this sparse set of values to a dense list.
|
|
# nfarray performs the reverse operation. nfident is
|
|
# nfarray multiplied by an identity matrix.
|
|
nfmap = calculate_nfmap(train_toks, encoding)
|
|
nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd')
|
|
nftranspose = numpy.reshape(nfarray, (len(nfarray), 1))
|
|
|
|
# Check for any features that are not attested in train_toks.
|
|
unattested = set(numpy.nonzero(empirical_ffreq == 0)[0])
|
|
|
|
# Build the classifier. Start with weight=0 for each attested
|
|
# feature, and weight=-infinity for each unattested feature.
|
|
weights = numpy.zeros(len(empirical_ffreq), 'd')
|
|
for fid in unattested:
|
|
weights[fid] = numpy.NINF
|
|
classifier = ConditionalExponentialClassifier(encoding, weights)
|
|
|
|
if trace > 0:
|
|
print(' ==> Training (%d iterations)' % cutoffs['max_iter'])
|
|
if trace > 2:
|
|
print()
|
|
print(' Iteration Log Likelihood Accuracy')
|
|
print(' ---------------------------------------')
|
|
|
|
# Train the classifier.
|
|
try:
|
|
while True:
|
|
if trace > 2:
|
|
ll = cutoffchecker.ll or log_likelihood(classifier, train_toks)
|
|
acc = cutoffchecker.acc or accuracy(classifier, train_toks)
|
|
iternum = cutoffchecker.iter
|
|
print(' %9d %14.5f %9.3f' % (iternum, ll, acc))
|
|
|
|
# Calculate the deltas for this iteration, using Newton's method.
|
|
deltas = calculate_deltas(
|
|
train_toks,
|
|
classifier,
|
|
unattested,
|
|
empirical_ffreq,
|
|
nfmap,
|
|
nfarray,
|
|
nftranspose,
|
|
encoding,
|
|
)
|
|
|
|
# Use the deltas to update our weights.
|
|
weights = classifier.weights()
|
|
weights += deltas
|
|
classifier.set_weights(weights)
|
|
|
|
# Check the log-likelihood & accuracy cutoffs.
|
|
if cutoffchecker.check(classifier, train_toks):
|
|
break
|
|
|
|
except KeyboardInterrupt:
|
|
print(' Training stopped: keyboard interrupt')
|
|
except:
|
|
raise
|
|
|
|
if trace > 2:
|
|
ll = log_likelihood(classifier, train_toks)
|
|
acc = accuracy(classifier, train_toks)
|
|
print(' Final %14.5f %9.3f' % (ll, acc))
|
|
|
|
# Return the classifier.
|
|
return classifier
|
|
|
|
|
|
def calculate_nfmap(train_toks, encoding):
|
|
"""
|
|
Construct a map that can be used to compress ``nf`` (which is
|
|
typically sparse).
|
|
|
|
*nf(feature_vector)* is the sum of the feature values for
|
|
*feature_vector*.
|
|
|
|
This represents the number of features that are active for a
|
|
given labeled text. This method finds all values of *nf(t)*
|
|
that are attested for at least one token in the given list of
|
|
training tokens; and constructs a dictionary mapping these
|
|
attested values to a continuous range *0...N*. For example,
|
|
if the only values of *nf()* that were attested were 3, 5, and
|
|
7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``.
|
|
|
|
:return: A map that can be used to compress ``nf`` to a dense
|
|
vector.
|
|
:rtype: dict(int -> int)
|
|
"""
|
|
# Map from nf to indices. This allows us to use smaller arrays.
|
|
nfset = set()
|
|
for tok, _ in train_toks:
|
|
for label in encoding.labels():
|
|
nfset.add(sum(val for (id, val) in encoding.encode(tok, label)))
|
|
return dict((nf, i) for (i, nf) in enumerate(nfset))
|
|
|
|
|
|
def calculate_deltas(
|
|
train_toks,
|
|
classifier,
|
|
unattested,
|
|
ffreq_empirical,
|
|
nfmap,
|
|
nfarray,
|
|
nftranspose,
|
|
encoding,
|
|
):
|
|
"""
|
|
Calculate the update values for the classifier weights for
|
|
this iteration of IIS. These update weights are the value of
|
|
``delta`` that solves the equation::
|
|
|
|
ffreq_empirical[i]
|
|
=
|
|
SUM[fs,l] (classifier.prob_classify(fs).prob(l) *
|
|
feature_vector(fs,l)[i] *
|
|
exp(delta[i] * nf(feature_vector(fs,l))))
|
|
|
|
Where:
|
|
- *(fs,l)* is a (featureset, label) tuple from ``train_toks``
|
|
- *feature_vector(fs,l)* = ``encoding.encode(fs,l)``
|
|
- *nf(vector)* = ``sum([val for (id,val) in vector])``
|
|
|
|
This method uses Newton's method to solve this equation for
|
|
*delta[i]*. In particular, it starts with a guess of
|
|
``delta[i]`` = 1; and iteratively updates ``delta`` with:
|
|
|
|
| delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])
|
|
|
|
until convergence, where *sum1* and *sum2* are defined as:
|
|
|
|
| sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)
|
|
| sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l)))
|
|
| f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) .
|
|
| feature_vector(fs,l)[i] .
|
|
| exp(delta[i] . nf(feature_vector(fs,l))))
|
|
|
|
Note that *sum1* and *sum2* depend on ``delta``; so they need
|
|
to be re-computed each iteration.
|
|
|
|
The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are
|
|
used to generate a dense encoding for *nf(ltext)*. This
|
|
allows ``_deltas`` to calculate *sum1* and *sum2* using
|
|
matrices, which yields a significant performance improvement.
|
|
|
|
:param train_toks: The set of training tokens.
|
|
:type train_toks: list(tuple(dict, str))
|
|
:param classifier: The current classifier.
|
|
:type classifier: ClassifierI
|
|
:param ffreq_empirical: An array containing the empirical
|
|
frequency for each feature. The *i*\ th element of this
|
|
array is the empirical frequency for feature *i*.
|
|
:type ffreq_empirical: sequence of float
|
|
:param unattested: An array that is 1 for features that are
|
|
not attested in the training data; and 0 for features that
|
|
are attested. In other words, ``unattested[i]==0`` iff
|
|
``ffreq_empirical[i]==0``.
|
|
:type unattested: sequence of int
|
|
:param nfmap: A map that can be used to compress ``nf`` to a dense
|
|
vector.
|
|
:type nfmap: dict(int -> int)
|
|
:param nfarray: An array that can be used to uncompress ``nf``
|
|
from a dense vector.
|
|
:type nfarray: array(float)
|
|
:param nftranspose: The transpose of ``nfarray``
|
|
:type nftranspose: array(float)
|
|
"""
|
|
# These parameters control when we decide that we've
|
|
# converged. It probably should be possible to set these
|
|
# manually, via keyword arguments to train.
|
|
NEWTON_CONVERGE = 1e-12
|
|
MAX_NEWTON = 300
|
|
|
|
deltas = numpy.ones(encoding.length(), 'd')
|
|
|
|
# Precompute the A matrix:
|
|
# A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) )
|
|
# over all label,fs s.t. num_features[label,fs]=nf
|
|
A = numpy.zeros((len(nfmap), encoding.length()), 'd')
|
|
|
|
for tok, label in train_toks:
|
|
dist = classifier.prob_classify(tok)
|
|
|
|
for label in encoding.labels():
|
|
# Generate the feature vector
|
|
feature_vector = encoding.encode(tok, label)
|
|
# Find the number of active features
|
|
nf = sum(val for (id, val) in feature_vector)
|
|
# Update the A matrix
|
|
for (id, val) in feature_vector:
|
|
A[nfmap[nf], id] += dist.prob(label) * val
|
|
A /= len(train_toks)
|
|
|
|
# Iteratively solve for delta. Use the following variables:
|
|
# - nf_delta[x][y] = nfarray[x] * delta[y]
|
|
# - exp_nf_delta[x][y] = exp(nf[x] * delta[y])
|
|
# - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y])
|
|
# - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
|
|
# exp(delta[i]nf)
|
|
# - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs)
|
|
# nf exp(delta[i]nf)
|
|
for rangenum in range(MAX_NEWTON):
|
|
nf_delta = numpy.outer(nfarray, deltas)
|
|
exp_nf_delta = 2 ** nf_delta
|
|
nf_exp_nf_delta = nftranspose * exp_nf_delta
|
|
sum1 = numpy.sum(exp_nf_delta * A, axis=0)
|
|
sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0)
|
|
|
|
# Avoid division by zero.
|
|
for fid in unattested:
|
|
sum2[fid] += 1
|
|
|
|
# Update the deltas.
|
|
deltas -= (ffreq_empirical - sum1) / -sum2
|
|
|
|
# We can stop once we converge.
|
|
n_error = numpy.sum(abs((ffreq_empirical - sum1))) / numpy.sum(abs(deltas))
|
|
if n_error < NEWTON_CONVERGE:
|
|
return deltas
|
|
|
|
return deltas
|
|
|
|
|
|
######################################################################
|
|
# { Classifier Trainer: megam
|
|
######################################################################
|
|
|
|
# [xx] possible extension: add support for using implicit file format;
|
|
# this would need to put requirements on what encoding is used. But
|
|
# we may need this for other maxent classifier trainers that require
|
|
# implicit formats anyway.
|
|
def train_maxent_classifier_with_megam(
|
|
train_toks, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **kwargs
|
|
):
|
|
"""
|
|
Train a new ``ConditionalExponentialClassifier``, using the given
|
|
training samples, using the external ``megam`` library. This
|
|
``ConditionalExponentialClassifier`` will encode the model that
|
|
maximizes entropy from all the models that are empirically
|
|
consistent with ``train_toks``.
|
|
|
|
:see: ``train_maxent_classifier()`` for parameter descriptions.
|
|
:see: ``nltk.classify.megam``
|
|
"""
|
|
|
|
explicit = True
|
|
bernoulli = True
|
|
if 'explicit' in kwargs:
|
|
explicit = kwargs['explicit']
|
|
if 'bernoulli' in kwargs:
|
|
bernoulli = kwargs['bernoulli']
|
|
|
|
# Construct an encoding from the training data.
|
|
if encoding is None:
|
|
# Count cutoff can also be controlled by megam with the -minfc
|
|
# option. Not sure where the best place for it is.
|
|
count_cutoff = kwargs.get('count_cutoff', 0)
|
|
encoding = BinaryMaxentFeatureEncoding.train(
|
|
train_toks, count_cutoff, labels=labels, alwayson_features=True
|
|
)
|
|
elif labels is not None:
|
|
raise ValueError('Specify encoding or labels, not both')
|
|
|
|
# Write a training file for megam.
|
|
try:
|
|
fd, trainfile_name = tempfile.mkstemp(prefix='nltk-')
|
|
with open(trainfile_name, 'w') as trainfile:
|
|
write_megam_file(
|
|
train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli
|
|
)
|
|
os.close(fd)
|
|
except (OSError, IOError, ValueError) as e:
|
|
raise ValueError('Error while creating megam training file: %s' % e)
|
|
|
|
# Run megam on the training file.
|
|
options = []
|
|
options += ['-nobias', '-repeat', '10']
|
|
if explicit:
|
|
options += ['-explicit']
|
|
if not bernoulli:
|
|
options += ['-fvals']
|
|
if gaussian_prior_sigma:
|
|
# Lambda is just the precision of the Gaussian prior, i.e. it's the
|
|
# inverse variance, so the parameter conversion is 1.0/sigma**2.
|
|
# See http://www.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf.
|
|
inv_variance = 1.0 / gaussian_prior_sigma ** 2
|
|
else:
|
|
inv_variance = 0
|
|
options += ['-lambda', '%.2f' % inv_variance, '-tune']
|
|
if trace < 3:
|
|
options += ['-quiet']
|
|
if 'max_iter' in kwargs:
|
|
options += ['-maxi', '%s' % kwargs['max_iter']]
|
|
if 'll_delta' in kwargs:
|
|
# [xx] this is actually a perplexity delta, not a log
|
|
# likelihood delta
|
|
options += ['-dpp', '%s' % abs(kwargs['ll_delta'])]
|
|
if hasattr(encoding, 'cost'):
|
|
options += ['-multilabel'] # each possible la
|
|
options += ['multiclass', trainfile_name]
|
|
stdout = call_megam(options)
|
|
# print './megam_i686.opt ', ' '.join(options)
|
|
# Delete the training file
|
|
try:
|
|
os.remove(trainfile_name)
|
|
except (OSError, IOError) as e:
|
|
print('Warning: unable to delete %s: %s' % (trainfile_name, e))
|
|
|
|
# Parse the generated weight vector.
|
|
weights = parse_megam_weights(stdout, encoding.length(), explicit)
|
|
|
|
# Convert from base-e to base-2 weights.
|
|
weights *= numpy.log2(numpy.e)
|
|
|
|
# Build the classifier
|
|
return MaxentClassifier(encoding, weights)
|
|
|
|
|
|
######################################################################
|
|
# { Classifier Trainer: tadm
|
|
######################################################################
|
|
|
|
|
|
class TadmMaxentClassifier(MaxentClassifier):
|
|
@classmethod
|
|
def train(cls, train_toks, **kwargs):
|
|
algorithm = kwargs.get('algorithm', 'tao_lmvm')
|
|
trace = kwargs.get('trace', 3)
|
|
encoding = kwargs.get('encoding', None)
|
|
labels = kwargs.get('labels', None)
|
|
sigma = kwargs.get('gaussian_prior_sigma', 0)
|
|
count_cutoff = kwargs.get('count_cutoff', 0)
|
|
max_iter = kwargs.get('max_iter')
|
|
ll_delta = kwargs.get('min_lldelta')
|
|
|
|
# Construct an encoding from the training data.
|
|
if not encoding:
|
|
encoding = TadmEventMaxentFeatureEncoding.train(
|
|
train_toks, count_cutoff, labels=labels
|
|
)
|
|
|
|
trainfile_fd, trainfile_name = tempfile.mkstemp(
|
|
prefix='nltk-tadm-events-', suffix='.gz'
|
|
)
|
|
weightfile_fd, weightfile_name = tempfile.mkstemp(prefix='nltk-tadm-weights-')
|
|
|
|
trainfile = gzip_open_unicode(trainfile_name, 'w')
|
|
write_tadm_file(train_toks, encoding, trainfile)
|
|
trainfile.close()
|
|
|
|
options = []
|
|
options.extend(['-monitor'])
|
|
options.extend(['-method', algorithm])
|
|
if sigma:
|
|
options.extend(['-l2', '%.6f' % sigma ** 2])
|
|
if max_iter:
|
|
options.extend(['-max_it', '%d' % max_iter])
|
|
if ll_delta:
|
|
options.extend(['-fatol', '%.6f' % abs(ll_delta)])
|
|
options.extend(['-events_in', trainfile_name])
|
|
options.extend(['-params_out', weightfile_name])
|
|
if trace < 3:
|
|
options.extend(['2>&1'])
|
|
else:
|
|
options.extend(['-summary'])
|
|
|
|
call_tadm(options)
|
|
|
|
with open(weightfile_name, 'r') as weightfile:
|
|
weights = parse_tadm_weights(weightfile)
|
|
|
|
os.remove(trainfile_name)
|
|
os.remove(weightfile_name)
|
|
|
|
# Convert from base-e to base-2 weights.
|
|
weights *= numpy.log2(numpy.e)
|
|
|
|
# Build the classifier
|
|
return cls(encoding, weights)
|
|
|
|
|
|
######################################################################
|
|
# { Demo
|
|
######################################################################
|
|
def demo():
|
|
from nltk.classify.util import names_demo
|
|
|
|
classifier = names_demo(MaxentClassifier.train)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
demo()
|