2179 lines
77 KiB
Python
2179 lines
77 KiB
Python
# -*- coding: utf-8 -*-
|
||
# Natural Language Toolkit: WordNet
|
||
#
|
||
# Copyright (C) 2001-2019 NLTK Project
|
||
# Author: Steven Bethard <Steven.Bethard@colorado.edu>
|
||
# Steven Bird <stevenbird1@gmail.com>
|
||
# Edward Loper <edloper@gmail.com>
|
||
# Nitin Madnani <nmadnani@ets.org>
|
||
# Nasruddin A’aidil Shari
|
||
# Sim Wei Ying Geraldine
|
||
# Soe Lynn
|
||
# Francis Bond <bond@ieee.org>
|
||
# URL: <http://nltk.org/>
|
||
# For license information, see LICENSE.TXT
|
||
|
||
"""
|
||
An NLTK interface for WordNet
|
||
|
||
WordNet is a lexical database of English.
|
||
Using synsets, helps find conceptual relationships between words
|
||
such as hypernyms, hyponyms, synonyms, antonyms etc.
|
||
|
||
For details about WordNet see:
|
||
http://wordnet.princeton.edu/
|
||
|
||
This module also allows you to find lemmas in languages
|
||
other than English from the Open Multilingual Wordnet
|
||
http://compling.hss.ntu.edu.sg/omw/
|
||
|
||
"""
|
||
|
||
from __future__ import print_function, unicode_literals
|
||
|
||
import math
|
||
import re
|
||
from itertools import islice, chain
|
||
from functools import total_ordering
|
||
from operator import itemgetter
|
||
from collections import defaultdict, deque
|
||
|
||
from six import iteritems
|
||
from six.moves import range
|
||
|
||
from nltk.corpus.reader import CorpusReader
|
||
from nltk.util import binary_search_file as _binary_search_file
|
||
from nltk.probability import FreqDist
|
||
from nltk.compat import python_2_unicode_compatible
|
||
from nltk.internals import deprecated
|
||
|
||
######################################################################
|
||
# Table of Contents
|
||
######################################################################
|
||
# - Constants
|
||
# - Data Classes
|
||
# - WordNetError
|
||
# - Lemma
|
||
# - Synset
|
||
# - WordNet Corpus Reader
|
||
# - WordNet Information Content Corpus Reader
|
||
# - Similarity Metrics
|
||
# - Demo
|
||
|
||
######################################################################
|
||
# Constants
|
||
######################################################################
|
||
|
||
#: Positive infinity (for similarity functions)
|
||
_INF = 1e300
|
||
|
||
# { Part-of-speech constants
|
||
ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
|
||
# }
|
||
|
||
POS_LIST = [NOUN, VERB, ADJ, ADV]
|
||
|
||
# A table of strings that are used to express verb frames.
|
||
VERB_FRAME_STRINGS = (
|
||
None,
|
||
"Something %s",
|
||
"Somebody %s",
|
||
"It is %sing",
|
||
"Something is %sing PP",
|
||
"Something %s something Adjective/Noun",
|
||
"Something %s Adjective/Noun",
|
||
"Somebody %s Adjective",
|
||
"Somebody %s something",
|
||
"Somebody %s somebody",
|
||
"Something %s somebody",
|
||
"Something %s something",
|
||
"Something %s to somebody",
|
||
"Somebody %s on something",
|
||
"Somebody %s somebody something",
|
||
"Somebody %s something to somebody",
|
||
"Somebody %s something from somebody",
|
||
"Somebody %s somebody with something",
|
||
"Somebody %s somebody of something",
|
||
"Somebody %s something on somebody",
|
||
"Somebody %s somebody PP",
|
||
"Somebody %s something PP",
|
||
"Somebody %s PP",
|
||
"Somebody's (body part) %s",
|
||
"Somebody %s somebody to INFINITIVE",
|
||
"Somebody %s somebody INFINITIVE",
|
||
"Somebody %s that CLAUSE",
|
||
"Somebody %s to somebody",
|
||
"Somebody %s to INFINITIVE",
|
||
"Somebody %s whether INFINITIVE",
|
||
"Somebody %s somebody into V-ing something",
|
||
"Somebody %s something with something",
|
||
"Somebody %s INFINITIVE",
|
||
"Somebody %s VERB-ing",
|
||
"It %s that CLAUSE",
|
||
"Something %s INFINITIVE",
|
||
)
|
||
|
||
SENSENUM_RE = re.compile(r'\.[\d]+\.')
|
||
|
||
|
||
######################################################################
|
||
# Data Classes
|
||
######################################################################
|
||
|
||
|
||
class WordNetError(Exception):
|
||
"""An exception class for wordnet-related errors."""
|
||
|
||
|
||
@total_ordering
|
||
class _WordNetObject(object):
|
||
"""A common base class for lemmas and synsets."""
|
||
|
||
def hypernyms(self):
|
||
return self._related('@')
|
||
|
||
def _hypernyms(self):
|
||
return self._related('@')
|
||
|
||
def instance_hypernyms(self):
|
||
return self._related('@i')
|
||
|
||
def _instance_hypernyms(self):
|
||
return self._related('@i')
|
||
|
||
def hyponyms(self):
|
||
return self._related('~')
|
||
|
||
def instance_hyponyms(self):
|
||
return self._related('~i')
|
||
|
||
def member_holonyms(self):
|
||
return self._related('#m')
|
||
|
||
def substance_holonyms(self):
|
||
return self._related('#s')
|
||
|
||
def part_holonyms(self):
|
||
return self._related('#p')
|
||
|
||
def member_meronyms(self):
|
||
return self._related('%m')
|
||
|
||
def substance_meronyms(self):
|
||
return self._related('%s')
|
||
|
||
def part_meronyms(self):
|
||
return self._related('%p')
|
||
|
||
def topic_domains(self):
|
||
return self._related(';c')
|
||
|
||
def in_topic_domains(self):
|
||
return self._related('-c')
|
||
|
||
def region_domains(self):
|
||
return self._related(';r')
|
||
|
||
def in_region_domains(self):
|
||
return self._related('-r')
|
||
|
||
def usage_domains(self):
|
||
return self._related(';u')
|
||
|
||
def in_usage_domains(self):
|
||
return self._related('-u')
|
||
|
||
def attributes(self):
|
||
return self._related('=')
|
||
|
||
def entailments(self):
|
||
return self._related('*')
|
||
|
||
def causes(self):
|
||
return self._related('>')
|
||
|
||
def also_sees(self):
|
||
return self._related('^')
|
||
|
||
def verb_groups(self):
|
||
return self._related('$')
|
||
|
||
def similar_tos(self):
|
||
return self._related('&')
|
||
|
||
def __hash__(self):
|
||
return hash(self._name)
|
||
|
||
def __eq__(self, other):
|
||
return self._name == other._name
|
||
|
||
def __ne__(self, other):
|
||
return self._name != other._name
|
||
|
||
def __lt__(self, other):
|
||
return self._name < other._name
|
||
|
||
|
||
@python_2_unicode_compatible
|
||
class Lemma(_WordNetObject):
|
||
"""
|
||
The lexical entry for a single morphological form of a
|
||
sense-disambiguated word.
|
||
|
||
Create a Lemma from a "<word>.<pos>.<number>.<lemma>" string where:
|
||
<word> is the morphological stem identifying the synset
|
||
<pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
|
||
<number> is the sense number, counting from 0.
|
||
<lemma> is the morphological form of interest
|
||
|
||
Note that <word> and <lemma> can be different, e.g. the Synset
|
||
'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and
|
||
'salt.n.03.salinity'.
|
||
|
||
Lemma attributes, accessible via methods with the same name:
|
||
|
||
- name: The canonical name of this lemma.
|
||
- synset: The synset that this lemma belongs to.
|
||
- syntactic_marker: For adjectives, the WordNet string identifying the
|
||
syntactic position relative modified noun. See:
|
||
https://wordnet.princeton.edu/documentation/wninput5wn
|
||
For all other parts of speech, this attribute is None.
|
||
- count: The frequency of this lemma in wordnet.
|
||
|
||
Lemma methods:
|
||
|
||
Lemmas have the following methods for retrieving related Lemmas. They
|
||
correspond to the names for the pointer symbols defined here:
|
||
https://wordnet.princeton.edu/documentation/wninput5wn
|
||
These methods all return lists of Lemmas:
|
||
|
||
- antonyms
|
||
- hypernyms, instance_hypernyms
|
||
- hyponyms, instance_hyponyms
|
||
- member_holonyms, substance_holonyms, part_holonyms
|
||
- member_meronyms, substance_meronyms, part_meronyms
|
||
- topic_domains, region_domains, usage_domains
|
||
- attributes
|
||
- derivationally_related_forms
|
||
- entailments
|
||
- causes
|
||
- also_sees
|
||
- verb_groups
|
||
- similar_tos
|
||
- pertainyms
|
||
"""
|
||
|
||
__slots__ = [
|
||
'_wordnet_corpus_reader',
|
||
'_name',
|
||
'_syntactic_marker',
|
||
'_synset',
|
||
'_frame_strings',
|
||
'_frame_ids',
|
||
'_lexname_index',
|
||
'_lex_id',
|
||
'_lang',
|
||
'_key',
|
||
]
|
||
|
||
def __init__(
|
||
self,
|
||
wordnet_corpus_reader,
|
||
synset,
|
||
name,
|
||
lexname_index,
|
||
lex_id,
|
||
syntactic_marker,
|
||
):
|
||
self._wordnet_corpus_reader = wordnet_corpus_reader
|
||
self._name = name
|
||
self._syntactic_marker = syntactic_marker
|
||
self._synset = synset
|
||
self._frame_strings = []
|
||
self._frame_ids = []
|
||
self._lexname_index = lexname_index
|
||
self._lex_id = lex_id
|
||
self._lang = 'eng'
|
||
|
||
self._key = None # gets set later.
|
||
|
||
def name(self):
|
||
return self._name
|
||
|
||
def syntactic_marker(self):
|
||
return self._syntactic_marker
|
||
|
||
def synset(self):
|
||
return self._synset
|
||
|
||
def frame_strings(self):
|
||
return self._frame_strings
|
||
|
||
def frame_ids(self):
|
||
return self._frame_ids
|
||
|
||
def lang(self):
|
||
return self._lang
|
||
|
||
def key(self):
|
||
return self._key
|
||
|
||
def __repr__(self):
|
||
tup = type(self).__name__, self._synset._name, self._name
|
||
return "%s('%s.%s')" % tup
|
||
|
||
def _related(self, relation_symbol):
|
||
get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
|
||
if (self._name, relation_symbol) not in self._synset._lemma_pointers:
|
||
return []
|
||
return [
|
||
get_synset(pos, offset)._lemmas[lemma_index]
|
||
for pos, offset, lemma_index in self._synset._lemma_pointers[
|
||
self._name, relation_symbol
|
||
]
|
||
]
|
||
|
||
def count(self):
|
||
"""Return the frequency count for this Lemma"""
|
||
return self._wordnet_corpus_reader.lemma_count(self)
|
||
|
||
def antonyms(self):
|
||
return self._related('!')
|
||
|
||
def derivationally_related_forms(self):
|
||
return self._related('+')
|
||
|
||
def pertainyms(self):
|
||
return self._related('\\')
|
||
|
||
|
||
@python_2_unicode_compatible
|
||
class Synset(_WordNetObject):
|
||
"""Create a Synset from a "<lemma>.<pos>.<number>" string where:
|
||
<lemma> is the word's morphological stem
|
||
<pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
|
||
<number> is the sense number, counting from 0.
|
||
|
||
Synset attributes, accessible via methods with the same name:
|
||
|
||
- name: The canonical name of this synset, formed using the first lemma
|
||
of this synset. Note that this may be different from the name
|
||
passed to the constructor if that string used a different lemma to
|
||
identify the synset.
|
||
- pos: The synset's part of speech, matching one of the module level
|
||
attributes ADJ, ADJ_SAT, ADV, NOUN or VERB.
|
||
- lemmas: A list of the Lemma objects for this synset.
|
||
- definition: The definition for this synset.
|
||
- examples: A list of example strings for this synset.
|
||
- offset: The offset in the WordNet dict file of this synset.
|
||
- lexname: The name of the lexicographer file containing this synset.
|
||
|
||
Synset methods:
|
||
|
||
Synsets have the following methods for retrieving related Synsets.
|
||
They correspond to the names for the pointer symbols defined here:
|
||
https://wordnet.princeton.edu/documentation/wninput5wn
|
||
These methods all return lists of Synsets.
|
||
|
||
- hypernyms, instance_hypernyms
|
||
- hyponyms, instance_hyponyms
|
||
- member_holonyms, substance_holonyms, part_holonyms
|
||
- member_meronyms, substance_meronyms, part_meronyms
|
||
- attributes
|
||
- entailments
|
||
- causes
|
||
- also_sees
|
||
- verb_groups
|
||
- similar_tos
|
||
|
||
Additionally, Synsets support the following methods specific to the
|
||
hypernym relation:
|
||
|
||
- root_hypernyms
|
||
- common_hypernyms
|
||
- lowest_common_hypernyms
|
||
|
||
Note that Synsets do not support the following relations because
|
||
these are defined by WordNet as lexical relations:
|
||
|
||
- antonyms
|
||
- derivationally_related_forms
|
||
- pertainyms
|
||
"""
|
||
|
||
__slots__ = [
|
||
'_pos',
|
||
'_offset',
|
||
'_name',
|
||
'_frame_ids',
|
||
'_lemmas',
|
||
'_lemma_names',
|
||
'_definition',
|
||
'_examples',
|
||
'_lexname',
|
||
'_pointers',
|
||
'_lemma_pointers',
|
||
'_max_depth',
|
||
'_min_depth',
|
||
]
|
||
|
||
def __init__(self, wordnet_corpus_reader):
|
||
self._wordnet_corpus_reader = wordnet_corpus_reader
|
||
# All of these attributes get initialized by
|
||
# WordNetCorpusReader._synset_from_pos_and_line()
|
||
|
||
self._pos = None
|
||
self._offset = None
|
||
self._name = None
|
||
self._frame_ids = []
|
||
self._lemmas = []
|
||
self._lemma_names = []
|
||
self._definition = None
|
||
self._examples = []
|
||
self._lexname = None # lexicographer name
|
||
self._all_hypernyms = None
|
||
|
||
self._pointers = defaultdict(set)
|
||
self._lemma_pointers = defaultdict(list)
|
||
|
||
def pos(self):
|
||
return self._pos
|
||
|
||
def offset(self):
|
||
return self._offset
|
||
|
||
def name(self):
|
||
return self._name
|
||
|
||
def frame_ids(self):
|
||
return self._frame_ids
|
||
|
||
def definition(self):
|
||
return self._definition
|
||
|
||
def examples(self):
|
||
return self._examples
|
||
|
||
def lexname(self):
|
||
return self._lexname
|
||
|
||
def _needs_root(self):
|
||
if self._pos == NOUN:
|
||
if self._wordnet_corpus_reader.get_version() == '1.6':
|
||
return True
|
||
else:
|
||
return False
|
||
elif self._pos == VERB:
|
||
return True
|
||
|
||
def lemma_names(self, lang='eng'):
|
||
'''Return all the lemma_names associated with the synset'''
|
||
if lang == 'eng':
|
||
return self._lemma_names
|
||
else:
|
||
self._wordnet_corpus_reader._load_lang_data(lang)
|
||
|
||
i = self._wordnet_corpus_reader.ss2of(self, lang)
|
||
if i in self._wordnet_corpus_reader._lang_data[lang][0]:
|
||
return self._wordnet_corpus_reader._lang_data[lang][0][i]
|
||
else:
|
||
return []
|
||
|
||
def lemmas(self, lang='eng'):
|
||
'''Return all the lemma objects associated with the synset'''
|
||
if lang == 'eng':
|
||
return self._lemmas
|
||
else:
|
||
self._wordnet_corpus_reader._load_lang_data(lang)
|
||
lemmark = []
|
||
lemmy = self.lemma_names(lang)
|
||
for lem in lemmy:
|
||
temp = Lemma(
|
||
self._wordnet_corpus_reader,
|
||
self,
|
||
lem,
|
||
self._wordnet_corpus_reader._lexnames.index(self.lexname()),
|
||
0,
|
||
None,
|
||
)
|
||
temp._lang = lang
|
||
lemmark.append(temp)
|
||
return lemmark
|
||
|
||
def root_hypernyms(self):
|
||
"""Get the topmost hypernyms of this synset in WordNet."""
|
||
|
||
result = []
|
||
seen = set()
|
||
todo = [self]
|
||
while todo:
|
||
next_synset = todo.pop()
|
||
if next_synset not in seen:
|
||
seen.add(next_synset)
|
||
next_hypernyms = (
|
||
next_synset.hypernyms() + next_synset.instance_hypernyms()
|
||
)
|
||
if not next_hypernyms:
|
||
result.append(next_synset)
|
||
else:
|
||
todo.extend(next_hypernyms)
|
||
return result
|
||
|
||
# Simpler implementation which makes incorrect assumption that
|
||
# hypernym hierarchy is acyclic:
|
||
#
|
||
# if not self.hypernyms():
|
||
# return [self]
|
||
# else:
|
||
# return list(set(root for h in self.hypernyms()
|
||
# for root in h.root_hypernyms()))
|
||
def max_depth(self):
|
||
"""
|
||
:return: The length of the longest hypernym path from this
|
||
synset to the root.
|
||
"""
|
||
|
||
if "_max_depth" not in self.__dict__:
|
||
hypernyms = self.hypernyms() + self.instance_hypernyms()
|
||
if not hypernyms:
|
||
self._max_depth = 0
|
||
else:
|
||
self._max_depth = 1 + max(h.max_depth() for h in hypernyms)
|
||
return self._max_depth
|
||
|
||
def min_depth(self):
|
||
"""
|
||
:return: The length of the shortest hypernym path from this
|
||
synset to the root.
|
||
"""
|
||
|
||
if "_min_depth" not in self.__dict__:
|
||
hypernyms = self.hypernyms() + self.instance_hypernyms()
|
||
if not hypernyms:
|
||
self._min_depth = 0
|
||
else:
|
||
self._min_depth = 1 + min(h.min_depth() for h in hypernyms)
|
||
return self._min_depth
|
||
|
||
def closure(self, rel, depth=-1):
|
||
"""Return the transitive closure of source under the rel
|
||
relationship, breadth-first
|
||
|
||
>>> from nltk.corpus import wordnet as wn
|
||
>>> dog = wn.synset('dog.n.01')
|
||
>>> hyp = lambda s:s.hypernyms()
|
||
>>> list(dog.closure(hyp))
|
||
[Synset('canine.n.02'), Synset('domestic_animal.n.01'),
|
||
Synset('carnivore.n.01'), Synset('animal.n.01'),
|
||
Synset('placental.n.01'), Synset('organism.n.01'),
|
||
Synset('mammal.n.01'), Synset('living_thing.n.01'),
|
||
Synset('vertebrate.n.01'), Synset('whole.n.02'),
|
||
Synset('chordate.n.01'), Synset('object.n.01'),
|
||
Synset('physical_entity.n.01'), Synset('entity.n.01')]
|
||
|
||
"""
|
||
from nltk.util import breadth_first
|
||
|
||
synset_offsets = []
|
||
for synset in breadth_first(self, rel, depth):
|
||
if synset._offset != self._offset:
|
||
if synset._offset not in synset_offsets:
|
||
synset_offsets.append(synset._offset)
|
||
yield synset
|
||
|
||
def hypernym_paths(self):
|
||
"""
|
||
Get the path(s) from this synset to the root, where each path is a
|
||
list of the synset nodes traversed on the way to the root.
|
||
|
||
:return: A list of lists, where each list gives the node sequence
|
||
connecting the initial ``Synset`` node and a root node.
|
||
"""
|
||
paths = []
|
||
|
||
hypernyms = self.hypernyms() + self.instance_hypernyms()
|
||
if len(hypernyms) == 0:
|
||
paths = [[self]]
|
||
|
||
for hypernym in hypernyms:
|
||
for ancestor_list in hypernym.hypernym_paths():
|
||
ancestor_list.append(self)
|
||
paths.append(ancestor_list)
|
||
return paths
|
||
|
||
def common_hypernyms(self, other):
|
||
"""
|
||
Find all synsets that are hypernyms of this synset and the
|
||
other synset.
|
||
|
||
:type other: Synset
|
||
:param other: other input synset.
|
||
:return: The synsets that are hypernyms of both synsets.
|
||
"""
|
||
if not self._all_hypernyms:
|
||
self._all_hypernyms = set(
|
||
self_synset
|
||
for self_synsets in self._iter_hypernym_lists()
|
||
for self_synset in self_synsets
|
||
)
|
||
if not other._all_hypernyms:
|
||
other._all_hypernyms = set(
|
||
other_synset
|
||
for other_synsets in other._iter_hypernym_lists()
|
||
for other_synset in other_synsets
|
||
)
|
||
return list(self._all_hypernyms.intersection(other._all_hypernyms))
|
||
|
||
def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False):
|
||
"""
|
||
Get a list of lowest synset(s) that both synsets have as a hypernym.
|
||
When `use_min_depth == False` this means that the synset which appears
|
||
as a hypernym of both `self` and `other` with the lowest maximum depth
|
||
is returned or if there are multiple such synsets at the same depth
|
||
they are all returned
|
||
|
||
However, if `use_min_depth == True` then the synset(s) which has/have
|
||
the lowest minimum depth and appear(s) in both paths is/are returned.
|
||
|
||
By setting the use_min_depth flag to True, the behavior of NLTK2 can be
|
||
preserved. This was changed in NLTK3 to give more accurate results in a
|
||
small set of cases, generally with synsets concerning people. (eg:
|
||
'chef.n.01', 'fireman.n.01', etc.)
|
||
|
||
This method is an implementation of Ted Pedersen's "Lowest Common
|
||
Subsumer" method from the Perl Wordnet module. It can return either
|
||
"self" or "other" if they are a hypernym of the other.
|
||
|
||
:type other: Synset
|
||
:param other: other input synset
|
||
:type simulate_root: bool
|
||
:param simulate_root: The various verb taxonomies do not
|
||
share a single root which disallows this metric from working for
|
||
synsets that are not connected. This flag (False by default)
|
||
creates a fake root that connects all the taxonomies. Set it
|
||
to True to enable this behavior. For the noun taxonomy,
|
||
there is usually a default root except for WordNet version 1.6.
|
||
If you are using wordnet 1.6, a fake root will need to be added
|
||
for nouns as well.
|
||
:type use_min_depth: bool
|
||
:param use_min_depth: This setting mimics older (v2) behavior of NLTK
|
||
wordnet If True, will use the min_depth function to calculate the
|
||
lowest common hypernyms. This is known to give strange results for
|
||
some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained
|
||
for backwards compatibility
|
||
:return: The synsets that are the lowest common hypernyms of both
|
||
synsets
|
||
"""
|
||
synsets = self.common_hypernyms(other)
|
||
if simulate_root:
|
||
fake_synset = Synset(None)
|
||
fake_synset._name = '*ROOT*'
|
||
fake_synset.hypernyms = lambda: []
|
||
fake_synset.instance_hypernyms = lambda: []
|
||
synsets.append(fake_synset)
|
||
|
||
try:
|
||
if use_min_depth:
|
||
max_depth = max(s.min_depth() for s in synsets)
|
||
unsorted_lch = [s for s in synsets if s.min_depth() == max_depth]
|
||
else:
|
||
max_depth = max(s.max_depth() for s in synsets)
|
||
unsorted_lch = [s for s in synsets if s.max_depth() == max_depth]
|
||
return sorted(unsorted_lch)
|
||
except ValueError:
|
||
return []
|
||
|
||
def hypernym_distances(self, distance=0, simulate_root=False):
|
||
"""
|
||
Get the path(s) from this synset to the root, counting the distance
|
||
of each node from the initial node on the way. A set of
|
||
(synset, distance) tuples is returned.
|
||
|
||
:type distance: int
|
||
:param distance: the distance (number of edges) from this hypernym to
|
||
the original hypernym ``Synset`` on which this method was called.
|
||
:return: A set of ``(Synset, int)`` tuples where each ``Synset`` is
|
||
a hypernym of the first ``Synset``.
|
||
"""
|
||
distances = set([(self, distance)])
|
||
for hypernym in self._hypernyms() + self._instance_hypernyms():
|
||
distances |= hypernym.hypernym_distances(distance + 1, simulate_root=False)
|
||
if simulate_root:
|
||
fake_synset = Synset(None)
|
||
fake_synset._name = '*ROOT*'
|
||
fake_synset_distance = max(distances, key=itemgetter(1))[1]
|
||
distances.add((fake_synset, fake_synset_distance + 1))
|
||
return distances
|
||
|
||
def _shortest_hypernym_paths(self, simulate_root):
|
||
if self._name == '*ROOT*':
|
||
return {self: 0}
|
||
|
||
queue = deque([(self, 0)])
|
||
path = {}
|
||
|
||
while queue:
|
||
s, depth = queue.popleft()
|
||
if s in path:
|
||
continue
|
||
path[s] = depth
|
||
|
||
depth += 1
|
||
queue.extend((hyp, depth) for hyp in s._hypernyms())
|
||
queue.extend((hyp, depth) for hyp in s._instance_hypernyms())
|
||
|
||
if simulate_root:
|
||
fake_synset = Synset(None)
|
||
fake_synset._name = '*ROOT*'
|
||
path[fake_synset] = max(path.values()) + 1
|
||
|
||
return path
|
||
|
||
def shortest_path_distance(self, other, simulate_root=False):
|
||
"""
|
||
Returns the distance of the shortest path linking the two synsets (if
|
||
one exists). For each synset, all the ancestor nodes and their
|
||
distances are recorded and compared. The ancestor node common to both
|
||
synsets that can be reached with the minimum number of traversals is
|
||
used. If no ancestor nodes are common, None is returned. If a node is
|
||
compared with itself 0 is returned.
|
||
|
||
:type other: Synset
|
||
:param other: The Synset to which the shortest path will be found.
|
||
:return: The number of edges in the shortest path connecting the two
|
||
nodes, or None if no path exists.
|
||
"""
|
||
|
||
if self == other:
|
||
return 0
|
||
|
||
dist_dict1 = self._shortest_hypernym_paths(simulate_root)
|
||
dist_dict2 = other._shortest_hypernym_paths(simulate_root)
|
||
|
||
# For each ancestor synset common to both subject synsets, find the
|
||
# connecting path length. Return the shortest of these.
|
||
|
||
inf = float('inf')
|
||
path_distance = inf
|
||
for synset, d1 in iteritems(dist_dict1):
|
||
d2 = dist_dict2.get(synset, inf)
|
||
path_distance = min(path_distance, d1 + d2)
|
||
|
||
return None if math.isinf(path_distance) else path_distance
|
||
|
||
def tree(self, rel, depth=-1, cut_mark=None):
|
||
"""
|
||
>>> from nltk.corpus import wordnet as wn
|
||
>>> dog = wn.synset('dog.n.01')
|
||
>>> hyp = lambda s:s.hypernyms()
|
||
>>> from pprint import pprint
|
||
>>> pprint(dog.tree(hyp))
|
||
[Synset('dog.n.01'),
|
||
[Synset('canine.n.02'),
|
||
[Synset('carnivore.n.01'),
|
||
[Synset('placental.n.01'),
|
||
[Synset('mammal.n.01'),
|
||
[Synset('vertebrate.n.01'),
|
||
[Synset('chordate.n.01'),
|
||
[Synset('animal.n.01'),
|
||
[Synset('organism.n.01'),
|
||
[Synset('living_thing.n.01'),
|
||
[Synset('whole.n.02'),
|
||
[Synset('object.n.01'),
|
||
[Synset('physical_entity.n.01'),
|
||
[Synset('entity.n.01')]]]]]]]]]]]]],
|
||
[Synset('domestic_animal.n.01'),
|
||
[Synset('animal.n.01'),
|
||
[Synset('organism.n.01'),
|
||
[Synset('living_thing.n.01'),
|
||
[Synset('whole.n.02'),
|
||
[Synset('object.n.01'),
|
||
[Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]
|
||
"""
|
||
|
||
tree = [self]
|
||
if depth != 0:
|
||
tree += [x.tree(rel, depth - 1, cut_mark) for x in rel(self)]
|
||
elif cut_mark:
|
||
tree += [cut_mark]
|
||
return tree
|
||
|
||
# interface to similarity methods
|
||
def path_similarity(self, other, verbose=False, simulate_root=True):
|
||
"""
|
||
Path Distance Similarity:
|
||
Return a score denoting how similar two word senses are, based on the
|
||
shortest path that connects the senses in the is-a (hypernym/hypnoym)
|
||
taxonomy. The score is in the range 0 to 1, except in those cases where
|
||
a path cannot be found (will only be true for verbs as there are many
|
||
distinct verb taxonomies), in which case None is returned. A score of
|
||
1 represents identity i.e. comparing a sense with itself will return 1.
|
||
|
||
:type other: Synset
|
||
:param other: The ``Synset`` that this ``Synset`` is being compared to.
|
||
:type simulate_root: bool
|
||
:param simulate_root: The various verb taxonomies do not
|
||
share a single root which disallows this metric from working for
|
||
synsets that are not connected. This flag (True by default)
|
||
creates a fake root that connects all the taxonomies. Set it
|
||
to false to disable this behavior. For the noun taxonomy,
|
||
there is usually a default root except for WordNet version 1.6.
|
||
If you are using wordnet 1.6, a fake root will be added for nouns
|
||
as well.
|
||
:return: A score denoting the similarity of the two ``Synset`` objects,
|
||
normally between 0 and 1. None is returned if no connecting path
|
||
could be found. 1 is returned if a ``Synset`` is compared with
|
||
itself.
|
||
"""
|
||
|
||
distance = self.shortest_path_distance(
|
||
other, simulate_root=simulate_root and self._needs_root()
|
||
)
|
||
if distance is None or distance < 0:
|
||
return None
|
||
return 1.0 / (distance + 1)
|
||
|
||
def lch_similarity(self, other, verbose=False, simulate_root=True):
|
||
"""
|
||
Leacock Chodorow Similarity:
|
||
Return a score denoting how similar two word senses are, based on the
|
||
shortest path that connects the senses (as above) and the maximum depth
|
||
of the taxonomy in which the senses occur. The relationship is given as
|
||
-log(p/2d) where p is the shortest path length and d is the taxonomy
|
||
depth.
|
||
|
||
:type other: Synset
|
||
:param other: The ``Synset`` that this ``Synset`` is being compared to.
|
||
:type simulate_root: bool
|
||
:param simulate_root: The various verb taxonomies do not
|
||
share a single root which disallows this metric from working for
|
||
synsets that are not connected. This flag (True by default)
|
||
creates a fake root that connects all the taxonomies. Set it
|
||
to false to disable this behavior. For the noun taxonomy,
|
||
there is usually a default root except for WordNet version 1.6.
|
||
If you are using wordnet 1.6, a fake root will be added for nouns
|
||
as well.
|
||
:return: A score denoting the similarity of the two ``Synset`` objects,
|
||
normally greater than 0. None is returned if no connecting path
|
||
could be found. If a ``Synset`` is compared with itself, the
|
||
maximum score is returned, which varies depending on the taxonomy
|
||
depth.
|
||
"""
|
||
|
||
if self._pos != other._pos:
|
||
raise WordNetError(
|
||
'Computing the lch similarity requires '
|
||
'%s and %s to have the same part of speech.' % (self, other)
|
||
)
|
||
|
||
need_root = self._needs_root()
|
||
|
||
if self._pos not in self._wordnet_corpus_reader._max_depth:
|
||
self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root)
|
||
|
||
depth = self._wordnet_corpus_reader._max_depth[self._pos]
|
||
|
||
distance = self.shortest_path_distance(
|
||
other, simulate_root=simulate_root and need_root
|
||
)
|
||
|
||
if distance is None or distance < 0 or depth == 0:
|
||
return None
|
||
return -math.log((distance + 1) / (2.0 * depth))
|
||
|
||
def wup_similarity(self, other, verbose=False, simulate_root=True):
|
||
"""
|
||
Wu-Palmer Similarity:
|
||
Return a score denoting how similar two word senses are, based on the
|
||
depth of the two senses in the taxonomy and that of their Least Common
|
||
Subsumer (most specific ancestor node). Previously, the scores computed
|
||
by this implementation did _not_ always agree with those given by
|
||
Pedersen's Perl implementation of WordNet Similarity. However, with
|
||
the addition of the simulate_root flag (see below), the score for
|
||
verbs now almost always agree but not always for nouns.
|
||
|
||
The LCS does not necessarily feature in the shortest path connecting
|
||
the two senses, as it is by definition the common ancestor deepest in
|
||
the taxonomy, not closest to the two senses. Typically, however, it
|
||
will so feature. Where multiple candidates for the LCS exist, that
|
||
whose shortest path to the root node is the longest will be selected.
|
||
Where the LCS has multiple paths to the root, the longer path is used
|
||
for the purposes of the calculation.
|
||
|
||
:type other: Synset
|
||
:param other: The ``Synset`` that this ``Synset`` is being compared to.
|
||
:type simulate_root: bool
|
||
:param simulate_root: The various verb taxonomies do not
|
||
share a single root which disallows this metric from working for
|
||
synsets that are not connected. This flag (True by default)
|
||
creates a fake root that connects all the taxonomies. Set it
|
||
to false to disable this behavior. For the noun taxonomy,
|
||
there is usually a default root except for WordNet version 1.6.
|
||
If you are using wordnet 1.6, a fake root will be added for nouns
|
||
as well.
|
||
:return: A float score denoting the similarity of the two ``Synset``
|
||
objects, normally greater than zero. If no connecting path between
|
||
the two senses can be found, None is returned.
|
||
|
||
"""
|
||
|
||
need_root = self._needs_root()
|
||
# Note that to preserve behavior from NLTK2 we set use_min_depth=True
|
||
# It is possible that more accurate results could be obtained by
|
||
# removing this setting and it should be tested later on
|
||
subsumers = self.lowest_common_hypernyms(
|
||
other, simulate_root=simulate_root and need_root, use_min_depth=True
|
||
)
|
||
|
||
# If no LCS was found return None
|
||
if len(subsumers) == 0:
|
||
return None
|
||
|
||
subsumer = self if self in subsumers else subsumers[0]
|
||
|
||
# Get the longest path from the LCS to the root,
|
||
# including a correction:
|
||
# - add one because the calculations include both the start and end
|
||
# nodes
|
||
depth = subsumer.max_depth() + 1
|
||
|
||
# Note: No need for an additional add-one correction for non-nouns
|
||
# to account for an imaginary root node because that is now
|
||
# automatically handled by simulate_root
|
||
# if subsumer._pos != NOUN:
|
||
# depth += 1
|
||
|
||
# Get the shortest path from the LCS to each of the synsets it is
|
||
# subsuming. Add this to the LCS path length to get the path
|
||
# length from each synset to the root.
|
||
len1 = self.shortest_path_distance(
|
||
subsumer, simulate_root=simulate_root and need_root
|
||
)
|
||
len2 = other.shortest_path_distance(
|
||
subsumer, simulate_root=simulate_root and need_root
|
||
)
|
||
if len1 is None or len2 is None:
|
||
return None
|
||
len1 += depth
|
||
len2 += depth
|
||
return (2.0 * depth) / (len1 + len2)
|
||
|
||
def res_similarity(self, other, ic, verbose=False):
|
||
"""
|
||
Resnik Similarity:
|
||
Return a score denoting how similar two word senses are, based on the
|
||
Information Content (IC) of the Least Common Subsumer (most specific
|
||
ancestor node).
|
||
|
||
:type other: Synset
|
||
:param other: The ``Synset`` that this ``Synset`` is being compared to.
|
||
:type ic: dict
|
||
:param ic: an information content object (as returned by
|
||
``nltk.corpus.wordnet_ic.ic()``).
|
||
:return: A float score denoting the similarity of the two ``Synset``
|
||
objects. Synsets whose LCS is the root node of the taxonomy will
|
||
have a score of 0 (e.g. N['dog'][0] and N['table'][0]).
|
||
"""
|
||
|
||
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
|
||
return lcs_ic
|
||
|
||
def jcn_similarity(self, other, ic, verbose=False):
|
||
"""
|
||
Jiang-Conrath Similarity:
|
||
Return a score denoting how similar two word senses are, based on the
|
||
Information Content (IC) of the Least Common Subsumer (most specific
|
||
ancestor node) and that of the two input Synsets. The relationship is
|
||
given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
|
||
|
||
:type other: Synset
|
||
:param other: The ``Synset`` that this ``Synset`` is being compared to.
|
||
:type ic: dict
|
||
:param ic: an information content object (as returned by
|
||
``nltk.corpus.wordnet_ic.ic()``).
|
||
:return: A float score denoting the similarity of the two ``Synset``
|
||
objects.
|
||
"""
|
||
|
||
if self == other:
|
||
return _INF
|
||
|
||
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
|
||
|
||
# If either of the input synsets are the root synset, or have a
|
||
# frequency of 0 (sparse data problem), return 0.
|
||
if ic1 == 0 or ic2 == 0:
|
||
return 0
|
||
|
||
ic_difference = ic1 + ic2 - 2 * lcs_ic
|
||
|
||
if ic_difference == 0:
|
||
return _INF
|
||
|
||
return 1 / ic_difference
|
||
|
||
def lin_similarity(self, other, ic, verbose=False):
|
||
"""
|
||
Lin Similarity:
|
||
Return a score denoting how similar two word senses are, based on the
|
||
Information Content (IC) of the Least Common Subsumer (most specific
|
||
ancestor node) and that of the two input Synsets. The relationship is
|
||
given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
|
||
|
||
:type other: Synset
|
||
:param other: The ``Synset`` that this ``Synset`` is being compared to.
|
||
:type ic: dict
|
||
:param ic: an information content object (as returned by
|
||
``nltk.corpus.wordnet_ic.ic()``).
|
||
:return: A float score denoting the similarity of the two ``Synset``
|
||
objects, in the range 0 to 1.
|
||
"""
|
||
|
||
ic1, ic2, lcs_ic = _lcs_ic(self, other, ic)
|
||
return (2.0 * lcs_ic) / (ic1 + ic2)
|
||
|
||
def _iter_hypernym_lists(self):
|
||
"""
|
||
:return: An iterator over ``Synset`` objects that are either proper
|
||
hypernyms or instance of hypernyms of the synset.
|
||
"""
|
||
todo = [self]
|
||
seen = set()
|
||
while todo:
|
||
for synset in todo:
|
||
seen.add(synset)
|
||
yield todo
|
||
todo = [
|
||
hypernym
|
||
for synset in todo
|
||
for hypernym in (synset.hypernyms() + synset.instance_hypernyms())
|
||
if hypernym not in seen
|
||
]
|
||
|
||
def __repr__(self):
|
||
return "%s('%s')" % (type(self).__name__, self._name)
|
||
|
||
def _related(self, relation_symbol, sort=True):
|
||
get_synset = self._wordnet_corpus_reader.synset_from_pos_and_offset
|
||
if relation_symbol not in self._pointers:
|
||
return []
|
||
pointer_tuples = self._pointers[relation_symbol]
|
||
r = [get_synset(pos, offset) for pos, offset in pointer_tuples]
|
||
if sort:
|
||
r.sort()
|
||
return r
|
||
|
||
|
||
######################################################################
|
||
# WordNet Corpus Reader
|
||
######################################################################
|
||
|
||
|
||
class WordNetCorpusReader(CorpusReader):
|
||
"""
|
||
A corpus reader used to access wordnet or its variants.
|
||
"""
|
||
|
||
_ENCODING = 'utf8'
|
||
|
||
# { Part-of-speech constants
|
||
ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
|
||
# }
|
||
|
||
# { Filename constants
|
||
_FILEMAP = {ADJ: 'adj', ADV: 'adv', NOUN: 'noun', VERB: 'verb'}
|
||
# }
|
||
|
||
# { Part of speech constants
|
||
_pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5}
|
||
_pos_names = dict(tup[::-1] for tup in _pos_numbers.items())
|
||
# }
|
||
|
||
#: A list of file identifiers for all the fileids used by this
|
||
#: corpus reader.
|
||
_FILES = (
|
||
'cntlist.rev',
|
||
'lexnames',
|
||
'index.sense',
|
||
'index.adj',
|
||
'index.adv',
|
||
'index.noun',
|
||
'index.verb',
|
||
'data.adj',
|
||
'data.adv',
|
||
'data.noun',
|
||
'data.verb',
|
||
'adj.exc',
|
||
'adv.exc',
|
||
'noun.exc',
|
||
'verb.exc',
|
||
)
|
||
|
||
def __init__(self, root, omw_reader):
|
||
"""
|
||
Construct a new wordnet corpus reader, with the given root
|
||
directory.
|
||
"""
|
||
super(WordNetCorpusReader, self).__init__(
|
||
root, self._FILES, encoding=self._ENCODING
|
||
)
|
||
|
||
# A index that provides the file offset
|
||
# Map from lemma -> pos -> synset_index -> offset
|
||
self._lemma_pos_offset_map = defaultdict(dict)
|
||
|
||
# A cache so we don't have to reconstuct synsets
|
||
# Map from pos -> offset -> synset
|
||
self._synset_offset_cache = defaultdict(dict)
|
||
|
||
# A lookup for the maximum depth of each part of speech. Useful for
|
||
# the lch similarity metric.
|
||
self._max_depth = defaultdict(dict)
|
||
|
||
# Corpus reader containing omw data.
|
||
self._omw_reader = omw_reader
|
||
|
||
# A cache to store the wordnet data of multiple languages
|
||
self._lang_data = defaultdict(list)
|
||
|
||
self._data_file_map = {}
|
||
self._exception_map = {}
|
||
self._lexnames = []
|
||
self._key_count_file = None
|
||
self._key_synset_file = None
|
||
|
||
# Load the lexnames
|
||
for i, line in enumerate(self.open('lexnames')):
|
||
index, lexname, _ = line.split()
|
||
assert int(index) == i
|
||
self._lexnames.append(lexname)
|
||
|
||
# Load the indices for lemmas and synset offsets
|
||
self._load_lemma_pos_offset_map()
|
||
|
||
# load the exception file data into memory
|
||
self._load_exception_map()
|
||
|
||
# Open Multilingual WordNet functions, contributed by
|
||
# Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn
|
||
|
||
def of2ss(self, of):
|
||
''' take an id and return the synsets '''
|
||
return self.synset_from_pos_and_offset(of[-1], int(of[:8]))
|
||
|
||
def ss2of(self, ss, lang=None):
|
||
''' return the ID of the synset '''
|
||
pos = ss.pos()
|
||
# Only these 3 WordNets retain the satellite pos tag
|
||
if lang not in ["nld", "lit", "slk"] and pos == 's':
|
||
pos = 'a'
|
||
return "{:08d}-{}".format(ss.offset(), pos)
|
||
|
||
def _load_lang_data(self, lang):
|
||
''' load the wordnet data of the requested language from the file to
|
||
the cache, _lang_data '''
|
||
|
||
if lang in self._lang_data.keys():
|
||
return
|
||
|
||
if lang not in self.langs():
|
||
raise WordNetError("Language is not supported.")
|
||
|
||
f = self._omw_reader.open('{0:}/wn-data-{0:}.tab'.format(lang))
|
||
self.custom_lemmas(f, lang)
|
||
f.close()
|
||
|
||
def langs(self):
|
||
''' return a list of languages supported by Multilingual Wordnet '''
|
||
import os
|
||
|
||
langs = ['eng']
|
||
fileids = self._omw_reader.fileids()
|
||
for fileid in fileids:
|
||
file_name, file_extension = os.path.splitext(fileid)
|
||
if file_extension == '.tab':
|
||
langs.append(file_name.split('-')[-1])
|
||
|
||
return langs
|
||
|
||
def _load_lemma_pos_offset_map(self):
|
||
for suffix in self._FILEMAP.values():
|
||
|
||
# parse each line of the file (ignoring comment lines)
|
||
for i, line in enumerate(self.open('index.%s' % suffix)):
|
||
if line.startswith(' '):
|
||
continue
|
||
|
||
_iter = iter(line.split())
|
||
|
||
def _next_token():
|
||
return next(_iter)
|
||
|
||
try:
|
||
|
||
# get the lemma and part-of-speech
|
||
lemma = _next_token()
|
||
pos = _next_token()
|
||
|
||
# get the number of synsets for this lemma
|
||
n_synsets = int(_next_token())
|
||
assert n_synsets > 0
|
||
|
||
# get and ignore the pointer symbols for all synsets of
|
||
# this lemma
|
||
n_pointers = int(_next_token())
|
||
[_next_token() for _ in range(n_pointers)]
|
||
|
||
# same as number of synsets
|
||
n_senses = int(_next_token())
|
||
assert n_synsets == n_senses
|
||
|
||
# get and ignore number of senses ranked according to
|
||
# frequency
|
||
_next_token()
|
||
|
||
# get synset offsets
|
||
synset_offsets = [int(_next_token()) for _ in range(n_synsets)]
|
||
|
||
# raise more informative error with file name and line number
|
||
except (AssertionError, ValueError) as e:
|
||
tup = ('index.%s' % suffix), (i + 1), e
|
||
raise WordNetError('file %s, line %i: %s' % tup)
|
||
|
||
# map lemmas and parts of speech to synsets
|
||
self._lemma_pos_offset_map[lemma][pos] = synset_offsets
|
||
if pos == ADJ:
|
||
self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets
|
||
|
||
def _load_exception_map(self):
|
||
# load the exception file data into memory
|
||
for pos, suffix in self._FILEMAP.items():
|
||
self._exception_map[pos] = {}
|
||
for line in self.open('%s.exc' % suffix):
|
||
terms = line.split()
|
||
self._exception_map[pos][terms[0]] = terms[1:]
|
||
self._exception_map[ADJ_SAT] = self._exception_map[ADJ]
|
||
|
||
def _compute_max_depth(self, pos, simulate_root):
|
||
"""
|
||
Compute the max depth for the given part of speech. This is
|
||
used by the lch similarity metric.
|
||
"""
|
||
depth = 0
|
||
for ii in self.all_synsets(pos):
|
||
try:
|
||
depth = max(depth, ii.max_depth())
|
||
except RuntimeError:
|
||
print(ii)
|
||
if simulate_root:
|
||
depth += 1
|
||
self._max_depth[pos] = depth
|
||
|
||
def get_version(self):
|
||
fh = self._data_file(ADJ)
|
||
for line in fh:
|
||
match = re.search(r'WordNet (\d+\.\d+) Copyright', line)
|
||
if match is not None:
|
||
version = match.group(1)
|
||
fh.seek(0)
|
||
return version
|
||
|
||
#############################################################
|
||
# Loading Lemmas
|
||
#############################################################
|
||
|
||
def lemma(self, name, lang='eng'):
|
||
'''Return lemma object that matches the name'''
|
||
# cannot simply split on first '.',
|
||
# e.g.: '.45_caliber.a.01..45_caliber'
|
||
separator = SENSENUM_RE.search(name).end()
|
||
|
||
synset_name, lemma_name = name[: separator - 1], name[separator:]
|
||
|
||
synset = self.synset(synset_name)
|
||
for lemma in synset.lemmas(lang):
|
||
if lemma._name == lemma_name:
|
||
return lemma
|
||
raise WordNetError('no lemma %r in %r' % (lemma_name, synset_name))
|
||
|
||
def lemma_from_key(self, key):
|
||
# Keys are case sensitive and always lower-case
|
||
key = key.lower()
|
||
|
||
lemma_name, lex_sense = key.split('%')
|
||
pos_number, lexname_index, lex_id, _, _ = lex_sense.split(':')
|
||
pos = self._pos_names[int(pos_number)]
|
||
|
||
# open the key -> synset file if necessary
|
||
if self._key_synset_file is None:
|
||
self._key_synset_file = self.open('index.sense')
|
||
|
||
# Find the synset for the lemma.
|
||
synset_line = _binary_search_file(self._key_synset_file, key)
|
||
if not synset_line:
|
||
raise WordNetError("No synset found for key %r" % key)
|
||
offset = int(synset_line.split()[1])
|
||
synset = self.synset_from_pos_and_offset(pos, offset)
|
||
|
||
# return the corresponding lemma
|
||
for lemma in synset._lemmas:
|
||
if lemma._key == key:
|
||
return lemma
|
||
raise WordNetError("No lemma found for for key %r" % key)
|
||
|
||
#############################################################
|
||
# Loading Synsets
|
||
#############################################################
|
||
def synset(self, name):
|
||
# split name into lemma, part of speech and synset number
|
||
lemma, pos, synset_index_str = name.lower().rsplit('.', 2)
|
||
synset_index = int(synset_index_str) - 1
|
||
|
||
# get the offset for this synset
|
||
try:
|
||
offset = self._lemma_pos_offset_map[lemma][pos][synset_index]
|
||
except KeyError:
|
||
message = 'no lemma %r with part of speech %r'
|
||
raise WordNetError(message % (lemma, pos))
|
||
except IndexError:
|
||
n_senses = len(self._lemma_pos_offset_map[lemma][pos])
|
||
message = "lemma %r with part of speech %r has only %i %s"
|
||
if n_senses == 1:
|
||
tup = lemma, pos, n_senses, "sense"
|
||
else:
|
||
tup = lemma, pos, n_senses, "senses"
|
||
raise WordNetError(message % tup)
|
||
|
||
# load synset information from the appropriate file
|
||
synset = self.synset_from_pos_and_offset(pos, offset)
|
||
|
||
# some basic sanity checks on loaded attributes
|
||
if pos == 's' and synset._pos == 'a':
|
||
message = (
|
||
'adjective satellite requested but only plain '
|
||
'adjective found for lemma %r'
|
||
)
|
||
raise WordNetError(message % lemma)
|
||
assert synset._pos == pos or (pos == 'a' and synset._pos == 's')
|
||
|
||
# Return the synset object.
|
||
return synset
|
||
|
||
def _data_file(self, pos):
|
||
"""
|
||
Return an open file pointer for the data file for the given
|
||
part of speech.
|
||
"""
|
||
if pos == ADJ_SAT:
|
||
pos = ADJ
|
||
if self._data_file_map.get(pos) is None:
|
||
fileid = 'data.%s' % self._FILEMAP[pos]
|
||
self._data_file_map[pos] = self.open(fileid)
|
||
return self._data_file_map[pos]
|
||
|
||
def synset_from_pos_and_offset(self, pos, offset):
|
||
# Check to see if the synset is in the cache
|
||
if offset in self._synset_offset_cache[pos]:
|
||
return self._synset_offset_cache[pos][offset]
|
||
|
||
data_file = self._data_file(pos)
|
||
data_file.seek(offset)
|
||
data_file_line = data_file.readline()
|
||
synset = self._synset_from_pos_and_line(pos, data_file_line)
|
||
assert synset._offset == offset
|
||
self._synset_offset_cache[pos][offset] = synset
|
||
return synset
|
||
|
||
@deprecated('Use public method synset_from_pos_and_offset() instead')
|
||
def _synset_from_pos_and_offset(self, *args, **kwargs):
|
||
"""
|
||
Hack to help people like the readers of
|
||
http://stackoverflow.com/a/27145655/1709587
|
||
who were using this function before it was officially a public method
|
||
"""
|
||
return self.synset_from_pos_and_offset(*args, **kwargs)
|
||
|
||
def _synset_from_pos_and_line(self, pos, data_file_line):
|
||
# Construct a new (empty) synset.
|
||
synset = Synset(self)
|
||
|
||
# parse the entry for this synset
|
||
try:
|
||
|
||
# parse out the definitions and examples from the gloss
|
||
columns_str, gloss = data_file_line.strip().split('|')
|
||
definition = re.sub(r"[\"].*?[\"]", "", gloss).strip()
|
||
examples = re.findall(r'"([^"]*)"', gloss)
|
||
for example in examples:
|
||
synset._examples.append(example)
|
||
|
||
synset._definition = definition.strip('; ')
|
||
|
||
# split the other info into fields
|
||
_iter = iter(columns_str.split())
|
||
|
||
def _next_token():
|
||
return next(_iter)
|
||
|
||
# get the offset
|
||
synset._offset = int(_next_token())
|
||
|
||
# determine the lexicographer file name
|
||
lexname_index = int(_next_token())
|
||
synset._lexname = self._lexnames[lexname_index]
|
||
|
||
# get the part of speech
|
||
synset._pos = _next_token()
|
||
|
||
# create Lemma objects for each lemma
|
||
n_lemmas = int(_next_token(), 16)
|
||
for _ in range(n_lemmas):
|
||
# get the lemma name
|
||
lemma_name = _next_token()
|
||
# get the lex_id (used for sense_keys)
|
||
lex_id = int(_next_token(), 16)
|
||
# If the lemma has a syntactic marker, extract it.
|
||
m = re.match(r'(.*?)(\(.*\))?$', lemma_name)
|
||
lemma_name, syn_mark = m.groups()
|
||
# create the lemma object
|
||
lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark)
|
||
synset._lemmas.append(lemma)
|
||
synset._lemma_names.append(lemma._name)
|
||
|
||
# collect the pointer tuples
|
||
n_pointers = int(_next_token())
|
||
for _ in range(n_pointers):
|
||
symbol = _next_token()
|
||
offset = int(_next_token())
|
||
pos = _next_token()
|
||
lemma_ids_str = _next_token()
|
||
if lemma_ids_str == '0000':
|
||
synset._pointers[symbol].add((pos, offset))
|
||
else:
|
||
source_index = int(lemma_ids_str[:2], 16) - 1
|
||
target_index = int(lemma_ids_str[2:], 16) - 1
|
||
source_lemma_name = synset._lemmas[source_index]._name
|
||
lemma_pointers = synset._lemma_pointers
|
||
tups = lemma_pointers[source_lemma_name, symbol]
|
||
tups.append((pos, offset, target_index))
|
||
|
||
# read the verb frames
|
||
try:
|
||
frame_count = int(_next_token())
|
||
except StopIteration:
|
||
pass
|
||
else:
|
||
for _ in range(frame_count):
|
||
# read the plus sign
|
||
plus = _next_token()
|
||
assert plus == '+'
|
||
# read the frame and lemma number
|
||
frame_number = int(_next_token())
|
||
frame_string_fmt = VERB_FRAME_STRINGS[frame_number]
|
||
lemma_number = int(_next_token(), 16)
|
||
# lemma number of 00 means all words in the synset
|
||
if lemma_number == 0:
|
||
synset._frame_ids.append(frame_number)
|
||
for lemma in synset._lemmas:
|
||
lemma._frame_ids.append(frame_number)
|
||
lemma._frame_strings.append(frame_string_fmt % lemma._name)
|
||
# only a specific word in the synset
|
||
else:
|
||
lemma = synset._lemmas[lemma_number - 1]
|
||
lemma._frame_ids.append(frame_number)
|
||
lemma._frame_strings.append(frame_string_fmt % lemma._name)
|
||
|
||
# raise a more informative error with line text
|
||
except ValueError as e:
|
||
raise WordNetError('line %r: %s' % (data_file_line, e))
|
||
|
||
# set sense keys for Lemma objects - note that this has to be
|
||
# done afterwards so that the relations are available
|
||
for lemma in synset._lemmas:
|
||
if synset._pos == ADJ_SAT:
|
||
head_lemma = synset.similar_tos()[0]._lemmas[0]
|
||
head_name = head_lemma._name
|
||
head_id = '%02d' % head_lemma._lex_id
|
||
else:
|
||
head_name = head_id = ''
|
||
tup = (
|
||
lemma._name,
|
||
WordNetCorpusReader._pos_numbers[synset._pos],
|
||
lemma._lexname_index,
|
||
lemma._lex_id,
|
||
head_name,
|
||
head_id,
|
||
)
|
||
lemma._key = ('%s%%%d:%02d:%02d:%s:%s' % tup).lower()
|
||
|
||
# the canonical name is based on the first lemma
|
||
lemma_name = synset._lemmas[0]._name.lower()
|
||
offsets = self._lemma_pos_offset_map[lemma_name][synset._pos]
|
||
sense_index = offsets.index(synset._offset)
|
||
tup = lemma_name, synset._pos, sense_index + 1
|
||
synset._name = '%s.%s.%02i' % tup
|
||
|
||
return synset
|
||
|
||
def synset_from_sense_key(self, sense_key):
|
||
"""
|
||
Retrieves synset based on a given sense_key. Sense keys can be
|
||
obtained from lemma.key()
|
||
|
||
From https://wordnet.princeton.edu/documentation/senseidx5wn:
|
||
A sense_key is represented as:
|
||
lemma % lex_sense (e.g. 'dog%1:18:01::')
|
||
where lex_sense is encoded as:
|
||
ss_type:lex_filenum:lex_id:head_word:head_id
|
||
|
||
lemma: ASCII text of word/collocation, in lower case
|
||
ss_type: synset type for the sense (1 digit int)
|
||
The synset type is encoded as follows:
|
||
1 NOUN
|
||
2 VERB
|
||
3 ADJECTIVE
|
||
4 ADVERB
|
||
5 ADJECTIVE SATELLITE
|
||
lex_filenum: name of lexicographer file containing the synset for the sense (2 digit int)
|
||
lex_id: when paired with lemma, uniquely identifies a sense in the lexicographer file (2 digit int)
|
||
head_word: lemma of the first word in satellite's head synset
|
||
Only used if sense is in an adjective satellite synset
|
||
head_id: uniquely identifies sense in a lexicographer file when paired with head_word
|
||
Only used if head_word is present (2 digit int)
|
||
"""
|
||
sense_key_regex = re.compile(r"(.*)\%(.*):(.*):(.*):(.*):(.*)")
|
||
synset_types = {1: NOUN, 2: VERB, 3: ADJ, 4: ADV, 5: ADJ_SAT}
|
||
lemma, ss_type, _, lex_id, _, _ = sense_key_regex.match(sense_key).groups()
|
||
|
||
# check that information extracted from sense_key is valid
|
||
error = None
|
||
if not lemma:
|
||
error = "lemma"
|
||
elif int(ss_type) not in synset_types:
|
||
error = "ss_type"
|
||
elif int(lex_id) < 0 or int(lex_id) > 99:
|
||
error = "lex_id"
|
||
if error:
|
||
raise WordNetError(
|
||
"valid {} could not be extracted from the sense key".format(error)
|
||
)
|
||
|
||
synset_id = '.'.join([lemma, synset_types[int(ss_type)], lex_id])
|
||
return self.synset(synset_id)
|
||
|
||
#############################################################
|
||
# Retrieve synsets and lemmas.
|
||
#############################################################
|
||
|
||
def synsets(self, lemma, pos=None, lang='eng', check_exceptions=True):
|
||
"""Load all synsets with a given lemma and part of speech tag.
|
||
If no pos is specified, all synsets for all parts of speech
|
||
will be loaded.
|
||
If lang is specified, all the synsets associated with the lemma name
|
||
of that language will be returned.
|
||
"""
|
||
lemma = lemma.lower()
|
||
|
||
if lang == 'eng':
|
||
get_synset = self.synset_from_pos_and_offset
|
||
index = self._lemma_pos_offset_map
|
||
if pos is None:
|
||
pos = POS_LIST
|
||
return [
|
||
get_synset(p, offset)
|
||
for p in pos
|
||
for form in self._morphy(lemma, p, check_exceptions)
|
||
for offset in index[form].get(p, [])
|
||
]
|
||
|
||
else:
|
||
self._load_lang_data(lang)
|
||
synset_list = []
|
||
if lemma in self._lang_data[lang][1]:
|
||
for l in self._lang_data[lang][1][lemma]:
|
||
if pos is not None and l[-1] != pos:
|
||
continue
|
||
synset_list.append(self.of2ss(l))
|
||
return synset_list
|
||
|
||
def lemmas(self, lemma, pos=None, lang='eng'):
|
||
"""Return all Lemma objects with a name matching the specified lemma
|
||
name and part of speech tag. Matches any part of speech tag if none is
|
||
specified."""
|
||
|
||
lemma = lemma.lower()
|
||
if lang == 'eng':
|
||
return [
|
||
lemma_obj
|
||
for synset in self.synsets(lemma, pos)
|
||
for lemma_obj in synset.lemmas()
|
||
if lemma_obj.name().lower() == lemma
|
||
]
|
||
|
||
else:
|
||
self._load_lang_data(lang)
|
||
lemmas = []
|
||
syn = self.synsets(lemma, lang=lang)
|
||
for s in syn:
|
||
if pos is not None and s.pos() != pos:
|
||
continue
|
||
for lemma_obj in s.lemmas(lang=lang):
|
||
if lemma_obj.name().lower() == lemma:
|
||
lemmas.append(lemma_obj)
|
||
return lemmas
|
||
|
||
def all_lemma_names(self, pos=None, lang='eng'):
|
||
"""Return all lemma names for all synsets for the given
|
||
part of speech tag and language or languages. If pos is
|
||
not specified, all synsets for all parts of speech will
|
||
be used."""
|
||
|
||
if lang == 'eng':
|
||
if pos is None:
|
||
return iter(self._lemma_pos_offset_map)
|
||
else:
|
||
return (
|
||
lemma
|
||
for lemma in self._lemma_pos_offset_map
|
||
if pos in self._lemma_pos_offset_map[lemma]
|
||
)
|
||
else:
|
||
self._load_lang_data(lang)
|
||
lemma = []
|
||
for i in self._lang_data[lang][0]:
|
||
if pos is not None and i[-1] != pos:
|
||
continue
|
||
lemma.extend(self._lang_data[lang][0][i])
|
||
|
||
lemma = iter(set(lemma))
|
||
return lemma
|
||
|
||
def all_synsets(self, pos=None):
|
||
"""Iterate over all synsets with a given part of speech tag.
|
||
If no pos is specified, all synsets for all parts of speech
|
||
will be loaded.
|
||
"""
|
||
if pos is None:
|
||
pos_tags = self._FILEMAP.keys()
|
||
else:
|
||
pos_tags = [pos]
|
||
|
||
cache = self._synset_offset_cache
|
||
from_pos_and_line = self._synset_from_pos_and_line
|
||
|
||
# generate all synsets for each part of speech
|
||
for pos_tag in pos_tags:
|
||
# Open the file for reading. Note that we can not re-use
|
||
# the file poitners from self._data_file_map here, because
|
||
# we're defining an iterator, and those file pointers might
|
||
# be moved while we're not looking.
|
||
if pos_tag == ADJ_SAT:
|
||
pos_tag = ADJ
|
||
fileid = 'data.%s' % self._FILEMAP[pos_tag]
|
||
data_file = self.open(fileid)
|
||
|
||
try:
|
||
# generate synsets for each line in the POS file
|
||
offset = data_file.tell()
|
||
line = data_file.readline()
|
||
while line:
|
||
if not line[0].isspace():
|
||
if offset in cache[pos_tag]:
|
||
# See if the synset is cached
|
||
synset = cache[pos_tag][offset]
|
||
else:
|
||
# Otherwise, parse the line
|
||
synset = from_pos_and_line(pos_tag, line)
|
||
cache[pos_tag][offset] = synset
|
||
|
||
# adjective satellites are in the same file as
|
||
# adjectives so only yield the synset if it's actually
|
||
# a satellite
|
||
if synset._pos == ADJ_SAT:
|
||
yield synset
|
||
|
||
# for all other POS tags, yield all synsets (this means
|
||
# that adjectives also include adjective satellites)
|
||
else:
|
||
yield synset
|
||
offset = data_file.tell()
|
||
line = data_file.readline()
|
||
|
||
# close the extra file handle we opened
|
||
except:
|
||
data_file.close()
|
||
raise
|
||
else:
|
||
data_file.close()
|
||
|
||
def words(self, lang='eng'):
|
||
"""return lemmas of the given language as list of words"""
|
||
return self.all_lemma_names(lang=lang)
|
||
|
||
def license(self, lang='eng'):
|
||
"""Return the contents of LICENSE (for omw)
|
||
use lang=lang to get the license for an individual language"""
|
||
if lang == 'eng':
|
||
return self.open("LICENSE").read()
|
||
elif lang in self.langs():
|
||
return self._omw_reader.open("{}/LICENSE".format(lang)).read()
|
||
elif lang == 'omw':
|
||
# under the assumption you don't mean Omwunra-Toqura
|
||
return self._omw_reader.open("LICENSE").read()
|
||
elif lang in self._lang_data:
|
||
raise WordNetError("Cannot determine license for user-provided tab file")
|
||
else:
|
||
raise WordNetError("Language is not supported.")
|
||
|
||
def readme(self, lang='omw'):
|
||
"""Return the contents of README (for omw)
|
||
use lang=lang to get the readme for an individual language"""
|
||
if lang == 'eng':
|
||
return self.open("README").read()
|
||
elif lang in self.langs():
|
||
return self._omw_reader.open("{}/README".format(lang)).read()
|
||
elif lang == 'omw':
|
||
# under the assumption you don't mean Omwunra-Toqura
|
||
return self._omw_reader.open("README").read()
|
||
elif lang in self._lang_data:
|
||
raise WordNetError("No README for user-provided tab file")
|
||
else:
|
||
raise WordNetError("Language is not supported.")
|
||
|
||
def citation(self, lang='omw'):
|
||
"""Return the contents of citation.bib file (for omw)
|
||
use lang=lang to get the citation for an individual language"""
|
||
if lang == 'eng':
|
||
return self.open("citation.bib").read()
|
||
elif lang in self.langs():
|
||
return self._omw_reader.open("{}/citation.bib".format(lang)).read()
|
||
elif lang == 'omw':
|
||
# under the assumption you don't mean Omwunra-Toqura
|
||
return self._omw_reader.open("citation.bib").read()
|
||
elif lang in self._lang_data:
|
||
raise WordNetError("citation not known for user-provided tab file")
|
||
else:
|
||
raise WordNetError("Language is not supported.")
|
||
|
||
#############################################################
|
||
# Misc
|
||
#############################################################
|
||
def lemma_count(self, lemma):
|
||
"""Return the frequency count for this Lemma"""
|
||
# Currently, count is only work for English
|
||
if lemma._lang != 'eng':
|
||
return 0
|
||
# open the count file if we haven't already
|
||
if self._key_count_file is None:
|
||
self._key_count_file = self.open('cntlist.rev')
|
||
# find the key in the counts file and return the count
|
||
line = _binary_search_file(self._key_count_file, lemma._key)
|
||
if line:
|
||
return int(line.rsplit(' ', 1)[-1])
|
||
else:
|
||
return 0
|
||
|
||
def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
|
||
return synset1.path_similarity(synset2, verbose, simulate_root)
|
||
|
||
path_similarity.__doc__ = Synset.path_similarity.__doc__
|
||
|
||
def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
|
||
return synset1.lch_similarity(synset2, verbose, simulate_root)
|
||
|
||
lch_similarity.__doc__ = Synset.lch_similarity.__doc__
|
||
|
||
def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True):
|
||
return synset1.wup_similarity(synset2, verbose, simulate_root)
|
||
|
||
wup_similarity.__doc__ = Synset.wup_similarity.__doc__
|
||
|
||
def res_similarity(self, synset1, synset2, ic, verbose=False):
|
||
return synset1.res_similarity(synset2, ic, verbose)
|
||
|
||
res_similarity.__doc__ = Synset.res_similarity.__doc__
|
||
|
||
def jcn_similarity(self, synset1, synset2, ic, verbose=False):
|
||
return synset1.jcn_similarity(synset2, ic, verbose)
|
||
|
||
jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
|
||
|
||
def lin_similarity(self, synset1, synset2, ic, verbose=False):
|
||
return synset1.lin_similarity(synset2, ic, verbose)
|
||
|
||
lin_similarity.__doc__ = Synset.lin_similarity.__doc__
|
||
|
||
#############################################################
|
||
# Morphy
|
||
#############################################################
|
||
# Morphy, adapted from Oliver Steele's pywordnet
|
||
def morphy(self, form, pos=None, check_exceptions=True):
|
||
"""
|
||
Find a possible base form for the given form, with the given
|
||
part of speech, by checking WordNet's list of exceptional
|
||
forms, and by recursively stripping affixes for this part of
|
||
speech until a form in WordNet is found.
|
||
|
||
>>> from nltk.corpus import wordnet as wn
|
||
>>> print(wn.morphy('dogs'))
|
||
dog
|
||
>>> print(wn.morphy('churches'))
|
||
church
|
||
>>> print(wn.morphy('aardwolves'))
|
||
aardwolf
|
||
>>> print(wn.morphy('abaci'))
|
||
abacus
|
||
>>> wn.morphy('hardrock', wn.ADV)
|
||
>>> print(wn.morphy('book', wn.NOUN))
|
||
book
|
||
>>> wn.morphy('book', wn.ADJ)
|
||
"""
|
||
|
||
if pos is None:
|
||
morphy = self._morphy
|
||
analyses = chain(a for p in POS_LIST for a in morphy(form, p))
|
||
else:
|
||
analyses = self._morphy(form, pos, check_exceptions)
|
||
|
||
# get the first one we find
|
||
first = list(islice(analyses, 1))
|
||
if len(first) == 1:
|
||
return first[0]
|
||
else:
|
||
return None
|
||
|
||
MORPHOLOGICAL_SUBSTITUTIONS = {
|
||
NOUN: [
|
||
('s', ''),
|
||
('ses', 's'),
|
||
('ves', 'f'),
|
||
('xes', 'x'),
|
||
('zes', 'z'),
|
||
('ches', 'ch'),
|
||
('shes', 'sh'),
|
||
('men', 'man'),
|
||
('ies', 'y'),
|
||
],
|
||
VERB: [
|
||
('s', ''),
|
||
('ies', 'y'),
|
||
('es', 'e'),
|
||
('es', ''),
|
||
('ed', 'e'),
|
||
('ed', ''),
|
||
('ing', 'e'),
|
||
('ing', ''),
|
||
],
|
||
ADJ: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')],
|
||
ADV: [],
|
||
}
|
||
|
||
MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ]
|
||
|
||
def _morphy(self, form, pos, check_exceptions=True):
|
||
# from jordanbg:
|
||
# Given an original string x
|
||
# 1. Apply rules once to the input to get y1, y2, y3, etc.
|
||
# 2. Return all that are in the database
|
||
# 3. If there are no matches, keep applying rules until you either
|
||
# find a match or you can't go any further
|
||
|
||
exceptions = self._exception_map[pos]
|
||
substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos]
|
||
|
||
def apply_rules(forms):
|
||
return [
|
||
form[: -len(old)] + new
|
||
for form in forms
|
||
for old, new in substitutions
|
||
if form.endswith(old)
|
||
]
|
||
|
||
def filter_forms(forms):
|
||
result = []
|
||
seen = set()
|
||
for form in forms:
|
||
if form in self._lemma_pos_offset_map:
|
||
if pos in self._lemma_pos_offset_map[form]:
|
||
if form not in seen:
|
||
result.append(form)
|
||
seen.add(form)
|
||
return result
|
||
|
||
# 0. Check the exception lists
|
||
if check_exceptions:
|
||
if form in exceptions:
|
||
return filter_forms([form] + exceptions[form])
|
||
|
||
# 1. Apply rules once to the input to get y1, y2, y3, etc.
|
||
forms = apply_rules([form])
|
||
|
||
# 2. Return all that are in the database (and check the original too)
|
||
results = filter_forms([form] + forms)
|
||
if results:
|
||
return results
|
||
|
||
# 3. If there are no matches, keep applying rules until we find a match
|
||
while forms:
|
||
forms = apply_rules(forms)
|
||
results = filter_forms(forms)
|
||
if results:
|
||
return results
|
||
|
||
# Return an empty list if we can't find anything
|
||
return []
|
||
|
||
#############################################################
|
||
# Create information content from corpus
|
||
#############################################################
|
||
def ic(self, corpus, weight_senses_equally=False, smoothing=1.0):
|
||
"""
|
||
Creates an information content lookup dictionary from a corpus.
|
||
|
||
:type corpus: CorpusReader
|
||
:param corpus: The corpus from which we create an information
|
||
content dictionary.
|
||
:type weight_senses_equally: bool
|
||
:param weight_senses_equally: If this is True, gives all
|
||
possible senses equal weight rather than dividing by the
|
||
number of possible senses. (If a word has 3 synses, each
|
||
sense gets 0.3333 per appearance when this is False, 1.0 when
|
||
it is true.)
|
||
:param smoothing: How much do we smooth synset counts (default is 1.0)
|
||
:type smoothing: float
|
||
:return: An information content dictionary
|
||
"""
|
||
counts = FreqDist()
|
||
for ww in corpus.words():
|
||
counts[ww] += 1
|
||
|
||
ic = {}
|
||
for pp in POS_LIST:
|
||
ic[pp] = defaultdict(float)
|
||
|
||
# Initialize the counts with the smoothing value
|
||
if smoothing > 0.0:
|
||
for ss in self.all_synsets():
|
||
pos = ss._pos
|
||
if pos == ADJ_SAT:
|
||
pos = ADJ
|
||
ic[pos][ss._offset] = smoothing
|
||
|
||
for ww in counts:
|
||
possible_synsets = self.synsets(ww)
|
||
if len(possible_synsets) == 0:
|
||
continue
|
||
|
||
# Distribute weight among possible synsets
|
||
weight = float(counts[ww])
|
||
if not weight_senses_equally:
|
||
weight /= float(len(possible_synsets))
|
||
|
||
for ss in possible_synsets:
|
||
pos = ss._pos
|
||
if pos == ADJ_SAT:
|
||
pos = ADJ
|
||
for level in ss._iter_hypernym_lists():
|
||
for hh in level:
|
||
ic[pos][hh._offset] += weight
|
||
# Add the weight to the root
|
||
ic[pos][0] += weight
|
||
return ic
|
||
|
||
def custom_lemmas(self, tab_file, lang):
|
||
"""
|
||
Reads a custom tab file containing mappings of lemmas in the given
|
||
language to Princeton WordNet 3.0 synset offsets, allowing NLTK's
|
||
WordNet functions to then be used with that language.
|
||
|
||
See the "Tab files" section at http://compling.hss.ntu.edu.sg/omw/ for
|
||
documentation on the Multilingual WordNet tab file format.
|
||
|
||
:param tab_file: Tab file as a file or file-like object
|
||
:type lang str
|
||
:param lang ISO 639-3 code of the language of the tab file
|
||
"""
|
||
if len(lang) != 3:
|
||
raise ValueError('lang should be a (3 character) ISO 639-3 code')
|
||
self._lang_data[lang] = [defaultdict(list), defaultdict(list)]
|
||
for line in tab_file.readlines():
|
||
if isinstance(line, bytes):
|
||
# Support byte-stream files (e.g. as returned by Python 2's
|
||
# open() function) as well as text-stream ones
|
||
line = line.decode('utf-8')
|
||
if not line.startswith('#'):
|
||
offset_pos, lemma_type, lemma = line.strip().split('\t')
|
||
lemma = lemma.strip().replace(' ', '_')
|
||
self._lang_data[lang][0][offset_pos].append(lemma)
|
||
self._lang_data[lang][1][lemma.lower()].append(offset_pos)
|
||
# Make sure no more entries are accidentally added subsequently
|
||
self._lang_data[lang][0].default_factory = None
|
||
self._lang_data[lang][1].default_factory = None
|
||
|
||
|
||
######################################################################
|
||
# WordNet Information Content Corpus Reader
|
||
######################################################################
|
||
|
||
|
||
class WordNetICCorpusReader(CorpusReader):
|
||
"""
|
||
A corpus reader for the WordNet information content corpus.
|
||
"""
|
||
|
||
def __init__(self, root, fileids):
|
||
CorpusReader.__init__(self, root, fileids, encoding='utf8')
|
||
|
||
# this load function would be more efficient if the data was pickled
|
||
# Note that we can't use NLTK's frequency distributions because
|
||
# synsets are overlapping (each instance of a synset also counts
|
||
# as an instance of its hypernyms)
|
||
def ic(self, icfile):
|
||
"""
|
||
Load an information content file from the wordnet_ic corpus
|
||
and return a dictionary. This dictionary has just two keys,
|
||
NOUN and VERB, whose values are dictionaries that map from
|
||
synsets to information content values.
|
||
|
||
:type icfile: str
|
||
:param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat")
|
||
:return: An information content dictionary
|
||
"""
|
||
ic = {}
|
||
ic[NOUN] = defaultdict(float)
|
||
ic[VERB] = defaultdict(float)
|
||
for num, line in enumerate(self.open(icfile)):
|
||
if num == 0: # skip the header
|
||
continue
|
||
fields = line.split()
|
||
offset = int(fields[0][:-1])
|
||
value = float(fields[1])
|
||
pos = _get_pos(fields[0])
|
||
if len(fields) == 3 and fields[2] == "ROOT":
|
||
# Store root count.
|
||
ic[pos][0] += value
|
||
if value != 0:
|
||
ic[pos][offset] = value
|
||
return ic
|
||
|
||
|
||
######################################################################
|
||
# Similarity metrics
|
||
######################################################################
|
||
|
||
# TODO: Add in the option to manually add a new root node; this will be
|
||
# useful for verb similarity as there exist multiple verb taxonomies.
|
||
|
||
# More information about the metrics is available at
|
||
# http://marimba.d.umn.edu/similarity/measures.html
|
||
|
||
|
||
def path_similarity(synset1, synset2, verbose=False, simulate_root=True):
|
||
return synset1.path_similarity(synset2, verbose, simulate_root)
|
||
|
||
|
||
def lch_similarity(synset1, synset2, verbose=False, simulate_root=True):
|
||
return synset1.lch_similarity(synset2, verbose, simulate_root)
|
||
|
||
|
||
def wup_similarity(synset1, synset2, verbose=False, simulate_root=True):
|
||
return synset1.wup_similarity(synset2, verbose, simulate_root)
|
||
|
||
|
||
def res_similarity(synset1, synset2, ic, verbose=False):
|
||
return synset1.res_similarity(synset2, verbose)
|
||
|
||
|
||
def jcn_similarity(synset1, synset2, ic, verbose=False):
|
||
return synset1.jcn_similarity(synset2, verbose)
|
||
|
||
|
||
def lin_similarity(synset1, synset2, ic, verbose=False):
|
||
return synset1.lin_similarity(synset2, verbose)
|
||
|
||
|
||
path_similarity.__doc__ = Synset.path_similarity.__doc__
|
||
lch_similarity.__doc__ = Synset.lch_similarity.__doc__
|
||
wup_similarity.__doc__ = Synset.wup_similarity.__doc__
|
||
res_similarity.__doc__ = Synset.res_similarity.__doc__
|
||
jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__
|
||
lin_similarity.__doc__ = Synset.lin_similarity.__doc__
|
||
|
||
|
||
def _lcs_ic(synset1, synset2, ic, verbose=False):
|
||
"""
|
||
Get the information content of the least common subsumer that has
|
||
the highest information content value. If two nodes have no
|
||
explicit common subsumer, assume that they share an artificial
|
||
root node that is the hypernym of all explicit roots.
|
||
|
||
:type synset1: Synset
|
||
:param synset1: First input synset.
|
||
:type synset2: Synset
|
||
:param synset2: Second input synset. Must be the same part of
|
||
speech as the first synset.
|
||
:type ic: dict
|
||
:param ic: an information content object (as returned by ``load_ic()``).
|
||
:return: The information content of the two synsets and their most
|
||
informative subsumer
|
||
"""
|
||
if synset1._pos != synset2._pos:
|
||
raise WordNetError(
|
||
'Computing the least common subsumer requires '
|
||
'%s and %s to have the same part of speech.' % (synset1, synset2)
|
||
)
|
||
|
||
ic1 = information_content(synset1, ic)
|
||
ic2 = information_content(synset2, ic)
|
||
subsumers = synset1.common_hypernyms(synset2)
|
||
if len(subsumers) == 0:
|
||
subsumer_ic = 0
|
||
else:
|
||
subsumer_ic = max(information_content(s, ic) for s in subsumers)
|
||
|
||
if verbose:
|
||
print("> LCS Subsumer by content:", subsumer_ic)
|
||
|
||
return ic1, ic2, subsumer_ic
|
||
|
||
|
||
# Utility functions
|
||
|
||
|
||
def information_content(synset, ic):
|
||
try:
|
||
icpos = ic[synset._pos]
|
||
except KeyError:
|
||
msg = 'Information content file has no entries for part-of-speech: %s'
|
||
raise WordNetError(msg % synset._pos)
|
||
|
||
counts = icpos[synset._offset]
|
||
if counts == 0:
|
||
return _INF
|
||
else:
|
||
return -math.log(counts / icpos[0])
|
||
|
||
|
||
# get the part of speech (NOUN or VERB) from the information content record
|
||
# (each identifier has a 'n' or 'v' suffix)
|
||
|
||
|
||
def _get_pos(field):
|
||
if field[-1] == 'n':
|
||
return NOUN
|
||
elif field[-1] == 'v':
|
||
return VERB
|
||
else:
|
||
msg = (
|
||
"Unidentified part of speech in WordNet Information Content file "
|
||
"for field %s" % field
|
||
)
|
||
raise ValueError(msg)
|
||
|
||
|
||
# unload corpus after tests
|
||
def teardown_module(module=None):
|
||
from nltk.corpus import wordnet
|
||
|
||
wordnet._unload()
|