185 lines
6.4 KiB
Python
185 lines
6.4 KiB
Python
# Natural Language Toolkit: Lin's Thesaurus
|
|
#
|
|
# Copyright (C) 2001-2019 NLTK Project
|
|
# Author: Dan Blanchard <dblanchard@ets.org>
|
|
# URL: <http://nltk.org/>
|
|
# For license information, see LICENSE.txt
|
|
from __future__ import print_function
|
|
|
|
import re
|
|
from collections import defaultdict
|
|
from functools import reduce
|
|
|
|
from nltk.corpus.reader import CorpusReader
|
|
|
|
|
|
class LinThesaurusCorpusReader(CorpusReader):
|
|
""" Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin. """
|
|
|
|
# Compiled regular expression for extracting the key from the first line of each
|
|
# thesaurus entry
|
|
_key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+')
|
|
|
|
@staticmethod
|
|
def __defaultdict_factory():
|
|
''' Factory for creating defaultdict of defaultdict(dict)s '''
|
|
return defaultdict(dict)
|
|
|
|
def __init__(self, root, badscore=0.0):
|
|
'''
|
|
Initialize the thesaurus.
|
|
|
|
:param root: root directory containing thesaurus LISP files
|
|
:type root: C{string}
|
|
:param badscore: the score to give to words which do not appear in each other's sets of synonyms
|
|
:type badscore: C{float}
|
|
'''
|
|
|
|
super(LinThesaurusCorpusReader, self).__init__(root, r'sim[A-Z]\.lsp')
|
|
self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory)
|
|
self._badscore = badscore
|
|
for path, encoding, fileid in self.abspaths(
|
|
include_encoding=True, include_fileid=True
|
|
):
|
|
with open(path) as lin_file:
|
|
first = True
|
|
for line in lin_file:
|
|
line = line.strip()
|
|
# Start of entry
|
|
if first:
|
|
key = LinThesaurusCorpusReader._key_re.sub(r'\1', line)
|
|
first = False
|
|
# End of entry
|
|
elif line == '))':
|
|
first = True
|
|
# Lines with pairs of ngrams and scores
|
|
else:
|
|
split_line = line.split('\t')
|
|
if len(split_line) == 2:
|
|
ngram, score = split_line
|
|
self._thesaurus[fileid][key][ngram.strip('"')] = float(
|
|
score
|
|
)
|
|
|
|
def similarity(self, ngram1, ngram2, fileid=None):
|
|
'''
|
|
Returns the similarity score for two ngrams.
|
|
|
|
:param ngram1: first ngram to compare
|
|
:type ngram1: C{string}
|
|
:param ngram2: second ngram to compare
|
|
:type ngram2: C{string}
|
|
:param fileid: thesaurus fileid to search in. If None, search all fileids.
|
|
:type fileid: C{string}
|
|
:return: If fileid is specified, just the score for the two ngrams; otherwise,
|
|
list of tuples of fileids and scores.
|
|
'''
|
|
# Entries don't contain themselves, so make sure similarity between item and itself is 1.0
|
|
if ngram1 == ngram2:
|
|
if fileid:
|
|
return 1.0
|
|
else:
|
|
return [(fid, 1.0) for fid in self._fileids]
|
|
else:
|
|
if fileid:
|
|
return (
|
|
self._thesaurus[fileid][ngram1][ngram2]
|
|
if ngram2 in self._thesaurus[fileid][ngram1]
|
|
else self._badscore
|
|
)
|
|
else:
|
|
return [
|
|
(
|
|
fid,
|
|
(
|
|
self._thesaurus[fid][ngram1][ngram2]
|
|
if ngram2 in self._thesaurus[fid][ngram1]
|
|
else self._badscore
|
|
),
|
|
)
|
|
for fid in self._fileids
|
|
]
|
|
|
|
def scored_synonyms(self, ngram, fileid=None):
|
|
'''
|
|
Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram
|
|
|
|
:param ngram: ngram to lookup
|
|
:type ngram: C{string}
|
|
:param fileid: thesaurus fileid to search in. If None, search all fileids.
|
|
:type fileid: C{string}
|
|
:return: If fileid is specified, list of tuples of scores and synonyms; otherwise,
|
|
list of tuples of fileids and lists, where inner lists consist of tuples of
|
|
scores and synonyms.
|
|
'''
|
|
if fileid:
|
|
return self._thesaurus[fileid][ngram].items()
|
|
else:
|
|
return [
|
|
(fileid, self._thesaurus[fileid][ngram].items())
|
|
for fileid in self._fileids
|
|
]
|
|
|
|
def synonyms(self, ngram, fileid=None):
|
|
'''
|
|
Returns a list of synonyms for the current ngram.
|
|
|
|
:param ngram: ngram to lookup
|
|
:type ngram: C{string}
|
|
:param fileid: thesaurus fileid to search in. If None, search all fileids.
|
|
:type fileid: C{string}
|
|
:return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and
|
|
lists, where inner lists contain synonyms.
|
|
'''
|
|
if fileid:
|
|
return self._thesaurus[fileid][ngram].keys()
|
|
else:
|
|
return [
|
|
(fileid, self._thesaurus[fileid][ngram].keys())
|
|
for fileid in self._fileids
|
|
]
|
|
|
|
def __contains__(self, ngram):
|
|
'''
|
|
Determines whether or not the given ngram is in the thesaurus.
|
|
|
|
:param ngram: ngram to lookup
|
|
:type ngram: C{string}
|
|
:return: whether the given ngram is in the thesaurus.
|
|
'''
|
|
return reduce(
|
|
lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]),
|
|
self._fileids,
|
|
False,
|
|
)
|
|
|
|
|
|
######################################################################
|
|
# Demo
|
|
######################################################################
|
|
|
|
|
|
def demo():
|
|
from nltk.corpus import lin_thesaurus as thes
|
|
|
|
word1 = "business"
|
|
word2 = "enterprise"
|
|
print("Getting synonyms for " + word1)
|
|
print(thes.synonyms(word1))
|
|
|
|
print("Getting scored synonyms for " + word1)
|
|
print(thes.scored_synonyms(word1))
|
|
|
|
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
|
|
print(thes.synonyms(word1, fileid="simN.lsp"))
|
|
|
|
print("Getting synonyms from simN.lsp (noun subsection) for " + word1)
|
|
print(thes.synonyms(word1, fileid="simN.lsp"))
|
|
|
|
print("Similarity score for %s and %s:" % (word1, word2))
|
|
print(thes.similarity(word1, word2))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
demo()
|