549 lines
15 KiB
Python
549 lines
15 KiB
Python
|
# Natural Language Toolkit: Relation Extraction
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Ewan Klein <ewan@inf.ed.ac.uk>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
Code for extracting relational triples from the ieer and conll2002 corpora.
|
||
|
|
||
|
Relations are stored internally as dictionaries ('reldicts').
|
||
|
|
||
|
The two serialization outputs are "rtuple" and "clause".
|
||
|
|
||
|
- An rtuple is a tuple of the form ``(subj, filler, obj)``,
|
||
|
where ``subj`` and ``obj`` are pairs of Named Entity mentions, and ``filler`` is the string of words
|
||
|
occurring between ``sub`` and ``obj`` (with no intervening NEs). Strings are printed via ``repr()`` to
|
||
|
circumvent locale variations in rendering utf-8 encoded strings.
|
||
|
- A clause is an atom of the form ``relsym(subjsym, objsym)``,
|
||
|
where the relation, subject and object have been canonicalized to single strings.
|
||
|
"""
|
||
|
from __future__ import print_function
|
||
|
|
||
|
# todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs?
|
||
|
|
||
|
from collections import defaultdict
|
||
|
import re
|
||
|
|
||
|
from six.moves import html_entities
|
||
|
|
||
|
# Dictionary that associates corpora with NE classes
|
||
|
NE_CLASSES = {
|
||
|
'ieer': [
|
||
|
'LOCATION',
|
||
|
'ORGANIZATION',
|
||
|
'PERSON',
|
||
|
'DURATION',
|
||
|
'DATE',
|
||
|
'CARDINAL',
|
||
|
'PERCENT',
|
||
|
'MONEY',
|
||
|
'MEASURE',
|
||
|
],
|
||
|
'conll2002': ['LOC', 'PER', 'ORG'],
|
||
|
'ace': [
|
||
|
'LOCATION',
|
||
|
'ORGANIZATION',
|
||
|
'PERSON',
|
||
|
'DURATION',
|
||
|
'DATE',
|
||
|
'CARDINAL',
|
||
|
'PERCENT',
|
||
|
'MONEY',
|
||
|
'MEASURE',
|
||
|
'FACILITY',
|
||
|
'GPE',
|
||
|
],
|
||
|
}
|
||
|
|
||
|
# Allow abbreviated class labels
|
||
|
short2long = dict(LOC='LOCATION', ORG='ORGANIZATION', PER='PERSON')
|
||
|
long2short = dict(LOCATION='LOC', ORGANIZATION='ORG', PERSON='PER')
|
||
|
|
||
|
|
||
|
def _expand(type):
|
||
|
"""
|
||
|
Expand an NE class name.
|
||
|
:type type: str
|
||
|
:rtype: str
|
||
|
"""
|
||
|
try:
|
||
|
return short2long[type]
|
||
|
except KeyError:
|
||
|
return type
|
||
|
|
||
|
|
||
|
def class_abbrev(type):
|
||
|
"""
|
||
|
Abbreviate an NE class name.
|
||
|
:type type: str
|
||
|
:rtype: str
|
||
|
"""
|
||
|
try:
|
||
|
return long2short[type]
|
||
|
except KeyError:
|
||
|
return type
|
||
|
|
||
|
|
||
|
def _join(lst, sep=' ', untag=False):
|
||
|
"""
|
||
|
Join a list into a string, turning tags tuples into tag strings or just words.
|
||
|
:param untag: if ``True``, omit the tag from tagged input strings.
|
||
|
:type lst: list
|
||
|
:rtype: str
|
||
|
"""
|
||
|
try:
|
||
|
return sep.join(lst)
|
||
|
except TypeError:
|
||
|
if untag:
|
||
|
return sep.join(tup[0] for tup in lst)
|
||
|
from nltk.tag import tuple2str
|
||
|
|
||
|
return sep.join(tuple2str(tup) for tup in lst)
|
||
|
|
||
|
|
||
|
def descape_entity(m, defs=html_entities.entitydefs):
|
||
|
"""
|
||
|
Translate one entity to its ISO Latin value.
|
||
|
Inspired by example from effbot.org
|
||
|
|
||
|
|
||
|
"""
|
||
|
# s = 'mcglashan_&_sarrail'
|
||
|
# l = ['mcglashan', '&', 'sarrail']
|
||
|
# pattern = re.compile("&(\w+?);")
|
||
|
# new = list2sym(l)
|
||
|
# s = pattern.sub(descape_entity, s)
|
||
|
# print s, new
|
||
|
try:
|
||
|
return defs[m.group(1)]
|
||
|
|
||
|
except KeyError:
|
||
|
return m.group(0) # use as is
|
||
|
|
||
|
|
||
|
def list2sym(lst):
|
||
|
"""
|
||
|
Convert a list of strings into a canonical symbol.
|
||
|
:type lst: list
|
||
|
:return: a Unicode string without whitespace
|
||
|
:rtype: unicode
|
||
|
"""
|
||
|
sym = _join(lst, '_', untag=True)
|
||
|
sym = sym.lower()
|
||
|
ENT = re.compile("&(\w+?);")
|
||
|
sym = ENT.sub(descape_entity, sym)
|
||
|
sym = sym.replace('.', '')
|
||
|
return sym
|
||
|
|
||
|
|
||
|
def tree2semi_rel(tree):
|
||
|
"""
|
||
|
Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``).
|
||
|
|
||
|
In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this
|
||
|
identifies pairs whose first member is a list (possibly empty) of terminal
|
||
|
strings, and whose second member is a ``Tree`` of the form (NE_label, terminals).
|
||
|
|
||
|
:param tree: a chunk tree
|
||
|
:return: a list of pairs (list(str), ``Tree``)
|
||
|
:rtype: list of tuple
|
||
|
"""
|
||
|
|
||
|
from nltk.tree import Tree
|
||
|
|
||
|
semi_rels = []
|
||
|
semi_rel = [[], None]
|
||
|
|
||
|
for dtr in tree:
|
||
|
if not isinstance(dtr, Tree):
|
||
|
semi_rel[0].append(dtr)
|
||
|
else:
|
||
|
# dtr is a Tree
|
||
|
semi_rel[1] = dtr
|
||
|
semi_rels.append(semi_rel)
|
||
|
semi_rel = [[], None]
|
||
|
return semi_rels
|
||
|
|
||
|
|
||
|
def semi_rel2reldict(pairs, window=5, trace=False):
|
||
|
"""
|
||
|
Converts the pairs generated by ``tree2semi_rel`` into a 'reldict': a dictionary which
|
||
|
stores information about the subject and object NEs plus the filler between them.
|
||
|
Additionally, a left and right context of length =< window are captured (within
|
||
|
a given input sentence).
|
||
|
|
||
|
:param pairs: a pair of list(str) and ``Tree``, as generated by
|
||
|
:param window: a threshold for the number of items to include in the left and right context
|
||
|
:type window: int
|
||
|
:return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon'
|
||
|
:rtype: list(defaultdict)
|
||
|
"""
|
||
|
result = []
|
||
|
while len(pairs) > 2:
|
||
|
reldict = defaultdict(str)
|
||
|
reldict['lcon'] = _join(pairs[0][0][-window:])
|
||
|
reldict['subjclass'] = pairs[0][1].label()
|
||
|
reldict['subjtext'] = _join(pairs[0][1].leaves())
|
||
|
reldict['subjsym'] = list2sym(pairs[0][1].leaves())
|
||
|
reldict['filler'] = _join(pairs[1][0])
|
||
|
reldict['untagged_filler'] = _join(pairs[1][0], untag=True)
|
||
|
reldict['objclass'] = pairs[1][1].label()
|
||
|
reldict['objtext'] = _join(pairs[1][1].leaves())
|
||
|
reldict['objsym'] = list2sym(pairs[1][1].leaves())
|
||
|
reldict['rcon'] = _join(pairs[2][0][:window])
|
||
|
if trace:
|
||
|
print(
|
||
|
"(%s(%s, %s)"
|
||
|
% (
|
||
|
reldict['untagged_filler'],
|
||
|
reldict['subjclass'],
|
||
|
reldict['objclass'],
|
||
|
)
|
||
|
)
|
||
|
result.append(reldict)
|
||
|
pairs = pairs[1:]
|
||
|
return result
|
||
|
|
||
|
|
||
|
def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10):
|
||
|
"""
|
||
|
Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern.
|
||
|
|
||
|
The parameters ``subjclass`` and ``objclass`` can be used to restrict the
|
||
|
Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION',
|
||
|
'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE').
|
||
|
|
||
|
:param subjclass: the class of the subject Named Entity.
|
||
|
:type subjclass: str
|
||
|
:param objclass: the class of the object Named Entity.
|
||
|
:type objclass: str
|
||
|
:param doc: input document
|
||
|
:type doc: ieer document or a list of chunk trees
|
||
|
:param corpus: name of the corpus to take as input; possible values are
|
||
|
'ieer' and 'conll2002'
|
||
|
:type corpus: str
|
||
|
:param pattern: a regular expression for filtering the fillers of
|
||
|
retrieved triples.
|
||
|
:type pattern: SRE_Pattern
|
||
|
:param window: filters out fillers which exceed this threshold
|
||
|
:type window: int
|
||
|
:return: see ``mk_reldicts``
|
||
|
:rtype: list(defaultdict)
|
||
|
"""
|
||
|
|
||
|
if subjclass and subjclass not in NE_CLASSES[corpus]:
|
||
|
if _expand(subjclass) in NE_CLASSES[corpus]:
|
||
|
subjclass = _expand(subjclass)
|
||
|
else:
|
||
|
raise ValueError(
|
||
|
"your value for the subject type has not been recognized: %s"
|
||
|
% subjclass
|
||
|
)
|
||
|
if objclass and objclass not in NE_CLASSES[corpus]:
|
||
|
if _expand(objclass) in NE_CLASSES[corpus]:
|
||
|
objclass = _expand(objclass)
|
||
|
else:
|
||
|
raise ValueError(
|
||
|
"your value for the object type has not been recognized: %s" % objclass
|
||
|
)
|
||
|
|
||
|
if corpus == 'ace' or corpus == 'conll2002':
|
||
|
pairs = tree2semi_rel(doc)
|
||
|
elif corpus == 'ieer':
|
||
|
pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline)
|
||
|
else:
|
||
|
raise ValueError("corpus type not recognized")
|
||
|
|
||
|
reldicts = semi_rel2reldict(pairs)
|
||
|
|
||
|
relfilter = lambda x: (
|
||
|
x['subjclass'] == subjclass
|
||
|
and len(x['filler'].split()) <= window
|
||
|
and pattern.match(x['filler'])
|
||
|
and x['objclass'] == objclass
|
||
|
)
|
||
|
|
||
|
return list(filter(relfilter, reldicts))
|
||
|
|
||
|
|
||
|
def rtuple(reldict, lcon=False, rcon=False):
|
||
|
"""
|
||
|
Pretty print the reldict as an rtuple.
|
||
|
:param reldict: a relation dictionary
|
||
|
:type reldict: defaultdict
|
||
|
"""
|
||
|
items = [
|
||
|
class_abbrev(reldict['subjclass']),
|
||
|
reldict['subjtext'],
|
||
|
reldict['filler'],
|
||
|
class_abbrev(reldict['objclass']),
|
||
|
reldict['objtext'],
|
||
|
]
|
||
|
format = '[%s: %r] %r [%s: %r]'
|
||
|
if lcon:
|
||
|
items = [reldict['lcon']] + items
|
||
|
format = '...%r)' + format
|
||
|
if rcon:
|
||
|
items.append(reldict['rcon'])
|
||
|
format = format + '(%r...'
|
||
|
printargs = tuple(items)
|
||
|
return format % printargs
|
||
|
|
||
|
|
||
|
def clause(reldict, relsym):
|
||
|
"""
|
||
|
Print the relation in clausal form.
|
||
|
:param reldict: a relation dictionary
|
||
|
:type reldict: defaultdict
|
||
|
:param relsym: a label for the relation
|
||
|
:type relsym: str
|
||
|
"""
|
||
|
items = (relsym, reldict['subjsym'], reldict['objsym'])
|
||
|
return "%s(%r, %r)" % items
|
||
|
|
||
|
|
||
|
#######################################################
|
||
|
# Demos of relation extraction with regular expressions
|
||
|
#######################################################
|
||
|
|
||
|
############################################
|
||
|
# Example of in(ORG, LOC)
|
||
|
############################################
|
||
|
def in_demo(trace=0, sql=True):
|
||
|
"""
|
||
|
Select pairs of organizations and locations whose mentions occur with an
|
||
|
intervening occurrence of the preposition "in".
|
||
|
|
||
|
If the sql parameter is set to True, then the entity pairs are loaded into
|
||
|
an in-memory database, and subsequently pulled out using an SQL "SELECT"
|
||
|
query.
|
||
|
"""
|
||
|
from nltk.corpus import ieer
|
||
|
|
||
|
if sql:
|
||
|
try:
|
||
|
import sqlite3
|
||
|
|
||
|
connection = sqlite3.connect(":memory:")
|
||
|
connection.text_factory = sqlite3.OptimizedUnicode
|
||
|
cur = connection.cursor()
|
||
|
cur.execute(
|
||
|
"""create table Locations
|
||
|
(OrgName text, LocationName text, DocID text)"""
|
||
|
)
|
||
|
except ImportError:
|
||
|
import warnings
|
||
|
|
||
|
warnings.warn("Cannot import sqlite; sql flag will be ignored.")
|
||
|
|
||
|
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
|
||
|
|
||
|
print()
|
||
|
print("IEER: in(ORG, LOC) -- just the clauses:")
|
||
|
print("=" * 45)
|
||
|
|
||
|
for file in ieer.fileids():
|
||
|
for doc in ieer.parsed_docs(file):
|
||
|
if trace:
|
||
|
print(doc.docno)
|
||
|
print("=" * 15)
|
||
|
for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN):
|
||
|
print(clause(rel, relsym='IN'))
|
||
|
if sql:
|
||
|
try:
|
||
|
rtuple = (rel['subjtext'], rel['objtext'], doc.docno)
|
||
|
cur.execute(
|
||
|
"""insert into Locations
|
||
|
values (?, ?, ?)""",
|
||
|
rtuple,
|
||
|
)
|
||
|
connection.commit()
|
||
|
except NameError:
|
||
|
pass
|
||
|
|
||
|
if sql:
|
||
|
try:
|
||
|
cur.execute(
|
||
|
"""select OrgName from Locations
|
||
|
where LocationName = 'Atlanta'"""
|
||
|
)
|
||
|
print()
|
||
|
print("Extract data from SQL table: ORGs in Atlanta")
|
||
|
print("-" * 15)
|
||
|
for row in cur:
|
||
|
print(row)
|
||
|
except NameError:
|
||
|
pass
|
||
|
|
||
|
|
||
|
############################################
|
||
|
# Example of has_role(PER, LOC)
|
||
|
############################################
|
||
|
|
||
|
|
||
|
def roles_demo(trace=0):
|
||
|
from nltk.corpus import ieer
|
||
|
|
||
|
roles = """
|
||
|
(.*( # assorted roles
|
||
|
analyst|
|
||
|
chair(wo)?man|
|
||
|
commissioner|
|
||
|
counsel|
|
||
|
director|
|
||
|
economist|
|
||
|
editor|
|
||
|
executive|
|
||
|
foreman|
|
||
|
governor|
|
||
|
head|
|
||
|
lawyer|
|
||
|
leader|
|
||
|
librarian).*)|
|
||
|
manager|
|
||
|
partner|
|
||
|
president|
|
||
|
producer|
|
||
|
professor|
|
||
|
researcher|
|
||
|
spokes(wo)?man|
|
||
|
writer|
|
||
|
,\sof\sthe?\s* # "X, of (the) Y"
|
||
|
"""
|
||
|
ROLES = re.compile(roles, re.VERBOSE)
|
||
|
|
||
|
print()
|
||
|
print("IEER: has_role(PER, ORG) -- raw rtuples:")
|
||
|
print("=" * 45)
|
||
|
|
||
|
for file in ieer.fileids():
|
||
|
for doc in ieer.parsed_docs(file):
|
||
|
lcon = rcon = False
|
||
|
if trace:
|
||
|
print(doc.docno)
|
||
|
print("=" * 15)
|
||
|
lcon = rcon = True
|
||
|
for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES):
|
||
|
print(rtuple(rel, lcon=lcon, rcon=rcon))
|
||
|
|
||
|
|
||
|
##############################################
|
||
|
### Show what's in the IEER Headlines
|
||
|
##############################################
|
||
|
|
||
|
|
||
|
def ieer_headlines():
|
||
|
|
||
|
from nltk.corpus import ieer
|
||
|
from nltk.tree import Tree
|
||
|
|
||
|
print("IEER: First 20 Headlines")
|
||
|
print("=" * 45)
|
||
|
|
||
|
trees = [
|
||
|
(doc.docno, doc.headline)
|
||
|
for file in ieer.fileids()
|
||
|
for doc in ieer.parsed_docs(file)
|
||
|
]
|
||
|
for tree in trees[:20]:
|
||
|
print()
|
||
|
print("%s:\n%s" % tree)
|
||
|
|
||
|
|
||
|
#############################################
|
||
|
## Dutch CONLL2002: take_on_role(PER, ORG
|
||
|
#############################################
|
||
|
|
||
|
|
||
|
def conllned(trace=1):
|
||
|
"""
|
||
|
Find the copula+'van' relation ('of') in the Dutch tagged training corpus
|
||
|
from CoNLL 2002.
|
||
|
"""
|
||
|
|
||
|
from nltk.corpus import conll2002
|
||
|
|
||
|
vnv = """
|
||
|
(
|
||
|
is/V| # 3rd sing present and
|
||
|
was/V| # past forms of the verb zijn ('be')
|
||
|
werd/V| # and also present
|
||
|
wordt/V # past of worden ('become)
|
||
|
)
|
||
|
.* # followed by anything
|
||
|
van/Prep # followed by van ('of')
|
||
|
"""
|
||
|
VAN = re.compile(vnv, re.VERBOSE)
|
||
|
|
||
|
print()
|
||
|
print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:")
|
||
|
print("=" * 45)
|
||
|
|
||
|
for doc in conll2002.chunked_sents('ned.train'):
|
||
|
lcon = rcon = False
|
||
|
if trace:
|
||
|
lcon = rcon = True
|
||
|
for rel in extract_rels(
|
||
|
'PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10
|
||
|
):
|
||
|
print(rtuple(rel, lcon=lcon, rcon=rcon))
|
||
|
|
||
|
|
||
|
#############################################
|
||
|
## Spanish CONLL2002: (PER, ORG)
|
||
|
#############################################
|
||
|
|
||
|
|
||
|
def conllesp():
|
||
|
from nltk.corpus import conll2002
|
||
|
|
||
|
de = """
|
||
|
.*
|
||
|
(
|
||
|
de/SP|
|
||
|
del/SP
|
||
|
)
|
||
|
"""
|
||
|
DE = re.compile(de, re.VERBOSE)
|
||
|
|
||
|
print()
|
||
|
print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
|
||
|
print("=" * 45)
|
||
|
rels = [
|
||
|
rel
|
||
|
for doc in conll2002.chunked_sents('esp.train')
|
||
|
for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern=DE)
|
||
|
]
|
||
|
for r in rels[:10]:
|
||
|
print(clause(r, relsym='DE'))
|
||
|
print()
|
||
|
|
||
|
|
||
|
def ne_chunked():
|
||
|
print()
|
||
|
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
|
||
|
print("=" * 45)
|
||
|
ROLE = re.compile(
|
||
|
r'.*(chairman|president|trader|scientist|economist|analyst|partner).*'
|
||
|
)
|
||
|
rels = []
|
||
|
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
|
||
|
sent = nltk.ne_chunk(sent)
|
||
|
rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
|
||
|
for rel in rels:
|
||
|
print('{0:<5}{1}'.format(i, rtuple(rel)))
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
import nltk
|
||
|
from nltk.sem import relextract
|
||
|
|
||
|
in_demo(trace=0)
|
||
|
roles_demo(trace=0)
|
||
|
conllned()
|
||
|
conllesp()
|
||
|
ieer_headlines()
|
||
|
ne_chunked()
|