135 lines
4.3 KiB
Python
135 lines
4.3 KiB
Python
|
# Natural Language Toolkit: Dependency Corpus Reader
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Kepa Sarasola <kepa.sarasola@ehu.es>
|
||
|
# Iker Manterola <returntothehangar@hotmail.com>
|
||
|
#
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
import codecs
|
||
|
|
||
|
from nltk.parse import DependencyGraph
|
||
|
from nltk.tokenize import *
|
||
|
|
||
|
from nltk.corpus.reader.util import *
|
||
|
from nltk.corpus.reader.api import *
|
||
|
|
||
|
|
||
|
class DependencyCorpusReader(SyntaxCorpusReader):
|
||
|
def __init__(
|
||
|
self,
|
||
|
root,
|
||
|
fileids,
|
||
|
encoding='utf8',
|
||
|
word_tokenizer=TabTokenizer(),
|
||
|
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
|
||
|
para_block_reader=read_blankline_block,
|
||
|
):
|
||
|
# FIXME: Why is it inheritting from SyntaxCorpusReader but initializing
|
||
|
# from CorpusReader?
|
||
|
CorpusReader.__init__(self, root, fileids, encoding)
|
||
|
|
||
|
#########################################################
|
||
|
|
||
|
def raw(self, fileids=None):
|
||
|
"""
|
||
|
:return: the given file(s) as a single string.
|
||
|
:rtype: str
|
||
|
"""
|
||
|
result = []
|
||
|
for fileid, encoding in self.abspaths(fileids, include_encoding=True):
|
||
|
if isinstance(fileid, PathPointer):
|
||
|
result.append(fileid.open(encoding=encoding).read())
|
||
|
else:
|
||
|
with codecs.open(fileid, "r", encoding) as fp:
|
||
|
result.append(fp.read())
|
||
|
return concat(result)
|
||
|
|
||
|
def words(self, fileids=None):
|
||
|
return concat(
|
||
|
[
|
||
|
DependencyCorpusView(fileid, False, False, False, encoding=enc)
|
||
|
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def tagged_words(self, fileids=None):
|
||
|
return concat(
|
||
|
[
|
||
|
DependencyCorpusView(fileid, True, False, False, encoding=enc)
|
||
|
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def sents(self, fileids=None):
|
||
|
return concat(
|
||
|
[
|
||
|
DependencyCorpusView(fileid, False, True, False, encoding=enc)
|
||
|
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def tagged_sents(self, fileids=None):
|
||
|
return concat(
|
||
|
[
|
||
|
DependencyCorpusView(fileid, True, True, False, encoding=enc)
|
||
|
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def parsed_sents(self, fileids=None):
|
||
|
sents = concat(
|
||
|
[
|
||
|
DependencyCorpusView(fileid, False, True, True, encoding=enc)
|
||
|
for fileid, enc in self.abspaths(fileids, include_encoding=True)
|
||
|
]
|
||
|
)
|
||
|
return [DependencyGraph(sent) for sent in sents]
|
||
|
|
||
|
|
||
|
class DependencyCorpusView(StreamBackedCorpusView):
|
||
|
_DOCSTART = '-DOCSTART- -DOCSTART- O\n' # dokumentu hasiera definitzen da
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
corpus_file,
|
||
|
tagged,
|
||
|
group_by_sent,
|
||
|
dependencies,
|
||
|
chunk_types=None,
|
||
|
encoding='utf8',
|
||
|
):
|
||
|
self._tagged = tagged
|
||
|
self._dependencies = dependencies
|
||
|
self._group_by_sent = group_by_sent
|
||
|
self._chunk_types = chunk_types
|
||
|
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||
|
|
||
|
def read_block(self, stream):
|
||
|
# Read the next sentence.
|
||
|
sent = read_blankline_block(stream)[0].strip()
|
||
|
# Strip off the docstart marker, if present.
|
||
|
if sent.startswith(self._DOCSTART):
|
||
|
sent = sent[len(self._DOCSTART) :].lstrip()
|
||
|
|
||
|
# extract word and tag from any of the formats
|
||
|
if not self._dependencies:
|
||
|
lines = [line.split('\t') for line in sent.split('\n')]
|
||
|
if len(lines[0]) == 3 or len(lines[0]) == 4:
|
||
|
sent = [(line[0], line[1]) for line in lines]
|
||
|
elif len(lines[0]) == 10:
|
||
|
sent = [(line[1], line[4]) for line in lines]
|
||
|
else:
|
||
|
raise ValueError('Unexpected number of fields in dependency tree file')
|
||
|
|
||
|
# discard tags if they weren't requested
|
||
|
if not self._tagged:
|
||
|
sent = [word for (word, tag) in sent]
|
||
|
|
||
|
# Return the result.
|
||
|
if self._group_by_sent:
|
||
|
return [sent]
|
||
|
else:
|
||
|
return list(sent)
|