169 lines
5.1 KiB
Python
169 lines
5.1 KiB
Python
|
# Natural Language Toolkit: Aligned Corpus Reader
|
||
|
#
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# URL: <http://nltk.org/>
|
||
|
# Author: Steven Bird <stevenbird1@gmail.com>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
from six import string_types
|
||
|
|
||
|
from nltk.tokenize import WhitespaceTokenizer, RegexpTokenizer
|
||
|
from nltk.translate import AlignedSent, Alignment
|
||
|
|
||
|
from nltk.corpus.reader.api import CorpusReader
|
||
|
from nltk.corpus.reader.util import (
|
||
|
StreamBackedCorpusView,
|
||
|
concat,
|
||
|
read_alignedsent_block,
|
||
|
)
|
||
|
|
||
|
|
||
|
class AlignedCorpusReader(CorpusReader):
|
||
|
"""
|
||
|
Reader for corpora of word-aligned sentences. Tokens are assumed
|
||
|
to be separated by whitespace. Sentences begin on separate lines.
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
root,
|
||
|
fileids,
|
||
|
sep='/',
|
||
|
word_tokenizer=WhitespaceTokenizer(),
|
||
|
sent_tokenizer=RegexpTokenizer('\n', gaps=True),
|
||
|
alignedsent_block_reader=read_alignedsent_block,
|
||
|
encoding='latin1',
|
||
|
):
|
||
|
"""
|
||
|
Construct a new Aligned Corpus reader for a set of documents
|
||
|
located at the given root directory. Example usage:
|
||
|
|
||
|
>>> root = '/...path to corpus.../'
|
||
|
>>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP
|
||
|
|
||
|
:param root: The root directory for this corpus.
|
||
|
:param fileids: A list or regexp specifying the fileids in this corpus.
|
||
|
"""
|
||
|
CorpusReader.__init__(self, root, fileids, encoding)
|
||
|
self._sep = sep
|
||
|
self._word_tokenizer = word_tokenizer
|
||
|
self._sent_tokenizer = sent_tokenizer
|
||
|
self._alignedsent_block_reader = alignedsent_block_reader
|
||
|
|
||
|
def raw(self, fileids=None):
|
||
|
"""
|
||
|
:return: the given file(s) as a single string.
|
||
|
:rtype: str
|
||
|
"""
|
||
|
if fileids is None:
|
||
|
fileids = self._fileids
|
||
|
elif isinstance(fileids, string_types):
|
||
|
fileids = [fileids]
|
||
|
return concat([self.open(f).read() for f in fileids])
|
||
|
|
||
|
def words(self, fileids=None):
|
||
|
"""
|
||
|
:return: the given file(s) as a list of words
|
||
|
and punctuation symbols.
|
||
|
:rtype: list(str)
|
||
|
"""
|
||
|
return concat(
|
||
|
[
|
||
|
AlignedSentCorpusView(
|
||
|
fileid,
|
||
|
enc,
|
||
|
False,
|
||
|
False,
|
||
|
self._word_tokenizer,
|
||
|
self._sent_tokenizer,
|
||
|
self._alignedsent_block_reader,
|
||
|
)
|
||
|
for (fileid, enc) in self.abspaths(fileids, True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def sents(self, fileids=None):
|
||
|
"""
|
||
|
:return: the given file(s) as a list of
|
||
|
sentences or utterances, each encoded as a list of word
|
||
|
strings.
|
||
|
:rtype: list(list(str))
|
||
|
"""
|
||
|
return concat(
|
||
|
[
|
||
|
AlignedSentCorpusView(
|
||
|
fileid,
|
||
|
enc,
|
||
|
False,
|
||
|
True,
|
||
|
self._word_tokenizer,
|
||
|
self._sent_tokenizer,
|
||
|
self._alignedsent_block_reader,
|
||
|
)
|
||
|
for (fileid, enc) in self.abspaths(fileids, True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
def aligned_sents(self, fileids=None):
|
||
|
"""
|
||
|
:return: the given file(s) as a list of AlignedSent objects.
|
||
|
:rtype: list(AlignedSent)
|
||
|
"""
|
||
|
return concat(
|
||
|
[
|
||
|
AlignedSentCorpusView(
|
||
|
fileid,
|
||
|
enc,
|
||
|
True,
|
||
|
True,
|
||
|
self._word_tokenizer,
|
||
|
self._sent_tokenizer,
|
||
|
self._alignedsent_block_reader,
|
||
|
)
|
||
|
for (fileid, enc) in self.abspaths(fileids, True)
|
||
|
]
|
||
|
)
|
||
|
|
||
|
|
||
|
class AlignedSentCorpusView(StreamBackedCorpusView):
|
||
|
"""
|
||
|
A specialized corpus view for aligned sentences.
|
||
|
``AlignedSentCorpusView`` objects are typically created by
|
||
|
``AlignedCorpusReader`` (not directly by nltk users).
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
corpus_file,
|
||
|
encoding,
|
||
|
aligned,
|
||
|
group_by_sent,
|
||
|
word_tokenizer,
|
||
|
sent_tokenizer,
|
||
|
alignedsent_block_reader,
|
||
|
):
|
||
|
self._aligned = aligned
|
||
|
self._group_by_sent = group_by_sent
|
||
|
self._word_tokenizer = word_tokenizer
|
||
|
self._sent_tokenizer = sent_tokenizer
|
||
|
self._alignedsent_block_reader = alignedsent_block_reader
|
||
|
StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
|
||
|
|
||
|
def read_block(self, stream):
|
||
|
block = [
|
||
|
self._word_tokenizer.tokenize(sent_str)
|
||
|
for alignedsent_str in self._alignedsent_block_reader(stream)
|
||
|
for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)
|
||
|
]
|
||
|
if self._aligned:
|
||
|
block[2] = Alignment.fromstring(
|
||
|
" ".join(block[2])
|
||
|
) # kludge; we shouldn't have tokenized the alignment string
|
||
|
block = [AlignedSent(*block)]
|
||
|
elif self._group_by_sent:
|
||
|
block = [block[0]]
|
||
|
else:
|
||
|
block = block[0]
|
||
|
|
||
|
return block
|