634 lines
25 KiB
Python
634 lines
25 KiB
Python
|
# CHILDES XML Corpus Reader
|
||
|
|
||
|
# Copyright (C) 2001-2019 NLTK Project
|
||
|
# Author: Tomonori Nagano <tnagano@gc.cuny.edu>
|
||
|
# Alexis Dimitriadis <A.Dimitriadis@uu.nl>
|
||
|
# URL: <http://nltk.org/>
|
||
|
# For license information, see LICENSE.TXT
|
||
|
|
||
|
"""
|
||
|
Corpus reader for the XML version of the CHILDES corpus.
|
||
|
"""
|
||
|
from __future__ import print_function, division
|
||
|
|
||
|
__docformat__ = 'epytext en'
|
||
|
|
||
|
import re
|
||
|
from collections import defaultdict
|
||
|
from six import string_types
|
||
|
|
||
|
from nltk.util import flatten, LazyMap, LazyConcatenation
|
||
|
|
||
|
from nltk.corpus.reader.util import concat
|
||
|
from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree
|
||
|
|
||
|
# to resolve the namespace issue
|
||
|
NS = 'http://www.talkbank.org/ns/talkbank'
|
||
|
|
||
|
|
||
|
class CHILDESCorpusReader(XMLCorpusReader):
|
||
|
"""
|
||
|
Corpus reader for the XML version of the CHILDES corpus.
|
||
|
The CHILDES corpus is available at ``https://childes.talkbank.org/``. The XML
|
||
|
version of CHILDES is located at ``https://childes.talkbank.org/data-xml/``.
|
||
|
Copy the needed parts of the CHILDES XML corpus into the NLTK data directory
|
||
|
(``nltk_data/corpora/CHILDES/``).
|
||
|
|
||
|
For access to the file text use the usual nltk functions,
|
||
|
``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, root, fileids, lazy=True):
|
||
|
XMLCorpusReader.__init__(self, root, fileids)
|
||
|
self._lazy = lazy
|
||
|
|
||
|
def words(
|
||
|
self,
|
||
|
fileids=None,
|
||
|
speaker='ALL',
|
||
|
stem=False,
|
||
|
relation=False,
|
||
|
strip_space=True,
|
||
|
replace=False,
|
||
|
):
|
||
|
"""
|
||
|
:return: the given file(s) as a list of words
|
||
|
:rtype: list(str)
|
||
|
|
||
|
:param speaker: If specified, select specific speaker(s) defined
|
||
|
in the corpus. Default is 'ALL' (all participants). Common choices
|
||
|
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||
|
researchers)
|
||
|
:param stem: If true, then use word stems instead of word strings.
|
||
|
:param relation: If true, then return tuples of (stem, index,
|
||
|
dependent_index)
|
||
|
:param strip_space: If true, then strip trailing spaces from word
|
||
|
tokens. Otherwise, leave the spaces on the tokens.
|
||
|
:param replace: If true, then use the replaced (intended) word instead
|
||
|
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||
|
"""
|
||
|
sent = None
|
||
|
pos = False
|
||
|
if not self._lazy:
|
||
|
return [
|
||
|
self._get_words(
|
||
|
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||
|
)
|
||
|
for fileid in self.abspaths(fileids)
|
||
|
]
|
||
|
|
||
|
get_words = lambda fileid: self._get_words(
|
||
|
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||
|
)
|
||
|
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||
|
|
||
|
def tagged_words(
|
||
|
self,
|
||
|
fileids=None,
|
||
|
speaker='ALL',
|
||
|
stem=False,
|
||
|
relation=False,
|
||
|
strip_space=True,
|
||
|
replace=False,
|
||
|
):
|
||
|
"""
|
||
|
:return: the given file(s) as a list of tagged
|
||
|
words and punctuation symbols, encoded as tuples
|
||
|
``(word,tag)``.
|
||
|
:rtype: list(tuple(str,str))
|
||
|
|
||
|
:param speaker: If specified, select specific speaker(s) defined
|
||
|
in the corpus. Default is 'ALL' (all participants). Common choices
|
||
|
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||
|
researchers)
|
||
|
:param stem: If true, then use word stems instead of word strings.
|
||
|
:param relation: If true, then return tuples of (stem, index,
|
||
|
dependent_index)
|
||
|
:param strip_space: If true, then strip trailing spaces from word
|
||
|
tokens. Otherwise, leave the spaces on the tokens.
|
||
|
:param replace: If true, then use the replaced (intended) word instead
|
||
|
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||
|
"""
|
||
|
sent = None
|
||
|
pos = True
|
||
|
if not self._lazy:
|
||
|
return [
|
||
|
self._get_words(
|
||
|
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||
|
)
|
||
|
for fileid in self.abspaths(fileids)
|
||
|
]
|
||
|
|
||
|
get_words = lambda fileid: self._get_words(
|
||
|
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||
|
)
|
||
|
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||
|
|
||
|
def sents(
|
||
|
self,
|
||
|
fileids=None,
|
||
|
speaker='ALL',
|
||
|
stem=False,
|
||
|
relation=None,
|
||
|
strip_space=True,
|
||
|
replace=False,
|
||
|
):
|
||
|
"""
|
||
|
:return: the given file(s) as a list of sentences or utterances, each
|
||
|
encoded as a list of word strings.
|
||
|
:rtype: list(list(str))
|
||
|
|
||
|
:param speaker: If specified, select specific speaker(s) defined
|
||
|
in the corpus. Default is 'ALL' (all participants). Common choices
|
||
|
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||
|
researchers)
|
||
|
:param stem: If true, then use word stems instead of word strings.
|
||
|
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
|
||
|
If there is manually-annotated relation info, it will return
|
||
|
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
|
||
|
:param strip_space: If true, then strip trailing spaces from word
|
||
|
tokens. Otherwise, leave the spaces on the tokens.
|
||
|
:param replace: If true, then use the replaced (intended) word instead
|
||
|
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||
|
"""
|
||
|
sent = True
|
||
|
pos = False
|
||
|
if not self._lazy:
|
||
|
return [
|
||
|
self._get_words(
|
||
|
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||
|
)
|
||
|
for fileid in self.abspaths(fileids)
|
||
|
]
|
||
|
|
||
|
get_words = lambda fileid: self._get_words(
|
||
|
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||
|
)
|
||
|
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||
|
|
||
|
def tagged_sents(
|
||
|
self,
|
||
|
fileids=None,
|
||
|
speaker='ALL',
|
||
|
stem=False,
|
||
|
relation=None,
|
||
|
strip_space=True,
|
||
|
replace=False,
|
||
|
):
|
||
|
"""
|
||
|
:return: the given file(s) as a list of
|
||
|
sentences, each encoded as a list of ``(word,tag)`` tuples.
|
||
|
:rtype: list(list(tuple(str,str)))
|
||
|
|
||
|
:param speaker: If specified, select specific speaker(s) defined
|
||
|
in the corpus. Default is 'ALL' (all participants). Common choices
|
||
|
are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
|
||
|
researchers)
|
||
|
:param stem: If true, then use word stems instead of word strings.
|
||
|
:param relation: If true, then return tuples of ``(str,pos,relation_list)``.
|
||
|
If there is manually-annotated relation info, it will return
|
||
|
tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
|
||
|
:param strip_space: If true, then strip trailing spaces from word
|
||
|
tokens. Otherwise, leave the spaces on the tokens.
|
||
|
:param replace: If true, then use the replaced (intended) word instead
|
||
|
of the original word (e.g., 'wat' will be replaced with 'watch')
|
||
|
"""
|
||
|
sent = True
|
||
|
pos = True
|
||
|
if not self._lazy:
|
||
|
return [
|
||
|
self._get_words(
|
||
|
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||
|
)
|
||
|
for fileid in self.abspaths(fileids)
|
||
|
]
|
||
|
|
||
|
get_words = lambda fileid: self._get_words(
|
||
|
fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||
|
)
|
||
|
return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
|
||
|
|
||
|
def corpus(self, fileids=None):
|
||
|
"""
|
||
|
:return: the given file(s) as a dict of ``(corpus_property_key, value)``
|
||
|
:rtype: list(dict)
|
||
|
"""
|
||
|
if not self._lazy:
|
||
|
return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)]
|
||
|
return LazyMap(self._get_corpus, self.abspaths(fileids))
|
||
|
|
||
|
def _get_corpus(self, fileid):
|
||
|
results = dict()
|
||
|
xmldoc = ElementTree.parse(fileid).getroot()
|
||
|
for key, value in xmldoc.items():
|
||
|
results[key] = value
|
||
|
return results
|
||
|
|
||
|
def participants(self, fileids=None):
|
||
|
"""
|
||
|
:return: the given file(s) as a dict of
|
||
|
``(participant_property_key, value)``
|
||
|
:rtype: list(dict)
|
||
|
"""
|
||
|
if not self._lazy:
|
||
|
return [self._get_participants(fileid) for fileid in self.abspaths(fileids)]
|
||
|
return LazyMap(self._get_participants, self.abspaths(fileids))
|
||
|
|
||
|
def _get_participants(self, fileid):
|
||
|
# multidimensional dicts
|
||
|
def dictOfDicts():
|
||
|
return defaultdict(dictOfDicts)
|
||
|
|
||
|
xmldoc = ElementTree.parse(fileid).getroot()
|
||
|
# getting participants' data
|
||
|
pat = dictOfDicts()
|
||
|
for participant in xmldoc.findall(
|
||
|
'.//{%s}Participants/{%s}participant' % (NS, NS)
|
||
|
):
|
||
|
for (key, value) in participant.items():
|
||
|
pat[participant.get('id')][key] = value
|
||
|
return pat
|
||
|
|
||
|
def age(self, fileids=None, speaker='CHI', month=False):
|
||
|
"""
|
||
|
:return: the given file(s) as string or int
|
||
|
:rtype: list or int
|
||
|
|
||
|
:param month: If true, return months instead of year-month-date
|
||
|
"""
|
||
|
if not self._lazy:
|
||
|
return [
|
||
|
self._get_age(fileid, speaker, month)
|
||
|
for fileid in self.abspaths(fileids)
|
||
|
]
|
||
|
get_age = lambda fileid: self._get_age(fileid, speaker, month)
|
||
|
return LazyMap(get_age, self.abspaths(fileids))
|
||
|
|
||
|
def _get_age(self, fileid, speaker, month):
|
||
|
xmldoc = ElementTree.parse(fileid).getroot()
|
||
|
for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS, NS)):
|
||
|
try:
|
||
|
if pat.get('id') == speaker:
|
||
|
age = pat.get('age')
|
||
|
if month:
|
||
|
age = self.convert_age(age)
|
||
|
return age
|
||
|
# some files don't have age data
|
||
|
except (TypeError, AttributeError) as e:
|
||
|
return None
|
||
|
|
||
|
def convert_age(self, age_year):
|
||
|
"Caclculate age in months from a string in CHILDES format"
|
||
|
m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?", age_year)
|
||
|
age_month = int(m.group(1)) * 12 + int(m.group(2))
|
||
|
try:
|
||
|
if int(m.group(3)) > 15:
|
||
|
age_month += 1
|
||
|
# some corpora don't have age information?
|
||
|
except ValueError as e:
|
||
|
pass
|
||
|
return age_month
|
||
|
|
||
|
def MLU(self, fileids=None, speaker='CHI'):
|
||
|
"""
|
||
|
:return: the given file(s) as a floating number
|
||
|
:rtype: list(float)
|
||
|
"""
|
||
|
if not self._lazy:
|
||
|
return [
|
||
|
self._getMLU(fileid, speaker=speaker)
|
||
|
for fileid in self.abspaths(fileids)
|
||
|
]
|
||
|
get_MLU = lambda fileid: self._getMLU(fileid, speaker=speaker)
|
||
|
return LazyMap(get_MLU, self.abspaths(fileids))
|
||
|
|
||
|
def _getMLU(self, fileid, speaker):
|
||
|
sents = self._get_words(
|
||
|
fileid,
|
||
|
speaker=speaker,
|
||
|
sent=True,
|
||
|
stem=True,
|
||
|
relation=False,
|
||
|
pos=True,
|
||
|
strip_space=True,
|
||
|
replace=True,
|
||
|
)
|
||
|
results = []
|
||
|
lastSent = []
|
||
|
numFillers = 0
|
||
|
sentDiscount = 0
|
||
|
for sent in sents:
|
||
|
posList = [pos for (word, pos) in sent]
|
||
|
# if any part of the sentence is intelligible
|
||
|
if any(pos == 'unk' for pos in posList):
|
||
|
continue
|
||
|
# if the sentence is null
|
||
|
elif sent == []:
|
||
|
continue
|
||
|
# if the sentence is the same as the last sent
|
||
|
elif sent == lastSent:
|
||
|
continue
|
||
|
else:
|
||
|
results.append([word for (word, pos) in sent])
|
||
|
# count number of fillers
|
||
|
if len(set(['co', None]).intersection(posList)) > 0:
|
||
|
numFillers += posList.count('co')
|
||
|
numFillers += posList.count(None)
|
||
|
sentDiscount += 1
|
||
|
lastSent = sent
|
||
|
try:
|
||
|
thisWordList = flatten(results)
|
||
|
# count number of morphemes
|
||
|
# (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes)
|
||
|
numWords = (
|
||
|
len(flatten([word.split('-') for word in thisWordList])) - numFillers
|
||
|
)
|
||
|
numSents = len(results) - sentDiscount
|
||
|
mlu = numWords / numSents
|
||
|
except ZeroDivisionError:
|
||
|
mlu = 0
|
||
|
# return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents}
|
||
|
return mlu
|
||
|
|
||
|
def _get_words(
|
||
|
self, fileid, speaker, sent, stem, relation, pos, strip_space, replace
|
||
|
):
|
||
|
if (
|
||
|
isinstance(speaker, string_types) and speaker != 'ALL'
|
||
|
): # ensure we have a list of speakers
|
||
|
speaker = [speaker]
|
||
|
xmldoc = ElementTree.parse(fileid).getroot()
|
||
|
# processing each xml doc
|
||
|
results = []
|
||
|
for xmlsent in xmldoc.findall('.//{%s}u' % NS):
|
||
|
sents = []
|
||
|
# select speakers
|
||
|
if speaker == 'ALL' or xmlsent.get('who') in speaker:
|
||
|
for xmlword in xmlsent.findall('.//{%s}w' % NS):
|
||
|
infl = None
|
||
|
suffixStem = None
|
||
|
suffixTag = None
|
||
|
# getting replaced words
|
||
|
if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS, NS)):
|
||
|
xmlword = xmlsent.find(
|
||
|
'.//{%s}w/{%s}replacement/{%s}w' % (NS, NS, NS)
|
||
|
)
|
||
|
elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)):
|
||
|
xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS))
|
||
|
# get text
|
||
|
if xmlword.text:
|
||
|
word = xmlword.text
|
||
|
else:
|
||
|
word = ''
|
||
|
# strip tailing space
|
||
|
if strip_space:
|
||
|
word = word.strip()
|
||
|
# stem
|
||
|
if relation or stem:
|
||
|
try:
|
||
|
xmlstem = xmlword.find('.//{%s}stem' % NS)
|
||
|
word = xmlstem.text
|
||
|
except AttributeError as e:
|
||
|
pass
|
||
|
# if there is an inflection
|
||
|
try:
|
||
|
xmlinfl = xmlword.find(
|
||
|
'.//{%s}mor/{%s}mw/{%s}mk' % (NS, NS, NS)
|
||
|
)
|
||
|
word += '-' + xmlinfl.text
|
||
|
except:
|
||
|
pass
|
||
|
# if there is a suffix
|
||
|
try:
|
||
|
xmlsuffix = xmlword.find(
|
||
|
'.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem'
|
||
|
% (NS, NS, NS, NS)
|
||
|
)
|
||
|
suffixStem = xmlsuffix.text
|
||
|
except AttributeError:
|
||
|
suffixStem = ""
|
||
|
if suffixStem:
|
||
|
word += "~" + suffixStem
|
||
|
# pos
|
||
|
if relation or pos:
|
||
|
try:
|
||
|
xmlpos = xmlword.findall(".//{%s}c" % NS)
|
||
|
xmlpos2 = xmlword.findall(".//{%s}s" % NS)
|
||
|
if xmlpos2 != []:
|
||
|
tag = xmlpos[0].text + ":" + xmlpos2[0].text
|
||
|
else:
|
||
|
tag = xmlpos[0].text
|
||
|
except (AttributeError, IndexError) as e:
|
||
|
tag = ""
|
||
|
try:
|
||
|
xmlsuffixpos = xmlword.findall(
|
||
|
'.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c'
|
||
|
% (NS, NS, NS, NS, NS)
|
||
|
)
|
||
|
xmlsuffixpos2 = xmlword.findall(
|
||
|
'.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s'
|
||
|
% (NS, NS, NS, NS, NS)
|
||
|
)
|
||
|
if xmlsuffixpos2:
|
||
|
suffixTag = (
|
||
|
xmlsuffixpos[0].text + ":" + xmlsuffixpos2[0].text
|
||
|
)
|
||
|
else:
|
||
|
suffixTag = xmlsuffixpos[0].text
|
||
|
except:
|
||
|
pass
|
||
|
if suffixTag:
|
||
|
tag += "~" + suffixTag
|
||
|
word = (word, tag)
|
||
|
# relational
|
||
|
# the gold standard is stored in
|
||
|
# <mor></mor><mor type="trn"><gra type="grt">
|
||
|
if relation == True:
|
||
|
for xmlstem_rel in xmlword.findall(
|
||
|
'.//{%s}mor/{%s}gra' % (NS, NS)
|
||
|
):
|
||
|
if not xmlstem_rel.get('type') == 'grt':
|
||
|
word = (
|
||
|
word[0],
|
||
|
word[1],
|
||
|
xmlstem_rel.get('index')
|
||
|
+ "|"
|
||
|
+ xmlstem_rel.get('head')
|
||
|
+ "|"
|
||
|
+ xmlstem_rel.get('relation'),
|
||
|
)
|
||
|
else:
|
||
|
word = (
|
||
|
word[0],
|
||
|
word[1],
|
||
|
word[2],
|
||
|
word[0],
|
||
|
word[1],
|
||
|
xmlstem_rel.get('index')
|
||
|
+ "|"
|
||
|
+ xmlstem_rel.get('head')
|
||
|
+ "|"
|
||
|
+ xmlstem_rel.get('relation'),
|
||
|
)
|
||
|
try:
|
||
|
for xmlpost_rel in xmlword.findall(
|
||
|
'.//{%s}mor/{%s}mor-post/{%s}gra' % (NS, NS, NS)
|
||
|
):
|
||
|
if not xmlpost_rel.get('type') == 'grt':
|
||
|
suffixStem = (
|
||
|
suffixStem[0],
|
||
|
suffixStem[1],
|
||
|
xmlpost_rel.get('index')
|
||
|
+ "|"
|
||
|
+ xmlpost_rel.get('head')
|
||
|
+ "|"
|
||
|
+ xmlpost_rel.get('relation'),
|
||
|
)
|
||
|
else:
|
||
|
suffixStem = (
|
||
|
suffixStem[0],
|
||
|
suffixStem[1],
|
||
|
suffixStem[2],
|
||
|
suffixStem[0],
|
||
|
suffixStem[1],
|
||
|
xmlpost_rel.get('index')
|
||
|
+ "|"
|
||
|
+ xmlpost_rel.get('head')
|
||
|
+ "|"
|
||
|
+ xmlpost_rel.get('relation'),
|
||
|
)
|
||
|
except:
|
||
|
pass
|
||
|
sents.append(word)
|
||
|
if sent or relation:
|
||
|
results.append(sents)
|
||
|
else:
|
||
|
results.extend(sents)
|
||
|
return LazyMap(lambda x: x, results)
|
||
|
|
||
|
# Ready-to-use browser opener
|
||
|
|
||
|
"""
|
||
|
The base URL for viewing files on the childes website. This
|
||
|
shouldn't need to be changed, unless CHILDES changes the configuration
|
||
|
of their server or unless the user sets up their own corpus webserver.
|
||
|
"""
|
||
|
childes_url_base = r'https://childes.talkbank.org/browser/index.php?url='
|
||
|
|
||
|
def webview_file(self, fileid, urlbase=None):
|
||
|
"""Map a corpus file to its web version on the CHILDES website,
|
||
|
and open it in a web browser.
|
||
|
|
||
|
The complete URL to be used is:
|
||
|
childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha')
|
||
|
|
||
|
If no urlbase is passed, we try to calculate it. This
|
||
|
requires that the childes corpus was set up to mirror the
|
||
|
folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.:
|
||
|
nltk_data/corpora/childes/Eng-USA/Cornell/??? or
|
||
|
nltk_data/corpora/childes/Romance/Spanish/Aguirre/???
|
||
|
|
||
|
The function first looks (as a special case) if "Eng-USA" is
|
||
|
on the path consisting of <corpus root>+fileid; then if
|
||
|
"childes", possibly followed by "data-xml", appears. If neither
|
||
|
one is found, we use the unmodified fileid and hope for the best.
|
||
|
If this is not right, specify urlbase explicitly, e.g., if the
|
||
|
corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'.
|
||
|
"""
|
||
|
|
||
|
import webbrowser
|
||
|
|
||
|
if urlbase:
|
||
|
path = urlbase + "/" + fileid
|
||
|
else:
|
||
|
full = self.root + "/" + fileid
|
||
|
full = re.sub(r'\\', '/', full)
|
||
|
if '/childes/' in full.lower():
|
||
|
# Discard /data-xml/ if present
|
||
|
path = re.findall(r'(?i)/childes(?:/data-xml)?/(.*)\.xml', full)[0]
|
||
|
elif 'eng-usa' in full.lower():
|
||
|
path = 'Eng-USA/' + re.findall(r'/(?i)Eng-USA/(.*)\.xml', full)[0]
|
||
|
else:
|
||
|
path = fileid
|
||
|
|
||
|
# Strip ".xml" and add ".cha", as necessary:
|
||
|
if path.endswith('.xml'):
|
||
|
path = path[:-4]
|
||
|
|
||
|
if not path.endswith('.cha'):
|
||
|
path = path + '.cha'
|
||
|
|
||
|
url = self.childes_url_base + path
|
||
|
|
||
|
webbrowser.open_new_tab(url)
|
||
|
print("Opening in browser:", url)
|
||
|
# Pausing is a good idea, but it's up to the user...
|
||
|
# raw_input("Hit Return to continue")
|
||
|
|
||
|
|
||
|
def demo(corpus_root=None):
|
||
|
"""
|
||
|
The CHILDES corpus should be manually downloaded and saved
|
||
|
to ``[NLTK_Data_Dir]/corpora/childes/``
|
||
|
"""
|
||
|
if not corpus_root:
|
||
|
from nltk.data import find
|
||
|
|
||
|
corpus_root = find('corpora/childes/data-xml/Eng-USA/')
|
||
|
|
||
|
try:
|
||
|
childes = CHILDESCorpusReader(corpus_root, '.*.xml')
|
||
|
# describe all corpus
|
||
|
for file in childes.fileids()[:5]:
|
||
|
corpus = ''
|
||
|
corpus_id = ''
|
||
|
for (key, value) in childes.corpus(file)[0].items():
|
||
|
if key == "Corpus":
|
||
|
corpus = value
|
||
|
if key == "Id":
|
||
|
corpus_id = value
|
||
|
print('Reading', corpus, corpus_id, ' .....')
|
||
|
print("words:", childes.words(file)[:7], "...")
|
||
|
print(
|
||
|
"words with replaced words:",
|
||
|
childes.words(file, replace=True)[:7],
|
||
|
" ...",
|
||
|
)
|
||
|
print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
|
||
|
print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
|
||
|
print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
|
||
|
print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
|
||
|
print(
|
||
|
"words with relations and pos-tag:",
|
||
|
childes.words(file, relation=True)[:5],
|
||
|
" ...",
|
||
|
)
|
||
|
print("sentence:", childes.sents(file)[:2], " ...")
|
||
|
for (participant, values) in childes.participants(file)[0].items():
|
||
|
for (key, value) in values.items():
|
||
|
print("\tparticipant", participant, key, ":", value)
|
||
|
print("num of sent:", len(childes.sents(file)))
|
||
|
print("num of morphemes:", len(childes.words(file, stem=True)))
|
||
|
print("age:", childes.age(file))
|
||
|
print("age in month:", childes.age(file, month=True))
|
||
|
print("MLU:", childes.MLU(file))
|
||
|
print()
|
||
|
|
||
|
except LookupError as e:
|
||
|
print(
|
||
|
"""The CHILDES corpus, or the parts you need, should be manually
|
||
|
downloaded from https://childes.talkbank.org/data-xml/ and saved at
|
||
|
[NLTK_Data_Dir]/corpora/childes/
|
||
|
Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
|
||
|
demo('/path/to/childes/data-xml/Eng-USA/")
|
||
|
"""
|
||
|
)
|
||
|
# corpus_root_http = urllib2.urlopen('https://childes.talkbank.org/data-xml/Eng-USA/Bates.zip')
|
||
|
# corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read()))
|
||
|
##this fails
|
||
|
# childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist())
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
demo()
|