78 lines
2.6 KiB
Python
78 lines
2.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""
|
|
UDHR corpus reader. It mostly deals with encodings.
|
|
"""
|
|
from __future__ import absolute_import, unicode_literals
|
|
|
|
from nltk.corpus.reader.util import find_corpus_fileids
|
|
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
|
|
|
|
|
|
class UdhrCorpusReader(PlaintextCorpusReader):
|
|
|
|
ENCODINGS = [
|
|
('.*-Latin1$', 'latin-1'),
|
|
('.*-Hebrew$', 'hebrew'),
|
|
('.*-Arabic$', 'cp1256'),
|
|
('Czech_Cesky-UTF8', 'cp1250'), # yeah
|
|
('.*-Cyrillic$', 'cyrillic'),
|
|
('.*-SJIS$', 'SJIS'),
|
|
('.*-GB2312$', 'GB2312'),
|
|
('.*-Latin2$', 'ISO-8859-2'),
|
|
('.*-Greek$', 'greek'),
|
|
('.*-UTF8$', 'utf-8'),
|
|
('Hungarian_Magyar-Unicode', 'utf-16-le'),
|
|
('Amahuaca', 'latin1'),
|
|
('Turkish_Turkce-Turkish', 'latin5'),
|
|
('Lithuanian_Lietuviskai-Baltic', 'latin4'),
|
|
('Japanese_Nihongo-EUC', 'EUC-JP'),
|
|
('Japanese_Nihongo-JIS', 'iso2022_jp'),
|
|
('Chinese_Mandarin-HZ', 'hz'),
|
|
('Abkhaz\-Cyrillic\+Abkh', 'cp1251'),
|
|
]
|
|
|
|
SKIP = set(
|
|
[
|
|
# The following files are not fully decodable because they
|
|
# were truncated at wrong bytes:
|
|
'Burmese_Myanmar-UTF8',
|
|
'Japanese_Nihongo-JIS',
|
|
'Chinese_Mandarin-HZ',
|
|
'Chinese_Mandarin-UTF8',
|
|
'Gujarati-UTF8',
|
|
'Hungarian_Magyar-Unicode',
|
|
'Lao-UTF8',
|
|
'Magahi-UTF8',
|
|
'Marathi-UTF8',
|
|
'Tamil-UTF8',
|
|
# Unfortunately, encodings required for reading
|
|
# the following files are not supported by Python:
|
|
'Vietnamese-VPS',
|
|
'Vietnamese-VIQR',
|
|
'Vietnamese-TCVN',
|
|
'Magahi-Agra',
|
|
'Bhojpuri-Agra',
|
|
'Esperanto-T61', # latin3 raises an exception
|
|
# The following files are encoded for specific fonts:
|
|
'Burmese_Myanmar-WinResearcher',
|
|
'Armenian-DallakHelv',
|
|
'Tigrinya_Tigrigna-VG2Main',
|
|
'Amharic-Afenegus6..60375', # ?
|
|
'Navaho_Dine-Navajo-Navaho-font',
|
|
# What are these?
|
|
'Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117',
|
|
'Azeri_Azerbaijani_Latin-Az.Times.Lat0117',
|
|
# The following files are unintended:
|
|
'Czech-Latin2-err',
|
|
'Russian_Russky-UTF8~',
|
|
]
|
|
)
|
|
|
|
def __init__(self, root='udhr'):
|
|
fileids = find_corpus_fileids(root, r'(?!README|\.).*')
|
|
super(UdhrCorpusReader, self).__init__(
|
|
root,
|
|
[fileid for fileid in fileids if fileid not in self.SKIP],
|
|
encoding=self.ENCODINGS,
|
|
)
|