TAU2019-028/preprocess-train/scripts/remove-diacritics.py

21 lines
689 B
Python
Executable File

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: Barry Haddow
# Distributed under MIT license
#
# Remove Romanian diacritics. Assumes s-comma and t-comma are normalised
import io
import sys
istream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
ostream = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
for line in istream:
line = line.replace("\u0218", "S").replace("\u0219", "s") #s-comma
line = line.replace("\u021a", "T").replace("\u021b", "t") #t-comma
line = line.replace("\u0102", "A").replace("\u0103", "a")
line = line.replace("\u00C2", "A").replace("\u00E2", "a")
line = line.replace("\u00CE", "I").replace("\u00EE", "i")
ostream.write(line)