Europarl/train_set_prune.py
2020-01-28 18:51:11 +01:00

26 lines
611 B
Python

from polyglot.detect import Detector
from polyglot.detect.base import UnknownLanguage
import sys
import warnings
warnings.filterwarnings("ignore")
counter = 0
langauges = ['Polish']
for line in sys.stdin:
pl, _ = line.split('\t')
d = Detector(pl, quiet=True)
lan = d.language.code
if not d.reliable:
continue
if lan != 'pl':
name = d.language.name
if name not in langauges:
langauges.append(name)
counter += 1
else:
with open("./train/clear-train.tsv", 'a') as f:
f.write(line)
print(counter, langauges, len(langauges))