from polyglot.detect import Detector from polyglot.detect.base import UnknownLanguage import sys import warnings warnings.filterwarnings("ignore") counter = 0 langauges = ['Polish'] for line in sys.stdin: pl, _ = line.split('\t') d = Detector(pl, quiet=True) lan = d.language.code if not d.reliable: continue if lan != 'pl': name = d.language.name if name not in langauges: langauges.append(name) counter += 1 else: with open("./train/clear-train.tsv", 'a') as f: f.write(line) print(counter, langauges, len(langauges))