26 lines
611 B
Python
26 lines
611 B
Python
from polyglot.detect import Detector
|
|
from polyglot.detect.base import UnknownLanguage
|
|
import sys
|
|
import warnings
|
|
|
|
|
|
warnings.filterwarnings("ignore")
|
|
counter = 0
|
|
langauges = ['Polish']
|
|
for line in sys.stdin:
|
|
pl, _ = line.split('\t')
|
|
d = Detector(pl, quiet=True)
|
|
lan = d.language.code
|
|
if not d.reliable:
|
|
continue
|
|
|
|
if lan != 'pl':
|
|
name = d.language.name
|
|
if name not in langauges:
|
|
langauges.append(name)
|
|
counter += 1
|
|
else:
|
|
with open("./train/clear-train.tsv", 'a') as f:
|
|
f.write(line)
|
|
print(counter, langauges, len(langauges))
|