mass-scraper/tsv2fasttext.py
siulkilulki 7dd903b3b5 First version of ml hour classificator.
Add last_access field to annotator_console user stats.

Add split-data script.
Add tsv2fasttext.py
Add todos.org.
2018-05-28 15:10:31 +02:00

26 lines
640 B
Python
Executable File

#!/usr/bin/env python3
import csv
import sys
import re
def preprocess(prefix, hour, suffix):
sentence = prefix + hour + suffix
sentence = re.sub(r'\\n', r' \\n ', sentence)
sentence = re.sub(' +', ' ', sentence)
return sentence
def main():
# csv.reader(sys.stdin, delimiter='\t')
next(sys.stdin)
for line in sys.stdin:
prefix, hour, suffix, is_mass, yes_count, no_count, url, button_text, depth, filepath, line_no = line.rstrip(
'\n').rsplit('\t')
sentence = preprocess(prefix, hour, suffix)
print(f'__label__{is_mass} {sentence}')
if __name__ == '__main__':
main()