from mongo.helpers import get_mongo_collection import argparse import logging from glob import glob from os import path import xml.etree.ElementTree as ET from bson.objectid import ObjectId from datetime import datetime def main(args): loglevel = args.loglevel input_path = args.input numeric_level = getattr(logging, loglevel.upper(), 10) logging.basicConfig(format='%(asctime)s [%(levelname)s] - %(message)s', level=numeric_level) uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco" db_name = "archSpeechReco" col_name = "moviesMeta" col = get_mongo_collection(col_name, db_name, uri) logging.info("let's start") for file in glob(f'{input_path}/*.xml'): id = path.basename(file).split('.')[0] root = ET.parse(file).getroot() out = [] for word in root.iter('Word'): out.append(word.text) try: now = datetime.now() col.update_one({"_id": ObjectId(id)}, {"$set": {"pcssTextRecoArm2.transcript": " ".join(out), "pcssTextRecoArm2.transcripted": now.strftime("%Y-%m-%d %H:%M:%S")}}) except Exception as e: logging.error(e) else: logging.info(f'mongo update OK {id}') logging.debug(f'{path.basename(id)}: {" ".join(out)}') logging.info("program finished") if __name__ == '__main__': parser = argparse.ArgumentParser(description='PCSS reco') parser.add_argument("--input", help="input path for XML files") parser.add_argument("--loglevel", help="log level: INFO WARNING ERROR") args = parser.parse_args() main(args)