archSpeechReco/src/pcss_reco.py

49 lines
1.6 KiB
Python

from mongo.helpers import get_mongo_collection
import argparse
import logging
from glob import glob
from os import path
import xml.etree.ElementTree as ET
from bson.objectid import ObjectId
from datetime import datetime
def main(args):
loglevel = args.loglevel
input_path = args.input
numeric_level = getattr(logging, loglevel.upper(), 10)
logging.basicConfig(format='%(asctime)s [%(levelname)s] - %(message)s', level=numeric_level)
uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
db_name = "archSpeechReco"
col_name = "moviesMeta"
col = get_mongo_collection(col_name, db_name, uri)
logging.info("let's start")
for file in glob(f'{input_path}/*.xml'):
id = path.basename(file).split('.')[0]
root = ET.parse(file).getroot()
out = []
for word in root.iter('Word'):
out.append(word.text)
try:
now = datetime.now()
col.update_one({"_id": ObjectId(id)}, {"$set": {"pcssTextRecoArm2.transcript": " ".join(out),
"pcssTextRecoArm2.transcripted": now.strftime("%Y-%m-%d %H:%M:%S")}})
except Exception as e:
logging.error(e)
else:
logging.info(f'mongo update OK {id}')
logging.debug(f'{path.basename(id)}: {" ".join(out)}')
logging.info("program finished")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PCSS reco')
parser.add_argument("--input", help="input path for XML files")
parser.add_argument("--loglevel", help="log level: INFO WARNING ERROR")
args = parser.parse_args()
main(args)