archSpeechReco/src/fix_transcript.py

25 lines
747 B
Python

from mongo.helpers import get_mongo_collection
mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
dbName = "archSpeechReco"
colName = "moviesMeta"
col = get_mongo_collection(colName, dbName, mongoUri)
pipeline = [{'$match': {'gcTextReco.words': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}]
col.aggregate(pipeline)
words = col.aggregate(pipeline)
words_dict = dict()
for w in words:
ww = w['gcTextReco']['words']
words_dict[w['_id']] = " ".join([e['word'] for e in ww])
for key, value in words_dict.items():
try:
col.update_one({"_id": key}, {"$set": {"gcTextReco.transcript_fix": value}})
except Exception as e:
print(e)
else:
print(f"mongo update OK {key}")