This commit is contained in:
Wojciech Smolak 2020-08-20 22:25:37 +02:00
parent 2e6bd33357
commit fc5486561a
2 changed files with 5 additions and 6 deletions

View File

@ -5,7 +5,6 @@ from pymongo import MongoClient
import argparse
from google.protobuf.json_format import MessageToDict
from mongo.helpers import get_mongo_collection
from bson.objectid import ObjectId
import datetime
import time
import concurrent.futures
@ -36,7 +35,7 @@ def run_reco(uri):
if len(recoDict) != 0:
words = recoDict["results"][-1]["alternatives"][0]["words"]
transcript = "".join([trans["alternatives"][0]["transcript"] for trans in recoDict["results"][:-1]])
transcript = " ".join([trans["alternatives"][0]["transcript"] for trans in recoDict["results"][:-1]])
elif len(recoDict) == 0:
words = {}
transcript = "film niemy"

View File

@ -6,7 +6,6 @@ def get_mongo_collection(colName,dbName,uri):
client = MongoClient(uri,maxPoolSize=512)
db = client[dbName]
col = db[colName]
return col
col = get_mongo_collection(colName,dbName,mongoUri)
@ -15,9 +14,11 @@ col.aggregate(pipeline)
pipeline = [{'$match': {'gcTextReco': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}, {'$limit': 10}]
pipeline = [{'$match': {'$and': [{'source':'kronikiprl'},{'gcTextReco': {'$exists': True}}]}}, {'$project': {'gcTextReco.words': 1}}]
words = col.aggregate(pipeline)
words_dict={}
for w in words:
ww = w['gcTextReco']['words']
words_dict[w['_id']] = " ".join([ e['word'] for e in ww ])
@ -25,8 +26,7 @@ for w in words:
for key, value in words_dict.items():
try:
col.update_one(
col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})
col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})
except Exception as e: print(e)
else: