mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco" dbName = "archSpeechReco" colName = "moviesMeta" def get_mongo_collection(colName,dbName,uri): client = MongoClient(uri,maxPoolSize=512) db = client[dbName] col = db[colName] return col col = get_mongo_collection(colName,dbName,mongoUri) col.aggregate(pipeline) pipeline = [{'$match': {'gcTextReco': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}, {'$limit': 10}] words = col.aggregate(pipeline) for w in words: ww = w['gcTextReco']['words'] words_dict[w['_id']] = " ".join([ e['word'] for e in ww ]) for key, value in words_dict.items(): try: col.update_one( col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}}) except Exception as e: print(e) else: print(f"mongo update OK {uri.split('/')[4].split('.')[0]}") var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid": {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export"} ] var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) } ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$limit: 2} ] var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": "$_id"} ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export3"} ] db.moviesMeta.aggregate(pipeline) var pipeline = [ {$match: {$and: [ {"hash": /[abcd]0$/}, {"gcsWav.location": {"$exists": 1}}, {"gcTextReco.transcript_fix": {"$not": /^$/}} ] }}, {$project: {"_id":0, "hash":1, "plik":{ "$substr": ["$gcsWav.location", 5, -1]}, "opis": "$description.desc", "transkrypcja": "$gcTextReco.transcript_fix"}}, {$out: "sample100"} ] 5df3e63d4c0402698d7844e3