archSpeechReco/src/temp

mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
dbName = "archSpeechReco"
colName = "moviesMeta"

def get_mongo_collection(colName,dbName,uri):
    client = MongoClient(uri,maxPoolSize=512)
    db = client[dbName]
    col = db[colName]

    return col

col = get_mongo_collection(colName,dbName,mongoUri)

col.aggregate(pipeline)


pipeline = [{'$match': {'gcTextReco': {'$exists': True}}}, {'$project': {'gcTextReco.words': 1}}, {'$limit': 10}]

words = col.aggregate(pipeline)
for w in words:
	ww = w['gcTextReco']['words']
	words_dict[w['_id']] = " ".join([ e['word'] for e in ww ])


for key, value in words_dict.items():
	try:
        	col.update_one(
			col.update_one({"_id": key},{"$set":{"gcTextReco.transcript_fix":value}})

except Exception as e: print(e)
else:
        print(f"mongo update OK {uri.split('/')[4].split('.')[0]}")


var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}},                 {"$project":                   {"_id":0,                    "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] },                    "wid": {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) },                    "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]},                    "originalDate": "$description.date",                     "contents": ["$gcTextReco.transcript_fix"],                    "url":"$url",                    "title":"$title",                    "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}},                 {$out: "export"}                  ]


var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}},                 {"$project":                   {"_id":0,                    "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] },                    "wid":{ "$concat": ["Filmoteka_", {"$toString": parseInt({"$substr": [ {"$toString": "$_id"}, 20, -1]},16) } ] },                    "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]},                    "originalDate": "$description.date",                     "contents": ["$gcTextReco.transcript"],                    "url":"$url",                    "title":"$title",                    "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}},                 {$limit: 2}                  ]


var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}},                 {"$project":                   {"_id":0,                    "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] },                    "wid":{ "$concat": ["Filmoteka_", {"$toString": "$_id"} ] },                    "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]},                    "originalDate": "$description.date",                     "contents": ["$gcTextReco.transcript_fix"],                    "url":"$url",                    "title":"$title",                    "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}},                 {$out: "export3"}                  ]
db.moviesMeta.aggregate(pipeline)

var pipeline = [ {$match: {$and: [ {"hash": /[abcd]0$/}, {"gcsWav.location": {"$exists": 1}}, {"gcTextReco.transcript_fix": {"$not": /^$/}} ] }},
                 {$project: {"_id":0, "hash":1, "plik":{ "$substr": ["$gcsWav.location", 5, -1]}, "url":1,"opis": "$description.desc", "transkrypcja": "$gcTextReco.transcript_fix"}},
                 {$out: "sample100"}
                ]
--exec 'gsutil cp {} gs://archspeechreco/wave && rm {} && gsutil cp `echo {} | cut -d "." -f1`.mp4 gs://archspeechreco/mp4 && rm `echo {} | cut -d "." -f1`.mp4'

youtube-dl -f mp4 -i --id  -x --audio-format wav --add-metadata --write-description \
--proxy socks5://localhost:9999/ -k  \
--exec 'gsutil cp {} gs://archspeechreco/wave && rm {} && gsutil cp `echo {} | cut -d "." -f1`.mp4 gs://archspeechreco/mp4 && rm `echo {} | cut -d "." -f1`.mp4' \
https://www.youtube.com/user/renirable/videos

youtube-dl --proxy socks5://localhost:9999/ --get-title https://www.youtube.com/playlist?list=PLE6CBDC963E1806AD

youtube-dl -f mp4 -i --id  -x --audio-format wav --add-metadata --write-description \
--proxy socks5://localhost:9999/ -k  \
--exec 'gsutil cp ./{} gs://archspeechreco/wave && rm ./{} && gsutil cp `echo ./{} | cut -d "." -f1`.mp4 gs://archspeechreco/mp4 && rm `echo ./{} | cut -d "." -f1`.mp4' \
https://www.youtube.com/playlist?list=PLE6CBDC963E1806AD

youtube-dl -f mp4 -i --id  -x --audio-format wav --add-metadata --write-description \
--proxy socks5://localhost:9999/ -k  \
--exec 'gsutil cp {} gs://archspeechreco/wave && rm {} && gsutil cp `echo {} | cut -d "." -f1`.mp4 gs://archspeechreco/mp4 && rm `echo {} | cut -d "." -f1`.mp4' \
https://www.youtube.com/channel/UCy91ke1yYCZiFdnZ3vTdY_Q/videos