From 40b9efeaab3817870a7b5f0d949c07ed0d2e045b Mon Sep 17 00:00:00 2001 From: Wojtek Date: Sun, 15 Dec 2019 19:56:51 +0000 Subject: [PATCH] gcs file uploader, mp4 only --- src/storageUpload.py | 101 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 src/storageUpload.py diff --git a/src/storageUpload.py b/src/storageUpload.py new file mode 100644 index 00000000..ccad59f2 --- /dev/null +++ b/src/storageUpload.py @@ -0,0 +1,101 @@ +from google.cloud import storage +import sys +import urllib +from pymongo import MongoClient +from bson.objectid import ObjectId +import os +import datetime + + +def main(): + uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco" + dbName = "archSpeechReco" + colName = "moviesMeta" + bucket = 'archspeechreco' + + col = getMongoCollection(colName,dbName,uri) + + toUpload = getUploadList(col) + + for i in toUpload: + fileName = ObjectId(i['_id']) + getVid( i['url'], ObjectId( i['_id'] ) ) + upload_blob(bucket, fileName, "mp4/{}.mp4".format(fileName),col) + try: + os.remove("{}.mp4".format(fileName)) + except: + print("{}.mp4 has NOT been removed".format(fileName)) + else: + print("{}.mp4 has been removed".format(fileName)) + + +def upload_blob(bucket_name, source_file_name, destination_blob_name,col): + """Uploads a file to the bucket.""" + storage_client = storage.Client() + bucket = storage_client.get_bucket(bucket_name) + blob = bucket.blob(destination_blob_name) + + try: + blob.upload_from_filename("{}.mp4".format(source_file_name)) + except: + print("gcs upload failed") + else: + print('File {}.mp4 uploaded to {}.'.format( + source_file_name, + destination_blob_name)) + now = datetime.datetime.now() + try: + col.update_one( + {"_id": ObjectId(source_file_name)}, + {"$set":{ + "gcs":{ + "location":destination_blob_name, + "uploadDate":now.strftime("%Y-%m-%d %H:%M:%S") + } + } + } + ) + except: + print("mongo update failed") + else: + print("mongo update OK") + + +def getMongoCollection(colName,dbName,uri): + client = MongoClient(uri) + db = client[dbName] + col = db[colName] + + return col + + +def getUploadList(col): + pipeline = [] + #$match phase, filetr documents withour gcs field - movies not uploaded to gcs + pipeline.append({"$match": { + "gcs": {"$exists": False} + } + }) + #project phase, show only url and _id keys + pipeline.append({"$project": { + "url": { "$concat": [ "http://repozytorium.fn.org.pl/",{"$arrayElemAt": [ "$mp4",0 ]}] } + } + }) + #skip first N documents + #pipeline.append({"$skip":362}) + #fetch only N documents + #pipeline.append({"$limit":20}) + + return col.aggregate(pipeline) + + +def getVid(url,out): + try: + urllib.request.urlretrieve(url, "{}.mp4".format(out)) + except: + print("wrong URL, can't download") + + +if __name__ == '__main__': + main() +