gcs file uploader, mp4 only

This commit is contained in:
Wojtek 2019-12-15 19:56:51 +00:00
parent 35ab9b8013
commit 40b9efeaab
1 changed files with 101 additions and 0 deletions

101
src/storageUpload.py Normal file
View File

@ -0,0 +1,101 @@
from google.cloud import storage
import sys
import urllib
from pymongo import MongoClient
from bson.objectid import ObjectId
import os
import datetime
def main():
uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
dbName = "archSpeechReco"
colName = "moviesMeta"
bucket = 'archspeechreco'
col = getMongoCollection(colName,dbName,uri)
toUpload = getUploadList(col)
for i in toUpload:
fileName = ObjectId(i['_id'])
getVid( i['url'], ObjectId( i['_id'] ) )
upload_blob(bucket, fileName, "mp4/{}.mp4".format(fileName),col)
try:
os.remove("{}.mp4".format(fileName))
except:
print("{}.mp4 has NOT been removed".format(fileName))
else:
print("{}.mp4 has been removed".format(fileName))
def upload_blob(bucket_name, source_file_name, destination_blob_name,col):
"""Uploads a file to the bucket."""
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
try:
blob.upload_from_filename("{}.mp4".format(source_file_name))
except:
print("gcs upload failed")
else:
print('File {}.mp4 uploaded to {}.'.format(
source_file_name,
destination_blob_name))
now = datetime.datetime.now()
try:
col.update_one(
{"_id": ObjectId(source_file_name)},
{"$set":{
"gcs":{
"location":destination_blob_name,
"uploadDate":now.strftime("%Y-%m-%d %H:%M:%S")
}
}
}
)
except:
print("mongo update failed")
else:
print("mongo update OK")
def getMongoCollection(colName,dbName,uri):
client = MongoClient(uri)
db = client[dbName]
col = db[colName]
return col
def getUploadList(col):
pipeline = []
#$match phase, filetr documents withour gcs field - movies not uploaded to gcs
pipeline.append({"$match": {
"gcs": {"$exists": False}
}
})
#project phase, show only url and _id keys
pipeline.append({"$project": {
"url": { "$concat": [ "http://repozytorium.fn.org.pl/",{"$arrayElemAt": [ "$mp4",0 ]}] }
}
})
#skip first N documents
#pipeline.append({"$skip":362})
#fetch only N documents
#pipeline.append({"$limit":20})
return col.aggregate(pipeline)
def getVid(url,out):
try:
urllib.request.urlretrieve(url, "{}.mp4".format(out))
except:
print("wrong URL, can't download")
if __name__ == '__main__':
main()