102 lines
2.8 KiB
Python
102 lines
2.8 KiB
Python
|
from google.cloud import storage
|
||
|
import sys
|
||
|
import urllib
|
||
|
from pymongo import MongoClient
|
||
|
from bson.objectid import ObjectId
|
||
|
import os
|
||
|
import datetime
|
||
|
|
||
|
|
||
|
def main():
|
||
|
uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
|
||
|
dbName = "archSpeechReco"
|
||
|
colName = "moviesMeta"
|
||
|
bucket = 'archspeechreco'
|
||
|
|
||
|
col = getMongoCollection(colName,dbName,uri)
|
||
|
|
||
|
toUpload = getUploadList(col)
|
||
|
|
||
|
for i in toUpload:
|
||
|
fileName = ObjectId(i['_id'])
|
||
|
getVid( i['url'], ObjectId( i['_id'] ) )
|
||
|
upload_blob(bucket, fileName, "mp4/{}.mp4".format(fileName),col)
|
||
|
try:
|
||
|
os.remove("{}.mp4".format(fileName))
|
||
|
except:
|
||
|
print("{}.mp4 has NOT been removed".format(fileName))
|
||
|
else:
|
||
|
print("{}.mp4 has been removed".format(fileName))
|
||
|
|
||
|
|
||
|
def upload_blob(bucket_name, source_file_name, destination_blob_name,col):
|
||
|
"""Uploads a file to the bucket."""
|
||
|
storage_client = storage.Client()
|
||
|
bucket = storage_client.get_bucket(bucket_name)
|
||
|
blob = bucket.blob(destination_blob_name)
|
||
|
|
||
|
try:
|
||
|
blob.upload_from_filename("{}.mp4".format(source_file_name))
|
||
|
except:
|
||
|
print("gcs upload failed")
|
||
|
else:
|
||
|
print('File {}.mp4 uploaded to {}.'.format(
|
||
|
source_file_name,
|
||
|
destination_blob_name))
|
||
|
now = datetime.datetime.now()
|
||
|
try:
|
||
|
col.update_one(
|
||
|
{"_id": ObjectId(source_file_name)},
|
||
|
{"$set":{
|
||
|
"gcs":{
|
||
|
"location":destination_blob_name,
|
||
|
"uploadDate":now.strftime("%Y-%m-%d %H:%M:%S")
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
)
|
||
|
except:
|
||
|
print("mongo update failed")
|
||
|
else:
|
||
|
print("mongo update OK")
|
||
|
|
||
|
|
||
|
def getMongoCollection(colName,dbName,uri):
|
||
|
client = MongoClient(uri)
|
||
|
db = client[dbName]
|
||
|
col = db[colName]
|
||
|
|
||
|
return col
|
||
|
|
||
|
|
||
|
def getUploadList(col):
|
||
|
pipeline = []
|
||
|
#$match phase, filetr documents withour gcs field - movies not uploaded to gcs
|
||
|
pipeline.append({"$match": {
|
||
|
"gcs": {"$exists": False}
|
||
|
}
|
||
|
})
|
||
|
#project phase, show only url and _id keys
|
||
|
pipeline.append({"$project": {
|
||
|
"url": { "$concat": [ "http://repozytorium.fn.org.pl/",{"$arrayElemAt": [ "$mp4",0 ]}] }
|
||
|
}
|
||
|
})
|
||
|
#skip first N documents
|
||
|
#pipeline.append({"$skip":362})
|
||
|
#fetch only N documents
|
||
|
#pipeline.append({"$limit":20})
|
||
|
|
||
|
return col.aggregate(pipeline)
|
||
|
|
||
|
|
||
|
def getVid(url,out):
|
||
|
try:
|
||
|
urllib.request.urlretrieve(url, "{}.mp4".format(out))
|
||
|
except:
|
||
|
print("wrong URL, can't download")
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|
||
|
|