prawilny refaktor początek

This commit is contained in:
Wojciech Smolak 2020-06-06 00:56:55 +02:00
parent 849e2cf21c
commit f996eb58ed
7 changed files with 153 additions and 138 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
.idea
__pycache__

0
src/__init__.py Normal file
View File

0
src/mongo/__init__.py Normal file
View File

9
src/mongo/helpers.py Normal file
View File

@ -0,0 +1,9 @@
from pymongo import MongoClient
def get_mongo_collection(col_name, db_name, uri):
client = MongoClient(uri)
db = client[db_name]
col = db[col_name]
return col

View File

@ -1,17 +1,14 @@
#from google.cloud import speech_v1
from google.cloud import speech_v1p1beta1
from google.cloud.speech_v1p1beta1 import enums
from google.cloud.speech_v1p1beta1 import types
from pymongo import MongoClient
import json
import argparse
from google.protobuf.json_format import MessageToJson,MessageToDict
from storageUpload import getMongoCollection
from google.protobuf.json_format import MessageToDict
from src.mongo.helpers import get_mongo_collection
from bson.objectid import ObjectId
import datetime
import time
import concurrent.futures
import re
def main(args):
@ -19,7 +16,7 @@ def main(args):
dbName = "archSpeechReco"
colName = "moviesMeta"
global col
col = getMongoCollection(colName,dbName,mongoUri)
col = get_mongo_collection(colName, dbName, mongoUri)
batch_size = int(args.batch_size)
waves = getWavList(col, batch_size)
uris = [w['gcsWawLocation'] for w in waves]
@ -51,7 +48,8 @@ def run_reco(uri):
"gcTextReco.words": words,
"gcTextReco.transcripted": now.strftime("%Y-%m-%d %H:%M:%S")}}
)
except Exception as e: print(e)
except Exception as e:
print(e)
else:
print(f"mongo update OK {uri.split('/')[4].split('.')[0]}")

View File

@ -1,12 +1,14 @@
from google.cloud import storage
import sys
import urllib
from pymongo import MongoClient
from src.mongo.helpers import get_mongo_collection
from bson.objectid import ObjectId
import os
import datetime
from subprocess import run,DEVNULL
from subprocess import run, DEVNULL, CalledProcessError
import argparse
from urllib.error import URLError, HTTPError, ContentTooShortError
import logging
def main(args):
uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
@ -14,46 +16,47 @@ def main(args):
colName = "moviesMeta"
bucket = 'archspeechreco'
col = getMongoCollection(colName,dbName,uri)
col = get_mongo_collection(colName, dbName, uri)
fileFormat = args.format
if (fileFormat == 'mp4'):
uploadMp4(col,bucket)
upload_mp4(col, bucket)
elif (fileFormat == 'wav'):
uploadWave(col,bucket)
upload_wave(col, bucket)
def uploadMp4(col,bucket):
toUpload = getUploadList(col)
for i in toUpload:
fileName = ObjectId(i['_id'])
getVid( i['url'], ObjectId( i['_id'] ) )
upload_blob(bucket, "{}.mp4".format(fileName), "mp4/{}.mp4".format(fileName),col,"Mp4")
def upload_mp4(col, bucket):
to_upload = get_upload_list(col)
for i in to_upload:
file_name = ObjectId(i['_id'])
get_vid(i['url'], ObjectId(i['_id']))
upload_blob(bucket, "{}.mp4".format(file_name), "mp4/{}.mp4".format(file_name), col, "Mp4")
try:
os.remove("{}.mp4".format(fileName))
os.remove("{}.mp4".format(file_name))
except:
print("{}.mp4 has NOT been removed".format(fileName))
print("{}.mp4 has NOT been removed".format(file_name))
else:
print("{}.mp4 has been removed".format(fileName))
print("{}.mp4 has been removed".format(file_name))
def uploadWave(col,bucket):
toUpload = getWavUploadList(col)
def upload_wave(col, bucket):
to_upload = get_wav_upload_list(col)
for i in toUpload:
fileName = ObjectId(i['_id'])
getVid( i['url'], ObjectId( i['_id'] ) )
getWave("{}.mp4".format(fileName))
upload_blob(bucket, "{}.wav".format(fileName), "wave/{}.wav".format(fileName),col,"Wav")
for i in to_upload:
file_name = ObjectId(i['_id'])
get_vid(i['url'], ObjectId(i['_id']))
get_wave("{}.mp4".format(file_name))
upload_blob(bucket, "{}.wav".format(file_name), "wave/{}.wav".format(file_name), col, "Wav")
try:
os.remove("{}.wav".format(fileName))
os.remove("{}.wav".format(file_name))
except:
print("{}.wav has NOT been removed".format(fileName))
print("{}.wav has NOT been removed".format(file_name))
else:
print("{}.wav has been removed".format(fileName))
print("{}.wav has been removed".format(file_name))
def upload_blob(bucket_name, source_file_name, destination_blob_name,col,fileFormat):
def upload_blob(bucket_name, source_file_name, destination_blob_name, col, file_format):
"""Uploads a file to the bucket."""
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
@ -66,14 +69,14 @@ def upload_blob(bucket_name, source_file_name, destination_blob_name,col,fileFor
else:
print('File {}.{} uploaded to {}.'.format(
source_file_name,
fileFormat,
file_format,
destination_blob_name))
now = datetime.datetime.now()
try:
col.update_one(
{"_id": ObjectId(source_file_name.split('.')[0])},
{"$set": {
"gcs{}".format(fileFormat):{
"gcs{}".format(file_format): {
"location": destination_blob_name,
"uploadDate": now.strftime("%Y-%m-%d %H:%M:%S")
}
@ -86,26 +89,16 @@ def upload_blob(bucket_name, source_file_name, destination_blob_name,col,fileFor
print("mongo update OK")
def getMongoCollection(colName,dbName,uri):
client = MongoClient(uri)
db = client[dbName]
col = db[colName]
return col
def getUploadList(col):
pipeline = []
#$match phase, filetr documents withour gcs field - movies not uploaded to gcs
pipeline.append({"$match": {
def get_upload_list(col):
pipeline = [{"$match": {
"gcsMp4": {"$exists": False}
}
})
#project phase, show only url and _id keys
pipeline.append({"$project": {
}, {"$project": {
"url": {"$concat": ["http://repozytorium.fn.org.pl/", {"$arrayElemAt": ["$mp4", 0]}]}
}
})
}]
# $match phase, filetr documents withour gcs field - movies not uploaded to gcs
# project phase, show only url and _id keys
# skip first N documents
# pipeline.append({"$skip":362})
# fetch only N documents
@ -113,18 +106,17 @@ def getUploadList(col):
return col.aggregate(pipeline)
def getWavUploadList(col):
pipeline = []
#$match phase, filetr documents withour gcs field - movies not uploaded to gcs
pipeline.append({"$match": {
def get_wav_upload_list(col):
pipeline = [{"$match": {
"gcsWav": {"$exists": False}
}
})
#project phase, show only url and _id keys
pipeline.append({"$project": {
}, {"$project": {
"url": {"$concat": ["http://repozytorium.fn.org.pl/", {"$arrayElemAt": ["$mp4", 0]}]}
}
})
}]
# $match phase, filter documents without gcs field - movies not uploaded to gcs
# project phase, show only url and _id keys
# skip first N documents
# pipeline.append({"$skip":362})
# fetch only N documents
@ -133,23 +125,31 @@ def getWavUploadList(col):
return col.aggregate(pipeline)
def getVid(url,out):
def get_vid(url, out):
try:
urllib.request.urlretrieve(url, "{}.mp4".format(out))
except:
print("wrong URL, can't download")
def getWave(filename):
try:
run(['ffmpeg','-i', filename, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '1', filename.replace("mp4","wav")],stdout=DEVNULL)
except:
print("problem with ffmpeg")
except URLError as e:
print("can't download, {}".format(e.reason))
except HTTPError as e:
print("reason:{}, Http code: {}", format(e.reason, e.code))
except ContentTooShortError:
print("content too short error")
else:
print("file {}.mp4 has been downloaded from {}".format(out, url))
def get_wave(filename):
try:
run(['ffmpeg', '-i', filename, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '1',
filename.replace("mp4", "wav")], stdout=DEVNULL, check=True)
except CalledProcessError as e:
print("problem with ffmpeg, {} exited with {} code".format(e.cmd, e.returncode))
else:
print("file {} has been decoded to waw format".format(filename))
try:
os.remove(filename)
except:
print("{} has NOT been removed".format(filename))
except OSError as e:
print("{} has NOT been removed, {}".format(filename, e.strerror))
else:
print("{} has been removed".format(filename))
@ -159,4 +159,3 @@ if __name__ == '__main__':
parser.add_argument("--format", default='mp4', help="format to fetch and upload, [mp4, wav]")
args = parser.parse_args()
main(args)

View File

@ -2,14 +2,14 @@ mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
dbName = "archSpeechReco"
colName = "moviesMeta"
def getMongoCollection(colName,dbName,uri):
def get_mongo_collection(colName,dbName,uri):
client = MongoClient(uri,maxPoolSize=512)
db = client[dbName]
col = db[colName]
return col
col = getMongoCollection(colName,dbName,mongoUri)
col = get_mongo_collection(colName,dbName,mongoUri)
col.aggregate(pipeline)
@ -42,3 +42,11 @@ var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": "$_id"} ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export3"} ]
db.moviesMeta.aggregate(pipeline)
var pipeline = [ {$match: {$and: [ {"hash": /[abcd]0$/}, {"gcsWav.location": {"$exists": 1}}, {"gcTextReco.transcript_fix": {"$not": /^$/}} ] }},
{$project: {"_id":0, "hash":1, "plik":{ "$substr": ["$gcsWav.location", 5, -1]}, "opis": "$description.desc", "transkrypcja": "$gcTextReco.transcript_fix"}},
{$out: "sample100"}
]
5df3e63d4c0402698d7844e3