prawilny refaktor początek

This commit is contained in:
Wojciech Smolak 2020-06-06 00:56:55 +02:00
parent 849e2cf21c
commit f996eb58ed
7 changed files with 153 additions and 138 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
.idea
__pycache__ __pycache__

0
src/__init__.py Normal file
View File

0
src/mongo/__init__.py Normal file
View File

9
src/mongo/helpers.py Normal file
View File

@ -0,0 +1,9 @@
from pymongo import MongoClient
def get_mongo_collection(col_name, db_name, uri):
client = MongoClient(uri)
db = client[db_name]
col = db[col_name]
return col

View File

@ -1,17 +1,14 @@
#from google.cloud import speech_v1
from google.cloud import speech_v1p1beta1 from google.cloud import speech_v1p1beta1
from google.cloud.speech_v1p1beta1 import enums from google.cloud.speech_v1p1beta1 import enums
from google.cloud.speech_v1p1beta1 import types from google.cloud.speech_v1p1beta1 import types
from pymongo import MongoClient from pymongo import MongoClient
import json
import argparse import argparse
from google.protobuf.json_format import MessageToJson,MessageToDict from google.protobuf.json_format import MessageToDict
from storageUpload import getMongoCollection from src.mongo.helpers import get_mongo_collection
from bson.objectid import ObjectId from bson.objectid import ObjectId
import datetime import datetime
import time import time
import concurrent.futures import concurrent.futures
import re
def main(args): def main(args):
@ -19,7 +16,7 @@ def main(args):
dbName = "archSpeechReco" dbName = "archSpeechReco"
colName = "moviesMeta" colName = "moviesMeta"
global col global col
col = getMongoCollection(colName,dbName,mongoUri) col = get_mongo_collection(colName, dbName, mongoUri)
batch_size = int(args.batch_size) batch_size = int(args.batch_size)
waves = getWavList(col, batch_size) waves = getWavList(col, batch_size)
uris = [w['gcsWawLocation'] for w in waves] uris = [w['gcsWawLocation'] for w in waves]
@ -51,7 +48,8 @@ def run_reco(uri):
"gcTextReco.words": words, "gcTextReco.words": words,
"gcTextReco.transcripted": now.strftime("%Y-%m-%d %H:%M:%S")}} "gcTextReco.transcripted": now.strftime("%Y-%m-%d %H:%M:%S")}}
) )
except Exception as e: print(e) except Exception as e:
print(e)
else: else:
print(f"mongo update OK {uri.split('/')[4].split('.')[0]}") print(f"mongo update OK {uri.split('/')[4].split('.')[0]}")

View File

@ -1,12 +1,14 @@
from google.cloud import storage from google.cloud import storage
import sys
import urllib import urllib
from pymongo import MongoClient from src.mongo.helpers import get_mongo_collection
from bson.objectid import ObjectId from bson.objectid import ObjectId
import os import os
import datetime import datetime
from subprocess import run,DEVNULL from subprocess import run, DEVNULL, CalledProcessError
import argparse import argparse
from urllib.error import URLError, HTTPError, ContentTooShortError
import logging
def main(args): def main(args):
uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco" uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
@ -14,46 +16,47 @@ def main(args):
colName = "moviesMeta" colName = "moviesMeta"
bucket = 'archspeechreco' bucket = 'archspeechreco'
col = getMongoCollection(colName,dbName,uri) col = get_mongo_collection(colName, dbName, uri)
fileFormat = args.format fileFormat = args.format
if (fileFormat == 'mp4'): if (fileFormat == 'mp4'):
uploadMp4(col,bucket) upload_mp4(col, bucket)
elif (fileFormat == 'wav'): elif (fileFormat == 'wav'):
uploadWave(col,bucket) upload_wave(col, bucket)
def uploadMp4(col,bucket):
toUpload = getUploadList(col)
for i in toUpload: def upload_mp4(col, bucket):
fileName = ObjectId(i['_id']) to_upload = get_upload_list(col)
getVid( i['url'], ObjectId( i['_id'] ) )
upload_blob(bucket, "{}.mp4".format(fileName), "mp4/{}.mp4".format(fileName),col,"Mp4") for i in to_upload:
file_name = ObjectId(i['_id'])
get_vid(i['url'], ObjectId(i['_id']))
upload_blob(bucket, "{}.mp4".format(file_name), "mp4/{}.mp4".format(file_name), col, "Mp4")
try: try:
os.remove("{}.mp4".format(fileName)) os.remove("{}.mp4".format(file_name))
except: except:
print("{}.mp4 has NOT been removed".format(fileName)) print("{}.mp4 has NOT been removed".format(file_name))
else: else:
print("{}.mp4 has been removed".format(fileName)) print("{}.mp4 has been removed".format(file_name))
def uploadWave(col,bucket): def upload_wave(col, bucket):
toUpload = getWavUploadList(col) to_upload = get_wav_upload_list(col)
for i in toUpload: for i in to_upload:
fileName = ObjectId(i['_id']) file_name = ObjectId(i['_id'])
getVid( i['url'], ObjectId( i['_id'] ) ) get_vid(i['url'], ObjectId(i['_id']))
getWave("{}.mp4".format(fileName)) get_wave("{}.mp4".format(file_name))
upload_blob(bucket, "{}.wav".format(fileName), "wave/{}.wav".format(fileName),col,"Wav") upload_blob(bucket, "{}.wav".format(file_name), "wave/{}.wav".format(file_name), col, "Wav")
try: try:
os.remove("{}.wav".format(fileName)) os.remove("{}.wav".format(file_name))
except: except:
print("{}.wav has NOT been removed".format(fileName)) print("{}.wav has NOT been removed".format(file_name))
else: else:
print("{}.wav has been removed".format(fileName)) print("{}.wav has been removed".format(file_name))
def upload_blob(bucket_name, source_file_name, destination_blob_name,col,fileFormat): def upload_blob(bucket_name, source_file_name, destination_blob_name, col, file_format):
"""Uploads a file to the bucket.""" """Uploads a file to the bucket."""
storage_client = storage.Client() storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name) bucket = storage_client.get_bucket(bucket_name)
@ -66,14 +69,14 @@ def upload_blob(bucket_name, source_file_name, destination_blob_name,col,fileFor
else: else:
print('File {}.{} uploaded to {}.'.format( print('File {}.{} uploaded to {}.'.format(
source_file_name, source_file_name,
fileFormat, file_format,
destination_blob_name)) destination_blob_name))
now = datetime.datetime.now() now = datetime.datetime.now()
try: try:
col.update_one( col.update_one(
{"_id": ObjectId(source_file_name.split('.')[0])}, {"_id": ObjectId(source_file_name.split('.')[0])},
{"$set": { {"$set": {
"gcs{}".format(fileFormat):{ "gcs{}".format(file_format): {
"location": destination_blob_name, "location": destination_blob_name,
"uploadDate": now.strftime("%Y-%m-%d %H:%M:%S") "uploadDate": now.strftime("%Y-%m-%d %H:%M:%S")
} }
@ -86,26 +89,16 @@ def upload_blob(bucket_name, source_file_name, destination_blob_name,col,fileFor
print("mongo update OK") print("mongo update OK")
def getMongoCollection(colName,dbName,uri): def get_upload_list(col):
client = MongoClient(uri) pipeline = [{"$match": {
db = client[dbName]
col = db[colName]
return col
def getUploadList(col):
pipeline = []
#$match phase, filetr documents withour gcs field - movies not uploaded to gcs
pipeline.append({"$match": {
"gcsMp4": {"$exists": False} "gcsMp4": {"$exists": False}
} }
}) }, {"$project": {
#project phase, show only url and _id keys
pipeline.append({"$project": {
"url": {"$concat": ["http://repozytorium.fn.org.pl/", {"$arrayElemAt": ["$mp4", 0]}]} "url": {"$concat": ["http://repozytorium.fn.org.pl/", {"$arrayElemAt": ["$mp4", 0]}]}
} }
}) }]
# $match phase, filetr documents withour gcs field - movies not uploaded to gcs
# project phase, show only url and _id keys
# skip first N documents # skip first N documents
# pipeline.append({"$skip":362}) # pipeline.append({"$skip":362})
# fetch only N documents # fetch only N documents
@ -113,18 +106,17 @@ def getUploadList(col):
return col.aggregate(pipeline) return col.aggregate(pipeline)
def getWavUploadList(col):
pipeline = [] def get_wav_upload_list(col):
#$match phase, filetr documents withour gcs field - movies not uploaded to gcs pipeline = [{"$match": {
pipeline.append({"$match": {
"gcsWav": {"$exists": False} "gcsWav": {"$exists": False}
} }
}) }, {"$project": {
#project phase, show only url and _id keys
pipeline.append({"$project": {
"url": {"$concat": ["http://repozytorium.fn.org.pl/", {"$arrayElemAt": ["$mp4", 0]}]} "url": {"$concat": ["http://repozytorium.fn.org.pl/", {"$arrayElemAt": ["$mp4", 0]}]}
} }
}) }]
# $match phase, filter documents without gcs field - movies not uploaded to gcs
# project phase, show only url and _id keys
# skip first N documents # skip first N documents
# pipeline.append({"$skip":362}) # pipeline.append({"$skip":362})
# fetch only N documents # fetch only N documents
@ -133,23 +125,31 @@ def getWavUploadList(col):
return col.aggregate(pipeline) return col.aggregate(pipeline)
def getVid(url,out): def get_vid(url, out):
try: try:
urllib.request.urlretrieve(url, "{}.mp4".format(out)) urllib.request.urlretrieve(url, "{}.mp4".format(out))
except: except URLError as e:
print("wrong URL, can't download") print("can't download, {}".format(e.reason))
except HTTPError as e:
print("reason:{}, Http code: {}", format(e.reason, e.code))
def getWave(filename): except ContentTooShortError:
try: print("content too short error")
run(['ffmpeg','-i', filename, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '1', filename.replace("mp4","wav")],stdout=DEVNULL)
except:
print("problem with ffmpeg")
else: else:
print("file {}.mp4 has been downloaded from {}".format(out, url))
def get_wave(filename):
try:
run(['ffmpeg', '-i', filename, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '1',
filename.replace("mp4", "wav")], stdout=DEVNULL, check=True)
except CalledProcessError as e:
print("problem with ffmpeg, {} exited with {} code".format(e.cmd, e.returncode))
else:
print("file {} has been decoded to waw format".format(filename))
try: try:
os.remove(filename) os.remove(filename)
except: except OSError as e:
print("{} has NOT been removed".format(filename)) print("{} has NOT been removed, {}".format(filename, e.strerror))
else: else:
print("{} has been removed".format(filename)) print("{} has been removed".format(filename))
@ -159,4 +159,3 @@ if __name__ == '__main__':
parser.add_argument("--format", default='mp4', help="format to fetch and upload, [mp4, wav]") parser.add_argument("--format", default='mp4', help="format to fetch and upload, [mp4, wav]")
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View File

@ -2,14 +2,14 @@ mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
dbName = "archSpeechReco" dbName = "archSpeechReco"
colName = "moviesMeta" colName = "moviesMeta"
def getMongoCollection(colName,dbName,uri): def get_mongo_collection(colName,dbName,uri):
client = MongoClient(uri,maxPoolSize=512) client = MongoClient(uri,maxPoolSize=512)
db = client[dbName] db = client[dbName]
col = db[colName] col = db[colName]
return col return col
col = getMongoCollection(colName,dbName,mongoUri) col = get_mongo_collection(colName,dbName,mongoUri)
col.aggregate(pipeline) col.aggregate(pipeline)
@ -42,3 +42,11 @@ var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": "$_id"} ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export3"} ] var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": "$_id"} ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export3"} ]
db.moviesMeta.aggregate(pipeline) db.moviesMeta.aggregate(pipeline)
var pipeline = [ {$match: {$and: [ {"hash": /[abcd]0$/}, {"gcsWav.location": {"$exists": 1}}, {"gcTextReco.transcript_fix": {"$not": /^$/}} ] }},
{$project: {"_id":0, "hash":1, "plik":{ "$substr": ["$gcsWav.location", 5, -1]}, "opis": "$description.desc", "transkrypcja": "$gcTextReco.transcript_fix"}},
{$out: "sample100"}
]
5df3e63d4c0402698d7844e3