prawilny refaktor początek

This commit is contained in:
Wojciech Smolak 2020-06-06 00:56:55 +02:00
parent 849e2cf21c
commit f996eb58ed
7 changed files with 153 additions and 138 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
.idea
__pycache__

0
src/__init__.py Normal file
View File

0
src/mongo/__init__.py Normal file
View File

9
src/mongo/helpers.py Normal file
View File

@ -0,0 +1,9 @@
from pymongo import MongoClient
def get_mongo_collection(col_name, db_name, uri):
client = MongoClient(uri)
db = client[db_name]
col = db[col_name]
return col

View File

@ -1,17 +1,14 @@
#from google.cloud import speech_v1
from google.cloud import speech_v1p1beta1
from google.cloud.speech_v1p1beta1 import enums
from google.cloud.speech_v1p1beta1 import types
from pymongo import MongoClient
import json
import argparse
from google.protobuf.json_format import MessageToJson,MessageToDict
from storageUpload import getMongoCollection
from google.protobuf.json_format import MessageToDict
from src.mongo.helpers import get_mongo_collection
from bson.objectid import ObjectId
import datetime
import time
import concurrent.futures
import re
def main(args):
@ -19,17 +16,17 @@ def main(args):
dbName = "archSpeechReco"
colName = "moviesMeta"
global col
col = getMongoCollection(colName,dbName,mongoUri)
col = get_mongo_collection(colName, dbName, mongoUri)
batch_size = int(args.batch_size)
waves = getWavList(col,batch_size)
uris = [ w['gcsWawLocation'] for w in waves ]
waves = getWavList(col, batch_size)
uris = [w['gcsWawLocation'] for w in waves]
start = time.perf_counter()
with concurrent.futures.ThreadPoolExecutor(max_workers=64) as executor:
executor.map(run_reco, uris)
stop = time.perf_counter()
print(f'Finished in {round(stop-start, 2)} seconds')
print(f'Finished in {round(stop - start, 2)} seconds')
def run_reco(uri):
@ -38,7 +35,7 @@ def run_reco(uri):
if (len(recoDict) != 0):
words = recoDict["results"][-1]["alternatives"][0]["words"]
transcript = "".join( [ trans["alternatives"][0]["transcript"] for trans in recoDict["results"][:-1] ] )
transcript = "".join([trans["alternatives"][0]["transcript"] for trans in recoDict["results"][:-1]])
elif (len(recoDict) == 0):
words = {}
transcript = "film niemy"
@ -46,12 +43,13 @@ def run_reco(uri):
now = datetime.datetime.now()
try:
col.update_one(
{"_id": ObjectId(uri.split('/')[4].split('.')[0])},
{"$set":{"gcTextReco.transcript":transcript,
"gcTextReco.words":words,
"gcTextReco.transcripted":now.strftime("%Y-%m-%d %H:%M:%S")}}
{"_id": ObjectId(uri.split('/')[4].split('.')[0])},
{"$set": {"gcTextReco.transcript": transcript,
"gcTextReco.words": words,
"gcTextReco.transcripted": now.strftime("%Y-%m-%d %H:%M:%S")}}
)
except Exception as e: print(e)
except Exception as e:
print(e)
else:
print(f"mongo update OK {uri.split('/')[4].split('.')[0]}")
@ -65,7 +63,7 @@ def recognize(storage_uri):
storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE]
"""
#client = speech_v1.SpeechClient()
# client = speech_v1.SpeechClient()
client = speech_v1p1beta1.SpeechClient()
# storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw'
@ -79,21 +77,21 @@ def recognize(storage_uri):
# This field is optional for FLAC and WAV audio formats.
encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16
enable_speaker_diarization = True
#config = {
#"sample_rate_hertz": sample_rate_hertz,
# config = {
# "sample_rate_hertz": sample_rate_hertz,
# "language_code": language_code,
# "encoding": encoding,
# "enableSpeakerDiarization": enable_speaker_diarization
#
d_config = types.SpeakerDiarizationConfig(
enable_speaker_diarization=True
)
enable_speaker_diarization=True
)
config = types.RecognitionConfig(
encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz = 44100,
language_code = "pl-PL",
diarization_config=d_config
)
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=44100,
language_code="pl-PL",
diarization_config=d_config
)
audio = {"uri": storage_uri}
@ -105,31 +103,31 @@ def recognize(storage_uri):
return response
def getMongoCollection(colName,dbName,uri):
client = MongoClient(uri,maxPoolSize=512)
def getMongoCollection(colName, dbName, uri):
client = MongoClient(uri, maxPoolSize=512)
db = client[dbName]
col = db[colName]
return col
def getWavList(col,limit=32):
def getWavList(col, limit=32):
pipeline = []
#match phase, filetr documents withour gcTextReco field - voice not recognized
pipeline.append({"$match": {"$and":[
{"gcTextReco": {"$exists": False}},
{"gcsWav": {"$exists": True}},
{"description.details.Format dźwięku": {"$ne": "brak"}}
]}
}
)
#project phase, show only bucket name: gcsWav.location
# match phase, filetr documents withour gcTextReco field - voice not recognized
pipeline.append({"$match": {"$and": [
{"gcTextReco": {"$exists": False}},
{"gcsWav": {"$exists": True}},
{"description.details.Format dźwięku": {"$ne": "brak"}}
]}
}
)
# project phase, show only bucket name: gcsWav.location
pipeline.append({"$project": {
"gcsWawLocation": { "$concat": [ "gs://archspeechreco/","$gcsWav.location" ] }
}
})
#fetch only N documents
pipeline.append({"$limit":limit})
"gcsWawLocation": {"$concat": ["gs://archspeechreco/", "$gcsWav.location"]}
}
})
# fetch only N documents
pipeline.append({"$limit": limit})
return col.aggregate(pipeline)

View File

@ -1,12 +1,14 @@
from google.cloud import storage
import sys
import urllib
from pymongo import MongoClient
from src.mongo.helpers import get_mongo_collection
from bson.objectid import ObjectId
import os
import datetime
from subprocess import run,DEVNULL
from subprocess import run, DEVNULL, CalledProcessError
import argparse
from urllib.error import URLError, HTTPError, ContentTooShortError
import logging
def main(args):
uri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
@ -14,46 +16,47 @@ def main(args):
colName = "moviesMeta"
bucket = 'archspeechreco'
col = getMongoCollection(colName,dbName,uri)
col = get_mongo_collection(colName, dbName, uri)
fileFormat = args.format
if (fileFormat == 'mp4'):
uploadMp4(col,bucket)
upload_mp4(col, bucket)
elif (fileFormat == 'wav'):
uploadWave(col,bucket)
upload_wave(col, bucket)
def uploadMp4(col,bucket):
toUpload = getUploadList(col)
for i in toUpload:
fileName = ObjectId(i['_id'])
getVid( i['url'], ObjectId( i['_id'] ) )
upload_blob(bucket, "{}.mp4".format(fileName), "mp4/{}.mp4".format(fileName),col,"Mp4")
def upload_mp4(col, bucket):
to_upload = get_upload_list(col)
for i in to_upload:
file_name = ObjectId(i['_id'])
get_vid(i['url'], ObjectId(i['_id']))
upload_blob(bucket, "{}.mp4".format(file_name), "mp4/{}.mp4".format(file_name), col, "Mp4")
try:
os.remove("{}.mp4".format(fileName))
os.remove("{}.mp4".format(file_name))
except:
print("{}.mp4 has NOT been removed".format(fileName))
print("{}.mp4 has NOT been removed".format(file_name))
else:
print("{}.mp4 has been removed".format(fileName))
print("{}.mp4 has been removed".format(file_name))
def uploadWave(col,bucket):
toUpload = getWavUploadList(col)
def upload_wave(col, bucket):
to_upload = get_wav_upload_list(col)
for i in toUpload:
fileName = ObjectId(i['_id'])
getVid( i['url'], ObjectId( i['_id'] ) )
getWave("{}.mp4".format(fileName))
upload_blob(bucket, "{}.wav".format(fileName), "wave/{}.wav".format(fileName),col,"Wav")
for i in to_upload:
file_name = ObjectId(i['_id'])
get_vid(i['url'], ObjectId(i['_id']))
get_wave("{}.mp4".format(file_name))
upload_blob(bucket, "{}.wav".format(file_name), "wave/{}.wav".format(file_name), col, "Wav")
try:
os.remove("{}.wav".format(fileName))
os.remove("{}.wav".format(file_name))
except:
print("{}.wav has NOT been removed".format(fileName))
print("{}.wav has NOT been removed".format(file_name))
else:
print("{}.wav has been removed".format(fileName))
print("{}.wav has been removed".format(file_name))
def upload_blob(bucket_name, source_file_name, destination_blob_name,col,fileFormat):
def upload_blob(bucket_name, source_file_name, destination_blob_name, col, file_format):
"""Uploads a file to the bucket."""
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
@ -66,90 +69,87 @@ def upload_blob(bucket_name, source_file_name, destination_blob_name,col,fileFor
else:
print('File {}.{} uploaded to {}.'.format(
source_file_name,
fileFormat,
file_format,
destination_blob_name))
now = datetime.datetime.now()
try:
col.update_one(
{"_id": ObjectId(source_file_name.split('.')[0])},
{"$set":{
"gcs{}".format(fileFormat):{
"location":destination_blob_name,
"uploadDate":now.strftime("%Y-%m-%d %H:%M:%S")
}
}
}
)
{"_id": ObjectId(source_file_name.split('.')[0])},
{"$set": {
"gcs{}".format(file_format): {
"location": destination_blob_name,
"uploadDate": now.strftime("%Y-%m-%d %H:%M:%S")
}
}
}
)
except:
print("mongo update failed")
else:
print("mongo update OK")
def getMongoCollection(colName,dbName,uri):
client = MongoClient(uri)
db = client[dbName]
col = db[colName]
return col
def getUploadList(col):
pipeline = []
#$match phase, filetr documents withour gcs field - movies not uploaded to gcs
pipeline.append({"$match": {
"gcsMp4": {"$exists": False}
}
})
#project phase, show only url and _id keys
pipeline.append({"$project": {
"url": { "$concat": [ "http://repozytorium.fn.org.pl/",{"$arrayElemAt": [ "$mp4",0 ]}] }
}
})
#skip first N documents
#pipeline.append({"$skip":362})
#fetch only N documents
#pipeline.append({"$limit":20})
def get_upload_list(col):
pipeline = [{"$match": {
"gcsMp4": {"$exists": False}
}
}, {"$project": {
"url": {"$concat": ["http://repozytorium.fn.org.pl/", {"$arrayElemAt": ["$mp4", 0]}]}
}
}]
# $match phase, filetr documents withour gcs field - movies not uploaded to gcs
# project phase, show only url and _id keys
# skip first N documents
# pipeline.append({"$skip":362})
# fetch only N documents
# pipeline.append({"$limit":20})
return col.aggregate(pipeline)
def getWavUploadList(col):
pipeline = []
#$match phase, filetr documents withour gcs field - movies not uploaded to gcs
pipeline.append({"$match": {
"gcsWav": {"$exists": False}
}
})
#project phase, show only url and _id keys
pipeline.append({"$project": {
"url": { "$concat": [ "http://repozytorium.fn.org.pl/",{"$arrayElemAt": [ "$mp4",0 ]}] }
}
})
#skip first N documents
#pipeline.append({"$skip":362})
#fetch only N documents
#pipeline.append({"$limit":500})
return col.aggregate(pipeline)
def get_wav_upload_list(col):
pipeline = [{"$match": {
"gcsWav": {"$exists": False}
}
}, {"$project": {
"url": {"$concat": ["http://repozytorium.fn.org.pl/", {"$arrayElemAt": ["$mp4", 0]}]}
}
}]
# $match phase, filter documents without gcs field - movies not uploaded to gcs
# project phase, show only url and _id keys
# skip first N documents
# pipeline.append({"$skip":362})
# fetch only N documents
# pipeline.append({"$limit":500})
return col.aggregate(pipeline)
def getVid(url,out):
def get_vid(url, out):
try:
urllib.request.urlretrieve(url, "{}.mp4".format(out))
except:
print("wrong URL, can't download")
def getWave(filename):
try:
run(['ffmpeg','-i', filename, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '1', filename.replace("mp4","wav")],stdout=DEVNULL)
except:
print("problem with ffmpeg")
except URLError as e:
print("can't download, {}".format(e.reason))
except HTTPError as e:
print("reason:{}, Http code: {}", format(e.reason, e.code))
except ContentTooShortError:
print("content too short error")
else:
print("file {}.mp4 has been downloaded from {}".format(out, url))
def get_wave(filename):
try:
run(['ffmpeg', '-i', filename, '-vn', '-acodec', 'pcm_s16le', '-ar', '44100', '-ac', '1',
filename.replace("mp4", "wav")], stdout=DEVNULL, check=True)
except CalledProcessError as e:
print("problem with ffmpeg, {} exited with {} code".format(e.cmd, e.returncode))
else:
print("file {} has been decoded to waw format".format(filename))
try:
os.remove(filename)
except:
print("{} has NOT been removed".format(filename))
except OSError as e:
print("{} has NOT been removed, {}".format(filename, e.strerror))
else:
print("{} has been removed".format(filename))
@ -159,4 +159,3 @@ if __name__ == '__main__':
parser.add_argument("--format", default='mp4', help="format to fetch and upload, [mp4, wav]")
args = parser.parse_args()
main(args)

View File

@ -2,14 +2,14 @@ mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco"
dbName = "archSpeechReco"
colName = "moviesMeta"
def getMongoCollection(colName,dbName,uri):
def get_mongo_collection(colName,dbName,uri):
client = MongoClient(uri,maxPoolSize=512)
db = client[dbName]
col = db[colName]
return col
col = getMongoCollection(colName,dbName,mongoUri)
col = get_mongo_collection(colName,dbName,mongoUri)
col.aggregate(pipeline)
@ -42,3 +42,11 @@ var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"
var pipeline = [{"$match":{"gcTextReco": {"$exists": true}}}, {"$project": {"_id":0, "durationStart": { "$concat": ["$description.date", "T00:00:00Z"] }, "wid":{ "$concat": ["Filmoteka_", {"$toString": "$_id"} ] }, "creator": { $ifNull: [ "$description.details.Produkcja", "null" ]}, "originalDate": "$description.date", "contents": ["$gcTextReco.transcript_fix"], "url":"$url", "title":"$title", "durationEnd": { "$concat": ["$description.date", "T23:59:59Z"] }}}, {$out: "export3"} ]
db.moviesMeta.aggregate(pipeline)
var pipeline = [ {$match: {$and: [ {"hash": /[abcd]0$/}, {"gcsWav.location": {"$exists": 1}}, {"gcTextReco.transcript_fix": {"$not": /^$/}} ] }},
{$project: {"_id":0, "hash":1, "plik":{ "$substr": ["$gcsWav.location", 5, -1]}, "opis": "$description.desc", "transkrypcja": "$gcTextReco.transcript_fix"}},
{$out: "sample100"}
]
5df3e63d4c0402698d7844e3