diff --git a/src/reco.py b/src/reco.py new file mode 100644 index 00000000..0e9ec109 --- /dev/null +++ b/src/reco.py @@ -0,0 +1,95 @@ +#from google.cloud import speech_v1 +from google.cloud import speech_v1p1beta1 +from google.cloud.speech_v1p1beta1 import enums +from google.cloud.speech_v1p1beta1 import types +from pymongo import MongoClient +import json +import argparse +from google.protobuf.json_format import MessageToJson,MessageToDict +from storageUpload import getMongoCollection +from bson.objectid import ObjectId +import datetime + + +def main(args): + uri = "gs://archspeechreco/wave/5df3e63d4c0402698d7837f3.wav" + reco = recognize(uri) + recoDict = MessageToDict(reco) + #print(json.dumps(transcript,indent=4,ensure_ascii=False)) + + words = recoDict["results"][-1]["alternatives"][0]["words"] + transcript = "".join( [ trans["alternatives"][0]["transcript"] for trans in recoDict["results"][:-1] ] ) + + mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco" + dbName = "archSpeechReco" + colName = "moviesMeta" + col = getMongoCollection(colName,dbName,mongoUri) + now = datetime.datetime.now() + try: + col.update_one( + {"_id": ObjectId("5df3e63d4c0402698d7837f3")}, + {"$set":{"gcTextReco.transcript":transcript, + "gcTextReco.words":words, + "gcTextReco.transcripted":now.strftime("%Y-%m-%d %H:%M:%S")}} + ) + except Exception as e: print(e) + else: + print("mongo update OK") + +def recognize(storage_uri): + """ + Transcribe long audio file from Cloud Storage using asynchronous speech + recognition + + Args: + storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] + """ + + #client = speech_v1.SpeechClient() + client = speech_v1p1beta1.SpeechClient() + # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw' + + # Sample rate in Hertz of the audio data sent + sample_rate_hertz = 44100 + + # The language of the supplied audio + language_code = "pl-PL" + + # Encoding of audio data sent. This sample sets this explicitly. + # This field is optional for FLAC and WAV audio formats. + encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 + enable_speaker_diarization = True + #config = { + #"sample_rate_hertz": sample_rate_hertz, + # "language_code": language_code, + # "encoding": encoding, + # "enableSpeakerDiarization": enable_speaker_diarization + # + d_config = types.SpeakerDiarizationConfig( + enable_speaker_diarization=True + ) + config = types.RecognitionConfig( + encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz = 44100, + language_code = "pl-PL", + diarization_config=d_config + ) + + audio = {"uri": storage_uri} + + operation = client.long_running_recognize(config, audio) + + print(u"Waiting for operation to complete...") + response = operation.result() + + #for result in response.results: + # # First alternative is the most probable result + # alternative = result.alternatives[0] + # print(u"Transcript: {}".format(alternative.transcript)) + return response + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Google Cloud speech2text API client') + parser.add_argument("--format", default='mp4', help="format to fetch and upload, [mp4, wav]") + args = parser.parse_args() + main(args)