#from google.cloud import speech_v1 from google.cloud import speech_v1p1beta1 from google.cloud.speech_v1p1beta1 import enums from google.cloud.speech_v1p1beta1 import types from pymongo import MongoClient import json import argparse from google.protobuf.json_format import MessageToJson,MessageToDict from storageUpload import getMongoCollection from bson.objectid import ObjectId import datetime def main(args): uri = "gs://archspeechreco/wave/5df3e63d4c0402698d7837f3.wav" reco = recognize(uri) recoDict = MessageToDict(reco) #print(json.dumps(transcript,indent=4,ensure_ascii=False)) words = recoDict["results"][-1]["alternatives"][0]["words"] transcript = "".join( [ trans["alternatives"][0]["transcript"] for trans in recoDict["results"][:-1] ] ) mongoUri = "mongodb://speechRecoUser:speech!reco@localhost/archSpeechReco" dbName = "archSpeechReco" colName = "moviesMeta" col = getMongoCollection(colName,dbName,mongoUri) now = datetime.datetime.now() try: col.update_one( {"_id": ObjectId("5df3e63d4c0402698d7837f3")}, {"$set":{"gcTextReco.transcript":transcript, "gcTextReco.words":words, "gcTextReco.transcripted":now.strftime("%Y-%m-%d %H:%M:%S")}} ) except Exception as e: print(e) else: print("mongo update OK") def recognize(storage_uri): """ Transcribe long audio file from Cloud Storage using asynchronous speech recognition Args: storage_uri URI for audio file in Cloud Storage, e.g. gs://[BUCKET]/[FILE] """ #client = speech_v1.SpeechClient() client = speech_v1p1beta1.SpeechClient() # storage_uri = 'gs://cloud-samples-data/speech/brooklyn_bridge.raw' # Sample rate in Hertz of the audio data sent sample_rate_hertz = 44100 # The language of the supplied audio language_code = "pl-PL" # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 enable_speaker_diarization = True #config = { #"sample_rate_hertz": sample_rate_hertz, # "language_code": language_code, # "encoding": encoding, # "enableSpeakerDiarization": enable_speaker_diarization # d_config = types.SpeakerDiarizationConfig( enable_speaker_diarization=True ) config = types.RecognitionConfig( encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz = 44100, language_code = "pl-PL", diarization_config=d_config ) audio = {"uri": storage_uri} operation = client.long_running_recognize(config, audio) print(u"Waiting for operation to complete...") response = operation.result() #for result in response.results: # # First alternative is the most probable result # alternative = result.alternatives[0] # print(u"Transcript: {}".format(alternative.transcript)) return response if __name__ == '__main__': parser = argparse.ArgumentParser(description='Google Cloud speech2text API client') parser.add_argument("--format", default='mp4', help="format to fetch and upload, [mp4, wav]") args = parser.parse_args() main(args)