implementing live voice commands?

daltonb

Wow good stuff, thanks @JonB and @cvp!! I will keep an eye on this thread and do some of my own tinkering.

cvp

2nd part and really enough for today

AVAudioEngine = ObjCClass('AVAudioEngine').alloc().init()
	AVAudioSession = ObjCClass('AVAudioSession')
	AVAudioRecorder = ObjCClass('AVAudioRecorder')
	
	shared_session = AVAudioSession.sharedInstance()
	category_set = shared_session.setCategory_mode_options_error_(ns('AVAudioSessionCategoryRecord'), ns('AVAudioSessionModeMeasurement'),ns('AVAudioSession.CategoryOptionsDuckOthers'),None)

	setActiveOptions = 0	# notifyOthersOnDeactivation
	shared_session.setActive_withOptions_error_(True,setActiveOptions,None)

	inputNode = AVAudioEngine.inputNode()
	
	# Configure the microphone input.
	recordingFormat = inputNode.outputFormatForBus_(0)
	
	def handler(_cmd,obj1_ptr,obj2_ptr):
		# param1 = AVAudioPCMBuffer
		# 				 	The buffer parameter is a buffer of audio captured 
		#						from the output of an AVAudioNode.
		# param2 = AVAudioTime
		#						The when parameter is the time the buffer was captured					
		if obj1_ptr:
			obj1 = ObjCInstance(obj1_ptr)
			#self.recognitionRequest?.append(buffer)
	
	handler_block = ObjCBlock(handler, restype=None, argtypes=[c_void_p, c_void_p, c_void_p])
		
	inputNode.installTapOnBus_bufferSize_format_block_(0,1024,recordingFormat, handler_block)

	AVAudioEngine.prepare()
	err_ptr = c_void_p()
	AVAudioEngine.startAndReturnError_(byref(err_ptr))
	if err_ptr:
		err = ObjCInstance(err)
		print(err)
		
	# Create and configure the speech recognition request.
	recognitionRequest = ObjCClass('SFSpeechAudioBufferRecognitionRequest').alloc()
	print(dir(recognitionRequest))
	recognitionRequest.setShouldReportPartialResults_(True)

And

Fatal Python error: Bus error

Thread 0x000000016fb67000 (most recent call first):

No error if I comment the line

AVAudioEngine.startAndReturnError_(byref(err_ptr))

JonB

This post is deleted!

JonB

you had some errors on one of your constants (the audiosession options should have been 0x2 for the duckothers option -- this is a mask, not a string)

here is a minor mod -- i verified the handler gets called, but i dont have speech recogognize to test against
https://gist.github.com/ad17f52c8944993092f537d963ce1963

cvp

@JonB Thanks, I'll try to continue today...

cvp

@JonB Really need help now:

segmentation fault if no underscore before appendAudioPCMBuffer_(obj1)
segmentation fault in last line not commented

from objc_util import *

AVAudioEngine = ObjCClass('AVAudioEngine').alloc().init()
AVAudioSession = ObjCClass('AVAudioSession')
AVAudioRecorder = ObjCClass('AVAudioRecorder')

shared_session = AVAudioSession.sharedInstance()
category_set= shared_session.setCategory_withOptions_error_(
	ns('AVAudioSessionCategoryRecord'), 
	0x2, #duckothers
	None)
shared_session.setMode_error_(ns('AVAudioSessionModeMeasurement'),None)

setActiveOptions = 0# notifyOthersOnDeactivation
shared_session.setActive_withOptions_error_(True,setActiveOptions,None)

inputNode = AVAudioEngine.inputNode()

# Configure the microphone input.
recordingFormat = inputNode.outputFormatForBus_(0)

# Create and configure the speech recognition request.
recognitionRequest = ObjCClass('SFSpeechAudioBufferRecognitionRequest').alloc()
print(dir(recognitionRequest))
recognitionRequest.setShouldReportPartialResults_(True)
retain_global(recognitionRequest)

@on_main_thread
def handler_buffer(_cmd,obj1_ptr,obj2_ptr):
	print('handler_buffer')
	# param1 = AVAudioPCMBuffer
	#   The buffer parameter is a buffer of audio captured 
	#   from the output of an AVAudioNode.
	# param2 = AVAudioTime
	#   The when parameter is the time the buffer was captured  
	if obj1_ptr:
		obj1 = ObjCInstance(obj1_ptr)
		#print(str(obj1._get_objc_classname()))	# AVAudioPCMBuffer
		#print(str(obj1.frameLength()))					# 4410
		# segmentation in next line if no "_" before appendAudioPCMBuffer
		recognitionRequest._appendAudioPCMBuffer_(obj1)

handler_block_buffer = ObjCBlock(handler_buffer, restype=None, argtypes=[c_void_p, c_void_p, c_void_p])

inputNode.installTapOnBus_bufferSize_format_block_(0,1024,recordingFormat, handler_block_buffer)

AVAudioEngine.prepare()
err_ptr = c_void_p()
AVAudioEngine.startAndReturnError_(byref(err_ptr))
if err_ptr:
	err = ObjCInstance(err)
	print(err)

@on_main_thread
def handler_recognize(_cmd,obj1_ptr,obj2_ptr):
	print('handler_recognize')
	# param1 = result
	# 					The object containing the partial or final transcriptions
	#						of the audio content.		 	
	# param2 = error
	#						An error object if a problem occurred. 
	#						This parameter is nil if speech recognition was successful.
	if obj1_ptr:
		obj1 = ObjCInstance(obj1_ptr)
		#print(str(obj1))
		
handler_block_recognize = ObjCBlock(handler_recognize, restype=None, argtypes=[c_void_p, c_void_p, c_void_p])
				
SFSpeechRecognizer = ObjCClass('SFSpeechRecognizer').alloc().init()
recognitionTask = SFSpeechRecognizer.recognitionTaskWithRequest_resultHandler_(recognitionRequest, handler_block_recognize)

JonB

recognitionRequest = ObjCClass('SFSpeechAudioBufferRecognitionRequest').alloc()

Missing .init()?

By the way, you will want AVAudioEngine.stop() handy.
For instance you might want to create a ui.View with a will_close, so that when you are experimenting, you can just close the view to kill the engine. Anyway you will eventually need to show the recognized words.

cvp

@JonB I know for the stop, ui.view, print recognized etc... but I go forward step by step...
I'll try the .init().
Thanks to follow this project, hoping I don't annoy you too much with my problems...
Perhaps, it could be better that I don't (try to) help other people with topics where I'm not a big specialist 😢

cvp

@JonB ok with the init(),Thanks(almost for @daltonb 😔)

And 🍾 with this imperfect script but a good start for a future app...
See all attributes of SFSpeechRecognitionResult.

from objc_util import *
import ui
import datetime

AVAudioEngine = ObjCClass('AVAudioEngine').alloc().init()
AVAudioSession = ObjCClass('AVAudioSession')
AVAudioRecorder = ObjCClass('AVAudioRecorder')

shared_session = AVAudioSession.sharedInstance()
category_set= shared_session.setCategory_withOptions_error_(
    ns('AVAudioSessionCategoryRecord'), 
    0x2, #duckothers
    None)
shared_session.setMode_error_(ns('AVAudioSessionModeMeasurement'),None)

setActiveOptions = 0# notifyOthersOnDeactivation
shared_session.setActive_withOptions_error_(True,setActiveOptions,None)

inputNode = AVAudioEngine.inputNode()

# Configure the microphone input.
recordingFormat = inputNode.outputFormatForBus_(0)

# Create and configure the speech recognition request.
recognitionRequest = ObjCClass('SFSpeechAudioBufferRecognitionRequest').alloc().init()
recognitionRequest.setShouldReportPartialResults_(True)
retain_global(recognitionRequest)

@on_main_thread
def handler_buffer(_cmd,obj1_ptr,obj2_ptr):
    #print('handler_buffer',datetime.datetime.now())
    # param1 = AVAudioPCMBuffer
    #   The buffer parameter is a buffer of audio captured 
    #   from the output of an AVAudioNode.
    # param2 = AVAudioTime
    #   The when parameter is the time the buffer was captured  
    if obj1_ptr:
        obj1 = ObjCInstance(obj1_ptr)
        #print(str(obj1._get_objc_classname())) # AVAudioPCMBuffer
        #print(str(obj1.frameLength()))                 # 4410
        # segmentation in next line if no "_" before appendAudioPCMBuffer
        recognitionRequest._appendAudioPCMBuffer_(obj1)

handler_block_buffer = ObjCBlock(handler_buffer, restype=None, argtypes=[c_void_p, c_void_p, c_void_p])

inputNode.installTapOnBus_bufferSize_format_block_(0,1024,recordingFormat, handler_block_buffer)

AVAudioEngine.prepare()
err_ptr = c_void_p()
AVAudioEngine.startAndReturnError_(byref(err_ptr))
if err_ptr:
    err = ObjCInstance(err)
    print(err)

#@on_main_thread
def handler_recognize(_cmd,obj1_ptr,obj2_ptr):
    #print('handler_recognize')
    # param1 = result
    #                   The object containing the partial/final transcriptions
    #                   of the audio content.           
    # param2 = error
    #                       An error object if a problem occurred. 
    #                       This parameter is nil if speech recognition was successful.
    if obj1_ptr:
        obj1 = ObjCInstance(obj1_ptr)
        # obj1 is a SFSpeechRecognitionResult
        print(obj1.bestTranscription().formattedString())
        
handler_block_recognize = ObjCBlock(handler_recognize, restype=None, argtypes=[c_void_p, c_void_p, c_void_p])
                
SFSpeechRecognizer = ObjCClass('SFSpeechRecognizer').alloc().init()
recognitionTask = SFSpeechRecognizer.recognitionTaskWithRequest_resultHandler_(recognitionRequest, handler_block_recognize)

mv = ui.View()
b = ui.ButtonItem()
b.title = 'stop'
def b_stop(sender):
	AVAudioEngine.stop()	
        recognitionRequest.endAudio()
b.action = b_stop
mv.right_button_items = (b,)
mv.present('sheet')

cvp

<SFTranscription: 0x281a70960>, formattedString=Okay, segments=(
    "<SFTranscriptionSegment: 0x283fb3900>, substringRange={0, 4}, timestamp=0, duration=2.94, confidence=0, substring=Okay, alternativeSubstrings=(\n), phoneSequence=, ipaPhoneSequence="
)

JonB

Cool!

So, does it give you a transcript object for each word? Or one for a whole phrase, etc?

cvp

@JonB Output when I say, with my French accent, "hello Jon B how are you"

Hello
Hello John
Hello John B
Hello John B
Hello John be
Hello John be Hall
Hello John B Hall
Hello John B Hall how
Hello John B Hall how are you
Hello John big hole how are you
Hello John big hole how are you

Edit: Oups, I just read this post, I swear I said "hello Jon b how are you" but I agree that my accent has to be very frenchy, sorry

cvp

Sure that I have to stop other objects than Engine because if I rerun, The script crashes with a bus error.

Added at stop button action

recognitionRequest.endAudio()

mikael

@cvp, muhaha. You and your cheeky French accent! Now I have to check what happens with a Finnish accent. :-)

daltonb

Awesome! @cvp your code-in-progress was really helpful in getting my pattern recognition engine going.. starting to get the hang of this objc_util thing. Looks like we finished around the same time :) Here's my version which is a pretty item-for-item transcription of https://developer.apple.com/documentation/speech/recognizing_speech_in_live_audio?language=objc. Instantiate Recognizer() and then you can call start() and stop() for live updates. It's only safe to instantiate once currently. The line print(bestTranscription.formattedString()) is where event handling of the recognition updates should occur.

from objc_util import *

NSLocale = ObjCClass('NSLocale')
SFSpeechRecognizer = ObjCClass('SFSpeechRecognizer')
AVAudioEngine = ObjCClass('AVAudioEngine')
AVAudioSession = ObjCClass('AVAudioSession')
SFSpeechAudioBufferRecognitionRequest = ObjCClass('SFSpeechAudioBufferRecognitionRequest')

class Recognizer:
    def __init__(self):
        locale = NSLocale.alloc().initWithLocaleIdentifier(ns("en-US"))
        self.speech_recognizer = SFSpeechRecognizer.alloc().initWithLocale(locale)

        self.audio_engine = AVAudioEngine.new()
        self.input_node = self.audio_engine.inputNode()

        self.recognition_request = None
        self.recognition_task = None

        def recognitionTaskWithRequest_resultHandler(block_ptr, result_ptr, error_ptr):
            # TODO: investigate https://forum.omz-software.com/topic/5232/traceback-using-gestures-module/21
            is_final = False
            if not result_ptr is None:
                result = ObjCInstance(result_ptr)
                bestTranscription = result.bestTranscription()
                print(bestTranscription.formattedString())
                is_final = result.isFinal()
            if not error_ptr is None or is_final:
                if is_final:
                    print("Speech recognition complete.")
                if error_ptr is not None:
                    error = ObjCInstance(error_ptr)
                    print("Error in recognition task:", error)
                self.audio_engine.stop()
                self.input_node.removeTapOnBus_(0)
                self.recognition_request = None
                self.recognition_task = None
        
        self.recognitionTaskWithRequest_resultHandler = recognitionTaskWithRequest_resultHandler

        def installTapOnBus_tapBlock(block_ptr, buffer_ptr, when_ptr):
            buffer = ObjCInstance(buffer_ptr)
            when = ObjCInstance(when_ptr)
            if not self.recognition_request is None:
                self.recognition_request.appendAudioPCMBuffer_(buffer)
        
        self.installTapOnBus_tapBlock = installTapOnBus_tapBlock

        # https://forum.omz-software.com/topic/5380/initialize-search-field-of-a-uidocumentpickerviewcontroller/11
        # must stay in scope when in use
        self.result_handler = ObjCBlock(self.recognitionTaskWithRequest_resultHandler, restype=None, argtypes=[c_void_p, c_void_p, c_void_p])
        self.tap_block = ObjCBlock(self.installTapOnBus_tapBlock, restype=None, argtypes=[c_void_p, c_void_p, c_void_p])


    def start(self):
        if self.recognition_task is not None:
            print("Speech recognition already active.")
            return
        
        print("Starting speech recognition.")

        audio_session = AVAudioSession.sharedInstance()
        audio_session.setCategory_mode_options_error_(ns('AVAudioSessionCategoryRecord'), ns('AVAudioSessionModeMeasurement'), ns('AVAudioSessionCategoryOptionDuckOthers'), None)
        audio_session.setActive_withOptions_error_(True, ns('AVAudioSessionSetActiveOptionNotifyOthersOnDeactivation'), None)

        self.recognition_request = SFSpeechAudioBufferRecognitionRequest.new()
        if self.recognition_request is None:
            print("Error: could not create recognition request!")
            return
        self.recognition_request.shouldReportPartialResults = True
        
        self.recognition_task = self.speech_recognizer.recognitionTaskWithRequest_resultHandler_(self.recognition_request, self.result_handler)

        recording_format = self.input_node.outputFormatForBus_(0)
        self.input_node.installTapOnBus_bufferSize_format_block_(0, 1024, recording_format, self.tap_block)
        
        self.audio_engine.prepare()
        err_ptr = c_void_p() # https://forum.omz-software.com/topic/3618/querying-returned-nserror/2
        self.audio_engine.startAndReturnError_(byref(err_ptr))
        if err_ptr:
            err = ObjCInstance(err_ptr)
            print("Error in audio engine:", err)
    
    def stop(self):
        if self.audio_engine.isRunning():
            self.audio_engine.stop()
            if self.recognition_request is not None:
                self.recognition_request.endAudio()

daltonb

Many thanks @cvp and @JonB, this has been super helpful!

mikael

Boring!

Hello
Hello
Hello
Hello John B. Nielsen
Hello John B. Nielsen
Hello John B. Nielsen
Hello John B. Nielsen