implementing live voice commands?

daltonb

@cvp that would be awesome.. even if it doesn’t work out I’d love to see a partial result

JonB

https://github.com/yao23/iOS_Playground/blob/master/SpeechRecognitionPractice/SpeechRecognitionPractice/ViewController.m
is an objc implementation.

The tricky bit obviously is getting those blocks implemented in objc_util

cvp

@daltonb I'm really sorry but I think it would be too complex for me, as even @jonb says it is difficult.

mikael

@daltonb, I am tempted to give it a try, but not this week.

cvp

@mikael and @daltonb First part, see here

mikael

@cvp, the man is fast! :-D

cvp

@mikael He, that is not my code 😂, just found there, I just begin to try to modify it...

cvp

First part (enough for today)

	AVAudioEngine = ObjCClass('AVAudioEngine').alloc().init()
	AVAudioSession = ObjCClass('AVAudioSession')
	AVAudioRecorder = ObjCClass('AVAudioRecorder')
	
	shared_session = AVAudioSession.sharedInstance()
	category_set = shared_session.setCategory_mode_options_error_(ns('AVAudioSessionCategoryRecord'), ns('AVAudioSessionModeMeasurement'),ns('AVAudioSession.CategoryOptionsDuckOthers'),None)

	setActiveOptions = 0	# notifyOthersOnDeactivation
	shared_session.setActive_withOptions_error_(True,setActiveOptions,None)

	inputNode = AVAudioEngine.inputNode()
	```

daltonb

Wow good stuff, thanks @JonB and @cvp!! I will keep an eye on this thread and do some of my own tinkering.

cvp

2nd part and really enough for today

AVAudioEngine = ObjCClass('AVAudioEngine').alloc().init()
	AVAudioSession = ObjCClass('AVAudioSession')
	AVAudioRecorder = ObjCClass('AVAudioRecorder')
	
	shared_session = AVAudioSession.sharedInstance()
	category_set = shared_session.setCategory_mode_options_error_(ns('AVAudioSessionCategoryRecord'), ns('AVAudioSessionModeMeasurement'),ns('AVAudioSession.CategoryOptionsDuckOthers'),None)

	setActiveOptions = 0	# notifyOthersOnDeactivation
	shared_session.setActive_withOptions_error_(True,setActiveOptions,None)

	inputNode = AVAudioEngine.inputNode()
	
	# Configure the microphone input.
	recordingFormat = inputNode.outputFormatForBus_(0)
	
	def handler(_cmd,obj1_ptr,obj2_ptr):
		# param1 = AVAudioPCMBuffer
		# 				 	The buffer parameter is a buffer of audio captured 
		#						from the output of an AVAudioNode.
		# param2 = AVAudioTime
		#						The when parameter is the time the buffer was captured					
		if obj1_ptr:
			obj1 = ObjCInstance(obj1_ptr)
			#self.recognitionRequest?.append(buffer)
	
	handler_block = ObjCBlock(handler, restype=None, argtypes=[c_void_p, c_void_p, c_void_p])
		
	inputNode.installTapOnBus_bufferSize_format_block_(0,1024,recordingFormat, handler_block)

	AVAudioEngine.prepare()
	err_ptr = c_void_p()
	AVAudioEngine.startAndReturnError_(byref(err_ptr))
	if err_ptr:
		err = ObjCInstance(err)
		print(err)
		
	# Create and configure the speech recognition request.
	recognitionRequest = ObjCClass('SFSpeechAudioBufferRecognitionRequest').alloc()
	print(dir(recognitionRequest))
	recognitionRequest.setShouldReportPartialResults_(True)

And

Fatal Python error: Bus error

Thread 0x000000016fb67000 (most recent call first):

No error if I comment the line

AVAudioEngine.startAndReturnError_(byref(err_ptr))

JonB

This post is deleted!

JonB

you had some errors on one of your constants (the audiosession options should have been 0x2 for the duckothers option -- this is a mask, not a string)

here is a minor mod -- i verified the handler gets called, but i dont have speech recogognize to test against
https://gist.github.com/ad17f52c8944993092f537d963ce1963

cvp

@JonB Thanks, I'll try to continue today...

cvp

@JonB Really need help now:

segmentation fault if no underscore before appendAudioPCMBuffer_(obj1)
segmentation fault in last line not commented

from objc_util import *

AVAudioEngine = ObjCClass('AVAudioEngine').alloc().init()
AVAudioSession = ObjCClass('AVAudioSession')
AVAudioRecorder = ObjCClass('AVAudioRecorder')

shared_session = AVAudioSession.sharedInstance()
category_set= shared_session.setCategory_withOptions_error_(
	ns('AVAudioSessionCategoryRecord'), 
	0x2, #duckothers
	None)
shared_session.setMode_error_(ns('AVAudioSessionModeMeasurement'),None)

setActiveOptions = 0# notifyOthersOnDeactivation
shared_session.setActive_withOptions_error_(True,setActiveOptions,None)

inputNode = AVAudioEngine.inputNode()

# Configure the microphone input.
recordingFormat = inputNode.outputFormatForBus_(0)

# Create and configure the speech recognition request.
recognitionRequest = ObjCClass('SFSpeechAudioBufferRecognitionRequest').alloc()
print(dir(recognitionRequest))
recognitionRequest.setShouldReportPartialResults_(True)
retain_global(recognitionRequest)

@on_main_thread
def handler_buffer(_cmd,obj1_ptr,obj2_ptr):
	print('handler_buffer')
	# param1 = AVAudioPCMBuffer
	#   The buffer parameter is a buffer of audio captured 
	#   from the output of an AVAudioNode.
	# param2 = AVAudioTime
	#   The when parameter is the time the buffer was captured  
	if obj1_ptr:
		obj1 = ObjCInstance(obj1_ptr)
		#print(str(obj1._get_objc_classname()))	# AVAudioPCMBuffer
		#print(str(obj1.frameLength()))					# 4410
		# segmentation in next line if no "_" before appendAudioPCMBuffer
		recognitionRequest._appendAudioPCMBuffer_(obj1)

handler_block_buffer = ObjCBlock(handler_buffer, restype=None, argtypes=[c_void_p, c_void_p, c_void_p])

inputNode.installTapOnBus_bufferSize_format_block_(0,1024,recordingFormat, handler_block_buffer)

AVAudioEngine.prepare()
err_ptr = c_void_p()
AVAudioEngine.startAndReturnError_(byref(err_ptr))
if err_ptr:
	err = ObjCInstance(err)
	print(err)

@on_main_thread
def handler_recognize(_cmd,obj1_ptr,obj2_ptr):
	print('handler_recognize')
	# param1 = result
	# 					The object containing the partial or final transcriptions
	#						of the audio content.		 	
	# param2 = error
	#						An error object if a problem occurred. 
	#						This parameter is nil if speech recognition was successful.
	if obj1_ptr:
		obj1 = ObjCInstance(obj1_ptr)
		#print(str(obj1))
		
handler_block_recognize = ObjCBlock(handler_recognize, restype=None, argtypes=[c_void_p, c_void_p, c_void_p])
				
SFSpeechRecognizer = ObjCClass('SFSpeechRecognizer').alloc().init()
recognitionTask = SFSpeechRecognizer.recognitionTaskWithRequest_resultHandler_(recognitionRequest, handler_block_recognize)

JonB

recognitionRequest = ObjCClass('SFSpeechAudioBufferRecognitionRequest').alloc()

Missing .init()?

By the way, you will want AVAudioEngine.stop() handy.
For instance you might want to create a ui.View with a will_close, so that when you are experimenting, you can just close the view to kill the engine. Anyway you will eventually need to show the recognized words.

cvp

@JonB I know for the stop, ui.view, print recognized etc... but I go forward step by step...
I'll try the .init().
Thanks to follow this project, hoping I don't annoy you too much with my problems...
Perhaps, it could be better that I don't (try to) help other people with topics where I'm not a big specialist 😢

cvp

@JonB ok with the init(),Thanks(almost for @daltonb 😔)

And 🍾 with this imperfect script but a good start for a future app...
See all attributes of SFSpeechRecognitionResult.

from objc_util import *
import ui
import datetime

AVAudioEngine = ObjCClass('AVAudioEngine').alloc().init()
AVAudioSession = ObjCClass('AVAudioSession')
AVAudioRecorder = ObjCClass('AVAudioRecorder')

shared_session = AVAudioSession.sharedInstance()
category_set= shared_session.setCategory_withOptions_error_(
    ns('AVAudioSessionCategoryRecord'), 
    0x2, #duckothers
    None)
shared_session.setMode_error_(ns('AVAudioSessionModeMeasurement'),None)

setActiveOptions = 0# notifyOthersOnDeactivation
shared_session.setActive_withOptions_error_(True,setActiveOptions,None)

inputNode = AVAudioEngine.inputNode()

# Configure the microphone input.
recordingFormat = inputNode.outputFormatForBus_(0)

# Create and configure the speech recognition request.
recognitionRequest = ObjCClass('SFSpeechAudioBufferRecognitionRequest').alloc().init()
recognitionRequest.setShouldReportPartialResults_(True)
retain_global(recognitionRequest)

@on_main_thread
def handler_buffer(_cmd,obj1_ptr,obj2_ptr):
    #print('handler_buffer',datetime.datetime.now())
    # param1 = AVAudioPCMBuffer
    #   The buffer parameter is a buffer of audio captured 
    #   from the output of an AVAudioNode.
    # param2 = AVAudioTime
    #   The when parameter is the time the buffer was captured  
    if obj1_ptr:
        obj1 = ObjCInstance(obj1_ptr)
        #print(str(obj1._get_objc_classname())) # AVAudioPCMBuffer
        #print(str(obj1.frameLength()))                 # 4410
        # segmentation in next line if no "_" before appendAudioPCMBuffer
        recognitionRequest._appendAudioPCMBuffer_(obj1)

handler_block_buffer = ObjCBlock(handler_buffer, restype=None, argtypes=[c_void_p, c_void_p, c_void_p])

inputNode.installTapOnBus_bufferSize_format_block_(0,1024,recordingFormat, handler_block_buffer)

AVAudioEngine.prepare()
err_ptr = c_void_p()
AVAudioEngine.startAndReturnError_(byref(err_ptr))
if err_ptr:
    err = ObjCInstance(err)
    print(err)

#@on_main_thread
def handler_recognize(_cmd,obj1_ptr,obj2_ptr):
    #print('handler_recognize')
    # param1 = result
    #                   The object containing the partial/final transcriptions
    #                   of the audio content.           
    # param2 = error
    #                       An error object if a problem occurred. 
    #                       This parameter is nil if speech recognition was successful.
    if obj1_ptr:
        obj1 = ObjCInstance(obj1_ptr)
        # obj1 is a SFSpeechRecognitionResult
        print(obj1.bestTranscription().formattedString())
        
handler_block_recognize = ObjCBlock(handler_recognize, restype=None, argtypes=[c_void_p, c_void_p, c_void_p])
                
SFSpeechRecognizer = ObjCClass('SFSpeechRecognizer').alloc().init()
recognitionTask = SFSpeechRecognizer.recognitionTaskWithRequest_resultHandler_(recognitionRequest, handler_block_recognize)

mv = ui.View()
b = ui.ButtonItem()
b.title = 'stop'
def b_stop(sender):
	AVAudioEngine.stop()	
        recognitionRequest.endAudio()
b.action = b_stop
mv.right_button_items = (b,)
mv.present('sheet')

cvp

<SFTranscription: 0x281a70960>, formattedString=Okay, segments=(
    "<SFTranscriptionSegment: 0x283fb3900>, substringRange={0, 4}, timestamp=0, duration=2.94, confidence=0, substring=Okay, alternativeSubstrings=(\n), phoneSequence=, ipaPhoneSequence="
)

JonB

Cool!

So, does it give you a transcript object for each word? Or one for a whole phrase, etc?

cvp

@JonB Output when I say, with my French accent, "hello Jon B how are you"

Hello
Hello John
Hello John B
Hello John B
Hello John be
Hello John be Hall
Hello John B Hall
Hello John B Hall how
Hello John B Hall how are you
Hello John big hole how are you
Hello John big hole how are you

Edit: Oups, I just read this post, I swear I said "hello Jon b how are you" but I agree that my accent has to be very frenchy, sorry