Source code for dronebuddylib.atoms.speechrecognition.vosk_speech_2_text_conversion_impl

import pkg_resources
from vosk import Model, KaldiRecognizer

from dronebuddylib.atoms.speechrecognition.i_speech_to_text_conversion import ISpeechToTextConversion
from dronebuddylib.models.engine_configurations import EngineConfigurations
from dronebuddylib.models.enums import Configurations
from dronebuddylib.models.recognized_speech import RecognizedSpeechResult
from dronebuddylib.utils.logging_config import get_logger
from dronebuddylib.utils.utils import config_validity_check

# Get an instance of a logger
logger = get_logger()
queue = []

'''
:param language: The language of the model. The default is 'en-US'. (currently only supports this language)
:return: The vosk model.

need to initialize the model before using the speechrecognition to text engine.
'''


[docs] class VoskSpeechToTextConversionImpl(ISpeechToTextConversion): """ This class is an implementation of the ISpeechToTextConversion interface for Vosk API. Attributes: speech_conversion_engine (KaldiRecognizer): The Vosk KaldiRecognizer object for speech recognition. """
[docs] def get_class_name(self) -> str: """ Gets the class name. Returns: str: The class name. """ return 'TEXT_TO_SPEECH_VOSK'
[docs] def get_algorithm_name(self) -> str: """ Gets the algorithm name. Returns: str: The algorithm name. """ return 'Vosk Text to Speech'
[docs] def get_required_params(self) -> list: """ Gets the list of required parameters. Returns: list: The list of required parameters. """ return []
[docs] def get_optional_params(self) -> list: """ Gets the list of optional parameters. Returns: list: The list of optional parameters. """ return [Configurations.SPEECH_RECOGNITION_VOSK_LANGUAGE_MODEL_PATH]
def __init__(self, engine_configurations: EngineConfigurations): """ Initializes a speech-to-text engine using the Vosk model for a given language. Args: engine_configurations (EngineConfigurations): The engine configurations containing necessary parameters. """ config_validity_check(self.get_required_params(), engine_configurations.get_configurations_for_engine(self.get_class_name()), self.get_algorithm_name()) configs = engine_configurations.get_configurations_for_engine(self.get_class_name()) model_path = pkg_resources.resource_filename(__name__, "resources/speechrecognition/vosk-model-small-en-us-0.15") language_model_path = configs.get(Configurations.SPEECH_RECOGNITION_VOSK_LANGUAGE_MODEL_PATH, model_path) model = Model(language_model_path) vosk_kaldi_model = KaldiRecognizer(model, 44100) logger.info('Speech Recognition : Initialized speechrecognition recognition model') self.speech_conversion_engine = vosk_kaldi_model
[docs] def recognize_speech(self, audio_steam): """ Recognizes text from an audio stream using the Vosk API. Args: audio_steam (bytes): The audio stream content to be recognized. Returns: RecognizedSpeechResult: The result containing recognized text and total billed time. """ if self.speech_conversion_engine.AcceptWaveform(audio_steam): r = self.speech_conversion_engine.Result() logger.debug('Speech Recognition : Recognized utterance : ', r) return RecognizedSpeechResult(r, None) return None