Source code for dronebuddylib.atoms.speechrecognition.vosk_speech_2_text_conversion_impl

import pkg_resources
from vosk import Model, KaldiRecognizer

from dronebuddylib.atoms.speechrecognition.i_speech_to_text_conversion import ISpeechToTextConversion
from dronebuddylib.models.engine_configurations import EngineConfigurations
from dronebuddylib.models.enums import Configurations
from dronebuddylib.models.recognized_speech import RecognizedSpeechResult
from dronebuddylib.utils.logging_config import get_logger
from dronebuddylib.utils.utils import config_validity_check

# Get an instance of a logger
logger = get_logger()
queue = []

'''
:param language: The language of the model. The default is 'en-US'. (currently only supports this language)
:return: The vosk model.

need to initialize the model before using the speechrecognition to text engine.
'''



[docs]
class VoskSpeechToTextConversionImpl(ISpeechToTextConversion):
    """
    This class is an implementation of the ISpeechToTextConversion interface for Vosk API.

    Attributes:
        speech_conversion_engine (KaldiRecognizer): The Vosk KaldiRecognizer object for speech recognition.
    """


[docs]
    def get_class_name(self) -> str:
        """
        Gets the class name.

        Returns:
            str: The class name.
        """
        return 'TEXT_TO_SPEECH_VOSK'



[docs]
    def get_algorithm_name(self) -> str:
        """
        Gets the algorithm name.

        Returns:
            str: The algorithm name.
        """
        return 'Vosk Text to Speech'



[docs]
    def get_required_params(self) -> list:
        """
        Gets the list of required parameters.

        Returns:
            list: The list of required parameters.
        """
        return []



[docs]
    def get_optional_params(self) -> list:
        """
        Gets the list of optional parameters.

        Returns:
            list: The list of optional parameters.
        """
        return [Configurations.SPEECH_RECOGNITION_VOSK_LANGUAGE_MODEL_PATH]


    def __init__(self, engine_configurations: EngineConfigurations):
        """
        Initializes a speech-to-text engine using the Vosk model for a given language.

        Args:
            engine_configurations (EngineConfigurations): The engine configurations containing necessary parameters.
        """
        config_validity_check(self.get_required_params(),
                              engine_configurations.get_configurations_for_engine(self.get_class_name()),
                              self.get_algorithm_name())
        configs = engine_configurations.get_configurations_for_engine(self.get_class_name())

        model_path = pkg_resources.resource_filename(__name__, "resources/speechrecognition/vosk-model-small-en-us-0.15")
        language_model_path = configs.get(Configurations.SPEECH_RECOGNITION_VOSK_LANGUAGE_MODEL_PATH, model_path)

        model = Model(language_model_path)
        vosk_kaldi_model = KaldiRecognizer(model, 44100)
        logger.info('Speech Recognition : Initialized speechrecognition recognition model')

        self.speech_conversion_engine = vosk_kaldi_model


[docs]
    def recognize_speech(self, audio_steam):
        """
        Recognizes text from an audio stream using the Vosk API.

        Args:
            audio_steam (bytes): The audio stream content to be recognized.

        Returns:
            RecognizedSpeechResult: The result containing recognized text and total billed time.
        """
        if self.speech_conversion_engine.AcceptWaveform(audio_steam):
            r = self.speech_conversion_engine.Result()
            logger.debug('Speech Recognition : Recognized utterance : ', r)
            return RecognizedSpeechResult(r, None)
        return None