Source code for dronebuddylib.atoms.speechrecognition.google_speech_2_text_conversion_impl

from google.cloud import speech

from dronebuddylib.atoms.speechrecognition.i_speech_to_text_conversion import ISpeechToTextConversion
from dronebuddylib.models.engine_configurations import EngineConfigurations
from dronebuddylib.models.enums import Configurations
from dronebuddylib.models.recognized_speech import RecognizedSpeechResult
from dronebuddylib.utils.utils import config_validity_check



[docs]
class GoogleSpeechToTextConversionImpl(ISpeechToTextConversion):
    """
       This class is an implementation of the ISpeechToTextConversion interface for Google Cloud Speech-to-Text API.

       Attributes:
           sample_rate (int): The sample rate of the audio stream in hertz.
           language (str): The language code of the speech in the audio stream.
           encoding (speech.RecognitionConfig.AudioEncoding): The encoding type of the audio stream.
           speech_conversion_engine (speech.SpeechClient): The Google Cloud Speech-to-Text client.
       """


[docs]
    def get_class_name(self) -> str:
        """
           Gets the class name.

           Returns:
               str: The class name.
           """
        return 'SPEECH_TO_TEXT_GOOGLE'



[docs]
    def get_algorithm_name(self) -> str:
        """
            Gets the algorithm name.

            Returns:
                str: The algorithm name.
            """
        return 'Google Speech to Text'



[docs]
    def get_optional_params(self) -> list:
        """
           Gets the list of optional parameters.

           Returns:
               list: The list of optional parameters.
           """
        return [Configurations.SPEECH_RECOGNITION_GOOGLE_SAMPLE_RATE_HERTZ,
                Configurations.SPEECH_RECOGNITION_GOOGLE_LANGUAGE_CODE,
                Configurations.SPEECH_RECOGNITION_GOOGLE_ENCODING]



[docs]
    def get_required_params(self) -> list:
        """
             Gets the list of required parameters.

             Returns:
                 list: The list of required parameters.
             """
        return []


    def __init__(self, engine_configurations: EngineConfigurations):
        """
         Initializes the GoogleSpeechToTextConversionImpl class with the provided engine configurations.

         Args:
             engine_configurations (EngineConfigurations): The engine configurations containing necessary parameters.
         """
        config_validity_check(self.get_required_params(),
                              engine_configurations.get_configurations_for_engine(self.get_class_name()),
                              self.get_algorithm_name())
        configs = engine_configurations.get_configurations_for_engine(self.get_class_name())

        self.sample_rate = configs.get(Configurations.SPEECH_RECOGNITION_GOOGLE_SAMPLE_RATE_HERTZ, 16000)
        self.language = configs.get(Configurations.SPEECH_RECOGNITION_GOOGLE_LANGUAGE_CODE, 'en-US')
        self.encoding = configs.get(Configurations.SPEECH_RECOGNITION_GOOGLE_ENCODING,
                                    speech.RecognitionConfig.AudioEncoding.LINEAR16)

        self.speech_conversion_engine = speech.SpeechClient()


[docs]
    def recognize_speech(self, audio_steam) -> RecognizedSpeechResult:
        """
            Recognizes speech from an audio stream using the Google Cloud Speech-to-Text API.

            Args:
                audio_steam (bytes): The audio stream content to be recognized.

            Returns:
                RecognizedSpeechResult: The result containing recognized speech and total billed time.
            """
        audio = self.speech_conversion_engine.RecognitionAudio(content=audio_steam)

        config = speech.RecognitionConfig(
            encoding=self.encoding,
            sample_rate_hertz=self.sample_rate,
            language_code=self.language,
        )

        # Detects speechrecognition in the audio file
        response = self.speech_conversion_engine.recognize(config=config, audio=audio)

        for result in response.results:
            print(f"Transcript: {result.alternatives[0].transcript}")

        return RecognizedSpeechResult(response.results, response.total_billed_time)