frostming
diff --git a/‎.github/workflows/release.yml
Lines changed: 4 additions & 0 deletions b/‎.github/workflows/release.yml
Lines changed: 4 additions & 0 deletions
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/tetos/azure.py
Lines changed: 35 additions & 14 deletions b/‎src/tetos/azure.py
Lines changed: 35 additions & 14 deletions
diff --git a/‎src/tetos/base.py
Lines changed: 25 additions & 5 deletions b/‎src/tetos/base.py
Lines changed: 25 additions & 5 deletions
@@ -9,6 +9,8 @@ jobs:
   release-pypi:
     name: release-pypi
     runs-on: ubuntu-latest
+    permissions:
+      contents: write
 
     steps:
       - uses: actions/checkout@v4
@@ -35,6 +37,7 @@ jobs:
           tetos azure -o azure.mp3 "Hello world"
           tetos edge -o edge.mp3 "Hello world"
           tetos volc -o volc.mp3 "Hello world"
+          tetos google -o google.mp3 "Hello world"
         env:
           OPENAI_API_BASE: ${{ secrets.OPENAI_API_BASE }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
@@ -43,6 +46,7 @@ jobs:
           VOLC_ACCESS_KEY: ${{ secrets.VOLC_ACCESS_KEY }}
           VOLC_SECRET_KEY: ${{ secrets.VOLC_SECRET_KEY }}
           VOLC_APP_KEY: ${{ secrets.VOLC_APP_KEY }}
+          GOOGLE_CREDENTIALS_JSON: ${{ secrets.GOOGLE_CREDENTIALS }}
 
       - name: Upload audio files
         uses: actions/upload-artifact@v2
 
@@ -160,3 +160,4 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+*.mp3
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import os
 from pathlib import Path
 
@@ -15,19 +17,15 @@ class AzureSpeaker(Speaker):
     Args:
         speech_key (str): The Azure Speech key.
         speech_region (str): The Azure Speech region.
-        voice (str): The voice to use.
+        voice (str, optional): The voice to use.
     """
 
     def __init__(
-        self, speech_key: str, speech_region: str, *, voice: str = "en-US-AriaNeural"
+        self, speech_key: str, speech_region: str, *, voice: str | None = None
     ) -> None:
-        self.speech_config = speechsdk.SpeechConfig(
-            subscription=speech_key, region=speech_region
-        )
-        self.speech_config.speech_synthesis_voice_name = voice
-        self.speech_config.set_speech_synthesis_output_format(
-            speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
-        )
+        self.voice = voice
+        self.speech_key = speech_key
+        self.speech_region = speech_region
         self._set_proxy()
 
     def _set_proxy(self) -> None:
@@ -47,10 +45,29 @@ def _set_proxy(self) -> None:
             )
             break
 
-    async def synthesize(self, text: str, out_file: Path) -> float:
+    def get_speech_config(self, lang: str) -> str:
+        config = speechsdk.SpeechConfig(
+            subscription=self.speech_key, region=self.speech_region
+        )
+        config.set_speech_synthesis_output_format(
+            speechsdk.SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
+        )
+        if self.voice:
+            voice = self.voice
+        else:
+            voice = next(
+                (v for v in self.list_voices() if v.startswith(lang)),
+                "en-US-AriaNeural",
+            )
+        config.speech_synthesis_voice_name = voice
+        return config
+
+    async def synthesize(
+        self, text: str, out_file: str | Path, lang: str = "en-US"
+    ) -> float:
         audio_config = speechsdk.audio.AudioOutputConfig(filename=str(out_file))
         speech_synthesizer = speechsdk.SpeechSynthesizer(
-            speech_config=self.speech_config, audio_config=audio_config
+            speech_config=self.get_speech_config(lang), audio_config=audio_config
         )
         result = await anyio.to_thread.run_sync(speech_synthesizer.speak_text, text)
 
@@ -84,12 +101,16 @@ def get_command(cls) -> click.Command:
             required=True,
             help="The Azure Speech region.",
         )
-        @click.option("--voice", default="en-US-AriaNeural", help="The voice to use.")
         @common_options(cls)
         def azure(
-            speech_key: str, speech_region: str, voice: str, text: str, output: str
+            speech_key: str,
+            speech_region: str,
+            voice: str | None,
+            text: str,
+            lang: str,
+            output: str,
         ) -> None:
             speaker = cls(speech_key, speech_region, voice=voice)
-            speaker.say(text, Path(output))
+            speaker.say(text, output, lang=lang)
 
         return azure
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import abc
 from pathlib import Path
 from typing import Any, Callable, TypeVar
@@ -13,12 +15,15 @@ class SynthesizeError(RuntimeError):
 
 class Speaker(metaclass=abc.ABCMeta):
     @abc.abstractmethod
-    async def synthesize(self, text: str, out_file: Path) -> float:
+    async def synthesize(
+        self, text: str, out_file: str | Path, lang: str = "en-US"
+    ) -> float:
         """Generate speech from text and save it to a file.
 
         Args:
             text (str): The text to synthesize.
             out_file (Path): The file to save the speech to.
+            lang (str): The language code of the text. e.g. "en-US", "fr-FR".
 
         Returns:
             float: The duration of the speech in seconds.
@@ -45,17 +50,23 @@ def get_command(cls) -> click.Command:
         """
         raise NotImplementedError
 
-    def say(self, text: str, out_file: Path | None = None) -> float:
-        """A synchronous version of synthesize() that takes an optional
-        playback argument to play the audio.
+    def say(
+        self, text: str, out_file: str | Path | None = None, lang: str = "en-US"
+    ) -> float:
+        """A synchronous version of synthesize()
+
+        Args:
+            text (str): The text to synthesize.
+            out_file (Path): The file to save the speech to.
+            lang (str): The language code of the text. e.g. "en-US", "fr-FR".
         """
         import anyio
         import click
 
         if out_file is None:
             out_file = Path("tts-output.mp3")
 
-        result = anyio.run(self.synthesize, text, out_file)
         result = anyio.run(self.synthesize, text, out_file, lang)
         click.echo(f"Speech is generated successfully at {out_file}")
         return result
 
@@ -76,6 +87,15 @@ def decorator(func: F) -> F:
             default="tts-output.mp3",
             help="The output file.",
         )(func)
+        func = click.option(
+            "--voice",
+            help="The voice to use. See supported voices with `--list-voices`",
+        )(func)
+        func = click.option(
+            "--lang",
+            default="en-US",
+            help="The language code of the text. e.g. 'en-US', 'fr-FR'.",
+        )(func)
         func = click.option(
             "--list-voices",
             "-l",