extended_generate.py 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. from typing import Dict, Optional, Union
  2. from bark import text_to_semantic, semantic_to_waveform
  3. from bark.generation import SEMANTIC_RATE_HZ
  4. def custom_generate_audio(
  5. text: str,
  6. burn_in_prompt: Optional[str] = None,
  7. history_prompt: Optional[Union[Dict, str]] = None,
  8. history_prompt_semantic: Optional[Union[Dict, str]] = None,
  9. text_temp: float = 0.7,
  10. waveform_temp: float = 0.7,
  11. silent: bool = False,
  12. output_full: bool = False,
  13. max_length=None,
  14. **kwargs,
  15. ):
  16. """Generate audio array from input text.
  17. Args:
  18. text: text to be turned into audio
  19. history_prompt: history choice for audio cloning
  20. text_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
  21. waveform_temp: generation temperature (1.0 more diverse, 0.0 more conservative)
  22. silent: disable progress bar
  23. output_full: return full generation to be used as a history prompt
  24. Returns:
  25. numpy audio array at sample frequency 24khz
  26. """
  27. history_prompt_semantic = history_prompt_semantic or history_prompt
  28. if burn_in_prompt is not None and len(burn_in_prompt) > 0:
  29. burn_in_prompt_semantic = text_to_semantic(
  30. burn_in_prompt,
  31. history_prompt=history_prompt_semantic,
  32. temp=text_temp,
  33. silent=silent,
  34. )
  35. history_prompt_semantic = {
  36. "coarse_prompt": None,
  37. "fine_prompt": None,
  38. "semantic_prompt": burn_in_prompt_semantic,
  39. }
  40. semantic_tokens = text_to_semantic(
  41. text,
  42. history_prompt=history_prompt_semantic,
  43. temp=text_temp,
  44. silent=silent,
  45. )
  46. out = semantic_to_waveform(
  47. semantic_tokens,
  48. history_prompt=history_prompt,
  49. temp=waveform_temp,
  50. silent=silent,
  51. output_full=output_full,
  52. max_gen_duration_s=max_length,
  53. )
  54. if output_full:
  55. full_generation, audio_arr = out
  56. if max_length is not None:
  57. semantic_tokens = semantic_tokens[: int(max_length * SEMANTIC_RATE_HZ)]
  58. full_generation["semantic_prompt"] = semantic_tokens
  59. return full_generation, audio_arr
  60. else:
  61. audio_arr = out
  62. return audio_arr