123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- import sys
- import asyncio
- from io import BytesIO
- from fairseq import checkpoint_utils
- import torch
- import edge_tts
- import librosa
- # https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/config.py#L43-L55 # noqa
- def has_mps() -> bool:
- if sys.platform != "darwin":
- return False
- else:
- if not getattr(torch, 'has_mps', False):
- return False
- try:
- torch.zeros(1).to(torch.device("mps"))
- return True
- except Exception:
- return False
- def is_half(device: str) -> bool:
- if not device.startswith('cuda'):
- return False
- else:
- gpu_name = torch.cuda.get_device_name(
- int(device.split(':')[-1])
- ).upper()
- # ...regex?
- if (
- ('16' in gpu_name and 'V100' not in gpu_name)
- or 'P40' in gpu_name
- or '1060' in gpu_name
- or '1070' in gpu_name
- or '1080' in gpu_name
- ):
- return False
- return True
- def load_hubert_model(device: str, model_path: str = 'hubert_base.pt'):
- model = checkpoint_utils.load_model_ensemble_and_task(
- [model_path]
- )[0][0].to(device)
- if is_half(device):
- return model.half()
- else:
- return model.float()
- async def call_edge_tts(speaker_name: str, text: str):
- tts_com = edge_tts.Communicate(text, speaker_name)
- tts_raw = b''
- # Stream TTS audio to bytes
- async for chunk in tts_com.stream():
- if chunk['type'] == 'audio':
- tts_raw += chunk['data']
- # Convert mp3 stream to wav
- ffmpeg_proc = await asyncio.create_subprocess_exec(
- 'ffmpeg',
- '-f', 'mp3',
- '-i', '-',
- '-f', 'wav',
- '-',
- stdin=asyncio.subprocess.PIPE,
- stdout=asyncio.subprocess.PIPE
- )
- (tts_wav, _) = await ffmpeg_proc.communicate(tts_raw)
- return librosa.load(BytesIO(tts_wav))
|