app_multi.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829
  1. from typing import Union
  2. from argparse import ArgumentParser
  3. from pathlib import Path
  4. import subprocess
  5. import librosa
  6. import os
  7. import time
  8. import random
  9. import matplotlib.pyplot as plt
  10. import numpy as np
  11. from PIL import Image, ImageDraw, ImageFont
  12. from moviepy.editor import *
  13. from moviepy.video.io.VideoFileClip import VideoFileClip
  14. import asyncio
  15. import json
  16. import hashlib
  17. from os import path, getenv
  18. from pydub import AudioSegment
  19. import gradio as gr
  20. import torch
  21. import edge_tts
  22. from datetime import datetime
  23. from scipy.io.wavfile import write
  24. import config
  25. import util
  26. from infer_pack.models import (
  27. SynthesizerTrnMs768NSFsid,
  28. SynthesizerTrnMs768NSFsid_nono
  29. )
  30. from vc_infer_pipeline import VC
  31. # Reference: https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L21 # noqa
  32. in_hf_space = getenv('SYSTEM') == 'spaces'
  33. high_quality = True
  34. # Argument parsing
  35. arg_parser = ArgumentParser()
  36. arg_parser.add_argument(
  37. '--hubert',
  38. default=getenv('RVC_HUBERT', 'hubert_base.pt'),
  39. help='path to hubert base model (default: hubert_base.pt)'
  40. )
  41. arg_parser.add_argument(
  42. '--config',
  43. default=getenv('RVC_MULTI_CFG', 'multi_config.json'),
  44. help='path to config file (default: multi_config.json)'
  45. )
  46. arg_parser.add_argument(
  47. '--api',
  48. action='store_true',
  49. help='enable api endpoint'
  50. )
  51. arg_parser.add_argument(
  52. '--cache-examples',
  53. action='store_true',
  54. help='enable example caching, please remember delete gradio_cached_examples folder when example config has been modified' # noqa
  55. )
  56. args = arg_parser.parse_args()
  57. app_css = '''
  58. #model_info img {
  59. max-width: 100px;
  60. max-height: 100px;
  61. float: right;
  62. }
  63. #model_info p {
  64. margin: unset;
  65. }
  66. '''
  67. app = gr.Blocks(
  68. theme=gr.themes.Soft(primary_hue="orange", secondary_hue="slate"),
  69. css=app_css,
  70. analytics_enabled=False
  71. )
  72. # Load hubert model
  73. hubert_model = util.load_hubert_model(config.device, args.hubert)
  74. hubert_model.eval()
  75. # Load models
  76. multi_cfg = json.load(open(args.config, 'r'))
  77. loaded_models = []
  78. for model_name in multi_cfg.get('models'):
  79. print(f'Loading model: {model_name}')
  80. # Load model info
  81. model_info = json.load(
  82. open(path.join('model', model_name, 'config.json'), 'r')
  83. )
  84. # Load RVC checkpoint
  85. cpt = torch.load(
  86. path.join('model', model_name, model_info['model']),
  87. map_location='cpu'
  88. )
  89. tgt_sr = cpt['config'][-1]
  90. cpt['config'][-3] = cpt['weight']['emb_g.weight'].shape[0] # n_spk
  91. if_f0 = cpt.get('f0', 1)
  92. net_g: Union[SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono]
  93. if if_f0 == 1:
  94. net_g = SynthesizerTrnMs768NSFsid(
  95. *cpt['config'],
  96. is_half=util.is_half(config.device)
  97. )
  98. else:
  99. net_g = SynthesizerTrnMs768NSFsid_nono(*cpt['config'])
  100. del net_g.enc_q
  101. # According to original code, this thing seems necessary.
  102. print(net_g.load_state_dict(cpt['weight'], strict=False))
  103. net_g.eval().to(config.device)
  104. net_g = net_g.half() if util.is_half(config.device) else net_g.float()
  105. vc = VC(tgt_sr, config)
  106. loaded_models.append(dict(
  107. name=model_name,
  108. metadata=model_info,
  109. vc=vc,
  110. net_g=net_g,
  111. if_f0=if_f0,
  112. target_sr=tgt_sr
  113. ))
  114. print(f'Models loaded: {len(loaded_models)}')
  115. # Edge TTS speakers
  116. tts_speakers_list = asyncio.get_event_loop().run_until_complete(edge_tts.list_voices()) # noqa
  117. # Make MV
  118. def make_bars_image(height_values, index, new_height):
  119. # Define the size of the image
  120. width = 512
  121. height = new_height
  122. # Create a new image with a transparent background
  123. image = Image.new('RGBA', (width, height), color=(0, 0, 0, 0))
  124. # Get the image drawing context
  125. draw = ImageDraw.Draw(image)
  126. # Define the rectangle width and spacing
  127. rect_width = 2
  128. spacing = 2
  129. # Define the list of height values for the rectangles
  130. #height_values = [20, 40, 60, 80, 100, 80, 60, 40]
  131. num_bars = len(height_values)
  132. # Calculate the total width of the rectangles and the spacing
  133. total_width = num_bars * rect_width + (num_bars - 1) * spacing
  134. # Calculate the starting position for the first rectangle
  135. start_x = int((width - total_width) / 2)
  136. # Define the buffer size
  137. buffer_size = 80
  138. # Draw the rectangles from left to right
  139. x = start_x
  140. for i, height in enumerate(height_values):
  141. # Define the rectangle coordinates
  142. y0 = buffer_size
  143. y1 = height + buffer_size
  144. x0 = x
  145. x1 = x + rect_width
  146. # Draw the rectangle
  147. draw.rectangle([x0, y0, x1, y1], fill='white')
  148. # Move to the next rectangle position
  149. if i < num_bars - 1:
  150. x += rect_width + spacing
  151. # Rotate the image by 180 degrees
  152. image = image.rotate(180)
  153. # Mirror the image
  154. image = image.transpose(Image.FLIP_LEFT_RIGHT)
  155. # Save the image
  156. image.save('audio_bars_'+ str(index) + '.png')
  157. return 'audio_bars_'+ str(index) + '.png'
  158. def textsize(text, font):
  159. im = Image.new(mode="P", size=(0, 0))
  160. draw = ImageDraw.Draw(im)
  161. _, _, width, height = draw.textbbox((0, 0), text=text, font=font)
  162. return width, height
  163. def db_to_height(db_value):
  164. # Scale the dB value to a range between 0 and 1
  165. scaled_value = (db_value + 80) / 80
  166. # Convert the scaled value to a height between 0 and 100
  167. height = scaled_value * 50
  168. return height
  169. def infer(title, audio_in, image_in):
  170. # Load the audio file
  171. audio_path = audio_in
  172. audio_data, sr = librosa.load(audio_path)
  173. # Get the duration in seconds
  174. duration = librosa.get_duration(y=audio_data, sr=sr)
  175. # Extract the audio data for the desired time
  176. start_time = 0 # start time in seconds
  177. end_time = duration # end time in seconds
  178. start_index = int(start_time * sr)
  179. end_index = int(end_time * sr)
  180. audio_data = audio_data[start_index:end_index]
  181. # Compute the short-time Fourier transform
  182. hop_length = 512
  183. stft = librosa.stft(audio_data, hop_length=hop_length)
  184. spectrogram = librosa.amplitude_to_db(np.abs(stft), ref=np.max)
  185. # Get the frequency values
  186. freqs = librosa.fft_frequencies(sr=sr, n_fft=stft.shape[0])
  187. # Select the indices of the frequency values that correspond to the desired frequencies
  188. n_freqs = 114
  189. freq_indices = np.linspace(0, len(freqs) - 1, n_freqs, dtype=int)
  190. # Extract the dB values for the desired frequencies
  191. db_values = []
  192. for i in range(spectrogram.shape[1]):
  193. db_values.append(list(zip(freqs[freq_indices], spectrogram[freq_indices, i])))
  194. # Print the dB values for the first time frame
  195. print(db_values[0])
  196. proportional_values = []
  197. for frame in db_values:
  198. proportional_frame = [db_to_height(db) for f, db in frame]
  199. proportional_values.append(proportional_frame)
  200. print(proportional_values[0])
  201. print("AUDIO CHUNK: " + str(len(proportional_values)))
  202. # Open the background image
  203. background_image = Image.open(image_in)
  204. # Resize the image while keeping its aspect ratio
  205. bg_width, bg_height = background_image.size
  206. aspect_ratio = bg_width / bg_height
  207. new_width = 512
  208. new_height = int(new_width / aspect_ratio)
  209. resized_bg = background_image.resize((new_width, new_height))
  210. # Apply black cache for better visibility of the white text
  211. bg_cache = Image.open('black_cache.png')
  212. resized_bg.paste(bg_cache, (0, resized_bg.height - bg_cache.height), mask=bg_cache)
  213. # Create a new ImageDraw object
  214. draw = ImageDraw.Draw(resized_bg)
  215. # Define the text to be added
  216. text = title
  217. font = ImageFont.truetype("Lato-Regular.ttf", 16)
  218. text_color = (255, 255, 255) # white color
  219. # Calculate the position of the text
  220. text_width, text_height = textsize(text, font=font)
  221. x = 30
  222. y = new_height - 70
  223. # Draw the text on the image
  224. draw.text((x, y), text, fill=text_color, font=font)
  225. # Save the resized image
  226. resized_bg.save('resized_background.jpg')
  227. generated_frames = []
  228. for i, frame in enumerate(proportional_values):
  229. bars_img = make_bars_image(frame, i, new_height)
  230. bars_img = Image.open(bars_img)
  231. # Paste the audio bars image on top of the background image
  232. fresh_bg = Image.open('resized_background.jpg')
  233. fresh_bg.paste(bars_img, (0, 0), mask=bars_img)
  234. # Save the image
  235. fresh_bg.save('audio_bars_with_bg' + str(i) + '.jpg')
  236. generated_frames.append('audio_bars_with_bg' + str(i) + '.jpg')
  237. print(generated_frames)
  238. # Create a video clip from the images
  239. clip = ImageSequenceClip(generated_frames, fps=len(generated_frames)/(end_time-start_time))
  240. audio_clip = AudioFileClip(audio_in)
  241. clip = clip.set_audio(audio_clip)
  242. # Set the output codec
  243. codec = 'libx264'
  244. audio_codec = 'aac'
  245. # Save the video to a file
  246. clip.write_videofile("my_video.mp4", codec=codec, audio_codec=audio_codec)
  247. retimed_clip = VideoFileClip("my_video.mp4")
  248. # Set the desired frame rate
  249. new_fps = 25
  250. # Create a new clip with the new frame rate
  251. new_clip = retimed_clip.set_fps(new_fps)
  252. # Save the new clip as a new video file
  253. new_clip.write_videofile("my_video_retimed.mp4", codec=codec, audio_codec=audio_codec)
  254. return "my_video_retimed.mp4"
  255. # mix vocal and non-vocal
  256. def mix(audio1, audio2):
  257. sound1 = AudioSegment.from_file(audio1)
  258. sound2 = AudioSegment.from_file(audio2)
  259. length = len(sound1)
  260. mixed = sound1[:length].overlay(sound2)
  261. mixed.export("song.wav", format="wav")
  262. return "song.wav"
  263. # Bilibili
  264. def youtube_downloader(
  265. video_identifier,
  266. start_time,
  267. end_time,
  268. output_filename="track.wav",
  269. num_attempts=5,
  270. url_base="",
  271. quiet=False,
  272. force=True,
  273. ):
  274. output_path = Path(output_filename)
  275. if output_path.exists():
  276. if not force:
  277. return output_path
  278. else:
  279. output_path.unlink()
  280. quiet = "--quiet --no-warnings" if quiet else ""
  281. command = f"""
  282. yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501
  283. """.strip()
  284. attempts = 0
  285. while True:
  286. try:
  287. _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
  288. except subprocess.CalledProcessError:
  289. attempts += 1
  290. if attempts == num_attempts:
  291. return None
  292. else:
  293. break
  294. if output_path.exists():
  295. return output_path
  296. else:
  297. return None
  298. def audio_separated(audio_input, progress=gr.Progress()):
  299. # start progress
  300. progress(progress=0, desc="Starting...")
  301. time.sleep(0.1)
  302. # check file input
  303. if audio_input is None:
  304. # show progress
  305. for i in progress.tqdm(range(100), desc="Please wait..."):
  306. time.sleep(0.01)
  307. return (None, None, 'Please input audio.')
  308. # create filename
  309. filename = str(random.randint(10000,99999))+datetime.now().strftime("%d%m%Y%H%M%S")
  310. # progress
  311. progress(progress=0.10, desc="Please wait...")
  312. # make dir output
  313. os.makedirs("output", exist_ok=True)
  314. # progress
  315. progress(progress=0.20, desc="Please wait...")
  316. # write
  317. if high_quality:
  318. write(filename+".wav", audio_input[0], audio_input[1])
  319. else:
  320. write(filename+".mp3", audio_input[0], audio_input[1])
  321. # progress
  322. progress(progress=0.50, desc="Please wait...")
  323. # demucs process
  324. if high_quality:
  325. command_demucs = "python3 -m demucs --two-stems=vocals -d cpu "+filename+".wav -o output"
  326. else:
  327. command_demucs = "python3 -m demucs --two-stems=vocals --mp3 --mp3-bitrate 128 -d cpu "+filename+".mp3 -o output"
  328. os.system(command_demucs)
  329. # progress
  330. progress(progress=0.70, desc="Please wait...")
  331. # remove file audio
  332. if high_quality:
  333. command_delete = "rm -v ./"+filename+".wav"
  334. else:
  335. command_delete = "rm -v ./"+filename+".mp3"
  336. os.system(command_delete)
  337. # progress
  338. progress(progress=0.80, desc="Please wait...")
  339. # progress
  340. for i in progress.tqdm(range(80,100), desc="Please wait..."):
  341. time.sleep(0.1)
  342. if high_quality:
  343. print("./output/htdemucs/"+filename+"/vocals.wav","./output/htdemucs/"+filename+"/no_vocals.wav","Successfully...")
  344. return "./output/htdemucs/"+filename+"/vocals.wav","./output/htdemucs/"+filename+"/no_vocals.wav","Successfully..."
  345. else:
  346. print("./output/htdemucs/"+filename+"/vocals.mp3","./output/htdemucs/"+filename+"/no_vocals.mp3","Successfully...")
  347. return "./output/htdemucs/"+filename+"/vocals.mp3","./output/htdemucs/"+filename+"/no_vocals.mp3","Successfully..."
  348. # https://github.com/fumiama/Retrieval-based-Voice-Conversion-WebUI/blob/main/infer-web.py#L118 # noqa
  349. def vc_func(
  350. input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
  351. filter_radius, rms_mix_rate, resample_option
  352. ):
  353. if input_audio is None:
  354. return (None, 'Please provide input audio.')
  355. if model_index is None:
  356. return (None, 'Please select a model.')
  357. model = loaded_models[model_index]
  358. # Reference: so-vits
  359. (audio_samp, audio_npy) = input_audio
  360. # https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L49
  361. # Can be change well, we will see
  362. if (audio_npy.shape[0] / audio_samp) > 600 and in_hf_space:
  363. return (None, 'Input audio is longer than 600 secs.')
  364. # Bloody hell: https://stackoverflow.com/questions/26921836/
  365. if audio_npy.dtype != np.float32: # :thonk:
  366. audio_npy = (
  367. audio_npy / np.iinfo(audio_npy.dtype).max
  368. ).astype(np.float32)
  369. if len(audio_npy.shape) > 1:
  370. audio_npy = librosa.to_mono(audio_npy.transpose(1, 0))
  371. if audio_samp != 16000:
  372. audio_npy = librosa.resample(
  373. audio_npy,
  374. orig_sr=audio_samp,
  375. target_sr=16000
  376. )
  377. pitch_int = int(pitch_adjust)
  378. resample = (
  379. 0 if resample_option == 'Disable resampling'
  380. else int(resample_option)
  381. )
  382. times = [0, 0, 0]
  383. checksum = hashlib.sha512()
  384. checksum.update(audio_npy.tobytes())
  385. output_audio = model['vc'].pipeline(
  386. hubert_model,
  387. model['net_g'],
  388. model['metadata'].get('speaker_id', 0),
  389. audio_npy,
  390. checksum.hexdigest(),
  391. times,
  392. pitch_int,
  393. f0_method,
  394. path.join('model', model['name'], model['metadata']['feat_index']),
  395. feat_ratio,
  396. model['if_f0'],
  397. filter_radius,
  398. model['target_sr'],
  399. resample,
  400. rms_mix_rate,
  401. 'v2'
  402. )
  403. out_sr = (
  404. resample if resample >= 16000 and model['target_sr'] != resample
  405. else model['target_sr']
  406. )
  407. print(f'npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s')
  408. return ((out_sr, output_audio), 'Success')
  409. async def edge_tts_vc_func(
  410. input_text, model_index, tts_speaker, pitch_adjust, f0_method, feat_ratio,
  411. filter_radius, rms_mix_rate, resample_option
  412. ):
  413. if input_text is None:
  414. return (None, 'Please provide TTS text.')
  415. if tts_speaker is None:
  416. return (None, 'Please select TTS speaker.')
  417. if model_index is None:
  418. return (None, 'Please select a model.')
  419. speaker = tts_speakers_list[tts_speaker]['ShortName']
  420. (tts_np, tts_sr) = await util.call_edge_tts(speaker, input_text)
  421. return vc_func(
  422. (tts_sr, tts_np),
  423. model_index,
  424. pitch_adjust,
  425. f0_method,
  426. feat_ratio,
  427. filter_radius,
  428. rms_mix_rate,
  429. resample_option
  430. )
  431. def update_model_info(model_index):
  432. if model_index is None:
  433. return str(
  434. '### Model info\n'
  435. 'Please select a model from dropdown above.'
  436. )
  437. model = loaded_models[model_index]
  438. model_icon = model['metadata'].get('icon', '')
  439. return str(
  440. '### Model info\n'
  441. '![model icon]({icon})'
  442. '**{name}**\n\n'
  443. 'Author: {author}\n\n'
  444. 'Source: {source}\n\n'
  445. '{note}'
  446. ).format(
  447. name=model['metadata'].get('name'),
  448. author=model['metadata'].get('author', 'Anonymous'),
  449. source=model['metadata'].get('source', 'Unknown'),
  450. note=model['metadata'].get('note', ''),
  451. icon=(
  452. model_icon
  453. if model_icon.startswith(('http://', 'https://'))
  454. else '/file/model/%s/%s' % (model['name'], model_icon)
  455. )
  456. )
  457. def _example_vc(
  458. input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
  459. filter_radius, rms_mix_rate, resample_option
  460. ):
  461. (audio, message) = vc_func(
  462. input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
  463. filter_radius, rms_mix_rate, resample_option
  464. )
  465. return (
  466. audio,
  467. message,
  468. update_model_info(model_index)
  469. )
  470. async def _example_edge_tts(
  471. input_text, model_index, tts_speaker, pitch_adjust, f0_method, feat_ratio,
  472. filter_radius, rms_mix_rate, resample_option
  473. ):
  474. (audio, message) = await edge_tts_vc_func(
  475. input_text, model_index, tts_speaker, pitch_adjust, f0_method,
  476. feat_ratio, filter_radius, rms_mix_rate, resample_option
  477. )
  478. return (
  479. audio,
  480. message,
  481. update_model_info(model_index)
  482. )
  483. with app:
  484. gr.HTML("<center>"
  485. "<h1>🥳🎶🎡 - AI歌手,RVC歌声转换 + AI变声</h1>"
  486. "</center>")
  487. gr.Markdown("### <center>🦄 - 能够自动提取视频中的声音,并去除背景音;Powered by [RVC-Project](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)</center>")
  488. gr.Markdown("### <center>更多精彩应用,敬请关注[滔滔AI](http://www.talktalkai.com);滔滔AI,为爱滔滔!💕</center>")
  489. with gr.Tab("🤗 - B站视频提取声音"):
  490. with gr.Row():
  491. with gr.Column():
  492. ydl_url_input = gr.Textbox(label="B站视频网址(可直接填写相应的BV号)", value = "https://www.bilibili.com/video/BV...")
  493. start = gr.Number(value=0, label="起始时间 (秒)")
  494. end = gr.Number(value=15, label="结束时间 (秒)")
  495. ydl_url_submit = gr.Button("提取声音文件吧", variant="primary")
  496. as_audio_submit = gr.Button("去除背景音吧", variant="primary")
  497. with gr.Column():
  498. ydl_audio_output = gr.Audio(label="Audio from Bilibili")
  499. as_audio_input = ydl_audio_output
  500. as_audio_vocals = gr.Audio(label="歌曲人声部分")
  501. as_audio_no_vocals = gr.Audio(label="Music only", type="filepath", visible=False)
  502. as_audio_message = gr.Textbox(label="Message", visible=False)
  503. ydl_url_submit.click(fn=youtube_downloader, inputs=[ydl_url_input, start, end], outputs=[ydl_audio_output])
  504. as_audio_submit.click(fn=audio_separated, inputs=[as_audio_input], outputs=[as_audio_vocals, as_audio_no_vocals, as_audio_message], show_progress=True, queue=True)
  505. with gr.Row():
  506. with gr.Column():
  507. with gr.Tab('🎶 - 歌声转换'):
  508. input_audio = as_audio_vocals
  509. vc_convert_btn = gr.Button('进行歌声转换吧!', variant='primary')
  510. full_song = gr.Button("加入歌曲伴奏吧!", variant="primary")
  511. new_song = gr.Audio(label="AI歌手+伴奏", type="filepath")
  512. with gr.Tab('🎙️ - 文本转语音'):
  513. tts_input = gr.Textbox(
  514. label='请填写您想要转换的文本(中英皆可)',
  515. lines=3
  516. )
  517. tts_speaker = gr.Dropdown(
  518. [
  519. '%s (%s)' % (
  520. s['FriendlyName'],
  521. s['Gender']
  522. )
  523. for s in tts_speakers_list
  524. ],
  525. label='请选择一个相应语言的说话人',
  526. type='index'
  527. )
  528. tts_convert_btn = gr.Button('进行AI变声吧', variant='primary')
  529. with gr.Tab("📺 - 音乐视频"):
  530. with gr.Row():
  531. with gr.Column():
  532. inp1 = gr.Textbox(label="为视频配上精彩的文案吧(选填;英文)")
  533. inp2 = new_song
  534. inp3 = gr.Image(source='upload', type='filepath', label="上传一张背景图片吧")
  535. btn = gr.Button("生成您的专属音乐视频吧", variant="primary")
  536. with gr.Column():
  537. out1 = gr.Video(label='您的专属音乐视频')
  538. btn.click(fn=infer, inputs=[inp1, inp2, inp3], outputs=[out1])
  539. pitch_adjust = gr.Slider(
  540. label='Pitch',
  541. minimum=-24,
  542. maximum=24,
  543. step=1,
  544. value=0
  545. )
  546. f0_method = gr.Radio(
  547. label='f0 methods',
  548. choices=['pm', 'rmvpe'],
  549. value='rmvpe',
  550. interactive=True
  551. )
  552. with gr.Accordion('更多设置', open=False):
  553. feat_ratio = gr.Slider(
  554. label='Feature ratio',
  555. minimum=0,
  556. maximum=1,
  557. step=0.1,
  558. value=0.6
  559. )
  560. filter_radius = gr.Slider(
  561. label='Filter radius',
  562. minimum=0,
  563. maximum=7,
  564. step=1,
  565. value=3
  566. )
  567. rms_mix_rate = gr.Slider(
  568. label='Volume envelope mix rate',
  569. minimum=0,
  570. maximum=1,
  571. step=0.1,
  572. value=1
  573. )
  574. resample_rate = gr.Dropdown(
  575. [
  576. 'Disable resampling',
  577. '16000',
  578. '22050',
  579. '44100',
  580. '48000'
  581. ],
  582. label='Resample rate',
  583. value='Disable resampling'
  584. )
  585. with gr.Column():
  586. # Model select
  587. model_index = gr.Dropdown(
  588. [
  589. '%s - %s' % (
  590. m['metadata'].get('source', 'Unknown'),
  591. m['metadata'].get('name')
  592. )
  593. for m in loaded_models
  594. ],
  595. label='请选择您的AI歌手(必选)',
  596. type='index'
  597. )
  598. # Model info
  599. with gr.Box():
  600. model_info = gr.Markdown(
  601. '### AI歌手信息\n'
  602. 'Please select a model from dropdown above.',
  603. elem_id='model_info'
  604. )
  605. output_audio = gr.Audio(label='AI歌手(无伴奏)', type="filepath")
  606. output_msg = gr.Textbox(label='Output message')
  607. multi_examples = multi_cfg.get('examples')
  608. if (
  609. multi_examples and
  610. multi_examples.get('vc') and multi_examples.get('tts_vc')
  611. ):
  612. with gr.Accordion('Sweet sweet examples', open=False):
  613. with gr.Row():
  614. # VC Example
  615. if multi_examples.get('vc'):
  616. gr.Examples(
  617. label='Audio conversion examples',
  618. examples=multi_examples.get('vc'),
  619. inputs=[
  620. input_audio, model_index, pitch_adjust, f0_method,
  621. feat_ratio
  622. ],
  623. outputs=[output_audio, output_msg, model_info],
  624. fn=_example_vc,
  625. cache_examples=args.cache_examples,
  626. run_on_click=args.cache_examples
  627. )
  628. # Edge TTS Example
  629. if multi_examples.get('tts_vc'):
  630. gr.Examples(
  631. label='TTS conversion examples',
  632. examples=multi_examples.get('tts_vc'),
  633. inputs=[
  634. tts_input, model_index, tts_speaker, pitch_adjust,
  635. f0_method, feat_ratio
  636. ],
  637. outputs=[output_audio, output_msg, model_info],
  638. fn=_example_edge_tts,
  639. cache_examples=args.cache_examples,
  640. run_on_click=args.cache_examples
  641. )
  642. vc_convert_btn.click(
  643. vc_func,
  644. [
  645. input_audio, model_index, pitch_adjust, f0_method, feat_ratio,
  646. filter_radius, rms_mix_rate, resample_rate
  647. ],
  648. [output_audio, output_msg],
  649. api_name='audio_conversion',
  650. )
  651. tts_convert_btn.click(
  652. edge_tts_vc_func,
  653. [
  654. tts_input, model_index, tts_speaker, pitch_adjust, f0_method,
  655. feat_ratio, filter_radius, rms_mix_rate, resample_rate
  656. ],
  657. [output_audio, output_msg],
  658. api_name='tts_conversion',
  659. )
  660. full_song.click(fn=mix, inputs=[output_audio, as_audio_no_vocals], outputs=[new_song])
  661. model_index.change(
  662. update_model_info,
  663. inputs=[model_index],
  664. outputs=[model_info],
  665. show_progress=False,
  666. )
  667. gr.Markdown("### <center>注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。</center>")
  668. gr.Markdown("### <center>🧸 - 如何使用此程序:填写视频网址和视频起止时间后,依次点击“提取声音文件吧”、“去除背景音吧”、“进行歌声转换吧!”、“加入歌曲伴奏吧!”四个按键即可。</center>")
  669. gr.HTML('''
  670. <div class="footer">
  671. <p>🌊🏞️🎶 - 江水东流急,滔滔无尽声。 明·顾璘
  672. </p>
  673. </div>
  674. ''')
  675. app.queue(
  676. concurrency_count=1,
  677. max_size=20,
  678. api_open=args.api
  679. ).launch(server_name="0.0.0.0", show_error=True, debug=True )