app.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. import gradio as gr
  2. import subprocess
  3. from moviepy.editor import VideoFileClip
  4. import datetime
  5. def convert_to_mp4_with_aac(input_path, output_path):
  6. # Load the video
  7. video = VideoFileClip(input_path)
  8. # Set the output format to mp4 with AAC codec
  9. video.write_videofile(output_path, codec="libx264", audio_codec="aac")
  10. return output_path
  11. # Function to check if the audio file path exists in the list
  12. def check_file_exists(file_path, audio_list):
  13. return file_path in audio_list
  14. def load_audio(audio_listed):
  15. if audio_listed is None:
  16. return None
  17. else:
  18. return f"data/audio/{audio_listed}"
  19. def execute_command(command: str) -> None:
  20. subprocess.run(command, check=True)
  21. def infer(audio_input, image_path, emotional_style):
  22. # Get the current timestamp
  23. timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
  24. output_name = f"lipsynced_result_{timestamp}"
  25. command = [
  26. f"python",
  27. f"inference_for_demo_video.py",
  28. f"--wav_path={audio_input}",
  29. f"--style_clip_path=data/style_clip/3DMM/{emotional_style}",
  30. f"--pose_path=data/pose/RichardShelby_front_neutral_level1_001.mat",
  31. f"--image_path={image_path}",
  32. f"--cfg_scale=1.0",
  33. f"--max_gen_len=30",
  34. f"--output_name={output_name}"
  35. ]
  36. execute_command(command)
  37. # Convert video to compatible codecs
  38. input_file = f"output_video/{output_name}.mp4"
  39. output_file = f"{output_name}.mp4"
  40. result = convert_to_mp4_with_aac(input_file, output_file)
  41. return result
  42. css="""
  43. #col-container{
  44. margin: 0 auto;
  45. max-width: 940px;
  46. }
  47. #project-links{
  48. margin: 0 0 12px !important;
  49. column-gap: 8px;
  50. display: flex;
  51. justify-content: center;
  52. flex-wrap: nowrap;
  53. flex-direction: row;
  54. align-items: center;
  55. }
  56. #run-btn{
  57. border: var(--button-border-width) solid var(--button-primary-border-color);
  58. background: var(--button-primary-background-fill);
  59. color: var(--button-primary-text-color);
  60. }
  61. #run-btn:hover{
  62. border-color: var(--button-primary-border-color-hover);
  63. background: var(--button-primary-background-fill-hover);
  64. color: var(--button-primary-text-color-hover);
  65. }
  66. """
  67. with gr.Blocks(css=css) as demo:
  68. with gr.Column(elem_id="col-container"):
  69. gr.HTML("""
  70. <h2 style="text-align: center;">DreamTalk</h2>
  71. <p style="text-align: center;">When Expressive Talking Head Generation Meets Diffusion Probabilistic Models</p>
  72. <p style="margin:12px auto;display: flex;justify-content: center;">
  73. <a href="https://huggingface.co/spaces/fffiloni/dreamtalk?duplicate=true"><img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-lg.svg" alt="Duplicate this Space"></a>
  74. </p>
  75. """)
  76. with gr.Row():
  77. with gr.Column():
  78. image_path = gr.Image(label="Image", type="filepath", sources=["upload"])
  79. audio_input = gr.Audio(label="Audio input", type="filepath", sources=["upload"], value="data/audio/acknowledgement_english.m4a")
  80. with gr.Row():
  81. audio_list = gr.Dropdown(
  82. label="Choose an audio (optional)",
  83. choices=[
  84. "German1.wav", "German2.wav", "German3.wav", "German4.wav",
  85. "acknowledgement_chinese.m4a", "acknowledgement_english.m4a",
  86. "chinese1_haierlizhi.wav", "chinese2_guanyu.wav",
  87. "french1.wav", "french2.wav", "french3.wav",
  88. "italian1.wav", "italian2.wav", "italian3.wav",
  89. "japan1.wav", "japan2.wav", "japan3.wav",
  90. "korean1.wav", "korean2.wav", "korean3.wav",
  91. "noisy_audio_cafeter_snr_0.wav", "noisy_audio_meeting_snr_0.wav", "noisy_audio_meeting_snr_10.wav", "noisy_audio_meeting_snr_20.wav", "noisy_audio_narrative.wav", "noisy_audio_office_snr_0.wav", "out_of_domain_narrative.wav",
  92. "spanish1.wav", "spanish2.wav", "spanish3.wav"
  93. ],
  94. value = "acknowledgement_english.m4a"
  95. )
  96. audio_list.change(
  97. fn = load_audio,
  98. inputs = [audio_list],
  99. outputs = [audio_input]
  100. )
  101. emotional_style = gr.Dropdown(
  102. label = "emotional style",
  103. choices = [
  104. "M030_front_angry_level3_001.mat",
  105. "M030_front_contempt_level3_001.mat",
  106. "M030_front_disgusted_level3_001.mat",
  107. "M030_front_fear_level3_001.mat",
  108. "M030_front_happy_level3_001.mat",
  109. "M030_front_neutral_level1_001.mat",
  110. "M030_front_sad_level3_001.mat",
  111. "M030_front_surprised_level3_001.mat",
  112. "W009_front_angry_level3_001.mat",
  113. "W009_front_contempt_level3_001.mat",
  114. "W009_front_disgusted_level3_001.mat",
  115. "W009_front_fear_level3_001.mat",
  116. "W009_front_happy_level3_001.mat",
  117. "W009_front_neutral_level1_001.mat",
  118. "W009_front_sad_level3_001.mat",
  119. "W009_front_surprised_level3_001.mat",
  120. "W011_front_angry_level3_001.mat",
  121. "W011_front_contempt_level3_001.mat",
  122. "W011_front_disgusted_level3_001.mat",
  123. "W011_front_fear_level3_001.mat",
  124. "W011_front_happy_level3_001.mat",
  125. "W011_front_neutral_level1_001.mat",
  126. "W011_front_sad_level3_001.mat",
  127. "W011_front_surprised_level3_001.mat"
  128. ],
  129. value = "M030_front_neutral_level1_001.mat"
  130. )
  131. gr.Examples(
  132. examples = [
  133. "data/src_img/uncropped/face3.png",
  134. "data/src_img/uncropped/male_face.png",
  135. "data/src_img/uncropped/uncut_src_img.jpg",
  136. "data/src_img/cropped/chpa5.png",
  137. "data/src_img/cropped/cut_img.png",
  138. "data/src_img/cropped/f30.png",
  139. "data/src_img/cropped/menglu2.png",
  140. "data/src_img/cropped/nscu2.png",
  141. "data/src_img/cropped/zp1.png",
  142. "data/src_img/cropped/zt12.png"
  143. ],
  144. inputs=[image_path],
  145. examples_per_page=5
  146. )
  147. with gr.Row():
  148. gr.ClearButton([audio_input, image_path, audio_list])
  149. run_btn = gr.Button("Run", elem_id="run-btn")
  150. with gr.Column():
  151. output_video = gr.Video(format="mp4")
  152. gr.HTML("""
  153. <p id="project-links" align="center">
  154. <a href='https://dreamtalk-project.github.io/'><img src='https://img.shields.io/badge/Project-Page-Green'></a> <a href='https://arxiv.org/abs/2312.09767'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a> <a href='https://youtu.be/VF4vlE6ZqWQ'><img src='https://badges.aleen42.com/src/youtube.svg'></a>
  155. </p>
  156. <img src="https://github.com/ali-vilab/dreamtalk/raw/main/media/teaser.gif" style="margin: 0 auto;border-radius: 10px;" />
  157. """)
  158. run_btn.click(
  159. fn = infer,
  160. inputs = [audio_input, image_path, emotional_style],
  161. outputs = [output_video]
  162. )
  163. demo.queue(max_size=20).launch(server_name="0.0.0.0", debug=True)