diffusion_cond.json 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. {
  2. "model_type": "diffusion_cond",
  3. "sample_size": 2097152,
  4. "sample_rate": 44100,
  5. "audio_channels": 2,
  6. "model": {
  7. "pretransform": {
  8. "type": "autoencoder",
  9. "iterate_batch": true,
  10. "config": {
  11. "encoder": {
  12. "type": "oobleck",
  13. "requires_grad": false,
  14. "config": {
  15. "in_channels": 2,
  16. "channels": 128,
  17. "c_mults": [1, 2, 4, 8, 16],
  18. "strides": [2, 4, 4, 8, 8],
  19. "latent_dim": 128,
  20. "use_snake": true
  21. }
  22. },
  23. "decoder": {
  24. "type": "oobleck",
  25. "config": {
  26. "out_channels": 2,
  27. "channels": 128,
  28. "c_mults": [1, 2, 4, 8, 16],
  29. "strides": [2, 4, 4, 8, 8],
  30. "latent_dim": 64,
  31. "use_snake": true,
  32. "final_tanh": false
  33. }
  34. },
  35. "bottleneck": {
  36. "type": "vae"
  37. },
  38. "latent_dim": 64,
  39. "downsampling_ratio": 2048,
  40. "io_channels": 2
  41. }
  42. },
  43. "conditioning": {
  44. "configs": [
  45. {
  46. "id": "prompt",
  47. "type": "t5",
  48. "config": {
  49. "t5_model_name": "t5-base",
  50. "max_length": 128
  51. }
  52. },
  53. {
  54. "id": "seconds_start",
  55. "type": "number",
  56. "config": {
  57. "min_val": 0,
  58. "max_val": 512
  59. }
  60. },
  61. {
  62. "id": "seconds_total",
  63. "type": "number",
  64. "config": {
  65. "min_val": 0,
  66. "max_val": 512
  67. }
  68. }
  69. ],
  70. "cond_dim": 768
  71. },
  72. "diffusion": {
  73. "cross_attention_cond_ids": ["prompt", "seconds_start", "seconds_total"],
  74. "global_cond_ids": ["seconds_start", "seconds_total"],
  75. "type": "dit",
  76. "config": {
  77. "io_channels": 64,
  78. "embed_dim": 1536,
  79. "depth": 24,
  80. "num_heads": 24,
  81. "cond_token_dim": 768,
  82. "global_cond_dim": 1536,
  83. "project_cond_tokens": false,
  84. "transformer_type": "continuous_transformer"
  85. }
  86. },
  87. "io_channels": 64
  88. },
  89. "training": {
  90. "use_ema": true,
  91. "log_loss_info": false,
  92. "optimizer_configs": {
  93. "diffusion": {
  94. "optimizer": {
  95. "type": "AdamW",
  96. "config": {
  97. "lr": 5e-5,
  98. "betas": [0.9, 0.999],
  99. "weight_decay": 1e-3
  100. }
  101. },
  102. "scheduler": {
  103. "type": "InverseLR",
  104. "config": {
  105. "inv_gamma": 1000000,
  106. "power": 0.5,
  107. "warmup": 0.99
  108. }
  109. }
  110. }
  111. },
  112. "demo": {
  113. "demo_every": 2000,
  114. "demo_steps": 250,
  115. "num_demos": 4,
  116. "demo_cond": [
  117. {"prompt": "Amen break 174 BPM", "seconds_start": 0, "seconds_total": 12},
  118. {"prompt": "A beautiful orchestral symphony, classical music", "seconds_start": 0, "seconds_total": 160},
  119. {"prompt": "Chill hip-hop beat, chillhop", "seconds_start": 0, "seconds_total": 190},
  120. {"prompt": "A pop song about love and loss", "seconds_start": 0, "seconds_total": 180}
  121. ],
  122. "demo_cfg_scales": [3, 6, 9]
  123. }
  124. }
  125. }