123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- import time
- import librosa
- import torch
- import torch.nn.functional as F
- import soundfile as sf
- import logging
- logging.getLogger("numba").setLevel(logging.WARNING)
- from transformers import (
- Wav2Vec2FeatureExtractor,
- HubertModel,
- )
- import utils
- import torch.nn as nn
- cnhubert_base_path = None
- class CNHubert(nn.Module):
- def __init__(self):
- super().__init__()
- self.model = HubertModel.from_pretrained(cnhubert_base_path)
- self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
- cnhubert_base_path
- )
- def forward(self, x):
- input_values = self.feature_extractor(
- x, return_tensors="pt", sampling_rate=16000
- ).input_values.to(x.device)
- feats = self.model(input_values)["last_hidden_state"]
- return feats
- # class CNHubertLarge(nn.Module):
- # def __init__(self):
- # super().__init__()
- # self.model = HubertModel.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
- # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-hubert-large")
- # def forward(self, x):
- # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
- # feats = self.model(input_values)["last_hidden_state"]
- # return feats
- #
- # class CVec(nn.Module):
- # def __init__(self):
- # super().__init__()
- # self.model = HubertModel.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
- # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/vc-webui-big/hubert_base")
- # def forward(self, x):
- # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
- # feats = self.model(input_values)["last_hidden_state"]
- # return feats
- #
- # class cnw2v2base(nn.Module):
- # def __init__(self):
- # super().__init__()
- # self.model = Wav2Vec2Model.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
- # self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("/data/docker/liujing04/gpt-vits/chinese-wav2vec2-base")
- # def forward(self, x):
- # input_values = self.feature_extractor(x, return_tensors="pt", sampling_rate=16000).input_values.to(x.device)
- # feats = self.model(input_values)["last_hidden_state"]
- # return feats
- def get_model():
- model = CNHubert()
- model.eval()
- return model
- # def get_large_model():
- # model = CNHubertLarge()
- # model.eval()
- # return model
- #
- # def get_model_cvec():
- # model = CVec()
- # model.eval()
- # return model
- #
- # def get_model_cnw2v2base():
- # model = cnw2v2base()
- # model.eval()
- # return model
- def get_content(hmodel, wav_16k_tensor):
- with torch.no_grad():
- feats = hmodel(wav_16k_tensor)
- return feats.transpose(1, 2)
- if __name__ == "__main__":
- model = get_model()
- src_path = "/Users/Shared/原音频2.wav"
- wav_16k_tensor = utils.load_wav_to_torch_and_resample(src_path, 16000)
- model = model
- wav_16k_tensor = wav_16k_tensor
- feats = get_content(model, wav_16k_tensor)
- print(feats.shape)
|