35 lines
3.1 KiB
Python
35 lines
3.1 KiB
Python
from cosyvoice.cli.cosyvoice import CosyVoice
|
|
from cosyvoice.utils.file_utils import load_wav
|
|
import torchaudio
|
|
cosyvoice = CosyVoice(r'D:\AI\download\CosyVoice-300M', load_jit=True, load_onnx=False, fp16=True)
|
|
print(cosyvoice.list_avaliable_spks())
|
|
prompt_speech_16k = load_wav('vocal_3.mp3_10.wav_0006151680_0006360320.wav', 16000)
|
|
for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜', '可以动动你的小手点个关注,感谢各位好哥哥,如果之后有新消息,我还会在更新呢。', prompt_speech_16k, stream=False)):
|
|
torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], 22050)
|
|
|
|
# change stream=True for chunk stream inference
|
|
# out = cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)
|
|
# torchaudio.save('sft_0.wav', out['tts_speech'], 22050)
|
|
|
|
# for i, j in enumerate(cosyvoice.inference_sft('你好,我是通义生成式语音大模型,请问有什么可以帮您的吗?', '中文女', stream=False)):
|
|
# torchaudio.save('sft_{}.wav'.format(i), j['tts_speech'], 22050)
|
|
|
|
# cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-25Hz') # or change to pretrained_models/CosyVoice-300M for 50Hz inference
|
|
# # zero_shot usage, <|zh|><|en|><|jp|><|yue|><|ko|> for Chinese/English/Japanese/Cantonese/Korean
|
|
# prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
|
# for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=False)):
|
|
# torchaudio.save('zero_shot_{}.wav'.format(i), j['tts_speech'], 22050)
|
|
# # cross_lingual usage
|
|
# prompt_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
|
# for i, j in enumerate(cosyvoice.inference_cross_lingual('<|en|>And then later on, fully acquiring that company. So keeping management in line, interest in line with the asset that\'s coming into the family is a reason why sometimes we don\'t buy the whole thing.', prompt_speech_16k, stream=False)):
|
|
# torchaudio.save('cross_lingual_{}.wav'.format(i), j['tts_speech'], 22050)
|
|
# # vc usage
|
|
# prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
|
# source_speech_16k = load_wav('cross_lingual_prompt.wav', 16000)
|
|
# for i, j in enumerate(cosyvoice.inference_vc(source_speech_16k, prompt_speech_16k, stream=False)):
|
|
# torchaudio.save('vc_{}.wav'.format(i), j['tts_speech'], 22050)
|
|
|
|
# cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
|
|
# # instruct usage, support <laughter></laughter><strong></strong>[laughter][breath]
|
|
# for i, j in enumerate(cosyvoice.inference_instruct('在面对挑战时,他展现了非凡的<strong>勇气</strong>与<strong>智慧</strong>。', '中文男', 'Theo \'Crimson\', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.', stream=False)):
|
|
# torchaudio.save('instruct_{}.wav'.format(i), j['tts_speech'], 22050) |