@@ -184,9 +184,10 @@ def load_examples_from_jsonl():
184184
185185# Model configuration
186186SYSTEM_PROMPT = "You are a speech synthesizer that generates natural, realistic, and human-like conversational audio from dialogue text."
187- MODEL_PATH = "fnlp/MOSS-TTSD-v0.5"
188- SPT_CONFIG_PATH = "XY_Tokenizer/config/xy_tokenizer_32k_config.yaml"
189- SPT_CHECKPOINT_PATH = "XY_Tokenizer/weights/xy_tokenizer.ckpt"
187+ MODEL_PATH = "fnlp/MOSS-TTSD-v0.7"
188+ # Align SPT config/weights with CLI inference
189+ SPT_CONFIG_PATH = "XY_Tokenizer/config/MOSS_TTSD_tokenizer.yaml"
190+ SPT_CHECKPOINT_PATH = "XY_Tokenizer/weights/MOSS_TTSD_tokenizer"
190191MAX_CHANNELS = 8
191192
192193# Global variables for caching loaded models
@@ -251,29 +252,29 @@ def process_single_audio_generation(
251252
252253 # Handle different audio input modes (mutually exclusive)
253254 if audio_mode == "Single" :
254- # Use single audio mode
255+ # Strict single-audio requirement
256+ if not prompt_audio_single :
257+ return (
258+ None ,
259+ "Error: In Single mode, please provide one prompt_audio and its text." ,
260+ )
255261 item ["prompt_audio" ] = prompt_audio_single
256262 item ["prompt_text" ] = prompt_text_single
257- elif audio_mode == "Role" and prompt_audio_1 and prompt_audio_2 :
258- # Use role audio mode (requires both audio files)
263+ elif audio_mode == "Role" :
264+ # Strict role-audio requirement: both speakers must be provided
265+ if not (prompt_audio_1 and prompt_audio_2 ):
266+ return (
267+ None ,
268+ "Error: In Role mode, please provide both Role1 and Role2 reference audios." ,
269+ )
259270 item ["prompt_audio_speaker1" ] = prompt_audio_1
260271 item ["prompt_text_speaker1" ] = prompt_text_1 if prompt_text_1 else ""
261272 item ["prompt_audio_speaker2" ] = prompt_audio_2
262273 item ["prompt_text_speaker2" ] = prompt_text_2 if prompt_text_2 else ""
263- elif audio_mode == "Role" and prompt_audio_1 :
264- # Only Role 1 audio provided, treat as single audio
265- print ("Only Role 1 audio provided, treating as single audio." )
266- item ["prompt_audio" ] = prompt_audio_1
267- item ["prompt_text" ] = prompt_text_1 if prompt_text_1 else ""
268- elif audio_mode == "Role" and prompt_audio_2 :
269- # Only Role 2 audio provided, treat as single audio
270- print ("Only Role 2 audio provided, treating as single audio." )
271- item ["prompt_audio" ] = prompt_audio_2
272- item ["prompt_text" ] = prompt_text_2 if prompt_text_2 else ""
273274 else :
274275 return (
275276 None ,
276- "Error: Please select a mode and provide corresponding audio files \n - Single Audio Mode: Provide one audio file and corresponding text \n - Role Mode: Provide audio files for Role1 and Role2 " ,
277+ "Error: Please select an audio input mode ( Single or Role). " ,
277278 )
278279
279280 # Set random seed to ensure reproducible results
@@ -290,6 +291,7 @@ def process_single_audio_generation(
290291 system_prompt = SYSTEM_PROMPT ,
291292 start_idx = 0 ,
292293 use_normalize = use_normalize ,
294+ silence_duration = 0.1 ,
293295 )
294296
295297 # Check results
0 commit comments