Skip to content

Commit a3eef50

Browse files
authored
Merge pull request #80 from chrisju/fix-crash-when-set-big-silence
修复了设置较大min_silence_duration_ms时崩溃的问题
2 parents a7819d0 + c3255e4 commit a3eef50

File tree

1 file changed

+31
-40
lines changed

1 file changed

+31
-40
lines changed

sense-voice/csrc/main.cc

+31-40
Original file line numberDiff line numberDiff line change
@@ -578,20 +578,8 @@ int main(int argc, char ** argv) {
578578

579579

580580
{
581-
float speech_prob = 0;
582-
silero_vad_encode_internal(*ctx, *ctx->state, chunk, params.n_threads, speech_prob);
583-
if (speech_prob >= params.threshold && temp_end) {
584-
temp_end = 0;
585-
if(next_start < prev_end) next_start = CHUNK_SIZE * i;
586-
}
587-
588-
if (speech_prob >= params.threshold && ! triggered){
589-
triggered = true;
590-
current_speech_start = i;
591-
continue;
592-
}
593581
if (triggered && i - current_speech_start > max_speech_samples) {
594-
if (prev_end){
582+
if (prev_end) {
595583
current_speech_end = prev_end;
596584

597585
// find an endpoint in speech
@@ -605,42 +593,42 @@ int main(int argc, char ** argv) {
605593
current_speech_end = current_speech_start = 0;
606594
if (next_start < prev_end) {
607595
triggered = false;
608-
}else{
596+
} else {
609597
current_speech_start = next_start;
610598
}
611-
// find an endpoint in speech
612-
speech_segment.clear();
613-
speech_segment.assign(pcmf32.begin() + current_speech_start, pcmf32.begin() + current_speech_end);
614-
if (sense_voice_full_parallel(ctx, wparams, speech_segment, speech_segment.size(), params.n_processors) != 0) {
615-
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
616-
return 10;
617-
}
618-
sense_voice_print_output(ctx, true, params.use_itn, false);
619-
current_speech_end = current_speech_start = 0;
620-
prev_end = next_start = temp_end = 0;
599+
prev_end = 0;
600+
}
601+
}
621602

622-
} else {
623-
current_speech_end = i;
624-
prev_end = next_start = temp_end = 0;
625-
triggered = false;
626-
continue;
603+
float speech_prob = 0;
604+
silero_vad_encode_internal(*ctx, *ctx->state, chunk, params.n_threads, speech_prob);
605+
if (speech_prob >= params.threshold) {
606+
if (temp_end) temp_end = 0;
607+
if (next_start < prev_end) next_start = i;
608+
}
627609

628-
}
610+
if (speech_prob >= params.threshold && !triggered) {
611+
triggered = true;
612+
current_speech_start = i;
613+
continue;
629614
}
630615

631-
if (speech_prob < params.neg_threshold && triggered){
632-
if (temp_end == 0){
616+
if (speech_prob < params.neg_threshold && triggered) {
617+
if (temp_end == 0) {
633618
temp_end = i;
634619
}
635620

636621
if (i - temp_end > min_silence_samples_at_max_speech) {
637622
prev_end = temp_end;
623+
} else {
624+
continue;
638625
}
639626

640-
if (i - temp_end < min_silence_samples) {
627+
// TODO min_silence_samples -> max_silence_samples
628+
if (i - prev_end < min_silence_samples) {
641629
continue;
642-
}else{
643-
current_speech_end = temp_end;
630+
} else {
631+
current_speech_end = prev_end;
644632
if (current_speech_end - current_speech_start > min_speech_samples) {
645633
// find an endpoint in speech
646634
speech_segment.clear();
@@ -653,19 +641,22 @@ int main(int argc, char ** argv) {
653641
sense_voice_print_output(ctx, true, params.use_itn, false);
654642
current_speech_end = current_speech_start = 0;
655643
}
656-
prev_end = next_start = temp_end = 0;
644+
prev_end = next_start = 0;
657645
triggered = false;
658646
continue;
659647
}
660648
}
661-
662649
}
663-
664650
}
665651
// last segment speech
666-
if (current_speech_start != 0 && current_speech_end != 0 && pcmf32.size() - current_speech_start > min_speech_samples){
652+
if (triggered && pcmf32.size() - 1 - current_speech_start > min_speech_samples) {
653+
if (temp_end) {
654+
current_speech_end = temp_end;
655+
} else {
656+
current_speech_end = pcmf32.size() - 1;
657+
}
667658
speech_segment.clear();
668-
speech_segment.assign(pcmf32.begin() + current_speech_start, pcmf32.begin() + pcmf32.size());
659+
speech_segment.assign(pcmf32.begin() + current_speech_start, pcmf32.begin() + current_speech_end);
669660
printf("[%.2f-%.2f] ", current_speech_start / (sample_rate * 1.0), current_speech_end / (sample_rate * 1.0));
670661
if (sense_voice_full_parallel(ctx, wparams, speech_segment, speech_segment.size(), params.n_processors) != 0) {
671662
fprintf(stderr, "%s: failed to process audio\n", argv[0]);

0 commit comments

Comments
 (0)