@@ -578,20 +578,8 @@ int main(int argc, char ** argv) {
578
578
579
579
580
580
{
581
- float speech_prob = 0 ;
582
- silero_vad_encode_internal (*ctx, *ctx->state , chunk, params.n_threads , speech_prob);
583
- if (speech_prob >= params.threshold && temp_end) {
584
- temp_end = 0 ;
585
- if (next_start < prev_end) next_start = CHUNK_SIZE * i;
586
- }
587
-
588
- if (speech_prob >= params.threshold && ! triggered){
589
- triggered = true ;
590
- current_speech_start = i;
591
- continue ;
592
- }
593
581
if (triggered && i - current_speech_start > max_speech_samples) {
594
- if (prev_end){
582
+ if (prev_end) {
595
583
current_speech_end = prev_end;
596
584
597
585
// find an endpoint in speech
@@ -605,42 +593,42 @@ int main(int argc, char ** argv) {
605
593
current_speech_end = current_speech_start = 0 ;
606
594
if (next_start < prev_end) {
607
595
triggered = false ;
608
- }else {
596
+ } else {
609
597
current_speech_start = next_start;
610
598
}
611
- // find an endpoint in speech
612
- speech_segment.clear ();
613
- speech_segment.assign (pcmf32.begin () + current_speech_start, pcmf32.begin () + current_speech_end);
614
- if (sense_voice_full_parallel (ctx, wparams, speech_segment, speech_segment.size (), params.n_processors ) != 0 ) {
615
- fprintf (stderr, " %s: failed to process audio\n " , argv[0 ]);
616
- return 10 ;
617
- }
618
- sense_voice_print_output (ctx, true , params.use_itn , false );
619
- current_speech_end = current_speech_start = 0 ;
620
- prev_end = next_start = temp_end = 0 ;
599
+ prev_end = 0 ;
600
+ }
601
+ }
621
602
622
- } else {
623
- current_speech_end = i;
624
- prev_end = next_start = temp_end = 0 ;
625
- triggered = false ;
626
- continue ;
603
+ float speech_prob = 0 ;
604
+ silero_vad_encode_internal (*ctx, *ctx->state , chunk, params.n_threads , speech_prob);
605
+ if (speech_prob >= params.threshold ) {
606
+ if (temp_end) temp_end = 0 ;
607
+ if (next_start < prev_end) next_start = i;
608
+ }
627
609
628
- }
610
+ if (speech_prob >= params.threshold && !triggered) {
611
+ triggered = true ;
612
+ current_speech_start = i;
613
+ continue ;
629
614
}
630
615
631
- if (speech_prob < params.neg_threshold && triggered){
632
- if (temp_end == 0 ){
616
+ if (speech_prob < params.neg_threshold && triggered) {
617
+ if (temp_end == 0 ) {
633
618
temp_end = i;
634
619
}
635
620
636
621
if (i - temp_end > min_silence_samples_at_max_speech) {
637
622
prev_end = temp_end;
623
+ } else {
624
+ continue ;
638
625
}
639
626
640
- if (i - temp_end < min_silence_samples) {
627
+ // TODO min_silence_samples -> max_silence_samples
628
+ if (i - prev_end < min_silence_samples) {
641
629
continue ;
642
- }else {
643
- current_speech_end = temp_end ;
630
+ } else {
631
+ current_speech_end = prev_end ;
644
632
if (current_speech_end - current_speech_start > min_speech_samples) {
645
633
// find an endpoint in speech
646
634
speech_segment.clear ();
@@ -653,19 +641,22 @@ int main(int argc, char ** argv) {
653
641
sense_voice_print_output (ctx, true , params.use_itn , false );
654
642
current_speech_end = current_speech_start = 0 ;
655
643
}
656
- prev_end = next_start = temp_end = 0 ;
644
+ prev_end = next_start = 0 ;
657
645
triggered = false ;
658
646
continue ;
659
647
}
660
648
}
661
-
662
649
}
663
-
664
650
}
665
651
// last segment speech
666
- if (current_speech_start != 0 && current_speech_end != 0 && pcmf32.size () - current_speech_start > min_speech_samples){
652
+ if (triggered && pcmf32.size () - 1 - current_speech_start > min_speech_samples) {
653
+ if (temp_end) {
654
+ current_speech_end = temp_end;
655
+ } else {
656
+ current_speech_end = pcmf32.size () - 1 ;
657
+ }
667
658
speech_segment.clear ();
668
- speech_segment.assign (pcmf32.begin () + current_speech_start, pcmf32.begin () + pcmf32. size () );
659
+ speech_segment.assign (pcmf32.begin () + current_speech_start, pcmf32.begin () + current_speech_end );
669
660
printf (" [%.2f-%.2f] " , current_speech_start / (sample_rate * 1.0 ), current_speech_end / (sample_rate * 1.0 ));
670
661
if (sense_voice_full_parallel (ctx, wparams, speech_segment, speech_segment.size (), params.n_processors ) != 0 ) {
671
662
fprintf (stderr, " %s: failed to process audio\n " , argv[0 ]);
0 commit comments