Skip to content

Commit 4e7de67

Browse files
ulatekhggerganov
authored andcommitted
main : add command-style grammar (ggml-org#1998)
* Implemented command-style grammar in the main example. Mostly just copied the relevant parts from the command example. * main : code style --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 8ab0ae5 commit 4e7de67

File tree

1 file changed

+54
-4
lines changed

1 file changed

+54
-4
lines changed

examples/main/main.cpp

+54-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "common.h"
22

33
#include "whisper.h"
4+
#include "grammar-parser.h"
45

56
#include <cmath>
67
#include <fstream>
@@ -38,9 +39,10 @@ struct whisper_params {
3839
int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
3940
int32_t audio_ctx = 0;
4041

41-
float word_thold = 0.01f;
42-
float entropy_thold = 2.40f;
43-
float logprob_thold = -1.00f;
42+
float word_thold = 0.01f;
43+
float entropy_thold = 2.40f;
44+
float logprob_thold = -1.00f;
45+
float grammar_penalty = 100.0f;
4446

4547
bool speed_up = false;
4648
bool debug_mode = false;
@@ -70,6 +72,8 @@ struct whisper_params {
7072
std::string prompt;
7173
std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
7274
std::string model = "models/ggml-base.en.bin";
75+
std::string grammar;
76+
std::string grammar_rule;
7377

7478
// [TDRZ] speaker turn string
7579
std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
@@ -80,6 +84,8 @@ struct whisper_params {
8084

8185
std::vector<std::string> fname_inp = {};
8286
std::vector<std::string> fname_out = {};
87+
88+
grammar_parser::parse_state grammar_parsed;
8389
};
8490

8591
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -154,6 +160,9 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
154160
else if (arg == "-dtw" || arg == "--dtw") { params.dtw = argv[++i]; }
155161
else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
156162
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
163+
else if ( arg == "--grammar") { params.grammar = argv[++i]; }
164+
else if ( arg == "--grammar-rule") { params.grammar_rule = argv[++i]; }
165+
else if ( arg == "--grammar-penalty") { params.grammar_penalty = std::stof(argv[++i]); }
157166
else {
158167
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
159168
whisper_print_usage(argc, argv, params);
@@ -214,6 +223,9 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
214223
fprintf(stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n", params.dtw.c_str());
215224
fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false");
216225
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
226+
fprintf(stderr, " --grammar GRAMMAR [%-7s] GBNF grammar to guide decoding\n", params.grammar.c_str());
227+
fprintf(stderr, " --grammar-rule RULE [%-7s] top-level GBNF grammar rule name\n", params.grammar_rule.c_str());
228+
fprintf(stderr, " --grammar-penalty N [%-7.1f] scales down logits of nongrammar tokens\n", params.grammar_penalty);
217229
fprintf(stderr, "\n");
218230
}
219231

@@ -926,6 +938,29 @@ int main(int argc, char ** argv) {
926938
// initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
927939
whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
928940

941+
if (!params.grammar.empty()) {
942+
auto & grammar = params.grammar_parsed;
943+
if (is_file_exist(params.grammar.c_str())) {
944+
// read grammar from file
945+
std::ifstream ifs(params.grammar.c_str());
946+
const std::string txt = std::string((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
947+
grammar = grammar_parser::parse(txt.c_str());
948+
} else {
949+
// read grammar from string
950+
grammar = grammar_parser::parse(params.grammar.c_str());
951+
}
952+
953+
// will be empty (default) if there are parse errors
954+
if (grammar.rules.empty()) {
955+
fprintf(stderr, "error: failed to parse grammar \"%s\"\n", params.grammar.c_str());
956+
return 4;
957+
} else {
958+
fprintf(stderr, "%s: grammar:\n", __func__);
959+
grammar_parser::print_grammar(stderr, grammar);
960+
fprintf(stderr, "\n");
961+
}
962+
}
963+
929964
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
930965
const auto fname_inp = params.fname_inp[f];
931966
const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
@@ -972,7 +1007,8 @@ int main(int argc, char ** argv) {
9721007
{
9731008
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
9741009

975-
wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
1010+
const bool use_grammar = (!params.grammar_parsed.rules.empty() && !params.grammar_rule.empty());
1011+
wparams.strategy = (params.beam_size > 1 || use_grammar) ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
9761012

9771013
wparams.print_realtime = false;
9781014
wparams.print_progress = params.print_progress;
@@ -1010,6 +1046,20 @@ int main(int argc, char ** argv) {
10101046

10111047
whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
10121048

1049+
const auto & grammar_parsed = params.grammar_parsed;
1050+
auto grammar_rules = grammar_parsed.c_rules();
1051+
1052+
if (use_grammar) {
1053+
if (grammar_parsed.symbol_ids.find(params.grammar_rule) == grammar_parsed.symbol_ids.end()) {
1054+
fprintf(stderr, "%s: warning: grammar rule '%s' not found - skipping grammar sampling\n", __func__, params.grammar_rule.c_str());
1055+
} else {
1056+
wparams.grammar_rules = grammar_rules.data();
1057+
wparams.n_grammar_rules = grammar_rules.size();
1058+
wparams.i_start_rule = grammar_parsed.symbol_ids.at(params.grammar_rule);
1059+
wparams.grammar_penalty = params.grammar_penalty;
1060+
}
1061+
}
1062+
10131063
// this callback is called on each new segment
10141064
if (!wparams.print_realtime) {
10151065
wparams.new_segment_callback = whisper_print_segment_callback;

0 commit comments

Comments
 (0)