PaddlePaddle
diff --git a/‎CONTRIBUTING.md
+2-2 b/‎CONTRIBUTING.md
+2-2
diff --git a/‎csrc/gpu/moe/fused_moe/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h
+1-1 b/‎csrc/gpu/moe/fused_moe/cutlass_kernels/moe_gemm/fused_moe_gemm_kernels_template.h
+1-1
diff --git a/‎paddlenlp/__init__.py
+1-1 b/‎paddlenlp/__init__.py
+1-1
diff --git a/‎paddlenlp/dataaug/char.py
+1-1 b/‎paddlenlp/dataaug/char.py
+1-1
diff --git a/‎paddlenlp/dataaug/sentence.py
+7-7 b/‎paddlenlp/dataaug/sentence.py
+7-7
diff --git a/‎paddlenlp/dataaug/word.py
+2-2 b/‎paddlenlp/dataaug/word.py
+2-2
diff --git a/‎paddlenlp/experimental/autonlp/README_en.md
+2-2 b/‎paddlenlp/experimental/autonlp/README_en.md
+2-2
diff --git a/‎paddlenlp/experimental/autonlp/auto_trainer_base.py
+1-1 b/‎paddlenlp/experimental/autonlp/auto_trainer_base.py
+1-1
diff --git a/‎paddlenlp/experimental/autonlp/text_classification.py
+3-3 b/‎paddlenlp/experimental/autonlp/text_classification.py
+3-3
diff --git a/‎paddlenlp/experimental/faster_tokenizer.py
+3-3 b/‎paddlenlp/experimental/faster_tokenizer.py
+3-3
diff --git a/‎paddlenlp/experimental/transformers/bloom/modeling.py
+1-1 b/‎paddlenlp/experimental/transformers/bloom/modeling.py
+1-1
diff --git a/‎paddlenlp/experimental/transformers/chatglm/modeling.py
+1-1 b/‎paddlenlp/experimental/transformers/chatglm/modeling.py
+1-1
diff --git a/‎paddlenlp/experimental/transformers/deepseek_v2/modeling.py
+1-1 b/‎paddlenlp/experimental/transformers/deepseek_v2/modeling.py
+1-1
diff --git a/‎paddlenlp/experimental/transformers/generation_utils.py
+1-1 b/‎paddlenlp/experimental/transformers/generation_utils.py
+1-1
diff --git a/‎paddlenlp/experimental/transformers/gpt/modeling.py
+1-1 b/‎paddlenlp/experimental/transformers/gpt/modeling.py
+1-1
diff --git a/‎paddlenlp/generation/configuration_utils.py
+1-1 b/‎paddlenlp/generation/configuration_utils.py
+1-1
diff --git a/‎paddlenlp/generation/streamers.py
+1-1 b/‎paddlenlp/generation/streamers.py
+1-1
diff --git a/‎paddlenlp/generation/utils.py
+1-1 b/‎paddlenlp/generation/utils.py
+1-1
diff --git a/‎paddlenlp/metrics/dureader.py
+1-1 b/‎paddlenlp/metrics/dureader.py
+1-1
diff --git a/‎paddlenlp/ops/optimizer/adamwdl.py
+1-1 b/‎paddlenlp/ops/optimizer/adamwdl.py
+1-1
diff --git a/‎paddlenlp/ops/triton_ops/fused_moe.py
+1-1 b/‎paddlenlp/ops/triton_ops/fused_moe.py
+1-1
diff --git a/‎paddlenlp/ops/triton_ops/triton_utils.py
+1-1 b/‎paddlenlp/ops/triton_ops/triton_utils.py
+1-1
diff --git a/‎paddlenlp/peft/lokr/lokr_layers.py
+1-1 b/‎paddlenlp/peft/lokr/lokr_layers.py
+1-1
diff --git a/‎paddlenlp/peft/lora/auto_lora_model.py
+3-3 b/‎paddlenlp/peft/lora/auto_lora_model.py
+3-3
diff --git a/‎paddlenlp/peft/lora/lora_model.py
+1-1 b/‎paddlenlp/peft/lora/lora_model.py
+1-1
diff --git a/‎paddlenlp/peft/vera/vera_model.py
+1-1 b/‎paddlenlp/peft/vera/vera_model.py
+1-1
diff --git a/‎paddlenlp/prompt/template.py
+1-1 b/‎paddlenlp/prompt/template.py
+1-1
diff --git a/‎paddlenlp/rl/trainer/ppo_trainer.py
+1-1 b/‎paddlenlp/rl/trainer/ppo_trainer.py
+1-1
@@ -72,7 +72,7 @@ PaddleNLP 使用 [Git 分支模型](http://nvie.com/posts/a-successful-git-branc
    类似于以下内容：
 
    ```text
-    ➜  (my-virtual-env) git commit -m "commiting my cool feature"
+    ➜  (my-virtual-env) git commit -m "committing my cool feature"
     black....................................................................Passed
     isort....................................................................Passed
     flake8...................................................................Passed
@@ -91,7 +91,7 @@ PaddleNLP 使用 [Git 分支模型](http://nvie.com/posts/a-successful-git-branc
    但大多数时候事情并没有那么顺利。当您的代码或文档不符合标准时，`pre-commit` 检查将失败。
 
    ```text
-    ➜  (my-virtual-env) git commit -m "commiting my cool feature"
+    ➜  (my-virtual-env) git commit -m "committing my cool feature"
     black....................................................................Passed
     isort....................................................................Failed
     - hook id: isort
 
@@ -725,7 +725,7 @@ void MoeGemmRunner<T, WeightType>::run_gemm<EpilogueTag>(
       gemmConfigManager.addBestConfig(gemmId, profile_total_rows, best_config);
       chosen_config = best_config;
     } else {
-      PADDLE_FATAL("[MoE Configure Search] find no one avaliable config.");
+      PADDLE_FATAL("[MoE Configure Search] find no one available config.");
     }
   }
   dispatch_to_arch<EpilogueTag>(A,
 
@@ -35,7 +35,7 @@
 
     logger.warning(
         "Detected that datasets module was imported before paddlenlp. "
-        "This may cause PaddleNLP datasets to be unavalible in intranet. "
+        "This may cause PaddleNLP datasets to be unavailable in intranet. "
         "Please import paddlenlp before datasets module to avoid download issues"
     )
 import paddle
 
@@ -436,7 +436,7 @@ def _augment_single(self, seq_tokens, aug_indexes):
         return sentences
 
     def _generate_sequence(self, output_seq_tokens, aug_tokens, p):
-        """Genearte the sequences according to the mapping list"""
+        """Generate the sequences according to the mapping list"""
         for aug_token in aug_tokens:
             idx, token = aug_token
             if p == 0:
 
@@ -33,9 +33,9 @@
 class SentenceGenerate:
     """
     SentenceGenerate is a sentence-level data augmentation strategy
-    that generates simialr sentences according to the input sequence.
-    The strattegy first generates several sentences, and then chooses
-    the top n simialr sentences by the model.
+    that generates similar sentences according to the input sequence.
+    The strategy first generates several sentences, and then chooses
+    the top n similar sentences by the model.
 
     Args:
         model_name (str):
@@ -82,7 +82,7 @@ def augment(self, sequences):
 
     @paddle.no_grad()
     def _generate_similar_sentence(self, sequence, model, tokenizer):
-        """Generates generate_n similar sentences from the provided sequence, and chooose the best create_n similar sentences."""
+        """Generates generate_n similar sentences from the provided sequence, and choose the best create_n similar sentences."""
 
         # Generate generate_n similar sentences
         generated_sequences = [sequence]
@@ -134,7 +134,7 @@ class SentenceSummarize:
             token probabilities in the "sampling" strategy. Default to 1.0,
             which means no effect.
         use_fp16_decoding: (bool): Whether to use fp16 for decoding.
-            Only works when faster entry is avalible. Default to False.
+            Only works when faster entry is available. Default to False.
         kwargs (dict): Additional keyword arguments refer to ..taskflow.text_summarization.TextSummarization
     """
 
@@ -337,8 +337,8 @@ def _translate(self, model, tokenizer, sequences, lang):
 class SentenceBackTranslateAPI:
     """
     SentenceBackTranslateAPI is a sentence-level data augmentation strategy
-    that translates the input sequence into one langugage, and backtranslate
-    back into the sourche language by baidu translate api.
+    that translates the input sequence into one language, and back-translate
+    back into the source language by baidu translate api.
 
     Args:
         src_lang (str):
 
@@ -170,7 +170,7 @@ def _load_substitute_dict(self, source_type):
         return substitute_dict
 
     def _generate_sequence(self, output_seq_tokens, aug_tokens):
-        """Genearte the sequences according to the mapping list"""
+        """Generate the sequences according to the mapping list"""
         for aug_token in aug_tokens:
             idx, token = aug_token
             output_seq_tokens[int(idx)] = token
@@ -503,7 +503,7 @@ def _augment_single(self, seq_tokens, aug_indexes):
         return sentences
 
     def _generate_sequence(self, output_seq_tokens, aug_tokens, p):
-        """Genearte the sequences according to the mapping list"""
+        """Generate the sequences according to the mapping list"""
         for aug_token in aug_tokens:
             idx, token = aug_token
             if p == 0:
 
@@ -50,8 +50,8 @@ Args:
 - text_column (string, required): Name of the column that contains the input text.
 - label_column (string, required): Name of the column that contains the target variable to predict.
 - language (string, required): language of the text
-- metric_for_best_model (string, optional): the name of the metrc for selecting the best model.
-- greater_is_better (bool, optional): Whether better models should have a greater metric or not. Use in conjuction with `metric_for_best_model`.
+- metric_for_best_model (string, optional): the name of the metric for selecting the best model.
+- greater_is_better (bool, optional): Whether better models should have a greater metric or not. Use in conjunction with `metric_for_best_model`.
 - problem_type (str, optional): Select among ["multi_class", "multi_label"] based on the nature of your problem
 - output_dir (str, optional): Output directory for the experiments, defaults to "autpnlp_results"
 - verbosity: (int, optional): controls the verbosity of the run. Defaults to 1, which let the workers log to the driver.To reduce the amount of logs, use verbosity > 0 to set stop the workers from logging to the driver.
 
@@ -45,7 +45,7 @@ class AutoTrainerBase(metaclass=ABCMeta):
         eval_dataset (Dataset, required): Evaluation dataset, must contains the 'text_column' and 'label_column' specified below
         language (string, required): language of the text
         metric_for_best_model (string, optional): the name of the metric for selecting the best model.
-        greater_is_better (bool, required): Whether better models should have a greater metric or not. Use in conjuction with `metric_for_best_model`.
+        greater_is_better (bool, required): Whether better models should have a greater metric or not. Use in conjunction with `metric_for_best_model`.
         output_dir (str, optional): Output directory for the experiments, defaults to "autpnlp_results"
         verbosity: (int, optional): controls the verbosity of the run. Defaults to 1, which let the workers log to the driver.To reduce the amount of logs,
                 use verbosity > 0 to set stop the workers from logging to the driver.
 
@@ -59,8 +59,8 @@ class AutoTrainerForTextClassification(AutoTrainerBase):
         eval_dataset (Dataset, required): Evaluation dataset, must contains the 'text_column' and 'label_column' specified below
         text_column (string, required): Name of the column that contains the input text.
         label_column (string, required): Name of the column that contains the target variable to predict.
-        metric_for_best_model (string, optional): the name of the metrc for selecting the best model. Default to 'eval_accuracy'.
-        greater_is_better (bool, optional): Whether better models should have a greater metric or not. Use in conjuction with `metric_for_best_model`.
+        metric_for_best_model (string, optional): the name of the metric for selecting the best model. Default to 'eval_accuracy'.
+        greater_is_better (bool, optional): Whether better models should have a greater metric or not. Use in conjunction with `metric_for_best_model`.
         problem_type (str, optional): Select among ["multi_class", "multi_label"] based on the nature of your problem
         kwargs (dict, optional): Additional keyword arguments passed along to the specific task.
             language (string, required): language of the text.
@@ -639,7 +639,7 @@ def export(self, export_path: str, trial_id: Optional[str] = None, compress: boo
             f"Taskflow config saved to {export_path}. You can use the Taskflow config to create a Taskflow instance for inference"
         )
 
-        logger.info(f"Exported trial_id: {trial_id} to export_path: {export_path} sucessfully!")
+        logger.info(f"Exported trial_id: {trial_id} to export_path: {export_path} successfully!")
 
         if os.path.exists(self.training_path):
             logger.info("Removing training checkpoints to conserve disk space")
 
@@ -29,10 +29,10 @@
 def to_tensor(string_values, name="text"):
     """
     Create the tensor that the value holds the list of string.
-    NOTICE: The value will be holded in the cpu place.
+    NOTICE: The value will be held in the cpu place.
 
     Args:
-        string_values(list[string]): The value will be setted to the tensor.
+        string_values(list[string]): The value will be set to the tensor.
         name(string): The name of the tensor.
     """
     tensor = paddle.Tensor(core.VarDesc.VarType.STRING, [], name, core.VarDesc.VarType.STRINGS, False)
@@ -43,7 +43,7 @@ def to_tensor(string_values, name="text"):
 def to_vocab_buffer(vocab_dict, name):
     """
     Create the tensor that the value holds the map, the type of key is the string.
-    NOTICE: The value will be holded in the cpu place.
+    NOTICE: The value will be held in the cpu place.
 
     Args:
         vocab_dict(dict): The value will be setted to the tensor.
 
@@ -392,7 +392,7 @@ def set_state_dict(self, state_dict, use_structured_name=True):
                 elif k.endswith("mlp.dense_4h_to_h.bias"):
                     self.transformer_block.ffn2_biases[idx].set_value(paddle.to_tensor(v))
                 else:
-                    raise ValueError("Unknow weight {}".format(k))
+                    raise ValueError("Unknown weight {}".format(k))
 
 
 class BloomLMHead(nn.Layer):
 
@@ -481,7 +481,7 @@ def set_state_dict(self, state_dict, use_structured_name=True):
             elif k.endswith("mlp.dense_4h_to_h.bias"):
                 self.transformer_block.ffn2_biases[idx].set_value(v.astype(dtype))
             else:
-                print("Unknow weight {}".format(k))
+                print("Unknown weight {}".format(k))
 
 
 @register_base_model
 
@@ -145,7 +145,7 @@ def forward(
 
 class DeepseekScalingRotaryEmbeddingXPU(nn.Layer):
     """
-    RotaryEmbedding XPU Implemention. In XPU, cos and sin must be computed in cpu.
+    RotaryEmbedding XPU Implementation. In XPU, cos and sin must be computed in cpu.
     """
 
     def __init__(
 
@@ -102,7 +102,7 @@ def to_static(self, output_path: str, config: dict):
             config.get("logits_processors", None),
             precache_input_spec,
         ]
-        # use "==" to distingusih between chatglm and chatglm_v2.
+        # use "==" to distinguish between chatglm and chatglm_v2.
         if self.config["model_type"] and "chatglm" == self.config.model_type.lower():
             input_spec[2] = paddle.static.InputSpec(
                 shape=[None, None, None], dtype="int64", name="position_ids"
 
@@ -432,7 +432,7 @@ def set_state_dict(self, state_dict):
                 elif k.endswith("linear2.bias"):
                     self.transformer_block.ffn2_biases[idx].set_value(v.astype(dtype))
                 else:
-                    raise ValueError("Unknow weight {}".format(k))
+                    raise ValueError("Unknown weight {}".format(k))
 
 
 class GPTForCausalLMInferenceModel(GenerationInferenceModel, GPTPretrainedModel):
 
@@ -121,7 +121,7 @@ class GenerationConfig:
             use_fast: (bool, optional): Whether to use fast entry of model
                 for FastGeneration. Default to False.
             use_fp16_decoding: (bool, optional): Whether to use fp16 for decoding.
-                Only works when fast entry is avalible. Default to False.
+                Only works when fast entry is available. Default to False.
             trunc_input: (bool, optional): Whether to truncate the inputs from
                 output sequences . Default to True.
             model_kwargs (dict): It can be used to specify additional kwargs
 
@@ -149,7 +149,7 @@ def _is_chinese_char(self, cp):
 class TextIteratorStreamer(TextStreamer):
     """
     Streamer that stores print-ready text in a queue, to be used by a downstream application as an iterator. This is
-    useful for applications that benefit from acessing the generated text in a non-blocking way (e.g. in an interactive
+    useful for applications that benefit from accessing the generated text in a non-blocking way (e.g. in an interactive
     Gradio demo).
 
     Parameters:
 
@@ -1223,7 +1223,7 @@ def sample(
             next_tokens = paddle.multinomial(probs)
 
             if self.config.tensor_parallel_degree > 1:
-                # Maybe no need to broadcast if seed is set correclty.
+                # Maybe no need to broadcast if seed is set correctly.
                 from paddle.distributed import fleet
 
                 try:
 
@@ -127,7 +127,7 @@ def compute_predictions(
 
             nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
 
-        # if we didn't inlude the empty option in the n-best, inlcude it
+        # if we didn't include the empty option in the n-best, include it
 
         # In very rare edge cases we could have no valid predictions. So we
         # just create a nonce prediction in this case to avoid failure.
 
@@ -188,7 +188,7 @@ def _append_optimize_op(self, block, param_and_grad):
         ratio = self.set_param_lr_fun(param_and_grad[0])
         param_and_grad[0].optimize_attr["learning_rate"] *= ratio
 
-        # excute Adam op
+        # execute Adam op
         res = super(AdamWDL, self)._append_optimize_op(block, param_and_grad)
         param_and_grad[0].optimize_attr["learning_rate"] = prev_lr
         return res
 
@@ -934,7 +934,7 @@ def get_default_config(
                     "num_stages": 4,
                 }
         else:
-            # Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1]
+            # Block-wise quant: BLOCK_SIZE_K must be divisible by block_shape[1]
             config = {
                 "BLOCK_SIZE_M": 64,
                 "BLOCK_SIZE_N": block_shape[0],
 
@@ -98,7 +98,7 @@ def extract_triton_kernel(kernel, file_name):
     elif type(kernel) == triton.runtime.autotuner.Autotuner:
         fn = kernel.fn.fn
     else:
-        AssertionError("error occures")
+        AssertionError("error occurs")
     py_script = textwrap.dedent(inspect.getsource(fn))
 
     # @triton.jit must only appear once
 
@@ -196,7 +196,7 @@ def factorization(dimension: int, factor: int = -1) -> Tuple[int, int]:
     In LoRA with Kroneckor Product, first value is a value for weight scale.
     second value is a value for weight.
 
-    Becuase of non-commutative property, A⊗B ≠ B⊗A. Meaning of two matrices is slightly different.
+    Because of non-commutative property, A⊗B ≠ B⊗A. Meaning of two matrices is slightly different.
 
     examples)
     factor
 
@@ -318,7 +318,7 @@ def save_pretrained(self, save_directory: str, merge_tensor_parallel: bool = Fal
         if self.is_pipelinemodel and merge_tensor_parallel and self.lora_config.tensor_parallel_degree > 1:
             merge_tensor_parallel = False
             logger.warning(
-                "Pipeline parallism does not support merge_tensor_parallel. Set merge_tensor_parallel to False."
+                "Pipeline parallelism does not support merge_tensor_parallel. Set merge_tensor_parallel to False."
             )
 
         variant = kwargs.get("variant", None)
@@ -580,7 +580,7 @@ def merge_auto_dist_configs(self, configs):
                     for k, v in config["mp_config"]["parallelize_plan"].items():
                         assert (
                             k not in final_config["mp_config"]["parallelize_plan"].keys()
-                        ), f"sublayer mp_config shuld be a subset of model but got sublayer config {config['mp_config']} and model config {final_config['mp_config']}."
+                        ), f"sublayer mp_config should be a subset of model but got sublayer config {config['mp_config']} and model config {final_config['mp_config']}."
                         final_config["mp_config"]["parallelize_plan"][k] = v
             if "sp_config" in config and config["sp_config"] is not None:
                 if final_config["sp_config"] is None:
@@ -589,7 +589,7 @@ def merge_auto_dist_configs(self, configs):
                     for k, v in config["sp_config"]["parallelize_plan"].items():
                         assert (
                             k not in final_config["sp_config"]["parallelize_plan"].keys()
-                        ), f"sublayer sp_config shuld be a subset of model but got sublayer config {config['sp_config']} and model config {final_config['sp_config']}."
+                        ), f"sublayer sp_config should be a subset of model but got sublayer config {config['sp_config']} and model config {final_config['sp_config']}."
                         final_config["sp_config"]["parallelize_plan"][k] = v
             if "pp_config" in config and config["pp_config"] is not None:
 
 
@@ -420,7 +420,7 @@ def save_pretrained(self, save_directory: str, merge_tensor_parallel: bool = Fal
         if self.is_pipelinemodel and merge_tensor_parallel and self.lora_config.tensor_parallel_degree > 1:
             merge_tensor_parallel = False
             logger.warning(
-                "Pipeline parallism does not support merge_tensor_parallel. Set merge_tensor_parallel to False."
+                "Pipeline parallelism does not support merge_tensor_parallel. Set merge_tensor_parallel to False."
             )
 
         variant = kwargs.get("variant", None)
 
@@ -111,7 +111,7 @@ def save_pretrained(self, save_directory: str, merge_tensor_parallel: bool = Fal
         if self.is_pipelinemodel and merge_tensor_parallel and self.vera_config.tensor_parallel_degree > 1:
             merge_tensor_parallel = False
             logger.warning(
-                "Pipeline parallism does not support merge_tensor_parallel. Set merge_tensor_parallel to False."
+                "Pipeline parallelism does not support merge_tensor_parallel. Set merge_tensor_parallel to False."
             )
 
         variant = kwargs.get("variant", None)
 
@@ -503,7 +503,7 @@ def parse_soft_prompt(self):
         continuous token id sequence for each part in template.
 
         Returns:
-            `List[Dict[str, str]]`: Template with continuous prompt formated as {"soft": "xxx"}.
+            `List[Dict[str, str]]`: Template with continuous prompt formatted as {"soft": "xxx"}.
             `Tuple[Dict[int, int], List[List[int]], int]`:
                 - Mapping from continuous ids to word ids for initialization.
                 - Continuous ids for each part. Id 0 denotes none-continuous part.
 
@@ -1208,7 +1208,7 @@ def _balance_batch(self, micro_batches):
         if dp_degree * sharding_degree == 1:
             return micro_batches
 
-        # otherwise, need to balance batch accross DP and Sharding groups
+        # otherwise, need to balance batch across DP and Sharding groups
         try:
             hcg = fleet.get_hybrid_communicate_group()
             sharding_parallel_group = hcg.get_sharding_parallel_group()
Original file line number	Diff line number	Diff line change
`@@ -725,7 +725,7 @@ void MoeGemmRunner<T, WeightType>::run_gemm<EpilogueTag>(`
`725`	`725`	`gemmConfigManager.addBestConfig(gemmId, profile_total_rows, best_config);`
`726`	`726`	`chosen_config = best_config;`
`727`	`727`	`} else {`
`728`		`- PADDLE_FATAL("[MoE Configure Search] find no one avaliable config.");`
	`728`	`+ PADDLE_FATAL("[MoE Configure Search] find no one available config.");`
`729`	`729`	`}`
`730`	`730`	`}`
`731`	`731`	`dispatch_to_arch<EpilogueTag>(A,`
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,7 @@`
`35`	`35`
`36`	`36`	`logger.warning(`
`37`	`37`	`"Detected that datasets module was imported before paddlenlp. "`
`38`		`- "This may cause PaddleNLP datasets to be unavalible in intranet. "`
	`38`	`+ "This may cause PaddleNLP datasets to be unavailable in intranet. "`
`39`	`39`	`"Please import paddlenlp before datasets module to avoid download issues"`
`40`	`40`	`)`
`41`	`41`	`import paddle`
Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ def to_static(self, output_path: str, config: dict):`
`102`	`102`	`config.get("logits_processors", None),`
`103`	`103`	`precache_input_spec,`
`104`	`104`	`]`
`105`		`- # use "==" to distingusih between chatglm and chatglm_v2.`
	`105`	`+ # use "==" to distinguish between chatglm and chatglm_v2.`
`106`	`106`	`if self.config["model_type"] and "chatglm" == self.config.model_type.lower():`
`107`	`107`	`input_spec[2] = paddle.static.InputSpec(`
`108`	`108`	`shape=[None, None, None], dtype="int64", name="position_ids"`
Original file line number	Diff line number	Diff line change
`@@ -934,7 +934,7 @@ def get_default_config(`
`934`	`934`	`"num_stages": 4,`
`935`	`935`	`}`
`936`	`936`	`else:`
`937`		`- # Block-wise quant: BLOCK_SIZE_K must be divisable by block_shape[1]`
	`937`	`+ # Block-wise quant: BLOCK_SIZE_K must be divisible by block_shape[1]`
`938`	`938`	`config = {`
`939`	`939`	`"BLOCK_SIZE_M": 64,`
`940`	`940`	`"BLOCK_SIZE_N": block_shape[0],`
Original file line number	Diff line number	Diff line change
`@@ -420,7 +420,7 @@ def save_pretrained(self, save_directory: str, merge_tensor_parallel: bool = Fal`
`420`	`420`	`if self.is_pipelinemodel and merge_tensor_parallel and self.lora_config.tensor_parallel_degree > 1:`
`421`	`421`	`merge_tensor_parallel = False`
`422`	`422`	`logger.warning(`
`423`		`- "Pipeline parallism does not support merge_tensor_parallel. Set merge_tensor_parallel to False."`
	`423`	`+ "Pipeline parallelism does not support merge_tensor_parallel. Set merge_tensor_parallel to False."`
`424`	`424`	`)`
`425`	`425`
`426`	`426`	`variant = kwargs.get("variant", None)`
Original file line number	Diff line number	Diff line change
`@@ -111,7 +111,7 @@ def save_pretrained(self, save_directory: str, merge_tensor_parallel: bool = Fal`
`111`	`111`	`if self.is_pipelinemodel and merge_tensor_parallel and self.vera_config.tensor_parallel_degree > 1:`
`112`	`112`	`merge_tensor_parallel = False`
`113`	`113`	`logger.warning(`
`114`		`- "Pipeline parallism does not support merge_tensor_parallel. Set merge_tensor_parallel to False."`
	`114`	`+ "Pipeline parallelism does not support merge_tensor_parallel. Set merge_tensor_parallel to False."`
`115`	`115`	`)`
`116`	`116`
`117`	`117`	`variant = kwargs.get("variant", None)`