diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
index cd345d3d3..54a42e3cb 100644
--- a/.github/workflows/doc-build.yml
+++ b/.github/workflows/doc-build.yml
@@ -12,6 +12,7 @@ jobs:
     uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
     with:
       commit_sha: ${{ github.sha }}
+      languages: en zh
       package: lighteval
     secrets:
       token: ${{ secrets.HUGGINGFACE_PUSH }}
diff --git a/.github/workflows/doc-pr-build.yml b/.github/workflows/doc-pr-build.yml
index f96e20583..a8e5e9b2c 100644
--- a/.github/workflows/doc-pr-build.yml
+++ b/.github/workflows/doc-pr-build.yml
@@ -14,3 +14,5 @@ jobs:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}
       package: lighteval
+      languages: en zh
+
diff --git a/.github/workflows/doc-pr-upload.yml b/.github/workflows/doc-pr-upload.yml
index ab6f32d7a..2b450ea1b 100644
--- a/.github/workflows/doc-pr-upload.yml
+++ b/.github/workflows/doc-pr-upload.yml
@@ -11,6 +11,7 @@ jobs:
     uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
     with:
       package_name: lighteval
+      languages: en zh
     secrets:
       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
       comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
diff --git a/docs/source/_toctree.yml b/docs/source/en/_toctree.yml
similarity index 100%
rename from docs/source/_toctree.yml
rename to docs/source/en/_toctree.yml
diff --git a/docs/source/adding-a-custom-task.mdx b/docs/source/en/adding-a-custom-task.mdx
similarity index 100%
rename from docs/source/adding-a-custom-task.mdx
rename to docs/source/en/adding-a-custom-task.mdx
diff --git a/docs/source/adding-a-new-metric.mdx b/docs/source/en/adding-a-new-metric.mdx
similarity index 100%
rename from docs/source/adding-a-new-metric.mdx
rename to docs/source/en/adding-a-new-metric.mdx
diff --git a/docs/source/available-tasks.mdx b/docs/source/en/available-tasks.mdx
similarity index 100%
rename from docs/source/available-tasks.mdx
rename to docs/source/en/available-tasks.mdx
diff --git a/docs/source/contributing-to-multilingual-evaluations.mdx b/docs/source/en/contributing-to-multilingual-evaluations.mdx
similarity index 100%
rename from docs/source/contributing-to-multilingual-evaluations.mdx
rename to docs/source/en/contributing-to-multilingual-evaluations.mdx
diff --git a/docs/source/evaluating-a-custom-model.mdx b/docs/source/en/evaluating-a-custom-model.mdx
similarity index 100%
rename from docs/source/evaluating-a-custom-model.mdx
rename to docs/source/en/evaluating-a-custom-model.mdx
diff --git a/docs/source/index.mdx b/docs/source/en/index.mdx
similarity index 100%
rename from docs/source/index.mdx
rename to docs/source/en/index.mdx
diff --git a/docs/source/installation.mdx b/docs/source/en/installation.mdx
similarity index 100%
rename from docs/source/installation.mdx
rename to docs/source/en/installation.mdx
diff --git a/docs/source/metric-list.mdx b/docs/source/en/metric-list.mdx
similarity index 100%
rename from docs/source/metric-list.mdx
rename to docs/source/en/metric-list.mdx
diff --git a/docs/source/package_reference/evaluation_tracker.mdx b/docs/source/en/package_reference/evaluation_tracker.mdx
similarity index 100%
rename from docs/source/package_reference/evaluation_tracker.mdx
rename to docs/source/en/package_reference/evaluation_tracker.mdx
diff --git a/docs/source/package_reference/logging.mdx b/docs/source/en/package_reference/logging.mdx
similarity index 100%
rename from docs/source/package_reference/logging.mdx
rename to docs/source/en/package_reference/logging.mdx
diff --git a/docs/source/package_reference/metrics.mdx b/docs/source/en/package_reference/metrics.mdx
similarity index 100%
rename from docs/source/package_reference/metrics.mdx
rename to docs/source/en/package_reference/metrics.mdx
diff --git a/docs/source/package_reference/models.mdx b/docs/source/en/package_reference/models.mdx
similarity index 100%
rename from docs/source/package_reference/models.mdx
rename to docs/source/en/package_reference/models.mdx
diff --git a/docs/source/package_reference/pipeline.mdx b/docs/source/en/package_reference/pipeline.mdx
similarity index 100%
rename from docs/source/package_reference/pipeline.mdx
rename to docs/source/en/package_reference/pipeline.mdx
diff --git a/docs/source/package_reference/tasks.mdx b/docs/source/en/package_reference/tasks.mdx
similarity index 100%
rename from docs/source/package_reference/tasks.mdx
rename to docs/source/en/package_reference/tasks.mdx
diff --git a/docs/source/quicktour.mdx b/docs/source/en/quicktour.mdx
similarity index 100%
rename from docs/source/quicktour.mdx
rename to docs/source/en/quicktour.mdx
diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/en/saving-and-reading-results.mdx
similarity index 100%
rename from docs/source/saving-and-reading-results.mdx
rename to docs/source/en/saving-and-reading-results.mdx
diff --git a/docs/source/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx b/docs/source/en/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx
similarity index 100%
rename from docs/source/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx
rename to docs/source/en/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx
diff --git a/docs/source/use-inference-providers-as-backend.mdx b/docs/source/en/use-inference-providers-as-backend.mdx
similarity index 100%
rename from docs/source/use-inference-providers-as-backend.mdx
rename to docs/source/en/use-inference-providers-as-backend.mdx
diff --git a/docs/source/use-litellm-as-backend.mdx b/docs/source/en/use-litellm-as-backend.mdx
similarity index 100%
rename from docs/source/use-litellm-as-backend.mdx
rename to docs/source/en/use-litellm-as-backend.mdx
diff --git a/docs/source/use-sglang-as-backend.mdx b/docs/source/en/use-sglang-as-backend.mdx
similarity index 100%
rename from docs/source/use-sglang-as-backend.mdx
rename to docs/source/en/use-sglang-as-backend.mdx
diff --git a/docs/source/use-vllm-as-backend.mdx b/docs/source/en/use-vllm-as-backend.mdx
similarity index 100%
rename from docs/source/use-vllm-as-backend.mdx
rename to docs/source/en/use-vllm-as-backend.mdx
diff --git a/docs/source/using-the-python-api.mdx b/docs/source/en/using-the-python-api.mdx
similarity index 100%
rename from docs/source/using-the-python-api.mdx
rename to docs/source/en/using-the-python-api.mdx
diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml
new file mode 100644
index 000000000..74b6d31de
--- /dev/null
+++ b/docs/source/zh/_toctree.yml
@@ -0,0 +1,54 @@
+- sections:
+  - local: index
+    title: 🤗 Lighteval
+  - local: installation
+    title: 安装
+  - local: quicktour
+    title: 快速上手
+  title: 入门指南
+- sections:
+  - local: saving-and-reading-results
+    title: 保存和读取结果
+  - local: using-the-python-api
+    title: 使用Python API
+  - local: adding-a-custom-task
+    title: 添加自定义任务
+  - local: adding-a-new-metric
+    title: 添加自定义指标
+  - local: evaluating-a-custom-model
+    title: 评估自定义模型
+  - local: use-inference-providers-as-backend
+    title: 使用HF的推理提供商作为后端
+  - local: use-litellm-as-backend
+    title: 使用litellm作为后端
+  - local: use-vllm-as-backend
+    title: 使用vllm作为后端
+  - local: use-sglang-as-backend
+    title: 使用SGLang作为后端
+  - local: use-huggingface-inference-endpoints-or-tgi-as-backend
+    title: 使用Hugging Face推理端点或TGI作为后端
+  - local: contributing-to-multilingual-evaluations
+    title: 贡献多语言评估
+  title: 指南
+- sections:
+  - local: metric-list
+    title: 可用指标
+  - local: available-tasks
+    title: 可用任务
+  title: API
+- sections:
+  - sections:
+    - local: package_reference/evaluation_tracker
+      title: EvaluationTracker
+    - local: package_reference/models
+      title: 模型和模型配置
+    - local: package_reference/pipeline
+      title: 流水线
+    title: 主要类
+  - local: package_reference/metrics
+    title: 指标
+  - local: package_reference/tasks
+    title: 任务
+  - local: package_reference/logging
+    title: 日志
+  title: 参考 
\ No newline at end of file
diff --git a/docs/source/zh/adding-a-custom-task.mdx b/docs/source/zh/adding-a-custom-task.mdx
new file mode 100644
index 000000000..a983ca119
--- /dev/null
+++ b/docs/source/zh/adding-a-custom-task.mdx
@@ -0,0 +1,143 @@
+# 添加自定义任务
+
+要添加新任务，首先打开一个issue，确定它是否将被集成到lighteval的核心评估中、扩展任务中还是社区任务中，并在hub上添加其数据集。
+
+- 核心评估是在其度量和处理中只需要标准逻辑的评估，我们会将其添加到我们的测试套件中，以确保随着时间的推移不会出现回归。它们在社区中已经有很高的使用率。
+- 扩展评估是在其度量中需要自定义逻辑的评估（复杂的规范化、LLM作为评判等），我们添加它们是为了方便用户。它们在社区中已经有很高的使用率。
+- 社区评估是社区提交的新任务。
+
+随着时间的推移，一个受欢迎的社区评估可以发展成为扩展评估或核心评估。
+
+> [!TIP]
+> 您可以在<a href="https://github.com/huggingface/lighteval/tree/main/community_tasks">community_task</a>目录中找到自定义任务的示例。
+
+## 逐步创建自定义任务
+
+> [!WARNING]
+> 要将您的自定义指标贡献给lighteval仓库，您首先需要通过运行`pip install -e .[dev]`安装所需的开发依赖项，然后运行`pre-commit install`安装pre-commit钩子。
+
+首先，在`community_tasks`目录下创建一个Python文件。
+
+您需要定义一个提示函数，该函数将把来自数据集的一行转换为用于评估的文档。
+
+```python
+# 根据您不同的任务需求定义尽可能多的函数
+def prompt_fn(line, task_name: str = None):
+    """定义如何从数据集行到doc对象。
+    参考src/lighteval/tasks/default_prompts.py中的例子，
+    或者在README中获取关于此函数应该做什么的更多信息。
+    """
+    return Doc(
+        task_name=task_name,
+        query=line["question"],
+        choices=[f" {c}" for c in line["choices"]],
+        gold_index=line["gold"],
+        instruction="",
+    )
+```
+
+然后，您需要选择一个指标：您可以使用现有的指标（在[`lighteval.metrics.metrics.Metrics`]中定义）或[创建自定义指标](adding-a-new-metric)。
+[//]: # (TODO: 一旦添加了自动文档，将lighteval.metrics.metrics.Metrics替换为~metrics.metrics.Metrics)
+
+```python
+custom_metric = SampleLevelMetric(
+    metric_name="my_custom_metric_name",
+    higher_is_better=True,
+    category=MetricCategory.IGNORED,
+    use_case=MetricUseCase.NONE,
+    sample_level_fn=lambda x: x,  # 如何计算一个样本的分数
+    corpus_level_fn=np.mean,  # 如何聚合样本指标
+)
+```
+
+然后，您需要使用[`~tasks.lighteval_task.LightevalTaskConfig`]定义您的任务。
+您可以定义有或没有子集的任务。
+要定义没有子集的任务：
+
+```python
+# 这是如何创建一个简单的任务（如hellaswag），它有一个单一的子集附加到它，并且可能有一个评估。
+task = LightevalTaskConfig(
+    name="myothertask",
+    prompt_function=prompt_fn,  # 必须在文件中定义或从src/lighteval/tasks/tasks_prompt_formatting.py导入
+    suite=["community"],
+    hf_repo="",
+    hf_subset="default",
+    hf_avail_splits=[],
+    evaluation_splits=[],
+    few_shots_split=None,
+    few_shots_select=None,
+    metric=[],  # 在Metrics中选择您的指标
+)
+```
+
+如果您想创建具有多个子集的任务，请将它们添加到`SAMPLE_SUBSETS`列表中，并为每个子集创建一个任务。
+
+```python
+SAMPLE_SUBSETS = []  # 用于此评估的所有子集列表
+
+
+class CustomSubsetTask(LightevalTaskConfig):
+    def __init__(
+        self,
+        name,
+        hf_subset,
+    ):
+        super().__init__(
+            name=name,
+            hf_subset=hf_subset,
+            prompt_function=prompt_fn,  # 必须在文件中定义或从src/lighteval/tasks/tasks_prompt_formatting.py导入
+            hf_repo="",
+            metric=[custom_metric],  # 在Metrics中选择您的指标或使用您的custom_metric
+            hf_avail_splits=[],
+            evaluation_splits=[],
+            few_shots_split=None,
+            few_shots_select=None,
+            suite=["community"],
+            generation_size=-1,
+            stop_sequence=None,
+        )
+SUBSET_TASKS = [CustomSubsetTask(name=f"mytask:{subset}", hf_subset=subset) for subset in SAMPLE_SUBSETS]
+```
+
+以下是参数及其含义的列表：
+
+- `name` (str)，您的评估名称
+- `suite` (list)，您的评估应该属于的套件。此字段允许我们比较不同的任务实现，并用作任务选择以区分要启动的版本。目前，您会找到关键词["helm", "bigbench", "original", "lighteval", "community", "custom"]；对于核心评估，请选择`lighteval`。
+- `prompt_function` (Callable)，您在上面步骤中定义的提示函数
+- `hf_repo` (str)，hub上您的评估数据集的路径
+- `hf_subset` (str)，您想用于评估的特定子集（注意：当数据集没有子集时，请用`"default"`填充此字段，而不是用`None`或`""`)
+- `hf_avail_splits` (list)，您的数据集可用的所有分割（训练、验证、测试、其他...）
+- `evaluation_splits` (list)，您想用于评估的分割
+- `few_shots_split` (str，可以为`null`)，您想从中选择少量示例样本的特定数据划分。它应该与`evaluation_splits`中包含的集合不同
+- `few_shots_select` (str，可以为`null`)，您将用来为少量样本示例选择项目的方法。可以为`null`，或以下之一：
+    - `balanced` 从`few_shots_split`中选择带有平衡标签的示例，以避免将少量样本示例（因此是模型生成）偏向特定标签
+    - `random` 从`few_shots_split`中随机选择示例
+    - `random_sampling` 为每个新项目从`few_shots_split`中随机选择新示例，但如果采样项等于当前项，则从可用样本中删除
+    - `random_sampling_from_train` 为每个新项目从`few_shots_split`中随机选择新示例，但如果采样项等于当前项，则保留！仅在您知道自己在做什么时使用此选项。
+    - `sequential` 选择`few_shots_split`的前`n`个示例
+- `generation_size` (int)，生成评估允许的最大令牌数。如果您的评估是对数似然评估（多选），此值应为-1
+- `stop_sequence` (list)，作为生成的句子结束标记的字符串列表
+- `metric` (list)，您想用于评估的指标（有关详细说明，请参见下一节）
+- `trust_dataset` (bool)，如果您信任数据集，则设置为True
+
+
+然后，您需要将您的任务添加到`TASKS_TABLE`列表中。
+
+```python
+# 存储您的评估
+
+# 带有子集的任务：
+TASKS_TABLE = SUBSET_TASKS
+
+# 不带子集的任务：
+# TASKS_TABLE = [task]
+```
+
+创建文件后，您可以使用以下命令运行评估：
+
+```bash
+lighteval accelerate \
+    "model_name=HuggingFaceH4/zephyr-7b-beta" \
+    "community|{custom_task}|{fewshots}|{truncate_few_shot}" \
+    --custom-tasks {path_to_your_custom_task_file}
+``` 
\ No newline at end of file
diff --git a/docs/source/zh/adding-a-new-metric.mdx b/docs/source/zh/adding-a-new-metric.mdx
new file mode 100644
index 000000000..e3dc5a11c
--- /dev/null
+++ b/docs/source/zh/adding-a-new-metric.mdx
@@ -0,0 +1,86 @@
+# 添加新指标
+
+首先，检查是否可以使用[语料库指标](package_reference/metrics#corpus-metrics)或[样本指标](package_reference/metrics#sample-metrics)中的参数化函数。
+
+如果不能，您可以使用`custom_task`系统注册您的新指标：
+
+> [!TIP]
+> 要查看与自定义任务一起添加的自定义指标示例，请查看<a href="">IFEval自定义任务</a>。
+
+
+> [!WARNING]
+> 要将您的自定义指标贡献给lighteval仓库，您首先需要通过运行`pip install -e .[dev]`安装所需的开发依赖项，然后运行`pre-commit install`安装pre-commit钩子。
+
+
+- 创建一个包含指标完整逻辑的新Python文件。
+- 该文件还需要以这些导入开始
+
+```python
+from aenum import extend_enum
+from lighteval.metrics import Metrics
+```
+
+您需要定义一个样本级指标：
+
+```python
+def custom_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> bool:
+    response = predictions[0]
+    return response == formatted_doc.choices[formatted_doc.gold_index]
+```
+
+这里的样本级指标只返回一个指标，如果您想为每个样本返回多个指标，您需要返回一个字典，以指标为键，值为值。
+
+```python
+def custom_metric(predictions: list[str], formatted_doc: Doc, **kwargs) -> dict:
+    response = predictions[0]
+    return {"accuracy": response == formatted_doc.choices[formatted_doc.gold_index], "other_metric": 0.5}
+```
+
+然后，如果需要，您可以定义一个聚合函数，常见的聚合函数是`np.mean`。
+
+```python
+def agg_function(items):
+    flat_items = [item for sublist in items for item in sublist]
+    score = sum(flat_items) / len(flat_items)
+    return score
+```
+
+最后，您可以定义您的指标。如果是样本级指标，您可以使用以下代码和[`~metrics.utils.metric_utils.SampleLevelMetric`]：
+
+```python
+my_custom_metric = SampleLevelMetric(
+    metric_name={custom_metric_name},
+    higher_is_better={True或False},
+    category={MetricCategory},
+    use_case={MetricUseCase},
+    sample_level_fn=custom_metric,
+    corpus_level_fn=agg_function,
+)
+```
+
+如果您的指标为每个样本定义多个指标，您可以使用以下代码和[`~metrics.utils.metric_utils.SampleLevelMetricGrouping`]：
+
+```python
+custom_metric = SampleLevelMetricGrouping(
+    metric_name={submetric_names},
+    higher_is_better={n: {True或False} for n in submetric_names},
+    category={MetricCategory},
+    use_case={MetricUseCase},
+    sample_level_fn=custom_metric,
+    corpus_level_fn={
+        "accuracy": np.mean,
+        "other_metric": agg_function,
+    },
+)
+```
+
+最后，添加以下内容，以便在作为模块加载时将您的指标添加到我们的指标列表中。
+
+```python
+# 将指标添加到指标列表！
+extend_enum(Metrics, "metric_name", metric_function)
+if __name__ == "__main__":
+    print("Imported metric")
+```
+
+您可以通过在启动lighteval时使用`--custom-tasks path_to_your_file`来提供您的自定义指标。 
\ No newline at end of file
diff --git a/docs/source/zh/available-tasks.mdx b/docs/source/zh/available-tasks.mdx
new file mode 100644
index 000000000..6fc867958
--- /dev/null
+++ b/docs/source/zh/available-tasks.mdx
@@ -0,0 +1,252 @@
+# 可用任务
+
+您可以通过运行以下命令获取所有可用任务的列表：
+
+```bash
+lighteval tasks list
+```
+
+您还可以通过运行以下命令来检查特定任务：
+
+```bash
+lighteval tasks inspect <task_name>
+```
+
+## 任务列表
+
+- bigbench:
+  - bigbench|abstract_narrative_understanding
+  - bigbench|anachronisms
+  - bigbench|analogical_similarity
+  - bigbench|analytic_entailment
+  - bigbench|arithmetic_bb
+  - bigbench|ascii_word_recognition
+  - bigbench|authorship_verification
+  - bigbench|auto_categorization
+  - bigbench|auto_debugging
+  - bigbench|bbq_lite_json
+  - bigbench|bridging_anaphora_resolution_barqa
+  - bigbench|causal_judgment
+  - bigbench|cause_and_effect
+  - bigbench|checkmate_in_one
+  - bigbench|chess_state_tracking
+  - bigbench|chinese_remainder_theorem
+  - bigbench|cifar10_classification
+  - bigbench|code_line_description
+  - bigbench|codenames
+  - bigbench|color
+  - bigbench|common_morpheme
+  - bigbench|conceptual_combinations
+  - bigbench|conlang_translation
+  - bigbench|contextual_parametric_knowledge_conflicts
+  - bigbench|coqa_bb
+  - bigbench|crash_blossom
+  - bigbench|crass_ai
+  - bigbench|cryobiology_spanish
+  - bigbench|cryptonite
+  - bigbench|cs_algorithms
+  - bigbench|dark_humor_detection
+  - bigbench|date_understanding
+  - bigbench|disambiguation_qa
+  - bigbench|discourse_marker_prediction
+  - bigbench|disfl_qa
+  - bigbench|dyck_languages
+  - bigbench|elementary_math_qa
+  - bigbench|emoji_movie
+  - bigbench|emojis_emotion_prediction
+  - bigbench|empirical_judgments
+  - bigbench|english_proverbs
+  - bigbench|english_russian_proverbs
+  - bigbench|entailed_polarity
+  - bigbench|entailed_polarity_hindi
+  - bigbench|epistemic_reasoning
+  - bigbench|evaluating_information_essentiality
+  - bigbench|fact_checker
+  - bigbench|fantasy_reasoning
+  - bigbench|few_shot_nlg
+  - bigbench|figure_of_speech_detection
+  - bigbench|formal_fallacies_syllogisms_negation
+  - bigbench|gem
+  - bigbench|gender_inclusive_sentences_german
+  - bigbench|general_knowledge
+  - bigbench|geometric_shapes
+  - bigbench|goal_step_wikihow
+  - bigbench|gre_reading_comprehension
+  - bigbench|hhh_alignment
+  - bigbench|hindi_question_answering
+  - bigbench|hindu_knowledge
+  - bigbench|hinglish_toxicity
+  - bigbench|human_organs_senses
+  - bigbench|hyperbaton
+  - bigbench|identify_math_theorems
+  - bigbench|identify_odd_metaphor
+  - bigbench|implicatures
+  - bigbench|implicit_relations
+  - bigbench|intent_recognition
+  - bigbench|international_phonetic_alphabet_nli
+  - bigbench|international_phonetic_alphabet_transliterate
+  - bigbench|intersect_geometry
+  - bigbench|irony_identification
+  - bigbench|kanji_ascii
+  - bigbench|kannada
+  - bigbench|key_value_maps
+  - bigbench|known_unknowns
+  - bigbench|language_games
+  - bigbench|language_identification
+  - bigbench|linguistic_mappings
+  - bigbench|linguistics_puzzles
+  - bigbench|logic_grid_puzzle
+  - bigbench|logical_args
+  - bigbench|logical_deduction
+  - bigbench|logical_fallacy_detection
+  - bigbench|logical_sequence
+  - bigbench|mathematical_induction
+  - bigbench|matrixshapes
+  - bigbench|metaphor_boolean
+  - bigbench|metaphor_understanding
+  - bigbench|minute_mysteries_qa
+  - bigbench|misconceptions
+  - bigbench|misconceptions_russian
+  - bigbench|mnist_ascii
+  - bigbench|modified_arithmetic
+  - bigbench|moral_permissibility
+  - bigbench|movie_dialog_same_or_different
+  - bigbench|movie_recommendation
+  - bigbench|mult_data_wrangling
+  - bigbench|multiemo
+  - bigbench|natural_instructions
+  - bigbench|navigate
+  - bigbench|nonsense_words_grammar
+  - bigbench|novel_concepts
+  - bigbench|object_counting
+  - bigbench|odd_one_out
+  - bigbench|operators
+  - bigbench|paragraph_segmentation
+  - bigbench|parsinlu_qa
+  - bigbench|parsinlu_reading_comprehension
+  - bigbench|penguins_in_a_table
+  - bigbench|periodic_elements
+  - bigbench|persian_idioms
+  - bigbench|phrase_relatedness
+  - bigbench|physical_intuition
+  - bigbench|physics
+  - bigbench|physics_questions
+  - bigbench|play_dialog_same_or_different
+  - bigbench|polish_sequence_labeling
+  - bigbench|presuppositions_as_nli
+  - bigbench|qa_wikidata
+  - bigbench|question_selection
+  - bigbench|real_or_fake_text
+  - bigbench|reasoning_about_colored_objects
+  - bigbench|repeat_copy_logic
+  - bigbench|rephrase
+  - bigbench|rhyming
+  - bigbench|riddle_sense
+  - bigbench|ruin_names
+  - bigbench|salient_translation_error_detection
+  - bigbench|scientific_press_release
+  - bigbench|semantic_parsing_in_context_sparc
+  - bigbench|semantic_parsing_spider
+  - bigbench|sentence_ambiguity
+  - bigbench|similarities_abstraction
+  - bigbench|simp_turing_concept
+  - bigbench|simple_arithmetic_json
+  - bigbench|simple_arithmetic_json_multiple_choice
+  - bigbench|simple_arithmetic_json_subtasks
+  - bigbench|simple_arithmetic_multiple_targets_json
+  - bigbench|simple_ethical_questions
+  - bigbench|simple_text_editing
+  - bigbench|snarks
+  - bigbench|social_iqa
+  - bigbench|social_support
+  - bigbench|sports_understanding
+  - bigbench|strange_stories
+  - bigbench|strategyqa
+  - bigbench|sufficient_information
+  - bigbench|suicide_risk
+  - bigbench|swahili_english_proverbs
+  - bigbench|swedish_to_german_proverbs
+  - bigbench|symbol_interpretation
+  - bigbench|tellmewhy
+  - bigbench|temporal_sequences
+  - bigbench|tense
+  - bigbench|timedial
+  - bigbench|topical_chat
+  - bigbench|tracking_shuffled_objects
+  - bigbench|understanding_fables
+  - bigbench|undo_permutation
+  - bigbench|unit_conversion
+  - bigbench|unit_interpretation
+  - bigbench|unnatural_in_context_learning
+  - bigbench|vitaminc_fact_verification
+  - bigbench|what_is_the_tao
+  - bigbench|which_wiki_edit
+  - bigbench|wino_x_german
+  - bigbench|winowhy
+  - bigbench|word_sorting
+  - bigbench|word_unscrambling
+
+- harness:
+  - harness|bbh:boolean_expressions
+  - harness|bbh:causal_judgment
+  - harness|bbh:date_understanding
+  - harness|bbh:disambiguation_qa
+  - harness|bbh:dyck_languages
+  - harness|bbh:formal_fallacies
+  - harness|bbh:geometric_shapes
+  - harness|bbh:hyperbaton
+  - harness|bbh:logical_deduction_five_objects
+  - harness|bbh:logical_deduction_seven_objects
+  - harness|bbh:logical_deduction_three_objects
+  - harness|bbh:movie_recommendation
+  - harness|bbh:multistep_arithmetic_two
+  - harness|bbh:navigate
+  - harness|bbh:object_counting
+  - harness|bbh:penguins_in_a_table
+  - harness|bbh:reasoning_about_colored_objects
+  - harness|bbh:ruin_names
+  - harness|bbh:salient_translation_error_detection
+  - harness|bbh:snarks
+  - harness|bbh:sports_understanding
+  - harness|bbh:temporal_sequences
+  - harness|bbh:tracking_shuffled_objects_five_objects
+  - harness|bbh:tracking_shuffled_objects_seven_objects
+  - harness|bbh:tracking_shuffled_objects_three_objects
+  - harness|bbh:web_of_lies
+  - harness|bbh:word_sorting
+  - harness|bigbench:causal_judgment
+  - harness|bigbench:date_understanding
+  - harness|bigbench:disambiguation_qa
+  - harness|bigbench:geometric_shapes
+  - harness|bigbench:logical_deduction_five_objects
+  - harness|bigbench:logical_deduction_seven_objects
+  - harness|bigbench:logical_deduction_three_objects
+  - harness|bigbench:movie_recommendation
+  - harness|bigbench:navigate
+  - harness|bigbench:reasoning_about_colored_objects
+  - harness|bigbench:ruin_names
+  - harness|bigbench:salient_translation_error_detection
+  - harness|bigbench:snarks
+  - harness|bigbench:sports_understanding
+  - harness|bigbench:temporal_sequences
+  - harness|bigbench:tracking_shuffled_objects_five_objects
+  - harness|bigbench:tracking_shuffled_objects_seven_objects
+  - harness|bigbench:tracking_shuffled_objects_three_objects
+  - harness|wikitext:103:document_level
+
+- helm:
+  - helm|babi_qa
+  - helm|bbq
+  - helm|bbq:Age
+  - helm|bbq:Disability_status
+  - helm|bbq:Gender_identity
+  - helm|bbq:Physical_appearance
+  - helm|bbq:Race_ethnicity
+  - helm|bbq:Race_x_SES
+  - helm|bbq:Race_x_gender
+  - helm|bbq:Religion
+  - helm|bbq:SES
+  - helm|bbq:Sexual_orientation
+  - helm|bbq=Nationality
+  # ... (此列表较长，为简洁起见，此处省略其余部分)
+  # 请注意，实际翻译时应包含完整列表 
\ No newline at end of file
diff --git a/docs/source/zh/contributing-to-multilingual-evaluations.mdx b/docs/source/zh/contributing-to-multilingual-evaluations.mdx
new file mode 100644
index 000000000..9ab47bc20
--- /dev/null
+++ b/docs/source/zh/contributing-to-multilingual-evaluations.mdx
@@ -0,0 +1,104 @@
+# 贡献多语言评估
+
+## 贡献小型翻译
+
+我们定义了19个`literals`，这些是在自动创建评估提示时使用的基本关键词或标点符号，例如`yes`、`no`、`because`等。
+
+我们欢迎您提供您语言的翻译！
+
+要贡献，您需要：
+1. 打开[translation_literals](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/templates/utils/translation_literals.py)文件
+2. 编辑文件，为您感兴趣的语言添加或扩展字面量。
+
+```python
+    Language.ENGLISH: TranslationLiterals(
+        language=Language.ENGLISH,
+        question_word="question", # 用法："Question: How are you?"
+        answer="answer", # 用法："Answer: I am fine"
+        confirmation_word="right", # 用法："He is smart, right?"
+        yes="yes", # 用法："Yes, he is"
+        no="no", # 用法："No, he is not"
+        also="also", # 用法："Also, she is smart."
+        cause_word="because", # 用法："She is smart, because she is tall"
+        effect_word="therefore", # 用法："He is tall therefore he is smart"
+        or_word="or", # 用法："He is tall or small"
+        true="true", # 用法："He is smart, true, false or neither?"
+        false="false", # 用法："He is smart, true, false or neither?"
+        neither="neither", # 用法："He is smart, true, false or neither?"
+        # 标点和间距：仅在您的语言使用与英语不同的内容时调整
+        full_stop=".",
+        comma=",",
+        question_mark="?",
+        exclamation_mark="!",
+        word_space=" ",
+        sentence_space=" ",
+        colon=":",
+        # 用于枚举的字母表的第一个字符，如果与英语不同
+        indices=["A", "B", "C", ...]
+    )
+```
+
+3. 提交带有您修改的PR！就是这样！
+
+## 贡献新的多语言任务
+
+您应该首先阅读我们关于[添加自定义任务](adding-a-custom-task)的指南，以更好地理解我们使用的不同参数。
+
+然后，您应该查看当前的[多语言任务](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/multilingual/tasks.py)文件，以了解它们是如何定义的。对于多语言评估，`prompt_function`应该由语言适应模板实现。模板将负责正确的格式化，正确且一致地使用语言调整的提示锚点（例如问题/回答）和标点符号。
+
+浏览[此处](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks/templates)的所有模板列表，以查看哪些最适合您自己的任务。
+
+然后，准备好后，要定义您自己的任务，您应该：
+1. 按照上述指南创建一个Python文件
+2. 导入与您的任务类型相关的模板（XNLI、Copa、多项选择、问答等）
+3. 使用我们可参数化的[`~tasks.lighteval_task.LightevalTaskConfig`]类为每个相关语言和评估表述（对于多项选择）定义一个或一组任务
+
+```python
+your_tasks = [
+    LightevalTaskConfig(
+        # 您的评估名称
+        name=f"evalname_{language.value}_{formulation.name.lower()}",
+        # 此评估由社区贡献
+        suite=["community"],
+        # 这将自动获取您所选表述的正确指标
+        metric=get_metrics_for_formulation(
+            formulation,
+            [
+                loglikelihood_acc_metric(normalization=None),
+                loglikelihood_acc_metric(normalization=LogProbTokenNorm()),
+                loglikelihood_acc_metric(normalization=LogProbCharNorm()),
+            ],
+        ),
+        # 在此函数中，您选择要遵循的模板以及使用哪种语言和表述
+        prompt_function=get_template_prompt_function(
+            language=language,
+            # 然后使用适配器定义模板键（左）和数据集键（右）之间的映射
+            # 要了解需要哪些模板键以及可用哪些模板键，
+            # 请查阅相应的适配器类型和文档字符串。
+            adapter=lambda line: {
+                "key": line["relevant_key"],
+                ...
+            },
+            formulation=formulation,
+        ),
+        # 您还可以添加特定过滤器以删除不相关的样本
+        hf_filter=lambda line: line["label"] in <condition>,
+        # 然后选择您的huggingface数据集以及可用于评估的分割
+        hf_repo=<dataset>,
+        hf_subset=<subset>,
+        evaluation_splits=["train"],
+        hf_avail_splits=["train"],
+    )
+    for language in [
+        Language.YOUR_LANGUAGE, ...
+    ]
+    for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
+]
+```
+4. 然后，您可以回到指南，测试您的任务是否正确实现！
+
+> [!TIP]
+> 所有[`~tasks.lighteval_task.LightevalTaskConfig`]参数都是强类型的，包括模板函数的输入。确保利用您的IDE的功能，使正确填写这些参数变得更容易。
+
+
+一旦一切就绪，提交PR，我们将很乐意审查它！ 
\ No newline at end of file
diff --git a/docs/source/zh/evaluating-a-custom-model.mdx b/docs/source/zh/evaluating-a-custom-model.mdx
new file mode 100644
index 000000000..1eea6a13f
--- /dev/null
+++ b/docs/source/zh/evaluating-a-custom-model.mdx
@@ -0,0 +1,127 @@
+# 评估自定义模型
+
+Lighteval允许您通过创建继承自`LightevalModel`的自定义模型类来评估自定义模型实现。当您想评估标准后端（transformers、vllm等）不直接支持的模型时，这非常有用。
+
+## 创建自定义模型
+
+1. 创建包含您的自定义模型实现的Python文件。该模型必须继承自`LightevalModel`并实现所有必需的方法。
+
+以下是一个基本示例：
+
+```python
+from lighteval.models.abstract_model import LightevalModel
+
+class MyCustomModel(LightevalModel):
+    def __init__(self, config):
+        super().__init__(config)
+        # 在这里初始化您的模型...
+
+    def greedy_until(self, requests, max_tokens=None, stop_sequences=None):
+        # 实现生成逻辑
+        pass
+
+    def loglikelihood(self, requests, log=True):
+        # 实现对数似然计算
+        pass
+
+    def loglikelihood_rolling(self, requests):
+        # 实现滚动对数似然计算
+        pass
+
+    def loglikelihood_single_token(self, requests):
+        # 实现单个令牌对数似然计算
+        pass
+```
+
+2. 自定义模型文件应该只包含一个继承自`LightevalModel`的类。在加载模型时，这个类将被自动检测并实例化。
+
+> [!TIP]
+> 您可以在`examples/custom_models/google_translate_model.py`中找到一个完整的自定义模型实现示例。
+
+## 运行评估
+
+您可以使用命令行界面或Python API评估您的自定义模型。
+
+### 使用命令行
+
+```bash
+lighteval custom \
+    "google-translate" \
+    "examples/custom_models/google_translate_model.py" \
+    "lighteval|wmt20:fr-de|0|0" \
+    --max-samples 10
+```
+
+该命令需要三个必要参数：
+- 模型名称（用于在结果/日志中跟踪）
+- 您的模型实现文件的路径
+- 要评估的任务（格式与其他后端相同）
+
+### 使用Python API
+
+```python
+from lighteval.logging.evaluation_tracker import EvaluationTracker
+from lighteval.models.custom.custom_model import CustomModelConfig
+from lighteval.pipeline import Pipeline, PipelineParameters
+
+# 设置评估跟踪
+evaluation_tracker = EvaluationTracker(
+    output_dir="results",
+    save_details=True
+)
+
+# 配置流水线
+pipeline_params = PipelineParameters(
+    launcher_type=ParallelismManager.CUSTOM,
+)
+
+# 配置您的自定义模型
+model_config = CustomModelConfig(
+    model="my-custom-model",
+    model_definition_file_path="path/to/my_model.py"
+)
+
+# 创建并运行流水线
+pipeline = Pipeline(
+    tasks="leaderboard|truthfulqa:mc|0|0",
+    pipeline_parameters=pipeline_params,
+    evaluation_tracker=evaluation_tracker,
+    model_config=model_config
+)
+
+pipeline.evaluate()
+pipeline.save_and_push_results()
+```
+
+## 必需的方法
+
+您的自定义模型必须实现这些核心方法：
+
+- `greedy_until`：用于生成文本，直到达到停止序列或最大令牌数
+- `loglikelihood`：用于计算特定续写的对数概率
+- `loglikelihood_rolling`：用于计算序列的滚动对数概率
+- `loglikelihood_single_token`：用于计算单个令牌的对数概率
+
+有关详细的方法签名和要求，请参阅`LightevalModel`基类文档。
+
+## 最佳实践
+
+1. **错误处理**：在您的模型方法中实现健壮的错误处理，以优雅地处理边缘情况。
+
+2. **批处理**：考虑在您的模型方法中实现高效的批处理，以提高性能。
+
+3. **资源管理**：在您的模型的`__init__`和`__del__`方法中正确管理任何资源（例如，API连接、模型权重）。
+
+4. **文档**：为您的模型类和方法添加清晰的文档字符串，解释任何特定的要求或限制。
+
+## 示例用例
+
+自定义模型特别适用于：
+
+- 评估通过自定义API访问的模型
+- 包装具有专门预处理/后处理的模型
+- 测试新型模型架构
+- 评估集成模型
+- 与外部服务或工具集成
+
+有关包装Google Translate API的自定义模型的完整示例，请参阅`examples/custom_models/google_translate_model.py`。 
\ No newline at end of file
diff --git a/docs/source/zh/index.mdx b/docs/source/zh/index.mdx
new file mode 100644
index 000000000..31dcf91aa
--- /dev/null
+++ b/docs/source/zh/index.mdx
@@ -0,0 +1,34 @@
+<!--
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Lighteval
+
+🤗 Lighteval 是一款多功能的大语言模型评估工具箱，支持多种后端平台 — 无论您使用的是
+[transformers](https://github.com/huggingface/transformers)、
+[tgi](https://github.com/huggingface/text-generation-inference)、
+[inference providers](https://huggingface.co/docs/huggingface_hub/en/guides/inference)、
+[vllm](https://github.com/vllm-project/vllm) 还是
+[nanotron](https://github.com/huggingface/nanotron)，
+都能轻松实现评估。通过保存和分析详细的样本级结果，您可以深入了解模型性能，进行调试并比较不同模型的表现。
+
+Lighteval 提供了灵活的自定义功能：您可以根据需求轻松创建[新任务](adding-a-custom-task)和[新指标](adding-a-new-metric)，
+也可以直接使用我们已有的丰富任务和指标库。
+
+您可以无缝地进行实验、基准测试，并将结果存储在 Hugging Face Hub、S3 或本地环境中。 
\ No newline at end of file
diff --git a/docs/source/zh/installation.mdx b/docs/source/zh/installation.mdx
new file mode 100644
index 000000000..0f3f2e1aa
--- /dev/null
+++ b/docs/source/zh/installation.mdx
@@ -0,0 +1,66 @@
+<!--
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 安装
+
+Lighteval可以通过PyPi或源代码两种方式安装。
+
+## 从PyPi安装
+
+```bash
+pip install lighteval
+```
+
+## 从源代码安装
+此方式主要适用于希望在`lighteval`上进行开发的用户：
+
+```bash
+git clone https://github.com/huggingface/lighteval.git
+cd lighteval
+pip install -e .
+```
+
+## 额外依赖
+
+Lighteval提供了多个可选依赖包，可以通过指定额外组来安装：
+`pip install lighteval[<group>]`或`pip install -e .[<group>]`
+
+如需使用`sglang`作为Lighteval后端，请参考[sglang安装文档](https://docs.sglang.ai/start/install.html)。
+
+| 额外名称     | 描述                                              |
+|--------------|---------------------------------------------------|
+| tgi          | 使用Text Generation Inference API评估模型         |
+| nanotron     | 支持评估nanotron模型                              |
+| quantization | 支持评估量化模型                                  |
+| adapters     | 支持评估适配器模型（delta和peft）                 |
+| tensorboardX | 允许将结果上传到tensorboard                       |
+| vllm         | 使用vllm作为推理后端                              |
+| sglang       | 使用sglang作为推理后端                            |
+| s3           | 支持将结果上传到s3存储                            |
+
+
+## Hugging Face登录
+
+如果您希望将评估结果推送到Hugging Face Hub或评估私有模型，
+需要将访问令牌添加到环境变量`HF_TOKEN`中，可通过以下命令完成：
+
+```bash
+huggingface-cli login
+``` 
\ No newline at end of file
diff --git a/docs/source/zh/metric-list.mdx b/docs/source/zh/metric-list.mdx
new file mode 100644
index 000000000..da89d692f
--- /dev/null
+++ b/docs/source/zh/metric-list.mdx
@@ -0,0 +1,76 @@
+# 指标列表
+
+## 多项选择任务的自动指标
+
+这些指标基于不同可能答案选项的对数似然值计算。
+- `loglikelihood_acc`：选择对数概率最高选项且选择正确的样本比例 - 对于选项仅包含单个token的任务，有一个更高效的版本 (`loglikelihood_acc_single_token`)。
+- `loglikelihood_acc_norm`：选择按序列长度归一化后对数概率最高选项且选择正确的样本比例 - 对于选项仅包含单个token的任务，有一个更高效的版本 (`loglikelihood_acc_norm_single_token`)。
+- `loglikelihood_acc_norm_nospace`：选择按序列长度归一化后对数概率最高选项且选择正确的样本比例，忽略首个空格。
+- `loglikelihood_f1`：多项选择的语料库级别F1分数 - 对于选项仅包含单个token的任务，有一个更高效的版本 (`loglikelihood_f1_single_token`)。
+- `mcc`：马修斯相关系数（衡量统计分布之间一致性的指标）。
+- `recall_at_1`：选择对数概率最高选项且选择正确的样本比例 - 对于每个选项仅包含单个token的任务，有一个更高效的版本 (`recall_at_1_single_token`)。
+- `recall_at_2`：选择对数概率第二高或更高选项且选择正确的样本比例 - 对于每个选项仅包含单个token的任务，有一个更高效的版本 (`recall_at_2_single_token`)。
+- `mrr`：平均倒数排名，评估按正确性/相关性排序的选项质量 - 对于选项仅包含单个token的任务，有一个更高效的版本 (`mrr_single_token`)。
+- `target_perplexity`：不同选项的困惑度。
+- `acc_golds_likelihood`：检查单个目标的平均对数概率是否高于或低于0.5。
+- `multi_f1_numeric`：多个正确答案选项的对数似然F1分数。
+
+上述所有指标都有"单token"版本（`loglikelihood_acc_single_token`、`loglikelihood_acc_norm_single_token`、`loglikelihood_f1_single_token`、`mcc_single_token`、`recall@2_single_token` 和 `mrr_single_token`）。当多项选择选项仅比较单个token（例如："A"vs"B"vs"C"vs"D"，或"是"vs"否"）时，使用这些单token版本可以将评估时间缩短至原来的1/N（N为选项数量）。单token评估还包括：
+- `multi_f1_numeric`：计算所有可能选项的F1分数并取平均值。
+
+## 困惑度和语言建模的自动指标
+这些指标基于输入文本的对数似然值计算。
+- `word_perplexity`：按序列词数加权的困惑度（输入的对数概率）。
+- `byte_perplexity`：按序列字节数加权的困惑度（输入的对数概率）。
+- `bits_per_byte`：根据模型概率计算的每字节平均位数。
+- `log_prob`：预测输出的平均对数概率（语言建模的输入对数概率）。
+
+## 生成任务的自动指标
+这些指标需要模型生成输出文本，因此评估速度相对较慢。
+- 基本指标：
+    - `perfect_exact_match`：预测与参考答案完全匹配的样本比例。
+    - `exact_match`：预测在去除首尾空白后与参考答案匹配的样本比例（即对两者应用`strip`后比较）。
+    - `quasi_exact_match`：归一化预测与归一化参考答案匹配的样本比例（归一化处理包括空白、冠词、大小写等）。还有其他变体使用不同归一化方法，例如`quasi_exact_match_triviaqa`，仅在对所有文本应用`strip`后比较。
+    - `prefix_exact_match`：预测的开头部分与参考答案匹配的样本比例（去除首尾空白后比较）。
+    - `prefix_quasi_exact_match`：预测的归一化开头部分与归一化参考答案匹配的样本比例。
+    - `exact_match_indicator`：移除特定指示符前的上下文后的精确匹配。
+    - `f1_score_quasi`：模型输出和参考答案之间词语重叠的平均F1分数，两者都先进行归一化。
+    - `f1_score`：模型输出和参考答案之间词语重叠的平均F1分数，不进行归一化。
+    - `f1_score_macro`：语料库级别的宏观F1分数。
+    - `f1_score_micro`：语料库级别的微观F1分数。
+    - `maj_at_5` 和 `maj_at_8`：模型多数投票。从模型中获取n（5或8）个生成结果，并将出现频率最高的结果作为最终预测。
+- 摘要指标：
+    - `rouge`：平均ROUGE分数 [(Lin, 2004)](https://aclanthology.org/W04-1013/)。
+    - `rouge1`：基于1-gram重叠的平均ROUGE分数 [(Lin, 2004)](https://aclanthology.org/W04-1013/)。
+    - `rouge2`：基于2-gram重叠的平均ROUGE分数 [(Lin, 2004)](https://aclanthology.org/W04-1013/)。
+    - `rougeL`：基于最长公共子序列重叠的平均ROUGE分数 [(Lin, 2004)](https://aclanthology.org/W04-1013/)。
+    - `rougeLsum`：基于最长公共子序列重叠的平均ROUGE分数 [(Lin, 2004)](https://aclanthology.org/W04-1013/)。
+    - `rouge_t5` (BigBench)：所有可用ROUGE指标的语料库级别ROUGE分数。
+    - `faithfulness`：基于SummaC方法 [(Laban et al. (2022)](https://aclanthology.org/2022.tacl-1.10/)) 的忠实度评分。
+    - `extractiveness`：基于 [(Grusky et al., 2018)](https://aclanthology.org/N18-1065/) 的内容评估：
+        - `summarization_coverage`：模型生成摘要中来自源文档的抽取片段比例，
+        - `summarization_density`：模型生成摘要对源文档的抽取程度，
+        - `summarization_compression`：模型生成摘要相对源文档的压缩比率。
+    - `bert_score`：模型生成摘要和参考摘要之间的平均BERTScore精确率、召回率和F1分数 [(Zhang et al., 2020)](https://openreview.net/pdf?id=SkeHuCVFDr)。
+- 翻译指标：
+    - `bleu`：语料库级别BLEU分数 [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) - 使用sacrebleu实现。
+    - `bleu_1`：基于1-gram重叠的平均样本BLEU分数 [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) - 使用nltk实现。
+    - `bleu_4`：基于4-gram重叠的平均样本BLEU分数 [(Papineni et al., 2002)](https://aclanthology.org/P02-1040/) - 使用nltk实现。
+    - `chrf`：字符n-gram匹配的F值评分。
+    - `ter`：翻译编辑/错误率。
+- 版权相关指标：
+    - `copyright`：评估以下方面：
+        - `longest_common_prefix_length`：模型生成和参考文本之间最长公共前缀的平均长度，
+        - `edit_distance`：模型生成和参考文本之间的平均Levenshtein编辑距离，
+        - `edit_similarity`：模型生成和参考文本之间的平均Levenshtein编辑相似度（按较长序列长度归一化）。
+- 数学评估指标：
+    - `quasi_exact_match_math`：归一化预测与归一化参考答案匹配的样本比例（针对数学问题特别优化，删除了LaTeX符号、单位等）。
+    - `maj_at_4_math`：多数投票评估，对预测和参考答案使用数学专用归一化。
+    - `quasi_exact_match_gsm8k`：归一化预测与归一化参考答案匹配的样本比例（针对gsm8k问题特别优化，删除了LaTeX符号、单位等）。
+    - `maj_at_8_gsm8k`：多数投票评估，对预测和参考答案使用gsm8k专用归一化。
+
+## LLM作为评判
+- `llm_judge_gpt3p5`：适用于任何生成任务，使用OpenAI API的GPT-3.5模型评分。
+- `llm_judge_llama_3_405b`：适用于任何生成任务，使用HuggingFace API的Llama 3 405B模型评分。
+- `llm_judge_multi_turn_gpt3p5`：适用于多轮对话任务（如mt-bench），使用OpenAI API的GPT-3.5模型评分。
+- `llm_judge_multi_turn_llama_3_405b`：适用于多轮对话任务（如mt-bench），使用HuggingFace API的Llama 3 405B模型评分。 
\ No newline at end of file
diff --git a/docs/source/zh/package_reference/evaluation_tracker.mdx b/docs/source/zh/package_reference/evaluation_tracker.mdx
new file mode 100644
index 000000000..06297ebbf
--- /dev/null
+++ b/docs/source/zh/package_reference/evaluation_tracker.mdx
@@ -0,0 +1,3 @@
+# 评估跟踪器
+
+[[autodoc]] logging.evaluation_tracker.EvaluationTracker 
\ No newline at end of file
diff --git a/docs/source/zh/package_reference/logging.mdx b/docs/source/zh/package_reference/logging.mdx
new file mode 100644
index 000000000..99fd42a16
--- /dev/null
+++ b/docs/source/zh/package_reference/logging.mdx
@@ -0,0 +1,15 @@
+# 日志记录
+
+## EvaluationTracker（评估跟踪器）
+[[autodoc]] logging.evaluation_tracker.EvaluationTracker
+
+## GeneralConfigLogger（通用配置记录器）
+[[autodoc]] logging.info_loggers.GeneralConfigLogger
+## DetailsLogger（详情记录器）
+[[autodoc]] logging.info_loggers.DetailsLogger
+## MetricsLogger（指标记录器）
+[[autodoc]] logging.info_loggers.MetricsLogger
+## VersionsLogger（版本记录器）
+[[autodoc]] logging.info_loggers.VersionsLogger
+## TaskConfigLogger（任务配置记录器）
+[[autodoc]] logging.info_loggers.TaskConfigLogger 
\ No newline at end of file
diff --git a/docs/source/zh/package_reference/metrics.mdx b/docs/source/zh/package_reference/metrics.mdx
new file mode 100644
index 000000000..ffc71a9f7
--- /dev/null
+++ b/docs/source/zh/package_reference/metrics.mdx
@@ -0,0 +1,70 @@
+# 指标
+
+## 指标
+[//]: # (TODO: aenum.Enum raises error when generating docs: not supported by inspect.signature. See: https://github.com/ethanfurman/aenum/issues/44)
+[//]: # (### Metrics)
+[//]: # ([[autodoc]] metrics.metrics.Metrics)
+### Metric（指标）
+[[autodoc]] metrics.utils.metric_utils.Metric
+### CorpusLevelMetric（语料级指标）
+[[autodoc]] metrics.utils.metric_utils.CorpusLevelMetric
+### SampleLevelMetric（样本级指标）
+[[autodoc]] metrics.utils.metric_utils.SampleLevelMetric
+### MetricGrouping（指标分组）
+[[autodoc]] metrics.utils.metric_utils.MetricGrouping
+### CorpusLevelMetricGrouping（语料级指标分组）
+[[autodoc]] metrics.utils.metric_utils.CorpusLevelMetricGrouping
+### SampleLevelMetricGrouping（样本级指标分组）
+[[autodoc]] metrics.utils.metric_utils.SampleLevelMetricGrouping
+
+## Corpus Metrics（语料级指标）
+### CorpusLevelF1Score
+[[autodoc]] metrics.metrics_corpus.CorpusLevelF1Score
+### CorpusLevelPerplexityMetric
+[[autodoc]] metrics.metrics_corpus.CorpusLevelPerplexityMetric
+### CorpusLevelTranslationMetric
+[[autodoc]] metrics.metrics_corpus.CorpusLevelTranslationMetric
+### matthews_corrcoef
+[[autodoc]] metrics.metrics_corpus.matthews_corrcoef
+
+## Sample Metrics（样本级指标）
+### ExactMatches（精确匹配）
+[[autodoc]] metrics.metrics_sample.ExactMatches
+### F1_score（F1分数）
+[[autodoc]] metrics.metrics_sample.F1_score
+### LoglikelihoodAcc
+[[autodoc]] metrics.metrics_sample.LoglikelihoodAcc
+### NormalizedMultiChoiceProbability（归一化多选概率）
+[[autodoc]] metrics.metrics_sample.NormalizedMultiChoiceProbability
+### Probability（概率）
+[[autodoc]] metrics.metrics_sample.Probability
+### Recall（召回率）
+[[autodoc]] metrics.metrics_sample.Recall
+### MRR
+[[autodoc]] metrics.metrics_sample.MRR
+### ROUGE
+[[autodoc]] metrics.metrics_sample.ROUGE
+### BertScore
+[[autodoc]] metrics.metrics_sample.BertScore
+### Extractiveness（抽取性）
+[[autodoc]] metrics.metrics_sample.Extractiveness
+### Faithfulness（忠实度）
+[[autodoc]] metrics.metrics_sample.Faithfulness
+### BLEURT
+[[autodoc]] metrics.metrics_sample.BLEURT
+### BLEU
+[[autodoc]] metrics.metrics_sample.BLEU
+### StringDistance（字符串距离）
+[[autodoc]] metrics.metrics_sample.StringDistance
+### JudgeLLM
+[[autodoc]] metrics.metrics_sample.JudgeLLM
+### JudgeLLMMTBench
+[[autodoc]] metrics.metrics_sample.JudgeLLMMTBench
+### JudgeLLMMixEval
+[[autodoc]] metrics.metrics_sample.JudgeLLMMixEval
+### MajAtK
+[[autodoc]] metrics.metrics_sample.MajAtK
+
+## LLM-as-a-Judge（LLM作为评判）
+### JudgeLM
+[[autodoc]] metrics.llm_as_judge.JudgeLM 
\ No newline at end of file
diff --git a/docs/source/zh/package_reference/models.mdx b/docs/source/zh/package_reference/models.mdx
new file mode 100644
index 000000000..8c2d7957a
--- /dev/null
+++ b/docs/source/zh/package_reference/models.mdx
@@ -0,0 +1,40 @@
+# 模型
+
+## 模型
+### LightevalModel
+[[autodoc]] models.abstract_model.LightevalModel
+
+
+## Accelerate和Transformers模型
+### TransformersModel
+[[autodoc]] models.transformers.transformers_model.TransformersModelConfig
+[[autodoc]] models.transformers.transformers_model.TransformersModel
+
+### AdapterModel
+[[autodoc]] models.transformers.adapter_model.AdapterModelConfig
+[[autodoc]] models.transformers.adapter_model.AdapterModel
+
+### DeltaModel
+[[autodoc]] models.transformers.delta_model.DeltaModelConfig
+[[autodoc]] models.transformers.delta_model.DeltaModel
+
+## 基于端点的模型
+### InferenceEndpointModel
+[[autodoc]] models.endpoints.endpoint_model.InferenceEndpointModelConfig
+[[autodoc]] models.endpoints.endpoint_model.ServerlessEndpointModelConfig
+[[autodoc]] models.endpoints.endpoint_model.InferenceEndpointModel
+
+### TGI ModelClient
+[[autodoc]] models.endpoints.tgi_model.TGIModelConfig
+[[autodoc]] models.endpoints.tgi_model.ModelClient
+
+### 自定义模型
+[[autodoc]] models.custom.custom_model.CustomModelConfig
+
+### Open AI模型
+[[autodoc]] models.endpoints.openai_model.OpenAIClient
+
+## VLLM模型
+### VLLMModel
+[[autodoc]] models.vllm.vllm_model.VLLMModelConfig
+[[autodoc]] models.vllm.vllm_model.VLLMModel 
\ No newline at end of file
diff --git a/docs/source/zh/package_reference/pipeline.mdx b/docs/source/zh/package_reference/pipeline.mdx
new file mode 100644
index 000000000..68f712ee6
--- /dev/null
+++ b/docs/source/zh/package_reference/pipeline.mdx
@@ -0,0 +1,13 @@
+# 流水线
+
+## Pipeline（流水线）
+
+[[autodoc]] pipeline.Pipeline
+
+## PipelineParameters（流水线参数）
+
+[[autodoc]] pipeline.PipelineParameters
+
+## ParallelismManager（并行管理器）
+
+[[autodoc]] pipeline.ParallelismManager 
\ No newline at end of file
diff --git a/docs/source/zh/package_reference/tasks.mdx b/docs/source/zh/package_reference/tasks.mdx
new file mode 100644
index 000000000..91b6fb4c8
--- /dev/null
+++ b/docs/source/zh/package_reference/tasks.mdx
@@ -0,0 +1,38 @@
+# 任务
+
+## LightevalTask
+### LightevalTaskConfig
+[[autodoc]] tasks.lighteval_task.LightevalTaskConfig
+### LightevalTask
+[[autodoc]] tasks.lighteval_task.LightevalTask
+
+## PromptManager（提示管理器）
+
+[[autodoc]] tasks.prompt_manager.PromptManager
+
+## Registry（注册表）
+
+[[autodoc]] tasks.registry.Registry
+
+## Requests（请求）
+
+[[autodoc]] tasks.requests.Request
+
+[[autodoc]] tasks.requests.LoglikelihoodRequest
+
+[[autodoc]] tasks.requests.LoglikelihoodSingleTokenRequest
+
+[[autodoc]] tasks.requests.LoglikelihoodRollingRequest
+
+[[autodoc]] tasks.requests.GreedyUntilRequest
+
+[[autodoc]] tasks.requests.GreedyUntilMultiTurnRequest
+
+## Datasets（数据集）
+
+[[autodoc]] data.DynamicBatchDataset
+[[autodoc]] data.LoglikelihoodDataset
+[[autodoc]] data.LoglikelihoodSingleTokenDataset
+[[autodoc]] data.GenerativeTaskDataset
+[[autodoc]] data.GenerativeTaskDatasetNanotron
+[[autodoc]] data.GenDistributedSampler 
\ No newline at end of file
diff --git a/docs/source/zh/quicktour.mdx b/docs/source/zh/quicktour.mdx
new file mode 100644
index 000000000..bfb35e6a9
--- /dev/null
+++ b/docs/source/zh/quicktour.mdx
@@ -0,0 +1,179 @@
+<!--
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 快速上手
+
+
+> [!TIP]
+> 建议使用`--help`参数了解每个命令的可用选项。
+> `lighteval --help`
+
+Lighteval 支持多种命令方式：
+
+- `lighteval accelerate`: 使用[🤗 Accelerate](https://github.com/huggingface/accelerate)在CPU或多GPU环境评估模型
+- `lighteval nanotron`: 通过[⚡️ Nanotron](https://github.com/huggingface/nanotron)在分布式环境中评估模型
+- `lighteval vllm`: 基于[🚀 VLLM](https://github.com/vllm-project/vllm)在单个或多个GPU上评估模型
+- `lighteval endpoint`
+    - `inference-endpoint`: 使用[🔗 Inference Endpoint](https://huggingface.co/inference-endpoints/dedicated)评估模型
+    - `tgi`: 通过[🔗 Text Generation Inference](https://huggingface.co/docs/text-generation-inference/en/index)评估模型
+    - `openai`: 基于[🔗 OpenAI API](https://platform.openai.com/)评估模型
+
+## 基本用法
+
+要使用[🤗 Accelerate](https://github.com/huggingface/accelerate)在Truthful QA基准上评估`GPT-2`模型，运行：
+
+```bash
+lighteval accelerate \
+     "model_name=openai-community/gpt2" \
+     "leaderboard|truthfulqa:mc|0|0"
+```
+
+在这里，首先选择后端（`accelerate`、`nanotron`或`vllm`），然后指定要评估的模型和任务。
+
+模型参数采用`key1=value1,key2=value2`这样的语法格式。
+有效的键值对取决于所选后端，详细说明请参见[下文](#模型参数)。
+
+任务规范的语法格式如下：
+
+```txt
+{套件}|{任务}|{少样本数量}|{0表示严格使用指定的少样本数量，1表示允许在上下文过长时自动截断}
+```
+
+当第四个值设为1时，lighteval会检查整个提示（包括少样本示例）是否超出任务或模型的上下文长度限制。
+如果超出限制，系统会自动减少少样本示例的数量。
+
+所有官方支持的任务可在[任务列表](available-tasks)和
+[extended文件夹](https://github.com/huggingface/lighteval/tree/main/src/lighteval/tasks/extended)中找到。
+社区贡献的任务则位于
+[community](https://github.com/huggingface/lighteval/tree/main/community_tasks)文件夹。
+关于任务实现的更多细节，如提示构建方式或使用的评估指标，请查阅
+[源文件](https://github.com/huggingface/lighteval/blob/main/src/lighteval/tasks/default_tasks.py)。
+
+Lighteval支持同时运行多个任务，可通过逗号分隔列表或指定配置文件路径来实现。
+配置文件应按照[examples/tasks/recommended_set.txt](https://github.com/huggingface/lighteval/blob/main/examples/tasks/recommended_set.txt)的结构编写。
+指定文件路径时应以`./`开头。
+
+```bash
+lighteval accelerate \
+     "model_name=openai-community/gpt2" \
+     ./path/to/lighteval/examples/tasks/recommended_set.txt
+# 或者使用逗号分隔的任务列表，例如："leaderboard|truthfulqa:mc|0|0|,leaderboard|gsm8k|3|1"
+```
+
+## 在多GPU环境中评估模型
+
+#### 数据并行
+
+要在多GPU环境中评估模型，首先需要创建多GPU配置：
+
+```bash
+accelerate config
+```
+
+然后，可以使用8个GPU的数据并行方式来评估模型：
+
+```bash
+accelerate launch --multi_gpu --num_processes=8 -m \
+    lighteval accelerate \
+    "model_name=openai-community/gpt2" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+其中，`--override_batch_size`定义每个设备的批处理大小，实际总批处理大小为`override_batch_size * num_gpus`。
+
+#### 流水线并行
+
+要使用2个或更多GPU的流水线并行方式评估模型，运行：
+
+```bash
+lighteval accelerate \
+    "model_name=openai-community/gpt2,model_parallel=True" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+这会自动使用accelerate将模型分布在多个GPU上。
+
+> [!TIP]
+> 数据并行和流水线并行可以结合使用，只需设置`model_parallel=True`并使用accelerate进行数据分布。
+
+## 后端配置
+
+`model-args`参数接受一个模型参数列表字符串。可用参数取决于所选后端（vllm或accelerate）。
+
+### Accelerate
+
+- **pretrained** (str):
+    HuggingFace Hub模型ID或预训练模型路径，相当于HuggingFace `transformers` API中`from_pretrained`的`pretrained_model_name_or_path`参数。
+- **tokenizer** (Optional[str]): 用于分词的HuggingFace Hub分词器ID。
+- **multichoice_continuations_start_space** (Optional[bool]): 在多选项生成中是否在每个选项开头添加空格。
+    例如，对于问题"法国的首都是什么？"和选项"巴黎"、"伦敦"，
+    会被分词为"法国的首都是什么？巴黎"和"法国的首都是什么？伦敦"。
+    True表示添加空格，False表示去除空格，None表示不做处理。
+- **subfolder** (Optional[str]): 模型仓库中的子文件夹。
+- **revision** (str): 模型的版本。
+- **max_gen_toks** (Optional[int]): 生成的最大token数量。
+- **max_length** (Optional[int]): 生成输出的最大长度。
+- **add_special_tokens** (bool, optional, defaults to True): 是否向输入序列添加特殊token。
+   如果为`None`，对于seq2seq模型（如T5）默认值为`True`，对于因果模型默认为`False`。
+- **model_parallel** (bool, optional, defaults to None):
+    True/False: 强制使用或不使用`accelerate`库在多设备间分布大型模型。
+    默认为None，会比较进程数与GPU数：若进程数小于GPU数则启用模型并行，否则不启用。
+- **dtype** (Union[str, torch.dtype], optional, defaults to None):
+    如指定，则将模型权重转换为该数据类型。字符串会被转换为`torch.dtype`对象（如`float16` -> `torch.float16`）。
+    使用`dtype="auto"`可从模型权重自动推导类型。
+- **device** (Union[int, str]): 用于模型训练的设备。
+- **quantization_config** (Optional[BitsAndBytesConfig]): 模型量化配置，用于以量化精度加载原本为浮点的模型。4位和8位精度需要此配置。
+- **trust_remote_code** (bool): 加载模型时是否信任远程代码。
+
+### VLLM
+
+- **pretrained** (str): HuggingFace Hub模型ID或预训练模型路径。
+- **gpu_memory_utilization** (float): GPU内存使用比例。
+- **batch_size** (int): 模型训练的批处理大小。
+- **revision** (str): 模型版本。
+- **dtype** (str, None): 模型使用的数据类型。
+- **tensor_parallel_size** (int): 使用的张量并行单元数量。
+- **data_parallel_size** (int): 使用的数据并行单元数量。
+- **max_model_length** (int): 模型的最大长度。
+- **swap_space** (int): 每个GPU的CPU交换空间大小（GiB）。
+- **seed** (int): 模型使用的随机种子。
+- **trust_remote_code** (bool): 加载模型时是否信任远程代码。
+- **use_chat_template** (bool): 是否使用聊天模板。
+- **add_special_tokens** (bool): 是否向输入序列添加特殊token。
+- **multichoice_continuations_start_space** (bool): 在多选项生成中是否在每个选项开头添加空格。
+- **subfolder** (Optional[str]): 模型仓库中的子文件夹。
+
+## Nanotron
+
+要评估使用nanotron训练的模型：
+
+> [!WARNING]
+> Nanotron模型必须使用torchrun进行评估。
+
+
+```bash
+ torchrun --standalone --nnodes=1 --nproc-per-node=1  \
+ src/lighteval/__main__.py nanotron \
+ --checkpoint-config-path ../nanotron/checkpoints/10/config.yaml \
+ --lighteval-config-path examples/nanotron/lighteval_config_override_template.yaml
+ ```
+
+`nproc-per-node`参数应与`lighteval_config_template.yaml`文件中配置的并行设置匹配，
+即：`nproc-per-node = data_parallelism * tensor_parallelism * pipeline_parallelism`。 
\ No newline at end of file
diff --git a/docs/source/zh/saving-and-reading-results.mdx b/docs/source/zh/saving-and-reading-results.mdx
new file mode 100644
index 000000000..41eebd025
--- /dev/null
+++ b/docs/source/zh/saving-and-reading-results.mdx
@@ -0,0 +1,203 @@
+<!--
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 保存和读取结果
+
+## 本地保存结果
+
+Lighteval将自动在使用`--output-dir`选项设置的目录中保存结果和评估详情。结果将保存在`{output_dir}/results/{model_name}/results_{timestamp}.json`中。[这里有一个结果文件的示例](#结果文件示例)。输出路径可以是任何符合[fsspec](https://filesystem-spec.readthedocs.io/en/latest/index.html)的路径（本地、s3、hf hub、gdrive、ftp等）。
+
+要保存评估的详细信息，您可以使用`--save-details`选项。详细信息将保存在parquet文件`{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet`中。
+
+## 将结果推送到HuggingFace hub
+
+您可以将结果和评估详情推送到HuggingFace hub。要这样做，您需要设置`--push-to-hub`以及`--results-org`选项。结果将保存在名为`{results_org}/{model_org}/{model_name}`的数据集中。要推送详细信息，您需要设置`--save-details`选项。
+默认情况下，创建的数据集将是私有的，您可以通过设置`--public-run`选项使其公开。
+
+
+## 将结果推送到Tensorboard
+
+您可以通过设置`--push-to-tensorboard`将结果推送到Tensorboard。这将在使用`--results-org`选项设置的HF组织中创建一个Tensorboard仪表板。
+
+
+## 将结果推送到WandB
+
+您可以通过设置`--wandb`将结果推送到WandB。这将初始化一个WandB运行并记录结果。
+
+Wandb参数需要在您的环境变量中设置。
+
+```
+export WANDB_PROJECT="lighteval"
+```
+
+您可以在[wandb文档](https://docs.wandb.ai/guides/track/environment-variables/)中找到变量列表。
+
+
+## 如何加载和研究详细信息
+
+### 从本地详细信息文件加载
+
+```python
+from datasets import load_dataset
+import os
+
+output_dir = "evals_doc"
+model_name = "HuggingFaceH4/zephyr-7b-beta"
+timestamp = "latest"
+task = "lighteval|gsm8k|0"
+
+if timestamp == "latest":
+    path = f"{output_dir}/details/{model_org}/{model_name}/*/"
+    timestamps = glob.glob(path)
+    timestamp = sorted(timestamps)[-1].split("/")[-2]
+    print(f"Latest timestamp: {timestamp}")
+
+details_path = f"{output_dir}/details/{model_name}/{timestamp}/details_{task}_{timestamp}.parquet"
+
+# 加载详细信息
+details = load_dataset("parquet", data_files=details_path, split="train")
+
+for detail in details:
+    print(detail)
+```
+
+### 从HuggingFace hub加载
+
+```python
+from datasets import load_dataset
+
+results_org = "SaylorTwift"
+model_name = "HuggingFaceH4/zephyr-7b-beta"
+sanitized_model_name = model_name.replace("/", "__")
+task = "lighteval|gsm8k|0"
+public_run = False
+
+dataset_path = f"{results_org}/details_{sanitized_model_name}{'_private' if not public_run else ''}"
+details = load_dataset(dataset_path, task.replace("|", "_"), split="latest")
+
+for detail in details:
+    print(detail)
+```
+
+
+详细信息文件包含以下列：
+- `choices`：在多选任务的情况下，向模型呈现的选项。
+- `gold`：黄金答案。
+- `gold_index`：黄金答案在选项列表中的索引。
+- `cont_tokens`：续写的令牌。
+- `example`：文本形式的输入。
+- `full_prompt`：完整提示，将输入到模型中。
+- `input_tokens`：完整提示的令牌。
+- `instruction`：给模型的指令。
+- `metrics`：为示例计算的指标。
+- `num_asked_few_shots`：要求模型的少样本数量。
+- `num_effective_few_shots`：有效的少样本数量。
+- `padded`：输入是否被填充。
+- `pred_logits`：模型的logits。
+- `predictions`：模型的预测。
+- `specifics`：任务的具体细节。
+- `truncated`：输入是否被截断。
+
+
+## 结果文件示例
+
+```json
+{
+  "config_general": {
+    "lighteval_sha": "203045a8431bc9b77245c9998e05fc54509ea07f",
+    "num_fewshot_seeds": 1,
+    "override_batch_size": 1,
+    "max_samples": 1,
+    "job_id": "",
+    "start_time": 620979.879320166,
+    "end_time": 621004.632108041,
+    "total_evaluation_time_secondes": "24.752787875011563",
+    "model_name": "gpt2",
+    "model_sha": "607a30d783dfa663caf39e06633721c8d4cfcd7e",
+    "model_dtype": null,
+    "model_size": "476.2 MB"
+  },
+  "results": {
+    "lighteval|gsm8k|0": {
+      "qem": 0.0,
+      "qem_stderr": 0.0,
+      "maj@8": 0.0,
+      "maj@8_stderr": 0.0
+    },
+    "all": {
+      "qem": 0.0,
+      "qem_stderr": 0.0,
+      "maj@8": 0.0,
+      "maj@8_stderr": 0.0
+    }
+  },
+  "versions": {
+    "lighteval|gsm8k|0": 0
+  },
+  "config_tasks": {
+    "lighteval|gsm8k": {
+      "name": "gsm8k",
+      "prompt_function": "gsm8k",
+      "hf_repo": "gsm8k",
+      "hf_subset": "main",
+      "metric": [
+        {
+          "metric_name": "qem",
+          "higher_is_better": true,
+          "category": "3",
+          "use_case": "5",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": "mean"
+        },
+        {
+          "metric_name": "maj@8",
+          "higher_is_better": true,
+          "category": "5",
+          "use_case": "5",
+          "sample_level_fn": "compute",
+          "corpus_level_fn": "mean"
+        }
+      ],
+      "hf_avail_splits": [
+        "train",
+        "test"
+      ],
+      "evaluation_splits": [
+        "test"
+      ],
+      "few_shots_split": null,
+      "few_shots_select": "random_sampling_from_train",
+      "generation_size": 256,
+      "generation_grammar": null,
+      "stop_sequence": [
+        "Question="
+      ],
+      "num_samples": null,
+      "suite": [
+        "lighteval"
+      ],
+      "original_num_docs": 1319,
+      "effective_num_docs": 1,
+      "trust_dataset": true,
+      "must_remove_duplicate_docs": null,
+      "version": 0
+    }
+  }
+} 
\ No newline at end of file
diff --git a/docs/source/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx b/docs/source/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx
new file mode 100644
index 000000000..3045f171e
--- /dev/null
+++ b/docs/source/zh/use-huggingface-inference-endpoints-or-tgi-as-backend.mdx
@@ -0,0 +1,50 @@
+# 在服务器或容器上评估模型
+
+除了本地运行评估外，您还可以在兼容TGI的服务器或容器上部署模型，然后通过向服务器发送请求来执行评估。命令格式与前面相同，只需额外指定yaml配置文件的路径：
+
+```bash
+lighteval endpoint {tgi,inference-endpoint} \
+    "/path/to/config/file"\
+    <task parameters>
+```
+
+服务器上运行支持两种类型的配置文件：
+
+### Hugging Face Inference Endpoints
+
+要使用HuggingFace的Inference Endpoints部署模型，您需要提供`endpoint_model.yaml`配置文件。Lighteval会自动部署端点，运行评估，并在完成后删除端点（除非您指定使用已启动的端点，这种情况下评估结束后不会删除端点）。
+
+__配置文件示例：__
+
+```yaml
+model_parameters:
+    reuse_existing: false # 如果设为true，将忽略实例中的所有参数，且评估后不删除端点
+# endpoint_name: "llama-2-7B-lighteval" # 名称必须使用小写字母，不含特殊字符
+    model_name: "meta-llama/Llama-2-7b-hf"
+    revision: "main"  # 默认为"main"
+    dtype: "float16" # 可选值包括"awq"、"eetq"、"gptq"、"4bit"或"8bit"（使用bitsandbytes）、"bfloat16"或"float16"
+    accelerator: "gpu"
+    region: "eu-west-1"
+    vendor: "aws"
+    instance_type: "nvidia-a10g"
+    instance_size: "x1"
+    framework: "pytorch"
+    endpoint_type: "protected"
+    namespace: null # 端点部署的命名空间，默认为当前用户的命名空间
+    image_url: null # （可选）指定部署端点时使用的docker镜像，例如使用支持更新模型的最新TGI容器
+    env_vars:
+    null # （可选）启动端点时设置的环境变量，例如：`MAX_INPUT_LENGTH: 2048`
+```
+
+### Text Generation Inference (TGI)
+
+如需使用已部署在TGI服务器上的模型（例如HuggingFace的无服务器推理服务）：
+
+__配置文件示例：__
+
+```yaml
+model_parameters:
+    inference_server_address: ""
+    inference_server_auth: null
+    model_id: null # 可选，仅当TGI容器以指向本地目录的model_id启动时需要
+``` 
\ No newline at end of file
diff --git a/docs/source/zh/use-inference-providers-as-backend.mdx b/docs/source/zh/use-inference-providers-as-backend.mdx
new file mode 100644
index 000000000..dbe6228a3
--- /dev/null
+++ b/docs/source/zh/use-inference-providers-as-backend.mdx
@@ -0,0 +1,41 @@
+# 使用Inference Providers作为后端
+
+Lighteval支持通过Hugging Face的Inference Providers在多种服务提供商上评估大语言模型，包括Black Forest Labs、Cerebras、Fireworks AI、Nebius、Together AI等。
+
+## 快速使用
+
+> [!WARNING]
+> 请务必设置您的HuggingFace API密钥。
+> 您可以通过`HF_TOKEN`环境变量或使用`huggingface-cli`命令来设置密钥。
+
+
+```bash
+lighteval endpoint inference-providers \
+    "model_name=deepseek-ai/DeepSeek-R1,provider=hf-inference" \
+    "lighteval|gsm8k|0|0"
+```
+
+## 使用配置文件
+
+您可以通过配置文件来定义要使用的模型和服务提供商。
+
+```bash
+lighteval endpoint inference-providers \
+    examples/model_configs/inference_providers.yaml \
+    "lighteval|gsm8k|0|0"
+```
+
+配置文件示例：
+
+```yaml
+model_parameters:
+  model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
+  provider: "novita"
+  timeout: null
+  proxies: null
+  parallel_calls_count: 10
+  generation_parameters:
+    temperature: 0.8
+    top_k: 10
+    max_new_tokens: 10000
+``` 
\ No newline at end of file
diff --git a/docs/source/zh/use-litellm-as-backend.mdx b/docs/source/zh/use-litellm-as-backend.mdx
new file mode 100644
index 000000000..22252acc9
--- /dev/null
+++ b/docs/source/zh/use-litellm-as-backend.mdx
@@ -0,0 +1,38 @@
+# 使用Litellm作为后端
+
+Lighteval支持使用litellm作为后端，这是一个统一接口工具，允许您以OpenAI格式调用各种LLM API（包括Bedrock、Huggingface、VertexAI、TogetherAI、Azure、OpenAI、Groq等）。
+
+关于可用API和兼容端点的详细文档可在[此处](https://docs.litellm.ai/docs/)查阅。
+
+## 快速使用
+
+```bash
+lighteval endpoint litellm \
+    "provider=openai,model_name=gpt-3.5-turbo" \
+    "lighteval|gsm8k|0|0" \
+    --use-chat-template
+```
+
+> [!WARNING]
+> 使用litellm时必须添加`--use-chat-template`参数才能正常工作。
+
+## 使用配置文件
+
+Litellm能够连接任何与OpenAI兼容的端点进行文本生成，例如，您可以评估在本地vllm服务器上运行的模型。
+
+要实现这一点，您需要使用类似以下的配置文件：
+
+```yaml
+model_parameters:
+    model_name: "openai/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
+    base_url: "您要使用的端点的URL"
+    api_key: "" # 根据需要删除或保留为空
+    generation_parameters:
+      temperature: 0.5
+      max_new_tokens: 256
+      stop_tokens: [""]
+      top_p: 0.9
+      seed: 0
+      repetition_penalty: 1.0
+      frequency_penalty: 0.0
+``` 
\ No newline at end of file
diff --git a/docs/source/zh/use-sglang-as-backend.mdx b/docs/source/zh/use-sglang-as-backend.mdx
new file mode 100644
index 000000000..e59fd0b63
--- /dev/null
+++ b/docs/source/zh/use-sglang-as-backend.mdx
@@ -0,0 +1,77 @@
+# 使用SGLang作为后端
+
+Lighteval支持使用`sglang`作为后端，这能显著提升评估速度。
+要启用此功能，只需在`model_args`中指定您希望传递给sglang的相关参数即可。
+
+```bash
+lighteval sglang \
+    "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+`sglang`能够通过数据并行和张量并行方式在多GPU环境中分布式部署模型。
+您可以在`model_args`中设置相应参数来选择合适的并行策略。
+
+例如，如果您有4个GPU，可以使用`tp_size`参数实现张量并行：
+
+```bash
+lighteval sglang \
+    "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tp_size=4" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+或者，如果您的模型能够适合单个GPU，可以利用`dp_size`参数实现数据并行来加速评估过程：
+
+```bash
+lighteval sglang \
+    "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,dp_size=4" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+## 使用配置文件
+
+对于更高级的配置需求，您可以使用配置文件来定义模型参数。
+以下是一个示例配置文件，完整版本可在`examples/model_configs/sglang_model_config.yaml`中找到：
+
+```bash
+lighteval sglang \
+    "examples/model_configs/sglang_model_config.yaml" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+> [!TIP]
+> sglang的详细配置参数文档可在[此处](https://docs.sglang.ai/backend/server_arguments.html)查阅
+
+```yaml
+model_parameters:
+    model_name: "HuggingFaceTB/SmolLM-1.7B-Instruct"
+    dtype: "auto"
+    tp_size: 1
+    dp_size: 1
+    context_length: null
+    random_seed: 1
+    trust_remote_code: False
+    use_chat_template: False
+    device: "cuda"
+    skip_tokenizer_init: False
+    kv_cache_dtype: "auto"
+    add_special_tokens: True
+    pairwise_tokenization: False
+    sampling_backend: null
+    attention_backend: null
+    mem_fraction_static: 0.8
+    chunked_prefill_size: 4096
+    generation_parameters:
+      max_new_tokens: 1024
+      min_new_tokens: 0
+      temperature: 1.0
+      top_k: 50
+      min_p: 0.0
+      top_p: 1.0
+      presence_penalty: 0.0
+      repetition_penalty: 1.0
+      frequency_penalty: 0.0
+```
+
+> [!WARNING]
+> 如果遇到内存溢出(OOM)问题，您可能需要减小模型的上下文窗口大小，并降低`mem_fraction_static`和`chunked_prefill_size`参数值。 
\ No newline at end of file
diff --git a/docs/source/zh/use-vllm-as-backend.mdx b/docs/source/zh/use-vllm-as-backend.mdx
new file mode 100644
index 000000000..d41be7aa6
--- /dev/null
+++ b/docs/source/zh/use-vllm-as-backend.mdx
@@ -0,0 +1,124 @@
+# 使用VLLM作为后端
+
+Lighteval支持使用`vllm`作为后端，这能显著提升评估速度。
+要启用此功能，只需在`model_args`中指定您希望传递给vllm的相关参数即可。
+
+
+> [!TIP]
+> vllm引擎的详细参数文档可在[此处](https://docs.vllm.ai/en/latest/serving/engine_args.html)查阅
+
+```bash
+lighteval vllm \
+    "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+`vllm`能够通过数据并行、流水线并行或张量并行方式在多GPU环境中分布式部署模型。
+您可以在`model_args`中设置相应参数来选择合适的并行策略。
+
+例如，如果您有4个GPU，可以使用`tensor_parallelism`将模型拆分：
+
+```bash
+export VLLM_WORKER_MULTIPROC_METHOD=spawn && lighteval vllm \
+    "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,tensor_parallel_size=4" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+或者，如果您的模型能够适合单个GPU，可以利用`data_parallelism`来加速评估过程：
+
+```bash
+lighteval vllm \
+    "model_name=HuggingFaceH4/zephyr-7b-beta,dtype=float16,data_parallel_size=4" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+## 使用配置文件
+
+对于更高级的配置需求，您可以使用配置文件来定义模型参数。
+以下是一个示例配置文件，完整版本可在`examples/model_configs/vllm_model_config.yaml`中找到：
+
+```bash
+lighteval vllm \
+    "examples/model_configs/vllm_model_config.yaml" \
+    "leaderboard|truthfulqa:mc|0|0"
+```
+
+```yaml
+model_parameters:
+    model_name: "HuggingFaceTB/SmolLM-1.7B-Instruct"
+    revision: "main"
+    dtype: "bfloat16"
+    tensor_parallel_size: 1
+    data_parallel_size: 1
+    pipeline_parallel_size: 1
+    gpu_memory_utilization: 0.9
+    max_model_length: 2048
+    swap_space: 4
+    seed: 1
+    trust_remote_code: True
+    use_chat_template: True
+    add_special_tokens: True
+    multichoice_continuations_start_space: True
+    pairwise_tokenization: True
+    subfolder: null
+    generation_parameters:
+      presence_penalty: 0.0
+      repetition_penalty: 1.0
+      frequency_penalty: 0.0
+      temperature: 1.0
+      top_k: 50
+      min_p: 0.0
+      top_p: 1.0
+      seed: 42
+      stop_tokens: null
+      max_new_tokens: 1024
+      min_new_tokens: 0
+```
+
+> [!WARNING]
+> 如果遇到内存溢出(OOM)问题，您可能需要减小模型的上下文窗口大小，并降低`gpu_memory_utilization`参数值。
+
+
+## 动态调整指标配置
+
+对于特殊类型的指标，如`Pass@K`或LiveCodeBench的`codegen`指标，有时需要传递特定参数值，例如生成样本数量。这可以在`yaml`配置文件中通过以下方式实现：
+
+```yaml
+model_parameters:
+    model_name: "HuggingFaceTB/SmolLM-1.7B-Instruct"
+    revision: "main"
+    dtype: "bfloat16"
+    tensor_parallel_size: 1
+    data_parallel_size: 1
+    pipeline_parallel_size: 1
+    gpu_memory_utilization: 0.9
+    max_model_length: 2048
+    swap_space: 4
+    seed: 1
+    trust_remote_code: True
+    use_chat_template: True
+    add_special_tokens: True
+    multichoice_continuations_start_space: True
+    pairwise_tokenization: True
+    subfolder: null
+    generation_parameters:
+      presence_penalty: 0.0
+      repetition_penalty: 1.0
+      frequency_penalty: 0.0
+      temperature: 1.0
+      top_k: 50
+      min_p: 0.0
+      top_p: 1.0
+      seed: 42
+      stop_tokens: null
+      max_new_tokens: 1024
+      min_new_tokens: 0
+metric_options: # 可选的指标参数
+    codegen_pass@1:16:
+        num_samples: 16
+```
+
+您可以通过在yaml文件中添加可选的`metric_options`键来自定义指标参数，
+使用的指标名称应与`Metric.metric_name`中定义的一致。
+在上例中，我们为任务中定义的`codegen_pass@1:16`指标将`num_samples`值设置为16，
+这会覆盖该指标原本的默认值。 
\ No newline at end of file
diff --git a/docs/source/zh/using-the-python-api.mdx b/docs/source/zh/using-the-python-api.mdx
new file mode 100644
index 000000000..c74fca156
--- /dev/null
+++ b/docs/source/zh/using-the-python-api.mdx
@@ -0,0 +1,83 @@
+<!--
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 使用Python API
+
+Lighteval可以从自定义Python脚本中使用。要评估模型，您需要设置
+[`~logging.evaluation_tracker.EvaluationTracker`]、[`~pipeline.PipelineParameters`]、
+[`model`](package_reference/models)或[`model_config`](package_reference/model_config)
+以及[`~pipeline.Pipeline`]。
+
+之后，只需运行流水线并保存结果。
+
+
+```python
+import lighteval
+from lighteval.logging.evaluation_tracker import EvaluationTracker
+from lighteval.models.vllm.vllm_model import VLLMModelConfig
+from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
+from lighteval.utils.utils import EnvConfig
+from lighteval.utils.imports import is_accelerate_available
+
+if is_accelerate_available():
+    from datetime import timedelta
+    from accelerate import Accelerator, InitProcessGroupKwargs
+    accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))])
+else:
+    accelerator = None
+
+def main():
+    evaluation_tracker = EvaluationTracker(
+        output_dir="./results",
+        save_details=True,
+        push_to_hub=True,
+        hub_results_org="您的用户名",
+    )
+
+    pipeline_params = PipelineParameters(
+        launcher_type=ParallelismManager.ACCELERATE,
+        env_config=EnvConfig(cache_dir="tmp/"),
+        custom_task_directory=None, # 如果使用自定义任务
+        # 一旦您的配置经过测试，删除以下2个参数
+        override_batch_size=1,
+        max_samples=10
+    )
+
+    model_config = VLLMModelConfig(
+            model_name="HuggingFaceH4/zephyr-7b-beta",
+            dtype="float16",
+            use_chat_template=True,
+    )
+
+    task = "helm|mmlu|5|1"
+
+    pipeline = Pipeline(
+        tasks=task,
+        pipeline_parameters=pipeline_params,
+        evaluation_tracker=evaluation_tracker,
+        model_config=model_config,
+    )
+
+    pipeline.evaluate()
+    pipeline.save_and_push_results()
+    pipeline.show_results()
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file