basetenlabs
diff --git a/‎docs/source/commands/trtllm-serve.rst
+27-20 b/‎docs/source/commands/trtllm-serve.rst
+27-20
diff --git a/‎docs/source/llm-api-examples/customization.md renamed to ‎docs/source/examples/customization.md
+1-1 b/‎docs/source/llm-api-examples/customization.md renamed to ‎docs/source/examples/customization.md
+1-1
diff --git a/‎docs/source/llm-api-examples/index.rst renamed to ‎docs/source/examples/index.rst b/‎docs/source/llm-api-examples/index.rst renamed to ‎docs/source/examples/index.rst
diff --git a/‎docs/source/llm-api-examples/llm_examples_index.template.rst_ renamed to ‎docs/source/examples/llm_examples_index.template.rst_
+1-1 b/‎docs/source/llm-api-examples/llm_examples_index.template.rst_ renamed to ‎docs/source/examples/llm_examples_index.template.rst_
+1-1
diff --git a/‎docs/source/helper.py
+75-49 b/‎docs/source/helper.py
+75-49
diff --git a/‎docs/source/index.rst
+5-4 b/‎docs/source/index.rst
+5-4
diff --git a/‎examples/apps/README.md
-39 b/‎examples/apps/README.md
-39
diff --git a/‎examples/apps/openai_client.py
-88 b/‎examples/apps/openai_client.py
-88
diff --git a/‎examples/llm-api/README.md
+1-1 b/‎examples/llm-api/README.md
+1-1
diff --git a/‎examples/serve/README.md
+3 b/‎examples/serve/README.md
+3
diff --git a/‎examples/serve/curl_chat_client.sh
+11 b/‎examples/serve/curl_chat_client.sh
+11
@@ -34,30 +34,37 @@ For the full syntax and argument descriptions, refer to :ref:`syntax`.
 Inference Endpoints
 -------------------
 
-After you start the server, you can send inference requests as shown in the following examples:
+After you start the server, you can send inference requests through completions API and Chat API, which are compatible with corresponding OpenAI APIs.
 
-.. code-block:: bash
+Chat API
+~~~~~~~~
 
-   curl http://localhost:8000/v1/completions \
-       -H "Content-Type: application/json" \
-       -d '{
-           "model": <model>,
-           "prompt": "Where is New York?",
-           "max_tokens": 16,
-           "temperature": 0
-       }'
+You can query Chat API with any http clients, a typical example is OpenAI Python client:
 
-.. code-block:: bash
+.. literalinclude:: ../../../examples/serve/openai_chat_client.py
+    :language: python
+    :linenos:
+
+Another example uses ``curl``:
+
+.. literalinclude:: ../../../examples/serve/curl_chat_client.sh
+    :language: bash
+    :linenos:
+
+Completions API
+~~~~~~~~~~~~~~~
+
+You can query Completions API with any http clients, a typical example is OpenAI Python client:
+
+.. literalinclude:: ../../../examples/serve/openai_completion_client.py
+    :language: python
+    :linenos:
+
+Another example uses ``curl``:
 
-   curl http://localhost:8000/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -d '{
-        "model": <model>,
-        "messages":[{"role": "system", "content": "You are a helpful assistant."},
-                    {"role": "user", "content": "Where is New York?"}],
-        "max_tokens": 16,
-        "temperature": 0
-    }'
+.. literalinclude:: ../../../examples/serve/curl_completion_client.sh
+    :language: bash
+    :linenos:
 
 Metrics Endpoint
 ----------------
 
@@ -1,4 +1,4 @@
-# Common Customizations
+# LLM Common Customizations
 
 ## Quantization
 
 
@@ -1,4 +1,4 @@
-Examples
+%EXAMPLE_NAME%
 =================================
 
 .. toctree::
 
@@ -1,5 +1,6 @@
 import logging
 import math
+from itertools import chain
 from pathlib import Path
 
 
@@ -20,69 +21,94 @@ def generate_title(filename: str) -> str:
 
 def generate_examples():
     root_dir = Path(__file__).parent.parent.parent.resolve()
+    ignore_list = {'__init__.py', 'quickstart_example.py'}
+    doc_dir = root_dir / "docs/source/examples"
 
-    # Source paths
-    script_dir = root_dir / "examples/llm-api"
-    # Look for both Python files and shell scripts
-    py_script_paths = sorted(
-        script_dir.glob("*.py"),
+    # Source paths for LLMAPI examples
+    llmapi_script_dir = root_dir / "examples/llm-api"
+    llmapi_script_paths = sorted(
+        llmapi_script_dir.glob("*.py"),
         # The autoPP example should be at the end since it is a preview example
         key=lambda x: math.inf if 'llm_auto_parallel' in x.stem else 0)
-
-    sh_script_paths = sorted(script_dir.glob("*.sh"))
-
-    # Combine both file types
-    script_paths = py_script_paths + sh_script_paths
-
-    ignore_list = {'__init__.py', 'quickstart_example.py'}
-    script_paths = [i for i in script_paths if i.name not in ignore_list]
-    # Destination paths
-    doc_dir = root_dir / "docs/source/llm-api-examples"
-    doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths]
+    llmapi_script_paths += sorted(llmapi_script_dir.glob("*.sh"))
+
+    llmapi_script_paths = [
+        i for i in llmapi_script_paths if i.name not in ignore_list
+    ]
+    # Destination paths for LLMAPI examples
+    llmapi_doc_paths = [
+        doc_dir / f"{path.stem}.rst" for path in llmapi_script_paths
+    ]
+    llmapi_script_base_url = "https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api"
+
+    # Path for trtllm-serve examples
+    serve_script_dir = root_dir / "examples/serve"
+    serve_script_paths = sorted(
+        chain(serve_script_dir.glob("*.py"), serve_script_dir.glob("*.sh")))
+    serve_script_paths = [
+        i for i in serve_script_paths if i.name not in ignore_list
+    ]
+    serve_doc_paths = [
+        doc_dir / f"{path.stem}.rst" for path in serve_script_paths
+    ]
+    serve_script_base_url = "https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/serve"
 
     # Generate the example docs for each example script
-    for script_path, doc_path in zip(script_paths, doc_paths):
-        if script_path.name in ignore_list:
-            logging.warning(f"Ignoring file: {script_path.name}")
-            continue
-        script_url = f"https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/{script_path.name}"
-
-        # Determine language based on file extension
-        language = "python" if script_path.suffix == ".py" else "bash"
-
-        # Make script_path relative to doc_path and call it include_path
-        include_path = '../../..' / script_path.relative_to(root_dir)
-
-        # For Python files, use generate_title to extract title from comments
-        # For shell scripts, use filename as title
-        if script_path.suffix == ".py":
-            title = generate_title(script_path)
-        else:
-            # Create a title from the filename (remove extension and replace underscores with spaces)
-            title_text = script_path.stem.replace('_', ' ').title()
-            title = underline(title_text)
-
-        content = (f"{title}\n\n"
-                   f"Source {script_url}.\n\n"
-                   f".. literalinclude:: {include_path}\n"
-                   f"    :language: {language}\n"
-                   "    :linenos:\n")
-        with open(doc_path, "w+") as f:
-            f.write(content)
-
-    # Generate the toctree for the example scripts
+    def write_script(base_url, script_paths, doc_paths, extra_content=""):
+        for script_path, doc_path in zip(script_paths, doc_paths):
+            if script_path.name in ignore_list:
+                logging.warning(f"Ignoring file: {script_path.name}")
+                continue
+            script_url = f"{base_url}/{script_path.name}"
+
+            # Determine language based on file extension
+            language = "python" if script_path.suffix == ".py" else "bash"
+
+            # Make script_path relative to doc_path and call it include_path
+            include_path = '../../..' / script_path.relative_to(root_dir)
+
+            # For Python files, use generate_title to extract title from comments
+            # For shell scripts, use filename as title
+            if script_path.suffix == ".py":
+                title = generate_title(script_path)
+            else:
+                # Create a title from the filename (remove extension and replace underscores with spaces)
+                title_text = script_path.stem.replace('_', ' ').title()
+                title = underline(title_text)
+
+            content = (f"{title}\n\n"
+                       f"{extra_content}"
+                       f"Source {script_url}.\n\n"
+                       f".. literalinclude:: {include_path}\n"
+                       f"    :language: {language}\n"
+                       "    :linenos:\n")
+            with open(doc_path, "w+") as f:
+                f.write(content)
+
+    # Generate the toctree for LLMAPI example scripts
+    write_script(llmapi_script_base_url, llmapi_script_paths, llmapi_doc_paths)
     with open(doc_dir / "llm_examples_index.template.rst_") as f:
         examples_index = f.read()
 
     with open(doc_dir / "llm_api_examples.rst", "w+") as f:
-        example_docs = "\n   ".join(path.stem for path in script_paths)
-        f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))
+        example_docs = "\n   ".join(path.stem for path in llmapi_script_paths)
+        f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs)\
+                .replace(r"%EXAMPLE_NAME%", "LLM Examples"))
+
+    # Generate the toctree for trtllm-serve example scripts
+    trtllm_serve_content = "Refer to the `trtllm-serve documentation <https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html>`_ for starting a server.\n\n"
+    write_script(serve_script_base_url, serve_script_paths, serve_doc_paths,
+                 trtllm_serve_content)
+    with open(doc_dir / "trtllm_serve_examples.rst", "w+") as f:
+        example_docs = "\n   ".join(path.stem for path in serve_script_paths)
+        f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs)\
+                .replace(r"%EXAMPLE_NAME%", "Online Serving Examples"))
 
     with open(doc_dir / "index.rst") as f:
         examples_index = f.read()
 
     with open(doc_dir / "index.rst", "w+") as f:
-        example_docs = "\n    ".join(path.stem for path in script_paths)
+        example_docs = "\n    ".join(path.stem for path in llmapi_script_paths)
         f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))
 
 
 
@@ -41,12 +41,13 @@ Welcome to TensorRT-LLM's Documentation!
 
 .. toctree::
    :maxdepth: 2
-   :caption: LLM API Examples
+   :caption: Examples
    :hidden:
 
-   llm-api-examples/index.rst
-   llm-api-examples/customization.md
-   llm-api-examples/llm_api_examples
+   examples/index.rst
+   examples/customization.md
+   examples/llm_api_examples
+   examples/trtllm_serve_examples
 
 
 .. toctree::
 
@@ -1,43 +1,4 @@
 # Apps examples with GenerationExecutor / LLM API
-## OpenAI API
-The `trtllm-serve` command launches an OpenAI compatible server which supports `v1/version`, `v1/completions` and `v1/chat/completions`. [openai_client.py](./openai_client.py) is a simple example using OpenAI client to query your model. To start the server, you can run
-```
-trtllm-serve <model>
-```
-Then you can query the APIs by running our example client or by `curl`.
-### v1/completions
-Query by `curl`:
-```
-curl http://localhost:8000/v1/completions \
-    -H "Content-Type: application/json" \
-    -d '{
-        "model": <model_name>,
-        "prompt": "Where is New York?",
-        "max_tokens": 16,
-        "temperature": 0
-    }'
-```
-Query by our example:
-```
-python3 ./openai_client.py --prompt "Where is New York?" --api completions
-```
-### v1/chat/completions
-Query by `curl`:
-```
-curl http://localhost:8000/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -d '{
-        "model": <model_name>,
-        "messages":[{"role": "system", "content": "You are a helpful assistant."},
-                    {"role": "user", "content": "Where is New York?"}],
-        "max_tokens": 16,
-        "temperature": 0
-    }'
-```
-Query by our example:
-```
-python3 ./openai_client.py --prompt "Where is New York?" --api chat
-```
 ## Python chat
 
 [chat.py](./chat.py) provides a small examples to play around with your model. Before running, install additional requirements with ` pip install -r ./requirements.txt`. Then you can run it with
 
@@ -1,3 +1,3 @@
 # LLM API Examples
 
-Please refer to the [official documentation](https://nvidia.github.io/TensorRT-LLM/llm-api/) and [examples](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/) for detailed information and usage guidelines regarding the LLM API.
+Please refer to the [official documentation](https://nvidia.github.io/TensorRT-LLM/llm-api/), [examples](https://nvidia.github.io/TensorRT-LLM/examples/llm_api_examples.html) and [customization](https://nvidia.github.io/TensorRT-LLM/examples/customization.html) for detailed information and usage guidelines regarding the LLM API.
@@ -0,0 +1,3 @@
+# Online Serving Examples with `trtllm-serve`
+
+We provide a CLI command, `trtllm-serve`, to launch a FastAPI server compatible with OpenAI APIs, here are some client examples to query the server, you can check the source code here or refer to the [command documentation](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html) and [examples](https://nvidia.github.io/TensorRT-LLM/examples/trtllm_serve_examples.html) for detailed information and usage guidelines.
@@ -0,0 +1,11 @@
+#! /usr/bin/env bash
+
+curl http://localhost:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": TinyLlama-1.1B-Chat-v1.0,
+        "messages":[{"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": "Where is New York?"}],
+        "max_tokens": 16,
+        "temperature": 0
+    }'
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Common Customizations`
	`1`	`+# LLM Common Customizations`
`2`	`2`
`3`	`3`	`## Quantization`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-Examples`
	`1`	`+%EXAMPLE_NAME%`
`2`	`2`	`=================================`
`3`	`3`
`4`	`4`	`.. toctree::`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`# LLM API Examples`
`2`	`2`
`3`		`-Please refer to the [official documentation](https://nvidia.github.io/TensorRT-LLM/llm-api/) and [examples](https://nvidia.github.io/TensorRT-LLM/llm-api-examples/) for detailed information and usage guidelines regarding the LLM API.`
	`3`	`+Please refer to the [official documentation](https://nvidia.github.io/TensorRT-LLM/llm-api/), [examples](https://nvidia.github.io/TensorRT-LLM/examples/llm_api_examples.html) and [customization](https://nvidia.github.io/TensorRT-LLM/examples/customization.html) for detailed information and usage guidelines regarding the LLM API.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+# Online Serving Examples with `trtllm-serve`
	`2`	`+`
	`3`	+We provide a CLI command, `trtllm-serve`, to launch a FastAPI server compatible with OpenAI APIs, here are some client examples to query the server, you can check the source code here or refer to the [command documentation](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html) and [examples](https://nvidia.github.io/TensorRT-LLM/examples/trtllm_serve_examples.html) for detailed information and usage guidelines.