ROCm
diff --git a/‎Dockerfile.rocm
Lines changed: 7 additions & 0 deletions b/‎Dockerfile.rocm
Lines changed: 7 additions & 0 deletions
diff --git a/‎benchmarks/test_accuracy.py
Lines changed: 75 additions & 38 deletions b/‎benchmarks/test_accuracy.py
Lines changed: 75 additions & 38 deletions
diff --git a/‎vllm/__init__.py
Lines changed: 0 additions & 2 deletions b/‎vllm/__init__.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎vllm/attention/backends/rocm_flash_attn.py
Lines changed: 53 additions & 2 deletions b/‎vllm/attention/backends/rocm_flash_attn.py
Lines changed: 53 additions & 2 deletions
@@ -109,11 +109,18 @@ ARG COMMON_WORKDIR
 COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
 COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 
+RUN git clone --recursive https://github.com/ROCm/aiter.git
+RUN cd /app/aiter && pip install -r requirements.txt && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py develop && pip show aiter
+
+
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false
 
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1
 
+# Enable Aiter. Make sure this only exists on the aiter branch.
+# ENV VLLM_USE_AITER=1
+
 CMD ["/bin/bash"]
 
@@ -1,45 +1,82 @@
-import time
+import argparse
+import dataclasses
 
+# from transformers import AutoTokenizer
 from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
 
 
-def main():
-    llm = LLM(
-        '/data/AI-ModelScope/Mixtral-8x7B-Instruct-v0___1/',
-        tensor_parallel_size=1,
-        #quantization="serenity",
-        dtype='float16',
-        #swap_space=16,
-        #enforce_eager=True,
-        #kv_cache_dtype="fp8",
-        #quantization="fp8",
-        #quantized_weights_path="/quantized/quark/llama.safetensors",
-        #worker_use_ray=True,
-        #trust_remote_code=True,
-        #distributed_executor_backend="mp",
-    )
-    batch_size = 5
-    max_tokens = 256
-    prompt = """The sun is a"""
-    sampling_params = SamplingParams(temperature=0,
-                                     top_p=0.95,
-                                     max_tokens=max_tokens)
-
-    start_time = time.perf_counter()
-    outs = llm.generate([prompt] * batch_size, sampling_params=sampling_params)
-    end_time = time.perf_counter()
-    elapsed_time = end_time - start_time
-
-    out_lengths = [len(x.token_ids) for out in outs for x in out.outputs]
-    num_tokens = sum(out_lengths)
-
-    print(
-        f"{num_tokens} tokens. {num_tokens / batch_size} on average. {num_tokens / elapsed_time:.2f} tokens/s. {elapsed_time} seconds"  # noqa: E501
+def main(args: argparse.Namespace):
+    print(args)
+
+    engine_args = EngineArgs.from_cli_args(args)
+
+    # NOTE(woosuk): If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=1.0,
+        top_p=1.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
     )
-    for out in outs:
-        print("===========")
-        print(out.outputs[0].text)
+    print(sampling_params)
+
+    # tokenizer = AutoTokenizer.from_pretrained(engine_args.model)
+    # inputs = tokenizer('Hello, world!', return_tensors='pt').input_ids
+    inputs = [
+        'Where is the capital of China?',
+        'The capital of Russia is ',
+        'The CEO of DeepSeek is ',
+        'The future of AI is',
+    ] * 32
+    outputs = llm.generate(inputs, sampling_params)
+    for i, output in enumerate(outputs):
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt {i}: {prompt!r}, Generated text: {generated_text!r}")
+    # print(tokenizer.decode(outputs[0]))
+
 
+if __name__ == '__main__':
+    parser = FlexibleArgumentParser(
+        description='Benchmark the latency of processing a single batch of '
+        'requests till completion.')
+    parser.add_argument('--input-len', type=int, default=32)
+    parser.add_argument('--output-len', type=int, default=128)
+    parser.add_argument('--batch-size', type=int, default=8)
+    parser.add_argument('--n',
+                        type=int,
+                        default=1,
+                        help='Number of generated sequences per prompt.')
+    parser.add_argument('--use-beam-search', action='store_true')
+    parser.add_argument('--num-iters-warmup',
+                        type=int,
+                        default=10,
+                        help='Number of iterations to run for warmup.')
+    parser.add_argument('--num-iters',
+                        type=int,
+                        default=30,
+                        help='Number of iterations to run.')
+    parser.add_argument(
+        '--profile',
+        action='store_true',
+        help='profile the generation process of a single batch')
+    parser.add_argument(
+        '--profile-result-dir',
+        type=str,
+        default=None,
+        help=('path to save the pytorch profiler output. Can be visualized '
+              'with ui.perfetto.dev or Tensorboard.'))
+    parser.add_argument(
+        '--output-json',
+        type=str,
+        default=None,
+        help='Path to save the latency results in JSON format.')
 
-if __name__ == "__main__":
-    main()
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)
@@ -6,7 +6,6 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
-from vllm.entrypoints.fast_sync_llm import FastSyncLLM
 from vllm.entrypoints.llm import LLM
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType, TextPrompt, TokensPrompt
@@ -38,7 +37,6 @@
     "__version__",
     "__version_tuple__",
     "LLM",
-    "FastSyncLLM",
     "ModelRegistry",
     "PromptType",
     "TextPrompt",
 
@@ -11,8 +11,14 @@
                                               AttentionMetadata, AttentionType)
 from vllm.attention.backends.utils import (CommonAttentionState,
                                            CommonMetadataBuilder)
-from vllm.attention.ops.paged_attn import (PagedAttention,
-                                           PagedAttentionMetadata)
+
+if envs.VLLM_USE_AITER_PAGED_ATTN:
+    from vllm.attention.ops.paged_attn_ater import (PagedAttention,
+                                                    PagedAttentionMetadata)
+else:
+    from vllm.attention.ops.paged_attn import (PagedAttention,
+                                               PagedAttentionMetadata)
+
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 
@@ -460,6 +466,9 @@ def __init__(
         logits_soft_cap: Optional[float] = None,
         attn_type: str = AttentionType.DECODER,
     ) -> None:
+        self.k_scale = torch.tensor([1.0], dtype=torch.float32)
+        self.v_scale = torch.tensor([1.0], dtype=torch.float32)
+        self.init_kv_scales = False
         if blocksparse_params is not None:
             raise ValueError(
                 "ROCmFlashAttention does not support blocksparse attention.")
@@ -609,6 +618,25 @@ def forward(
         else:
             assert value is None
 
+        if (envs.VLLM_USE_AITER_PAGED_ATTN and kv_cache.dtype.itemsize == 1
+                and self.init_kv_scales is False
+                and kv_cache.shape != torch.Size([0])):
+            num_blocks = kv_cache.shape[1]
+            block_size = kv_cache.shape[2] // (self.num_kv_heads *
+                                               self.head_size)
+            self.k_scale = torch.ones(
+                (self.num_kv_heads, num_blocks * block_size),
+                dtype=torch.float32,
+                device=kv_cache.device)
+            self.v_scale = torch.ones(
+                (self.num_kv_heads, num_blocks * block_size),
+                dtype=torch.float32,
+                device=kv_cache.device)
+            self.init_kv_scales = True
+            # if self.init_kv_scales:
+            layer._k_scale = self.k_scale
+            layer._v_scale = self.v_scale
+
         if self.attn_type != AttentionType.ENCODER and kv_cache.numel() > 0:
             key_cache, value_cache = PagedAttention.split_kv_cache(
                 kv_cache, self.num_kv_heads, self.head_size)
@@ -780,6 +808,29 @@ def forward(
             use_custom = _use_rocm_custom_paged_attention(
                 decode_query.dtype, head_size, block_size, gqa_ratio,
                 decode_meta.max_decode_seq_len)
+            if envs.VLLM_USE_AITER_PAGED_ATTN:
+                out = output[num_prefill_tokens:]
+                PagedAttention.forward_decode(
+                    decode_query,
+                    key_cache,
+                    value_cache,
+                    decode_meta.block_tables
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.cross_block_tables,
+                    decode_meta.seq_lens_tensor
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.encoder_seq_lens_tensor,
+                    decode_meta.max_decode_seq_len
+                    if self.attn_type != AttentionType.ENCODER_DECODER else
+                    decode_meta.max_encoder_seq_len,
+                    self.kv_cache_dtype,
+                    self.num_kv_heads,
+                    self.scale,
+                    self.alibi_slopes,
+                    layer._k_scale,
+                    layer._v_scale,
+                    out=out)
+                return output.view(-1, self.num_heads * self.head_size)
             if use_custom:
                 max_seq_len = (decode_meta.max_decode_seq_len if
                                self.attn_type != AttentionType.ENCODER_DECODER