intel
diff --git a/‎docker/llm/inference/xpu/docker/Dockerfile
+1-1 b/‎docker/llm/inference/xpu/docker/Dockerfile
+1-1
diff --git a/‎python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py
-346 b/‎python/llm/example/GPU/Pipeline-Parallel-FastAPI/pipeline_serving.py
-346
diff --git a/‎python/llm/example/GPU/Pipeline-Parallel-FastAPI/README.md renamed to ‎python/llm/example/GPU/Pipeline-Parallel-Serving/README.md
+10-3 b/‎python/llm/example/GPU/Pipeline-Parallel-FastAPI/README.md renamed to ‎python/llm/example/GPU/Pipeline-Parallel-Serving/README.md
+10-3
diff --git a/‎python/llm/example/GPU/Pipeline-Parallel-FastAPI/benchmark.py renamed to ‎python/llm/example/GPU/Pipeline-Parallel-Serving/benchmark.py b/‎python/llm/example/GPU/Pipeline-Parallel-FastAPI/benchmark.py renamed to ‎python/llm/example/GPU/Pipeline-Parallel-Serving/benchmark.py
diff --git a/‎python/llm/example/GPU/Pipeline-Parallel-FastAPI/gradio_webui.py renamed to ‎python/llm/example/GPU/Pipeline-Parallel-Serving/gradio_webui.py b/‎python/llm/example/GPU/Pipeline-Parallel-FastAPI/gradio_webui.py renamed to ‎python/llm/example/GPU/Pipeline-Parallel-Serving/gradio_webui.py
@@ -61,7 +61,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
     cp -r ./ipex-llm/python/llm/example/GPU/vLLM-Serving/ ./vLLM-Serving && \
     # Download pp_serving
     mkdir -p /llm/pp_serving && \
-    cp ./ipex-llm/python/llm/example/GPU/Pipeline-Parallel-FastAPI/*.py /llm/pp_serving/ && \
+    cp ./ipex-llm/python/llm/example/GPU/Pipeline-Parallel-Serving/*.py /llm/pp_serving/ && \
     # Install related library of benchmarking
     pip install pandas omegaconf && \
     chmod +x /llm/benchmark.sh && \
 
@@ -50,7 +50,14 @@ pip install transformers==4.40.0
 pip install trl==0.8.1
 ```
 
-### 2. Run pipeline parallel serving on multiple GPUs
+### 2-1. Run ipex-llm serving on one GPU card 
+
+```bash
+# Need to set NUM_GPUS=1 and MODEL_PATH in run.sh first
+bash run.sh
+```
+
+### 2-2. Run pipeline parallel serving on multiple GPUs
 
 ```bash
 # Need to set MODEL_PATH in run.sh first
@@ -76,7 +83,7 @@ export http_proxy=
 export https_proxy=
 
 curl -X 'POST' \
-  'http://127.0.0.1:8000/generate/' \
+  'http://127.0.0.1:8000/generate' \
   -H 'accept: application/json' \
   -H 'Content-Type: application/json' \
   -d '{
@@ -99,7 +106,7 @@ Please change the test url accordingly.
 
 ```bash
 # set t/c to the number of concurrencies to test full throughput.
-wrk -t1 -c1 -d5m -s ./wrk_script_1024.lua http://127.0.0.1:8000/generate/ --timeout 1m
+wrk -t1 -c1 -d5m -s ./wrk_script_1024.lua http://127.0.0.1:8000/generate --timeout 1m
 ```
 
 ## 5. Using the `benchmark.py` Script