Skip to content

Commit 2de600d

Browse files
wallashssmgoin
authored andcommitted
[Hardware][Apple] Native support for macOS Apple Silicon (vllm-project#11696)
Signed-off-by: Wallas Santos <[email protected]> Co-authored-by: Michael Goin <[email protected]>
1 parent 188bd61 commit 2de600d

File tree

11 files changed

+209
-29
lines changed

11 files changed

+209
-29
lines changed

cmake/cpu_extension.cmake

Lines changed: 46 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
44
set(CMAKE_CXX_EXTENSIONS ON)
55
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
66

7+
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
8+
set(MACOSX_FOUND TRUE)
9+
endif()
10+
11+
712
#
813
# Define environment variables for special configurations
914
#
@@ -13,6 +18,9 @@ endif()
1318

1419
include_directories("${CMAKE_SOURCE_DIR}/csrc")
1520

21+
22+
set (ENABLE_NUMA TRUE)
23+
1624
#
1725
# Check the compile flags
1826
#
@@ -22,18 +30,28 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
2230
"-mf16c"
2331
)
2432
endif()
25-
list(APPEND CXX_COMPILE_FLAGS
26-
"-fopenmp"
27-
"-DVLLM_CPU_EXTENSION")
2833

29-
execute_process(COMMAND cat /proc/cpuinfo
30-
RESULT_VARIABLE CPUINFO_RET
31-
OUTPUT_VARIABLE CPUINFO)
34+
if(MACOSX_FOUND)
35+
list(APPEND CXX_COMPILE_FLAGS
36+
"-Xpreprocessor"
37+
"-fopenmp"
38+
"-DVLLM_CPU_EXTENSION")
39+
else()
40+
list(APPEND CXX_COMPILE_FLAGS
41+
"-fopenmp"
42+
"-DVLLM_CPU_EXTENSION")
43+
endif()
3244

33-
if (NOT CPUINFO_RET EQUAL 0)
34-
message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
45+
if (NOT MACOSX_FOUND)
46+
execute_process(COMMAND cat /proc/cpuinfo
47+
RESULT_VARIABLE CPUINFO_RET
48+
OUTPUT_VARIABLE CPUINFO)
49+
if (NOT CPUINFO_RET EQUAL 0)
50+
message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
51+
endif()
3552
endif()
3653

54+
3755
function (find_isa CPUINFO TARGET OUT)
3856
string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
3957
if(NOT ISA_FOUND EQUAL -1)
@@ -54,12 +72,17 @@ endfunction()
5472

5573
is_avx512_disabled(AVX512_DISABLED)
5674

57-
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
58-
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
59-
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
60-
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
61-
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
62-
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
75+
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
76+
set(APPLE_SILICON_FOUND TRUE)
77+
else()
78+
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
79+
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
80+
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
81+
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
82+
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
83+
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
84+
endif()
85+
6386

6487
if (AVX512_FOUND AND NOT AVX512_DISABLED)
6588
list(APPEND CXX_COMPILE_FLAGS
@@ -103,6 +126,9 @@ elseif (ASIMD_FOUND)
103126
set(MARCH_FLAGS "-march=armv8.2-a+dotprod+fp16")
104127
endif()
105128
list(APPEND CXX_COMPILE_FLAGS ${MARCH_FLAGS})
129+
elseif(APPLE_SILICON_FOUND)
130+
message(STATUS "Apple Silicon Detected")
131+
set(ENABLE_NUMA OFF)
106132
else()
107133
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
108134
endif()
@@ -139,7 +165,12 @@ endif()
139165

140166
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
141167

142-
list(APPEND LIBS numa)
168+
if(ENABLE_NUMA)
169+
list(APPEND LIBS numa)
170+
else()
171+
message(STATUS "NUMA is disabled")
172+
add_compile_definitions(-DVLLM_NUMA_DISABLED)
173+
endif()
143174

144175
#
145176
# _C extension

csrc/cpu/cpu_types_arm.hpp

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,11 +91,68 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
9191
vst1q_f16(reinterpret_cast<__fp16*>(ptr) + 8, reg.val[1]);
9292
}
9393
}
94+
95+
// Note: below is the unrolled version of the following code:
96+
//
97+
// for (int i = 0; i < remainder; ++i) {
98+
// reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] =
99+
// vgetq_lane_f16(temp, i);
100+
// }
101+
//
102+
// For macOS build (Clang), the arm/neon intrinsics function
103+
// `vgetq_lane_f16` needs the parameter `i` to be constant at compile
104+
// time.
94105

95106
if (remainder > 0) {
96107
float16x8_t temp = reg.val[full_blocks];
97-
for (int i = 0; i < remainder; ++i) {
98-
reinterpret_cast<__fp16*>(ptr)[full_blocks * 8 + i] = vgetq_lane_f16(temp, i);
108+
__fp16* fp16_ptr = reinterpret_cast<__fp16*>(ptr);
109+
switch (remainder)
110+
{
111+
case 1:
112+
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
113+
break;
114+
case 2:
115+
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
116+
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
117+
break;
118+
case 3:
119+
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
120+
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
121+
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
122+
break;
123+
case 4:
124+
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
125+
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
126+
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
127+
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
128+
break;
129+
case 5:
130+
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
131+
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
132+
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
133+
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
134+
fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
135+
break;
136+
case 6:
137+
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
138+
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
139+
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
140+
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
141+
fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
142+
fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
143+
break;
144+
case 7:
145+
fp16_ptr[full_blocks * 8 + 0] = vgetq_lane_f16(temp, 0);
146+
fp16_ptr[full_blocks * 8 + 1] = vgetq_lane_f16(temp, 1);
147+
fp16_ptr[full_blocks * 8 + 2] = vgetq_lane_f16(temp, 2);
148+
fp16_ptr[full_blocks * 8 + 3] = vgetq_lane_f16(temp, 3);
149+
fp16_ptr[full_blocks * 8 + 4] = vgetq_lane_f16(temp, 4);
150+
fp16_ptr[full_blocks * 8 + 5] = vgetq_lane_f16(temp, 5);
151+
fp16_ptr[full_blocks * 8 + 6] = vgetq_lane_f16(temp, 6);
152+
break;
153+
154+
default:
155+
break;
99156
}
100157
}
101158
}

csrc/cpu/utils.cpp

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,22 @@
1-
#include <numa.h>
2-
#include <unistd.h>
3-
#include <string>
4-
#include <sched.h>
1+
#ifndef VLLM_NUMA_DISABLED
2+
#include <numa.h>
3+
#include <unistd.h>
4+
#include <string>
5+
#include <sched.h>
6+
#endif
57

68
#include "cpu_types.hpp"
79

10+
#ifdef VLLM_NUMA_DISABLED
11+
std::string init_cpu_threads_env(const std::string& cpu_ids) {
12+
return std::string(
13+
"Warning: NUMA is not enabled in this build. `init_cpu_threads_env` has "
14+
"no effect to setup thread affinity.");
15+
}
16+
17+
#endif
18+
19+
#ifndef VLLM_NUMA_DISABLED
820
std::string init_cpu_threads_env(const std::string& cpu_ids) {
921
bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
1022
TORCH_CHECK(omp_cpu_mask->size > 0);
@@ -57,7 +69,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
5769
omp_lock_t writelock;
5870
omp_init_lock(&writelock);
5971

60-
#pragma omp parallel for schedule(static, 1)
72+
#pragma omp parallel for schedule(static, 1)
6173
for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
6274
cpu_set_t mask;
6375
CPU_ZERO(&mask);
@@ -88,3 +100,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
88100

89101
return ss.str();
90102
}
103+
#endif
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
(installation-apple)=
2+
3+
# Installation for macOS
4+
5+
vLLM has experimental support for macOS with Apple Silicon. For now, users shall build from the source vLLM to natively run on macOS. For more details, like running on vLLM in a docker container, see [ARM CPU Documentation](installation-arm)
6+
7+
Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
8+
9+
## Requirements
10+
11+
- **Operating System**: `macOS Sonoma` or later
12+
- **SDK** `XCode 15.4` or later with Command Line Tools
13+
- **Compilers**: `Apple Clang >= 15.0.0`
14+
15+
<!-- (arm-backend-quick-start-dockerfile)= -->
16+
17+
## Build and installation
18+
19+
After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
20+
21+
```
22+
$ git clone https://github.com/vllm-project/vllm.git
23+
$ cd vllm
24+
$ pip install -r requirements-cpu.txt
25+
$ pip install -e .
26+
```
27+
28+
```{note}
29+
On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
30+
```
31+
32+
33+
34+
## Troubleshooting
35+
36+
If the build has error like the following snippet where standard C++ headers cannot be found, try to remove and reinstall your
37+
[Command Line Tools for Xcode](https://developer.apple.com/download/all/).
38+
39+
```
40+
[...] fatal error: 'map' file not found
41+
1 | #include <map>
42+
| ^~~~~
43+
1 error generated.
44+
[2/8] Building CXX object CMakeFiles/_C.dir/csrc/cpu/pos_encoding.cpp.o
45+
46+
[...] fatal error: 'cstddef' file not found
47+
10 | #include <cstddef>
48+
| ^~~~~~~~~
49+
1 error generated.
50+
```
51+

docs/source/getting_started/installation/cpu-arm.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# Installation for ARM CPUs
44

5-
vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
5+
vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM (which also apply to Apple Silicon, see [Installation for macOS](#installation-apple) for more). For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering:
66

77
- CPU backend inference capabilities
88
- Relevant runtime environment variables
@@ -20,7 +20,7 @@ Contents:
2020
## Requirements
2121

2222
- **Operating System**: Linux or macOS
23-
- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended)
23+
- **Compilers**: `gcc/g++ >= 12.3.0` (optional, but recommended) or `Apple Clang >= 15.0.0` for macOS
2424
- **Instruction Set Architecture (ISA)**: NEON support is required
2525

2626
(arm-backend-quick-start-dockerfile)=

docs/source/getting_started/installation/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ gpu-cuda
1111
gpu-rocm
1212
cpu-x86
1313
cpu-arm
14+
cpu-apple
1415
hpu-gaudi
1516
tpu
1617
xpu

requirements-cpu.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
-r requirements-common.txt
33

44
# Dependencies for CPUs
5-
torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64"
6-
torch==2.5.1; platform_machine == "aarch64"
5+
torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
6+
torch==2.5.1; platform_machine == "aarch64" or platform_system == "Darwin"
77
torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch
8-
datasets # for benchmark scripts
8+
datasets # for benchmark scripts

setup.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,14 @@ def load_module_from_path(module_name, path):
3434

3535
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
3636

37-
if not sys.platform.startswith("linux"):
37+
if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
3838
logger.warning(
39-
"vLLM only supports Linux platform (including WSL). "
39+
"VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
40+
VLLM_TARGET_DEVICE = "cpu"
41+
elif not (sys.platform.startswith("linux")
42+
or sys.platform.startswith("darwin")):
43+
logger.warning(
44+
"vLLM only supports Linux platform (including WSL) and MacOS."
4045
"Building on %s, "
4146
"so vLLM may not be able to run correctly", sys.platform)
4247
VLLM_TARGET_DEVICE = "empty"

vllm/config.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import hashlib
55
import json
66
import os
7+
import sys
78
import warnings
89
from contextlib import contextmanager
910
from dataclasses import dataclass, field, replace
@@ -2259,6 +2260,17 @@ def _get_and_verify_dtype(
22592260
"supported for POWERPC.")
22602261
torch_dtype = torch.bfloat16
22612262

2263+
# TODO: change this condition to check if the platform support bf16
2264+
# instead of checking the OS. For instance M2 shall supports bf16
2265+
# already. But we need to modify `cpu_extension.cmake` to activate
2266+
# the feature in the build.
2267+
if (current_platform.is_cpu() and sys.platform.startswith("darwin")
2268+
and current_platform.get_cpu_architecture()
2269+
== CpuArchEnum.ARM and config_dtype == torch.bfloat16):
2270+
logger.info("For macOS with Apple Silicon, currently bfloat16 "
2271+
"is not supported. Setting dtype to float16.")
2272+
torch_dtype = torch.float16
2273+
22622274
if current_platform.is_hpu() and config_dtype == torch.float16:
22632275
logger.info(
22642276
"For HPU, we cast models to bfloat16 instead of"

vllm/entrypoints/openai/api_server.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import re
88
import signal
99
import socket
10+
import sys
1011
import tempfile
1112
import uuid
1213
from argparse import Namespace
@@ -805,6 +806,8 @@ def signal_handler(*_) -> None:
805806
ssl_certfile=args.ssl_certfile,
806807
ssl_ca_certs=args.ssl_ca_certs,
807808
ssl_cert_reqs=args.ssl_cert_reqs,
809+
# Workaround to work on macOS
810+
fd=sock.fileno() if sys.platform.startswith("darwin") else None,
808811
**uvicorn_kwargs,
809812
)
810813

vllm/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,13 @@ def get_open_port() -> int:
524524

525525

526526
def find_process_using_port(port: int) -> Optional[psutil.Process]:
527+
# TODO: We can not check for running processes with network
528+
# port on macOS. Therefore, we can not have a full graceful shutdown
529+
# of vLLM. For now, let's not look for processes in this case.
530+
# Ref: https://www.florianreinhard.de/accessdenied-in-psutil/
531+
if sys.platform.startswith("darwin"):
532+
return None
533+
527534
for conn in psutil.net_connections():
528535
if conn.laddr.port == port:
529536
try:

0 commit comments

Comments
 (0)