RoPE in float32 precision

imoneoi · imoneoi · commit 890b3bca917a · 2023-08-25T08:27:57.000Z
diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu
@@ -8,7 +8,7 @@ __global__ void rotary_embedding_neox_kernel(
   const int64_t* __restrict__ positions,        // [num_tokens]
   scalar_t* __restrict__ query,                 // [num_tokens, num_heads, head_size]
   scalar_t* __restrict__ key,                   // [num_tokens, num_kv_heads, head_size]
-  const scalar_t* __restrict__ cos_sin_cache,   // [max_position, 2, rot_dim // 2]
+  const float* __restrict__ cos_sin_cache,      // [max_position, 2, rot_dim // 2]
   const int rot_dim,
   const int query_stride,
   const int key_stride,
@@ -18,7 +18,7 @@ __global__ void rotary_embedding_neox_kernel(
   // Each thread block is responsible for one token.
   const int token_idx = blockIdx.x;
   int64_t pos = positions[token_idx];
-  const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+  const float* cache_ptr = cos_sin_cache + pos * rot_dim;
 
   const int embed_dim = rot_dim / 2;
   const int nq = num_heads * embed_dim;
@@ -33,13 +33,13 @@ __global__ void rotary_embedding_neox_kernel(
     const int out_x = token_idx * query_stride + head_idx * head_size + x_index;
     const int out_y = token_idx * query_stride + head_idx * head_size + y_index;
 
-    const scalar_t cos = __ldg(cache_ptr + x_index);
-    const scalar_t sin = __ldg(cache_ptr + y_index);
+    const float cos = __ldg(cache_ptr + x_index);
+    const float sin = __ldg(cache_ptr + y_index);
 
-    const scalar_t q_x = query[token_head + x_index];
-    const scalar_t q_y = query[token_head + y_index];
-    query[out_x] = q_x * cos - q_y * sin;
-    query[out_y] = q_y * cos + q_x * sin;
+    const float q_x = static_cast<float> (query[token_head + x_index]);
+    const float q_y = static_cast<float> (query[token_head + y_index]);
+    query[out_x] = static_cast<scalar_t> (q_x * cos - q_y * sin);
+    query[out_y] = static_cast<scalar_t> (q_y * cos + q_x * sin);
   }
 
   const int nk = num_kv_heads * embed_dim;
@@ -54,13 +54,13 @@ __global__ void rotary_embedding_neox_kernel(
     const int out_x = token_idx * key_stride + head_idx * head_size + x_index;
     const int out_y = token_idx * key_stride + head_idx * head_size + y_index;
 
-    const scalar_t cos = __ldg(cache_ptr + x_index);
-    const scalar_t sin = __ldg(cache_ptr + y_index);
+    const float cos = __ldg(cache_ptr + x_index);
+    const float sin = __ldg(cache_ptr + y_index);
 
-    const scalar_t k_x = key[token_head + x_index];
-    const scalar_t k_y = key[token_head + y_index];
-    key[out_x] = k_x * cos - k_y * sin;
-    key[out_y] = k_y * cos + k_x * sin;
+    const float k_x = static_cast<float> (key[token_head + x_index]);
+    const float k_y = static_cast<float> (key[token_head + y_index]);
+    key[out_x] = static_cast<scalar_t> (k_x * cos - k_y * sin);
+    key[out_y] = static_cast<scalar_t> (k_y * cos + k_x * sin);
   }
 }
 
@@ -93,7 +93,7 @@ void rotary_embedding_neox(
         positions.data_ptr<int64_t>(),
         query.data_ptr<scalar_t>(),
         key.data_ptr<scalar_t>(),
-        cos_sin_cache.data_ptr<scalar_t>(),
+        cos_sin_cache.data_ptr<float>(),
         rot_dim,
         query_stride,
         key_stride,
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
@@ -259,18 +259,13 @@ def __init__(
         super().__init__(num_heads, head_size, scale, num_kv_heads)
 
         # Create the cos and sin cache.
-        inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2) / rotary_dim))
-        t = torch.arange(max_position).float()
-        freqs = torch.einsum("i,j -> ij", t, inv_freq.float())
+        inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2, dtype=torch.float32) / rotary_dim))
+        t = torch.arange(max_position, dtype=torch.float32)
+        freqs = torch.einsum("i,j -> ij", t, inv_freq)
         cos = freqs.cos()
         sin = freqs.sin()
         cache = torch.cat((cos, sin), dim=-1)
 
-        # FIXME(woosuk): This assumes that we configure the default dtype when
-        # initializing the model.
-        # TODO(woosuk): Make it more robust.
-        torch_dtype = torch.get_default_dtype()
-        cache = cache.to(torch_dtype)
         # Embedding size: [max_position, rotary_dim]
         self.register_buffer("cos_sin_cache", cache, persistent=False)