|
| 1 | +import triton # type: ignore[import] |
| 2 | +import triton.language as tl # type: ignore[import] |
| 3 | + |
| 4 | +from flashinfer.triton.kernels.quant import scale_and_clamp |
| 5 | + |
| 6 | + |
| 7 | +@triton.jit |
| 8 | +def rms_norm_kernel( |
| 9 | + n, |
| 10 | + b, |
| 11 | + x_ptr, |
| 12 | + x_stride, |
| 13 | + x_scale_ptr, |
| 14 | + r_ptr, |
| 15 | + r_stride, |
| 16 | + w_ptr, |
| 17 | + o_ptr, |
| 18 | + o_stride, |
| 19 | + o_scale_ptr, |
| 20 | + EPS: tl.constexpr, |
| 21 | + BLOCK_SIZE: tl.constexpr, |
| 22 | + HAS_IN_SCALE: tl.constexpr, |
| 23 | + HAS_OUT_SCALE: tl.constexpr, |
| 24 | + HAS_OUTPUT: tl.constexpr, |
| 25 | + HAS_RESIDUAL: tl.constexpr, |
| 26 | +) -> None: |
| 27 | + i = tl.program_id(axis=0).to(tl.int64) |
| 28 | + |
| 29 | + # If r_ptr is present, the input to norm is x + r. |
| 30 | + x_row = x_ptr + i * x_stride |
| 31 | + o_row = o_ptr + i * o_stride if HAS_OUTPUT else x_row |
| 32 | + r_row = r_ptr + i * r_stride if HAS_RESIDUAL else None |
| 33 | + |
| 34 | + x_scale = tl.load(x_scale_ptr) if HAS_IN_SCALE else None |
| 35 | + o_scale = tl.load(o_scale_ptr) if HAS_OUT_SCALE else None |
| 36 | + |
| 37 | + # Find the root mean square for the given row. |
| 38 | + square_sum = tl.zeros([BLOCK_SIZE], dtype=tl.float32) |
| 39 | + for off in range(0, n, BLOCK_SIZE): |
| 40 | + offsets = off + tl.arange(0, BLOCK_SIZE) |
| 41 | + mask = offsets < n |
| 42 | + |
| 43 | + x = tl.load(x_row + offsets, mask=mask, other=0.0).to(tl.float32) |
| 44 | + if HAS_IN_SCALE: |
| 45 | + x *= x_scale |
| 46 | + |
| 47 | + if HAS_RESIDUAL: |
| 48 | + r = tl.load(r_row + offsets, mask=mask, other=0.0).to(tl.float32) |
| 49 | + x += r |
| 50 | + tl.store(r_row + offsets, x, mask=mask) |
| 51 | + |
| 52 | + square_sum += x * x |
| 53 | + |
| 54 | + # Compute the norm. |
| 55 | + rms = tl.rsqrt(tl.sum(square_sum) / n + EPS) |
| 56 | + |
| 57 | + # x[i] = r[i] + x[i] / rms * weight[i] |
| 58 | + output_dtype = o_row.dtype.element_ty |
| 59 | + for off in range(0, n, BLOCK_SIZE): |
| 60 | + offsets = off + tl.arange(0, BLOCK_SIZE) |
| 61 | + mask = offsets < n |
| 62 | + |
| 63 | + if HAS_RESIDUAL: |
| 64 | + x = tl.load(r_row + offsets, mask=mask).to(tl.float32) |
| 65 | + else: |
| 66 | + x = tl.load(x_row + offsets, mask=mask).to(tl.float32) |
| 67 | + if HAS_IN_SCALE: |
| 68 | + x *= x_scale |
| 69 | + |
| 70 | + w = tl.load(w_ptr + offsets, mask=mask).to(tl.float32) |
| 71 | + |
| 72 | + # Multiply x with RMS on float32, but cast to the narrower type before |
| 73 | + # multiplying with the weights to replicate the HF behaviour precisely. |
| 74 | + result = w * (x * rms) |
| 75 | + if HAS_OUT_SCALE: |
| 76 | + result = scale_and_clamp(result, o_scale, output_dtype) |
| 77 | + tl.store(o_row + offsets, result, mask=mask) |
0 commit comments