From bf0ec124c62a4e23b5035ed9fde065a87a839065 Mon Sep 17 00:00:00 2001
From: ckl117 <ckl117@163.com>
Date: Thu, 6 Nov 2025 21:08:34 +0800
Subject: [PATCH 1/2] add gelu_tanh

---
 custom_ops/gpu_ops/cpp_extensions.cc |  4 ++
 custom_ops/gpu_ops/gelu_tanh.cu      | 91 ++++++++++++++++++++++++++++
 custom_ops/setup_ops.py              |  1 +
 tests/operators/test_gelu_tanh.py    | 44 ++++++++++++++
 4 files changed, 140 insertions(+)
 create mode 100644 custom_ops/gpu_ops/gelu_tanh.cu
 create mode 100644 tests/operators/test_gelu_tanh.py
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
index 979a94eb7c6..00b516e140f 100644
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -1059,6 +1059,8 @@ std::vector<paddle::Tensor> UpdateAttnMaskOffsets(
     const paddle::Tensor& decode_states,
     const paddle::Tensor& mask_rollback);
 
+void gelu_tanh(paddle::Tensor& output, paddle::Tensor& input, bool enable_pdl);
+
 PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("get_expert_token_num",
         &GetExpertTokenNum,
@@ -1648,4 +1650,6 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("update_attn_mask_offsets",
         &UpdateAttnMaskOffsets,
         "update attention mask");
+
+  m.def("gelu_tanh", &gelu_tanh, "gelu_tanh function");
 }
diff --git a/custom_ops/gpu_ops/gelu_tanh.cu b/custom_ops/gpu_ops/gelu_tanh.cu
new file mode 100644
index 00000000000..5240f61c5f4
--- /dev/null
+++ b/custom_ops/gpu_ops/gelu_tanh.cu
@@ -0,0 +1,91 @@
+#include "helper.h"
+
+__forceinline__ __device__ float tanh_ptx(float x) {
+  float y;
+  asm volatile("tanh.approx.f32 %0, %1;" : "=f"(y) : "f"(x));
+  return y;
+}
+
+__device__ __forceinline__ float gelu_tanh_func(const float& val) {
+  const float cdf =
+      0.5f * (1.0f + tanh_ptx((0.7978845608028654f *
+                               (val + 0.044715f * val * val * val))));
+  return val * cdf;
+}
+
+template <typename T>
+__global__ void gelu_tanh_kernel(T* __restrict__ out,
+                                 const T* __restrict__ input,
+                                 const int d) {
+  constexpr uint32_t kVecSize = 16 / sizeof(T);
+  const int64_t token_idx = blockIdx.x;
+  const int64_t thread_idx = threadIdx.x;
+  const int64_t stride = blockDim.x;
+  const int64_t offset = token_idx * 2 * d;
+  using vec_t = AlignedVector<T, kVecSize>;
+#if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && \
+     (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");
+#endif
+
+#pragma unroll 1
+  for (uint32_t idx = thread_idx; idx < d / kVecSize; idx += stride) {
+    vec_t x_vec, y_vec, out_vec;
+    Load(input + offset + idx * kVecSize, &x_vec);
+    Load(input + offset + d + idx * kVecSize, &y_vec);
+#pragma unroll
+    for (uint32_t i = 0; i < kVecSize; ++i) {
+      out_vec[i] = static_cast<T>(gelu_tanh_func(static_cast<float>(x_vec[i])) *
+                                  static_cast<float>(y_vec[i]));
+    }
+    Store(out_vec, out + token_idx * d + idx * kVecSize);
+  }
+
+  const int64_t remaining_offset = d - d % (stride * kVecSize);
+  // process the remaining elements
+#pragma unroll 1
+  for (int64_t idx = thread_idx; idx < d % (stride * kVecSize); idx += stride) {
+    float x = input[offset + remaining_offset + idx],
+          y = input[offset + remaining_offset + d + idx];
+    out[token_idx * d + remaining_offset + idx] =
+        static_cast<T>(gelu_tanh_func(x) * y);
+  }
+
+#if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && \
+     (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+void gelu_tanh(paddle::Tensor& output, paddle::Tensor& input, bool enable_pdl) {
+  int d = input.dims()[1] / 2;
+  int64_t num_tokens = input.dims()[0];
+  cudaStream_t stream = input.stream();
+
+  DISPATCH_FLOAT_FP6_DTYPE(input.dtype(), scalar_t, {
+    uint32_t vec_size = 16 / sizeof(scalar_t);
+    cudaLaunchConfig_t config;
+    config.gridDim = num_tokens;
+    config.blockDim = std::min(d / vec_size, 1024U);
+    config.dynamicSmemBytes = 0;
+    config.stream = stream;
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+    config.numAttrs = 1;
+    config.attrs = attrs;
+
+    cudaLaunchKernelEx(&config,
+                       gelu_tanh_kernel<scalar_t>,
+                       output.data<scalar_t>(),
+                       input.data<scalar_t>(),
+                       d);
+  });
+}
+
+PD_BUILD_STATIC_OP(gelu_tanh)
+    .Inputs({"output", "input"})
+    .Attrs({"enable_pdl:bool"})
+    .Outputs({"out"})
+    .SetInplaceMap({{"output", "out"}})
+    .SetKernelFn(PD_KERNEL(gelu_tanh));
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
index 4903f3dd575..72ed0a2b11c 100644
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -306,6 +306,7 @@ def find_end_files(directory, end_str):
         "gpu_ops/limit_thinking_content_length_v1.cu",
         "gpu_ops/limit_thinking_content_length_v2.cu",
         "gpu_ops/update_attn_mask_offsets.cu",
+        "gpu_ops/gelu_tanh.cu",
     ]
 
     # pd_disaggregation
diff --git a/tests/operators/test_gelu_tanh.py b/tests/operators/test_gelu_tanh.py
new file mode 100644
index 00000000000..341dfe52dc7
--- /dev/null
+++ b/tests/operators/test_gelu_tanh.py
@@ -0,0 +1,44 @@
+import math
+import unittest
+from itertools import product
+
+import numpy as np
+import paddle
+
+from fastdeploy.model_executor.ops.gpu import gelu_tanh
+
+
+class TestMoeRouting(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2024)
+        print(paddle.device.cuda.get_device_properties())
+        print(paddle.__git_commit__)
+
+    def native_gelu_tanh(
+        self,
+        input: paddle.Tensor,
+    ):
+        orig_dtype = input.dtype
+        d = input.shape[-1] // 2
+        x_fp32 = input.cast("float32")
+        x1 = x_fp32[:, :d]
+        act_out = 0.5 * x1 * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (x1 + 0.044715 * paddle.pow(x1, 3.0))))
+        out = act_out * x_fp32[:, d:]
+        return out.cast(orig_dtype)
+
+    def test_gelu_tanh(self):
+        bszs = [1, 32, 64, 128, 1024]
+        hidden_sizes = [4096, 7168]
+        test_cases = product(bszs, hidden_sizes)
+        for bsz, hidden_size in test_cases:
+            shape = [bsz, hidden_size]
+            input = paddle.randn(shape, dtype="float16")
+            out_ref = self.native_gelu_tanh(input)
+
+            out = paddle.empty([bsz, hidden_size // 2], dtype=input.dtype)
+            gelu_tanh(out, input, False)
+            np.testing.assert_allclose(out_ref.numpy(), out.numpy(), rtol=1e-03, atol=1e-03)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 5cef4583e84bc05aac708e3d38946dc0cf21041c Mon Sep 17 00:00:00 2001
From: ckl117 <ckl117@163.com>
Date: Thu, 6 Nov 2025 21:35:48 +0800
Subject: [PATCH 2/2] check

---
 custom_ops/gpu_ops/gelu_tanh.cu               | 17 +++++++----------
 .../model_executor/layers/activation.py       | 18 ++++++++++++++++++
 fastdeploy/model_executor/layers/utils.py     |  7 +++++++
 tests/operators/test_gelu_tanh.py             | 19 +++++++++----------
 4 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/custom_ops/gpu_ops/gelu_tanh.cu b/custom_ops/gpu_ops/gelu_tanh.cu
index 5240f61c5f4..ce57149b378 100644
--- a/custom_ops/gpu_ops/gelu_tanh.cu
+++ b/custom_ops/gpu_ops/gelu_tanh.cu
@@ -21,7 +21,7 @@ __global__ void gelu_tanh_kernel(T* __restrict__ out,
   const int64_t token_idx = blockIdx.x;
   const int64_t thread_idx = threadIdx.x;
   const int64_t stride = blockDim.x;
-  const int64_t offset = token_idx * 2 * d;
+  const int64_t offset = token_idx * d;
   using vec_t = AlignedVector<T, kVecSize>;
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && \
      (__CUDA_ARCH__ >= 900))
@@ -30,25 +30,22 @@ __global__ void gelu_tanh_kernel(T* __restrict__ out,
 
 #pragma unroll 1
   for (uint32_t idx = thread_idx; idx < d / kVecSize; idx += stride) {
-    vec_t x_vec, y_vec, out_vec;
+    vec_t x_vec;
     Load(input + offset + idx * kVecSize, &x_vec);
-    Load(input + offset + d + idx * kVecSize, &y_vec);
 #pragma unroll
     for (uint32_t i = 0; i < kVecSize; ++i) {
-      out_vec[i] = static_cast<T>(gelu_tanh_func(static_cast<float>(x_vec[i])) *
-                                  static_cast<float>(y_vec[i]));
+      x_vec[i] = static_cast<T>(gelu_tanh_func(static_cast<float>(x_vec[i])));
     }
-    Store(out_vec, out + token_idx * d + idx * kVecSize);
+    Store(x_vec, out + token_idx * d + idx * kVecSize);
   }
 
   const int64_t remaining_offset = d - d % (stride * kVecSize);
   // process the remaining elements
 #pragma unroll 1
   for (int64_t idx = thread_idx; idx < d % (stride * kVecSize); idx += stride) {
-    float x = input[offset + remaining_offset + idx],
-          y = input[offset + remaining_offset + d + idx];
+    float x = input[offset + remaining_offset + idx];
     out[token_idx * d + remaining_offset + idx] =
-        static_cast<T>(gelu_tanh_func(x) * y);
+        static_cast<T>(gelu_tanh_func(x));
   }
 
 #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && \
@@ -58,7 +55,7 @@ __global__ void gelu_tanh_kernel(T* __restrict__ out,
 }
 
 void gelu_tanh(paddle::Tensor& output, paddle::Tensor& input, bool enable_pdl) {
-  int d = input.dims()[1] / 2;
+  int d = input.dims()[1];
   int64_t num_tokens = input.dims()[0];
   cudaStream_t stream = input.stream();
 
diff --git a/fastdeploy/model_executor/layers/activation.py b/fastdeploy/model_executor/layers/activation.py
index 79fd3b24f64..7b8a1197918 100644
--- a/fastdeploy/model_executor/layers/activation.py
+++ b/fastdeploy/model_executor/layers/activation.py
@@ -23,6 +23,24 @@
 from fastdeploy.config import FDConfig
 from fastdeploy.platforms import current_platform
 
+from .utils import is_arch_support_pdl
+
+
+def gelu_tanh(
+    input: paddle.Tensor,
+    out: Optional[paddle.Tensor] = None,
+    enable_pdl: Optional[bool] = None,
+):
+    """GeLU Tanh operation."""
+    from fastdeploy.model_executor.ops.gpu import gelu_tanh
+
+    if out is None:
+        out = out = paddle.empty_like(input)
+    if enable_pdl is None:
+        enable_pdl = is_arch_support_pdl()
+    gelu_tanh(out, input, enable_pdl)
+    return out
+
 
 class SiluAndMul(nn.Layer):
     """
diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py
index c0644896e8e..d1a83faeff1 100644
--- a/fastdeploy/model_executor/layers/utils.py
+++ b/fastdeploy/model_executor/layers/utils.py
@@ -402,3 +402,10 @@ def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size: int, ran
 def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int, offset: int = 0):
     per_partition_vocab_size = divide(global_vocab_size, world_size)
     return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, offset=offset)
+
+
+@functools.lru_cache(maxsize=1)
+def is_arch_support_pdl() -> bool:
+    # Hopper arch's compute capability == 9.0
+    prop = paddle.device.cuda.get_device_properties()
+    return prop.major >= 9
diff --git a/tests/operators/test_gelu_tanh.py b/tests/operators/test_gelu_tanh.py
index 341dfe52dc7..450d83d5897 100644
--- a/tests/operators/test_gelu_tanh.py
+++ b/tests/operators/test_gelu_tanh.py
@@ -5,10 +5,10 @@
 import numpy as np
 import paddle
 
-from fastdeploy.model_executor.ops.gpu import gelu_tanh
+from fastdeploy.model_executor.layers.activation import gelu_tanh
 
 
-class TestMoeRouting(unittest.TestCase):
+class TestGeluTanh(unittest.TestCase):
     def setUp(self):
         paddle.seed(2024)
         print(paddle.device.cuda.get_device_properties())
@@ -18,13 +18,13 @@ def native_gelu_tanh(
         self,
         input: paddle.Tensor,
     ):
-        orig_dtype = input.dtype
-        d = input.shape[-1] // 2
         x_fp32 = input.cast("float32")
-        x1 = x_fp32[:, :d]
-        act_out = 0.5 * x1 * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (x1 + 0.044715 * paddle.pow(x1, 3.0))))
-        out = act_out * x_fp32[:, d:]
-        return out.cast(orig_dtype)
+        out = (
+            0.5
+            * x_fp32
+            * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (x_fp32 + 0.044715 * paddle.pow(x_fp32, 3.0))))
+        )
+        return out.cast(input.dtype)
 
     def test_gelu_tanh(self):
         bszs = [1, 32, 64, 128, 1024]
@@ -35,8 +35,7 @@ def test_gelu_tanh(self):
             input = paddle.randn(shape, dtype="float16")
             out_ref = self.native_gelu_tanh(input)
 
-            out = paddle.empty([bsz, hidden_size // 2], dtype=input.dtype)
-            gelu_tanh(out, input, False)
+            out = gelu_tanh(input)
             np.testing.assert_allclose(out_ref.numpy(), out.numpy(), rtol=1e-03, atol=1e-03)