From 3a8a27bb20b7d65dfd092a3d46372827f56a2713 Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Fri, 3 Apr 2026 14:39:45 +0900
Subject: [PATCH 01/21] Improve parallel performance of element force
 derivative computation

  Extract assembled stiffness matrices into a separate contiguous buffer
  (m_assembledStiffnessMatrices) to replace getReadAccessor calls on
  Data<vector<FactorizedElementStiffness>> inside parallel forEachRange
  lambdas. The read accessor acquires a shared lock on the Data object,
  causing contention across threads and effectively serializing the
  parallel work during CG iterations. Using a direct const reference to a
  plain vector eliminates this synchronization bottleneck (~3x speedup in
  parallel mode). As a secondary benefit, the contiguous buffer only
  stores the assembled 24x24 matrices (~4.6 KB each) rather than the full
  FactorizedElementStiffness structs (~14 KB each), improving cache
  utilization.
---
 .../fem/elastic/BaseElementLinearFEMForceField.h     | 12 ++++++++++++
 .../fem/elastic/BaseElementLinearFEMForceField.inl   |  7 +++++++
 .../fem/elastic/ElementCorotationalFEMForceField.inl |  8 ++++----
 .../ElementLinearSmallStrainFEMForceField.inl        |  8 ++++----
 4 files changed, 27 insertions(+), 8 deletions(-)
diff --git a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.h b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.h
index 1a980373a5e..26b4ffadd93 100644
--- a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.h
+++ b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.h
@@ -55,6 +55,10 @@ class BaseElementLinearFEMForceField : public sofa::component::solidmechanics::f
     using StrainDisplacement = typename trait::StrainDisplacement;
     using Real = typename trait::Real;
 
+public:
+    using AssembledStiffnessMatrix = sofa::type::Mat<
+        trait::NumberOfDofsInElement, trait::NumberOfDofsInElement, Real>;
+
 protected:
 
     BaseElementLinearFEMForceField();
@@ -70,6 +74,14 @@ class BaseElementLinearFEMForceField : public sofa::component::solidmechanics::f
      * List of precomputed element stiffness matrices
      */
     sofa::Data<sofa::type::vector<ElementStiffness> > d_elementStiffness;
+
+    /**
+     * Contiguous buffer of assembled stiffness matrices (one per element).
+     * Extracted from d_elementStiffness for cache-friendly access in the hot path.
+     * This avoids loading the full FactorizedElementStiffness struct (~15 KB per element)
+     * when only the assembled matrix (~4.6 KB) is needed.
+     */
+    sofa::type::vector<AssembledStiffnessMatrix> m_assembledStiffnessMatrices;
 };
 
 #if !defined(ELASTICITY_COMPONENT_BASE_ELEMENT_LINEAR_FEM_FORCEFIELD_CPP)
diff --git a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.inl b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.inl
index de380c5e4cc..e7827cfc55e 100644
--- a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.inl
+++ b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.inl
@@ -100,6 +100,13 @@ void BaseElementLinearFEMForceField<DataTypes, ElementType>::precomputeElementSt
             const std::array<sofa::Coord_t<DataTypes>, trait::NumberOfNodesInElement> nodesCoordinates = extractNodesVectorFromGlobalVector(element, restPositionAccessor.ref());
             elementStiffness[elementId] = integrate<DataTypes, ElementType, trait::matrixVectorProductType>(nodesCoordinates, elasticityTensor);
         });
+
+    // Extract assembled matrices into a contiguous buffer for cache-friendly access
+    m_assembledStiffnessMatrices.resize(elements.size());
+    for (std::size_t i = 0; i < elements.size(); ++i)
+    {
+        m_assembledStiffnessMatrices[i] = elementStiffness[i].getAssembledMatrix();
+    }
 }
 
 }
diff --git a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementCorotationalFEMForceField.inl b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementCorotationalFEMForceField.inl
index a3a74a36a81..0298f2e4247 100644
--- a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementCorotationalFEMForceField.inl
+++ b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementCorotationalFEMForceField.inl
@@ -86,7 +86,7 @@ void ElementCorotationalFEMForceField<DataTypes, ElementType>::computeElementsFo
 {
     const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology);
     auto restPositionAccessor = this->mstate->readRestPositions();
-    auto elementStiffness = sofa::helper::getReadAccessor(this->d_elementStiffness);
+    const auto& assembledMatrices = this->m_assembledStiffnessMatrices;
 
     for (std::size_t elementId = range.start; elementId < range.end; ++elementId)
     {
@@ -112,7 +112,7 @@ void ElementCorotationalFEMForceField<DataTypes, ElementType>::computeElementsFo
             transformedDisplacement = elementRotation.multTranspose(elementNodesCoordinates[j] - t) - (restElementNodesCoordinates[j] - t0);
         }
 
-        const auto& stiffnessMatrix = elementStiffness[elementId];
+        const auto& stiffnessMatrix = assembledMatrices[elementId];
 
         auto& elementForce = elementForces[elementId];
         elementForce = stiffnessMatrix * displacement;
@@ -134,7 +134,7 @@ void ElementCorotationalFEMForceField<DataTypes, ElementType>::computeElementsFo
     const sofa::VecDeriv_t<DataTypes>& nodeDx)
 {
     const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology);
-    auto elementStiffness = sofa::helper::getReadAccessor(this->d_elementStiffness);
+    const auto& assembledMatrices = this->m_assembledStiffnessMatrices;
 
     for (std::size_t elementId = range.start; elementId < range.end; ++elementId)
     {
@@ -150,7 +150,7 @@ void ElementCorotationalFEMForceField<DataTypes, ElementType>::computeElementsFo
             rotated_dx = elementRotation.multTranspose(nodeDx[element[n]]);
         }
 
-        const auto& stiffnessMatrix = elementStiffness[elementId];
+        const auto& stiffnessMatrix = assembledMatrices[elementId];
 
         auto& df = elementForcesDeriv[elementId];
         df = stiffnessMatrix * element_dx;
diff --git a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementLinearSmallStrainFEMForceField.inl b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementLinearSmallStrainFEMForceField.inl
index fc58db7abe4..191de07c12c 100644
--- a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementLinearSmallStrainFEMForceField.inl
+++ b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementLinearSmallStrainFEMForceField.inl
@@ -49,12 +49,12 @@ void ElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::computeEleme
 {
     const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology);
     auto restPositionAccessor = this->mstate->readRestPositions();
-    auto elementStiffness = sofa::helper::getReadAccessor(this->d_elementStiffness);
+    const auto& assembledMatrices = this->m_assembledStiffnessMatrices;
 
     for (std::size_t elementId = range.start; elementId < range.end; ++elementId)
     {
         const auto& element = elements[elementId];
-        const auto& stiffnessMatrix = elementStiffness[elementId];
+        const auto& stiffnessMatrix = assembledMatrices[elementId];
 
         typename trait::ElementDisplacement displacement{ sofa::type::NOINIT };
 
@@ -79,12 +79,12 @@ void ElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::computeEleme
     const sofa::VecDeriv_t<DataTypes>& nodeDx)
 {
     const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology);
-    auto elementStiffness = sofa::helper::getReadAccessor(this->d_elementStiffness);
+    const auto& assembledMatrices = this->m_assembledStiffnessMatrices;
 
     for (std::size_t elementId = range.start; elementId < range.end; ++elementId)
     {
         const auto& element = elements[elementId];
-        const auto& stiffnessMatrix = elementStiffness[elementId];
+        const auto& stiffnessMatrix = assembledMatrices[elementId];
 
         const std::array<sofa::Coord_t<DataTypes>, trait::NumberOfNodesInElement> elementNodesDx =
             extractNodesVectorFromGlobalVector(element, nodeDx);

From 9e2c5b96f21a7b82eb272add49d0c3f9c87bffd7 Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Mon, 6 Apr 2026 11:05:16 +0900
Subject: [PATCH 02/21] wip

---
 .../plugins/SofaCUDA/Component/CMakeLists.txt |   4 +
 .../Component/src/SofaCUDA/component/init.cpp |   2 +
 .../CudaElementCorotationalFEMForceField.cpp  |  58 ++++++
 .../CudaElementCorotationalFEMForceField.cu   | 160 +++++++++++++++++
 .../CudaElementCorotationalFEMForceField.h    | 110 ++++++++++++
 .../CudaElementCorotationalFEMForceField.inl  | 168 ++++++++++++++++++
 .../CantileverBeam_ElementFEMForceField.xml   |   2 +-
 7 files changed, 503 insertions(+), 1 deletion(-)
 create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
 create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
 create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
 create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl

diff --git a/applications/plugins/SofaCUDA/Component/CMakeLists.txt b/applications/plugins/SofaCUDA/Component/CMakeLists.txt
index 77ce7a8b2be..fbd83faf0e5 100644
--- a/applications/plugins/SofaCUDA/Component/CMakeLists.txt
+++ b/applications/plugins/SofaCUDA/Component/CMakeLists.txt
@@ -39,6 +39,8 @@ set(HEADER_FILES
     
 
     ### solidmechanics
+    ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
+    ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.h
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.inl
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/hyperelastic/CudaStandardTetrahedralFEMForceField.h
@@ -111,6 +113,7 @@ set(SOURCE_FILES
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/mass/CudaUniformMass.cpp
 
     ### Solidmechanics
+    ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.cpp
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/hyperelastic/CudaStandardTetrahedralFEMForceField.cpp
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/tensormass/CudaTetrahedralTensorMassForceField.cpp
@@ -181,6 +184,7 @@ set(CUDA_SOURCES
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/mass/CudaUniformMass.cu
 
     ### solidmechanics
+    ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.cu
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/hyperelastic/CudaStandardTetrahedralFEMForceField.cu
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/tensormass/CudaTetrahedralTensorMassForceField.cu
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp
index f9c124998fb..0d890c64139 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp
@@ -90,6 +90,7 @@ extern void registerPlaneForceField(sofa::core::ObjectFactory* factory);
 extern void registerSphereForceField(sofa::core::ObjectFactory* factory);
 
 // component::solidmechanics::fem::elastic
+extern void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory);
 extern void registerHexahedronFEMForceField(sofa::core::ObjectFactory* factory);
 extern void registerTetrahedronFEMForceField(sofa::core::ObjectFactory* factory);
 extern void registerTriangularFEMForceFieldOptim(sofa::core::ObjectFactory* factory);
@@ -224,6 +225,7 @@ void registerObjects(sofa::core::ObjectFactory* factory)
     registerLinearForceField(factory);
     registerPlaneForceField(factory);
     registerSphereForceField(factory);
+    registerElementCorotationalFEMForceField(factory);
     registerHexahedronFEMForceField(factory);
     registerTetrahedronFEMForceField(factory);
     registerTriangularFEMForceFieldOptim(factory);
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
new file mode 100644
index 00000000000..cb0251b4925
--- /dev/null
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
@@ -0,0 +1,58 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#include <SofaCUDA/component/config.h>
+
+#include <sofa/gpu/cuda/CudaTypes.h>
+#include <SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl>
+#include <sofa/core/ObjectFactory.h>
+
+namespace sofa::component::solidmechanics::fem::elastic
+{
+
+using namespace sofa::gpu::cuda;
+
+template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Edge>;
+template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Triangle>;
+template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Quad>;
+template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Tetrahedron>;
+template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Hexahedron>;
+
+} // namespace sofa::component::solidmechanics::fem::elastic
+
+namespace sofa::gpu::cuda
+{
+
+void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory)
+{
+    using namespace sofa::component::solidmechanics::fem::elastic;
+
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA for the ElementCorotationalFEMForceField")
+        .add< CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Edge> >()
+        .add< CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Triangle> >()
+        .add< CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Quad> >()
+        .add< CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Tetrahedron> >()
+        .add< CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Hexahedron> >()
+    );
+}
+
+} // namespace sofa::gpu::cuda
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
new file mode 100644
index 00000000000..7728de8b2c2
--- /dev/null
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
@@ -0,0 +1,160 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#include <sofa/gpu/cuda/CudaCommon.h>
+#include <sofa/gpu/cuda/CudaMath.h>
+#include <cuda.h>
+
+#if defined(__cplusplus)
+namespace sofa
+{
+namespace gpu
+{
+namespace cuda
+{
+#endif
+
+/// Maximum number of DOFs per element (8 nodes * 3 dimensions for hexahedra)
+#define MAX_ELEM_DOFS 24
+/// Maximum spatial dimensions
+#define MAX_DIM 3
+/// Maximum nodes per element
+#define MAX_NODES 8
+
+/**
+ * CUDA kernel for addDForce of corotational FEM.
+ *
+ * Generic over element type: works with any number of nodes per element and spatial dimensions.
+ * One thread per element. For each element:
+ *   1. Gather dx from nodes
+ *   2. Rotate dx into reference frame: rdx = R^T * dx
+ *   3. Multiply by stiffness: edf = K * rdx
+ *   4. Rotate back: df_world = R * edf
+ *   5. Scatter to nodes via atomicAdd: df[node] -= kFactor * df_world
+ */
+__global__ void ElementCorotationalFEMForceFieldCuda3f_addDForce_kernel(
+    int nbElem,
+    int nbNodesPerElem,
+    int nbDofsPerElem,
+    int dim,
+    const int* __restrict__ elements,
+    const float* __restrict__ rotations,
+    const float* __restrict__ stiffness,
+    const float* __restrict__ dx,
+    float* df,
+    float kFactor)
+{
+    const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
+    if (elemId >= nbElem) return;
+
+    // Load element node indices
+    const int* elemNodes = elements + elemId * nbNodesPerElem;
+
+    // Load rotation matrix R (dim x dim, row-major)
+    const float* Rptr = rotations + elemId * dim * dim;
+    float R[MAX_DIM * MAX_DIM];
+    for (int i = 0; i < dim * dim; ++i)
+        R[i] = Rptr[i];
+
+    // Gather dx and rotate into reference frame: rdx = R^T * dx_node
+    float rdx[MAX_ELEM_DOFS];
+    for (int n = 0; n < nbNodesPerElem; ++n)
+    {
+        const int nodeId = elemNodes[n];
+        const float* node_dx = dx + nodeId * dim;
+
+        for (int i = 0; i < dim; ++i)
+        {
+            float val = 0.0f;
+            for (int j = 0; j < dim; ++j)
+                val += R[j * dim + i] * node_dx[j]; // R^T[i][j] = R[j][i]
+            rdx[n * dim + i] = val;
+        }
+    }
+
+    // K * rdx -> edf (nbDofsPerElem x nbDofsPerElem matrix-vector product)
+    const float* K = stiffness + elemId * nbDofsPerElem * nbDofsPerElem;
+    float edf[MAX_ELEM_DOFS];
+    for (int i = 0; i < nbDofsPerElem; ++i)
+    {
+        float sum = 0.0f;
+        const float* Ki = K + i * nbDofsPerElem;
+        for (int j = 0; j < nbDofsPerElem; ++j)
+            sum += Ki[j] * rdx[j];
+        edf[i] = sum;
+    }
+
+    // Rotate back and scatter: df[node] -= kFactor * R * edf_node
+    for (int n = 0; n < nbNodesPerElem; ++n)
+    {
+        const int nodeId = elemNodes[n];
+        const float* node_edf = edf + n * dim;
+
+        for (int i = 0; i < dim; ++i)
+        {
+            float val = 0.0f;
+            for (int j = 0; j < dim; ++j)
+                val += R[i * dim + j] * node_edf[j]; // R * edf_node
+            atomicAdd(&df[nodeId * dim + i], -kFactor * val);
+        }
+    }
+}
+
+extern "C"
+{
+
+void ElementCorotationalFEMForceFieldCuda3f_addDForce(
+    unsigned int nbElem,
+    unsigned int nbNodesPerElem,
+    unsigned int nbDofsPerElem,
+    unsigned int spatialDim,
+    const void* elements,
+    const void* rotations,
+    const void* stiffness,
+    const void* dx,
+    void* df,
+    float kFactor)
+{
+    const int threadsPerBlock = 64;
+    const int numBlocks = (nbElem + threadsPerBlock - 1) / threadsPerBlock;
+
+    ElementCorotationalFEMForceFieldCuda3f_addDForce_kernel<<<numBlocks, threadsPerBlock>>>(
+        nbElem,
+        nbNodesPerElem,
+        nbDofsPerElem,
+        spatialDim,
+        (const int*)elements,
+        (const float*)rotations,
+        (const float*)stiffness,
+        (const float*)dx,
+        (float*)df,
+        kFactor);
+
+    mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_addDForce_kernel");
+}
+
+} // extern "C"
+
+#if defined(__cplusplus)
+} // namespace cuda
+} // namespace gpu
+} // namespace sofa
+#endif
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
new file mode 100644
index 00000000000..e124047fd58
--- /dev/null
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
@@ -0,0 +1,110 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#pragma once
+
+#include <sofa/gpu/cuda/CudaTypes.h>
+#include <sofa/component/solidmechanics/fem/elastic/ElementCorotationalFEMForceField.h>
+
+namespace sofa::gpu::cuda
+{
+
+extern "C"
+{
+    void ElementCorotationalFEMForceFieldCuda3f_addDForce(
+        unsigned int nbElem,
+        unsigned int nbNodesPerElem,
+        unsigned int nbDofsPerElem,
+        unsigned int spatialDim,
+        const void* elements,
+        const void* rotations,
+        const void* stiffness,
+        const void* dx,
+        void* df,
+        float kFactor);
+}
+
+} // namespace sofa::gpu::cuda
+
+namespace sofa::component::solidmechanics::fem::elastic
+{
+
+/**
+ * CUDA-accelerated version of ElementCorotationalFEMForceField.
+ *
+ * Works with any element type (Edge, Triangle, Quad, Tetrahedron, Hexahedron).
+ * The addDForce method (the CG hot path, called ~250 times per timestep) runs entirely on GPU.
+ * The addForce method delegates to the CPU parent and uploads rotations to GPU afterwards.
+ */
+template<class DataTypes, class ElementType>
+class CudaElementCorotationalFEMForceField
+    : public ElementCorotationalFEMForceField<DataTypes, ElementType>
+{
+public:
+    SOFA_CLASS(
+        SOFA_TEMPLATE2(CudaElementCorotationalFEMForceField, DataTypes, ElementType),
+        SOFA_TEMPLATE2(ElementCorotationalFEMForceField, DataTypes, ElementType));
+
+    using Real = sofa::Real_t<DataTypes>;
+    using Coord = sofa::Coord_t<DataTypes>;
+    using Deriv = sofa::Deriv_t<DataTypes>;
+    using VecCoord = sofa::VecCoord_t<DataTypes>;
+    using VecDeriv = sofa::VecDeriv_t<DataTypes>;
+
+    static const std::string GetCustomClassName()
+    {
+        return ElementCorotationalFEMForceField<DataTypes, ElementType>::GetCustomClassName();
+    }
+
+    static const std::string GetCustomTemplateName()
+    {
+        return DataTypes::Name();
+    }
+
+    void init() override;
+
+    void addForce(
+        const sofa::core::MechanicalParams* mparams,
+        sofa::DataVecDeriv_t<DataTypes>& f,
+        const sofa::DataVecCoord_t<DataTypes>& x,
+        const sofa::DataVecDeriv_t<DataTypes>& v) override;
+
+    void addDForce(
+        const sofa::core::MechanicalParams* mparams,
+        sofa::DataVecDeriv_t<DataTypes>& df,
+        const sofa::DataVecDeriv_t<DataTypes>& dx) override;
+
+protected:
+
+    CudaElementCorotationalFEMForceField() = default;
+
+    void uploadStiffnessAndConnectivity();
+    void uploadRotations();
+
+    gpu::cuda::CudaVector<float> m_gpuStiffness;   ///< Flat NxN stiffness matrices per element (N = nbDofsPerElement)
+    gpu::cuda::CudaVector<float> m_gpuRotations;    ///< Flat DxD rotation matrices per element (D = spatial_dimensions)
+    gpu::cuda::CudaVector<int>   m_gpuElements;     ///< Node indices per element
+
+    bool m_gpuDataUploaded = false;
+    bool m_gpuRotationsUploaded = false;
+};
+
+} // namespace sofa::component::solidmechanics::fem::elastic
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
new file mode 100644
index 00000000000..ab2cc511cb7
--- /dev/null
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
@@ -0,0 +1,168 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#pragma once
+#include <SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h>
+#include <sofa/component/solidmechanics/fem/elastic/ElementCorotationalFEMForceField.inl>
+#include <sofa/core/behavior/ForceField.inl>
+
+namespace sofa::component::solidmechanics::fem::elastic
+{
+
+template<class DataTypes, class ElementType>
+void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::init()
+{
+    ElementCorotationalFEMForceField<DataTypes, ElementType>::init();
+
+    if (!this->isComponentStateInvalid())
+    {
+        uploadStiffnessAndConnectivity();
+    }
+}
+
+template<class DataTypes, class ElementType>
+void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::uploadStiffnessAndConnectivity()
+{
+    using trait = sofa::component::solidmechanics::fem::elastic::trait<DataTypes, ElementType>;
+
+    if (!this->l_topology) return;
+
+    const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology);
+    const auto& assembledMatrices = this->m_assembledStiffnessMatrices;
+
+    const auto nbElem = elements.size();
+    constexpr auto nDofs = trait::NumberOfDofsInElement;
+    constexpr auto nNodes = trait::NumberOfNodesInElement;
+
+    // Upload stiffness matrices (flat row-major NxN per element)
+    m_gpuStiffness.resize(nbElem * nDofs * nDofs);
+    {
+        auto* dst = m_gpuStiffness.hostWrite();
+        for (std::size_t e = 0; e < nbElem; ++e)
+        {
+            const auto& K = assembledMatrices[e];
+            for (unsigned int i = 0; i < nDofs; ++i)
+                for (unsigned int j = 0; j < nDofs; ++j)
+                    dst[e * nDofs * nDofs + i * nDofs + j] = static_cast<float>(K[i][j]);
+        }
+    }
+
+    // Upload element connectivity (nNodes node indices per element)
+    m_gpuElements.resize(nbElem * nNodes);
+    {
+        auto* dst = m_gpuElements.hostWrite();
+        for (std::size_t e = 0; e < nbElem; ++e)
+        {
+            const auto& element = elements[e];
+            for (unsigned int n = 0; n < nNodes; ++n)
+                dst[e * nNodes + n] = static_cast<int>(element[n]);
+        }
+    }
+
+    m_gpuDataUploaded = true;
+    m_gpuRotationsUploaded = false;
+}
+
+template<class DataTypes, class ElementType>
+void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::uploadRotations()
+{
+    using trait = sofa::component::solidmechanics::fem::elastic::trait<DataTypes, ElementType>;
+    constexpr auto dim = trait::spatial_dimensions;
+
+    const auto& rotations = this->m_rotations;
+    const auto nbElem = rotations.size();
+
+    m_gpuRotations.resize(nbElem * dim * dim);
+    {
+        auto* dst = m_gpuRotations.hostWrite();
+        for (std::size_t e = 0; e < nbElem; ++e)
+        {
+            const auto& R = rotations[e];
+            for (unsigned int i = 0; i < dim; ++i)
+                for (unsigned int j = 0; j < dim; ++j)
+                    dst[e * dim * dim + i * dim + j] = static_cast<float>(R[i][j]);
+        }
+    }
+
+    m_gpuRotationsUploaded = true;
+}
+
+template<class DataTypes, class ElementType>
+void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
+    const sofa::core::MechanicalParams* mparams,
+    sofa::DataVecDeriv_t<DataTypes>& f,
+    const sofa::DataVecCoord_t<DataTypes>& x,
+    const sofa::DataVecDeriv_t<DataTypes>& v)
+{
+    // Run on CPU: computes rotations and forces
+    ElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(mparams, f, x, v);
+
+    // Upload the freshly-computed rotations to GPU for subsequent addDForce calls
+    uploadRotations();
+}
+
+template<class DataTypes, class ElementType>
+void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addDForce(
+    const sofa::core::MechanicalParams* mparams,
+    sofa::DataVecDeriv_t<DataTypes>& d_df,
+    const sofa::DataVecDeriv_t<DataTypes>& d_dx)
+{
+    if (this->isComponentStateInvalid())
+        return;
+
+    if (!m_gpuDataUploaded || !m_gpuRotationsUploaded)
+    {
+        // Fallback to CPU if GPU data not ready
+        ElementCorotationalFEMForceField<DataTypes, ElementType>::addDForce(mparams, d_df, d_dx);
+        return;
+    }
+
+    using trait = sofa::component::solidmechanics::fem::elastic::trait<DataTypes, ElementType>;
+
+    VecDeriv& df = *d_df.beginEdit();
+    const VecDeriv& dx = d_dx.getValue();
+
+    if (df.size() < dx.size())
+        df.resize(dx.size());
+
+    const auto kFactor = static_cast<float>(
+        sofa::core::mechanicalparams::kFactorIncludingRayleighDamping(
+            mparams, this->rayleighStiffness.getValue()));
+
+    const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology);
+    const auto nbElem = static_cast<unsigned int>(elements.size());
+
+    gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addDForce(
+        nbElem,
+        trait::NumberOfNodesInElement,
+        trait::NumberOfDofsInElement,
+        trait::spatial_dimensions,
+        m_gpuElements.deviceRead(),
+        m_gpuRotations.deviceRead(),
+        m_gpuStiffness.deviceRead(),
+        dx.deviceRead(),
+        df.deviceWrite(),
+        kFactor);
+
+    d_df.endEdit();
+}
+
+} // namespace sofa::component::solidmechanics::fem::elastic
diff --git a/examples/Validation/cantilever_beam/CantileverBeam_ElementFEMForceField.xml b/examples/Validation/cantilever_beam/CantileverBeam_ElementFEMForceField.xml
index 4e1fab99d9e..2133c327c4a 100644
--- a/examples/Validation/cantilever_beam/CantileverBeam_ElementFEMForceField.xml
+++ b/examples/Validation/cantilever_beam/CantileverBeam_ElementFEMForceField.xml
@@ -27,7 +27,7 @@
 
     <EulerImplicitSolver name="backward_Euler" rayleighStiffness="0.1" rayleighMass="0.1" />
 
-    <RegularGridTopology name="grid" min="-0.01 -0.01 0" max="0.01 0.01 0.2" n="5 5 30"/>
+    <RegularGridTopology name="grid" min="-0.01 -0.01 0" max="0.01 0.01 0.2" n="10 10 60"/>
     <MechanicalObject template="Vec3" name="state" showObject="true"/>
 
     <MeshMatrixMass massDensity="1100" topology="@grid"/>

From b746722ce6982996da99c4f57ed6bdec4e2380b9 Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Mon, 6 Apr 2026 13:22:22 +0900
Subject: [PATCH 03/21] add example

---
 .../CudaElementCorotationalFEMForceField.cpp  | 14 +++++++-
 .../CudaElementCorotationalFEMForceField.scn  | 35 +++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
index cb0251b4925..c77a51c13c2 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
@@ -46,11 +46,23 @@ void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory
     using namespace sofa::component::solidmechanics::fem::elastic;
 
     factory->registerObjects(sofa::core::ObjectRegistrationData(
-        "Supports GPU-side computations using CUDA for the ElementCorotationalFEMForceField")
+        "Supports GPU-side computations using CUDA for EdgeCorotationalFEMForceField")
         .add< CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Edge> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA for TriangleCorotationalFEMForceField")
         .add< CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Triangle> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA for QuadCorotationalFEMForceField")
         .add< CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Quad> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA for TetrahedronCorotationalFEMForceField")
         .add< CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Tetrahedron> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA for HexahedronCorotationalFEMForceField")
         .add< CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Hexahedron> >()
     );
 }
diff --git a/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn b/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn
new file mode 100644
index 00000000000..a75e2058ff5
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn
@@ -0,0 +1,35 @@
+<?xml version="1.0" ?>
+<Node name="root" gravity="0 -9.81 0" dt="0.01">
+    <RequiredPlugin pluginName="Sofa.Component.Constraint.Projective"/>
+    <RequiredPlugin pluginName="Sofa.Component.Engine.Select"/>
+    <RequiredPlugin pluginName="Sofa.Component.LinearSolver.Iterative"/>
+    <RequiredPlugin pluginName="Sofa.Component.Mass"/>
+    <RequiredPlugin pluginName="Sofa.Component.ODESolver.Backward"/>
+    <RequiredPlugin pluginName="Sofa.Component.SolidMechanics.FEM.Elastic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Dynamic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Grid"/>
+    <RequiredPlugin pluginName="Sofa.Component.Visual"/>
+    <RequiredPlugin pluginName="SofaCUDA"/>
+
+    <VisualStyle displayFlags="showBehaviorModels showForceFields" />
+    <DefaultAnimationLoop/>
+
+    <Node name="Hexahedron">
+        <EulerImplicitSolver name="odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+        <CGLinearSolver iterations="250" name="linear_solver" tolerance="1.0e-12" threshold="1.0e-12" />
+
+        <RegularGridTopology name="grid" n="40 10 10" min="0 6 -2" max="16 10 2" />
+        <MechanicalObject template="CudaVec3f" name="state"/>
+
+        <HexahedronSetTopologyContainer name="Container" src="@grid"/>
+        <HexahedronSetTopologyModifier name="Modifier" />
+
+        <DiagonalMass totalMass="50.0" />
+
+        <BoxROI name="box_roi" box="-0.1 5 -3 0.1 11 3" drawBoxes="1" />
+        <FixedProjectiveConstraint indices="@box_roi.indices" />
+
+        <HexahedronCorotationalFEMForceField name="FEM" template="CudaVec3f"
+                                              youngModulus="2000" poissonRatio="0.3" />
+    </Node>
+</Node>

From 865f7f46ff8eebf8f5c463a85b5d80f7d5b72a0c Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Mon, 6 Apr 2026 14:24:02 +0900
Subject: [PATCH 04/21] new version

---
 .../CudaElementCorotationalFEMForceField.cu   | 218 ++++++++++++------
 .../CudaElementCorotationalFEMForceField.h    |  21 +-
 .../CudaElementCorotationalFEMForceField.inl  |  83 ++++++-
 3 files changed, 232 insertions(+), 90 deletions(-)

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
index 7728de8b2c2..4e1091f0e66 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
@@ -32,89 +32,164 @@ namespace cuda
 {
 #endif
 
-/// Maximum number of DOFs per element (8 nodes * 3 dimensions for hexahedra)
-#define MAX_ELEM_DOFS 24
-/// Maximum spatial dimensions
-#define MAX_DIM 3
-/// Maximum nodes per element
-#define MAX_NODES 8
-
 /**
- * CUDA kernel for addDForce of corotational FEM.
+ * Kernel 1: Compute per-element dForce (1 thread per element).
+ *
+ * Templated on NNodes (compile-time) for full loop unrolling.
+ * Hardcoded Dim=3 (CudaVec3f only).
  *
- * Generic over element type: works with any number of nodes per element and spatial dimensions.
- * One thread per element. For each element:
- *   1. Gather dx from nodes
- *   2. Rotate dx into reference frame: rdx = R^T * dx
- *   3. Multiply by stiffness: edf = K * rdx
- *   4. Rotate back: df_world = R * edf
- *   5. Scatter to nodes via atomicAdd: df[node] -= kFactor * df_world
+ * Connectivity is SoA: elements[nodeIdx * nbElem + elemId].
+ * Stiffness is in block format: K[(ni * NNodes + nj) * 9 + di * 3 + dj].
  */
-__global__ void ElementCorotationalFEMForceFieldCuda3f_addDForce_kernel(
+template<int NNodes>
+__global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel(
     int nbElem,
-    int nbNodesPerElem,
-    int nbDofsPerElem,
-    int dim,
     const int* __restrict__ elements,
     const float* __restrict__ rotations,
     const float* __restrict__ stiffness,
     const float* __restrict__ dx,
-    float* df,
+    float* __restrict__ eforce,
     float kFactor)
 {
+    constexpr int NDofs = NNodes * 3;
+
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
-    // Load element node indices
-    const int* elemNodes = elements + elemId * nbNodesPerElem;
-
-    // Load rotation matrix R (dim x dim, row-major)
-    const float* Rptr = rotations + elemId * dim * dim;
-    float R[MAX_DIM * MAX_DIM];
-    for (int i = 0; i < dim * dim; ++i)
+    // Load rotation matrix R (3x3, row-major)
+    const float* Rptr = rotations + elemId * 9;
+    float R[9];
+    #pragma unroll
+    for (int i = 0; i < 9; ++i)
         R[i] = Rptr[i];
 
-    // Gather dx and rotate into reference frame: rdx = R^T * dx_node
-    float rdx[MAX_ELEM_DOFS];
-    for (int n = 0; n < nbNodesPerElem; ++n)
+    // Gather dx and rotate into reference frame: rdx[n] = R^T * dx[node[n]]
+    float rdx[NDofs];
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
     {
-        const int nodeId = elemNodes[n];
-        const float* node_dx = dx + nodeId * dim;
+        const int nodeId = elements[n * nbElem + elemId];
+        const float dx_x = dx[nodeId * 3 + 0];
+        const float dx_y = dx[nodeId * 3 + 1];
+        const float dx_z = dx[nodeId * 3 + 2];
+
+        rdx[n * 3 + 0] = R[0] * dx_x + R[3] * dx_y + R[6] * dx_z;
+        rdx[n * 3 + 1] = R[1] * dx_x + R[4] * dx_y + R[7] * dx_z;
+        rdx[n * 3 + 2] = R[2] * dx_x + R[5] * dx_y + R[8] * dx_z;
+    }
 
-        for (int i = 0; i < dim; ++i)
+    // Block-matrix multiply: edf = K * rdx
+    const float* K = stiffness + elemId * NNodes * NNodes * 9;
+    float edf[NDofs];
+    #pragma unroll
+    for (int ni = 0; ni < NNodes; ++ni)
+    {
+        float fi0 = 0.0f, fi1 = 0.0f, fi2 = 0.0f;
+        #pragma unroll
+        for (int nj = 0; nj < NNodes; ++nj)
         {
-            float val = 0.0f;
-            for (int j = 0; j < dim; ++j)
-                val += R[j * dim + i] * node_dx[j]; // R^T[i][j] = R[j][i]
-            rdx[n * dim + i] = val;
+            const float* Kij = K + (ni * NNodes + nj) * 9;
+            const float rj0 = rdx[nj * 3 + 0];
+            const float rj1 = rdx[nj * 3 + 1];
+            const float rj2 = rdx[nj * 3 + 2];
+            fi0 += Kij[0] * rj0 + Kij[1] * rj1 + Kij[2] * rj2;
+            fi1 += Kij[3] * rj0 + Kij[4] * rj1 + Kij[5] * rj2;
+            fi2 += Kij[6] * rj0 + Kij[7] * rj1 + Kij[8] * rj2;
         }
+        edf[ni * 3 + 0] = fi0;
+        edf[ni * 3 + 1] = fi1;
+        edf[ni * 3 + 2] = fi2;
     }
 
-    // K * rdx -> edf (nbDofsPerElem x nbDofsPerElem matrix-vector product)
-    const float* K = stiffness + elemId * nbDofsPerElem * nbDofsPerElem;
-    float edf[MAX_ELEM_DOFS];
-    for (int i = 0; i < nbDofsPerElem; ++i)
+    // Rotate back and write: eforce = -kFactor * R * edf
+    float* out = eforce + elemId * NNodes * 3;
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
     {
-        float sum = 0.0f;
-        const float* Ki = K + i * nbDofsPerElem;
-        for (int j = 0; j < nbDofsPerElem; ++j)
-            sum += Ki[j] * rdx[j];
-        edf[i] = sum;
+        const float e0 = edf[n * 3 + 0];
+        const float e1 = edf[n * 3 + 1];
+        const float e2 = edf[n * 3 + 2];
+        out[n * 3 + 0] = -kFactor * (R[0] * e0 + R[1] * e1 + R[2] * e2);
+        out[n * 3 + 1] = -kFactor * (R[3] * e0 + R[4] * e1 + R[5] * e2);
+        out[n * 3 + 2] = -kFactor * (R[6] * e0 + R[7] * e1 + R[8] * e2);
     }
+}
 
-    // Rotate back and scatter: df[node] -= kFactor * R * edf_node
-    for (int n = 0; n < nbNodesPerElem; ++n)
+/**
+ * Kernel 2: Gather per-vertex forces (1 thread per vertex).
+ *
+ * No atomics: each vertex handled by exactly one thread.
+ * velems is SoA: velems[s * nbVertex + vertexId], 0-terminated.
+ * Each entry is (elemId * NNodes + localNode + 1), with 0 as sentinel.
+ */
+__global__ void ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel(
+    int nbVertex,
+    int maxElemPerVertex,
+    const int* __restrict__ velems,
+    const float* __restrict__ eforce,
+    float* df)
+{
+    const int vertexId = blockIdx.x * blockDim.x + threadIdx.x;
+    if (vertexId >= nbVertex) return;
+
+    float fx = 0.0f, fy = 0.0f, fz = 0.0f;
+
+    for (int s = 0; s < maxElemPerVertex; ++s)
     {
-        const int nodeId = elemNodes[n];
-        const float* node_edf = edf + n * dim;
+        const int idx = velems[s * nbVertex + vertexId];
+        if (idx == 0) break;
+        const int base = (idx - 1) * 3;
+        fx += eforce[base + 0];
+        fy += eforce[base + 1];
+        fz += eforce[base + 2];
+    }
 
-        for (int i = 0; i < dim; ++i)
-        {
-            float val = 0.0f;
-            for (int j = 0; j < dim; ++j)
-                val += R[i * dim + j] * node_edf[j]; // R * edf_node
-            atomicAdd(&df[nodeId * dim + i], -kFactor * val);
-        }
+    df[vertexId * 3 + 0] += fx;
+    df[vertexId * 3 + 1] += fy;
+    df[vertexId * 3 + 2] += fz;
+}
+
+template<int NNodes>
+static void launchAddDForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* rotations,
+    const void* stiffness,
+    const void* dx,
+    void* df,
+    void* eforce,
+    const void* velems,
+    float kFactor)
+{
+    const int computeThreads = 64;
+    const int gatherThreads = 256;
+
+    {
+        const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
+        ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel<NNodes>
+            <<<numBlocks, computeThreads>>>(
+                nbElem,
+                (const int*)elements,
+                (const float*)rotations,
+                (const float*)stiffness,
+                (const float*)dx,
+                (float*)eforce,
+                kFactor);
+        mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel");
+    }
+
+    {
+        const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
+        ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel
+            <<<numBlocks, gatherThreads>>>(
+                nbVertex,
+                maxElemPerVertex,
+                (const int*)velems,
+                (const float*)eforce,
+                (float*)df);
+        mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel");
     }
 }
 
@@ -123,32 +198,25 @@ extern "C"
 
 void ElementCorotationalFEMForceFieldCuda3f_addDForce(
     unsigned int nbElem,
+    unsigned int nbVertex,
     unsigned int nbNodesPerElem,
-    unsigned int nbDofsPerElem,
-    unsigned int spatialDim,
+    unsigned int maxElemPerVertex,
     const void* elements,
     const void* rotations,
     const void* stiffness,
     const void* dx,
     void* df,
+    void* eforce,
+    const void* velems,
     float kFactor)
 {
-    const int threadsPerBlock = 64;
-    const int numBlocks = (nbElem + threadsPerBlock - 1) / threadsPerBlock;
-
-    ElementCorotationalFEMForceFieldCuda3f_addDForce_kernel<<<numBlocks, threadsPerBlock>>>(
-        nbElem,
-        nbNodesPerElem,
-        nbDofsPerElem,
-        spatialDim,
-        (const int*)elements,
-        (const float*)rotations,
-        (const float*)stiffness,
-        (const float*)dx,
-        (float*)df,
-        kFactor);
-
-    mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_addDForce_kernel");
+    switch (nbNodesPerElem)
+    {
+        case 2: launchAddDForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 3: launchAddDForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 4: launchAddDForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 8: launchAddDForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
+    }
 }
 
 } // extern "C"
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
index e124047fd58..c5220a2f2be 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
@@ -31,14 +31,16 @@ extern "C"
 {
     void ElementCorotationalFEMForceFieldCuda3f_addDForce(
         unsigned int nbElem,
+        unsigned int nbVertex,
         unsigned int nbNodesPerElem,
-        unsigned int nbDofsPerElem,
-        unsigned int spatialDim,
+        unsigned int maxElemPerVertex,
         const void* elements,
         const void* rotations,
         const void* stiffness,
         const void* dx,
         void* df,
+        void* eforce,
+        const void* velems,
         float kFactor);
 }
 
@@ -53,6 +55,10 @@ namespace sofa::component::solidmechanics::fem::elastic
  * Works with any element type (Edge, Triangle, Quad, Tetrahedron, Hexahedron).
  * The addDForce method (the CG hot path, called ~250 times per timestep) runs entirely on GPU.
  * The addForce method delegates to the CPU parent and uploads rotations to GPU afterwards.
+ *
+ * Uses a two-kernel approach for addDForce:
+ *   Kernel 1: compute per-element forces (1 thread/element, fully unrolled)
+ *   Kernel 2: gather per-vertex (1 thread/vertex, no atomics)
  */
 template<class DataTypes, class ElementType>
 class CudaElementCorotationalFEMForceField
@@ -99,9 +105,14 @@ class CudaElementCorotationalFEMForceField
     void uploadStiffnessAndConnectivity();
     void uploadRotations();
 
-    gpu::cuda::CudaVector<float> m_gpuStiffness;   ///< Flat NxN stiffness matrices per element (N = nbDofsPerElement)
-    gpu::cuda::CudaVector<float> m_gpuRotations;    ///< Flat DxD rotation matrices per element (D = spatial_dimensions)
-    gpu::cuda::CudaVector<int>   m_gpuElements;     ///< Node indices per element
+    gpu::cuda::CudaVector<float> m_gpuStiffness;      ///< Block-format stiffness: K[(ni*N+nj)*9 + di*3+dj] per element
+    gpu::cuda::CudaVector<float> m_gpuRotations;       ///< Flat 3x3 rotation matrices per element
+    gpu::cuda::CudaVector<int>   m_gpuElements;        ///< SoA connectivity: elements[nodeIdx * nbElem + elemId]
+    gpu::cuda::CudaVector<float> m_gpuElementForce;    ///< Intermediate per-element per-node force buffer
+    gpu::cuda::CudaVector<int>   m_gpuVelems;          ///< SoA vertex-to-element mapping, 0-terminated
+
+    unsigned int m_maxElemPerVertex = 0;
+    unsigned int m_nbVertices = 0;
 
     bool m_gpuDataUploaded = false;
     bool m_gpuRotationsUploaded = false;
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
index ab2cc511cb7..a3813867726 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
@@ -23,6 +23,7 @@
 #include <SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h>
 #include <sofa/component/solidmechanics/fem/elastic/ElementCorotationalFEMForceField.inl>
 #include <sofa/core/behavior/ForceField.inl>
+#include <cstring>
 
 namespace sofa::component::solidmechanics::fem::elastic
 {
@@ -51,32 +52,91 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::uploadStiffne
     const auto nbElem = elements.size();
     constexpr auto nDofs = trait::NumberOfDofsInElement;
     constexpr auto nNodes = trait::NumberOfNodesInElement;
+    constexpr auto dim = trait::spatial_dimensions;
+
+    // Find number of vertices
+    unsigned int maxNodeId = 0;
+    for (std::size_t e = 0; e < nbElem; ++e)
+    {
+        const auto& element = elements[e];
+        for (unsigned int n = 0; n < nNodes; ++n)
+        {
+            if (static_cast<unsigned int>(element[n]) > maxNodeId)
+                maxNodeId = static_cast<unsigned int>(element[n]);
+        }
+    }
+    m_nbVertices = maxNodeId + 1;
 
-    // Upload stiffness matrices (flat row-major NxN per element)
-    m_gpuStiffness.resize(nbElem * nDofs * nDofs);
+    // Upload stiffness matrices in block format:
+    // K[(ni * nNodes + nj) * dim * dim + di * dim + dj] per element
+    // This groups each 3x3 sub-block contiguously for better cache behavior.
+    m_gpuStiffness.resize(nbElem * nNodes * nNodes * dim * dim);
     {
         auto* dst = m_gpuStiffness.hostWrite();
         for (std::size_t e = 0; e < nbElem; ++e)
         {
             const auto& K = assembledMatrices[e];
-            for (unsigned int i = 0; i < nDofs; ++i)
-                for (unsigned int j = 0; j < nDofs; ++j)
-                    dst[e * nDofs * nDofs + i * nDofs + j] = static_cast<float>(K[i][j]);
+            for (unsigned int ni = 0; ni < nNodes; ++ni)
+                for (unsigned int nj = 0; nj < nNodes; ++nj)
+                    for (unsigned int di = 0; di < dim; ++di)
+                        for (unsigned int dj = 0; dj < dim; ++dj)
+                            dst[e * nNodes * nNodes * dim * dim
+                                + (ni * nNodes + nj) * dim * dim
+                                + di * dim + dj]
+                                = static_cast<float>(K[ni * dim + di][nj * dim + dj]);
         }
     }
 
-    // Upload element connectivity (nNodes node indices per element)
-    m_gpuElements.resize(nbElem * nNodes);
+    // Upload element connectivity in SoA layout:
+    // elements[nodeIdx * nbElem + elemId] = global node index
+    // Adjacent threads access adjacent memory for coalesced reads.
+    m_gpuElements.resize(nNodes * nbElem);
     {
         auto* dst = m_gpuElements.hostWrite();
         for (std::size_t e = 0; e < nbElem; ++e)
         {
             const auto& element = elements[e];
             for (unsigned int n = 0; n < nNodes; ++n)
-                dst[e * nNodes + n] = static_cast<int>(element[n]);
+                dst[n * nbElem + e] = static_cast<int>(element[n]);
+        }
+    }
+
+    // Build vertex-to-element mapping (velems)
+    // For each vertex, stores the list of (elemId * nNodes + localNode + 1).
+    // 0 is used as sentinel. SoA layout: velems[slot * nbVertex + vertexId].
+    std::vector<std::vector<int>> vertexElems(m_nbVertices);
+    for (std::size_t e = 0; e < nbElem; ++e)
+    {
+        const auto& element = elements[e];
+        for (unsigned int n = 0; n < nNodes; ++n)
+        {
+            const int nodeId = static_cast<int>(element[n]);
+            vertexElems[nodeId].push_back(
+                static_cast<int>(e * nNodes + n + 1));
+        }
+    }
+
+    m_maxElemPerVertex = 0;
+    for (const auto& ve : vertexElems)
+    {
+        if (ve.size() > m_maxElemPerVertex)
+            m_maxElemPerVertex = static_cast<unsigned int>(ve.size());
+    }
+
+    m_gpuVelems.resize(m_maxElemPerVertex * m_nbVertices);
+    {
+        auto* dst = m_gpuVelems.hostWrite();
+        std::memset(dst, 0, m_maxElemPerVertex * m_nbVertices * sizeof(int));
+        for (std::size_t v = 0; v < m_nbVertices; ++v)
+        {
+            for (std::size_t s = 0; s < vertexElems[v].size(); ++s)
+                dst[s * m_nbVertices + v] = vertexElems[v][s];
         }
     }
 
+    // Allocate intermediate per-element force buffer
+    m_gpuElementForce.resize(nbElem * nNodes * dim);
+
     m_gpuDataUploaded = true;
     m_gpuRotationsUploaded = false;
 }
@@ -149,17 +209,20 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addDForce(
 
     const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology);
     const auto nbElem = static_cast<unsigned int>(elements.size());
+    const auto nbVertex = static_cast<unsigned int>(dx.size());
 
     gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addDForce(
         nbElem,
+        nbVertex,
         trait::NumberOfNodesInElement,
-        trait::NumberOfDofsInElement,
-        trait::spatial_dimensions,
+        m_maxElemPerVertex,
         m_gpuElements.deviceRead(),
         m_gpuRotations.deviceRead(),
         m_gpuStiffness.deviceRead(),
         dx.deviceRead(),
         df.deviceWrite(),
+        m_gpuElementForce.deviceWrite(),
+        m_gpuVelems.deviceRead(),
         kFactor);
 
     d_df.endEdit();

From 90e1204a19c6c51ad7dc17aa11ac49214625ed8f Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Mon, 6 Apr 2026 14:56:32 +0900
Subject: [PATCH 05/21] improvment of new version

---
 .../CudaElementCorotationalFEMForceField.cu   | 73 ++++++++++++++-----
 .../CudaElementCorotationalFEMForceField.inl  | 22 ++++--
 2 files changed, 70 insertions(+), 25 deletions(-)

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
index 4e1091f0e66..52ec3af12af 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
@@ -39,7 +39,12 @@ namespace cuda
  * Hardcoded Dim=3 (CudaVec3f only).
  *
  * Connectivity is SoA: elements[nodeIdx * nbElem + elemId].
- * Stiffness is in block format: K[(ni * NNodes + nj) * 9 + di * 3 + dj].
+ * Stiffness uses symmetric upper-triangle block storage:
+ *   Only blocks (ni, nj) with nj >= ni are stored.
+ *   NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats each.
+ *   Each off-diagonal block is read once and used for both
+ *   forward (edf[ni] += Kij * rdx[nj]) and symmetric
+ *   (edf[nj] += Kij^T * rdx[ni]) contributions.
  */
 template<int NNodes>
 __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel(
@@ -51,7 +56,7 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel(
     float* __restrict__ eforce,
     float kFactor)
 {
-    constexpr int NDofs = NNodes * 3;
+    constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
 
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
@@ -64,7 +69,7 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel(
         R[i] = Rptr[i];
 
     // Gather dx and rotate into reference frame: rdx[n] = R^T * dx[node[n]]
-    float rdx[NDofs];
+    float rdx[NNodes * 3];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
@@ -78,27 +83,59 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel(
         rdx[n * 3 + 2] = R[2] * dx_x + R[5] * dx_y + R[8] * dx_z;
     }
 
-    // Block-matrix multiply: edf = K * rdx
-    const float* K = stiffness + elemId * NNodes * NNodes * 9;
-    float edf[NDofs];
+    // Symmetric block-matrix multiply: edf = K * rdx
+    // K stored as upper triangle: blocks (ni, nj) for nj >= ni
+    const float* K = stiffness + elemId * NSymBlocks * 9;
+    float edf[NNodes * 3];
+
+    #pragma unroll
+    for (int i = 0; i < NNodes * 3; ++i)
+        edf[i] = 0.0f;
+
     #pragma unroll
     for (int ni = 0; ni < NNodes; ++ni)
     {
-        float fi0 = 0.0f, fi1 = 0.0f, fi2 = 0.0f;
+        // symIdx for (ni, ni) = ni*NNodes - ni*(ni-1)/2
+        const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
+
+        // Diagonal block (ni, ni): Kii * rdx[ni]
+        {
+            const float* Kii = K + diagIdx * 9;
+            const float ri0 = rdx[ni * 3 + 0];
+            const float ri1 = rdx[ni * 3 + 1];
+            const float ri2 = rdx[ni * 3 + 2];
+            edf[ni * 3 + 0] += Kii[0] * ri0 + Kii[1] * ri1 + Kii[2] * ri2;
+            edf[ni * 3 + 1] += Kii[3] * ri0 + Kii[4] * ri1 + Kii[5] * ri2;
+            edf[ni * 3 + 2] += Kii[6] * ri0 + Kii[7] * ri1 + Kii[8] * ri2;
+        }
+
+        // Off-diagonal blocks (ni, nj) for nj > ni
         #pragma unroll
-        for (int nj = 0; nj < NNodes; ++nj)
+        for (int nj = ni + 1; nj < NNodes; ++nj)
         {
-            const float* Kij = K + (ni * NNodes + nj) * 9;
-            const float rj0 = rdx[nj * 3 + 0];
-            const float rj1 = rdx[nj * 3 + 1];
-            const float rj2 = rdx[nj * 3 + 2];
-            fi0 += Kij[0] * rj0 + Kij[1] * rj1 + Kij[2] * rj2;
-            fi1 += Kij[3] * rj0 + Kij[4] * rj1 + Kij[5] * rj2;
-            fi2 += Kij[6] * rj0 + Kij[7] * rj1 + Kij[8] * rj2;
+            const int symIdx = diagIdx + (nj - ni);
+            const float* Kij = K + symIdx * 9;
+
+            // Forward: edf[ni] += Kij * rdx[nj]
+            {
+                const float rj0 = rdx[nj * 3 + 0];
+                const float rj1 = rdx[nj * 3 + 1];
+                const float rj2 = rdx[nj * 3 + 2];
+                edf[ni * 3 + 0] += Kij[0] * rj0 + Kij[1] * rj1 + Kij[2] * rj2;
+                edf[ni * 3 + 1] += Kij[3] * rj0 + Kij[4] * rj1 + Kij[5] * rj2;
+                edf[ni * 3 + 2] += Kij[6] * rj0 + Kij[7] * rj1 + Kij[8] * rj2;
+            }
+
+            // Symmetric: edf[nj] += Kij^T * rdx[ni]
+            {
+                const float ri0 = rdx[ni * 3 + 0];
+                const float ri1 = rdx[ni * 3 + 1];
+                const float ri2 = rdx[ni * 3 + 2];
+                edf[nj * 3 + 0] += Kij[0] * ri0 + Kij[3] * ri1 + Kij[6] * ri2;
+                edf[nj * 3 + 1] += Kij[1] * ri0 + Kij[4] * ri1 + Kij[7] * ri2;
+                edf[nj * 3 + 2] += Kij[2] * ri0 + Kij[5] * ri1 + Kij[8] * ri2;
+            }
         }
-        edf[ni * 3 + 0] = fi0;
-        edf[ni * 3 + 1] = fi1;
-        edf[ni * 3 + 2] = fi2;
     }
 
     // Rotate back and write: eforce = -kFactor * R * edf
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
index a3813867726..1e1093758a9 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
@@ -67,23 +67,31 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::uploadStiffne
     }
     m_nbVertices = maxNodeId + 1;
 
-    // Upload stiffness matrices in block format:
-    // K[(ni * nNodes + nj) * dim * dim + di * dim + dj] per element
-    // This groups each 3x3 sub-block contiguously for better cache behavior.
-    m_gpuStiffness.resize(nbElem * nNodes * nNodes * dim * dim);
+    // Upload stiffness matrices in symmetric upper-triangle block format:
+    // Only blocks (ni, nj) with nj >= ni are stored.
+    // symIdx = ni * nNodes - ni*(ni-1)/2 + (nj - ni)
+    // K[symIdx * dim * dim + di * dim + dj] per element
+    constexpr auto nSymBlocks = nNodes * (nNodes + 1) / 2;
+    m_gpuStiffness.resize(nbElem * nSymBlocks * dim * dim);
     {
         auto* dst = m_gpuStiffness.hostWrite();
         for (std::size_t e = 0; e < nbElem; ++e)
         {
             const auto& K = assembledMatrices[e];
             for (unsigned int ni = 0; ni < nNodes; ++ni)
-                for (unsigned int nj = 0; nj < nNodes; ++nj)
+            {
+                const unsigned int diagIdx = ni * nNodes - ni * (ni - 1) / 2;
+                for (unsigned int nj = ni; nj < nNodes; ++nj)
+                {
+                    const unsigned int symIdx = diagIdx + (nj - ni);
                     for (unsigned int di = 0; di < dim; ++di)
                         for (unsigned int dj = 0; dj < dim; ++dj)
-                            dst[e * nNodes * nNodes * dim * dim
-                                + (ni * nNodes + nj) * dim * dim
+                            dst[e * nSymBlocks * dim * dim
+                                + symIdx * dim * dim
                                 + di * dim + dj]
                                 = static_cast<float>(K[ni * dim + di][nj * dim + dj]);
+                }
+            }
         }
     }
 

From 5a2d2b983ab7ab49a65e7ef4e68d4835a9d97542 Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Mon, 6 Apr 2026 15:18:55 +0900
Subject: [PATCH 06/21] add cuda version of
 ElementLinearSmallStrainFEMForceField

---
 .../plugins/SofaCUDA/Component/CMakeLists.txt |   4 +
 .../Component/src/SofaCUDA/component/init.cpp |   2 +
 ...aElementLinearSmallStrainFEMForceField.cpp |  70 +++++
 ...daElementLinearSmallStrainFEMForceField.cu | 242 ++++++++++++++++++
 ...udaElementLinearSmallStrainFEMForceField.h | 113 ++++++++
 ...aElementLinearSmallStrainFEMForceField.inl | 195 ++++++++++++++
 ...aElementLinearSmallStrainFEMForceField.scn |  35 +++
 7 files changed, 661 insertions(+)
 create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp
 create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
 create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
 create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
 create mode 100644 applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn

diff --git a/applications/plugins/SofaCUDA/Component/CMakeLists.txt b/applications/plugins/SofaCUDA/Component/CMakeLists.txt
index fbd83faf0e5..5ac492c4834 100644
--- a/applications/plugins/SofaCUDA/Component/CMakeLists.txt
+++ b/applications/plugins/SofaCUDA/Component/CMakeLists.txt
@@ -41,6 +41,8 @@ set(HEADER_FILES
     ### solidmechanics
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
+    ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
+    ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.h
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.inl
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/hyperelastic/CudaStandardTetrahedralFEMForceField.h
@@ -114,6 +116,7 @@ set(SOURCE_FILES
 
     ### Solidmechanics
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
+    ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.cpp
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/hyperelastic/CudaStandardTetrahedralFEMForceField.cpp
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/tensormass/CudaTetrahedralTensorMassForceField.cpp
@@ -185,6 +188,7 @@ set(CUDA_SOURCES
 
     ### solidmechanics
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
+    ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.cu
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/hyperelastic/CudaStandardTetrahedralFEMForceField.cu
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/tensormass/CudaTetrahedralTensorMassForceField.cu
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp
index 0d890c64139..c6b0ee6b438 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp
@@ -91,6 +91,7 @@ extern void registerSphereForceField(sofa::core::ObjectFactory* factory);
 
 // component::solidmechanics::fem::elastic
 extern void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory);
+extern void registerElementLinearSmallStrainFEMForceField(sofa::core::ObjectFactory* factory);
 extern void registerHexahedronFEMForceField(sofa::core::ObjectFactory* factory);
 extern void registerTetrahedronFEMForceField(sofa::core::ObjectFactory* factory);
 extern void registerTriangularFEMForceFieldOptim(sofa::core::ObjectFactory* factory);
@@ -226,6 +227,7 @@ void registerObjects(sofa::core::ObjectFactory* factory)
     registerPlaneForceField(factory);
     registerSphereForceField(factory);
     registerElementCorotationalFEMForceField(factory);
+    registerElementLinearSmallStrainFEMForceField(factory);
     registerHexahedronFEMForceField(factory);
     registerTetrahedronFEMForceField(factory);
     registerTriangularFEMForceFieldOptim(factory);
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp
new file mode 100644
index 00000000000..af802d29e95
--- /dev/null
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp
@@ -0,0 +1,70 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#include <SofaCUDA/component/config.h>
+
+#include <sofa/gpu/cuda/CudaTypes.h>
+#include <SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl>
+#include <sofa/core/ObjectFactory.h>
+
+namespace sofa::component::solidmechanics::fem::elastic
+{
+
+using namespace sofa::gpu::cuda;
+
+template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Edge>;
+template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Triangle>;
+template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Quad>;
+template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Tetrahedron>;
+template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Hexahedron>;
+
+} // namespace sofa::component::solidmechanics::fem::elastic
+
+namespace sofa::gpu::cuda
+{
+
+void registerElementLinearSmallStrainFEMForceField(sofa::core::ObjectFactory* factory)
+{
+    using namespace sofa::component::solidmechanics::fem::elastic;
+
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA for EdgeLinearSmallStrainFEMForceField")
+        .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Edge> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA for TriangleLinearSmallStrainFEMForceField")
+        .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Triangle> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA for QuadLinearSmallStrainFEMForceField")
+        .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Quad> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA for TetrahedronLinearSmallStrainFEMForceField")
+        .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Tetrahedron> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA for HexahedronLinearSmallStrainFEMForceField")
+        .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Hexahedron> >()
+    );
+}
+
+} // namespace sofa::gpu::cuda
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
new file mode 100644
index 00000000000..4c474fe2d09
--- /dev/null
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
@@ -0,0 +1,242 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#include <sofa/gpu/cuda/CudaCommon.h>
+#include <sofa/gpu/cuda/CudaMath.h>
+#include <cuda.h>
+
+#if defined(__cplusplus)
+namespace sofa
+{
+namespace gpu
+{
+namespace cuda
+{
+#endif
+
+/**
+ * Kernel 1: Compute per-element dForce (1 thread per element).
+ *
+ * Templated on NNodes (compile-time) for full loop unrolling.
+ * Hardcoded Dim=3 (CudaVec3f only).
+ *
+ * No rotation matrices needed (linear small strain).
+ * Stiffness uses symmetric upper-triangle block storage:
+ *   Only blocks (ni, nj) with nj >= ni are stored.
+ *   NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats each.
+ */
+template<int NNodes>
+__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel(
+    int nbElem,
+    const int* __restrict__ elements,
+    const float* __restrict__ stiffness,
+    const float* __restrict__ dx,
+    float* __restrict__ eforce,
+    float kFactor)
+{
+    constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
+
+    const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
+    if (elemId >= nbElem) return;
+
+    // Gather dx for this element's nodes
+    float edx[NNodes * 3];
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        const int nodeId = elements[n * nbElem + elemId];
+        edx[n * 3 + 0] = dx[nodeId * 3 + 0];
+        edx[n * 3 + 1] = dx[nodeId * 3 + 1];
+        edx[n * 3 + 2] = dx[nodeId * 3 + 2];
+    }
+
+    // Symmetric block-matrix multiply: edf = K * edx
+    const float* K = stiffness + elemId * NSymBlocks * 9;
+    float edf[NNodes * 3];
+
+    #pragma unroll
+    for (int i = 0; i < NNodes * 3; ++i)
+        edf[i] = 0.0f;
+
+    #pragma unroll
+    for (int ni = 0; ni < NNodes; ++ni)
+    {
+        const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
+
+        // Diagonal block (ni, ni): Kii * edx[ni]
+        {
+            const float* Kii = K + diagIdx * 9;
+            const float di0 = edx[ni * 3 + 0];
+            const float di1 = edx[ni * 3 + 1];
+            const float di2 = edx[ni * 3 + 2];
+            edf[ni * 3 + 0] += Kii[0] * di0 + Kii[1] * di1 + Kii[2] * di2;
+            edf[ni * 3 + 1] += Kii[3] * di0 + Kii[4] * di1 + Kii[5] * di2;
+            edf[ni * 3 + 2] += Kii[6] * di0 + Kii[7] * di1 + Kii[8] * di2;
+        }
+
+        // Off-diagonal blocks (ni, nj) for nj > ni
+        #pragma unroll
+        for (int nj = ni + 1; nj < NNodes; ++nj)
+        {
+            const int symIdx = diagIdx + (nj - ni);
+            const float* Kij = K + symIdx * 9;
+
+            // Forward: edf[ni] += Kij * edx[nj]
+            {
+                const float dj0 = edx[nj * 3 + 0];
+                const float dj1 = edx[nj * 3 + 1];
+                const float dj2 = edx[nj * 3 + 2];
+                edf[ni * 3 + 0] += Kij[0] * dj0 + Kij[1] * dj1 + Kij[2] * dj2;
+                edf[ni * 3 + 1] += Kij[3] * dj0 + Kij[4] * dj1 + Kij[5] * dj2;
+                edf[ni * 3 + 2] += Kij[6] * dj0 + Kij[7] * dj1 + Kij[8] * dj2;
+            }
+
+            // Symmetric: edf[nj] += Kij^T * edx[ni]
+            {
+                const float di0 = edx[ni * 3 + 0];
+                const float di1 = edx[ni * 3 + 1];
+                const float di2 = edx[ni * 3 + 2];
+                edf[nj * 3 + 0] += Kij[0] * di0 + Kij[3] * di1 + Kij[6] * di2;
+                edf[nj * 3 + 1] += Kij[1] * di0 + Kij[4] * di1 + Kij[7] * di2;
+                edf[nj * 3 + 2] += Kij[2] * di0 + Kij[5] * di1 + Kij[8] * di2;
+            }
+        }
+    }
+
+    // Write: eforce = -kFactor * edf
+    float* out = eforce + elemId * NNodes * 3;
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        out[n * 3 + 0] = -kFactor * edf[n * 3 + 0];
+        out[n * 3 + 1] = -kFactor * edf[n * 3 + 1];
+        out[n * 3 + 2] = -kFactor * edf[n * 3 + 2];
+    }
+}
+
+/**
+ * Kernel 2: Gather per-vertex forces (1 thread per vertex).
+ *
+ * No atomics: each vertex handled by exactly one thread.
+ * velems is SoA: velems[s * nbVertex + vertexId], 0-terminated.
+ * Each entry is (elemId * NNodes + localNode + 1), with 0 as sentinel.
+ */
+__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel(
+    int nbVertex,
+    int maxElemPerVertex,
+    const int* __restrict__ velems,
+    const float* __restrict__ eforce,
+    float* df)
+{
+    const int vertexId = blockIdx.x * blockDim.x + threadIdx.x;
+    if (vertexId >= nbVertex) return;
+
+    float fx = 0.0f, fy = 0.0f, fz = 0.0f;
+
+    for (int s = 0; s < maxElemPerVertex; ++s)
+    {
+        const int idx = velems[s * nbVertex + vertexId];
+        if (idx == 0) break;
+        const int base = (idx - 1) * 3;
+        fx += eforce[base + 0];
+        fy += eforce[base + 1];
+        fz += eforce[base + 2];
+    }
+
+    df[vertexId * 3 + 0] += fx;
+    df[vertexId * 3 + 1] += fy;
+    df[vertexId * 3 + 2] += fz;
+}
+
+template<int NNodes>
+static void launchAddDForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* stiffness,
+    const void* dx,
+    void* df,
+    void* eforce,
+    const void* velems,
+    float kFactor)
+{
+    const int computeThreads = 64;
+    const int gatherThreads = 256;
+
+    {
+        const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
+        ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel<NNodes>
+            <<<numBlocks, computeThreads>>>(
+                nbElem,
+                (const int*)elements,
+                (const float*)stiffness,
+                (const float*)dx,
+                (float*)eforce,
+                kFactor);
+        mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel");
+    }
+
+    {
+        const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
+        ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel
+            <<<numBlocks, gatherThreads>>>(
+                nbVertex,
+                maxElemPerVertex,
+                (const int*)velems,
+                (const float*)eforce,
+                (float*)df);
+        mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel");
+    }
+}
+
+extern "C"
+{
+
+void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int nbNodesPerElem,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* stiffness,
+    const void* dx,
+    void* df,
+    void* eforce,
+    const void* velems,
+    float kFactor)
+{
+    switch (nbNodesPerElem)
+    {
+        case 2: launchAddDForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 3: launchAddDForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 4: launchAddDForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 8: launchAddDForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
+    }
+}
+
+} // extern "C"
+
+#if defined(__cplusplus)
+} // namespace cuda
+} // namespace gpu
+} // namespace sofa
+#endif
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
new file mode 100644
index 00000000000..67ae48abb48
--- /dev/null
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
@@ -0,0 +1,113 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#pragma once
+
+#include <sofa/gpu/cuda/CudaTypes.h>
+#include <sofa/component/solidmechanics/fem/elastic/ElementLinearSmallStrainFEMForceField.h>
+
+namespace sofa::gpu::cuda
+{
+
+extern "C"
+{
+    void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce(
+        unsigned int nbElem,
+        unsigned int nbVertex,
+        unsigned int nbNodesPerElem,
+        unsigned int maxElemPerVertex,
+        const void* elements,
+        const void* stiffness,
+        const void* dx,
+        void* df,
+        void* eforce,
+        const void* velems,
+        float kFactor);
+}
+
+} // namespace sofa::gpu::cuda
+
+namespace sofa::component::solidmechanics::fem::elastic
+{
+
+/**
+ * CUDA-accelerated version of ElementLinearSmallStrainFEMForceField.
+ *
+ * Works with any element type (Edge, Triangle, Quad, Tetrahedron, Hexahedron).
+ * The addDForce method (the CG hot path, called ~250 times per timestep) runs entirely on GPU.
+ * The addForce method delegates to the CPU parent.
+ *
+ * Uses a two-kernel approach for addDForce:
+ *   Kernel 1: compute per-element forces (1 thread/element, fully unrolled)
+ *   Kernel 2: gather per-vertex (1 thread/vertex, no atomics)
+ *
+ * Compared to the corotational version, no rotation matrices are needed.
+ */
+template<class DataTypes, class ElementType>
+class CudaElementLinearSmallStrainFEMForceField
+    : public ElementLinearSmallStrainFEMForceField<DataTypes, ElementType>
+{
+public:
+    SOFA_CLASS(
+        SOFA_TEMPLATE2(CudaElementLinearSmallStrainFEMForceField, DataTypes, ElementType),
+        SOFA_TEMPLATE2(ElementLinearSmallStrainFEMForceField, DataTypes, ElementType));
+
+    using Real = sofa::Real_t<DataTypes>;
+    using Coord = sofa::Coord_t<DataTypes>;
+    using Deriv = sofa::Deriv_t<DataTypes>;
+    using VecCoord = sofa::VecCoord_t<DataTypes>;
+    using VecDeriv = sofa::VecDeriv_t<DataTypes>;
+
+    static const std::string GetCustomClassName()
+    {
+        return ElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::GetCustomClassName();
+    }
+
+    static const std::string GetCustomTemplateName()
+    {
+        return DataTypes::Name();
+    }
+
+    void init() override;
+
+    void addDForce(
+        const sofa::core::MechanicalParams* mparams,
+        sofa::DataVecDeriv_t<DataTypes>& df,
+        const sofa::DataVecDeriv_t<DataTypes>& dx) override;
+
+protected:
+
+    CudaElementLinearSmallStrainFEMForceField() = default;
+
+    void uploadStiffnessAndConnectivity();
+
+    gpu::cuda::CudaVector<float> m_gpuStiffness;      ///< Symmetric block-format stiffness per element
+    gpu::cuda::CudaVector<int>   m_gpuElements;        ///< SoA connectivity: elements[nodeIdx * nbElem + elemId]
+    gpu::cuda::CudaVector<float> m_gpuElementForce;    ///< Intermediate per-element per-node force buffer
+    gpu::cuda::CudaVector<int>   m_gpuVelems;          ///< SoA vertex-to-element mapping, 0-terminated
+
+    unsigned int m_maxElemPerVertex = 0;
+    unsigned int m_nbVertices = 0;
+
+    bool m_gpuDataUploaded = false;
+};
+
+} // namespace sofa::component::solidmechanics::fem::elastic
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
new file mode 100644
index 00000000000..95bc4519ed9
--- /dev/null
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
@@ -0,0 +1,195 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#pragma once
+#include <SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h>
+#include <sofa/component/solidmechanics/fem/elastic/ElementLinearSmallStrainFEMForceField.inl>
+#include <sofa/core/behavior/ForceField.inl>
+#include <cstring>
+
+namespace sofa::component::solidmechanics::fem::elastic
+{
+
+template<class DataTypes, class ElementType>
+void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::init()
+{
+    ElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::init();
+
+    if (!this->isComponentStateInvalid())
+    {
+        uploadStiffnessAndConnectivity();
+    }
+}
+
+template<class DataTypes, class ElementType>
+void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::uploadStiffnessAndConnectivity()
+{
+    using trait = sofa::component::solidmechanics::fem::elastic::trait<DataTypes, ElementType>;
+
+    if (!this->l_topology) return;
+
+    const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology);
+    const auto& assembledMatrices = this->m_assembledStiffnessMatrices;
+
+    const auto nbElem = elements.size();
+    constexpr auto nNodes = trait::NumberOfNodesInElement;
+    constexpr auto dim = trait::spatial_dimensions;
+
+    // Find number of vertices
+    unsigned int maxNodeId = 0;
+    for (std::size_t e = 0; e < nbElem; ++e)
+    {
+        const auto& element = elements[e];
+        for (unsigned int n = 0; n < nNodes; ++n)
+        {
+            if (static_cast<unsigned int>(element[n]) > maxNodeId)
+                maxNodeId = static_cast<unsigned int>(element[n]);
+        }
+    }
+    m_nbVertices = maxNodeId + 1;
+
+    // Upload stiffness matrices in symmetric upper-triangle block format:
+    // Only blocks (ni, nj) with nj >= ni are stored.
+    // symIdx = ni * nNodes - ni*(ni-1)/2 + (nj - ni)
+    // K[symIdx * dim * dim + di * dim + dj] per element
+    constexpr auto nSymBlocks = nNodes * (nNodes + 1) / 2;
+    m_gpuStiffness.resize(nbElem * nSymBlocks * dim * dim);
+    {
+        auto* dst = m_gpuStiffness.hostWrite();
+        for (std::size_t e = 0; e < nbElem; ++e)
+        {
+            const auto& K = assembledMatrices[e];
+            for (unsigned int ni = 0; ni < nNodes; ++ni)
+            {
+                const unsigned int diagIdx = ni * nNodes - ni * (ni - 1) / 2;
+                for (unsigned int nj = ni; nj < nNodes; ++nj)
+                {
+                    const unsigned int symIdx = diagIdx + (nj - ni);
+                    for (unsigned int di = 0; di < dim; ++di)
+                        for (unsigned int dj = 0; dj < dim; ++dj)
+                            dst[e * nSymBlocks * dim * dim
+                                + symIdx * dim * dim
+                                + di * dim + dj]
+                                = static_cast<float>(K[ni * dim + di][nj * dim + dj]);
+                }
+            }
+        }
+    }
+
+    // Upload element connectivity in SoA layout:
+    // elements[nodeIdx * nbElem + elemId] = global node index
+    m_gpuElements.resize(nNodes * nbElem);
+    {
+        auto* dst = m_gpuElements.hostWrite();
+        for (std::size_t e = 0; e < nbElem; ++e)
+        {
+            const auto& element = elements[e];
+            for (unsigned int n = 0; n < nNodes; ++n)
+                dst[n * nbElem + e] = static_cast<int>(element[n]);
+        }
+    }
+
+    // Build vertex-to-element mapping (velems)
+    std::vector<std::vector<int>> vertexElems(m_nbVertices);
+    for (std::size_t e = 0; e < nbElem; ++e)
+    {
+        const auto& element = elements[e];
+        for (unsigned int n = 0; n < nNodes; ++n)
+        {
+            const int nodeId = static_cast<int>(element[n]);
+            vertexElems[nodeId].push_back(
+                static_cast<int>(e * nNodes + n + 1));
+        }
+    }
+
+    m_maxElemPerVertex = 0;
+    for (const auto& ve : vertexElems)
+    {
+        if (ve.size() > m_maxElemPerVertex)
+            m_maxElemPerVertex = static_cast<unsigned int>(ve.size());
+    }
+
+    m_gpuVelems.resize(m_maxElemPerVertex * m_nbVertices);
+    {
+        auto* dst = m_gpuVelems.hostWrite();
+        std::memset(dst, 0, m_maxElemPerVertex * m_nbVertices * sizeof(int));
+        for (std::size_t v = 0; v < m_nbVertices; ++v)
+        {
+            for (std::size_t s = 0; s < vertexElems[v].size(); ++s)
+                dst[s * m_nbVertices + v] = vertexElems[v][s];
+        }
+    }
+
+    // Allocate intermediate per-element force buffer
+    m_gpuElementForce.resize(nbElem * nNodes * dim);
+
+    m_gpuDataUploaded = true;
+}
+
+template<class DataTypes, class ElementType>
+void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addDForce(
+    const sofa::core::MechanicalParams* mparams,
+    sofa::DataVecDeriv_t<DataTypes>& d_df,
+    const sofa::DataVecDeriv_t<DataTypes>& d_dx)
+{
+    if (this->isComponentStateInvalid())
+        return;
+
+    if (!m_gpuDataUploaded)
+    {
+        // Fallback to CPU if GPU data not ready
+        ElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addDForce(mparams, d_df, d_dx);
+        return;
+    }
+
+    using trait = sofa::component::solidmechanics::fem::elastic::trait<DataTypes, ElementType>;
+
+    VecDeriv& df = *d_df.beginEdit();
+    const VecDeriv& dx = d_dx.getValue();
+
+    if (df.size() < dx.size())
+        df.resize(dx.size());
+
+    const auto kFactor = static_cast<float>(
+        sofa::core::mechanicalparams::kFactorIncludingRayleighDamping(
+            mparams, this->rayleighStiffness.getValue()));
+
+    const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology);
+    const auto nbElem = static_cast<unsigned int>(elements.size());
+    const auto nbVertex = static_cast<unsigned int>(dx.size());
+
+    gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce(
+        nbElem,
+        nbVertex,
+        trait::NumberOfNodesInElement,
+        m_maxElemPerVertex,
+        m_gpuElements.deviceRead(),
+        m_gpuStiffness.deviceRead(),
+        dx.deviceRead(),
+        df.deviceWrite(),
+        m_gpuElementForce.deviceWrite(),
+        m_gpuVelems.deviceRead(),
+        kFactor);
+
+    d_df.endEdit();
+}
+
+} // namespace sofa::component::solidmechanics::fem::elastic
diff --git a/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn b/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn
new file mode 100644
index 00000000000..c59fb6a6c2a
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn
@@ -0,0 +1,35 @@
+<?xml version="1.0" ?>
+<Node name="root" gravity="0 -9.81 0" dt="0.01">
+    <RequiredPlugin pluginName="Sofa.Component.Constraint.Projective"/>
+    <RequiredPlugin pluginName="Sofa.Component.Engine.Select"/>
+    <RequiredPlugin pluginName="Sofa.Component.LinearSolver.Iterative"/>
+    <RequiredPlugin pluginName="Sofa.Component.Mass"/>
+    <RequiredPlugin pluginName="Sofa.Component.ODESolver.Backward"/>
+    <RequiredPlugin pluginName="Sofa.Component.SolidMechanics.FEM.Elastic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Dynamic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Grid"/>
+    <RequiredPlugin pluginName="Sofa.Component.Visual"/>
+    <RequiredPlugin pluginName="SofaCUDA"/>
+
+    <VisualStyle displayFlags="showBehaviorModels showForceFields" />
+    <DefaultAnimationLoop/>
+
+    <Node name="Hexahedron">
+        <EulerImplicitSolver name="odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+        <CGLinearSolver iterations="250" name="linear_solver" tolerance="1.0e-12" threshold="1.0e-12" />
+
+        <RegularGridTopology name="grid" n="40 10 10" min="0 6 -2" max="16 10 2" />
+        <MechanicalObject template="CudaVec3f" name="state"/>
+
+        <HexahedronSetTopologyContainer name="Container" src="@grid"/>
+        <HexahedronSetTopologyModifier name="Modifier" />
+
+        <DiagonalMass totalMass="50.0" />
+
+        <BoxROI name="box_roi" box="-0.1 5 -3 0.1 11 3" drawBoxes="1" />
+        <FixedProjectiveConstraint indices="@box_roi.indices" />
+
+        <HexahedronLinearSmallStrainFEMForceField name="FEM" template="CudaVec3f"
+                                              youngModulus="2000" poissonRatio="0.3" />
+    </Node>
+</Node>

From 566389d0823eebb2359fee5b9f3710339899e08f Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Tue, 7 Apr 2026 08:43:32 +0900
Subject: [PATCH 07/21] add cuda version of addforce for
 ElementLinearSmallStrainFEMForceField

---
 ...daElementLinearSmallStrainFEMForceField.cu | 210 +++++++++++++++---
 ...udaElementLinearSmallStrainFEMForceField.h |  22 +-
 ...aElementLinearSmallStrainFEMForceField.inl |  47 ++++
 3 files changed, 245 insertions(+), 34 deletions(-)

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
index 4c474fe2d09..39c67a27db4 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
@@ -33,15 +33,105 @@ namespace cuda
 #endif
 
 /**
- * Kernel 1: Compute per-element dForce (1 thread per element).
+ * Kernel for addForce: Compute per-element force from displacement (1 thread per element).
  *
+ * f = -K * (x - x0)
  * Templated on NNodes (compile-time) for full loop unrolling.
  * Hardcoded Dim=3 (CudaVec3f only).
+ */
+template<int NNodes>
+__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel(
+    int nbElem,
+    const int* __restrict__ elements,
+    const float* __restrict__ stiffness,
+    const float* __restrict__ x,
+    const float* __restrict__ x0,
+    float* __restrict__ eforce)
+{
+    constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
+
+    const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
+    if (elemId >= nbElem) return;
+
+    // Gather displacement = x - x0 for this element's nodes
+    float disp[NNodes * 3];
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        const int nodeId = elements[n * nbElem + elemId];
+        disp[n * 3 + 0] = x[nodeId * 3 + 0] - x0[nodeId * 3 + 0];
+        disp[n * 3 + 1] = x[nodeId * 3 + 1] - x0[nodeId * 3 + 1];
+        disp[n * 3 + 2] = x[nodeId * 3 + 2] - x0[nodeId * 3 + 2];
+    }
+
+    // Symmetric block-matrix multiply: edf = K * disp
+    const float* K = stiffness + elemId * NSymBlocks * 9;
+    float edf[NNodes * 3];
+
+    #pragma unroll
+    for (int i = 0; i < NNodes * 3; ++i)
+        edf[i] = 0.0f;
+
+    #pragma unroll
+    for (int ni = 0; ni < NNodes; ++ni)
+    {
+        const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
+
+        // Diagonal block
+        {
+            const float* Kii = K + diagIdx * 9;
+            const float di0 = disp[ni * 3 + 0];
+            const float di1 = disp[ni * 3 + 1];
+            const float di2 = disp[ni * 3 + 2];
+            edf[ni * 3 + 0] += Kii[0] * di0 + Kii[1] * di1 + Kii[2] * di2;
+            edf[ni * 3 + 1] += Kii[3] * di0 + Kii[4] * di1 + Kii[5] * di2;
+            edf[ni * 3 + 2] += Kii[6] * di0 + Kii[7] * di1 + Kii[8] * di2;
+        }
+
+        // Off-diagonal blocks
+        #pragma unroll
+        for (int nj = ni + 1; nj < NNodes; ++nj)
+        {
+            const int symIdx = diagIdx + (nj - ni);
+            const float* Kij = K + symIdx * 9;
+
+            {
+                const float dj0 = disp[nj * 3 + 0];
+                const float dj1 = disp[nj * 3 + 1];
+                const float dj2 = disp[nj * 3 + 2];
+                edf[ni * 3 + 0] += Kij[0] * dj0 + Kij[1] * dj1 + Kij[2] * dj2;
+                edf[ni * 3 + 1] += Kij[3] * dj0 + Kij[4] * dj1 + Kij[5] * dj2;
+                edf[ni * 3 + 2] += Kij[6] * dj0 + Kij[7] * dj1 + Kij[8] * dj2;
+            }
+
+            {
+                const float di0 = disp[ni * 3 + 0];
+                const float di1 = disp[ni * 3 + 1];
+                const float di2 = disp[ni * 3 + 2];
+                edf[nj * 3 + 0] += Kij[0] * di0 + Kij[3] * di1 + Kij[6] * di2;
+                edf[nj * 3 + 1] += Kij[1] * di0 + Kij[4] * di1 + Kij[7] * di2;
+                edf[nj * 3 + 2] += Kij[2] * di0 + Kij[5] * di1 + Kij[8] * di2;
+            }
+        }
+    }
+
+    // Write: eforce = -edf (minus sign from f -= K * displacement)
+    float* out = eforce + elemId * NNodes * 3;
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        out[n * 3 + 0] = -edf[n * 3 + 0];
+        out[n * 3 + 1] = -edf[n * 3 + 1];
+        out[n * 3 + 2] = -edf[n * 3 + 2];
+    }
+}
+
+/**
+ * Kernel for addDForce: Compute per-element dForce (1 thread per element).
  *
- * No rotation matrices needed (linear small strain).
- * Stiffness uses symmetric upper-triangle block storage:
- *   Only blocks (ni, nj) with nj >= ni are stored.
- *   NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats each.
+ * df = -kFactor * K * dx
+ * Templated on NNodes (compile-time) for full loop unrolling.
+ * Hardcoded Dim=3 (CudaVec3f only).
  */
 template<int NNodes>
 __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel(
@@ -133,13 +223,14 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel
 }
 
 /**
- * Kernel 2: Gather per-vertex forces (1 thread per vertex).
+ * Gather per-vertex forces (1 thread per vertex).
  *
+ * Shared by both addForce and addDForce.
  * No atomics: each vertex handled by exactly one thread.
  * velems is SoA: velems[s * nbVertex + vertexId], 0-terminated.
  * Each entry is (elemId * NNodes + localNode + 1), with 0 as sentinel.
  */
-__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel(
+__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel(
     int nbVertex,
     int maxElemPerVertex,
     const int* __restrict__ velems,
@@ -166,6 +257,53 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel(
     df[vertexId * 3 + 2] += fz;
 }
 
+static void launchGather(
+    unsigned int nbVertex,
+    unsigned int maxElemPerVertex,
+    const void* velems,
+    const void* eforce,
+    void* f)
+{
+    const int gatherThreads = 256;
+    const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
+    ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel
+        <<<numBlocks, gatherThreads>>>(
+            nbVertex,
+            maxElemPerVertex,
+            (const int*)velems,
+            (const float*)eforce,
+            (float*)f);
+    mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel");
+}
+
+template<int NNodes>
+static void launchAddForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* stiffness,
+    const void* x,
+    const void* x0,
+    void* f,
+    void* eforce,
+    const void* velems)
+{
+    const int computeThreads = 64;
+    const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
+    ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel<NNodes>
+        <<<numBlocks, computeThreads>>>(
+            nbElem,
+            (const int*)elements,
+            (const float*)stiffness,
+            (const float*)x,
+            (const float*)x0,
+            (float*)eforce);
+    mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel");
+
+    launchGather(nbVertex, maxElemPerVertex, velems, eforce, f);
+}
+
 template<int NNodes>
 static void launchAddDForce(
     unsigned int nbElem,
@@ -180,37 +318,45 @@ static void launchAddDForce(
     float kFactor)
 {
     const int computeThreads = 64;
-    const int gatherThreads = 256;
-
-    {
-        const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
-        ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel<NNodes>
-            <<<numBlocks, computeThreads>>>(
-                nbElem,
-                (const int*)elements,
-                (const float*)stiffness,
-                (const float*)dx,
-                (float*)eforce,
-                kFactor);
-        mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel");
-    }
+    const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
+    ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel<NNodes>
+        <<<numBlocks, computeThreads>>>(
+            nbElem,
+            (const int*)elements,
+            (const float*)stiffness,
+            (const float*)dx,
+            (float*)eforce,
+            kFactor);
+    mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel");
 
-    {
-        const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
-        ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel
-            <<<numBlocks, gatherThreads>>>(
-                nbVertex,
-                maxElemPerVertex,
-                (const int*)velems,
-                (const float*)eforce,
-                (float*)df);
-        mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel");
-    }
+    launchGather(nbVertex, maxElemPerVertex, velems, eforce, df);
 }
 
 extern "C"
 {
 
+void ElementLinearSmallStrainFEMForceFieldCuda3f_addForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int nbNodesPerElem,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* stiffness,
+    const void* x,
+    const void* x0,
+    void* f,
+    void* eforce,
+    const void* velems)
+{
+    switch (nbNodesPerElem)
+    {
+        case 2: launchAddForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
+        case 3: launchAddForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
+        case 4: launchAddForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
+        case 8: launchAddForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
+    }
+}
+
 void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce(
     unsigned int nbElem,
     unsigned int nbVertex,
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
index 67ae48abb48..53cfaf663c5 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
@@ -29,6 +29,19 @@ namespace sofa::gpu::cuda
 
 extern "C"
 {
+    void ElementLinearSmallStrainFEMForceFieldCuda3f_addForce(
+        unsigned int nbElem,
+        unsigned int nbVertex,
+        unsigned int nbNodesPerElem,
+        unsigned int maxElemPerVertex,
+        const void* elements,
+        const void* stiffness,
+        const void* x,
+        const void* x0,
+        void* f,
+        void* eforce,
+        const void* velems);
+
     void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce(
         unsigned int nbElem,
         unsigned int nbVertex,
@@ -52,8 +65,7 @@ namespace sofa::component::solidmechanics::fem::elastic
  * CUDA-accelerated version of ElementLinearSmallStrainFEMForceField.
  *
  * Works with any element type (Edge, Triangle, Quad, Tetrahedron, Hexahedron).
- * The addDForce method (the CG hot path, called ~250 times per timestep) runs entirely on GPU.
- * The addForce method delegates to the CPU parent.
+ * Both addForce and addDForce run entirely on GPU.
  *
  * Uses a two-kernel approach for addDForce:
  *   Kernel 1: compute per-element forces (1 thread/element, fully unrolled)
@@ -88,6 +100,12 @@ class CudaElementLinearSmallStrainFEMForceField
 
     void init() override;
 
+    void addForce(
+        const sofa::core::MechanicalParams* mparams,
+        sofa::DataVecDeriv_t<DataTypes>& f,
+        const sofa::DataVecCoord_t<DataTypes>& x,
+        const sofa::DataVecDeriv_t<DataTypes>& v) override;
+
     void addDForce(
         const sofa::core::MechanicalParams* mparams,
         sofa::DataVecDeriv_t<DataTypes>& df,
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
index 95bc4519ed9..1ab9dfb33f5 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
@@ -144,6 +144,53 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::uploadSt
     m_gpuDataUploaded = true;
 }
 
+template<class DataTypes, class ElementType>
+void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addForce(
+    const sofa::core::MechanicalParams* mparams,
+    sofa::DataVecDeriv_t<DataTypes>& d_f,
+    const sofa::DataVecCoord_t<DataTypes>& d_x,
+    const sofa::DataVecDeriv_t<DataTypes>& d_v)
+{
+    if (this->isComponentStateInvalid())
+        return;
+
+    if (!m_gpuDataUploaded)
+    {
+        ElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addForce(mparams, d_f, d_x, d_v);
+        return;
+    }
+
+    using trait = sofa::component::solidmechanics::fem::elastic::trait<DataTypes, ElementType>;
+
+    VecDeriv& f = *d_f.beginEdit();
+    const VecCoord& x = d_x.getValue();
+
+    if (f.size() < x.size())
+        f.resize(x.size());
+
+    auto restPositionAccessor = this->mstate->readRestPositions();
+    const VecCoord& x0 = restPositionAccessor.ref();
+
+    const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology);
+    const auto nbElem = static_cast<unsigned int>(elements.size());
+    const auto nbVertex = static_cast<unsigned int>(x.size());
+
+    gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addForce(
+        nbElem,
+        nbVertex,
+        trait::NumberOfNodesInElement,
+        m_maxElemPerVertex,
+        m_gpuElements.deviceRead(),
+        m_gpuStiffness.deviceRead(),
+        x.deviceRead(),
+        x0.deviceRead(),
+        f.deviceWrite(),
+        m_gpuElementForce.deviceWrite(),
+        m_gpuVelems.deviceRead());
+
+    d_f.endEdit();
+}
+
 template<class DataTypes, class ElementType>
 void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addDForce(
     const sofa::core::MechanicalParams* mparams,

From 27c0bc75a87cbe148a4bb9c9f96d85baf2337e71 Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Tue, 7 Apr 2026 09:01:32 +0900
Subject: [PATCH 08/21] add cuda version of addforce for
 ElementCorotationalFEMForceField

---
 .../CudaElementCorotationalFEMForceField.cu   | 332 +++++++++++++-----
 .../CudaElementCorotationalFEMForceField.h    |  14 +
 .../CudaElementCorotationalFEMForceField.inl  |  53 ++-
 3 files changed, 304 insertions(+), 95 deletions(-)

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
index 52ec3af12af..4a2ff9028b9 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
@@ -33,18 +33,163 @@ namespace cuda
 #endif
 
 /**
- * Kernel 1: Compute per-element dForce (1 thread per element).
+ * Symmetric block-matrix multiply: out = K * in
+ * K stored as upper triangle: NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats.
+ * Inline device function shared by both addForce and addDForce kernels.
+ */
+template<int NNodes>
+__device__ void symBlockMatMul(const float* K, const float* in, float* out)
+{
+    constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
+
+    #pragma unroll
+    for (int i = 0; i < NNodes * 3; ++i)
+        out[i] = 0.0f;
+
+    #pragma unroll
+    for (int ni = 0; ni < NNodes; ++ni)
+    {
+        const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
+
+        // Diagonal block
+        {
+            const float* Kii = K + diagIdx * 9;
+            const float i0 = in[ni * 3 + 0];
+            const float i1 = in[ni * 3 + 1];
+            const float i2 = in[ni * 3 + 2];
+            out[ni * 3 + 0] += Kii[0] * i0 + Kii[1] * i1 + Kii[2] * i2;
+            out[ni * 3 + 1] += Kii[3] * i0 + Kii[4] * i1 + Kii[5] * i2;
+            out[ni * 3 + 2] += Kii[6] * i0 + Kii[7] * i1 + Kii[8] * i2;
+        }
+
+        // Off-diagonal blocks
+        #pragma unroll
+        for (int nj = ni + 1; nj < NNodes; ++nj)
+        {
+            const int symIdx = diagIdx + (nj - ni);
+            const float* Kij = K + symIdx * 9;
+
+            // Forward: out[ni] += Kij * in[nj]
+            {
+                const float j0 = in[nj * 3 + 0];
+                const float j1 = in[nj * 3 + 1];
+                const float j2 = in[nj * 3 + 2];
+                out[ni * 3 + 0] += Kij[0] * j0 + Kij[1] * j1 + Kij[2] * j2;
+                out[ni * 3 + 1] += Kij[3] * j0 + Kij[4] * j1 + Kij[5] * j2;
+                out[ni * 3 + 2] += Kij[6] * j0 + Kij[7] * j1 + Kij[8] * j2;
+            }
+
+            // Symmetric: out[nj] += Kij^T * in[ni]
+            {
+                const float i0 = in[ni * 3 + 0];
+                const float i1 = in[ni * 3 + 1];
+                const float i2 = in[ni * 3 + 2];
+                out[nj * 3 + 0] += Kij[0] * i0 + Kij[3] * i1 + Kij[6] * i2;
+                out[nj * 3 + 1] += Kij[1] * i0 + Kij[4] * i1 + Kij[7] * i2;
+                out[nj * 3 + 2] += Kij[2] * i0 + Kij[5] * i1 + Kij[8] * i2;
+            }
+        }
+    }
+}
+
+/**
+ * Kernel for addForce: Compute per-element force (1 thread per element).
  *
- * Templated on NNodes (compile-time) for full loop unrolling.
- * Hardcoded Dim=3 (CudaVec3f only).
+ * displacement[j] = R^T * (x[j] - centroid_x) - (x0[j] - centroid_x0)
+ * elementForce = K * displacement
+ * out[j] = -R * elementForce[j]
+ */
+template<int NNodes>
+__global__ void ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel(
+    int nbElem,
+    const int* __restrict__ elements,
+    const float* __restrict__ rotations,
+    const float* __restrict__ stiffness,
+    const float* __restrict__ x,
+    const float* __restrict__ x0,
+    float* __restrict__ eforce)
+{
+    constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
+    constexpr float invN = 1.0f / NNodes;
+
+    const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
+    if (elemId >= nbElem) return;
+
+    // Load rotation matrix R (3x3, row-major)
+    const float* Rptr = rotations + elemId * 9;
+    float R[9];
+    #pragma unroll
+    for (int i = 0; i < 9; ++i)
+        R[i] = Rptr[i];
+
+    // Gather node positions and rest positions
+    float ex[NNodes * 3], ex0[NNodes * 3];
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        const int nodeId = elements[n * nbElem + elemId];
+        ex[n * 3 + 0] = x[nodeId * 3 + 0];
+        ex[n * 3 + 1] = x[nodeId * 3 + 1];
+        ex[n * 3 + 2] = x[nodeId * 3 + 2];
+        ex0[n * 3 + 0] = x0[nodeId * 3 + 0];
+        ex0[n * 3 + 1] = x0[nodeId * 3 + 1];
+        ex0[n * 3 + 2] = x0[nodeId * 3 + 2];
+    }
+
+    // Compute centroids
+    float cx = 0.0f, cy = 0.0f, cz = 0.0f;
+    float cx0 = 0.0f, cy0 = 0.0f, cz0 = 0.0f;
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        cx += ex[n * 3 + 0]; cy += ex[n * 3 + 1]; cz += ex[n * 3 + 2];
+        cx0 += ex0[n * 3 + 0]; cy0 += ex0[n * 3 + 1]; cz0 += ex0[n * 3 + 2];
+    }
+    cx *= invN; cy *= invN; cz *= invN;
+    cx0 *= invN; cy0 *= invN; cz0 *= invN;
+
+    // Compute displacement: disp[j] = R^T * (x[j] - centroid) - (x0[j] - centroid0)
+    float disp[NNodes * 3];
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        const float dx = ex[n * 3 + 0] - cx;
+        const float dy = ex[n * 3 + 1] - cy;
+        const float dz = ex[n * 3 + 2] - cz;
+
+        // R^T * (x - centroid)
+        const float rx = R[0] * dx + R[3] * dy + R[6] * dz;
+        const float ry = R[1] * dx + R[4] * dy + R[7] * dz;
+        const float rz = R[2] * dx + R[5] * dy + R[8] * dz;
+
+        disp[n * 3 + 0] = rx - (ex0[n * 3 + 0] - cx0);
+        disp[n * 3 + 1] = ry - (ex0[n * 3 + 1] - cy0);
+        disp[n * 3 + 2] = rz - (ex0[n * 3 + 2] - cz0);
+    }
+
+    // edf = K * disp
+    float edf[NNodes * 3];
+    const float* K = stiffness + elemId * NSymBlocks * 9;
+    symBlockMatMul<NNodes>(K, disp, edf);
+
+    // Rotate back and write: out = -R * edf
+    float* out = eforce + elemId * NNodes * 3;
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        const float e0 = edf[n * 3 + 0];
+        const float e1 = edf[n * 3 + 1];
+        const float e2 = edf[n * 3 + 2];
+        out[n * 3 + 0] = -(R[0] * e0 + R[1] * e1 + R[2] * e2);
+        out[n * 3 + 1] = -(R[3] * e0 + R[4] * e1 + R[5] * e2);
+        out[n * 3 + 2] = -(R[6] * e0 + R[7] * e1 + R[8] * e2);
+    }
+}
+
+/**
+ * Kernel for addDForce: Compute per-element dForce (1 thread per element).
  *
- * Connectivity is SoA: elements[nodeIdx * nbElem + elemId].
- * Stiffness uses symmetric upper-triangle block storage:
- *   Only blocks (ni, nj) with nj >= ni are stored.
- *   NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats each.
- *   Each off-diagonal block is read once and used for both
- *   forward (edf[ni] += Kij * rdx[nj]) and symmetric
- *   (edf[nj] += Kij^T * rdx[ni]) contributions.
+ * rdx = R^T * dx, edf = K * rdx, out = -kFactor * R * edf
  */
 template<int NNodes>
 __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel(
@@ -84,59 +229,9 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel(
     }
 
     // Symmetric block-matrix multiply: edf = K * rdx
-    // K stored as upper triangle: blocks (ni, nj) for nj >= ni
     const float* K = stiffness + elemId * NSymBlocks * 9;
     float edf[NNodes * 3];
-
-    #pragma unroll
-    for (int i = 0; i < NNodes * 3; ++i)
-        edf[i] = 0.0f;
-
-    #pragma unroll
-    for (int ni = 0; ni < NNodes; ++ni)
-    {
-        // symIdx for (ni, ni) = ni*NNodes - ni*(ni-1)/2
-        const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
-
-        // Diagonal block (ni, ni): Kii * rdx[ni]
-        {
-            const float* Kii = K + diagIdx * 9;
-            const float ri0 = rdx[ni * 3 + 0];
-            const float ri1 = rdx[ni * 3 + 1];
-            const float ri2 = rdx[ni * 3 + 2];
-            edf[ni * 3 + 0] += Kii[0] * ri0 + Kii[1] * ri1 + Kii[2] * ri2;
-            edf[ni * 3 + 1] += Kii[3] * ri0 + Kii[4] * ri1 + Kii[5] * ri2;
-            edf[ni * 3 + 2] += Kii[6] * ri0 + Kii[7] * ri1 + Kii[8] * ri2;
-        }
-
-        // Off-diagonal blocks (ni, nj) for nj > ni
-        #pragma unroll
-        for (int nj = ni + 1; nj < NNodes; ++nj)
-        {
-            const int symIdx = diagIdx + (nj - ni);
-            const float* Kij = K + symIdx * 9;
-
-            // Forward: edf[ni] += Kij * rdx[nj]
-            {
-                const float rj0 = rdx[nj * 3 + 0];
-                const float rj1 = rdx[nj * 3 + 1];
-                const float rj2 = rdx[nj * 3 + 2];
-                edf[ni * 3 + 0] += Kij[0] * rj0 + Kij[1] * rj1 + Kij[2] * rj2;
-                edf[ni * 3 + 1] += Kij[3] * rj0 + Kij[4] * rj1 + Kij[5] * rj2;
-                edf[ni * 3 + 2] += Kij[6] * rj0 + Kij[7] * rj1 + Kij[8] * rj2;
-            }
-
-            // Symmetric: edf[nj] += Kij^T * rdx[ni]
-            {
-                const float ri0 = rdx[ni * 3 + 0];
-                const float ri1 = rdx[ni * 3 + 1];
-                const float ri2 = rdx[ni * 3 + 2];
-                edf[nj * 3 + 0] += Kij[0] * ri0 + Kij[3] * ri1 + Kij[6] * ri2;
-                edf[nj * 3 + 1] += Kij[1] * ri0 + Kij[4] * ri1 + Kij[7] * ri2;
-                edf[nj * 3 + 2] += Kij[2] * ri0 + Kij[5] * ri1 + Kij[8] * ri2;
-            }
-        }
-    }
+    symBlockMatMul<NNodes>(K, rdx, edf);
 
     // Rotate back and write: eforce = -kFactor * R * edf
     float* out = eforce + elemId * NNodes * 3;
@@ -153,13 +248,14 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel(
 }
 
 /**
- * Kernel 2: Gather per-vertex forces (1 thread per vertex).
+ * Gather per-vertex forces (1 thread per vertex).
  *
+ * Shared by addForce and addDForce.
  * No atomics: each vertex handled by exactly one thread.
  * velems is SoA: velems[s * nbVertex + vertexId], 0-terminated.
  * Each entry is (elemId * NNodes + localNode + 1), with 0 as sentinel.
  */
-__global__ void ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel(
+__global__ void ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel(
     int nbVertex,
     int maxElemPerVertex,
     const int* __restrict__ velems,
@@ -186,6 +282,55 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel(
     df[vertexId * 3 + 2] += fz;
 }
 
+static void launchGather(
+    unsigned int nbVertex,
+    unsigned int maxElemPerVertex,
+    const void* velems,
+    const void* eforce,
+    void* f)
+{
+    const int gatherThreads = 256;
+    const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
+    ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel
+        <<<numBlocks, gatherThreads>>>(
+            nbVertex,
+            maxElemPerVertex,
+            (const int*)velems,
+            (const float*)eforce,
+            (float*)f);
+    mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel");
+}
+
+template<int NNodes>
+static void launchAddForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* rotations,
+    const void* stiffness,
+    const void* x,
+    const void* x0,
+    void* f,
+    void* eforce,
+    const void* velems)
+{
+    const int computeThreads = 64;
+    const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
+    ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel<NNodes>
+        <<<numBlocks, computeThreads>>>(
+            nbElem,
+            (const int*)elements,
+            (const float*)rotations,
+            (const float*)stiffness,
+            (const float*)x,
+            (const float*)x0,
+            (float*)eforce);
+    mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel");
+
+    launchGather(nbVertex, maxElemPerVertex, velems, eforce, f);
+}
+
 template<int NNodes>
 static void launchAddDForce(
     unsigned int nbElem,
@@ -201,38 +346,47 @@ static void launchAddDForce(
     float kFactor)
 {
     const int computeThreads = 64;
-    const int gatherThreads = 256;
+    const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
+    ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel<NNodes>
+        <<<numBlocks, computeThreads>>>(
+            nbElem,
+            (const int*)elements,
+            (const float*)rotations,
+            (const float*)stiffness,
+            (const float*)dx,
+            (float*)eforce,
+            kFactor);
+    mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel");
 
-    {
-        const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
-        ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel<NNodes>
-            <<<numBlocks, computeThreads>>>(
-                nbElem,
-                (const int*)elements,
-                (const float*)rotations,
-                (const float*)stiffness,
-                (const float*)dx,
-                (float*)eforce,
-                kFactor);
-        mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel");
-    }
-
-    {
-        const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
-        ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel
-            <<<numBlocks, gatherThreads>>>(
-                nbVertex,
-                maxElemPerVertex,
-                (const int*)velems,
-                (const float*)eforce,
-                (float*)df);
-        mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel");
-    }
+    launchGather(nbVertex, maxElemPerVertex, velems, eforce, df);
 }
 
 extern "C"
 {
 
+void ElementCorotationalFEMForceFieldCuda3f_addForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int nbNodesPerElem,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* rotations,
+    const void* stiffness,
+    const void* x,
+    const void* x0,
+    void* f,
+    void* eforce,
+    const void* velems)
+{
+    switch (nbNodesPerElem)
+    {
+        case 2: launchAddForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
+        case 3: launchAddForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
+        case 4: launchAddForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
+        case 8: launchAddForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
+    }
+}
+
 void ElementCorotationalFEMForceFieldCuda3f_addDForce(
     unsigned int nbElem,
     unsigned int nbVertex,
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
index c5220a2f2be..74a9adc1a5b 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
@@ -29,6 +29,20 @@ namespace sofa::gpu::cuda
 
 extern "C"
 {
+    void ElementCorotationalFEMForceFieldCuda3f_addForce(
+        unsigned int nbElem,
+        unsigned int nbVertex,
+        unsigned int nbNodesPerElem,
+        unsigned int maxElemPerVertex,
+        const void* elements,
+        const void* rotations,
+        const void* stiffness,
+        const void* x,
+        const void* x0,
+        void* f,
+        void* eforce,
+        const void* velems);
+
     void ElementCorotationalFEMForceFieldCuda3f_addDForce(
         unsigned int nbElem,
         unsigned int nbVertex,
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
index 1e1093758a9..6cd70af4c7c 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
@@ -176,15 +176,56 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::uploadRotatio
 template<class DataTypes, class ElementType>
 void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
     const sofa::core::MechanicalParams* mparams,
-    sofa::DataVecDeriv_t<DataTypes>& f,
-    const sofa::DataVecCoord_t<DataTypes>& x,
-    const sofa::DataVecDeriv_t<DataTypes>& v)
+    sofa::DataVecDeriv_t<DataTypes>& d_f,
+    const sofa::DataVecCoord_t<DataTypes>& d_x,
+    const sofa::DataVecDeriv_t<DataTypes>& d_v)
 {
-    // Run on CPU: computes rotations and forces
-    ElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(mparams, f, x, v);
+    if (this->isComponentStateInvalid())
+        return;
+
+    if (!m_gpuDataUploaded)
+    {
+        ElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(mparams, d_f, d_x, d_v);
+        uploadRotations();
+        return;
+    }
 
-    // Upload the freshly-computed rotations to GPU for subsequent addDForce calls
+    using trait = sofa::component::solidmechanics::fem::elastic::trait<DataTypes, ElementType>;
+
+    const VecCoord& x = d_x.getValue();
+    auto restPositionAccessor = this->mstate->readRestPositions();
+    const VecCoord& x0 = restPositionAccessor.ref();
+
+    // Compute rotations on CPU (polar decomposition cannot run on GPU)
+    this->computeRotations(this->m_rotations, x, x0);
+
+    // Upload rotations to GPU
     uploadRotations();
+
+    // Run force computation on GPU
+    VecDeriv& f = *d_f.beginEdit();
+    if (f.size() < x.size())
+        f.resize(x.size());
+
+    const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology);
+    const auto nbElem = static_cast<unsigned int>(elements.size());
+    const auto nbVertex = static_cast<unsigned int>(x.size());
+
+    gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForce(
+        nbElem,
+        nbVertex,
+        trait::NumberOfNodesInElement,
+        m_maxElemPerVertex,
+        m_gpuElements.deviceRead(),
+        m_gpuRotations.deviceRead(),
+        m_gpuStiffness.deviceRead(),
+        x.deviceRead(),
+        x0.deviceRead(),
+        f.deviceWrite(),
+        m_gpuElementForce.deviceWrite(),
+        m_gpuVelems.deviceRead());
+
+    d_f.endEdit();
 }
 
 template<class DataTypes, class ElementType>

From d6eca76bed2b138af810c56cd4e50a6493002c9b Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Tue, 7 Apr 2026 09:13:11 +0900
Subject: [PATCH 09/21] update addforce to compute everything on GPU

---
 .../CudaElementCorotationalFEMForceField.cu   | 278 +++++++++++++++++-
 .../CudaElementCorotationalFEMForceField.h    |  28 +-
 .../CudaElementCorotationalFEMForceField.inl  |  95 ++++--
 3 files changed, 371 insertions(+), 30 deletions(-)

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
index 4a2ff9028b9..fd31eeab487 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
@@ -32,16 +32,128 @@ namespace cuda
 {
 #endif
 
+/**
+ * Device helper: 3x3 matrix multiply C = A * B (row-major)
+ */
+__device__ void mat3Mul(const float* A, const float* B, float* C)
+{
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+    {
+        #pragma unroll
+        for (int j = 0; j < 3; ++j)
+        {
+            C[i * 3 + j] = A[i * 3 + 0] * B[0 * 3 + j]
+                          + A[i * 3 + 1] * B[1 * 3 + j]
+                          + A[i * 3 + 2] * B[2 * 3 + j];
+        }
+    }
+}
+
+/**
+ * Device helper: C = A * B^T (row-major)
+ */
+__device__ void mat3MulTranspose(const float* A, const float* BT, float* C)
+{
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+    {
+        #pragma unroll
+        for (int j = 0; j < 3; ++j)
+        {
+            C[i * 3 + j] = A[i * 3 + 0] * BT[j * 3 + 0]
+                          + A[i * 3 + 1] * BT[j * 3 + 1]
+                          + A[i * 3 + 2] * BT[j * 3 + 2];
+        }
+    }
+}
+
+/**
+ * Device helper: compute rotation frame from first 3 nodes (TriangleRotation).
+ * Used for Triangle (NNodes=3) and Tetrahedron (NNodes=4) elements.
+ * ex is [NNodes*3] array of gathered node positions.
+ */
+__device__ void computeTriangleFrame(const float* ex, float* frame)
+{
+    // xAxis = normalize(p1 - p0)
+    float ax = ex[3] - ex[0], ay = ex[4] - ex[1], az = ex[5] - ex[2];
+    float invLen = rsqrtf(ax * ax + ay * ay + az * az);
+    ax *= invLen; ay *= invLen; az *= invLen;
+
+    // tmp yAxis = p2 - p0
+    float bx = ex[6] - ex[0], by = ex[7] - ex[1], bz = ex[8] - ex[2];
+
+    // zAxis = normalize(cross(xAxis, tmpY))
+    float cx = ay * bz - az * by;
+    float cy = az * bx - ax * bz;
+    float cz = ax * by - ay * bx;
+    invLen = rsqrtf(cx * cx + cy * cy + cz * cz);
+    cx *= invLen; cy *= invLen; cz *= invLen;
+
+    // yAxis = cross(zAxis, xAxis)
+    bx = cy * az - cz * ay;
+    by = cz * ax - cx * az;
+    bz = cx * ay - cy * ax;
+
+    // frame rows: [xAxis; yAxis; zAxis]
+    frame[0] = ax; frame[1] = ay; frame[2] = az;
+    frame[3] = bx; frame[4] = by; frame[5] = bz;
+    frame[6] = cx; frame[7] = cy; frame[8] = cz;
+}
+
+/**
+ * Device helper: compute rotation frame from 8 hexahedron nodes (HexahedronRotation).
+ * ex is [8*3] array of gathered node positions.
+ */
+__device__ void computeHexahedronFrame(const float* ex, float* frame)
+{
+    // Average edge vectors
+    // xAxis_avg = ((n1-n0) + (n2-n3) + (n5-n4) + (n6-n7)) * 0.25
+    float ax = ((ex[1*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[3*3+0])
+              + (ex[5*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[7*3+0])) * 0.25f;
+    float ay = ((ex[1*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[3*3+1])
+              + (ex[5*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[7*3+1])) * 0.25f;
+    float az = ((ex[1*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[3*3+2])
+              + (ex[5*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[7*3+2])) * 0.25f;
+
+    // yAxis_avg = ((n3-n0) + (n2-n1) + (n7-n4) + (n6-n5)) * 0.25
+    float bx = ((ex[3*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[1*3+0])
+              + (ex[7*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[5*3+0])) * 0.25f;
+    float by = ((ex[3*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[1*3+1])
+              + (ex[7*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[5*3+1])) * 0.25f;
+    float bz = ((ex[3*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[1*3+2])
+              + (ex[7*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[5*3+2])) * 0.25f;
+
+    // Normalize xAxis
+    float invLen = rsqrtf(ax * ax + ay * ay + az * az);
+    ax *= invLen; ay *= invLen; az *= invLen;
+
+    // zAxis = normalize(cross(xAxis, yAxis_avg))
+    float cx = ay * bz - az * by;
+    float cy = az * bx - ax * bz;
+    float cz = ax * by - ay * bx;
+    invLen = rsqrtf(cx * cx + cy * cy + cz * cz);
+    cx *= invLen; cy *= invLen; cz *= invLen;
+
+    // yAxis = cross(zAxis, xAxis)
+    bx = cy * az - cz * ay;
+    by = cz * ax - cx * az;
+    bz = cx * ay - cy * ax;
+
+    // frame rows: [xAxis; yAxis; zAxis]
+    frame[0] = ax; frame[1] = ay; frame[2] = az;
+    frame[3] = bx; frame[4] = by; frame[5] = bz;
+    frame[6] = cx; frame[7] = cy; frame[8] = cz;
+}
+
 /**
  * Symmetric block-matrix multiply: out = K * in
  * K stored as upper triangle: NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats.
- * Inline device function shared by both addForce and addDForce kernels.
+ * Inline device function shared by addForce, addDForce, and combined kernels.
  */
 template<int NNodes>
 __device__ void symBlockMatMul(const float* K, const float* in, float* out)
 {
-    constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
-
     #pragma unroll
     for (int i = 0; i < NNodes * 3; ++i)
         out[i] = 0.0f;
@@ -92,6 +204,111 @@ __device__ void symBlockMatMul(const float* K, const float* in, float* out)
     }
 }
 
+/**
+ * Combined kernel: compute rotations AND per-element forces in one pass.
+ *
+ * Uses TriangleRotation for NNodes=3,4 and HexahedronRotation for NNodes=8.
+ * Computes: frame from node positions → R = frame * initRotTransposed
+ * Then: displacement = R^T*(x-centroid) - (x0-centroid0) → K*disp → -R*result
+ * Also writes R to rotations buffer for subsequent addDForce calls.
+ */
+template<int NNodes>
+__global__ void ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_kernel(
+    int nbElem,
+    const int* __restrict__ elements,
+    const float* __restrict__ initRotTransposed,
+    const float* __restrict__ stiffness,
+    const float* __restrict__ x,
+    const float* __restrict__ x0,
+    float* __restrict__ rotationsOut,
+    float* __restrict__ eforce)
+{
+    constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
+    constexpr float invN = 1.0f / NNodes;
+
+    const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
+    if (elemId >= nbElem) return;
+
+    // Gather node positions and rest positions
+    float ex[NNodes * 3], ex0[NNodes * 3];
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        const int nodeId = elements[n * nbElem + elemId];
+        ex[n * 3 + 0] = x[nodeId * 3 + 0];
+        ex[n * 3 + 1] = x[nodeId * 3 + 1];
+        ex[n * 3 + 2] = x[nodeId * 3 + 2];
+        ex0[n * 3 + 0] = x0[nodeId * 3 + 0];
+        ex0[n * 3 + 1] = x0[nodeId * 3 + 1];
+        ex0[n * 3 + 2] = x0[nodeId * 3 + 2];
+    }
+
+    // Compute rotation frame from current positions
+    float frame[9];
+    if constexpr (NNodes == 8)
+        computeHexahedronFrame(ex, frame);
+    else
+        computeTriangleFrame(ex, frame);
+
+    // R = frame * initRotTransposed^T (i.e. frame.multTranspose(initRotTransposed))
+    // Since initRotTransposed is already the transpose, R = frame * initRotTransposed^T
+    const float* irt = initRotTransposed + elemId * 9;
+    float R[9];
+    mat3MulTranspose(frame, irt, R);
+
+    // Write R to rotations buffer for addDForce
+    float* Rout = rotationsOut + elemId * 9;
+    #pragma unroll
+    for (int i = 0; i < 9; ++i)
+        Rout[i] = R[i];
+
+    // Compute centroids
+    float cx = 0.0f, cy = 0.0f, cz = 0.0f;
+    float cx0 = 0.0f, cy0 = 0.0f, cz0 = 0.0f;
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        cx += ex[n * 3 + 0]; cy += ex[n * 3 + 1]; cz += ex[n * 3 + 2];
+        cx0 += ex0[n * 3 + 0]; cy0 += ex0[n * 3 + 1]; cz0 += ex0[n * 3 + 2];
+    }
+    cx *= invN; cy *= invN; cz *= invN;
+    cx0 *= invN; cy0 *= invN; cz0 *= invN;
+
+    // Compute displacement: disp[j] = R^T * (x[j] - centroid) - (x0[j] - centroid0)
+    float disp[NNodes * 3];
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        const float dx = ex[n * 3 + 0] - cx;
+        const float dy = ex[n * 3 + 1] - cy;
+        const float dz = ex[n * 3 + 2] - cz;
+        const float rx = R[0] * dx + R[3] * dy + R[6] * dz;
+        const float ry = R[1] * dx + R[4] * dy + R[7] * dz;
+        const float rz = R[2] * dx + R[5] * dy + R[8] * dz;
+        disp[n * 3 + 0] = rx - (ex0[n * 3 + 0] - cx0);
+        disp[n * 3 + 1] = ry - (ex0[n * 3 + 1] - cy0);
+        disp[n * 3 + 2] = rz - (ex0[n * 3 + 2] - cz0);
+    }
+
+    // edf = K * disp
+    float edf[NNodes * 3];
+    const float* K = stiffness + elemId * NSymBlocks * 9;
+    symBlockMatMul<NNodes>(K, disp, edf);
+
+    // Rotate back and write: out = -R * edf
+    float* out = eforce + elemId * NNodes * 3;
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        const float e0 = edf[n * 3 + 0];
+        const float e1 = edf[n * 3 + 1];
+        const float e2 = edf[n * 3 + 2];
+        out[n * 3 + 0] = -(R[0] * e0 + R[1] * e1 + R[2] * e2);
+        out[n * 3 + 1] = -(R[3] * e0 + R[4] * e1 + R[5] * e2);
+        out[n * 3 + 2] = -(R[6] * e0 + R[7] * e1 + R[8] * e2);
+    }
+}
+
 /**
  * Kernel for addForce: Compute per-element force (1 thread per element).
  *
@@ -301,6 +518,38 @@ static void launchGather(
     mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel");
 }
 
+template<int NNodes>
+static void launchAddForceWithRotations(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* initRotTransposed,
+    const void* stiffness,
+    const void* x,
+    const void* x0,
+    void* f,
+    void* eforce,
+    void* rotationsOut,
+    const void* velems)
+{
+    const int computeThreads = 64;
+    const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
+    ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_kernel<NNodes>
+        <<<numBlocks, computeThreads>>>(
+            nbElem,
+            (const int*)elements,
+            (const float*)initRotTransposed,
+            (const float*)stiffness,
+            (const float*)x,
+            (const float*)x0,
+            (float*)rotationsOut,
+            (float*)eforce);
+    mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_kernel");
+
+    launchGather(nbVertex, maxElemPerVertex, velems, eforce, f);
+}
+
 template<int NNodes>
 static void launchAddForce(
     unsigned int nbElem,
@@ -364,6 +613,29 @@ static void launchAddDForce(
 extern "C"
 {
 
+void ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int nbNodesPerElem,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* initRotTransposed,
+    const void* stiffness,
+    const void* x,
+    const void* x0,
+    void* f,
+    void* eforce,
+    void* rotationsOut,
+    const void* velems)
+{
+    switch (nbNodesPerElem)
+    {
+        case 3: launchAddForceWithRotations<3>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
+        case 4: launchAddForceWithRotations<4>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
+        case 8: launchAddForceWithRotations<8>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
+    }
+}
+
 void ElementCorotationalFEMForceFieldCuda3f_addForce(
     unsigned int nbElem,
     unsigned int nbVertex,
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
index 74a9adc1a5b..e25f9a7b485 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
@@ -29,6 +29,21 @@ namespace sofa::gpu::cuda
 
 extern "C"
 {
+    void ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations(
+        unsigned int nbElem,
+        unsigned int nbVertex,
+        unsigned int nbNodesPerElem,
+        unsigned int maxElemPerVertex,
+        const void* elements,
+        const void* initRotTransposed,
+        const void* stiffness,
+        const void* x,
+        const void* x0,
+        void* f,
+        void* eforce,
+        void* rotationsOut,
+        const void* velems);
+
     void ElementCorotationalFEMForceFieldCuda3f_addForce(
         unsigned int nbElem,
         unsigned int nbVertex,
@@ -118,18 +133,21 @@ class CudaElementCorotationalFEMForceField
 
     void uploadStiffnessAndConnectivity();
     void uploadRotations();
+    void uploadInitialRotationsTransposed();
 
-    gpu::cuda::CudaVector<float> m_gpuStiffness;      ///< Block-format stiffness: K[(ni*N+nj)*9 + di*3+dj] per element
-    gpu::cuda::CudaVector<float> m_gpuRotations;       ///< Flat 3x3 rotation matrices per element
-    gpu::cuda::CudaVector<int>   m_gpuElements;        ///< SoA connectivity: elements[nodeIdx * nbElem + elemId]
-    gpu::cuda::CudaVector<float> m_gpuElementForce;    ///< Intermediate per-element per-node force buffer
-    gpu::cuda::CudaVector<int>   m_gpuVelems;          ///< SoA vertex-to-element mapping, 0-terminated
+    gpu::cuda::CudaVector<float> m_gpuStiffness;                  ///< Symmetric block-format stiffness per element
+    gpu::cuda::CudaVector<float> m_gpuRotations;                  ///< Flat 3x3 rotation matrices per element
+    gpu::cuda::CudaVector<float> m_gpuInitialRotationsTransposed; ///< Flat 3x3 initial rotation transposed per element
+    gpu::cuda::CudaVector<int>   m_gpuElements;                   ///< SoA connectivity: elements[nodeIdx * nbElem + elemId]
+    gpu::cuda::CudaVector<float> m_gpuElementForce;               ///< Intermediate per-element per-node force buffer
+    gpu::cuda::CudaVector<int>   m_gpuVelems;                     ///< SoA vertex-to-element mapping, 0-terminated
 
     unsigned int m_maxElemPerVertex = 0;
     unsigned int m_nbVertices = 0;
 
     bool m_gpuDataUploaded = false;
     bool m_gpuRotationsUploaded = false;
+    bool m_gpuRotationMethodSupported = false;
 };
 
 } // namespace sofa::component::solidmechanics::fem::elastic
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
index 6cd70af4c7c..e6742c60b2c 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
@@ -36,6 +36,7 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::init()
     if (!this->isComponentStateInvalid())
     {
         uploadStiffnessAndConnectivity();
+        uploadInitialRotationsTransposed();
     }
 }
 
@@ -173,6 +174,36 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::uploadRotatio
     m_gpuRotationsUploaded = true;
 }
 
+template<class DataTypes, class ElementType>
+void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::uploadInitialRotationsTransposed()
+{
+    using trait = sofa::component::solidmechanics::fem::elastic::trait<DataTypes, ElementType>;
+    constexpr auto dim = trait::spatial_dimensions;
+    constexpr auto nNodes = trait::NumberOfNodesInElement;
+
+    const auto& initRotT = this->m_initialRotationsTransposed;
+    const auto nbElem = initRotT.size();
+    if (nbElem == 0) return;
+
+    m_gpuInitialRotationsTransposed.resize(nbElem * dim * dim);
+    m_gpuRotations.resize(nbElem * dim * dim);
+    {
+        auto* dst = m_gpuInitialRotationsTransposed.hostWrite();
+        for (std::size_t e = 0; e < nbElem; ++e)
+        {
+            const auto& R = initRotT[e];
+            for (unsigned int i = 0; i < dim; ++i)
+                for (unsigned int j = 0; j < dim; ++j)
+                    dst[e * dim * dim + i * dim + j] = static_cast<float>(R[i][j]);
+        }
+    }
+
+    // Check if the rotation method is GPU-compatible
+    const auto rotationMethodKey = this->m_rotationMethods.d_rotationMethod.getValue().key();
+    m_gpuRotationMethodSupported = (nNodes >= 3)
+        && (rotationMethodKey == "triangle" || rotationMethodKey == "hexahedron");
+}
+
 template<class DataTypes, class ElementType>
 void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
     const sofa::core::MechanicalParams* mparams,
@@ -196,34 +227,54 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
     auto restPositionAccessor = this->mstate->readRestPositions();
     const VecCoord& x0 = restPositionAccessor.ref();
 
-    // Compute rotations on CPU (polar decomposition cannot run on GPU)
-    this->computeRotations(this->m_rotations, x, x0);
-
-    // Upload rotations to GPU
-    uploadRotations();
+    const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology);
+    const auto nbElem = static_cast<unsigned int>(elements.size());
+    const auto nbVertex = static_cast<unsigned int>(x.size());
 
-    // Run force computation on GPU
     VecDeriv& f = *d_f.beginEdit();
     if (f.size() < x.size())
         f.resize(x.size());
 
-    const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology);
-    const auto nbElem = static_cast<unsigned int>(elements.size());
-    const auto nbVertex = static_cast<unsigned int>(x.size());
+    if (m_gpuRotationMethodSupported)
+    {
+        // Fully GPU path: compute rotations + forces in one kernel
+        gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations(
+            nbElem,
+            nbVertex,
+            trait::NumberOfNodesInElement,
+            m_maxElemPerVertex,
+            m_gpuElements.deviceRead(),
+            m_gpuInitialRotationsTransposed.deviceRead(),
+            m_gpuStiffness.deviceRead(),
+            x.deviceRead(),
+            x0.deviceRead(),
+            f.deviceWrite(),
+            m_gpuElementForce.deviceWrite(),
+            m_gpuRotations.deviceWrite(),
+            m_gpuVelems.deviceRead());
+
+        m_gpuRotationsUploaded = true;
+    }
+    else
+    {
+        // CPU rotations + GPU forces
+        this->computeRotations(this->m_rotations, x, x0);
+        uploadRotations();
 
-    gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForce(
-        nbElem,
-        nbVertex,
-        trait::NumberOfNodesInElement,
-        m_maxElemPerVertex,
-        m_gpuElements.deviceRead(),
-        m_gpuRotations.deviceRead(),
-        m_gpuStiffness.deviceRead(),
-        x.deviceRead(),
-        x0.deviceRead(),
-        f.deviceWrite(),
-        m_gpuElementForce.deviceWrite(),
-        m_gpuVelems.deviceRead());
+        gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForce(
+            nbElem,
+            nbVertex,
+            trait::NumberOfNodesInElement,
+            m_maxElemPerVertex,
+            m_gpuElements.deviceRead(),
+            m_gpuRotations.deviceRead(),
+            m_gpuStiffness.deviceRead(),
+            x.deviceRead(),
+            x0.deviceRead(),
+            f.deviceWrite(),
+            m_gpuElementForce.deviceWrite(),
+            m_gpuVelems.deviceRead());
+    }
 
     d_f.endEdit();
 }

From 589d59193a2e5c2b5f5f461b91cd4c903a60969e Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Tue, 7 Apr 2026 15:11:25 +0900
Subject: [PATCH 10/21] try to fix with direct solver

---
 .../CudaElementCorotationalFEMForceField.h    |  3 ++
 .../CudaElementCorotationalFEMForceField.inl  | 31 +++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
index e25f9a7b485..67c619768b5 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
@@ -127,6 +127,8 @@ class CudaElementCorotationalFEMForceField
         sofa::DataVecDeriv_t<DataTypes>& df,
         const sofa::DataVecDeriv_t<DataTypes>& dx) override;
 
+    void buildStiffnessMatrix(sofa::core::behavior::StiffnessMatrix* matrix) override;
+
 protected:
 
     CudaElementCorotationalFEMForceField() = default;
@@ -134,6 +136,7 @@ class CudaElementCorotationalFEMForceField
     void uploadStiffnessAndConnectivity();
     void uploadRotations();
     void uploadInitialRotationsTransposed();
+    void downloadRotations();
 
     gpu::cuda::CudaVector<float> m_gpuStiffness;                  ///< Symmetric block-format stiffness per element
     gpu::cuda::CudaVector<float> m_gpuRotations;                  ///< Flat 3x3 rotation matrices per element
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
index e6742c60b2c..7d7f32964c3 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
@@ -204,6 +204,37 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::uploadInitial
         && (rotationMethodKey == "triangle" || rotationMethodKey == "hexahedron");
 }
 
+template<class DataTypes, class ElementType>
+void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::downloadRotations()
+{
+    using trait = sofa::component::solidmechanics::fem::elastic::trait<DataTypes, ElementType>;
+    constexpr auto dim = trait::spatial_dimensions;
+
+    if (!m_gpuRotationsUploaded) return;
+
+    const auto nbElem = m_gpuRotations.size() / (dim * dim);
+    this->m_rotations.resize(nbElem);
+
+    const auto* src = m_gpuRotations.hostRead();
+    for (std::size_t e = 0; e < nbElem; ++e)
+    {
+        auto& R = this->m_rotations[e];
+        for (unsigned int i = 0; i < dim; ++i)
+            for (unsigned int j = 0; j < dim; ++j)
+                R[i][j] = static_cast<Real>(src[e * dim * dim + i * dim + j]);
+    }
+}
+
+template<class DataTypes, class ElementType>
+void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::buildStiffnessMatrix(
+    sofa::core::behavior::StiffnessMatrix* matrix)
+{
+    if (m_gpuRotationMethodSupported && m_gpuRotationsUploaded)
+        downloadRotations();
+
+    ElementCorotationalFEMForceField<DataTypes, ElementType>::buildStiffnessMatrix(matrix);
+}
+
 template<class DataTypes, class ElementType>
 void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
     const sofa::core::MechanicalParams* mparams,

From cf4ec939ebaa2d264b13548eeecae5f490c0782d Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Tue, 7 Apr 2026 15:31:39 +0900
Subject: [PATCH 11/21] fix corot

---
 .../CudaElementCorotationalFEMForceField.cu   | 25 ++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
index fd31eeab487..6de99fe5794 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
@@ -68,6 +68,25 @@ __device__ void mat3MulTranspose(const float* A, const float* BT, float* C)
     }
 }
 
+/**
+ * Device helper: C = A^T * B (row-major)
+ * Matches SOFA's Mat::multTranspose(B) which computes this^T * B.
+ */
+__device__ void mat3TransposeMul(const float* A, const float* B, float* C)
+{
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+    {
+        #pragma unroll
+        for (int j = 0; j < 3; ++j)
+        {
+            C[i * 3 + j] = A[0 * 3 + i] * B[0 * 3 + j]
+                          + A[1 * 3 + i] * B[1 * 3 + j]
+                          + A[2 * 3 + i] * B[2 * 3 + j];
+        }
+    }
+}
+
 /**
  * Device helper: compute rotation frame from first 3 nodes (TriangleRotation).
  * Used for Triangle (NNodes=3) and Tetrahedron (NNodes=4) elements.
@@ -250,11 +269,11 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_
     else
         computeTriangleFrame(ex, frame);
 
-    // R = frame * initRotTransposed^T (i.e. frame.multTranspose(initRotTransposed))
-    // Since initRotTransposed is already the transpose, R = frame * initRotTransposed^T
+    // R = frame^T * initRot (matching SOFA's Mat::multTranspose which computes A^T * B)
+    // m_initialRotationsTransposed stores frame_rest (despite its name, it's transposed during init)
     const float* irt = initRotTransposed + elemId * 9;
     float R[9];
-    mat3MulTranspose(frame, irt, R);
+    mat3TransposeMul(frame, irt, R);
 
     // Write R to rotations buffer for addDForce
     float* Rout = rotationsOut + elemId * 9;

From aa95ad3b3fec60fda44ff4978faeaa047a955215 Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Wed, 8 Apr 2026 08:12:39 +0900
Subject: [PATCH 12/21] update examples

---
 .../CudaElementCorotationalFEMForceField.scn  | 73 ++++++++++++++++++-
 ...aElementLinearSmallStrainFEMForceField.scn | 68 ++++++++++++++++-
 2 files changed, 137 insertions(+), 4 deletions(-)

diff --git a/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn b/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn
index a75e2058ff5..7ecf5da0d41 100644
--- a/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn
+++ b/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn
@@ -12,13 +12,14 @@
     <RequiredPlugin pluginName="SofaCUDA"/>
 
     <VisualStyle displayFlags="showBehaviorModels showForceFields" />
-    <DefaultAnimationLoop/>
+    <DefaultAnimationLoop />
+    <DefaultVisualManagerLoop />
 
-    <Node name="Hexahedron">
+<!--     <Node name="Hexahedron">
         <EulerImplicitSolver name="odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
         <CGLinearSolver iterations="250" name="linear_solver" tolerance="1.0e-12" threshold="1.0e-12" />
 
-        <RegularGridTopology name="grid" n="40 10 10" min="0 6 -2" max="16 10 2" />
+        <RegularGridTopology name="grid" n="80 20 20" min="0 6 -2" max="16 10 2" />
         <MechanicalObject template="CudaVec3f" name="state"/>
 
         <HexahedronSetTopologyContainer name="Container" src="@grid"/>
@@ -31,5 +32,71 @@
 
         <HexahedronCorotationalFEMForceField name="FEM" template="CudaVec3f"
                                               youngModulus="2000" poissonRatio="0.3" />
+    </Node> -->
+
+    <Node name="Tetrahedron">
+        <Node name="Beam">
+            <RegularGridTopology name="grid" n="40 10 10" min="0 6 -2" max="16 10 2" />
+            <TetrahedronSetTopologyContainer name="BeamTopo" />
+            <TetrahedronSetTopologyModifier name="Modifier" />
+
+            <Hexa2TetraTopologicalMapping input="@grid" output="@BeamTopo" />
+        </Node>
+        
+        
+        <Node name="TetrahedronFEMForceField-GPU-Green">
+            <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+            <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-6" threshold="1.0e-6" />
+            
+            <MechanicalObject position="@../Beam/grid.position" name="Volume" template="CudaVec3f"/>
+
+            <TetrahedronSetTopologyContainer name="Container" src="@../Beam/BeamTopo"/>
+            <TetrahedronSetTopologyModifier name="Modifier" />
+            <TetrahedronSetGeometryAlgorithms name="GeomAlgo" template="CudaVec3f" />
+
+            <DiagonalMass totalMass="50.0" />
+            <BoxROI name="ROI1" box="-0.1 5 -3 0.1 11 3" drawBoxes="1" />
+            
+            <FixedProjectiveConstraint indices="@ROI1.indices" />
+            <TetrahedronCorotationalFEMForceField name="FEM" template="CudaVec3f"
+                                              youngModulus="2000" poissonRatio="0.3" />
+        </Node>  
     </Node>
+    
+<!--     <Node name="Triangle">
+        <RegularGridTopology name="tissue" n="100 100 1" min="0 0 0" max="10 10 0" />
+
+        <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+        <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-6" threshold="1.0e-6" />
+        
+        <MechanicalObject position="@../tissue.position" name="dofs" template="CudaVec3f"/>
+
+        <TriangleSetTopologyContainer name="Container" src="@tissue"/>
+        <TriangleSetTopologyModifier name="Modifier" />
+        <TriangleSetGeometryAlgorithms name="GeomAlgo" template="CudaVec3f" />
+
+        <DiagonalMass massDensity="0.15" template="CudaVec3f,CudaVec3f"/>
+        <FixedProjectiveConstraint indices="9900 9901 9902 9903 9996 9997 9998 9999" />
+
+        <TriangleCorotationalFEMForceField name="FEM" youngModulus="600" poissonRatio="0.3" method="large" template="CudaVec3f"/>
+    </Node> -->
+
+<!--     <Node name="Quad">
+        <RegularGridTopology name="tissue" n="100 100 1" min="0 0 0" max="10 10 0" />
+
+        <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+        <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-6" threshold="1.0e-6" />
+        
+        <MechanicalObject position="@../tissue.position" name="dofs" template="CudaVec3f"/>
+
+        <QuadSetTopologyContainer name="Container" src="@tissue"/>
+        <QuadSetTopologyModifier name="Modifier" />
+        <QuadSetGeometryAlgorithms name="GeomAlgo" template="CudaVec3f" />
+
+        <DiagonalMass massDensity="0.15" template="CudaVec3f,CudaVec3f"/>
+        <FixedProjectiveConstraint indices="9900 9901 9902 9903 9996 9997 9998 9999" />
+
+        <QuadCorotationalFEMForceField name="FEM" youngModulus="600" poissonRatio="0.3" method="large" template="CudaVec3f"/>
+    </Node> -->
+
 </Node>
diff --git a/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn b/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn
index c59fb6a6c2a..39a5017048d 100644
--- a/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn
+++ b/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn
@@ -14,7 +14,7 @@
     <VisualStyle displayFlags="showBehaviorModels showForceFields" />
     <DefaultAnimationLoop/>
 
-    <Node name="Hexahedron">
+<!--     <Node name="Hexahedron">
         <EulerImplicitSolver name="odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
         <CGLinearSolver iterations="250" name="linear_solver" tolerance="1.0e-12" threshold="1.0e-12" />
 
@@ -31,5 +31,71 @@
 
         <HexahedronLinearSmallStrainFEMForceField name="FEM" template="CudaVec3f"
                                               youngModulus="2000" poissonRatio="0.3" />
+    </Node> -->
+
+    <Node name="Tetrahedron">
+        <Node name="Beam">
+            <RegularGridTopology name="grid" n="40 10 10" min="0 6 -2" max="16 10 2" />
+            <TetrahedronSetTopologyContainer name="BeamTopo" />
+            <TetrahedronSetTopologyModifier name="Modifier" />
+
+            <Hexa2TetraTopologicalMapping input="@grid" output="@BeamTopo" />
+        </Node>
+        
+        
+        <Node name="TetrahedronFEMForceField-GPU-Green">
+            <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+            <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-6" threshold="1.0e-6" />
+            
+            <MechanicalObject position="@../Beam/grid.position" name="Volume" template="CudaVec3f"/>
+
+            <TetrahedronSetTopologyContainer name="Container" src="@../Beam/BeamTopo"/>
+            <TetrahedronSetTopologyModifier name="Modifier" />
+            <TetrahedronSetGeometryAlgorithms name="GeomAlgo" template="CudaVec3f" />
+
+            <DiagonalMass totalMass="50.0" />
+            <BoxROI name="ROI1" box="-0.1 5 -3 0.1 11 3" drawBoxes="1" />
+            
+            <FixedProjectiveConstraint indices="@ROI1.indices" />
+            <TetrahedronLinearSmallStrainFEMForceField name="FEM" template="CudaVec3f"
+                                              youngModulus="2000" poissonRatio="0.3" />
+        </Node>  
     </Node>
+
+<!--     <Node name="Triangle">
+        <RegularGridTopology name="tissue" n="100 100 1" min="0 0 0" max="10 10 0" />
+
+        <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+        <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-6" threshold="1.0e-6" />
+        
+        <MechanicalObject position="@../tissue.position" name="dofs" template="CudaVec3f"/>
+
+        <TriangleSetTopologyContainer name="Container" src="@tissue"/>
+        <TriangleSetTopologyModifier name="Modifier" />
+        <TriangleSetGeometryAlgorithms name="GeomAlgo" template="CudaVec3f" />
+
+        <DiagonalMass massDensity="0.15" template="CudaVec3f,CudaVec3f"/>
+        <FixedProjectiveConstraint indices="9900 9901 9902 9903 9996 9997 9998 9999" />
+
+        <TriangleLinearSmallStrainFEMForceField name="FEM" youngModulus="600" poissonRatio="0.3" method="large" template="CudaVec3f"/>
+    </Node> -->
+
+<!--     <Node name="Quad">
+        <RegularGridTopology name="tissue" n="100 100 1" min="0 0 0" max="10 10 0" />
+
+        <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+        <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-6" threshold="1.0e-6" />
+        
+        <MechanicalObject position="@../tissue.position" name="dofs" template="CudaVec3f"/>
+
+        <QuadSetTopologyContainer name="Container" src="@tissue"/>
+        <QuadSetTopologyModifier name="Modifier" />
+        <QuadSetGeometryAlgorithms name="GeomAlgo" template="CudaVec3f" />
+
+        <DiagonalMass massDensity="0.15" template="CudaVec3f,CudaVec3f"/>
+        <FixedProjectiveConstraint indices="9900 9901 9902 9903 9996 9997 9998 9999" />
+
+        <QuadLinearSmallStrainFEMForceField name="FEM" youngModulus="600" poissonRatio="0.3" method="large" template="CudaVec3f"/>
+    </Node> -->
+
 </Node>

From a9c0a0c39a2774e26bffef71cac815e9e22e9767 Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Wed, 8 Apr 2026 11:02:14 +0900
Subject: [PATCH 13/21] organize examples and add cpu-gpu comparison

---
 ...aElementCorotationalFEMForceField_hexa.scn | 37 ++++++++
 ...aElementCorotationalFEMForceField_quad.scn | 36 +++++++
 ...ElementCorotationalFEMForceField_tetra.scn | 47 +++++++++
 ...mentCorotationalFEMForceField_triangle.scn | 35 +++++++
 ...entLinearSmallStrainFEMForceField_hexa.scn | 36 +++++++
 ...entLinearSmallStrainFEMForceField_quad.scn | 35 +++++++
 ...ntLinearSmallStrainFEMForceField_tetra.scn | 46 +++++++++
 ...inearSmallStrainFEMForceField_triangle.scn | 35 +++++++
 ...orotationalFEMForceField_tetra_cpu_gpu.scn | 95 +++++++++++++++++++
 ...tionalFEMForceField_tetra_cpu_gpu.scn.view | 17 ++++
 10 files changed, 419 insertions(+)
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_hexa.scn
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_quad.scn
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_tetra.scn
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_triangle.scn
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_hexa.scn
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_quad.scn
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_tetra.scn
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_triangle.scn
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn.view

diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_hexa.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_hexa.scn
new file mode 100644
index 00000000000..a08c65e0024
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_hexa.scn
@@ -0,0 +1,37 @@
+<?xml version="1.0" ?>
+<Node name="root" gravity="0 -9.81 0" dt="0.01">
+    <RequiredPlugin pluginName="Sofa.Component.Constraint.Projective"/>
+    <RequiredPlugin pluginName="Sofa.Component.Engine.Select"/>
+    <RequiredPlugin pluginName="Sofa.Component.LinearSolver.Iterative"/>
+    <RequiredPlugin pluginName="Sofa.Component.Mass"/>
+    <RequiredPlugin pluginName="Sofa.Component.ODESolver.Backward"/>
+    <RequiredPlugin pluginName="Sofa.Component.SolidMechanics.FEM.Elastic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Dynamic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Grid"/>
+    <RequiredPlugin pluginName="Sofa.Component.Visual"/>
+    <RequiredPlugin pluginName="SofaCUDA"/>
+
+    <VisualStyle displayFlags="showBehaviorModels showForceFields" />
+    <DefaultAnimationLoop />
+    <DefaultVisualManagerLoop />
+
+    <Node name="Hexahedron">
+        <EulerImplicitSolver name="odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+        <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-12" threshold="1.0e-12" />
+
+        <RegularGridTopology name="grid" n="80 20 20" min="0 6 -2" max="16 10 2" />
+        <MechanicalObject template="CudaVec3f" name="state"/>
+
+        <HexahedronSetTopologyContainer name="Container" src="@grid"/>
+        <HexahedronSetTopologyModifier name="Modifier" />
+
+        <DiagonalMass totalMass="50.0" />
+
+        <BoxROI name="box_roi" box="-0.1 5 -3 0.1 11 3" drawBoxes="1" />
+        <FixedProjectiveConstraint indices="@box_roi.indices" />
+
+        <HexahedronCorotationalFEMForceField name="FEM" template="CudaVec3f"
+                                              youngModulus="2000" poissonRatio="0.3" />
+    </Node>
+
+</Node>
diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_quad.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_quad.scn
new file mode 100644
index 00000000000..047a96624f9
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_quad.scn
@@ -0,0 +1,36 @@
+<?xml version="1.0" ?>
+<Node name="root" gravity="0 -9.81 0" dt="0.01">
+    <RequiredPlugin pluginName="Sofa.Component.Constraint.Projective"/>
+    <RequiredPlugin pluginName="Sofa.Component.Engine.Select"/>
+    <RequiredPlugin pluginName="Sofa.Component.LinearSolver.Iterative"/>
+    <RequiredPlugin pluginName="Sofa.Component.Mass"/>
+    <RequiredPlugin pluginName="Sofa.Component.ODESolver.Backward"/>
+    <RequiredPlugin pluginName="Sofa.Component.SolidMechanics.FEM.Elastic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Dynamic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Grid"/>
+    <RequiredPlugin pluginName="Sofa.Component.Visual"/>
+    <RequiredPlugin pluginName="SofaCUDA"/>
+
+    <VisualStyle displayFlags="showBehaviorModels showForceFields" />
+    <DefaultAnimationLoop />
+    <DefaultVisualManagerLoop />
+
+    <Node name="Quad">
+        <RegularGridTopology name="tissue" n="100 100 1" min="0 0 0" max="10 10 0" />
+
+        <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+        <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-12" threshold="1.0e-12" />
+        
+        <MechanicalObject position="@../tissue.position" name="dofs" template="CudaVec3f"/>
+
+        <QuadSetTopologyContainer name="Container" src="@tissue"/>
+        <QuadSetTopologyModifier name="Modifier" />
+        <QuadSetGeometryAlgorithms name="GeomAlgo" template="CudaVec3f" />
+
+        <DiagonalMass massDensity="0.15" template="CudaVec3f,CudaVec3f"/>
+        <FixedProjectiveConstraint indices="9900 9901 9902 9903 9996 9997 9998 9999" />
+
+        <QuadCorotationalFEMForceField name="FEM" youngModulus="600" poissonRatio="0.3" method="large" template="CudaVec3f"/>
+    </Node>
+
+</Node>
diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_tetra.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_tetra.scn
new file mode 100644
index 00000000000..57a39e63286
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_tetra.scn
@@ -0,0 +1,47 @@
+<?xml version="1.0" ?>
+<Node name="root" gravity="0 -9.81 0" dt="0.01">
+    <RequiredPlugin pluginName="Sofa.Component.Constraint.Projective"/>
+    <RequiredPlugin pluginName="Sofa.Component.Engine.Select"/>
+    <RequiredPlugin pluginName="Sofa.Component.LinearSolver.Iterative"/>
+    <RequiredPlugin pluginName="Sofa.Component.Mass"/>
+    <RequiredPlugin pluginName="Sofa.Component.ODESolver.Backward"/>
+    <RequiredPlugin pluginName="Sofa.Component.SolidMechanics.FEM.Elastic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Dynamic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Grid"/>
+    <RequiredPlugin pluginName="Sofa.Component.Visual"/>
+    <RequiredPlugin pluginName="SofaCUDA"/>
+
+    <VisualStyle displayFlags="showBehaviorModels showForceFields" />
+    <DefaultAnimationLoop />
+    <DefaultVisualManagerLoop />
+
+    <Node name="Tetrahedron">
+        <Node name="Beam">
+            <RegularGridTopology name="grid" n="40 10 10" min="0 6 -2" max="16 10 2" />
+            <TetrahedronSetTopologyContainer name="BeamTopo" />
+            <TetrahedronSetTopologyModifier name="Modifier" />
+
+            <Hexa2TetraTopologicalMapping input="@grid" output="@BeamTopo" />
+        </Node>
+        
+        
+        <Node name="Simulated">
+            <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+            <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-12" threshold="1.0e-12" />
+            
+            <MechanicalObject position="@../Beam/grid.position" name="Volume" template="CudaVec3f"/>
+
+            <TetrahedronSetTopologyContainer name="Container" src="@../Beam/BeamTopo"/>
+            <TetrahedronSetTopologyModifier name="Modifier" />
+            <TetrahedronSetGeometryAlgorithms name="GeomAlgo" template="CudaVec3f" />
+
+            <DiagonalMass totalMass="50.0" />
+            <BoxROI name="ROI1" box="-0.1 5 -3 0.1 11 3" drawBoxes="1" />
+            
+            <FixedProjectiveConstraint indices="@ROI1.indices" />
+            <TetrahedronCorotationalFEMForceField name="FEM" template="CudaVec3f"
+                                              youngModulus="2000" poissonRatio="0.3" />
+        </Node>  
+    </Node>
+    
+</Node>
diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_triangle.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_triangle.scn
new file mode 100644
index 00000000000..0a47b7e393d
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_triangle.scn
@@ -0,0 +1,35 @@
+<?xml version="1.0" ?>
+<Node name="root" gravity="0 -9.81 0" dt="0.01">
+    <RequiredPlugin pluginName="Sofa.Component.Constraint.Projective"/>
+    <RequiredPlugin pluginName="Sofa.Component.Engine.Select"/>
+    <RequiredPlugin pluginName="Sofa.Component.LinearSolver.Iterative"/>
+    <RequiredPlugin pluginName="Sofa.Component.Mass"/>
+    <RequiredPlugin pluginName="Sofa.Component.ODESolver.Backward"/>
+    <RequiredPlugin pluginName="Sofa.Component.SolidMechanics.FEM.Elastic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Dynamic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Grid"/>
+    <RequiredPlugin pluginName="Sofa.Component.Visual"/>
+    <RequiredPlugin pluginName="SofaCUDA"/>
+
+    <VisualStyle displayFlags="showBehaviorModels showForceFields" />
+    <DefaultAnimationLoop />
+    <DefaultVisualManagerLoop />
+
+    <Node name="Triangle">
+        <RegularGridTopology name="tissue" n="100 100 1" min="0 0 0" max="10 10 0" />
+
+        <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+        <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-12" threshold="1.0e-12" />
+        
+        <MechanicalObject position="@../tissue.position" name="dofs" template="CudaVec3f"/>
+
+        <TriangleSetTopologyContainer name="Container" src="@tissue"/>
+        <TriangleSetTopologyModifier name="Modifier" />
+        <TriangleSetGeometryAlgorithms name="GeomAlgo" template="CudaVec3f" />
+
+        <DiagonalMass massDensity="0.15" template="CudaVec3f,CudaVec3f"/>
+        <FixedProjectiveConstraint indices="9900 9901 9902 9903 9996 9997 9998 9999" />
+
+        <TriangleCorotationalFEMForceField name="FEM" youngModulus="600" poissonRatio="0.3" method="large" template="CudaVec3f"/>
+    </Node>
+</Node>
diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_hexa.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_hexa.scn
new file mode 100644
index 00000000000..228e4a08943
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_hexa.scn
@@ -0,0 +1,36 @@
+<?xml version="1.0" ?>
+<Node name="root" gravity="0 -9.81 0" dt="0.01">
+    <RequiredPlugin pluginName="Sofa.Component.Constraint.Projective"/>
+    <RequiredPlugin pluginName="Sofa.Component.Engine.Select"/>
+    <RequiredPlugin pluginName="Sofa.Component.LinearSolver.Iterative"/>
+    <RequiredPlugin pluginName="Sofa.Component.Mass"/>
+    <RequiredPlugin pluginName="Sofa.Component.ODESolver.Backward"/>
+    <RequiredPlugin pluginName="Sofa.Component.SolidMechanics.FEM.Elastic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Dynamic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Grid"/>
+    <RequiredPlugin pluginName="Sofa.Component.Visual"/>
+    <RequiredPlugin pluginName="SofaCUDA"/>
+
+    <VisualStyle displayFlags="showBehaviorModels showForceFields" />
+    <DefaultAnimationLoop/>
+
+    <Node name="Hexahedron">
+        <EulerImplicitSolver name="odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+        <CGLinearSolver iterations="250" name="linear_solver" tolerance="1.0e-12" threshold="1.0e-12" />
+
+        <RegularGridTopology name="grid" n="40 10 10" min="0 6 -2" max="16 10 2" />
+        <MechanicalObject template="CudaVec3f" name="state"/>
+
+        <HexahedronSetTopologyContainer name="Container" src="@grid"/>
+        <HexahedronSetTopologyModifier name="Modifier" />
+
+        <DiagonalMass totalMass="50.0" />
+
+        <BoxROI name="box_roi" box="-0.1 5 -3 0.1 11 3" drawBoxes="1" />
+        <FixedProjectiveConstraint indices="@box_roi.indices" />
+
+        <HexahedronLinearSmallStrainFEMForceField name="FEM" template="CudaVec3f"
+                                              youngModulus="2000" poissonRatio="0.3" />
+    </Node>
+
+</Node>
diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_quad.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_quad.scn
new file mode 100644
index 00000000000..14babc5d207
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_quad.scn
@@ -0,0 +1,35 @@
+<?xml version="1.0" ?>
+<Node name="root" gravity="0 -9.81 0" dt="0.01">
+    <RequiredPlugin pluginName="Sofa.Component.Constraint.Projective"/>
+    <RequiredPlugin pluginName="Sofa.Component.Engine.Select"/>
+    <RequiredPlugin pluginName="Sofa.Component.LinearSolver.Iterative"/>
+    <RequiredPlugin pluginName="Sofa.Component.Mass"/>
+    <RequiredPlugin pluginName="Sofa.Component.ODESolver.Backward"/>
+    <RequiredPlugin pluginName="Sofa.Component.SolidMechanics.FEM.Elastic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Dynamic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Grid"/>
+    <RequiredPlugin pluginName="Sofa.Component.Visual"/>
+    <RequiredPlugin pluginName="SofaCUDA"/>
+
+    <VisualStyle displayFlags="showBehaviorModels showForceFields" />
+    <DefaultAnimationLoop/>
+
+    <Node name="Quad">
+        <RegularGridTopology name="tissue" n="100 100 1" min="0 0 0" max="10 10 0" />
+
+        <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+        <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-12" threshold="1.0e-12" />
+        
+        <MechanicalObject position="@../tissue.position" name="dofs" template="CudaVec3f"/>
+
+        <QuadSetTopologyContainer name="Container" src="@tissue"/>
+        <QuadSetTopologyModifier name="Modifier" />
+        <QuadSetGeometryAlgorithms name="GeomAlgo" template="CudaVec3f" />
+
+        <DiagonalMass massDensity="0.15" template="CudaVec3f,CudaVec3f"/>
+        <FixedProjectiveConstraint indices="9900 9901 9902 9903 9996 9997 9998 9999" />
+
+        <QuadLinearSmallStrainFEMForceField name="FEM" youngModulus="600" poissonRatio="0.3" method="large" template="CudaVec3f"/>
+    </Node>
+
+</Node>
diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_tetra.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_tetra.scn
new file mode 100644
index 00000000000..eac98ff93ac
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_tetra.scn
@@ -0,0 +1,46 @@
+<?xml version="1.0" ?>
+<Node name="root" gravity="0 -9.81 0" dt="0.01">
+    <RequiredPlugin pluginName="Sofa.Component.Constraint.Projective"/>
+    <RequiredPlugin pluginName="Sofa.Component.Engine.Select"/>
+    <RequiredPlugin pluginName="Sofa.Component.LinearSolver.Iterative"/>
+    <RequiredPlugin pluginName="Sofa.Component.Mass"/>
+    <RequiredPlugin pluginName="Sofa.Component.ODESolver.Backward"/>
+    <RequiredPlugin pluginName="Sofa.Component.SolidMechanics.FEM.Elastic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Dynamic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Grid"/>
+    <RequiredPlugin pluginName="Sofa.Component.Visual"/>
+    <RequiredPlugin pluginName="SofaCUDA"/>
+
+    <VisualStyle displayFlags="showBehaviorModels showForceFields" />
+    <DefaultAnimationLoop/>
+
+    <Node name="Tetrahedron">
+        <Node name="Beam">
+            <RegularGridTopology name="grid" n="40 10 10" min="0 6 -2" max="16 10 2" />
+            <TetrahedronSetTopologyContainer name="BeamTopo" />
+            <TetrahedronSetTopologyModifier name="Modifier" />
+
+            <Hexa2TetraTopologicalMapping input="@grid" output="@BeamTopo" />
+        </Node>
+        
+        
+        <Node name="TetrahedronFEMForceField-GPU-Green">
+            <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+        <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-12" threshold="1.0e-12" />
+            
+            <MechanicalObject position="@../Beam/grid.position" name="Volume" template="CudaVec3f"/>
+
+            <TetrahedronSetTopologyContainer name="Container" src="@../Beam/BeamTopo"/>
+            <TetrahedronSetTopologyModifier name="Modifier" />
+            <TetrahedronSetGeometryAlgorithms name="GeomAlgo" template="CudaVec3f" />
+
+            <DiagonalMass totalMass="50.0" />
+            <BoxROI name="ROI1" box="-0.1 5 -3 0.1 11 3" drawBoxes="1" />
+            
+            <FixedProjectiveConstraint indices="@ROI1.indices" />
+            <TetrahedronLinearSmallStrainFEMForceField name="FEM" template="CudaVec3f"
+                                              youngModulus="2000" poissonRatio="0.3" />
+        </Node>  
+    </Node>
+
+</Node>
diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_triangle.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_triangle.scn
new file mode 100644
index 00000000000..03c6955697c
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_triangle.scn
@@ -0,0 +1,35 @@
+<?xml version="1.0" ?>
+<Node name="root" gravity="0 -9.81 0" dt="0.01">
+    <RequiredPlugin pluginName="Sofa.Component.Constraint.Projective"/>
+    <RequiredPlugin pluginName="Sofa.Component.Engine.Select"/>
+    <RequiredPlugin pluginName="Sofa.Component.LinearSolver.Iterative"/>
+    <RequiredPlugin pluginName="Sofa.Component.Mass"/>
+    <RequiredPlugin pluginName="Sofa.Component.ODESolver.Backward"/>
+    <RequiredPlugin pluginName="Sofa.Component.SolidMechanics.FEM.Elastic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Dynamic"/>
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Grid"/>
+    <RequiredPlugin pluginName="Sofa.Component.Visual"/>
+    <RequiredPlugin pluginName="SofaCUDA"/>
+
+    <VisualStyle displayFlags="showBehaviorModels showForceFields" />
+    <DefaultAnimationLoop/>
+
+    <Node name="Triangle">
+        <RegularGridTopology name="tissue" n="100 100 1" min="0 0 0" max="10 10 0" />
+
+        <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+        <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-12" threshold="1.0e-12" />
+        
+        <MechanicalObject position="@../tissue.position" name="dofs" template="CudaVec3f"/>
+
+        <TriangleSetTopologyContainer name="Container" src="@tissue"/>
+        <TriangleSetTopologyModifier name="Modifier" />
+        <TriangleSetGeometryAlgorithms name="GeomAlgo" template="CudaVec3f" />
+
+        <DiagonalMass massDensity="0.15" template="CudaVec3f,CudaVec3f"/>
+        <FixedProjectiveConstraint indices="9900 9901 9902 9903 9996 9997 9998 9999" />
+
+        <TriangleLinearSmallStrainFEMForceField name="FEM" youngModulus="600" poissonRatio="0.3" method="large" template="CudaVec3f"/>
+    </Node>
+
+</Node>
diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn
new file mode 100644
index 00000000000..6875f9a8849
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn
@@ -0,0 +1,95 @@
+<?xml version="1.0" ?>
+<Node name="root" gravity="0 -9 0" dt="0.02">
+    <RequiredPlugin pluginName="Sofa.Component.Constraint.Projective"/> <!-- Needed to use components [FixedProjectiveConstraint] -->  
+    <RequiredPlugin pluginName="Sofa.Component.Engine.Select"/> <!-- Needed to use components [BoxROI] -->  
+    <RequiredPlugin pluginName="Sofa.Component.LinearSolver.Iterative"/> <!-- Needed to use components [CGLinearSolver] -->  
+    <RequiredPlugin pluginName="Sofa.Component.Mapping.Linear"/> <!-- Needed to use components [IdentityMapping] -->  
+    <RequiredPlugin pluginName="Sofa.Component.Mass"/> <!-- Needed to use components [DiagonalMass] -->  
+    <RequiredPlugin pluginName="Sofa.Component.ODESolver.Backward"/> <!-- Needed to use components [EulerImplicitSolver] -->  
+    <RequiredPlugin pluginName="Sofa.Component.SolidMechanics.FEM.Elastic"/> <!-- Needed to use components [TetrahedronCorotationalFEMForceField] -->  
+    <RequiredPlugin pluginName="Sofa.Component.StateContainer"/> <!-- Needed to use components [MechanicalObject] -->  
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Dynamic"/> <!-- Needed to use components [HexahedronSetTopologyContainer,HexahedronSetTopologyModifier,QuadSetTopologyContainer,QuadSetTopologyModifier] -->  
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Container.Grid"/> <!-- Needed to use components [RegularGridTopology] -->  
+    <RequiredPlugin pluginName="Sofa.Component.Topology.Mapping"/> <!-- Needed to use components [Hexa2QuadTopologicalMapping] -->  
+    <RequiredPlugin pluginName="Sofa.Component.Visual"/> <!-- Needed to use components [VisualStyle] -->  
+    <RequiredPlugin pluginName="Sofa.GL.Component.Rendering3D"/> <!-- Needed to use components [OglModel] -->
+    <RequiredPlugin pluginName="SofaCUDA"/> <!-- Needed to use components [OglModel] -->
+  
+    <VisualStyle displayFlags="showBehaviorModels showVisual" />
+    
+    <DefaultAnimationLoop />
+    <DefaultVisualManagerLoop />
+    
+    <Node name="TetrahedronCorotationalFEMForceField-GPU-Green">
+        <Node name="Beam">
+            <RegularGridTopology name="grid" n="40 10 10" min="0 6 -2" max="16 10 2" />
+            <TetrahedronSetTopologyContainer name="BeamTopo" />
+            <TetrahedronSetTopologyModifier name="Modifier" />
+
+            <Hexa2TetraTopologicalMapping input="@grid" output="@BeamTopo" />
+        </Node>
+        
+        <Node name="Simulated">
+            <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+            <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-12" threshold="1.0e-12" />
+            
+            <MechanicalObject position="@../Beam/grid.position" name="Volume" template="CudaVec3f"/>
+
+            <TetrahedronSetTopologyContainer name="Container" src="@../Beam/BeamTopo"/>
+
+            <DiagonalMass totalMass="50.0" />
+            <BoxROI name="ROI1" box="-0.1 5 -3 0.1 11 3" drawBoxes="1" />
+            
+            <FixedProjectiveConstraint indices="@ROI1.indices" />
+            <TetrahedronCorotationalFEMForceField name="FEM" template="CudaVec3f" youngModulus="2000" poissonRatio="0.3" />
+            <Node name="surface">
+                <TriangleSetTopologyContainer name="Container" />
+                <TriangleSetTopologyModifier name="Modifier" />
+                <TriangleSetGeometryAlgorithms template="CudaVec3f" name="GeomAlgo" />
+                
+                <Tetra2TriangleTopologicalMapping input="@../Container" output="@Container" />
+                <Node name="Visu">
+                    <OglModel name="Visual" color="green" />
+                    <IdentityMapping input="@../../Volume" output="@Visual" />
+                </Node>
+            </Node>
+        </Node>  
+    </Node>
+    
+
+    <Node name="TetrahedronCorotationalFEMForceField-CPU-red">
+        <Node name="Beam">
+            <RegularGridTopology name="grid" n="40 10 10" min="0 6 -2" max="16 10 2" />
+            <TetrahedronSetTopologyContainer name="BeamTopo" />
+            <TetrahedronSetTopologyModifier name="Modifier" />
+
+            <Hexa2TetraTopologicalMapping input="@grid" output="@BeamTopo" />
+        </Node>
+        
+        <Node name="Simulated">
+            <EulerImplicitSolver name="cg_odesolver" rayleighStiffness="0.1" rayleighMass="0.1" />
+            <CGLinearSolver iterations="20" name="linear solver" tolerance="1.0e-12" threshold="1.0e-12" />
+            
+            <MechanicalObject position="@../Beam/grid.position" name="Volume" template="Vec3d"/>
+
+            <TetrahedronSetTopologyContainer name="Container" src="@../Beam/BeamTopo"/>
+
+            <DiagonalMass totalMass="50.0" />
+            <BoxROI name="ROI1" box="-0.1 5 -3 0.1 11 3" drawBoxes="1" />
+            
+            <FixedProjectiveConstraint indices="@ROI1.indices" />
+            <TetrahedronCorotationalFEMForceField name="FEM" youngModulus="2000" poissonRatio="0.3" />
+            <Node name="surface">
+                <TriangleSetTopologyContainer name="Container" />
+                <TriangleSetTopologyModifier name="Modifier" />
+                <TriangleSetGeometryAlgorithms template="Vec3d" name="GeomAlgo" />
+                
+                <Tetra2TriangleTopologicalMapping input="@../Container" output="@Container" />
+                <Node name="Visu">
+                    <OglModel name="Visual" color="red" />
+                    <IdentityMapping input="@../../Volume" output="@Visual" />
+                </Node>
+            </Node>
+        </Node> 
+    </Node>
+</Node>
diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn.view b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn.view
new file mode 100644
index 00000000000..3bf10e74929
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn.view
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Camera version="1.0">
+    <!--Vector of 3 reals (x, y, z)-->
+    <position value="16.6922 13.3665 14.4075"/>
+    <!--Quaternion (x, y, z, w)-->
+    <orientation value="-0.209766 0.206744 -0.0247238 0.955324"/>
+    <!--Real-->
+    <fieldOfView value="45"/>
+    <!--Real-->
+    <distance value="23.4743"/>
+    <!--Real-->
+    <zNear value="0.212191"/>
+    <!--Real-->
+    <zFar value="29.6313"/>
+    <!--Int (0 -> Perspective, 1 -> Orthographic)-->
+    <projectionType value="Perspective"/>
+</Camera>

From 63e201424bd8c22ab3fb81e69c94c0905709934b Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Wed, 8 Apr 2026 11:15:46 +0900
Subject: [PATCH 14/21] add double version (templates)

---
 .../CudaElementCorotationalFEMForceField.cpp  |  27 +
 .../CudaElementCorotationalFEMForceField.cu   | 490 +++++++++---------
 .../CudaElementCorotationalFEMForceField.h    |  55 +-
 .../CudaElementCorotationalFEMForceField.inl  | 102 ++--
 ...aElementLinearSmallStrainFEMForceField.cpp |  27 +
 ...daElementLinearSmallStrainFEMForceField.cu | 211 ++++----
 ...udaElementLinearSmallStrainFEMForceField.h |  34 +-
 ...aElementLinearSmallStrainFEMForceField.inl |  64 ++-
 8 files changed, 611 insertions(+), 399 deletions(-)

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
index c77a51c13c2..55a46c00669 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
@@ -36,6 +36,12 @@ template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaV
 template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Tetrahedron>;
 template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Hexahedron>;
 
+template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Edge>;
+template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Triangle>;
+template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Quad>;
+template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Tetrahedron>;
+template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Hexahedron>;
+
 } // namespace sofa::component::solidmechanics::fem::elastic
 
 namespace sofa::gpu::cuda
@@ -65,6 +71,27 @@ void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory
         "Supports GPU-side computations using CUDA for HexahedronCorotationalFEMForceField")
         .add< CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Hexahedron> >()
     );
+
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA (double) for EdgeCorotationalFEMForceField")
+        .add< CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Edge> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA (double) for TriangleCorotationalFEMForceField")
+        .add< CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Triangle> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA (double) for QuadCorotationalFEMForceField")
+        .add< CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Quad> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA (double) for TetrahedronCorotationalFEMForceField")
+        .add< CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Tetrahedron> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA (double) for HexahedronCorotationalFEMForceField")
+        .add< CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Hexahedron> >()
+    );
 }
 
 } // namespace sofa::gpu::cuda
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
index 6de99fe5794..afbbdb89532 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
@@ -32,10 +32,16 @@ namespace cuda
 {
 #endif
 
+template<typename T>
+__device__ T myRsqrt(T x);
+template<> __device__ float myRsqrt<float>(float x) { return rsqrtf(x); }
+template<> __device__ double myRsqrt<double>(double x) { return rsqrt(x); }
+
 /**
  * Device helper: 3x3 matrix multiply C = A * B (row-major)
  */
-__device__ void mat3Mul(const float* A, const float* B, float* C)
+template<typename T>
+__device__ void mat3Mul(const T* A, const T* B, T* C)
 {
     #pragma unroll
     for (int i = 0; i < 3; ++i)
@@ -53,7 +59,8 @@ __device__ void mat3Mul(const float* A, const float* B, float* C)
 /**
  * Device helper: C = A * B^T (row-major)
  */
-__device__ void mat3MulTranspose(const float* A, const float* BT, float* C)
+template<typename T>
+__device__ void mat3MulTranspose(const T* A, const T* BT, T* C)
 {
     #pragma unroll
     for (int i = 0; i < 3; ++i)
@@ -72,7 +79,8 @@ __device__ void mat3MulTranspose(const float* A, const float* BT, float* C)
  * Device helper: C = A^T * B (row-major)
  * Matches SOFA's Mat::multTranspose(B) which computes this^T * B.
  */
-__device__ void mat3TransposeMul(const float* A, const float* B, float* C)
+template<typename T>
+__device__ void mat3TransposeMul(const T* A, const T* B, T* C)
 {
     #pragma unroll
     for (int i = 0; i < 3; ++i)
@@ -89,32 +97,26 @@ __device__ void mat3TransposeMul(const float* A, const float* B, float* C)
 
 /**
  * Device helper: compute rotation frame from first 3 nodes (TriangleRotation).
- * Used for Triangle (NNodes=3) and Tetrahedron (NNodes=4) elements.
- * ex is [NNodes*3] array of gathered node positions.
  */
-__device__ void computeTriangleFrame(const float* ex, float* frame)
+template<typename T>
+__device__ void computeTriangleFrame(const T* ex, T* frame)
 {
-    // xAxis = normalize(p1 - p0)
-    float ax = ex[3] - ex[0], ay = ex[4] - ex[1], az = ex[5] - ex[2];
-    float invLen = rsqrtf(ax * ax + ay * ay + az * az);
+    T ax = ex[3] - ex[0], ay = ex[4] - ex[1], az = ex[5] - ex[2];
+    T invLen = myRsqrt(ax * ax + ay * ay + az * az);
     ax *= invLen; ay *= invLen; az *= invLen;
 
-    // tmp yAxis = p2 - p0
-    float bx = ex[6] - ex[0], by = ex[7] - ex[1], bz = ex[8] - ex[2];
+    T bx = ex[6] - ex[0], by = ex[7] - ex[1], bz = ex[8] - ex[2];
 
-    // zAxis = normalize(cross(xAxis, tmpY))
-    float cx = ay * bz - az * by;
-    float cy = az * bx - ax * bz;
-    float cz = ax * by - ay * bx;
-    invLen = rsqrtf(cx * cx + cy * cy + cz * cz);
+    T cx = ay * bz - az * by;
+    T cy = az * bx - ax * bz;
+    T cz = ax * by - ay * bx;
+    invLen = myRsqrt(cx * cx + cy * cy + cz * cz);
     cx *= invLen; cy *= invLen; cz *= invLen;
 
-    // yAxis = cross(zAxis, xAxis)
     bx = cy * az - cz * ay;
     by = cz * ax - cx * az;
     bz = cx * ay - cy * ax;
 
-    // frame rows: [xAxis; yAxis; zAxis]
     frame[0] = ax; frame[1] = ay; frame[2] = az;
     frame[3] = bx; frame[4] = by; frame[5] = bz;
     frame[6] = cx; frame[7] = cy; frame[8] = cz;
@@ -122,44 +124,39 @@ __device__ void computeTriangleFrame(const float* ex, float* frame)
 
 /**
  * Device helper: compute rotation frame from 8 hexahedron nodes (HexahedronRotation).
- * ex is [8*3] array of gathered node positions.
  */
-__device__ void computeHexahedronFrame(const float* ex, float* frame)
+template<typename T>
+__device__ void computeHexahedronFrame(const T* ex, T* frame)
 {
-    // Average edge vectors
-    // xAxis_avg = ((n1-n0) + (n2-n3) + (n5-n4) + (n6-n7)) * 0.25
-    float ax = ((ex[1*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[3*3+0])
-              + (ex[5*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[7*3+0])) * 0.25f;
-    float ay = ((ex[1*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[3*3+1])
-              + (ex[5*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[7*3+1])) * 0.25f;
-    float az = ((ex[1*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[3*3+2])
-              + (ex[5*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[7*3+2])) * 0.25f;
-
-    // yAxis_avg = ((n3-n0) + (n2-n1) + (n7-n4) + (n6-n5)) * 0.25
-    float bx = ((ex[3*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[1*3+0])
-              + (ex[7*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[5*3+0])) * 0.25f;
-    float by = ((ex[3*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[1*3+1])
-              + (ex[7*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[5*3+1])) * 0.25f;
-    float bz = ((ex[3*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[1*3+2])
-              + (ex[7*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[5*3+2])) * 0.25f;
-
-    // Normalize xAxis
-    float invLen = rsqrtf(ax * ax + ay * ay + az * az);
+    const T quarter = T(0.25);
+
+    T ax = ((ex[1*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[3*3+0])
+          + (ex[5*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[7*3+0])) * quarter;
+    T ay = ((ex[1*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[3*3+1])
+          + (ex[5*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[7*3+1])) * quarter;
+    T az = ((ex[1*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[3*3+2])
+          + (ex[5*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[7*3+2])) * quarter;
+
+    T bx = ((ex[3*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[1*3+0])
+          + (ex[7*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[5*3+0])) * quarter;
+    T by = ((ex[3*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[1*3+1])
+          + (ex[7*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[5*3+1])) * quarter;
+    T bz = ((ex[3*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[1*3+2])
+          + (ex[7*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[5*3+2])) * quarter;
+
+    T invLen = myRsqrt(ax * ax + ay * ay + az * az);
     ax *= invLen; ay *= invLen; az *= invLen;
 
-    // zAxis = normalize(cross(xAxis, yAxis_avg))
-    float cx = ay * bz - az * by;
-    float cy = az * bx - ax * bz;
-    float cz = ax * by - ay * bx;
-    invLen = rsqrtf(cx * cx + cy * cy + cz * cz);
+    T cx = ay * bz - az * by;
+    T cy = az * bx - ax * bz;
+    T cz = ax * by - ay * bx;
+    invLen = myRsqrt(cx * cx + cy * cy + cz * cz);
     cx *= invLen; cy *= invLen; cz *= invLen;
 
-    // yAxis = cross(zAxis, xAxis)
     bx = cy * az - cz * ay;
     by = cz * ax - cx * az;
     bz = cx * ay - cy * ax;
 
-    // frame rows: [xAxis; yAxis; zAxis]
     frame[0] = ax; frame[1] = ay; frame[2] = az;
     frame[3] = bx; frame[4] = by; frame[5] = bz;
     frame[6] = cx; frame[7] = cy; frame[8] = cz;
@@ -167,54 +164,48 @@ __device__ void computeHexahedronFrame(const float* ex, float* frame)
 
 /**
  * Symmetric block-matrix multiply: out = K * in
- * K stored as upper triangle: NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats.
- * Inline device function shared by addForce, addDForce, and combined kernels.
  */
-template<int NNodes>
-__device__ void symBlockMatMul(const float* K, const float* in, float* out)
+template<typename T, int NNodes>
+__device__ void symBlockMatMul(const T* K, const T* in, T* out)
 {
     #pragma unroll
     for (int i = 0; i < NNodes * 3; ++i)
-        out[i] = 0.0f;
+        out[i] = T(0);
 
     #pragma unroll
     for (int ni = 0; ni < NNodes; ++ni)
     {
         const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
 
-        // Diagonal block
         {
-            const float* Kii = K + diagIdx * 9;
-            const float i0 = in[ni * 3 + 0];
-            const float i1 = in[ni * 3 + 1];
-            const float i2 = in[ni * 3 + 2];
+            const T* Kii = K + diagIdx * 9;
+            const T i0 = in[ni * 3 + 0];
+            const T i1 = in[ni * 3 + 1];
+            const T i2 = in[ni * 3 + 2];
             out[ni * 3 + 0] += Kii[0] * i0 + Kii[1] * i1 + Kii[2] * i2;
             out[ni * 3 + 1] += Kii[3] * i0 + Kii[4] * i1 + Kii[5] * i2;
             out[ni * 3 + 2] += Kii[6] * i0 + Kii[7] * i1 + Kii[8] * i2;
         }
 
-        // Off-diagonal blocks
         #pragma unroll
         for (int nj = ni + 1; nj < NNodes; ++nj)
         {
             const int symIdx = diagIdx + (nj - ni);
-            const float* Kij = K + symIdx * 9;
+            const T* Kij = K + symIdx * 9;
 
-            // Forward: out[ni] += Kij * in[nj]
             {
-                const float j0 = in[nj * 3 + 0];
-                const float j1 = in[nj * 3 + 1];
-                const float j2 = in[nj * 3 + 2];
+                const T j0 = in[nj * 3 + 0];
+                const T j1 = in[nj * 3 + 1];
+                const T j2 = in[nj * 3 + 2];
                 out[ni * 3 + 0] += Kij[0] * j0 + Kij[1] * j1 + Kij[2] * j2;
                 out[ni * 3 + 1] += Kij[3] * j0 + Kij[4] * j1 + Kij[5] * j2;
                 out[ni * 3 + 2] += Kij[6] * j0 + Kij[7] * j1 + Kij[8] * j2;
             }
 
-            // Symmetric: out[nj] += Kij^T * in[ni]
             {
-                const float i0 = in[ni * 3 + 0];
-                const float i1 = in[ni * 3 + 1];
-                const float i2 = in[ni * 3 + 2];
+                const T i0 = in[ni * 3 + 0];
+                const T i1 = in[ni * 3 + 1];
+                const T i2 = in[ni * 3 + 2];
                 out[nj * 3 + 0] += Kij[0] * i0 + Kij[3] * i1 + Kij[6] * i2;
                 out[nj * 3 + 1] += Kij[1] * i0 + Kij[4] * i1 + Kij[7] * i2;
                 out[nj * 3 + 2] += Kij[2] * i0 + Kij[5] * i1 + Kij[8] * i2;
@@ -225,31 +216,25 @@ __device__ void symBlockMatMul(const float* K, const float* in, float* out)
 
 /**
  * Combined kernel: compute rotations AND per-element forces in one pass.
- *
- * Uses TriangleRotation for NNodes=3,4 and HexahedronRotation for NNodes=8.
- * Computes: frame from node positions → R = frame * initRotTransposed
- * Then: displacement = R^T*(x-centroid) - (x0-centroid0) → K*disp → -R*result
- * Also writes R to rotations buffer for subsequent addDForce calls.
  */
-template<int NNodes>
-__global__ void ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_kernel(
+template<typename T, int NNodes>
+__global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel(
     int nbElem,
     const int* __restrict__ elements,
-    const float* __restrict__ initRotTransposed,
-    const float* __restrict__ stiffness,
-    const float* __restrict__ x,
-    const float* __restrict__ x0,
-    float* __restrict__ rotationsOut,
-    float* __restrict__ eforce)
+    const T* __restrict__ initRotTransposed,
+    const T* __restrict__ stiffness,
+    const T* __restrict__ x,
+    const T* __restrict__ x0,
+    T* __restrict__ rotationsOut,
+    T* __restrict__ eforce)
 {
     constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
-    constexpr float invN = 1.0f / NNodes;
+    const T invN = T(1) / T(NNodes);
 
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
-    // Gather node positions and rest positions
-    float ex[NNodes * 3], ex0[NNodes * 3];
+    T ex[NNodes * 3], ex0[NNodes * 3];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
@@ -262,28 +247,24 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_
         ex0[n * 3 + 2] = x0[nodeId * 3 + 2];
     }
 
-    // Compute rotation frame from current positions
-    float frame[9];
+    T frame[9];
     if constexpr (NNodes == 8)
         computeHexahedronFrame(ex, frame);
     else
         computeTriangleFrame(ex, frame);
 
-    // R = frame^T * initRot (matching SOFA's Mat::multTranspose which computes A^T * B)
-    // m_initialRotationsTransposed stores frame_rest (despite its name, it's transposed during init)
-    const float* irt = initRotTransposed + elemId * 9;
-    float R[9];
+    // R = frame^T * initRot
+    const T* irt = initRotTransposed + elemId * 9;
+    T R[9];
     mat3TransposeMul(frame, irt, R);
 
-    // Write R to rotations buffer for addDForce
-    float* Rout = rotationsOut + elemId * 9;
+    T* Rout = rotationsOut + elemId * 9;
     #pragma unroll
     for (int i = 0; i < 9; ++i)
         Rout[i] = R[i];
 
-    // Compute centroids
-    float cx = 0.0f, cy = 0.0f, cz = 0.0f;
-    float cx0 = 0.0f, cy0 = 0.0f, cz0 = 0.0f;
+    T cx = T(0), cy = T(0), cz = T(0);
+    T cx0 = T(0), cy0 = T(0), cz0 = T(0);
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
@@ -293,35 +274,32 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_
     cx *= invN; cy *= invN; cz *= invN;
     cx0 *= invN; cy0 *= invN; cz0 *= invN;
 
-    // Compute displacement: disp[j] = R^T * (x[j] - centroid) - (x0[j] - centroid0)
-    float disp[NNodes * 3];
+    T disp[NNodes * 3];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        const float dx = ex[n * 3 + 0] - cx;
-        const float dy = ex[n * 3 + 1] - cy;
-        const float dz = ex[n * 3 + 2] - cz;
-        const float rx = R[0] * dx + R[3] * dy + R[6] * dz;
-        const float ry = R[1] * dx + R[4] * dy + R[7] * dz;
-        const float rz = R[2] * dx + R[5] * dy + R[8] * dz;
+        const T dx = ex[n * 3 + 0] - cx;
+        const T dy = ex[n * 3 + 1] - cy;
+        const T dz = ex[n * 3 + 2] - cz;
+        const T rx = R[0] * dx + R[3] * dy + R[6] * dz;
+        const T ry = R[1] * dx + R[4] * dy + R[7] * dz;
+        const T rz = R[2] * dx + R[5] * dy + R[8] * dz;
         disp[n * 3 + 0] = rx - (ex0[n * 3 + 0] - cx0);
         disp[n * 3 + 1] = ry - (ex0[n * 3 + 1] - cy0);
         disp[n * 3 + 2] = rz - (ex0[n * 3 + 2] - cz0);
     }
 
-    // edf = K * disp
-    float edf[NNodes * 3];
-    const float* K = stiffness + elemId * NSymBlocks * 9;
-    symBlockMatMul<NNodes>(K, disp, edf);
+    T edf[NNodes * 3];
+    const T* K = stiffness + elemId * NSymBlocks * 9;
+    symBlockMatMul<T, NNodes>(K, disp, edf);
 
-    // Rotate back and write: out = -R * edf
-    float* out = eforce + elemId * NNodes * 3;
+    T* out = eforce + elemId * NNodes * 3;
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        const float e0 = edf[n * 3 + 0];
-        const float e1 = edf[n * 3 + 1];
-        const float e2 = edf[n * 3 + 2];
+        const T e0 = edf[n * 3 + 0];
+        const T e1 = edf[n * 3 + 1];
+        const T e2 = edf[n * 3 + 2];
         out[n * 3 + 0] = -(R[0] * e0 + R[1] * e1 + R[2] * e2);
         out[n * 3 + 1] = -(R[3] * e0 + R[4] * e1 + R[5] * e2);
         out[n * 3 + 2] = -(R[6] * e0 + R[7] * e1 + R[8] * e2);
@@ -330,36 +308,30 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_
 
 /**
  * Kernel for addForce: Compute per-element force (1 thread per element).
- *
- * displacement[j] = R^T * (x[j] - centroid_x) - (x0[j] - centroid_x0)
- * elementForce = K * displacement
- * out[j] = -R * elementForce[j]
  */
-template<int NNodes>
-__global__ void ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel(
+template<typename T, int NNodes>
+__global__ void ElementCorotationalFEMForceField_computeForce_kernel(
     int nbElem,
     const int* __restrict__ elements,
-    const float* __restrict__ rotations,
-    const float* __restrict__ stiffness,
-    const float* __restrict__ x,
-    const float* __restrict__ x0,
-    float* __restrict__ eforce)
+    const T* __restrict__ rotations,
+    const T* __restrict__ stiffness,
+    const T* __restrict__ x,
+    const T* __restrict__ x0,
+    T* __restrict__ eforce)
 {
     constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
-    constexpr float invN = 1.0f / NNodes;
+    const T invN = T(1) / T(NNodes);
 
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
-    // Load rotation matrix R (3x3, row-major)
-    const float* Rptr = rotations + elemId * 9;
-    float R[9];
+    const T* Rptr = rotations + elemId * 9;
+    T R[9];
     #pragma unroll
     for (int i = 0; i < 9; ++i)
         R[i] = Rptr[i];
 
-    // Gather node positions and rest positions
-    float ex[NNodes * 3], ex0[NNodes * 3];
+    T ex[NNodes * 3], ex0[NNodes * 3];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
@@ -372,9 +344,8 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel(
         ex0[n * 3 + 2] = x0[nodeId * 3 + 2];
     }
 
-    // Compute centroids
-    float cx = 0.0f, cy = 0.0f, cz = 0.0f;
-    float cx0 = 0.0f, cy0 = 0.0f, cz0 = 0.0f;
+    T cx = T(0), cy = T(0), cz = T(0);
+    T cx0 = T(0), cy0 = T(0), cz0 = T(0);
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
@@ -384,38 +355,32 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel(
     cx *= invN; cy *= invN; cz *= invN;
     cx0 *= invN; cy0 *= invN; cz0 *= invN;
 
-    // Compute displacement: disp[j] = R^T * (x[j] - centroid) - (x0[j] - centroid0)
-    float disp[NNodes * 3];
+    T disp[NNodes * 3];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        const float dx = ex[n * 3 + 0] - cx;
-        const float dy = ex[n * 3 + 1] - cy;
-        const float dz = ex[n * 3 + 2] - cz;
-
-        // R^T * (x - centroid)
-        const float rx = R[0] * dx + R[3] * dy + R[6] * dz;
-        const float ry = R[1] * dx + R[4] * dy + R[7] * dz;
-        const float rz = R[2] * dx + R[5] * dy + R[8] * dz;
-
+        const T dx = ex[n * 3 + 0] - cx;
+        const T dy = ex[n * 3 + 1] - cy;
+        const T dz = ex[n * 3 + 2] - cz;
+        const T rx = R[0] * dx + R[3] * dy + R[6] * dz;
+        const T ry = R[1] * dx + R[4] * dy + R[7] * dz;
+        const T rz = R[2] * dx + R[5] * dy + R[8] * dz;
         disp[n * 3 + 0] = rx - (ex0[n * 3 + 0] - cx0);
         disp[n * 3 + 1] = ry - (ex0[n * 3 + 1] - cy0);
         disp[n * 3 + 2] = rz - (ex0[n * 3 + 2] - cz0);
     }
 
-    // edf = K * disp
-    float edf[NNodes * 3];
-    const float* K = stiffness + elemId * NSymBlocks * 9;
-    symBlockMatMul<NNodes>(K, disp, edf);
+    T edf[NNodes * 3];
+    const T* K = stiffness + elemId * NSymBlocks * 9;
+    symBlockMatMul<T, NNodes>(K, disp, edf);
 
-    // Rotate back and write: out = -R * edf
-    float* out = eforce + elemId * NNodes * 3;
+    T* out = eforce + elemId * NNodes * 3;
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        const float e0 = edf[n * 3 + 0];
-        const float e1 = edf[n * 3 + 1];
-        const float e2 = edf[n * 3 + 2];
+        const T e0 = edf[n * 3 + 0];
+        const T e1 = edf[n * 3 + 1];
+        const T e2 = edf[n * 3 + 2];
         out[n * 3 + 0] = -(R[0] * e0 + R[1] * e1 + R[2] * e2);
         out[n * 3 + 1] = -(R[3] * e0 + R[4] * e1 + R[5] * e2);
         out[n * 3 + 2] = -(R[6] * e0 + R[7] * e1 + R[8] * e2);
@@ -424,59 +389,52 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel(
 
 /**
  * Kernel for addDForce: Compute per-element dForce (1 thread per element).
- *
- * rdx = R^T * dx, edf = K * rdx, out = -kFactor * R * edf
  */
-template<int NNodes>
-__global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel(
+template<typename T, int NNodes>
+__global__ void ElementCorotationalFEMForceField_computeDForce_kernel(
     int nbElem,
     const int* __restrict__ elements,
-    const float* __restrict__ rotations,
-    const float* __restrict__ stiffness,
-    const float* __restrict__ dx,
-    float* __restrict__ eforce,
-    float kFactor)
+    const T* __restrict__ rotations,
+    const T* __restrict__ stiffness,
+    const T* __restrict__ dx,
+    T* __restrict__ eforce,
+    T kFactor)
 {
     constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
 
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
-    // Load rotation matrix R (3x3, row-major)
-    const float* Rptr = rotations + elemId * 9;
-    float R[9];
+    const T* Rptr = rotations + elemId * 9;
+    T R[9];
     #pragma unroll
     for (int i = 0; i < 9; ++i)
         R[i] = Rptr[i];
 
-    // Gather dx and rotate into reference frame: rdx[n] = R^T * dx[node[n]]
-    float rdx[NNodes * 3];
+    T rdx[NNodes * 3];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
         const int nodeId = elements[n * nbElem + elemId];
-        const float dx_x = dx[nodeId * 3 + 0];
-        const float dx_y = dx[nodeId * 3 + 1];
-        const float dx_z = dx[nodeId * 3 + 2];
-
+        const T dx_x = dx[nodeId * 3 + 0];
+        const T dx_y = dx[nodeId * 3 + 1];
+        const T dx_z = dx[nodeId * 3 + 2];
         rdx[n * 3 + 0] = R[0] * dx_x + R[3] * dx_y + R[6] * dx_z;
         rdx[n * 3 + 1] = R[1] * dx_x + R[4] * dx_y + R[7] * dx_z;
         rdx[n * 3 + 2] = R[2] * dx_x + R[5] * dx_y + R[8] * dx_z;
     }
 
-    // Symmetric block-matrix multiply: edf = K * rdx
-    const float* K = stiffness + elemId * NSymBlocks * 9;
-    float edf[NNodes * 3];
-    symBlockMatMul<NNodes>(K, rdx, edf);
+    const T* K = stiffness + elemId * NSymBlocks * 9;
+    T edf[NNodes * 3];
+    symBlockMatMul<T, NNodes>(K, rdx, edf);
 
-    // Rotate back and write: eforce = -kFactor * R * edf
-    float* out = eforce + elemId * NNodes * 3;
+    T* out = eforce + elemId * NNodes * 3;
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        const float e0 = edf[n * 3 + 0];
-        const float e1 = edf[n * 3 + 1];
-        const float e2 = edf[n * 3 + 2];
+        const T e0 = edf[n * 3 + 0];
+        const T e1 = edf[n * 3 + 1];
+        const T e2 = edf[n * 3 + 2];
         out[n * 3 + 0] = -kFactor * (R[0] * e0 + R[1] * e1 + R[2] * e2);
         out[n * 3 + 1] = -kFactor * (R[3] * e0 + R[4] * e1 + R[5] * e2);
         out[n * 3 + 2] = -kFactor * (R[6] * e0 + R[7] * e1 + R[8] * e2);
@@ -485,23 +443,19 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel(
 
 /**
  * Gather per-vertex forces (1 thread per vertex).
- *
- * Shared by addForce and addDForce.
- * No atomics: each vertex handled by exactly one thread.
- * velems is SoA: velems[s * nbVertex + vertexId], 0-terminated.
- * Each entry is (elemId * NNodes + localNode + 1), with 0 as sentinel.
  */
-__global__ void ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel(
+template<typename T>
+__global__ void ElementCorotationalFEMForceField_gatherForce_kernel(
     int nbVertex,
     int maxElemPerVertex,
     const int* __restrict__ velems,
-    const float* __restrict__ eforce,
-    float* df)
+    const T* __restrict__ eforce,
+    T* df)
 {
     const int vertexId = blockIdx.x * blockDim.x + threadIdx.x;
     if (vertexId >= nbVertex) return;
 
-    float fx = 0.0f, fy = 0.0f, fz = 0.0f;
+    T fx = T(0), fy = T(0), fz = T(0);
 
     for (int s = 0; s < maxElemPerVertex; ++s)
     {
@@ -518,6 +472,7 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel(
     df[vertexId * 3 + 2] += fz;
 }
 
+template<typename T>
 static void launchGather(
     unsigned int nbVertex,
     unsigned int maxElemPerVertex,
@@ -527,17 +482,17 @@ static void launchGather(
 {
     const int gatherThreads = 256;
     const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
-    ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel
+    ElementCorotationalFEMForceField_gatherForce_kernel<T>
         <<<numBlocks, gatherThreads>>>(
             nbVertex,
             maxElemPerVertex,
             (const int*)velems,
-            (const float*)eforce,
-            (float*)f);
-    mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel");
+            (const T*)eforce,
+            (T*)f);
+    mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel");
 }
 
-template<int NNodes>
+template<typename T, int NNodes>
 static void launchAddForceWithRotations(
     unsigned int nbElem,
     unsigned int nbVertex,
@@ -554,22 +509,22 @@ static void launchAddForceWithRotations(
 {
     const int computeThreads = 64;
     const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
-    ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_kernel<NNodes>
+    ElementCorotationalFEMForceField_computeRotationsAndForce_kernel<T, NNodes>
         <<<numBlocks, computeThreads>>>(
             nbElem,
             (const int*)elements,
-            (const float*)initRotTransposed,
-            (const float*)stiffness,
-            (const float*)x,
-            (const float*)x0,
-            (float*)rotationsOut,
-            (float*)eforce);
-    mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_kernel");
-
-    launchGather(nbVertex, maxElemPerVertex, velems, eforce, f);
+            (const T*)initRotTransposed,
+            (const T*)stiffness,
+            (const T*)x,
+            (const T*)x0,
+            (T*)rotationsOut,
+            (T*)eforce);
+    mycudaDebugError("ElementCorotationalFEMForceField_computeRotationsAndForce_kernel");
+
+    launchGather<T>(nbVertex, maxElemPerVertex, velems, eforce, f);
 }
 
-template<int NNodes>
+template<typename T, int NNodes>
 static void launchAddForce(
     unsigned int nbElem,
     unsigned int nbVertex,
@@ -585,21 +540,21 @@ static void launchAddForce(
 {
     const int computeThreads = 64;
     const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
-    ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel<NNodes>
+    ElementCorotationalFEMForceField_computeForce_kernel<T, NNodes>
         <<<numBlocks, computeThreads>>>(
             nbElem,
             (const int*)elements,
-            (const float*)rotations,
-            (const float*)stiffness,
-            (const float*)x,
-            (const float*)x0,
-            (float*)eforce);
-    mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel");
-
-    launchGather(nbVertex, maxElemPerVertex, velems, eforce, f);
+            (const T*)rotations,
+            (const T*)stiffness,
+            (const T*)x,
+            (const T*)x0,
+            (T*)eforce);
+    mycudaDebugError("ElementCorotationalFEMForceField_computeForce_kernel");
+
+    launchGather<T>(nbVertex, maxElemPerVertex, velems, eforce, f);
 }
 
-template<int NNodes>
+template<typename T, int NNodes>
 static void launchAddDForce(
     unsigned int nbElem,
     unsigned int nbVertex,
@@ -611,27 +566,29 @@ static void launchAddDForce(
     void* df,
     void* eforce,
     const void* velems,
-    float kFactor)
+    T kFactor)
 {
     const int computeThreads = 64;
     const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
-    ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel<NNodes>
+    ElementCorotationalFEMForceField_computeDForce_kernel<T, NNodes>
         <<<numBlocks, computeThreads>>>(
             nbElem,
             (const int*)elements,
-            (const float*)rotations,
-            (const float*)stiffness,
-            (const float*)dx,
-            (float*)eforce,
+            (const T*)rotations,
+            (const T*)stiffness,
+            (const T*)dx,
+            (T*)eforce,
             kFactor);
-    mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel");
+    mycudaDebugError("ElementCorotationalFEMForceField_computeDForce_kernel");
 
-    launchGather(nbVertex, maxElemPerVertex, velems, eforce, df);
+    launchGather<T>(nbVertex, maxElemPerVertex, velems, eforce, df);
 }
 
 extern "C"
 {
 
+// ==================== float versions ====================
+
 void ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations(
     unsigned int nbElem,
     unsigned int nbVertex,
@@ -649,9 +606,9 @@ void ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations(
 {
     switch (nbNodesPerElem)
     {
-        case 3: launchAddForceWithRotations<3>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
-        case 4: launchAddForceWithRotations<4>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
-        case 8: launchAddForceWithRotations<8>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
+        case 3: launchAddForceWithRotations<float, 3>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
+        case 4: launchAddForceWithRotations<float, 4>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
+        case 8: launchAddForceWithRotations<float, 8>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
     }
 }
 
@@ -671,10 +628,10 @@ void ElementCorotationalFEMForceFieldCuda3f_addForce(
 {
     switch (nbNodesPerElem)
     {
-        case 2: launchAddForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
-        case 3: launchAddForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
-        case 4: launchAddForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
-        case 8: launchAddForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
+        case 2: launchAddForce<float, 2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
+        case 3: launchAddForce<float, 3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
+        case 4: launchAddForce<float, 4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
+        case 8: launchAddForce<float, 8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
     }
 }
 
@@ -694,10 +651,81 @@ void ElementCorotationalFEMForceFieldCuda3f_addDForce(
 {
     switch (nbNodesPerElem)
     {
-        case 2: launchAddDForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 3: launchAddDForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 4: launchAddDForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 8: launchAddDForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 2: launchAddDForce<float, 2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 3: launchAddDForce<float, 3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 4: launchAddDForce<float, 4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 8: launchAddDForce<float, 8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
+    }
+}
+
+// ==================== double versions ====================
+
+void ElementCorotationalFEMForceFieldCuda3d_addForceWithRotations(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int nbNodesPerElem,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* initRotTransposed,
+    const void* stiffness,
+    const void* x,
+    const void* x0,
+    void* f,
+    void* eforce,
+    void* rotationsOut,
+    const void* velems)
+{
+    switch (nbNodesPerElem)
+    {
+        case 3: launchAddForceWithRotations<double, 3>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
+        case 4: launchAddForceWithRotations<double, 4>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
+        case 8: launchAddForceWithRotations<double, 8>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
+    }
+}
+
+void ElementCorotationalFEMForceFieldCuda3d_addForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int nbNodesPerElem,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* rotations,
+    const void* stiffness,
+    const void* x,
+    const void* x0,
+    void* f,
+    void* eforce,
+    const void* velems)
+{
+    switch (nbNodesPerElem)
+    {
+        case 2: launchAddForce<double, 2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
+        case 3: launchAddForce<double, 3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
+        case 4: launchAddForce<double, 4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
+        case 8: launchAddForce<double, 8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
+    }
+}
+
+void ElementCorotationalFEMForceFieldCuda3d_addDForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int nbNodesPerElem,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* rotations,
+    const void* stiffness,
+    const void* dx,
+    void* df,
+    void* eforce,
+    const void* velems,
+    double kFactor)
+{
+    switch (nbNodesPerElem)
+    {
+        case 2: launchAddDForce<double, 2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 3: launchAddDForce<double, 3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 4: launchAddDForce<double, 4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 8: launchAddDForce<double, 8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
     }
 }
 
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
index 67c619768b5..7ec167dcae3 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
@@ -71,6 +71,49 @@ extern "C"
         void* eforce,
         const void* velems,
         float kFactor);
+
+    void ElementCorotationalFEMForceFieldCuda3d_addForceWithRotations(
+        unsigned int nbElem,
+        unsigned int nbVertex,
+        unsigned int nbNodesPerElem,
+        unsigned int maxElemPerVertex,
+        const void* elements,
+        const void* initRotTransposed,
+        const void* stiffness,
+        const void* x,
+        const void* x0,
+        void* f,
+        void* eforce,
+        void* rotationsOut,
+        const void* velems);
+
+    void ElementCorotationalFEMForceFieldCuda3d_addForce(
+        unsigned int nbElem,
+        unsigned int nbVertex,
+        unsigned int nbNodesPerElem,
+        unsigned int maxElemPerVertex,
+        const void* elements,
+        const void* rotations,
+        const void* stiffness,
+        const void* x,
+        const void* x0,
+        void* f,
+        void* eforce,
+        const void* velems);
+
+    void ElementCorotationalFEMForceFieldCuda3d_addDForce(
+        unsigned int nbElem,
+        unsigned int nbVertex,
+        unsigned int nbNodesPerElem,
+        unsigned int maxElemPerVertex,
+        const void* elements,
+        const void* rotations,
+        const void* stiffness,
+        const void* dx,
+        void* df,
+        void* eforce,
+        const void* velems,
+        double kFactor);
 }
 
 } // namespace sofa::gpu::cuda
@@ -138,12 +181,12 @@ class CudaElementCorotationalFEMForceField
     void uploadInitialRotationsTransposed();
     void downloadRotations();
 
-    gpu::cuda::CudaVector<float> m_gpuStiffness;                  ///< Symmetric block-format stiffness per element
-    gpu::cuda::CudaVector<float> m_gpuRotations;                  ///< Flat 3x3 rotation matrices per element
-    gpu::cuda::CudaVector<float> m_gpuInitialRotationsTransposed; ///< Flat 3x3 initial rotation transposed per element
-    gpu::cuda::CudaVector<int>   m_gpuElements;                   ///< SoA connectivity: elements[nodeIdx * nbElem + elemId]
-    gpu::cuda::CudaVector<float> m_gpuElementForce;               ///< Intermediate per-element per-node force buffer
-    gpu::cuda::CudaVector<int>   m_gpuVelems;                     ///< SoA vertex-to-element mapping, 0-terminated
+    gpu::cuda::CudaVector<Real> m_gpuStiffness;                  ///< Symmetric block-format stiffness per element
+    gpu::cuda::CudaVector<Real> m_gpuRotations;                  ///< Flat 3x3 rotation matrices per element
+    gpu::cuda::CudaVector<Real> m_gpuInitialRotationsTransposed; ///< Flat 3x3 initial rotation transposed per element
+    gpu::cuda::CudaVector<int>  m_gpuElements;                   ///< SoA connectivity: elements[nodeIdx * nbElem + elemId]
+    gpu::cuda::CudaVector<Real> m_gpuElementForce;               ///< Intermediate per-element per-node force buffer
+    gpu::cuda::CudaVector<int>  m_gpuVelems;                     ///< SoA vertex-to-element mapping, 0-terminated
 
     unsigned int m_maxElemPerVertex = 0;
     unsigned int m_nbVertices = 0;
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
index 7d7f32964c3..effb420ab61 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
@@ -90,7 +90,7 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::uploadStiffne
                             dst[e * nSymBlocks * dim * dim
                                 + symIdx * dim * dim
                                 + di * dim + dj]
-                                = static_cast<float>(K[ni * dim + di][nj * dim + dj]);
+                                = static_cast<Real>(K[ni * dim + di][nj * dim + dj]);
                 }
             }
         }
@@ -167,7 +167,7 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::uploadRotatio
             const auto& R = rotations[e];
             for (unsigned int i = 0; i < dim; ++i)
                 for (unsigned int j = 0; j < dim; ++j)
-                    dst[e * dim * dim + i * dim + j] = static_cast<float>(R[i][j]);
+                    dst[e * dim * dim + i * dim + j] = static_cast<Real>(R[i][j]);
         }
     }
 
@@ -194,7 +194,7 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::uploadInitial
             const auto& R = initRotT[e];
             for (unsigned int i = 0; i < dim; ++i)
                 for (unsigned int j = 0; j < dim; ++j)
-                    dst[e * dim * dim + i * dim + j] = static_cast<float>(R[i][j]);
+                    dst[e * dim * dim + i * dim + j] = static_cast<Real>(R[i][j]);
         }
     }
 
@@ -269,20 +269,24 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
     if (m_gpuRotationMethodSupported)
     {
         // Fully GPU path: compute rotations + forces in one kernel
-        gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations(
-            nbElem,
-            nbVertex,
-            trait::NumberOfNodesInElement,
-            m_maxElemPerVertex,
-            m_gpuElements.deviceRead(),
-            m_gpuInitialRotationsTransposed.deviceRead(),
-            m_gpuStiffness.deviceRead(),
-            x.deviceRead(),
-            x0.deviceRead(),
-            f.deviceWrite(),
-            m_gpuElementForce.deviceWrite(),
-            m_gpuRotations.deviceWrite(),
-            m_gpuVelems.deviceRead());
+        if constexpr (std::is_same_v<Real, double>)
+        {
+            gpu::cuda::ElementCorotationalFEMForceFieldCuda3d_addForceWithRotations(
+                nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
+                m_gpuElements.deviceRead(), m_gpuInitialRotationsTransposed.deviceRead(),
+                m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(),
+                f.deviceWrite(), m_gpuElementForce.deviceWrite(),
+                m_gpuRotations.deviceWrite(), m_gpuVelems.deviceRead());
+        }
+        else
+        {
+            gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations(
+                nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
+                m_gpuElements.deviceRead(), m_gpuInitialRotationsTransposed.deviceRead(),
+                m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(),
+                f.deviceWrite(), m_gpuElementForce.deviceWrite(),
+                m_gpuRotations.deviceWrite(), m_gpuVelems.deviceRead());
+        }
 
         m_gpuRotationsUploaded = true;
     }
@@ -292,19 +296,24 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
         this->computeRotations(this->m_rotations, x, x0);
         uploadRotations();
 
-        gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForce(
-            nbElem,
-            nbVertex,
-            trait::NumberOfNodesInElement,
-            m_maxElemPerVertex,
-            m_gpuElements.deviceRead(),
-            m_gpuRotations.deviceRead(),
-            m_gpuStiffness.deviceRead(),
-            x.deviceRead(),
-            x0.deviceRead(),
-            f.deviceWrite(),
-            m_gpuElementForce.deviceWrite(),
-            m_gpuVelems.deviceRead());
+        if constexpr (std::is_same_v<Real, double>)
+        {
+            gpu::cuda::ElementCorotationalFEMForceFieldCuda3d_addForce(
+                nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
+                m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
+                m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(),
+                f.deviceWrite(), m_gpuElementForce.deviceWrite(),
+                m_gpuVelems.deviceRead());
+        }
+        else
+        {
+            gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForce(
+                nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
+                m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
+                m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(),
+                f.deviceWrite(), m_gpuElementForce.deviceWrite(),
+                m_gpuVelems.deviceRead());
+        }
     }
 
     d_f.endEdit();
@@ -334,7 +343,7 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addDForce(
     if (df.size() < dx.size())
         df.resize(dx.size());
 
-    const auto kFactor = static_cast<float>(
+    const auto kFactor = static_cast<Real>(
         sofa::core::mechanicalparams::kFactorIncludingRayleighDamping(
             mparams, this->rayleighStiffness.getValue()));
 
@@ -342,19 +351,24 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addDForce(
     const auto nbElem = static_cast<unsigned int>(elements.size());
     const auto nbVertex = static_cast<unsigned int>(dx.size());
 
-    gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addDForce(
-        nbElem,
-        nbVertex,
-        trait::NumberOfNodesInElement,
-        m_maxElemPerVertex,
-        m_gpuElements.deviceRead(),
-        m_gpuRotations.deviceRead(),
-        m_gpuStiffness.deviceRead(),
-        dx.deviceRead(),
-        df.deviceWrite(),
-        m_gpuElementForce.deviceWrite(),
-        m_gpuVelems.deviceRead(),
-        kFactor);
+    if constexpr (std::is_same_v<Real, double>)
+    {
+        gpu::cuda::ElementCorotationalFEMForceFieldCuda3d_addDForce(
+            nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
+            m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
+            m_gpuStiffness.deviceRead(), dx.deviceRead(),
+            df.deviceWrite(), m_gpuElementForce.deviceWrite(),
+            m_gpuVelems.deviceRead(), kFactor);
+    }
+    else
+    {
+        gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addDForce(
+            nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
+            m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
+            m_gpuStiffness.deviceRead(), dx.deviceRead(),
+            df.deviceWrite(), m_gpuElementForce.deviceWrite(),
+            m_gpuVelems.deviceRead(), kFactor);
+    }
 
     d_df.endEdit();
 }
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp
index af802d29e95..b46f90d06d4 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp
@@ -36,6 +36,12 @@ template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<
 template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Tetrahedron>;
 template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Hexahedron>;
 
+template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Edge>;
+template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Triangle>;
+template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Quad>;
+template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Tetrahedron>;
+template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Hexahedron>;
+
 } // namespace sofa::component::solidmechanics::fem::elastic
 
 namespace sofa::gpu::cuda
@@ -65,6 +71,27 @@ void registerElementLinearSmallStrainFEMForceField(sofa::core::ObjectFactory* fa
         "Supports GPU-side computations using CUDA for HexahedronLinearSmallStrainFEMForceField")
         .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Hexahedron> >()
     );
+
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA (double) for EdgeLinearSmallStrainFEMForceField")
+        .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Edge> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA (double) for TriangleLinearSmallStrainFEMForceField")
+        .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Triangle> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA (double) for QuadLinearSmallStrainFEMForceField")
+        .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Quad> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA (double) for TetrahedronLinearSmallStrainFEMForceField")
+        .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Tetrahedron> >()
+    );
+    factory->registerObjects(sofa::core::ObjectRegistrationData(
+        "Supports GPU-side computations using CUDA (double) for HexahedronLinearSmallStrainFEMForceField")
+        .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Hexahedron> >()
+    );
 }
 
 } // namespace sofa::gpu::cuda
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
index 39c67a27db4..6752bd29af1 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
@@ -37,16 +37,16 @@ namespace cuda
  *
  * f = -K * (x - x0)
  * Templated on NNodes (compile-time) for full loop unrolling.
- * Hardcoded Dim=3 (CudaVec3f only).
+ * Templated on T for float/double support.
  */
-template<int NNodes>
-__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel(
+template<typename T, int NNodes>
+__global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel(
     int nbElem,
     const int* __restrict__ elements,
-    const float* __restrict__ stiffness,
-    const float* __restrict__ x,
-    const float* __restrict__ x0,
-    float* __restrict__ eforce)
+    const T* __restrict__ stiffness,
+    const T* __restrict__ x,
+    const T* __restrict__ x0,
+    T* __restrict__ eforce)
 {
     constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
 
@@ -54,7 +54,7 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel(
     if (elemId >= nbElem) return;
 
     // Gather displacement = x - x0 for this element's nodes
-    float disp[NNodes * 3];
+    T disp[NNodes * 3];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
@@ -65,12 +65,12 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel(
     }
 
     // Symmetric block-matrix multiply: edf = K * disp
-    const float* K = stiffness + elemId * NSymBlocks * 9;
-    float edf[NNodes * 3];
+    const T* K = stiffness + elemId * NSymBlocks * 9;
+    T edf[NNodes * 3];
 
     #pragma unroll
     for (int i = 0; i < NNodes * 3; ++i)
-        edf[i] = 0.0f;
+        edf[i] = T(0);
 
     #pragma unroll
     for (int ni = 0; ni < NNodes; ++ni)
@@ -79,10 +79,10 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel(
 
         // Diagonal block
         {
-            const float* Kii = K + diagIdx * 9;
-            const float di0 = disp[ni * 3 + 0];
-            const float di1 = disp[ni * 3 + 1];
-            const float di2 = disp[ni * 3 + 2];
+            const T* Kii = K + diagIdx * 9;
+            const T di0 = disp[ni * 3 + 0];
+            const T di1 = disp[ni * 3 + 1];
+            const T di2 = disp[ni * 3 + 2];
             edf[ni * 3 + 0] += Kii[0] * di0 + Kii[1] * di1 + Kii[2] * di2;
             edf[ni * 3 + 1] += Kii[3] * di0 + Kii[4] * di1 + Kii[5] * di2;
             edf[ni * 3 + 2] += Kii[6] * di0 + Kii[7] * di1 + Kii[8] * di2;
@@ -93,21 +93,21 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel(
         for (int nj = ni + 1; nj < NNodes; ++nj)
         {
             const int symIdx = diagIdx + (nj - ni);
-            const float* Kij = K + symIdx * 9;
+            const T* Kij = K + symIdx * 9;
 
             {
-                const float dj0 = disp[nj * 3 + 0];
-                const float dj1 = disp[nj * 3 + 1];
-                const float dj2 = disp[nj * 3 + 2];
+                const T dj0 = disp[nj * 3 + 0];
+                const T dj1 = disp[nj * 3 + 1];
+                const T dj2 = disp[nj * 3 + 2];
                 edf[ni * 3 + 0] += Kij[0] * dj0 + Kij[1] * dj1 + Kij[2] * dj2;
                 edf[ni * 3 + 1] += Kij[3] * dj0 + Kij[4] * dj1 + Kij[5] * dj2;
                 edf[ni * 3 + 2] += Kij[6] * dj0 + Kij[7] * dj1 + Kij[8] * dj2;
             }
 
             {
-                const float di0 = disp[ni * 3 + 0];
-                const float di1 = disp[ni * 3 + 1];
-                const float di2 = disp[ni * 3 + 2];
+                const T di0 = disp[ni * 3 + 0];
+                const T di1 = disp[ni * 3 + 1];
+                const T di2 = disp[ni * 3 + 2];
                 edf[nj * 3 + 0] += Kij[0] * di0 + Kij[3] * di1 + Kij[6] * di2;
                 edf[nj * 3 + 1] += Kij[1] * di0 + Kij[4] * di1 + Kij[7] * di2;
                 edf[nj * 3 + 2] += Kij[2] * di0 + Kij[5] * di1 + Kij[8] * di2;
@@ -116,7 +116,7 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel(
     }
 
     // Write: eforce = -edf (minus sign from f -= K * displacement)
-    float* out = eforce + elemId * NNodes * 3;
+    T* out = eforce + elemId * NNodes * 3;
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
@@ -130,17 +130,15 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel(
  * Kernel for addDForce: Compute per-element dForce (1 thread per element).
  *
  * df = -kFactor * K * dx
- * Templated on NNodes (compile-time) for full loop unrolling.
- * Hardcoded Dim=3 (CudaVec3f only).
  */
-template<int NNodes>
-__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel(
+template<typename T, int NNodes>
+__global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel(
     int nbElem,
     const int* __restrict__ elements,
-    const float* __restrict__ stiffness,
-    const float* __restrict__ dx,
-    float* __restrict__ eforce,
-    float kFactor)
+    const T* __restrict__ stiffness,
+    const T* __restrict__ dx,
+    T* __restrict__ eforce,
+    T kFactor)
 {
     constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
 
@@ -148,7 +146,7 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel
     if (elemId >= nbElem) return;
 
     // Gather dx for this element's nodes
-    float edx[NNodes * 3];
+    T edx[NNodes * 3];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
@@ -159,51 +157,47 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel
     }
 
     // Symmetric block-matrix multiply: edf = K * edx
-    const float* K = stiffness + elemId * NSymBlocks * 9;
-    float edf[NNodes * 3];
+    const T* K = stiffness + elemId * NSymBlocks * 9;
+    T edf[NNodes * 3];
 
     #pragma unroll
     for (int i = 0; i < NNodes * 3; ++i)
-        edf[i] = 0.0f;
+        edf[i] = T(0);
 
     #pragma unroll
     for (int ni = 0; ni < NNodes; ++ni)
     {
         const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
 
-        // Diagonal block (ni, ni): Kii * edx[ni]
         {
-            const float* Kii = K + diagIdx * 9;
-            const float di0 = edx[ni * 3 + 0];
-            const float di1 = edx[ni * 3 + 1];
-            const float di2 = edx[ni * 3 + 2];
+            const T* Kii = K + diagIdx * 9;
+            const T di0 = edx[ni * 3 + 0];
+            const T di1 = edx[ni * 3 + 1];
+            const T di2 = edx[ni * 3 + 2];
             edf[ni * 3 + 0] += Kii[0] * di0 + Kii[1] * di1 + Kii[2] * di2;
             edf[ni * 3 + 1] += Kii[3] * di0 + Kii[4] * di1 + Kii[5] * di2;
             edf[ni * 3 + 2] += Kii[6] * di0 + Kii[7] * di1 + Kii[8] * di2;
         }
 
-        // Off-diagonal blocks (ni, nj) for nj > ni
         #pragma unroll
         for (int nj = ni + 1; nj < NNodes; ++nj)
         {
             const int symIdx = diagIdx + (nj - ni);
-            const float* Kij = K + symIdx * 9;
+            const T* Kij = K + symIdx * 9;
 
-            // Forward: edf[ni] += Kij * edx[nj]
             {
-                const float dj0 = edx[nj * 3 + 0];
-                const float dj1 = edx[nj * 3 + 1];
-                const float dj2 = edx[nj * 3 + 2];
+                const T dj0 = edx[nj * 3 + 0];
+                const T dj1 = edx[nj * 3 + 1];
+                const T dj2 = edx[nj * 3 + 2];
                 edf[ni * 3 + 0] += Kij[0] * dj0 + Kij[1] * dj1 + Kij[2] * dj2;
                 edf[ni * 3 + 1] += Kij[3] * dj0 + Kij[4] * dj1 + Kij[5] * dj2;
                 edf[ni * 3 + 2] += Kij[6] * dj0 + Kij[7] * dj1 + Kij[8] * dj2;
             }
 
-            // Symmetric: edf[nj] += Kij^T * edx[ni]
             {
-                const float di0 = edx[ni * 3 + 0];
-                const float di1 = edx[ni * 3 + 1];
-                const float di2 = edx[ni * 3 + 2];
+                const T di0 = edx[ni * 3 + 0];
+                const T di1 = edx[ni * 3 + 1];
+                const T di2 = edx[ni * 3 + 2];
                 edf[nj * 3 + 0] += Kij[0] * di0 + Kij[3] * di1 + Kij[6] * di2;
                 edf[nj * 3 + 1] += Kij[1] * di0 + Kij[4] * di1 + Kij[7] * di2;
                 edf[nj * 3 + 2] += Kij[2] * di0 + Kij[5] * di1 + Kij[8] * di2;
@@ -212,7 +206,7 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel
     }
 
     // Write: eforce = -kFactor * edf
-    float* out = eforce + elemId * NNodes * 3;
+    T* out = eforce + elemId * NNodes * 3;
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
@@ -224,23 +218,19 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel
 
 /**
  * Gather per-vertex forces (1 thread per vertex).
- *
- * Shared by both addForce and addDForce.
- * No atomics: each vertex handled by exactly one thread.
- * velems is SoA: velems[s * nbVertex + vertexId], 0-terminated.
- * Each entry is (elemId * NNodes + localNode + 1), with 0 as sentinel.
  */
-__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel(
+template<typename T>
+__global__ void ElementLinearSmallStrainFEMForceField_gatherForce_kernel(
     int nbVertex,
     int maxElemPerVertex,
     const int* __restrict__ velems,
-    const float* __restrict__ eforce,
-    float* df)
+    const T* __restrict__ eforce,
+    T* df)
 {
     const int vertexId = blockIdx.x * blockDim.x + threadIdx.x;
     if (vertexId >= nbVertex) return;
 
-    float fx = 0.0f, fy = 0.0f, fz = 0.0f;
+    T fx = T(0), fy = T(0), fz = T(0);
 
     for (int s = 0; s < maxElemPerVertex; ++s)
     {
@@ -257,6 +247,7 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel(
     df[vertexId * 3 + 2] += fz;
 }
 
+template<typename T>
 static void launchGather(
     unsigned int nbVertex,
     unsigned int maxElemPerVertex,
@@ -266,17 +257,17 @@ static void launchGather(
 {
     const int gatherThreads = 256;
     const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
-    ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel
+    ElementLinearSmallStrainFEMForceField_gatherForce_kernel<T>
         <<<numBlocks, gatherThreads>>>(
             nbVertex,
             maxElemPerVertex,
             (const int*)velems,
-            (const float*)eforce,
-            (float*)f);
-    mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel");
+            (const T*)eforce,
+            (T*)f);
+    mycudaDebugError("ElementLinearSmallStrainFEMForceField_gatherForce_kernel");
 }
 
-template<int NNodes>
+template<typename T, int NNodes>
 static void launchAddForce(
     unsigned int nbElem,
     unsigned int nbVertex,
@@ -291,20 +282,20 @@ static void launchAddForce(
 {
     const int computeThreads = 64;
     const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
-    ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel<NNodes>
+    ElementLinearSmallStrainFEMForceField_computeForce_kernel<T, NNodes>
         <<<numBlocks, computeThreads>>>(
             nbElem,
             (const int*)elements,
-            (const float*)stiffness,
-            (const float*)x,
-            (const float*)x0,
-            (float*)eforce);
-    mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel");
+            (const T*)stiffness,
+            (const T*)x,
+            (const T*)x0,
+            (T*)eforce);
+    mycudaDebugError("ElementLinearSmallStrainFEMForceField_computeForce_kernel");
 
-    launchGather(nbVertex, maxElemPerVertex, velems, eforce, f);
+    launchGather<T>(nbVertex, maxElemPerVertex, velems, eforce, f);
 }
 
-template<int NNodes>
+template<typename T, int NNodes>
 static void launchAddDForce(
     unsigned int nbElem,
     unsigned int nbVertex,
@@ -315,21 +306,21 @@ static void launchAddDForce(
     void* df,
     void* eforce,
     const void* velems,
-    float kFactor)
+    T kFactor)
 {
     const int computeThreads = 64;
     const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
-    ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel<NNodes>
+    ElementLinearSmallStrainFEMForceField_computeDForce_kernel<T, NNodes>
         <<<numBlocks, computeThreads>>>(
             nbElem,
             (const int*)elements,
-            (const float*)stiffness,
-            (const float*)dx,
-            (float*)eforce,
+            (const T*)stiffness,
+            (const T*)dx,
+            (T*)eforce,
             kFactor);
-    mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel");
+    mycudaDebugError("ElementLinearSmallStrainFEMForceField_computeDForce_kernel");
 
-    launchGather(nbVertex, maxElemPerVertex, velems, eforce, df);
+    launchGather<T>(nbVertex, maxElemPerVertex, velems, eforce, df);
 }
 
 extern "C"
@@ -350,10 +341,10 @@ void ElementLinearSmallStrainFEMForceFieldCuda3f_addForce(
 {
     switch (nbNodesPerElem)
     {
-        case 2: launchAddForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
-        case 3: launchAddForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
-        case 4: launchAddForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
-        case 8: launchAddForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
+        case 2: launchAddForce<float, 2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
+        case 3: launchAddForce<float, 3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
+        case 4: launchAddForce<float, 4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
+        case 8: launchAddForce<float, 8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
     }
 }
 
@@ -372,10 +363,54 @@ void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce(
 {
     switch (nbNodesPerElem)
     {
-        case 2: launchAddDForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 3: launchAddDForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 4: launchAddDForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 8: launchAddDForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 2: launchAddDForce<float, 2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 3: launchAddDForce<float, 3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 4: launchAddDForce<float, 4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 8: launchAddDForce<float, 8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
+    }
+}
+
+void ElementLinearSmallStrainFEMForceFieldCuda3d_addForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int nbNodesPerElem,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* stiffness,
+    const void* x,
+    const void* x0,
+    void* f,
+    void* eforce,
+    const void* velems)
+{
+    switch (nbNodesPerElem)
+    {
+        case 2: launchAddForce<double, 2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
+        case 3: launchAddForce<double, 3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
+        case 4: launchAddForce<double, 4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
+        case 8: launchAddForce<double, 8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
+    }
+}
+
+void ElementLinearSmallStrainFEMForceFieldCuda3d_addDForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int nbNodesPerElem,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* stiffness,
+    const void* dx,
+    void* df,
+    void* eforce,
+    const void* velems,
+    double kFactor)
+{
+    switch (nbNodesPerElem)
+    {
+        case 2: launchAddDForce<double, 2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 3: launchAddDForce<double, 3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 4: launchAddDForce<double, 4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
+        case 8: launchAddDForce<double, 8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
     }
 }
 
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
index 53cfaf663c5..777d3301ee2 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
@@ -54,6 +54,32 @@ extern "C"
         void* eforce,
         const void* velems,
         float kFactor);
+
+    void ElementLinearSmallStrainFEMForceFieldCuda3d_addForce(
+        unsigned int nbElem,
+        unsigned int nbVertex,
+        unsigned int nbNodesPerElem,
+        unsigned int maxElemPerVertex,
+        const void* elements,
+        const void* stiffness,
+        const void* x,
+        const void* x0,
+        void* f,
+        void* eforce,
+        const void* velems);
+
+    void ElementLinearSmallStrainFEMForceFieldCuda3d_addDForce(
+        unsigned int nbElem,
+        unsigned int nbVertex,
+        unsigned int nbNodesPerElem,
+        unsigned int maxElemPerVertex,
+        const void* elements,
+        const void* stiffness,
+        const void* dx,
+        void* df,
+        void* eforce,
+        const void* velems,
+        double kFactor);
 }
 
 } // namespace sofa::gpu::cuda
@@ -117,10 +143,10 @@ class CudaElementLinearSmallStrainFEMForceField
 
     void uploadStiffnessAndConnectivity();
 
-    gpu::cuda::CudaVector<float> m_gpuStiffness;      ///< Symmetric block-format stiffness per element
-    gpu::cuda::CudaVector<int>   m_gpuElements;        ///< SoA connectivity: elements[nodeIdx * nbElem + elemId]
-    gpu::cuda::CudaVector<float> m_gpuElementForce;    ///< Intermediate per-element per-node force buffer
-    gpu::cuda::CudaVector<int>   m_gpuVelems;          ///< SoA vertex-to-element mapping, 0-terminated
+    gpu::cuda::CudaVector<Real> m_gpuStiffness;      ///< Symmetric block-format stiffness per element
+    gpu::cuda::CudaVector<int>  m_gpuElements;        ///< SoA connectivity: elements[nodeIdx * nbElem + elemId]
+    gpu::cuda::CudaVector<Real> m_gpuElementForce;    ///< Intermediate per-element per-node force buffer
+    gpu::cuda::CudaVector<int>  m_gpuVelems;          ///< SoA vertex-to-element mapping, 0-terminated
 
     unsigned int m_maxElemPerVertex = 0;
     unsigned int m_nbVertices = 0;
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
index 1ab9dfb33f5..863511e951d 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
@@ -88,7 +88,7 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::uploadSt
                             dst[e * nSymBlocks * dim * dim
                                 + symIdx * dim * dim
                                 + di * dim + dj]
-                                = static_cast<float>(K[ni * dim + di][nj * dim + dj]);
+                                = static_cast<Real>(K[ni * dim + di][nj * dim + dj]);
                 }
             }
         }
@@ -175,18 +175,24 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addForce
     const auto nbElem = static_cast<unsigned int>(elements.size());
     const auto nbVertex = static_cast<unsigned int>(x.size());
 
-    gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addForce(
-        nbElem,
-        nbVertex,
-        trait::NumberOfNodesInElement,
-        m_maxElemPerVertex,
-        m_gpuElements.deviceRead(),
-        m_gpuStiffness.deviceRead(),
-        x.deviceRead(),
-        x0.deviceRead(),
-        f.deviceWrite(),
-        m_gpuElementForce.deviceWrite(),
-        m_gpuVelems.deviceRead());
+    if constexpr (std::is_same_v<Real, double>)
+    {
+        gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3d_addForce(
+            nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
+            m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
+            x.deviceRead(), x0.deviceRead(),
+            f.deviceWrite(), m_gpuElementForce.deviceWrite(),
+            m_gpuVelems.deviceRead());
+    }
+    else
+    {
+        gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addForce(
+            nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
+            m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
+            x.deviceRead(), x0.deviceRead(),
+            f.deviceWrite(), m_gpuElementForce.deviceWrite(),
+            m_gpuVelems.deviceRead());
+    }
 
     d_f.endEdit();
 }
@@ -215,7 +221,7 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addDForc
     if (df.size() < dx.size())
         df.resize(dx.size());
 
-    const auto kFactor = static_cast<float>(
+    const auto kFactor = static_cast<Real>(
         sofa::core::mechanicalparams::kFactorIncludingRayleighDamping(
             mparams, this->rayleighStiffness.getValue()));
 
@@ -223,18 +229,24 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addDForc
     const auto nbElem = static_cast<unsigned int>(elements.size());
     const auto nbVertex = static_cast<unsigned int>(dx.size());
 
-    gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce(
-        nbElem,
-        nbVertex,
-        trait::NumberOfNodesInElement,
-        m_maxElemPerVertex,
-        m_gpuElements.deviceRead(),
-        m_gpuStiffness.deviceRead(),
-        dx.deviceRead(),
-        df.deviceWrite(),
-        m_gpuElementForce.deviceWrite(),
-        m_gpuVelems.deviceRead(),
-        kFactor);
+    if constexpr (std::is_same_v<Real, double>)
+    {
+        gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3d_addDForce(
+            nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
+            m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
+            dx.deviceRead(), df.deviceWrite(),
+            m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(),
+            kFactor);
+    }
+    else
+    {
+        gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce(
+            nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
+            m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
+            dx.deviceRead(), df.deviceWrite(),
+            m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(),
+            kFactor);
+    }
 
     d_df.endEdit();
 }

From 07bd243ff2a658b73fe51ff911a7ab0b4db581c1 Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Wed, 8 Apr 2026 13:22:28 +0900
Subject: [PATCH 15/21] add benchmarks

---
 .../benchmarks/Hexahedron_corotational.py     |  93 ++++++++++++++
 .../Hexahedron_corotational.py.view           |  17 +++
 .../benchmarks/Tetrahedron_corotational.py    |  94 +++++++++++++++
 .../Tetrahedron_corotational.py.view          |  17 +++
 .../benchmarks/utilities.py                   | 114 ++++++++++++++++++
 5 files changed, 335 insertions(+)
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py.view
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py.view
 create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/utilities.py

diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py
new file mode 100644
index 00000000000..aec2063fbe6
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py
@@ -0,0 +1,93 @@
+import Sofa
+
+import os
+import numpy as np
+from utilities import generate_regular_grid
+
+g_grid_min_corner=(0, 6, -2)
+g_grid_max_corner=(16, 10, 2)
+
+g_fem_version = os.environ.get('FEM_VERSION', 'new') #either 'new' or 'legacy'
+g_fem_template = os.environ.get('FEM_TEMPLATE', 'Vec3d')
+
+# default is (76, 16, 16)
+g_grid_nx = int(os.environ.get('NX', '76'))
+g_grid_ny = int(os.environ.get('NY', '16'))
+g_grid_nz = int(os.environ.get('NZ', '16'))
+
+g_nb_steps = int(os.environ.get('NBSTEPS', '1000'))
+
+def createScene(root_node):
+    root_node.name = "root"
+    root_node.gravity = (0, -9, 0)
+    root_node.dt = 0.01
+
+    plugin_node = root_node.addChild('Plugins')
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Engine.Select")
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.LinearSolver.Iterative")
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.ODESolver.Backward")
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.StateContainer")
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Topology.Container.Dynamic")
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Topology.Container.Grid")
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Visual")
+    plugin_node.addObject('RequiredPlugin', pluginName='Sofa.Component.Constraint.Projective') # Needed to use components [FixedProjectiveConstraint]  
+    plugin_node.addObject('RequiredPlugin', pluginName='Sofa.Component.Mass') # Needed to use components [DiagonalMass]  
+    plugin_node.addObject('RequiredPlugin', pluginName='Sofa.Component.SolidMechanics.FEM.Elastic') # Needed to use components [HexahedronCorotationalFEMForceField]
+    plugin_node.addObject('RequiredPlugin', pluginName='SofaCUDA.Component')
+    plugin_node.addObject('VisualStyle', displayFlags="showBehaviorModels showForceFields")
+
+    root_node.addObject('DefaultAnimationLoop')
+    root_node.addObject('VisualStyle', displayFlags="showBehaviorModels showForceFields")
+
+    grid_nodes, grid_hexa = generate_regular_grid(nx=g_grid_nx, ny=g_grid_ny, nz=g_grid_nz, min_corner=g_grid_min_corner, max_corner=g_grid_max_corner)
+
+    hexahedron_node = root_node.addChild('Hexahedron')
+    hexahedron_node.addObject('EulerImplicitSolver', rayleighStiffness="0.1", rayleighMass="0.1")
+    hexahedron_node.addObject('CGLinearSolver', iterations="250", name="linear_solver", tolerance="1.0e-12", threshold="1.0e-12")
+    hexahedron_node.addObject('MechanicalObject', name="ms", template=g_fem_template, position=grid_nodes)
+    hexahedron_node.addObject('HexahedronSetTopologyContainer', hexahedra=grid_hexa)
+    hexahedron_node.addObject('DiagonalMass', totalMass="50.0")
+    hexahedron_node.addObject('BoxROI', name="boxroi1", box="-0.1 5 -3 0.1 11 3", drawBoxes="1")
+    hexahedron_node.addObject('FixedProjectiveConstraint', indices="@boxroi1.indices")
+    if g_fem_version == "legacy":
+        hexahedron_node.addObject('HexahedronFEMForceField', name="LegacyFEM", template=g_fem_template, youngModulus="4000", poissonRatio="0.3", method="large")
+    if g_fem_version == "new":
+        hexahedron_node.addObject('HexahedronCorotationalFEMForceField', name="NewFEM", template=g_fem_template, youngModulus="4000", poissonRatio="0.3")
+
+def main():
+
+    enable_gui = False
+
+    try:
+        import Sofa.Gui
+        import SofaImGui
+    except:
+        enable_gui = False
+    
+    root = Sofa.Core.Node("root")
+    createScene(root)
+    
+    Sofa.Simulation.initRoot(root)
+
+    if enable_gui:
+        Sofa.Gui.GUIManager.Init("myscene","imgui")
+        Sofa.Gui.GUIManager.createGUI(root, __file__)
+        Sofa.Gui.GUIManager.MainLoop(root)
+        Sofa.Gui.GUIManager.closeGUI()
+    else:
+        import time
+
+        print(f"Running on {g_nb_steps} steps...")
+        start_timer = time.time()
+
+        for iteration in range(g_nb_steps):
+            Sofa.Simulation.animate(root, root.dt.value)
+
+        stop_timer = time.time()
+        print(f"... Done.")
+        print(f"{g_nb_steps} steps done in {stop_timer - start_timer:.3}s ({g_nb_steps/(stop_timer - start_timer):.5} fps).")
+
+
+# Function used only if this script is called from a python environment
+if __name__ == '__main__':
+    main()
diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py.view b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py.view
new file mode 100644
index 00000000000..1e9c7f6670c
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py.view
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Camera version="1.0">
+    <!--Vector of 3 reals (x, y, z)-->
+    <position value="16.2475 12.9323 9.34711"/>
+    <!--Quaternion (x, y, z, w)-->
+    <orientation value="-0.208716 0.28044 0.0158216 0.936772"/>
+    <!--Real-->
+    <fieldOfView value="45"/>
+    <!--Real-->
+    <distance value="19.4175"/>
+    <!--Real-->
+    <zNear value="0.212191"/>
+    <!--Real-->
+    <zFar value="25.9888"/>
+    <!--Int (0 -> Perspective, 1 -> Orthographic)-->
+    <projectionType value="Perspective"/>
+</Camera>
diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py
new file mode 100644
index 00000000000..d9480c34dfa
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py
@@ -0,0 +1,94 @@
+import Sofa
+
+import os
+import numpy as np
+from utilities import generate_regular_grid, hexa_to_tetra
+
+g_grid_min_corner=(0, 6, -2)
+g_grid_max_corner=(16, 10, 2)
+
+g_fem_version = os.environ.get('FEM_VERSION', 'new') #either 'new' or 'legacy'
+g_fem_template = os.environ.get('FEM_TEMPLATE', 'Vec3d')
+
+# default is (76, 16, 16)
+g_grid_nx = int(os.environ.get('NX', '76'))
+g_grid_ny = int(os.environ.get('NY', '16'))
+g_grid_nz = int(os.environ.get('NZ', '16'))
+
+g_nb_steps = int(os.environ.get('NBSTEPS', '1000'))
+
+def createScene(root_node):
+    root_node.name = "root"
+    root_node.gravity = (0, -9, 0)
+    root_node.dt = 0.01
+
+    plugin_node = root_node.addChild('Plugins')
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Engine.Select")
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.LinearSolver.Iterative")
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.ODESolver.Backward")
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.StateContainer")
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Topology.Container.Dynamic")
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Topology.Container.Grid")
+    plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Visual")
+    plugin_node.addObject('RequiredPlugin', pluginName='Sofa.Component.Constraint.Projective') # Needed to use components [FixedProjectiveConstraint]  
+    plugin_node.addObject('RequiredPlugin', pluginName='Sofa.Component.Mass') # Needed to use components [DiagonalMass]  
+    plugin_node.addObject('RequiredPlugin', pluginName='Sofa.Component.SolidMechanics.FEM.Elastic') # Needed to use components [TetrahedronCorotationalFEMForceField]
+    plugin_node.addObject('RequiredPlugin', pluginName='SofaCUDA.Component')
+    plugin_node.addObject('VisualStyle', displayFlags="showBehaviorModels showForceFields")
+
+    root_node.addObject('DefaultAnimationLoop')
+    root_node.addObject('VisualStyle', displayFlags="showBehaviorModels showForceFields")
+
+    grid_nodes, grid_hexa = generate_regular_grid(nx=g_grid_nx, ny=g_grid_ny, nz=g_grid_nz, min_corner=g_grid_min_corner, max_corner=g_grid_max_corner)
+    grid_tetra = hexa_to_tetra(grid_hexa)
+    
+    tetrahedron_node = root_node.addChild('Tetrahedron')
+    tetrahedron_node.addObject('EulerImplicitSolver', rayleighStiffness="0.1", rayleighMass="0.1")
+    tetrahedron_node.addObject('CGLinearSolver', iterations="250", name="linear_solver", tolerance="1.0e-12", threshold="1.0e-12")
+    tetrahedron_node.addObject('MechanicalObject', name="ms", template=g_fem_template, position=grid_nodes)
+    tetrahedron_node.addObject('TetrahedronSetTopologyContainer', tetrahedra=grid_tetra)
+    tetrahedron_node.addObject('DiagonalMass', totalMass="50.0")
+    tetrahedron_node.addObject('BoxROI', name="boxroi1", box="-0.1 5 -3 0.1 11 3", drawBoxes="1")
+    tetrahedron_node.addObject('FixedProjectiveConstraint', indices="@boxroi1.indices")
+    if g_fem_version == "legacy":
+        tetrahedron_node.addObject('TetrahedronFEMForceField', name="LegacyFEM", template=g_fem_template, youngModulus="4000", poissonRatio="0.3", method="large")
+    if g_fem_version == "new":
+        tetrahedron_node.addObject('TetrahedronCorotationalFEMForceField', name="NewFEM", template=g_fem_template, youngModulus="4000", poissonRatio="0.3")
+
+def main():
+
+    enable_gui = False
+
+    try:
+        import Sofa.Gui
+        import SofaImGui
+    except:
+        enable_gui = False
+    
+    root = Sofa.Core.Node("root")
+    createScene(root)
+    
+    Sofa.Simulation.initRoot(root)
+
+    if enable_gui:
+        Sofa.Gui.GUIManager.Init("myscene","imgui")
+        Sofa.Gui.GUIManager.createGUI(root, __file__)
+        Sofa.Gui.GUIManager.MainLoop(root)
+        Sofa.Gui.GUIManager.closeGUI()
+    else:
+        import time
+
+        print(f"Running on {g_nb_steps} steps...")
+        start_timer = time.time()
+
+        for iteration in range(g_nb_steps):
+            Sofa.Simulation.animate(root, root.dt.value)
+
+        stop_timer = time.time()
+        print(f"... Done.")
+        print(f"{g_nb_steps} steps done in {stop_timer - start_timer:.3}s ({g_nb_steps/(stop_timer - start_timer):.5} fps).")
+
+
+# Function used only if this script is called from a python environment
+if __name__ == '__main__':
+    main()
diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py.view b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py.view
new file mode 100644
index 00000000000..433112afafd
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py.view
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<Camera version="1.0">
+    <!--Vector of 3 reals (x, y, z)-->
+    <position value="17.3888 11.1211 10.0501"/>
+    <!--Quaternion (x, y, z, w)-->
+    <orientation value="-0.136912 0.285982 -0 0.948404"/>
+    <!--Real-->
+    <fieldOfView value="45"/>
+    <!--Real-->
+    <distance value="20.435"/>
+    <!--Real-->
+    <zNear value="0.212191"/>
+    <!--Real-->
+    <zFar value="27.026"/>
+    <!--Int (0 -> Perspective, 1 -> Orthographic)-->
+    <projectionType value="Perspective"/>
+</Camera>
diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/utilities.py b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/utilities.py
new file mode 100644
index 00000000000..3bdf301c641
--- /dev/null
+++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/utilities.py
@@ -0,0 +1,114 @@
+import numpy as np
+
+def generate_regular_grid(nx=10, ny=10, nz=10, min_corner=(0, 0, 0), max_corner=(1, 1, 1)):                                                                                                                       
+    """                                                                                                                                                                                                  
+    Generate a regular grid of hexahedra.                                                                                                                                                                
+                                                                                                                                                                                                       
+    Args:                                                                                                                                                                                                
+      nx, ny, nz: Number of vertices in each direction (grid resolution)                                                                                                                               
+      min_corner: (xmin, ymin, zmin) tuple                                                                                                                                                             
+      max_corner: (xmax, ymax, zmax) tuple                                                                                                                                                             
+                                                                                                                                                                                                       
+    Returns:                                                                                                                                                                                             
+      points: array of shape (nx*ny*nz, 3) - vertex positions                                                                                                                                          
+      hexahedra: array of shape ((nx-1)*(ny-1)*(nz-1), 8) - hexahedra indices                                                                                                                          
+    """
+    xmin, ymin, zmin = min_corner                                                                                                                                                                        
+    xmax, ymax, zmax = max_corner                                                                                                                                                                        
+                                                                                                                                                                                                       
+    # Compute spacing                                                                                                                                                                                    
+    dx = (xmax - xmin) / (nx - 1) if nx > 1 else 0                                                                                                                                                       
+    dy = (ymax - ymin) / (ny - 1) if ny > 1 else 0                                                                                                                                                       
+    dz = (zmax - zmin) / (nz - 1) if nz > 1 else 0                                                                                                                                                       
+                                                                                                                                                                                                       
+    # Generate points                                                                                                                                                                                    
+    points = []                                                                                                                                                                                          
+    for k in range(nz):                                                                                                                                                                                  
+      for j in range(ny):                                                                                                                                                                              
+          for i in range(nx):                                                                                                                                                                          
+              points.append([xmin + i*dx, ymin + j*dy, zmin + k*dz])                                                                                                                                   
+    points = np.array(points)                                                                                                                                                                            
+                                                                                                                                                                                                       
+    # Helper to get point index from grid coordinates                                                                                                                                                    
+    def point_index(i, j, k):                                                                                                                                                                            
+      return nx * (ny * k + j) + i                                                                                                                                                                     
+                                                                                                                                                                                                       
+    # Generate hexahedra (8 vertices per hexa, in SOFA convention)                                                                                                                                       
+    hexahedra = []                                                                                                                                                                                       
+    for k in range(nz - 1):                                                                                                                                                                              
+      for j in range(ny - 1):                                                                                                                                                                          
+          for i in range(nx - 1):                                                                                                                                                                      
+              hexa = [                                                                                                                                                                                 
+                  point_index(i,   j,   k),                                                                                                                                                            
+                  point_index(i+1, j,   k),                                                                                                                                                            
+                  point_index(i+1, j+1, k),                                                                                                                                                            
+                  point_index(i,   j+1, k),                                                                                                                                                            
+                  point_index(i,   j,   k+1),                                                                                                                                                          
+                  point_index(i+1, j,   k+1),                                                                                                                                                          
+                  point_index(i+1, j+1, k+1),                                                                                                                                                          
+                  point_index(i,   j+1, k+1),                                                                                                                                                          
+              ]                                                                                                                                                                                        
+              hexahedra.append(hexa)                                                                                                                                                                   
+    hexahedra = np.array(hexahedra)                                                                                                                                                                      
+                                                                                                                                                                                                       
+    return points, hexahedra
+
+def hexa_to_tetra(hexahedra):                                                                                                                                                                            
+    """                                                                                                                                                                                                  
+    Convert hexahedra to tetrahedra.                                                                                                                                                                     
+                                                                                                                                                                                                       
+    Each hexahedron is split into 5 tetrahedra.                                                                                                                                                          
+                                                                                                                                                                                                       
+    Args:       
+      hexahedra: array of shape (N, 8) - hexahedra vertex indices
+
+    Returns:
+      tetrahedra: array of shape (N*5, 4) - tetrahedra vertex indices
+    """
+    tetrahedra = []
+
+    # 5-tetra decomposition using diagonal 1-3-4-6
+    splits = [
+      [0, 1, 3, 4],
+      [1, 2, 3, 6],
+      [1, 4, 5, 6],
+      [3, 4, 6, 7],
+      [1, 3, 4, 6],  # central tetrahedron                                                                                                                                                             
+    ]
+                                                                                                                                                                                                       
+    for hexa in hexahedra:
+      for split in splits:                                                                                                                                                                             
+          tetrahedra.append([hexa[i] for i in split])
+                                                                                                                                                                                                       
+    return np.array(tetrahedra)
+
+def hexa_to_tetra_symmetric(hexahedra):                                                                                                                                                                  
+    """                                                                                                                                                                                                  
+    Convert hexahedra to tetrahedra using symmetric 6-tetra decomposition.                                                                                                                               
+
+    Each hexahedron is split into 6 tetrahedra around the space diagonal (0-6).                                                                                                                          
+    Better symmetry properties for FEM simulations.
+
+    Args:
+      hexahedra: array of shape (N, 8) - hexahedra vertex indices
+
+    Returns:
+      tetrahedra: array of shape (N*6, 4) - tetrahedra vertex indices
+    """
+    tetrahedra = []
+
+    # 6-tetra symmetric decomposition around diagonal 0-6
+    splits = [
+      [0, 1, 2, 6],
+      [0, 2, 3, 6],
+      [0, 3, 7, 6],
+      [0, 7, 4, 6],
+      [0, 4, 5, 6],
+      [0, 5, 1, 6],
+    ]
+
+    for hexa in hexahedra:
+      for split in splits:
+          tetrahedra.append([hexa[i] for i in split])
+
+    return np.array(tetrahedra)
\ No newline at end of file

From f1778f14a7cdf6b20e42f59347a596397c392624 Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Thu, 9 Apr 2026 07:35:59 +0900
Subject: [PATCH 16/21] template CUDA kernels on Dim and remove runtime
 dispatch

Replace hardcoded 3D assumption and extern "C" + switch(nbNodesPerElem) runtime dispatch with fully compile-time C++ template parameters <T, NNodes, Dim>. All kernel dimensions, stiffness block sizes,
   and gather loops are now generic over Dim. The .inl callers use a single template call with constexpr nNodes and dim from the trait, eliminating both the if-constexpr type branching and the runtime
  NNodes switch. Explicit template instantiations in the .cu files provide the needed symbols. Applied to both ElementLinearSmallStrainFEMForceField and ElementCorotationalFEMForceField CUDA
  implementations.
---
 .../CudaElementCorotationalFEMForceField.cu   | 584 ++++++++----------
 .../CudaElementCorotationalFEMForceField.h    | 130 ++--
 .../CudaElementCorotationalFEMForceField.inl  |  87 +--
 ...daElementLinearSmallStrainFEMForceField.cu | 335 ++++------
 ...udaElementLinearSmallStrainFEMForceField.h |  79 +--
 ...aElementLinearSmallStrainFEMForceField.inl |  52 +-
 6 files changed, 518 insertions(+), 749 deletions(-)

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
index afbbdb89532..8cb90f8c540 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
@@ -23,14 +23,12 @@
 #include <sofa/gpu/cuda/CudaMath.h>
 #include <cuda.h>
 
-#if defined(__cplusplus)
 namespace sofa
 {
 namespace gpu
 {
 namespace cuda
 {
-#endif
 
 template<typename T>
 __device__ T myRsqrt(T x);
@@ -164,12 +162,13 @@ __device__ void computeHexahedronFrame(const T* ex, T* frame)
 
 /**
  * Symmetric block-matrix multiply: out = K * in
+ * Templated on Dim for generic spatial dimensions.
  */
-template<typename T, int NNodes>
+template<typename T, int NNodes, int Dim>
 __device__ void symBlockMatMul(const T* K, const T* in, T* out)
 {
     #pragma unroll
-    for (int i = 0; i < NNodes * 3; ++i)
+    for (int i = 0; i < NNodes * Dim; ++i)
         out[i] = T(0);
 
     #pragma unroll
@@ -177,38 +176,47 @@ __device__ void symBlockMatMul(const T* K, const T* in, T* out)
     {
         const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
 
+        // Diagonal block
         {
-            const T* Kii = K + diagIdx * 9;
-            const T i0 = in[ni * 3 + 0];
-            const T i1 = in[ni * 3 + 1];
-            const T i2 = in[ni * 3 + 2];
-            out[ni * 3 + 0] += Kii[0] * i0 + Kii[1] * i1 + Kii[2] * i2;
-            out[ni * 3 + 1] += Kii[3] * i0 + Kii[4] * i1 + Kii[5] * i2;
-            out[ni * 3 + 2] += Kii[6] * i0 + Kii[7] * i1 + Kii[8] * i2;
+            const T* Kii = K + diagIdx * Dim * Dim;
+            #pragma unroll
+            for (int di = 0; di < Dim; ++di)
+            {
+                T sum = T(0);
+                #pragma unroll
+                for (int dj = 0; dj < Dim; ++dj)
+                    sum += Kii[di * Dim + dj] * in[ni * Dim + dj];
+                out[ni * Dim + di] += sum;
+            }
         }
 
+        // Off-diagonal blocks
         #pragma unroll
         for (int nj = ni + 1; nj < NNodes; ++nj)
         {
             const int symIdx = diagIdx + (nj - ni);
-            const T* Kij = K + symIdx * 9;
+            const T* Kij = K + symIdx * Dim * Dim;
 
+            // Kij * in_j -> out_i
+            #pragma unroll
+            for (int di = 0; di < Dim; ++di)
             {
-                const T j0 = in[nj * 3 + 0];
-                const T j1 = in[nj * 3 + 1];
-                const T j2 = in[nj * 3 + 2];
-                out[ni * 3 + 0] += Kij[0] * j0 + Kij[1] * j1 + Kij[2] * j2;
-                out[ni * 3 + 1] += Kij[3] * j0 + Kij[4] * j1 + Kij[5] * j2;
-                out[ni * 3 + 2] += Kij[6] * j0 + Kij[7] * j1 + Kij[8] * j2;
+                T sum = T(0);
+                #pragma unroll
+                for (int dj = 0; dj < Dim; ++dj)
+                    sum += Kij[di * Dim + dj] * in[nj * Dim + dj];
+                out[ni * Dim + di] += sum;
             }
 
+            // Kij^T * in_i -> out_j
+            #pragma unroll
+            for (int dj = 0; dj < Dim; ++dj)
             {
-                const T i0 = in[ni * 3 + 0];
-                const T i1 = in[ni * 3 + 1];
-                const T i2 = in[ni * 3 + 2];
-                out[nj * 3 + 0] += Kij[0] * i0 + Kij[3] * i1 + Kij[6] * i2;
-                out[nj * 3 + 1] += Kij[1] * i0 + Kij[4] * i1 + Kij[7] * i2;
-                out[nj * 3 + 2] += Kij[2] * i0 + Kij[5] * i1 + Kij[8] * i2;
+                T sum = T(0);
+                #pragma unroll
+                for (int di = 0; di < Dim; ++di)
+                    sum += Kij[di * Dim + dj] * in[ni * Dim + di];
+                out[nj * Dim + dj] += sum;
             }
         }
     }
@@ -216,8 +224,9 @@ __device__ void symBlockMatMul(const T* K, const T* in, T* out)
 
 /**
  * Combined kernel: compute rotations AND per-element forces in one pass.
+ * Rotation computation is inherently 3D (cross products).
  */
-template<typename T, int NNodes>
+template<typename T, int NNodes, int Dim>
 __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel(
     int nbElem,
     const int* __restrict__ elements,
@@ -228,88 +237,112 @@ __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel
     T* __restrict__ rotationsOut,
     T* __restrict__ eforce)
 {
+    static_assert(Dim == 3, "Corotational rotation computation requires Dim == 3");
     constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
     const T invN = T(1) / T(NNodes);
 
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
-    T ex[NNodes * 3], ex0[NNodes * 3];
+    T ex[NNodes * Dim], ex0[NNodes * Dim];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
         const int nodeId = elements[n * nbElem + elemId];
-        ex[n * 3 + 0] = x[nodeId * 3 + 0];
-        ex[n * 3 + 1] = x[nodeId * 3 + 1];
-        ex[n * 3 + 2] = x[nodeId * 3 + 2];
-        ex0[n * 3 + 0] = x0[nodeId * 3 + 0];
-        ex0[n * 3 + 1] = x0[nodeId * 3 + 1];
-        ex0[n * 3 + 2] = x0[nodeId * 3 + 2];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+        {
+            ex[n * Dim + d] = x[nodeId * Dim + d];
+            ex0[n * Dim + d] = x0[nodeId * Dim + d];
+        }
     }
 
-    T frame[9];
+    T frame[Dim * Dim];
     if constexpr (NNodes == 8)
         computeHexahedronFrame(ex, frame);
     else
         computeTriangleFrame(ex, frame);
 
     // R = frame^T * initRot
-    const T* irt = initRotTransposed + elemId * 9;
-    T R[9];
+    const T* irt = initRotTransposed + elemId * Dim * Dim;
+    T R[Dim * Dim];
     mat3TransposeMul(frame, irt, R);
 
-    T* Rout = rotationsOut + elemId * 9;
+    T* Rout = rotationsOut + elemId * Dim * Dim;
     #pragma unroll
-    for (int i = 0; i < 9; ++i)
+    for (int i = 0; i < Dim * Dim; ++i)
         Rout[i] = R[i];
 
-    T cx = T(0), cy = T(0), cz = T(0);
-    T cx0 = T(0), cy0 = T(0), cz0 = T(0);
+    T center[Dim], center0[Dim];
+    #pragma unroll
+    for (int d = 0; d < Dim; ++d)
+    {
+        center[d] = T(0);
+        center0[d] = T(0);
+    }
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        cx += ex[n * 3 + 0]; cy += ex[n * 3 + 1]; cz += ex[n * 3 + 2];
-        cx0 += ex0[n * 3 + 0]; cy0 += ex0[n * 3 + 1]; cz0 += ex0[n * 3 + 2];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+        {
+            center[d] += ex[n * Dim + d];
+            center0[d] += ex0[n * Dim + d];
+        }
+    }
+    #pragma unroll
+    for (int d = 0; d < Dim; ++d)
+    {
+        center[d] *= invN;
+        center0[d] *= invN;
     }
-    cx *= invN; cy *= invN; cz *= invN;
-    cx0 *= invN; cy0 *= invN; cz0 *= invN;
 
-    T disp[NNodes * 3];
+    T disp[NNodes * Dim];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        const T dx = ex[n * 3 + 0] - cx;
-        const T dy = ex[n * 3 + 1] - cy;
-        const T dz = ex[n * 3 + 2] - cz;
-        const T rx = R[0] * dx + R[3] * dy + R[6] * dz;
-        const T ry = R[1] * dx + R[4] * dy + R[7] * dz;
-        const T rz = R[2] * dx + R[5] * dy + R[8] * dz;
-        disp[n * 3 + 0] = rx - (ex0[n * 3 + 0] - cx0);
-        disp[n * 3 + 1] = ry - (ex0[n * 3 + 1] - cy0);
-        disp[n * 3 + 2] = rz - (ex0[n * 3 + 2] - cz0);
+        // R^T * (x_n - center)
+        T diff[Dim];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            diff[d] = ex[n * Dim + d] - center[d];
+
+        #pragma unroll
+        for (int di = 0; di < Dim; ++di)
+        {
+            T rotated = T(0);
+            #pragma unroll
+            for (int dj = 0; dj < Dim; ++dj)
+                rotated += R[dj * Dim + di] * diff[dj];
+            disp[n * Dim + di] = rotated - (ex0[n * Dim + di] - center0[di]);
+        }
     }
 
-    T edf[NNodes * 3];
-    const T* K = stiffness + elemId * NSymBlocks * 9;
-    symBlockMatMul<T, NNodes>(K, disp, edf);
+    T edf[NNodes * Dim];
+    const T* K = stiffness + elemId * NSymBlocks * Dim * Dim;
+    symBlockMatMul<T, NNodes, Dim>(K, disp, edf);
 
-    T* out = eforce + elemId * NNodes * 3;
+    T* out = eforce + elemId * NNodes * Dim;
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        const T e0 = edf[n * 3 + 0];
-        const T e1 = edf[n * 3 + 1];
-        const T e2 = edf[n * 3 + 2];
-        out[n * 3 + 0] = -(R[0] * e0 + R[1] * e1 + R[2] * e2);
-        out[n * 3 + 1] = -(R[3] * e0 + R[4] * e1 + R[5] * e2);
-        out[n * 3 + 2] = -(R[6] * e0 + R[7] * e1 + R[8] * e2);
+        // R * edf_n, negated
+        #pragma unroll
+        for (int di = 0; di < Dim; ++di)
+        {
+            T sum = T(0);
+            #pragma unroll
+            for (int dj = 0; dj < Dim; ++dj)
+                sum += R[di * Dim + dj] * edf[n * Dim + dj];
+            out[n * Dim + di] = -sum;
+        }
     }
 }
 
 /**
  * Kernel for addForce: Compute per-element force (1 thread per element).
  */
-template<typename T, int NNodes>
+template<typename T, int NNodes, int Dim>
 __global__ void ElementCorotationalFEMForceField_computeForce_kernel(
     int nbElem,
     const int* __restrict__ elements,
@@ -325,72 +358,93 @@ __global__ void ElementCorotationalFEMForceField_computeForce_kernel(
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
-    const T* Rptr = rotations + elemId * 9;
-    T R[9];
+    const T* Rptr = rotations + elemId * Dim * Dim;
+    T R[Dim * Dim];
     #pragma unroll
-    for (int i = 0; i < 9; ++i)
+    for (int i = 0; i < Dim * Dim; ++i)
         R[i] = Rptr[i];
 
-    T ex[NNodes * 3], ex0[NNodes * 3];
+    T ex[NNodes * Dim], ex0[NNodes * Dim];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
         const int nodeId = elements[n * nbElem + elemId];
-        ex[n * 3 + 0] = x[nodeId * 3 + 0];
-        ex[n * 3 + 1] = x[nodeId * 3 + 1];
-        ex[n * 3 + 2] = x[nodeId * 3 + 2];
-        ex0[n * 3 + 0] = x0[nodeId * 3 + 0];
-        ex0[n * 3 + 1] = x0[nodeId * 3 + 1];
-        ex0[n * 3 + 2] = x0[nodeId * 3 + 2];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+        {
+            ex[n * Dim + d] = x[nodeId * Dim + d];
+            ex0[n * Dim + d] = x0[nodeId * Dim + d];
+        }
     }
 
-    T cx = T(0), cy = T(0), cz = T(0);
-    T cx0 = T(0), cy0 = T(0), cz0 = T(0);
+    T center[Dim], center0[Dim];
+    #pragma unroll
+    for (int d = 0; d < Dim; ++d)
+    {
+        center[d] = T(0);
+        center0[d] = T(0);
+    }
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        cx += ex[n * 3 + 0]; cy += ex[n * 3 + 1]; cz += ex[n * 3 + 2];
-        cx0 += ex0[n * 3 + 0]; cy0 += ex0[n * 3 + 1]; cz0 += ex0[n * 3 + 2];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+        {
+            center[d] += ex[n * Dim + d];
+            center0[d] += ex0[n * Dim + d];
+        }
+    }
+    #pragma unroll
+    for (int d = 0; d < Dim; ++d)
+    {
+        center[d] *= invN;
+        center0[d] *= invN;
     }
-    cx *= invN; cy *= invN; cz *= invN;
-    cx0 *= invN; cy0 *= invN; cz0 *= invN;
 
-    T disp[NNodes * 3];
+    T disp[NNodes * Dim];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        const T dx = ex[n * 3 + 0] - cx;
-        const T dy = ex[n * 3 + 1] - cy;
-        const T dz = ex[n * 3 + 2] - cz;
-        const T rx = R[0] * dx + R[3] * dy + R[6] * dz;
-        const T ry = R[1] * dx + R[4] * dy + R[7] * dz;
-        const T rz = R[2] * dx + R[5] * dy + R[8] * dz;
-        disp[n * 3 + 0] = rx - (ex0[n * 3 + 0] - cx0);
-        disp[n * 3 + 1] = ry - (ex0[n * 3 + 1] - cy0);
-        disp[n * 3 + 2] = rz - (ex0[n * 3 + 2] - cz0);
+        T diff[Dim];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            diff[d] = ex[n * Dim + d] - center[d];
+
+        #pragma unroll
+        for (int di = 0; di < Dim; ++di)
+        {
+            T rotated = T(0);
+            #pragma unroll
+            for (int dj = 0; dj < Dim; ++dj)
+                rotated += R[dj * Dim + di] * diff[dj];
+            disp[n * Dim + di] = rotated - (ex0[n * Dim + di] - center0[di]);
+        }
     }
 
-    T edf[NNodes * 3];
-    const T* K = stiffness + elemId * NSymBlocks * 9;
-    symBlockMatMul<T, NNodes>(K, disp, edf);
+    T edf[NNodes * Dim];
+    const T* K = stiffness + elemId * NSymBlocks * Dim * Dim;
+    symBlockMatMul<T, NNodes, Dim>(K, disp, edf);
 
-    T* out = eforce + elemId * NNodes * 3;
+    T* out = eforce + elemId * NNodes * Dim;
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        const T e0 = edf[n * 3 + 0];
-        const T e1 = edf[n * 3 + 1];
-        const T e2 = edf[n * 3 + 2];
-        out[n * 3 + 0] = -(R[0] * e0 + R[1] * e1 + R[2] * e2);
-        out[n * 3 + 1] = -(R[3] * e0 + R[4] * e1 + R[5] * e2);
-        out[n * 3 + 2] = -(R[6] * e0 + R[7] * e1 + R[8] * e2);
+        #pragma unroll
+        for (int di = 0; di < Dim; ++di)
+        {
+            T sum = T(0);
+            #pragma unroll
+            for (int dj = 0; dj < Dim; ++dj)
+                sum += R[di * Dim + dj] * edf[n * Dim + dj];
+            out[n * Dim + di] = -sum;
+        }
     }
 }
 
 /**
  * Kernel for addDForce: Compute per-element dForce (1 thread per element).
  */
-template<typename T, int NNodes>
+template<typename T, int NNodes, int Dim>
 __global__ void ElementCorotationalFEMForceField_computeDForce_kernel(
     int nbElem,
     const int* __restrict__ elements,
@@ -405,46 +459,59 @@ __global__ void ElementCorotationalFEMForceField_computeDForce_kernel(
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
-    const T* Rptr = rotations + elemId * 9;
-    T R[9];
+    const T* Rptr = rotations + elemId * Dim * Dim;
+    T R[Dim * Dim];
     #pragma unroll
-    for (int i = 0; i < 9; ++i)
+    for (int i = 0; i < Dim * Dim; ++i)
         R[i] = Rptr[i];
 
-    T rdx[NNodes * 3];
+    // R^T * dx for each node
+    T rdx[NNodes * Dim];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
         const int nodeId = elements[n * nbElem + elemId];
-        const T dx_x = dx[nodeId * 3 + 0];
-        const T dx_y = dx[nodeId * 3 + 1];
-        const T dx_z = dx[nodeId * 3 + 2];
-        rdx[n * 3 + 0] = R[0] * dx_x + R[3] * dx_y + R[6] * dx_z;
-        rdx[n * 3 + 1] = R[1] * dx_x + R[4] * dx_y + R[7] * dx_z;
-        rdx[n * 3 + 2] = R[2] * dx_x + R[5] * dx_y + R[8] * dx_z;
+        T nodeDx[Dim];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            nodeDx[d] = dx[nodeId * Dim + d];
+
+        #pragma unroll
+        for (int di = 0; di < Dim; ++di)
+        {
+            T sum = T(0);
+            #pragma unroll
+            for (int dj = 0; dj < Dim; ++dj)
+                sum += R[dj * Dim + di] * nodeDx[dj];
+            rdx[n * Dim + di] = sum;
+        }
     }
 
-    const T* K = stiffness + elemId * NSymBlocks * 9;
-    T edf[NNodes * 3];
-    symBlockMatMul<T, NNodes>(K, rdx, edf);
+    const T* K = stiffness + elemId * NSymBlocks * Dim * Dim;
+    T edf[NNodes * Dim];
+    symBlockMatMul<T, NNodes, Dim>(K, rdx, edf);
 
-    T* out = eforce + elemId * NNodes * 3;
+    // R * edf, scaled by -kFactor
+    T* out = eforce + elemId * NNodes * Dim;
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        const T e0 = edf[n * 3 + 0];
-        const T e1 = edf[n * 3 + 1];
-        const T e2 = edf[n * 3 + 2];
-        out[n * 3 + 0] = -kFactor * (R[0] * e0 + R[1] * e1 + R[2] * e2);
-        out[n * 3 + 1] = -kFactor * (R[3] * e0 + R[4] * e1 + R[5] * e2);
-        out[n * 3 + 2] = -kFactor * (R[6] * e0 + R[7] * e1 + R[8] * e2);
+        #pragma unroll
+        for (int di = 0; di < Dim; ++di)
+        {
+            T sum = T(0);
+            #pragma unroll
+            for (int dj = 0; dj < Dim; ++dj)
+                sum += R[di * Dim + dj] * edf[n * Dim + dj];
+            out[n * Dim + di] = -kFactor * sum;
+        }
     }
 }
 
 /**
  * Gather per-vertex forces (1 thread per vertex).
  */
-template<typename T>
+template<typename T, int Dim>
 __global__ void ElementCorotationalFEMForceField_gatherForce_kernel(
     int nbVertex,
     int maxElemPerVertex,
@@ -455,45 +522,30 @@ __global__ void ElementCorotationalFEMForceField_gatherForce_kernel(
     const int vertexId = blockIdx.x * blockDim.x + threadIdx.x;
     if (vertexId >= nbVertex) return;
 
-    T fx = T(0), fy = T(0), fz = T(0);
+    T acc[Dim];
+    #pragma unroll
+    for (int d = 0; d < Dim; ++d)
+        acc[d] = T(0);
 
     for (int s = 0; s < maxElemPerVertex; ++s)
     {
         const int idx = velems[s * nbVertex + vertexId];
         if (idx == 0) break;
-        const int base = (idx - 1) * 3;
-        fx += eforce[base + 0];
-        fy += eforce[base + 1];
-        fz += eforce[base + 2];
+        const int base = (idx - 1) * Dim;
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            acc[d] += eforce[base + d];
     }
 
-    df[vertexId * 3 + 0] += fx;
-    df[vertexId * 3 + 1] += fy;
-    df[vertexId * 3 + 2] += fz;
+    #pragma unroll
+    for (int d = 0; d < Dim; ++d)
+        df[vertexId * Dim + d] += acc[d];
 }
 
-template<typename T>
-static void launchGather(
-    unsigned int nbVertex,
-    unsigned int maxElemPerVertex,
-    const void* velems,
-    const void* eforce,
-    void* f)
-{
-    const int gatherThreads = 256;
-    const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
-    ElementCorotationalFEMForceField_gatherForce_kernel<T>
-        <<<numBlocks, gatherThreads>>>(
-            nbVertex,
-            maxElemPerVertex,
-            (const int*)velems,
-            (const T*)eforce,
-            (T*)f);
-    mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel");
-}
+// ===================== Launch functions (C++ templates) =====================
 
-template<typename T, int NNodes>
-static void launchAddForceWithRotations(
+template<typename T, int NNodes, int Dim>
+void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(
     unsigned int nbElem,
     unsigned int nbVertex,
     unsigned int maxElemPerVertex,
@@ -508,8 +560,8 @@ static void launchAddForceWithRotations(
     const void* velems)
 {
     const int computeThreads = 64;
-    const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
-    ElementCorotationalFEMForceField_computeRotationsAndForce_kernel<T, NNodes>
+    int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
+    ElementCorotationalFEMForceField_computeRotationsAndForce_kernel<T, NNodes, Dim>
         <<<numBlocks, computeThreads>>>(
             nbElem,
             (const int*)elements,
@@ -521,11 +573,20 @@ static void launchAddForceWithRotations(
             (T*)eforce);
     mycudaDebugError("ElementCorotationalFEMForceField_computeRotationsAndForce_kernel");
 
-    launchGather<T>(nbVertex, maxElemPerVertex, velems, eforce, f);
+    const int gatherThreads = 256;
+    numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
+    ElementCorotationalFEMForceField_gatherForce_kernel<T, Dim>
+        <<<numBlocks, gatherThreads>>>(
+            nbVertex,
+            maxElemPerVertex,
+            (const int*)velems,
+            (const T*)eforce,
+            (T*)f);
+    mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel");
 }
 
-template<typename T, int NNodes>
-static void launchAddForce(
+template<typename T, int NNodes, int Dim>
+void ElementCorotationalFEMForceFieldCuda_addForce(
     unsigned int nbElem,
     unsigned int nbVertex,
     unsigned int maxElemPerVertex,
@@ -539,8 +600,8 @@ static void launchAddForce(
     const void* velems)
 {
     const int computeThreads = 64;
-    const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
-    ElementCorotationalFEMForceField_computeForce_kernel<T, NNodes>
+    int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
+    ElementCorotationalFEMForceField_computeForce_kernel<T, NNodes, Dim>
         <<<numBlocks, computeThreads>>>(
             nbElem,
             (const int*)elements,
@@ -551,11 +612,20 @@ static void launchAddForce(
             (T*)eforce);
     mycudaDebugError("ElementCorotationalFEMForceField_computeForce_kernel");
 
-    launchGather<T>(nbVertex, maxElemPerVertex, velems, eforce, f);
+    const int gatherThreads = 256;
+    numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
+    ElementCorotationalFEMForceField_gatherForce_kernel<T, Dim>
+        <<<numBlocks, gatherThreads>>>(
+            nbVertex,
+            maxElemPerVertex,
+            (const int*)velems,
+            (const T*)eforce,
+            (T*)f);
+    mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel");
 }
 
-template<typename T, int NNodes>
-static void launchAddDForce(
+template<typename T, int NNodes, int Dim>
+void ElementCorotationalFEMForceFieldCuda_addDForce(
     unsigned int nbElem,
     unsigned int nbVertex,
     unsigned int maxElemPerVertex,
@@ -569,8 +639,8 @@ static void launchAddDForce(
     T kFactor)
 {
     const int computeThreads = 64;
-    const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
-    ElementCorotationalFEMForceField_computeDForce_kernel<T, NNodes>
+    int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
+    ElementCorotationalFEMForceField_computeDForce_kernel<T, NNodes, Dim>
         <<<numBlocks, computeThreads>>>(
             nbElem,
             (const int*)elements,
@@ -581,158 +651,48 @@ static void launchAddDForce(
             kFactor);
     mycudaDebugError("ElementCorotationalFEMForceField_computeDForce_kernel");
 
-    launchGather<T>(nbVertex, maxElemPerVertex, velems, eforce, df);
-}
-
-extern "C"
-{
-
-// ==================== float versions ====================
-
-void ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations(
-    unsigned int nbElem,
-    unsigned int nbVertex,
-    unsigned int nbNodesPerElem,
-    unsigned int maxElemPerVertex,
-    const void* elements,
-    const void* initRotTransposed,
-    const void* stiffness,
-    const void* x,
-    const void* x0,
-    void* f,
-    void* eforce,
-    void* rotationsOut,
-    const void* velems)
-{
-    switch (nbNodesPerElem)
-    {
-        case 3: launchAddForceWithRotations<float, 3>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
-        case 4: launchAddForceWithRotations<float, 4>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
-        case 8: launchAddForceWithRotations<float, 8>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
-    }
-}
-
-void ElementCorotationalFEMForceFieldCuda3f_addForce(
-    unsigned int nbElem,
-    unsigned int nbVertex,
-    unsigned int nbNodesPerElem,
-    unsigned int maxElemPerVertex,
-    const void* elements,
-    const void* rotations,
-    const void* stiffness,
-    const void* x,
-    const void* x0,
-    void* f,
-    void* eforce,
-    const void* velems)
-{
-    switch (nbNodesPerElem)
-    {
-        case 2: launchAddForce<float, 2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
-        case 3: launchAddForce<float, 3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
-        case 4: launchAddForce<float, 4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
-        case 8: launchAddForce<float, 8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
-    }
-}
-
-void ElementCorotationalFEMForceFieldCuda3f_addDForce(
-    unsigned int nbElem,
-    unsigned int nbVertex,
-    unsigned int nbNodesPerElem,
-    unsigned int maxElemPerVertex,
-    const void* elements,
-    const void* rotations,
-    const void* stiffness,
-    const void* dx,
-    void* df,
-    void* eforce,
-    const void* velems,
-    float kFactor)
-{
-    switch (nbNodesPerElem)
-    {
-        case 2: launchAddDForce<float, 2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 3: launchAddDForce<float, 3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 4: launchAddDForce<float, 4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 8: launchAddDForce<float, 8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
-    }
-}
-
-// ==================== double versions ====================
-
-void ElementCorotationalFEMForceFieldCuda3d_addForceWithRotations(
-    unsigned int nbElem,
-    unsigned int nbVertex,
-    unsigned int nbNodesPerElem,
-    unsigned int maxElemPerVertex,
-    const void* elements,
-    const void* initRotTransposed,
-    const void* stiffness,
-    const void* x,
-    const void* x0,
-    void* f,
-    void* eforce,
-    void* rotationsOut,
-    const void* velems)
-{
-    switch (nbNodesPerElem)
-    {
-        case 3: launchAddForceWithRotations<double, 3>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
-        case 4: launchAddForceWithRotations<double, 4>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
-        case 8: launchAddForceWithRotations<double, 8>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break;
-    }
-}
-
-void ElementCorotationalFEMForceFieldCuda3d_addForce(
-    unsigned int nbElem,
-    unsigned int nbVertex,
-    unsigned int nbNodesPerElem,
-    unsigned int maxElemPerVertex,
-    const void* elements,
-    const void* rotations,
-    const void* stiffness,
-    const void* x,
-    const void* x0,
-    void* f,
-    void* eforce,
-    const void* velems)
-{
-    switch (nbNodesPerElem)
-    {
-        case 2: launchAddForce<double, 2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
-        case 3: launchAddForce<double, 3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
-        case 4: launchAddForce<double, 4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
-        case 8: launchAddForce<double, 8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break;
-    }
-}
-
-void ElementCorotationalFEMForceFieldCuda3d_addDForce(
-    unsigned int nbElem,
-    unsigned int nbVertex,
-    unsigned int nbNodesPerElem,
-    unsigned int maxElemPerVertex,
-    const void* elements,
-    const void* rotations,
-    const void* stiffness,
-    const void* dx,
-    void* df,
-    void* eforce,
-    const void* velems,
-    double kFactor)
-{
-    switch (nbNodesPerElem)
-    {
-        case 2: launchAddDForce<double, 2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 3: launchAddDForce<double, 3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 4: launchAddDForce<double, 4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 8: launchAddDForce<double, 8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break;
-    }
+    const int gatherThreads = 256;
+    numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
+    ElementCorotationalFEMForceField_gatherForce_kernel<T, Dim>
+        <<<numBlocks, gatherThreads>>>(
+            nbVertex,
+            maxElemPerVertex,
+            (const int*)velems,
+            (const T*)eforce,
+            (T*)df);
+    mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel");
 }
 
-} // extern "C"
+// ===================== Explicit template instantiations =====================
+
+// addForceWithRotations: only NNodes >= 3 (triangle/quad/hex rotation methods)
+template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations<float, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*);
+template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations<float, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*);
+template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations<float, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*);
+template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations<double, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*);
+template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations<double, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*);
+template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations<double, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*);
+
+// addForce: all element types
+template void ElementCorotationalFEMForceFieldCuda_addForce<float, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementCorotationalFEMForceFieldCuda_addForce<float, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementCorotationalFEMForceFieldCuda_addForce<float, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementCorotationalFEMForceFieldCuda_addForce<float, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementCorotationalFEMForceFieldCuda_addForce<double, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementCorotationalFEMForceFieldCuda_addForce<double, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementCorotationalFEMForceFieldCuda_addForce<double, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementCorotationalFEMForceFieldCuda_addForce<double, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
+
+// addDForce: all element types
+template void ElementCorotationalFEMForceFieldCuda_addDForce<float, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float);
+template void ElementCorotationalFEMForceFieldCuda_addDForce<float, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float);
+template void ElementCorotationalFEMForceFieldCuda_addDForce<float, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float);
+template void ElementCorotationalFEMForceFieldCuda_addDForce<float, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float);
+template void ElementCorotationalFEMForceFieldCuda_addDForce<double, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double);
+template void ElementCorotationalFEMForceFieldCuda_addDForce<double, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double);
+template void ElementCorotationalFEMForceFieldCuda_addDForce<double, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double);
+template void ElementCorotationalFEMForceFieldCuda_addDForce<double, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double);
 
-#if defined(__cplusplus)
 } // namespace cuda
 } // namespace gpu
 } // namespace sofa
-#endif
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
index 7ec167dcae3..820b4c915a1 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
@@ -27,94 +27,48 @@
 namespace sofa::gpu::cuda
 {
 
-extern "C"
-{
-    void ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations(
-        unsigned int nbElem,
-        unsigned int nbVertex,
-        unsigned int nbNodesPerElem,
-        unsigned int maxElemPerVertex,
-        const void* elements,
-        const void* initRotTransposed,
-        const void* stiffness,
-        const void* x,
-        const void* x0,
-        void* f,
-        void* eforce,
-        void* rotationsOut,
-        const void* velems);
-
-    void ElementCorotationalFEMForceFieldCuda3f_addForce(
-        unsigned int nbElem,
-        unsigned int nbVertex,
-        unsigned int nbNodesPerElem,
-        unsigned int maxElemPerVertex,
-        const void* elements,
-        const void* rotations,
-        const void* stiffness,
-        const void* x,
-        const void* x0,
-        void* f,
-        void* eforce,
-        const void* velems);
-
-    void ElementCorotationalFEMForceFieldCuda3f_addDForce(
-        unsigned int nbElem,
-        unsigned int nbVertex,
-        unsigned int nbNodesPerElem,
-        unsigned int maxElemPerVertex,
-        const void* elements,
-        const void* rotations,
-        const void* stiffness,
-        const void* dx,
-        void* df,
-        void* eforce,
-        const void* velems,
-        float kFactor);
-
-    void ElementCorotationalFEMForceFieldCuda3d_addForceWithRotations(
-        unsigned int nbElem,
-        unsigned int nbVertex,
-        unsigned int nbNodesPerElem,
-        unsigned int maxElemPerVertex,
-        const void* elements,
-        const void* initRotTransposed,
-        const void* stiffness,
-        const void* x,
-        const void* x0,
-        void* f,
-        void* eforce,
-        void* rotationsOut,
-        const void* velems);
-
-    void ElementCorotationalFEMForceFieldCuda3d_addForce(
-        unsigned int nbElem,
-        unsigned int nbVertex,
-        unsigned int nbNodesPerElem,
-        unsigned int maxElemPerVertex,
-        const void* elements,
-        const void* rotations,
-        const void* stiffness,
-        const void* x,
-        const void* x0,
-        void* f,
-        void* eforce,
-        const void* velems);
-
-    void ElementCorotationalFEMForceFieldCuda3d_addDForce(
-        unsigned int nbElem,
-        unsigned int nbVertex,
-        unsigned int nbNodesPerElem,
-        unsigned int maxElemPerVertex,
-        const void* elements,
-        const void* rotations,
-        const void* stiffness,
-        const void* dx,
-        void* df,
-        void* eforce,
-        const void* velems,
-        double kFactor);
-}
+template<typename T, int NNodes, int Dim>
+void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* initRotTransposed,
+    const void* stiffness,
+    const void* x,
+    const void* x0,
+    void* f,
+    void* eforce,
+    void* rotationsOut,
+    const void* velems);
+
+template<typename T, int NNodes, int Dim>
+void ElementCorotationalFEMForceFieldCuda_addForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* rotations,
+    const void* stiffness,
+    const void* x,
+    const void* x0,
+    void* f,
+    void* eforce,
+    const void* velems);
+
+template<typename T, int NNodes, int Dim>
+void ElementCorotationalFEMForceFieldCuda_addDForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* rotations,
+    const void* stiffness,
+    const void* dx,
+    void* df,
+    void* eforce,
+    const void* velems,
+    T kFactor);
 
 } // namespace sofa::gpu::cuda
 
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
index effb420ab61..2359244539e 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
@@ -51,7 +51,6 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::uploadStiffne
     const auto& assembledMatrices = this->m_assembledStiffnessMatrices;
 
     const auto nbElem = elements.size();
-    constexpr auto nDofs = trait::NumberOfDofsInElement;
     constexpr auto nNodes = trait::NumberOfNodesInElement;
     constexpr auto dim = trait::spatial_dimensions;
 
@@ -253,6 +252,8 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
     }
 
     using trait = sofa::component::solidmechanics::fem::elastic::trait<DataTypes, ElementType>;
+    constexpr auto nNodes = trait::NumberOfNodesInElement;
+    constexpr auto dim = trait::spatial_dimensions;
 
     const VecCoord& x = d_x.getValue();
     auto restPositionAccessor = this->mstate->readRestPositions();
@@ -266,56 +267,34 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
     if (f.size() < x.size())
         f.resize(x.size());
 
-    if (m_gpuRotationMethodSupported)
+    if constexpr (nNodes >= 3)
     {
-        // Fully GPU path: compute rotations + forces in one kernel
-        if constexpr (std::is_same_v<Real, double>)
-        {
-            gpu::cuda::ElementCorotationalFEMForceFieldCuda3d_addForceWithRotations(
-                nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
-                m_gpuElements.deviceRead(), m_gpuInitialRotationsTransposed.deviceRead(),
-                m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(),
-                f.deviceWrite(), m_gpuElementForce.deviceWrite(),
-                m_gpuRotations.deviceWrite(), m_gpuVelems.deviceRead());
-        }
-        else
+        if (m_gpuRotationMethodSupported)
         {
-            gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations(
-                nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
+            gpu::cuda::ElementCorotationalFEMForceFieldCuda_addForceWithRotations<Real, nNodes, dim>(
+                nbElem, nbVertex, m_maxElemPerVertex,
                 m_gpuElements.deviceRead(), m_gpuInitialRotationsTransposed.deviceRead(),
                 m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(),
                 f.deviceWrite(), m_gpuElementForce.deviceWrite(),
                 m_gpuRotations.deviceWrite(), m_gpuVelems.deviceRead());
-        }
 
-        m_gpuRotationsUploaded = true;
-    }
-    else
-    {
-        // CPU rotations + GPU forces
-        this->computeRotations(this->m_rotations, x, x0);
-        uploadRotations();
-
-        if constexpr (std::is_same_v<Real, double>)
-        {
-            gpu::cuda::ElementCorotationalFEMForceFieldCuda3d_addForce(
-                nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
-                m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
-                m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(),
-                f.deviceWrite(), m_gpuElementForce.deviceWrite(),
-                m_gpuVelems.deviceRead());
-        }
-        else
-        {
-            gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForce(
-                nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
-                m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
-                m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(),
-                f.deviceWrite(), m_gpuElementForce.deviceWrite(),
-                m_gpuVelems.deviceRead());
+            m_gpuRotationsUploaded = true;
+            d_f.endEdit();
+            return;
         }
     }
 
+    // CPU rotations + GPU forces
+    this->computeRotations(this->m_rotations, x, x0);
+    uploadRotations();
+
+    gpu::cuda::ElementCorotationalFEMForceFieldCuda_addForce<Real, nNodes, dim>(
+        nbElem, nbVertex, m_maxElemPerVertex,
+        m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
+        m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(),
+        f.deviceWrite(), m_gpuElementForce.deviceWrite(),
+        m_gpuVelems.deviceRead());
+
     d_f.endEdit();
 }
 
@@ -336,6 +315,8 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addDForce(
     }
 
     using trait = sofa::component::solidmechanics::fem::elastic::trait<DataTypes, ElementType>;
+    constexpr auto nNodes = trait::NumberOfNodesInElement;
+    constexpr auto dim = trait::spatial_dimensions;
 
     VecDeriv& df = *d_df.beginEdit();
     const VecDeriv& dx = d_dx.getValue();
@@ -351,24 +332,12 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addDForce(
     const auto nbElem = static_cast<unsigned int>(elements.size());
     const auto nbVertex = static_cast<unsigned int>(dx.size());
 
-    if constexpr (std::is_same_v<Real, double>)
-    {
-        gpu::cuda::ElementCorotationalFEMForceFieldCuda3d_addDForce(
-            nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
-            m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
-            m_gpuStiffness.deviceRead(), dx.deviceRead(),
-            df.deviceWrite(), m_gpuElementForce.deviceWrite(),
-            m_gpuVelems.deviceRead(), kFactor);
-    }
-    else
-    {
-        gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addDForce(
-            nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
-            m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
-            m_gpuStiffness.deviceRead(), dx.deviceRead(),
-            df.deviceWrite(), m_gpuElementForce.deviceWrite(),
-            m_gpuVelems.deviceRead(), kFactor);
-    }
+    gpu::cuda::ElementCorotationalFEMForceFieldCuda_addDForce<Real, nNodes, dim>(
+        nbElem, nbVertex, m_maxElemPerVertex,
+        m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
+        m_gpuStiffness.deviceRead(), dx.deviceRead(),
+        df.deviceWrite(), m_gpuElementForce.deviceWrite(),
+        m_gpuVelems.deviceRead(), kFactor);
 
     d_df.endEdit();
 }
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
index 6752bd29af1..e8492615c32 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
@@ -23,23 +23,21 @@
 #include <sofa/gpu/cuda/CudaMath.h>
 #include <cuda.h>
 
-#if defined(__cplusplus)
 namespace sofa
 {
 namespace gpu
 {
 namespace cuda
 {
-#endif
 
 /**
  * Kernel for addForce: Compute per-element force from displacement (1 thread per element).
  *
  * f = -K * (x - x0)
- * Templated on NNodes (compile-time) for full loop unrolling.
+ * Templated on NNodes and Dim (compile-time) for full loop unrolling.
  * Templated on T for float/double support.
  */
-template<typename T, int NNodes>
+template<typename T, int NNodes, int Dim>
 __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel(
     int nbElem,
     const int* __restrict__ elements,
@@ -54,22 +52,22 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel(
     if (elemId >= nbElem) return;
 
     // Gather displacement = x - x0 for this element's nodes
-    T disp[NNodes * 3];
+    T disp[NNodes * Dim];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
         const int nodeId = elements[n * nbElem + elemId];
-        disp[n * 3 + 0] = x[nodeId * 3 + 0] - x0[nodeId * 3 + 0];
-        disp[n * 3 + 1] = x[nodeId * 3 + 1] - x0[nodeId * 3 + 1];
-        disp[n * 3 + 2] = x[nodeId * 3 + 2] - x0[nodeId * 3 + 2];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            disp[n * Dim + d] = x[nodeId * Dim + d] - x0[nodeId * Dim + d];
     }
 
     // Symmetric block-matrix multiply: edf = K * disp
-    const T* K = stiffness + elemId * NSymBlocks * 9;
-    T edf[NNodes * 3];
+    const T* K = stiffness + elemId * NSymBlocks * Dim * Dim;
+    T edf[NNodes * Dim];
 
     #pragma unroll
-    for (int i = 0; i < NNodes * 3; ++i)
+    for (int i = 0; i < NNodes * Dim; ++i)
         edf[i] = T(0);
 
     #pragma unroll
@@ -79,13 +77,16 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel(
 
         // Diagonal block
         {
-            const T* Kii = K + diagIdx * 9;
-            const T di0 = disp[ni * 3 + 0];
-            const T di1 = disp[ni * 3 + 1];
-            const T di2 = disp[ni * 3 + 2];
-            edf[ni * 3 + 0] += Kii[0] * di0 + Kii[1] * di1 + Kii[2] * di2;
-            edf[ni * 3 + 1] += Kii[3] * di0 + Kii[4] * di1 + Kii[5] * di2;
-            edf[ni * 3 + 2] += Kii[6] * di0 + Kii[7] * di1 + Kii[8] * di2;
+            const T* Kii = K + diagIdx * Dim * Dim;
+            #pragma unroll
+            for (int di = 0; di < Dim; ++di)
+            {
+                T sum = T(0);
+                #pragma unroll
+                for (int dj = 0; dj < Dim; ++dj)
+                    sum += Kii[di * Dim + dj] * disp[ni * Dim + dj];
+                edf[ni * Dim + di] += sum;
+            }
         }
 
         // Off-diagonal blocks
@@ -93,36 +94,40 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel(
         for (int nj = ni + 1; nj < NNodes; ++nj)
         {
             const int symIdx = diagIdx + (nj - ni);
-            const T* Kij = K + symIdx * 9;
+            const T* Kij = K + symIdx * Dim * Dim;
 
+            // Kij * disp_j -> edf_i
+            #pragma unroll
+            for (int di = 0; di < Dim; ++di)
             {
-                const T dj0 = disp[nj * 3 + 0];
-                const T dj1 = disp[nj * 3 + 1];
-                const T dj2 = disp[nj * 3 + 2];
-                edf[ni * 3 + 0] += Kij[0] * dj0 + Kij[1] * dj1 + Kij[2] * dj2;
-                edf[ni * 3 + 1] += Kij[3] * dj0 + Kij[4] * dj1 + Kij[5] * dj2;
-                edf[ni * 3 + 2] += Kij[6] * dj0 + Kij[7] * dj1 + Kij[8] * dj2;
+                T sum = T(0);
+                #pragma unroll
+                for (int dj = 0; dj < Dim; ++dj)
+                    sum += Kij[di * Dim + dj] * disp[nj * Dim + dj];
+                edf[ni * Dim + di] += sum;
             }
 
+            // Kij^T * disp_i -> edf_j
+            #pragma unroll
+            for (int dj = 0; dj < Dim; ++dj)
             {
-                const T di0 = disp[ni * 3 + 0];
-                const T di1 = disp[ni * 3 + 1];
-                const T di2 = disp[ni * 3 + 2];
-                edf[nj * 3 + 0] += Kij[0] * di0 + Kij[3] * di1 + Kij[6] * di2;
-                edf[nj * 3 + 1] += Kij[1] * di0 + Kij[4] * di1 + Kij[7] * di2;
-                edf[nj * 3 + 2] += Kij[2] * di0 + Kij[5] * di1 + Kij[8] * di2;
+                T sum = T(0);
+                #pragma unroll
+                for (int di = 0; di < Dim; ++di)
+                    sum += Kij[di * Dim + dj] * disp[ni * Dim + di];
+                edf[nj * Dim + dj] += sum;
             }
         }
     }
 
     // Write: eforce = -edf (minus sign from f -= K * displacement)
-    T* out = eforce + elemId * NNodes * 3;
+    T* out = eforce + elemId * NNodes * Dim;
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        out[n * 3 + 0] = -edf[n * 3 + 0];
-        out[n * 3 + 1] = -edf[n * 3 + 1];
-        out[n * 3 + 2] = -edf[n * 3 + 2];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            out[n * Dim + d] = -edf[n * Dim + d];
     }
 }
 
@@ -131,7 +136,7 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel(
  *
  * df = -kFactor * K * dx
  */
-template<typename T, int NNodes>
+template<typename T, int NNodes, int Dim>
 __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel(
     int nbElem,
     const int* __restrict__ elements,
@@ -146,22 +151,22 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel(
     if (elemId >= nbElem) return;
 
     // Gather dx for this element's nodes
-    T edx[NNodes * 3];
+    T edx[NNodes * Dim];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
         const int nodeId = elements[n * nbElem + elemId];
-        edx[n * 3 + 0] = dx[nodeId * 3 + 0];
-        edx[n * 3 + 1] = dx[nodeId * 3 + 1];
-        edx[n * 3 + 2] = dx[nodeId * 3 + 2];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            edx[n * Dim + d] = dx[nodeId * Dim + d];
     }
 
     // Symmetric block-matrix multiply: edf = K * edx
-    const T* K = stiffness + elemId * NSymBlocks * 9;
-    T edf[NNodes * 3];
+    const T* K = stiffness + elemId * NSymBlocks * Dim * Dim;
+    T edf[NNodes * Dim];
 
     #pragma unroll
-    for (int i = 0; i < NNodes * 3; ++i)
+    for (int i = 0; i < NNodes * Dim; ++i)
         edf[i] = T(0);
 
     #pragma unroll
@@ -170,56 +175,61 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel(
         const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
 
         {
-            const T* Kii = K + diagIdx * 9;
-            const T di0 = edx[ni * 3 + 0];
-            const T di1 = edx[ni * 3 + 1];
-            const T di2 = edx[ni * 3 + 2];
-            edf[ni * 3 + 0] += Kii[0] * di0 + Kii[1] * di1 + Kii[2] * di2;
-            edf[ni * 3 + 1] += Kii[3] * di0 + Kii[4] * di1 + Kii[5] * di2;
-            edf[ni * 3 + 2] += Kii[6] * di0 + Kii[7] * di1 + Kii[8] * di2;
+            const T* Kii = K + diagIdx * Dim * Dim;
+            #pragma unroll
+            for (int di = 0; di < Dim; ++di)
+            {
+                T sum = T(0);
+                #pragma unroll
+                for (int dj = 0; dj < Dim; ++dj)
+                    sum += Kii[di * Dim + dj] * edx[ni * Dim + dj];
+                edf[ni * Dim + di] += sum;
+            }
         }
 
         #pragma unroll
         for (int nj = ni + 1; nj < NNodes; ++nj)
         {
             const int symIdx = diagIdx + (nj - ni);
-            const T* Kij = K + symIdx * 9;
+            const T* Kij = K + symIdx * Dim * Dim;
 
+            #pragma unroll
+            for (int di = 0; di < Dim; ++di)
             {
-                const T dj0 = edx[nj * 3 + 0];
-                const T dj1 = edx[nj * 3 + 1];
-                const T dj2 = edx[nj * 3 + 2];
-                edf[ni * 3 + 0] += Kij[0] * dj0 + Kij[1] * dj1 + Kij[2] * dj2;
-                edf[ni * 3 + 1] += Kij[3] * dj0 + Kij[4] * dj1 + Kij[5] * dj2;
-                edf[ni * 3 + 2] += Kij[6] * dj0 + Kij[7] * dj1 + Kij[8] * dj2;
+                T sum = T(0);
+                #pragma unroll
+                for (int dj = 0; dj < Dim; ++dj)
+                    sum += Kij[di * Dim + dj] * edx[nj * Dim + dj];
+                edf[ni * Dim + di] += sum;
             }
 
+            #pragma unroll
+            for (int dj = 0; dj < Dim; ++dj)
             {
-                const T di0 = edx[ni * 3 + 0];
-                const T di1 = edx[ni * 3 + 1];
-                const T di2 = edx[ni * 3 + 2];
-                edf[nj * 3 + 0] += Kij[0] * di0 + Kij[3] * di1 + Kij[6] * di2;
-                edf[nj * 3 + 1] += Kij[1] * di0 + Kij[4] * di1 + Kij[7] * di2;
-                edf[nj * 3 + 2] += Kij[2] * di0 + Kij[5] * di1 + Kij[8] * di2;
+                T sum = T(0);
+                #pragma unroll
+                for (int di = 0; di < Dim; ++di)
+                    sum += Kij[di * Dim + dj] * edx[ni * Dim + di];
+                edf[nj * Dim + dj] += sum;
             }
         }
     }
 
     // Write: eforce = -kFactor * edf
-    T* out = eforce + elemId * NNodes * 3;
+    T* out = eforce + elemId * NNodes * Dim;
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        out[n * 3 + 0] = -kFactor * edf[n * 3 + 0];
-        out[n * 3 + 1] = -kFactor * edf[n * 3 + 1];
-        out[n * 3 + 2] = -kFactor * edf[n * 3 + 2];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            out[n * Dim + d] = -kFactor * edf[n * Dim + d];
     }
 }
 
 /**
  * Gather per-vertex forces (1 thread per vertex).
  */
-template<typename T>
+template<typename T, int Dim>
 __global__ void ElementLinearSmallStrainFEMForceField_gatherForce_kernel(
     int nbVertex,
     int maxElemPerVertex,
@@ -230,45 +240,28 @@ __global__ void ElementLinearSmallStrainFEMForceField_gatherForce_kernel(
     const int vertexId = blockIdx.x * blockDim.x + threadIdx.x;
     if (vertexId >= nbVertex) return;
 
-    T fx = T(0), fy = T(0), fz = T(0);
+    T acc[Dim];
+    #pragma unroll
+    for (int d = 0; d < Dim; ++d)
+        acc[d] = T(0);
 
     for (int s = 0; s < maxElemPerVertex; ++s)
     {
         const int idx = velems[s * nbVertex + vertexId];
         if (idx == 0) break;
-        const int base = (idx - 1) * 3;
-        fx += eforce[base + 0];
-        fy += eforce[base + 1];
-        fz += eforce[base + 2];
+        const int base = (idx - 1) * Dim;
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            acc[d] += eforce[base + d];
     }
 
-    df[vertexId * 3 + 0] += fx;
-    df[vertexId * 3 + 1] += fy;
-    df[vertexId * 3 + 2] += fz;
-}
-
-template<typename T>
-static void launchGather(
-    unsigned int nbVertex,
-    unsigned int maxElemPerVertex,
-    const void* velems,
-    const void* eforce,
-    void* f)
-{
-    const int gatherThreads = 256;
-    const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
-    ElementLinearSmallStrainFEMForceField_gatherForce_kernel<T>
-        <<<numBlocks, gatherThreads>>>(
-            nbVertex,
-            maxElemPerVertex,
-            (const int*)velems,
-            (const T*)eforce,
-            (T*)f);
-    mycudaDebugError("ElementLinearSmallStrainFEMForceField_gatherForce_kernel");
+    #pragma unroll
+    for (int d = 0; d < Dim; ++d)
+        df[vertexId * Dim + d] += acc[d];
 }
 
-template<typename T, int NNodes>
-static void launchAddForce(
+template<typename T, int NNodes, int Dim>
+void ElementLinearSmallStrainFEMForceFieldCuda_addForce(
     unsigned int nbElem,
     unsigned int nbVertex,
     unsigned int maxElemPerVertex,
@@ -281,8 +274,8 @@ static void launchAddForce(
     const void* velems)
 {
     const int computeThreads = 64;
-    const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
-    ElementLinearSmallStrainFEMForceField_computeForce_kernel<T, NNodes>
+    int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
+    ElementLinearSmallStrainFEMForceField_computeForce_kernel<T, NNodes, Dim>
         <<<numBlocks, computeThreads>>>(
             nbElem,
             (const int*)elements,
@@ -292,11 +285,20 @@ static void launchAddForce(
             (T*)eforce);
     mycudaDebugError("ElementLinearSmallStrainFEMForceField_computeForce_kernel");
 
-    launchGather<T>(nbVertex, maxElemPerVertex, velems, eforce, f);
+    const int gatherThreads = 256;
+    numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
+    ElementLinearSmallStrainFEMForceField_gatherForce_kernel<T, Dim>
+        <<<numBlocks, gatherThreads>>>(
+            nbVertex,
+            maxElemPerVertex,
+            (const int*)velems,
+            (const T*)eforce,
+            (T*)f);
+    mycudaDebugError("ElementLinearSmallStrainFEMForceField_gatherForce_kernel");
 }
 
-template<typename T, int NNodes>
-static void launchAddDForce(
+template<typename T, int NNodes, int Dim>
+void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(
     unsigned int nbElem,
     unsigned int nbVertex,
     unsigned int maxElemPerVertex,
@@ -309,8 +311,8 @@ static void launchAddDForce(
     T kFactor)
 {
     const int computeThreads = 64;
-    const int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
-    ElementLinearSmallStrainFEMForceField_computeDForce_kernel<T, NNodes>
+    int numBlocks = (nbElem + computeThreads - 1) / computeThreads;
+    ElementLinearSmallStrainFEMForceField_computeDForce_kernel<T, NNodes, Dim>
         <<<numBlocks, computeThreads>>>(
             nbElem,
             (const int*)elements,
@@ -320,104 +322,37 @@ static void launchAddDForce(
             kFactor);
     mycudaDebugError("ElementLinearSmallStrainFEMForceField_computeDForce_kernel");
 
-    launchGather<T>(nbVertex, maxElemPerVertex, velems, eforce, df);
-}
-
-extern "C"
-{
-
-void ElementLinearSmallStrainFEMForceFieldCuda3f_addForce(
-    unsigned int nbElem,
-    unsigned int nbVertex,
-    unsigned int nbNodesPerElem,
-    unsigned int maxElemPerVertex,
-    const void* elements,
-    const void* stiffness,
-    const void* x,
-    const void* x0,
-    void* f,
-    void* eforce,
-    const void* velems)
-{
-    switch (nbNodesPerElem)
-    {
-        case 2: launchAddForce<float, 2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
-        case 3: launchAddForce<float, 3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
-        case 4: launchAddForce<float, 4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
-        case 8: launchAddForce<float, 8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
-    }
-}
-
-void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce(
-    unsigned int nbElem,
-    unsigned int nbVertex,
-    unsigned int nbNodesPerElem,
-    unsigned int maxElemPerVertex,
-    const void* elements,
-    const void* stiffness,
-    const void* dx,
-    void* df,
-    void* eforce,
-    const void* velems,
-    float kFactor)
-{
-    switch (nbNodesPerElem)
-    {
-        case 2: launchAddDForce<float, 2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 3: launchAddDForce<float, 3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 4: launchAddDForce<float, 4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 8: launchAddDForce<float, 8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
-    }
-}
-
-void ElementLinearSmallStrainFEMForceFieldCuda3d_addForce(
-    unsigned int nbElem,
-    unsigned int nbVertex,
-    unsigned int nbNodesPerElem,
-    unsigned int maxElemPerVertex,
-    const void* elements,
-    const void* stiffness,
-    const void* x,
-    const void* x0,
-    void* f,
-    void* eforce,
-    const void* velems)
-{
-    switch (nbNodesPerElem)
-    {
-        case 2: launchAddForce<double, 2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
-        case 3: launchAddForce<double, 3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
-        case 4: launchAddForce<double, 4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
-        case 8: launchAddForce<double, 8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break;
-    }
-}
-
-void ElementLinearSmallStrainFEMForceFieldCuda3d_addDForce(
-    unsigned int nbElem,
-    unsigned int nbVertex,
-    unsigned int nbNodesPerElem,
-    unsigned int maxElemPerVertex,
-    const void* elements,
-    const void* stiffness,
-    const void* dx,
-    void* df,
-    void* eforce,
-    const void* velems,
-    double kFactor)
-{
-    switch (nbNodesPerElem)
-    {
-        case 2: launchAddDForce<double, 2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 3: launchAddDForce<double, 3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 4: launchAddDForce<double, 4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
-        case 8: launchAddDForce<double, 8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break;
-    }
+    const int gatherThreads = 256;
+    numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
+    ElementLinearSmallStrainFEMForceField_gatherForce_kernel<T, Dim>
+        <<<numBlocks, gatherThreads>>>(
+            nbVertex,
+            maxElemPerVertex,
+            (const int*)velems,
+            (const T*)eforce,
+            (T*)df);
+    mycudaDebugError("ElementLinearSmallStrainFEMForceField_gatherForce_kernel");
 }
 
-} // extern "C"
+// Explicit template instantiations for all supported (T, NNodes, Dim) combinations
+template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<float, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<float, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<float, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<float, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<double, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<double, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<double, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<double, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
+
+template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<float, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<float, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<float, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<float, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<double, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<double, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<double, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double);
+template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<double, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double);
 
-#if defined(__cplusplus)
 } // namespace cuda
 } // namespace gpu
 } // namespace sofa
-#endif
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
index 777d3301ee2..45d119846e8 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
@@ -27,60 +27,31 @@
 namespace sofa::gpu::cuda
 {
 
-extern "C"
-{
-    void ElementLinearSmallStrainFEMForceFieldCuda3f_addForce(
-        unsigned int nbElem,
-        unsigned int nbVertex,
-        unsigned int nbNodesPerElem,
-        unsigned int maxElemPerVertex,
-        const void* elements,
-        const void* stiffness,
-        const void* x,
-        const void* x0,
-        void* f,
-        void* eforce,
-        const void* velems);
-
-    void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce(
-        unsigned int nbElem,
-        unsigned int nbVertex,
-        unsigned int nbNodesPerElem,
-        unsigned int maxElemPerVertex,
-        const void* elements,
-        const void* stiffness,
-        const void* dx,
-        void* df,
-        void* eforce,
-        const void* velems,
-        float kFactor);
-
-    void ElementLinearSmallStrainFEMForceFieldCuda3d_addForce(
-        unsigned int nbElem,
-        unsigned int nbVertex,
-        unsigned int nbNodesPerElem,
-        unsigned int maxElemPerVertex,
-        const void* elements,
-        const void* stiffness,
-        const void* x,
-        const void* x0,
-        void* f,
-        void* eforce,
-        const void* velems);
-
-    void ElementLinearSmallStrainFEMForceFieldCuda3d_addDForce(
-        unsigned int nbElem,
-        unsigned int nbVertex,
-        unsigned int nbNodesPerElem,
-        unsigned int maxElemPerVertex,
-        const void* elements,
-        const void* stiffness,
-        const void* dx,
-        void* df,
-        void* eforce,
-        const void* velems,
-        double kFactor);
-}
+template<typename T, int NNodes, int Dim>
+void ElementLinearSmallStrainFEMForceFieldCuda_addForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* stiffness,
+    const void* x,
+    const void* x0,
+    void* f,
+    void* eforce,
+    const void* velems);
+
+template<typename T, int NNodes, int Dim>
+void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(
+    unsigned int nbElem,
+    unsigned int nbVertex,
+    unsigned int maxElemPerVertex,
+    const void* elements,
+    const void* stiffness,
+    const void* dx,
+    void* df,
+    void* eforce,
+    const void* velems,
+    T kFactor);
 
 } // namespace sofa::gpu::cuda
 
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
index 863511e951d..f27c06b92e7 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
@@ -161,6 +161,8 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addForce
     }
 
     using trait = sofa::component::solidmechanics::fem::elastic::trait<DataTypes, ElementType>;
+    constexpr auto nNodes = trait::NumberOfNodesInElement;
+    constexpr auto dim = trait::spatial_dimensions;
 
     VecDeriv& f = *d_f.beginEdit();
     const VecCoord& x = d_x.getValue();
@@ -175,24 +177,12 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addForce
     const auto nbElem = static_cast<unsigned int>(elements.size());
     const auto nbVertex = static_cast<unsigned int>(x.size());
 
-    if constexpr (std::is_same_v<Real, double>)
-    {
-        gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3d_addForce(
-            nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
-            m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
-            x.deviceRead(), x0.deviceRead(),
-            f.deviceWrite(), m_gpuElementForce.deviceWrite(),
-            m_gpuVelems.deviceRead());
-    }
-    else
-    {
-        gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addForce(
-            nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
-            m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
-            x.deviceRead(), x0.deviceRead(),
-            f.deviceWrite(), m_gpuElementForce.deviceWrite(),
-            m_gpuVelems.deviceRead());
-    }
+    gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda_addForce<Real, nNodes, dim>(
+        nbElem, nbVertex, m_maxElemPerVertex,
+        m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
+        x.deviceRead(), x0.deviceRead(),
+        f.deviceWrite(), m_gpuElementForce.deviceWrite(),
+        m_gpuVelems.deviceRead());
 
     d_f.endEdit();
 }
@@ -214,6 +204,8 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addDForc
     }
 
     using trait = sofa::component::solidmechanics::fem::elastic::trait<DataTypes, ElementType>;
+    constexpr auto nNodes = trait::NumberOfNodesInElement;
+    constexpr auto dim = trait::spatial_dimensions;
 
     VecDeriv& df = *d_df.beginEdit();
     const VecDeriv& dx = d_dx.getValue();
@@ -229,24 +221,12 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addDForc
     const auto nbElem = static_cast<unsigned int>(elements.size());
     const auto nbVertex = static_cast<unsigned int>(dx.size());
 
-    if constexpr (std::is_same_v<Real, double>)
-    {
-        gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3d_addDForce(
-            nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
-            m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
-            dx.deviceRead(), df.deviceWrite(),
-            m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(),
-            kFactor);
-    }
-    else
-    {
-        gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce(
-            nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex,
-            m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
-            dx.deviceRead(), df.deviceWrite(),
-            m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(),
-            kFactor);
-    }
+    gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda_addDForce<Real, nNodes, dim>(
+        nbElem, nbVertex, m_maxElemPerVertex,
+        m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
+        dx.deviceRead(), df.deviceWrite(),
+        m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(),
+        kFactor);
 
     d_df.endEdit();
 }

From abb676c1b490dd923ac80e64d6d7f1229b7b6cea Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Thu, 9 Apr 2026 08:10:17 +0900
Subject: [PATCH 17/21] dont compile double version if SOFA_GPU_CUDA_DOUBLE is
 not enabled

---
 .../fem/elastic/CudaElementCorotationalFEMForceField.cpp      | 4 ++++
 .../fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
index 55a46c00669..5cd43daa6d2 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp
@@ -36,11 +36,13 @@ template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaV
 template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Tetrahedron>;
 template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Hexahedron>;
 
+#ifdef SOFA_GPU_CUDA_DOUBLE
 template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Edge>;
 template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Triangle>;
 template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Quad>;
 template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Tetrahedron>;
 template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Hexahedron>;
+#endif
 
 } // namespace sofa::component::solidmechanics::fem::elastic
 
@@ -72,6 +74,7 @@ void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory
         .add< CudaElementCorotationalFEMForceField<CudaVec3fTypes, sofa::geometry::Hexahedron> >()
     );
 
+#ifdef SOFA_GPU_CUDA_DOUBLE
     factory->registerObjects(sofa::core::ObjectRegistrationData(
         "Supports GPU-side computations using CUDA (double) for EdgeCorotationalFEMForceField")
         .add< CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Edge> >()
@@ -92,6 +95,7 @@ void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory
         "Supports GPU-side computations using CUDA (double) for HexahedronCorotationalFEMForceField")
         .add< CudaElementCorotationalFEMForceField<CudaVec3dTypes, sofa::geometry::Hexahedron> >()
     );
+#endif
 }
 
 } // namespace sofa::gpu::cuda
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp
index b46f90d06d4..d8d3b9ef1c3 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp
@@ -36,11 +36,13 @@ template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<
 template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Tetrahedron>;
 template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Hexahedron>;
 
+#ifdef SOFA_GPU_CUDA_DOUBLE
 template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Edge>;
 template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Triangle>;
 template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Quad>;
 template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Tetrahedron>;
 template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Hexahedron>;
+#endif
 
 } // namespace sofa::component::solidmechanics::fem::elastic
 
@@ -72,6 +74,7 @@ void registerElementLinearSmallStrainFEMForceField(sofa::core::ObjectFactory* fa
         .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3fTypes, sofa::geometry::Hexahedron> >()
     );
 
+#ifdef SOFA_GPU_CUDA_DOUBLE
     factory->registerObjects(sofa::core::ObjectRegistrationData(
         "Supports GPU-side computations using CUDA (double) for EdgeLinearSmallStrainFEMForceField")
         .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Edge> >()
@@ -92,6 +95,7 @@ void registerElementLinearSmallStrainFEMForceField(sofa::core::ObjectFactory* fa
         "Supports GPU-side computations using CUDA (double) for HexahedronLinearSmallStrainFEMForceField")
         .add< CudaElementLinearSmallStrainFEMForceField<CudaVec3dTypes, sofa::geometry::Hexahedron> >()
     );
+#endif
 }
 
 } // namespace sofa::gpu::cuda

From e41d0b69300580da430bb878199069d7b845bb71 Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Thu, 9 Apr 2026 08:17:32 +0900
Subject: [PATCH 18/21] use w accessors

---
 .../CudaElementCorotationalFEMForceField.inl      | 15 +++++----------
 .../CudaElementLinearSmallStrainFEMForceField.inl | 12 ++++--------
 2 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
index 2359244539e..b6da2b45a2d 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
@@ -263,7 +263,7 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
     const auto nbElem = static_cast<unsigned int>(elements.size());
     const auto nbVertex = static_cast<unsigned int>(x.size());
 
-    VecDeriv& f = *d_f.beginEdit();
+    auto f = sofa::helper::getWriteOnlyAccessor(d_f);
     if (f.size() < x.size())
         f.resize(x.size());
 
@@ -275,11 +275,10 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
                 nbElem, nbVertex, m_maxElemPerVertex,
                 m_gpuElements.deviceRead(), m_gpuInitialRotationsTransposed.deviceRead(),
                 m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(),
-                f.deviceWrite(), m_gpuElementForce.deviceWrite(),
+                f.wref().deviceWrite(), m_gpuElementForce.deviceWrite(),
                 m_gpuRotations.deviceWrite(), m_gpuVelems.deviceRead());
 
             m_gpuRotationsUploaded = true;
-            d_f.endEdit();
             return;
         }
     }
@@ -292,10 +291,8 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
         nbElem, nbVertex, m_maxElemPerVertex,
         m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
         m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(),
-        f.deviceWrite(), m_gpuElementForce.deviceWrite(),
+        f.wref().deviceWrite(), m_gpuElementForce.deviceWrite(),
         m_gpuVelems.deviceRead());
-
-    d_f.endEdit();
 }
 
 template<class DataTypes, class ElementType>
@@ -318,7 +315,7 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addDForce(
     constexpr auto nNodes = trait::NumberOfNodesInElement;
     constexpr auto dim = trait::spatial_dimensions;
 
-    VecDeriv& df = *d_df.beginEdit();
+    auto df = sofa::helper::getWriteOnlyAccessor(d_df);
     const VecDeriv& dx = d_dx.getValue();
 
     if (df.size() < dx.size())
@@ -336,10 +333,8 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addDForce(
         nbElem, nbVertex, m_maxElemPerVertex,
         m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
         m_gpuStiffness.deviceRead(), dx.deviceRead(),
-        df.deviceWrite(), m_gpuElementForce.deviceWrite(),
+        df.wref().deviceWrite(), m_gpuElementForce.deviceWrite(),
         m_gpuVelems.deviceRead(), kFactor);
-
-    d_df.endEdit();
 }
 
 } // namespace sofa::component::solidmechanics::fem::elastic
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
index f27c06b92e7..97b6066aa4c 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
@@ -164,7 +164,7 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addForce
     constexpr auto nNodes = trait::NumberOfNodesInElement;
     constexpr auto dim = trait::spatial_dimensions;
 
-    VecDeriv& f = *d_f.beginEdit();
+    auto f = sofa::helper::getWriteOnlyAccessor(d_f);
     const VecCoord& x = d_x.getValue();
 
     if (f.size() < x.size())
@@ -181,10 +181,8 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addForce
         nbElem, nbVertex, m_maxElemPerVertex,
         m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
         x.deviceRead(), x0.deviceRead(),
-        f.deviceWrite(), m_gpuElementForce.deviceWrite(),
+        f.wref().deviceWrite(), m_gpuElementForce.deviceWrite(),
         m_gpuVelems.deviceRead());
-
-    d_f.endEdit();
 }
 
 template<class DataTypes, class ElementType>
@@ -207,7 +205,7 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addDForc
     constexpr auto nNodes = trait::NumberOfNodesInElement;
     constexpr auto dim = trait::spatial_dimensions;
 
-    VecDeriv& df = *d_df.beginEdit();
+    auto df = sofa::helper::getWriteOnlyAccessor(d_df);
     const VecDeriv& dx = d_dx.getValue();
 
     if (df.size() < dx.size())
@@ -224,11 +222,9 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addDForc
     gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda_addDForce<Real, nNodes, dim>(
         nbElem, nbVertex, m_maxElemPerVertex,
         m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
-        dx.deviceRead(), df.deviceWrite(),
+        dx.deviceRead(), df.wref().deviceWrite(),
         m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(),
         kFactor);
-
-    d_df.endEdit();
 }
 
 } // namespace sofa::component::solidmechanics::fem::elastic

From 05c54101ae277a4c7801c03541d29098303ae40f Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Fri, 10 Apr 2026 11:16:48 +0900
Subject: [PATCH 19/21] Revert "use w accessors"

This reverts commit e41d0b69300580da430bb878199069d7b845bb71.
---
 .../CudaElementCorotationalFEMForceField.inl      | 15 ++++++++++-----
 .../CudaElementLinearSmallStrainFEMForceField.inl | 12 ++++++++----
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
index b6da2b45a2d..2359244539e 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
@@ -263,7 +263,7 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
     const auto nbElem = static_cast<unsigned int>(elements.size());
     const auto nbVertex = static_cast<unsigned int>(x.size());
 
-    auto f = sofa::helper::getWriteOnlyAccessor(d_f);
+    VecDeriv& f = *d_f.beginEdit();
     if (f.size() < x.size())
         f.resize(x.size());
 
@@ -275,10 +275,11 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
                 nbElem, nbVertex, m_maxElemPerVertex,
                 m_gpuElements.deviceRead(), m_gpuInitialRotationsTransposed.deviceRead(),
                 m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(),
-                f.wref().deviceWrite(), m_gpuElementForce.deviceWrite(),
+                f.deviceWrite(), m_gpuElementForce.deviceWrite(),
                 m_gpuRotations.deviceWrite(), m_gpuVelems.deviceRead());
 
             m_gpuRotationsUploaded = true;
+            d_f.endEdit();
             return;
         }
     }
@@ -291,8 +292,10 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addForce(
         nbElem, nbVertex, m_maxElemPerVertex,
         m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
         m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(),
-        f.wref().deviceWrite(), m_gpuElementForce.deviceWrite(),
+        f.deviceWrite(), m_gpuElementForce.deviceWrite(),
         m_gpuVelems.deviceRead());
+
+    d_f.endEdit();
 }
 
 template<class DataTypes, class ElementType>
@@ -315,7 +318,7 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addDForce(
     constexpr auto nNodes = trait::NumberOfNodesInElement;
     constexpr auto dim = trait::spatial_dimensions;
 
-    auto df = sofa::helper::getWriteOnlyAccessor(d_df);
+    VecDeriv& df = *d_df.beginEdit();
     const VecDeriv& dx = d_dx.getValue();
 
     if (df.size() < dx.size())
@@ -333,8 +336,10 @@ void CudaElementCorotationalFEMForceField<DataTypes, ElementType>::addDForce(
         nbElem, nbVertex, m_maxElemPerVertex,
         m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(),
         m_gpuStiffness.deviceRead(), dx.deviceRead(),
-        df.wref().deviceWrite(), m_gpuElementForce.deviceWrite(),
+        df.deviceWrite(), m_gpuElementForce.deviceWrite(),
         m_gpuVelems.deviceRead(), kFactor);
+
+    d_df.endEdit();
 }
 
 } // namespace sofa::component::solidmechanics::fem::elastic
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
index 97b6066aa4c..f27c06b92e7 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl
@@ -164,7 +164,7 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addForce
     constexpr auto nNodes = trait::NumberOfNodesInElement;
     constexpr auto dim = trait::spatial_dimensions;
 
-    auto f = sofa::helper::getWriteOnlyAccessor(d_f);
+    VecDeriv& f = *d_f.beginEdit();
     const VecCoord& x = d_x.getValue();
 
     if (f.size() < x.size())
@@ -181,8 +181,10 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addForce
         nbElem, nbVertex, m_maxElemPerVertex,
         m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
         x.deviceRead(), x0.deviceRead(),
-        f.wref().deviceWrite(), m_gpuElementForce.deviceWrite(),
+        f.deviceWrite(), m_gpuElementForce.deviceWrite(),
         m_gpuVelems.deviceRead());
+
+    d_f.endEdit();
 }
 
 template<class DataTypes, class ElementType>
@@ -205,7 +207,7 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addDForc
     constexpr auto nNodes = trait::NumberOfNodesInElement;
     constexpr auto dim = trait::spatial_dimensions;
 
-    auto df = sofa::helper::getWriteOnlyAccessor(d_df);
+    VecDeriv& df = *d_df.beginEdit();
     const VecDeriv& dx = d_dx.getValue();
 
     if (df.size() < dx.size())
@@ -222,9 +224,11 @@ void CudaElementLinearSmallStrainFEMForceField<DataTypes, ElementType>::addDForc
     gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda_addDForce<Real, nNodes, dim>(
         nbElem, nbVertex, m_maxElemPerVertex,
         m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(),
-        dx.deviceRead(), df.wref().deviceWrite(),
+        dx.deviceRead(), df.deviceWrite(),
         m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(),
         kFactor);
+
+    d_df.endEdit();
 }
 
 } // namespace sofa::component::solidmechanics::fem::elastic

From c6721d4721a0adf7c789c3ad3d872061a31ccc39 Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Fri, 10 Apr 2026 11:20:37 +0900
Subject: [PATCH 20/21] refactor cuda code

---
 .../plugins/SofaCUDA/Component/CMakeLists.txt |   1 +
 .../CudaElementCorotationalFEMForceField.cu   | 322 +++---------------
 .../fem/elastic/CudaElementFEMKernelUtils.cuh | 254 ++++++++++++++
 ...daElementLinearSmallStrainFEMForceField.cu | 213 ++----------
 4 files changed, 331 insertions(+), 459 deletions(-)
 create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh

diff --git a/applications/plugins/SofaCUDA/Component/CMakeLists.txt b/applications/plugins/SofaCUDA/Component/CMakeLists.txt
index 5ac492c4834..ce4d885c90b 100644
--- a/applications/plugins/SofaCUDA/Component/CMakeLists.txt
+++ b/applications/plugins/SofaCUDA/Component/CMakeLists.txt
@@ -39,6 +39,7 @@ set(HEADER_FILES
     
 
     ### solidmechanics
+    ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl
     ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
index 8cb90f8c540..3125446e0a6 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
@@ -21,210 +21,13 @@
 ******************************************************************************/
 #include <sofa/gpu/cuda/CudaCommon.h>
 #include <sofa/gpu/cuda/CudaMath.h>
-#include <cuda.h>
+#include "CudaElementFEMKernelUtils.cuh"
 
-namespace sofa
+namespace sofa::gpu::cuda
 {
-namespace gpu
-{
-namespace cuda
-{
-
-template<typename T>
-__device__ T myRsqrt(T x);
-template<> __device__ float myRsqrt<float>(float x) { return rsqrtf(x); }
-template<> __device__ double myRsqrt<double>(double x) { return rsqrt(x); }
-
-/**
- * Device helper: 3x3 matrix multiply C = A * B (row-major)
- */
-template<typename T>
-__device__ void mat3Mul(const T* A, const T* B, T* C)
-{
-    #pragma unroll
-    for (int i = 0; i < 3; ++i)
-    {
-        #pragma unroll
-        for (int j = 0; j < 3; ++j)
-        {
-            C[i * 3 + j] = A[i * 3 + 0] * B[0 * 3 + j]
-                          + A[i * 3 + 1] * B[1 * 3 + j]
-                          + A[i * 3 + 2] * B[2 * 3 + j];
-        }
-    }
-}
-
-/**
- * Device helper: C = A * B^T (row-major)
- */
-template<typename T>
-__device__ void mat3MulTranspose(const T* A, const T* BT, T* C)
-{
-    #pragma unroll
-    for (int i = 0; i < 3; ++i)
-    {
-        #pragma unroll
-        for (int j = 0; j < 3; ++j)
-        {
-            C[i * 3 + j] = A[i * 3 + 0] * BT[j * 3 + 0]
-                          + A[i * 3 + 1] * BT[j * 3 + 1]
-                          + A[i * 3 + 2] * BT[j * 3 + 2];
-        }
-    }
-}
-
-/**
- * Device helper: C = A^T * B (row-major)
- * Matches SOFA's Mat::multTranspose(B) which computes this^T * B.
- */
-template<typename T>
-__device__ void mat3TransposeMul(const T* A, const T* B, T* C)
-{
-    #pragma unroll
-    for (int i = 0; i < 3; ++i)
-    {
-        #pragma unroll
-        for (int j = 0; j < 3; ++j)
-        {
-            C[i * 3 + j] = A[0 * 3 + i] * B[0 * 3 + j]
-                          + A[1 * 3 + i] * B[1 * 3 + j]
-                          + A[2 * 3 + i] * B[2 * 3 + j];
-        }
-    }
-}
-
-/**
- * Device helper: compute rotation frame from first 3 nodes (TriangleRotation).
- */
-template<typename T>
-__device__ void computeTriangleFrame(const T* ex, T* frame)
-{
-    T ax = ex[3] - ex[0], ay = ex[4] - ex[1], az = ex[5] - ex[2];
-    T invLen = myRsqrt(ax * ax + ay * ay + az * az);
-    ax *= invLen; ay *= invLen; az *= invLen;
-
-    T bx = ex[6] - ex[0], by = ex[7] - ex[1], bz = ex[8] - ex[2];
-
-    T cx = ay * bz - az * by;
-    T cy = az * bx - ax * bz;
-    T cz = ax * by - ay * bx;
-    invLen = myRsqrt(cx * cx + cy * cy + cz * cz);
-    cx *= invLen; cy *= invLen; cz *= invLen;
-
-    bx = cy * az - cz * ay;
-    by = cz * ax - cx * az;
-    bz = cx * ay - cy * ax;
-
-    frame[0] = ax; frame[1] = ay; frame[2] = az;
-    frame[3] = bx; frame[4] = by; frame[5] = bz;
-    frame[6] = cx; frame[7] = cy; frame[8] = cz;
-}
-
-/**
- * Device helper: compute rotation frame from 8 hexahedron nodes (HexahedronRotation).
- */
-template<typename T>
-__device__ void computeHexahedronFrame(const T* ex, T* frame)
-{
-    const T quarter = T(0.25);
-
-    T ax = ((ex[1*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[3*3+0])
-          + (ex[5*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[7*3+0])) * quarter;
-    T ay = ((ex[1*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[3*3+1])
-          + (ex[5*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[7*3+1])) * quarter;
-    T az = ((ex[1*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[3*3+2])
-          + (ex[5*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[7*3+2])) * quarter;
-
-    T bx = ((ex[3*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[1*3+0])
-          + (ex[7*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[5*3+0])) * quarter;
-    T by = ((ex[3*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[1*3+1])
-          + (ex[7*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[5*3+1])) * quarter;
-    T bz = ((ex[3*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[1*3+2])
-          + (ex[7*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[5*3+2])) * quarter;
-
-    T invLen = myRsqrt(ax * ax + ay * ay + az * az);
-    ax *= invLen; ay *= invLen; az *= invLen;
-
-    T cx = ay * bz - az * by;
-    T cy = az * bx - ax * bz;
-    T cz = ax * by - ay * bx;
-    invLen = myRsqrt(cx * cx + cy * cy + cz * cz);
-    cx *= invLen; cy *= invLen; cz *= invLen;
-
-    bx = cy * az - cz * ay;
-    by = cz * ax - cx * az;
-    bz = cx * ay - cy * ax;
-
-    frame[0] = ax; frame[1] = ay; frame[2] = az;
-    frame[3] = bx; frame[4] = by; frame[5] = bz;
-    frame[6] = cx; frame[7] = cy; frame[8] = cz;
-}
-
-/**
- * Symmetric block-matrix multiply: out = K * in
- * Templated on Dim for generic spatial dimensions.
- */
-template<typename T, int NNodes, int Dim>
-__device__ void symBlockMatMul(const T* K, const T* in, T* out)
-{
-    #pragma unroll
-    for (int i = 0; i < NNodes * Dim; ++i)
-        out[i] = T(0);
-
-    #pragma unroll
-    for (int ni = 0; ni < NNodes; ++ni)
-    {
-        const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
-
-        // Diagonal block
-        {
-            const T* Kii = K + diagIdx * Dim * Dim;
-            #pragma unroll
-            for (int di = 0; di < Dim; ++di)
-            {
-                T sum = T(0);
-                #pragma unroll
-                for (int dj = 0; dj < Dim; ++dj)
-                    sum += Kii[di * Dim + dj] * in[ni * Dim + dj];
-                out[ni * Dim + di] += sum;
-            }
-        }
-
-        // Off-diagonal blocks
-        #pragma unroll
-        for (int nj = ni + 1; nj < NNodes; ++nj)
-        {
-            const int symIdx = diagIdx + (nj - ni);
-            const T* Kij = K + symIdx * Dim * Dim;
-
-            // Kij * in_j -> out_i
-            #pragma unroll
-            for (int di = 0; di < Dim; ++di)
-            {
-                T sum = T(0);
-                #pragma unroll
-                for (int dj = 0; dj < Dim; ++dj)
-                    sum += Kij[di * Dim + dj] * in[nj * Dim + dj];
-                out[ni * Dim + di] += sum;
-            }
-
-            // Kij^T * in_i -> out_j
-            #pragma unroll
-            for (int dj = 0; dj < Dim; ++dj)
-            {
-                T sum = T(0);
-                #pragma unroll
-                for (int di = 0; di < Dim; ++di)
-                    sum += Kij[di * Dim + dj] * in[ni * Dim + di];
-                out[nj * Dim + dj] += sum;
-            }
-        }
-    }
-}
 
 /**
  * Combined kernel: compute rotations AND per-element forces in one pass.
- * Rotation computation is inherently 3D (cross products).
  */
 template<typename T, int NNodes, int Dim>
 __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel(
@@ -263,7 +66,6 @@ __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel
     else
         computeTriangleFrame(ex, frame);
 
-    // R = frame^T * initRot
     const T* irt = initRotTransposed + elemId * Dim * Dim;
     T R[Dim * Dim];
     mat3TransposeMul(frame, irt, R);
@@ -301,7 +103,6 @@ __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        // R^T * (x_n - center)
         T diff[Dim];
         #pragma unroll
         for (int d = 0; d < Dim; ++d)
@@ -326,7 +127,6 @@ __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
     {
-        // R * edf_n, negated
         #pragma unroll
         for (int di = 0; di < Dim; ++di)
         {
@@ -340,7 +140,7 @@ __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel
 }
 
 /**
- * Kernel for addForce: Compute per-element force (1 thread per element).
+ * Kernel for addForce with pre-computed rotations.
  */
 template<typename T, int NNodes, int Dim>
 __global__ void ElementCorotationalFEMForceField_computeForce_kernel(
@@ -442,7 +242,7 @@ __global__ void ElementCorotationalFEMForceField_computeForce_kernel(
 }
 
 /**
- * Kernel for addDForce: Compute per-element dForce (1 thread per element).
+ * Kernel for addDForce.
  */
 template<typename T, int NNodes, int Dim>
 __global__ void ElementCorotationalFEMForceField_computeDForce_kernel(
@@ -465,7 +265,6 @@ __global__ void ElementCorotationalFEMForceField_computeDForce_kernel(
     for (int i = 0; i < Dim * Dim; ++i)
         R[i] = Rptr[i];
 
-    // R^T * dx for each node
     T rdx[NNodes * Dim];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
@@ -491,7 +290,6 @@ __global__ void ElementCorotationalFEMForceField_computeDForce_kernel(
     T edf[NNodes * Dim];
     symBlockMatMul<T, NNodes, Dim>(K, rdx, edf);
 
-    // R * edf, scaled by -kFactor
     T* out = eforce + elemId * NNodes * Dim;
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
@@ -508,41 +306,7 @@ __global__ void ElementCorotationalFEMForceField_computeDForce_kernel(
     }
 }
 
-/**
- * Gather per-vertex forces (1 thread per vertex).
- */
-template<typename T, int Dim>
-__global__ void ElementCorotationalFEMForceField_gatherForce_kernel(
-    int nbVertex,
-    int maxElemPerVertex,
-    const int* __restrict__ velems,
-    const T* __restrict__ eforce,
-    T* df)
-{
-    const int vertexId = blockIdx.x * blockDim.x + threadIdx.x;
-    if (vertexId >= nbVertex) return;
-
-    T acc[Dim];
-    #pragma unroll
-    for (int d = 0; d < Dim; ++d)
-        acc[d] = T(0);
-
-    for (int s = 0; s < maxElemPerVertex; ++s)
-    {
-        const int idx = velems[s * nbVertex + vertexId];
-        if (idx == 0) break;
-        const int base = (idx - 1) * Dim;
-        #pragma unroll
-        for (int d = 0; d < Dim; ++d)
-            acc[d] += eforce[base + d];
-    }
-
-    #pragma unroll
-    for (int d = 0; d < Dim; ++d)
-        df[vertexId * Dim + d] += acc[d];
-}
-
-// ===================== Launch functions (C++ templates) =====================
+// ===================== Launch functions =====================
 
 template<typename T, int NNodes, int Dim>
 void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(
@@ -575,14 +339,14 @@ void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(
 
     const int gatherThreads = 256;
     numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
-    ElementCorotationalFEMForceField_gatherForce_kernel<T, Dim>
+    ElementFEM_gatherForce_kernel<T, Dim>
         <<<numBlocks, gatherThreads>>>(
             nbVertex,
             maxElemPerVertex,
             (const int*)velems,
             (const T*)eforce,
             (T*)f);
-    mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel");
+    mycudaDebugError("ElementFEM_gatherForce_kernel");
 }
 
 template<typename T, int NNodes, int Dim>
@@ -614,14 +378,14 @@ void ElementCorotationalFEMForceFieldCuda_addForce(
 
     const int gatherThreads = 256;
     numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
-    ElementCorotationalFEMForceField_gatherForce_kernel<T, Dim>
+    ElementFEM_gatherForce_kernel<T, Dim>
         <<<numBlocks, gatherThreads>>>(
             nbVertex,
             maxElemPerVertex,
             (const int*)velems,
             (const T*)eforce,
             (T*)f);
-    mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel");
+    mycudaDebugError("ElementFEM_gatherForce_kernel");
 }
 
 template<typename T, int NNodes, int Dim>
@@ -653,46 +417,48 @@ void ElementCorotationalFEMForceFieldCuda_addDForce(
 
     const int gatherThreads = 256;
     numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
-    ElementCorotationalFEMForceField_gatherForce_kernel<T, Dim>
+    ElementFEM_gatherForce_kernel<T, Dim>
         <<<numBlocks, gatherThreads>>>(
             nbVertex,
             maxElemPerVertex,
             (const int*)velems,
             (const T*)eforce,
             (T*)df);
-    mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel");
+    mycudaDebugError("ElementFEM_gatherForce_kernel");
 }
 
 // ===================== Explicit template instantiations =====================
 
-// addForceWithRotations: only NNodes >= 3 (triangle/quad/hex rotation methods)
-template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations<float, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*);
-template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations<float, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*);
-template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations<float, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*);
-template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations<double, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*);
-template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations<double, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*);
-template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations<double, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*);
-
-// addForce: all element types
-template void ElementCorotationalFEMForceFieldCuda_addForce<float, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementCorotationalFEMForceFieldCuda_addForce<float, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementCorotationalFEMForceFieldCuda_addForce<float, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementCorotationalFEMForceFieldCuda_addForce<float, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementCorotationalFEMForceFieldCuda_addForce<double, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementCorotationalFEMForceFieldCuda_addForce<double, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementCorotationalFEMForceFieldCuda_addForce<double, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementCorotationalFEMForceFieldCuda_addForce<double, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*);
-
-// addDForce: all element types
-template void ElementCorotationalFEMForceFieldCuda_addDForce<float, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float);
-template void ElementCorotationalFEMForceFieldCuda_addDForce<float, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float);
-template void ElementCorotationalFEMForceFieldCuda_addDForce<float, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float);
-template void ElementCorotationalFEMForceFieldCuda_addDForce<float, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float);
-template void ElementCorotationalFEMForceFieldCuda_addDForce<double, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double);
-template void ElementCorotationalFEMForceFieldCuda_addDForce<double, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double);
-template void ElementCorotationalFEMForceFieldCuda_addDForce<double, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double);
-template void ElementCorotationalFEMForceFieldCuda_addDForce<double, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double);
-
-} // namespace cuda
-} // namespace gpu
-} // namespace sofa
+#define INSTANTIATE_COROTATIONAL(T, NNodes) \
+    template void ElementCorotationalFEMForceFieldCuda_addForce<T, NNodes, 3>( \
+        unsigned int, unsigned int, unsigned int, const void*, const void*, \
+        const void*, const void*, const void*, void*, void*, const void*); \
+    template void ElementCorotationalFEMForceFieldCuda_addDForce<T, NNodes, 3>( \
+        unsigned int, unsigned int, unsigned int, const void*, const void*, \
+        const void*, const void*, void*, void*, const void*, T);
+
+#define INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(T, NNodes) \
+    template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations<T, NNodes, 3>( \
+        unsigned int, unsigned int, unsigned int, const void*, const void*, \
+        const void*, const void*, const void*, void*, void*, void*, const void*);
+
+INSTANTIATE_COROTATIONAL(float, 2)
+INSTANTIATE_COROTATIONAL(float, 3)
+INSTANTIATE_COROTATIONAL(float, 4)
+INSTANTIATE_COROTATIONAL(float, 8)
+INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(float, 3)
+INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(float, 4)
+INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(float, 8)
+
+INSTANTIATE_COROTATIONAL(double, 2)
+INSTANTIATE_COROTATIONAL(double, 3)
+INSTANTIATE_COROTATIONAL(double, 4)
+INSTANTIATE_COROTATIONAL(double, 8)
+INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(double, 3)
+INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(double, 4)
+INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(double, 8)
+
+#undef INSTANTIATE_COROTATIONAL
+#undef INSTANTIATE_COROTATIONAL_WITH_ROTATIONS
+
+} // namespace sofa::gpu::cuda
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh
new file mode 100644
index 00000000000..605e7773baa
--- /dev/null
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh
@@ -0,0 +1,254 @@
+/******************************************************************************
+*                 SOFA, Simulation Open-Framework Architecture                *
+*                    (c) 2006 INRIA, USTL, UJF, CNRS, MGH                     *
+*                                                                             *
+* This program is free software; you can redistribute it and/or modify it     *
+* under the terms of the GNU Lesser General Public License as published by    *
+* the Free Software Foundation; either version 2.1 of the License, or (at     *
+* your option) any later version.                                             *
+*                                                                             *
+* This program is distributed in the hope that it will be useful, but WITHOUT *
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License *
+* for more details.                                                           *
+*                                                                             *
+* You should have received a copy of the GNU Lesser General Public License    *
+* along with this program. If not, see <http://www.gnu.org/licenses/>.        *
+*******************************************************************************
+* Authors: The SOFA Team and external contributors (see Authors.txt)          *
+*                                                                             *
+* Contact information: contact@sofa-framework.org                             *
+******************************************************************************/
+#pragma once
+
+#include <cuda.h>
+
+namespace sofa::gpu::cuda
+{
+
+//=============================================================================
+// Math utilities
+//=============================================================================
+
+template<typename T>
+__device__ inline T myRsqrt(T x);
+
+template<>
+__device__ inline float myRsqrt<float>(float x) { return rsqrtf(x); }
+
+template<>
+__device__ inline double myRsqrt<double>(double x) { return rsqrt(x); }
+
+//=============================================================================
+// 3x3 Matrix operations (row-major)
+//=============================================================================
+
+template<typename T>
+__device__ inline void mat3Mul(const T* A, const T* B, T* C)
+{
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+    {
+        #pragma unroll
+        for (int j = 0; j < 3; ++j)
+        {
+            C[i * 3 + j] = A[i * 3 + 0] * B[0 * 3 + j]
+                         + A[i * 3 + 1] * B[1 * 3 + j]
+                         + A[i * 3 + 2] * B[2 * 3 + j];
+        }
+    }
+}
+
+template<typename T>
+__device__ inline void mat3MulTranspose(const T* A, const T* BT, T* C)
+{
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+    {
+        #pragma unroll
+        for (int j = 0; j < 3; ++j)
+        {
+            C[i * 3 + j] = A[i * 3 + 0] * BT[j * 3 + 0]
+                         + A[i * 3 + 1] * BT[j * 3 + 1]
+                         + A[i * 3 + 2] * BT[j * 3 + 2];
+        }
+    }
+}
+
+template<typename T>
+__device__ inline void mat3TransposeMul(const T* A, const T* B, T* C)
+{
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+    {
+        #pragma unroll
+        for (int j = 0; j < 3; ++j)
+        {
+            C[i * 3 + j] = A[0 * 3 + i] * B[0 * 3 + j]
+                         + A[1 * 3 + i] * B[1 * 3 + j]
+                         + A[2 * 3 + i] * B[2 * 3 + j];
+        }
+    }
+}
+
+//=============================================================================
+// Rotation frame computation
+//=============================================================================
+
+template<typename T>
+__device__ inline void computeTriangleFrame(const T* ex, T* frame)
+{
+    T ax = ex[3] - ex[0], ay = ex[4] - ex[1], az = ex[5] - ex[2];
+    T invLen = myRsqrt(ax * ax + ay * ay + az * az);
+    ax *= invLen; ay *= invLen; az *= invLen;
+
+    T bx = ex[6] - ex[0], by = ex[7] - ex[1], bz = ex[8] - ex[2];
+
+    T cx = ay * bz - az * by;
+    T cy = az * bx - ax * bz;
+    T cz = ax * by - ay * bx;
+    invLen = myRsqrt(cx * cx + cy * cy + cz * cz);
+    cx *= invLen; cy *= invLen; cz *= invLen;
+
+    bx = cy * az - cz * ay;
+    by = cz * ax - cx * az;
+    bz = cx * ay - cy * ax;
+
+    frame[0] = ax; frame[1] = ay; frame[2] = az;
+    frame[3] = bx; frame[4] = by; frame[5] = bz;
+    frame[6] = cx; frame[7] = cy; frame[8] = cz;
+}
+
+template<typename T>
+__device__ inline void computeHexahedronFrame(const T* ex, T* frame)
+{
+    const T quarter = T(0.25);
+
+    T ax = ((ex[1*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[3*3+0])
+          + (ex[5*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[7*3+0])) * quarter;
+    T ay = ((ex[1*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[3*3+1])
+          + (ex[5*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[7*3+1])) * quarter;
+    T az = ((ex[1*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[3*3+2])
+          + (ex[5*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[7*3+2])) * quarter;
+
+    T bx = ((ex[3*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[1*3+0])
+          + (ex[7*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[5*3+0])) * quarter;
+    T by = ((ex[3*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[1*3+1])
+          + (ex[7*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[5*3+1])) * quarter;
+    T bz = ((ex[3*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[1*3+2])
+          + (ex[7*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[5*3+2])) * quarter;
+
+    T invLen = myRsqrt(ax * ax + ay * ay + az * az);
+    ax *= invLen; ay *= invLen; az *= invLen;
+
+    T cx = ay * bz - az * by;
+    T cy = az * bx - ax * bz;
+    T cz = ax * by - ay * bx;
+    invLen = myRsqrt(cx * cx + cy * cy + cz * cz);
+    cx *= invLen; cy *= invLen; cz *= invLen;
+
+    bx = cy * az - cz * ay;
+    by = cz * ax - cx * az;
+    bz = cx * ay - cy * ax;
+
+    frame[0] = ax; frame[1] = ay; frame[2] = az;
+    frame[3] = bx; frame[4] = by; frame[5] = bz;
+    frame[6] = cx; frame[7] = cy; frame[8] = cz;
+}
+
+//=============================================================================
+// Symmetric block-matrix multiply
+//=============================================================================
+
+template<typename T, int NNodes, int Dim>
+__device__ inline void symBlockMatMul(const T* K, const T* in, T* out)
+{
+    #pragma unroll
+    for (int i = 0; i < NNodes * Dim; ++i)
+        out[i] = T(0);
+
+    #pragma unroll
+    for (int ni = 0; ni < NNodes; ++ni)
+    {
+        const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
+
+        // Diagonal block
+        {
+            const T* Kii = K + diagIdx * Dim * Dim;
+            #pragma unroll
+            for (int di = 0; di < Dim; ++di)
+            {
+                T sum = T(0);
+                #pragma unroll
+                for (int dj = 0; dj < Dim; ++dj)
+                    sum += Kii[di * Dim + dj] * in[ni * Dim + dj];
+                out[ni * Dim + di] += sum;
+            }
+        }
+
+        // Off-diagonal blocks
+        #pragma unroll
+        for (int nj = ni + 1; nj < NNodes; ++nj)
+        {
+            const int symIdx = diagIdx + (nj - ni);
+            const T* Kij = K + symIdx * Dim * Dim;
+
+            #pragma unroll
+            for (int di = 0; di < Dim; ++di)
+            {
+                T sum = T(0);
+                #pragma unroll
+                for (int dj = 0; dj < Dim; ++dj)
+                    sum += Kij[di * Dim + dj] * in[nj * Dim + dj];
+                out[ni * Dim + di] += sum;
+            }
+
+            #pragma unroll
+            for (int dj = 0; dj < Dim; ++dj)
+            {
+                T sum = T(0);
+                #pragma unroll
+                for (int di = 0; di < Dim; ++di)
+                    sum += Kij[di * Dim + dj] * in[ni * Dim + di];
+                out[nj * Dim + dj] += sum;
+            }
+        }
+    }
+}
+
+//=============================================================================
+// Gather kernel
+//=============================================================================
+
+template<typename T, int Dim>
+__global__ void ElementFEM_gatherForce_kernel(
+    int nbVertex,
+    int maxElemPerVertex,
+    const int* __restrict__ velems,
+    const T* __restrict__ eforce,
+    T* df)
+{
+    const int vertexId = blockIdx.x * blockDim.x + threadIdx.x;
+    if (vertexId >= nbVertex) return;
+
+    T acc[Dim];
+    #pragma unroll
+    for (int d = 0; d < Dim; ++d)
+        acc[d] = T(0);
+
+    for (int s = 0; s < maxElemPerVertex; ++s)
+    {
+        const int idx = velems[s * nbVertex + vertexId];
+        if (idx == 0) break;
+        const int base = (idx - 1) * Dim;
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            acc[d] += eforce[base + d];
+    }
+
+    #pragma unroll
+    for (int d = 0; d < Dim; ++d)
+        df[vertexId * Dim + d] += acc[d];
+}
+
+} // namespace sofa::gpu::cuda
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
index e8492615c32..10b04b98193 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
@@ -21,21 +21,13 @@
 ******************************************************************************/
 #include <sofa/gpu/cuda/CudaCommon.h>
 #include <sofa/gpu/cuda/CudaMath.h>
-#include <cuda.h>
+#include "CudaElementFEMKernelUtils.cuh"
 
-namespace sofa
-{
-namespace gpu
-{
-namespace cuda
+namespace sofa::gpu::cuda
 {
 
 /**
- * Kernel for addForce: Compute per-element force from displacement (1 thread per element).
- *
- * f = -K * (x - x0)
- * Templated on NNodes and Dim (compile-time) for full loop unrolling.
- * Templated on T for float/double support.
+ * Kernel for addForce: f = -K * (x - x0)
  */
 template<typename T, int NNodes, int Dim>
 __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel(
@@ -51,7 +43,6 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel(
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
-    // Gather displacement = x - x0 for this element's nodes
     T disp[NNodes * Dim];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
@@ -62,65 +53,10 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel(
             disp[n * Dim + d] = x[nodeId * Dim + d] - x0[nodeId * Dim + d];
     }
 
-    // Symmetric block-matrix multiply: edf = K * disp
     const T* K = stiffness + elemId * NSymBlocks * Dim * Dim;
     T edf[NNodes * Dim];
+    symBlockMatMul<T, NNodes, Dim>(K, disp, edf);
 
-    #pragma unroll
-    for (int i = 0; i < NNodes * Dim; ++i)
-        edf[i] = T(0);
-
-    #pragma unroll
-    for (int ni = 0; ni < NNodes; ++ni)
-    {
-        const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
-
-        // Diagonal block
-        {
-            const T* Kii = K + diagIdx * Dim * Dim;
-            #pragma unroll
-            for (int di = 0; di < Dim; ++di)
-            {
-                T sum = T(0);
-                #pragma unroll
-                for (int dj = 0; dj < Dim; ++dj)
-                    sum += Kii[di * Dim + dj] * disp[ni * Dim + dj];
-                edf[ni * Dim + di] += sum;
-            }
-        }
-
-        // Off-diagonal blocks
-        #pragma unroll
-        for (int nj = ni + 1; nj < NNodes; ++nj)
-        {
-            const int symIdx = diagIdx + (nj - ni);
-            const T* Kij = K + symIdx * Dim * Dim;
-
-            // Kij * disp_j -> edf_i
-            #pragma unroll
-            for (int di = 0; di < Dim; ++di)
-            {
-                T sum = T(0);
-                #pragma unroll
-                for (int dj = 0; dj < Dim; ++dj)
-                    sum += Kij[di * Dim + dj] * disp[nj * Dim + dj];
-                edf[ni * Dim + di] += sum;
-            }
-
-            // Kij^T * disp_i -> edf_j
-            #pragma unroll
-            for (int dj = 0; dj < Dim; ++dj)
-            {
-                T sum = T(0);
-                #pragma unroll
-                for (int di = 0; di < Dim; ++di)
-                    sum += Kij[di * Dim + dj] * disp[ni * Dim + di];
-                edf[nj * Dim + dj] += sum;
-            }
-        }
-    }
-
-    // Write: eforce = -edf (minus sign from f -= K * displacement)
     T* out = eforce + elemId * NNodes * Dim;
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
@@ -132,9 +68,7 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel(
 }
 
 /**
- * Kernel for addDForce: Compute per-element dForce (1 thread per element).
- *
- * df = -kFactor * K * dx
+ * Kernel for addDForce: df = -kFactor * K * dx
  */
 template<typename T, int NNodes, int Dim>
 __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel(
@@ -150,7 +84,6 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel(
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
-    // Gather dx for this element's nodes
     T edx[NNodes * Dim];
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
@@ -161,61 +94,10 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel(
             edx[n * Dim + d] = dx[nodeId * Dim + d];
     }
 
-    // Symmetric block-matrix multiply: edf = K * edx
     const T* K = stiffness + elemId * NSymBlocks * Dim * Dim;
     T edf[NNodes * Dim];
+    symBlockMatMul<T, NNodes, Dim>(K, edx, edf);
 
-    #pragma unroll
-    for (int i = 0; i < NNodes * Dim; ++i)
-        edf[i] = T(0);
-
-    #pragma unroll
-    for (int ni = 0; ni < NNodes; ++ni)
-    {
-        const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
-
-        {
-            const T* Kii = K + diagIdx * Dim * Dim;
-            #pragma unroll
-            for (int di = 0; di < Dim; ++di)
-            {
-                T sum = T(0);
-                #pragma unroll
-                for (int dj = 0; dj < Dim; ++dj)
-                    sum += Kii[di * Dim + dj] * edx[ni * Dim + dj];
-                edf[ni * Dim + di] += sum;
-            }
-        }
-
-        #pragma unroll
-        for (int nj = ni + 1; nj < NNodes; ++nj)
-        {
-            const int symIdx = diagIdx + (nj - ni);
-            const T* Kij = K + symIdx * Dim * Dim;
-
-            #pragma unroll
-            for (int di = 0; di < Dim; ++di)
-            {
-                T sum = T(0);
-                #pragma unroll
-                for (int dj = 0; dj < Dim; ++dj)
-                    sum += Kij[di * Dim + dj] * edx[nj * Dim + dj];
-                edf[ni * Dim + di] += sum;
-            }
-
-            #pragma unroll
-            for (int dj = 0; dj < Dim; ++dj)
-            {
-                T sum = T(0);
-                #pragma unroll
-                for (int di = 0; di < Dim; ++di)
-                    sum += Kij[di * Dim + dj] * edx[ni * Dim + di];
-                edf[nj * Dim + dj] += sum;
-            }
-        }
-    }
-
-    // Write: eforce = -kFactor * edf
     T* out = eforce + elemId * NNodes * Dim;
     #pragma unroll
     for (int n = 0; n < NNodes; ++n)
@@ -226,39 +108,7 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel(
     }
 }
 
-/**
- * Gather per-vertex forces (1 thread per vertex).
- */
-template<typename T, int Dim>
-__global__ void ElementLinearSmallStrainFEMForceField_gatherForce_kernel(
-    int nbVertex,
-    int maxElemPerVertex,
-    const int* __restrict__ velems,
-    const T* __restrict__ eforce,
-    T* df)
-{
-    const int vertexId = blockIdx.x * blockDim.x + threadIdx.x;
-    if (vertexId >= nbVertex) return;
-
-    T acc[Dim];
-    #pragma unroll
-    for (int d = 0; d < Dim; ++d)
-        acc[d] = T(0);
-
-    for (int s = 0; s < maxElemPerVertex; ++s)
-    {
-        const int idx = velems[s * nbVertex + vertexId];
-        if (idx == 0) break;
-        const int base = (idx - 1) * Dim;
-        #pragma unroll
-        for (int d = 0; d < Dim; ++d)
-            acc[d] += eforce[base + d];
-    }
-
-    #pragma unroll
-    for (int d = 0; d < Dim; ++d)
-        df[vertexId * Dim + d] += acc[d];
-}
+// ===================== Launch functions =====================
 
 template<typename T, int NNodes, int Dim>
 void ElementLinearSmallStrainFEMForceFieldCuda_addForce(
@@ -287,14 +137,14 @@ void ElementLinearSmallStrainFEMForceFieldCuda_addForce(
 
     const int gatherThreads = 256;
     numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
-    ElementLinearSmallStrainFEMForceField_gatherForce_kernel<T, Dim>
+    ElementFEM_gatherForce_kernel<T, Dim>
         <<<numBlocks, gatherThreads>>>(
             nbVertex,
             maxElemPerVertex,
             (const int*)velems,
             (const T*)eforce,
             (T*)f);
-    mycudaDebugError("ElementLinearSmallStrainFEMForceField_gatherForce_kernel");
+    mycudaDebugError("ElementFEM_gatherForce_kernel");
 }
 
 template<typename T, int NNodes, int Dim>
@@ -324,35 +174,36 @@ void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(
 
     const int gatherThreads = 256;
     numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads;
-    ElementLinearSmallStrainFEMForceField_gatherForce_kernel<T, Dim>
+    ElementFEM_gatherForce_kernel<T, Dim>
         <<<numBlocks, gatherThreads>>>(
             nbVertex,
             maxElemPerVertex,
             (const int*)velems,
             (const T*)eforce,
             (T*)df);
-    mycudaDebugError("ElementLinearSmallStrainFEMForceField_gatherForce_kernel");
+    mycudaDebugError("ElementFEM_gatherForce_kernel");
 }
 
-// Explicit template instantiations for all supported (T, NNodes, Dim) combinations
-template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<float, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<float, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<float, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<float, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<double, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<double, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<double, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<double, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*);
+// ===================== Explicit template instantiations =====================
+
+#define INSTANTIATE_LINEAR(T, NNodes) \
+    template void ElementLinearSmallStrainFEMForceFieldCuda_addForce<T, NNodes, 3>( \
+        unsigned int, unsigned int, unsigned int, const void*, const void*, \
+        const void*, const void*, void*, void*, const void*); \
+    template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<T, NNodes, 3>( \
+        unsigned int, unsigned int, unsigned int, const void*, const void*, \
+        const void*, void*, void*, const void*, T);
+
+INSTANTIATE_LINEAR(float, 2)
+INSTANTIATE_LINEAR(float, 3)
+INSTANTIATE_LINEAR(float, 4)
+INSTANTIATE_LINEAR(float, 8)
+
+INSTANTIATE_LINEAR(double, 2)
+INSTANTIATE_LINEAR(double, 3)
+INSTANTIATE_LINEAR(double, 4)
+INSTANTIATE_LINEAR(double, 8)
 
-template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<float, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<float, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<float, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<float, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<double, 2, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<double, 3, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<double, 4, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double);
-template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce<double, 8, 3>(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double);
+#undef INSTANTIATE_LINEAR
 
-} // namespace cuda
-} // namespace gpu
-} // namespace sofa
+} // namespace sofa::gpu::cuda

From 6a2d39509a1c408f887fb370d3514e3b7222eeb8 Mon Sep 17 00:00:00 2001
From: Frederick Roy <froy@lnrobo.com>
Date: Fri, 10 Apr 2026 13:06:02 +0900
Subject: [PATCH 21/21] clarify code

---
 .../CudaElementCorotationalFEMForceField.cu   | 194 +++----------
 .../fem/elastic/CudaElementFEMKernelUtils.cuh | 264 ++++++++++++++++--
 ...daElementLinearSmallStrainFEMForceField.cu |  40 +--
 3 files changed, 284 insertions(+), 214 deletions(-)

diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
index 3125446e0a6..6616637dc55 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu
@@ -42,101 +42,50 @@ __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel
 {
     static_assert(Dim == 3, "Corotational rotation computation requires Dim == 3");
     constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
-    const T invN = T(1) / T(NNodes);
 
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
+    // Gather element positions
     T ex[NNodes * Dim], ex0[NNodes * Dim];
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        const int nodeId = elements[n * nbElem + elemId];
-        #pragma unroll
-        for (int d = 0; d < Dim; ++d)
-        {
-            ex[n * Dim + d] = x[nodeId * Dim + d];
-            ex0[n * Dim + d] = x0[nodeId * Dim + d];
-        }
-    }
+    gatherElementData<T, NNodes, Dim>(elements, nbElem, elemId, x, ex);
+    gatherElementData<T, NNodes, Dim>(elements, nbElem, elemId, x0, ex0);
 
+    // Compute rotation frame from current positions
     T frame[Dim * Dim];
     if constexpr (NNodes == 8)
         computeHexahedronFrame(ex, frame);
     else
         computeTriangleFrame(ex, frame);
 
+    // R = frame^T * initRotTransposed
     const T* irt = initRotTransposed + elemId * Dim * Dim;
     T R[Dim * Dim];
     mat3TransposeMul(frame, irt, R);
 
+    // Store rotation for later use
     T* Rout = rotationsOut + elemId * Dim * Dim;
     #pragma unroll
     for (int i = 0; i < Dim * Dim; ++i)
         Rout[i] = R[i];
 
+    // Compute element centers
     T center[Dim], center0[Dim];
-    #pragma unroll
-    for (int d = 0; d < Dim; ++d)
-    {
-        center[d] = T(0);
-        center0[d] = T(0);
-    }
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        #pragma unroll
-        for (int d = 0; d < Dim; ++d)
-        {
-            center[d] += ex[n * Dim + d];
-            center0[d] += ex0[n * Dim + d];
-        }
-    }
-    #pragma unroll
-    for (int d = 0; d < Dim; ++d)
-    {
-        center[d] *= invN;
-        center0[d] *= invN;
-    }
+    computeElementCenter<T, NNodes, Dim>(ex, center);
+    computeElementCenter<T, NNodes, Dim>(ex0, center0);
 
+    // Compute corotational displacement
     T disp[NNodes * Dim];
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        T diff[Dim];
-        #pragma unroll
-        for (int d = 0; d < Dim; ++d)
-            diff[d] = ex[n * Dim + d] - center[d];
-
-        #pragma unroll
-        for (int di = 0; di < Dim; ++di)
-        {
-            T rotated = T(0);
-            #pragma unroll
-            for (int dj = 0; dj < Dim; ++dj)
-                rotated += R[dj * Dim + di] * diff[dj];
-            disp[n * Dim + di] = rotated - (ex0[n * Dim + di] - center0[di]);
-        }
-    }
+    computeCorotationalDisplacement<T, NNodes, Dim>(R, ex, ex0, center, center0, disp);
 
+    // Multiply by stiffness matrix
     T edf[NNodes * Dim];
     const T* K = stiffness + elemId * NSymBlocks * Dim * Dim;
     symBlockMatMul<T, NNodes, Dim>(K, disp, edf);
 
+    // Rotate forces back to global frame and negate
     T* out = eforce + elemId * NNodes * Dim;
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        #pragma unroll
-        for (int di = 0; di < Dim; ++di)
-        {
-            T sum = T(0);
-            #pragma unroll
-            for (int dj = 0; dj < Dim; ++dj)
-                sum += R[di * Dim + dj] * edf[n * Dim + dj];
-            out[n * Dim + di] = -sum;
-        }
-    }
+    rotateAndWriteForce<T, NNodes, Dim>(R, edf, out, T(-1));
 }
 
 /**
@@ -153,92 +102,39 @@ __global__ void ElementCorotationalFEMForceField_computeForce_kernel(
     T* __restrict__ eforce)
 {
     constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2;
-    const T invN = T(1) / T(NNodes);
 
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
+    // Load rotation matrix
     const T* Rptr = rotations + elemId * Dim * Dim;
     T R[Dim * Dim];
     #pragma unroll
     for (int i = 0; i < Dim * Dim; ++i)
         R[i] = Rptr[i];
 
+    // Gather element positions
     T ex[NNodes * Dim], ex0[NNodes * Dim];
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        const int nodeId = elements[n * nbElem + elemId];
-        #pragma unroll
-        for (int d = 0; d < Dim; ++d)
-        {
-            ex[n * Dim + d] = x[nodeId * Dim + d];
-            ex0[n * Dim + d] = x0[nodeId * Dim + d];
-        }
-    }
+    gatherElementData<T, NNodes, Dim>(elements, nbElem, elemId, x, ex);
+    gatherElementData<T, NNodes, Dim>(elements, nbElem, elemId, x0, ex0);
 
+    // Compute element centers
     T center[Dim], center0[Dim];
-    #pragma unroll
-    for (int d = 0; d < Dim; ++d)
-    {
-        center[d] = T(0);
-        center0[d] = T(0);
-    }
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        #pragma unroll
-        for (int d = 0; d < Dim; ++d)
-        {
-            center[d] += ex[n * Dim + d];
-            center0[d] += ex0[n * Dim + d];
-        }
-    }
-    #pragma unroll
-    for (int d = 0; d < Dim; ++d)
-    {
-        center[d] *= invN;
-        center0[d] *= invN;
-    }
+    computeElementCenter<T, NNodes, Dim>(ex, center);
+    computeElementCenter<T, NNodes, Dim>(ex0, center0);
 
+    // Compute corotational displacement
     T disp[NNodes * Dim];
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        T diff[Dim];
-        #pragma unroll
-        for (int d = 0; d < Dim; ++d)
-            diff[d] = ex[n * Dim + d] - center[d];
-
-        #pragma unroll
-        for (int di = 0; di < Dim; ++di)
-        {
-            T rotated = T(0);
-            #pragma unroll
-            for (int dj = 0; dj < Dim; ++dj)
-                rotated += R[dj * Dim + di] * diff[dj];
-            disp[n * Dim + di] = rotated - (ex0[n * Dim + di] - center0[di]);
-        }
-    }
+    computeCorotationalDisplacement<T, NNodes, Dim>(R, ex, ex0, center, center0, disp);
 
+    // Multiply by stiffness matrix
     T edf[NNodes * Dim];
     const T* K = stiffness + elemId * NSymBlocks * Dim * Dim;
     symBlockMatMul<T, NNodes, Dim>(K, disp, edf);
 
+    // Rotate forces back to global frame and negate
     T* out = eforce + elemId * NNodes * Dim;
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        #pragma unroll
-        for (int di = 0; di < Dim; ++di)
-        {
-            T sum = T(0);
-            #pragma unroll
-            for (int dj = 0; dj < Dim; ++dj)
-                sum += R[di * Dim + dj] * edf[n * Dim + dj];
-            out[n * Dim + di] = -sum;
-        }
-    }
+    rotateAndWriteForce<T, NNodes, Dim>(R, edf, out, T(-1));
 }
 
 /**
@@ -259,51 +155,25 @@ __global__ void ElementCorotationalFEMForceField_computeDForce_kernel(
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
+    // Load rotation matrix
     const T* Rptr = rotations + elemId * Dim * Dim;
     T R[Dim * Dim];
     #pragma unroll
     for (int i = 0; i < Dim * Dim; ++i)
         R[i] = Rptr[i];
 
+    // Gather and rotate displacement: rdx = R^T * dx
     T rdx[NNodes * Dim];
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        const int nodeId = elements[n * nbElem + elemId];
-        T nodeDx[Dim];
-        #pragma unroll
-        for (int d = 0; d < Dim; ++d)
-            nodeDx[d] = dx[nodeId * Dim + d];
-
-        #pragma unroll
-        for (int di = 0; di < Dim; ++di)
-        {
-            T sum = T(0);
-            #pragma unroll
-            for (int dj = 0; dj < Dim; ++dj)
-                sum += R[dj * Dim + di] * nodeDx[dj];
-            rdx[n * Dim + di] = sum;
-        }
-    }
+    rotateDisplacementTranspose<T, NNodes, Dim>(R, elements, nbElem, elemId, dx, rdx);
 
+    // Multiply by stiffness matrix
     const T* K = stiffness + elemId * NSymBlocks * Dim * Dim;
     T edf[NNodes * Dim];
     symBlockMatMul<T, NNodes, Dim>(K, rdx, edf);
 
+    // Rotate forces back to global frame and scale
     T* out = eforce + elemId * NNodes * Dim;
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        #pragma unroll
-        for (int di = 0; di < Dim; ++di)
-        {
-            T sum = T(0);
-            #pragma unroll
-            for (int dj = 0; dj < Dim; ++dj)
-                sum += R[di * Dim + dj] * edf[n * Dim + dj];
-            out[n * Dim + di] = -kFactor * sum;
-        }
-    }
+    rotateAndWriteForce<T, NNodes, Dim>(R, edf, out, -kFactor);
 }
 
 // ===================== Launch functions =====================
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh
index 605e7773baa..4d65c2f7130 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh
@@ -40,9 +40,10 @@ template<>
 __device__ inline double myRsqrt<double>(double x) { return rsqrt(x); }
 
 //=============================================================================
-// 3x3 Matrix operations (row-major)
+// 3x3 Matrix operations (row-major storage)
 //=============================================================================
 
+/// C = A * B
 template<typename T>
 __device__ inline void mat3Mul(const T* A, const T* B, T* C)
 {
@@ -59,6 +60,7 @@ __device__ inline void mat3Mul(const T* A, const T* B, T* C)
     }
 }
 
+/// C = A * B^T
 template<typename T>
 __device__ inline void mat3MulTranspose(const T* A, const T* BT, T* C)
 {
@@ -75,6 +77,7 @@ __device__ inline void mat3MulTranspose(const T* A, const T* BT, T* C)
     }
 }
 
+/// C = A^T * B
 template<typename T>
 __device__ inline void mat3TransposeMul(const T* A, const T* B, T* C)
 {
@@ -91,62 +94,100 @@ __device__ inline void mat3TransposeMul(const T* A, const T* B, T* C)
     }
 }
 
+/// out = R * in (rotate a 3D vector)
+template<typename T>
+__device__ inline void rotateVector(const T* R, const T* in, T* out)
+{
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+    {
+        out[i] = R[i * 3 + 0] * in[0]
+               + R[i * 3 + 1] * in[1]
+               + R[i * 3 + 2] * in[2];
+    }
+}
+
+/// out = R^T * in (rotate a 3D vector by transpose)
+template<typename T>
+__device__ inline void rotateVectorTranspose(const T* R, const T* in, T* out)
+{
+    #pragma unroll
+    for (int i = 0; i < 3; ++i)
+    {
+        out[i] = R[0 * 3 + i] * in[0]
+               + R[1 * 3 + i] * in[1]
+               + R[2 * 3 + i] * in[2];
+    }
+}
+
 //=============================================================================
 // Rotation frame computation
 //=============================================================================
 
+/// Compute rotation frame from first 3 nodes (for Triangle, Quad, Tetrahedron)
 template<typename T>
-__device__ inline void computeTriangleFrame(const T* ex, T* frame)
+__device__ inline void computeTriangleFrame(const T* pos, T* frame)
 {
-    T ax = ex[3] - ex[0], ay = ex[4] - ex[1], az = ex[5] - ex[2];
+    // X axis: normalized (p1 - p0)
+    T ax = pos[3] - pos[0], ay = pos[4] - pos[1], az = pos[5] - pos[2];
     T invLen = myRsqrt(ax * ax + ay * ay + az * az);
     ax *= invLen; ay *= invLen; az *= invLen;
 
-    T bx = ex[6] - ex[0], by = ex[7] - ex[1], bz = ex[8] - ex[2];
+    // Temp vector b = p2 - p0
+    T bx = pos[6] - pos[0], by = pos[7] - pos[1], bz = pos[8] - pos[2];
 
+    // Z axis: normalized cross(a, b)
     T cx = ay * bz - az * by;
     T cy = az * bx - ax * bz;
     T cz = ax * by - ay * bx;
     invLen = myRsqrt(cx * cx + cy * cy + cz * cz);
     cx *= invLen; cy *= invLen; cz *= invLen;
 
+    // Y axis: cross(z, x)
     bx = cy * az - cz * ay;
     by = cz * ax - cx * az;
     bz = cx * ay - cy * ax;
 
+    // Store row-major: frame[row][col] = frame[row * 3 + col]
     frame[0] = ax; frame[1] = ay; frame[2] = az;
     frame[3] = bx; frame[4] = by; frame[5] = bz;
     frame[6] = cx; frame[7] = cy; frame[8] = cz;
 }
 
+/// Compute rotation frame from 8 hexahedron nodes
 template<typename T>
-__device__ inline void computeHexahedronFrame(const T* ex, T* frame)
+__device__ inline void computeHexahedronFrame(const T* pos, T* frame)
 {
     const T quarter = T(0.25);
 
-    T ax = ((ex[1*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[3*3+0])
-          + (ex[5*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[7*3+0])) * quarter;
-    T ay = ((ex[1*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[3*3+1])
-          + (ex[5*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[7*3+1])) * quarter;
-    T az = ((ex[1*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[3*3+2])
-          + (ex[5*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[7*3+2])) * quarter;
-
-    T bx = ((ex[3*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[1*3+0])
-          + (ex[7*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[5*3+0])) * quarter;
-    T by = ((ex[3*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[1*3+1])
-          + (ex[7*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[5*3+1])) * quarter;
-    T bz = ((ex[3*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[1*3+2])
-          + (ex[7*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[5*3+2])) * quarter;
-
+    // Average X direction from 4 edge pairs
+    T ax = ((pos[1*3+0] - pos[0*3+0]) + (pos[2*3+0] - pos[3*3+0])
+          + (pos[5*3+0] - pos[4*3+0]) + (pos[6*3+0] - pos[7*3+0])) * quarter;
+    T ay = ((pos[1*3+1] - pos[0*3+1]) + (pos[2*3+1] - pos[3*3+1])
+          + (pos[5*3+1] - pos[4*3+1]) + (pos[6*3+1] - pos[7*3+1])) * quarter;
+    T az = ((pos[1*3+2] - pos[0*3+2]) + (pos[2*3+2] - pos[3*3+2])
+          + (pos[5*3+2] - pos[4*3+2]) + (pos[6*3+2] - pos[7*3+2])) * quarter;
+
+    // Average Y direction
+    T bx = ((pos[3*3+0] - pos[0*3+0]) + (pos[2*3+0] - pos[1*3+0])
+          + (pos[7*3+0] - pos[4*3+0]) + (pos[6*3+0] - pos[5*3+0])) * quarter;
+    T by = ((pos[3*3+1] - pos[0*3+1]) + (pos[2*3+1] - pos[1*3+1])
+          + (pos[7*3+1] - pos[4*3+1]) + (pos[6*3+1] - pos[5*3+1])) * quarter;
+    T bz = ((pos[3*3+2] - pos[0*3+2]) + (pos[2*3+2] - pos[1*3+2])
+          + (pos[7*3+2] - pos[4*3+2]) + (pos[6*3+2] - pos[5*3+2])) * quarter;
+
+    // Normalize X
     T invLen = myRsqrt(ax * ax + ay * ay + az * az);
     ax *= invLen; ay *= invLen; az *= invLen;
 
+    // Z = normalized cross(X, Y)
     T cx = ay * bz - az * by;
     T cy = az * bx - ax * bz;
     T cz = ax * by - ay * bx;
     invLen = myRsqrt(cx * cx + cy * cy + cz * cz);
     cx *= invLen; cy *= invLen; cz *= invLen;
 
+    // Y = cross(Z, X)
     bx = cy * az - cz * ay;
     by = cz * ax - cx * az;
     bz = cx * ay - cy * ax;
@@ -156,13 +197,150 @@ __device__ inline void computeHexahedronFrame(const T* ex, T* frame)
     frame[6] = cx; frame[7] = cy; frame[8] = cz;
 }
 
+//=============================================================================
+// Element data gathering
+//=============================================================================
+
+/// Gather positions for one element from global arrays (SoA layout)
+template<typename T, int NNodes, int Dim>
+__device__ inline void gatherElementData(
+    const int* elements, int nbElem, int elemId,
+    const T* globalData,
+    T* localData)
+{
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        const int nodeId = elements[n * nbElem + elemId];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            localData[n * Dim + d] = globalData[nodeId * Dim + d];
+    }
+}
+
+/// Gather displacement (x - x0) for one element
+template<typename T, int NNodes, int Dim>
+__device__ inline void gatherElementDisplacement(
+    const int* elements, int nbElem, int elemId,
+    const T* x, const T* x0,
+    T* disp)
+{
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        const int nodeId = elements[n * nbElem + elemId];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            disp[n * Dim + d] = x[nodeId * Dim + d] - x0[nodeId * Dim + d];
+    }
+}
+
+//=============================================================================
+// Element center computation
+//=============================================================================
+
+/// Compute center of element positions
+template<typename T, int NNodes, int Dim>
+__device__ inline void computeElementCenter(const T* pos, T* center)
+{
+    const T invN = T(1) / T(NNodes);
+
+    #pragma unroll
+    for (int d = 0; d < Dim; ++d)
+        center[d] = T(0);
+
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            center[d] += pos[n * Dim + d];
+    }
+
+    #pragma unroll
+    for (int d = 0; d < Dim; ++d)
+        center[d] *= invN;
+}
+
+//=============================================================================
+// Corotational displacement computation
+//=============================================================================
+
+/// Compute corotational displacement: disp = R^T * (x - center) - (x0 - center0)
+template<typename T, int NNodes, int Dim>
+__device__ inline void computeCorotationalDisplacement(
+    const T* R,
+    const T* x, const T* x0,
+    const T* center, const T* center0,
+    T* disp)
+{
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        // diff = x_n - center
+        T diff[Dim];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            diff[d] = x[n * Dim + d] - center[d];
+
+        // rotated = R^T * diff
+        #pragma unroll
+        for (int di = 0; di < Dim; ++di)
+        {
+            T rotated = T(0);
+            #pragma unroll
+            for (int dj = 0; dj < Dim; ++dj)
+                rotated += R[dj * Dim + di] * diff[dj];
+            disp[n * Dim + di] = rotated - (x0[n * Dim + di] - center0[di]);
+        }
+    }
+}
+
+/// Compute R^T * dx for each node (for addDForce)
+template<typename T, int NNodes, int Dim>
+__device__ inline void rotateDisplacementTranspose(
+    const T* R,
+    const int* elements, int nbElem, int elemId,
+    const T* dx,
+    T* rdx)
+{
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        const int nodeId = elements[n * nbElem + elemId];
+
+        T nodeDx[Dim];
+        #pragma unroll
+        for (int d = 0; d < Dim; ++d)
+            nodeDx[d] = dx[nodeId * Dim + d];
+
+        #pragma unroll
+        for (int di = 0; di < Dim; ++di)
+        {
+            T sum = T(0);
+            #pragma unroll
+            for (int dj = 0; dj < Dim; ++dj)
+                sum += R[dj * Dim + di] * nodeDx[dj];
+            rdx[n * Dim + di] = sum;
+        }
+    }
+}
+
 //=============================================================================
 // Symmetric block-matrix multiply
 //=============================================================================
 
+/**
+ * Symmetric block-matrix multiply: out = K * in
+ *
+ * K is stored in upper-triangle block format:
+ *   symIdx = ni * NNodes - ni*(ni-1)/2 + (nj - ni)  for nj >= ni
+ *   K[symIdx * Dim * Dim + di * Dim + dj] for each element
+ */
 template<typename T, int NNodes, int Dim>
 __device__ inline void symBlockMatMul(const T* K, const T* in, T* out)
 {
+    // Initialize output to zero
     #pragma unroll
     for (int i = 0; i < NNodes * Dim; ++i)
         out[i] = T(0);
@@ -172,7 +350,7 @@ __device__ inline void symBlockMatMul(const T* K, const T* in, T* out)
     {
         const int diagIdx = ni * NNodes - ni * (ni - 1) / 2;
 
-        // Diagonal block
+        // Diagonal block: Kii * in_i -> out_i
         {
             const T* Kii = K + diagIdx * Dim * Dim;
             #pragma unroll
@@ -186,13 +364,14 @@ __device__ inline void symBlockMatMul(const T* K, const T* in, T* out)
             }
         }
 
-        // Off-diagonal blocks
+        // Off-diagonal blocks (symmetric: Kij and Kij^T)
         #pragma unroll
         for (int nj = ni + 1; nj < NNodes; ++nj)
         {
             const int symIdx = diagIdx + (nj - ni);
             const T* Kij = K + symIdx * Dim * Dim;
 
+            // Kij * in_j -> out_i
             #pragma unroll
             for (int di = 0; di < Dim; ++di)
             {
@@ -203,6 +382,7 @@ __device__ inline void symBlockMatMul(const T* K, const T* in, T* out)
                 out[ni * Dim + di] += sum;
             }
 
+            // Kij^T * in_i -> out_j
             #pragma unroll
             for (int dj = 0; dj < Dim; ++dj)
             {
@@ -217,9 +397,49 @@ __device__ inline void symBlockMatMul(const T* K, const T* in, T* out)
 }
 
 //=============================================================================
-// Gather kernel
+// Force output with rotation
+//=============================================================================
+
+/// Rotate local forces to global frame and write: out = scale * R * localForce
+template<typename T, int NNodes, int Dim>
+__device__ inline void rotateAndWriteForce(
+    const T* R,
+    const T* localForce,
+    T* out,
+    T scale)
+{
+    #pragma unroll
+    for (int n = 0; n < NNodes; ++n)
+    {
+        #pragma unroll
+        for (int di = 0; di < Dim; ++di)
+        {
+            T sum = T(0);
+            #pragma unroll
+            for (int dj = 0; dj < Dim; ++dj)
+                sum += R[di * Dim + dj] * localForce[n * Dim + dj];
+            out[n * Dim + di] = scale * sum;
+        }
+    }
+}
+
+/// Write negated force (for linear case without rotation): out = scale * localForce
+template<typename T, int NNodes, int Dim>
+__device__ inline void writeForce(const T* localForce, T* out, T scale)
+{
+    #pragma unroll
+    for (int i = 0; i < NNodes * Dim; ++i)
+        out[i] = scale * localForce[i];
+}
+
+//=============================================================================
+// Gather kernel for accumulating per-vertex forces
 //=============================================================================
 
+/**
+ * Gather per-vertex forces from per-element contributions.
+ * velems[slot * nbVertex + vertexId] contains (elemId * NNodes + localNode + 1), 0 = end
+ */
 template<typename T, int Dim>
 __global__ void ElementFEM_gatherForce_kernel(
     int nbVertex,
diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
index 10b04b98193..9d0195ab580 100644
--- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
+++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu
@@ -43,28 +43,18 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel(
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
+    // Gather displacement (x - x0)
     T disp[NNodes * Dim];
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        const int nodeId = elements[n * nbElem + elemId];
-        #pragma unroll
-        for (int d = 0; d < Dim; ++d)
-            disp[n * Dim + d] = x[nodeId * Dim + d] - x0[nodeId * Dim + d];
-    }
+    gatherElementDisplacement<T, NNodes, Dim>(elements, nbElem, elemId, x, x0, disp);
 
+    // Multiply by stiffness matrix
     const T* K = stiffness + elemId * NSymBlocks * Dim * Dim;
     T edf[NNodes * Dim];
     symBlockMatMul<T, NNodes, Dim>(K, disp, edf);
 
+    // Write negated force
     T* out = eforce + elemId * NNodes * Dim;
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        #pragma unroll
-        for (int d = 0; d < Dim; ++d)
-            out[n * Dim + d] = -edf[n * Dim + d];
-    }
+    writeForce<T, NNodes, Dim>(edf, out, T(-1));
 }
 
 /**
@@ -84,28 +74,18 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel(
     const int elemId = blockIdx.x * blockDim.x + threadIdx.x;
     if (elemId >= nbElem) return;
 
+    // Gather displacement increment
     T edx[NNodes * Dim];
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        const int nodeId = elements[n * nbElem + elemId];
-        #pragma unroll
-        for (int d = 0; d < Dim; ++d)
-            edx[n * Dim + d] = dx[nodeId * Dim + d];
-    }
+    gatherElementData<T, NNodes, Dim>(elements, nbElem, elemId, dx, edx);
 
+    // Multiply by stiffness matrix
     const T* K = stiffness + elemId * NSymBlocks * Dim * Dim;
     T edf[NNodes * Dim];
     symBlockMatMul<T, NNodes, Dim>(K, edx, edf);
 
+    // Write scaled negated force
     T* out = eforce + elemId * NNodes * Dim;
-    #pragma unroll
-    for (int n = 0; n < NNodes; ++n)
-    {
-        #pragma unroll
-        for (int d = 0; d < Dim; ++d)
-            out[n * Dim + d] = -kFactor * edf[n * Dim + d];
-    }
+    writeForce<T, NNodes, Dim>(edf, out, -kFactor);
 }
 
 // ===================== Launch functions =====================