From 3a8a27bb20b7d65dfd092a3d46372827f56a2713 Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Fri, 3 Apr 2026 14:39:45 +0900 Subject: [PATCH 01/21] Improve parallel performance of element force derivative computation Extract assembled stiffness matrices into a separate contiguous buffer (m_assembledStiffnessMatrices) to replace getReadAccessor calls on Data> inside parallel forEachRange lambdas. The read accessor acquires a shared lock on the Data object, causing contention across threads and effectively serializing the parallel work during CG iterations. Using a direct const reference to a plain vector eliminates this synchronization bottleneck (~3x speedup in parallel mode). As a secondary benefit, the contiguous buffer only stores the assembled 24x24 matrices (~4.6 KB each) rather than the full FactorizedElementStiffness structs (~14 KB each), improving cache utilization. --- .../fem/elastic/BaseElementLinearFEMForceField.h | 12 ++++++++++++ .../fem/elastic/BaseElementLinearFEMForceField.inl | 7 +++++++ .../fem/elastic/ElementCorotationalFEMForceField.inl | 8 ++++---- .../ElementLinearSmallStrainFEMForceField.inl | 8 ++++---- 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.h b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.h index 1a980373a5e..26b4ffadd93 100644 --- a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.h +++ b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.h @@ -55,6 +55,10 @@ class BaseElementLinearFEMForceField : public sofa::component::solidmechanics::f using StrainDisplacement = typename trait::StrainDisplacement; using Real = typename trait::Real; +public: + using AssembledStiffnessMatrix = sofa::type::Mat< + trait::NumberOfDofsInElement, trait::NumberOfDofsInElement, Real>; + protected: BaseElementLinearFEMForceField(); @@ -70,6 +74,14 @@ class BaseElementLinearFEMForceField : public sofa::component::solidmechanics::f * List of precomputed element stiffness matrices */ sofa::Data > d_elementStiffness; + + /** + * Contiguous buffer of assembled stiffness matrices (one per element). + * Extracted from d_elementStiffness for cache-friendly access in the hot path. + * This avoids loading the full FactorizedElementStiffness struct (~15 KB per element) + * when only the assembled matrix (~4.6 KB) is needed. + */ + sofa::type::vector m_assembledStiffnessMatrices; }; #if !defined(ELASTICITY_COMPONENT_BASE_ELEMENT_LINEAR_FEM_FORCEFIELD_CPP) diff --git a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.inl b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.inl index de380c5e4cc..e7827cfc55e 100644 --- a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.inl +++ b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/BaseElementLinearFEMForceField.inl @@ -100,6 +100,13 @@ void BaseElementLinearFEMForceField::precomputeElementSt const std::array, trait::NumberOfNodesInElement> nodesCoordinates = extractNodesVectorFromGlobalVector(element, restPositionAccessor.ref()); elementStiffness[elementId] = integrate(nodesCoordinates, elasticityTensor); }); + + // Extract assembled matrices into a contiguous buffer for cache-friendly access + m_assembledStiffnessMatrices.resize(elements.size()); + for (std::size_t i = 0; i < elements.size(); ++i) + { + m_assembledStiffnessMatrices[i] = elementStiffness[i].getAssembledMatrix(); + } } } diff --git a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementCorotationalFEMForceField.inl b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementCorotationalFEMForceField.inl index a3a74a36a81..0298f2e4247 100644 --- a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementCorotationalFEMForceField.inl +++ b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementCorotationalFEMForceField.inl @@ -86,7 +86,7 @@ void ElementCorotationalFEMForceField::computeElementsFo { const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology); auto restPositionAccessor = this->mstate->readRestPositions(); - auto elementStiffness = sofa::helper::getReadAccessor(this->d_elementStiffness); + const auto& assembledMatrices = this->m_assembledStiffnessMatrices; for (std::size_t elementId = range.start; elementId < range.end; ++elementId) { @@ -112,7 +112,7 @@ void ElementCorotationalFEMForceField::computeElementsFo transformedDisplacement = elementRotation.multTranspose(elementNodesCoordinates[j] - t) - (restElementNodesCoordinates[j] - t0); } - const auto& stiffnessMatrix = elementStiffness[elementId]; + const auto& stiffnessMatrix = assembledMatrices[elementId]; auto& elementForce = elementForces[elementId]; elementForce = stiffnessMatrix * displacement; @@ -134,7 +134,7 @@ void ElementCorotationalFEMForceField::computeElementsFo const sofa::VecDeriv_t& nodeDx) { const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology); - auto elementStiffness = sofa::helper::getReadAccessor(this->d_elementStiffness); + const auto& assembledMatrices = this->m_assembledStiffnessMatrices; for (std::size_t elementId = range.start; elementId < range.end; ++elementId) { @@ -150,7 +150,7 @@ void ElementCorotationalFEMForceField::computeElementsFo rotated_dx = elementRotation.multTranspose(nodeDx[element[n]]); } - const auto& stiffnessMatrix = elementStiffness[elementId]; + const auto& stiffnessMatrix = assembledMatrices[elementId]; auto& df = elementForcesDeriv[elementId]; df = stiffnessMatrix * element_dx; diff --git a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementLinearSmallStrainFEMForceField.inl b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementLinearSmallStrainFEMForceField.inl index fc58db7abe4..191de07c12c 100644 --- a/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementLinearSmallStrainFEMForceField.inl +++ b/Sofa/Component/SolidMechanics/FEM/Elastic/src/sofa/component/solidmechanics/fem/elastic/ElementLinearSmallStrainFEMForceField.inl @@ -49,12 +49,12 @@ void ElementLinearSmallStrainFEMForceField::computeEleme { const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology); auto restPositionAccessor = this->mstate->readRestPositions(); - auto elementStiffness = sofa::helper::getReadAccessor(this->d_elementStiffness); + const auto& assembledMatrices = this->m_assembledStiffnessMatrices; for (std::size_t elementId = range.start; elementId < range.end; ++elementId) { const auto& element = elements[elementId]; - const auto& stiffnessMatrix = elementStiffness[elementId]; + const auto& stiffnessMatrix = assembledMatrices[elementId]; typename trait::ElementDisplacement displacement{ sofa::type::NOINIT }; @@ -79,12 +79,12 @@ void ElementLinearSmallStrainFEMForceField::computeEleme const sofa::VecDeriv_t& nodeDx) { const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology); - auto elementStiffness = sofa::helper::getReadAccessor(this->d_elementStiffness); + const auto& assembledMatrices = this->m_assembledStiffnessMatrices; for (std::size_t elementId = range.start; elementId < range.end; ++elementId) { const auto& element = elements[elementId]; - const auto& stiffnessMatrix = elementStiffness[elementId]; + const auto& stiffnessMatrix = assembledMatrices[elementId]; const std::array, trait::NumberOfNodesInElement> elementNodesDx = extractNodesVectorFromGlobalVector(element, nodeDx); From 9e2c5b96f21a7b82eb272add49d0c3f9c87bffd7 Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Mon, 6 Apr 2026 11:05:16 +0900 Subject: [PATCH 02/21] wip --- .../plugins/SofaCUDA/Component/CMakeLists.txt | 4 + .../Component/src/SofaCUDA/component/init.cpp | 2 + .../CudaElementCorotationalFEMForceField.cpp | 58 ++++++ .../CudaElementCorotationalFEMForceField.cu | 160 +++++++++++++++++ .../CudaElementCorotationalFEMForceField.h | 110 ++++++++++++ .../CudaElementCorotationalFEMForceField.inl | 168 ++++++++++++++++++ .../CantileverBeam_ElementFEMForceField.xml | 2 +- 7 files changed, 503 insertions(+), 1 deletion(-) create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl diff --git a/applications/plugins/SofaCUDA/Component/CMakeLists.txt b/applications/plugins/SofaCUDA/Component/CMakeLists.txt index 77ce7a8b2be..fbd83faf0e5 100644 --- a/applications/plugins/SofaCUDA/Component/CMakeLists.txt +++ b/applications/plugins/SofaCUDA/Component/CMakeLists.txt @@ -39,6 +39,8 @@ set(HEADER_FILES ### solidmechanics + ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h + ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.h ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.inl ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/hyperelastic/CudaStandardTetrahedralFEMForceField.h @@ -111,6 +113,7 @@ set(SOURCE_FILES ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/mass/CudaUniformMass.cpp ### Solidmechanics + ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.cpp ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/hyperelastic/CudaStandardTetrahedralFEMForceField.cpp ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/tensormass/CudaTetrahedralTensorMassForceField.cpp @@ -181,6 +184,7 @@ set(CUDA_SOURCES ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/mass/CudaUniformMass.cu ### solidmechanics + ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.cu ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/hyperelastic/CudaStandardTetrahedralFEMForceField.cu ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/tensormass/CudaTetrahedralTensorMassForceField.cu diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp index f9c124998fb..0d890c64139 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp @@ -90,6 +90,7 @@ extern void registerPlaneForceField(sofa::core::ObjectFactory* factory); extern void registerSphereForceField(sofa::core::ObjectFactory* factory); // component::solidmechanics::fem::elastic +extern void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory); extern void registerHexahedronFEMForceField(sofa::core::ObjectFactory* factory); extern void registerTetrahedronFEMForceField(sofa::core::ObjectFactory* factory); extern void registerTriangularFEMForceFieldOptim(sofa::core::ObjectFactory* factory); @@ -224,6 +225,7 @@ void registerObjects(sofa::core::ObjectFactory* factory) registerLinearForceField(factory); registerPlaneForceField(factory); registerSphereForceField(factory); + registerElementCorotationalFEMForceField(factory); registerHexahedronFEMForceField(factory); registerTetrahedronFEMForceField(factory); registerTriangularFEMForceFieldOptim(factory); diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp new file mode 100644 index 00000000000..cb0251b4925 --- /dev/null +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp @@ -0,0 +1,58 @@ +/****************************************************************************** +* SOFA, Simulation Open-Framework Architecture * +* (c) 2006 INRIA, USTL, UJF, CNRS, MGH * +* * +* This program is free software; you can redistribute it and/or modify it * +* under the terms of the GNU Lesser General Public License as published by * +* the Free Software Foundation; either version 2.1 of the License, or (at * +* your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, but WITHOUT * +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * +* for more details. * +* * +* You should have received a copy of the GNU Lesser General Public License * +* along with this program. If not, see . * +******************************************************************************* +* Authors: The SOFA Team and external contributors (see Authors.txt) * +* * +* Contact information: contact@sofa-framework.org * +******************************************************************************/ +#include + +#include +#include +#include + +namespace sofa::component::solidmechanics::fem::elastic +{ + +using namespace sofa::gpu::cuda; + +template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; + +} // namespace sofa::component::solidmechanics::fem::elastic + +namespace sofa::gpu::cuda +{ + +void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory) +{ + using namespace sofa::component::solidmechanics::fem::elastic; + + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA for the ElementCorotationalFEMForceField") + .add< CudaElementCorotationalFEMForceField >() + .add< CudaElementCorotationalFEMForceField >() + .add< CudaElementCorotationalFEMForceField >() + .add< CudaElementCorotationalFEMForceField >() + .add< CudaElementCorotationalFEMForceField >() + ); +} + +} // namespace sofa::gpu::cuda diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu new file mode 100644 index 00000000000..7728de8b2c2 --- /dev/null +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu @@ -0,0 +1,160 @@ +/****************************************************************************** +* SOFA, Simulation Open-Framework Architecture * +* (c) 2006 INRIA, USTL, UJF, CNRS, MGH * +* * +* This program is free software; you can redistribute it and/or modify it * +* under the terms of the GNU Lesser General Public License as published by * +* the Free Software Foundation; either version 2.1 of the License, or (at * +* your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, but WITHOUT * +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * +* for more details. * +* * +* You should have received a copy of the GNU Lesser General Public License * +* along with this program. If not, see . * +******************************************************************************* +* Authors: The SOFA Team and external contributors (see Authors.txt) * +* * +* Contact information: contact@sofa-framework.org * +******************************************************************************/ +#include +#include +#include + +#if defined(__cplusplus) +namespace sofa +{ +namespace gpu +{ +namespace cuda +{ +#endif + +/// Maximum number of DOFs per element (8 nodes * 3 dimensions for hexahedra) +#define MAX_ELEM_DOFS 24 +/// Maximum spatial dimensions +#define MAX_DIM 3 +/// Maximum nodes per element +#define MAX_NODES 8 + +/** + * CUDA kernel for addDForce of corotational FEM. + * + * Generic over element type: works with any number of nodes per element and spatial dimensions. + * One thread per element. For each element: + * 1. Gather dx from nodes + * 2. Rotate dx into reference frame: rdx = R^T * dx + * 3. Multiply by stiffness: edf = K * rdx + * 4. Rotate back: df_world = R * edf + * 5. Scatter to nodes via atomicAdd: df[node] -= kFactor * df_world + */ +__global__ void ElementCorotationalFEMForceFieldCuda3f_addDForce_kernel( + int nbElem, + int nbNodesPerElem, + int nbDofsPerElem, + int dim, + const int* __restrict__ elements, + const float* __restrict__ rotations, + const float* __restrict__ stiffness, + const float* __restrict__ dx, + float* df, + float kFactor) +{ + const int elemId = blockIdx.x * blockDim.x + threadIdx.x; + if (elemId >= nbElem) return; + + // Load element node indices + const int* elemNodes = elements + elemId * nbNodesPerElem; + + // Load rotation matrix R (dim x dim, row-major) + const float* Rptr = rotations + elemId * dim * dim; + float R[MAX_DIM * MAX_DIM]; + for (int i = 0; i < dim * dim; ++i) + R[i] = Rptr[i]; + + // Gather dx and rotate into reference frame: rdx = R^T * dx_node + float rdx[MAX_ELEM_DOFS]; + for (int n = 0; n < nbNodesPerElem; ++n) + { + const int nodeId = elemNodes[n]; + const float* node_dx = dx + nodeId * dim; + + for (int i = 0; i < dim; ++i) + { + float val = 0.0f; + for (int j = 0; j < dim; ++j) + val += R[j * dim + i] * node_dx[j]; // R^T[i][j] = R[j][i] + rdx[n * dim + i] = val; + } + } + + // K * rdx -> edf (nbDofsPerElem x nbDofsPerElem matrix-vector product) + const float* K = stiffness + elemId * nbDofsPerElem * nbDofsPerElem; + float edf[MAX_ELEM_DOFS]; + for (int i = 0; i < nbDofsPerElem; ++i) + { + float sum = 0.0f; + const float* Ki = K + i * nbDofsPerElem; + for (int j = 0; j < nbDofsPerElem; ++j) + sum += Ki[j] * rdx[j]; + edf[i] = sum; + } + + // Rotate back and scatter: df[node] -= kFactor * R * edf_node + for (int n = 0; n < nbNodesPerElem; ++n) + { + const int nodeId = elemNodes[n]; + const float* node_edf = edf + n * dim; + + for (int i = 0; i < dim; ++i) + { + float val = 0.0f; + for (int j = 0; j < dim; ++j) + val += R[i * dim + j] * node_edf[j]; // R * edf_node + atomicAdd(&df[nodeId * dim + i], -kFactor * val); + } + } +} + +extern "C" +{ + +void ElementCorotationalFEMForceFieldCuda3f_addDForce( + unsigned int nbElem, + unsigned int nbNodesPerElem, + unsigned int nbDofsPerElem, + unsigned int spatialDim, + const void* elements, + const void* rotations, + const void* stiffness, + const void* dx, + void* df, + float kFactor) +{ + const int threadsPerBlock = 64; + const int numBlocks = (nbElem + threadsPerBlock - 1) / threadsPerBlock; + + ElementCorotationalFEMForceFieldCuda3f_addDForce_kernel<<>>( + nbElem, + nbNodesPerElem, + nbDofsPerElem, + spatialDim, + (const int*)elements, + (const float*)rotations, + (const float*)stiffness, + (const float*)dx, + (float*)df, + kFactor); + + mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_addDForce_kernel"); +} + +} // extern "C" + +#if defined(__cplusplus) +} // namespace cuda +} // namespace gpu +} // namespace sofa +#endif diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h new file mode 100644 index 00000000000..e124047fd58 --- /dev/null +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h @@ -0,0 +1,110 @@ +/****************************************************************************** +* SOFA, Simulation Open-Framework Architecture * +* (c) 2006 INRIA, USTL, UJF, CNRS, MGH * +* * +* This program is free software; you can redistribute it and/or modify it * +* under the terms of the GNU Lesser General Public License as published by * +* the Free Software Foundation; either version 2.1 of the License, or (at * +* your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, but WITHOUT * +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * +* for more details. * +* * +* You should have received a copy of the GNU Lesser General Public License * +* along with this program. If not, see . * +******************************************************************************* +* Authors: The SOFA Team and external contributors (see Authors.txt) * +* * +* Contact information: contact@sofa-framework.org * +******************************************************************************/ +#pragma once + +#include +#include + +namespace sofa::gpu::cuda +{ + +extern "C" +{ + void ElementCorotationalFEMForceFieldCuda3f_addDForce( + unsigned int nbElem, + unsigned int nbNodesPerElem, + unsigned int nbDofsPerElem, + unsigned int spatialDim, + const void* elements, + const void* rotations, + const void* stiffness, + const void* dx, + void* df, + float kFactor); +} + +} // namespace sofa::gpu::cuda + +namespace sofa::component::solidmechanics::fem::elastic +{ + +/** + * CUDA-accelerated version of ElementCorotationalFEMForceField. + * + * Works with any element type (Edge, Triangle, Quad, Tetrahedron, Hexahedron). + * The addDForce method (the CG hot path, called ~250 times per timestep) runs entirely on GPU. + * The addForce method delegates to the CPU parent and uploads rotations to GPU afterwards. + */ +template +class CudaElementCorotationalFEMForceField + : public ElementCorotationalFEMForceField +{ +public: + SOFA_CLASS( + SOFA_TEMPLATE2(CudaElementCorotationalFEMForceField, DataTypes, ElementType), + SOFA_TEMPLATE2(ElementCorotationalFEMForceField, DataTypes, ElementType)); + + using Real = sofa::Real_t; + using Coord = sofa::Coord_t; + using Deriv = sofa::Deriv_t; + using VecCoord = sofa::VecCoord_t; + using VecDeriv = sofa::VecDeriv_t; + + static const std::string GetCustomClassName() + { + return ElementCorotationalFEMForceField::GetCustomClassName(); + } + + static const std::string GetCustomTemplateName() + { + return DataTypes::Name(); + } + + void init() override; + + void addForce( + const sofa::core::MechanicalParams* mparams, + sofa::DataVecDeriv_t& f, + const sofa::DataVecCoord_t& x, + const sofa::DataVecDeriv_t& v) override; + + void addDForce( + const sofa::core::MechanicalParams* mparams, + sofa::DataVecDeriv_t& df, + const sofa::DataVecDeriv_t& dx) override; + +protected: + + CudaElementCorotationalFEMForceField() = default; + + void uploadStiffnessAndConnectivity(); + void uploadRotations(); + + gpu::cuda::CudaVector m_gpuStiffness; ///< Flat NxN stiffness matrices per element (N = nbDofsPerElement) + gpu::cuda::CudaVector m_gpuRotations; ///< Flat DxD rotation matrices per element (D = spatial_dimensions) + gpu::cuda::CudaVector m_gpuElements; ///< Node indices per element + + bool m_gpuDataUploaded = false; + bool m_gpuRotationsUploaded = false; +}; + +} // namespace sofa::component::solidmechanics::fem::elastic diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl new file mode 100644 index 00000000000..ab2cc511cb7 --- /dev/null +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl @@ -0,0 +1,168 @@ +/****************************************************************************** +* SOFA, Simulation Open-Framework Architecture * +* (c) 2006 INRIA, USTL, UJF, CNRS, MGH * +* * +* This program is free software; you can redistribute it and/or modify it * +* under the terms of the GNU Lesser General Public License as published by * +* the Free Software Foundation; either version 2.1 of the License, or (at * +* your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, but WITHOUT * +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * +* for more details. * +* * +* You should have received a copy of the GNU Lesser General Public License * +* along with this program. If not, see . * +******************************************************************************* +* Authors: The SOFA Team and external contributors (see Authors.txt) * +* * +* Contact information: contact@sofa-framework.org * +******************************************************************************/ +#pragma once +#include +#include +#include + +namespace sofa::component::solidmechanics::fem::elastic +{ + +template +void CudaElementCorotationalFEMForceField::init() +{ + ElementCorotationalFEMForceField::init(); + + if (!this->isComponentStateInvalid()) + { + uploadStiffnessAndConnectivity(); + } +} + +template +void CudaElementCorotationalFEMForceField::uploadStiffnessAndConnectivity() +{ + using trait = sofa::component::solidmechanics::fem::elastic::trait; + + if (!this->l_topology) return; + + const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology); + const auto& assembledMatrices = this->m_assembledStiffnessMatrices; + + const auto nbElem = elements.size(); + constexpr auto nDofs = trait::NumberOfDofsInElement; + constexpr auto nNodes = trait::NumberOfNodesInElement; + + // Upload stiffness matrices (flat row-major NxN per element) + m_gpuStiffness.resize(nbElem * nDofs * nDofs); + { + auto* dst = m_gpuStiffness.hostWrite(); + for (std::size_t e = 0; e < nbElem; ++e) + { + const auto& K = assembledMatrices[e]; + for (unsigned int i = 0; i < nDofs; ++i) + for (unsigned int j = 0; j < nDofs; ++j) + dst[e * nDofs * nDofs + i * nDofs + j] = static_cast(K[i][j]); + } + } + + // Upload element connectivity (nNodes node indices per element) + m_gpuElements.resize(nbElem * nNodes); + { + auto* dst = m_gpuElements.hostWrite(); + for (std::size_t e = 0; e < nbElem; ++e) + { + const auto& element = elements[e]; + for (unsigned int n = 0; n < nNodes; ++n) + dst[e * nNodes + n] = static_cast(element[n]); + } + } + + m_gpuDataUploaded = true; + m_gpuRotationsUploaded = false; +} + +template +void CudaElementCorotationalFEMForceField::uploadRotations() +{ + using trait = sofa::component::solidmechanics::fem::elastic::trait; + constexpr auto dim = trait::spatial_dimensions; + + const auto& rotations = this->m_rotations; + const auto nbElem = rotations.size(); + + m_gpuRotations.resize(nbElem * dim * dim); + { + auto* dst = m_gpuRotations.hostWrite(); + for (std::size_t e = 0; e < nbElem; ++e) + { + const auto& R = rotations[e]; + for (unsigned int i = 0; i < dim; ++i) + for (unsigned int j = 0; j < dim; ++j) + dst[e * dim * dim + i * dim + j] = static_cast(R[i][j]); + } + } + + m_gpuRotationsUploaded = true; +} + +template +void CudaElementCorotationalFEMForceField::addForce( + const sofa::core::MechanicalParams* mparams, + sofa::DataVecDeriv_t& f, + const sofa::DataVecCoord_t& x, + const sofa::DataVecDeriv_t& v) +{ + // Run on CPU: computes rotations and forces + ElementCorotationalFEMForceField::addForce(mparams, f, x, v); + + // Upload the freshly-computed rotations to GPU for subsequent addDForce calls + uploadRotations(); +} + +template +void CudaElementCorotationalFEMForceField::addDForce( + const sofa::core::MechanicalParams* mparams, + sofa::DataVecDeriv_t& d_df, + const sofa::DataVecDeriv_t& d_dx) +{ + if (this->isComponentStateInvalid()) + return; + + if (!m_gpuDataUploaded || !m_gpuRotationsUploaded) + { + // Fallback to CPU if GPU data not ready + ElementCorotationalFEMForceField::addDForce(mparams, d_df, d_dx); + return; + } + + using trait = sofa::component::solidmechanics::fem::elastic::trait; + + VecDeriv& df = *d_df.beginEdit(); + const VecDeriv& dx = d_dx.getValue(); + + if (df.size() < dx.size()) + df.resize(dx.size()); + + const auto kFactor = static_cast( + sofa::core::mechanicalparams::kFactorIncludingRayleighDamping( + mparams, this->rayleighStiffness.getValue())); + + const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology); + const auto nbElem = static_cast(elements.size()); + + gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addDForce( + nbElem, + trait::NumberOfNodesInElement, + trait::NumberOfDofsInElement, + trait::spatial_dimensions, + m_gpuElements.deviceRead(), + m_gpuRotations.deviceRead(), + m_gpuStiffness.deviceRead(), + dx.deviceRead(), + df.deviceWrite(), + kFactor); + + d_df.endEdit(); +} + +} // namespace sofa::component::solidmechanics::fem::elastic diff --git a/examples/Validation/cantilever_beam/CantileverBeam_ElementFEMForceField.xml b/examples/Validation/cantilever_beam/CantileverBeam_ElementFEMForceField.xml index 4e1fab99d9e..2133c327c4a 100644 --- a/examples/Validation/cantilever_beam/CantileverBeam_ElementFEMForceField.xml +++ b/examples/Validation/cantilever_beam/CantileverBeam_ElementFEMForceField.xml @@ -27,7 +27,7 @@ - + From b746722ce6982996da99c4f57ed6bdec4e2380b9 Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Mon, 6 Apr 2026 13:22:22 +0900 Subject: [PATCH 03/21] add example --- .../CudaElementCorotationalFEMForceField.cpp | 14 +++++++- .../CudaElementCorotationalFEMForceField.scn | 35 +++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp index cb0251b4925..c77a51c13c2 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp @@ -46,11 +46,23 @@ void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory using namespace sofa::component::solidmechanics::fem::elastic; factory->registerObjects(sofa::core::ObjectRegistrationData( - "Supports GPU-side computations using CUDA for the ElementCorotationalFEMForceField") + "Supports GPU-side computations using CUDA for EdgeCorotationalFEMForceField") .add< CudaElementCorotationalFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA for TriangleCorotationalFEMForceField") .add< CudaElementCorotationalFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA for QuadCorotationalFEMForceField") .add< CudaElementCorotationalFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA for TetrahedronCorotationalFEMForceField") .add< CudaElementCorotationalFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA for HexahedronCorotationalFEMForceField") .add< CudaElementCorotationalFEMForceField >() ); } diff --git a/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn b/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn new file mode 100644 index 00000000000..a75e2058ff5 --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 865f7f46ff8eebf8f5c463a85b5d80f7d5b72a0c Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Mon, 6 Apr 2026 14:24:02 +0900 Subject: [PATCH 04/21] new version --- .../CudaElementCorotationalFEMForceField.cu | 218 ++++++++++++------ .../CudaElementCorotationalFEMForceField.h | 21 +- .../CudaElementCorotationalFEMForceField.inl | 83 ++++++- 3 files changed, 232 insertions(+), 90 deletions(-) diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu index 7728de8b2c2..4e1091f0e66 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu @@ -32,89 +32,164 @@ namespace cuda { #endif -/// Maximum number of DOFs per element (8 nodes * 3 dimensions for hexahedra) -#define MAX_ELEM_DOFS 24 -/// Maximum spatial dimensions -#define MAX_DIM 3 -/// Maximum nodes per element -#define MAX_NODES 8 - /** - * CUDA kernel for addDForce of corotational FEM. + * Kernel 1: Compute per-element dForce (1 thread per element). + * + * Templated on NNodes (compile-time) for full loop unrolling. + * Hardcoded Dim=3 (CudaVec3f only). * - * Generic over element type: works with any number of nodes per element and spatial dimensions. - * One thread per element. For each element: - * 1. Gather dx from nodes - * 2. Rotate dx into reference frame: rdx = R^T * dx - * 3. Multiply by stiffness: edf = K * rdx - * 4. Rotate back: df_world = R * edf - * 5. Scatter to nodes via atomicAdd: df[node] -= kFactor * df_world + * Connectivity is SoA: elements[nodeIdx * nbElem + elemId]. + * Stiffness is in block format: K[(ni * NNodes + nj) * 9 + di * 3 + dj]. */ -__global__ void ElementCorotationalFEMForceFieldCuda3f_addDForce_kernel( +template +__global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel( int nbElem, - int nbNodesPerElem, - int nbDofsPerElem, - int dim, const int* __restrict__ elements, const float* __restrict__ rotations, const float* __restrict__ stiffness, const float* __restrict__ dx, - float* df, + float* __restrict__ eforce, float kFactor) { + constexpr int NDofs = NNodes * 3; + const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; - // Load element node indices - const int* elemNodes = elements + elemId * nbNodesPerElem; - - // Load rotation matrix R (dim x dim, row-major) - const float* Rptr = rotations + elemId * dim * dim; - float R[MAX_DIM * MAX_DIM]; - for (int i = 0; i < dim * dim; ++i) + // Load rotation matrix R (3x3, row-major) + const float* Rptr = rotations + elemId * 9; + float R[9]; + #pragma unroll + for (int i = 0; i < 9; ++i) R[i] = Rptr[i]; - // Gather dx and rotate into reference frame: rdx = R^T * dx_node - float rdx[MAX_ELEM_DOFS]; - for (int n = 0; n < nbNodesPerElem; ++n) + // Gather dx and rotate into reference frame: rdx[n] = R^T * dx[node[n]] + float rdx[NDofs]; + #pragma unroll + for (int n = 0; n < NNodes; ++n) { - const int nodeId = elemNodes[n]; - const float* node_dx = dx + nodeId * dim; + const int nodeId = elements[n * nbElem + elemId]; + const float dx_x = dx[nodeId * 3 + 0]; + const float dx_y = dx[nodeId * 3 + 1]; + const float dx_z = dx[nodeId * 3 + 2]; + + rdx[n * 3 + 0] = R[0] * dx_x + R[3] * dx_y + R[6] * dx_z; + rdx[n * 3 + 1] = R[1] * dx_x + R[4] * dx_y + R[7] * dx_z; + rdx[n * 3 + 2] = R[2] * dx_x + R[5] * dx_y + R[8] * dx_z; + } - for (int i = 0; i < dim; ++i) + // Block-matrix multiply: edf = K * rdx + const float* K = stiffness + elemId * NNodes * NNodes * 9; + float edf[NDofs]; + #pragma unroll + for (int ni = 0; ni < NNodes; ++ni) + { + float fi0 = 0.0f, fi1 = 0.0f, fi2 = 0.0f; + #pragma unroll + for (int nj = 0; nj < NNodes; ++nj) { - float val = 0.0f; - for (int j = 0; j < dim; ++j) - val += R[j * dim + i] * node_dx[j]; // R^T[i][j] = R[j][i] - rdx[n * dim + i] = val; + const float* Kij = K + (ni * NNodes + nj) * 9; + const float rj0 = rdx[nj * 3 + 0]; + const float rj1 = rdx[nj * 3 + 1]; + const float rj2 = rdx[nj * 3 + 2]; + fi0 += Kij[0] * rj0 + Kij[1] * rj1 + Kij[2] * rj2; + fi1 += Kij[3] * rj0 + Kij[4] * rj1 + Kij[5] * rj2; + fi2 += Kij[6] * rj0 + Kij[7] * rj1 + Kij[8] * rj2; } + edf[ni * 3 + 0] = fi0; + edf[ni * 3 + 1] = fi1; + edf[ni * 3 + 2] = fi2; } - // K * rdx -> edf (nbDofsPerElem x nbDofsPerElem matrix-vector product) - const float* K = stiffness + elemId * nbDofsPerElem * nbDofsPerElem; - float edf[MAX_ELEM_DOFS]; - for (int i = 0; i < nbDofsPerElem; ++i) + // Rotate back and write: eforce = -kFactor * R * edf + float* out = eforce + elemId * NNodes * 3; + #pragma unroll + for (int n = 0; n < NNodes; ++n) { - float sum = 0.0f; - const float* Ki = K + i * nbDofsPerElem; - for (int j = 0; j < nbDofsPerElem; ++j) - sum += Ki[j] * rdx[j]; - edf[i] = sum; + const float e0 = edf[n * 3 + 0]; + const float e1 = edf[n * 3 + 1]; + const float e2 = edf[n * 3 + 2]; + out[n * 3 + 0] = -kFactor * (R[0] * e0 + R[1] * e1 + R[2] * e2); + out[n * 3 + 1] = -kFactor * (R[3] * e0 + R[4] * e1 + R[5] * e2); + out[n * 3 + 2] = -kFactor * (R[6] * e0 + R[7] * e1 + R[8] * e2); } +} - // Rotate back and scatter: df[node] -= kFactor * R * edf_node - for (int n = 0; n < nbNodesPerElem; ++n) +/** + * Kernel 2: Gather per-vertex forces (1 thread per vertex). + * + * No atomics: each vertex handled by exactly one thread. + * velems is SoA: velems[s * nbVertex + vertexId], 0-terminated. + * Each entry is (elemId * NNodes + localNode + 1), with 0 as sentinel. + */ +__global__ void ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel( + int nbVertex, + int maxElemPerVertex, + const int* __restrict__ velems, + const float* __restrict__ eforce, + float* df) +{ + const int vertexId = blockIdx.x * blockDim.x + threadIdx.x; + if (vertexId >= nbVertex) return; + + float fx = 0.0f, fy = 0.0f, fz = 0.0f; + + for (int s = 0; s < maxElemPerVertex; ++s) { - const int nodeId = elemNodes[n]; - const float* node_edf = edf + n * dim; + const int idx = velems[s * nbVertex + vertexId]; + if (idx == 0) break; + const int base = (idx - 1) * 3; + fx += eforce[base + 0]; + fy += eforce[base + 1]; + fz += eforce[base + 2]; + } - for (int i = 0; i < dim; ++i) - { - float val = 0.0f; - for (int j = 0; j < dim; ++j) - val += R[i * dim + j] * node_edf[j]; // R * edf_node - atomicAdd(&df[nodeId * dim + i], -kFactor * val); - } + df[vertexId * 3 + 0] += fx; + df[vertexId * 3 + 1] += fy; + df[vertexId * 3 + 2] += fz; +} + +template +static void launchAddDForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int maxElemPerVertex, + const void* elements, + const void* rotations, + const void* stiffness, + const void* dx, + void* df, + void* eforce, + const void* velems, + float kFactor) +{ + const int computeThreads = 64; + const int gatherThreads = 256; + + { + const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; + ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel + <<>>( + nbElem, + (const int*)elements, + (const float*)rotations, + (const float*)stiffness, + (const float*)dx, + (float*)eforce, + kFactor); + mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel"); + } + + { + const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; + ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel + <<>>( + nbVertex, + maxElemPerVertex, + (const int*)velems, + (const float*)eforce, + (float*)df); + mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel"); } } @@ -123,32 +198,25 @@ extern "C" void ElementCorotationalFEMForceFieldCuda3f_addDForce( unsigned int nbElem, + unsigned int nbVertex, unsigned int nbNodesPerElem, - unsigned int nbDofsPerElem, - unsigned int spatialDim, + unsigned int maxElemPerVertex, const void* elements, const void* rotations, const void* stiffness, const void* dx, void* df, + void* eforce, + const void* velems, float kFactor) { - const int threadsPerBlock = 64; - const int numBlocks = (nbElem + threadsPerBlock - 1) / threadsPerBlock; - - ElementCorotationalFEMForceFieldCuda3f_addDForce_kernel<<>>( - nbElem, - nbNodesPerElem, - nbDofsPerElem, - spatialDim, - (const int*)elements, - (const float*)rotations, - (const float*)stiffness, - (const float*)dx, - (float*)df, - kFactor); - - mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_addDForce_kernel"); + switch (nbNodesPerElem) + { + case 2: launchAddDForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; + case 3: launchAddDForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; + case 4: launchAddDForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; + case 8: launchAddDForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; + } } } // extern "C" diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h index e124047fd58..c5220a2f2be 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h @@ -31,14 +31,16 @@ extern "C" { void ElementCorotationalFEMForceFieldCuda3f_addDForce( unsigned int nbElem, + unsigned int nbVertex, unsigned int nbNodesPerElem, - unsigned int nbDofsPerElem, - unsigned int spatialDim, + unsigned int maxElemPerVertex, const void* elements, const void* rotations, const void* stiffness, const void* dx, void* df, + void* eforce, + const void* velems, float kFactor); } @@ -53,6 +55,10 @@ namespace sofa::component::solidmechanics::fem::elastic * Works with any element type (Edge, Triangle, Quad, Tetrahedron, Hexahedron). * The addDForce method (the CG hot path, called ~250 times per timestep) runs entirely on GPU. * The addForce method delegates to the CPU parent and uploads rotations to GPU afterwards. + * + * Uses a two-kernel approach for addDForce: + * Kernel 1: compute per-element forces (1 thread/element, fully unrolled) + * Kernel 2: gather per-vertex (1 thread/vertex, no atomics) */ template class CudaElementCorotationalFEMForceField @@ -99,9 +105,14 @@ class CudaElementCorotationalFEMForceField void uploadStiffnessAndConnectivity(); void uploadRotations(); - gpu::cuda::CudaVector m_gpuStiffness; ///< Flat NxN stiffness matrices per element (N = nbDofsPerElement) - gpu::cuda::CudaVector m_gpuRotations; ///< Flat DxD rotation matrices per element (D = spatial_dimensions) - gpu::cuda::CudaVector m_gpuElements; ///< Node indices per element + gpu::cuda::CudaVector m_gpuStiffness; ///< Block-format stiffness: K[(ni*N+nj)*9 + di*3+dj] per element + gpu::cuda::CudaVector m_gpuRotations; ///< Flat 3x3 rotation matrices per element + gpu::cuda::CudaVector m_gpuElements; ///< SoA connectivity: elements[nodeIdx * nbElem + elemId] + gpu::cuda::CudaVector m_gpuElementForce; ///< Intermediate per-element per-node force buffer + gpu::cuda::CudaVector m_gpuVelems; ///< SoA vertex-to-element mapping, 0-terminated + + unsigned int m_maxElemPerVertex = 0; + unsigned int m_nbVertices = 0; bool m_gpuDataUploaded = false; bool m_gpuRotationsUploaded = false; diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl index ab2cc511cb7..a3813867726 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl @@ -23,6 +23,7 @@ #include #include #include +#include namespace sofa::component::solidmechanics::fem::elastic { @@ -51,32 +52,91 @@ void CudaElementCorotationalFEMForceField::uploadStiffne const auto nbElem = elements.size(); constexpr auto nDofs = trait::NumberOfDofsInElement; constexpr auto nNodes = trait::NumberOfNodesInElement; + constexpr auto dim = trait::spatial_dimensions; + + // Find number of vertices + unsigned int maxNodeId = 0; + for (std::size_t e = 0; e < nbElem; ++e) + { + const auto& element = elements[e]; + for (unsigned int n = 0; n < nNodes; ++n) + { + if (static_cast(element[n]) > maxNodeId) + maxNodeId = static_cast(element[n]); + } + } + m_nbVertices = maxNodeId + 1; - // Upload stiffness matrices (flat row-major NxN per element) - m_gpuStiffness.resize(nbElem * nDofs * nDofs); + // Upload stiffness matrices in block format: + // K[(ni * nNodes + nj) * dim * dim + di * dim + dj] per element + // This groups each 3x3 sub-block contiguously for better cache behavior. + m_gpuStiffness.resize(nbElem * nNodes * nNodes * dim * dim); { auto* dst = m_gpuStiffness.hostWrite(); for (std::size_t e = 0; e < nbElem; ++e) { const auto& K = assembledMatrices[e]; - for (unsigned int i = 0; i < nDofs; ++i) - for (unsigned int j = 0; j < nDofs; ++j) - dst[e * nDofs * nDofs + i * nDofs + j] = static_cast(K[i][j]); + for (unsigned int ni = 0; ni < nNodes; ++ni) + for (unsigned int nj = 0; nj < nNodes; ++nj) + for (unsigned int di = 0; di < dim; ++di) + for (unsigned int dj = 0; dj < dim; ++dj) + dst[e * nNodes * nNodes * dim * dim + + (ni * nNodes + nj) * dim * dim + + di * dim + dj] + = static_cast(K[ni * dim + di][nj * dim + dj]); } } - // Upload element connectivity (nNodes node indices per element) - m_gpuElements.resize(nbElem * nNodes); + // Upload element connectivity in SoA layout: + // elements[nodeIdx * nbElem + elemId] = global node index + // Adjacent threads access adjacent memory for coalesced reads. + m_gpuElements.resize(nNodes * nbElem); { auto* dst = m_gpuElements.hostWrite(); for (std::size_t e = 0; e < nbElem; ++e) { const auto& element = elements[e]; for (unsigned int n = 0; n < nNodes; ++n) - dst[e * nNodes + n] = static_cast(element[n]); + dst[n * nbElem + e] = static_cast(element[n]); + } + } + + // Build vertex-to-element mapping (velems) + // For each vertex, stores the list of (elemId * nNodes + localNode + 1). + // 0 is used as sentinel. SoA layout: velems[slot * nbVertex + vertexId]. + std::vector> vertexElems(m_nbVertices); + for (std::size_t e = 0; e < nbElem; ++e) + { + const auto& element = elements[e]; + for (unsigned int n = 0; n < nNodes; ++n) + { + const int nodeId = static_cast(element[n]); + vertexElems[nodeId].push_back( + static_cast(e * nNodes + n + 1)); + } + } + + m_maxElemPerVertex = 0; + for (const auto& ve : vertexElems) + { + if (ve.size() > m_maxElemPerVertex) + m_maxElemPerVertex = static_cast(ve.size()); + } + + m_gpuVelems.resize(m_maxElemPerVertex * m_nbVertices); + { + auto* dst = m_gpuVelems.hostWrite(); + std::memset(dst, 0, m_maxElemPerVertex * m_nbVertices * sizeof(int)); + for (std::size_t v = 0; v < m_nbVertices; ++v) + { + for (std::size_t s = 0; s < vertexElems[v].size(); ++s) + dst[s * m_nbVertices + v] = vertexElems[v][s]; } } + // Allocate intermediate per-element force buffer + m_gpuElementForce.resize(nbElem * nNodes * dim); + m_gpuDataUploaded = true; m_gpuRotationsUploaded = false; } @@ -149,17 +209,20 @@ void CudaElementCorotationalFEMForceField::addDForce( const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology); const auto nbElem = static_cast(elements.size()); + const auto nbVertex = static_cast(dx.size()); gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addDForce( nbElem, + nbVertex, trait::NumberOfNodesInElement, - trait::NumberOfDofsInElement, - trait::spatial_dimensions, + m_maxElemPerVertex, m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), m_gpuStiffness.deviceRead(), dx.deviceRead(), df.deviceWrite(), + m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead(), kFactor); d_df.endEdit(); From 90e1204a19c6c51ad7dc17aa11ac49214625ed8f Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Mon, 6 Apr 2026 14:56:32 +0900 Subject: [PATCH 05/21] improvment of new version --- .../CudaElementCorotationalFEMForceField.cu | 73 ++++++++++++++----- .../CudaElementCorotationalFEMForceField.inl | 22 ++++-- 2 files changed, 70 insertions(+), 25 deletions(-) diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu index 4e1091f0e66..52ec3af12af 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu @@ -39,7 +39,12 @@ namespace cuda * Hardcoded Dim=3 (CudaVec3f only). * * Connectivity is SoA: elements[nodeIdx * nbElem + elemId]. - * Stiffness is in block format: K[(ni * NNodes + nj) * 9 + di * 3 + dj]. + * Stiffness uses symmetric upper-triangle block storage: + * Only blocks (ni, nj) with nj >= ni are stored. + * NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats each. + * Each off-diagonal block is read once and used for both + * forward (edf[ni] += Kij * rdx[nj]) and symmetric + * (edf[nj] += Kij^T * rdx[ni]) contributions. */ template __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel( @@ -51,7 +56,7 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel( float* __restrict__ eforce, float kFactor) { - constexpr int NDofs = NNodes * 3; + constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; @@ -64,7 +69,7 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel( R[i] = Rptr[i]; // Gather dx and rotate into reference frame: rdx[n] = R^T * dx[node[n]] - float rdx[NDofs]; + float rdx[NNodes * 3]; #pragma unroll for (int n = 0; n < NNodes; ++n) { @@ -78,27 +83,59 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel( rdx[n * 3 + 2] = R[2] * dx_x + R[5] * dx_y + R[8] * dx_z; } - // Block-matrix multiply: edf = K * rdx - const float* K = stiffness + elemId * NNodes * NNodes * 9; - float edf[NDofs]; + // Symmetric block-matrix multiply: edf = K * rdx + // K stored as upper triangle: blocks (ni, nj) for nj >= ni + const float* K = stiffness + elemId * NSymBlocks * 9; + float edf[NNodes * 3]; + + #pragma unroll + for (int i = 0; i < NNodes * 3; ++i) + edf[i] = 0.0f; + #pragma unroll for (int ni = 0; ni < NNodes; ++ni) { - float fi0 = 0.0f, fi1 = 0.0f, fi2 = 0.0f; + // symIdx for (ni, ni) = ni*NNodes - ni*(ni-1)/2 + const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; + + // Diagonal block (ni, ni): Kii * rdx[ni] + { + const float* Kii = K + diagIdx * 9; + const float ri0 = rdx[ni * 3 + 0]; + const float ri1 = rdx[ni * 3 + 1]; + const float ri2 = rdx[ni * 3 + 2]; + edf[ni * 3 + 0] += Kii[0] * ri0 + Kii[1] * ri1 + Kii[2] * ri2; + edf[ni * 3 + 1] += Kii[3] * ri0 + Kii[4] * ri1 + Kii[5] * ri2; + edf[ni * 3 + 2] += Kii[6] * ri0 + Kii[7] * ri1 + Kii[8] * ri2; + } + + // Off-diagonal blocks (ni, nj) for nj > ni #pragma unroll - for (int nj = 0; nj < NNodes; ++nj) + for (int nj = ni + 1; nj < NNodes; ++nj) { - const float* Kij = K + (ni * NNodes + nj) * 9; - const float rj0 = rdx[nj * 3 + 0]; - const float rj1 = rdx[nj * 3 + 1]; - const float rj2 = rdx[nj * 3 + 2]; - fi0 += Kij[0] * rj0 + Kij[1] * rj1 + Kij[2] * rj2; - fi1 += Kij[3] * rj0 + Kij[4] * rj1 + Kij[5] * rj2; - fi2 += Kij[6] * rj0 + Kij[7] * rj1 + Kij[8] * rj2; + const int symIdx = diagIdx + (nj - ni); + const float* Kij = K + symIdx * 9; + + // Forward: edf[ni] += Kij * rdx[nj] + { + const float rj0 = rdx[nj * 3 + 0]; + const float rj1 = rdx[nj * 3 + 1]; + const float rj2 = rdx[nj * 3 + 2]; + edf[ni * 3 + 0] += Kij[0] * rj0 + Kij[1] * rj1 + Kij[2] * rj2; + edf[ni * 3 + 1] += Kij[3] * rj0 + Kij[4] * rj1 + Kij[5] * rj2; + edf[ni * 3 + 2] += Kij[6] * rj0 + Kij[7] * rj1 + Kij[8] * rj2; + } + + // Symmetric: edf[nj] += Kij^T * rdx[ni] + { + const float ri0 = rdx[ni * 3 + 0]; + const float ri1 = rdx[ni * 3 + 1]; + const float ri2 = rdx[ni * 3 + 2]; + edf[nj * 3 + 0] += Kij[0] * ri0 + Kij[3] * ri1 + Kij[6] * ri2; + edf[nj * 3 + 1] += Kij[1] * ri0 + Kij[4] * ri1 + Kij[7] * ri2; + edf[nj * 3 + 2] += Kij[2] * ri0 + Kij[5] * ri1 + Kij[8] * ri2; + } } - edf[ni * 3 + 0] = fi0; - edf[ni * 3 + 1] = fi1; - edf[ni * 3 + 2] = fi2; } // Rotate back and write: eforce = -kFactor * R * edf diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl index a3813867726..1e1093758a9 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl @@ -67,23 +67,31 @@ void CudaElementCorotationalFEMForceField::uploadStiffne } m_nbVertices = maxNodeId + 1; - // Upload stiffness matrices in block format: - // K[(ni * nNodes + nj) * dim * dim + di * dim + dj] per element - // This groups each 3x3 sub-block contiguously for better cache behavior. - m_gpuStiffness.resize(nbElem * nNodes * nNodes * dim * dim); + // Upload stiffness matrices in symmetric upper-triangle block format: + // Only blocks (ni, nj) with nj >= ni are stored. + // symIdx = ni * nNodes - ni*(ni-1)/2 + (nj - ni) + // K[symIdx * dim * dim + di * dim + dj] per element + constexpr auto nSymBlocks = nNodes * (nNodes + 1) / 2; + m_gpuStiffness.resize(nbElem * nSymBlocks * dim * dim); { auto* dst = m_gpuStiffness.hostWrite(); for (std::size_t e = 0; e < nbElem; ++e) { const auto& K = assembledMatrices[e]; for (unsigned int ni = 0; ni < nNodes; ++ni) - for (unsigned int nj = 0; nj < nNodes; ++nj) + { + const unsigned int diagIdx = ni * nNodes - ni * (ni - 1) / 2; + for (unsigned int nj = ni; nj < nNodes; ++nj) + { + const unsigned int symIdx = diagIdx + (nj - ni); for (unsigned int di = 0; di < dim; ++di) for (unsigned int dj = 0; dj < dim; ++dj) - dst[e * nNodes * nNodes * dim * dim - + (ni * nNodes + nj) * dim * dim + dst[e * nSymBlocks * dim * dim + + symIdx * dim * dim + di * dim + dj] = static_cast(K[ni * dim + di][nj * dim + dj]); + } + } } } From 5a2d2b983ab7ab49a65e7ef4e68d4835a9d97542 Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Mon, 6 Apr 2026 15:18:55 +0900 Subject: [PATCH 06/21] add cuda version of ElementLinearSmallStrainFEMForceField --- .../plugins/SofaCUDA/Component/CMakeLists.txt | 4 + .../Component/src/SofaCUDA/component/init.cpp | 2 + ...aElementLinearSmallStrainFEMForceField.cpp | 70 +++++ ...daElementLinearSmallStrainFEMForceField.cu | 242 ++++++++++++++++++ ...udaElementLinearSmallStrainFEMForceField.h | 113 ++++++++ ...aElementLinearSmallStrainFEMForceField.inl | 195 ++++++++++++++ ...aElementLinearSmallStrainFEMForceField.scn | 35 +++ 7 files changed, 661 insertions(+) create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl create mode 100644 applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn diff --git a/applications/plugins/SofaCUDA/Component/CMakeLists.txt b/applications/plugins/SofaCUDA/Component/CMakeLists.txt index fbd83faf0e5..5ac492c4834 100644 --- a/applications/plugins/SofaCUDA/Component/CMakeLists.txt +++ b/applications/plugins/SofaCUDA/Component/CMakeLists.txt @@ -41,6 +41,8 @@ set(HEADER_FILES ### solidmechanics ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl + ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h + ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.h ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.inl ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/hyperelastic/CudaStandardTetrahedralFEMForceField.h @@ -114,6 +116,7 @@ set(SOURCE_FILES ### Solidmechanics ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp + ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.cpp ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/hyperelastic/CudaStandardTetrahedralFEMForceField.cpp ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/tensormass/CudaTetrahedralTensorMassForceField.cpp @@ -185,6 +188,7 @@ set(CUDA_SOURCES ### solidmechanics ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu + ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaHexahedronFEMForceField.cu ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/hyperelastic/CudaStandardTetrahedralFEMForceField.cu ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/tensormass/CudaTetrahedralTensorMassForceField.cu diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp index 0d890c64139..c6b0ee6b438 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/init.cpp @@ -91,6 +91,7 @@ extern void registerSphereForceField(sofa::core::ObjectFactory* factory); // component::solidmechanics::fem::elastic extern void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory); +extern void registerElementLinearSmallStrainFEMForceField(sofa::core::ObjectFactory* factory); extern void registerHexahedronFEMForceField(sofa::core::ObjectFactory* factory); extern void registerTetrahedronFEMForceField(sofa::core::ObjectFactory* factory); extern void registerTriangularFEMForceFieldOptim(sofa::core::ObjectFactory* factory); @@ -226,6 +227,7 @@ void registerObjects(sofa::core::ObjectFactory* factory) registerPlaneForceField(factory); registerSphereForceField(factory); registerElementCorotationalFEMForceField(factory); + registerElementLinearSmallStrainFEMForceField(factory); registerHexahedronFEMForceField(factory); registerTetrahedronFEMForceField(factory); registerTriangularFEMForceFieldOptim(factory); diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp new file mode 100644 index 00000000000..af802d29e95 --- /dev/null +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp @@ -0,0 +1,70 @@ +/****************************************************************************** +* SOFA, Simulation Open-Framework Architecture * +* (c) 2006 INRIA, USTL, UJF, CNRS, MGH * +* * +* This program is free software; you can redistribute it and/or modify it * +* under the terms of the GNU Lesser General Public License as published by * +* the Free Software Foundation; either version 2.1 of the License, or (at * +* your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, but WITHOUT * +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * +* for more details. * +* * +* You should have received a copy of the GNU Lesser General Public License * +* along with this program. If not, see . * +******************************************************************************* +* Authors: The SOFA Team and external contributors (see Authors.txt) * +* * +* Contact information: contact@sofa-framework.org * +******************************************************************************/ +#include + +#include +#include +#include + +namespace sofa::component::solidmechanics::fem::elastic +{ + +using namespace sofa::gpu::cuda; + +template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; + +} // namespace sofa::component::solidmechanics::fem::elastic + +namespace sofa::gpu::cuda +{ + +void registerElementLinearSmallStrainFEMForceField(sofa::core::ObjectFactory* factory) +{ + using namespace sofa::component::solidmechanics::fem::elastic; + + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA for EdgeLinearSmallStrainFEMForceField") + .add< CudaElementLinearSmallStrainFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA for TriangleLinearSmallStrainFEMForceField") + .add< CudaElementLinearSmallStrainFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA for QuadLinearSmallStrainFEMForceField") + .add< CudaElementLinearSmallStrainFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA for TetrahedronLinearSmallStrainFEMForceField") + .add< CudaElementLinearSmallStrainFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA for HexahedronLinearSmallStrainFEMForceField") + .add< CudaElementLinearSmallStrainFEMForceField >() + ); +} + +} // namespace sofa::gpu::cuda diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu new file mode 100644 index 00000000000..4c474fe2d09 --- /dev/null +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu @@ -0,0 +1,242 @@ +/****************************************************************************** +* SOFA, Simulation Open-Framework Architecture * +* (c) 2006 INRIA, USTL, UJF, CNRS, MGH * +* * +* This program is free software; you can redistribute it and/or modify it * +* under the terms of the GNU Lesser General Public License as published by * +* the Free Software Foundation; either version 2.1 of the License, or (at * +* your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, but WITHOUT * +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * +* for more details. * +* * +* You should have received a copy of the GNU Lesser General Public License * +* along with this program. If not, see . * +******************************************************************************* +* Authors: The SOFA Team and external contributors (see Authors.txt) * +* * +* Contact information: contact@sofa-framework.org * +******************************************************************************/ +#include +#include +#include + +#if defined(__cplusplus) +namespace sofa +{ +namespace gpu +{ +namespace cuda +{ +#endif + +/** + * Kernel 1: Compute per-element dForce (1 thread per element). + * + * Templated on NNodes (compile-time) for full loop unrolling. + * Hardcoded Dim=3 (CudaVec3f only). + * + * No rotation matrices needed (linear small strain). + * Stiffness uses symmetric upper-triangle block storage: + * Only blocks (ni, nj) with nj >= ni are stored. + * NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats each. + */ +template +__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel( + int nbElem, + const int* __restrict__ elements, + const float* __restrict__ stiffness, + const float* __restrict__ dx, + float* __restrict__ eforce, + float kFactor) +{ + constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; + + const int elemId = blockIdx.x * blockDim.x + threadIdx.x; + if (elemId >= nbElem) return; + + // Gather dx for this element's nodes + float edx[NNodes * 3]; + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + const int nodeId = elements[n * nbElem + elemId]; + edx[n * 3 + 0] = dx[nodeId * 3 + 0]; + edx[n * 3 + 1] = dx[nodeId * 3 + 1]; + edx[n * 3 + 2] = dx[nodeId * 3 + 2]; + } + + // Symmetric block-matrix multiply: edf = K * edx + const float* K = stiffness + elemId * NSymBlocks * 9; + float edf[NNodes * 3]; + + #pragma unroll + for (int i = 0; i < NNodes * 3; ++i) + edf[i] = 0.0f; + + #pragma unroll + for (int ni = 0; ni < NNodes; ++ni) + { + const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; + + // Diagonal block (ni, ni): Kii * edx[ni] + { + const float* Kii = K + diagIdx * 9; + const float di0 = edx[ni * 3 + 0]; + const float di1 = edx[ni * 3 + 1]; + const float di2 = edx[ni * 3 + 2]; + edf[ni * 3 + 0] += Kii[0] * di0 + Kii[1] * di1 + Kii[2] * di2; + edf[ni * 3 + 1] += Kii[3] * di0 + Kii[4] * di1 + Kii[5] * di2; + edf[ni * 3 + 2] += Kii[6] * di0 + Kii[7] * di1 + Kii[8] * di2; + } + + // Off-diagonal blocks (ni, nj) for nj > ni + #pragma unroll + for (int nj = ni + 1; nj < NNodes; ++nj) + { + const int symIdx = diagIdx + (nj - ni); + const float* Kij = K + symIdx * 9; + + // Forward: edf[ni] += Kij * edx[nj] + { + const float dj0 = edx[nj * 3 + 0]; + const float dj1 = edx[nj * 3 + 1]; + const float dj2 = edx[nj * 3 + 2]; + edf[ni * 3 + 0] += Kij[0] * dj0 + Kij[1] * dj1 + Kij[2] * dj2; + edf[ni * 3 + 1] += Kij[3] * dj0 + Kij[4] * dj1 + Kij[5] * dj2; + edf[ni * 3 + 2] += Kij[6] * dj0 + Kij[7] * dj1 + Kij[8] * dj2; + } + + // Symmetric: edf[nj] += Kij^T * edx[ni] + { + const float di0 = edx[ni * 3 + 0]; + const float di1 = edx[ni * 3 + 1]; + const float di2 = edx[ni * 3 + 2]; + edf[nj * 3 + 0] += Kij[0] * di0 + Kij[3] * di1 + Kij[6] * di2; + edf[nj * 3 + 1] += Kij[1] * di0 + Kij[4] * di1 + Kij[7] * di2; + edf[nj * 3 + 2] += Kij[2] * di0 + Kij[5] * di1 + Kij[8] * di2; + } + } + } + + // Write: eforce = -kFactor * edf + float* out = eforce + elemId * NNodes * 3; + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + out[n * 3 + 0] = -kFactor * edf[n * 3 + 0]; + out[n * 3 + 1] = -kFactor * edf[n * 3 + 1]; + out[n * 3 + 2] = -kFactor * edf[n * 3 + 2]; + } +} + +/** + * Kernel 2: Gather per-vertex forces (1 thread per vertex). + * + * No atomics: each vertex handled by exactly one thread. + * velems is SoA: velems[s * nbVertex + vertexId], 0-terminated. + * Each entry is (elemId * NNodes + localNode + 1), with 0 as sentinel. + */ +__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel( + int nbVertex, + int maxElemPerVertex, + const int* __restrict__ velems, + const float* __restrict__ eforce, + float* df) +{ + const int vertexId = blockIdx.x * blockDim.x + threadIdx.x; + if (vertexId >= nbVertex) return; + + float fx = 0.0f, fy = 0.0f, fz = 0.0f; + + for (int s = 0; s < maxElemPerVertex; ++s) + { + const int idx = velems[s * nbVertex + vertexId]; + if (idx == 0) break; + const int base = (idx - 1) * 3; + fx += eforce[base + 0]; + fy += eforce[base + 1]; + fz += eforce[base + 2]; + } + + df[vertexId * 3 + 0] += fx; + df[vertexId * 3 + 1] += fy; + df[vertexId * 3 + 2] += fz; +} + +template +static void launchAddDForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int maxElemPerVertex, + const void* elements, + const void* stiffness, + const void* dx, + void* df, + void* eforce, + const void* velems, + float kFactor) +{ + const int computeThreads = 64; + const int gatherThreads = 256; + + { + const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; + ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel + <<>>( + nbElem, + (const int*)elements, + (const float*)stiffness, + (const float*)dx, + (float*)eforce, + kFactor); + mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel"); + } + + { + const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; + ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel + <<>>( + nbVertex, + maxElemPerVertex, + (const int*)velems, + (const float*)eforce, + (float*)df); + mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel"); + } +} + +extern "C" +{ + +void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* stiffness, + const void* dx, + void* df, + void* eforce, + const void* velems, + float kFactor) +{ + switch (nbNodesPerElem) + { + case 2: launchAddDForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; + case 3: launchAddDForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; + case 4: launchAddDForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; + case 8: launchAddDForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; + } +} + +} // extern "C" + +#if defined(__cplusplus) +} // namespace cuda +} // namespace gpu +} // namespace sofa +#endif diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h new file mode 100644 index 00000000000..67ae48abb48 --- /dev/null +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h @@ -0,0 +1,113 @@ +/****************************************************************************** +* SOFA, Simulation Open-Framework Architecture * +* (c) 2006 INRIA, USTL, UJF, CNRS, MGH * +* * +* This program is free software; you can redistribute it and/or modify it * +* under the terms of the GNU Lesser General Public License as published by * +* the Free Software Foundation; either version 2.1 of the License, or (at * +* your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, but WITHOUT * +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * +* for more details. * +* * +* You should have received a copy of the GNU Lesser General Public License * +* along with this program. If not, see . * +******************************************************************************* +* Authors: The SOFA Team and external contributors (see Authors.txt) * +* * +* Contact information: contact@sofa-framework.org * +******************************************************************************/ +#pragma once + +#include +#include + +namespace sofa::gpu::cuda +{ + +extern "C" +{ + void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* stiffness, + const void* dx, + void* df, + void* eforce, + const void* velems, + float kFactor); +} + +} // namespace sofa::gpu::cuda + +namespace sofa::component::solidmechanics::fem::elastic +{ + +/** + * CUDA-accelerated version of ElementLinearSmallStrainFEMForceField. + * + * Works with any element type (Edge, Triangle, Quad, Tetrahedron, Hexahedron). + * The addDForce method (the CG hot path, called ~250 times per timestep) runs entirely on GPU. + * The addForce method delegates to the CPU parent. + * + * Uses a two-kernel approach for addDForce: + * Kernel 1: compute per-element forces (1 thread/element, fully unrolled) + * Kernel 2: gather per-vertex (1 thread/vertex, no atomics) + * + * Compared to the corotational version, no rotation matrices are needed. + */ +template +class CudaElementLinearSmallStrainFEMForceField + : public ElementLinearSmallStrainFEMForceField +{ +public: + SOFA_CLASS( + SOFA_TEMPLATE2(CudaElementLinearSmallStrainFEMForceField, DataTypes, ElementType), + SOFA_TEMPLATE2(ElementLinearSmallStrainFEMForceField, DataTypes, ElementType)); + + using Real = sofa::Real_t; + using Coord = sofa::Coord_t; + using Deriv = sofa::Deriv_t; + using VecCoord = sofa::VecCoord_t; + using VecDeriv = sofa::VecDeriv_t; + + static const std::string GetCustomClassName() + { + return ElementLinearSmallStrainFEMForceField::GetCustomClassName(); + } + + static const std::string GetCustomTemplateName() + { + return DataTypes::Name(); + } + + void init() override; + + void addDForce( + const sofa::core::MechanicalParams* mparams, + sofa::DataVecDeriv_t& df, + const sofa::DataVecDeriv_t& dx) override; + +protected: + + CudaElementLinearSmallStrainFEMForceField() = default; + + void uploadStiffnessAndConnectivity(); + + gpu::cuda::CudaVector m_gpuStiffness; ///< Symmetric block-format stiffness per element + gpu::cuda::CudaVector m_gpuElements; ///< SoA connectivity: elements[nodeIdx * nbElem + elemId] + gpu::cuda::CudaVector m_gpuElementForce; ///< Intermediate per-element per-node force buffer + gpu::cuda::CudaVector m_gpuVelems; ///< SoA vertex-to-element mapping, 0-terminated + + unsigned int m_maxElemPerVertex = 0; + unsigned int m_nbVertices = 0; + + bool m_gpuDataUploaded = false; +}; + +} // namespace sofa::component::solidmechanics::fem::elastic diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl new file mode 100644 index 00000000000..95bc4519ed9 --- /dev/null +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl @@ -0,0 +1,195 @@ +/****************************************************************************** +* SOFA, Simulation Open-Framework Architecture * +* (c) 2006 INRIA, USTL, UJF, CNRS, MGH * +* * +* This program is free software; you can redistribute it and/or modify it * +* under the terms of the GNU Lesser General Public License as published by * +* the Free Software Foundation; either version 2.1 of the License, or (at * +* your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, but WITHOUT * +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * +* for more details. * +* * +* You should have received a copy of the GNU Lesser General Public License * +* along with this program. If not, see . * +******************************************************************************* +* Authors: The SOFA Team and external contributors (see Authors.txt) * +* * +* Contact information: contact@sofa-framework.org * +******************************************************************************/ +#pragma once +#include +#include +#include +#include + +namespace sofa::component::solidmechanics::fem::elastic +{ + +template +void CudaElementLinearSmallStrainFEMForceField::init() +{ + ElementLinearSmallStrainFEMForceField::init(); + + if (!this->isComponentStateInvalid()) + { + uploadStiffnessAndConnectivity(); + } +} + +template +void CudaElementLinearSmallStrainFEMForceField::uploadStiffnessAndConnectivity() +{ + using trait = sofa::component::solidmechanics::fem::elastic::trait; + + if (!this->l_topology) return; + + const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology); + const auto& assembledMatrices = this->m_assembledStiffnessMatrices; + + const auto nbElem = elements.size(); + constexpr auto nNodes = trait::NumberOfNodesInElement; + constexpr auto dim = trait::spatial_dimensions; + + // Find number of vertices + unsigned int maxNodeId = 0; + for (std::size_t e = 0; e < nbElem; ++e) + { + const auto& element = elements[e]; + for (unsigned int n = 0; n < nNodes; ++n) + { + if (static_cast(element[n]) > maxNodeId) + maxNodeId = static_cast(element[n]); + } + } + m_nbVertices = maxNodeId + 1; + + // Upload stiffness matrices in symmetric upper-triangle block format: + // Only blocks (ni, nj) with nj >= ni are stored. + // symIdx = ni * nNodes - ni*(ni-1)/2 + (nj - ni) + // K[symIdx * dim * dim + di * dim + dj] per element + constexpr auto nSymBlocks = nNodes * (nNodes + 1) / 2; + m_gpuStiffness.resize(nbElem * nSymBlocks * dim * dim); + { + auto* dst = m_gpuStiffness.hostWrite(); + for (std::size_t e = 0; e < nbElem; ++e) + { + const auto& K = assembledMatrices[e]; + for (unsigned int ni = 0; ni < nNodes; ++ni) + { + const unsigned int diagIdx = ni * nNodes - ni * (ni - 1) / 2; + for (unsigned int nj = ni; nj < nNodes; ++nj) + { + const unsigned int symIdx = diagIdx + (nj - ni); + for (unsigned int di = 0; di < dim; ++di) + for (unsigned int dj = 0; dj < dim; ++dj) + dst[e * nSymBlocks * dim * dim + + symIdx * dim * dim + + di * dim + dj] + = static_cast(K[ni * dim + di][nj * dim + dj]); + } + } + } + } + + // Upload element connectivity in SoA layout: + // elements[nodeIdx * nbElem + elemId] = global node index + m_gpuElements.resize(nNodes * nbElem); + { + auto* dst = m_gpuElements.hostWrite(); + for (std::size_t e = 0; e < nbElem; ++e) + { + const auto& element = elements[e]; + for (unsigned int n = 0; n < nNodes; ++n) + dst[n * nbElem + e] = static_cast(element[n]); + } + } + + // Build vertex-to-element mapping (velems) + std::vector> vertexElems(m_nbVertices); + for (std::size_t e = 0; e < nbElem; ++e) + { + const auto& element = elements[e]; + for (unsigned int n = 0; n < nNodes; ++n) + { + const int nodeId = static_cast(element[n]); + vertexElems[nodeId].push_back( + static_cast(e * nNodes + n + 1)); + } + } + + m_maxElemPerVertex = 0; + for (const auto& ve : vertexElems) + { + if (ve.size() > m_maxElemPerVertex) + m_maxElemPerVertex = static_cast(ve.size()); + } + + m_gpuVelems.resize(m_maxElemPerVertex * m_nbVertices); + { + auto* dst = m_gpuVelems.hostWrite(); + std::memset(dst, 0, m_maxElemPerVertex * m_nbVertices * sizeof(int)); + for (std::size_t v = 0; v < m_nbVertices; ++v) + { + for (std::size_t s = 0; s < vertexElems[v].size(); ++s) + dst[s * m_nbVertices + v] = vertexElems[v][s]; + } + } + + // Allocate intermediate per-element force buffer + m_gpuElementForce.resize(nbElem * nNodes * dim); + + m_gpuDataUploaded = true; +} + +template +void CudaElementLinearSmallStrainFEMForceField::addDForce( + const sofa::core::MechanicalParams* mparams, + sofa::DataVecDeriv_t& d_df, + const sofa::DataVecDeriv_t& d_dx) +{ + if (this->isComponentStateInvalid()) + return; + + if (!m_gpuDataUploaded) + { + // Fallback to CPU if GPU data not ready + ElementLinearSmallStrainFEMForceField::addDForce(mparams, d_df, d_dx); + return; + } + + using trait = sofa::component::solidmechanics::fem::elastic::trait; + + VecDeriv& df = *d_df.beginEdit(); + const VecDeriv& dx = d_dx.getValue(); + + if (df.size() < dx.size()) + df.resize(dx.size()); + + const auto kFactor = static_cast( + sofa::core::mechanicalparams::kFactorIncludingRayleighDamping( + mparams, this->rayleighStiffness.getValue())); + + const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology); + const auto nbElem = static_cast(elements.size()); + const auto nbVertex = static_cast(dx.size()); + + gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce( + nbElem, + nbVertex, + trait::NumberOfNodesInElement, + m_maxElemPerVertex, + m_gpuElements.deviceRead(), + m_gpuStiffness.deviceRead(), + dx.deviceRead(), + df.deviceWrite(), + m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead(), + kFactor); + + d_df.endEdit(); +} + +} // namespace sofa::component::solidmechanics::fem::elastic diff --git a/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn b/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn new file mode 100644 index 00000000000..c59fb6a6c2a --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 566389d0823eebb2359fee5b9f3710339899e08f Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Tue, 7 Apr 2026 08:43:32 +0900 Subject: [PATCH 07/21] add cuda version of addforce for ElementLinearSmallStrainFEMForceField --- ...daElementLinearSmallStrainFEMForceField.cu | 210 +++++++++++++++--- ...udaElementLinearSmallStrainFEMForceField.h | 22 +- ...aElementLinearSmallStrainFEMForceField.inl | 47 ++++ 3 files changed, 245 insertions(+), 34 deletions(-) diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu index 4c474fe2d09..39c67a27db4 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu @@ -33,15 +33,105 @@ namespace cuda #endif /** - * Kernel 1: Compute per-element dForce (1 thread per element). + * Kernel for addForce: Compute per-element force from displacement (1 thread per element). * + * f = -K * (x - x0) * Templated on NNodes (compile-time) for full loop unrolling. * Hardcoded Dim=3 (CudaVec3f only). + */ +template +__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel( + int nbElem, + const int* __restrict__ elements, + const float* __restrict__ stiffness, + const float* __restrict__ x, + const float* __restrict__ x0, + float* __restrict__ eforce) +{ + constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; + + const int elemId = blockIdx.x * blockDim.x + threadIdx.x; + if (elemId >= nbElem) return; + + // Gather displacement = x - x0 for this element's nodes + float disp[NNodes * 3]; + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + const int nodeId = elements[n * nbElem + elemId]; + disp[n * 3 + 0] = x[nodeId * 3 + 0] - x0[nodeId * 3 + 0]; + disp[n * 3 + 1] = x[nodeId * 3 + 1] - x0[nodeId * 3 + 1]; + disp[n * 3 + 2] = x[nodeId * 3 + 2] - x0[nodeId * 3 + 2]; + } + + // Symmetric block-matrix multiply: edf = K * disp + const float* K = stiffness + elemId * NSymBlocks * 9; + float edf[NNodes * 3]; + + #pragma unroll + for (int i = 0; i < NNodes * 3; ++i) + edf[i] = 0.0f; + + #pragma unroll + for (int ni = 0; ni < NNodes; ++ni) + { + const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; + + // Diagonal block + { + const float* Kii = K + diagIdx * 9; + const float di0 = disp[ni * 3 + 0]; + const float di1 = disp[ni * 3 + 1]; + const float di2 = disp[ni * 3 + 2]; + edf[ni * 3 + 0] += Kii[0] * di0 + Kii[1] * di1 + Kii[2] * di2; + edf[ni * 3 + 1] += Kii[3] * di0 + Kii[4] * di1 + Kii[5] * di2; + edf[ni * 3 + 2] += Kii[6] * di0 + Kii[7] * di1 + Kii[8] * di2; + } + + // Off-diagonal blocks + #pragma unroll + for (int nj = ni + 1; nj < NNodes; ++nj) + { + const int symIdx = diagIdx + (nj - ni); + const float* Kij = K + symIdx * 9; + + { + const float dj0 = disp[nj * 3 + 0]; + const float dj1 = disp[nj * 3 + 1]; + const float dj2 = disp[nj * 3 + 2]; + edf[ni * 3 + 0] += Kij[0] * dj0 + Kij[1] * dj1 + Kij[2] * dj2; + edf[ni * 3 + 1] += Kij[3] * dj0 + Kij[4] * dj1 + Kij[5] * dj2; + edf[ni * 3 + 2] += Kij[6] * dj0 + Kij[7] * dj1 + Kij[8] * dj2; + } + + { + const float di0 = disp[ni * 3 + 0]; + const float di1 = disp[ni * 3 + 1]; + const float di2 = disp[ni * 3 + 2]; + edf[nj * 3 + 0] += Kij[0] * di0 + Kij[3] * di1 + Kij[6] * di2; + edf[nj * 3 + 1] += Kij[1] * di0 + Kij[4] * di1 + Kij[7] * di2; + edf[nj * 3 + 2] += Kij[2] * di0 + Kij[5] * di1 + Kij[8] * di2; + } + } + } + + // Write: eforce = -edf (minus sign from f -= K * displacement) + float* out = eforce + elemId * NNodes * 3; + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + out[n * 3 + 0] = -edf[n * 3 + 0]; + out[n * 3 + 1] = -edf[n * 3 + 1]; + out[n * 3 + 2] = -edf[n * 3 + 2]; + } +} + +/** + * Kernel for addDForce: Compute per-element dForce (1 thread per element). * - * No rotation matrices needed (linear small strain). - * Stiffness uses symmetric upper-triangle block storage: - * Only blocks (ni, nj) with nj >= ni are stored. - * NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats each. + * df = -kFactor * K * dx + * Templated on NNodes (compile-time) for full loop unrolling. + * Hardcoded Dim=3 (CudaVec3f only). */ template __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel( @@ -133,13 +223,14 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel } /** - * Kernel 2: Gather per-vertex forces (1 thread per vertex). + * Gather per-vertex forces (1 thread per vertex). * + * Shared by both addForce and addDForce. * No atomics: each vertex handled by exactly one thread. * velems is SoA: velems[s * nbVertex + vertexId], 0-terminated. * Each entry is (elemId * NNodes + localNode + 1), with 0 as sentinel. */ -__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel( +__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel( int nbVertex, int maxElemPerVertex, const int* __restrict__ velems, @@ -166,6 +257,53 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel( df[vertexId * 3 + 2] += fz; } +static void launchGather( + unsigned int nbVertex, + unsigned int maxElemPerVertex, + const void* velems, + const void* eforce, + void* f) +{ + const int gatherThreads = 256; + const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; + ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel + <<>>( + nbVertex, + maxElemPerVertex, + (const int*)velems, + (const float*)eforce, + (float*)f); + mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel"); +} + +template +static void launchAddForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int maxElemPerVertex, + const void* elements, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + const void* velems) +{ + const int computeThreads = 64; + const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; + ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel + <<>>( + nbElem, + (const int*)elements, + (const float*)stiffness, + (const float*)x, + (const float*)x0, + (float*)eforce); + mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel"); + + launchGather(nbVertex, maxElemPerVertex, velems, eforce, f); +} + template static void launchAddDForce( unsigned int nbElem, @@ -180,37 +318,45 @@ static void launchAddDForce( float kFactor) { const int computeThreads = 64; - const int gatherThreads = 256; - - { - const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; - ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel - <<>>( - nbElem, - (const int*)elements, - (const float*)stiffness, - (const float*)dx, - (float*)eforce, - kFactor); - mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel"); - } + const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; + ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel + <<>>( + nbElem, + (const int*)elements, + (const float*)stiffness, + (const float*)dx, + (float*)eforce, + kFactor); + mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel"); - { - const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; - ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel - <<>>( - nbVertex, - maxElemPerVertex, - (const int*)velems, - (const float*)eforce, - (float*)df); - mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_gatherDForce_kernel"); - } + launchGather(nbVertex, maxElemPerVertex, velems, eforce, df); } extern "C" { +void ElementLinearSmallStrainFEMForceFieldCuda3f_addForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + const void* velems) +{ + switch (nbNodesPerElem) + { + case 2: launchAddForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; + case 3: launchAddForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; + case 4: launchAddForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; + case 8: launchAddForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; + } +} + void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce( unsigned int nbElem, unsigned int nbVertex, diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h index 67ae48abb48..53cfaf663c5 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h @@ -29,6 +29,19 @@ namespace sofa::gpu::cuda extern "C" { + void ElementLinearSmallStrainFEMForceFieldCuda3f_addForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + const void* velems); + void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce( unsigned int nbElem, unsigned int nbVertex, @@ -52,8 +65,7 @@ namespace sofa::component::solidmechanics::fem::elastic * CUDA-accelerated version of ElementLinearSmallStrainFEMForceField. * * Works with any element type (Edge, Triangle, Quad, Tetrahedron, Hexahedron). - * The addDForce method (the CG hot path, called ~250 times per timestep) runs entirely on GPU. - * The addForce method delegates to the CPU parent. + * Both addForce and addDForce run entirely on GPU. * * Uses a two-kernel approach for addDForce: * Kernel 1: compute per-element forces (1 thread/element, fully unrolled) @@ -88,6 +100,12 @@ class CudaElementLinearSmallStrainFEMForceField void init() override; + void addForce( + const sofa::core::MechanicalParams* mparams, + sofa::DataVecDeriv_t& f, + const sofa::DataVecCoord_t& x, + const sofa::DataVecDeriv_t& v) override; + void addDForce( const sofa::core::MechanicalParams* mparams, sofa::DataVecDeriv_t& df, diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl index 95bc4519ed9..1ab9dfb33f5 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl @@ -144,6 +144,53 @@ void CudaElementLinearSmallStrainFEMForceField::uploadSt m_gpuDataUploaded = true; } +template +void CudaElementLinearSmallStrainFEMForceField::addForce( + const sofa::core::MechanicalParams* mparams, + sofa::DataVecDeriv_t& d_f, + const sofa::DataVecCoord_t& d_x, + const sofa::DataVecDeriv_t& d_v) +{ + if (this->isComponentStateInvalid()) + return; + + if (!m_gpuDataUploaded) + { + ElementLinearSmallStrainFEMForceField::addForce(mparams, d_f, d_x, d_v); + return; + } + + using trait = sofa::component::solidmechanics::fem::elastic::trait; + + VecDeriv& f = *d_f.beginEdit(); + const VecCoord& x = d_x.getValue(); + + if (f.size() < x.size()) + f.resize(x.size()); + + auto restPositionAccessor = this->mstate->readRestPositions(); + const VecCoord& x0 = restPositionAccessor.ref(); + + const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology); + const auto nbElem = static_cast(elements.size()); + const auto nbVertex = static_cast(x.size()); + + gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addForce( + nbElem, + nbVertex, + trait::NumberOfNodesInElement, + m_maxElemPerVertex, + m_gpuElements.deviceRead(), + m_gpuStiffness.deviceRead(), + x.deviceRead(), + x0.deviceRead(), + f.deviceWrite(), + m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead()); + + d_f.endEdit(); +} + template void CudaElementLinearSmallStrainFEMForceField::addDForce( const sofa::core::MechanicalParams* mparams, From 27c0bc75a87cbe148a4bb9c9f96d85baf2337e71 Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Tue, 7 Apr 2026 09:01:32 +0900 Subject: [PATCH 08/21] add cuda version of addforce for ElementCorotationalFEMForceField --- .../CudaElementCorotationalFEMForceField.cu | 332 +++++++++++++----- .../CudaElementCorotationalFEMForceField.h | 14 + .../CudaElementCorotationalFEMForceField.inl | 53 ++- 3 files changed, 304 insertions(+), 95 deletions(-) diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu index 52ec3af12af..4a2ff9028b9 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu @@ -33,18 +33,163 @@ namespace cuda #endif /** - * Kernel 1: Compute per-element dForce (1 thread per element). + * Symmetric block-matrix multiply: out = K * in + * K stored as upper triangle: NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats. + * Inline device function shared by both addForce and addDForce kernels. + */ +template +__device__ void symBlockMatMul(const float* K, const float* in, float* out) +{ + constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; + + #pragma unroll + for (int i = 0; i < NNodes * 3; ++i) + out[i] = 0.0f; + + #pragma unroll + for (int ni = 0; ni < NNodes; ++ni) + { + const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; + + // Diagonal block + { + const float* Kii = K + diagIdx * 9; + const float i0 = in[ni * 3 + 0]; + const float i1 = in[ni * 3 + 1]; + const float i2 = in[ni * 3 + 2]; + out[ni * 3 + 0] += Kii[0] * i0 + Kii[1] * i1 + Kii[2] * i2; + out[ni * 3 + 1] += Kii[3] * i0 + Kii[4] * i1 + Kii[5] * i2; + out[ni * 3 + 2] += Kii[6] * i0 + Kii[7] * i1 + Kii[8] * i2; + } + + // Off-diagonal blocks + #pragma unroll + for (int nj = ni + 1; nj < NNodes; ++nj) + { + const int symIdx = diagIdx + (nj - ni); + const float* Kij = K + symIdx * 9; + + // Forward: out[ni] += Kij * in[nj] + { + const float j0 = in[nj * 3 + 0]; + const float j1 = in[nj * 3 + 1]; + const float j2 = in[nj * 3 + 2]; + out[ni * 3 + 0] += Kij[0] * j0 + Kij[1] * j1 + Kij[2] * j2; + out[ni * 3 + 1] += Kij[3] * j0 + Kij[4] * j1 + Kij[5] * j2; + out[ni * 3 + 2] += Kij[6] * j0 + Kij[7] * j1 + Kij[8] * j2; + } + + // Symmetric: out[nj] += Kij^T * in[ni] + { + const float i0 = in[ni * 3 + 0]; + const float i1 = in[ni * 3 + 1]; + const float i2 = in[ni * 3 + 2]; + out[nj * 3 + 0] += Kij[0] * i0 + Kij[3] * i1 + Kij[6] * i2; + out[nj * 3 + 1] += Kij[1] * i0 + Kij[4] * i1 + Kij[7] * i2; + out[nj * 3 + 2] += Kij[2] * i0 + Kij[5] * i1 + Kij[8] * i2; + } + } + } +} + +/** + * Kernel for addForce: Compute per-element force (1 thread per element). * - * Templated on NNodes (compile-time) for full loop unrolling. - * Hardcoded Dim=3 (CudaVec3f only). + * displacement[j] = R^T * (x[j] - centroid_x) - (x0[j] - centroid_x0) + * elementForce = K * displacement + * out[j] = -R * elementForce[j] + */ +template +__global__ void ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel( + int nbElem, + const int* __restrict__ elements, + const float* __restrict__ rotations, + const float* __restrict__ stiffness, + const float* __restrict__ x, + const float* __restrict__ x0, + float* __restrict__ eforce) +{ + constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; + constexpr float invN = 1.0f / NNodes; + + const int elemId = blockIdx.x * blockDim.x + threadIdx.x; + if (elemId >= nbElem) return; + + // Load rotation matrix R (3x3, row-major) + const float* Rptr = rotations + elemId * 9; + float R[9]; + #pragma unroll + for (int i = 0; i < 9; ++i) + R[i] = Rptr[i]; + + // Gather node positions and rest positions + float ex[NNodes * 3], ex0[NNodes * 3]; + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + const int nodeId = elements[n * nbElem + elemId]; + ex[n * 3 + 0] = x[nodeId * 3 + 0]; + ex[n * 3 + 1] = x[nodeId * 3 + 1]; + ex[n * 3 + 2] = x[nodeId * 3 + 2]; + ex0[n * 3 + 0] = x0[nodeId * 3 + 0]; + ex0[n * 3 + 1] = x0[nodeId * 3 + 1]; + ex0[n * 3 + 2] = x0[nodeId * 3 + 2]; + } + + // Compute centroids + float cx = 0.0f, cy = 0.0f, cz = 0.0f; + float cx0 = 0.0f, cy0 = 0.0f, cz0 = 0.0f; + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + cx += ex[n * 3 + 0]; cy += ex[n * 3 + 1]; cz += ex[n * 3 + 2]; + cx0 += ex0[n * 3 + 0]; cy0 += ex0[n * 3 + 1]; cz0 += ex0[n * 3 + 2]; + } + cx *= invN; cy *= invN; cz *= invN; + cx0 *= invN; cy0 *= invN; cz0 *= invN; + + // Compute displacement: disp[j] = R^T * (x[j] - centroid) - (x0[j] - centroid0) + float disp[NNodes * 3]; + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + const float dx = ex[n * 3 + 0] - cx; + const float dy = ex[n * 3 + 1] - cy; + const float dz = ex[n * 3 + 2] - cz; + + // R^T * (x - centroid) + const float rx = R[0] * dx + R[3] * dy + R[6] * dz; + const float ry = R[1] * dx + R[4] * dy + R[7] * dz; + const float rz = R[2] * dx + R[5] * dy + R[8] * dz; + + disp[n * 3 + 0] = rx - (ex0[n * 3 + 0] - cx0); + disp[n * 3 + 1] = ry - (ex0[n * 3 + 1] - cy0); + disp[n * 3 + 2] = rz - (ex0[n * 3 + 2] - cz0); + } + + // edf = K * disp + float edf[NNodes * 3]; + const float* K = stiffness + elemId * NSymBlocks * 9; + symBlockMatMul(K, disp, edf); + + // Rotate back and write: out = -R * edf + float* out = eforce + elemId * NNodes * 3; + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + const float e0 = edf[n * 3 + 0]; + const float e1 = edf[n * 3 + 1]; + const float e2 = edf[n * 3 + 2]; + out[n * 3 + 0] = -(R[0] * e0 + R[1] * e1 + R[2] * e2); + out[n * 3 + 1] = -(R[3] * e0 + R[4] * e1 + R[5] * e2); + out[n * 3 + 2] = -(R[6] * e0 + R[7] * e1 + R[8] * e2); + } +} + +/** + * Kernel for addDForce: Compute per-element dForce (1 thread per element). * - * Connectivity is SoA: elements[nodeIdx * nbElem + elemId]. - * Stiffness uses symmetric upper-triangle block storage: - * Only blocks (ni, nj) with nj >= ni are stored. - * NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats each. - * Each off-diagonal block is read once and used for both - * forward (edf[ni] += Kij * rdx[nj]) and symmetric - * (edf[nj] += Kij^T * rdx[ni]) contributions. + * rdx = R^T * dx, edf = K * rdx, out = -kFactor * R * edf */ template __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel( @@ -84,59 +229,9 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel( } // Symmetric block-matrix multiply: edf = K * rdx - // K stored as upper triangle: blocks (ni, nj) for nj >= ni const float* K = stiffness + elemId * NSymBlocks * 9; float edf[NNodes * 3]; - - #pragma unroll - for (int i = 0; i < NNodes * 3; ++i) - edf[i] = 0.0f; - - #pragma unroll - for (int ni = 0; ni < NNodes; ++ni) - { - // symIdx for (ni, ni) = ni*NNodes - ni*(ni-1)/2 - const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; - - // Diagonal block (ni, ni): Kii * rdx[ni] - { - const float* Kii = K + diagIdx * 9; - const float ri0 = rdx[ni * 3 + 0]; - const float ri1 = rdx[ni * 3 + 1]; - const float ri2 = rdx[ni * 3 + 2]; - edf[ni * 3 + 0] += Kii[0] * ri0 + Kii[1] * ri1 + Kii[2] * ri2; - edf[ni * 3 + 1] += Kii[3] * ri0 + Kii[4] * ri1 + Kii[5] * ri2; - edf[ni * 3 + 2] += Kii[6] * ri0 + Kii[7] * ri1 + Kii[8] * ri2; - } - - // Off-diagonal blocks (ni, nj) for nj > ni - #pragma unroll - for (int nj = ni + 1; nj < NNodes; ++nj) - { - const int symIdx = diagIdx + (nj - ni); - const float* Kij = K + symIdx * 9; - - // Forward: edf[ni] += Kij * rdx[nj] - { - const float rj0 = rdx[nj * 3 + 0]; - const float rj1 = rdx[nj * 3 + 1]; - const float rj2 = rdx[nj * 3 + 2]; - edf[ni * 3 + 0] += Kij[0] * rj0 + Kij[1] * rj1 + Kij[2] * rj2; - edf[ni * 3 + 1] += Kij[3] * rj0 + Kij[4] * rj1 + Kij[5] * rj2; - edf[ni * 3 + 2] += Kij[6] * rj0 + Kij[7] * rj1 + Kij[8] * rj2; - } - - // Symmetric: edf[nj] += Kij^T * rdx[ni] - { - const float ri0 = rdx[ni * 3 + 0]; - const float ri1 = rdx[ni * 3 + 1]; - const float ri2 = rdx[ni * 3 + 2]; - edf[nj * 3 + 0] += Kij[0] * ri0 + Kij[3] * ri1 + Kij[6] * ri2; - edf[nj * 3 + 1] += Kij[1] * ri0 + Kij[4] * ri1 + Kij[7] * ri2; - edf[nj * 3 + 2] += Kij[2] * ri0 + Kij[5] * ri1 + Kij[8] * ri2; - } - } - } + symBlockMatMul(K, rdx, edf); // Rotate back and write: eforce = -kFactor * R * edf float* out = eforce + elemId * NNodes * 3; @@ -153,13 +248,14 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel( } /** - * Kernel 2: Gather per-vertex forces (1 thread per vertex). + * Gather per-vertex forces (1 thread per vertex). * + * Shared by addForce and addDForce. * No atomics: each vertex handled by exactly one thread. * velems is SoA: velems[s * nbVertex + vertexId], 0-terminated. * Each entry is (elemId * NNodes + localNode + 1), with 0 as sentinel. */ -__global__ void ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel( +__global__ void ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel( int nbVertex, int maxElemPerVertex, const int* __restrict__ velems, @@ -186,6 +282,55 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel( df[vertexId * 3 + 2] += fz; } +static void launchGather( + unsigned int nbVertex, + unsigned int maxElemPerVertex, + const void* velems, + const void* eforce, + void* f) +{ + const int gatherThreads = 256; + const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; + ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel + <<>>( + nbVertex, + maxElemPerVertex, + (const int*)velems, + (const float*)eforce, + (float*)f); + mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel"); +} + +template +static void launchAddForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int maxElemPerVertex, + const void* elements, + const void* rotations, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + const void* velems) +{ + const int computeThreads = 64; + const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; + ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel + <<>>( + nbElem, + (const int*)elements, + (const float*)rotations, + (const float*)stiffness, + (const float*)x, + (const float*)x0, + (float*)eforce); + mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel"); + + launchGather(nbVertex, maxElemPerVertex, velems, eforce, f); +} + template static void launchAddDForce( unsigned int nbElem, @@ -201,38 +346,47 @@ static void launchAddDForce( float kFactor) { const int computeThreads = 64; - const int gatherThreads = 256; + const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; + ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel + <<>>( + nbElem, + (const int*)elements, + (const float*)rotations, + (const float*)stiffness, + (const float*)dx, + (float*)eforce, + kFactor); + mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel"); - { - const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; - ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel - <<>>( - nbElem, - (const int*)elements, - (const float*)rotations, - (const float*)stiffness, - (const float*)dx, - (float*)eforce, - kFactor); - mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel"); - } - - { - const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; - ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel - <<>>( - nbVertex, - maxElemPerVertex, - (const int*)velems, - (const float*)eforce, - (float*)df); - mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_gatherDForce_kernel"); - } + launchGather(nbVertex, maxElemPerVertex, velems, eforce, df); } extern "C" { +void ElementCorotationalFEMForceFieldCuda3f_addForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* rotations, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + const void* velems) +{ + switch (nbNodesPerElem) + { + case 2: launchAddForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; + case 3: launchAddForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; + case 4: launchAddForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; + case 8: launchAddForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; + } +} + void ElementCorotationalFEMForceFieldCuda3f_addDForce( unsigned int nbElem, unsigned int nbVertex, diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h index c5220a2f2be..74a9adc1a5b 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h @@ -29,6 +29,20 @@ namespace sofa::gpu::cuda extern "C" { + void ElementCorotationalFEMForceFieldCuda3f_addForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* rotations, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + const void* velems); + void ElementCorotationalFEMForceFieldCuda3f_addDForce( unsigned int nbElem, unsigned int nbVertex, diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl index 1e1093758a9..6cd70af4c7c 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl @@ -176,15 +176,56 @@ void CudaElementCorotationalFEMForceField::uploadRotatio template void CudaElementCorotationalFEMForceField::addForce( const sofa::core::MechanicalParams* mparams, - sofa::DataVecDeriv_t& f, - const sofa::DataVecCoord_t& x, - const sofa::DataVecDeriv_t& v) + sofa::DataVecDeriv_t& d_f, + const sofa::DataVecCoord_t& d_x, + const sofa::DataVecDeriv_t& d_v) { - // Run on CPU: computes rotations and forces - ElementCorotationalFEMForceField::addForce(mparams, f, x, v); + if (this->isComponentStateInvalid()) + return; + + if (!m_gpuDataUploaded) + { + ElementCorotationalFEMForceField::addForce(mparams, d_f, d_x, d_v); + uploadRotations(); + return; + } - // Upload the freshly-computed rotations to GPU for subsequent addDForce calls + using trait = sofa::component::solidmechanics::fem::elastic::trait; + + const VecCoord& x = d_x.getValue(); + auto restPositionAccessor = this->mstate->readRestPositions(); + const VecCoord& x0 = restPositionAccessor.ref(); + + // Compute rotations on CPU (polar decomposition cannot run on GPU) + this->computeRotations(this->m_rotations, x, x0); + + // Upload rotations to GPU uploadRotations(); + + // Run force computation on GPU + VecDeriv& f = *d_f.beginEdit(); + if (f.size() < x.size()) + f.resize(x.size()); + + const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology); + const auto nbElem = static_cast(elements.size()); + const auto nbVertex = static_cast(x.size()); + + gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForce( + nbElem, + nbVertex, + trait::NumberOfNodesInElement, + m_maxElemPerVertex, + m_gpuElements.deviceRead(), + m_gpuRotations.deviceRead(), + m_gpuStiffness.deviceRead(), + x.deviceRead(), + x0.deviceRead(), + f.deviceWrite(), + m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead()); + + d_f.endEdit(); } template From d6eca76bed2b138af810c56cd4e50a6493002c9b Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Tue, 7 Apr 2026 09:13:11 +0900 Subject: [PATCH 09/21] update addforce to compute everything on GPU --- .../CudaElementCorotationalFEMForceField.cu | 278 +++++++++++++++++- .../CudaElementCorotationalFEMForceField.h | 28 +- .../CudaElementCorotationalFEMForceField.inl | 95 ++++-- 3 files changed, 371 insertions(+), 30 deletions(-) diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu index 4a2ff9028b9..fd31eeab487 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu @@ -32,16 +32,128 @@ namespace cuda { #endif +/** + * Device helper: 3x3 matrix multiply C = A * B (row-major) + */ +__device__ void mat3Mul(const float* A, const float* B, float* C) +{ + #pragma unroll + for (int i = 0; i < 3; ++i) + { + #pragma unroll + for (int j = 0; j < 3; ++j) + { + C[i * 3 + j] = A[i * 3 + 0] * B[0 * 3 + j] + + A[i * 3 + 1] * B[1 * 3 + j] + + A[i * 3 + 2] * B[2 * 3 + j]; + } + } +} + +/** + * Device helper: C = A * B^T (row-major) + */ +__device__ void mat3MulTranspose(const float* A, const float* BT, float* C) +{ + #pragma unroll + for (int i = 0; i < 3; ++i) + { + #pragma unroll + for (int j = 0; j < 3; ++j) + { + C[i * 3 + j] = A[i * 3 + 0] * BT[j * 3 + 0] + + A[i * 3 + 1] * BT[j * 3 + 1] + + A[i * 3 + 2] * BT[j * 3 + 2]; + } + } +} + +/** + * Device helper: compute rotation frame from first 3 nodes (TriangleRotation). + * Used for Triangle (NNodes=3) and Tetrahedron (NNodes=4) elements. + * ex is [NNodes*3] array of gathered node positions. + */ +__device__ void computeTriangleFrame(const float* ex, float* frame) +{ + // xAxis = normalize(p1 - p0) + float ax = ex[3] - ex[0], ay = ex[4] - ex[1], az = ex[5] - ex[2]; + float invLen = rsqrtf(ax * ax + ay * ay + az * az); + ax *= invLen; ay *= invLen; az *= invLen; + + // tmp yAxis = p2 - p0 + float bx = ex[6] - ex[0], by = ex[7] - ex[1], bz = ex[8] - ex[2]; + + // zAxis = normalize(cross(xAxis, tmpY)) + float cx = ay * bz - az * by; + float cy = az * bx - ax * bz; + float cz = ax * by - ay * bx; + invLen = rsqrtf(cx * cx + cy * cy + cz * cz); + cx *= invLen; cy *= invLen; cz *= invLen; + + // yAxis = cross(zAxis, xAxis) + bx = cy * az - cz * ay; + by = cz * ax - cx * az; + bz = cx * ay - cy * ax; + + // frame rows: [xAxis; yAxis; zAxis] + frame[0] = ax; frame[1] = ay; frame[2] = az; + frame[3] = bx; frame[4] = by; frame[5] = bz; + frame[6] = cx; frame[7] = cy; frame[8] = cz; +} + +/** + * Device helper: compute rotation frame from 8 hexahedron nodes (HexahedronRotation). + * ex is [8*3] array of gathered node positions. + */ +__device__ void computeHexahedronFrame(const float* ex, float* frame) +{ + // Average edge vectors + // xAxis_avg = ((n1-n0) + (n2-n3) + (n5-n4) + (n6-n7)) * 0.25 + float ax = ((ex[1*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[3*3+0]) + + (ex[5*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[7*3+0])) * 0.25f; + float ay = ((ex[1*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[3*3+1]) + + (ex[5*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[7*3+1])) * 0.25f; + float az = ((ex[1*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[3*3+2]) + + (ex[5*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[7*3+2])) * 0.25f; + + // yAxis_avg = ((n3-n0) + (n2-n1) + (n7-n4) + (n6-n5)) * 0.25 + float bx = ((ex[3*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[1*3+0]) + + (ex[7*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[5*3+0])) * 0.25f; + float by = ((ex[3*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[1*3+1]) + + (ex[7*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[5*3+1])) * 0.25f; + float bz = ((ex[3*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[1*3+2]) + + (ex[7*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[5*3+2])) * 0.25f; + + // Normalize xAxis + float invLen = rsqrtf(ax * ax + ay * ay + az * az); + ax *= invLen; ay *= invLen; az *= invLen; + + // zAxis = normalize(cross(xAxis, yAxis_avg)) + float cx = ay * bz - az * by; + float cy = az * bx - ax * bz; + float cz = ax * by - ay * bx; + invLen = rsqrtf(cx * cx + cy * cy + cz * cz); + cx *= invLen; cy *= invLen; cz *= invLen; + + // yAxis = cross(zAxis, xAxis) + bx = cy * az - cz * ay; + by = cz * ax - cx * az; + bz = cx * ay - cy * ax; + + // frame rows: [xAxis; yAxis; zAxis] + frame[0] = ax; frame[1] = ay; frame[2] = az; + frame[3] = bx; frame[4] = by; frame[5] = bz; + frame[6] = cx; frame[7] = cy; frame[8] = cz; +} + /** * Symmetric block-matrix multiply: out = K * in * K stored as upper triangle: NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats. - * Inline device function shared by both addForce and addDForce kernels. + * Inline device function shared by addForce, addDForce, and combined kernels. */ template __device__ void symBlockMatMul(const float* K, const float* in, float* out) { - constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; - #pragma unroll for (int i = 0; i < NNodes * 3; ++i) out[i] = 0.0f; @@ -92,6 +204,111 @@ __device__ void symBlockMatMul(const float* K, const float* in, float* out) } } +/** + * Combined kernel: compute rotations AND per-element forces in one pass. + * + * Uses TriangleRotation for NNodes=3,4 and HexahedronRotation for NNodes=8. + * Computes: frame from node positions → R = frame * initRotTransposed + * Then: displacement = R^T*(x-centroid) - (x0-centroid0) → K*disp → -R*result + * Also writes R to rotations buffer for subsequent addDForce calls. + */ +template +__global__ void ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_kernel( + int nbElem, + const int* __restrict__ elements, + const float* __restrict__ initRotTransposed, + const float* __restrict__ stiffness, + const float* __restrict__ x, + const float* __restrict__ x0, + float* __restrict__ rotationsOut, + float* __restrict__ eforce) +{ + constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; + constexpr float invN = 1.0f / NNodes; + + const int elemId = blockIdx.x * blockDim.x + threadIdx.x; + if (elemId >= nbElem) return; + + // Gather node positions and rest positions + float ex[NNodes * 3], ex0[NNodes * 3]; + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + const int nodeId = elements[n * nbElem + elemId]; + ex[n * 3 + 0] = x[nodeId * 3 + 0]; + ex[n * 3 + 1] = x[nodeId * 3 + 1]; + ex[n * 3 + 2] = x[nodeId * 3 + 2]; + ex0[n * 3 + 0] = x0[nodeId * 3 + 0]; + ex0[n * 3 + 1] = x0[nodeId * 3 + 1]; + ex0[n * 3 + 2] = x0[nodeId * 3 + 2]; + } + + // Compute rotation frame from current positions + float frame[9]; + if constexpr (NNodes == 8) + computeHexahedronFrame(ex, frame); + else + computeTriangleFrame(ex, frame); + + // R = frame * initRotTransposed^T (i.e. frame.multTranspose(initRotTransposed)) + // Since initRotTransposed is already the transpose, R = frame * initRotTransposed^T + const float* irt = initRotTransposed + elemId * 9; + float R[9]; + mat3MulTranspose(frame, irt, R); + + // Write R to rotations buffer for addDForce + float* Rout = rotationsOut + elemId * 9; + #pragma unroll + for (int i = 0; i < 9; ++i) + Rout[i] = R[i]; + + // Compute centroids + float cx = 0.0f, cy = 0.0f, cz = 0.0f; + float cx0 = 0.0f, cy0 = 0.0f, cz0 = 0.0f; + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + cx += ex[n * 3 + 0]; cy += ex[n * 3 + 1]; cz += ex[n * 3 + 2]; + cx0 += ex0[n * 3 + 0]; cy0 += ex0[n * 3 + 1]; cz0 += ex0[n * 3 + 2]; + } + cx *= invN; cy *= invN; cz *= invN; + cx0 *= invN; cy0 *= invN; cz0 *= invN; + + // Compute displacement: disp[j] = R^T * (x[j] - centroid) - (x0[j] - centroid0) + float disp[NNodes * 3]; + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + const float dx = ex[n * 3 + 0] - cx; + const float dy = ex[n * 3 + 1] - cy; + const float dz = ex[n * 3 + 2] - cz; + const float rx = R[0] * dx + R[3] * dy + R[6] * dz; + const float ry = R[1] * dx + R[4] * dy + R[7] * dz; + const float rz = R[2] * dx + R[5] * dy + R[8] * dz; + disp[n * 3 + 0] = rx - (ex0[n * 3 + 0] - cx0); + disp[n * 3 + 1] = ry - (ex0[n * 3 + 1] - cy0); + disp[n * 3 + 2] = rz - (ex0[n * 3 + 2] - cz0); + } + + // edf = K * disp + float edf[NNodes * 3]; + const float* K = stiffness + elemId * NSymBlocks * 9; + symBlockMatMul(K, disp, edf); + + // Rotate back and write: out = -R * edf + float* out = eforce + elemId * NNodes * 3; + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + const float e0 = edf[n * 3 + 0]; + const float e1 = edf[n * 3 + 1]; + const float e2 = edf[n * 3 + 2]; + out[n * 3 + 0] = -(R[0] * e0 + R[1] * e1 + R[2] * e2); + out[n * 3 + 1] = -(R[3] * e0 + R[4] * e1 + R[5] * e2); + out[n * 3 + 2] = -(R[6] * e0 + R[7] * e1 + R[8] * e2); + } +} + /** * Kernel for addForce: Compute per-element force (1 thread per element). * @@ -301,6 +518,38 @@ static void launchGather( mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel"); } +template +static void launchAddForceWithRotations( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int maxElemPerVertex, + const void* elements, + const void* initRotTransposed, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + void* rotationsOut, + const void* velems) +{ + const int computeThreads = 64; + const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; + ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_kernel + <<>>( + nbElem, + (const int*)elements, + (const float*)initRotTransposed, + (const float*)stiffness, + (const float*)x, + (const float*)x0, + (float*)rotationsOut, + (float*)eforce); + mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_kernel"); + + launchGather(nbVertex, maxElemPerVertex, velems, eforce, f); +} + template static void launchAddForce( unsigned int nbElem, @@ -364,6 +613,29 @@ static void launchAddDForce( extern "C" { +void ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* initRotTransposed, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + void* rotationsOut, + const void* velems) +{ + switch (nbNodesPerElem) + { + case 3: launchAddForceWithRotations<3>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; + case 4: launchAddForceWithRotations<4>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; + case 8: launchAddForceWithRotations<8>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; + } +} + void ElementCorotationalFEMForceFieldCuda3f_addForce( unsigned int nbElem, unsigned int nbVertex, diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h index 74a9adc1a5b..e25f9a7b485 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h @@ -29,6 +29,21 @@ namespace sofa::gpu::cuda extern "C" { + void ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* initRotTransposed, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + void* rotationsOut, + const void* velems); + void ElementCorotationalFEMForceFieldCuda3f_addForce( unsigned int nbElem, unsigned int nbVertex, @@ -118,18 +133,21 @@ class CudaElementCorotationalFEMForceField void uploadStiffnessAndConnectivity(); void uploadRotations(); + void uploadInitialRotationsTransposed(); - gpu::cuda::CudaVector m_gpuStiffness; ///< Block-format stiffness: K[(ni*N+nj)*9 + di*3+dj] per element - gpu::cuda::CudaVector m_gpuRotations; ///< Flat 3x3 rotation matrices per element - gpu::cuda::CudaVector m_gpuElements; ///< SoA connectivity: elements[nodeIdx * nbElem + elemId] - gpu::cuda::CudaVector m_gpuElementForce; ///< Intermediate per-element per-node force buffer - gpu::cuda::CudaVector m_gpuVelems; ///< SoA vertex-to-element mapping, 0-terminated + gpu::cuda::CudaVector m_gpuStiffness; ///< Symmetric block-format stiffness per element + gpu::cuda::CudaVector m_gpuRotations; ///< Flat 3x3 rotation matrices per element + gpu::cuda::CudaVector m_gpuInitialRotationsTransposed; ///< Flat 3x3 initial rotation transposed per element + gpu::cuda::CudaVector m_gpuElements; ///< SoA connectivity: elements[nodeIdx * nbElem + elemId] + gpu::cuda::CudaVector m_gpuElementForce; ///< Intermediate per-element per-node force buffer + gpu::cuda::CudaVector m_gpuVelems; ///< SoA vertex-to-element mapping, 0-terminated unsigned int m_maxElemPerVertex = 0; unsigned int m_nbVertices = 0; bool m_gpuDataUploaded = false; bool m_gpuRotationsUploaded = false; + bool m_gpuRotationMethodSupported = false; }; } // namespace sofa::component::solidmechanics::fem::elastic diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl index 6cd70af4c7c..e6742c60b2c 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl @@ -36,6 +36,7 @@ void CudaElementCorotationalFEMForceField::init() if (!this->isComponentStateInvalid()) { uploadStiffnessAndConnectivity(); + uploadInitialRotationsTransposed(); } } @@ -173,6 +174,36 @@ void CudaElementCorotationalFEMForceField::uploadRotatio m_gpuRotationsUploaded = true; } +template +void CudaElementCorotationalFEMForceField::uploadInitialRotationsTransposed() +{ + using trait = sofa::component::solidmechanics::fem::elastic::trait; + constexpr auto dim = trait::spatial_dimensions; + constexpr auto nNodes = trait::NumberOfNodesInElement; + + const auto& initRotT = this->m_initialRotationsTransposed; + const auto nbElem = initRotT.size(); + if (nbElem == 0) return; + + m_gpuInitialRotationsTransposed.resize(nbElem * dim * dim); + m_gpuRotations.resize(nbElem * dim * dim); + { + auto* dst = m_gpuInitialRotationsTransposed.hostWrite(); + for (std::size_t e = 0; e < nbElem; ++e) + { + const auto& R = initRotT[e]; + for (unsigned int i = 0; i < dim; ++i) + for (unsigned int j = 0; j < dim; ++j) + dst[e * dim * dim + i * dim + j] = static_cast(R[i][j]); + } + } + + // Check if the rotation method is GPU-compatible + const auto rotationMethodKey = this->m_rotationMethods.d_rotationMethod.getValue().key(); + m_gpuRotationMethodSupported = (nNodes >= 3) + && (rotationMethodKey == "triangle" || rotationMethodKey == "hexahedron"); +} + template void CudaElementCorotationalFEMForceField::addForce( const sofa::core::MechanicalParams* mparams, @@ -196,34 +227,54 @@ void CudaElementCorotationalFEMForceField::addForce( auto restPositionAccessor = this->mstate->readRestPositions(); const VecCoord& x0 = restPositionAccessor.ref(); - // Compute rotations on CPU (polar decomposition cannot run on GPU) - this->computeRotations(this->m_rotations, x, x0); - - // Upload rotations to GPU - uploadRotations(); + const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology); + const auto nbElem = static_cast(elements.size()); + const auto nbVertex = static_cast(x.size()); - // Run force computation on GPU VecDeriv& f = *d_f.beginEdit(); if (f.size() < x.size()) f.resize(x.size()); - const auto& elements = trait::FiniteElement::getElementSequence(*this->l_topology); - const auto nbElem = static_cast(elements.size()); - const auto nbVertex = static_cast(x.size()); + if (m_gpuRotationMethodSupported) + { + // Fully GPU path: compute rotations + forces in one kernel + gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations( + nbElem, + nbVertex, + trait::NumberOfNodesInElement, + m_maxElemPerVertex, + m_gpuElements.deviceRead(), + m_gpuInitialRotationsTransposed.deviceRead(), + m_gpuStiffness.deviceRead(), + x.deviceRead(), + x0.deviceRead(), + f.deviceWrite(), + m_gpuElementForce.deviceWrite(), + m_gpuRotations.deviceWrite(), + m_gpuVelems.deviceRead()); + + m_gpuRotationsUploaded = true; + } + else + { + // CPU rotations + GPU forces + this->computeRotations(this->m_rotations, x, x0); + uploadRotations(); - gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForce( - nbElem, - nbVertex, - trait::NumberOfNodesInElement, - m_maxElemPerVertex, - m_gpuElements.deviceRead(), - m_gpuRotations.deviceRead(), - m_gpuStiffness.deviceRead(), - x.deviceRead(), - x0.deviceRead(), - f.deviceWrite(), - m_gpuElementForce.deviceWrite(), - m_gpuVelems.deviceRead()); + gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForce( + nbElem, + nbVertex, + trait::NumberOfNodesInElement, + m_maxElemPerVertex, + m_gpuElements.deviceRead(), + m_gpuRotations.deviceRead(), + m_gpuStiffness.deviceRead(), + x.deviceRead(), + x0.deviceRead(), + f.deviceWrite(), + m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead()); + } d_f.endEdit(); } From 589d59193a2e5c2b5f5f461b91cd4c903a60969e Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Tue, 7 Apr 2026 15:11:25 +0900 Subject: [PATCH 10/21] try to fix with direct solver --- .../CudaElementCorotationalFEMForceField.h | 3 ++ .../CudaElementCorotationalFEMForceField.inl | 31 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h index e25f9a7b485..67c619768b5 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h @@ -127,6 +127,8 @@ class CudaElementCorotationalFEMForceField sofa::DataVecDeriv_t& df, const sofa::DataVecDeriv_t& dx) override; + void buildStiffnessMatrix(sofa::core::behavior::StiffnessMatrix* matrix) override; + protected: CudaElementCorotationalFEMForceField() = default; @@ -134,6 +136,7 @@ class CudaElementCorotationalFEMForceField void uploadStiffnessAndConnectivity(); void uploadRotations(); void uploadInitialRotationsTransposed(); + void downloadRotations(); gpu::cuda::CudaVector m_gpuStiffness; ///< Symmetric block-format stiffness per element gpu::cuda::CudaVector m_gpuRotations; ///< Flat 3x3 rotation matrices per element diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl index e6742c60b2c..7d7f32964c3 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl @@ -204,6 +204,37 @@ void CudaElementCorotationalFEMForceField::uploadInitial && (rotationMethodKey == "triangle" || rotationMethodKey == "hexahedron"); } +template +void CudaElementCorotationalFEMForceField::downloadRotations() +{ + using trait = sofa::component::solidmechanics::fem::elastic::trait; + constexpr auto dim = trait::spatial_dimensions; + + if (!m_gpuRotationsUploaded) return; + + const auto nbElem = m_gpuRotations.size() / (dim * dim); + this->m_rotations.resize(nbElem); + + const auto* src = m_gpuRotations.hostRead(); + for (std::size_t e = 0; e < nbElem; ++e) + { + auto& R = this->m_rotations[e]; + for (unsigned int i = 0; i < dim; ++i) + for (unsigned int j = 0; j < dim; ++j) + R[i][j] = static_cast(src[e * dim * dim + i * dim + j]); + } +} + +template +void CudaElementCorotationalFEMForceField::buildStiffnessMatrix( + sofa::core::behavior::StiffnessMatrix* matrix) +{ + if (m_gpuRotationMethodSupported && m_gpuRotationsUploaded) + downloadRotations(); + + ElementCorotationalFEMForceField::buildStiffnessMatrix(matrix); +} + template void CudaElementCorotationalFEMForceField::addForce( const sofa::core::MechanicalParams* mparams, From cf4ec939ebaa2d264b13548eeecae5f490c0782d Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Tue, 7 Apr 2026 15:31:39 +0900 Subject: [PATCH 11/21] fix corot --- .../CudaElementCorotationalFEMForceField.cu | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu index fd31eeab487..6de99fe5794 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu @@ -68,6 +68,25 @@ __device__ void mat3MulTranspose(const float* A, const float* BT, float* C) } } +/** + * Device helper: C = A^T * B (row-major) + * Matches SOFA's Mat::multTranspose(B) which computes this^T * B. + */ +__device__ void mat3TransposeMul(const float* A, const float* B, float* C) +{ + #pragma unroll + for (int i = 0; i < 3; ++i) + { + #pragma unroll + for (int j = 0; j < 3; ++j) + { + C[i * 3 + j] = A[0 * 3 + i] * B[0 * 3 + j] + + A[1 * 3 + i] * B[1 * 3 + j] + + A[2 * 3 + i] * B[2 * 3 + j]; + } + } +} + /** * Device helper: compute rotation frame from first 3 nodes (TriangleRotation). * Used for Triangle (NNodes=3) and Tetrahedron (NNodes=4) elements. @@ -250,11 +269,11 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_ else computeTriangleFrame(ex, frame); - // R = frame * initRotTransposed^T (i.e. frame.multTranspose(initRotTransposed)) - // Since initRotTransposed is already the transpose, R = frame * initRotTransposed^T + // R = frame^T * initRot (matching SOFA's Mat::multTranspose which computes A^T * B) + // m_initialRotationsTransposed stores frame_rest (despite its name, it's transposed during init) const float* irt = initRotTransposed + elemId * 9; float R[9]; - mat3MulTranspose(frame, irt, R); + mat3TransposeMul(frame, irt, R); // Write R to rotations buffer for addDForce float* Rout = rotationsOut + elemId * 9; From aa95ad3b3fec60fda44ff4978faeaa047a955215 Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Wed, 8 Apr 2026 08:12:39 +0900 Subject: [PATCH 12/21] update examples --- .../CudaElementCorotationalFEMForceField.scn | 73 ++++++++++++++++++- ...aElementLinearSmallStrainFEMForceField.scn | 68 ++++++++++++++++- 2 files changed, 137 insertions(+), 4 deletions(-) diff --git a/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn b/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn index a75e2058ff5..7ecf5da0d41 100644 --- a/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn +++ b/applications/plugins/SofaCUDA/examples/CudaElementCorotationalFEMForceField.scn @@ -12,13 +12,14 @@ - + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn b/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn index c59fb6a6c2a..39a5017048d 100644 --- a/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn +++ b/applications/plugins/SofaCUDA/examples/CudaElementLinearSmallStrainFEMForceField.scn @@ -14,7 +14,7 @@ - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From a9c0a0c39a2774e26bffef71cac815e9e22e9767 Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Wed, 8 Apr 2026 11:02:14 +0900 Subject: [PATCH 13/21] organize examples and add cpu-gpu comparison --- ...aElementCorotationalFEMForceField_hexa.scn | 37 ++++++++ ...aElementCorotationalFEMForceField_quad.scn | 36 +++++++ ...ElementCorotationalFEMForceField_tetra.scn | 47 +++++++++ ...mentCorotationalFEMForceField_triangle.scn | 35 +++++++ ...entLinearSmallStrainFEMForceField_hexa.scn | 36 +++++++ ...entLinearSmallStrainFEMForceField_quad.scn | 35 +++++++ ...ntLinearSmallStrainFEMForceField_tetra.scn | 46 +++++++++ ...inearSmallStrainFEMForceField_triangle.scn | 35 +++++++ ...orotationalFEMForceField_tetra_cpu_gpu.scn | 95 +++++++++++++++++++ ...tionalFEMForceField_tetra_cpu_gpu.scn.view | 17 ++++ 10 files changed, 419 insertions(+) create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_hexa.scn create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_quad.scn create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_tetra.scn create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_triangle.scn create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_hexa.scn create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_quad.scn create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_tetra.scn create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_triangle.scn create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn.view diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_hexa.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_hexa.scn new file mode 100644 index 00000000000..a08c65e0024 --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_hexa.scn @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_quad.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_quad.scn new file mode 100644 index 00000000000..047a96624f9 --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_quad.scn @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_tetra.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_tetra.scn new file mode 100644 index 00000000000..57a39e63286 --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_tetra.scn @@ -0,0 +1,47 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_triangle.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_triangle.scn new file mode 100644 index 00000000000..0a47b7e393d --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementCorotationalFEMForceField_triangle.scn @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_hexa.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_hexa.scn new file mode 100644 index 00000000000..228e4a08943 --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_hexa.scn @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_quad.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_quad.scn new file mode 100644 index 00000000000..14babc5d207 --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_quad.scn @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_tetra.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_tetra.scn new file mode 100644 index 00000000000..eac98ff93ac --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_tetra.scn @@ -0,0 +1,46 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_triangle.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_triangle.scn new file mode 100644 index 00000000000..03c6955697c --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/CudaElementLinearSmallStrainFEMForceField_triangle.scn @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn new file mode 100644 index 00000000000..6875f9a8849 --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn @@ -0,0 +1,95 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn.view b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn.view new file mode 100644 index 00000000000..3bf10e74929 --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/cpu-gpu_validation/ElementCorotationalFEMForceField_tetra_cpu_gpu.scn.view @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + From 63e201424bd8c22ab3fb81e69c94c0905709934b Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Wed, 8 Apr 2026 11:15:46 +0900 Subject: [PATCH 14/21] add double version (templates) --- .../CudaElementCorotationalFEMForceField.cpp | 27 + .../CudaElementCorotationalFEMForceField.cu | 490 +++++++++--------- .../CudaElementCorotationalFEMForceField.h | 55 +- .../CudaElementCorotationalFEMForceField.inl | 102 ++-- ...aElementLinearSmallStrainFEMForceField.cpp | 27 + ...daElementLinearSmallStrainFEMForceField.cu | 211 ++++---- ...udaElementLinearSmallStrainFEMForceField.h | 34 +- ...aElementLinearSmallStrainFEMForceField.inl | 64 ++- 8 files changed, 611 insertions(+), 399 deletions(-) diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp index c77a51c13c2..55a46c00669 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp @@ -36,6 +36,12 @@ template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; + } // namespace sofa::component::solidmechanics::fem::elastic namespace sofa::gpu::cuda @@ -65,6 +71,27 @@ void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory "Supports GPU-side computations using CUDA for HexahedronCorotationalFEMForceField") .add< CudaElementCorotationalFEMForceField >() ); + + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA (double) for EdgeCorotationalFEMForceField") + .add< CudaElementCorotationalFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA (double) for TriangleCorotationalFEMForceField") + .add< CudaElementCorotationalFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA (double) for QuadCorotationalFEMForceField") + .add< CudaElementCorotationalFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA (double) for TetrahedronCorotationalFEMForceField") + .add< CudaElementCorotationalFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA (double) for HexahedronCorotationalFEMForceField") + .add< CudaElementCorotationalFEMForceField >() + ); } } // namespace sofa::gpu::cuda diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu index 6de99fe5794..afbbdb89532 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu @@ -32,10 +32,16 @@ namespace cuda { #endif +template +__device__ T myRsqrt(T x); +template<> __device__ float myRsqrt(float x) { return rsqrtf(x); } +template<> __device__ double myRsqrt(double x) { return rsqrt(x); } + /** * Device helper: 3x3 matrix multiply C = A * B (row-major) */ -__device__ void mat3Mul(const float* A, const float* B, float* C) +template +__device__ void mat3Mul(const T* A, const T* B, T* C) { #pragma unroll for (int i = 0; i < 3; ++i) @@ -53,7 +59,8 @@ __device__ void mat3Mul(const float* A, const float* B, float* C) /** * Device helper: C = A * B^T (row-major) */ -__device__ void mat3MulTranspose(const float* A, const float* BT, float* C) +template +__device__ void mat3MulTranspose(const T* A, const T* BT, T* C) { #pragma unroll for (int i = 0; i < 3; ++i) @@ -72,7 +79,8 @@ __device__ void mat3MulTranspose(const float* A, const float* BT, float* C) * Device helper: C = A^T * B (row-major) * Matches SOFA's Mat::multTranspose(B) which computes this^T * B. */ -__device__ void mat3TransposeMul(const float* A, const float* B, float* C) +template +__device__ void mat3TransposeMul(const T* A, const T* B, T* C) { #pragma unroll for (int i = 0; i < 3; ++i) @@ -89,32 +97,26 @@ __device__ void mat3TransposeMul(const float* A, const float* B, float* C) /** * Device helper: compute rotation frame from first 3 nodes (TriangleRotation). - * Used for Triangle (NNodes=3) and Tetrahedron (NNodes=4) elements. - * ex is [NNodes*3] array of gathered node positions. */ -__device__ void computeTriangleFrame(const float* ex, float* frame) +template +__device__ void computeTriangleFrame(const T* ex, T* frame) { - // xAxis = normalize(p1 - p0) - float ax = ex[3] - ex[0], ay = ex[4] - ex[1], az = ex[5] - ex[2]; - float invLen = rsqrtf(ax * ax + ay * ay + az * az); + T ax = ex[3] - ex[0], ay = ex[4] - ex[1], az = ex[5] - ex[2]; + T invLen = myRsqrt(ax * ax + ay * ay + az * az); ax *= invLen; ay *= invLen; az *= invLen; - // tmp yAxis = p2 - p0 - float bx = ex[6] - ex[0], by = ex[7] - ex[1], bz = ex[8] - ex[2]; + T bx = ex[6] - ex[0], by = ex[7] - ex[1], bz = ex[8] - ex[2]; - // zAxis = normalize(cross(xAxis, tmpY)) - float cx = ay * bz - az * by; - float cy = az * bx - ax * bz; - float cz = ax * by - ay * bx; - invLen = rsqrtf(cx * cx + cy * cy + cz * cz); + T cx = ay * bz - az * by; + T cy = az * bx - ax * bz; + T cz = ax * by - ay * bx; + invLen = myRsqrt(cx * cx + cy * cy + cz * cz); cx *= invLen; cy *= invLen; cz *= invLen; - // yAxis = cross(zAxis, xAxis) bx = cy * az - cz * ay; by = cz * ax - cx * az; bz = cx * ay - cy * ax; - // frame rows: [xAxis; yAxis; zAxis] frame[0] = ax; frame[1] = ay; frame[2] = az; frame[3] = bx; frame[4] = by; frame[5] = bz; frame[6] = cx; frame[7] = cy; frame[8] = cz; @@ -122,44 +124,39 @@ __device__ void computeTriangleFrame(const float* ex, float* frame) /** * Device helper: compute rotation frame from 8 hexahedron nodes (HexahedronRotation). - * ex is [8*3] array of gathered node positions. */ -__device__ void computeHexahedronFrame(const float* ex, float* frame) +template +__device__ void computeHexahedronFrame(const T* ex, T* frame) { - // Average edge vectors - // xAxis_avg = ((n1-n0) + (n2-n3) + (n5-n4) + (n6-n7)) * 0.25 - float ax = ((ex[1*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[3*3+0]) - + (ex[5*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[7*3+0])) * 0.25f; - float ay = ((ex[1*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[3*3+1]) - + (ex[5*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[7*3+1])) * 0.25f; - float az = ((ex[1*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[3*3+2]) - + (ex[5*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[7*3+2])) * 0.25f; - - // yAxis_avg = ((n3-n0) + (n2-n1) + (n7-n4) + (n6-n5)) * 0.25 - float bx = ((ex[3*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[1*3+0]) - + (ex[7*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[5*3+0])) * 0.25f; - float by = ((ex[3*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[1*3+1]) - + (ex[7*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[5*3+1])) * 0.25f; - float bz = ((ex[3*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[1*3+2]) - + (ex[7*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[5*3+2])) * 0.25f; - - // Normalize xAxis - float invLen = rsqrtf(ax * ax + ay * ay + az * az); + const T quarter = T(0.25); + + T ax = ((ex[1*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[3*3+0]) + + (ex[5*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[7*3+0])) * quarter; + T ay = ((ex[1*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[3*3+1]) + + (ex[5*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[7*3+1])) * quarter; + T az = ((ex[1*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[3*3+2]) + + (ex[5*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[7*3+2])) * quarter; + + T bx = ((ex[3*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[1*3+0]) + + (ex[7*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[5*3+0])) * quarter; + T by = ((ex[3*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[1*3+1]) + + (ex[7*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[5*3+1])) * quarter; + T bz = ((ex[3*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[1*3+2]) + + (ex[7*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[5*3+2])) * quarter; + + T invLen = myRsqrt(ax * ax + ay * ay + az * az); ax *= invLen; ay *= invLen; az *= invLen; - // zAxis = normalize(cross(xAxis, yAxis_avg)) - float cx = ay * bz - az * by; - float cy = az * bx - ax * bz; - float cz = ax * by - ay * bx; - invLen = rsqrtf(cx * cx + cy * cy + cz * cz); + T cx = ay * bz - az * by; + T cy = az * bx - ax * bz; + T cz = ax * by - ay * bx; + invLen = myRsqrt(cx * cx + cy * cy + cz * cz); cx *= invLen; cy *= invLen; cz *= invLen; - // yAxis = cross(zAxis, xAxis) bx = cy * az - cz * ay; by = cz * ax - cx * az; bz = cx * ay - cy * ax; - // frame rows: [xAxis; yAxis; zAxis] frame[0] = ax; frame[1] = ay; frame[2] = az; frame[3] = bx; frame[4] = by; frame[5] = bz; frame[6] = cx; frame[7] = cy; frame[8] = cz; @@ -167,54 +164,48 @@ __device__ void computeHexahedronFrame(const float* ex, float* frame) /** * Symmetric block-matrix multiply: out = K * in - * K stored as upper triangle: NSymBlocks = NNodes*(NNodes+1)/2 blocks of 9 floats. - * Inline device function shared by addForce, addDForce, and combined kernels. */ -template -__device__ void symBlockMatMul(const float* K, const float* in, float* out) +template +__device__ void symBlockMatMul(const T* K, const T* in, T* out) { #pragma unroll for (int i = 0; i < NNodes * 3; ++i) - out[i] = 0.0f; + out[i] = T(0); #pragma unroll for (int ni = 0; ni < NNodes; ++ni) { const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; - // Diagonal block { - const float* Kii = K + diagIdx * 9; - const float i0 = in[ni * 3 + 0]; - const float i1 = in[ni * 3 + 1]; - const float i2 = in[ni * 3 + 2]; + const T* Kii = K + diagIdx * 9; + const T i0 = in[ni * 3 + 0]; + const T i1 = in[ni * 3 + 1]; + const T i2 = in[ni * 3 + 2]; out[ni * 3 + 0] += Kii[0] * i0 + Kii[1] * i1 + Kii[2] * i2; out[ni * 3 + 1] += Kii[3] * i0 + Kii[4] * i1 + Kii[5] * i2; out[ni * 3 + 2] += Kii[6] * i0 + Kii[7] * i1 + Kii[8] * i2; } - // Off-diagonal blocks #pragma unroll for (int nj = ni + 1; nj < NNodes; ++nj) { const int symIdx = diagIdx + (nj - ni); - const float* Kij = K + symIdx * 9; + const T* Kij = K + symIdx * 9; - // Forward: out[ni] += Kij * in[nj] { - const float j0 = in[nj * 3 + 0]; - const float j1 = in[nj * 3 + 1]; - const float j2 = in[nj * 3 + 2]; + const T j0 = in[nj * 3 + 0]; + const T j1 = in[nj * 3 + 1]; + const T j2 = in[nj * 3 + 2]; out[ni * 3 + 0] += Kij[0] * j0 + Kij[1] * j1 + Kij[2] * j2; out[ni * 3 + 1] += Kij[3] * j0 + Kij[4] * j1 + Kij[5] * j2; out[ni * 3 + 2] += Kij[6] * j0 + Kij[7] * j1 + Kij[8] * j2; } - // Symmetric: out[nj] += Kij^T * in[ni] { - const float i0 = in[ni * 3 + 0]; - const float i1 = in[ni * 3 + 1]; - const float i2 = in[ni * 3 + 2]; + const T i0 = in[ni * 3 + 0]; + const T i1 = in[ni * 3 + 1]; + const T i2 = in[ni * 3 + 2]; out[nj * 3 + 0] += Kij[0] * i0 + Kij[3] * i1 + Kij[6] * i2; out[nj * 3 + 1] += Kij[1] * i0 + Kij[4] * i1 + Kij[7] * i2; out[nj * 3 + 2] += Kij[2] * i0 + Kij[5] * i1 + Kij[8] * i2; @@ -225,31 +216,25 @@ __device__ void symBlockMatMul(const float* K, const float* in, float* out) /** * Combined kernel: compute rotations AND per-element forces in one pass. - * - * Uses TriangleRotation for NNodes=3,4 and HexahedronRotation for NNodes=8. - * Computes: frame from node positions → R = frame * initRotTransposed - * Then: displacement = R^T*(x-centroid) - (x0-centroid0) → K*disp → -R*result - * Also writes R to rotations buffer for subsequent addDForce calls. */ -template -__global__ void ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_kernel( +template +__global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel( int nbElem, const int* __restrict__ elements, - const float* __restrict__ initRotTransposed, - const float* __restrict__ stiffness, - const float* __restrict__ x, - const float* __restrict__ x0, - float* __restrict__ rotationsOut, - float* __restrict__ eforce) + const T* __restrict__ initRotTransposed, + const T* __restrict__ stiffness, + const T* __restrict__ x, + const T* __restrict__ x0, + T* __restrict__ rotationsOut, + T* __restrict__ eforce) { constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; - constexpr float invN = 1.0f / NNodes; + const T invN = T(1) / T(NNodes); const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; - // Gather node positions and rest positions - float ex[NNodes * 3], ex0[NNodes * 3]; + T ex[NNodes * 3], ex0[NNodes * 3]; #pragma unroll for (int n = 0; n < NNodes; ++n) { @@ -262,28 +247,24 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_ ex0[n * 3 + 2] = x0[nodeId * 3 + 2]; } - // Compute rotation frame from current positions - float frame[9]; + T frame[9]; if constexpr (NNodes == 8) computeHexahedronFrame(ex, frame); else computeTriangleFrame(ex, frame); - // R = frame^T * initRot (matching SOFA's Mat::multTranspose which computes A^T * B) - // m_initialRotationsTransposed stores frame_rest (despite its name, it's transposed during init) - const float* irt = initRotTransposed + elemId * 9; - float R[9]; + // R = frame^T * initRot + const T* irt = initRotTransposed + elemId * 9; + T R[9]; mat3TransposeMul(frame, irt, R); - // Write R to rotations buffer for addDForce - float* Rout = rotationsOut + elemId * 9; + T* Rout = rotationsOut + elemId * 9; #pragma unroll for (int i = 0; i < 9; ++i) Rout[i] = R[i]; - // Compute centroids - float cx = 0.0f, cy = 0.0f, cz = 0.0f; - float cx0 = 0.0f, cy0 = 0.0f, cz0 = 0.0f; + T cx = T(0), cy = T(0), cz = T(0); + T cx0 = T(0), cy0 = T(0), cz0 = T(0); #pragma unroll for (int n = 0; n < NNodes; ++n) { @@ -293,35 +274,32 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_ cx *= invN; cy *= invN; cz *= invN; cx0 *= invN; cy0 *= invN; cz0 *= invN; - // Compute displacement: disp[j] = R^T * (x[j] - centroid) - (x0[j] - centroid0) - float disp[NNodes * 3]; + T disp[NNodes * 3]; #pragma unroll for (int n = 0; n < NNodes; ++n) { - const float dx = ex[n * 3 + 0] - cx; - const float dy = ex[n * 3 + 1] - cy; - const float dz = ex[n * 3 + 2] - cz; - const float rx = R[0] * dx + R[3] * dy + R[6] * dz; - const float ry = R[1] * dx + R[4] * dy + R[7] * dz; - const float rz = R[2] * dx + R[5] * dy + R[8] * dz; + const T dx = ex[n * 3 + 0] - cx; + const T dy = ex[n * 3 + 1] - cy; + const T dz = ex[n * 3 + 2] - cz; + const T rx = R[0] * dx + R[3] * dy + R[6] * dz; + const T ry = R[1] * dx + R[4] * dy + R[7] * dz; + const T rz = R[2] * dx + R[5] * dy + R[8] * dz; disp[n * 3 + 0] = rx - (ex0[n * 3 + 0] - cx0); disp[n * 3 + 1] = ry - (ex0[n * 3 + 1] - cy0); disp[n * 3 + 2] = rz - (ex0[n * 3 + 2] - cz0); } - // edf = K * disp - float edf[NNodes * 3]; - const float* K = stiffness + elemId * NSymBlocks * 9; - symBlockMatMul(K, disp, edf); + T edf[NNodes * 3]; + const T* K = stiffness + elemId * NSymBlocks * 9; + symBlockMatMul(K, disp, edf); - // Rotate back and write: out = -R * edf - float* out = eforce + elemId * NNodes * 3; + T* out = eforce + elemId * NNodes * 3; #pragma unroll for (int n = 0; n < NNodes; ++n) { - const float e0 = edf[n * 3 + 0]; - const float e1 = edf[n * 3 + 1]; - const float e2 = edf[n * 3 + 2]; + const T e0 = edf[n * 3 + 0]; + const T e1 = edf[n * 3 + 1]; + const T e2 = edf[n * 3 + 2]; out[n * 3 + 0] = -(R[0] * e0 + R[1] * e1 + R[2] * e2); out[n * 3 + 1] = -(R[3] * e0 + R[4] * e1 + R[5] * e2); out[n * 3 + 2] = -(R[6] * e0 + R[7] * e1 + R[8] * e2); @@ -330,36 +308,30 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_ /** * Kernel for addForce: Compute per-element force (1 thread per element). - * - * displacement[j] = R^T * (x[j] - centroid_x) - (x0[j] - centroid_x0) - * elementForce = K * displacement - * out[j] = -R * elementForce[j] */ -template -__global__ void ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel( +template +__global__ void ElementCorotationalFEMForceField_computeForce_kernel( int nbElem, const int* __restrict__ elements, - const float* __restrict__ rotations, - const float* __restrict__ stiffness, - const float* __restrict__ x, - const float* __restrict__ x0, - float* __restrict__ eforce) + const T* __restrict__ rotations, + const T* __restrict__ stiffness, + const T* __restrict__ x, + const T* __restrict__ x0, + T* __restrict__ eforce) { constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; - constexpr float invN = 1.0f / NNodes; + const T invN = T(1) / T(NNodes); const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; - // Load rotation matrix R (3x3, row-major) - const float* Rptr = rotations + elemId * 9; - float R[9]; + const T* Rptr = rotations + elemId * 9; + T R[9]; #pragma unroll for (int i = 0; i < 9; ++i) R[i] = Rptr[i]; - // Gather node positions and rest positions - float ex[NNodes * 3], ex0[NNodes * 3]; + T ex[NNodes * 3], ex0[NNodes * 3]; #pragma unroll for (int n = 0; n < NNodes; ++n) { @@ -372,9 +344,8 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel( ex0[n * 3 + 2] = x0[nodeId * 3 + 2]; } - // Compute centroids - float cx = 0.0f, cy = 0.0f, cz = 0.0f; - float cx0 = 0.0f, cy0 = 0.0f, cz0 = 0.0f; + T cx = T(0), cy = T(0), cz = T(0); + T cx0 = T(0), cy0 = T(0), cz0 = T(0); #pragma unroll for (int n = 0; n < NNodes; ++n) { @@ -384,38 +355,32 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel( cx *= invN; cy *= invN; cz *= invN; cx0 *= invN; cy0 *= invN; cz0 *= invN; - // Compute displacement: disp[j] = R^T * (x[j] - centroid) - (x0[j] - centroid0) - float disp[NNodes * 3]; + T disp[NNodes * 3]; #pragma unroll for (int n = 0; n < NNodes; ++n) { - const float dx = ex[n * 3 + 0] - cx; - const float dy = ex[n * 3 + 1] - cy; - const float dz = ex[n * 3 + 2] - cz; - - // R^T * (x - centroid) - const float rx = R[0] * dx + R[3] * dy + R[6] * dz; - const float ry = R[1] * dx + R[4] * dy + R[7] * dz; - const float rz = R[2] * dx + R[5] * dy + R[8] * dz; - + const T dx = ex[n * 3 + 0] - cx; + const T dy = ex[n * 3 + 1] - cy; + const T dz = ex[n * 3 + 2] - cz; + const T rx = R[0] * dx + R[3] * dy + R[6] * dz; + const T ry = R[1] * dx + R[4] * dy + R[7] * dz; + const T rz = R[2] * dx + R[5] * dy + R[8] * dz; disp[n * 3 + 0] = rx - (ex0[n * 3 + 0] - cx0); disp[n * 3 + 1] = ry - (ex0[n * 3 + 1] - cy0); disp[n * 3 + 2] = rz - (ex0[n * 3 + 2] - cz0); } - // edf = K * disp - float edf[NNodes * 3]; - const float* K = stiffness + elemId * NSymBlocks * 9; - symBlockMatMul(K, disp, edf); + T edf[NNodes * 3]; + const T* K = stiffness + elemId * NSymBlocks * 9; + symBlockMatMul(K, disp, edf); - // Rotate back and write: out = -R * edf - float* out = eforce + elemId * NNodes * 3; + T* out = eforce + elemId * NNodes * 3; #pragma unroll for (int n = 0; n < NNodes; ++n) { - const float e0 = edf[n * 3 + 0]; - const float e1 = edf[n * 3 + 1]; - const float e2 = edf[n * 3 + 2]; + const T e0 = edf[n * 3 + 0]; + const T e1 = edf[n * 3 + 1]; + const T e2 = edf[n * 3 + 2]; out[n * 3 + 0] = -(R[0] * e0 + R[1] * e1 + R[2] * e2); out[n * 3 + 1] = -(R[3] * e0 + R[4] * e1 + R[5] * e2); out[n * 3 + 2] = -(R[6] * e0 + R[7] * e1 + R[8] * e2); @@ -424,59 +389,52 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel( /** * Kernel for addDForce: Compute per-element dForce (1 thread per element). - * - * rdx = R^T * dx, edf = K * rdx, out = -kFactor * R * edf */ -template -__global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel( +template +__global__ void ElementCorotationalFEMForceField_computeDForce_kernel( int nbElem, const int* __restrict__ elements, - const float* __restrict__ rotations, - const float* __restrict__ stiffness, - const float* __restrict__ dx, - float* __restrict__ eforce, - float kFactor) + const T* __restrict__ rotations, + const T* __restrict__ stiffness, + const T* __restrict__ dx, + T* __restrict__ eforce, + T kFactor) { constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; - // Load rotation matrix R (3x3, row-major) - const float* Rptr = rotations + elemId * 9; - float R[9]; + const T* Rptr = rotations + elemId * 9; + T R[9]; #pragma unroll for (int i = 0; i < 9; ++i) R[i] = Rptr[i]; - // Gather dx and rotate into reference frame: rdx[n] = R^T * dx[node[n]] - float rdx[NNodes * 3]; + T rdx[NNodes * 3]; #pragma unroll for (int n = 0; n < NNodes; ++n) { const int nodeId = elements[n * nbElem + elemId]; - const float dx_x = dx[nodeId * 3 + 0]; - const float dx_y = dx[nodeId * 3 + 1]; - const float dx_z = dx[nodeId * 3 + 2]; - + const T dx_x = dx[nodeId * 3 + 0]; + const T dx_y = dx[nodeId * 3 + 1]; + const T dx_z = dx[nodeId * 3 + 2]; rdx[n * 3 + 0] = R[0] * dx_x + R[3] * dx_y + R[6] * dx_z; rdx[n * 3 + 1] = R[1] * dx_x + R[4] * dx_y + R[7] * dx_z; rdx[n * 3 + 2] = R[2] * dx_x + R[5] * dx_y + R[8] * dx_z; } - // Symmetric block-matrix multiply: edf = K * rdx - const float* K = stiffness + elemId * NSymBlocks * 9; - float edf[NNodes * 3]; - symBlockMatMul(K, rdx, edf); + const T* K = stiffness + elemId * NSymBlocks * 9; + T edf[NNodes * 3]; + symBlockMatMul(K, rdx, edf); - // Rotate back and write: eforce = -kFactor * R * edf - float* out = eforce + elemId * NNodes * 3; + T* out = eforce + elemId * NNodes * 3; #pragma unroll for (int n = 0; n < NNodes; ++n) { - const float e0 = edf[n * 3 + 0]; - const float e1 = edf[n * 3 + 1]; - const float e2 = edf[n * 3 + 2]; + const T e0 = edf[n * 3 + 0]; + const T e1 = edf[n * 3 + 1]; + const T e2 = edf[n * 3 + 2]; out[n * 3 + 0] = -kFactor * (R[0] * e0 + R[1] * e1 + R[2] * e2); out[n * 3 + 1] = -kFactor * (R[3] * e0 + R[4] * e1 + R[5] * e2); out[n * 3 + 2] = -kFactor * (R[6] * e0 + R[7] * e1 + R[8] * e2); @@ -485,23 +443,19 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel( /** * Gather per-vertex forces (1 thread per vertex). - * - * Shared by addForce and addDForce. - * No atomics: each vertex handled by exactly one thread. - * velems is SoA: velems[s * nbVertex + vertexId], 0-terminated. - * Each entry is (elemId * NNodes + localNode + 1), with 0 as sentinel. */ -__global__ void ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel( +template +__global__ void ElementCorotationalFEMForceField_gatherForce_kernel( int nbVertex, int maxElemPerVertex, const int* __restrict__ velems, - const float* __restrict__ eforce, - float* df) + const T* __restrict__ eforce, + T* df) { const int vertexId = blockIdx.x * blockDim.x + threadIdx.x; if (vertexId >= nbVertex) return; - float fx = 0.0f, fy = 0.0f, fz = 0.0f; + T fx = T(0), fy = T(0), fz = T(0); for (int s = 0; s < maxElemPerVertex; ++s) { @@ -518,6 +472,7 @@ __global__ void ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel( df[vertexId * 3 + 2] += fz; } +template static void launchGather( unsigned int nbVertex, unsigned int maxElemPerVertex, @@ -527,17 +482,17 @@ static void launchGather( { const int gatherThreads = 256; const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; - ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel + ElementCorotationalFEMForceField_gatherForce_kernel <<>>( nbVertex, maxElemPerVertex, (const int*)velems, - (const float*)eforce, - (float*)f); - mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_gatherForce_kernel"); + (const T*)eforce, + (T*)f); + mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel"); } -template +template static void launchAddForceWithRotations( unsigned int nbElem, unsigned int nbVertex, @@ -554,22 +509,22 @@ static void launchAddForceWithRotations( { const int computeThreads = 64; const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; - ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_kernel + ElementCorotationalFEMForceField_computeRotationsAndForce_kernel <<>>( nbElem, (const int*)elements, - (const float*)initRotTransposed, - (const float*)stiffness, - (const float*)x, - (const float*)x0, - (float*)rotationsOut, - (float*)eforce); - mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeRotationsAndForce_kernel"); - - launchGather(nbVertex, maxElemPerVertex, velems, eforce, f); + (const T*)initRotTransposed, + (const T*)stiffness, + (const T*)x, + (const T*)x0, + (T*)rotationsOut, + (T*)eforce); + mycudaDebugError("ElementCorotationalFEMForceField_computeRotationsAndForce_kernel"); + + launchGather(nbVertex, maxElemPerVertex, velems, eforce, f); } -template +template static void launchAddForce( unsigned int nbElem, unsigned int nbVertex, @@ -585,21 +540,21 @@ static void launchAddForce( { const int computeThreads = 64; const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; - ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel + ElementCorotationalFEMForceField_computeForce_kernel <<>>( nbElem, (const int*)elements, - (const float*)rotations, - (const float*)stiffness, - (const float*)x, - (const float*)x0, - (float*)eforce); - mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeForce_kernel"); - - launchGather(nbVertex, maxElemPerVertex, velems, eforce, f); + (const T*)rotations, + (const T*)stiffness, + (const T*)x, + (const T*)x0, + (T*)eforce); + mycudaDebugError("ElementCorotationalFEMForceField_computeForce_kernel"); + + launchGather(nbVertex, maxElemPerVertex, velems, eforce, f); } -template +template static void launchAddDForce( unsigned int nbElem, unsigned int nbVertex, @@ -611,27 +566,29 @@ static void launchAddDForce( void* df, void* eforce, const void* velems, - float kFactor) + T kFactor) { const int computeThreads = 64; const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; - ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel + ElementCorotationalFEMForceField_computeDForce_kernel <<>>( nbElem, (const int*)elements, - (const float*)rotations, - (const float*)stiffness, - (const float*)dx, - (float*)eforce, + (const T*)rotations, + (const T*)stiffness, + (const T*)dx, + (T*)eforce, kFactor); - mycudaDebugError("ElementCorotationalFEMForceFieldCuda3f_computeDForce_kernel"); + mycudaDebugError("ElementCorotationalFEMForceField_computeDForce_kernel"); - launchGather(nbVertex, maxElemPerVertex, velems, eforce, df); + launchGather(nbVertex, maxElemPerVertex, velems, eforce, df); } extern "C" { +// ==================== float versions ==================== + void ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations( unsigned int nbElem, unsigned int nbVertex, @@ -649,9 +606,9 @@ void ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations( { switch (nbNodesPerElem) { - case 3: launchAddForceWithRotations<3>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; - case 4: launchAddForceWithRotations<4>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; - case 8: launchAddForceWithRotations<8>(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; + case 3: launchAddForceWithRotations(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; + case 4: launchAddForceWithRotations(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; + case 8: launchAddForceWithRotations(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; } } @@ -671,10 +628,10 @@ void ElementCorotationalFEMForceFieldCuda3f_addForce( { switch (nbNodesPerElem) { - case 2: launchAddForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; - case 3: launchAddForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; - case 4: launchAddForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; - case 8: launchAddForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; + case 2: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; + case 3: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; + case 4: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; + case 8: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; } } @@ -694,10 +651,81 @@ void ElementCorotationalFEMForceFieldCuda3f_addDForce( { switch (nbNodesPerElem) { - case 2: launchAddDForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; - case 3: launchAddDForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; - case 4: launchAddDForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; - case 8: launchAddDForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; + case 2: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; + case 3: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; + case 4: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; + case 8: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; + } +} + +// ==================== double versions ==================== + +void ElementCorotationalFEMForceFieldCuda3d_addForceWithRotations( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* initRotTransposed, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + void* rotationsOut, + const void* velems) +{ + switch (nbNodesPerElem) + { + case 3: launchAddForceWithRotations(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; + case 4: launchAddForceWithRotations(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; + case 8: launchAddForceWithRotations(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; + } +} + +void ElementCorotationalFEMForceFieldCuda3d_addForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* rotations, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + const void* velems) +{ + switch (nbNodesPerElem) + { + case 2: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; + case 3: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; + case 4: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; + case 8: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; + } +} + +void ElementCorotationalFEMForceFieldCuda3d_addDForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* rotations, + const void* stiffness, + const void* dx, + void* df, + void* eforce, + const void* velems, + double kFactor) +{ + switch (nbNodesPerElem) + { + case 2: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; + case 3: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; + case 4: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; + case 8: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; } } diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h index 67c619768b5..7ec167dcae3 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h @@ -71,6 +71,49 @@ extern "C" void* eforce, const void* velems, float kFactor); + + void ElementCorotationalFEMForceFieldCuda3d_addForceWithRotations( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* initRotTransposed, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + void* rotationsOut, + const void* velems); + + void ElementCorotationalFEMForceFieldCuda3d_addForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* rotations, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + const void* velems); + + void ElementCorotationalFEMForceFieldCuda3d_addDForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* rotations, + const void* stiffness, + const void* dx, + void* df, + void* eforce, + const void* velems, + double kFactor); } } // namespace sofa::gpu::cuda @@ -138,12 +181,12 @@ class CudaElementCorotationalFEMForceField void uploadInitialRotationsTransposed(); void downloadRotations(); - gpu::cuda::CudaVector m_gpuStiffness; ///< Symmetric block-format stiffness per element - gpu::cuda::CudaVector m_gpuRotations; ///< Flat 3x3 rotation matrices per element - gpu::cuda::CudaVector m_gpuInitialRotationsTransposed; ///< Flat 3x3 initial rotation transposed per element - gpu::cuda::CudaVector m_gpuElements; ///< SoA connectivity: elements[nodeIdx * nbElem + elemId] - gpu::cuda::CudaVector m_gpuElementForce; ///< Intermediate per-element per-node force buffer - gpu::cuda::CudaVector m_gpuVelems; ///< SoA vertex-to-element mapping, 0-terminated + gpu::cuda::CudaVector m_gpuStiffness; ///< Symmetric block-format stiffness per element + gpu::cuda::CudaVector m_gpuRotations; ///< Flat 3x3 rotation matrices per element + gpu::cuda::CudaVector m_gpuInitialRotationsTransposed; ///< Flat 3x3 initial rotation transposed per element + gpu::cuda::CudaVector m_gpuElements; ///< SoA connectivity: elements[nodeIdx * nbElem + elemId] + gpu::cuda::CudaVector m_gpuElementForce; ///< Intermediate per-element per-node force buffer + gpu::cuda::CudaVector m_gpuVelems; ///< SoA vertex-to-element mapping, 0-terminated unsigned int m_maxElemPerVertex = 0; unsigned int m_nbVertices = 0; diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl index 7d7f32964c3..effb420ab61 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl @@ -90,7 +90,7 @@ void CudaElementCorotationalFEMForceField::uploadStiffne dst[e * nSymBlocks * dim * dim + symIdx * dim * dim + di * dim + dj] - = static_cast(K[ni * dim + di][nj * dim + dj]); + = static_cast(K[ni * dim + di][nj * dim + dj]); } } } @@ -167,7 +167,7 @@ void CudaElementCorotationalFEMForceField::uploadRotatio const auto& R = rotations[e]; for (unsigned int i = 0; i < dim; ++i) for (unsigned int j = 0; j < dim; ++j) - dst[e * dim * dim + i * dim + j] = static_cast(R[i][j]); + dst[e * dim * dim + i * dim + j] = static_cast(R[i][j]); } } @@ -194,7 +194,7 @@ void CudaElementCorotationalFEMForceField::uploadInitial const auto& R = initRotT[e]; for (unsigned int i = 0; i < dim; ++i) for (unsigned int j = 0; j < dim; ++j) - dst[e * dim * dim + i * dim + j] = static_cast(R[i][j]); + dst[e * dim * dim + i * dim + j] = static_cast(R[i][j]); } } @@ -269,20 +269,24 @@ void CudaElementCorotationalFEMForceField::addForce( if (m_gpuRotationMethodSupported) { // Fully GPU path: compute rotations + forces in one kernel - gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations( - nbElem, - nbVertex, - trait::NumberOfNodesInElement, - m_maxElemPerVertex, - m_gpuElements.deviceRead(), - m_gpuInitialRotationsTransposed.deviceRead(), - m_gpuStiffness.deviceRead(), - x.deviceRead(), - x0.deviceRead(), - f.deviceWrite(), - m_gpuElementForce.deviceWrite(), - m_gpuRotations.deviceWrite(), - m_gpuVelems.deviceRead()); + if constexpr (std::is_same_v) + { + gpu::cuda::ElementCorotationalFEMForceFieldCuda3d_addForceWithRotations( + nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuInitialRotationsTransposed.deviceRead(), + m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), + f.deviceWrite(), m_gpuElementForce.deviceWrite(), + m_gpuRotations.deviceWrite(), m_gpuVelems.deviceRead()); + } + else + { + gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations( + nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuInitialRotationsTransposed.deviceRead(), + m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), + f.deviceWrite(), m_gpuElementForce.deviceWrite(), + m_gpuRotations.deviceWrite(), m_gpuVelems.deviceRead()); + } m_gpuRotationsUploaded = true; } @@ -292,19 +296,24 @@ void CudaElementCorotationalFEMForceField::addForce( this->computeRotations(this->m_rotations, x, x0); uploadRotations(); - gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForce( - nbElem, - nbVertex, - trait::NumberOfNodesInElement, - m_maxElemPerVertex, - m_gpuElements.deviceRead(), - m_gpuRotations.deviceRead(), - m_gpuStiffness.deviceRead(), - x.deviceRead(), - x0.deviceRead(), - f.deviceWrite(), - m_gpuElementForce.deviceWrite(), - m_gpuVelems.deviceRead()); + if constexpr (std::is_same_v) + { + gpu::cuda::ElementCorotationalFEMForceFieldCuda3d_addForce( + nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), + m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), + f.deviceWrite(), m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead()); + } + else + { + gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForce( + nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), + m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), + f.deviceWrite(), m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead()); + } } d_f.endEdit(); @@ -334,7 +343,7 @@ void CudaElementCorotationalFEMForceField::addDForce( if (df.size() < dx.size()) df.resize(dx.size()); - const auto kFactor = static_cast( + const auto kFactor = static_cast( sofa::core::mechanicalparams::kFactorIncludingRayleighDamping( mparams, this->rayleighStiffness.getValue())); @@ -342,19 +351,24 @@ void CudaElementCorotationalFEMForceField::addDForce( const auto nbElem = static_cast(elements.size()); const auto nbVertex = static_cast(dx.size()); - gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addDForce( - nbElem, - nbVertex, - trait::NumberOfNodesInElement, - m_maxElemPerVertex, - m_gpuElements.deviceRead(), - m_gpuRotations.deviceRead(), - m_gpuStiffness.deviceRead(), - dx.deviceRead(), - df.deviceWrite(), - m_gpuElementForce.deviceWrite(), - m_gpuVelems.deviceRead(), - kFactor); + if constexpr (std::is_same_v) + { + gpu::cuda::ElementCorotationalFEMForceFieldCuda3d_addDForce( + nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), + m_gpuStiffness.deviceRead(), dx.deviceRead(), + df.deviceWrite(), m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead(), kFactor); + } + else + { + gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addDForce( + nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), + m_gpuStiffness.deviceRead(), dx.deviceRead(), + df.deviceWrite(), m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead(), kFactor); + } d_df.endEdit(); } diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp index af802d29e95..b46f90d06d4 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp @@ -36,6 +36,12 @@ template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField< template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; +template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; + } // namespace sofa::component::solidmechanics::fem::elastic namespace sofa::gpu::cuda @@ -65,6 +71,27 @@ void registerElementLinearSmallStrainFEMForceField(sofa::core::ObjectFactory* fa "Supports GPU-side computations using CUDA for HexahedronLinearSmallStrainFEMForceField") .add< CudaElementLinearSmallStrainFEMForceField >() ); + + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA (double) for EdgeLinearSmallStrainFEMForceField") + .add< CudaElementLinearSmallStrainFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA (double) for TriangleLinearSmallStrainFEMForceField") + .add< CudaElementLinearSmallStrainFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA (double) for QuadLinearSmallStrainFEMForceField") + .add< CudaElementLinearSmallStrainFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA (double) for TetrahedronLinearSmallStrainFEMForceField") + .add< CudaElementLinearSmallStrainFEMForceField >() + ); + factory->registerObjects(sofa::core::ObjectRegistrationData( + "Supports GPU-side computations using CUDA (double) for HexahedronLinearSmallStrainFEMForceField") + .add< CudaElementLinearSmallStrainFEMForceField >() + ); } } // namespace sofa::gpu::cuda diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu index 39c67a27db4..6752bd29af1 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu @@ -37,16 +37,16 @@ namespace cuda * * f = -K * (x - x0) * Templated on NNodes (compile-time) for full loop unrolling. - * Hardcoded Dim=3 (CudaVec3f only). + * Templated on T for float/double support. */ -template -__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel( +template +__global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel( int nbElem, const int* __restrict__ elements, - const float* __restrict__ stiffness, - const float* __restrict__ x, - const float* __restrict__ x0, - float* __restrict__ eforce) + const T* __restrict__ stiffness, + const T* __restrict__ x, + const T* __restrict__ x0, + T* __restrict__ eforce) { constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; @@ -54,7 +54,7 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel( if (elemId >= nbElem) return; // Gather displacement = x - x0 for this element's nodes - float disp[NNodes * 3]; + T disp[NNodes * 3]; #pragma unroll for (int n = 0; n < NNodes; ++n) { @@ -65,12 +65,12 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel( } // Symmetric block-matrix multiply: edf = K * disp - const float* K = stiffness + elemId * NSymBlocks * 9; - float edf[NNodes * 3]; + const T* K = stiffness + elemId * NSymBlocks * 9; + T edf[NNodes * 3]; #pragma unroll for (int i = 0; i < NNodes * 3; ++i) - edf[i] = 0.0f; + edf[i] = T(0); #pragma unroll for (int ni = 0; ni < NNodes; ++ni) @@ -79,10 +79,10 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel( // Diagonal block { - const float* Kii = K + diagIdx * 9; - const float di0 = disp[ni * 3 + 0]; - const float di1 = disp[ni * 3 + 1]; - const float di2 = disp[ni * 3 + 2]; + const T* Kii = K + diagIdx * 9; + const T di0 = disp[ni * 3 + 0]; + const T di1 = disp[ni * 3 + 1]; + const T di2 = disp[ni * 3 + 2]; edf[ni * 3 + 0] += Kii[0] * di0 + Kii[1] * di1 + Kii[2] * di2; edf[ni * 3 + 1] += Kii[3] * di0 + Kii[4] * di1 + Kii[5] * di2; edf[ni * 3 + 2] += Kii[6] * di0 + Kii[7] * di1 + Kii[8] * di2; @@ -93,21 +93,21 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel( for (int nj = ni + 1; nj < NNodes; ++nj) { const int symIdx = diagIdx + (nj - ni); - const float* Kij = K + symIdx * 9; + const T* Kij = K + symIdx * 9; { - const float dj0 = disp[nj * 3 + 0]; - const float dj1 = disp[nj * 3 + 1]; - const float dj2 = disp[nj * 3 + 2]; + const T dj0 = disp[nj * 3 + 0]; + const T dj1 = disp[nj * 3 + 1]; + const T dj2 = disp[nj * 3 + 2]; edf[ni * 3 + 0] += Kij[0] * dj0 + Kij[1] * dj1 + Kij[2] * dj2; edf[ni * 3 + 1] += Kij[3] * dj0 + Kij[4] * dj1 + Kij[5] * dj2; edf[ni * 3 + 2] += Kij[6] * dj0 + Kij[7] * dj1 + Kij[8] * dj2; } { - const float di0 = disp[ni * 3 + 0]; - const float di1 = disp[ni * 3 + 1]; - const float di2 = disp[ni * 3 + 2]; + const T di0 = disp[ni * 3 + 0]; + const T di1 = disp[ni * 3 + 1]; + const T di2 = disp[ni * 3 + 2]; edf[nj * 3 + 0] += Kij[0] * di0 + Kij[3] * di1 + Kij[6] * di2; edf[nj * 3 + 1] += Kij[1] * di0 + Kij[4] * di1 + Kij[7] * di2; edf[nj * 3 + 2] += Kij[2] * di0 + Kij[5] * di1 + Kij[8] * di2; @@ -116,7 +116,7 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel( } // Write: eforce = -edf (minus sign from f -= K * displacement) - float* out = eforce + elemId * NNodes * 3; + T* out = eforce + elemId * NNodes * 3; #pragma unroll for (int n = 0; n < NNodes; ++n) { @@ -130,17 +130,15 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel( * Kernel for addDForce: Compute per-element dForce (1 thread per element). * * df = -kFactor * K * dx - * Templated on NNodes (compile-time) for full loop unrolling. - * Hardcoded Dim=3 (CudaVec3f only). */ -template -__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel( +template +__global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel( int nbElem, const int* __restrict__ elements, - const float* __restrict__ stiffness, - const float* __restrict__ dx, - float* __restrict__ eforce, - float kFactor) + const T* __restrict__ stiffness, + const T* __restrict__ dx, + T* __restrict__ eforce, + T kFactor) { constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; @@ -148,7 +146,7 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel if (elemId >= nbElem) return; // Gather dx for this element's nodes - float edx[NNodes * 3]; + T edx[NNodes * 3]; #pragma unroll for (int n = 0; n < NNodes; ++n) { @@ -159,51 +157,47 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel } // Symmetric block-matrix multiply: edf = K * edx - const float* K = stiffness + elemId * NSymBlocks * 9; - float edf[NNodes * 3]; + const T* K = stiffness + elemId * NSymBlocks * 9; + T edf[NNodes * 3]; #pragma unroll for (int i = 0; i < NNodes * 3; ++i) - edf[i] = 0.0f; + edf[i] = T(0); #pragma unroll for (int ni = 0; ni < NNodes; ++ni) { const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; - // Diagonal block (ni, ni): Kii * edx[ni] { - const float* Kii = K + diagIdx * 9; - const float di0 = edx[ni * 3 + 0]; - const float di1 = edx[ni * 3 + 1]; - const float di2 = edx[ni * 3 + 2]; + const T* Kii = K + diagIdx * 9; + const T di0 = edx[ni * 3 + 0]; + const T di1 = edx[ni * 3 + 1]; + const T di2 = edx[ni * 3 + 2]; edf[ni * 3 + 0] += Kii[0] * di0 + Kii[1] * di1 + Kii[2] * di2; edf[ni * 3 + 1] += Kii[3] * di0 + Kii[4] * di1 + Kii[5] * di2; edf[ni * 3 + 2] += Kii[6] * di0 + Kii[7] * di1 + Kii[8] * di2; } - // Off-diagonal blocks (ni, nj) for nj > ni #pragma unroll for (int nj = ni + 1; nj < NNodes; ++nj) { const int symIdx = diagIdx + (nj - ni); - const float* Kij = K + symIdx * 9; + const T* Kij = K + symIdx * 9; - // Forward: edf[ni] += Kij * edx[nj] { - const float dj0 = edx[nj * 3 + 0]; - const float dj1 = edx[nj * 3 + 1]; - const float dj2 = edx[nj * 3 + 2]; + const T dj0 = edx[nj * 3 + 0]; + const T dj1 = edx[nj * 3 + 1]; + const T dj2 = edx[nj * 3 + 2]; edf[ni * 3 + 0] += Kij[0] * dj0 + Kij[1] * dj1 + Kij[2] * dj2; edf[ni * 3 + 1] += Kij[3] * dj0 + Kij[4] * dj1 + Kij[5] * dj2; edf[ni * 3 + 2] += Kij[6] * dj0 + Kij[7] * dj1 + Kij[8] * dj2; } - // Symmetric: edf[nj] += Kij^T * edx[ni] { - const float di0 = edx[ni * 3 + 0]; - const float di1 = edx[ni * 3 + 1]; - const float di2 = edx[ni * 3 + 2]; + const T di0 = edx[ni * 3 + 0]; + const T di1 = edx[ni * 3 + 1]; + const T di2 = edx[ni * 3 + 2]; edf[nj * 3 + 0] += Kij[0] * di0 + Kij[3] * di1 + Kij[6] * di2; edf[nj * 3 + 1] += Kij[1] * di0 + Kij[4] * di1 + Kij[7] * di2; edf[nj * 3 + 2] += Kij[2] * di0 + Kij[5] * di1 + Kij[8] * di2; @@ -212,7 +206,7 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel } // Write: eforce = -kFactor * edf - float* out = eforce + elemId * NNodes * 3; + T* out = eforce + elemId * NNodes * 3; #pragma unroll for (int n = 0; n < NNodes; ++n) { @@ -224,23 +218,19 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel /** * Gather per-vertex forces (1 thread per vertex). - * - * Shared by both addForce and addDForce. - * No atomics: each vertex handled by exactly one thread. - * velems is SoA: velems[s * nbVertex + vertexId], 0-terminated. - * Each entry is (elemId * NNodes + localNode + 1), with 0 as sentinel. */ -__global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel( +template +__global__ void ElementLinearSmallStrainFEMForceField_gatherForce_kernel( int nbVertex, int maxElemPerVertex, const int* __restrict__ velems, - const float* __restrict__ eforce, - float* df) + const T* __restrict__ eforce, + T* df) { const int vertexId = blockIdx.x * blockDim.x + threadIdx.x; if (vertexId >= nbVertex) return; - float fx = 0.0f, fy = 0.0f, fz = 0.0f; + T fx = T(0), fy = T(0), fz = T(0); for (int s = 0; s < maxElemPerVertex; ++s) { @@ -257,6 +247,7 @@ __global__ void ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel( df[vertexId * 3 + 2] += fz; } +template static void launchGather( unsigned int nbVertex, unsigned int maxElemPerVertex, @@ -266,17 +257,17 @@ static void launchGather( { const int gatherThreads = 256; const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; - ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel + ElementLinearSmallStrainFEMForceField_gatherForce_kernel <<>>( nbVertex, maxElemPerVertex, (const int*)velems, - (const float*)eforce, - (float*)f); - mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_gatherForce_kernel"); + (const T*)eforce, + (T*)f); + mycudaDebugError("ElementLinearSmallStrainFEMForceField_gatherForce_kernel"); } -template +template static void launchAddForce( unsigned int nbElem, unsigned int nbVertex, @@ -291,20 +282,20 @@ static void launchAddForce( { const int computeThreads = 64; const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; - ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel + ElementLinearSmallStrainFEMForceField_computeForce_kernel <<>>( nbElem, (const int*)elements, - (const float*)stiffness, - (const float*)x, - (const float*)x0, - (float*)eforce); - mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_computeForce_kernel"); + (const T*)stiffness, + (const T*)x, + (const T*)x0, + (T*)eforce); + mycudaDebugError("ElementLinearSmallStrainFEMForceField_computeForce_kernel"); - launchGather(nbVertex, maxElemPerVertex, velems, eforce, f); + launchGather(nbVertex, maxElemPerVertex, velems, eforce, f); } -template +template static void launchAddDForce( unsigned int nbElem, unsigned int nbVertex, @@ -315,21 +306,21 @@ static void launchAddDForce( void* df, void* eforce, const void* velems, - float kFactor) + T kFactor) { const int computeThreads = 64; const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; - ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel + ElementLinearSmallStrainFEMForceField_computeDForce_kernel <<>>( nbElem, (const int*)elements, - (const float*)stiffness, - (const float*)dx, - (float*)eforce, + (const T*)stiffness, + (const T*)dx, + (T*)eforce, kFactor); - mycudaDebugError("ElementLinearSmallStrainFEMForceFieldCuda3f_computeDForce_kernel"); + mycudaDebugError("ElementLinearSmallStrainFEMForceField_computeDForce_kernel"); - launchGather(nbVertex, maxElemPerVertex, velems, eforce, df); + launchGather(nbVertex, maxElemPerVertex, velems, eforce, df); } extern "C" @@ -350,10 +341,10 @@ void ElementLinearSmallStrainFEMForceFieldCuda3f_addForce( { switch (nbNodesPerElem) { - case 2: launchAddForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; - case 3: launchAddForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; - case 4: launchAddForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; - case 8: launchAddForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; + case 2: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; + case 3: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; + case 4: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; + case 8: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; } } @@ -372,10 +363,54 @@ void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce( { switch (nbNodesPerElem) { - case 2: launchAddDForce<2>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; - case 3: launchAddDForce<3>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; - case 4: launchAddDForce<4>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; - case 8: launchAddDForce<8>(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; + case 2: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; + case 3: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; + case 4: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; + case 8: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; + } +} + +void ElementLinearSmallStrainFEMForceFieldCuda3d_addForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + const void* velems) +{ + switch (nbNodesPerElem) + { + case 2: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; + case 3: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; + case 4: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; + case 8: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; + } +} + +void ElementLinearSmallStrainFEMForceFieldCuda3d_addDForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* stiffness, + const void* dx, + void* df, + void* eforce, + const void* velems, + double kFactor) +{ + switch (nbNodesPerElem) + { + case 2: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; + case 3: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; + case 4: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; + case 8: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; } } diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h index 53cfaf663c5..777d3301ee2 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h @@ -54,6 +54,32 @@ extern "C" void* eforce, const void* velems, float kFactor); + + void ElementLinearSmallStrainFEMForceFieldCuda3d_addForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + const void* velems); + + void ElementLinearSmallStrainFEMForceFieldCuda3d_addDForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int nbNodesPerElem, + unsigned int maxElemPerVertex, + const void* elements, + const void* stiffness, + const void* dx, + void* df, + void* eforce, + const void* velems, + double kFactor); } } // namespace sofa::gpu::cuda @@ -117,10 +143,10 @@ class CudaElementLinearSmallStrainFEMForceField void uploadStiffnessAndConnectivity(); - gpu::cuda::CudaVector m_gpuStiffness; ///< Symmetric block-format stiffness per element - gpu::cuda::CudaVector m_gpuElements; ///< SoA connectivity: elements[nodeIdx * nbElem + elemId] - gpu::cuda::CudaVector m_gpuElementForce; ///< Intermediate per-element per-node force buffer - gpu::cuda::CudaVector m_gpuVelems; ///< SoA vertex-to-element mapping, 0-terminated + gpu::cuda::CudaVector m_gpuStiffness; ///< Symmetric block-format stiffness per element + gpu::cuda::CudaVector m_gpuElements; ///< SoA connectivity: elements[nodeIdx * nbElem + elemId] + gpu::cuda::CudaVector m_gpuElementForce; ///< Intermediate per-element per-node force buffer + gpu::cuda::CudaVector m_gpuVelems; ///< SoA vertex-to-element mapping, 0-terminated unsigned int m_maxElemPerVertex = 0; unsigned int m_nbVertices = 0; diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl index 1ab9dfb33f5..863511e951d 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl @@ -88,7 +88,7 @@ void CudaElementLinearSmallStrainFEMForceField::uploadSt dst[e * nSymBlocks * dim * dim + symIdx * dim * dim + di * dim + dj] - = static_cast(K[ni * dim + di][nj * dim + dj]); + = static_cast(K[ni * dim + di][nj * dim + dj]); } } } @@ -175,18 +175,24 @@ void CudaElementLinearSmallStrainFEMForceField::addForce const auto nbElem = static_cast(elements.size()); const auto nbVertex = static_cast(x.size()); - gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addForce( - nbElem, - nbVertex, - trait::NumberOfNodesInElement, - m_maxElemPerVertex, - m_gpuElements.deviceRead(), - m_gpuStiffness.deviceRead(), - x.deviceRead(), - x0.deviceRead(), - f.deviceWrite(), - m_gpuElementForce.deviceWrite(), - m_gpuVelems.deviceRead()); + if constexpr (std::is_same_v) + { + gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3d_addForce( + nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), + x.deviceRead(), x0.deviceRead(), + f.deviceWrite(), m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead()); + } + else + { + gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addForce( + nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), + x.deviceRead(), x0.deviceRead(), + f.deviceWrite(), m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead()); + } d_f.endEdit(); } @@ -215,7 +221,7 @@ void CudaElementLinearSmallStrainFEMForceField::addDForc if (df.size() < dx.size()) df.resize(dx.size()); - const auto kFactor = static_cast( + const auto kFactor = static_cast( sofa::core::mechanicalparams::kFactorIncludingRayleighDamping( mparams, this->rayleighStiffness.getValue())); @@ -223,18 +229,24 @@ void CudaElementLinearSmallStrainFEMForceField::addDForc const auto nbElem = static_cast(elements.size()); const auto nbVertex = static_cast(dx.size()); - gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce( - nbElem, - nbVertex, - trait::NumberOfNodesInElement, - m_maxElemPerVertex, - m_gpuElements.deviceRead(), - m_gpuStiffness.deviceRead(), - dx.deviceRead(), - df.deviceWrite(), - m_gpuElementForce.deviceWrite(), - m_gpuVelems.deviceRead(), - kFactor); + if constexpr (std::is_same_v) + { + gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3d_addDForce( + nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), + dx.deviceRead(), df.deviceWrite(), + m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(), + kFactor); + } + else + { + gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce( + nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), + dx.deviceRead(), df.deviceWrite(), + m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(), + kFactor); + } d_df.endEdit(); } From 07bd243ff2a658b73fe51ff911a7ab0b4db581c1 Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Wed, 8 Apr 2026 13:22:28 +0900 Subject: [PATCH 15/21] add benchmarks --- .../benchmarks/Hexahedron_corotational.py | 93 ++++++++++++++ .../Hexahedron_corotational.py.view | 17 +++ .../benchmarks/Tetrahedron_corotational.py | 94 +++++++++++++++ .../Tetrahedron_corotational.py.view | 17 +++ .../benchmarks/utilities.py | 114 ++++++++++++++++++ 5 files changed, 335 insertions(+) create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py.view create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py.view create mode 100644 applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/utilities.py diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py new file mode 100644 index 00000000000..aec2063fbe6 --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py @@ -0,0 +1,93 @@ +import Sofa + +import os +import numpy as np +from utilities import generate_regular_grid + +g_grid_min_corner=(0, 6, -2) +g_grid_max_corner=(16, 10, 2) + +g_fem_version = os.environ.get('FEM_VERSION', 'new') #either 'new' or 'legacy' +g_fem_template = os.environ.get('FEM_TEMPLATE', 'Vec3d') + +# default is (76, 16, 16) +g_grid_nx = int(os.environ.get('NX', '76')) +g_grid_ny = int(os.environ.get('NY', '16')) +g_grid_nz = int(os.environ.get('NZ', '16')) + +g_nb_steps = int(os.environ.get('NBSTEPS', '1000')) + +def createScene(root_node): + root_node.name = "root" + root_node.gravity = (0, -9, 0) + root_node.dt = 0.01 + + plugin_node = root_node.addChild('Plugins') + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Engine.Select") + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.LinearSolver.Iterative") + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.ODESolver.Backward") + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.StateContainer") + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Topology.Container.Dynamic") + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Topology.Container.Grid") + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Visual") + plugin_node.addObject('RequiredPlugin', pluginName='Sofa.Component.Constraint.Projective') # Needed to use components [FixedProjectiveConstraint] + plugin_node.addObject('RequiredPlugin', pluginName='Sofa.Component.Mass') # Needed to use components [DiagonalMass] + plugin_node.addObject('RequiredPlugin', pluginName='Sofa.Component.SolidMechanics.FEM.Elastic') # Needed to use components [HexahedronCorotationalFEMForceField] + plugin_node.addObject('RequiredPlugin', pluginName='SofaCUDA.Component') + plugin_node.addObject('VisualStyle', displayFlags="showBehaviorModels showForceFields") + + root_node.addObject('DefaultAnimationLoop') + root_node.addObject('VisualStyle', displayFlags="showBehaviorModels showForceFields") + + grid_nodes, grid_hexa = generate_regular_grid(nx=g_grid_nx, ny=g_grid_ny, nz=g_grid_nz, min_corner=g_grid_min_corner, max_corner=g_grid_max_corner) + + hexahedron_node = root_node.addChild('Hexahedron') + hexahedron_node.addObject('EulerImplicitSolver', rayleighStiffness="0.1", rayleighMass="0.1") + hexahedron_node.addObject('CGLinearSolver', iterations="250", name="linear_solver", tolerance="1.0e-12", threshold="1.0e-12") + hexahedron_node.addObject('MechanicalObject', name="ms", template=g_fem_template, position=grid_nodes) + hexahedron_node.addObject('HexahedronSetTopologyContainer', hexahedra=grid_hexa) + hexahedron_node.addObject('DiagonalMass', totalMass="50.0") + hexahedron_node.addObject('BoxROI', name="boxroi1", box="-0.1 5 -3 0.1 11 3", drawBoxes="1") + hexahedron_node.addObject('FixedProjectiveConstraint', indices="@boxroi1.indices") + if g_fem_version == "legacy": + hexahedron_node.addObject('HexahedronFEMForceField', name="LegacyFEM", template=g_fem_template, youngModulus="4000", poissonRatio="0.3", method="large") + if g_fem_version == "new": + hexahedron_node.addObject('HexahedronCorotationalFEMForceField', name="NewFEM", template=g_fem_template, youngModulus="4000", poissonRatio="0.3") + +def main(): + + enable_gui = False + + try: + import Sofa.Gui + import SofaImGui + except: + enable_gui = False + + root = Sofa.Core.Node("root") + createScene(root) + + Sofa.Simulation.initRoot(root) + + if enable_gui: + Sofa.Gui.GUIManager.Init("myscene","imgui") + Sofa.Gui.GUIManager.createGUI(root, __file__) + Sofa.Gui.GUIManager.MainLoop(root) + Sofa.Gui.GUIManager.closeGUI() + else: + import time + + print(f"Running on {g_nb_steps} steps...") + start_timer = time.time() + + for iteration in range(g_nb_steps): + Sofa.Simulation.animate(root, root.dt.value) + + stop_timer = time.time() + print(f"... Done.") + print(f"{g_nb_steps} steps done in {stop_timer - start_timer:.3}s ({g_nb_steps/(stop_timer - start_timer):.5} fps).") + + +# Function used only if this script is called from a python environment +if __name__ == '__main__': + main() diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py.view b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py.view new file mode 100644 index 00000000000..1e9c7f6670c --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Hexahedron_corotational.py.view @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py new file mode 100644 index 00000000000..d9480c34dfa --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py @@ -0,0 +1,94 @@ +import Sofa + +import os +import numpy as np +from utilities import generate_regular_grid, hexa_to_tetra + +g_grid_min_corner=(0, 6, -2) +g_grid_max_corner=(16, 10, 2) + +g_fem_version = os.environ.get('FEM_VERSION', 'new') #either 'new' or 'legacy' +g_fem_template = os.environ.get('FEM_TEMPLATE', 'Vec3d') + +# default is (76, 16, 16) +g_grid_nx = int(os.environ.get('NX', '76')) +g_grid_ny = int(os.environ.get('NY', '16')) +g_grid_nz = int(os.environ.get('NZ', '16')) + +g_nb_steps = int(os.environ.get('NBSTEPS', '1000')) + +def createScene(root_node): + root_node.name = "root" + root_node.gravity = (0, -9, 0) + root_node.dt = 0.01 + + plugin_node = root_node.addChild('Plugins') + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Engine.Select") + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.LinearSolver.Iterative") + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.ODESolver.Backward") + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.StateContainer") + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Topology.Container.Dynamic") + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Topology.Container.Grid") + plugin_node.addObject('RequiredPlugin', pluginName="Sofa.Component.Visual") + plugin_node.addObject('RequiredPlugin', pluginName='Sofa.Component.Constraint.Projective') # Needed to use components [FixedProjectiveConstraint] + plugin_node.addObject('RequiredPlugin', pluginName='Sofa.Component.Mass') # Needed to use components [DiagonalMass] + plugin_node.addObject('RequiredPlugin', pluginName='Sofa.Component.SolidMechanics.FEM.Elastic') # Needed to use components [TetrahedronCorotationalFEMForceField] + plugin_node.addObject('RequiredPlugin', pluginName='SofaCUDA.Component') + plugin_node.addObject('VisualStyle', displayFlags="showBehaviorModels showForceFields") + + root_node.addObject('DefaultAnimationLoop') + root_node.addObject('VisualStyle', displayFlags="showBehaviorModels showForceFields") + + grid_nodes, grid_hexa = generate_regular_grid(nx=g_grid_nx, ny=g_grid_ny, nz=g_grid_nz, min_corner=g_grid_min_corner, max_corner=g_grid_max_corner) + grid_tetra = hexa_to_tetra(grid_hexa) + + tetrahedron_node = root_node.addChild('Tetrahedron') + tetrahedron_node.addObject('EulerImplicitSolver', rayleighStiffness="0.1", rayleighMass="0.1") + tetrahedron_node.addObject('CGLinearSolver', iterations="250", name="linear_solver", tolerance="1.0e-12", threshold="1.0e-12") + tetrahedron_node.addObject('MechanicalObject', name="ms", template=g_fem_template, position=grid_nodes) + tetrahedron_node.addObject('TetrahedronSetTopologyContainer', tetrahedra=grid_tetra) + tetrahedron_node.addObject('DiagonalMass', totalMass="50.0") + tetrahedron_node.addObject('BoxROI', name="boxroi1", box="-0.1 5 -3 0.1 11 3", drawBoxes="1") + tetrahedron_node.addObject('FixedProjectiveConstraint', indices="@boxroi1.indices") + if g_fem_version == "legacy": + tetrahedron_node.addObject('TetrahedronFEMForceField', name="LegacyFEM", template=g_fem_template, youngModulus="4000", poissonRatio="0.3", method="large") + if g_fem_version == "new": + tetrahedron_node.addObject('TetrahedronCorotationalFEMForceField', name="NewFEM", template=g_fem_template, youngModulus="4000", poissonRatio="0.3") + +def main(): + + enable_gui = False + + try: + import Sofa.Gui + import SofaImGui + except: + enable_gui = False + + root = Sofa.Core.Node("root") + createScene(root) + + Sofa.Simulation.initRoot(root) + + if enable_gui: + Sofa.Gui.GUIManager.Init("myscene","imgui") + Sofa.Gui.GUIManager.createGUI(root, __file__) + Sofa.Gui.GUIManager.MainLoop(root) + Sofa.Gui.GUIManager.closeGUI() + else: + import time + + print(f"Running on {g_nb_steps} steps...") + start_timer = time.time() + + for iteration in range(g_nb_steps): + Sofa.Simulation.animate(root, root.dt.value) + + stop_timer = time.time() + print(f"... Done.") + print(f"{g_nb_steps} steps done in {stop_timer - start_timer:.3}s ({g_nb_steps/(stop_timer - start_timer):.5} fps).") + + +# Function used only if this script is called from a python environment +if __name__ == '__main__': + main() diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py.view b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py.view new file mode 100644 index 00000000000..433112afafd --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/Tetrahedron_corotational.py.view @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/utilities.py b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/utilities.py new file mode 100644 index 00000000000..3bdf301c641 --- /dev/null +++ b/applications/plugins/SofaCUDA/examples/ElementFEMForcefield/benchmarks/utilities.py @@ -0,0 +1,114 @@ +import numpy as np + +def generate_regular_grid(nx=10, ny=10, nz=10, min_corner=(0, 0, 0), max_corner=(1, 1, 1)): + """ + Generate a regular grid of hexahedra. + + Args: + nx, ny, nz: Number of vertices in each direction (grid resolution) + min_corner: (xmin, ymin, zmin) tuple + max_corner: (xmax, ymax, zmax) tuple + + Returns: + points: array of shape (nx*ny*nz, 3) - vertex positions + hexahedra: array of shape ((nx-1)*(ny-1)*(nz-1), 8) - hexahedra indices + """ + xmin, ymin, zmin = min_corner + xmax, ymax, zmax = max_corner + + # Compute spacing + dx = (xmax - xmin) / (nx - 1) if nx > 1 else 0 + dy = (ymax - ymin) / (ny - 1) if ny > 1 else 0 + dz = (zmax - zmin) / (nz - 1) if nz > 1 else 0 + + # Generate points + points = [] + for k in range(nz): + for j in range(ny): + for i in range(nx): + points.append([xmin + i*dx, ymin + j*dy, zmin + k*dz]) + points = np.array(points) + + # Helper to get point index from grid coordinates + def point_index(i, j, k): + return nx * (ny * k + j) + i + + # Generate hexahedra (8 vertices per hexa, in SOFA convention) + hexahedra = [] + for k in range(nz - 1): + for j in range(ny - 1): + for i in range(nx - 1): + hexa = [ + point_index(i, j, k), + point_index(i+1, j, k), + point_index(i+1, j+1, k), + point_index(i, j+1, k), + point_index(i, j, k+1), + point_index(i+1, j, k+1), + point_index(i+1, j+1, k+1), + point_index(i, j+1, k+1), + ] + hexahedra.append(hexa) + hexahedra = np.array(hexahedra) + + return points, hexahedra + +def hexa_to_tetra(hexahedra): + """ + Convert hexahedra to tetrahedra. + + Each hexahedron is split into 5 tetrahedra. + + Args: + hexahedra: array of shape (N, 8) - hexahedra vertex indices + + Returns: + tetrahedra: array of shape (N*5, 4) - tetrahedra vertex indices + """ + tetrahedra = [] + + # 5-tetra decomposition using diagonal 1-3-4-6 + splits = [ + [0, 1, 3, 4], + [1, 2, 3, 6], + [1, 4, 5, 6], + [3, 4, 6, 7], + [1, 3, 4, 6], # central tetrahedron + ] + + for hexa in hexahedra: + for split in splits: + tetrahedra.append([hexa[i] for i in split]) + + return np.array(tetrahedra) + +def hexa_to_tetra_symmetric(hexahedra): + """ + Convert hexahedra to tetrahedra using symmetric 6-tetra decomposition. + + Each hexahedron is split into 6 tetrahedra around the space diagonal (0-6). + Better symmetry properties for FEM simulations. + + Args: + hexahedra: array of shape (N, 8) - hexahedra vertex indices + + Returns: + tetrahedra: array of shape (N*6, 4) - tetrahedra vertex indices + """ + tetrahedra = [] + + # 6-tetra symmetric decomposition around diagonal 0-6 + splits = [ + [0, 1, 2, 6], + [0, 2, 3, 6], + [0, 3, 7, 6], + [0, 7, 4, 6], + [0, 4, 5, 6], + [0, 5, 1, 6], + ] + + for hexa in hexahedra: + for split in splits: + tetrahedra.append([hexa[i] for i in split]) + + return np.array(tetrahedra) \ No newline at end of file From f1778f14a7cdf6b20e42f59347a596397c392624 Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Thu, 9 Apr 2026 07:35:59 +0900 Subject: [PATCH 16/21] template CUDA kernels on Dim and remove runtime dispatch Replace hardcoded 3D assumption and extern "C" + switch(nbNodesPerElem) runtime dispatch with fully compile-time C++ template parameters . All kernel dimensions, stiffness block sizes, and gather loops are now generic over Dim. The .inl callers use a single template call with constexpr nNodes and dim from the trait, eliminating both the if-constexpr type branching and the runtime NNodes switch. Explicit template instantiations in the .cu files provide the needed symbols. Applied to both ElementLinearSmallStrainFEMForceField and ElementCorotationalFEMForceField CUDA implementations. --- .../CudaElementCorotationalFEMForceField.cu | 584 ++++++++---------- .../CudaElementCorotationalFEMForceField.h | 130 ++-- .../CudaElementCorotationalFEMForceField.inl | 87 +-- ...daElementLinearSmallStrainFEMForceField.cu | 335 ++++------ ...udaElementLinearSmallStrainFEMForceField.h | 79 +-- ...aElementLinearSmallStrainFEMForceField.inl | 52 +- 6 files changed, 518 insertions(+), 749 deletions(-) diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu index afbbdb89532..8cb90f8c540 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu @@ -23,14 +23,12 @@ #include #include -#if defined(__cplusplus) namespace sofa { namespace gpu { namespace cuda { -#endif template __device__ T myRsqrt(T x); @@ -164,12 +162,13 @@ __device__ void computeHexahedronFrame(const T* ex, T* frame) /** * Symmetric block-matrix multiply: out = K * in + * Templated on Dim for generic spatial dimensions. */ -template +template __device__ void symBlockMatMul(const T* K, const T* in, T* out) { #pragma unroll - for (int i = 0; i < NNodes * 3; ++i) + for (int i = 0; i < NNodes * Dim; ++i) out[i] = T(0); #pragma unroll @@ -177,38 +176,47 @@ __device__ void symBlockMatMul(const T* K, const T* in, T* out) { const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; + // Diagonal block { - const T* Kii = K + diagIdx * 9; - const T i0 = in[ni * 3 + 0]; - const T i1 = in[ni * 3 + 1]; - const T i2 = in[ni * 3 + 2]; - out[ni * 3 + 0] += Kii[0] * i0 + Kii[1] * i1 + Kii[2] * i2; - out[ni * 3 + 1] += Kii[3] * i0 + Kii[4] * i1 + Kii[5] * i2; - out[ni * 3 + 2] += Kii[6] * i0 + Kii[7] * i1 + Kii[8] * i2; + const T* Kii = K + diagIdx * Dim * Dim; + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += Kii[di * Dim + dj] * in[ni * Dim + dj]; + out[ni * Dim + di] += sum; + } } + // Off-diagonal blocks #pragma unroll for (int nj = ni + 1; nj < NNodes; ++nj) { const int symIdx = diagIdx + (nj - ni); - const T* Kij = K + symIdx * 9; + const T* Kij = K + symIdx * Dim * Dim; + // Kij * in_j -> out_i + #pragma unroll + for (int di = 0; di < Dim; ++di) { - const T j0 = in[nj * 3 + 0]; - const T j1 = in[nj * 3 + 1]; - const T j2 = in[nj * 3 + 2]; - out[ni * 3 + 0] += Kij[0] * j0 + Kij[1] * j1 + Kij[2] * j2; - out[ni * 3 + 1] += Kij[3] * j0 + Kij[4] * j1 + Kij[5] * j2; - out[ni * 3 + 2] += Kij[6] * j0 + Kij[7] * j1 + Kij[8] * j2; + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += Kij[di * Dim + dj] * in[nj * Dim + dj]; + out[ni * Dim + di] += sum; } + // Kij^T * in_i -> out_j + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) { - const T i0 = in[ni * 3 + 0]; - const T i1 = in[ni * 3 + 1]; - const T i2 = in[ni * 3 + 2]; - out[nj * 3 + 0] += Kij[0] * i0 + Kij[3] * i1 + Kij[6] * i2; - out[nj * 3 + 1] += Kij[1] * i0 + Kij[4] * i1 + Kij[7] * i2; - out[nj * 3 + 2] += Kij[2] * i0 + Kij[5] * i1 + Kij[8] * i2; + T sum = T(0); + #pragma unroll + for (int di = 0; di < Dim; ++di) + sum += Kij[di * Dim + dj] * in[ni * Dim + di]; + out[nj * Dim + dj] += sum; } } } @@ -216,8 +224,9 @@ __device__ void symBlockMatMul(const T* K, const T* in, T* out) /** * Combined kernel: compute rotations AND per-element forces in one pass. + * Rotation computation is inherently 3D (cross products). */ -template +template __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel( int nbElem, const int* __restrict__ elements, @@ -228,88 +237,112 @@ __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel T* __restrict__ rotationsOut, T* __restrict__ eforce) { + static_assert(Dim == 3, "Corotational rotation computation requires Dim == 3"); constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; const T invN = T(1) / T(NNodes); const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; - T ex[NNodes * 3], ex0[NNodes * 3]; + T ex[NNodes * Dim], ex0[NNodes * Dim]; #pragma unroll for (int n = 0; n < NNodes; ++n) { const int nodeId = elements[n * nbElem + elemId]; - ex[n * 3 + 0] = x[nodeId * 3 + 0]; - ex[n * 3 + 1] = x[nodeId * 3 + 1]; - ex[n * 3 + 2] = x[nodeId * 3 + 2]; - ex0[n * 3 + 0] = x0[nodeId * 3 + 0]; - ex0[n * 3 + 1] = x0[nodeId * 3 + 1]; - ex0[n * 3 + 2] = x0[nodeId * 3 + 2]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + { + ex[n * Dim + d] = x[nodeId * Dim + d]; + ex0[n * Dim + d] = x0[nodeId * Dim + d]; + } } - T frame[9]; + T frame[Dim * Dim]; if constexpr (NNodes == 8) computeHexahedronFrame(ex, frame); else computeTriangleFrame(ex, frame); // R = frame^T * initRot - const T* irt = initRotTransposed + elemId * 9; - T R[9]; + const T* irt = initRotTransposed + elemId * Dim * Dim; + T R[Dim * Dim]; mat3TransposeMul(frame, irt, R); - T* Rout = rotationsOut + elemId * 9; + T* Rout = rotationsOut + elemId * Dim * Dim; #pragma unroll - for (int i = 0; i < 9; ++i) + for (int i = 0; i < Dim * Dim; ++i) Rout[i] = R[i]; - T cx = T(0), cy = T(0), cz = T(0); - T cx0 = T(0), cy0 = T(0), cz0 = T(0); + T center[Dim], center0[Dim]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + { + center[d] = T(0); + center0[d] = T(0); + } #pragma unroll for (int n = 0; n < NNodes; ++n) { - cx += ex[n * 3 + 0]; cy += ex[n * 3 + 1]; cz += ex[n * 3 + 2]; - cx0 += ex0[n * 3 + 0]; cy0 += ex0[n * 3 + 1]; cz0 += ex0[n * 3 + 2]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + { + center[d] += ex[n * Dim + d]; + center0[d] += ex0[n * Dim + d]; + } + } + #pragma unroll + for (int d = 0; d < Dim; ++d) + { + center[d] *= invN; + center0[d] *= invN; } - cx *= invN; cy *= invN; cz *= invN; - cx0 *= invN; cy0 *= invN; cz0 *= invN; - T disp[NNodes * 3]; + T disp[NNodes * Dim]; #pragma unroll for (int n = 0; n < NNodes; ++n) { - const T dx = ex[n * 3 + 0] - cx; - const T dy = ex[n * 3 + 1] - cy; - const T dz = ex[n * 3 + 2] - cz; - const T rx = R[0] * dx + R[3] * dy + R[6] * dz; - const T ry = R[1] * dx + R[4] * dy + R[7] * dz; - const T rz = R[2] * dx + R[5] * dy + R[8] * dz; - disp[n * 3 + 0] = rx - (ex0[n * 3 + 0] - cx0); - disp[n * 3 + 1] = ry - (ex0[n * 3 + 1] - cy0); - disp[n * 3 + 2] = rz - (ex0[n * 3 + 2] - cz0); + // R^T * (x_n - center) + T diff[Dim]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + diff[d] = ex[n * Dim + d] - center[d]; + + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T rotated = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + rotated += R[dj * Dim + di] * diff[dj]; + disp[n * Dim + di] = rotated - (ex0[n * Dim + di] - center0[di]); + } } - T edf[NNodes * 3]; - const T* K = stiffness + elemId * NSymBlocks * 9; - symBlockMatMul(K, disp, edf); + T edf[NNodes * Dim]; + const T* K = stiffness + elemId * NSymBlocks * Dim * Dim; + symBlockMatMul(K, disp, edf); - T* out = eforce + elemId * NNodes * 3; + T* out = eforce + elemId * NNodes * Dim; #pragma unroll for (int n = 0; n < NNodes; ++n) { - const T e0 = edf[n * 3 + 0]; - const T e1 = edf[n * 3 + 1]; - const T e2 = edf[n * 3 + 2]; - out[n * 3 + 0] = -(R[0] * e0 + R[1] * e1 + R[2] * e2); - out[n * 3 + 1] = -(R[3] * e0 + R[4] * e1 + R[5] * e2); - out[n * 3 + 2] = -(R[6] * e0 + R[7] * e1 + R[8] * e2); + // R * edf_n, negated + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += R[di * Dim + dj] * edf[n * Dim + dj]; + out[n * Dim + di] = -sum; + } } } /** * Kernel for addForce: Compute per-element force (1 thread per element). */ -template +template __global__ void ElementCorotationalFEMForceField_computeForce_kernel( int nbElem, const int* __restrict__ elements, @@ -325,72 +358,93 @@ __global__ void ElementCorotationalFEMForceField_computeForce_kernel( const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; - const T* Rptr = rotations + elemId * 9; - T R[9]; + const T* Rptr = rotations + elemId * Dim * Dim; + T R[Dim * Dim]; #pragma unroll - for (int i = 0; i < 9; ++i) + for (int i = 0; i < Dim * Dim; ++i) R[i] = Rptr[i]; - T ex[NNodes * 3], ex0[NNodes * 3]; + T ex[NNodes * Dim], ex0[NNodes * Dim]; #pragma unroll for (int n = 0; n < NNodes; ++n) { const int nodeId = elements[n * nbElem + elemId]; - ex[n * 3 + 0] = x[nodeId * 3 + 0]; - ex[n * 3 + 1] = x[nodeId * 3 + 1]; - ex[n * 3 + 2] = x[nodeId * 3 + 2]; - ex0[n * 3 + 0] = x0[nodeId * 3 + 0]; - ex0[n * 3 + 1] = x0[nodeId * 3 + 1]; - ex0[n * 3 + 2] = x0[nodeId * 3 + 2]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + { + ex[n * Dim + d] = x[nodeId * Dim + d]; + ex0[n * Dim + d] = x0[nodeId * Dim + d]; + } } - T cx = T(0), cy = T(0), cz = T(0); - T cx0 = T(0), cy0 = T(0), cz0 = T(0); + T center[Dim], center0[Dim]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + { + center[d] = T(0); + center0[d] = T(0); + } #pragma unroll for (int n = 0; n < NNodes; ++n) { - cx += ex[n * 3 + 0]; cy += ex[n * 3 + 1]; cz += ex[n * 3 + 2]; - cx0 += ex0[n * 3 + 0]; cy0 += ex0[n * 3 + 1]; cz0 += ex0[n * 3 + 2]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + { + center[d] += ex[n * Dim + d]; + center0[d] += ex0[n * Dim + d]; + } + } + #pragma unroll + for (int d = 0; d < Dim; ++d) + { + center[d] *= invN; + center0[d] *= invN; } - cx *= invN; cy *= invN; cz *= invN; - cx0 *= invN; cy0 *= invN; cz0 *= invN; - T disp[NNodes * 3]; + T disp[NNodes * Dim]; #pragma unroll for (int n = 0; n < NNodes; ++n) { - const T dx = ex[n * 3 + 0] - cx; - const T dy = ex[n * 3 + 1] - cy; - const T dz = ex[n * 3 + 2] - cz; - const T rx = R[0] * dx + R[3] * dy + R[6] * dz; - const T ry = R[1] * dx + R[4] * dy + R[7] * dz; - const T rz = R[2] * dx + R[5] * dy + R[8] * dz; - disp[n * 3 + 0] = rx - (ex0[n * 3 + 0] - cx0); - disp[n * 3 + 1] = ry - (ex0[n * 3 + 1] - cy0); - disp[n * 3 + 2] = rz - (ex0[n * 3 + 2] - cz0); + T diff[Dim]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + diff[d] = ex[n * Dim + d] - center[d]; + + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T rotated = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + rotated += R[dj * Dim + di] * diff[dj]; + disp[n * Dim + di] = rotated - (ex0[n * Dim + di] - center0[di]); + } } - T edf[NNodes * 3]; - const T* K = stiffness + elemId * NSymBlocks * 9; - symBlockMatMul(K, disp, edf); + T edf[NNodes * Dim]; + const T* K = stiffness + elemId * NSymBlocks * Dim * Dim; + symBlockMatMul(K, disp, edf); - T* out = eforce + elemId * NNodes * 3; + T* out = eforce + elemId * NNodes * Dim; #pragma unroll for (int n = 0; n < NNodes; ++n) { - const T e0 = edf[n * 3 + 0]; - const T e1 = edf[n * 3 + 1]; - const T e2 = edf[n * 3 + 2]; - out[n * 3 + 0] = -(R[0] * e0 + R[1] * e1 + R[2] * e2); - out[n * 3 + 1] = -(R[3] * e0 + R[4] * e1 + R[5] * e2); - out[n * 3 + 2] = -(R[6] * e0 + R[7] * e1 + R[8] * e2); + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += R[di * Dim + dj] * edf[n * Dim + dj]; + out[n * Dim + di] = -sum; + } } } /** * Kernel for addDForce: Compute per-element dForce (1 thread per element). */ -template +template __global__ void ElementCorotationalFEMForceField_computeDForce_kernel( int nbElem, const int* __restrict__ elements, @@ -405,46 +459,59 @@ __global__ void ElementCorotationalFEMForceField_computeDForce_kernel( const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; - const T* Rptr = rotations + elemId * 9; - T R[9]; + const T* Rptr = rotations + elemId * Dim * Dim; + T R[Dim * Dim]; #pragma unroll - for (int i = 0; i < 9; ++i) + for (int i = 0; i < Dim * Dim; ++i) R[i] = Rptr[i]; - T rdx[NNodes * 3]; + // R^T * dx for each node + T rdx[NNodes * Dim]; #pragma unroll for (int n = 0; n < NNodes; ++n) { const int nodeId = elements[n * nbElem + elemId]; - const T dx_x = dx[nodeId * 3 + 0]; - const T dx_y = dx[nodeId * 3 + 1]; - const T dx_z = dx[nodeId * 3 + 2]; - rdx[n * 3 + 0] = R[0] * dx_x + R[3] * dx_y + R[6] * dx_z; - rdx[n * 3 + 1] = R[1] * dx_x + R[4] * dx_y + R[7] * dx_z; - rdx[n * 3 + 2] = R[2] * dx_x + R[5] * dx_y + R[8] * dx_z; + T nodeDx[Dim]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + nodeDx[d] = dx[nodeId * Dim + d]; + + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += R[dj * Dim + di] * nodeDx[dj]; + rdx[n * Dim + di] = sum; + } } - const T* K = stiffness + elemId * NSymBlocks * 9; - T edf[NNodes * 3]; - symBlockMatMul(K, rdx, edf); + const T* K = stiffness + elemId * NSymBlocks * Dim * Dim; + T edf[NNodes * Dim]; + symBlockMatMul(K, rdx, edf); - T* out = eforce + elemId * NNodes * 3; + // R * edf, scaled by -kFactor + T* out = eforce + elemId * NNodes * Dim; #pragma unroll for (int n = 0; n < NNodes; ++n) { - const T e0 = edf[n * 3 + 0]; - const T e1 = edf[n * 3 + 1]; - const T e2 = edf[n * 3 + 2]; - out[n * 3 + 0] = -kFactor * (R[0] * e0 + R[1] * e1 + R[2] * e2); - out[n * 3 + 1] = -kFactor * (R[3] * e0 + R[4] * e1 + R[5] * e2); - out[n * 3 + 2] = -kFactor * (R[6] * e0 + R[7] * e1 + R[8] * e2); + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += R[di * Dim + dj] * edf[n * Dim + dj]; + out[n * Dim + di] = -kFactor * sum; + } } } /** * Gather per-vertex forces (1 thread per vertex). */ -template +template __global__ void ElementCorotationalFEMForceField_gatherForce_kernel( int nbVertex, int maxElemPerVertex, @@ -455,45 +522,30 @@ __global__ void ElementCorotationalFEMForceField_gatherForce_kernel( const int vertexId = blockIdx.x * blockDim.x + threadIdx.x; if (vertexId >= nbVertex) return; - T fx = T(0), fy = T(0), fz = T(0); + T acc[Dim]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + acc[d] = T(0); for (int s = 0; s < maxElemPerVertex; ++s) { const int idx = velems[s * nbVertex + vertexId]; if (idx == 0) break; - const int base = (idx - 1) * 3; - fx += eforce[base + 0]; - fy += eforce[base + 1]; - fz += eforce[base + 2]; + const int base = (idx - 1) * Dim; + #pragma unroll + for (int d = 0; d < Dim; ++d) + acc[d] += eforce[base + d]; } - df[vertexId * 3 + 0] += fx; - df[vertexId * 3 + 1] += fy; - df[vertexId * 3 + 2] += fz; + #pragma unroll + for (int d = 0; d < Dim; ++d) + df[vertexId * Dim + d] += acc[d]; } -template -static void launchGather( - unsigned int nbVertex, - unsigned int maxElemPerVertex, - const void* velems, - const void* eforce, - void* f) -{ - const int gatherThreads = 256; - const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; - ElementCorotationalFEMForceField_gatherForce_kernel - <<>>( - nbVertex, - maxElemPerVertex, - (const int*)velems, - (const T*)eforce, - (T*)f); - mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel"); -} +// ===================== Launch functions (C++ templates) ===================== -template -static void launchAddForceWithRotations( +template +void ElementCorotationalFEMForceFieldCuda_addForceWithRotations( unsigned int nbElem, unsigned int nbVertex, unsigned int maxElemPerVertex, @@ -508,8 +560,8 @@ static void launchAddForceWithRotations( const void* velems) { const int computeThreads = 64; - const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; - ElementCorotationalFEMForceField_computeRotationsAndForce_kernel + int numBlocks = (nbElem + computeThreads - 1) / computeThreads; + ElementCorotationalFEMForceField_computeRotationsAndForce_kernel <<>>( nbElem, (const int*)elements, @@ -521,11 +573,20 @@ static void launchAddForceWithRotations( (T*)eforce); mycudaDebugError("ElementCorotationalFEMForceField_computeRotationsAndForce_kernel"); - launchGather(nbVertex, maxElemPerVertex, velems, eforce, f); + const int gatherThreads = 256; + numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; + ElementCorotationalFEMForceField_gatherForce_kernel + <<>>( + nbVertex, + maxElemPerVertex, + (const int*)velems, + (const T*)eforce, + (T*)f); + mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel"); } -template -static void launchAddForce( +template +void ElementCorotationalFEMForceFieldCuda_addForce( unsigned int nbElem, unsigned int nbVertex, unsigned int maxElemPerVertex, @@ -539,8 +600,8 @@ static void launchAddForce( const void* velems) { const int computeThreads = 64; - const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; - ElementCorotationalFEMForceField_computeForce_kernel + int numBlocks = (nbElem + computeThreads - 1) / computeThreads; + ElementCorotationalFEMForceField_computeForce_kernel <<>>( nbElem, (const int*)elements, @@ -551,11 +612,20 @@ static void launchAddForce( (T*)eforce); mycudaDebugError("ElementCorotationalFEMForceField_computeForce_kernel"); - launchGather(nbVertex, maxElemPerVertex, velems, eforce, f); + const int gatherThreads = 256; + numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; + ElementCorotationalFEMForceField_gatherForce_kernel + <<>>( + nbVertex, + maxElemPerVertex, + (const int*)velems, + (const T*)eforce, + (T*)f); + mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel"); } -template -static void launchAddDForce( +template +void ElementCorotationalFEMForceFieldCuda_addDForce( unsigned int nbElem, unsigned int nbVertex, unsigned int maxElemPerVertex, @@ -569,8 +639,8 @@ static void launchAddDForce( T kFactor) { const int computeThreads = 64; - const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; - ElementCorotationalFEMForceField_computeDForce_kernel + int numBlocks = (nbElem + computeThreads - 1) / computeThreads; + ElementCorotationalFEMForceField_computeDForce_kernel <<>>( nbElem, (const int*)elements, @@ -581,158 +651,48 @@ static void launchAddDForce( kFactor); mycudaDebugError("ElementCorotationalFEMForceField_computeDForce_kernel"); - launchGather(nbVertex, maxElemPerVertex, velems, eforce, df); -} - -extern "C" -{ - -// ==================== float versions ==================== - -void ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* initRotTransposed, - const void* stiffness, - const void* x, - const void* x0, - void* f, - void* eforce, - void* rotationsOut, - const void* velems) -{ - switch (nbNodesPerElem) - { - case 3: launchAddForceWithRotations(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; - case 4: launchAddForceWithRotations(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; - case 8: launchAddForceWithRotations(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; - } -} - -void ElementCorotationalFEMForceFieldCuda3f_addForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* rotations, - const void* stiffness, - const void* x, - const void* x0, - void* f, - void* eforce, - const void* velems) -{ - switch (nbNodesPerElem) - { - case 2: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; - case 3: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; - case 4: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; - case 8: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; - } -} - -void ElementCorotationalFEMForceFieldCuda3f_addDForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* rotations, - const void* stiffness, - const void* dx, - void* df, - void* eforce, - const void* velems, - float kFactor) -{ - switch (nbNodesPerElem) - { - case 2: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; - case 3: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; - case 4: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; - case 8: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; - } -} - -// ==================== double versions ==================== - -void ElementCorotationalFEMForceFieldCuda3d_addForceWithRotations( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* initRotTransposed, - const void* stiffness, - const void* x, - const void* x0, - void* f, - void* eforce, - void* rotationsOut, - const void* velems) -{ - switch (nbNodesPerElem) - { - case 3: launchAddForceWithRotations(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; - case 4: launchAddForceWithRotations(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; - case 8: launchAddForceWithRotations(nbElem, nbVertex, maxElemPerVertex, elements, initRotTransposed, stiffness, x, x0, f, eforce, rotationsOut, velems); break; - } -} - -void ElementCorotationalFEMForceFieldCuda3d_addForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* rotations, - const void* stiffness, - const void* x, - const void* x0, - void* f, - void* eforce, - const void* velems) -{ - switch (nbNodesPerElem) - { - case 2: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; - case 3: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; - case 4: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; - case 8: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, x, x0, f, eforce, velems); break; - } -} - -void ElementCorotationalFEMForceFieldCuda3d_addDForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* rotations, - const void* stiffness, - const void* dx, - void* df, - void* eforce, - const void* velems, - double kFactor) -{ - switch (nbNodesPerElem) - { - case 2: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; - case 3: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; - case 4: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; - case 8: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, rotations, stiffness, dx, df, eforce, velems, kFactor); break; - } + const int gatherThreads = 256; + numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; + ElementCorotationalFEMForceField_gatherForce_kernel + <<>>( + nbVertex, + maxElemPerVertex, + (const int*)velems, + (const T*)eforce, + (T*)df); + mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel"); } -} // extern "C" +// ===================== Explicit template instantiations ===================== + +// addForceWithRotations: only NNodes >= 3 (triangle/quad/hex rotation methods) +template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*); +template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*); +template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*); +template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*); +template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*); +template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*); + +// addForce: all element types +template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); + +// addDForce: all element types +template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float); +template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float); +template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float); +template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float); +template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double); +template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double); +template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double); +template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double); -#if defined(__cplusplus) } // namespace cuda } // namespace gpu } // namespace sofa -#endif diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h index 7ec167dcae3..820b4c915a1 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h @@ -27,94 +27,48 @@ namespace sofa::gpu::cuda { -extern "C" -{ - void ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* initRotTransposed, - const void* stiffness, - const void* x, - const void* x0, - void* f, - void* eforce, - void* rotationsOut, - const void* velems); - - void ElementCorotationalFEMForceFieldCuda3f_addForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* rotations, - const void* stiffness, - const void* x, - const void* x0, - void* f, - void* eforce, - const void* velems); - - void ElementCorotationalFEMForceFieldCuda3f_addDForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* rotations, - const void* stiffness, - const void* dx, - void* df, - void* eforce, - const void* velems, - float kFactor); - - void ElementCorotationalFEMForceFieldCuda3d_addForceWithRotations( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* initRotTransposed, - const void* stiffness, - const void* x, - const void* x0, - void* f, - void* eforce, - void* rotationsOut, - const void* velems); - - void ElementCorotationalFEMForceFieldCuda3d_addForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* rotations, - const void* stiffness, - const void* x, - const void* x0, - void* f, - void* eforce, - const void* velems); - - void ElementCorotationalFEMForceFieldCuda3d_addDForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* rotations, - const void* stiffness, - const void* dx, - void* df, - void* eforce, - const void* velems, - double kFactor); -} +template +void ElementCorotationalFEMForceFieldCuda_addForceWithRotations( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int maxElemPerVertex, + const void* elements, + const void* initRotTransposed, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + void* rotationsOut, + const void* velems); + +template +void ElementCorotationalFEMForceFieldCuda_addForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int maxElemPerVertex, + const void* elements, + const void* rotations, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + const void* velems); + +template +void ElementCorotationalFEMForceFieldCuda_addDForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int maxElemPerVertex, + const void* elements, + const void* rotations, + const void* stiffness, + const void* dx, + void* df, + void* eforce, + const void* velems, + T kFactor); } // namespace sofa::gpu::cuda diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl index effb420ab61..2359244539e 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl @@ -51,7 +51,6 @@ void CudaElementCorotationalFEMForceField::uploadStiffne const auto& assembledMatrices = this->m_assembledStiffnessMatrices; const auto nbElem = elements.size(); - constexpr auto nDofs = trait::NumberOfDofsInElement; constexpr auto nNodes = trait::NumberOfNodesInElement; constexpr auto dim = trait::spatial_dimensions; @@ -253,6 +252,8 @@ void CudaElementCorotationalFEMForceField::addForce( } using trait = sofa::component::solidmechanics::fem::elastic::trait; + constexpr auto nNodes = trait::NumberOfNodesInElement; + constexpr auto dim = trait::spatial_dimensions; const VecCoord& x = d_x.getValue(); auto restPositionAccessor = this->mstate->readRestPositions(); @@ -266,56 +267,34 @@ void CudaElementCorotationalFEMForceField::addForce( if (f.size() < x.size()) f.resize(x.size()); - if (m_gpuRotationMethodSupported) + if constexpr (nNodes >= 3) { - // Fully GPU path: compute rotations + forces in one kernel - if constexpr (std::is_same_v) - { - gpu::cuda::ElementCorotationalFEMForceFieldCuda3d_addForceWithRotations( - nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, - m_gpuElements.deviceRead(), m_gpuInitialRotationsTransposed.deviceRead(), - m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), - f.deviceWrite(), m_gpuElementForce.deviceWrite(), - m_gpuRotations.deviceWrite(), m_gpuVelems.deviceRead()); - } - else + if (m_gpuRotationMethodSupported) { - gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForceWithRotations( - nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, + gpu::cuda::ElementCorotationalFEMForceFieldCuda_addForceWithRotations( + nbElem, nbVertex, m_maxElemPerVertex, m_gpuElements.deviceRead(), m_gpuInitialRotationsTransposed.deviceRead(), m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), f.deviceWrite(), m_gpuElementForce.deviceWrite(), m_gpuRotations.deviceWrite(), m_gpuVelems.deviceRead()); - } - m_gpuRotationsUploaded = true; - } - else - { - // CPU rotations + GPU forces - this->computeRotations(this->m_rotations, x, x0); - uploadRotations(); - - if constexpr (std::is_same_v) - { - gpu::cuda::ElementCorotationalFEMForceFieldCuda3d_addForce( - nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, - m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), - m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), - f.deviceWrite(), m_gpuElementForce.deviceWrite(), - m_gpuVelems.deviceRead()); - } - else - { - gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addForce( - nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, - m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), - m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), - f.deviceWrite(), m_gpuElementForce.deviceWrite(), - m_gpuVelems.deviceRead()); + m_gpuRotationsUploaded = true; + d_f.endEdit(); + return; } } + // CPU rotations + GPU forces + this->computeRotations(this->m_rotations, x, x0); + uploadRotations(); + + gpu::cuda::ElementCorotationalFEMForceFieldCuda_addForce( + nbElem, nbVertex, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), + m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), + f.deviceWrite(), m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead()); + d_f.endEdit(); } @@ -336,6 +315,8 @@ void CudaElementCorotationalFEMForceField::addDForce( } using trait = sofa::component::solidmechanics::fem::elastic::trait; + constexpr auto nNodes = trait::NumberOfNodesInElement; + constexpr auto dim = trait::spatial_dimensions; VecDeriv& df = *d_df.beginEdit(); const VecDeriv& dx = d_dx.getValue(); @@ -351,24 +332,12 @@ void CudaElementCorotationalFEMForceField::addDForce( const auto nbElem = static_cast(elements.size()); const auto nbVertex = static_cast(dx.size()); - if constexpr (std::is_same_v) - { - gpu::cuda::ElementCorotationalFEMForceFieldCuda3d_addDForce( - nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, - m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), - m_gpuStiffness.deviceRead(), dx.deviceRead(), - df.deviceWrite(), m_gpuElementForce.deviceWrite(), - m_gpuVelems.deviceRead(), kFactor); - } - else - { - gpu::cuda::ElementCorotationalFEMForceFieldCuda3f_addDForce( - nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, - m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), - m_gpuStiffness.deviceRead(), dx.deviceRead(), - df.deviceWrite(), m_gpuElementForce.deviceWrite(), - m_gpuVelems.deviceRead(), kFactor); - } + gpu::cuda::ElementCorotationalFEMForceFieldCuda_addDForce( + nbElem, nbVertex, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), + m_gpuStiffness.deviceRead(), dx.deviceRead(), + df.deviceWrite(), m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead(), kFactor); d_df.endEdit(); } diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu index 6752bd29af1..e8492615c32 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu @@ -23,23 +23,21 @@ #include #include -#if defined(__cplusplus) namespace sofa { namespace gpu { namespace cuda { -#endif /** * Kernel for addForce: Compute per-element force from displacement (1 thread per element). * * f = -K * (x - x0) - * Templated on NNodes (compile-time) for full loop unrolling. + * Templated on NNodes and Dim (compile-time) for full loop unrolling. * Templated on T for float/double support. */ -template +template __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel( int nbElem, const int* __restrict__ elements, @@ -54,22 +52,22 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel( if (elemId >= nbElem) return; // Gather displacement = x - x0 for this element's nodes - T disp[NNodes * 3]; + T disp[NNodes * Dim]; #pragma unroll for (int n = 0; n < NNodes; ++n) { const int nodeId = elements[n * nbElem + elemId]; - disp[n * 3 + 0] = x[nodeId * 3 + 0] - x0[nodeId * 3 + 0]; - disp[n * 3 + 1] = x[nodeId * 3 + 1] - x0[nodeId * 3 + 1]; - disp[n * 3 + 2] = x[nodeId * 3 + 2] - x0[nodeId * 3 + 2]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + disp[n * Dim + d] = x[nodeId * Dim + d] - x0[nodeId * Dim + d]; } // Symmetric block-matrix multiply: edf = K * disp - const T* K = stiffness + elemId * NSymBlocks * 9; - T edf[NNodes * 3]; + const T* K = stiffness + elemId * NSymBlocks * Dim * Dim; + T edf[NNodes * Dim]; #pragma unroll - for (int i = 0; i < NNodes * 3; ++i) + for (int i = 0; i < NNodes * Dim; ++i) edf[i] = T(0); #pragma unroll @@ -79,13 +77,16 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel( // Diagonal block { - const T* Kii = K + diagIdx * 9; - const T di0 = disp[ni * 3 + 0]; - const T di1 = disp[ni * 3 + 1]; - const T di2 = disp[ni * 3 + 2]; - edf[ni * 3 + 0] += Kii[0] * di0 + Kii[1] * di1 + Kii[2] * di2; - edf[ni * 3 + 1] += Kii[3] * di0 + Kii[4] * di1 + Kii[5] * di2; - edf[ni * 3 + 2] += Kii[6] * di0 + Kii[7] * di1 + Kii[8] * di2; + const T* Kii = K + diagIdx * Dim * Dim; + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += Kii[di * Dim + dj] * disp[ni * Dim + dj]; + edf[ni * Dim + di] += sum; + } } // Off-diagonal blocks @@ -93,36 +94,40 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel( for (int nj = ni + 1; nj < NNodes; ++nj) { const int symIdx = diagIdx + (nj - ni); - const T* Kij = K + symIdx * 9; + const T* Kij = K + symIdx * Dim * Dim; + // Kij * disp_j -> edf_i + #pragma unroll + for (int di = 0; di < Dim; ++di) { - const T dj0 = disp[nj * 3 + 0]; - const T dj1 = disp[nj * 3 + 1]; - const T dj2 = disp[nj * 3 + 2]; - edf[ni * 3 + 0] += Kij[0] * dj0 + Kij[1] * dj1 + Kij[2] * dj2; - edf[ni * 3 + 1] += Kij[3] * dj0 + Kij[4] * dj1 + Kij[5] * dj2; - edf[ni * 3 + 2] += Kij[6] * dj0 + Kij[7] * dj1 + Kij[8] * dj2; + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += Kij[di * Dim + dj] * disp[nj * Dim + dj]; + edf[ni * Dim + di] += sum; } + // Kij^T * disp_i -> edf_j + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) { - const T di0 = disp[ni * 3 + 0]; - const T di1 = disp[ni * 3 + 1]; - const T di2 = disp[ni * 3 + 2]; - edf[nj * 3 + 0] += Kij[0] * di0 + Kij[3] * di1 + Kij[6] * di2; - edf[nj * 3 + 1] += Kij[1] * di0 + Kij[4] * di1 + Kij[7] * di2; - edf[nj * 3 + 2] += Kij[2] * di0 + Kij[5] * di1 + Kij[8] * di2; + T sum = T(0); + #pragma unroll + for (int di = 0; di < Dim; ++di) + sum += Kij[di * Dim + dj] * disp[ni * Dim + di]; + edf[nj * Dim + dj] += sum; } } } // Write: eforce = -edf (minus sign from f -= K * displacement) - T* out = eforce + elemId * NNodes * 3; + T* out = eforce + elemId * NNodes * Dim; #pragma unroll for (int n = 0; n < NNodes; ++n) { - out[n * 3 + 0] = -edf[n * 3 + 0]; - out[n * 3 + 1] = -edf[n * 3 + 1]; - out[n * 3 + 2] = -edf[n * 3 + 2]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + out[n * Dim + d] = -edf[n * Dim + d]; } } @@ -131,7 +136,7 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel( * * df = -kFactor * K * dx */ -template +template __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel( int nbElem, const int* __restrict__ elements, @@ -146,22 +151,22 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel( if (elemId >= nbElem) return; // Gather dx for this element's nodes - T edx[NNodes * 3]; + T edx[NNodes * Dim]; #pragma unroll for (int n = 0; n < NNodes; ++n) { const int nodeId = elements[n * nbElem + elemId]; - edx[n * 3 + 0] = dx[nodeId * 3 + 0]; - edx[n * 3 + 1] = dx[nodeId * 3 + 1]; - edx[n * 3 + 2] = dx[nodeId * 3 + 2]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + edx[n * Dim + d] = dx[nodeId * Dim + d]; } // Symmetric block-matrix multiply: edf = K * edx - const T* K = stiffness + elemId * NSymBlocks * 9; - T edf[NNodes * 3]; + const T* K = stiffness + elemId * NSymBlocks * Dim * Dim; + T edf[NNodes * Dim]; #pragma unroll - for (int i = 0; i < NNodes * 3; ++i) + for (int i = 0; i < NNodes * Dim; ++i) edf[i] = T(0); #pragma unroll @@ -170,56 +175,61 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel( const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; { - const T* Kii = K + diagIdx * 9; - const T di0 = edx[ni * 3 + 0]; - const T di1 = edx[ni * 3 + 1]; - const T di2 = edx[ni * 3 + 2]; - edf[ni * 3 + 0] += Kii[0] * di0 + Kii[1] * di1 + Kii[2] * di2; - edf[ni * 3 + 1] += Kii[3] * di0 + Kii[4] * di1 + Kii[5] * di2; - edf[ni * 3 + 2] += Kii[6] * di0 + Kii[7] * di1 + Kii[8] * di2; + const T* Kii = K + diagIdx * Dim * Dim; + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += Kii[di * Dim + dj] * edx[ni * Dim + dj]; + edf[ni * Dim + di] += sum; + } } #pragma unroll for (int nj = ni + 1; nj < NNodes; ++nj) { const int symIdx = diagIdx + (nj - ni); - const T* Kij = K + symIdx * 9; + const T* Kij = K + symIdx * Dim * Dim; + #pragma unroll + for (int di = 0; di < Dim; ++di) { - const T dj0 = edx[nj * 3 + 0]; - const T dj1 = edx[nj * 3 + 1]; - const T dj2 = edx[nj * 3 + 2]; - edf[ni * 3 + 0] += Kij[0] * dj0 + Kij[1] * dj1 + Kij[2] * dj2; - edf[ni * 3 + 1] += Kij[3] * dj0 + Kij[4] * dj1 + Kij[5] * dj2; - edf[ni * 3 + 2] += Kij[6] * dj0 + Kij[7] * dj1 + Kij[8] * dj2; + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += Kij[di * Dim + dj] * edx[nj * Dim + dj]; + edf[ni * Dim + di] += sum; } + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) { - const T di0 = edx[ni * 3 + 0]; - const T di1 = edx[ni * 3 + 1]; - const T di2 = edx[ni * 3 + 2]; - edf[nj * 3 + 0] += Kij[0] * di0 + Kij[3] * di1 + Kij[6] * di2; - edf[nj * 3 + 1] += Kij[1] * di0 + Kij[4] * di1 + Kij[7] * di2; - edf[nj * 3 + 2] += Kij[2] * di0 + Kij[5] * di1 + Kij[8] * di2; + T sum = T(0); + #pragma unroll + for (int di = 0; di < Dim; ++di) + sum += Kij[di * Dim + dj] * edx[ni * Dim + di]; + edf[nj * Dim + dj] += sum; } } } // Write: eforce = -kFactor * edf - T* out = eforce + elemId * NNodes * 3; + T* out = eforce + elemId * NNodes * Dim; #pragma unroll for (int n = 0; n < NNodes; ++n) { - out[n * 3 + 0] = -kFactor * edf[n * 3 + 0]; - out[n * 3 + 1] = -kFactor * edf[n * 3 + 1]; - out[n * 3 + 2] = -kFactor * edf[n * 3 + 2]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + out[n * Dim + d] = -kFactor * edf[n * Dim + d]; } } /** * Gather per-vertex forces (1 thread per vertex). */ -template +template __global__ void ElementLinearSmallStrainFEMForceField_gatherForce_kernel( int nbVertex, int maxElemPerVertex, @@ -230,45 +240,28 @@ __global__ void ElementLinearSmallStrainFEMForceField_gatherForce_kernel( const int vertexId = blockIdx.x * blockDim.x + threadIdx.x; if (vertexId >= nbVertex) return; - T fx = T(0), fy = T(0), fz = T(0); + T acc[Dim]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + acc[d] = T(0); for (int s = 0; s < maxElemPerVertex; ++s) { const int idx = velems[s * nbVertex + vertexId]; if (idx == 0) break; - const int base = (idx - 1) * 3; - fx += eforce[base + 0]; - fy += eforce[base + 1]; - fz += eforce[base + 2]; + const int base = (idx - 1) * Dim; + #pragma unroll + for (int d = 0; d < Dim; ++d) + acc[d] += eforce[base + d]; } - df[vertexId * 3 + 0] += fx; - df[vertexId * 3 + 1] += fy; - df[vertexId * 3 + 2] += fz; -} - -template -static void launchGather( - unsigned int nbVertex, - unsigned int maxElemPerVertex, - const void* velems, - const void* eforce, - void* f) -{ - const int gatherThreads = 256; - const int numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; - ElementLinearSmallStrainFEMForceField_gatherForce_kernel - <<>>( - nbVertex, - maxElemPerVertex, - (const int*)velems, - (const T*)eforce, - (T*)f); - mycudaDebugError("ElementLinearSmallStrainFEMForceField_gatherForce_kernel"); + #pragma unroll + for (int d = 0; d < Dim; ++d) + df[vertexId * Dim + d] += acc[d]; } -template -static void launchAddForce( +template +void ElementLinearSmallStrainFEMForceFieldCuda_addForce( unsigned int nbElem, unsigned int nbVertex, unsigned int maxElemPerVertex, @@ -281,8 +274,8 @@ static void launchAddForce( const void* velems) { const int computeThreads = 64; - const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; - ElementLinearSmallStrainFEMForceField_computeForce_kernel + int numBlocks = (nbElem + computeThreads - 1) / computeThreads; + ElementLinearSmallStrainFEMForceField_computeForce_kernel <<>>( nbElem, (const int*)elements, @@ -292,11 +285,20 @@ static void launchAddForce( (T*)eforce); mycudaDebugError("ElementLinearSmallStrainFEMForceField_computeForce_kernel"); - launchGather(nbVertex, maxElemPerVertex, velems, eforce, f); + const int gatherThreads = 256; + numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; + ElementLinearSmallStrainFEMForceField_gatherForce_kernel + <<>>( + nbVertex, + maxElemPerVertex, + (const int*)velems, + (const T*)eforce, + (T*)f); + mycudaDebugError("ElementLinearSmallStrainFEMForceField_gatherForce_kernel"); } -template -static void launchAddDForce( +template +void ElementLinearSmallStrainFEMForceFieldCuda_addDForce( unsigned int nbElem, unsigned int nbVertex, unsigned int maxElemPerVertex, @@ -309,8 +311,8 @@ static void launchAddDForce( T kFactor) { const int computeThreads = 64; - const int numBlocks = (nbElem + computeThreads - 1) / computeThreads; - ElementLinearSmallStrainFEMForceField_computeDForce_kernel + int numBlocks = (nbElem + computeThreads - 1) / computeThreads; + ElementLinearSmallStrainFEMForceField_computeDForce_kernel <<>>( nbElem, (const int*)elements, @@ -320,104 +322,37 @@ static void launchAddDForce( kFactor); mycudaDebugError("ElementLinearSmallStrainFEMForceField_computeDForce_kernel"); - launchGather(nbVertex, maxElemPerVertex, velems, eforce, df); -} - -extern "C" -{ - -void ElementLinearSmallStrainFEMForceFieldCuda3f_addForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* stiffness, - const void* x, - const void* x0, - void* f, - void* eforce, - const void* velems) -{ - switch (nbNodesPerElem) - { - case 2: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; - case 3: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; - case 4: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; - case 8: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; - } -} - -void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* stiffness, - const void* dx, - void* df, - void* eforce, - const void* velems, - float kFactor) -{ - switch (nbNodesPerElem) - { - case 2: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; - case 3: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; - case 4: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; - case 8: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; - } -} - -void ElementLinearSmallStrainFEMForceFieldCuda3d_addForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* stiffness, - const void* x, - const void* x0, - void* f, - void* eforce, - const void* velems) -{ - switch (nbNodesPerElem) - { - case 2: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; - case 3: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; - case 4: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; - case 8: launchAddForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, x, x0, f, eforce, velems); break; - } -} - -void ElementLinearSmallStrainFEMForceFieldCuda3d_addDForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* stiffness, - const void* dx, - void* df, - void* eforce, - const void* velems, - double kFactor) -{ - switch (nbNodesPerElem) - { - case 2: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; - case 3: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; - case 4: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; - case 8: launchAddDForce(nbElem, nbVertex, maxElemPerVertex, elements, stiffness, dx, df, eforce, velems, kFactor); break; - } + const int gatherThreads = 256; + numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; + ElementLinearSmallStrainFEMForceField_gatherForce_kernel + <<>>( + nbVertex, + maxElemPerVertex, + (const int*)velems, + (const T*)eforce, + (T*)df); + mycudaDebugError("ElementLinearSmallStrainFEMForceField_gatherForce_kernel"); } -} // extern "C" +// Explicit template instantiations for all supported (T, NNodes, Dim) combinations +template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); +template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); + +template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float); +template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float); +template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float); +template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float); +template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double); +template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double); +template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double); +template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double); -#if defined(__cplusplus) } // namespace cuda } // namespace gpu } // namespace sofa -#endif diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h index 777d3301ee2..45d119846e8 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h @@ -27,60 +27,31 @@ namespace sofa::gpu::cuda { -extern "C" -{ - void ElementLinearSmallStrainFEMForceFieldCuda3f_addForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* stiffness, - const void* x, - const void* x0, - void* f, - void* eforce, - const void* velems); - - void ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* stiffness, - const void* dx, - void* df, - void* eforce, - const void* velems, - float kFactor); - - void ElementLinearSmallStrainFEMForceFieldCuda3d_addForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* stiffness, - const void* x, - const void* x0, - void* f, - void* eforce, - const void* velems); - - void ElementLinearSmallStrainFEMForceFieldCuda3d_addDForce( - unsigned int nbElem, - unsigned int nbVertex, - unsigned int nbNodesPerElem, - unsigned int maxElemPerVertex, - const void* elements, - const void* stiffness, - const void* dx, - void* df, - void* eforce, - const void* velems, - double kFactor); -} +template +void ElementLinearSmallStrainFEMForceFieldCuda_addForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int maxElemPerVertex, + const void* elements, + const void* stiffness, + const void* x, + const void* x0, + void* f, + void* eforce, + const void* velems); + +template +void ElementLinearSmallStrainFEMForceFieldCuda_addDForce( + unsigned int nbElem, + unsigned int nbVertex, + unsigned int maxElemPerVertex, + const void* elements, + const void* stiffness, + const void* dx, + void* df, + void* eforce, + const void* velems, + T kFactor); } // namespace sofa::gpu::cuda diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl index 863511e951d..f27c06b92e7 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl @@ -161,6 +161,8 @@ void CudaElementLinearSmallStrainFEMForceField::addForce } using trait = sofa::component::solidmechanics::fem::elastic::trait; + constexpr auto nNodes = trait::NumberOfNodesInElement; + constexpr auto dim = trait::spatial_dimensions; VecDeriv& f = *d_f.beginEdit(); const VecCoord& x = d_x.getValue(); @@ -175,24 +177,12 @@ void CudaElementLinearSmallStrainFEMForceField::addForce const auto nbElem = static_cast(elements.size()); const auto nbVertex = static_cast(x.size()); - if constexpr (std::is_same_v) - { - gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3d_addForce( - nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, - m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), - x.deviceRead(), x0.deviceRead(), - f.deviceWrite(), m_gpuElementForce.deviceWrite(), - m_gpuVelems.deviceRead()); - } - else - { - gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addForce( - nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, - m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), - x.deviceRead(), x0.deviceRead(), - f.deviceWrite(), m_gpuElementForce.deviceWrite(), - m_gpuVelems.deviceRead()); - } + gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda_addForce( + nbElem, nbVertex, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), + x.deviceRead(), x0.deviceRead(), + f.deviceWrite(), m_gpuElementForce.deviceWrite(), + m_gpuVelems.deviceRead()); d_f.endEdit(); } @@ -214,6 +204,8 @@ void CudaElementLinearSmallStrainFEMForceField::addDForc } using trait = sofa::component::solidmechanics::fem::elastic::trait; + constexpr auto nNodes = trait::NumberOfNodesInElement; + constexpr auto dim = trait::spatial_dimensions; VecDeriv& df = *d_df.beginEdit(); const VecDeriv& dx = d_dx.getValue(); @@ -229,24 +221,12 @@ void CudaElementLinearSmallStrainFEMForceField::addDForc const auto nbElem = static_cast(elements.size()); const auto nbVertex = static_cast(dx.size()); - if constexpr (std::is_same_v) - { - gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3d_addDForce( - nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, - m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), - dx.deviceRead(), df.deviceWrite(), - m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(), - kFactor); - } - else - { - gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda3f_addDForce( - nbElem, nbVertex, trait::NumberOfNodesInElement, m_maxElemPerVertex, - m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), - dx.deviceRead(), df.deviceWrite(), - m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(), - kFactor); - } + gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda_addDForce( + nbElem, nbVertex, m_maxElemPerVertex, + m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), + dx.deviceRead(), df.deviceWrite(), + m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(), + kFactor); d_df.endEdit(); } From abb676c1b490dd923ac80e64d6d7f1229b7b6cea Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Thu, 9 Apr 2026 08:10:17 +0900 Subject: [PATCH 17/21] dont compile double version if SOFA_GPU_CUDA_DOUBLE is not enabled --- .../fem/elastic/CudaElementCorotationalFEMForceField.cpp | 4 ++++ .../fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp index 55a46c00669..5cd43daa6d2 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cpp @@ -36,11 +36,13 @@ template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; +#ifdef SOFA_GPU_CUDA_DOUBLE template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; template class SOFACUDA_COMPONENT_API CudaElementCorotationalFEMForceField; +#endif } // namespace sofa::component::solidmechanics::fem::elastic @@ -72,6 +74,7 @@ void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory .add< CudaElementCorotationalFEMForceField >() ); +#ifdef SOFA_GPU_CUDA_DOUBLE factory->registerObjects(sofa::core::ObjectRegistrationData( "Supports GPU-side computations using CUDA (double) for EdgeCorotationalFEMForceField") .add< CudaElementCorotationalFEMForceField >() @@ -92,6 +95,7 @@ void registerElementCorotationalFEMForceField(sofa::core::ObjectFactory* factory "Supports GPU-side computations using CUDA (double) for HexahedronCorotationalFEMForceField") .add< CudaElementCorotationalFEMForceField >() ); +#endif } } // namespace sofa::gpu::cuda diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp index b46f90d06d4..d8d3b9ef1c3 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cpp @@ -36,11 +36,13 @@ template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField< template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; +#ifdef SOFA_GPU_CUDA_DOUBLE template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; template class SOFACUDA_COMPONENT_API CudaElementLinearSmallStrainFEMForceField; +#endif } // namespace sofa::component::solidmechanics::fem::elastic @@ -72,6 +74,7 @@ void registerElementLinearSmallStrainFEMForceField(sofa::core::ObjectFactory* fa .add< CudaElementLinearSmallStrainFEMForceField >() ); +#ifdef SOFA_GPU_CUDA_DOUBLE factory->registerObjects(sofa::core::ObjectRegistrationData( "Supports GPU-side computations using CUDA (double) for EdgeLinearSmallStrainFEMForceField") .add< CudaElementLinearSmallStrainFEMForceField >() @@ -92,6 +95,7 @@ void registerElementLinearSmallStrainFEMForceField(sofa::core::ObjectFactory* fa "Supports GPU-side computations using CUDA (double) for HexahedronLinearSmallStrainFEMForceField") .add< CudaElementLinearSmallStrainFEMForceField >() ); +#endif } } // namespace sofa::gpu::cuda From e41d0b69300580da430bb878199069d7b845bb71 Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Thu, 9 Apr 2026 08:17:32 +0900 Subject: [PATCH 18/21] use w accessors --- .../CudaElementCorotationalFEMForceField.inl | 15 +++++---------- .../CudaElementLinearSmallStrainFEMForceField.inl | 12 ++++-------- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl index 2359244539e..b6da2b45a2d 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl @@ -263,7 +263,7 @@ void CudaElementCorotationalFEMForceField::addForce( const auto nbElem = static_cast(elements.size()); const auto nbVertex = static_cast(x.size()); - VecDeriv& f = *d_f.beginEdit(); + auto f = sofa::helper::getWriteOnlyAccessor(d_f); if (f.size() < x.size()) f.resize(x.size()); @@ -275,11 +275,10 @@ void CudaElementCorotationalFEMForceField::addForce( nbElem, nbVertex, m_maxElemPerVertex, m_gpuElements.deviceRead(), m_gpuInitialRotationsTransposed.deviceRead(), m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), - f.deviceWrite(), m_gpuElementForce.deviceWrite(), + f.wref().deviceWrite(), m_gpuElementForce.deviceWrite(), m_gpuRotations.deviceWrite(), m_gpuVelems.deviceRead()); m_gpuRotationsUploaded = true; - d_f.endEdit(); return; } } @@ -292,10 +291,8 @@ void CudaElementCorotationalFEMForceField::addForce( nbElem, nbVertex, m_maxElemPerVertex, m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), - f.deviceWrite(), m_gpuElementForce.deviceWrite(), + f.wref().deviceWrite(), m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead()); - - d_f.endEdit(); } template @@ -318,7 +315,7 @@ void CudaElementCorotationalFEMForceField::addDForce( constexpr auto nNodes = trait::NumberOfNodesInElement; constexpr auto dim = trait::spatial_dimensions; - VecDeriv& df = *d_df.beginEdit(); + auto df = sofa::helper::getWriteOnlyAccessor(d_df); const VecDeriv& dx = d_dx.getValue(); if (df.size() < dx.size()) @@ -336,10 +333,8 @@ void CudaElementCorotationalFEMForceField::addDForce( nbElem, nbVertex, m_maxElemPerVertex, m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), m_gpuStiffness.deviceRead(), dx.deviceRead(), - df.deviceWrite(), m_gpuElementForce.deviceWrite(), + df.wref().deviceWrite(), m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(), kFactor); - - d_df.endEdit(); } } // namespace sofa::component::solidmechanics::fem::elastic diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl index f27c06b92e7..97b6066aa4c 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl @@ -164,7 +164,7 @@ void CudaElementLinearSmallStrainFEMForceField::addForce constexpr auto nNodes = trait::NumberOfNodesInElement; constexpr auto dim = trait::spatial_dimensions; - VecDeriv& f = *d_f.beginEdit(); + auto f = sofa::helper::getWriteOnlyAccessor(d_f); const VecCoord& x = d_x.getValue(); if (f.size() < x.size()) @@ -181,10 +181,8 @@ void CudaElementLinearSmallStrainFEMForceField::addForce nbElem, nbVertex, m_maxElemPerVertex, m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), - f.deviceWrite(), m_gpuElementForce.deviceWrite(), + f.wref().deviceWrite(), m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead()); - - d_f.endEdit(); } template @@ -207,7 +205,7 @@ void CudaElementLinearSmallStrainFEMForceField::addDForc constexpr auto nNodes = trait::NumberOfNodesInElement; constexpr auto dim = trait::spatial_dimensions; - VecDeriv& df = *d_df.beginEdit(); + auto df = sofa::helper::getWriteOnlyAccessor(d_df); const VecDeriv& dx = d_dx.getValue(); if (df.size() < dx.size()) @@ -224,11 +222,9 @@ void CudaElementLinearSmallStrainFEMForceField::addDForc gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda_addDForce( nbElem, nbVertex, m_maxElemPerVertex, m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), - dx.deviceRead(), df.deviceWrite(), + dx.deviceRead(), df.wref().deviceWrite(), m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(), kFactor); - - d_df.endEdit(); } } // namespace sofa::component::solidmechanics::fem::elastic From 05c54101ae277a4c7801c03541d29098303ae40f Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Fri, 10 Apr 2026 11:16:48 +0900 Subject: [PATCH 19/21] Revert "use w accessors" This reverts commit e41d0b69300580da430bb878199069d7b845bb71. --- .../CudaElementCorotationalFEMForceField.inl | 15 ++++++++++----- .../CudaElementLinearSmallStrainFEMForceField.inl | 12 ++++++++---- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl index b6da2b45a2d..2359244539e 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl @@ -263,7 +263,7 @@ void CudaElementCorotationalFEMForceField::addForce( const auto nbElem = static_cast(elements.size()); const auto nbVertex = static_cast(x.size()); - auto f = sofa::helper::getWriteOnlyAccessor(d_f); + VecDeriv& f = *d_f.beginEdit(); if (f.size() < x.size()) f.resize(x.size()); @@ -275,10 +275,11 @@ void CudaElementCorotationalFEMForceField::addForce( nbElem, nbVertex, m_maxElemPerVertex, m_gpuElements.deviceRead(), m_gpuInitialRotationsTransposed.deviceRead(), m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), - f.wref().deviceWrite(), m_gpuElementForce.deviceWrite(), + f.deviceWrite(), m_gpuElementForce.deviceWrite(), m_gpuRotations.deviceWrite(), m_gpuVelems.deviceRead()); m_gpuRotationsUploaded = true; + d_f.endEdit(); return; } } @@ -291,8 +292,10 @@ void CudaElementCorotationalFEMForceField::addForce( nbElem, nbVertex, m_maxElemPerVertex, m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), - f.wref().deviceWrite(), m_gpuElementForce.deviceWrite(), + f.deviceWrite(), m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead()); + + d_f.endEdit(); } template @@ -315,7 +318,7 @@ void CudaElementCorotationalFEMForceField::addDForce( constexpr auto nNodes = trait::NumberOfNodesInElement; constexpr auto dim = trait::spatial_dimensions; - auto df = sofa::helper::getWriteOnlyAccessor(d_df); + VecDeriv& df = *d_df.beginEdit(); const VecDeriv& dx = d_dx.getValue(); if (df.size() < dx.size()) @@ -333,8 +336,10 @@ void CudaElementCorotationalFEMForceField::addDForce( nbElem, nbVertex, m_maxElemPerVertex, m_gpuElements.deviceRead(), m_gpuRotations.deviceRead(), m_gpuStiffness.deviceRead(), dx.deviceRead(), - df.wref().deviceWrite(), m_gpuElementForce.deviceWrite(), + df.deviceWrite(), m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(), kFactor); + + d_df.endEdit(); } } // namespace sofa::component::solidmechanics::fem::elastic diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl index 97b6066aa4c..f27c06b92e7 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.inl @@ -164,7 +164,7 @@ void CudaElementLinearSmallStrainFEMForceField::addForce constexpr auto nNodes = trait::NumberOfNodesInElement; constexpr auto dim = trait::spatial_dimensions; - auto f = sofa::helper::getWriteOnlyAccessor(d_f); + VecDeriv& f = *d_f.beginEdit(); const VecCoord& x = d_x.getValue(); if (f.size() < x.size()) @@ -181,8 +181,10 @@ void CudaElementLinearSmallStrainFEMForceField::addForce nbElem, nbVertex, m_maxElemPerVertex, m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), x.deviceRead(), x0.deviceRead(), - f.wref().deviceWrite(), m_gpuElementForce.deviceWrite(), + f.deviceWrite(), m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead()); + + d_f.endEdit(); } template @@ -205,7 +207,7 @@ void CudaElementLinearSmallStrainFEMForceField::addDForc constexpr auto nNodes = trait::NumberOfNodesInElement; constexpr auto dim = trait::spatial_dimensions; - auto df = sofa::helper::getWriteOnlyAccessor(d_df); + VecDeriv& df = *d_df.beginEdit(); const VecDeriv& dx = d_dx.getValue(); if (df.size() < dx.size()) @@ -222,9 +224,11 @@ void CudaElementLinearSmallStrainFEMForceField::addDForc gpu::cuda::ElementLinearSmallStrainFEMForceFieldCuda_addDForce( nbElem, nbVertex, m_maxElemPerVertex, m_gpuElements.deviceRead(), m_gpuStiffness.deviceRead(), - dx.deviceRead(), df.wref().deviceWrite(), + dx.deviceRead(), df.deviceWrite(), m_gpuElementForce.deviceWrite(), m_gpuVelems.deviceRead(), kFactor); + + d_df.endEdit(); } } // namespace sofa::component::solidmechanics::fem::elastic From c6721d4721a0adf7c789c3ad3d872061a31ccc39 Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Fri, 10 Apr 2026 11:20:37 +0900 Subject: [PATCH 20/21] refactor cuda code --- .../plugins/SofaCUDA/Component/CMakeLists.txt | 1 + .../CudaElementCorotationalFEMForceField.cu | 322 +++--------------- .../fem/elastic/CudaElementFEMKernelUtils.cuh | 254 ++++++++++++++ ...daElementLinearSmallStrainFEMForceField.cu | 213 ++---------- 4 files changed, 331 insertions(+), 459 deletions(-) create mode 100644 applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh diff --git a/applications/plugins/SofaCUDA/Component/CMakeLists.txt b/applications/plugins/SofaCUDA/Component/CMakeLists.txt index 5ac492c4834..ce4d885c90b 100644 --- a/applications/plugins/SofaCUDA/Component/CMakeLists.txt +++ b/applications/plugins/SofaCUDA/Component/CMakeLists.txt @@ -39,6 +39,7 @@ set(HEADER_FILES ### solidmechanics + ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.h ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.inl ${SOFACUDA_COMPONENT_SOURCE_DIR}/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.h diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu index 8cb90f8c540..3125446e0a6 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu @@ -21,210 +21,13 @@ ******************************************************************************/ #include #include -#include +#include "CudaElementFEMKernelUtils.cuh" -namespace sofa +namespace sofa::gpu::cuda { -namespace gpu -{ -namespace cuda -{ - -template -__device__ T myRsqrt(T x); -template<> __device__ float myRsqrt(float x) { return rsqrtf(x); } -template<> __device__ double myRsqrt(double x) { return rsqrt(x); } - -/** - * Device helper: 3x3 matrix multiply C = A * B (row-major) - */ -template -__device__ void mat3Mul(const T* A, const T* B, T* C) -{ - #pragma unroll - for (int i = 0; i < 3; ++i) - { - #pragma unroll - for (int j = 0; j < 3; ++j) - { - C[i * 3 + j] = A[i * 3 + 0] * B[0 * 3 + j] - + A[i * 3 + 1] * B[1 * 3 + j] - + A[i * 3 + 2] * B[2 * 3 + j]; - } - } -} - -/** - * Device helper: C = A * B^T (row-major) - */ -template -__device__ void mat3MulTranspose(const T* A, const T* BT, T* C) -{ - #pragma unroll - for (int i = 0; i < 3; ++i) - { - #pragma unroll - for (int j = 0; j < 3; ++j) - { - C[i * 3 + j] = A[i * 3 + 0] * BT[j * 3 + 0] - + A[i * 3 + 1] * BT[j * 3 + 1] - + A[i * 3 + 2] * BT[j * 3 + 2]; - } - } -} - -/** - * Device helper: C = A^T * B (row-major) - * Matches SOFA's Mat::multTranspose(B) which computes this^T * B. - */ -template -__device__ void mat3TransposeMul(const T* A, const T* B, T* C) -{ - #pragma unroll - for (int i = 0; i < 3; ++i) - { - #pragma unroll - for (int j = 0; j < 3; ++j) - { - C[i * 3 + j] = A[0 * 3 + i] * B[0 * 3 + j] - + A[1 * 3 + i] * B[1 * 3 + j] - + A[2 * 3 + i] * B[2 * 3 + j]; - } - } -} - -/** - * Device helper: compute rotation frame from first 3 nodes (TriangleRotation). - */ -template -__device__ void computeTriangleFrame(const T* ex, T* frame) -{ - T ax = ex[3] - ex[0], ay = ex[4] - ex[1], az = ex[5] - ex[2]; - T invLen = myRsqrt(ax * ax + ay * ay + az * az); - ax *= invLen; ay *= invLen; az *= invLen; - - T bx = ex[6] - ex[0], by = ex[7] - ex[1], bz = ex[8] - ex[2]; - - T cx = ay * bz - az * by; - T cy = az * bx - ax * bz; - T cz = ax * by - ay * bx; - invLen = myRsqrt(cx * cx + cy * cy + cz * cz); - cx *= invLen; cy *= invLen; cz *= invLen; - - bx = cy * az - cz * ay; - by = cz * ax - cx * az; - bz = cx * ay - cy * ax; - - frame[0] = ax; frame[1] = ay; frame[2] = az; - frame[3] = bx; frame[4] = by; frame[5] = bz; - frame[6] = cx; frame[7] = cy; frame[8] = cz; -} - -/** - * Device helper: compute rotation frame from 8 hexahedron nodes (HexahedronRotation). - */ -template -__device__ void computeHexahedronFrame(const T* ex, T* frame) -{ - const T quarter = T(0.25); - - T ax = ((ex[1*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[3*3+0]) - + (ex[5*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[7*3+0])) * quarter; - T ay = ((ex[1*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[3*3+1]) - + (ex[5*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[7*3+1])) * quarter; - T az = ((ex[1*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[3*3+2]) - + (ex[5*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[7*3+2])) * quarter; - - T bx = ((ex[3*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[1*3+0]) - + (ex[7*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[5*3+0])) * quarter; - T by = ((ex[3*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[1*3+1]) - + (ex[7*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[5*3+1])) * quarter; - T bz = ((ex[3*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[1*3+2]) - + (ex[7*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[5*3+2])) * quarter; - - T invLen = myRsqrt(ax * ax + ay * ay + az * az); - ax *= invLen; ay *= invLen; az *= invLen; - - T cx = ay * bz - az * by; - T cy = az * bx - ax * bz; - T cz = ax * by - ay * bx; - invLen = myRsqrt(cx * cx + cy * cy + cz * cz); - cx *= invLen; cy *= invLen; cz *= invLen; - - bx = cy * az - cz * ay; - by = cz * ax - cx * az; - bz = cx * ay - cy * ax; - - frame[0] = ax; frame[1] = ay; frame[2] = az; - frame[3] = bx; frame[4] = by; frame[5] = bz; - frame[6] = cx; frame[7] = cy; frame[8] = cz; -} - -/** - * Symmetric block-matrix multiply: out = K * in - * Templated on Dim for generic spatial dimensions. - */ -template -__device__ void symBlockMatMul(const T* K, const T* in, T* out) -{ - #pragma unroll - for (int i = 0; i < NNodes * Dim; ++i) - out[i] = T(0); - - #pragma unroll - for (int ni = 0; ni < NNodes; ++ni) - { - const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; - - // Diagonal block - { - const T* Kii = K + diagIdx * Dim * Dim; - #pragma unroll - for (int di = 0; di < Dim; ++di) - { - T sum = T(0); - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - sum += Kii[di * Dim + dj] * in[ni * Dim + dj]; - out[ni * Dim + di] += sum; - } - } - - // Off-diagonal blocks - #pragma unroll - for (int nj = ni + 1; nj < NNodes; ++nj) - { - const int symIdx = diagIdx + (nj - ni); - const T* Kij = K + symIdx * Dim * Dim; - - // Kij * in_j -> out_i - #pragma unroll - for (int di = 0; di < Dim; ++di) - { - T sum = T(0); - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - sum += Kij[di * Dim + dj] * in[nj * Dim + dj]; - out[ni * Dim + di] += sum; - } - - // Kij^T * in_i -> out_j - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - { - T sum = T(0); - #pragma unroll - for (int di = 0; di < Dim; ++di) - sum += Kij[di * Dim + dj] * in[ni * Dim + di]; - out[nj * Dim + dj] += sum; - } - } - } -} /** * Combined kernel: compute rotations AND per-element forces in one pass. - * Rotation computation is inherently 3D (cross products). */ template __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel( @@ -263,7 +66,6 @@ __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel else computeTriangleFrame(ex, frame); - // R = frame^T * initRot const T* irt = initRotTransposed + elemId * Dim * Dim; T R[Dim * Dim]; mat3TransposeMul(frame, irt, R); @@ -301,7 +103,6 @@ __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel #pragma unroll for (int n = 0; n < NNodes; ++n) { - // R^T * (x_n - center) T diff[Dim]; #pragma unroll for (int d = 0; d < Dim; ++d) @@ -326,7 +127,6 @@ __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel #pragma unroll for (int n = 0; n < NNodes; ++n) { - // R * edf_n, negated #pragma unroll for (int di = 0; di < Dim; ++di) { @@ -340,7 +140,7 @@ __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel } /** - * Kernel for addForce: Compute per-element force (1 thread per element). + * Kernel for addForce with pre-computed rotations. */ template __global__ void ElementCorotationalFEMForceField_computeForce_kernel( @@ -442,7 +242,7 @@ __global__ void ElementCorotationalFEMForceField_computeForce_kernel( } /** - * Kernel for addDForce: Compute per-element dForce (1 thread per element). + * Kernel for addDForce. */ template __global__ void ElementCorotationalFEMForceField_computeDForce_kernel( @@ -465,7 +265,6 @@ __global__ void ElementCorotationalFEMForceField_computeDForce_kernel( for (int i = 0; i < Dim * Dim; ++i) R[i] = Rptr[i]; - // R^T * dx for each node T rdx[NNodes * Dim]; #pragma unroll for (int n = 0; n < NNodes; ++n) @@ -491,7 +290,6 @@ __global__ void ElementCorotationalFEMForceField_computeDForce_kernel( T edf[NNodes * Dim]; symBlockMatMul(K, rdx, edf); - // R * edf, scaled by -kFactor T* out = eforce + elemId * NNodes * Dim; #pragma unroll for (int n = 0; n < NNodes; ++n) @@ -508,41 +306,7 @@ __global__ void ElementCorotationalFEMForceField_computeDForce_kernel( } } -/** - * Gather per-vertex forces (1 thread per vertex). - */ -template -__global__ void ElementCorotationalFEMForceField_gatherForce_kernel( - int nbVertex, - int maxElemPerVertex, - const int* __restrict__ velems, - const T* __restrict__ eforce, - T* df) -{ - const int vertexId = blockIdx.x * blockDim.x + threadIdx.x; - if (vertexId >= nbVertex) return; - - T acc[Dim]; - #pragma unroll - for (int d = 0; d < Dim; ++d) - acc[d] = T(0); - - for (int s = 0; s < maxElemPerVertex; ++s) - { - const int idx = velems[s * nbVertex + vertexId]; - if (idx == 0) break; - const int base = (idx - 1) * Dim; - #pragma unroll - for (int d = 0; d < Dim; ++d) - acc[d] += eforce[base + d]; - } - - #pragma unroll - for (int d = 0; d < Dim; ++d) - df[vertexId * Dim + d] += acc[d]; -} - -// ===================== Launch functions (C++ templates) ===================== +// ===================== Launch functions ===================== template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations( @@ -575,14 +339,14 @@ void ElementCorotationalFEMForceFieldCuda_addForceWithRotations( const int gatherThreads = 256; numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; - ElementCorotationalFEMForceField_gatherForce_kernel + ElementFEM_gatherForce_kernel <<>>( nbVertex, maxElemPerVertex, (const int*)velems, (const T*)eforce, (T*)f); - mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel"); + mycudaDebugError("ElementFEM_gatherForce_kernel"); } template @@ -614,14 +378,14 @@ void ElementCorotationalFEMForceFieldCuda_addForce( const int gatherThreads = 256; numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; - ElementCorotationalFEMForceField_gatherForce_kernel + ElementFEM_gatherForce_kernel <<>>( nbVertex, maxElemPerVertex, (const int*)velems, (const T*)eforce, (T*)f); - mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel"); + mycudaDebugError("ElementFEM_gatherForce_kernel"); } template @@ -653,46 +417,48 @@ void ElementCorotationalFEMForceFieldCuda_addDForce( const int gatherThreads = 256; numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; - ElementCorotationalFEMForceField_gatherForce_kernel + ElementFEM_gatherForce_kernel <<>>( nbVertex, maxElemPerVertex, (const int*)velems, (const T*)eforce, (T*)df); - mycudaDebugError("ElementCorotationalFEMForceField_gatherForce_kernel"); + mycudaDebugError("ElementFEM_gatherForce_kernel"); } // ===================== Explicit template instantiations ===================== -// addForceWithRotations: only NNodes >= 3 (triangle/quad/hex rotation methods) -template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*); -template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*); -template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*); -template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*); -template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*); -template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, void*, const void*); - -// addForce: all element types -template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementCorotationalFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, const void*, void*, void*, const void*); - -// addDForce: all element types -template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float); -template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float); -template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float); -template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, float); -template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double); -template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double); -template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double); -template void ElementCorotationalFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*, double); - -} // namespace cuda -} // namespace gpu -} // namespace sofa +#define INSTANTIATE_COROTATIONAL(T, NNodes) \ + template void ElementCorotationalFEMForceFieldCuda_addForce( \ + unsigned int, unsigned int, unsigned int, const void*, const void*, \ + const void*, const void*, const void*, void*, void*, const void*); \ + template void ElementCorotationalFEMForceFieldCuda_addDForce( \ + unsigned int, unsigned int, unsigned int, const void*, const void*, \ + const void*, const void*, void*, void*, const void*, T); + +#define INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(T, NNodes) \ + template void ElementCorotationalFEMForceFieldCuda_addForceWithRotations( \ + unsigned int, unsigned int, unsigned int, const void*, const void*, \ + const void*, const void*, const void*, void*, void*, void*, const void*); + +INSTANTIATE_COROTATIONAL(float, 2) +INSTANTIATE_COROTATIONAL(float, 3) +INSTANTIATE_COROTATIONAL(float, 4) +INSTANTIATE_COROTATIONAL(float, 8) +INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(float, 3) +INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(float, 4) +INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(float, 8) + +INSTANTIATE_COROTATIONAL(double, 2) +INSTANTIATE_COROTATIONAL(double, 3) +INSTANTIATE_COROTATIONAL(double, 4) +INSTANTIATE_COROTATIONAL(double, 8) +INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(double, 3) +INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(double, 4) +INSTANTIATE_COROTATIONAL_WITH_ROTATIONS(double, 8) + +#undef INSTANTIATE_COROTATIONAL +#undef INSTANTIATE_COROTATIONAL_WITH_ROTATIONS + +} // namespace sofa::gpu::cuda diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh new file mode 100644 index 00000000000..605e7773baa --- /dev/null +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh @@ -0,0 +1,254 @@ +/****************************************************************************** +* SOFA, Simulation Open-Framework Architecture * +* (c) 2006 INRIA, USTL, UJF, CNRS, MGH * +* * +* This program is free software; you can redistribute it and/or modify it * +* under the terms of the GNU Lesser General Public License as published by * +* the Free Software Foundation; either version 2.1 of the License, or (at * +* your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, but WITHOUT * +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License * +* for more details. * +* * +* You should have received a copy of the GNU Lesser General Public License * +* along with this program. If not, see . * +******************************************************************************* +* Authors: The SOFA Team and external contributors (see Authors.txt) * +* * +* Contact information: contact@sofa-framework.org * +******************************************************************************/ +#pragma once + +#include + +namespace sofa::gpu::cuda +{ + +//============================================================================= +// Math utilities +//============================================================================= + +template +__device__ inline T myRsqrt(T x); + +template<> +__device__ inline float myRsqrt(float x) { return rsqrtf(x); } + +template<> +__device__ inline double myRsqrt(double x) { return rsqrt(x); } + +//============================================================================= +// 3x3 Matrix operations (row-major) +//============================================================================= + +template +__device__ inline void mat3Mul(const T* A, const T* B, T* C) +{ + #pragma unroll + for (int i = 0; i < 3; ++i) + { + #pragma unroll + for (int j = 0; j < 3; ++j) + { + C[i * 3 + j] = A[i * 3 + 0] * B[0 * 3 + j] + + A[i * 3 + 1] * B[1 * 3 + j] + + A[i * 3 + 2] * B[2 * 3 + j]; + } + } +} + +template +__device__ inline void mat3MulTranspose(const T* A, const T* BT, T* C) +{ + #pragma unroll + for (int i = 0; i < 3; ++i) + { + #pragma unroll + for (int j = 0; j < 3; ++j) + { + C[i * 3 + j] = A[i * 3 + 0] * BT[j * 3 + 0] + + A[i * 3 + 1] * BT[j * 3 + 1] + + A[i * 3 + 2] * BT[j * 3 + 2]; + } + } +} + +template +__device__ inline void mat3TransposeMul(const T* A, const T* B, T* C) +{ + #pragma unroll + for (int i = 0; i < 3; ++i) + { + #pragma unroll + for (int j = 0; j < 3; ++j) + { + C[i * 3 + j] = A[0 * 3 + i] * B[0 * 3 + j] + + A[1 * 3 + i] * B[1 * 3 + j] + + A[2 * 3 + i] * B[2 * 3 + j]; + } + } +} + +//============================================================================= +// Rotation frame computation +//============================================================================= + +template +__device__ inline void computeTriangleFrame(const T* ex, T* frame) +{ + T ax = ex[3] - ex[0], ay = ex[4] - ex[1], az = ex[5] - ex[2]; + T invLen = myRsqrt(ax * ax + ay * ay + az * az); + ax *= invLen; ay *= invLen; az *= invLen; + + T bx = ex[6] - ex[0], by = ex[7] - ex[1], bz = ex[8] - ex[2]; + + T cx = ay * bz - az * by; + T cy = az * bx - ax * bz; + T cz = ax * by - ay * bx; + invLen = myRsqrt(cx * cx + cy * cy + cz * cz); + cx *= invLen; cy *= invLen; cz *= invLen; + + bx = cy * az - cz * ay; + by = cz * ax - cx * az; + bz = cx * ay - cy * ax; + + frame[0] = ax; frame[1] = ay; frame[2] = az; + frame[3] = bx; frame[4] = by; frame[5] = bz; + frame[6] = cx; frame[7] = cy; frame[8] = cz; +} + +template +__device__ inline void computeHexahedronFrame(const T* ex, T* frame) +{ + const T quarter = T(0.25); + + T ax = ((ex[1*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[3*3+0]) + + (ex[5*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[7*3+0])) * quarter; + T ay = ((ex[1*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[3*3+1]) + + (ex[5*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[7*3+1])) * quarter; + T az = ((ex[1*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[3*3+2]) + + (ex[5*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[7*3+2])) * quarter; + + T bx = ((ex[3*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[1*3+0]) + + (ex[7*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[5*3+0])) * quarter; + T by = ((ex[3*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[1*3+1]) + + (ex[7*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[5*3+1])) * quarter; + T bz = ((ex[3*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[1*3+2]) + + (ex[7*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[5*3+2])) * quarter; + + T invLen = myRsqrt(ax * ax + ay * ay + az * az); + ax *= invLen; ay *= invLen; az *= invLen; + + T cx = ay * bz - az * by; + T cy = az * bx - ax * bz; + T cz = ax * by - ay * bx; + invLen = myRsqrt(cx * cx + cy * cy + cz * cz); + cx *= invLen; cy *= invLen; cz *= invLen; + + bx = cy * az - cz * ay; + by = cz * ax - cx * az; + bz = cx * ay - cy * ax; + + frame[0] = ax; frame[1] = ay; frame[2] = az; + frame[3] = bx; frame[4] = by; frame[5] = bz; + frame[6] = cx; frame[7] = cy; frame[8] = cz; +} + +//============================================================================= +// Symmetric block-matrix multiply +//============================================================================= + +template +__device__ inline void symBlockMatMul(const T* K, const T* in, T* out) +{ + #pragma unroll + for (int i = 0; i < NNodes * Dim; ++i) + out[i] = T(0); + + #pragma unroll + for (int ni = 0; ni < NNodes; ++ni) + { + const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; + + // Diagonal block + { + const T* Kii = K + diagIdx * Dim * Dim; + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += Kii[di * Dim + dj] * in[ni * Dim + dj]; + out[ni * Dim + di] += sum; + } + } + + // Off-diagonal blocks + #pragma unroll + for (int nj = ni + 1; nj < NNodes; ++nj) + { + const int symIdx = diagIdx + (nj - ni); + const T* Kij = K + symIdx * Dim * Dim; + + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += Kij[di * Dim + dj] * in[nj * Dim + dj]; + out[ni * Dim + di] += sum; + } + + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + { + T sum = T(0); + #pragma unroll + for (int di = 0; di < Dim; ++di) + sum += Kij[di * Dim + dj] * in[ni * Dim + di]; + out[nj * Dim + dj] += sum; + } + } + } +} + +//============================================================================= +// Gather kernel +//============================================================================= + +template +__global__ void ElementFEM_gatherForce_kernel( + int nbVertex, + int maxElemPerVertex, + const int* __restrict__ velems, + const T* __restrict__ eforce, + T* df) +{ + const int vertexId = blockIdx.x * blockDim.x + threadIdx.x; + if (vertexId >= nbVertex) return; + + T acc[Dim]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + acc[d] = T(0); + + for (int s = 0; s < maxElemPerVertex; ++s) + { + const int idx = velems[s * nbVertex + vertexId]; + if (idx == 0) break; + const int base = (idx - 1) * Dim; + #pragma unroll + for (int d = 0; d < Dim; ++d) + acc[d] += eforce[base + d]; + } + + #pragma unroll + for (int d = 0; d < Dim; ++d) + df[vertexId * Dim + d] += acc[d]; +} + +} // namespace sofa::gpu::cuda diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu index e8492615c32..10b04b98193 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu @@ -21,21 +21,13 @@ ******************************************************************************/ #include #include -#include +#include "CudaElementFEMKernelUtils.cuh" -namespace sofa -{ -namespace gpu -{ -namespace cuda +namespace sofa::gpu::cuda { /** - * Kernel for addForce: Compute per-element force from displacement (1 thread per element). - * - * f = -K * (x - x0) - * Templated on NNodes and Dim (compile-time) for full loop unrolling. - * Templated on T for float/double support. + * Kernel for addForce: f = -K * (x - x0) */ template __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel( @@ -51,7 +43,6 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel( const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; - // Gather displacement = x - x0 for this element's nodes T disp[NNodes * Dim]; #pragma unroll for (int n = 0; n < NNodes; ++n) @@ -62,65 +53,10 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel( disp[n * Dim + d] = x[nodeId * Dim + d] - x0[nodeId * Dim + d]; } - // Symmetric block-matrix multiply: edf = K * disp const T* K = stiffness + elemId * NSymBlocks * Dim * Dim; T edf[NNodes * Dim]; + symBlockMatMul(K, disp, edf); - #pragma unroll - for (int i = 0; i < NNodes * Dim; ++i) - edf[i] = T(0); - - #pragma unroll - for (int ni = 0; ni < NNodes; ++ni) - { - const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; - - // Diagonal block - { - const T* Kii = K + diagIdx * Dim * Dim; - #pragma unroll - for (int di = 0; di < Dim; ++di) - { - T sum = T(0); - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - sum += Kii[di * Dim + dj] * disp[ni * Dim + dj]; - edf[ni * Dim + di] += sum; - } - } - - // Off-diagonal blocks - #pragma unroll - for (int nj = ni + 1; nj < NNodes; ++nj) - { - const int symIdx = diagIdx + (nj - ni); - const T* Kij = K + symIdx * Dim * Dim; - - // Kij * disp_j -> edf_i - #pragma unroll - for (int di = 0; di < Dim; ++di) - { - T sum = T(0); - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - sum += Kij[di * Dim + dj] * disp[nj * Dim + dj]; - edf[ni * Dim + di] += sum; - } - - // Kij^T * disp_i -> edf_j - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - { - T sum = T(0); - #pragma unroll - for (int di = 0; di < Dim; ++di) - sum += Kij[di * Dim + dj] * disp[ni * Dim + di]; - edf[nj * Dim + dj] += sum; - } - } - } - - // Write: eforce = -edf (minus sign from f -= K * displacement) T* out = eforce + elemId * NNodes * Dim; #pragma unroll for (int n = 0; n < NNodes; ++n) @@ -132,9 +68,7 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel( } /** - * Kernel for addDForce: Compute per-element dForce (1 thread per element). - * - * df = -kFactor * K * dx + * Kernel for addDForce: df = -kFactor * K * dx */ template __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel( @@ -150,7 +84,6 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel( const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; - // Gather dx for this element's nodes T edx[NNodes * Dim]; #pragma unroll for (int n = 0; n < NNodes; ++n) @@ -161,61 +94,10 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel( edx[n * Dim + d] = dx[nodeId * Dim + d]; } - // Symmetric block-matrix multiply: edf = K * edx const T* K = stiffness + elemId * NSymBlocks * Dim * Dim; T edf[NNodes * Dim]; + symBlockMatMul(K, edx, edf); - #pragma unroll - for (int i = 0; i < NNodes * Dim; ++i) - edf[i] = T(0); - - #pragma unroll - for (int ni = 0; ni < NNodes; ++ni) - { - const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; - - { - const T* Kii = K + diagIdx * Dim * Dim; - #pragma unroll - for (int di = 0; di < Dim; ++di) - { - T sum = T(0); - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - sum += Kii[di * Dim + dj] * edx[ni * Dim + dj]; - edf[ni * Dim + di] += sum; - } - } - - #pragma unroll - for (int nj = ni + 1; nj < NNodes; ++nj) - { - const int symIdx = diagIdx + (nj - ni); - const T* Kij = K + symIdx * Dim * Dim; - - #pragma unroll - for (int di = 0; di < Dim; ++di) - { - T sum = T(0); - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - sum += Kij[di * Dim + dj] * edx[nj * Dim + dj]; - edf[ni * Dim + di] += sum; - } - - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - { - T sum = T(0); - #pragma unroll - for (int di = 0; di < Dim; ++di) - sum += Kij[di * Dim + dj] * edx[ni * Dim + di]; - edf[nj * Dim + dj] += sum; - } - } - } - - // Write: eforce = -kFactor * edf T* out = eforce + elemId * NNodes * Dim; #pragma unroll for (int n = 0; n < NNodes; ++n) @@ -226,39 +108,7 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel( } } -/** - * Gather per-vertex forces (1 thread per vertex). - */ -template -__global__ void ElementLinearSmallStrainFEMForceField_gatherForce_kernel( - int nbVertex, - int maxElemPerVertex, - const int* __restrict__ velems, - const T* __restrict__ eforce, - T* df) -{ - const int vertexId = blockIdx.x * blockDim.x + threadIdx.x; - if (vertexId >= nbVertex) return; - - T acc[Dim]; - #pragma unroll - for (int d = 0; d < Dim; ++d) - acc[d] = T(0); - - for (int s = 0; s < maxElemPerVertex; ++s) - { - const int idx = velems[s * nbVertex + vertexId]; - if (idx == 0) break; - const int base = (idx - 1) * Dim; - #pragma unroll - for (int d = 0; d < Dim; ++d) - acc[d] += eforce[base + d]; - } - - #pragma unroll - for (int d = 0; d < Dim; ++d) - df[vertexId * Dim + d] += acc[d]; -} +// ===================== Launch functions ===================== template void ElementLinearSmallStrainFEMForceFieldCuda_addForce( @@ -287,14 +137,14 @@ void ElementLinearSmallStrainFEMForceFieldCuda_addForce( const int gatherThreads = 256; numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; - ElementLinearSmallStrainFEMForceField_gatherForce_kernel + ElementFEM_gatherForce_kernel <<>>( nbVertex, maxElemPerVertex, (const int*)velems, (const T*)eforce, (T*)f); - mycudaDebugError("ElementLinearSmallStrainFEMForceField_gatherForce_kernel"); + mycudaDebugError("ElementFEM_gatherForce_kernel"); } template @@ -324,35 +174,36 @@ void ElementLinearSmallStrainFEMForceFieldCuda_addDForce( const int gatherThreads = 256; numBlocks = (nbVertex + gatherThreads - 1) / gatherThreads; - ElementLinearSmallStrainFEMForceField_gatherForce_kernel + ElementFEM_gatherForce_kernel <<>>( nbVertex, maxElemPerVertex, (const int*)velems, (const T*)eforce, (T*)df); - mycudaDebugError("ElementLinearSmallStrainFEMForceField_gatherForce_kernel"); + mycudaDebugError("ElementFEM_gatherForce_kernel"); } -// Explicit template instantiations for all supported (T, NNodes, Dim) combinations -template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); -template void ElementLinearSmallStrainFEMForceFieldCuda_addForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, const void*, void*, void*, const void*); +// ===================== Explicit template instantiations ===================== + +#define INSTANTIATE_LINEAR(T, NNodes) \ + template void ElementLinearSmallStrainFEMForceFieldCuda_addForce( \ + unsigned int, unsigned int, unsigned int, const void*, const void*, \ + const void*, const void*, void*, void*, const void*); \ + template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce( \ + unsigned int, unsigned int, unsigned int, const void*, const void*, \ + const void*, void*, void*, const void*, T); + +INSTANTIATE_LINEAR(float, 2) +INSTANTIATE_LINEAR(float, 3) +INSTANTIATE_LINEAR(float, 4) +INSTANTIATE_LINEAR(float, 8) + +INSTANTIATE_LINEAR(double, 2) +INSTANTIATE_LINEAR(double, 3) +INSTANTIATE_LINEAR(double, 4) +INSTANTIATE_LINEAR(double, 8) -template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float); -template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float); -template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float); -template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, float); -template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double); -template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double); -template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double); -template void ElementLinearSmallStrainFEMForceFieldCuda_addDForce(unsigned int, unsigned int, unsigned int, const void*, const void*, const void*, void*, void*, const void*, double); +#undef INSTANTIATE_LINEAR -} // namespace cuda -} // namespace gpu -} // namespace sofa +} // namespace sofa::gpu::cuda From 6a2d39509a1c408f887fb370d3514e3b7222eeb8 Mon Sep 17 00:00:00 2001 From: Frederick Roy Date: Fri, 10 Apr 2026 13:06:02 +0900 Subject: [PATCH 21/21] clarify code --- .../CudaElementCorotationalFEMForceField.cu | 194 +++---------- .../fem/elastic/CudaElementFEMKernelUtils.cuh | 264 ++++++++++++++++-- ...daElementLinearSmallStrainFEMForceField.cu | 40 +-- 3 files changed, 284 insertions(+), 214 deletions(-) diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu index 3125446e0a6..6616637dc55 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementCorotationalFEMForceField.cu @@ -42,101 +42,50 @@ __global__ void ElementCorotationalFEMForceField_computeRotationsAndForce_kernel { static_assert(Dim == 3, "Corotational rotation computation requires Dim == 3"); constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; - const T invN = T(1) / T(NNodes); const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; + // Gather element positions T ex[NNodes * Dim], ex0[NNodes * Dim]; - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - const int nodeId = elements[n * nbElem + elemId]; - #pragma unroll - for (int d = 0; d < Dim; ++d) - { - ex[n * Dim + d] = x[nodeId * Dim + d]; - ex0[n * Dim + d] = x0[nodeId * Dim + d]; - } - } + gatherElementData(elements, nbElem, elemId, x, ex); + gatherElementData(elements, nbElem, elemId, x0, ex0); + // Compute rotation frame from current positions T frame[Dim * Dim]; if constexpr (NNodes == 8) computeHexahedronFrame(ex, frame); else computeTriangleFrame(ex, frame); + // R = frame^T * initRotTransposed const T* irt = initRotTransposed + elemId * Dim * Dim; T R[Dim * Dim]; mat3TransposeMul(frame, irt, R); + // Store rotation for later use T* Rout = rotationsOut + elemId * Dim * Dim; #pragma unroll for (int i = 0; i < Dim * Dim; ++i) Rout[i] = R[i]; + // Compute element centers T center[Dim], center0[Dim]; - #pragma unroll - for (int d = 0; d < Dim; ++d) - { - center[d] = T(0); - center0[d] = T(0); - } - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - #pragma unroll - for (int d = 0; d < Dim; ++d) - { - center[d] += ex[n * Dim + d]; - center0[d] += ex0[n * Dim + d]; - } - } - #pragma unroll - for (int d = 0; d < Dim; ++d) - { - center[d] *= invN; - center0[d] *= invN; - } + computeElementCenter(ex, center); + computeElementCenter(ex0, center0); + // Compute corotational displacement T disp[NNodes * Dim]; - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - T diff[Dim]; - #pragma unroll - for (int d = 0; d < Dim; ++d) - diff[d] = ex[n * Dim + d] - center[d]; - - #pragma unroll - for (int di = 0; di < Dim; ++di) - { - T rotated = T(0); - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - rotated += R[dj * Dim + di] * diff[dj]; - disp[n * Dim + di] = rotated - (ex0[n * Dim + di] - center0[di]); - } - } + computeCorotationalDisplacement(R, ex, ex0, center, center0, disp); + // Multiply by stiffness matrix T edf[NNodes * Dim]; const T* K = stiffness + elemId * NSymBlocks * Dim * Dim; symBlockMatMul(K, disp, edf); + // Rotate forces back to global frame and negate T* out = eforce + elemId * NNodes * Dim; - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - #pragma unroll - for (int di = 0; di < Dim; ++di) - { - T sum = T(0); - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - sum += R[di * Dim + dj] * edf[n * Dim + dj]; - out[n * Dim + di] = -sum; - } - } + rotateAndWriteForce(R, edf, out, T(-1)); } /** @@ -153,92 +102,39 @@ __global__ void ElementCorotationalFEMForceField_computeForce_kernel( T* __restrict__ eforce) { constexpr int NSymBlocks = NNodes * (NNodes + 1) / 2; - const T invN = T(1) / T(NNodes); const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; + // Load rotation matrix const T* Rptr = rotations + elemId * Dim * Dim; T R[Dim * Dim]; #pragma unroll for (int i = 0; i < Dim * Dim; ++i) R[i] = Rptr[i]; + // Gather element positions T ex[NNodes * Dim], ex0[NNodes * Dim]; - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - const int nodeId = elements[n * nbElem + elemId]; - #pragma unroll - for (int d = 0; d < Dim; ++d) - { - ex[n * Dim + d] = x[nodeId * Dim + d]; - ex0[n * Dim + d] = x0[nodeId * Dim + d]; - } - } + gatherElementData(elements, nbElem, elemId, x, ex); + gatherElementData(elements, nbElem, elemId, x0, ex0); + // Compute element centers T center[Dim], center0[Dim]; - #pragma unroll - for (int d = 0; d < Dim; ++d) - { - center[d] = T(0); - center0[d] = T(0); - } - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - #pragma unroll - for (int d = 0; d < Dim; ++d) - { - center[d] += ex[n * Dim + d]; - center0[d] += ex0[n * Dim + d]; - } - } - #pragma unroll - for (int d = 0; d < Dim; ++d) - { - center[d] *= invN; - center0[d] *= invN; - } + computeElementCenter(ex, center); + computeElementCenter(ex0, center0); + // Compute corotational displacement T disp[NNodes * Dim]; - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - T diff[Dim]; - #pragma unroll - for (int d = 0; d < Dim; ++d) - diff[d] = ex[n * Dim + d] - center[d]; - - #pragma unroll - for (int di = 0; di < Dim; ++di) - { - T rotated = T(0); - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - rotated += R[dj * Dim + di] * diff[dj]; - disp[n * Dim + di] = rotated - (ex0[n * Dim + di] - center0[di]); - } - } + computeCorotationalDisplacement(R, ex, ex0, center, center0, disp); + // Multiply by stiffness matrix T edf[NNodes * Dim]; const T* K = stiffness + elemId * NSymBlocks * Dim * Dim; symBlockMatMul(K, disp, edf); + // Rotate forces back to global frame and negate T* out = eforce + elemId * NNodes * Dim; - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - #pragma unroll - for (int di = 0; di < Dim; ++di) - { - T sum = T(0); - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - sum += R[di * Dim + dj] * edf[n * Dim + dj]; - out[n * Dim + di] = -sum; - } - } + rotateAndWriteForce(R, edf, out, T(-1)); } /** @@ -259,51 +155,25 @@ __global__ void ElementCorotationalFEMForceField_computeDForce_kernel( const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; + // Load rotation matrix const T* Rptr = rotations + elemId * Dim * Dim; T R[Dim * Dim]; #pragma unroll for (int i = 0; i < Dim * Dim; ++i) R[i] = Rptr[i]; + // Gather and rotate displacement: rdx = R^T * dx T rdx[NNodes * Dim]; - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - const int nodeId = elements[n * nbElem + elemId]; - T nodeDx[Dim]; - #pragma unroll - for (int d = 0; d < Dim; ++d) - nodeDx[d] = dx[nodeId * Dim + d]; - - #pragma unroll - for (int di = 0; di < Dim; ++di) - { - T sum = T(0); - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - sum += R[dj * Dim + di] * nodeDx[dj]; - rdx[n * Dim + di] = sum; - } - } + rotateDisplacementTranspose(R, elements, nbElem, elemId, dx, rdx); + // Multiply by stiffness matrix const T* K = stiffness + elemId * NSymBlocks * Dim * Dim; T edf[NNodes * Dim]; symBlockMatMul(K, rdx, edf); + // Rotate forces back to global frame and scale T* out = eforce + elemId * NNodes * Dim; - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - #pragma unroll - for (int di = 0; di < Dim; ++di) - { - T sum = T(0); - #pragma unroll - for (int dj = 0; dj < Dim; ++dj) - sum += R[di * Dim + dj] * edf[n * Dim + dj]; - out[n * Dim + di] = -kFactor * sum; - } - } + rotateAndWriteForce(R, edf, out, -kFactor); } // ===================== Launch functions ===================== diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh index 605e7773baa..4d65c2f7130 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementFEMKernelUtils.cuh @@ -40,9 +40,10 @@ template<> __device__ inline double myRsqrt(double x) { return rsqrt(x); } //============================================================================= -// 3x3 Matrix operations (row-major) +// 3x3 Matrix operations (row-major storage) //============================================================================= +/// C = A * B template __device__ inline void mat3Mul(const T* A, const T* B, T* C) { @@ -59,6 +60,7 @@ __device__ inline void mat3Mul(const T* A, const T* B, T* C) } } +/// C = A * B^T template __device__ inline void mat3MulTranspose(const T* A, const T* BT, T* C) { @@ -75,6 +77,7 @@ __device__ inline void mat3MulTranspose(const T* A, const T* BT, T* C) } } +/// C = A^T * B template __device__ inline void mat3TransposeMul(const T* A, const T* B, T* C) { @@ -91,62 +94,100 @@ __device__ inline void mat3TransposeMul(const T* A, const T* B, T* C) } } +/// out = R * in (rotate a 3D vector) +template +__device__ inline void rotateVector(const T* R, const T* in, T* out) +{ + #pragma unroll + for (int i = 0; i < 3; ++i) + { + out[i] = R[i * 3 + 0] * in[0] + + R[i * 3 + 1] * in[1] + + R[i * 3 + 2] * in[2]; + } +} + +/// out = R^T * in (rotate a 3D vector by transpose) +template +__device__ inline void rotateVectorTranspose(const T* R, const T* in, T* out) +{ + #pragma unroll + for (int i = 0; i < 3; ++i) + { + out[i] = R[0 * 3 + i] * in[0] + + R[1 * 3 + i] * in[1] + + R[2 * 3 + i] * in[2]; + } +} + //============================================================================= // Rotation frame computation //============================================================================= +/// Compute rotation frame from first 3 nodes (for Triangle, Quad, Tetrahedron) template -__device__ inline void computeTriangleFrame(const T* ex, T* frame) +__device__ inline void computeTriangleFrame(const T* pos, T* frame) { - T ax = ex[3] - ex[0], ay = ex[4] - ex[1], az = ex[5] - ex[2]; + // X axis: normalized (p1 - p0) + T ax = pos[3] - pos[0], ay = pos[4] - pos[1], az = pos[5] - pos[2]; T invLen = myRsqrt(ax * ax + ay * ay + az * az); ax *= invLen; ay *= invLen; az *= invLen; - T bx = ex[6] - ex[0], by = ex[7] - ex[1], bz = ex[8] - ex[2]; + // Temp vector b = p2 - p0 + T bx = pos[6] - pos[0], by = pos[7] - pos[1], bz = pos[8] - pos[2]; + // Z axis: normalized cross(a, b) T cx = ay * bz - az * by; T cy = az * bx - ax * bz; T cz = ax * by - ay * bx; invLen = myRsqrt(cx * cx + cy * cy + cz * cz); cx *= invLen; cy *= invLen; cz *= invLen; + // Y axis: cross(z, x) bx = cy * az - cz * ay; by = cz * ax - cx * az; bz = cx * ay - cy * ax; + // Store row-major: frame[row][col] = frame[row * 3 + col] frame[0] = ax; frame[1] = ay; frame[2] = az; frame[3] = bx; frame[4] = by; frame[5] = bz; frame[6] = cx; frame[7] = cy; frame[8] = cz; } +/// Compute rotation frame from 8 hexahedron nodes template -__device__ inline void computeHexahedronFrame(const T* ex, T* frame) +__device__ inline void computeHexahedronFrame(const T* pos, T* frame) { const T quarter = T(0.25); - T ax = ((ex[1*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[3*3+0]) - + (ex[5*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[7*3+0])) * quarter; - T ay = ((ex[1*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[3*3+1]) - + (ex[5*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[7*3+1])) * quarter; - T az = ((ex[1*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[3*3+2]) - + (ex[5*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[7*3+2])) * quarter; - - T bx = ((ex[3*3+0] - ex[0*3+0]) + (ex[2*3+0] - ex[1*3+0]) - + (ex[7*3+0] - ex[4*3+0]) + (ex[6*3+0] - ex[5*3+0])) * quarter; - T by = ((ex[3*3+1] - ex[0*3+1]) + (ex[2*3+1] - ex[1*3+1]) - + (ex[7*3+1] - ex[4*3+1]) + (ex[6*3+1] - ex[5*3+1])) * quarter; - T bz = ((ex[3*3+2] - ex[0*3+2]) + (ex[2*3+2] - ex[1*3+2]) - + (ex[7*3+2] - ex[4*3+2]) + (ex[6*3+2] - ex[5*3+2])) * quarter; - + // Average X direction from 4 edge pairs + T ax = ((pos[1*3+0] - pos[0*3+0]) + (pos[2*3+0] - pos[3*3+0]) + + (pos[5*3+0] - pos[4*3+0]) + (pos[6*3+0] - pos[7*3+0])) * quarter; + T ay = ((pos[1*3+1] - pos[0*3+1]) + (pos[2*3+1] - pos[3*3+1]) + + (pos[5*3+1] - pos[4*3+1]) + (pos[6*3+1] - pos[7*3+1])) * quarter; + T az = ((pos[1*3+2] - pos[0*3+2]) + (pos[2*3+2] - pos[3*3+2]) + + (pos[5*3+2] - pos[4*3+2]) + (pos[6*3+2] - pos[7*3+2])) * quarter; + + // Average Y direction + T bx = ((pos[3*3+0] - pos[0*3+0]) + (pos[2*3+0] - pos[1*3+0]) + + (pos[7*3+0] - pos[4*3+0]) + (pos[6*3+0] - pos[5*3+0])) * quarter; + T by = ((pos[3*3+1] - pos[0*3+1]) + (pos[2*3+1] - pos[1*3+1]) + + (pos[7*3+1] - pos[4*3+1]) + (pos[6*3+1] - pos[5*3+1])) * quarter; + T bz = ((pos[3*3+2] - pos[0*3+2]) + (pos[2*3+2] - pos[1*3+2]) + + (pos[7*3+2] - pos[4*3+2]) + (pos[6*3+2] - pos[5*3+2])) * quarter; + + // Normalize X T invLen = myRsqrt(ax * ax + ay * ay + az * az); ax *= invLen; ay *= invLen; az *= invLen; + // Z = normalized cross(X, Y) T cx = ay * bz - az * by; T cy = az * bx - ax * bz; T cz = ax * by - ay * bx; invLen = myRsqrt(cx * cx + cy * cy + cz * cz); cx *= invLen; cy *= invLen; cz *= invLen; + // Y = cross(Z, X) bx = cy * az - cz * ay; by = cz * ax - cx * az; bz = cx * ay - cy * ax; @@ -156,13 +197,150 @@ __device__ inline void computeHexahedronFrame(const T* ex, T* frame) frame[6] = cx; frame[7] = cy; frame[8] = cz; } +//============================================================================= +// Element data gathering +//============================================================================= + +/// Gather positions for one element from global arrays (SoA layout) +template +__device__ inline void gatherElementData( + const int* elements, int nbElem, int elemId, + const T* globalData, + T* localData) +{ + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + const int nodeId = elements[n * nbElem + elemId]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + localData[n * Dim + d] = globalData[nodeId * Dim + d]; + } +} + +/// Gather displacement (x - x0) for one element +template +__device__ inline void gatherElementDisplacement( + const int* elements, int nbElem, int elemId, + const T* x, const T* x0, + T* disp) +{ + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + const int nodeId = elements[n * nbElem + elemId]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + disp[n * Dim + d] = x[nodeId * Dim + d] - x0[nodeId * Dim + d]; + } +} + +//============================================================================= +// Element center computation +//============================================================================= + +/// Compute center of element positions +template +__device__ inline void computeElementCenter(const T* pos, T* center) +{ + const T invN = T(1) / T(NNodes); + + #pragma unroll + for (int d = 0; d < Dim; ++d) + center[d] = T(0); + + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + #pragma unroll + for (int d = 0; d < Dim; ++d) + center[d] += pos[n * Dim + d]; + } + + #pragma unroll + for (int d = 0; d < Dim; ++d) + center[d] *= invN; +} + +//============================================================================= +// Corotational displacement computation +//============================================================================= + +/// Compute corotational displacement: disp = R^T * (x - center) - (x0 - center0) +template +__device__ inline void computeCorotationalDisplacement( + const T* R, + const T* x, const T* x0, + const T* center, const T* center0, + T* disp) +{ + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + // diff = x_n - center + T diff[Dim]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + diff[d] = x[n * Dim + d] - center[d]; + + // rotated = R^T * diff + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T rotated = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + rotated += R[dj * Dim + di] * diff[dj]; + disp[n * Dim + di] = rotated - (x0[n * Dim + di] - center0[di]); + } + } +} + +/// Compute R^T * dx for each node (for addDForce) +template +__device__ inline void rotateDisplacementTranspose( + const T* R, + const int* elements, int nbElem, int elemId, + const T* dx, + T* rdx) +{ + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + const int nodeId = elements[n * nbElem + elemId]; + + T nodeDx[Dim]; + #pragma unroll + for (int d = 0; d < Dim; ++d) + nodeDx[d] = dx[nodeId * Dim + d]; + + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += R[dj * Dim + di] * nodeDx[dj]; + rdx[n * Dim + di] = sum; + } + } +} + //============================================================================= // Symmetric block-matrix multiply //============================================================================= +/** + * Symmetric block-matrix multiply: out = K * in + * + * K is stored in upper-triangle block format: + * symIdx = ni * NNodes - ni*(ni-1)/2 + (nj - ni) for nj >= ni + * K[symIdx * Dim * Dim + di * Dim + dj] for each element + */ template __device__ inline void symBlockMatMul(const T* K, const T* in, T* out) { + // Initialize output to zero #pragma unroll for (int i = 0; i < NNodes * Dim; ++i) out[i] = T(0); @@ -172,7 +350,7 @@ __device__ inline void symBlockMatMul(const T* K, const T* in, T* out) { const int diagIdx = ni * NNodes - ni * (ni - 1) / 2; - // Diagonal block + // Diagonal block: Kii * in_i -> out_i { const T* Kii = K + diagIdx * Dim * Dim; #pragma unroll @@ -186,13 +364,14 @@ __device__ inline void symBlockMatMul(const T* K, const T* in, T* out) } } - // Off-diagonal blocks + // Off-diagonal blocks (symmetric: Kij and Kij^T) #pragma unroll for (int nj = ni + 1; nj < NNodes; ++nj) { const int symIdx = diagIdx + (nj - ni); const T* Kij = K + symIdx * Dim * Dim; + // Kij * in_j -> out_i #pragma unroll for (int di = 0; di < Dim; ++di) { @@ -203,6 +382,7 @@ __device__ inline void symBlockMatMul(const T* K, const T* in, T* out) out[ni * Dim + di] += sum; } + // Kij^T * in_i -> out_j #pragma unroll for (int dj = 0; dj < Dim; ++dj) { @@ -217,9 +397,49 @@ __device__ inline void symBlockMatMul(const T* K, const T* in, T* out) } //============================================================================= -// Gather kernel +// Force output with rotation +//============================================================================= + +/// Rotate local forces to global frame and write: out = scale * R * localForce +template +__device__ inline void rotateAndWriteForce( + const T* R, + const T* localForce, + T* out, + T scale) +{ + #pragma unroll + for (int n = 0; n < NNodes; ++n) + { + #pragma unroll + for (int di = 0; di < Dim; ++di) + { + T sum = T(0); + #pragma unroll + for (int dj = 0; dj < Dim; ++dj) + sum += R[di * Dim + dj] * localForce[n * Dim + dj]; + out[n * Dim + di] = scale * sum; + } + } +} + +/// Write negated force (for linear case without rotation): out = scale * localForce +template +__device__ inline void writeForce(const T* localForce, T* out, T scale) +{ + #pragma unroll + for (int i = 0; i < NNodes * Dim; ++i) + out[i] = scale * localForce[i]; +} + +//============================================================================= +// Gather kernel for accumulating per-vertex forces //============================================================================= +/** + * Gather per-vertex forces from per-element contributions. + * velems[slot * nbVertex + vertexId] contains (elemId * NNodes + localNode + 1), 0 = end + */ template __global__ void ElementFEM_gatherForce_kernel( int nbVertex, diff --git a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu index 10b04b98193..9d0195ab580 100644 --- a/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu +++ b/applications/plugins/SofaCUDA/Component/src/SofaCUDA/component/solidmechanics/fem/elastic/CudaElementLinearSmallStrainFEMForceField.cu @@ -43,28 +43,18 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeForce_kernel( const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; + // Gather displacement (x - x0) T disp[NNodes * Dim]; - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - const int nodeId = elements[n * nbElem + elemId]; - #pragma unroll - for (int d = 0; d < Dim; ++d) - disp[n * Dim + d] = x[nodeId * Dim + d] - x0[nodeId * Dim + d]; - } + gatherElementDisplacement(elements, nbElem, elemId, x, x0, disp); + // Multiply by stiffness matrix const T* K = stiffness + elemId * NSymBlocks * Dim * Dim; T edf[NNodes * Dim]; symBlockMatMul(K, disp, edf); + // Write negated force T* out = eforce + elemId * NNodes * Dim; - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - #pragma unroll - for (int d = 0; d < Dim; ++d) - out[n * Dim + d] = -edf[n * Dim + d]; - } + writeForce(edf, out, T(-1)); } /** @@ -84,28 +74,18 @@ __global__ void ElementLinearSmallStrainFEMForceField_computeDForce_kernel( const int elemId = blockIdx.x * blockDim.x + threadIdx.x; if (elemId >= nbElem) return; + // Gather displacement increment T edx[NNodes * Dim]; - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - const int nodeId = elements[n * nbElem + elemId]; - #pragma unroll - for (int d = 0; d < Dim; ++d) - edx[n * Dim + d] = dx[nodeId * Dim + d]; - } + gatherElementData(elements, nbElem, elemId, dx, edx); + // Multiply by stiffness matrix const T* K = stiffness + elemId * NSymBlocks * Dim * Dim; T edf[NNodes * Dim]; symBlockMatMul(K, edx, edf); + // Write scaled negated force T* out = eforce + elemId * NNodes * Dim; - #pragma unroll - for (int n = 0; n < NNodes; ++n) - { - #pragma unroll - for (int d = 0; d < Dim; ++d) - out[n * Dim + d] = -kFactor * edf[n * Dim + d]; - } + writeForce(edf, out, -kFactor); } // ===================== Launch functions =====================