From 3df801f5a60d32df20a621c41d52e2cabf988de1 Mon Sep 17 00:00:00 2001 From: Timofey Zakharchuk Date: Sun, 15 Feb 2026 21:17:48 +0300 Subject: [PATCH 01/13] Add MPI --- CMakeLists.txt | 53 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 452cd0c..73eb43c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,12 @@ if (MINGW) elseif (MSVC) message(STATUS "Detected MSVC — using MSVC settings") set(USING_MSVC TRUE) +elseif (APPLE) + message(STATUS "Detected macOS — limited MPI support") + set(USING_APPLE TRUE) +elseif (UNIX) + message(STATUS "Detected Linux/UNIX — using standard settings") + set(USING_UNIX TRUE) endif() # ============================ @@ -18,19 +24,15 @@ endif() # ============================ option(NEXT_FP32 "Use 32-bit floats (scalar)" OFF) option(NEXT_FP64 "Use 64-bit floats (scalar)" ON) +option(NEXT_MPI "Enable MPI support" OFF) -# Option: copy final executable to source dir option(NEXT_COPY_TO_CMAKE_SOURCE_DIR "Copy final executable from build dir to source dir" ON) # ============================ # Precision modes # ============================ -set(NEXT_MODES - NEXT_FP32 - NEXT_FP64 -) +set(NEXT_MODES NEXT_FP32 NEXT_FP64) -# Count enabled modes set(NEXT_MODE_COUNT 0) foreach(m ${NEXT_MODES}) if(${m}) @@ -38,14 +40,12 @@ foreach(m ${NEXT_MODES}) endif() endforeach() -# Enforce exactly one mode if(NEXT_MODE_COUNT EQUAL 0) message(FATAL_ERROR "You must enable NEXT_FP64 or NEXT_FP32.") elseif(NEXT_MODE_COUNT GREATER 1) message(FATAL_ERROR "Enable only one precision mode.") endif() -# Apply compile definitions if(NEXT_FP32) add_compile_definitions(NEXT_FP32) elseif(NEXT_FP64) @@ -62,6 +62,9 @@ file(GLOB ARGPARSE_FILES ${CMAKE_SOURCE_DIR}/argparse/*.cpp) add_executable(next ${SRC_FILES} ${ARGPARSE_FILES}) +# ============================ +# Vectorization reports +# ============================ option(ENABLE_VEC_REPORT "Enable compiler vectorization reports" ON) if(ENABLE_VEC_REPORT) @@ -74,38 +77,56 @@ if(ENABLE_VEC_REPORT) endif() endif() - # ============================ # OpenMP # ============================ find_package(OpenMP QUIET) if(OpenMP_CXX_FOUND) message(STATUS "OpenMP detected — enabling multithreading.") - if(MSVC) - # Force MSVC to use the modern LLVM backend (supports OpenMP 3.0+ and size_t) - # We add it as a compiler option because MSVC's default find_package - # often defaults to the legacy /openmp flag. target_compile_options(next PRIVATE /openmp:llvm) else() target_link_libraries(next PRIVATE OpenMP::OpenMP_CXX) endif() - else() message(STATUS "OpenMP not found — building in single-threaded mode.") endif() +# ============================ +# MPI detection +# ============================ +if(NEXT_MPI) + if(USING_UNIX) + # Linux: standard OpenMPI + find_package(MPI REQUIRED) + message(STATUS "MPI (OpenMPI) detected — enabling distributed memory parallelism.") + target_compile_definitions(next PRIVATE NEXT_MPI) + target_link_libraries(next PRIVATE MPI::MPI_CXX) + + elseif(USING_MSVC OR USING_MINGW) + # Windows: Microsoft MPI + find_package(MPI REQUIRED) + message(STATUS "MPI (MS-MPI) detected — enabling distributed memory parallelism.") + target_compile_definitions(next PRIVATE NEXT_MPI) + target_link_libraries(next PRIVATE MPI::MPI_CXX) + + elseif(USING_APPLE) + message(WARNING "MPI requested, but macOS does not ship MPI by default. Please install OpenMPI via Homebrew (brew install open-mpi).") + find_package(MPI REQUIRED) + target_compile_definitions(next PRIVATE NEXT_MPI) + target_link_libraries(next PRIVATE MPI::MPI_CXX) + endif() +endif() + # ============================ # HDF5 detection # ============================ -# MSVC: use vcpkg or prebuilt binaries if (USING_MSVC AND DEFINED ENV{HDF5_DIR}) set(HDF5_ROOT "$ENV{HDF5_DIR}") set(CMAKE_PREFIX_PATH "${HDF5_ROOT};${CMAKE_PREFIX_PATH}") endif() if (USING_MINGW) - # MSYS2 MinGW installs HDF5 here set(CMAKE_PREFIX_PATH "C:/tools/msys64/mingw64") message(STATUS "Using MSYS2 MinGW HDF5 path: ${CMAKE_PREFIX_PATH}") endif() From baa390922465c92d84504cf398e7a6e78f046dc3 Mon Sep 17 00:00:00 2001 From: Timofey Zakharchuk Date: Sun, 15 Feb 2026 21:18:22 +0300 Subject: [PATCH 02/13] Update begrun.cpp --- src/begrun.cpp | 137 ++++++++++++++++++++++++++----------------------- 1 file changed, 74 insertions(+), 63 deletions(-) diff --git a/src/begrun.cpp b/src/begrun.cpp index 47dea18..1550208 100644 --- a/src/begrun.cpp +++ b/src/begrun.cpp @@ -22,13 +22,37 @@ #include #include #include +#ifdef NEXT_MPI + #include +#endif +#include "hdf5.h" + using next::OutputFormat; +// Helper: only rank 0, thread 0 prints +inline void log_once(int rank, const std::string &msg) { + if (rank == 0 && omp_get_thread_num() == 0) { + std::cout << msg << std::endl; + } +} + int main(int argc, char **argv) { - auto args = next::parse_arguments(argc, argv); + int rank = 0; + int size = 1; + +#ifdef NEXT_MPI + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + log_once(rank, "MPI mode enabled (" + std::to_string(size) + " ranks)"); +#endif - static constexpr const char *BANNER = R"NEXTBANNER( + H5Eset_auto(H5E_DEFAULT, nullptr, nullptr); + + auto args = next::parse_arguments(argc, argv, rank); + + static constexpr const char *BANNER = R"NEXTBANNER( _ _ ________ _________ | \ | | ____\ \ / /__ __| | \| | |__ \ V / | | @@ -37,73 +61,60 @@ int main(int argc, char **argv) { |_| \_|______/_/ \_\ |_| )NEXTBANNER"; - std::cout << BANNER << '\n'; - - // Set threads and report - omp_set_num_threads(args.threads); - std::cout << " Threads: " << args.threads << "\n"; + omp_set_num_threads(args.threads); + log_once(rank, BANNER); + log_once(rank, " Threads: " + std::to_string(args.threads)); #ifdef NEXT_FP64 - std::cout << " Precision: FP64\n"; + log_once(rank, " Precision: FP64"); #elif defined(NEXT_FP32) - std::cout << " Precision: FP32\n"; + log_once(rank, " Precision: FP32"); #endif - // --- SoA UPDATE --- - // LoadParticlesFromFile now returns a single 'Particle' struct - // which internally contains all the parallel vectors. - Particle particles = LoadParticlesFromFile(args.input_file); - - std::cout << " Particles: " << particles.size() << "\n"; - - real simTime = 0; - real nextDump = 0; - int step = 0; - char command; - - while (true) { - // Both computeAdaptiveDt and Step now take the SoA object by reference - real dtAdaptive = computeAdaptiveDt(particles, args.dt); - Step(particles, dtAdaptive); - - simTime += dtAdaptive; - - if (simTime >= nextDump) { - std::string out = "dump_" + std::to_string(step); - - switch (args.format) { - case OutputFormat::VTK: - out += ".vtk"; - SaveVTK(particles, out); - break; - - case OutputFormat::VTU: - out += ".vtu"; - SaveVTU(particles, out); - break; - - case OutputFormat::HDF5: - out += ".hdf5"; - SaveHDF5(particles, out); - break; - } - - std::cout << "[Dump " << step << "] t = " << simTime - << ", file: " << out << "\n"; - - nextDump += args.dump_interval; - step++; + // Load particles + Particle particles = LoadParticlesFromFile(args.input_file); + log_once(rank, " Particles: " + std::to_string(particles.size())); + + real simTime = 0; + real nextDump = 0; + int step = 0; + char command; + + while (true) { + real dtAdaptive = computeAdaptiveDt(particles, args.dt); + Step(particles, dtAdaptive); + simTime += dtAdaptive; + + if (simTime >= nextDump) { + std::string out = "dump_" + std::to_string(step); + + switch (args.format) { + case OutputFormat::VTK: out += ".vtk"; SaveVTK(particles, out); break; + case OutputFormat::VTU: out += ".vtu"; SaveVTU(particles, out); break; + case OutputFormat::HDF5: out += ".hdf5"; SaveHDF5(particles, out); break; + } + + log_once(rank, "[Dump " + std::to_string(step) + + "] t = " + std::to_string(simTime) + + ", file: " + out); + + nextDump += args.dump_interval; + step++; + } + + // Non-blocking exit check + if (std::cin.rdbuf()->in_avail() > 0) { + std::cin >> command; + if (command == 'q' || command == 'Q') { + log_once(rank, "Exiting..."); + break; + } + } } - // Non-blocking exit check - if (std::cin.rdbuf()->in_avail() > 0) { - std::cin >> command; - if (command == 'q' || command == 'Q') { - std::cout << "Exiting...\n"; - break; - } - } - } +#ifdef NEXT_MPI + MPI_Finalize(); +#endif - return 0; + return 0; } From 17f28833bb686d2e320797f7ed160cdb0aebcb43 Mon Sep 17 00:00:00 2001 From: Timofey Zakharchuk Date: Sun, 15 Feb 2026 21:18:55 +0300 Subject: [PATCH 03/13] Update step.h --- src/gravity/step.h | 166 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 138 insertions(+), 28 deletions(-) diff --git a/src/gravity/step.h b/src/gravity/step.h index 96395d3..cf114ff 100644 --- a/src/gravity/step.h +++ b/src/gravity/step.h @@ -16,6 +16,9 @@ #include #include #include +#ifdef NEXT_MPI + #include +#endif /** * @brief Performs a complete Leapfrog Step (Kick-Drift-Kick) using SoA data. @@ -23,75 +26,182 @@ inline void Step(ParticleSystem &ps, real dt) { if (ps.size() == 0) return; - const real theta = 0.5; - const real half = dt * real(0.5); - const int N = static_cast(ps.size()); + const real theta = real(0.5); + const real half = dt * real(0.5); + const int N = static_cast(ps.size()); + + // --------------------------------------------------------- + // MPI SETUP + // --------------------------------------------------------- +#ifdef NEXT_MPI + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // Select MPI datatype matching + MPI_Datatype MPI_REAL_T; +# ifdef NEXT_FP64 + MPI_REAL_T = MPI_DOUBLE; +# elif defined(NEXT_FP32) + MPI_REAL_T = MPI_FLOAT; +# else +# error "Define NEXT_FP32 or NEXT_FP64 for 'real' type." +# endif +#else + int rank = 0; + int size = 1; +#endif + + // --------------------------------------------------------- + // DOMAIN DECOMPOSITION + // --------------------------------------------------------- + const int start = (rank * N) / size; + const int end = ((rank + 1) * N) / size; + +#ifdef NEXT_MPI + // Precompute counts and displacements for Allgatherv + std::vector counts(size), displs(size); + for (int r = 0; r < size; ++r) { + const int s = (r * N) / size; + const int e = ((r + 1) * N) / size; + counts[r] = e - s; + displs[r] = s; + } +#endif - // Helper lambda to build the tree using the ParticleSystem indices + // --------------------------------------------------------- + // TREE BUILDER + // --------------------------------------------------------- auto buildTree = [&]() -> std::unique_ptr { - real minx = 1e30, miny = 1e30, minz = 1e30; - real maxx = -1e30, maxy = -1e30, maxz = -1e30; + struct BBox { real minx, miny, minz, maxx, maxy, maxz; }; + BBox local{ real(1e30), real(1e30), real(1e30), + real(-1e30), real(-1e30), real(-1e30) }; - // Bounding box calculation (SoA access is very fast here) for (int i = 0; i < N; ++i) { - minx = std::min(minx, ps.x[i]); miny = std::min(miny, ps.y[i]); minz = std::min(minz, ps.z[i]); - maxx = std::max(maxx, ps.x[i]); maxy = std::max(maxy, ps.y[i]); maxz = std::max(maxz, ps.z[i]); + local.minx = std::min(local.minx, ps.x[i]); + local.miny = std::min(local.miny, ps.y[i]); + local.minz = std::min(local.minz, ps.z[i]); + local.maxx = std::max(local.maxx, ps.x[i]); + local.maxy = std::max(local.maxy, ps.y[i]); + local.maxz = std::max(local.maxz, ps.z[i]); } - real cx = (minx + maxx) * 0.5; - real cy = (miny + maxy) * 0.5; - real cz = (minz + maxz) * 0.5; - real size = std::max({maxx - minx, maxy - miny, maxz - minz}) * 0.5; +#ifdef NEXT_MPI + BBox global; + real mins[3] = {local.minx, local.miny, local.minz}; + real maxs[3] = {local.maxx, local.maxy, local.maxz}; + + MPI_Allreduce(MPI_IN_PLACE, mins, 3, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, maxs, 3, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD); + + global.minx = mins[0]; global.miny = mins[1]; global.minz = mins[2]; + global.maxx = maxs[0]; global.maxy = maxs[1]; global.maxz = maxs[2]; - if (size <= 0) size = 1.0; +#else + BBox global = local; +#endif + + const real cx = (global.minx + global.maxx) * real(0.5); + const real cy = (global.miny + global.maxy) * real(0.5); + const real cz = (global.minz + global.maxz) * real(0.5); + real size = std::max({global.maxx - global.minx, + global.maxy - global.miny, + global.maxz - global.minz}) * real(0.5); + + if (size <= real(0)) size = real(1.0); auto root = std::make_unique(cx, cy, cz, size); - // Insert particle indices 0 to N-1 - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) root->insert(i, ps); - } root->computeMass(ps); return root; }; - // --- First Kick (dt/2) --- + // --------------------------------------------------------- + // FIRST KICK + // --------------------------------------------------------- { - std::unique_ptr root = buildTree(); + auto root = buildTree(); #pragma omp parallel for schedule(dynamic, 64) - for (int i = 0; i < N; ++i) { - real ax = 0, ay = 0, az = 0; + for (int i = start; i < end; ++i) { + real ax = real(0), ay = real(0), az = real(0); bhAccel(root.get(), i, ps, theta, ax, ay, az); ps.vx[i] += ax * half; ps.vy[i] += ay * half; ps.vz[i] += az * half; } + +#ifdef NEXT_MPI + MPI_Request reqs[3]; + MPI_Iallgatherv(ps.vx.data() + start, end - start, MPI_REAL_T, + ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs[0]); + MPI_Iallgatherv(ps.vy.data() + start, end - start, MPI_REAL_T, + ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs[1]); + MPI_Iallgatherv(ps.vz.data() + start, end - start, MPI_REAL_T, + ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs[2]); + MPI_Waitall(3, reqs, MPI_STATUSES_IGNORE); +#endif } - // --- Drift (dt) --- - // Contiguous memory access makes this loop ideal for SIMD + // --------------------------------------------------------- + // DRIFT + // --------------------------------------------------------- #pragma omp parallel for schedule(static) - for (int i = 0; i < N; ++i) { + for (int i = start; i < end; ++i) { ps.x[i] += ps.vx[i] * dt; ps.y[i] += ps.vy[i] * dt; ps.z[i] += ps.vz[i] * dt; } - // --- Second Kick (dt/2) --- +#ifdef NEXT_MPI + MPI_Request reqs[3]; + MPI_Iallgatherv(ps.x.data() + start, end - start, MPI_REAL_T, + ps.x.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs[0]); + MPI_Iallgatherv(ps.y.data() + start, end - start, MPI_REAL_T, + ps.y.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs[1]); + MPI_Iallgatherv(ps.z.data() + start, end - start, MPI_REAL_T, + ps.z.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs[2]); + MPI_Waitall(3, reqs, MPI_STATUSES_IGNORE); +#endif + + // --------------------------------------------------------- + // SECOND KICK + // --------------------------------------------------------- { - std::unique_ptr root = buildTree(); + auto root = buildTree(); #pragma omp parallel for schedule(dynamic, 64) - for (int i = 0; i < N; ++i) { - real ax = 0, ay = 0, az = 0; + for (int i = start; i < end; ++i) { + real ax = real(0), ay = real(0), az = real(0); bhAccel(root.get(), i, ps, theta, ax, ay, az); ps.vx[i] += ax * half; ps.vy[i] += ay * half; ps.vz[i] += az * half; } + +#ifdef NEXT_MPI + MPI_Request reqs2[3]; + MPI_Iallgatherv(ps.vx.data() + start, end - start, MPI_REAL_T, + ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs2[0]); + MPI_Iallgatherv(ps.vy.data() + start, end - start, MPI_REAL_T, + ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs2[1]); + MPI_Iallgatherv(ps.vz.data() + start, end - start, MPI_REAL_T, + ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs2[2]); + MPI_Waitall(3, reqs2, MPI_STATUSES_IGNORE); +#endif } } From 3b79adc100618d74801f6fb274cc5200f1b4d873 Mon Sep 17 00:00:00 2001 From: Timofey Zakharchuk Date: Sun, 15 Feb 2026 21:19:18 +0300 Subject: [PATCH 04/13] Update argparse.cpp --- argparse/argparse.cpp | 46 +++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/argparse/argparse.cpp b/argparse/argparse.cpp index 9fbd08a..a7611df 100644 --- a/argparse/argparse.cpp +++ b/argparse/argparse.cpp @@ -1,33 +1,59 @@ +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "argparse.hpp" #include #include +#ifdef NEXT_MPI + #include +#endif namespace next { -Arguments parse_arguments(int argc, char** argv) +Arguments parse_arguments(int argc, char** argv, int rank) { if (argc != 6) { - std::cerr << "Usage: next
\n"; + // Only rank 0 prints usage + if (rank == 0) { + std::cerr << "Usage: next
\n"; + } + +#ifdef NEXT_MPI + MPI_Finalize(); +#endif std::exit(1); } Arguments args; - args.input_file = argv[1]; - args.threads = std::stoi(argv[2]); - args.dt = std::stod(argv[3]); + args.input_file = argv[1]; + args.threads = std::stoi(argv[2]); + args.dt = std::stod(argv[3]); args.dump_interval = std::stod(argv[4]); std::string fmt = argv[5]; - if (fmt == "vtk") + if (fmt == "vtk") { args.format = OutputFormat::VTK; - else if (fmt == "vtu") + } else if (fmt == "vtu") { args.format = OutputFormat::VTU; - else if (fmt == "hdf5") + } else if (fmt == "hdf5") { args.format = OutputFormat::HDF5; - else { - std::cerr << "Choose a file format: vtk, vtu, or hdf5\n"; + } else { + if (rank == 0) { + std::cerr << "Choose a file format: vtk, vtu, or hdf5\n"; + } +#ifdef NEXT_MPI + MPI_Finalize(); +#endif std::exit(1); } From 01739b9de70afd7250afcbe24faf2c80580efd03 Mon Sep 17 00:00:00 2001 From: Timofey Zakharchuk Date: Sun, 15 Feb 2026 21:19:30 +0300 Subject: [PATCH 05/13] Update argparse.hpp --- argparse/argparse.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argparse/argparse.hpp b/argparse/argparse.hpp index 3e0f831..b81174d 100644 --- a/argparse/argparse.hpp +++ b/argparse/argparse.hpp @@ -17,6 +17,6 @@ struct Arguments { OutputFormat format; }; -Arguments parse_arguments(int argc, char** argv); +Arguments parse_arguments(int argc, char** argv, int rank); } From 7649eafdfe6b60464df5469429cb26679a785000 Mon Sep 17 00:00:00 2001 From: Timofey Zakharchuk Date: Sun, 15 Feb 2026 21:24:24 +0300 Subject: [PATCH 06/13] Create mpi.yml --- .github/workflows/mpi.yml | 53 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 .github/workflows/mpi.yml diff --git a/.github/workflows/mpi.yml b/.github/workflows/mpi.yml new file mode 100644 index 0000000..bd9097d --- /dev/null +++ b/.github/workflows/mpi.yml @@ -0,0 +1,53 @@ +name: MPI Galaxy Demo + +permissions: + contents: read + +on: + push: + branches: [ main ] + pull_request: + +concurrency: + group: mpi-${{ github.ref }} + cancel-in-progress: true + +jobs: + mpi-galaxy: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install dependencies (MPI + HDF5 + Python) + run: | + sudo apt-get update + sudo apt-get install -y mpich libhdf5-dev libomp-dev cmake build-essential python3 + + - name: Configure project (CMake) + run: | + mkdir -p build + cd build + cmake .. -DCMAKE_BUILD_TYPE=Release -DNEXT_FP64=ON -DNEXT_MPI=ON + + - name: Build project + run: | + cd build + cmake --build . -- -j$(nproc) + + - name: Run Galaxy demo (timed) + run: | + # Go back to project root + cd $GITHUB_WORKSPACE/examples/GalaxyDemo + python3 galaxy.py + # Run with 2 MPI ranks and 2 OpenMP threads, stop after 20s + OMP_NUM_THREADS=2 timeout 20s mpiexec -n 2 ../../build/next galaxy.txt 2 0.01 0.1 hdf5 + + - name: Upload simulation outputs + uses: actions/upload-artifact@v4 + with: + name: galaxy-dumps + path: | + examples/GalaxyDemo/dump_*.hdf5 + examples/GalaxyDemo/dump_*.xdmf From 9d4611c27c99757bace7534fe9f4880c76e725af Mon Sep 17 00:00:00 2001 From: Timofey Zakharchuk Date: Sun, 15 Feb 2026 21:29:08 +0300 Subject: [PATCH 07/13] Update step.h --- src/gravity/step.h | 63 +++++++++++++++------------------------------- 1 file changed, 20 insertions(+), 43 deletions(-) diff --git a/src/gravity/step.h b/src/gravity/step.h index cf114ff..d57f36f 100644 --- a/src/gravity/step.h +++ b/src/gravity/step.h @@ -20,9 +20,6 @@ #include #endif -/** - * @brief Performs a complete Leapfrog Step (Kick-Drift-Kick) using SoA data. - */ inline void Step(ParticleSystem &ps, real dt) { if (ps.size() == 0) return; @@ -30,15 +27,11 @@ inline void Step(ParticleSystem &ps, real dt) { const real half = dt * real(0.5); const int N = static_cast(ps.size()); - // --------------------------------------------------------- - // MPI SETUP - // --------------------------------------------------------- #ifdef NEXT_MPI int rank, size; MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); - // Select MPI datatype matching MPI_Datatype MPI_REAL_T; # ifdef NEXT_FP64 MPI_REAL_T = MPI_DOUBLE; @@ -52,14 +45,10 @@ inline void Step(ParticleSystem &ps, real dt) { int size = 1; #endif - // --------------------------------------------------------- - // DOMAIN DECOMPOSITION - // --------------------------------------------------------- const int start = (rank * N) / size; const int end = ((rank + 1) * N) / size; #ifdef NEXT_MPI - // Precompute counts and displacements for Allgatherv std::vector counts(size), displs(size); for (int r = 0; r < size; ++r) { const int s = (r * N) / size; @@ -69,9 +58,6 @@ inline void Step(ParticleSystem &ps, real dt) { } #endif - // --------------------------------------------------------- - // TREE BUILDER - // --------------------------------------------------------- auto buildTree = [&]() -> std::unique_ptr { struct BBox { real minx, miny, minz, maxx, maxy, maxz; }; BBox local{ real(1e30), real(1e30), real(1e30), @@ -87,16 +73,13 @@ inline void Step(ParticleSystem &ps, real dt) { } #ifdef NEXT_MPI - BBox global; real mins[3] = {local.minx, local.miny, local.minz}; real maxs[3] = {local.maxx, local.maxy, local.maxz}; MPI_Allreduce(MPI_IN_PLACE, mins, 3, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD); MPI_Allreduce(MPI_IN_PLACE, maxs, 3, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD); - global.minx = mins[0]; global.miny = mins[1]; global.minz = mins[2]; - global.maxx = maxs[0]; global.maxy = maxs[1]; global.maxz = maxs[2]; - + BBox global{mins[0], mins[1], mins[2], maxs[0], maxs[1], maxs[2]}; #else BBox global = local; #endif @@ -119,9 +102,7 @@ inline void Step(ParticleSystem &ps, real dt) { return root; }; - // --------------------------------------------------------- // FIRST KICK - // --------------------------------------------------------- { auto root = buildTree(); @@ -137,22 +118,20 @@ inline void Step(ParticleSystem &ps, real dt) { #ifdef NEXT_MPI MPI_Request reqs[3]; - MPI_Iallgatherv(ps.vx.data() + start, end - start, MPI_REAL_T, + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T, MPI_COMM_WORLD, &reqs[0]); - MPI_Iallgatherv(ps.vy.data() + start, end - start, MPI_REAL_T, + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T, MPI_COMM_WORLD, &reqs[1]); - MPI_Iallgatherv(ps.vz.data() + start, end - start, MPI_REAL_T, + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T, MPI_COMM_WORLD, &reqs[2]); MPI_Waitall(3, reqs, MPI_STATUSES_IGNORE); #endif } - // --------------------------------------------------------- // DRIFT - // --------------------------------------------------------- #pragma omp parallel for schedule(static) for (int i = start; i < end; ++i) { ps.x[i] += ps.vx[i] * dt; @@ -161,22 +140,20 @@ inline void Step(ParticleSystem &ps, real dt) { } #ifdef NEXT_MPI - MPI_Request reqs[3]; - MPI_Iallgatherv(ps.x.data() + start, end - start, MPI_REAL_T, + MPI_Request reqs3[3]; + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, ps.x.data(), counts.data(), displs.data(), MPI_REAL_T, - MPI_COMM_WORLD, &reqs[0]); - MPI_Iallgatherv(ps.y.data() + start, end - start, MPI_REAL_T, + MPI_COMM_WORLD, &reqs3[0]); + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, ps.y.data(), counts.data(), displs.data(), MPI_REAL_T, - MPI_COMM_WORLD, &reqs[1]); - MPI_Iallgatherv(ps.z.data() + start, end - start, MPI_REAL_T, + MPI_COMM_WORLD, &reqs3[1]); + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, ps.z.data(), counts.data(), displs.data(), MPI_REAL_T, - MPI_COMM_WORLD, &reqs[2]); - MPI_Waitall(3, reqs, MPI_STATUSES_IGNORE); + MPI_COMM_WORLD, &reqs3[2]); + MPI_Waitall(3, reqs3, MPI_STATUSES_IGNORE); #endif - // --------------------------------------------------------- // SECOND KICK - // --------------------------------------------------------- { auto root = buildTree(); @@ -191,17 +168,17 @@ inline void Step(ParticleSystem &ps, real dt) { } #ifdef NEXT_MPI - MPI_Request reqs2[3]; - MPI_Iallgatherv(ps.vx.data() + start, end - start, MPI_REAL_T, + MPI_Request reqs4[3]; + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T, - MPI_COMM_WORLD, &reqs2[0]); - MPI_Iallgatherv(ps.vy.data() + start, end - start, MPI_REAL_T, + MPI_COMM_WORLD, &reqs4[0]); + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T, - MPI_COMM_WORLD, &reqs2[1]); - MPI_Iallgatherv(ps.vz.data() + start, end - start, MPI_REAL_T, + MPI_COMM_WORLD, &reqs4[1]); + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T, - MPI_COMM_WORLD, &reqs2[2]); - MPI_Waitall(3, reqs2, MPI_STATUSES_IGNORE); + MPI_COMM_WORLD, &reqs4[2]); + MPI_Waitall(3, reqs4, MPI_STATUSES_IGNORE); #endif } } From 25cb9fe50e41e26ea1c0e9ecd8dffb7792ce66b6 Mon Sep 17 00:00:00 2001 From: Timofey Zakharchuk Date: Sun, 15 Feb 2026 21:32:28 +0300 Subject: [PATCH 08/13] Update begrun.cpp --- src/begrun.cpp | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/begrun.cpp b/src/begrun.cpp index 1550208..9b8f5ff 100644 --- a/src/begrun.cpp +++ b/src/begrun.cpp @@ -45,7 +45,6 @@ int main(int argc, char **argv) { MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); - log_once(rank, "MPI mode enabled (" + std::to_string(size) + " ranks)"); #endif H5Eset_auto(H5E_DEFAULT, nullptr, nullptr); @@ -63,17 +62,25 @@ int main(int argc, char **argv) { omp_set_num_threads(args.threads); - log_once(rank, BANNER); - log_once(rank, " Threads: " + std::to_string(args.threads)); + // Only rank 0 prints startup info + if (rank == 0 && omp_get_thread_num() == 0) { +#ifdef NEXT_MPI + std::cout << "MPI mode enabled (" << size << " ranks)" << std::endl; +#endif + std::cout << BANNER << std::endl; + std::cout << " Threads: " << args.threads << std::endl; #ifdef NEXT_FP64 - log_once(rank, " Precision: FP64"); + std::cout << " Precision: FP64" << std::endl; #elif defined(NEXT_FP32) - log_once(rank, " Precision: FP32"); + std::cout << " Precision: FP32" << std::endl; #endif + } // Load particles Particle particles = LoadParticlesFromFile(args.input_file); - log_once(rank, " Particles: " + std::to_string(particles.size())); + if (rank == 0 && omp_get_thread_num() == 0) { + std::cout << " Particles: " << particles.size() << std::endl; + } real simTime = 0; real nextDump = 0; @@ -94,9 +101,10 @@ int main(int argc, char **argv) { case OutputFormat::HDF5: out += ".hdf5"; SaveHDF5(particles, out); break; } - log_once(rank, "[Dump " + std::to_string(step) + - "] t = " + std::to_string(simTime) + - ", file: " + out); + if (rank == 0 && omp_get_thread_num() == 0) { + std::cout << "[Dump " << step << "] t = " << simTime + << ", file: " << out << std::endl; + } nextDump += args.dump_interval; step++; @@ -106,7 +114,9 @@ int main(int argc, char **argv) { if (std::cin.rdbuf()->in_avail() > 0) { std::cin >> command; if (command == 'q' || command == 'Q') { - log_once(rank, "Exiting..."); + if (rank == 0 && omp_get_thread_num() == 0) { + std::cout << "Exiting..." << std::endl; + } break; } } From 7a0e878b4aa647e7a190bb5d2d8c5a3c8b3ea91d Mon Sep 17 00:00:00 2001 From: Timofey Zakharchuk Date: Sun, 15 Feb 2026 21:45:02 +0300 Subject: [PATCH 09/13] Update begrun.cpp --- src/begrun.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/begrun.cpp b/src/begrun.cpp index 9b8f5ff..16f1d89 100644 --- a/src/begrun.cpp +++ b/src/begrun.cpp @@ -45,6 +45,12 @@ int main(int argc, char **argv) { MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); + + // Silence stdout/stderr for all ranks except 0 + if (rank != 0) { + fclose(stdout); + } + #endif H5Eset_auto(H5E_DEFAULT, nullptr, nullptr); From 981e02b2d6635811cee176d86664ef4c7d06ccd1 Mon Sep 17 00:00:00 2001 From: Timofey Zakharchuk Date: Sun, 15 Feb 2026 21:48:03 +0300 Subject: [PATCH 10/13] Update CMakeLists.txt --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 73eb43c..562100f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,7 +65,7 @@ add_executable(next ${SRC_FILES} ${ARGPARSE_FILES}) # ============================ # Vectorization reports # ============================ -option(ENABLE_VEC_REPORT "Enable compiler vectorization reports" ON) +option(ENABLE_VEC_REPORT "Enable compiler vectorization reports" OFF) if(ENABLE_VEC_REPORT) if(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") From d5ae3080e708af0e33a9adc8cc81dda0e9aa05df Mon Sep 17 00:00:00 2001 From: Timofey Zakharchuk Date: Sun, 15 Feb 2026 21:49:40 +0300 Subject: [PATCH 11/13] Update mpi.yml --- .github/workflows/mpi.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mpi.yml b/.github/workflows/mpi.yml index bd9097d..6676122 100644 --- a/.github/workflows/mpi.yml +++ b/.github/workflows/mpi.yml @@ -42,7 +42,7 @@ jobs: cd $GITHUB_WORKSPACE/examples/GalaxyDemo python3 galaxy.py # Run with 2 MPI ranks and 2 OpenMP threads, stop after 20s - OMP_NUM_THREADS=2 timeout 20s mpiexec -n 2 ../../build/next galaxy.txt 2 0.01 0.1 hdf5 + timeout 60s mpiexec -n 2 ./next galaxy.txt 1 0.01 0.1 hdf5 - name: Upload simulation outputs uses: actions/upload-artifact@v4 From fc6a4652b192896c97b4671447d190cea781f4d5 Mon Sep 17 00:00:00 2001 From: Timofey Zakharchuk Date: Sun, 15 Feb 2026 21:51:05 +0300 Subject: [PATCH 12/13] Update mpi.yml --- .github/workflows/mpi.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mpi.yml b/.github/workflows/mpi.yml index 6676122..05a19c9 100644 --- a/.github/workflows/mpi.yml +++ b/.github/workflows/mpi.yml @@ -42,7 +42,7 @@ jobs: cd $GITHUB_WORKSPACE/examples/GalaxyDemo python3 galaxy.py # Run with 2 MPI ranks and 2 OpenMP threads, stop after 20s - timeout 60s mpiexec -n 2 ./next galaxy.txt 1 0.01 0.1 hdf5 + timeout 60s mpiexec -n 2 ../../next galaxy.txt 1 0.01 0.1 hdf5 - name: Upload simulation outputs uses: actions/upload-artifact@v4 From d759c4198bc9a532d57aa286d1a57dc777e8b926 Mon Sep 17 00:00:00 2001 From: Timofey Zakharchuk Date: Sun, 15 Feb 2026 22:07:29 +0300 Subject: [PATCH 13/13] Update mpi.yml --- .github/workflows/mpi.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mpi.yml b/.github/workflows/mpi.yml index 05a19c9..3753672 100644 --- a/.github/workflows/mpi.yml +++ b/.github/workflows/mpi.yml @@ -42,7 +42,7 @@ jobs: cd $GITHUB_WORKSPACE/examples/GalaxyDemo python3 galaxy.py # Run with 2 MPI ranks and 2 OpenMP threads, stop after 20s - timeout 60s mpiexec -n 2 ../../next galaxy.txt 1 0.01 0.1 hdf5 + timeout 60s mpiexec -n 2 ../../next galaxy.txt 1 0.01 0.1 hdf5 || true - name: Upload simulation outputs uses: actions/upload-artifact@v4