diff --git a/.github/workflows/mpi.yml b/.github/workflows/mpi.yml
new file mode 100644
index 0000000..3753672
--- /dev/null
+++ b/.github/workflows/mpi.yml
@@ -0,0 +1,53 @@
+name: MPI Galaxy Demo
+
+permissions:
+ contents: read
+
+on:
+ push:
+ branches: [ main ]
+ pull_request:
+
+concurrency:
+ group: mpi-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ mpi-galaxy:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Checkout repository
+ uses: actions/checkout@v4
+
+ - name: Install dependencies (MPI + HDF5 + Python)
+ run: |
+ sudo apt-get update
+ sudo apt-get install -y mpich libhdf5-dev libomp-dev cmake build-essential python3
+
+ - name: Configure project (CMake)
+ run: |
+ mkdir -p build
+ cd build
+ cmake .. -DCMAKE_BUILD_TYPE=Release -DNEXT_FP64=ON -DNEXT_MPI=ON
+
+ - name: Build project
+ run: |
+ cd build
+ cmake --build . -- -j$(nproc)
+
+ - name: Run Galaxy demo (timed)
+ run: |
+ # Go back to project root
+ cd $GITHUB_WORKSPACE/examples/GalaxyDemo
+ python3 galaxy.py
+ # Run with 2 MPI ranks and 2 OpenMP threads, stop after 20s
+ timeout 60s mpiexec -n 2 ../../next galaxy.txt 1 0.01 0.1 hdf5 || true
+
+ - name: Upload simulation outputs
+ uses: actions/upload-artifact@v4
+ with:
+ name: galaxy-dumps
+ path: |
+ examples/GalaxyDemo/dump_*.hdf5
+ examples/GalaxyDemo/dump_*.xdmf
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 452cd0c..562100f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,12 @@ if (MINGW)
elseif (MSVC)
message(STATUS "Detected MSVC — using MSVC settings")
set(USING_MSVC TRUE)
+elseif (APPLE)
+ message(STATUS "Detected macOS — limited MPI support")
+ set(USING_APPLE TRUE)
+elseif (UNIX)
+ message(STATUS "Detected Linux/UNIX — using standard settings")
+ set(USING_UNIX TRUE)
endif()
# ============================
@@ -18,19 +24,15 @@ endif()
# ============================
option(NEXT_FP32 "Use 32-bit floats (scalar)" OFF)
option(NEXT_FP64 "Use 64-bit floats (scalar)" ON)
+option(NEXT_MPI "Enable MPI support" OFF)
-# Option: copy final executable to source dir
option(NEXT_COPY_TO_CMAKE_SOURCE_DIR "Copy final executable from build dir to source dir" ON)
# ============================
# Precision modes
# ============================
-set(NEXT_MODES
- NEXT_FP32
- NEXT_FP64
-)
+set(NEXT_MODES NEXT_FP32 NEXT_FP64)
-# Count enabled modes
set(NEXT_MODE_COUNT 0)
foreach(m ${NEXT_MODES})
if(${m})
@@ -38,14 +40,12 @@ foreach(m ${NEXT_MODES})
endif()
endforeach()
-# Enforce exactly one mode
if(NEXT_MODE_COUNT EQUAL 0)
message(FATAL_ERROR "You must enable NEXT_FP64 or NEXT_FP32.")
elseif(NEXT_MODE_COUNT GREATER 1)
message(FATAL_ERROR "Enable only one precision mode.")
endif()
-# Apply compile definitions
if(NEXT_FP32)
add_compile_definitions(NEXT_FP32)
elseif(NEXT_FP64)
@@ -62,7 +62,10 @@ file(GLOB ARGPARSE_FILES ${CMAKE_SOURCE_DIR}/argparse/*.cpp)
add_executable(next ${SRC_FILES} ${ARGPARSE_FILES})
-option(ENABLE_VEC_REPORT "Enable compiler vectorization reports" ON)
+# ============================
+# Vectorization reports
+# ============================
+option(ENABLE_VEC_REPORT "Enable compiler vectorization reports" OFF)
if(ENABLE_VEC_REPORT)
if(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
@@ -74,38 +77,56 @@ if(ENABLE_VEC_REPORT)
endif()
endif()
-
# ============================
# OpenMP
# ============================
find_package(OpenMP QUIET)
if(OpenMP_CXX_FOUND)
message(STATUS "OpenMP detected — enabling multithreading.")
-
if(MSVC)
- # Force MSVC to use the modern LLVM backend (supports OpenMP 3.0+ and size_t)
- # We add it as a compiler option because MSVC's default find_package
- # often defaults to the legacy /openmp flag.
target_compile_options(next PRIVATE /openmp:llvm)
else()
target_link_libraries(next PRIVATE OpenMP::OpenMP_CXX)
endif()
-
else()
message(STATUS "OpenMP not found — building in single-threaded mode.")
endif()
+# ============================
+# MPI detection
+# ============================
+if(NEXT_MPI)
+ if(USING_UNIX)
+ # Linux: standard OpenMPI
+ find_package(MPI REQUIRED)
+ message(STATUS "MPI (OpenMPI) detected — enabling distributed memory parallelism.")
+ target_compile_definitions(next PRIVATE NEXT_MPI)
+ target_link_libraries(next PRIVATE MPI::MPI_CXX)
+
+ elseif(USING_MSVC OR USING_MINGW)
+ # Windows: Microsoft MPI
+ find_package(MPI REQUIRED)
+ message(STATUS "MPI (MS-MPI) detected — enabling distributed memory parallelism.")
+ target_compile_definitions(next PRIVATE NEXT_MPI)
+ target_link_libraries(next PRIVATE MPI::MPI_CXX)
+
+ elseif(USING_APPLE)
+ message(WARNING "MPI requested, but macOS does not ship MPI by default. Please install OpenMPI via Homebrew (brew install open-mpi).")
+ find_package(MPI REQUIRED)
+ target_compile_definitions(next PRIVATE NEXT_MPI)
+ target_link_libraries(next PRIVATE MPI::MPI_CXX)
+ endif()
+endif()
+
# ============================
# HDF5 detection
# ============================
-# MSVC: use vcpkg or prebuilt binaries
if (USING_MSVC AND DEFINED ENV{HDF5_DIR})
set(HDF5_ROOT "$ENV{HDF5_DIR}")
set(CMAKE_PREFIX_PATH "${HDF5_ROOT};${CMAKE_PREFIX_PATH}")
endif()
if (USING_MINGW)
- # MSYS2 MinGW installs HDF5 here
set(CMAKE_PREFIX_PATH "C:/tools/msys64/mingw64")
message(STATUS "Using MSYS2 MinGW HDF5 path: ${CMAKE_PREFIX_PATH}")
endif()
diff --git a/argparse/argparse.cpp b/argparse/argparse.cpp
index 9fbd08a..a7611df 100644
--- a/argparse/argparse.cpp
+++ b/argparse/argparse.cpp
@@ -1,33 +1,59 @@
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program. If not, see .
+
#include "argparse.hpp"
#include
#include
+#ifdef NEXT_MPI
+ #include
+#endif
namespace next {
-Arguments parse_arguments(int argc, char** argv)
+Arguments parse_arguments(int argc, char** argv, int rank)
{
if (argc != 6) {
- std::cerr << "Usage: next \n";
+ // Only rank 0 prints usage
+ if (rank == 0) {
+ std::cerr << "Usage: next \n";
+ }
+
+#ifdef NEXT_MPI
+ MPI_Finalize();
+#endif
std::exit(1);
}
Arguments args;
- args.input_file = argv[1];
- args.threads = std::stoi(argv[2]);
- args.dt = std::stod(argv[3]);
+ args.input_file = argv[1];
+ args.threads = std::stoi(argv[2]);
+ args.dt = std::stod(argv[3]);
args.dump_interval = std::stod(argv[4]);
std::string fmt = argv[5];
- if (fmt == "vtk")
+ if (fmt == "vtk") {
args.format = OutputFormat::VTK;
- else if (fmt == "vtu")
+ } else if (fmt == "vtu") {
args.format = OutputFormat::VTU;
- else if (fmt == "hdf5")
+ } else if (fmt == "hdf5") {
args.format = OutputFormat::HDF5;
- else {
- std::cerr << "Choose a file format: vtk, vtu, or hdf5\n";
+ } else {
+ if (rank == 0) {
+ std::cerr << "Choose a file format: vtk, vtu, or hdf5\n";
+ }
+#ifdef NEXT_MPI
+ MPI_Finalize();
+#endif
std::exit(1);
}
diff --git a/argparse/argparse.hpp b/argparse/argparse.hpp
index 3e0f831..b81174d 100644
--- a/argparse/argparse.hpp
+++ b/argparse/argparse.hpp
@@ -17,6 +17,6 @@ struct Arguments {
OutputFormat format;
};
-Arguments parse_arguments(int argc, char** argv);
+Arguments parse_arguments(int argc, char** argv, int rank);
}
diff --git a/src/begrun.cpp b/src/begrun.cpp
index 47dea18..16f1d89 100644
--- a/src/begrun.cpp
+++ b/src/begrun.cpp
@@ -22,13 +22,42 @@
#include
#include
#include
+#ifdef NEXT_MPI
+ #include
+#endif
+#include "hdf5.h"
+
using next::OutputFormat;
+// Helper: only rank 0, thread 0 prints
+inline void log_once(int rank, const std::string &msg) {
+ if (rank == 0 && omp_get_thread_num() == 0) {
+ std::cout << msg << std::endl;
+ }
+}
+
int main(int argc, char **argv) {
- auto args = next::parse_arguments(argc, argv);
+ int rank = 0;
+ int size = 1;
+
+#ifdef NEXT_MPI
+ MPI_Init(&argc, &argv);
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ // Silence stdout/stderr for all ranks except 0
+ if (rank != 0) {
+ fclose(stdout);
+ }
+
+#endif
+
+ H5Eset_auto(H5E_DEFAULT, nullptr, nullptr);
- static constexpr const char *BANNER = R"NEXTBANNER(
+ auto args = next::parse_arguments(argc, argv, rank);
+
+ static constexpr const char *BANNER = R"NEXTBANNER(
_ _ ________ _________
| \ | | ____\ \ / /__ __|
| \| | |__ \ V / | |
@@ -37,73 +66,71 @@ int main(int argc, char **argv) {
|_| \_|______/_/ \_\ |_|
)NEXTBANNER";
- std::cout << BANNER << '\n';
-
- // Set threads and report
- omp_set_num_threads(args.threads);
- std::cout << " Threads: " << args.threads << "\n";
+ omp_set_num_threads(args.threads);
+ // Only rank 0 prints startup info
+ if (rank == 0 && omp_get_thread_num() == 0) {
+#ifdef NEXT_MPI
+ std::cout << "MPI mode enabled (" << size << " ranks)" << std::endl;
+#endif
+ std::cout << BANNER << std::endl;
+ std::cout << " Threads: " << args.threads << std::endl;
#ifdef NEXT_FP64
- std::cout << " Precision: FP64\n";
+ std::cout << " Precision: FP64" << std::endl;
#elif defined(NEXT_FP32)
- std::cout << " Precision: FP32\n";
+ std::cout << " Precision: FP32" << std::endl;
#endif
+ }
- // --- SoA UPDATE ---
- // LoadParticlesFromFile now returns a single 'Particle' struct
- // which internally contains all the parallel vectors.
- Particle particles = LoadParticlesFromFile(args.input_file);
-
- std::cout << " Particles: " << particles.size() << "\n";
-
- real simTime = 0;
- real nextDump = 0;
- int step = 0;
- char command;
-
- while (true) {
- // Both computeAdaptiveDt and Step now take the SoA object by reference
- real dtAdaptive = computeAdaptiveDt(particles, args.dt);
- Step(particles, dtAdaptive);
-
- simTime += dtAdaptive;
-
- if (simTime >= nextDump) {
- std::string out = "dump_" + std::to_string(step);
-
- switch (args.format) {
- case OutputFormat::VTK:
- out += ".vtk";
- SaveVTK(particles, out);
- break;
-
- case OutputFormat::VTU:
- out += ".vtu";
- SaveVTU(particles, out);
- break;
-
- case OutputFormat::HDF5:
- out += ".hdf5";
- SaveHDF5(particles, out);
- break;
- }
-
- std::cout << "[Dump " << step << "] t = " << simTime
- << ", file: " << out << "\n";
-
- nextDump += args.dump_interval;
- step++;
+ // Load particles
+ Particle particles = LoadParticlesFromFile(args.input_file);
+ if (rank == 0 && omp_get_thread_num() == 0) {
+ std::cout << " Particles: " << particles.size() << std::endl;
}
- // Non-blocking exit check
- if (std::cin.rdbuf()->in_avail() > 0) {
- std::cin >> command;
- if (command == 'q' || command == 'Q') {
- std::cout << "Exiting...\n";
- break;
- }
+ real simTime = 0;
+ real nextDump = 0;
+ int step = 0;
+ char command;
+
+ while (true) {
+ real dtAdaptive = computeAdaptiveDt(particles, args.dt);
+ Step(particles, dtAdaptive);
+ simTime += dtAdaptive;
+
+ if (simTime >= nextDump) {
+ std::string out = "dump_" + std::to_string(step);
+
+ switch (args.format) {
+ case OutputFormat::VTK: out += ".vtk"; SaveVTK(particles, out); break;
+ case OutputFormat::VTU: out += ".vtu"; SaveVTU(particles, out); break;
+ case OutputFormat::HDF5: out += ".hdf5"; SaveHDF5(particles, out); break;
+ }
+
+ if (rank == 0 && omp_get_thread_num() == 0) {
+ std::cout << "[Dump " << step << "] t = " << simTime
+ << ", file: " << out << std::endl;
+ }
+
+ nextDump += args.dump_interval;
+ step++;
+ }
+
+ // Non-blocking exit check
+ if (std::cin.rdbuf()->in_avail() > 0) {
+ std::cin >> command;
+ if (command == 'q' || command == 'Q') {
+ if (rank == 0 && omp_get_thread_num() == 0) {
+ std::cout << "Exiting..." << std::endl;
+ }
+ break;
+ }
+ }
}
- }
- return 0;
+#ifdef NEXT_MPI
+ MPI_Finalize();
+#endif
+
+ return 0;
}
diff --git a/src/gravity/step.h b/src/gravity/step.h
index 96395d3..d57f36f 100644
--- a/src/gravity/step.h
+++ b/src/gravity/step.h
@@ -16,82 +16,169 @@
#include
#include
#include
+#ifdef NEXT_MPI
+ #include
+#endif
-/**
- * @brief Performs a complete Leapfrog Step (Kick-Drift-Kick) using SoA data.
- */
inline void Step(ParticleSystem &ps, real dt) {
if (ps.size() == 0) return;
- const real theta = 0.5;
- const real half = dt * real(0.5);
- const int N = static_cast(ps.size());
+ const real theta = real(0.5);
+ const real half = dt * real(0.5);
+ const int N = static_cast(ps.size());
+
+#ifdef NEXT_MPI
+ int rank, size;
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+ MPI_Datatype MPI_REAL_T;
+# ifdef NEXT_FP64
+ MPI_REAL_T = MPI_DOUBLE;
+# elif defined(NEXT_FP32)
+ MPI_REAL_T = MPI_FLOAT;
+# else
+# error "Define NEXT_FP32 or NEXT_FP64 for 'real' type."
+# endif
+#else
+ int rank = 0;
+ int size = 1;
+#endif
+
+ const int start = (rank * N) / size;
+ const int end = ((rank + 1) * N) / size;
+
+#ifdef NEXT_MPI
+ std::vector counts(size), displs(size);
+ for (int r = 0; r < size; ++r) {
+ const int s = (r * N) / size;
+ const int e = ((r + 1) * N) / size;
+ counts[r] = e - s;
+ displs[r] = s;
+ }
+#endif
- // Helper lambda to build the tree using the ParticleSystem indices
auto buildTree = [&]() -> std::unique_ptr {
- real minx = 1e30, miny = 1e30, minz = 1e30;
- real maxx = -1e30, maxy = -1e30, maxz = -1e30;
+ struct BBox { real minx, miny, minz, maxx, maxy, maxz; };
+ BBox local{ real(1e30), real(1e30), real(1e30),
+ real(-1e30), real(-1e30), real(-1e30) };
- // Bounding box calculation (SoA access is very fast here)
for (int i = 0; i < N; ++i) {
- minx = std::min(minx, ps.x[i]); miny = std::min(miny, ps.y[i]); minz = std::min(minz, ps.z[i]);
- maxx = std::max(maxx, ps.x[i]); maxy = std::max(maxy, ps.y[i]); maxz = std::max(maxz, ps.z[i]);
+ local.minx = std::min(local.minx, ps.x[i]);
+ local.miny = std::min(local.miny, ps.y[i]);
+ local.minz = std::min(local.minz, ps.z[i]);
+ local.maxx = std::max(local.maxx, ps.x[i]);
+ local.maxy = std::max(local.maxy, ps.y[i]);
+ local.maxz = std::max(local.maxz, ps.z[i]);
}
- real cx = (minx + maxx) * 0.5;
- real cy = (miny + maxy) * 0.5;
- real cz = (minz + maxz) * 0.5;
- real size = std::max({maxx - minx, maxy - miny, maxz - minz}) * 0.5;
+#ifdef NEXT_MPI
+ real mins[3] = {local.minx, local.miny, local.minz};
+ real maxs[3] = {local.maxx, local.maxy, local.maxz};
+
+ MPI_Allreduce(MPI_IN_PLACE, mins, 3, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
+ MPI_Allreduce(MPI_IN_PLACE, maxs, 3, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
+
+ BBox global{mins[0], mins[1], mins[2], maxs[0], maxs[1], maxs[2]};
+#else
+ BBox global = local;
+#endif
- if (size <= 0) size = 1.0;
+ const real cx = (global.minx + global.maxx) * real(0.5);
+ const real cy = (global.miny + global.maxy) * real(0.5);
+ const real cz = (global.minz + global.maxz) * real(0.5);
+ real size = std::max({global.maxx - global.minx,
+ global.maxy - global.miny,
+ global.maxz - global.minz}) * real(0.5);
+
+ if (size <= real(0)) size = real(1.0);
auto root = std::make_unique(cx, cy, cz, size);
- // Insert particle indices 0 to N-1
- for (int i = 0; i < N; ++i) {
+ for (int i = 0; i < N; ++i)
root->insert(i, ps);
- }
root->computeMass(ps);
return root;
};
- // --- First Kick (dt/2) ---
+ // FIRST KICK
{
- std::unique_ptr root = buildTree();
+ auto root = buildTree();
#pragma omp parallel for schedule(dynamic, 64)
- for (int i = 0; i < N; ++i) {
- real ax = 0, ay = 0, az = 0;
+ for (int i = start; i < end; ++i) {
+ real ax = real(0), ay = real(0), az = real(0);
bhAccel(root.get(), i, ps, theta, ax, ay, az);
ps.vx[i] += ax * half;
ps.vy[i] += ay * half;
ps.vz[i] += az * half;
}
+
+#ifdef NEXT_MPI
+ MPI_Request reqs[3];
+ MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+ ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T,
+ MPI_COMM_WORLD, &reqs[0]);
+ MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+ ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T,
+ MPI_COMM_WORLD, &reqs[1]);
+ MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+ ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T,
+ MPI_COMM_WORLD, &reqs[2]);
+ MPI_Waitall(3, reqs, MPI_STATUSES_IGNORE);
+#endif
}
- // --- Drift (dt) ---
- // Contiguous memory access makes this loop ideal for SIMD
+ // DRIFT
#pragma omp parallel for schedule(static)
- for (int i = 0; i < N; ++i) {
+ for (int i = start; i < end; ++i) {
ps.x[i] += ps.vx[i] * dt;
ps.y[i] += ps.vy[i] * dt;
ps.z[i] += ps.vz[i] * dt;
}
- // --- Second Kick (dt/2) ---
+#ifdef NEXT_MPI
+ MPI_Request reqs3[3];
+ MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+ ps.x.data(), counts.data(), displs.data(), MPI_REAL_T,
+ MPI_COMM_WORLD, &reqs3[0]);
+ MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+ ps.y.data(), counts.data(), displs.data(), MPI_REAL_T,
+ MPI_COMM_WORLD, &reqs3[1]);
+ MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+ ps.z.data(), counts.data(), displs.data(), MPI_REAL_T,
+ MPI_COMM_WORLD, &reqs3[2]);
+ MPI_Waitall(3, reqs3, MPI_STATUSES_IGNORE);
+#endif
+
+ // SECOND KICK
{
- std::unique_ptr root = buildTree();
+ auto root = buildTree();
#pragma omp parallel for schedule(dynamic, 64)
- for (int i = 0; i < N; ++i) {
- real ax = 0, ay = 0, az = 0;
+ for (int i = start; i < end; ++i) {
+ real ax = real(0), ay = real(0), az = real(0);
bhAccel(root.get(), i, ps, theta, ax, ay, az);
ps.vx[i] += ax * half;
ps.vy[i] += ay * half;
ps.vz[i] += az * half;
}
+
+#ifdef NEXT_MPI
+ MPI_Request reqs4[3];
+ MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+ ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T,
+ MPI_COMM_WORLD, &reqs4[0]);
+ MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+ ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T,
+ MPI_COMM_WORLD, &reqs4[1]);
+ MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+ ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T,
+ MPI_COMM_WORLD, &reqs4[2]);
+ MPI_Waitall(3, reqs4, MPI_STATUSES_IGNORE);
+#endif
}
}