diff --git a/.github/workflows/mpi.yml b/.github/workflows/mpi.yml new file mode 100644 index 0000000..3753672 --- /dev/null +++ b/.github/workflows/mpi.yml @@ -0,0 +1,53 @@ +name: MPI Galaxy Demo + +permissions: + contents: read + +on: + push: + branches: [ main ] + pull_request: + +concurrency: + group: mpi-${{ github.ref }} + cancel-in-progress: true + +jobs: + mpi-galaxy: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install dependencies (MPI + HDF5 + Python) + run: | + sudo apt-get update + sudo apt-get install -y mpich libhdf5-dev libomp-dev cmake build-essential python3 + + - name: Configure project (CMake) + run: | + mkdir -p build + cd build + cmake .. -DCMAKE_BUILD_TYPE=Release -DNEXT_FP64=ON -DNEXT_MPI=ON + + - name: Build project + run: | + cd build + cmake --build . -- -j$(nproc) + + - name: Run Galaxy demo (timed) + run: | + # Go back to project root + cd $GITHUB_WORKSPACE/examples/GalaxyDemo + python3 galaxy.py + # Run with 2 MPI ranks and 2 OpenMP threads, stop after 20s + timeout 60s mpiexec -n 2 ../../next galaxy.txt 1 0.01 0.1 hdf5 || true + + - name: Upload simulation outputs + uses: actions/upload-artifact@v4 + with: + name: galaxy-dumps + path: | + examples/GalaxyDemo/dump_*.hdf5 + examples/GalaxyDemo/dump_*.xdmf diff --git a/CMakeLists.txt b/CMakeLists.txt index 452cd0c..562100f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,12 @@ if (MINGW) elseif (MSVC) message(STATUS "Detected MSVC — using MSVC settings") set(USING_MSVC TRUE) +elseif (APPLE) + message(STATUS "Detected macOS — limited MPI support") + set(USING_APPLE TRUE) +elseif (UNIX) + message(STATUS "Detected Linux/UNIX — using standard settings") + set(USING_UNIX TRUE) endif() # ============================ @@ -18,19 +24,15 @@ endif() # ============================ option(NEXT_FP32 "Use 32-bit floats (scalar)" OFF) option(NEXT_FP64 "Use 64-bit floats (scalar)" ON) +option(NEXT_MPI "Enable MPI support" OFF) -# Option: copy final executable to source dir option(NEXT_COPY_TO_CMAKE_SOURCE_DIR "Copy final executable from build dir to source dir" ON) # ============================ # Precision modes # ============================ -set(NEXT_MODES - NEXT_FP32 - NEXT_FP64 -) +set(NEXT_MODES NEXT_FP32 NEXT_FP64) -# Count enabled modes set(NEXT_MODE_COUNT 0) foreach(m ${NEXT_MODES}) if(${m}) @@ -38,14 +40,12 @@ foreach(m ${NEXT_MODES}) endif() endforeach() -# Enforce exactly one mode if(NEXT_MODE_COUNT EQUAL 0) message(FATAL_ERROR "You must enable NEXT_FP64 or NEXT_FP32.") elseif(NEXT_MODE_COUNT GREATER 1) message(FATAL_ERROR "Enable only one precision mode.") endif() -# Apply compile definitions if(NEXT_FP32) add_compile_definitions(NEXT_FP32) elseif(NEXT_FP64) @@ -62,7 +62,10 @@ file(GLOB ARGPARSE_FILES ${CMAKE_SOURCE_DIR}/argparse/*.cpp) add_executable(next ${SRC_FILES} ${ARGPARSE_FILES}) -option(ENABLE_VEC_REPORT "Enable compiler vectorization reports" ON) +# ============================ +# Vectorization reports +# ============================ +option(ENABLE_VEC_REPORT "Enable compiler vectorization reports" OFF) if(ENABLE_VEC_REPORT) if(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM") @@ -74,38 +77,56 @@ if(ENABLE_VEC_REPORT) endif() endif() - # ============================ # OpenMP # ============================ find_package(OpenMP QUIET) if(OpenMP_CXX_FOUND) message(STATUS "OpenMP detected — enabling multithreading.") - if(MSVC) - # Force MSVC to use the modern LLVM backend (supports OpenMP 3.0+ and size_t) - # We add it as a compiler option because MSVC's default find_package - # often defaults to the legacy /openmp flag. target_compile_options(next PRIVATE /openmp:llvm) else() target_link_libraries(next PRIVATE OpenMP::OpenMP_CXX) endif() - else() message(STATUS "OpenMP not found — building in single-threaded mode.") endif() +# ============================ +# MPI detection +# ============================ +if(NEXT_MPI) + if(USING_UNIX) + # Linux: standard OpenMPI + find_package(MPI REQUIRED) + message(STATUS "MPI (OpenMPI) detected — enabling distributed memory parallelism.") + target_compile_definitions(next PRIVATE NEXT_MPI) + target_link_libraries(next PRIVATE MPI::MPI_CXX) + + elseif(USING_MSVC OR USING_MINGW) + # Windows: Microsoft MPI + find_package(MPI REQUIRED) + message(STATUS "MPI (MS-MPI) detected — enabling distributed memory parallelism.") + target_compile_definitions(next PRIVATE NEXT_MPI) + target_link_libraries(next PRIVATE MPI::MPI_CXX) + + elseif(USING_APPLE) + message(WARNING "MPI requested, but macOS does not ship MPI by default. Please install OpenMPI via Homebrew (brew install open-mpi).") + find_package(MPI REQUIRED) + target_compile_definitions(next PRIVATE NEXT_MPI) + target_link_libraries(next PRIVATE MPI::MPI_CXX) + endif() +endif() + # ============================ # HDF5 detection # ============================ -# MSVC: use vcpkg or prebuilt binaries if (USING_MSVC AND DEFINED ENV{HDF5_DIR}) set(HDF5_ROOT "$ENV{HDF5_DIR}") set(CMAKE_PREFIX_PATH "${HDF5_ROOT};${CMAKE_PREFIX_PATH}") endif() if (USING_MINGW) - # MSYS2 MinGW installs HDF5 here set(CMAKE_PREFIX_PATH "C:/tools/msys64/mingw64") message(STATUS "Using MSYS2 MinGW HDF5 path: ${CMAKE_PREFIX_PATH}") endif() diff --git a/argparse/argparse.cpp b/argparse/argparse.cpp index 9fbd08a..a7611df 100644 --- a/argparse/argparse.cpp +++ b/argparse/argparse.cpp @@ -1,33 +1,59 @@ +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . + #include "argparse.hpp" #include #include +#ifdef NEXT_MPI + #include +#endif namespace next { -Arguments parse_arguments(int argc, char** argv) +Arguments parse_arguments(int argc, char** argv, int rank) { if (argc != 6) { - std::cerr << "Usage: next
\n"; + // Only rank 0 prints usage + if (rank == 0) { + std::cerr << "Usage: next
\n"; + } + +#ifdef NEXT_MPI + MPI_Finalize(); +#endif std::exit(1); } Arguments args; - args.input_file = argv[1]; - args.threads = std::stoi(argv[2]); - args.dt = std::stod(argv[3]); + args.input_file = argv[1]; + args.threads = std::stoi(argv[2]); + args.dt = std::stod(argv[3]); args.dump_interval = std::stod(argv[4]); std::string fmt = argv[5]; - if (fmt == "vtk") + if (fmt == "vtk") { args.format = OutputFormat::VTK; - else if (fmt == "vtu") + } else if (fmt == "vtu") { args.format = OutputFormat::VTU; - else if (fmt == "hdf5") + } else if (fmt == "hdf5") { args.format = OutputFormat::HDF5; - else { - std::cerr << "Choose a file format: vtk, vtu, or hdf5\n"; + } else { + if (rank == 0) { + std::cerr << "Choose a file format: vtk, vtu, or hdf5\n"; + } +#ifdef NEXT_MPI + MPI_Finalize(); +#endif std::exit(1); } diff --git a/argparse/argparse.hpp b/argparse/argparse.hpp index 3e0f831..b81174d 100644 --- a/argparse/argparse.hpp +++ b/argparse/argparse.hpp @@ -17,6 +17,6 @@ struct Arguments { OutputFormat format; }; -Arguments parse_arguments(int argc, char** argv); +Arguments parse_arguments(int argc, char** argv, int rank); } diff --git a/src/begrun.cpp b/src/begrun.cpp index 47dea18..16f1d89 100644 --- a/src/begrun.cpp +++ b/src/begrun.cpp @@ -22,13 +22,42 @@ #include #include #include +#ifdef NEXT_MPI + #include +#endif +#include "hdf5.h" + using next::OutputFormat; +// Helper: only rank 0, thread 0 prints +inline void log_once(int rank, const std::string &msg) { + if (rank == 0 && omp_get_thread_num() == 0) { + std::cout << msg << std::endl; + } +} + int main(int argc, char **argv) { - auto args = next::parse_arguments(argc, argv); + int rank = 0; + int size = 1; + +#ifdef NEXT_MPI + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // Silence stdout/stderr for all ranks except 0 + if (rank != 0) { + fclose(stdout); + } + +#endif + + H5Eset_auto(H5E_DEFAULT, nullptr, nullptr); - static constexpr const char *BANNER = R"NEXTBANNER( + auto args = next::parse_arguments(argc, argv, rank); + + static constexpr const char *BANNER = R"NEXTBANNER( _ _ ________ _________ | \ | | ____\ \ / /__ __| | \| | |__ \ V / | | @@ -37,73 +66,71 @@ int main(int argc, char **argv) { |_| \_|______/_/ \_\ |_| )NEXTBANNER"; - std::cout << BANNER << '\n'; - - // Set threads and report - omp_set_num_threads(args.threads); - std::cout << " Threads: " << args.threads << "\n"; + omp_set_num_threads(args.threads); + // Only rank 0 prints startup info + if (rank == 0 && omp_get_thread_num() == 0) { +#ifdef NEXT_MPI + std::cout << "MPI mode enabled (" << size << " ranks)" << std::endl; +#endif + std::cout << BANNER << std::endl; + std::cout << " Threads: " << args.threads << std::endl; #ifdef NEXT_FP64 - std::cout << " Precision: FP64\n"; + std::cout << " Precision: FP64" << std::endl; #elif defined(NEXT_FP32) - std::cout << " Precision: FP32\n"; + std::cout << " Precision: FP32" << std::endl; #endif + } - // --- SoA UPDATE --- - // LoadParticlesFromFile now returns a single 'Particle' struct - // which internally contains all the parallel vectors. - Particle particles = LoadParticlesFromFile(args.input_file); - - std::cout << " Particles: " << particles.size() << "\n"; - - real simTime = 0; - real nextDump = 0; - int step = 0; - char command; - - while (true) { - // Both computeAdaptiveDt and Step now take the SoA object by reference - real dtAdaptive = computeAdaptiveDt(particles, args.dt); - Step(particles, dtAdaptive); - - simTime += dtAdaptive; - - if (simTime >= nextDump) { - std::string out = "dump_" + std::to_string(step); - - switch (args.format) { - case OutputFormat::VTK: - out += ".vtk"; - SaveVTK(particles, out); - break; - - case OutputFormat::VTU: - out += ".vtu"; - SaveVTU(particles, out); - break; - - case OutputFormat::HDF5: - out += ".hdf5"; - SaveHDF5(particles, out); - break; - } - - std::cout << "[Dump " << step << "] t = " << simTime - << ", file: " << out << "\n"; - - nextDump += args.dump_interval; - step++; + // Load particles + Particle particles = LoadParticlesFromFile(args.input_file); + if (rank == 0 && omp_get_thread_num() == 0) { + std::cout << " Particles: " << particles.size() << std::endl; } - // Non-blocking exit check - if (std::cin.rdbuf()->in_avail() > 0) { - std::cin >> command; - if (command == 'q' || command == 'Q') { - std::cout << "Exiting...\n"; - break; - } + real simTime = 0; + real nextDump = 0; + int step = 0; + char command; + + while (true) { + real dtAdaptive = computeAdaptiveDt(particles, args.dt); + Step(particles, dtAdaptive); + simTime += dtAdaptive; + + if (simTime >= nextDump) { + std::string out = "dump_" + std::to_string(step); + + switch (args.format) { + case OutputFormat::VTK: out += ".vtk"; SaveVTK(particles, out); break; + case OutputFormat::VTU: out += ".vtu"; SaveVTU(particles, out); break; + case OutputFormat::HDF5: out += ".hdf5"; SaveHDF5(particles, out); break; + } + + if (rank == 0 && omp_get_thread_num() == 0) { + std::cout << "[Dump " << step << "] t = " << simTime + << ", file: " << out << std::endl; + } + + nextDump += args.dump_interval; + step++; + } + + // Non-blocking exit check + if (std::cin.rdbuf()->in_avail() > 0) { + std::cin >> command; + if (command == 'q' || command == 'Q') { + if (rank == 0 && omp_get_thread_num() == 0) { + std::cout << "Exiting..." << std::endl; + } + break; + } + } } - } - return 0; +#ifdef NEXT_MPI + MPI_Finalize(); +#endif + + return 0; } diff --git a/src/gravity/step.h b/src/gravity/step.h index 96395d3..d57f36f 100644 --- a/src/gravity/step.h +++ b/src/gravity/step.h @@ -16,82 +16,169 @@ #include #include #include +#ifdef NEXT_MPI + #include +#endif -/** - * @brief Performs a complete Leapfrog Step (Kick-Drift-Kick) using SoA data. - */ inline void Step(ParticleSystem &ps, real dt) { if (ps.size() == 0) return; - const real theta = 0.5; - const real half = dt * real(0.5); - const int N = static_cast(ps.size()); + const real theta = real(0.5); + const real half = dt * real(0.5); + const int N = static_cast(ps.size()); + +#ifdef NEXT_MPI + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + MPI_Datatype MPI_REAL_T; +# ifdef NEXT_FP64 + MPI_REAL_T = MPI_DOUBLE; +# elif defined(NEXT_FP32) + MPI_REAL_T = MPI_FLOAT; +# else +# error "Define NEXT_FP32 or NEXT_FP64 for 'real' type." +# endif +#else + int rank = 0; + int size = 1; +#endif + + const int start = (rank * N) / size; + const int end = ((rank + 1) * N) / size; + +#ifdef NEXT_MPI + std::vector counts(size), displs(size); + for (int r = 0; r < size; ++r) { + const int s = (r * N) / size; + const int e = ((r + 1) * N) / size; + counts[r] = e - s; + displs[r] = s; + } +#endif - // Helper lambda to build the tree using the ParticleSystem indices auto buildTree = [&]() -> std::unique_ptr { - real minx = 1e30, miny = 1e30, minz = 1e30; - real maxx = -1e30, maxy = -1e30, maxz = -1e30; + struct BBox { real minx, miny, minz, maxx, maxy, maxz; }; + BBox local{ real(1e30), real(1e30), real(1e30), + real(-1e30), real(-1e30), real(-1e30) }; - // Bounding box calculation (SoA access is very fast here) for (int i = 0; i < N; ++i) { - minx = std::min(minx, ps.x[i]); miny = std::min(miny, ps.y[i]); minz = std::min(minz, ps.z[i]); - maxx = std::max(maxx, ps.x[i]); maxy = std::max(maxy, ps.y[i]); maxz = std::max(maxz, ps.z[i]); + local.minx = std::min(local.minx, ps.x[i]); + local.miny = std::min(local.miny, ps.y[i]); + local.minz = std::min(local.minz, ps.z[i]); + local.maxx = std::max(local.maxx, ps.x[i]); + local.maxy = std::max(local.maxy, ps.y[i]); + local.maxz = std::max(local.maxz, ps.z[i]); } - real cx = (minx + maxx) * 0.5; - real cy = (miny + maxy) * 0.5; - real cz = (minz + maxz) * 0.5; - real size = std::max({maxx - minx, maxy - miny, maxz - minz}) * 0.5; +#ifdef NEXT_MPI + real mins[3] = {local.minx, local.miny, local.minz}; + real maxs[3] = {local.maxx, local.maxy, local.maxz}; + + MPI_Allreduce(MPI_IN_PLACE, mins, 3, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, maxs, 3, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD); + + BBox global{mins[0], mins[1], mins[2], maxs[0], maxs[1], maxs[2]}; +#else + BBox global = local; +#endif - if (size <= 0) size = 1.0; + const real cx = (global.minx + global.maxx) * real(0.5); + const real cy = (global.miny + global.maxy) * real(0.5); + const real cz = (global.minz + global.maxz) * real(0.5); + real size = std::max({global.maxx - global.minx, + global.maxy - global.miny, + global.maxz - global.minz}) * real(0.5); + + if (size <= real(0)) size = real(1.0); auto root = std::make_unique(cx, cy, cz, size); - // Insert particle indices 0 to N-1 - for (int i = 0; i < N; ++i) { + for (int i = 0; i < N; ++i) root->insert(i, ps); - } root->computeMass(ps); return root; }; - // --- First Kick (dt/2) --- + // FIRST KICK { - std::unique_ptr root = buildTree(); + auto root = buildTree(); #pragma omp parallel for schedule(dynamic, 64) - for (int i = 0; i < N; ++i) { - real ax = 0, ay = 0, az = 0; + for (int i = start; i < end; ++i) { + real ax = real(0), ay = real(0), az = real(0); bhAccel(root.get(), i, ps, theta, ax, ay, az); ps.vx[i] += ax * half; ps.vy[i] += ay * half; ps.vz[i] += az * half; } + +#ifdef NEXT_MPI + MPI_Request reqs[3]; + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, + ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs[0]); + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, + ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs[1]); + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, + ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs[2]); + MPI_Waitall(3, reqs, MPI_STATUSES_IGNORE); +#endif } - // --- Drift (dt) --- - // Contiguous memory access makes this loop ideal for SIMD + // DRIFT #pragma omp parallel for schedule(static) - for (int i = 0; i < N; ++i) { + for (int i = start; i < end; ++i) { ps.x[i] += ps.vx[i] * dt; ps.y[i] += ps.vy[i] * dt; ps.z[i] += ps.vz[i] * dt; } - // --- Second Kick (dt/2) --- +#ifdef NEXT_MPI + MPI_Request reqs3[3]; + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, + ps.x.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs3[0]); + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, + ps.y.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs3[1]); + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, + ps.z.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs3[2]); + MPI_Waitall(3, reqs3, MPI_STATUSES_IGNORE); +#endif + + // SECOND KICK { - std::unique_ptr root = buildTree(); + auto root = buildTree(); #pragma omp parallel for schedule(dynamic, 64) - for (int i = 0; i < N; ++i) { - real ax = 0, ay = 0, az = 0; + for (int i = start; i < end; ++i) { + real ax = real(0), ay = real(0), az = real(0); bhAccel(root.get(), i, ps, theta, ax, ay, az); ps.vx[i] += ax * half; ps.vy[i] += ay * half; ps.vz[i] += az * half; } + +#ifdef NEXT_MPI + MPI_Request reqs4[3]; + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, + ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs4[0]); + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, + ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs4[1]); + MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T, + ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T, + MPI_COMM_WORLD, &reqs4[2]); + MPI_Waitall(3, reqs4, MPI_STATUSES_IGNORE); +#endif } }