From 3df801f5a60d32df20a621c41d52e2cabf988de1 Mon Sep 17 00:00:00 2001
From: Timofey Zakharchuk <timofey.zakharchuk.research@gmail.com>
Date: Sun, 15 Feb 2026 21:17:48 +0300
Subject: [PATCH 01/13] Add MPI

---
 CMakeLists.txt | 53 +++++++++++++++++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 452cd0c..73eb43c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,12 @@ if (MINGW)
 elseif (MSVC)
     message(STATUS "Detected MSVC — using MSVC settings")
     set(USING_MSVC TRUE)
+elseif (APPLE)
+    message(STATUS "Detected macOS — limited MPI support")
+    set(USING_APPLE TRUE)
+elseif (UNIX)
+    message(STATUS "Detected Linux/UNIX — using standard settings")
+    set(USING_UNIX TRUE)
 endif()
 
 # ============================
@@ -18,19 +24,15 @@ endif()
 # ============================
 option(NEXT_FP32 "Use 32-bit floats (scalar)" OFF)
 option(NEXT_FP64 "Use 64-bit floats (scalar)" ON)
+option(NEXT_MPI "Enable MPI support" OFF)
 
-# Option: copy final executable to source dir
 option(NEXT_COPY_TO_CMAKE_SOURCE_DIR "Copy final executable from build dir to source dir" ON)
 
 # ============================
 # Precision modes
 # ============================
-set(NEXT_MODES
-    NEXT_FP32
-    NEXT_FP64
-)
+set(NEXT_MODES NEXT_FP32 NEXT_FP64)
 
-# Count enabled modes
 set(NEXT_MODE_COUNT 0)
 foreach(m ${NEXT_MODES})
     if(${m})
@@ -38,14 +40,12 @@ foreach(m ${NEXT_MODES})
     endif()
 endforeach()
 
-# Enforce exactly one mode
 if(NEXT_MODE_COUNT EQUAL 0)
     message(FATAL_ERROR "You must enable NEXT_FP64 or NEXT_FP32.")
 elseif(NEXT_MODE_COUNT GREATER 1)
     message(FATAL_ERROR "Enable only one precision mode.")
 endif()
 
-# Apply compile definitions
 if(NEXT_FP32)
     add_compile_definitions(NEXT_FP32)
 elseif(NEXT_FP64)
@@ -62,6 +62,9 @@ file(GLOB ARGPARSE_FILES ${CMAKE_SOURCE_DIR}/argparse/*.cpp)
 
 add_executable(next ${SRC_FILES} ${ARGPARSE_FILES})
 
+# ============================
+# Vectorization reports
+# ============================
 option(ENABLE_VEC_REPORT "Enable compiler vectorization reports" ON)
 
 if(ENABLE_VEC_REPORT)
@@ -74,38 +77,56 @@ if(ENABLE_VEC_REPORT)
     endif()
 endif()
 
-
 # ============================
 # OpenMP
 # ============================
 find_package(OpenMP QUIET)
 if(OpenMP_CXX_FOUND)
     message(STATUS "OpenMP detected — enabling multithreading.")
-    
     if(MSVC)
-        # Force MSVC to use the modern LLVM backend (supports OpenMP 3.0+ and size_t)
-        # We add it as a compiler option because MSVC's default find_package 
-        # often defaults to the legacy /openmp flag.
         target_compile_options(next PRIVATE /openmp:llvm)
     else()
         target_link_libraries(next PRIVATE OpenMP::OpenMP_CXX)
     endif()
-    
 else()
     message(STATUS "OpenMP not found — building in single-threaded mode.")
 endif()
 
+# ============================
+# MPI detection
+# ============================
+if(NEXT_MPI)
+    if(USING_UNIX)
+        # Linux: standard OpenMPI
+        find_package(MPI REQUIRED)
+        message(STATUS "MPI (OpenMPI) detected — enabling distributed memory parallelism.")
+        target_compile_definitions(next PRIVATE NEXT_MPI)
+        target_link_libraries(next PRIVATE MPI::MPI_CXX)
+
+    elseif(USING_MSVC OR USING_MINGW)
+        # Windows: Microsoft MPI
+        find_package(MPI REQUIRED)
+        message(STATUS "MPI (MS-MPI) detected — enabling distributed memory parallelism.")
+        target_compile_definitions(next PRIVATE NEXT_MPI)
+        target_link_libraries(next PRIVATE MPI::MPI_CXX)
+
+    elseif(USING_APPLE)
+        message(WARNING "MPI requested, but macOS does not ship MPI by default. Please install OpenMPI via Homebrew (brew install open-mpi).")
+        find_package(MPI REQUIRED)
+        target_compile_definitions(next PRIVATE NEXT_MPI)
+        target_link_libraries(next PRIVATE MPI::MPI_CXX)
+    endif()
+endif()
+
 # ============================
 # HDF5 detection
 # ============================
-# MSVC: use vcpkg or prebuilt binaries
 if (USING_MSVC AND DEFINED ENV{HDF5_DIR})
     set(HDF5_ROOT "$ENV{HDF5_DIR}")
     set(CMAKE_PREFIX_PATH "${HDF5_ROOT};${CMAKE_PREFIX_PATH}")
 endif()
 
 if (USING_MINGW)
-    # MSYS2 MinGW installs HDF5 here
     set(CMAKE_PREFIX_PATH "C:/tools/msys64/mingw64")
     message(STATUS "Using MSYS2 MinGW HDF5 path: ${CMAKE_PREFIX_PATH}")
 endif()

From baa390922465c92d84504cf398e7a6e78f046dc3 Mon Sep 17 00:00:00 2001
From: Timofey Zakharchuk <timofey.zakharchuk.research@gmail.com>
Date: Sun, 15 Feb 2026 21:18:22 +0300
Subject: [PATCH 02/13] Update begrun.cpp

---
 src/begrun.cpp | 137 ++++++++++++++++++++++++++-----------------------
 1 file changed, 74 insertions(+), 63 deletions(-)

diff --git a/src/begrun.cpp b/src/begrun.cpp
index 47dea18..1550208 100644
--- a/src/begrun.cpp
+++ b/src/begrun.cpp
@@ -22,13 +22,37 @@
 #include <omp.h>
 #include <string>
 #include <vector>
+#ifdef NEXT_MPI
+  #include <mpi.h>
+#endif
+#include "hdf5.h"
+
 
 using next::OutputFormat;
 
+// Helper: only rank 0, thread 0 prints
+inline void log_once(int rank, const std::string &msg) {
+    if (rank == 0 && omp_get_thread_num() == 0) {
+        std::cout << msg << std::endl;
+    }
+}
+
 int main(int argc, char **argv) {
-  auto args = next::parse_arguments(argc, argv);
+    int rank = 0;
+    int size = 1;
+
+#ifdef NEXT_MPI
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    log_once(rank, "MPI mode enabled (" + std::to_string(size) + " ranks)");
+#endif
 
-  static constexpr const char *BANNER = R"NEXTBANNER(
+    H5Eset_auto(H5E_DEFAULT, nullptr, nullptr);
+
+    auto args = next::parse_arguments(argc, argv, rank);
+
+    static constexpr const char *BANNER = R"NEXTBANNER(
  _  _ ________   _________ 
 | \ | |  ____\ \ / /__   __|
 |  \| | |__   \ V /   | |   
@@ -37,73 +61,60 @@ int main(int argc, char **argv) {
 |_| \_|______/_/ \_\  |_|   
 )NEXTBANNER";
 
-  std::cout << BANNER << '\n';
-
-  // Set threads and report
-  omp_set_num_threads(args.threads);
-  std::cout << " Threads:   " << args.threads << "\n";
+    omp_set_num_threads(args.threads);
 
+    log_once(rank, BANNER);
+    log_once(rank, " Threads:   " + std::to_string(args.threads));
 #ifdef NEXT_FP64
-  std::cout << " Precision: FP64\n";
+    log_once(rank, " Precision: FP64");
 #elif defined(NEXT_FP32)
-  std::cout << " Precision: FP32\n";
+    log_once(rank, " Precision: FP32");
 #endif
 
-  // --- SoA UPDATE ---
-  // LoadParticlesFromFile now returns a single 'Particle' struct
-  // which internally contains all the parallel vectors.
-  Particle particles = LoadParticlesFromFile(args.input_file);
-
-  std::cout << " Particles: " << particles.size() << "\n";
-
-  real simTime = 0;
-  real nextDump = 0;
-  int step = 0;
-  char command;
-
-  while (true) {
-    // Both computeAdaptiveDt and Step now take the SoA object by reference
-    real dtAdaptive = computeAdaptiveDt(particles, args.dt);
-    Step(particles, dtAdaptive);
-    
-    simTime += dtAdaptive;
-
-    if (simTime >= nextDump) {
-      std::string out = "dump_" + std::to_string(step);
-
-      switch (args.format) {
-        case OutputFormat::VTK:
-          out += ".vtk";
-          SaveVTK(particles, out);
-          break;
-
-        case OutputFormat::VTU:
-          out += ".vtu";
-          SaveVTU(particles, out);
-          break;
-
-        case OutputFormat::HDF5:
-          out += ".hdf5";
-          SaveHDF5(particles, out);
-          break;
-      }
-
-      std::cout << "[Dump " << step << "] t = " << simTime
-                << ", file: " << out << "\n";
-
-      nextDump += args.dump_interval;
-      step++;
+    // Load particles
+    Particle particles = LoadParticlesFromFile(args.input_file);
+    log_once(rank, " Particles: " + std::to_string(particles.size()));
+
+    real simTime = 0;
+    real nextDump = 0;
+    int step = 0;
+    char command;
+
+    while (true) {
+        real dtAdaptive = computeAdaptiveDt(particles, args.dt);
+        Step(particles, dtAdaptive);
+        simTime += dtAdaptive;
+
+        if (simTime >= nextDump) {
+            std::string out = "dump_" + std::to_string(step);
+
+            switch (args.format) {
+                case OutputFormat::VTK:  out += ".vtk";  SaveVTK(particles, out);  break;
+                case OutputFormat::VTU:  out += ".vtu";  SaveVTU(particles, out);  break;
+                case OutputFormat::HDF5: out += ".hdf5"; SaveHDF5(particles, out); break;
+            }
+
+            log_once(rank, "[Dump " + std::to_string(step) +
+                           "] t = " + std::to_string(simTime) +
+                           ", file: " + out);
+
+            nextDump += args.dump_interval;
+            step++;
+        }
+
+        // Non-blocking exit check
+        if (std::cin.rdbuf()->in_avail() > 0) {
+            std::cin >> command;
+            if (command == 'q' || command == 'Q') {
+                log_once(rank, "Exiting...");
+                break;
+            }
+        }
     }
 
-    // Non-blocking exit check
-    if (std::cin.rdbuf()->in_avail() > 0) {
-      std::cin >> command;
-      if (command == 'q' || command == 'Q') {
-        std::cout << "Exiting...\n";
-        break;
-      }
-    }
-  }
+#ifdef NEXT_MPI
+    MPI_Finalize();
+#endif
 
-  return 0;
+    return 0;
 }

From 17f28833bb686d2e320797f7ed160cdb0aebcb43 Mon Sep 17 00:00:00 2001
From: Timofey Zakharchuk <timofey.zakharchuk.research@gmail.com>
Date: Sun, 15 Feb 2026 21:18:55 +0300
Subject: [PATCH 03/13] Update step.h

---
 src/gravity/step.h | 166 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 138 insertions(+), 28 deletions(-)

diff --git a/src/gravity/step.h b/src/gravity/step.h
index 96395d3..cf114ff 100644
--- a/src/gravity/step.h
+++ b/src/gravity/step.h
@@ -16,6 +16,9 @@
 #include <memory>
 #include <algorithm>
 #include <omp.h>
+#ifdef NEXT_MPI
+    #include <mpi.h>
+#endif
 
 /**
  * @brief Performs a complete Leapfrog Step (Kick-Drift-Kick) using SoA data.
@@ -23,75 +26,182 @@
 inline void Step(ParticleSystem &ps, real dt) {
     if (ps.size() == 0) return;
 
-    const real theta = 0.5;
-    const real half = dt * real(0.5);
-    const int N = static_cast<int>(ps.size());
+    const real theta = real(0.5);
+    const real half  = dt * real(0.5);
+    const int  N     = static_cast<int>(ps.size());
+
+    // ---------------------------------------------------------
+    // MPI SETUP
+    // ---------------------------------------------------------
+#ifdef NEXT_MPI
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // Select MPI datatype matching
+    MPI_Datatype MPI_REAL_T;
+#  ifdef NEXT_FP64
+    MPI_REAL_T = MPI_DOUBLE;
+#  elif defined(NEXT_FP32)
+    MPI_REAL_T = MPI_FLOAT;
+#  else
+#    error "Define NEXT_FP32 or NEXT_FP64 for 'real' type."
+#  endif
+#else
+    int rank = 0;
+    int size = 1;
+#endif
+
+    // ---------------------------------------------------------
+    // DOMAIN DECOMPOSITION
+    // ---------------------------------------------------------
+    const int start = (rank * N) / size;
+    const int end   = ((rank + 1) * N) / size;
+
+#ifdef NEXT_MPI
+    // Precompute counts and displacements for Allgatherv
+    std::vector<int> counts(size), displs(size);
+    for (int r = 0; r < size; ++r) {
+        const int s = (r * N) / size;
+        const int e = ((r + 1) * N) / size;
+        counts[r]   = e - s;
+        displs[r]   = s;
+    }
+#endif
 
-    // Helper lambda to build the tree using the ParticleSystem indices
+    // ---------------------------------------------------------
+    // TREE BUILDER
+    // ---------------------------------------------------------
     auto buildTree = [&]() -> std::unique_ptr<Octree> {
-        real minx = 1e30, miny = 1e30, minz = 1e30;
-        real maxx = -1e30, maxy = -1e30, maxz = -1e30;
+        struct BBox { real minx, miny, minz, maxx, maxy, maxz; };
+        BBox local{ real(1e30), real(1e30), real(1e30),
+                    real(-1e30), real(-1e30), real(-1e30) };
 
-        // Bounding box calculation (SoA access is very fast here)
         for (int i = 0; i < N; ++i) {
-            minx = std::min(minx, ps.x[i]); miny = std::min(miny, ps.y[i]); minz = std::min(minz, ps.z[i]);
-            maxx = std::max(maxx, ps.x[i]); maxy = std::max(maxy, ps.y[i]); maxz = std::max(maxz, ps.z[i]);
+            local.minx = std::min(local.minx, ps.x[i]);
+            local.miny = std::min(local.miny, ps.y[i]);
+            local.minz = std::min(local.minz, ps.z[i]);
+            local.maxx = std::max(local.maxx, ps.x[i]);
+            local.maxy = std::max(local.maxy, ps.y[i]);
+            local.maxz = std::max(local.maxz, ps.z[i]);
         }
 
-        real cx = (minx + maxx) * 0.5;
-        real cy = (miny + maxy) * 0.5;
-        real cz = (minz + maxz) * 0.5;
-        real size = std::max({maxx - minx, maxy - miny, maxz - minz}) * 0.5;
+#ifdef NEXT_MPI
+        BBox global;
+        real mins[3] = {local.minx, local.miny, local.minz};
+        real maxs[3] = {local.maxx, local.maxy, local.maxz};
+
+        MPI_Allreduce(MPI_IN_PLACE, mins, 3, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
+        MPI_Allreduce(MPI_IN_PLACE, maxs, 3, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
+
+        global.minx = mins[0]; global.miny = mins[1]; global.minz = mins[2];
+        global.maxx = maxs[0]; global.maxy = maxs[1]; global.maxz = maxs[2];
 
-        if (size <= 0) size = 1.0;
+#else
+        BBox global = local;
+#endif
+
+        const real cx   = (global.minx + global.maxx) * real(0.5);
+        const real cy   = (global.miny + global.maxy) * real(0.5);
+        const real cz   = (global.minz + global.maxz) * real(0.5);
+        real       size = std::max({global.maxx - global.minx,
+                                    global.maxy - global.miny,
+                                    global.maxz - global.minz}) * real(0.5);
+
+        if (size <= real(0)) size = real(1.0);
 
         auto root = std::make_unique<Octree>(cx, cy, cz, size);
 
-        // Insert particle indices 0 to N-1
-        for (int i = 0; i < N; ++i) {
+        for (int i = 0; i < N; ++i)
             root->insert(i, ps);
-        }
 
         root->computeMass(ps);
         return root;
     };
 
-    // --- First Kick (dt/2) ---
+    // ---------------------------------------------------------
+    // FIRST KICK
+    // ---------------------------------------------------------
     {
-        std::unique_ptr<Octree> root = buildTree();
+        auto root = buildTree();
 
         #pragma omp parallel for schedule(dynamic, 64)
-        for (int i = 0; i < N; ++i) {
-            real ax = 0, ay = 0, az = 0;
+        for (int i = start; i < end; ++i) {
+            real ax = real(0), ay = real(0), az = real(0);
             bhAccel(root.get(), i, ps, theta, ax, ay, az);
 
             ps.vx[i] += ax * half;
             ps.vy[i] += ay * half;
             ps.vz[i] += az * half;
         }
+
+#ifdef NEXT_MPI
+        MPI_Request reqs[3];
+        MPI_Iallgatherv(ps.vx.data() + start, end - start, MPI_REAL_T,
+                        ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs[0]);
+        MPI_Iallgatherv(ps.vy.data() + start, end - start, MPI_REAL_T,
+                        ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs[1]);
+        MPI_Iallgatherv(ps.vz.data() + start, end - start, MPI_REAL_T,
+                        ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs[2]);
+        MPI_Waitall(3, reqs, MPI_STATUSES_IGNORE);
+#endif
     }
 
-    // --- Drift (dt) ---
-    // Contiguous memory access makes this loop ideal for SIMD
+    // ---------------------------------------------------------
+    // DRIFT
+    // ---------------------------------------------------------
     #pragma omp parallel for schedule(static)
-    for (int i = 0; i < N; ++i) {
+    for (int i = start; i < end; ++i) {
         ps.x[i] += ps.vx[i] * dt;
         ps.y[i] += ps.vy[i] * dt;
         ps.z[i] += ps.vz[i] * dt;
     }
 
-    // --- Second Kick (dt/2) ---
+#ifdef NEXT_MPI
+    MPI_Request reqs[3];
+    MPI_Iallgatherv(ps.x.data() + start, end - start, MPI_REAL_T,
+                    ps.x.data(), counts.data(), displs.data(), MPI_REAL_T,
+                    MPI_COMM_WORLD, &reqs[0]);
+    MPI_Iallgatherv(ps.y.data() + start, end - start, MPI_REAL_T,
+                    ps.y.data(), counts.data(), displs.data(), MPI_REAL_T,
+                    MPI_COMM_WORLD, &reqs[1]);
+    MPI_Iallgatherv(ps.z.data() + start, end - start, MPI_REAL_T,
+                    ps.z.data(), counts.data(), displs.data(), MPI_REAL_T,
+                    MPI_COMM_WORLD, &reqs[2]);
+    MPI_Waitall(3, reqs, MPI_STATUSES_IGNORE);
+#endif
+
+    // ---------------------------------------------------------
+    // SECOND KICK
+    // ---------------------------------------------------------
     {
-        std::unique_ptr<Octree> root = buildTree();
+        auto root = buildTree();
 
         #pragma omp parallel for schedule(dynamic, 64)
-        for (int i = 0; i < N; ++i) {
-            real ax = 0, ay = 0, az = 0;
+        for (int i = start; i < end; ++i) {
+            real ax = real(0), ay = real(0), az = real(0);
             bhAccel(root.get(), i, ps, theta, ax, ay, az);
 
             ps.vx[i] += ax * half;
             ps.vy[i] += ay * half;
             ps.vz[i] += az * half;
         }
+
+#ifdef NEXT_MPI
+        MPI_Request reqs2[3];
+        MPI_Iallgatherv(ps.vx.data() + start, end - start, MPI_REAL_T,
+                        ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs2[0]);
+        MPI_Iallgatherv(ps.vy.data() + start, end - start, MPI_REAL_T,
+                        ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs2[1]);
+        MPI_Iallgatherv(ps.vz.data() + start, end - start, MPI_REAL_T,
+                        ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs2[2]);
+        MPI_Waitall(3, reqs2, MPI_STATUSES_IGNORE);
+#endif
     }
 }

From 3b79adc100618d74801f6fb274cc5200f1b4d873 Mon Sep 17 00:00:00 2001
From: Timofey Zakharchuk <timofey.zakharchuk.research@gmail.com>
Date: Sun, 15 Feb 2026 21:19:18 +0300
Subject: [PATCH 04/13] Update argparse.cpp

---
 argparse/argparse.cpp | 46 +++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/argparse/argparse.cpp b/argparse/argparse.cpp
index 9fbd08a..a7611df 100644
--- a/argparse/argparse.cpp
+++ b/argparse/argparse.cpp
@@ -1,33 +1,59 @@
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
 #include "argparse.hpp"
 #include <iostream>
 #include <stdexcept>
+#ifdef NEXT_MPI
+    #include <mpi.h>
+#endif
 
 namespace next {
 
-Arguments parse_arguments(int argc, char** argv)
+Arguments parse_arguments(int argc, char** argv, int rank)
 {
     if (argc != 6) {
-        std::cerr << "Usage: next <input.txt> <threads> <dt> <dump_interval> <vtk|vtu|hdf5>\n";
+        // Only rank 0 prints usage
+        if (rank == 0) {
+            std::cerr << "Usage: next <input.txt> <threads> <dt> <dump_interval> <vtk|vtu|hdf5>\n";
+        }
+
+#ifdef NEXT_MPI
+        MPI_Finalize();
+#endif
         std::exit(1);
     }
 
     Arguments args;
 
-    args.input_file = argv[1];
-    args.threads = std::stoi(argv[2]);
-    args.dt = std::stod(argv[3]);
+    args.input_file    = argv[1];
+    args.threads       = std::stoi(argv[2]);
+    args.dt            = std::stod(argv[3]);
     args.dump_interval = std::stod(argv[4]);
 
     std::string fmt = argv[5];
 
-    if (fmt == "vtk")
+    if (fmt == "vtk") {
         args.format = OutputFormat::VTK;
-    else if (fmt == "vtu")
+    } else if (fmt == "vtu") {
         args.format = OutputFormat::VTU;
-    else if (fmt == "hdf5")
+    } else if (fmt == "hdf5") {
         args.format = OutputFormat::HDF5;
-    else {
-        std::cerr << "Choose a file format: vtk, vtu, or hdf5\n";
+    } else {
+        if (rank == 0) {
+            std::cerr << "Choose a file format: vtk, vtu, or hdf5\n";
+        }
+#ifdef NEXT_MPI
+        MPI_Finalize();
+#endif
         std::exit(1);
     }
 

From 01739b9de70afd7250afcbe24faf2c80580efd03 Mon Sep 17 00:00:00 2001
From: Timofey Zakharchuk <timofey.zakharchuk.research@gmail.com>
Date: Sun, 15 Feb 2026 21:19:30 +0300
Subject: [PATCH 05/13] Update argparse.hpp

---
 argparse/argparse.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/argparse/argparse.hpp b/argparse/argparse.hpp
index 3e0f831..b81174d 100644
--- a/argparse/argparse.hpp
+++ b/argparse/argparse.hpp
@@ -17,6 +17,6 @@ struct Arguments {
     OutputFormat format;
 };
 
-Arguments parse_arguments(int argc, char** argv);
+Arguments parse_arguments(int argc, char** argv, int rank);
 
 }

From 7649eafdfe6b60464df5469429cb26679a785000 Mon Sep 17 00:00:00 2001
From: Timofey Zakharchuk <timofey.zakharchuk.research@gmail.com>
Date: Sun, 15 Feb 2026 21:24:24 +0300
Subject: [PATCH 06/13] Create mpi.yml

---
 .github/workflows/mpi.yml | 53 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 .github/workflows/mpi.yml

diff --git a/.github/workflows/mpi.yml b/.github/workflows/mpi.yml
new file mode 100644
index 0000000..bd9097d
--- /dev/null
+++ b/.github/workflows/mpi.yml
@@ -0,0 +1,53 @@
+name: MPI Galaxy Demo
+
+permissions:
+  contents: read
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+
+concurrency:
+  group: mpi-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  mpi-galaxy:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install dependencies (MPI + HDF5 + Python)
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y mpich libhdf5-dev libomp-dev cmake build-essential python3
+
+      - name: Configure project (CMake)
+        run: |
+          mkdir -p build
+          cd build
+          cmake .. -DCMAKE_BUILD_TYPE=Release -DNEXT_FP64=ON -DNEXT_MPI=ON
+
+      - name: Build project
+        run: |
+          cd build
+          cmake --build . -- -j$(nproc)
+
+      - name: Run Galaxy demo (timed)
+        run: |
+          # Go back to project root
+          cd $GITHUB_WORKSPACE/examples/GalaxyDemo
+          python3 galaxy.py
+          # Run with 2 MPI ranks and 2 OpenMP threads, stop after 20s
+          OMP_NUM_THREADS=2 timeout 20s mpiexec -n 2 ../../build/next galaxy.txt 2 0.01 0.1 hdf5
+
+      - name: Upload simulation outputs
+        uses: actions/upload-artifact@v4
+        with:
+          name: galaxy-dumps
+          path: |
+            examples/GalaxyDemo/dump_*.hdf5
+            examples/GalaxyDemo/dump_*.xdmf

From 9d4611c27c99757bace7534fe9f4880c76e725af Mon Sep 17 00:00:00 2001
From: Timofey Zakharchuk <timofey.zakharchuk.research@gmail.com>
Date: Sun, 15 Feb 2026 21:29:08 +0300
Subject: [PATCH 07/13] Update step.h

---
 src/gravity/step.h | 63 +++++++++++++++-------------------------------
 1 file changed, 20 insertions(+), 43 deletions(-)

diff --git a/src/gravity/step.h b/src/gravity/step.h
index cf114ff..d57f36f 100644
--- a/src/gravity/step.h
+++ b/src/gravity/step.h
@@ -20,9 +20,6 @@
     #include <mpi.h>
 #endif
 
-/**
- * @brief Performs a complete Leapfrog Step (Kick-Drift-Kick) using SoA data.
- */
 inline void Step(ParticleSystem &ps, real dt) {
     if (ps.size() == 0) return;
 
@@ -30,15 +27,11 @@ inline void Step(ParticleSystem &ps, real dt) {
     const real half  = dt * real(0.5);
     const int  N     = static_cast<int>(ps.size());
 
-    // ---------------------------------------------------------
-    // MPI SETUP
-    // ---------------------------------------------------------
 #ifdef NEXT_MPI
     int rank, size;
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     MPI_Comm_size(MPI_COMM_WORLD, &size);
 
-    // Select MPI datatype matching
     MPI_Datatype MPI_REAL_T;
 #  ifdef NEXT_FP64
     MPI_REAL_T = MPI_DOUBLE;
@@ -52,14 +45,10 @@ inline void Step(ParticleSystem &ps, real dt) {
     int size = 1;
 #endif
 
-    // ---------------------------------------------------------
-    // DOMAIN DECOMPOSITION
-    // ---------------------------------------------------------
     const int start = (rank * N) / size;
     const int end   = ((rank + 1) * N) / size;
 
 #ifdef NEXT_MPI
-    // Precompute counts and displacements for Allgatherv
     std::vector<int> counts(size), displs(size);
     for (int r = 0; r < size; ++r) {
         const int s = (r * N) / size;
@@ -69,9 +58,6 @@ inline void Step(ParticleSystem &ps, real dt) {
     }
 #endif
 
-    // ---------------------------------------------------------
-    // TREE BUILDER
-    // ---------------------------------------------------------
     auto buildTree = [&]() -> std::unique_ptr<Octree> {
         struct BBox { real minx, miny, minz, maxx, maxy, maxz; };
         BBox local{ real(1e30), real(1e30), real(1e30),
@@ -87,16 +73,13 @@ inline void Step(ParticleSystem &ps, real dt) {
         }
 
 #ifdef NEXT_MPI
-        BBox global;
         real mins[3] = {local.minx, local.miny, local.minz};
         real maxs[3] = {local.maxx, local.maxy, local.maxz};
 
         MPI_Allreduce(MPI_IN_PLACE, mins, 3, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
         MPI_Allreduce(MPI_IN_PLACE, maxs, 3, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
 
-        global.minx = mins[0]; global.miny = mins[1]; global.minz = mins[2];
-        global.maxx = maxs[0]; global.maxy = maxs[1]; global.maxz = maxs[2];
-
+        BBox global{mins[0], mins[1], mins[2], maxs[0], maxs[1], maxs[2]};
 #else
         BBox global = local;
 #endif
@@ -119,9 +102,7 @@ inline void Step(ParticleSystem &ps, real dt) {
         return root;
     };
 
-    // ---------------------------------------------------------
     // FIRST KICK
-    // ---------------------------------------------------------
     {
         auto root = buildTree();
 
@@ -137,22 +118,20 @@ inline void Step(ParticleSystem &ps, real dt) {
 
 #ifdef NEXT_MPI
         MPI_Request reqs[3];
-        MPI_Iallgatherv(ps.vx.data() + start, end - start, MPI_REAL_T,
+        MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
                         ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T,
                         MPI_COMM_WORLD, &reqs[0]);
-        MPI_Iallgatherv(ps.vy.data() + start, end - start, MPI_REAL_T,
+        MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
                         ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T,
                         MPI_COMM_WORLD, &reqs[1]);
-        MPI_Iallgatherv(ps.vz.data() + start, end - start, MPI_REAL_T,
+        MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
                         ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T,
                         MPI_COMM_WORLD, &reqs[2]);
         MPI_Waitall(3, reqs, MPI_STATUSES_IGNORE);
 #endif
     }
 
-    // ---------------------------------------------------------
     // DRIFT
-    // ---------------------------------------------------------
     #pragma omp parallel for schedule(static)
     for (int i = start; i < end; ++i) {
         ps.x[i] += ps.vx[i] * dt;
@@ -161,22 +140,20 @@ inline void Step(ParticleSystem &ps, real dt) {
     }
 
 #ifdef NEXT_MPI
-    MPI_Request reqs[3];
-    MPI_Iallgatherv(ps.x.data() + start, end - start, MPI_REAL_T,
+    MPI_Request reqs3[3];
+    MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
                     ps.x.data(), counts.data(), displs.data(), MPI_REAL_T,
-                    MPI_COMM_WORLD, &reqs[0]);
-    MPI_Iallgatherv(ps.y.data() + start, end - start, MPI_REAL_T,
+                    MPI_COMM_WORLD, &reqs3[0]);
+    MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
                     ps.y.data(), counts.data(), displs.data(), MPI_REAL_T,
-                    MPI_COMM_WORLD, &reqs[1]);
-    MPI_Iallgatherv(ps.z.data() + start, end - start, MPI_REAL_T,
+                    MPI_COMM_WORLD, &reqs3[1]);
+    MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
                     ps.z.data(), counts.data(), displs.data(), MPI_REAL_T,
-                    MPI_COMM_WORLD, &reqs[2]);
-    MPI_Waitall(3, reqs, MPI_STATUSES_IGNORE);
+                    MPI_COMM_WORLD, &reqs3[2]);
+    MPI_Waitall(3, reqs3, MPI_STATUSES_IGNORE);
 #endif
 
-    // ---------------------------------------------------------
     // SECOND KICK
-    // ---------------------------------------------------------
     {
         auto root = buildTree();
 
@@ -191,17 +168,17 @@ inline void Step(ParticleSystem &ps, real dt) {
         }
 
 #ifdef NEXT_MPI
-        MPI_Request reqs2[3];
-        MPI_Iallgatherv(ps.vx.data() + start, end - start, MPI_REAL_T,
+        MPI_Request reqs4[3];
+        MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
                         ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T,
-                        MPI_COMM_WORLD, &reqs2[0]);
-        MPI_Iallgatherv(ps.vy.data() + start, end - start, MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs4[0]);
+        MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
                         ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T,
-                        MPI_COMM_WORLD, &reqs2[1]);
-        MPI_Iallgatherv(ps.vz.data() + start, end - start, MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs4[1]);
+        MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
                         ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T,
-                        MPI_COMM_WORLD, &reqs2[2]);
-        MPI_Waitall(3, reqs2, MPI_STATUSES_IGNORE);
+                        MPI_COMM_WORLD, &reqs4[2]);
+        MPI_Waitall(3, reqs4, MPI_STATUSES_IGNORE);
 #endif
     }
 }

From 25cb9fe50e41e26ea1c0e9ecd8dffb7792ce66b6 Mon Sep 17 00:00:00 2001
From: Timofey Zakharchuk <timofey.zakharchuk.research@gmail.com>
Date: Sun, 15 Feb 2026 21:32:28 +0300
Subject: [PATCH 08/13] Update begrun.cpp

---
 src/begrun.cpp | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/begrun.cpp b/src/begrun.cpp
index 1550208..9b8f5ff 100644
--- a/src/begrun.cpp
+++ b/src/begrun.cpp
@@ -45,7 +45,6 @@ int main(int argc, char **argv) {
     MPI_Init(&argc, &argv);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     MPI_Comm_size(MPI_COMM_WORLD, &size);
-    log_once(rank, "MPI mode enabled (" + std::to_string(size) + " ranks)");
 #endif
 
     H5Eset_auto(H5E_DEFAULT, nullptr, nullptr);
@@ -63,17 +62,25 @@ int main(int argc, char **argv) {
 
     omp_set_num_threads(args.threads);
 
-    log_once(rank, BANNER);
-    log_once(rank, " Threads:   " + std::to_string(args.threads));
+    // Only rank 0 prints startup info
+    if (rank == 0 && omp_get_thread_num() == 0) {
+#ifdef NEXT_MPI
+        std::cout << "MPI mode enabled (" << size << " ranks)" << std::endl;
+#endif
+        std::cout << BANNER << std::endl;
+        std::cout << " Threads:   " << args.threads << std::endl;
 #ifdef NEXT_FP64
-    log_once(rank, " Precision: FP64");
+        std::cout << " Precision: FP64" << std::endl;
 #elif defined(NEXT_FP32)
-    log_once(rank, " Precision: FP32");
+        std::cout << " Precision: FP32" << std::endl;
 #endif
+    }
 
     // Load particles
     Particle particles = LoadParticlesFromFile(args.input_file);
-    log_once(rank, " Particles: " + std::to_string(particles.size()));
+    if (rank == 0 && omp_get_thread_num() == 0) {
+        std::cout << " Particles: " << particles.size() << std::endl;
+    }
 
     real simTime = 0;
     real nextDump = 0;
@@ -94,9 +101,10 @@ int main(int argc, char **argv) {
                 case OutputFormat::HDF5: out += ".hdf5"; SaveHDF5(particles, out); break;
             }
 
-            log_once(rank, "[Dump " + std::to_string(step) +
-                           "] t = " + std::to_string(simTime) +
-                           ", file: " + out);
+            if (rank == 0 && omp_get_thread_num() == 0) {
+                std::cout << "[Dump " << step << "] t = " << simTime
+                          << ", file: " << out << std::endl;
+            }
 
             nextDump += args.dump_interval;
             step++;
@@ -106,7 +114,9 @@ int main(int argc, char **argv) {
         if (std::cin.rdbuf()->in_avail() > 0) {
             std::cin >> command;
             if (command == 'q' || command == 'Q') {
-                log_once(rank, "Exiting...");
+                if (rank == 0 && omp_get_thread_num() == 0) {
+                    std::cout << "Exiting..." << std::endl;
+                }
                 break;
             }
         }

From 7a0e878b4aa647e7a190bb5d2d8c5a3c8b3ea91d Mon Sep 17 00:00:00 2001
From: Timofey Zakharchuk <timofey.zakharchuk.research@gmail.com>
Date: Sun, 15 Feb 2026 21:45:02 +0300
Subject: [PATCH 09/13] Update begrun.cpp

---
 src/begrun.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/begrun.cpp b/src/begrun.cpp
index 9b8f5ff..16f1d89 100644
--- a/src/begrun.cpp
+++ b/src/begrun.cpp
@@ -45,6 +45,12 @@ int main(int argc, char **argv) {
     MPI_Init(&argc, &argv);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // Silence stdout/stderr for all ranks except 0
+    if (rank != 0) {
+        fclose(stdout);
+    }
+  
 #endif
 
     H5Eset_auto(H5E_DEFAULT, nullptr, nullptr);

From 981e02b2d6635811cee176d86664ef4c7d06ccd1 Mon Sep 17 00:00:00 2001
From: Timofey Zakharchuk <timofey.zakharchuk.research@gmail.com>
Date: Sun, 15 Feb 2026 21:48:03 +0300
Subject: [PATCH 10/13] Update CMakeLists.txt

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 73eb43c..562100f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,7 +65,7 @@ add_executable(next ${SRC_FILES} ${ARGPARSE_FILES})
 # ============================
 # Vectorization reports
 # ============================
-option(ENABLE_VEC_REPORT "Enable compiler vectorization reports" ON)
+option(ENABLE_VEC_REPORT "Enable compiler vectorization reports" OFF)
 
 if(ENABLE_VEC_REPORT)
     if(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")

From d5ae3080e708af0e33a9adc8cc81dda0e9aa05df Mon Sep 17 00:00:00 2001
From: Timofey Zakharchuk <timofey.zakharchuk.research@gmail.com>
Date: Sun, 15 Feb 2026 21:49:40 +0300
Subject: [PATCH 11/13] Update mpi.yml

---
 .github/workflows/mpi.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mpi.yml b/.github/workflows/mpi.yml
index bd9097d..6676122 100644
--- a/.github/workflows/mpi.yml
+++ b/.github/workflows/mpi.yml
@@ -42,7 +42,7 @@ jobs:
           cd $GITHUB_WORKSPACE/examples/GalaxyDemo
           python3 galaxy.py
           # Run with 2 MPI ranks and 2 OpenMP threads, stop after 20s
-          OMP_NUM_THREADS=2 timeout 20s mpiexec -n 2 ../../build/next galaxy.txt 2 0.01 0.1 hdf5
+          timeout 60s mpiexec -n 2 ./next galaxy.txt 1 0.01 0.1 hdf5
 
       - name: Upload simulation outputs
         uses: actions/upload-artifact@v4

From fc6a4652b192896c97b4671447d190cea781f4d5 Mon Sep 17 00:00:00 2001
From: Timofey Zakharchuk <timofey.zakharchuk.research@gmail.com>
Date: Sun, 15 Feb 2026 21:51:05 +0300
Subject: [PATCH 12/13] Update mpi.yml

---
 .github/workflows/mpi.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mpi.yml b/.github/workflows/mpi.yml
index 6676122..05a19c9 100644
--- a/.github/workflows/mpi.yml
+++ b/.github/workflows/mpi.yml
@@ -42,7 +42,7 @@ jobs:
           cd $GITHUB_WORKSPACE/examples/GalaxyDemo
           python3 galaxy.py
           # Run with 2 MPI ranks and 2 OpenMP threads, stop after 20s
-          timeout 60s mpiexec -n 2 ./next galaxy.txt 1 0.01 0.1 hdf5
+          timeout 60s mpiexec -n 2 ../../next galaxy.txt 1 0.01 0.1 hdf5
 
       - name: Upload simulation outputs
         uses: actions/upload-artifact@v4

From d759c4198bc9a532d57aa286d1a57dc777e8b926 Mon Sep 17 00:00:00 2001
From: Timofey Zakharchuk <timofey.zakharchuk.research@gmail.com>
Date: Sun, 15 Feb 2026 22:07:29 +0300
Subject: [PATCH 13/13] Update mpi.yml

---
 .github/workflows/mpi.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mpi.yml b/.github/workflows/mpi.yml
index 05a19c9..3753672 100644
--- a/.github/workflows/mpi.yml
+++ b/.github/workflows/mpi.yml
@@ -42,7 +42,7 @@ jobs:
           cd $GITHUB_WORKSPACE/examples/GalaxyDemo
           python3 galaxy.py
           # Run with 2 MPI ranks and 2 OpenMP threads, stop after 20s
-          timeout 60s mpiexec -n 2 ../../next galaxy.txt 1 0.01 0.1 hdf5
+          timeout 60s mpiexec -n 2 ../../next galaxy.txt 1 0.01 0.1 hdf5 || true
 
       - name: Upload simulation outputs
         uses: actions/upload-artifact@v4