diff --git a/.github/workflows/mpi.yml b/.github/workflows/mpi.yml
new file mode 100644
index 0000000..3753672
--- /dev/null
+++ b/.github/workflows/mpi.yml
@@ -0,0 +1,53 @@
+name: MPI Galaxy Demo
+
+permissions:
+  contents: read
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+
+concurrency:
+  group: mpi-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  mpi-galaxy:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install dependencies (MPI + HDF5 + Python)
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y mpich libhdf5-dev libomp-dev cmake build-essential python3
+
+      - name: Configure project (CMake)
+        run: |
+          mkdir -p build
+          cd build
+          cmake .. -DCMAKE_BUILD_TYPE=Release -DNEXT_FP64=ON -DNEXT_MPI=ON
+
+      - name: Build project
+        run: |
+          cd build
+          cmake --build . -- -j$(nproc)
+
+      - name: Run Galaxy demo (timed)
+        run: |
+          # Go back to project root
+          cd $GITHUB_WORKSPACE/examples/GalaxyDemo
+          python3 galaxy.py
+          # Run with 2 MPI ranks and 2 OpenMP threads, stop after 20s
+          timeout 60s mpiexec -n 2 ../../next galaxy.txt 1 0.01 0.1 hdf5 || true
+
+      - name: Upload simulation outputs
+        uses: actions/upload-artifact@v4
+        with:
+          name: galaxy-dumps
+          path: |
+            examples/GalaxyDemo/dump_*.hdf5
+            examples/GalaxyDemo/dump_*.xdmf
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 452cd0c..562100f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,12 @@ if (MINGW)
 elseif (MSVC)
     message(STATUS "Detected MSVC — using MSVC settings")
     set(USING_MSVC TRUE)
+elseif (APPLE)
+    message(STATUS "Detected macOS — limited MPI support")
+    set(USING_APPLE TRUE)
+elseif (UNIX)
+    message(STATUS "Detected Linux/UNIX — using standard settings")
+    set(USING_UNIX TRUE)
 endif()
 
 # ============================
@@ -18,19 +24,15 @@ endif()
 # ============================
 option(NEXT_FP32 "Use 32-bit floats (scalar)" OFF)
 option(NEXT_FP64 "Use 64-bit floats (scalar)" ON)
+option(NEXT_MPI "Enable MPI support" OFF)
 
-# Option: copy final executable to source dir
 option(NEXT_COPY_TO_CMAKE_SOURCE_DIR "Copy final executable from build dir to source dir" ON)
 
 # ============================
 # Precision modes
 # ============================
-set(NEXT_MODES
-    NEXT_FP32
-    NEXT_FP64
-)
+set(NEXT_MODES NEXT_FP32 NEXT_FP64)
 
-# Count enabled modes
 set(NEXT_MODE_COUNT 0)
 foreach(m ${NEXT_MODES})
     if(${m})
@@ -38,14 +40,12 @@ foreach(m ${NEXT_MODES})
     endif()
 endforeach()
 
-# Enforce exactly one mode
 if(NEXT_MODE_COUNT EQUAL 0)
     message(FATAL_ERROR "You must enable NEXT_FP64 or NEXT_FP32.")
 elseif(NEXT_MODE_COUNT GREATER 1)
     message(FATAL_ERROR "Enable only one precision mode.")
 endif()
 
-# Apply compile definitions
 if(NEXT_FP32)
     add_compile_definitions(NEXT_FP32)
 elseif(NEXT_FP64)
@@ -62,7 +62,10 @@ file(GLOB ARGPARSE_FILES ${CMAKE_SOURCE_DIR}/argparse/*.cpp)
 
 add_executable(next ${SRC_FILES} ${ARGPARSE_FILES})
 
-option(ENABLE_VEC_REPORT "Enable compiler vectorization reports" ON)
+# ============================
+# Vectorization reports
+# ============================
+option(ENABLE_VEC_REPORT "Enable compiler vectorization reports" OFF)
 
 if(ENABLE_VEC_REPORT)
     if(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
@@ -74,38 +77,56 @@ if(ENABLE_VEC_REPORT)
     endif()
 endif()
 
-
 # ============================
 # OpenMP
 # ============================
 find_package(OpenMP QUIET)
 if(OpenMP_CXX_FOUND)
     message(STATUS "OpenMP detected — enabling multithreading.")
-    
     if(MSVC)
-        # Force MSVC to use the modern LLVM backend (supports OpenMP 3.0+ and size_t)
-        # We add it as a compiler option because MSVC's default find_package 
-        # often defaults to the legacy /openmp flag.
         target_compile_options(next PRIVATE /openmp:llvm)
     else()
         target_link_libraries(next PRIVATE OpenMP::OpenMP_CXX)
     endif()
-    
 else()
     message(STATUS "OpenMP not found — building in single-threaded mode.")
 endif()
 
+# ============================
+# MPI detection
+# ============================
+if(NEXT_MPI)
+    if(USING_UNIX)
+        # Linux: standard OpenMPI
+        find_package(MPI REQUIRED)
+        message(STATUS "MPI (OpenMPI) detected — enabling distributed memory parallelism.")
+        target_compile_definitions(next PRIVATE NEXT_MPI)
+        target_link_libraries(next PRIVATE MPI::MPI_CXX)
+
+    elseif(USING_MSVC OR USING_MINGW)
+        # Windows: Microsoft MPI
+        find_package(MPI REQUIRED)
+        message(STATUS "MPI (MS-MPI) detected — enabling distributed memory parallelism.")
+        target_compile_definitions(next PRIVATE NEXT_MPI)
+        target_link_libraries(next PRIVATE MPI::MPI_CXX)
+
+    elseif(USING_APPLE)
+        message(WARNING "MPI requested, but macOS does not ship MPI by default. Please install OpenMPI via Homebrew (brew install open-mpi).")
+        find_package(MPI REQUIRED)
+        target_compile_definitions(next PRIVATE NEXT_MPI)
+        target_link_libraries(next PRIVATE MPI::MPI_CXX)
+    endif()
+endif()
+
 # ============================
 # HDF5 detection
 # ============================
-# MSVC: use vcpkg or prebuilt binaries
 if (USING_MSVC AND DEFINED ENV{HDF5_DIR})
     set(HDF5_ROOT "$ENV{HDF5_DIR}")
     set(CMAKE_PREFIX_PATH "${HDF5_ROOT};${CMAKE_PREFIX_PATH}")
 endif()
 
 if (USING_MINGW)
-    # MSYS2 MinGW installs HDF5 here
     set(CMAKE_PREFIX_PATH "C:/tools/msys64/mingw64")
     message(STATUS "Using MSYS2 MinGW HDF5 path: ${CMAKE_PREFIX_PATH}")
 endif()
diff --git a/argparse/argparse.cpp b/argparse/argparse.cpp
index 9fbd08a..a7611df 100644
--- a/argparse/argparse.cpp
+++ b/argparse/argparse.cpp
@@ -1,33 +1,59 @@
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
 #include "argparse.hpp"
 #include <iostream>
 #include <stdexcept>
+#ifdef NEXT_MPI
+    #include <mpi.h>
+#endif
 
 namespace next {
 
-Arguments parse_arguments(int argc, char** argv)
+Arguments parse_arguments(int argc, char** argv, int rank)
 {
     if (argc != 6) {
-        std::cerr << "Usage: next <input.txt> <threads> <dt> <dump_interval> <vtk|vtu|hdf5>\n";
+        // Only rank 0 prints usage
+        if (rank == 0) {
+            std::cerr << "Usage: next <input.txt> <threads> <dt> <dump_interval> <vtk|vtu|hdf5>\n";
+        }
+
+#ifdef NEXT_MPI
+        MPI_Finalize();
+#endif
         std::exit(1);
     }
 
     Arguments args;
 
-    args.input_file = argv[1];
-    args.threads = std::stoi(argv[2]);
-    args.dt = std::stod(argv[3]);
+    args.input_file    = argv[1];
+    args.threads       = std::stoi(argv[2]);
+    args.dt            = std::stod(argv[3]);
     args.dump_interval = std::stod(argv[4]);
 
     std::string fmt = argv[5];
 
-    if (fmt == "vtk")
+    if (fmt == "vtk") {
         args.format = OutputFormat::VTK;
-    else if (fmt == "vtu")
+    } else if (fmt == "vtu") {
         args.format = OutputFormat::VTU;
-    else if (fmt == "hdf5")
+    } else if (fmt == "hdf5") {
         args.format = OutputFormat::HDF5;
-    else {
-        std::cerr << "Choose a file format: vtk, vtu, or hdf5\n";
+    } else {
+        if (rank == 0) {
+            std::cerr << "Choose a file format: vtk, vtu, or hdf5\n";
+        }
+#ifdef NEXT_MPI
+        MPI_Finalize();
+#endif
         std::exit(1);
     }
 
diff --git a/argparse/argparse.hpp b/argparse/argparse.hpp
index 3e0f831..b81174d 100644
--- a/argparse/argparse.hpp
+++ b/argparse/argparse.hpp
@@ -17,6 +17,6 @@ struct Arguments {
     OutputFormat format;
 };
 
-Arguments parse_arguments(int argc, char** argv);
+Arguments parse_arguments(int argc, char** argv, int rank);
 
 }
diff --git a/src/begrun.cpp b/src/begrun.cpp
index 47dea18..16f1d89 100644
--- a/src/begrun.cpp
+++ b/src/begrun.cpp
@@ -22,13 +22,42 @@
 #include <omp.h>
 #include <string>
 #include <vector>
+#ifdef NEXT_MPI
+  #include <mpi.h>
+#endif
+#include "hdf5.h"
+
 
 using next::OutputFormat;
 
+// Helper: only rank 0, thread 0 prints
+inline void log_once(int rank, const std::string &msg) {
+    if (rank == 0 && omp_get_thread_num() == 0) {
+        std::cout << msg << std::endl;
+    }
+}
+
 int main(int argc, char **argv) {
-  auto args = next::parse_arguments(argc, argv);
+    int rank = 0;
+    int size = 1;
+
+#ifdef NEXT_MPI
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // Silence stdout/stderr for all ranks except 0
+    if (rank != 0) {
+        fclose(stdout);
+    }
+  
+#endif
+
+    H5Eset_auto(H5E_DEFAULT, nullptr, nullptr);
 
-  static constexpr const char *BANNER = R"NEXTBANNER(
+    auto args = next::parse_arguments(argc, argv, rank);
+
+    static constexpr const char *BANNER = R"NEXTBANNER(
  _  _ ________   _________ 
 | \ | |  ____\ \ / /__   __|
 |  \| | |__   \ V /   | |   
@@ -37,73 +66,71 @@ int main(int argc, char **argv) {
 |_| \_|______/_/ \_\  |_|   
 )NEXTBANNER";
 
-  std::cout << BANNER << '\n';
-
-  // Set threads and report
-  omp_set_num_threads(args.threads);
-  std::cout << " Threads:   " << args.threads << "\n";
+    omp_set_num_threads(args.threads);
 
+    // Only rank 0 prints startup info
+    if (rank == 0 && omp_get_thread_num() == 0) {
+#ifdef NEXT_MPI
+        std::cout << "MPI mode enabled (" << size << " ranks)" << std::endl;
+#endif
+        std::cout << BANNER << std::endl;
+        std::cout << " Threads:   " << args.threads << std::endl;
 #ifdef NEXT_FP64
-  std::cout << " Precision: FP64\n";
+        std::cout << " Precision: FP64" << std::endl;
 #elif defined(NEXT_FP32)
-  std::cout << " Precision: FP32\n";
+        std::cout << " Precision: FP32" << std::endl;
 #endif
+    }
 
-  // --- SoA UPDATE ---
-  // LoadParticlesFromFile now returns a single 'Particle' struct
-  // which internally contains all the parallel vectors.
-  Particle particles = LoadParticlesFromFile(args.input_file);
-
-  std::cout << " Particles: " << particles.size() << "\n";
-
-  real simTime = 0;
-  real nextDump = 0;
-  int step = 0;
-  char command;
-
-  while (true) {
-    // Both computeAdaptiveDt and Step now take the SoA object by reference
-    real dtAdaptive = computeAdaptiveDt(particles, args.dt);
-    Step(particles, dtAdaptive);
-    
-    simTime += dtAdaptive;
-
-    if (simTime >= nextDump) {
-      std::string out = "dump_" + std::to_string(step);
-
-      switch (args.format) {
-        case OutputFormat::VTK:
-          out += ".vtk";
-          SaveVTK(particles, out);
-          break;
-
-        case OutputFormat::VTU:
-          out += ".vtu";
-          SaveVTU(particles, out);
-          break;
-
-        case OutputFormat::HDF5:
-          out += ".hdf5";
-          SaveHDF5(particles, out);
-          break;
-      }
-
-      std::cout << "[Dump " << step << "] t = " << simTime
-                << ", file: " << out << "\n";
-
-      nextDump += args.dump_interval;
-      step++;
+    // Load particles
+    Particle particles = LoadParticlesFromFile(args.input_file);
+    if (rank == 0 && omp_get_thread_num() == 0) {
+        std::cout << " Particles: " << particles.size() << std::endl;
     }
 
-    // Non-blocking exit check
-    if (std::cin.rdbuf()->in_avail() > 0) {
-      std::cin >> command;
-      if (command == 'q' || command == 'Q') {
-        std::cout << "Exiting...\n";
-        break;
-      }
+    real simTime = 0;
+    real nextDump = 0;
+    int step = 0;
+    char command;
+
+    while (true) {
+        real dtAdaptive = computeAdaptiveDt(particles, args.dt);
+        Step(particles, dtAdaptive);
+        simTime += dtAdaptive;
+
+        if (simTime >= nextDump) {
+            std::string out = "dump_" + std::to_string(step);
+
+            switch (args.format) {
+                case OutputFormat::VTK:  out += ".vtk";  SaveVTK(particles, out);  break;
+                case OutputFormat::VTU:  out += ".vtu";  SaveVTU(particles, out);  break;
+                case OutputFormat::HDF5: out += ".hdf5"; SaveHDF5(particles, out); break;
+            }
+
+            if (rank == 0 && omp_get_thread_num() == 0) {
+                std::cout << "[Dump " << step << "] t = " << simTime
+                          << ", file: " << out << std::endl;
+            }
+
+            nextDump += args.dump_interval;
+            step++;
+        }
+
+        // Non-blocking exit check
+        if (std::cin.rdbuf()->in_avail() > 0) {
+            std::cin >> command;
+            if (command == 'q' || command == 'Q') {
+                if (rank == 0 && omp_get_thread_num() == 0) {
+                    std::cout << "Exiting..." << std::endl;
+                }
+                break;
+            }
+        }
     }
-  }
 
-  return 0;
+#ifdef NEXT_MPI
+    MPI_Finalize();
+#endif
+
+    return 0;
 }
diff --git a/src/gravity/step.h b/src/gravity/step.h
index 96395d3..d57f36f 100644
--- a/src/gravity/step.h
+++ b/src/gravity/step.h
@@ -16,82 +16,169 @@
 #include <memory>
 #include <algorithm>
 #include <omp.h>
+#ifdef NEXT_MPI
+    #include <mpi.h>
+#endif
 
-/**
- * @brief Performs a complete Leapfrog Step (Kick-Drift-Kick) using SoA data.
- */
 inline void Step(ParticleSystem &ps, real dt) {
     if (ps.size() == 0) return;
 
-    const real theta = 0.5;
-    const real half = dt * real(0.5);
-    const int N = static_cast<int>(ps.size());
+    const real theta = real(0.5);
+    const real half  = dt * real(0.5);
+    const int  N     = static_cast<int>(ps.size());
+
+#ifdef NEXT_MPI
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    MPI_Datatype MPI_REAL_T;
+#  ifdef NEXT_FP64
+    MPI_REAL_T = MPI_DOUBLE;
+#  elif defined(NEXT_FP32)
+    MPI_REAL_T = MPI_FLOAT;
+#  else
+#    error "Define NEXT_FP32 or NEXT_FP64 for 'real' type."
+#  endif
+#else
+    int rank = 0;
+    int size = 1;
+#endif
+
+    const int start = (rank * N) / size;
+    const int end   = ((rank + 1) * N) / size;
+
+#ifdef NEXT_MPI
+    std::vector<int> counts(size), displs(size);
+    for (int r = 0; r < size; ++r) {
+        const int s = (r * N) / size;
+        const int e = ((r + 1) * N) / size;
+        counts[r]   = e - s;
+        displs[r]   = s;
+    }
+#endif
 
-    // Helper lambda to build the tree using the ParticleSystem indices
     auto buildTree = [&]() -> std::unique_ptr<Octree> {
-        real minx = 1e30, miny = 1e30, minz = 1e30;
-        real maxx = -1e30, maxy = -1e30, maxz = -1e30;
+        struct BBox { real minx, miny, minz, maxx, maxy, maxz; };
+        BBox local{ real(1e30), real(1e30), real(1e30),
+                    real(-1e30), real(-1e30), real(-1e30) };
 
-        // Bounding box calculation (SoA access is very fast here)
         for (int i = 0; i < N; ++i) {
-            minx = std::min(minx, ps.x[i]); miny = std::min(miny, ps.y[i]); minz = std::min(minz, ps.z[i]);
-            maxx = std::max(maxx, ps.x[i]); maxy = std::max(maxy, ps.y[i]); maxz = std::max(maxz, ps.z[i]);
+            local.minx = std::min(local.minx, ps.x[i]);
+            local.miny = std::min(local.miny, ps.y[i]);
+            local.minz = std::min(local.minz, ps.z[i]);
+            local.maxx = std::max(local.maxx, ps.x[i]);
+            local.maxy = std::max(local.maxy, ps.y[i]);
+            local.maxz = std::max(local.maxz, ps.z[i]);
         }
 
-        real cx = (minx + maxx) * 0.5;
-        real cy = (miny + maxy) * 0.5;
-        real cz = (minz + maxz) * 0.5;
-        real size = std::max({maxx - minx, maxy - miny, maxz - minz}) * 0.5;
+#ifdef NEXT_MPI
+        real mins[3] = {local.minx, local.miny, local.minz};
+        real maxs[3] = {local.maxx, local.maxy, local.maxz};
+
+        MPI_Allreduce(MPI_IN_PLACE, mins, 3, MPI_REAL_T, MPI_MIN, MPI_COMM_WORLD);
+        MPI_Allreduce(MPI_IN_PLACE, maxs, 3, MPI_REAL_T, MPI_MAX, MPI_COMM_WORLD);
+
+        BBox global{mins[0], mins[1], mins[2], maxs[0], maxs[1], maxs[2]};
+#else
+        BBox global = local;
+#endif
 
-        if (size <= 0) size = 1.0;
+        const real cx   = (global.minx + global.maxx) * real(0.5);
+        const real cy   = (global.miny + global.maxy) * real(0.5);
+        const real cz   = (global.minz + global.maxz) * real(0.5);
+        real       size = std::max({global.maxx - global.minx,
+                                    global.maxy - global.miny,
+                                    global.maxz - global.minz}) * real(0.5);
+
+        if (size <= real(0)) size = real(1.0);
 
         auto root = std::make_unique<Octree>(cx, cy, cz, size);
 
-        // Insert particle indices 0 to N-1
-        for (int i = 0; i < N; ++i) {
+        for (int i = 0; i < N; ++i)
             root->insert(i, ps);
-        }
 
         root->computeMass(ps);
         return root;
     };
 
-    // --- First Kick (dt/2) ---
+    // FIRST KICK
     {
-        std::unique_ptr<Octree> root = buildTree();
+        auto root = buildTree();
 
         #pragma omp parallel for schedule(dynamic, 64)
-        for (int i = 0; i < N; ++i) {
-            real ax = 0, ay = 0, az = 0;
+        for (int i = start; i < end; ++i) {
+            real ax = real(0), ay = real(0), az = real(0);
             bhAccel(root.get(), i, ps, theta, ax, ay, az);
 
             ps.vx[i] += ax * half;
             ps.vy[i] += ay * half;
             ps.vz[i] += az * half;
         }
+
+#ifdef NEXT_MPI
+        MPI_Request reqs[3];
+        MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+                        ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs[0]);
+        MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+                        ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs[1]);
+        MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+                        ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs[2]);
+        MPI_Waitall(3, reqs, MPI_STATUSES_IGNORE);
+#endif
     }
 
-    // --- Drift (dt) ---
-    // Contiguous memory access makes this loop ideal for SIMD
+    // DRIFT
     #pragma omp parallel for schedule(static)
-    for (int i = 0; i < N; ++i) {
+    for (int i = start; i < end; ++i) {
         ps.x[i] += ps.vx[i] * dt;
         ps.y[i] += ps.vy[i] * dt;
         ps.z[i] += ps.vz[i] * dt;
     }
 
-    // --- Second Kick (dt/2) ---
+#ifdef NEXT_MPI
+    MPI_Request reqs3[3];
+    MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+                    ps.x.data(), counts.data(), displs.data(), MPI_REAL_T,
+                    MPI_COMM_WORLD, &reqs3[0]);
+    MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+                    ps.y.data(), counts.data(), displs.data(), MPI_REAL_T,
+                    MPI_COMM_WORLD, &reqs3[1]);
+    MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+                    ps.z.data(), counts.data(), displs.data(), MPI_REAL_T,
+                    MPI_COMM_WORLD, &reqs3[2]);
+    MPI_Waitall(3, reqs3, MPI_STATUSES_IGNORE);
+#endif
+
+    // SECOND KICK
     {
-        std::unique_ptr<Octree> root = buildTree();
+        auto root = buildTree();
 
         #pragma omp parallel for schedule(dynamic, 64)
-        for (int i = 0; i < N; ++i) {
-            real ax = 0, ay = 0, az = 0;
+        for (int i = start; i < end; ++i) {
+            real ax = real(0), ay = real(0), az = real(0);
             bhAccel(root.get(), i, ps, theta, ax, ay, az);
 
             ps.vx[i] += ax * half;
             ps.vy[i] += ay * half;
             ps.vz[i] += az * half;
         }
+
+#ifdef NEXT_MPI
+        MPI_Request reqs4[3];
+        MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+                        ps.vx.data(), counts.data(), displs.data(), MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs4[0]);
+        MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+                        ps.vy.data(), counts.data(), displs.data(), MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs4[1]);
+        MPI_Iallgatherv(MPI_IN_PLACE, 0, MPI_REAL_T,
+                        ps.vz.data(), counts.data(), displs.data(), MPI_REAL_T,
+                        MPI_COMM_WORLD, &reqs4[2]);
+        MPI_Waitall(3, reqs4, MPI_STATUSES_IGNORE);
+#endif
     }
 }