From 08d04fb0d7f098e415361cee7c2396993b1d6a31 Mon Sep 17 00:00:00 2001 From: Gerard Gorman Date: Thu, 19 Feb 2026 17:15:20 -0600 Subject: [PATCH] Fix blockinner+OpenMP compilation and blocklevels=2 with high-order stencils Two bugs fixed: 1. blockinner+OpenMP: When a loop had both uindices (e.g. Modulo for time stepping) and OpenMP pragmas, the generated for-loop header contained multiple variables in the init clause, violating OpenMP canonical form. Fix: compute uindex values inside the loop body when pragmas are present, keeping them thread-private. 2. blocklevels=2 with space_order>=4: BlockDimension._arg_values crashed with AttributeError when the parent block's step was a literal Integer (not a Symbol). Fix: catch AttributeError and use int(self.parent.step) directly. Added two new tests: - test_cache_blocking_openmp_execution: verifies blockinner+OpenMP compiles and produces correct results (not just structure checks) - test_cache_blocking_hierarchical_high_order: verifies blocklevels=2 with space_order=8 executes correctly --- devito/ir/iet/visitors.py | 39 ++++++++++++++++++++++++--- devito/types/dimension.py | 16 ++++++++--- tests/test_dle.py | 57 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 6 deletions(-) diff --git a/devito/ir/iet/visitors.py b/devito/ir/iet/visitors.py index b450960354..ffa17535ca 100644 --- a/devito/ir/iet/visitors.py +++ b/devito/ir/iet/visitors.py @@ -647,14 +647,47 @@ def visit_Iteration(self, o): loop_inc = f'{o.index} += {o.limits[2]}' # Append unbounded indices, if any - if o.uindices: - uinit = [f'{i.name} = {self.ccode(i.symbolic_min)}' for i in o.uindices] + if o.uindices and o.pragmas: + # When pragmas are present (e.g., OpenMP parallel for), the + # for-loop header must be in canonical form: a single loop + # variable in init/cond/incr. Compute uindex values inside + # the loop body instead, where they are automatically + # thread-private + uinit_stmts = [] + for i in o.uindices: + if i.is_Modulo: + # Modulo dimensions: symbolic_incr computes the + # value from the parent dimension (the loop variable) + value = i.symbolic_incr + else: + # IncrDimension: derive value from loop variable + # position relative to loop start + if o.direction == Backward: + n_iters = _max - o.dim + else: + n_iters = o.dim - _min + _step = o.limits[2] + if _step != 1: + n_iters = n_iters / _step + if i.symbolic_incr != 1: + value = i.symbolic_min + n_iters * i.symbolic_incr + else: + value = i.symbolic_min + n_iters + uinit_stmts.append( + c.Statement(f'int {i.name} = {self.ccode(value)}') + ) + body = list(uinit_stmts) + list(body) + elif o.uindices: + uinit = [f'{i.name} = {self.ccode(i.symbolic_min)}' + for i in o.uindices] loop_init = c.Line(', '.join([loop_init] + uinit)) ustep = [] for i in o.uindices: op = '=' if i.is_Modulo else '+=' - ustep.append(f'{i.name} {op} {self.ccode(i.symbolic_incr)}') + ustep.append( + f'{i.name} {op} {self.ccode(i.symbolic_incr)}' + ) loop_inc = c.Line(', '.join([loop_inc] + ustep)) # Create For header+body diff --git a/devito/types/dimension.py b/devito/types/dimension.py index 4010472bef..513c381f64 100644 --- a/devito/types/dimension.py +++ b/devito/types/dimension.py @@ -1344,7 +1344,11 @@ def _arg_values(self, interval, grid=None, args=None, **kwargs): elif isinstance(self.parent, BlockDimension): # `self` is a BlockDimension within an outer BlockDimension, but # no value supplied -> the sub-block will span the entire block - return {name: args[self.parent.step.name]} + try: + return {name: args[self.parent.step.name]} + except AttributeError: + # Parent's step is a literal (e.g. Integer), use it directly + return {name: int(self.parent.step)} else: # TODO": Check the args for space order and apply heuristics (e.g., # `2*space_order`?) for even better block sizes @@ -1365,12 +1369,18 @@ def _arg_check(self, args, *_args): value = args[name] if isinstance(self.parent, BlockDimension): # sub-BlockDimensions must be perfect divisors of their parent - parent_value = args[self.parent.step.name] + try: + parent_step_name = self.parent.step.name + parent_value = args[parent_step_name] + except AttributeError: + # Parent's step is a literal (e.g. Integer) + parent_step_name = str(self.parent.step) + parent_value = int(self.parent.step) if parent_value % value > 0: raise InvalidArgument( f'Illegal block size `{name}={value}`: sub-block sizes ' 'must divide the parent block size evenly ' - f'(`{self.parent.step.name}={parent_value}`)' + f'(`{parent_step_name}={parent_value}`)' ) else: if value < 0: diff --git a/tests/test_dle.py b/tests/test_dle.py index ced0ad9937..0673522272 100644 --- a/tests/test_dle.py +++ b/tests/test_dle.py @@ -109,6 +109,35 @@ def test_cache_blocking_structure(blockinner, openmp, expected): assert 'omp for' in trees[0][1].pragmas[0].ccode.value +@skipif('noomp') +@pytest.mark.parametrize("blockinner", [False, True]) +def test_cache_blocking_openmp_execution(blockinner): + """ + Verify that blockinner + OpenMP actually compiles and produces + correct results. This catches canonical-form violations in the + generated for-loop headers (e.g. uindices in the init clause). + """ + grid = Grid(shape=(12, 12, 12), dtype=np.float64) + + u = TimeFunction(name='u', grid=grid, space_order=4) + v = TimeFunction(name='v', grid=grid, space_order=4) + + eqns = [Eq(u.forward, u.laplace + 1.)] + + op0 = Operator(eqns, opt='noop') + op1 = Operator(eqns, opt=('advanced', {'openmp': True, + 'blockinner': blockinner, + 'par-collapse-ncores': 1})) + + u.data[:] = 0.1 + op0(time_M=5) + + v.data[:] = 0.1 + op1(u=v, time_M=5) + + assert np.allclose(u.data, v.data, rtol=1e-12) + + def test_cache_blocking_structure_subdims(): """ Test that: @@ -514,6 +543,34 @@ def test_cache_blocking_hierarchical(blockshape0, blockshape1, exception): raise AssertionError('Assert False') from e +@pytest.mark.parametrize("blockinner", [False, True]) +def test_cache_blocking_hierarchical_high_order(blockinner): + """ + Verify that blocklevels=2 works correctly with high space_order + stencils. High-order stencils produce literal Integer step sizes + in BlockDimension._arg_values, exercising a different code path + than low-order stencils. + """ + grid = Grid(shape=(45, 45, 45), dtype=np.float64) + + u = TimeFunction(name='u', grid=grid, time_order=1, space_order=8) + v = TimeFunction(name='v', grid=grid, time_order=1, space_order=8) + + # Use a small coefficient to keep the stencil stable + eqns = [Eq(u.forward, u + 1e-4 * u.laplace)] + + op0 = Operator(eqns, opt='noop') + op1 = Operator(eqns, opt=('advanced', {'blockinner': blockinner, + 'blocklevels': 2})) + + u.data[0, :] = np.linspace(-1, 1, 45**3).reshape((45, 45, 45)) + v.data[0, :] = u.data[0, :] + op0(u=u, time_M=10) + op1(u=v, time_M=10) + + assert np.allclose(u.data, v.data, rtol=1e-12) + + @pytest.mark.parametrize("blockinner", [False, True]) def test_cache_blocking_imperfect_nest(blockinner): """