Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 22 additions & 7 deletions crates/core_arch/src/macros.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,12 +237,12 @@ macro_rules! deinterleaving_load {
($elem:ty, $lanes:literal, 2, $ptr:expr) => {{
use $crate::core_arch::macros::deinterleave_mask;
use $crate::core_arch::simd::Simd;
use $crate::{mem::transmute, ptr};
use $crate::mem::transmute;

type V = Simd<$elem, $lanes>;
type W = Simd<$elem, { $lanes * 2 }>;

let w: W = ptr::read_unaligned($ptr as *const W);
let w: W = $crate::ptr::read_unaligned($ptr as *const W);

let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 2, 0>());
let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 2, 1>());
Expand All @@ -253,12 +253,20 @@ macro_rules! deinterleaving_load {
($elem:ty, $lanes:literal, 3, $ptr:expr) => {{
use $crate::core_arch::macros::deinterleave_mask;
use $crate::core_arch::simd::Simd;
use $crate::{mem::transmute, ptr};
use $crate::mem::{MaybeUninit, transmute};

type V = Simd<$elem, $lanes>;
type W = Simd<$elem, { $lanes * 3 }>;

let w: W = ptr::read_unaligned($ptr as *const W);
// NOTE: repr(simd) adds padding to make the total size a power of two.
// Hence reading W from ptr might read out of bounds.
let mut mem = MaybeUninit::<W>::uninit();
$crate::ptr::copy_nonoverlapping(
$ptr.cast::<$elem>(),
mem.as_mut_ptr().cast::<$elem>(),
$lanes * 3,
);
Comment on lines +264 to +268
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

copy_nonoverlapping requires aligned pointers, so shouldn't this use read_unaligned($ptr) before writing to mem?

Same with the store needing to use write_unaligned rather than copy_nonoverlapping because that would require dst to be aligned.

Copy link
Contributor

@sayantn sayantn Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As this is copying elementwise, it should be ok if the pointer is element-aligned. I am not too familiar with ARM load semantics, is the pointer at least guaranteed to be element-aligned (I guess that's the case just from the type). @folkertdev could you just confirm this please?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pointer should be aligned to the element type, that is assumed by the implementation. You can see that e.g. in the Operation section here:

https://developer.arm.com/architectures/instruction-sets/intrinsics/vld3q_f16

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok in that case it lgtm

let w = mem.assume_init();

let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 3, 0>());
let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 3, 1>());
Expand All @@ -270,12 +278,12 @@ macro_rules! deinterleaving_load {
($elem:ty, $lanes:literal, 4, $ptr:expr) => {{
use $crate::core_arch::macros::deinterleave_mask;
use $crate::core_arch::simd::Simd;
use $crate::{mem::transmute, ptr};
use $crate::mem::transmute;

type V = Simd<$elem, $lanes>;
type W = Simd<$elem, { $lanes * 4 }>;

let w: W = ptr::read_unaligned($ptr as *const W);
let w: W = $crate::ptr::read_unaligned($ptr as *const W);

let v0: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 0>());
let v1: V = simd_shuffle!(w, w, deinterleave_mask::<$lanes, 4, 1>());
Expand Down Expand Up @@ -322,8 +330,15 @@ macro_rules! interleaving_store {
simd_shuffle!($v.2, $v.2, identity::<{ $lanes * 2 }>());

type W = Simd<$elem, { $lanes * 3 }>;

// NOTE: repr(simd) adds padding to make the total size a power of two.
// Hence writing W to ptr might write out of bounds.
let w: W = simd_shuffle!(v0v1, v2v2, interleave_mask::<{ $lanes * 3 }, $lanes, 3>());
$crate::ptr::write_unaligned($ptr as *mut W, w);
$crate::ptr::copy_nonoverlapping(
(&w as *const W).cast::<$elem>(),
$ptr.cast::<$elem>(),
$lanes * 3,
);
}};

// N = 4
Expand Down