chore: custom kernel launcher API to remove macro#6112
Conversation
Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
CodSpeed Performance ReportMerging this PR will degrade performance by 29.68%Comparing
|
| Mode | Benchmark | BASE |
HEAD |
Efficiency | |
|---|---|---|---|---|---|
| ❌ | WallTime | u64_FoR[10K] |
9.7 µs | 13.8 µs | -29.68% |
| ❌ | WallTime | u16_FoR[1K] |
5.6 µs | 6.9 µs | -17.79% |
| ❌ | WallTime | u16_FoR[10M] |
9.7 µs | 11.4 µs | -14.41% |
| ⚡ | Simulation | canonical_into_nullable[(10000, 10, 0.0)] |
528.8 µs | 444.6 µs | +18.93% |
| ❌ | Simulation | into_canonical_non_nullable[(10000, 100, 0.01)] |
2.2 ms | 3 ms | -26.73% |
| ❌ | Simulation | into_canonical_non_nullable[(10000, 100, 0.0)] |
1.9 ms | 2.7 ms | -29.35% |
| ❌ | Simulation | into_canonical_non_nullable[(10000, 100, 0.1)] |
3.8 ms | 4.6 ms | -17.82% |
| ⚡ | Simulation | into_canonical_nullable[(10000, 10, 0.0)] |
537.4 µs | 452.3 µs | +18.82% |
| ⚡ | Simulation | into_canonical_nullable[(10000, 10, 0.1)] |
710.5 µs | 632.3 µs | +12.37% |
| ❌ | Simulation | into_canonical_nullable[(10000, 100, 0.0)] |
4.4 ms | 5.2 ms | -15.77% |
| ❌ | Simulation | patched_take_10k_contiguous_not_patches |
1.2 ms | 1.4 ms | -10.19% |
| ❌ | Simulation | canonical_into_non_nullable[(10000, 1, 0.01)] |
36 µs | 44.2 µs | -18.45% |
| ❌ | Simulation | patched_take_10k_contiguous_patches |
2 ms | 2.5 ms | -16.83% |
| ❌ | Simulation | canonical_into_non_nullable[(10000, 1, 0.1)] |
52 µs | 60.2 µs | -13.71% |
| ❌ | Simulation | canonical_into_non_nullable[(10000, 1, 0.0)] |
30.9 µs | 39 µs | -20.79% |
| ❌ | Simulation | canonical_into_non_nullable[(10000, 100, 0.0)] |
1.9 ms | 2.7 ms | -29.5% |
| ❌ | Simulation | canonical_into_non_nullable[(10000, 100, 0.01)] |
2.1 ms | 2.9 ms | -27.4% |
| ❌ | Simulation | canonical_into_non_nullable[(10000, 100, 0.1)] |
3.7 ms | 4.5 ms | -18.03% |
Footnotes
-
1254 benchmarks were skipped, so the baseline results were used instead. If they were deleted from the codebase, click here and archive them to remove them from the performance reports. ↩
| //! let events = KernelLauncher::new(ctx, "for", &[array.ptype()])? | ||
| //! .arg_view(&cuda_view) | ||
| //! .arg(&reference) | ||
| //! .arg(&array_len) | ||
| //! .event_flags(CU_EVENT_DISABLE_TIMING) | ||
| //! .launch(array.len())?; |
| pub fn new( | ||
| ctx: &'a CudaExecutionCtx, | ||
| module_name: &str, | ||
| ptypes: &[PType], |
There was a problem hiding this comment.
This is not enough for non prim?
| /// - Integers: u8, u16, u32, u64, i8, i16, i32, i64 | ||
| /// - Floats: f32, f64 |
There was a problem hiding this comment.
What about str literal?
| // The _sync guard is dropped immediately, but that's fine since we're just | ||
| // reading the pointer value, not scheduling any work yet. | ||
| let (device_ptr, _sync) = view.device_ptr(self.stream); | ||
| self.storage.push(device_ptr); |
There was a problem hiding this comment.
what keeps the view alive?
@joseph-isaacs claude draft, wdyt? I'll do a clean up if we think this is the right direction.