diff --git a/Makefile b/Makefile index d25cd4a..904aa82 100644 --- a/Makefile +++ b/Makefile @@ -42,9 +42,9 @@ endif TARGET_ARCH := $(shell uname -m) ifeq ($(TARGET_ARCH),aarch64) - SMS ?= 53 61 70 72 75 80 86 87 90 + SMS ?= 75 80 86 87 89 90 120 else - SMS ?= 52 60 70 75 80 86 90 + SMS ?= 75 80 86 89 90 120 endif # Generate SASS code for each SM architecture listed in $(SMS) diff --git a/Makefile.config b/Makefile.config index a981d0f..aee79d7 100644 --- a/Makefile.config +++ b/Makefile.config @@ -1,4 +1,4 @@ -CUDA_PATH ?= /usr/local/cuda-12.1 +CUDA_PATH ?= /usr/local/cuda SANALYZER_DIR ?= ../sanalyzer diff --git a/gpu_src/gpu_patch_block_divergence_analysis.cu b/gpu_src/gpu_patch_block_divergence_analysis.cu new file mode 100644 index 0000000..0293185 --- /dev/null +++ b/gpu_src/gpu_patch_block_divergence_analysis.cu @@ -0,0 +1,151 @@ +#include "gpu_patch.h" + +#include + +#include "gpu_utils.h" +#include + +static __device__ __inline__ +uint32_t GetBufferIndex(MemoryAccessTracker* pTracker) { + uint32_t idx = MEMORY_ACCESS_BUFFER_SIZE; + + while (idx >= MEMORY_ACCESS_BUFFER_SIZE) { + idx = atomicAdd(&(pTracker->currentEntry), 1); + + if (idx >= MEMORY_ACCESS_BUFFER_SIZE) { + // buffer is full, wait for last writing thread to flush + while (*(volatile uint32_t*)&(pTracker->currentEntry) >= MEMORY_ACCESS_BUFFER_SIZE); + } + } + + return idx; +} + +static __device__ __inline__ +void IncrementNumEntries(MemoryAccessTracker* pTracker) { + DoorBell* doorbell = pTracker->doorBell; + __threadfence(); + const uint32_t numEntries = atomicAdd((int*)&(pTracker->numEntries), 1); + + if (numEntries == MEMORY_ACCESS_BUFFER_SIZE - 1) { + // make sure everything is visible in memory + __threadfence_system(); + doorbell->full = true; + while (doorbell->full); + + pTracker->numEntries = 0; + __threadfence(); + pTracker->currentEntry = 0; + } +} + +static __device__ +SanitizerPatchResult CommonCallback( + void* userdata, + uint64_t pc, + void* ptr, + uint32_t accessSize, + uint32_t flags, + MemoryType type) +{ + auto* pTracker = (MemoryAccessTracker*)userdata; + + uint32_t active_mask = __activemask(); + uint32_t laneid = get_laneid(); + uint32_t first_laneid = __ffs(active_mask) - 1; + + MemoryAccess* accesses = nullptr; + + if (laneid == first_laneid) { + uint32_t idx = GetBufferIndex(pTracker); + accesses = &pTracker->access_buffer[idx]; + accesses->accessSize = accessSize; + accesses->flags = flags; + accesses->warpId = get_warpid(); + accesses->ctaId = get_ctaid_as_uint64(); + accesses->pc = pc; + accesses->active_mask = active_mask; + accesses->type = type; + } + + __syncwarp(active_mask); + + accesses = (MemoryAccess*) shfl((uint64_t)accesses, first_laneid, active_mask); + if (accesses) { + accesses->addresses[laneid] = (uint64_t)(uintptr_t)ptr; + } + + __syncwarp(active_mask); + + if (laneid == first_laneid) { + IncrementNumEntries(pTracker); + } + + return SANITIZER_PATCH_SUCCESS; +} + +extern "C" __device__ __noinline__ +SanitizerPatchResult MemoryGlobalAccessCallback( + void* userdata, + uint64_t pc, + void* ptr, + uint32_t accessSize, + uint32_t flags, + const void *pData) +{ + return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Global); +} + +extern "C" __device__ __noinline__ +SanitizerPatchResult MemorySharedAccessCallback( + void* userdata, + uint64_t pc, + void* ptr, + uint32_t accessSize, + uint32_t flags, + const void *pData) +{ + return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Shared); +} + +extern "C" __device__ __noinline__ +SanitizerPatchResult MemoryLocalAccessCallback( + void* userdata, + uint64_t pc, + void* ptr, + uint32_t accessSize, + uint32_t flags, + const void *pData) +{ + return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Local); +} + +extern "C" __device__ __noinline__ +SanitizerPatchResult MemcpyAsyncCallback(void* userdata, uint64_t pc, void* src, uint32_t dst, uint32_t accessSize) +{ + if (src) + { + CommonCallback(userdata, pc, src, accessSize, SANITIZER_MEMORY_DEVICE_FLAG_READ, MemoryType::Global); + } + + return CommonCallback(userdata, pc, (void*)dst, accessSize, SANITIZER_MEMORY_DEVICE_FLAG_WRITE, MemoryType::Shared); +} + +extern "C" __device__ __noinline__ +SanitizerPatchResult BlockExitCallback(void* userdata, uint64_t pc) +{ + MemoryAccessTracker* tracker = (MemoryAccessTracker*)userdata; + DoorBell* doorbell = tracker->doorBell; + + uint32_t active_mask = __activemask(); + uint32_t laneid = get_laneid(); + uint32_t first_laneid = __ffs(active_mask) - 1; + int32_t pop_count = __popc(active_mask); + + if (laneid == first_laneid) { + atomicAdd((int*)&doorbell->num_threads, -pop_count); + } + __syncwarp(active_mask); + + return SANITIZER_PATCH_SUCCESS; +} diff --git a/gpu_src/gpu_patch_heatmap_analysis.cu b/gpu_src/gpu_patch_heatmap_analysis.cu new file mode 100644 index 0000000..d9d94b4 --- /dev/null +++ b/gpu_src/gpu_patch_heatmap_analysis.cu @@ -0,0 +1,172 @@ +#include "gpu_patch.h" + +#include + +#include "gpu_utils.h" +#include + +static __device__ __inline__ +bool BypassCheckByCtaId(uint32_t block_idx, uint32_t block_idy, uint32_t block_idz) { + int4 ctaid = get_ctaid(); + if(ctaid.x != block_idx || ctaid.y != block_idy || ctaid.z != block_idz) { + return true; + } + return false; +} + + +static __device__ __inline__ +uint32_t GetBufferIndex(MemoryAccessTracker* pTracker) { + uint32_t idx = MEMORY_ACCESS_BUFFER_SIZE; + + while (idx >= MEMORY_ACCESS_BUFFER_SIZE) { + idx = atomicAdd(&(pTracker->currentEntry), 1); + + if (idx >= MEMORY_ACCESS_BUFFER_SIZE) { + // buffer is full, wait for last writing thread to flush + while (*(volatile uint32_t*)&(pTracker->currentEntry) >= MEMORY_ACCESS_BUFFER_SIZE); + } + } + + return idx; +} + +static __device__ __inline__ +void IncrementNumEntries(MemoryAccessTracker* pTracker) { + DoorBell* doorbell = pTracker->doorBell; + __threadfence(); + const uint32_t numEntries = atomicAdd((int*)&(pTracker->numEntries), 1); + + if (numEntries == MEMORY_ACCESS_BUFFER_SIZE - 1) { + // make sure everything is visible in memory + __threadfence_system(); + doorbell->full = true; + while (doorbell->full); + + pTracker->numEntries = 0; + __threadfence(); + pTracker->currentEntry = 0; + } +} + +static __device__ +SanitizerPatchResult CommonCallback( + void* userdata, + uint64_t pc, + void* ptr, + uint32_t accessSize, + uint32_t flags, + MemoryType type) +{ + auto* pTracker = (MemoryAccessTracker*)userdata; + if(BypassCheckByCtaId(pTracker->target_block[0], pTracker->target_block[1], pTracker->target_block[2])) { + return SANITIZER_PATCH_SUCCESS; + } + + uint32_t active_mask = __activemask(); + uint32_t laneid = get_laneid(); + uint32_t first_laneid = __ffs(active_mask) - 1; + + MemoryAccess* accesses = nullptr; + + if (laneid == first_laneid) { + uint32_t idx = GetBufferIndex(pTracker); + accesses = &pTracker->access_buffer[idx]; + accesses->accessSize = accessSize; + accesses->flags = flags; + accesses->warpId = get_warpid(); + accesses->type = type; + accesses->pc = pc; + accesses->active_mask = active_mask; + } + + __syncwarp(active_mask); + + accesses = (MemoryAccess*) shfl((uint64_t)accesses, first_laneid, active_mask); + if (accesses) { + if(type == MemoryType::Local){ + accesses->addresses[laneid] = (((uint64_t) get_warpid() * GPU_WARP_SIZE + get_laneid()) << 54) |((uint64_t)(uintptr_t)ptr); // use high 10 bits to store thread id for local memory access + } else { + accesses->addresses[laneid] = (uint64_t)(uintptr_t)ptr; + } + } + + __syncwarp(active_mask); + + if (laneid == first_laneid) { + IncrementNumEntries(pTracker); + } + + return SANITIZER_PATCH_SUCCESS; +} + +extern "C" __device__ __noinline__ +SanitizerPatchResult MemoryGlobalAccessCallback( + void* userdata, + uint64_t pc, + void* ptr, + uint32_t accessSize, + uint32_t flags, + const void *pData) +{ + return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Global); +} + +extern "C" __device__ __noinline__ +SanitizerPatchResult MemorySharedAccessCallback( + void* userdata, + uint64_t pc, + void* ptr, + uint32_t accessSize, + uint32_t flags, + const void *pData) +{ + return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Shared); +} + +extern "C" __device__ __noinline__ +SanitizerPatchResult MemoryLocalAccessCallback( + void* userdata, + uint64_t pc, + void* ptr, + uint32_t accessSize, + uint32_t flags, + const void *pData) +{ + return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Local); +} + +//For the future use of async memcpy +extern "C" __device__ __noinline__ +SanitizerPatchResult MemcpyAsyncCallback(void* userdata, uint64_t pc, void* src, uint32_t dst, uint32_t accessSize) +{ + if (src) + { + CommonCallback(userdata, pc, src, accessSize, SANITIZER_MEMORY_DEVICE_FLAG_READ, MemoryType::Global); + } + + return CommonCallback(userdata, pc, (void*)dst, accessSize, SANITIZER_MEMORY_DEVICE_FLAG_WRITE, MemoryType::Shared); +} + +extern "C" __device__ __noinline__ +SanitizerPatchResult BlockExitCallback(void* userdata, uint64_t pc) +{ + + MemoryAccessTracker* tracker = (MemoryAccessTracker*)userdata; + DoorBell* doorbell = tracker->doorBell; + if(BypassCheckByCtaId(tracker->target_block[0], tracker->target_block[1], tracker->target_block[2])) { + return SANITIZER_PATCH_SUCCESS; + } + + uint32_t active_mask = __activemask(); + uint32_t laneid = get_laneid(); + uint32_t first_laneid = __ffs(active_mask) - 1; + int32_t pop_count = __popc(active_mask); + + if (laneid == first_laneid) { + atomicAdd((int*)&doorbell->num_threads, -pop_count); + } + __syncwarp(active_mask); + + return SANITIZER_PATCH_SUCCESS; +} diff --git a/gpu_src/gpu_patch_pc_dependency.cu b/gpu_src/gpu_patch_pc_dependency.cu new file mode 100644 index 0000000..d560513 --- /dev/null +++ b/gpu_src/gpu_patch_pc_dependency.cu @@ -0,0 +1,164 @@ +#include "gpu_patch.h" + +#include + +#include "gpu_utils.h" +#include + + +static __device__ __inline__ +uint32_t GetBufferIndex(MemoryAccessTracker* pTracker) { + uint32_t idx = MEMORY_ACCESS_BUFFER_SIZE; + + while (idx >= MEMORY_ACCESS_BUFFER_SIZE) { + idx = atomicAdd(&(pTracker->currentEntry), 1); + + if (idx >= MEMORY_ACCESS_BUFFER_SIZE) { + // buffer is full, wait for last writing thread to flush + while (*(volatile uint32_t*)&(pTracker->currentEntry) >= MEMORY_ACCESS_BUFFER_SIZE); + } + } + + return idx; +} + +static __device__ __inline__ +void IncrementNumEntries(MemoryAccessTracker* pTracker) { + DoorBell* doorbell = pTracker->doorBell; + __threadfence(); + const uint32_t numEntries = atomicAdd((int*)&(pTracker->numEntries), 1); + + if (numEntries == MEMORY_ACCESS_BUFFER_SIZE - 1) { + // make sure everything is visible in memory + __threadfence_system(); + doorbell->full = true; + while (doorbell->full); + + pTracker->numEntries = 0; + __threadfence(); + pTracker->currentEntry = 0; + } +} + +static __device__ +SanitizerPatchResult CommonCallback( + void* userdata, + uint64_t pc, + void* ptr, + uint32_t accessSize, + uint32_t flags, + MemoryType type) +{ + auto* pTracker = (MemoryAccessTracker*)userdata; + if (!pTracker->enabled_instrumenting) { + return SANITIZER_PATCH_SUCCESS; + } + + uint32_t active_mask = __activemask(); + uint32_t laneid = get_laneid(); + uint32_t first_laneid = __ffs(active_mask) - 1; + + MemoryAccess* accesses = nullptr; + + uint32_t distinct_sector_count = get_distint_sector_count((uint64_t)(uintptr_t)ptr/32, active_mask); // sector size is 32 bytes so we divide by 32 to get the sector tag + + if (laneid == first_laneid) { + uint32_t idx = GetBufferIndex(pTracker); + accesses = &pTracker->access_buffer[idx]; + accesses->accessSize = accessSize; + accesses->flags = flags; + accesses->distinct_sector_count = distinct_sector_count; + accesses->warpId = get_flat_thread_id() / GPU_WARP_SIZE; + accesses->ctaId = get_ctaid_as_uint64(); + accesses->type = type; + accesses->pc = pc - pTracker->kernel_pc; // for in-gpu pc offset calculation, so that the pc in corresponding analysis tool can be saved only in uint32_t + accesses->active_mask = active_mask; + } + + __syncwarp(active_mask); + + accesses = (MemoryAccess*) shfl((uint64_t)accesses, first_laneid, active_mask); + if (accesses) { + if(type == MemoryType::Local){ + accesses->addresses[laneid] = ((get_flat_thread_id()) << 54) |((uint64_t)(uintptr_t)ptr); // use high 10 bits to store thread id for local memory access + } else { + accesses->addresses[laneid] = (uint64_t)(uintptr_t)ptr; + } + } + + __syncwarp(active_mask); + + if (laneid == first_laneid) { + IncrementNumEntries(pTracker); + } + + return SANITIZER_PATCH_SUCCESS; +} + +extern "C" __device__ __noinline__ +SanitizerPatchResult MemoryGlobalAccessCallback( + void* userdata, + uint64_t pc, + void* ptr, + uint32_t accessSize, + uint32_t flags, + const void *pData) +{ + return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Global); +} + +extern "C" __device__ __noinline__ +SanitizerPatchResult MemorySharedAccessCallback( + void* userdata, + uint64_t pc, + void* ptr, + uint32_t accessSize, + uint32_t flags, + const void *pData) +{ + return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Shared); +} + +extern "C" __device__ __noinline__ +SanitizerPatchResult MemoryLocalAccessCallback( + void* userdata, + uint64_t pc, + void* ptr, + uint32_t accessSize, + uint32_t flags, + const void *pData) +{ + return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Local); +} + +//For the future use of async memcpy +extern "C" __device__ __noinline__ +SanitizerPatchResult MemcpyAsyncCallback(void* userdata, uint64_t pc, void* src, uint32_t dst, uint32_t accessSize) +{ + if (src) + { + CommonCallback(userdata, pc, src, accessSize, SANITIZER_MEMORY_DEVICE_FLAG_READ, MemoryType::Global); + } + + return CommonCallback(userdata, pc, (void*)dst, accessSize, SANITIZER_MEMORY_DEVICE_FLAG_WRITE, MemoryType::Shared); +} + +extern "C" __device__ __noinline__ +SanitizerPatchResult BlockExitCallback(void* userdata, uint64_t pc) +{ + + MemoryAccessTracker* tracker = (MemoryAccessTracker*)userdata; + DoorBell* doorbell = tracker->doorBell; + + uint32_t active_mask = __activemask(); + uint32_t laneid = get_laneid(); + uint32_t first_laneid = __ffs(active_mask) - 1; + int32_t pop_count = __popc(active_mask); + + if (laneid == first_laneid) { + atomicAdd((int*)&doorbell->num_threads, -pop_count); + } + __syncwarp(active_mask); + + return SANITIZER_PATCH_SUCCESS; +} diff --git a/gpu_src/include/gpu_patch.h b/gpu_src/include/gpu_patch.h index 09152cb..52419e5 100644 --- a/gpu_src/include/gpu_patch.h +++ b/gpu_src/include/gpu_patch.h @@ -22,7 +22,11 @@ struct MemoryAccess uint64_t addresses[GPU_WARP_SIZE]; uint32_t accessSize; uint32_t flags; - uint64_t warpId; + uint64_t ctaId; + uint64_t pc; + uint32_t warpId; + uint32_t distinct_sector_count; + uint32_t active_mask; MemoryType type; // copy constructor @@ -34,8 +38,12 @@ struct MemoryAccess } accessSize = other.accessSize; flags = other.flags; + ctaId = other.ctaId; warpId = other.warpId; + distinct_sector_count = other.distinct_sector_count; type = other.type; + pc = other.pc; + active_mask = other.active_mask; } MemoryAccess() = default; @@ -86,6 +94,9 @@ struct MemoryAccessTracker uint32_t numEntries; uint64_t accessCount; uint64_t accessSize; + uint64_t kernel_pc; + bool enabled_instrumenting; + int32_t target_block[3]; // target block to sample [x, y, z] DoorBell* doorBell; MemoryAccess* access_buffer; MemoryAccessState* access_state; diff --git a/gpu_src/include/gpu_utils.h b/gpu_src/include/gpu_utils.h index ca254ce..e517f14 100644 --- a/gpu_src/include/gpu_utils.h +++ b/gpu_src/include/gpu_utils.h @@ -36,6 +36,12 @@ __device__ __forceinline__ int4 get_ctaid(void) { return ret; } +__device__ __forceinline__ uint64_t get_ctaid_as_uint64(void) { + int4 ctaid = get_ctaid(); + uint64_t ret = (uint64_t)(ctaid.x + ctaid.y * gridDim.x + ctaid.z * gridDim.x * gridDim.y); + return ret; +} + // Get the number of CTA ids per grid __device__ __forceinline__ int4 get_nctaid(void) { int4 ret; @@ -65,6 +71,14 @@ __device__ __forceinline__ uint64_t get_block_num_threads() { return blockDim.x * blockDim.y * blockDim.z; } +__device__ __forceinline__ uint32_t get_distint_sector_count(uint64_t sector_tag, uint32_t active_mask) { + uint32_t match_mask = 0; + match_mask = __match_any_sync(active_mask, sector_tag); + uint32_t leader = __ffs(match_mask) - 1; + uint32_t is_leader = (leader == get_laneid()) ? 1 : 0; + uint32_t final_mask = __ballot_sync(active_mask, is_leader); + return __popc(final_mask); +} template __device__ __forceinline__ T shfl(T v, uint32_t srcline, uint32_t mask = 0xFFFFFFFF) { diff --git a/src/compute_sanitizer.cpp b/src/compute_sanitizer.cpp index 42adb0c..45f7aab 100644 --- a/src/compute_sanitizer.cpp +++ b/src/compute_sanitizer.cpp @@ -16,6 +16,8 @@ #include #include #include +#include +#include #define SANITIZER_VERBOSE 1 @@ -58,6 +60,55 @@ static std::unordered_map sanitizer_active_modules; // for multi-GPU support static std::unordered_map sanitizer_ctx_to_device; +static std::unordered_set sanitizer_kernel_white_list; + +// read whitelist file, parse kernel keywords line by line +void LoadKernelWhiteList(const char* whitelist_path) { + if (!whitelist_path) return; + + std::ifstream fin(whitelist_path); + if (!fin.is_open()) { + std::cerr << "[SANITIZER WARN] Failed to open whitelist file: " + << whitelist_path << std::endl; + return; + } + + auto trim = [](std::string& s) { + const char* ws = " \t\r\n"; + auto start = s.find_first_not_of(ws); + if (start == std::string::npos) { + s.clear(); + return; + } + auto end = s.find_last_not_of(ws); + s = s.substr(start, end - start + 1); + }; + + std::string line; + while (std::getline(fin, line)) { + trim(line); + if (line.empty() || line[0] == '#') continue; // 跳过空行与注释 + sanitizer_kernel_white_list.insert(line); + } + + PRINT("[SANITIZER INFO] Loaded %zu kernel whitelist entries from %s\n", + sanitizer_kernel_white_list.size(), whitelist_path); + if (!sanitizer_kernel_white_list.empty()) { + PRINT("[SANITIZER INFO] Kernel whitelist: %s\n", sanitizer_kernel_white_list.begin()->c_str()); + } +} + +bool SanitizerKernelWhiteListCheck(const std::string& functionName){ + if (sanitizer_kernel_white_list.empty()) { + return true; + } + for (const auto& kernel : sanitizer_kernel_white_list) { + if (functionName.find(kernel) != std::string::npos) { + return true; + } + } + return false; +} void SanitizerTensorMallocCallback(uint64_t ptr, int64_t size, int64_t allocated, int64_t reserved, int device_id) { if (!sanitizer_options.sanitizer_callback_enabled) { @@ -194,6 +245,45 @@ void ModuleLoadedCallback(CUmodule module) SANITIZER_INSTRUCTION_GLOBAL_MEMORY_ACCESS, module, "MemoryGlobalAccessCallback")); SANITIZER_SAFECALL( sanitizerPatchInstructions(SANITIZER_INSTRUCTION_BLOCK_EXIT, module, "BlockExitCallback")); + } else if (sanitizer_options.patch_name == GPU_PATCH_HEATMAP_ANALYSIS) { + SANITIZER_SAFECALL( + sanitizerPatchInstructions( + SANITIZER_INSTRUCTION_GLOBAL_MEMORY_ACCESS, module, "MemoryGlobalAccessCallback")); + SANITIZER_SAFECALL( + sanitizerPatchInstructions( + SANITIZER_INSTRUCTION_SHARED_MEMORY_ACCESS, module, "MemorySharedAccessCallback")); + SANITIZER_SAFECALL( + sanitizerPatchInstructions( + SANITIZER_INSTRUCTION_LOCAL_MEMORY_ACCESS, module, "MemoryLocalAccessCallback")); + // SANITIZER_SAFECALL( + // sanitizerPatchInstructions( + // SANITIZER_INSTRUCTION_MEMCPY_ASYNC, module, "MemcpyAsyncCallback")); + SANITIZER_SAFECALL( + sanitizerPatchInstructions(SANITIZER_INSTRUCTION_BLOCK_EXIT, module, "BlockExitCallback")); + } else if (sanitizer_options.patch_name == GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS) { + SANITIZER_SAFECALL( + sanitizerPatchInstructions( + SANITIZER_INSTRUCTION_GLOBAL_MEMORY_ACCESS, module, "MemoryGlobalAccessCallback")); + SANITIZER_SAFECALL( + sanitizerPatchInstructions( + SANITIZER_INSTRUCTION_SHARED_MEMORY_ACCESS, module, "MemorySharedAccessCallback")); + SANITIZER_SAFECALL( + sanitizerPatchInstructions( + SANITIZER_INSTRUCTION_LOCAL_MEMORY_ACCESS, module, "MemoryLocalAccessCallback")); + SANITIZER_SAFECALL( + sanitizerPatchInstructions(SANITIZER_INSTRUCTION_BLOCK_EXIT, module, "BlockExitCallback")); + } else if (sanitizer_options.patch_name == GPU_PATCH_PC_DEPENDENCY_ANALYSIS) { + SANITIZER_SAFECALL( + sanitizerPatchInstructions( + SANITIZER_INSTRUCTION_GLOBAL_MEMORY_ACCESS, module, "MemoryGlobalAccessCallback")); + SANITIZER_SAFECALL( + sanitizerPatchInstructions( + SANITIZER_INSTRUCTION_SHARED_MEMORY_ACCESS, module, "MemorySharedAccessCallback")); + SANITIZER_SAFECALL( + sanitizerPatchInstructions( + SANITIZER_INSTRUCTION_LOCAL_MEMORY_ACCESS, module, "MemoryLocalAccessCallback")); + SANITIZER_SAFECALL( + sanitizerPatchInstructions(SANITIZER_INSTRUCTION_BLOCK_EXIT, module, "BlockExitCallback")); } SANITIZER_SAFECALL(sanitizerPatchModule(module)); @@ -313,6 +403,63 @@ void buffer_init(CUcontext context) { SANITIZER_SAFECALL( sanitizerAllocHost(context, (void**)&global_doorbell, sizeof(DoorBell))); } + } else if (sanitizer_options.patch_name == GPU_PATCH_HEATMAP_ANALYSIS) { + if (!device_access_buffer) { + SANITIZER_SAFECALL( + sanitizerAlloc( + context, + (void**)&device_access_buffer, + sizeof(MemoryAccess) * MEMORY_ACCESS_BUFFER_SIZE)); + } + if (!host_access_buffer) { + SANITIZER_SAFECALL( + sanitizerAllocHost( + context, + (void**)&host_access_buffer, + sizeof(MemoryAccess) * MEMORY_ACCESS_BUFFER_SIZE)); + } + if (!global_doorbell) { + SANITIZER_SAFECALL( + sanitizerAllocHost(context, (void**)&global_doorbell, sizeof(DoorBell))); + } + } else if (sanitizer_options.patch_name == GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS) { + if (!device_access_buffer) { + SANITIZER_SAFECALL( + sanitizerAlloc( + context, + (void**)&device_access_buffer, + sizeof(MemoryAccess) * MEMORY_ACCESS_BUFFER_SIZE)); + } + if (!host_access_buffer) { + SANITIZER_SAFECALL( + sanitizerAllocHost( + context, + (void**)&host_access_buffer, + sizeof(MemoryAccess) * MEMORY_ACCESS_BUFFER_SIZE)); + } + if (!global_doorbell) { + SANITIZER_SAFECALL( + sanitizerAllocHost(context, (void**)&global_doorbell, sizeof(DoorBell))); + } + } else if (sanitizer_options.patch_name == GPU_PATCH_PC_DEPENDENCY_ANALYSIS) { + if (!device_access_buffer) { + SANITIZER_SAFECALL( + sanitizerAlloc( + context, + (void**)&device_access_buffer, + sizeof(MemoryAccess) * MEMORY_ACCESS_BUFFER_SIZE)); + } + if (!host_access_buffer) { + SANITIZER_SAFECALL( + sanitizerAllocHost( + context, + (void**)&host_access_buffer, + sizeof(MemoryAccess) * MEMORY_ACCESS_BUFFER_SIZE)); + } + if (!global_doorbell) { + SANITIZER_SAFECALL( + sanitizerAllocHost(context, (void**)&global_doorbell, sizeof(DoorBell))); + } } } @@ -321,6 +468,7 @@ void LaunchBeginCallback( CUcontext context, CUmodule module, CUfunction function, + uint64_t pc, std::string functionName, Sanitizer_StreamHandle hstream, dim3 blockDims, @@ -328,8 +476,11 @@ void LaunchBeginCallback( { if (sanitizer_options.patch_name != GPU_NO_PATCH) { // sampling + bool launch_monitoring = true; sanitizer_options.grid_launch_id++; - if (sanitizer_options.grid_launch_id % sanitizer_options.sample_rate == 0) { + if ((sanitizer_options.grid_launch_id % sanitizer_options.sample_rate == 0) && + SanitizerKernelWhiteListCheck(functionName)) { + PRINT("[SANITIZER INFO] Monitoring kernel %s, launch id %lu\n", functionName.c_str(), sanitizer_options.grid_launch_id); auto it = sanitizer_active_modules.find(module); @@ -340,12 +491,15 @@ void LaunchBeginCallback( } else { PRINT("[SANITIZER INFO] Skipping kernel %s monitoring, launch id %lu\n", functionName.c_str(), sanitizer_options.grid_launch_id); + launch_monitoring = false; auto it = sanitizer_active_modules.find(module); - if (it->second) { - SANITIZER_SAFECALL(sanitizerUnpatchModule(module)); - it->second = false; + if (it != sanitizer_active_modules.end()) { + if (it->second) { + // SANITIZER_SAFECALL(sanitizerUnpatchModule(module)); + it->second = false; + } + return; } - return; } buffer_init(context); @@ -439,6 +593,73 @@ void LaunchBeginCallback( host_tracker_handle->numEntries = 0; host_tracker_handle->access_buffer = device_access_buffer; + uint32_t num_threads = + blockDims.x * blockDims.y * blockDims.z * gridDims.x * gridDims.y * gridDims.z; + global_doorbell->num_threads = num_threads; + global_doorbell->full = 0; + host_tracker_handle->doorBell = global_doorbell; + } else if (sanitizer_options.patch_name == GPU_PATCH_HEATMAP_ANALYSIS) { + SANITIZER_SAFECALL( + sanitizerMemset( + device_access_buffer, 0, sizeof(MemoryAccess) * MEMORY_ACCESS_BUFFER_SIZE, hstream)); + host_tracker_handle->currentEntry = 0; + host_tracker_handle->numEntries = 0; + host_tracker_handle->access_buffer = device_access_buffer; + char* target_block_str = std::getenv("YOSEMITE_TARGET_BLOCK"); + if (target_block_str) { + // target_block_str is "x,y,z" + std::stringstream ss(target_block_str); + std::string token; + int i = 0; + while (std::getline(ss, token, ',')) { + host_tracker_handle->target_block[i] = std::stoi(token); + i++; + } + if (i != 3) { + PRINT("[SANITIZER ERROR] Invalid target block format: %s\n", target_block_str); + exit(EXIT_FAILURE); + } + if (host_tracker_handle->target_block[0] < 0 || host_tracker_handle->target_block[1] < 0 || host_tracker_handle->target_block[2] < 0) { + PRINT("[SANITIZER ERROR] Invalid target block: %d, %d, %d\n", host_tracker_handle->target_block[0], host_tracker_handle->target_block[1], host_tracker_handle->target_block[2]); + exit(EXIT_FAILURE); + } + if (host_tracker_handle->target_block[0] >= gridDims.x || host_tracker_handle->target_block[1] >= gridDims.y || host_tracker_handle->target_block[2] >= gridDims.z) { + PRINT("[SANITIZER ERROR] Invalid target block: %d, %d, %d\n", host_tracker_handle->target_block[0], host_tracker_handle->target_block[1], host_tracker_handle->target_block[2]); + exit(EXIT_FAILURE); + } + PRINT("[SANITIZER INFO] Target block: %d, %d, %d\n", host_tracker_handle->target_block[0], host_tracker_handle->target_block[1], host_tracker_handle->target_block[2]); + } else { + host_tracker_handle->target_block[0] = 0; + host_tracker_handle->target_block[1] = 0; + host_tracker_handle->target_block[2] = 0; + PRINT("[SANITIZER INFO] No target block specified, using default (0, 0, 0)\n"); + } + // only sample one block so the doorbell need to be updated to thread amount of one block + uint32_t num_threads = blockDims.x * blockDims.y * blockDims.z; + global_doorbell->num_threads = num_threads; + global_doorbell->full = 0; + host_tracker_handle->doorBell = global_doorbell; + } else if (sanitizer_options.patch_name == GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS) { + SANITIZER_SAFECALL( + sanitizerMemset( + device_access_buffer, 0, sizeof(MemoryAccess) * MEMORY_ACCESS_BUFFER_SIZE, hstream)); + host_tracker_handle->currentEntry = 0; + host_tracker_handle->numEntries = 0; + host_tracker_handle->access_buffer = device_access_buffer; + + uint32_t num_threads = + blockDims.x * blockDims.y * blockDims.z * gridDims.x * gridDims.y * gridDims.z; + global_doorbell->num_threads = num_threads; + global_doorbell->full = 0; + host_tracker_handle->doorBell = global_doorbell; + } else if (sanitizer_options.patch_name == GPU_PATCH_PC_DEPENDENCY_ANALYSIS) { + SANITIZER_SAFECALL( + sanitizerMemset( + device_access_buffer, 0, sizeof(MemoryAccess) * MEMORY_ACCESS_BUFFER_SIZE, hstream)); + host_tracker_handle->currentEntry = 0; + host_tracker_handle->numEntries = 0; + host_tracker_handle->access_buffer = device_access_buffer; + uint32_t num_threads = blockDims.x * blockDims.y * blockDims.z * gridDims.x * gridDims.y * gridDims.z; global_doorbell->num_threads = num_threads; @@ -446,6 +667,8 @@ void LaunchBeginCallback( host_tracker_handle->doorBell = global_doorbell; } + host_tracker_handle->kernel_pc = pc; // for in-gpu offset calculation + host_tracker_handle->enabled_instrumenting = launch_monitoring; SANITIZER_SAFECALL( sanitizerMemcpyHostToDeviceAsync( device_tracker_handle, host_tracker_handle, sizeof(MemoryAccessTracker), hstream)); @@ -466,7 +689,8 @@ void LaunchEndCallback( Sanitizer_StreamHandle phstream) { // sampling - if (sanitizer_options.grid_launch_id % sanitizer_options.sample_rate != 0) { + if (sanitizer_options.grid_launch_id % sanitizer_options.sample_rate != 0 || + !SanitizerKernelWhiteListCheck(functionName)) { return; } @@ -616,6 +840,90 @@ void LaunchEndCallback( sanitizerMemcpyDeviceToHost( host_access_buffer, device_access_buffer, sizeof(MemoryAccess) * numEntries, hstream)); + yosemite_gpu_data_analysis(host_access_buffer, numEntries); + } else if (sanitizer_options.patch_name == GPU_PATCH_HEATMAP_ANALYSIS) { + while (true) + { + if (global_doorbell->num_threads == 0) { + break; + } + + if (global_doorbell->full) { + PRINT("[SANITIZER INFO] Doorbell full with size %u. Analyzing data...\n", + MEMORY_ACCESS_BUFFER_SIZE); + SANITIZER_SAFECALL( + sanitizerMemcpyDeviceToHost(host_access_buffer, device_access_buffer, + sizeof(MemoryAccess) * MEMORY_ACCESS_BUFFER_SIZE, phstream)); + yosemite_gpu_data_analysis(host_access_buffer, MEMORY_ACCESS_BUFFER_SIZE); + global_doorbell->full = 0; + } + } + SANITIZER_SAFECALL(sanitizerStreamSynchronize(hstream)); + SANITIZER_SAFECALL( + sanitizerMemcpyDeviceToHost( + host_tracker_handle, device_tracker_handle, sizeof(MemoryAccessTracker), hstream)); + + auto numEntries = host_tracker_handle->numEntries; + SANITIZER_SAFECALL( + sanitizerMemcpyDeviceToHost( + host_access_buffer, device_access_buffer, sizeof(MemoryAccess) * numEntries, hstream)); + + yosemite_gpu_data_analysis(host_access_buffer, numEntries); + } else if (sanitizer_options.patch_name == GPU_PATCH_BLOCK_DIVERGENCE_ANALYSIS) { + while (true) + { + if (global_doorbell->num_threads == 0) { + break; + } + + if (global_doorbell->full) { + PRINT("[SANITIZER INFO] Doorbell full with size %u. Analyzing data...\n", + MEMORY_ACCESS_BUFFER_SIZE); + SANITIZER_SAFECALL( + sanitizerMemcpyDeviceToHost(host_access_buffer, device_access_buffer, + sizeof(MemoryAccess) * MEMORY_ACCESS_BUFFER_SIZE, phstream)); + yosemite_gpu_data_analysis(host_access_buffer, MEMORY_ACCESS_BUFFER_SIZE); + global_doorbell->full = 0; + } + } + SANITIZER_SAFECALL(sanitizerStreamSynchronize(hstream)); + SANITIZER_SAFECALL( + sanitizerMemcpyDeviceToHost( + host_tracker_handle, device_tracker_handle, sizeof(MemoryAccessTracker), hstream)); + + auto numEntries = host_tracker_handle->numEntries; + SANITIZER_SAFECALL( + sanitizerMemcpyDeviceToHost( + host_access_buffer, device_access_buffer, sizeof(MemoryAccess) * numEntries, hstream)); + + yosemite_gpu_data_analysis(host_access_buffer, numEntries); + } else if (sanitizer_options.patch_name == GPU_PATCH_PC_DEPENDENCY_ANALYSIS) { + while (true) + { + if (global_doorbell->num_threads == 0) { + break; + } + + if (global_doorbell->full) { + PRINT("[SANITIZER INFO] Doorbell full with size %u. Analyzing data...\n", + MEMORY_ACCESS_BUFFER_SIZE); + SANITIZER_SAFECALL( + sanitizerMemcpyDeviceToHost(host_access_buffer, device_access_buffer, + sizeof(MemoryAccess) * MEMORY_ACCESS_BUFFER_SIZE, phstream)); + yosemite_gpu_data_analysis(host_access_buffer, MEMORY_ACCESS_BUFFER_SIZE); + global_doorbell->full = 0; + } + } + SANITIZER_SAFECALL(sanitizerStreamSynchronize(hstream)); + SANITIZER_SAFECALL( + sanitizerMemcpyDeviceToHost( + host_tracker_handle, device_tracker_handle, sizeof(MemoryAccessTracker), hstream)); + + auto numEntries = host_tracker_handle->numEntries; + SANITIZER_SAFECALL( + sanitizerMemcpyDeviceToHost( + host_access_buffer, device_access_buffer, sizeof(MemoryAccess) * numEntries, hstream)); + yosemite_gpu_data_analysis(host_access_buffer, numEntries); } } else { @@ -679,6 +987,12 @@ void ComputeSanitizerCallback( auto* pContextData = (Sanitizer_ResourceContextData*)cbdata; PRINT("[SANITIZER INFO] Context %p creation finished on device %p\n", &pContextData->context, &pContextData->device); + + CUstream p_stream; + Sanitizer_StreamHandle p_stream_handle; + sanitizer_priority_stream_get(pContextData->context, &p_stream); + PRINT("[SANITIZER INFO] Priority stream %p created on context %p\n", + p_stream, pContextData->context); break; } case SANITIZER_CBID_RESOURCE_CONTEXT_DESTROY_STARTING: @@ -728,7 +1042,7 @@ void ComputeSanitizerCallback( PRINT("[SANITIZER INFO] Malloc memory %p with size %lu (flag: %u) on device %d\n", (void*)pModuleData->address, pModuleData->size, pModuleData->flags, device_id); - + PRINT("[SANITIZER INFO] Sector tag: %p, end tag: %p\n", (void*)(pModuleData->address >> 5), (void*)((pModuleData->address + pModuleData->size - 1) >> 5)); yosemite_alloc_callback( pModuleData->address, pModuleData->size, pModuleData->flags, device_id); break; @@ -748,6 +1062,64 @@ void ComputeSanitizerCallback( pModuleData->address, pModuleData->size, pModuleData->flags, device_id); break; } + case SANITIZER_CBID_RESOURCE_HOST_MEMORY_ALLOC: + { + auto *pModuleData = (Sanitizer_ResourceMemoryData *)cbdata; + if (pModuleData->flags == SANITIZER_MEMORY_FLAG_CG_RUNTIME || pModuleData->size == 0) { + break; + } + PRINT("[SANITIZER INFO] Alloc host memory %p with size %lu (flag: %u)\n", + (void*)pModuleData->address, pModuleData->size, pModuleData->flags); + PRINT("[SANITIZER INFO] Sector tag: %p, end tag: %p\n", (void*)(pModuleData->address >> 5), (void*)((pModuleData->address + pModuleData->size - 1) >> 5)); + yosemite_alloc_callback( + pModuleData->address, pModuleData->size, pModuleData->flags, 0); + break; + } + case SANITIZER_CBID_RESOURCE_HOST_MEMORY_FREE: + { + auto *pModuleData = (Sanitizer_ResourceMemoryData *)cbdata; + if (pModuleData->flags == SANITIZER_MEMORY_FLAG_CG_RUNTIME || pModuleData->size == 0) { + break; + } + + PRINT("[SANITIZER INFO] Free host memory %p with size %lu (flag: %u)\n", + (void*)pModuleData->address, pModuleData->size, pModuleData->flags); + + yosemite_free_callback( + pModuleData->address, pModuleData->size, pModuleData->flags, 0); + break; + } + case SANITIZER_CBID_RESOURCE_MEMORY_ALLOC_ASYNC: + { + auto* pModuleData = (Sanitizer_ResourceMemoryData*)cbdata; + if (pModuleData->flags == SANITIZER_MEMORY_FLAG_CG_RUNTIME || pModuleData->size == 0) { + break; + } + + CUdevice device_id = sanitizer_ctx_to_device[pModuleData->context]; + + PRINT("[SANITIZER INFO] Alloc async memory %p with size %lu (flag: %u) on device %d\n", + (void*)pModuleData->address, pModuleData->size, pModuleData->flags, device_id); + yosemite_alloc_callback( + pModuleData->address, pModuleData->size, pModuleData->flags, device_id); + break; + } + case SANITIZER_CBID_RESOURCE_MEMORY_FREE_ASYNC: + { + auto* pModuleData = (Sanitizer_ResourceMemoryData*)cbdata; + if (pModuleData->flags == SANITIZER_MEMORY_FLAG_CG_RUNTIME || pModuleData->size == 0) { + break; + } + + CUdevice device_id = sanitizer_ctx_to_device[pModuleData->context]; + + PRINT("[SANITIZER INFO] Free async memory %p with size %lu (flag: %u) on device %d\n", + (void*)pModuleData->address, pModuleData->size, pModuleData->flags, device_id); + + yosemite_free_callback( + pModuleData->address, pModuleData->size, pModuleData->flags, device_id); + break; + } default: break; } @@ -768,14 +1140,16 @@ void ComputeSanitizerCallback( auto func_name = sanitizer_demangled_name_get(pLaunchData->functionName); CUdevice device_id = sanitizer_ctx_to_device[pLaunchData->context]; - - PRINT("[SANITIZER INFO] Launching kernel %s <<<(%u, %u, %u), (%u, %u, %u)>>> on device %d\n", + uint64_t pc; + uint64_t size; + SANITIZER_SAFECALL(sanitizerGetFunctionPcAndSize(pLaunchData->module, pLaunchData->functionName, &pc, &size)); + PRINT("[SANITIZER INFO] Launching kernel %s <<<(%u, %u, %u), (%u, %u, %u)>>> on device %d, pc: 0x%lx, size: %lu\n", func_name, pLaunchData->gridDim_x, pLaunchData->gridDim_y, pLaunchData->gridDim_z, pLaunchData->blockDim_x, pLaunchData->blockDim_y, pLaunchData->blockDim_z, - device_id); + device_id, pc, size); - LaunchBeginCallback(pLaunchData->context, pLaunchData->module, pLaunchData->function, + LaunchBeginCallback(pLaunchData->context, pLaunchData->module, pLaunchData->function, pc, func_name, pLaunchData->hStream, blockDims, gridDims); break; } @@ -913,6 +1287,7 @@ void enable_compute_sanitizer(bool enable) { int InitializeInjection() { sanitizer_debug_wait(); + LoadKernelWhiteList(std::getenv("YOSEMITE_KERNEL_WHITELIST")); Sanitizer_SubscriberHandle handle; SANITIZER_SAFECALL(sanitizerSubscribe(&handle, ComputeSanitizerCallback, nullptr)); SANITIZER_SAFECALL(sanitizerEnableDomain(1, handle, SANITIZER_CB_DOMAIN_RESOURCE));