Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ endif
TARGET_ARCH := $(shell uname -m)

ifeq ($(TARGET_ARCH),aarch64)
SMS ?= 53 61 70 72 75 80 86 87 90
SMS ?= 75 80 86 87 89 90 120
else
SMS ?= 52 60 70 75 80 86 90
SMS ?= 75 80 86 89 90 120
endif

# Generate SASS code for each SM architecture listed in $(SMS)
Expand Down
2 changes: 1 addition & 1 deletion Makefile.config
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CUDA_PATH ?= /usr/local/cuda-12.1
CUDA_PATH ?= /usr/local/cuda

SANALYZER_DIR ?= ../sanalyzer

Expand Down
151 changes: 151 additions & 0 deletions gpu_src/gpu_patch_block_divergence_analysis.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#include "gpu_patch.h"

#include <sanitizer_patching.h>

#include "gpu_utils.h"
#include <cstdio>

static __device__ __inline__
uint32_t GetBufferIndex(MemoryAccessTracker* pTracker) {
uint32_t idx = MEMORY_ACCESS_BUFFER_SIZE;

while (idx >= MEMORY_ACCESS_BUFFER_SIZE) {
idx = atomicAdd(&(pTracker->currentEntry), 1);

if (idx >= MEMORY_ACCESS_BUFFER_SIZE) {
// buffer is full, wait for last writing thread to flush
while (*(volatile uint32_t*)&(pTracker->currentEntry) >= MEMORY_ACCESS_BUFFER_SIZE);
}
}

return idx;
}

static __device__ __inline__
void IncrementNumEntries(MemoryAccessTracker* pTracker) {
DoorBell* doorbell = pTracker->doorBell;
__threadfence();
const uint32_t numEntries = atomicAdd((int*)&(pTracker->numEntries), 1);

if (numEntries == MEMORY_ACCESS_BUFFER_SIZE - 1) {
// make sure everything is visible in memory
__threadfence_system();
doorbell->full = true;
while (doorbell->full);

pTracker->numEntries = 0;
__threadfence();
pTracker->currentEntry = 0;
}
}

static __device__
SanitizerPatchResult CommonCallback(
void* userdata,
uint64_t pc,
void* ptr,
uint32_t accessSize,
uint32_t flags,
MemoryType type)
{
auto* pTracker = (MemoryAccessTracker*)userdata;

uint32_t active_mask = __activemask();
uint32_t laneid = get_laneid();
uint32_t first_laneid = __ffs(active_mask) - 1;

MemoryAccess* accesses = nullptr;

if (laneid == first_laneid) {
uint32_t idx = GetBufferIndex(pTracker);
accesses = &pTracker->access_buffer[idx];
accesses->accessSize = accessSize;
accesses->flags = flags;
accesses->warpId = get_warpid();
accesses->ctaId = get_ctaid_as_uint64();
accesses->pc = pc;
accesses->active_mask = active_mask;
accesses->type = type;
}

__syncwarp(active_mask);

accesses = (MemoryAccess*) shfl((uint64_t)accesses, first_laneid, active_mask);
if (accesses) {
accesses->addresses[laneid] = (uint64_t)(uintptr_t)ptr;
}

__syncwarp(active_mask);

if (laneid == first_laneid) {
IncrementNumEntries(pTracker);
}

return SANITIZER_PATCH_SUCCESS;
}

extern "C" __device__ __noinline__
SanitizerPatchResult MemoryGlobalAccessCallback(
void* userdata,
uint64_t pc,
void* ptr,
uint32_t accessSize,
uint32_t flags,
const void *pData)
{
return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Global);
}

extern "C" __device__ __noinline__
SanitizerPatchResult MemorySharedAccessCallback(
void* userdata,
uint64_t pc,
void* ptr,
uint32_t accessSize,
uint32_t flags,
const void *pData)
{
return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Shared);
}

extern "C" __device__ __noinline__
SanitizerPatchResult MemoryLocalAccessCallback(
void* userdata,
uint64_t pc,
void* ptr,
uint32_t accessSize,
uint32_t flags,
const void *pData)
{
return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Local);
}

extern "C" __device__ __noinline__
SanitizerPatchResult MemcpyAsyncCallback(void* userdata, uint64_t pc, void* src, uint32_t dst, uint32_t accessSize)
{
if (src)
{
CommonCallback(userdata, pc, src, accessSize, SANITIZER_MEMORY_DEVICE_FLAG_READ, MemoryType::Global);
}

return CommonCallback(userdata, pc, (void*)dst, accessSize, SANITIZER_MEMORY_DEVICE_FLAG_WRITE, MemoryType::Shared);
}

extern "C" __device__ __noinline__
SanitizerPatchResult BlockExitCallback(void* userdata, uint64_t pc)
{
MemoryAccessTracker* tracker = (MemoryAccessTracker*)userdata;
DoorBell* doorbell = tracker->doorBell;

uint32_t active_mask = __activemask();
uint32_t laneid = get_laneid();
uint32_t first_laneid = __ffs(active_mask) - 1;
int32_t pop_count = __popc(active_mask);

if (laneid == first_laneid) {
atomicAdd((int*)&doorbell->num_threads, -pop_count);
}
__syncwarp(active_mask);

return SANITIZER_PATCH_SUCCESS;
}
172 changes: 172 additions & 0 deletions gpu_src/gpu_patch_heatmap_analysis.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#include "gpu_patch.h"

#include <sanitizer_patching.h>

#include "gpu_utils.h"
#include <cstdio>

static __device__ __inline__
bool BypassCheckByCtaId(uint32_t block_idx, uint32_t block_idy, uint32_t block_idz) {
int4 ctaid = get_ctaid();
if(ctaid.x != block_idx || ctaid.y != block_idy || ctaid.z != block_idz) {
return true;
}
return false;
}


static __device__ __inline__
uint32_t GetBufferIndex(MemoryAccessTracker* pTracker) {
uint32_t idx = MEMORY_ACCESS_BUFFER_SIZE;

while (idx >= MEMORY_ACCESS_BUFFER_SIZE) {
idx = atomicAdd(&(pTracker->currentEntry), 1);

if (idx >= MEMORY_ACCESS_BUFFER_SIZE) {
// buffer is full, wait for last writing thread to flush
while (*(volatile uint32_t*)&(pTracker->currentEntry) >= MEMORY_ACCESS_BUFFER_SIZE);
}
}

return idx;
}

static __device__ __inline__
void IncrementNumEntries(MemoryAccessTracker* pTracker) {
DoorBell* doorbell = pTracker->doorBell;
__threadfence();
const uint32_t numEntries = atomicAdd((int*)&(pTracker->numEntries), 1);

if (numEntries == MEMORY_ACCESS_BUFFER_SIZE - 1) {
// make sure everything is visible in memory
__threadfence_system();
doorbell->full = true;
while (doorbell->full);

pTracker->numEntries = 0;
__threadfence();
pTracker->currentEntry = 0;
}
}

static __device__
SanitizerPatchResult CommonCallback(
void* userdata,
uint64_t pc,
void* ptr,
uint32_t accessSize,
uint32_t flags,
MemoryType type)
{
auto* pTracker = (MemoryAccessTracker*)userdata;
if(BypassCheckByCtaId(pTracker->target_block[0], pTracker->target_block[1], pTracker->target_block[2])) {
return SANITIZER_PATCH_SUCCESS;
}

uint32_t active_mask = __activemask();
uint32_t laneid = get_laneid();
uint32_t first_laneid = __ffs(active_mask) - 1;

MemoryAccess* accesses = nullptr;

if (laneid == first_laneid) {
uint32_t idx = GetBufferIndex(pTracker);
accesses = &pTracker->access_buffer[idx];
accesses->accessSize = accessSize;
accesses->flags = flags;
accesses->warpId = get_warpid();
accesses->type = type;
accesses->pc = pc;
accesses->active_mask = active_mask;
}

__syncwarp(active_mask);

accesses = (MemoryAccess*) shfl((uint64_t)accesses, first_laneid, active_mask);
if (accesses) {
if(type == MemoryType::Local){
accesses->addresses[laneid] = (((uint64_t) get_warpid() * GPU_WARP_SIZE + get_laneid()) << 54) |((uint64_t)(uintptr_t)ptr); // use high 10 bits to store thread id for local memory access
} else {
accesses->addresses[laneid] = (uint64_t)(uintptr_t)ptr;
}
}

__syncwarp(active_mask);

if (laneid == first_laneid) {
IncrementNumEntries(pTracker);
}

return SANITIZER_PATCH_SUCCESS;
}

extern "C" __device__ __noinline__
SanitizerPatchResult MemoryGlobalAccessCallback(
void* userdata,
uint64_t pc,
void* ptr,
uint32_t accessSize,
uint32_t flags,
const void *pData)
{
return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Global);
}

extern "C" __device__ __noinline__
SanitizerPatchResult MemorySharedAccessCallback(
void* userdata,
uint64_t pc,
void* ptr,
uint32_t accessSize,
uint32_t flags,
const void *pData)
{
return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Shared);
}

extern "C" __device__ __noinline__
SanitizerPatchResult MemoryLocalAccessCallback(
void* userdata,
uint64_t pc,
void* ptr,
uint32_t accessSize,
uint32_t flags,
const void *pData)
{
return CommonCallback(userdata, pc, ptr, accessSize, flags, MemoryType::Local);
}

//For the future use of async memcpy
extern "C" __device__ __noinline__
SanitizerPatchResult MemcpyAsyncCallback(void* userdata, uint64_t pc, void* src, uint32_t dst, uint32_t accessSize)
{
if (src)
{
CommonCallback(userdata, pc, src, accessSize, SANITIZER_MEMORY_DEVICE_FLAG_READ, MemoryType::Global);
}

return CommonCallback(userdata, pc, (void*)dst, accessSize, SANITIZER_MEMORY_DEVICE_FLAG_WRITE, MemoryType::Shared);
}

extern "C" __device__ __noinline__
SanitizerPatchResult BlockExitCallback(void* userdata, uint64_t pc)
{

MemoryAccessTracker* tracker = (MemoryAccessTracker*)userdata;
DoorBell* doorbell = tracker->doorBell;
if(BypassCheckByCtaId(tracker->target_block[0], tracker->target_block[1], tracker->target_block[2])) {
return SANITIZER_PATCH_SUCCESS;
}

uint32_t active_mask = __activemask();
uint32_t laneid = get_laneid();
uint32_t first_laneid = __ffs(active_mask) - 1;
int32_t pop_count = __popc(active_mask);

if (laneid == first_laneid) {
atomicAdd((int*)&doorbell->num_threads, -pop_count);
}
__syncwarp(active_mask);

return SANITIZER_PATCH_SUCCESS;
}
Loading