diff --git a/zstd/zstdgpu/zstdgpu.cpp b/zstd/zstdgpu/zstdgpu.cpp index 29a976f..e234b62 100644 --- a/zstd/zstdgpu/zstdgpu.cpp +++ b/zstd/zstdgpu/zstdgpu.cpp @@ -560,19 +560,20 @@ zstdgpu_Status zstdgpu_CreatePerRequestContext(zstdgpu_PerRequestContext *outPer context->ExecuteSequences = context->ExecuteSequences64; context->DecompressSequences_LdsFseCache = context->DecompressSequences_LdsFseCache32; #else - if (persistentContext->maxLaneCount == 128) + if (persistentContext->minLaneCount == 128) { context->ExecuteSequences = context->ExecuteSequences128; context->DecompressSequences_LdsFseCache = context->DecompressSequences_LdsFseCache128; } - else if (persistentContext->maxLaneCount == 64) + else if (persistentContext->minLaneCount == 64) { context->ExecuteSequences = context->ExecuteSequences64; context->DecompressSequences_LdsFseCache = context->DecompressSequences_LdsFseCache64; } else { - context->ExecuteSequences = context->ExecuteSequences32; + context->ExecuteSequences = (persistentContext->maxLaneCount >= 64) ? context->ExecuteSequences64 + : context->ExecuteSequences32; context->DecompressSequences_LdsFseCache = context->DecompressSequences_LdsFseCache32; } #endif diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h index f9c296f..07cfc7f 100644 --- a/zstd/zstdgpu/zstdgpu_shaders.h +++ b/zstd/zstdgpu/zstdgpu_shaders.h @@ -3030,39 +3030,43 @@ void zstdgpu_DecompressHuffmanCompressedLiterals(ZSTDGPU_RO_RAW_BUFFER(uint32_t) zstdgpu_LitStreamInfo compressedLiteral = LitRefs[literalStreamId]; -#if 0 - zstdgpu_Backward_BitBuffer_V0 bitBuffer; - zstdgpu_Backward_BitBuffer_V0_InitWithSegment(bitBuffer, CompressedData, compressedLiteral.src); - - uint32_t state = zstdgpu_Backward_BitBuffer_V0_Get_Huffman(bitBuffer, bitsMax, bitsMax); - uint32_t decodedByteCnt = 0; - while (decodedByteCnt < compressedLiteral.dst.size) + if (compressedLiteral.dst.size != 0) // derived from block Regenerated_Size { - uint32_t symbol = 0; - uint32_t bitcnt = 0; - zstdgpu_SampleHuffmanSymbolAndBitcnt(symbol, bitcnt, state, GS_HuffmanTable); + uint32_t decodedByteCnt = 0; - // FIXME/TODO(pamartis): Experiment with storing data to LDS first (we have some allocated but unused) - // and then to memory. At least try small LDS cache of 32-dwords per literal - zstdgpu_TypedStoreU8(DecompressedLiterals, compressedLiteral.dst.offs + decodedByteCnt++, symbol); + // This more original way won't compile since Backward_BitBuffer_V0 expects StructuredBuffer + // but CompressedData is ByteAddressBuffer. We could remove all raw-buffer usage and reintroduce it later; + // one 64-bit load isn't much better than two (on AMD: s_claused'd) 32-bit loads. + // + // Benefits of raw-buffers over StructuredBuffer is any of Load{1,2,3,4} can be used and + // when applicable, they are nicer for SMEM (s_buffer_load does not use the SRD stride to compute the address). +#if 0 + zstdgpu_Backward_BitBuffer_V0 bitBuffer; + zstdgpu_Backward_BitBuffer_V0_InitWithSegment(bitBuffer, CompressedData, compressedLiteral.src); - if (zstdgpu_Backward_BitBuffer_V0_CanRefill_Huffman(bitBuffer, bitcnt)) + uint32_t state = zstdgpu_Backward_BitBuffer_V0_Get_Huffman(bitBuffer, bitsMax, bitsMax); + uint32_t decodedByteCnt = 0; + for (;;) { + uint32_t symbol = 0; + uint32_t bitcnt = 0; + zstdgpu_SampleHuffmanSymbolAndBitcnt(symbol, bitcnt, state, GS_HuffmanTable); + + // FIXME/TODO(pamartis): Experiment with storing data to LDS first (we have some allocated but unused) + // and then to memory. At least try small LDS cache of 32-dwords per literal + zstdgpu_TypedStoreU8(DecompressedLiterals, compressedLiteral.dst.offs + decodedByteCnt++, symbol); + + if (decodedByteCnt == compressedLiteral.dst.size) + { + break; + } + const uint32_t rest = zstdgpu_Backward_BitBuffer_V0_Get_Huffman(bitBuffer, bitcnt, bitsMax); state = ((state << bitcnt) + rest) & maxBitcntMask; } - else - { - break; - } - } #else - if (compressedLiteral.dst.size != 0) // derived from block Regenerated_Size - { zstdgpu_HuffmanStream stream; zstdgpu_HuffmanStream_InitWithSegment(stream, CompressedData, compressedLiteral.src, bitsMax); - - uint32_t decodedByteCnt = 0; do { const uint32_t state = zstdgpu_HuffmanStream_RefillAndPeek(stream); @@ -3076,8 +3080,8 @@ void zstdgpu_DecompressHuffmanCompressedLiterals(ZSTDGPU_RO_RAW_BUFFER(uint32_t) // It could make sense to mid-break on (decodedByteCnt == compressedLiteral.dst.size) instead. zstdgpu_HuffmanStream_Consume(stream, bitcnt); } while (decodedByteCnt < compressedLiteral.dst.size); - } #endif + } } } @@ -3499,14 +3503,6 @@ static void zstdgpu_ShaderEntry_DecompressSequences_LdsFseCache(ZSTDGPU_PARAM_IN const zstdgpu_SeqStreamInfo seqRef = srt.inSeqRefs[seqStreamIdx]; - #ifdef ZSTDGPU_BACKWARD_BITBUF - # error `ZSTDGPU_BACKWARD_BITBUF` must not be defined. - #endif - - zstdgpu_Backward_BitBuffer_V0 bitBuffer; - #define ZSTDGPU_BACKWARD_BITBUF(method) zstdgpu_Backward_BitBuffer_V0_##method - ZSTDGPU_BACKWARD_BITBUF(InitWithSegment)(bitBuffer, srt.inCompressedData, seqRef.src); - #ifndef __hlsl_dx_compiler const uint32_t SEQ_LITERAL_LENGTH_BASELINES[36] = { @@ -3531,7 +3527,7 @@ static void zstdgpu_ShaderEntry_DecompressSequences_LdsFseCache(ZSTDGPU_PARAM_IN // NOTE: the final block size will be computed as SUM(literalSize, totalMLen) const uint32_t literalSize = srt.inoutBlockSizePrefix[seqRef.blockId]; - uint32_t totalSize = 0; + // uint32_t totalSize = 0; uint32_t totalMLen = 0; uint32_t offset1, offset2, offset3; @@ -3542,8 +3538,6 @@ static void zstdgpu_ShaderEntry_DecompressSequences_LdsFseCache(ZSTDGPU_PARAM_IN const uint32_t startMLen = seqRef.fseMLen * kzstdgpu_FseElemMaxCount_LLen; const zstdgpu_OffsetAndSize dst = zstdgpu_GetSequenceStartAndCount(srt, seqStreamIdx, seqStreamCnt); - const uint32_t outputStart = dst.offs; - const uint32_t outputEnd = outputStart + dst.size; #include "zstdgpu_lds_decl_base.h" ZSTDGPU_DECOMPRESS_SEQUENCES_LDS_FSE_CACHE_LDS(0, DecompressSequences_LdsFseCache); @@ -3574,114 +3568,125 @@ static void zstdgpu_ShaderEntry_DecompressSequences_LdsFseCache(ZSTDGPU_PARAM_IN ZSTDGPU_PRELOAD_FSE_INTO_LDS(MLen) #if !defined(__XBOX_SCARLETT) - if (tgSize > WaveGetLaneCount()) - { - GroupMemoryBarrierWithGroupSync(); - } - if (threadId >= WaveGetLaneCount()) + GroupMemoryBarrierWithGroupSync(); + #endif + + // The rest of the shader should be scalar. Ideally the compiler should emit mostly scalar instructions, + // but this may help it, or deactivate unnecessary lanes for instructions with no scalar counterpart (LDS loads). + if (threadId != 0) { return; } - #endif - #define ZSTDGPU_INIT_FSE_STATE(name) \ - uint32_t state##name = 0; \ - if (seqRef.fse##name < kzstdgpu_FseProbTableIndex_MinRLE) \ - { \ - const uint32_t initBitcnt = srt.inFseInfos[seqRef.fse##name].fseProbCountAndAccuracyLog2 >> 8; \ - state##name = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, initBitcnt); \ - } + if (dst.size != 0) + { + #ifdef ZSTDGPU_BACKWARD_BITBUF + # error `ZSTDGPU_BACKWARD_BITBUF` must not be defined. + #endif - ZSTDGPU_INIT_FSE_STATE(LLen) - ZSTDGPU_INIT_FSE_STATE(Offs) - ZSTDGPU_INIT_FSE_STATE(MLen) - #undef ZSTDGPU_INIT_FSE_STATE - - #define ZSTGPU_DECODE_SEQ(outIdx, outNState, outRestBitcnt) \ - { \ - const uint32_t packedFseElemLLen = WaveReadLaneFirst(zstdgpu_LdsLoadU32(GS_FsePackedLLen + stateLLen));\ - const uint32_t packedFseElemOffs = WaveReadLaneFirst(zstdgpu_LdsLoadU32(GS_FsePackedOffs + stateOffs));\ - const uint32_t packedFseElemMLen = WaveReadLaneFirst(zstdgpu_LdsLoadU32(GS_FsePackedMLen + stateMLen));\ - \ - const uint32_t symbolLLen = packedFseElemLLen & 0xff; \ - const uint32_t symbolOffs = packedFseElemOffs & 0xff; \ - const uint32_t symbolMLen = packedFseElemMLen & 0xff; \ - \ - outRestBitcnt##LLen = (packedFseElemLLen >> 8) & 0xff; \ - outRestBitcnt##Offs = (packedFseElemOffs >> 8) & 0xff; \ - outRestBitcnt##MLen = (packedFseElemMLen >> 8) & 0xff; \ - \ - outNState##LLen = (packedFseElemLLen >> 16) & 0xffff; \ - outNState##Offs = (packedFseElemOffs >> 16) & 0xffff; \ - outNState##MLen = (packedFseElemMLen >> 16) & 0xffff; \ - \ - ZSTDGPU_ASSERT(symbolLLen < 36); \ - ZSTDGPU_ASSERT(symbolMLen < 53); \ - \ - const uint32_t bitcntLLen = SEQ_LITERAL_LENGTH_EXTRA_BITS[symbolLLen]; \ - const uint32_t bitcntOffs = symbolOffs; \ - const uint32_t bitcntMLen = SEQ_MATCH_LENGTH_EXTRA_BITS[symbolMLen]; \ - \ - const uint32_t bitsOffs = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, bitcntOffs); \ - const uint32_t bitsMLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, bitcntMLen); \ - const uint32_t bitsLLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, bitcntLLen); \ - \ - uint32_t offs = (1u << symbolOffs) + bitsOffs; \ - const uint32_t mlen = SEQ_MATCH_LENGTH_BASELINES[symbolMLen] + bitsMLen; \ - const uint32_t llen = SEQ_LITERAL_LENGTH_BASELINES[symbolLLen] + bitsLLen; \ - \ - offs = zstdgpu_SequenceOffsets_Update2(offset1, offset2, offset3, offs, llen); \ - \ - totalSize += llen + mlen; \ - totalMLen += mlen; \ - \ - srt.inoutDecompressedSequenceLLen[outIdx] = llen; \ - srt.inoutDecompressedSequenceMLen[outIdx] = mlen; \ - srt.inoutDecompressedSequenceOffs[outIdx] = offs; \ - } + zstdgpu_Backward_BitBuffer_V0 bitBuffer; + #define ZSTDGPU_BACKWARD_BITBUF(method) zstdgpu_Backward_BitBuffer_V0_##method + ZSTDGPU_BACKWARD_BITBUF(InitWithSegment)(bitBuffer, srt.inCompressedData, seqRef.src); + #define ZSTDGPU_INIT_FSE_STATE(name) \ + uint32_t state##name = 0; \ + if (seqRef.fse##name < kzstdgpu_FseProbTableIndex_MinRLE) \ + { \ + const uint32_t initBitcnt = srt.inFseInfos[seqRef.fse##name].fseProbCountAndAccuracyLog2 >> 8; \ + state##name = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, initBitcnt); \ + } - for (uint32_t i = outputStart; i < outputEnd - 1; ++i) - { - uint32_t restbitcntLLen, restbitcntOffs, restbitcntMLen; - uint32_t nstateLLen, nstateOffs, nstateMLen; - ZSTGPU_DECODE_SEQ(i, nstate, restbitcnt) + ZSTDGPU_INIT_FSE_STATE(LLen) + ZSTDGPU_INIT_FSE_STATE(Offs) + ZSTDGPU_INIT_FSE_STATE(MLen) + #undef ZSTDGPU_INIT_FSE_STATE - #if 0 - const uint32_t restLLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntLLen); - const uint32_t restMLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntMLen); - const uint32_t restOffs = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntOffs); - #else - // NOTE(pamartis): bit counts stored in FSE tables are equal to accuracy_log in worst case - // so it's 9 for LLen/MLen and 8 for offset, so we are not extracting more than 26 bits at once - uint32_t packedBits = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntLLen + restbitcntMLen + restbitcntOffs); + #define ZSTGPU_DECODE_SEQ(outIdx, outNState, outRestBitcnt) \ + { \ + const uint32_t packedFseElemLLen = WaveReadLaneFirst(zstdgpu_LdsLoadU32(GS_FsePackedLLen + stateLLen));\ + const uint32_t packedFseElemOffs = WaveReadLaneFirst(zstdgpu_LdsLoadU32(GS_FsePackedOffs + stateOffs));\ + const uint32_t packedFseElemMLen = WaveReadLaneFirst(zstdgpu_LdsLoadU32(GS_FsePackedMLen + stateMLen));\ + \ + const uint32_t symbolLLen = packedFseElemLLen & 0xff; \ + const uint32_t symbolOffs = packedFseElemOffs & 0xff; \ + const uint32_t symbolMLen = packedFseElemMLen & 0xff; \ + \ + outRestBitcnt##LLen = (packedFseElemLLen >> 8) & 0xff; \ + outRestBitcnt##Offs = (packedFseElemOffs >> 8) & 0xff; \ + outRestBitcnt##MLen = (packedFseElemMLen >> 8) & 0xff; \ + \ + outNState##LLen = (packedFseElemLLen >> 16) & 0xffff; \ + outNState##Offs = (packedFseElemOffs >> 16) & 0xffff; \ + outNState##MLen = (packedFseElemMLen >> 16) & 0xffff; \ + \ + ZSTDGPU_ASSERT(symbolLLen < 36); \ + ZSTDGPU_ASSERT(symbolMLen < 53); \ + \ + const uint32_t bitcntLLen = SEQ_LITERAL_LENGTH_EXTRA_BITS[symbolLLen]; \ + const uint32_t bitcntOffs = symbolOffs; \ + const uint32_t bitcntMLen = SEQ_MATCH_LENGTH_EXTRA_BITS[symbolMLen]; \ + \ + const uint32_t bitsOffs = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, bitcntOffs); \ + const uint32_t bitsMLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, bitcntMLen); \ + const uint32_t bitsLLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, bitcntLLen); \ + \ + uint32_t offs = (1u << symbolOffs) + bitsOffs; \ + const uint32_t mlen = SEQ_MATCH_LENGTH_BASELINES[symbolMLen] + bitsMLen; \ + const uint32_t llen = SEQ_LITERAL_LENGTH_BASELINES[symbolLLen] + bitsLLen; \ + \ + offs = zstdgpu_SequenceOffsets_Update2(offset1, offset2, offset3, offs, llen); \ + \ + /*totalSize += llen + mlen;*/ \ + totalMLen += mlen; \ + \ + srt.inoutDecompressedSequenceLLen[outIdx] = llen; \ + srt.inoutDecompressedSequenceMLen[outIdx] = mlen; \ + srt.inoutDecompressedSequenceOffs[outIdx] = offs; \ + } - const uint32_t restOffs = packedBits & ((1u << restbitcntOffs) - 1u); - packedBits >>= restbitcntOffs; + uint32_t i = dst.offs; + const uint32_t outputEnd = dst.offs + dst.size; + for (;;) + { + uint32_t restbitcntLLen, restbitcntOffs, restbitcntMLen; + uint32_t nstateLLen, nstateOffs, nstateMLen; + ZSTGPU_DECODE_SEQ(i, nstate, restbitcnt) + if (++i == outputEnd) + { + break; + } - const uint32_t restMLen = packedBits & ((1u << restbitcntMLen) - 1u); - packedBits >>= restbitcntMLen; + #if 0 + const uint32_t restLLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntLLen); + const uint32_t restMLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntMLen); + const uint32_t restOffs = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntOffs); + #else + // NOTE(pamartis): bit counts stored in FSE tables are equal to accuracy_log in worst case + // so it's 9 for LLen/MLen and 8 for offset, so we are not extracting more than 26 bits at once + uint32_t packedBits = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntLLen + restbitcntMLen + restbitcntOffs); - const uint32_t restLLen = packedBits; - #endif + const uint32_t restOffs = packedBits & ((1u << restbitcntOffs) - 1u); + packedBits >>= restbitcntOffs; - stateLLen = nstateLLen + restLLen; - stateMLen = nstateMLen + restMLen; - stateOffs = nstateOffs + restOffs; - } + const uint32_t restMLen = packedBits & ((1u << restbitcntMLen) - 1u); + packedBits >>= restbitcntMLen; - uint32_t restbitcntLLen, restbitcntOffs, restbitcntMLen; - uint32_t nstateLLen, nstateOffs, nstateMLen; - ZSTGPU_DECODE_SEQ(outputEnd - 1, nstate, restbitcnt) - #undef ZSTDGPU_BACKWARD_BITBUF + const uint32_t restLLen = packedBits; + #endif + + stateLLen = nstateLLen + restLLen; + stateMLen = nstateMLen + restMLen; + stateOffs = nstateOffs + restOffs; + } + ZSTDGPU_ASSERT(bitBuffer.hadlastrefill && bitBuffer.bitcnt == 0); + #undef ZSTDGPU_BACKWARD_BITBUF + } // NOTE(pamartis): update block size adding `totalMLen` bytes on top srt.inoutBlockSizePrefix[seqRef.blockId] = totalMLen + literalSize; srt.inoutPerSeqStreamFinalOffset1[seqStreamIdx] = offset1; srt.inoutPerSeqStreamFinalOffset2[seqStreamIdx] = offset2; srt.inoutPerSeqStreamFinalOffset3[seqStreamIdx] = offset3; - - ZSTDGPU_ASSERT(bitBuffer.hadlastrefill && bitBuffer.bitcnt == 0); } // LDS partitioning macro lists for sequence decompression with in-LDS caching diff --git a/zstd/zstdgpu/zstdgpu_structs.h b/zstd/zstdgpu/zstdgpu_structs.h index 5809794..4f6d025 100644 --- a/zstd/zstdgpu/zstdgpu_structs.h +++ b/zstd/zstdgpu/zstdgpu_structs.h @@ -443,6 +443,32 @@ static inline uint32_t zstdgpu_FindFirstBitHiU32(uint32_t v) #endif } +static inline uint32_t zstdgpu_FindFirstBitHiU32_Nonzero(uint32_t v) +{ +#ifdef __hlsl_dx_compiler + // On AMD RDNA3, {v,s}_clz_i32_u32 return -1 for an input of 0, instead of returning 32. + // So firstbithigh can't directly be implemented as 31 - {v,s}_clz_i32_u32; + // there are additional fixup instructions, currently even when or-ing the input with 1. + // The current AMD driver compiler does not do a great job with uniform firstbithigh: + // v_clz_i32_u32 v0, s10 // input s10 is scalar, but didn't use SALU s_clz_i32_u32 + // v_sub_nc_u32 v1, 31, v0 + // v_cmp_ne_i32 vcc_lo, -1, v0 + // v_cndmask_b32 v0, -1, v1, vcc_lo // final result in v10 + // The following formulation gets us: + // s_brev_b32 s13, s10 + // s_ctz_i32_b32 s13, s13 + // s_sub_u32 s13, 31, s13 // final result in s13 + // Which isn't identical when v==0, so only use this when v!=0. + // If input is non-uniform (VALU), we still save an instruction. + return 31 - firstbitlow(reversebits(v)); +#else + unsigned long index = 0; + uint32_t found = _BitScanReverse(&index, v); + ZSTDGPU_ASSERT(0 != found); + return found ? (uint32_t)index : 32u; // found should be true, but due this to match GPU behavior +#endif +} + static inline uint32_t zstdgpu_FindFirstBitHiU64(uint64_t x) { #if defined(__hlsl_dx_compiler) @@ -804,7 +830,6 @@ typedef struct zstdgpu_Backward_BitBuffer_V0 uint32_t lastDword; // VGPR as it store any memory block size varying per lane uint32_t baseDword; bool hadlastrefill; - bool hadlastrefillHuffman; } zstdgpu_Backward_BitBuffer_V0; typedef struct zstdgpu_Backward_BitBuffer @@ -849,7 +874,7 @@ static inline void zstdgpu_Backward_BitBuffer_V0_InitWithSegment(ZSTDGPU_PARAM_I uint32_t bitbuf = buffer[lastDword] & bitmsk; // Secondly, we search for the highest set bit to see how many bits are valid - bitcnt = zstdgpu_FindFirstBitHiU32(bitbuf); + bitcnt = zstdgpu_FindFirstBitHiU32_Nonzero(bitbuf); #ifdef ZSTDGPU_USE_REVERSED_BIT_BUFFER_BITBUF bitbuf <<= 32u - bitcnt; bitbuf = reversebits(bitbuf); @@ -879,7 +904,6 @@ static inline void zstdgpu_Backward_BitBuffer_V0_InitWithSegment(ZSTDGPU_PARAM_I outBuffer.lastDword = lastDword; outBuffer.baseDword = baseDword; outBuffer.hadlastrefill = baseDword == lastDword; - outBuffer.hadlastrefillHuffman = false; //outBuffer.bytesz = bytesz; } @@ -960,19 +984,6 @@ static inline void zstdgpu_Backward_BitBuffer_V0_Pop(ZSTDGPU_PARAM_INOUT(zstdgpu ZSTDGPU_BITBUF_DEFINE_STANDARD_METHODS(Backward_BitBuffer_V0) -static inline bool zstdgpu_Backward_BitBuffer_V0_CanRefill_Huffman(ZSTDGPU_PARAM_IN(zstdgpu_Backward_BitBuffer_V0) inBuffer, uint32_t bitcnt) -{ - ZSTDGPU_ASSERT(bitcnt <= 32); - if (inBuffer.bitcnt >= bitcnt) - { - return true; - } - else - { - return !inBuffer.hadlastrefillHuffman; - } -} - static inline void zstdgpu_Backward_BitBuffer_V0_Refill_Huffman(ZSTDGPU_PARAM_INOUT(zstdgpu_Backward_BitBuffer_V0) inoutBuffer, uint32_t bitcnt, uint32_t extrabits) { if (inoutBuffer.hadlastrefill == false) @@ -983,7 +994,6 @@ static inline void zstdgpu_Backward_BitBuffer_V0_Refill_Huffman(ZSTDGPU_PARAM_IN if (inoutBuffer.bitcnt < bitcnt) { inoutBuffer.bitcnt += extrabits; // simply increment counter because upper bits are zeros - inoutBuffer.hadlastrefillHuffman = true; } } @@ -998,7 +1008,6 @@ static inline uint32_t zstdgpu_Backward_BitBuffer_V0_Get_Huffman(ZSTDGPU_PARAM_I { inoutBuffer.bitcnt += extrabits; // simply increment counter because upper bits are zeros inoutBuffer.bitbuf <<= extrabits; - inoutBuffer.hadlastrefillHuffman = true; } uint32_t result = zstdgpu_Backward_BitBuffer_V0_Top(inoutBuffer, bitcnt); @@ -1052,7 +1061,7 @@ static inline void zstdgpu_HuffmanStream_InitWithSegment(ZSTDGPU_PARAM_INOUT(zst // Count number of leading zero bits above the flag (result in [0:7]). // There is no 64-bit version of v_clz_i32_u32 and uint32_t(data64 >> 32) is free since U64 is a pair of VGPRs. // Add one (reverse-subtract by 32, not 31) to also shift out the flag itself. - const uint32_t nonDataBitCount = 32 - zstdgpu_FindFirstBitHiU32(uint32_t(data64 >> 32)); + const uint32_t nonDataBitCount = 32 - zstdgpu_FindFirstBitHiU32_Nonzero(uint32_t(data64 >> 32)); data64 <<= nonDataBitCount; const uint32_t keptBitCount = 64 - (oobBitCount + nonDataBitCount); // could be 0