diff --git a/zstd/zstdgpu/zstdgpu.cpp b/zstd/zstdgpu/zstdgpu.cpp
index 29a976f..e234b62 100644
--- a/zstd/zstdgpu/zstdgpu.cpp
+++ b/zstd/zstdgpu/zstdgpu.cpp
@@ -560,19 +560,20 @@ zstdgpu_Status zstdgpu_CreatePerRequestContext(zstdgpu_PerRequestContext *outPer
         context->ExecuteSequences = context->ExecuteSequences64;
         context->DecompressSequences_LdsFseCache = context->DecompressSequences_LdsFseCache32;
 #else
-        if (persistentContext->maxLaneCount == 128)
+        if (persistentContext->minLaneCount == 128)
         {
             context->ExecuteSequences = context->ExecuteSequences128;
             context->DecompressSequences_LdsFseCache = context->DecompressSequences_LdsFseCache128;
         }
-        else if (persistentContext->maxLaneCount == 64)
+        else if (persistentContext->minLaneCount == 64)
         {
             context->ExecuteSequences = context->ExecuteSequences64;
             context->DecompressSequences_LdsFseCache = context->DecompressSequences_LdsFseCache64;
         }
         else
         {
-            context->ExecuteSequences = context->ExecuteSequences32;
+            context->ExecuteSequences = (persistentContext->maxLaneCount >= 64) ? context->ExecuteSequences64
+                                                                                : context->ExecuteSequences32;
             context->DecompressSequences_LdsFseCache = context->DecompressSequences_LdsFseCache32;
         }
 #endif
diff --git a/zstd/zstdgpu/zstdgpu_shaders.h b/zstd/zstdgpu/zstdgpu_shaders.h
index f9c296f..07cfc7f 100644
--- a/zstd/zstdgpu/zstdgpu_shaders.h
+++ b/zstd/zstdgpu/zstdgpu_shaders.h
@@ -3030,39 +3030,43 @@ void zstdgpu_DecompressHuffmanCompressedLiterals(ZSTDGPU_RO_RAW_BUFFER(uint32_t)
 
         zstdgpu_LitStreamInfo compressedLiteral = LitRefs[literalStreamId];
 
-#if 0
-        zstdgpu_Backward_BitBuffer_V0 bitBuffer;
-        zstdgpu_Backward_BitBuffer_V0_InitWithSegment(bitBuffer, CompressedData, compressedLiteral.src);
-
-        uint32_t state = zstdgpu_Backward_BitBuffer_V0_Get_Huffman(bitBuffer, bitsMax, bitsMax);
-        uint32_t decodedByteCnt = 0;
-        while (decodedByteCnt < compressedLiteral.dst.size)
+        if (compressedLiteral.dst.size != 0) // derived from block Regenerated_Size
         {
-            uint32_t symbol = 0;
-            uint32_t bitcnt = 0;
-            zstdgpu_SampleHuffmanSymbolAndBitcnt(symbol, bitcnt, state, GS_HuffmanTable);
+            uint32_t decodedByteCnt = 0;
 
-            // FIXME/TODO(pamartis): Experiment with storing data to LDS first (we have some allocated but unused)
-            // and then to memory. At least try small LDS cache of 32-dwords per literal
-            zstdgpu_TypedStoreU8(DecompressedLiterals, compressedLiteral.dst.offs + decodedByteCnt++, symbol);
+            // This more original way won't compile since Backward_BitBuffer_V0 expects StructuredBuffer<uint32_t>
+            // but CompressedData is ByteAddressBuffer. We could remove all raw-buffer usage and reintroduce it later;
+            // one 64-bit load isn't much better than two (on AMD: s_claused'd) 32-bit loads.
+            //
+            // Benefits of raw-buffers over StructuredBuffer<uint32_t> is any of Load{1,2,3,4} can be used and
+            // when applicable, they are nicer for SMEM (s_buffer_load does not use the SRD stride to compute the address).
+#if 0
+            zstdgpu_Backward_BitBuffer_V0 bitBuffer;
+            zstdgpu_Backward_BitBuffer_V0_InitWithSegment(bitBuffer, CompressedData, compressedLiteral.src);
 
-            if (zstdgpu_Backward_BitBuffer_V0_CanRefill_Huffman(bitBuffer, bitcnt))
+            uint32_t state = zstdgpu_Backward_BitBuffer_V0_Get_Huffman(bitBuffer, bitsMax, bitsMax);
+            uint32_t decodedByteCnt = 0;
+            for (;;)
             {
+                uint32_t symbol = 0;
+                uint32_t bitcnt = 0;
+                zstdgpu_SampleHuffmanSymbolAndBitcnt(symbol, bitcnt, state, GS_HuffmanTable);
+
+                // FIXME/TODO(pamartis): Experiment with storing data to LDS first (we have some allocated but unused)
+                // and then to memory. At least try small LDS cache of 32-dwords per literal
+                zstdgpu_TypedStoreU8(DecompressedLiterals, compressedLiteral.dst.offs + decodedByteCnt++, symbol);
+
+                if (decodedByteCnt == compressedLiteral.dst.size)
+                {
+                    break;
+                }
+
                 const uint32_t rest = zstdgpu_Backward_BitBuffer_V0_Get_Huffman(bitBuffer, bitcnt, bitsMax);
                 state = ((state << bitcnt) + rest) & maxBitcntMask;
             }
-            else
-            {
-                break;
-            }
-        }
 #else
-        if (compressedLiteral.dst.size != 0) // derived from block Regenerated_Size
-        {
             zstdgpu_HuffmanStream stream;
             zstdgpu_HuffmanStream_InitWithSegment(stream, CompressedData, compressedLiteral.src, bitsMax);
-
-            uint32_t decodedByteCnt = 0;
             do
             {
                 const uint32_t state = zstdgpu_HuffmanStream_RefillAndPeek(stream);
@@ -3076,8 +3080,8 @@ void zstdgpu_DecompressHuffmanCompressedLiterals(ZSTDGPU_RO_RAW_BUFFER(uint32_t)
                 // It could make sense to mid-break on (decodedByteCnt == compressedLiteral.dst.size) instead.
                 zstdgpu_HuffmanStream_Consume(stream, bitcnt);
             } while (decodedByteCnt < compressedLiteral.dst.size);
-        }
 #endif
+        }
     }
 }
 
@@ -3499,14 +3503,6 @@ static void zstdgpu_ShaderEntry_DecompressSequences_LdsFseCache(ZSTDGPU_PARAM_IN
 
     const zstdgpu_SeqStreamInfo seqRef = srt.inSeqRefs[seqStreamIdx];
 
-    #ifdef ZSTDGPU_BACKWARD_BITBUF
-    #   error `ZSTDGPU_BACKWARD_BITBUF` must not be defined.
-    #endif
-
-    zstdgpu_Backward_BitBuffer_V0 bitBuffer;
-    #define ZSTDGPU_BACKWARD_BITBUF(method) zstdgpu_Backward_BitBuffer_V0_##method
-    ZSTDGPU_BACKWARD_BITBUF(InitWithSegment)(bitBuffer, srt.inCompressedData, seqRef.src);
-
 #ifndef __hlsl_dx_compiler
 
     const uint32_t SEQ_LITERAL_LENGTH_BASELINES[36] = {
@@ -3531,7 +3527,7 @@ static void zstdgpu_ShaderEntry_DecompressSequences_LdsFseCache(ZSTDGPU_PARAM_IN
 
     // NOTE: the final block size will be computed as SUM(literalSize, totalMLen)
     const uint32_t literalSize = srt.inoutBlockSizePrefix[seqRef.blockId];
-    uint32_t totalSize = 0;
+    // uint32_t totalSize = 0;
     uint32_t totalMLen = 0;
 
     uint32_t offset1, offset2, offset3;
@@ -3542,8 +3538,6 @@ static void zstdgpu_ShaderEntry_DecompressSequences_LdsFseCache(ZSTDGPU_PARAM_IN
     const uint32_t startMLen = seqRef.fseMLen * kzstdgpu_FseElemMaxCount_LLen;
 
     const zstdgpu_OffsetAndSize dst = zstdgpu_GetSequenceStartAndCount(srt, seqStreamIdx, seqStreamCnt);
-    const uint32_t outputStart = dst.offs;
-    const uint32_t outputEnd = outputStart + dst.size;
 
     #include "zstdgpu_lds_decl_base.h"
     ZSTDGPU_DECOMPRESS_SEQUENCES_LDS_FSE_CACHE_LDS(0, DecompressSequences_LdsFseCache);
@@ -3574,114 +3568,125 @@ static void zstdgpu_ShaderEntry_DecompressSequences_LdsFseCache(ZSTDGPU_PARAM_IN
     ZSTDGPU_PRELOAD_FSE_INTO_LDS(MLen)
 
     #if !defined(__XBOX_SCARLETT)
-    if (tgSize > WaveGetLaneCount())
-    {
-        GroupMemoryBarrierWithGroupSync();
-    }
-    if (threadId >= WaveGetLaneCount())
+    GroupMemoryBarrierWithGroupSync();
+    #endif
+
+    // The rest of the shader should be scalar. Ideally the compiler should emit mostly scalar instructions,
+    // but this may help it, or deactivate unnecessary lanes for instructions with no scalar counterpart (LDS loads).
+    if (threadId != 0)
     {
         return;
     }
-    #endif
 
-    #define ZSTDGPU_INIT_FSE_STATE(name)                                                                    \
-        uint32_t state##name = 0;                                                                           \
-        if (seqRef.fse##name < kzstdgpu_FseProbTableIndex_MinRLE)                                           \
-        {                                                                                                   \
-            const uint32_t initBitcnt = srt.inFseInfos[seqRef.fse##name].fseProbCountAndAccuracyLog2 >> 8;  \
-            state##name = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, initBitcnt);                              \
-        }
+    if (dst.size != 0)
+    {
+        #ifdef ZSTDGPU_BACKWARD_BITBUF
+        #   error `ZSTDGPU_BACKWARD_BITBUF` must not be defined.
+        #endif
 
-    ZSTDGPU_INIT_FSE_STATE(LLen)
-    ZSTDGPU_INIT_FSE_STATE(Offs)
-    ZSTDGPU_INIT_FSE_STATE(MLen)
-    #undef ZSTDGPU_INIT_FSE_STATE
-
-    #define ZSTGPU_DECODE_SEQ(outIdx, outNState, outRestBitcnt)                             \
-    {                                                                                       \
-        const uint32_t packedFseElemLLen = WaveReadLaneFirst(zstdgpu_LdsLoadU32(GS_FsePackedLLen + stateLLen));\
-        const uint32_t packedFseElemOffs = WaveReadLaneFirst(zstdgpu_LdsLoadU32(GS_FsePackedOffs + stateOffs));\
-        const uint32_t packedFseElemMLen = WaveReadLaneFirst(zstdgpu_LdsLoadU32(GS_FsePackedMLen + stateMLen));\
-                                                                                            \
-        const uint32_t symbolLLen = packedFseElemLLen & 0xff;                               \
-        const uint32_t symbolOffs = packedFseElemOffs & 0xff;                               \
-        const uint32_t symbolMLen = packedFseElemMLen & 0xff;                               \
-                                                                                            \
-        outRestBitcnt##LLen = (packedFseElemLLen >> 8) & 0xff;                              \
-        outRestBitcnt##Offs = (packedFseElemOffs >> 8) & 0xff;                              \
-        outRestBitcnt##MLen = (packedFseElemMLen >> 8) & 0xff;                              \
-                                                                                            \
-        outNState##LLen = (packedFseElemLLen >> 16) & 0xffff;                               \
-        outNState##Offs = (packedFseElemOffs >> 16) & 0xffff;                               \
-        outNState##MLen = (packedFseElemMLen >> 16) & 0xffff;                               \
-                                                                                            \
-        ZSTDGPU_ASSERT(symbolLLen < 36);                                                    \
-        ZSTDGPU_ASSERT(symbolMLen < 53);                                                    \
-                                                                                            \
-        const uint32_t bitcntLLen = SEQ_LITERAL_LENGTH_EXTRA_BITS[symbolLLen];              \
-        const uint32_t bitcntOffs = symbolOffs;                                             \
-        const uint32_t bitcntMLen = SEQ_MATCH_LENGTH_EXTRA_BITS[symbolMLen];                \
-                                                                                            \
-        const uint32_t bitsOffs = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, bitcntOffs);      \
-        const uint32_t bitsMLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, bitcntMLen);      \
-        const uint32_t bitsLLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, bitcntLLen);      \
-                                                                                            \
-              uint32_t offs = (1u << symbolOffs) + bitsOffs;                                \
-        const uint32_t mlen = SEQ_MATCH_LENGTH_BASELINES[symbolMLen] + bitsMLen;            \
-        const uint32_t llen = SEQ_LITERAL_LENGTH_BASELINES[symbolLLen] + bitsLLen;          \
-                                                                                            \
-        offs = zstdgpu_SequenceOffsets_Update2(offset1, offset2, offset3, offs, llen);      \
-                                                                                            \
-        totalSize += llen + mlen;                                                           \
-        totalMLen += mlen;                                                                  \
-                                                                                            \
-        srt.inoutDecompressedSequenceLLen[outIdx] = llen;                                   \
-        srt.inoutDecompressedSequenceMLen[outIdx] = mlen;                                   \
-        srt.inoutDecompressedSequenceOffs[outIdx] = offs;                                   \
-    }
+        zstdgpu_Backward_BitBuffer_V0 bitBuffer;
+        #define ZSTDGPU_BACKWARD_BITBUF(method) zstdgpu_Backward_BitBuffer_V0_##method
+        ZSTDGPU_BACKWARD_BITBUF(InitWithSegment)(bitBuffer, srt.inCompressedData, seqRef.src);
 
+        #define ZSTDGPU_INIT_FSE_STATE(name)                                                                    \
+            uint32_t state##name = 0;                                                                           \
+            if (seqRef.fse##name < kzstdgpu_FseProbTableIndex_MinRLE)                                           \
+            {                                                                                                   \
+                const uint32_t initBitcnt = srt.inFseInfos[seqRef.fse##name].fseProbCountAndAccuracyLog2 >> 8;  \
+                state##name = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, initBitcnt);                              \
+            }
 
-    for (uint32_t i = outputStart; i < outputEnd - 1; ++i)
-    {
-        uint32_t restbitcntLLen, restbitcntOffs, restbitcntMLen;
-        uint32_t nstateLLen, nstateOffs, nstateMLen;
-        ZSTGPU_DECODE_SEQ(i, nstate, restbitcnt)
+        ZSTDGPU_INIT_FSE_STATE(LLen)
+        ZSTDGPU_INIT_FSE_STATE(Offs)
+        ZSTDGPU_INIT_FSE_STATE(MLen)
+        #undef ZSTDGPU_INIT_FSE_STATE
 
-        #if 0
-        const uint32_t restLLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntLLen);
-        const uint32_t restMLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntMLen);
-        const uint32_t restOffs = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntOffs);
-        #else
-        // NOTE(pamartis): bit counts stored in FSE tables are equal to accuracy_log in worst case
-        // so it's 9 for LLen/MLen and 8 for offset, so we are not extracting more than 26 bits at once
-        uint32_t packedBits = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntLLen + restbitcntMLen + restbitcntOffs);
+        #define ZSTGPU_DECODE_SEQ(outIdx, outNState, outRestBitcnt)                             \
+        {                                                                                       \
+            const uint32_t packedFseElemLLen = WaveReadLaneFirst(zstdgpu_LdsLoadU32(GS_FsePackedLLen + stateLLen));\
+            const uint32_t packedFseElemOffs = WaveReadLaneFirst(zstdgpu_LdsLoadU32(GS_FsePackedOffs + stateOffs));\
+            const uint32_t packedFseElemMLen = WaveReadLaneFirst(zstdgpu_LdsLoadU32(GS_FsePackedMLen + stateMLen));\
+                                                                                                \
+            const uint32_t symbolLLen = packedFseElemLLen & 0xff;                               \
+            const uint32_t symbolOffs = packedFseElemOffs & 0xff;                               \
+            const uint32_t symbolMLen = packedFseElemMLen & 0xff;                               \
+                                                                                                \
+            outRestBitcnt##LLen = (packedFseElemLLen >> 8) & 0xff;                              \
+            outRestBitcnt##Offs = (packedFseElemOffs >> 8) & 0xff;                              \
+            outRestBitcnt##MLen = (packedFseElemMLen >> 8) & 0xff;                              \
+                                                                                                \
+            outNState##LLen = (packedFseElemLLen >> 16) & 0xffff;                               \
+            outNState##Offs = (packedFseElemOffs >> 16) & 0xffff;                               \
+            outNState##MLen = (packedFseElemMLen >> 16) & 0xffff;                               \
+                                                                                                \
+            ZSTDGPU_ASSERT(symbolLLen < 36);                                                    \
+            ZSTDGPU_ASSERT(symbolMLen < 53);                                                    \
+                                                                                                \
+            const uint32_t bitcntLLen = SEQ_LITERAL_LENGTH_EXTRA_BITS[symbolLLen];              \
+            const uint32_t bitcntOffs = symbolOffs;                                             \
+            const uint32_t bitcntMLen = SEQ_MATCH_LENGTH_EXTRA_BITS[symbolMLen];                \
+                                                                                                \
+            const uint32_t bitsOffs = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, bitcntOffs);      \
+            const uint32_t bitsMLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, bitcntMLen);      \
+            const uint32_t bitsLLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, bitcntLLen);      \
+                                                                                                \
+                  uint32_t offs = (1u << symbolOffs) + bitsOffs;                                \
+            const uint32_t mlen = SEQ_MATCH_LENGTH_BASELINES[symbolMLen] + bitsMLen;            \
+            const uint32_t llen = SEQ_LITERAL_LENGTH_BASELINES[symbolLLen] + bitsLLen;          \
+                                                                                                \
+            offs = zstdgpu_SequenceOffsets_Update2(offset1, offset2, offset3, offs, llen);      \
+                                                                                                \
+            /*totalSize += llen + mlen;*/                                                           \
+            totalMLen += mlen;                                                                  \
+                                                                                                \
+            srt.inoutDecompressedSequenceLLen[outIdx] = llen;                                   \
+            srt.inoutDecompressedSequenceMLen[outIdx] = mlen;                                   \
+            srt.inoutDecompressedSequenceOffs[outIdx] = offs;                                   \
+        }
 
-        const uint32_t restOffs = packedBits & ((1u << restbitcntOffs) - 1u);
-        packedBits >>= restbitcntOffs;
+              uint32_t i         = dst.offs;
+        const uint32_t outputEnd = dst.offs + dst.size;
+        for (;;)
+        {
+            uint32_t restbitcntLLen, restbitcntOffs, restbitcntMLen;
+            uint32_t nstateLLen, nstateOffs, nstateMLen;
+            ZSTGPU_DECODE_SEQ(i, nstate, restbitcnt)
+            if (++i == outputEnd)
+            {
+                break;
+            }
 
-        const uint32_t restMLen = packedBits & ((1u << restbitcntMLen) - 1u);
-        packedBits >>= restbitcntMLen;
+            #if 0
+            const uint32_t restLLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntLLen);
+            const uint32_t restMLen = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntMLen);
+            const uint32_t restOffs = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntOffs);
+            #else
+            // NOTE(pamartis): bit counts stored in FSE tables are equal to accuracy_log in worst case
+            // so it's 9 for LLen/MLen and 8 for offset, so we are not extracting more than 26 bits at once
+            uint32_t packedBits = ZSTDGPU_BACKWARD_BITBUF(Get)(bitBuffer, restbitcntLLen + restbitcntMLen + restbitcntOffs);
 
-        const uint32_t restLLen = packedBits;
-        #endif
+            const uint32_t restOffs = packedBits & ((1u << restbitcntOffs) - 1u);
+            packedBits >>= restbitcntOffs;
 
-        stateLLen = nstateLLen + restLLen;
-        stateMLen = nstateMLen + restMLen;
-        stateOffs = nstateOffs + restOffs;
-    }
+            const uint32_t restMLen = packedBits & ((1u << restbitcntMLen) - 1u);
+            packedBits >>= restbitcntMLen;
 
-    uint32_t restbitcntLLen, restbitcntOffs, restbitcntMLen;
-    uint32_t nstateLLen, nstateOffs, nstateMLen;
-    ZSTGPU_DECODE_SEQ(outputEnd - 1, nstate, restbitcnt)
-    #undef ZSTDGPU_BACKWARD_BITBUF
+            const uint32_t restLLen = packedBits;
+            #endif
+
+            stateLLen = nstateLLen + restLLen;
+            stateMLen = nstateMLen + restMLen;
+            stateOffs = nstateOffs + restOffs;
+        }
+        ZSTDGPU_ASSERT(bitBuffer.hadlastrefill && bitBuffer.bitcnt == 0);
+        #undef ZSTDGPU_BACKWARD_BITBUF
+    }
 
     // NOTE(pamartis): update block size adding `totalMLen` bytes on top
     srt.inoutBlockSizePrefix[seqRef.blockId] = totalMLen + literalSize;
     srt.inoutPerSeqStreamFinalOffset1[seqStreamIdx] = offset1;
     srt.inoutPerSeqStreamFinalOffset2[seqStreamIdx] = offset2;
     srt.inoutPerSeqStreamFinalOffset3[seqStreamIdx] = offset3;
-
-    ZSTDGPU_ASSERT(bitBuffer.hadlastrefill && bitBuffer.bitcnt == 0);
 }
 
 // LDS partitioning macro lists for sequence decompression with in-LDS caching
diff --git a/zstd/zstdgpu/zstdgpu_structs.h b/zstd/zstdgpu/zstdgpu_structs.h
index 5809794..4f6d025 100644
--- a/zstd/zstdgpu/zstdgpu_structs.h
+++ b/zstd/zstdgpu/zstdgpu_structs.h
@@ -443,6 +443,32 @@ static inline uint32_t zstdgpu_FindFirstBitHiU32(uint32_t v)
 #endif
 }
 
+static inline uint32_t zstdgpu_FindFirstBitHiU32_Nonzero(uint32_t v)
+{
+#ifdef __hlsl_dx_compiler
+    // On AMD RDNA3, {v,s}_clz_i32_u32 return -1 for an input of 0, instead of returning 32.
+    // So firstbithigh can't directly be implemented as 31 - {v,s}_clz_i32_u32;
+    // there are additional fixup instructions, currently even when or-ing the input with 1.
+    // The current AMD driver compiler does not do a great job with uniform firstbithigh:
+    //          v_clz_i32_u32   v0, s10             // input s10 is scalar, but didn't use SALU s_clz_i32_u32
+    //          v_sub_nc_u32    v1, 31, v0
+    //          v_cmp_ne_i32    vcc_lo, -1, v0
+    //          v_cndmask_b32   v0, -1, v1, vcc_lo  // final result in v10
+    // The following formulation gets us:
+    //          s_brev_b32    s13, s10
+    //          s_ctz_i32_b32 s13, s13
+    //          s_sub_u32     s13, 31, s13          // final result in s13
+    // Which isn't identical when v==0, so only use this when v!=0.
+    // If input is non-uniform (VALU), we still save an instruction.
+    return 31 - firstbitlow(reversebits(v));
+#else
+    unsigned long index = 0;
+    uint32_t found = _BitScanReverse(&index, v);
+    ZSTDGPU_ASSERT(0 != found);
+    return found ? (uint32_t)index : 32u; // found should be true, but due this to match GPU behavior
+#endif
+}
+
 static inline uint32_t zstdgpu_FindFirstBitHiU64(uint64_t x)
 {
 #if defined(__hlsl_dx_compiler)
@@ -804,7 +830,6 @@ typedef struct zstdgpu_Backward_BitBuffer_V0
     uint32_t lastDword;   // VGPR as it store any memory block size varying per lane
     uint32_t baseDword;
     bool     hadlastrefill;
-    bool     hadlastrefillHuffman;
 } zstdgpu_Backward_BitBuffer_V0;
 
 typedef struct zstdgpu_Backward_BitBuffer
@@ -849,7 +874,7 @@ static inline void zstdgpu_Backward_BitBuffer_V0_InitWithSegment(ZSTDGPU_PARAM_I
     uint32_t bitbuf = buffer[lastDword] & bitmsk;
 
     // Secondly, we search for the highest set bit to see how many bits are valid
-    bitcnt = zstdgpu_FindFirstBitHiU32(bitbuf);
+    bitcnt = zstdgpu_FindFirstBitHiU32_Nonzero(bitbuf);
 #ifdef ZSTDGPU_USE_REVERSED_BIT_BUFFER_BITBUF
     bitbuf <<= 32u - bitcnt;
     bitbuf = reversebits(bitbuf);
@@ -879,7 +904,6 @@ static inline void zstdgpu_Backward_BitBuffer_V0_InitWithSegment(ZSTDGPU_PARAM_I
     outBuffer.lastDword = lastDword;
     outBuffer.baseDword = baseDword;
     outBuffer.hadlastrefill = baseDword == lastDword;
-    outBuffer.hadlastrefillHuffman = false;
     //outBuffer.bytesz = bytesz;
 }
 
@@ -960,19 +984,6 @@ static inline void zstdgpu_Backward_BitBuffer_V0_Pop(ZSTDGPU_PARAM_INOUT(zstdgpu
 
 ZSTDGPU_BITBUF_DEFINE_STANDARD_METHODS(Backward_BitBuffer_V0)
 
-static inline bool zstdgpu_Backward_BitBuffer_V0_CanRefill_Huffman(ZSTDGPU_PARAM_IN(zstdgpu_Backward_BitBuffer_V0) inBuffer, uint32_t bitcnt)
-{
-    ZSTDGPU_ASSERT(bitcnt <= 32);
-    if (inBuffer.bitcnt >= bitcnt)
-    {
-        return true;
-    }
-    else
-    {
-        return !inBuffer.hadlastrefillHuffman;
-    }
-}
-
 static inline void zstdgpu_Backward_BitBuffer_V0_Refill_Huffman(ZSTDGPU_PARAM_INOUT(zstdgpu_Backward_BitBuffer_V0) inoutBuffer, uint32_t bitcnt, uint32_t extrabits)
 {
     if (inoutBuffer.hadlastrefill == false)
@@ -983,7 +994,6 @@ static inline void zstdgpu_Backward_BitBuffer_V0_Refill_Huffman(ZSTDGPU_PARAM_IN
     if (inoutBuffer.bitcnt < bitcnt)
     {
         inoutBuffer.bitcnt += extrabits;    // simply increment counter because upper bits are zeros
-        inoutBuffer.hadlastrefillHuffman = true;
     }
 }
 
@@ -998,7 +1008,6 @@ static inline uint32_t zstdgpu_Backward_BitBuffer_V0_Get_Huffman(ZSTDGPU_PARAM_I
     {
         inoutBuffer.bitcnt += extrabits;    // simply increment counter because upper bits are zeros
         inoutBuffer.bitbuf <<= extrabits;
-        inoutBuffer.hadlastrefillHuffman = true;
     }
 
     uint32_t result = zstdgpu_Backward_BitBuffer_V0_Top(inoutBuffer, bitcnt);
@@ -1052,7 +1061,7 @@ static inline void zstdgpu_HuffmanStream_InitWithSegment(ZSTDGPU_PARAM_INOUT(zst
     // Count number of leading zero bits above the flag (result in [0:7]).
     // There is no 64-bit version of v_clz_i32_u32 and uint32_t(data64 >> 32) is free since U64 is a pair of VGPRs.
     // Add one (reverse-subtract by 32, not 31) to also shift out the flag itself.
-    const uint32_t nonDataBitCount = 32 - zstdgpu_FindFirstBitHiU32(uint32_t(data64 >> 32));
+    const uint32_t nonDataBitCount = 32 - zstdgpu_FindFirstBitHiU32_Nonzero(uint32_t(data64 >> 32));
     data64 <<= nonDataBitCount;
     const uint32_t keptBitCount = 64 - (oobBitCount + nonDataBitCount); // could be 0