#version 450 #extension GL_EXT_control_flow_attributes : enable #extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require #extension GL_EXT_shader_explicit_arithmetic_types_int32 : require #extension GL_EXT_shader_explicit_arithmetic_types_int16 : require #extension GL_KHR_memory_scope_semantics : enable #extension GL_KHR_cooperative_matrix : enable #extension GL_NV_cooperative_matrix2 : enable #extension GL_EXT_buffer_reference : enable #extension GL_KHR_shader_subgroup_ballot : enable #extension GL_KHR_shader_subgroup_vote : enable #include "types.comp" layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 1) const uint BM = 64; layout (constant_id = 2) const uint BN = 64; layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working with a quant layout (constant_id = 4) const bool enable_smaller_matrices = false; const uint BNover2 = enable_smaller_matrices ? (BN / 2) : BN; const uint BNover4 = enable_smaller_matrices ? (BN / 4) : BN; layout (push_constant) uniform parameter { uint M; uint N; uint K; uint stride_a; uint stride_b; uint stride_d; uint batch_stride_a; uint batch_stride_b; uint batch_stride_d; #ifdef MUL_MAT_ID uint nei0; uint nei1; uint nbi1; uint ne11; #else uint k_split; uint ne02; uint ne12; uint broadcast2; uint broadcast3; #endif // N dimension for the B matrix can be >= p.N uint padded_N; } p; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; #if QUANT_K > 1 #define DECODEFUNCA , dequantFuncA #include "dequant_funcs_cm2.comp" #else #define DECODEFUNCA #endif #ifdef MUL_MAT_ID layout (binding = 3) readonly buffer IDS {int data_ids[];}; shared u16vec4 row_ids[3072]; layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB { B_TYPE b[]; }; uint _ne1; shared uint _ne1_sh; B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { const uint row_i = blockCoords[0]; if (row_i >= _ne1) { return B_TYPE(0.0); } const u16vec4 row_idx = row_ids[row_i]; B_TYPE ret = data_b[row_idx.y * p.batch_stride_b + row_idx.x * p.stride_b + blockCoords[1]]; return ret; } D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t ir, const in uint32_t ic) { uint dr = ir * BM + r; uint dc = ic * BN + c; if (dr < p.M && dc < _ne1) { uint row_i = dc; const u16vec4 row_idx = row_ids[row_i]; data_d[row_idx.y * p.batch_stride_d + row_idx.z * p.stride_d + dr] = elem; } return elem; } #endif void main() { #ifdef NEEDS_INIT_IQ_SHMEM init_iq_shmem(gl_WorkGroupSize); #endif #ifdef MUL_MAT_ID const uint expert_idx = gl_GlobalInvocationID.z; #else const uint batch_idx = gl_GlobalInvocationID.z; const uint i13 = batch_idx / p.ne12; const uint i12 = batch_idx % p.ne12; const uint i03 = i13 / p.broadcast3; const uint i02 = i12 / p.broadcast2; const uint batch_idx_a = i03 * p.ne02 + i02; #endif const uint blocks_m = (p.M + BM - 1) / BM; const uint ir = gl_WorkGroupID.x % blocks_m; const uint ik = gl_WorkGroupID.x / blocks_m; const uint ic = gl_WorkGroupID.y; #ifdef MUL_MAT_ID // Spread the search across all elements in the first subgroup if (gl_SubgroupID == 0) { _ne1 = 0; uint num_elements = p.nei1 * p.nei0; for (uint i = gl_SubgroupInvocationID; subgroupAny(i < num_elements); i += gl_SubgroupSize) { bool in_range = i < num_elements; uint ii0 = i % p.nei0; uint ii1 = i / p.nei0; uint id = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0; uvec4 ballot = subgroupBallot(in_range && id == expert_idx); uint idx = subgroupBallotExclusiveBitCount(ballot); if (in_range && id == expert_idx) { row_ids[_ne1 + idx] = u16vec4(ii0 % p.ne11, ii1, ii0, 0); } _ne1 += subgroupBallotBitCount(ballot); } _ne1_sh = _ne1; } barrier(); _ne1 = _ne1_sh; // Workgroup has no work if (ic * BN >= _ne1) return; #endif #ifdef MUL_MAT_ID uint start_k = 0; const uint end_k = p.K; #else uint start_k = ik * p.k_split; const uint end_k = min(p.K, (ik + 1) * p.k_split); #endif #ifdef MUL_MAT_ID uint pos_a = (expert_idx * p.batch_stride_a) / QUANT_K; uint pos_b = 0; #else uint pos_a = (batch_idx_a * p.batch_stride_a) / QUANT_K; uint pos_b = batch_idx * p.batch_stride_b; uint pos_d = batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z; #endif uint stride_a = p.stride_a / QUANT_K; uint stride_b = p.stride_b; // Hint to the compiler that values are aligned (want 16B alignment). // Quants are always block-aligned, no alignment needed. #if ALIGNED #if QUANT_K == 1 stride_a &= ~7; #endif stride_b &= ~7; #endif // Create layouts for both clamped and unclamped accesses tensorLayoutNV<2> tensorLayoutA = createTensorLayoutNV(2); tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutAClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); tensorLayoutNV<2> tensorLayoutB = createTensorLayoutNV(2); tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutBClamp = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); tensorLayoutD = setTensorLayoutStrideNV(tensorLayoutD, p.stride_d, 1); #if QUANT_K > 1 tensorLayoutA = setTensorLayoutBlockSizeNV(tensorLayoutA, 1, QUANT_K); tensorLayoutAClamp = setTensorLayoutBlockSizeNV(tensorLayoutAClamp, 1, QUANT_K); #endif // Use end_k rather than p.K as the dimension because that's what // we need to bound check against when using split_k. // Bounds check B against padded_N, but bounds check D against N. tensorLayoutA = setTensorLayoutDimensionNV(tensorLayoutA, p.M, end_k); tensorLayoutB = setTensorLayoutDimensionNV(tensorLayoutB, p.padded_N, end_k); tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.N, p.M); tensorLayoutAClamp = setTensorLayoutDimensionNV(tensorLayoutAClamp, p.M, end_k); tensorLayoutBClamp = setTensorLayoutDimensionNV(tensorLayoutBClamp, p.padded_N, end_k); tensorViewNV<2, false, 1, 0> tensorViewTranspose = createTensorViewNV(2, false, 1, 0); #if !defined(MUL_MAT_ID) // Detect a fast path where all loads are entirely in bounds and no clamping is required if ((ir + 1) * BM <= p.M && (ic + 1) * BN <= p.padded_N && (start_k % BK) == 0 && (end_k % BK) == 0 && #if QUANT_K == 1 (stride_a % 8) == 0 && #endif (stride_b % 8) == 0 && (start_k % 8) == 0) { // Hint to the compiler that values are aligned (want 16B alignment) start_k &= ~7; stride_b &= ~7; #if QUANT_K == 1 stride_a &= ~7; #endif tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1); tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1); uint k_iters = (end_k - start_k + BK - 1) / BK; if (enable_smaller_matrices && ic * BN + BNover4 >= p.N) { coopmat sum = coopmat(0.0); for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) { coopmat mat_a; coopmat mat_b; coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover4, block_k, BK), tensorViewTranspose); sum = coopMatMulAdd(mat_a, mat_b, sum); } coopmat mat_d = coopmat(sum); coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BNover4, ir * BM, BM), tensorViewTranspose); return; } else if (enable_smaller_matrices && ic * BN + BNover2 >= p.N) { coopmat sum = coopmat(0.0); for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) { coopmat mat_a; coopmat mat_b; coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BNover2, block_k, BK), tensorViewTranspose); sum = coopMatMulAdd(mat_a, mat_b, sum); } coopmat mat_d = coopmat(sum); coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BNover2, ir * BM, BM), tensorViewTranspose); return; } else { coopmat sum = coopmat(0.0); for (uint block_k = start_k, i = 0; i < k_iters; block_k += BK, ++i) { coopmat mat_a; coopmat mat_b; coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, block_k, BK) DECODEFUNCA); coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose); sum = coopMatMulAdd(mat_a, mat_b, sum); } coopmat mat_d = coopmat(sum); coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose); return; } } else #endif // !defined(MUL_MAT_ID) { tensorLayoutA = setTensorLayoutStrideNV(tensorLayoutA, stride_a, 1); tensorLayoutAClamp = setTensorLayoutStrideNV(tensorLayoutAClamp, stride_a, 1); tensorLayoutB = setTensorLayoutStrideNV(tensorLayoutB, stride_b, 1); tensorLayoutBClamp = setTensorLayoutStrideNV(tensorLayoutBClamp, stride_b, 1); coopmat sum; sum = coopmat(0.0); [[dont_unroll]] for (uint block_k = start_k; block_k < end_k; block_k += BK) { coopmat mat_a; coopmat mat_b; // Clamping is expensive, so detect different code paths for each combination // of A and B needing clamping. bool unclampedA = (ir + 1) * BM <= p.M && block_k + BK <= end_k && (block_k % 8) == 0; #ifdef MUL_MAT_ID bool unclampedB = true; #else bool unclampedB = (ic + 1) * BN <= p.padded_N && block_k + BK <= end_k && (block_k % 8) == 0; #endif if (unclampedA && unclampedB) { coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA); #ifdef MUL_MAT_ID coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB); #else coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose); #endif sum = coopMatMulAdd(mat_a, mat_b, sum); } else if (unclampedA && !unclampedB) { coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutA, ir * BM, BM, (block_k & ~7), BK) DECODEFUNCA); coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose); sum = coopMatMulAdd(mat_a, mat_b, sum); } else if (!unclampedA && unclampedB) { coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA); #ifdef MUL_MAT_ID coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, block_k, BK), tensorViewTranspose, decodeFuncB); #else coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutB, ic * BN, BN, (block_k & ~7), BK), tensorViewTranspose); #endif sum = coopMatMulAdd(mat_a, mat_b, sum); } else if (!unclampedA && !unclampedB) { coopMatLoadTensorNV(mat_a, data_a, pos_a, sliceTensorLayoutNV(tensorLayoutAClamp, ir * BM, BM, block_k, BK) DECODEFUNCA); coopMatLoadTensorNV(mat_b, data_b, pos_b, sliceTensorLayoutNV(tensorLayoutBClamp, ic * BN, BN, block_k, BK), tensorViewTranspose); sum = coopMatMulAdd(mat_a, mat_b, sum); } } // Convert from ACC_TYPE to D_TYPE coopmat mat_d; mat_d = coopmat(sum); #ifdef MUL_MAT_ID // Call callback to store each element, remapping row through shared memory coopMatPerElementNV(mat_d, mat_d, perElemOpD, ir, ic); #else coopMatStoreTensorNV(mat_d, data_d, pos_d, sliceTensorLayoutNV(tensorLayoutD, ic * BN, BN, ir * BM, BM), tensorViewTranspose); #endif } }