opencl: fix for small models (#11950)
* opencl: fix small shape gemv, remove unused extensions * opencl: fix `transpose_16`, `dump_tensor`, enforce subgroup size * opencl: fix for token length < 4 * opencl: use wave size of 64 for all Adreno GPUs --------- Co-authored-by: Shawn Gu <quic_shawngu@quicinc.com> Co-authored-by: Skyler Szot <quic_sszot@quicinc.com>
This commit is contained in:
parent
7a2c913e66
commit
34a846b584
6 changed files with 67 additions and 59 deletions
|
@ -1,9 +1,11 @@
|
|||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable
|
||||
#pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable
|
||||
|
||||
#ifdef cl_qcom_reqd_sub_group_size
|
||||
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
||||
#define ADRENO_GPU 1
|
||||
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
||||
#endif
|
||||
|
||||
// assume
|
||||
#define QK4_0 32
|
||||
|
@ -186,8 +188,9 @@
|
|||
total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
|
||||
total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
|
||||
|
||||
|
||||
__attribute__((qcom_reqd_sub_group_size("full")))
|
||||
#ifdef ADRENO_GPU
|
||||
REQD_SUBGROUP_SIZE_64
|
||||
#endif
|
||||
__kernel void kernel_gemv_noshuffle(
|
||||
__read_only image1d_buffer_t src0_q, // quantized A
|
||||
global half2 * src0_d, // A scales
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue