Better 1.5 bit quantization (#5971)
* Trying blocvks of 16 for IQ1_S - seems slightly better * iq1s_blocks16: Adjust scale fudge factor to 1.125 * iq1s_blocks16: going to blocks of 32 with 2048 lattice points, so same bpw. This is even better than blocks of 16. Should I try blocks of 64? But to keep the same bpw, when I go to 4096 lattice points, I need to remove blocks alltogether and just have superblocks of 256 weights. * iq1s_blocks16: Use 2*<x^2> as sigma2 in weight adjustment * iq1s_blocks16: scalar and AVX2 dot products * iq1s_blocks16: CUDA dot product * iq1s_blocks16: Metal works, Neon does not Metal works but TG is dog slow (35 t/s). PP is OKish (493 t/s). Not seeing the bug in the Neon implementation for now. * iq1s_blocks16: fixed Neon * iq1s_blocks16: very slightly faster TG on Metal Still pathetic at 37 t/s * iq1s_blocks16: speedup Metal by packing codebook into uint32_t's * Formatting * iq1s_blocks16: uint32_t codebook is also better in CUDA TG-128 is now 204 t/s up from 194 t/s. PP-512 is 5890 t/s, so significantly better than other quants * iq1s_blocks16: slightly faster Neon dot product * iq1s_blocks16: faster AVX2 dot product * iq1s_blocks16: adjust to ggml-common.h --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
This commit is contained in:
parent
ef3ced26a3
commit
be858f6205
5 changed files with 1152 additions and 393 deletions
62
ggml-cuda.cu
62
ggml-cuda.cu
|
@ -565,8 +565,8 @@ static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N
|
|||
#define QI1_S (QK_K / (4*QR1_S))
|
||||
typedef struct {
|
||||
half d;
|
||||
uint8_t qs[QK_K/8];
|
||||
uint8_t scales[QK_K/16];
|
||||
uint8_t qs[QK_K/8];
|
||||
uint16_t qh[QK_K/32];
|
||||
} block_iq1_s;
|
||||
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
||||
|
||||
|
@ -1722,11 +1722,22 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
|
|||
const int il = tid/8; // 0...3
|
||||
const int ib = tid%8; // 0...7
|
||||
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
||||
const int i8 = 4*ib+il;
|
||||
uint8_t h = x[i].scales[i8/2] >> 4*(i8%2);
|
||||
const int8_t * grid = (const int8_t *)(iq1s_grid + (x[i].qs[i8] | ((h & 8) << 5)));
|
||||
const float d = (float)x[i].d * (2*(h & 7) + 1);
|
||||
for (int j = 0; j < 8; ++j) y[j] = d * grid[j];
|
||||
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 0xf) + 1);
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
int grid32[2]; const int8_t * q = (const int8_t *)grid32;
|
||||
grid32[0] = *((const int *)(iq1s_grid_gpu + (x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8))));
|
||||
grid32[1] = __vsub4((grid32[0] >> 4) & 0x0f0f0f0f, 0x01010101);
|
||||
grid32[0] = __vsub4(grid32[0] & 0x0f0f0f0f, 0x01010101);
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
y[j] = d * q[j];
|
||||
}
|
||||
#else
|
||||
const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
y[j+0] = d * ((grid[j] & 0xf) - 1);
|
||||
y[j+4] = d * ((grid[j] >> 4) - 1);
|
||||
}
|
||||
#endif
|
||||
#else
|
||||
assert(false);
|
||||
#endif
|
||||
|
@ -4538,44 +4549,33 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
|
|||
#endif
|
||||
}
|
||||
|
||||
|
||||
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
|
||||
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
|
||||
#if QK_K == 256
|
||||
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
||||
|
||||
const int ib32 = iqs;
|
||||
int sumi1 = 0, sumi2 = 0, sumi3 = 0, sumi4 = 0;
|
||||
const uint8_t h1 = bq1->scales[2*ib32+0];
|
||||
const uint8_t h2 = bq1->scales[2*ib32+1];
|
||||
int sumi = 0;
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
|
||||
const int * q8 = (const int *)bq8_1[ib32].qs;
|
||||
const int * grid1 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
|
||||
const int * grid2 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
|
||||
const int * grid3 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
|
||||
const int * grid4 = (const int *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
|
||||
for (int j = 0; j < 2; ++j) {
|
||||
sumi1 = __dp4a(q8[j+0], grid1[j], sumi1);
|
||||
sumi2 = __dp4a(q8[j+2], grid2[j], sumi2);
|
||||
sumi3 = __dp4a(q8[j+4], grid3[j], sumi3);
|
||||
sumi4 = __dp4a(q8[j+6], grid4[j], sumi4);
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
|
||||
int grid0 = __vsub4(grid[0] & 0x0f0f0f0f, 0x01010101);
|
||||
int grid1 = __vsub4((grid[0] >> 4) & 0x0f0f0f0f, 0x01010101);
|
||||
sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi));
|
||||
}
|
||||
#else
|
||||
const int8_t * q8 = bq8_1[ib32].qs;
|
||||
const int8_t * grid1 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+0] | ((h1 & 0x08) << 5)));
|
||||
const int8_t * grid2 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+1] | ((h1 & 0x80) << 1)));
|
||||
const int8_t * grid3 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+2] | ((h2 & 0x08) << 5)));
|
||||
const int8_t * grid4 = (const int8_t *)(iq1s_grid + (bq1->qs[4*ib32+3] | ((h2 & 0x80) << 1)));
|
||||
for (int j = 0; j < 8; ++j) {
|
||||
sumi1 += q8[j+ 0] * grid1[j];
|
||||
sumi2 += q8[j+ 8] * grid2[j];
|
||||
sumi3 += q8[j+16] * grid3[j];
|
||||
sumi4 += q8[j+24] * grid4[j];
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
|
||||
for (int j = 0; j < 4; ++j) {
|
||||
sumi += q8[j] * ((grid[j] & 0xf) - 1) + q8[j+4] * ((grid[j] >> 4) - 1);
|
||||
}
|
||||
q8 += 8;
|
||||
}
|
||||
#endif
|
||||
const float d = (float)bq1->d * __low2float(bq8_1[ib32].ds);
|
||||
return d * (sumi1 * (2*(h1 & 7) + 1) + sumi2 * (2*((h1 >> 4) & 7) + 1) +
|
||||
sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
|
||||
return d * sumi * (2*(bq1->qh[ib32] >> 12) + 1);
|
||||
#else
|
||||
assert(false);
|
||||
return 0.f;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue