arm64: optimize q4_k_q8_k kernel with i8mm (#13886)

This PR improves q4_k_q8_k gemm kernel with arm64 i8mm instruction.

Tested on neoverse-n2 with llama3 8b q4_k_m quantization model.
- 34% ~ 50% S_PP uplift for all batch sizes
- 12% ~ 37% S_TG uplift for batch size 4 and above

Perplexity doesn't change with this PR.

```
// tested on neoverse-n2
$ llama-batched-bench \
      -m Meta-Llama-3-8B-Instruct-Q4_K_M.gguf \
      --no-mmap -fa \
      -c 8192 -b 4096 -ub 512 -npp 128 -ntg 128 \
      -npl 1,2,4,8,16,32 \
      -t 64

---------------------------------------------------------------------
|    PP |     TG |    B |       S_PP t/s      |       S_TG t/s      |
|       |        |      | original |  this pr | original |  this pr |
|-------|--------|------|----------|----------|----------|----------|
|   128 |    128 |    1 |   110.12 |   147.83 |    24.36 |    24.28 |
|   128 |    128 |    2 |   121.16 |   172.42 |    46.36 |    47.93 |
|   128 |    128 |    4 |   120.15 |   169.75 |    74.68 |    84.00 |
|   128 |    128 |    8 |   130.97 |   196.81 |    91.04 |   114.74 |
|   128 |    128 |   16 |   131.01 |   196.88 |   101.43 |   135.79 |
|   128 |    128 |   32 |   130.85 |   196.51 |   106.97 |   147.29 |
---------------------------------------------------------------------
```
This commit is contained in:
Yibo Cai 2025-05-29 19:39:20 +08:00 committed by GitHub
parent 21fcc21ad5
commit 54a2c7a8cd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 148 additions and 0 deletions

View file

@ -270,7 +270,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.from_float = quantize_row_q4_K,
.vec_dot = ggml_vec_dot_q4_K_q8_K,
.vec_dot_type = GGML_TYPE_Q8_K,
#if defined (__ARM_FEATURE_MATMUL_INT8)
.nrows = 2,
#else
.nrows = 1,
#endif
},
[GGML_TYPE_Q5_K] = {
.from_float = quantize_row_q5_K,