arm64: optimize q4_k_q8_k kernel with i8mm (#13886)

This PR improves q4_k_q8_k gemm kernel with arm64 i8mm instruction. Tested on neoverse-n2 with llama3 8b q4_k_m quantization model. - 34% ~ 50% S_PP uplift for all batch sizes - 12% ~ 37% S_TG uplift for batch size 4 and above Perplexity doesn't change with this PR. ``` // tested on neoverse-n2 $ llama-batched-bench \ -m Meta-Llama-3-8B-Instruct-Q4_K_M.gguf \ --no-mmap -fa \ -c 8192 -b 4096 -ub 512 -npp 128 -ntg 128 \ -npl 1,2,4,8,16,32 \ -t 64 --------------------------------------------------------------------- | PP | TG | B | S_PP t/s | S_TG t/s | | | | | original | this pr | original | this pr | |-------|--------|------|----------|----------|----------|----------| | 128 | 128 | 1 | 110.12 | 147.83 | 24.36 | 24.28 | | 128 | 128 | 2 | 121.16 | 172.42 | 46.36 | 47.93 | | 128 | 128 | 4 | 120.15 | 169.75 | 74.68 | 84.00 | | 128 | 128 | 8 | 130.97 | 196.81 | 91.04 | 114.74 | | 128 | 128 | 16 | 131.01 | 196.88 | 101.43 | 135.79 | | 128 | 128 | 32 | 130.85 | 196.51 | 106.97 | 147.29 | --------------------------------------------------------------------- ```
2025-05-29 19:39:20 +08:00 · 2025-05-29 19:39:20 +08:00 · 54a2c7a8cd
commit 54a2c7a8cd
parent 21fcc21ad5
2 changed files with 148 additions and 0 deletions
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@ -270,7 +270,11 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
        .from_float               = quantize_row_q4_K,
        .vec_dot                  = ggml_vec_dot_q4_K_q8_K,
        .vec_dot_type             = GGML_TYPE_Q8_K,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+        .nrows                    = 2,
+#else
        .nrows                    = 1,
+#endif
    },
    [GGML_TYPE_Q5_K] = {
        .from_float               = quantize_row_q5_K,