CUDA: app option to compile without FlashAttention (#12025)

2025-02-22 20:44:34 +01:00 · 2025-02-22 20:44:34 +01:00 · a28e0d5eb1
commit a28e0d5eb1
parent 36c258ee92
13 changed files with 46 additions and 31 deletions
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -151,6 +151,7 @@ set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                            "ggml: max. batch size for using peer access")
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
 option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
+option(GGML_CUDA_FA                         "ggml: compile ggml FlashAttention CUDA kernels"  ON)
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})