From eb3949938e82a128855bc0676220bb2ce6e4228d Mon Sep 17 00:00:00 2001 From: Shawn yang <137684499+Yangxiaoz@users.noreply.github.com> Date: Sat, 31 May 2025 14:48:04 +0800 Subject: [PATCH] CUDA: add a prop in ggml_cuda_device_infor for distinguish iGPU or dGPU in cuda (#13856) (#13895) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 1. add "integrated" in ggml_cuda_device_info for distinguish whether it is Intergrate_gpu or discrete_gpu 2. Adjust the func:"ggml_backend_cuda_device_supports_buft" for this new feature * Update ggml/src/ggml-cuda/ggml-cuda.cu Adjusted code indentation Co-authored-by: Johannes Gäßler * Update ggml/src/ggml-cuda/ggml-cuda.cu Fixed incorrect setting of variable types Co-authored-by: Johannes Gäßler * Update ggml/src/ggml-cuda/ggml-cuda.cu Adjusted the judgment logic Co-authored-by: Johannes Gäßler * add a host_buft assert in case of integrated_cuda_device with func:'evaluate_and_capture_cuda_graph()' * Update ggml/src/ggml-cuda/ggml-cuda.cu Add a defensive security assert Co-authored-by: Johannes Gäßler * Update ggml/src/ggml-cuda/ggml-cuda.cu Adjusted the support judgment logic. Co-authored-by: Johannes Gäßler * revoke the suggest commit changes due to it's not applicable in jetson_device * Update ggml/src/ggml-cuda/ggml-cuda.cu Add parentheses to enforce operator precedence​ Co-authored-by: Diego Devesa * Update ggml/src/ggml-cuda/ggml-cuda.cu Fix ci bug: add a spaces Co-authored-by: Johannes Gäßler --------- Co-authored-by: yangxiao Co-authored-by: Johannes Gäßler Co-authored-by: yangxiao Co-authored-by: Diego Devesa --- ggml/src/ggml-cuda/common.cuh | 1 + ggml/src/ggml-cuda/ggml-cuda.cu | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index df450b18..e1ce1d4c 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -635,6 +635,7 @@ struct ggml_cuda_device_info { int nsm; // number of streaming multiprocessors size_t smpb; // max. shared memory per block size_t smpbo; // max. shared memory per block (with opt-in) + bool integrated; // Device is integrated as opposed to discrete bool vmm; // virtual memory support size_t vmm_granularity; // granularity of virtual memory size_t total_vram; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 009ed904..2a6f7f10 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -243,10 +243,10 @@ static ggml_cuda_device_info ggml_cuda_init() { info.default_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; - - info.devices[id].nsm = prop.multiProcessorCount; - info.devices[id].smpb = prop.sharedMemPerBlock; - info.devices[id].warp_size = prop.warpSize; + info.devices[id].integrated = prop.integrated; + info.devices[id].nsm = prop.multiProcessorCount; + info.devices[id].smpb = prop.sharedMemPerBlock; + info.devices[id].warp_size = prop.warpSize; #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) info.devices[id].smpbo = prop.sharedMemPerBlock; @@ -1065,6 +1065,10 @@ static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_ GGML_UNUSED(buft); } +static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name; +} + static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { CUDA_CHECK(cudaFreeHost(buffer->context)); } @@ -2641,6 +2645,8 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) { static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) { + // flag used to determine whether it is an integrated_gpu + const bool integrated = ggml_cuda_info().devices[cuda_ctx->device].integrated; while (!graph_evaluated_or_captured) { // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. @@ -2659,7 +2665,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx if (node->src[j] != nullptr) { assert(node->src[j]->buffer); assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || - ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft)); + ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft) || (integrated && ggml_backend_buft_is_cuda_host(node->src[j]->buffer->buft))); } } #endif @@ -3266,7 +3272,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev; + ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context; + const bool integrated = ggml_cuda_info().devices[dev_ctx->device].integrated; + return (((ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev) || (integrated && ggml_backend_buft_is_cuda_host(buft))); } static int64_t get_op_batch_size(const ggml_tensor * op) {