cuBLAS: use host pinned memory and dequantize while copying (#1207)

* cuBLAS: dequantize simultaneously while copying memory

* cuBLAS: use host pinned memory

* cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory

* cuBLAS: also pin kv cache

* fix rebase
This commit is contained in:
slaren 2023-04-29 02:04:18 +02:00 committed by GitHub
parent b1ee8f59b4
commit 7fc50c051a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 110 additions and 54 deletions

View file

@ -405,4 +405,30 @@ struct llama_buffer {
delete[] addr;
}
};
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
struct llama_ctx_buffer {
uint8_t * addr = NULL;
size_t size = 0;
void resize(size_t size) {
if (addr) {
ggml_cuda_host_free(addr);
}
addr = (uint8_t *) ggml_cuda_host_malloc(size);
this->size = size;
}
~llama_ctx_buffer() {
if (addr) {
ggml_cuda_host_free(addr);
}
}
};
#else
typedef llama_buffer llama_ctx_buffer;
#endif
#endif