cuda : improve cuda pool efficiency using virtual memory (#4606)

* cuda : improve cuda pool efficiency using virtual memory

* fix mixtral

* fix cmake build

* check for vmm support, disable for hip

ggml-ci

* fix hip build

* clarify granularity

* move all caps to g_device_caps

* refactor error checking

* add cuda_pool_alloc, refactor most pool allocations

ggml-ci

* fix hip build

* CUBLAS_TF32_TENSOR_OP_MATH is not a macro

* more hip crap

* llama : fix msvc warnings

* ggml : fix msvc warnings

* minor

* minor

* cuda : fallback to CPU on host buffer alloc fail

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update ggml-cuda.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* ensure allocations are always aligned

* act_size -> actual_size

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
slaren 2023-12-24 14:34:22 +01:00 committed by GitHub
parent 708e179e85
commit 5bf3953d7e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 328 additions and 208 deletions

View file

@ -1281,7 +1281,7 @@ struct llama_hparams {
if (this->rope_finetuned != other.rope_finetuned) return true;
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
const float EPSILON = 1e-9;
const float EPSILON = 1e-9f;
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
@ -10300,7 +10300,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
std::string result = model->vocab.id_to_token[token].text;
llama_unescape_whitespace(result);
if (length < (int) result.length()) {
return -result.length();
return -(int) result.length();
}
memcpy(buf, result.c_str(), result.length());
return result.length();
@ -10330,7 +10330,7 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
std::string result = model->vocab.id_to_token[token].text;
result = llama_decode_text(result);
if (length < (int) result.length()) {
return -result.length();
return -(int) result.length();
}
memcpy(buf, result.c_str(), result.length());
return result.length();