model : add dots.llm1 architecture support (#14044) (#14118)

Adds:

* Dots1Model to convert_hf_to_gguf.py

* Computation graph code to llama-model.cpp

* Chat template to llama-chat.cpp to detect this model's template.

---

The model is called "dots.llm1" (I decided to shorten it to dots1 or
DOTS1 in the code generally) architecture.

The only models that exist as of writing of this commit that follow this
architecture are "dots.llm1.inst" and "dots.llm1.base" from here:

* https://huggingface.co/rednote-hilab/dots.llm1.inst

* https://huggingface.co/rednote-hilab/dots.llm1.base

The model architecture is a combination of Qwen and Deepseek parts, as
seen here:

ffe12627b4/src/transformers/models/dots1/modular_dots1.py
This commit is contained in:
Mikko Juola 2025-06-15 00:52:06 -07:00 committed by GitHub
parent c311ac664d
commit 9ae4143bc6
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 326 additions and 1 deletions

View file

@ -343,6 +343,7 @@ class MODEL_ARCH(IntEnum):
WAVTOKENIZER_DEC = auto()
PLM = auto()
BAILINGMOE = auto()
DOTS1 = auto()
class VISION_PROJECTOR_TYPE(IntEnum):
@ -623,6 +624,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
MODEL_ARCH.PLM: "plm",
MODEL_ARCH.BAILINGMOE: "bailingmoe",
MODEL_ARCH.DOTS1: "dots1"
}
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@ -2044,6 +2046,30 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
],
MODEL_ARCH.DOTS1: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_EXP_PROBS_B,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_UP_SHEXP,
],
# TODO
}

View file

@ -305,7 +305,7 @@ class TensorNameMap:
),
MODEL_TENSOR.FFN_EXP_PROBS_B: (
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3
"model.layers.{bid}.mlp.gate.e_score_correction", # deepseek-v3 dots1
),
# Feed-forward up