model : Nomic Embed Text V2 with Mixture-of-Experts (MoE) architecture (#12466)
* Nomic Embed Text V2 with Mixture-of-Experts (MoE) architecture - Adds MoE-based embedding model supporting multilingual embeddings. - Selects architecture variant based on hyperparameter detection (MoE layers). - Removes unnecessary subclass initialization checks for clarity. https://www.nomic.ai/blog/posts/nomic-embed-text-v2 Co-authored-by: Jared Van Bortel <jared@nomic.ai> * fix tokenizer * don't rename this tensor --------- Co-authored-by: Jared Van Bortel <jared@nomic.ai>
This commit is contained in:
parent
eaea325324
commit
5f5e39e1ba
9 changed files with 247 additions and 110 deletions
|
@ -78,7 +78,7 @@ class ModelBase:
|
|||
# subclasses should define this!
|
||||
model_arch: gguf.MODEL_ARCH
|
||||
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
|
||||
use_temp_file: bool = False, eager: bool = False,
|
||||
metadata_override: Path | None = None, model_name: str | None = None,
|
||||
split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
|
||||
|
@ -454,13 +454,6 @@ class ModelBase:
|
|||
|
||||
|
||||
class TextModel(ModelBase):
|
||||
@classmethod
|
||||
def __init_subclass__(cls):
|
||||
# can't use an abstract property, because overriding it without type errors
|
||||
# would require using decorated functions instead of simply defining the property
|
||||
if "model_arch" not in cls.__dict__:
|
||||
raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
|
@ -3373,14 +3366,7 @@ class BertModel(TextModel):
|
|||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
|
||||
@ModelBase.register("RobertaModel")
|
||||
class RobertaModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def _xlmroberta_tokenizer_init(self) -> None:
|
||||
# we need the pad_token_id to know how to chop down position_embd matrix
|
||||
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
||||
self._position_offset = 1 + pad_token_id
|
||||
|
@ -3389,82 +3375,7 @@ class RobertaModel(BertModel):
|
|||
else:
|
||||
self._position_offset = None
|
||||
|
||||
def set_vocab(self):
|
||||
"""Support BPE tokenizers for roberta models"""
|
||||
bpe_tok_path = self.dir_model / "tokenizer.json"
|
||||
if bpe_tok_path.exists():
|
||||
self._set_vocab_gpt2()
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
# we need this to validate the size of the token_type embeddings
|
||||
# though currently we are passing all zeros to the token_type embeddings
|
||||
# "Sequence A" or "Sequence B"
|
||||
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
||||
|
||||
else:
|
||||
return super().set_vocab()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# if name starts with "roberta.", remove the prefix
|
||||
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
||||
if name.startswith("roberta."):
|
||||
name = name[8:]
|
||||
|
||||
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
||||
if name == "embeddings.position_embeddings.weight":
|
||||
if self._position_offset is not None:
|
||||
data_torch = data_torch[self._position_offset:,:]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("NomicBertModel")
|
||||
class NomicBertModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.NOMIC_BERT
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# the HF config claims n_ctx=8192, but it uses RoPE scaling
|
||||
self.hparams["n_ctx"] = 2048
|
||||
|
||||
# SwigLU activation
|
||||
assert self.hparams["activation_function"] == "swiglu"
|
||||
# this doesn't do anything in the HF version
|
||||
assert self.hparams["causal"] is False
|
||||
# no bias tensors
|
||||
assert self.hparams["qkv_proj_bias"] is False
|
||||
assert self.hparams["mlp_fc1_bias"] is False
|
||||
assert self.hparams["mlp_fc2_bias"] is False
|
||||
# norm at end of layer
|
||||
assert self.hparams["prenorm"] is False
|
||||
# standard RoPE
|
||||
assert self.hparams["rotary_emb_fraction"] == 1.0
|
||||
assert self.hparams["rotary_emb_interleaved"] is False
|
||||
assert self.hparams["rotary_emb_scale_base"] is None
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||
|
||||
|
||||
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
||||
class XLMRobertaModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# we need the pad_token_id to know how to chop down position_embd matrix
|
||||
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
||||
self._position_offset = 1 + pad_token_id
|
||||
if "max_position_embeddings" in self.hparams:
|
||||
self.hparams["max_position_embeddings"] -= self._position_offset
|
||||
else:
|
||||
self._position_offset = None
|
||||
|
||||
def set_vocab(self):
|
||||
def _xlmroberta_set_vocab(self) -> None:
|
||||
# to avoid TypeError: Descriptors cannot be created directly
|
||||
# exception when importing sentencepiece_model_pb2
|
||||
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
||||
|
@ -3546,6 +3457,138 @@ class XLMRobertaModel(BertModel):
|
|||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
|
||||
@ModelBase.register("RobertaModel")
|
||||
class RobertaModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# we need the pad_token_id to know how to chop down position_embd matrix
|
||||
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
|
||||
self._position_offset = 1 + pad_token_id
|
||||
if "max_position_embeddings" in self.hparams:
|
||||
self.hparams["max_position_embeddings"] -= self._position_offset
|
||||
else:
|
||||
self._position_offset = None
|
||||
|
||||
def set_vocab(self):
|
||||
"""Support BPE tokenizers for roberta models"""
|
||||
bpe_tok_path = self.dir_model / "tokenizer.json"
|
||||
if bpe_tok_path.exists():
|
||||
self._set_vocab_gpt2()
|
||||
self.gguf_writer.add_add_bos_token(True)
|
||||
self.gguf_writer.add_add_eos_token(True)
|
||||
|
||||
# we need this to validate the size of the token_type embeddings
|
||||
# though currently we are passing all zeros to the token_type embeddings
|
||||
# "Sequence A" or "Sequence B"
|
||||
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
|
||||
|
||||
else:
|
||||
return super().set_vocab()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# if name starts with "roberta.", remove the prefix
|
||||
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
||||
if name.startswith("roberta."):
|
||||
name = name[8:]
|
||||
|
||||
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
|
||||
if name == "embeddings.position_embeddings.weight":
|
||||
if self._position_offset is not None:
|
||||
data_torch = data_torch[self._position_offset:,:]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("NomicBertModel")
|
||||
class NomicBertModel(BertModel):
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
|
||||
hparams = kwargs.pop("hparams", None)
|
||||
if hparams is None:
|
||||
hparams = ModelBase.load_hparams(dir_model)
|
||||
|
||||
self.is_moe = bool(hparams.get("moe_every_n_layers"))
|
||||
self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT
|
||||
|
||||
super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
|
||||
|
||||
self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta()
|
||||
if self._tokenizer_is_xlmroberta:
|
||||
self._xlmroberta_tokenizer_init()
|
||||
|
||||
# the HF config claims n_ctx=8192, but it uses RoPE scaling
|
||||
self.hparams["n_ctx"] = 2048
|
||||
|
||||
assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu"
|
||||
|
||||
# this doesn't do anything in the HF version
|
||||
assert self.hparams["causal"] is False
|
||||
# no bias tensors unless MoE
|
||||
assert self.hparams["qkv_proj_bias"] == self.is_moe
|
||||
assert self.hparams["mlp_fc1_bias"] == self.is_moe
|
||||
assert self.hparams["mlp_fc2_bias"] == self.is_moe
|
||||
|
||||
# norm at end of layer
|
||||
assert self.hparams["prenorm"] is False
|
||||
# standard RoPE
|
||||
assert self.hparams["rotary_emb_fraction"] == 1.0
|
||||
assert self.hparams["rotary_emb_interleaved"] is False
|
||||
assert self.hparams["rotary_emb_scale_base"] is None
|
||||
|
||||
def set_vocab(self) -> None:
|
||||
if self._tokenizer_is_xlmroberta:
|
||||
return self._xlmroberta_set_vocab()
|
||||
return super().set_vocab()
|
||||
|
||||
def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]:
|
||||
# If the tensor is an experts bias tensor, skip it by returning an empty list.
|
||||
if "mlp.experts.bias" in name:
|
||||
return [] # Explicitly return an empty list.
|
||||
|
||||
if "mlp.experts.mlp.w1" in name:
|
||||
data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
|
||||
name += ".weight"
|
||||
|
||||
if "mlp.experts.mlp.w2" in name:
|
||||
data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
|
||||
data_torch = data_torch.transpose(1, 2)
|
||||
name += ".weight"
|
||||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
|
||||
if self.is_moe:
|
||||
self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
|
||||
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
|
||||
self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
|
||||
|
||||
def _is_tokenizer_xlmroberta(self) -> bool:
|
||||
with open(self.dir_model / "tokenizer.json") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
toktyp = tokenizer_json["model"]["type"]
|
||||
if toktyp == "Unigram":
|
||||
return True
|
||||
if toktyp == "WordPiece":
|
||||
return False
|
||||
raise ValueError(f"unknown tokenizer: {toktyp}")
|
||||
|
||||
|
||||
@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
|
||||
class XLMRobertaModel(BertModel):
|
||||
model_arch = gguf.MODEL_ARCH.BERT
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._xlmroberta_tokenizer_init()
|
||||
|
||||
def set_vocab(self):
|
||||
self._xlmroberta_set_vocab()
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
# if name starts with "roberta.", remove the prefix
|
||||
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue