You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Using multimodal datasets to train ovis1_6-gemma2-9b, an error occurred: RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Char
#2514
Open
c-x-l-w opened this issue
Nov 26, 2024
· 1 comment
Train: 0%| | 0/1980 [00:00<?, ?it/s]/opt/conda/envs/www2025/lib/python3.12/site-packages/bitsandbytes/autograd/_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
Traceback (most recent call last):
File "/home/tom/fssd/WWW2025/swift/swift/cli/sft.py", line 5, in
sft_main()
File "/home/tom/fssd/WWW2025/swift/swift/utils/run_utils.py", line 32, in x_main
result = llm_x(args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/llm/sft.py", line 546, in llm_sft
return trainer_train(args, model, template, train_dataset, val_dataset, callbacks=callbacks, msg=msg)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/llm/sft.py", line 496, in trainer_train
trainer.train(training_args.resume_from_checkpoint)
File "/home/tom/fssd/WWW2025/swift/swift/trainers/mixin.py", line 493, in train
res = super().train(resume_from_checkpoint, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/trainer.py", line 2122, in train
return inner_training_loop(
^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/trainer.py", line 2474, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/trainer.py", line 3572, in training_step
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/trainers/trainers.py", line 161, in compute_loss
outputs = model(**inputs)
^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl
return inner()
^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1769, in inner
args_kwargs_result = hook(self, args, kwargs) # type: ignore[misc]
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/llm/utils/template.py", line 350, in _pre_forward_hook
res_extra.append(self._post_encode(module, d))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/llm/utils/template.py", line 1355, in _post_encode
_, inputs_embeds, labels, _ = self.model.merge_multimodal(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/.cache/huggingface/modules/transformers_modules/Ovis1.6-Gemma2-9B/modeling_ovis.py", line 376, in merge_multimodal
visual_tokens = self.visual_tokenizer(torch.cat([x for x in pixel_values], dim=0))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/.cache/huggingface/modules/transformers_modules/Ovis1.6-Gemma2-9B/modeling_ovis.py", line 223, in forward
features = self.encode(pixel_values)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/.cache/huggingface/modules/transformers_modules/Ovis1.6-Gemma2-9B/modeling_ovis.py", line 198, in encode
output = self.backbone(pixel_values, output_hidden_states=True, return_dict=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/models/siglip/modeling_siglip.py", line 1190, in forward
return self.vision_model(
^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/models/siglip/modeling_siglip.py", line 1101, in forward
pooler_output = self.head(last_hidden_state) if self.use_head else None
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/models/siglip/modeling_siglip.py", line 1128, in forward
hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/activation.py", line 1368, in forward
attn_output, attn_output_weights = F.multi_head_attention_forward(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/functional.py", line 6251, in multi_head_attention_forward
attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Char
Train: 0%| | 0/1980 [00:03<?, ?it/s]
The text was updated successfully, but these errors were encountered:
Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 20%|██ | 1/5 [00:04<00:18, 4.72s/it]
Loading checkpoint shards: 40%|████ | 2/5 [00:10<00:16, 5.53s/it]
Loading checkpoint shards: 60%|██████ | 3/5 [00:16<00:11, 5.60s/it]
Loading checkpoint shards: 80%|████████ | 4/5 [00:23<00:06, 6.30s/it]
Loading checkpoint shards: 100%|██████████| 5/5 [00:24<00:00, 4.25s/it]
Loading checkpoint shards: 100%|██████████| 5/5 [00:24<00:00, 4.90s/it]
[INFO:swift] model.max_model_len: 8192
[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
[INFO:swift] model_config: OvisConfig {
"_attn_implementation_autoset": true,
"_name_or_path": "/home/tom/fssd/WWW2025/Ovis1.6-Gemma2-9B",
"architectures": [
"Ovis"
],
"auto_map": {
"AutoConfig": "configuration_ovis.OvisConfig",
"AutoModelForCausalLM": "modeling_ovis.Ovis"
},
"conversation_formatter_class": "GemmaConversationFormatter",
"disable_tie_weight": false,
"hidden_size": 3584,
"keys_to_ignore_at_inference": [
"past_key_values"
],
"llm_attn_implementation": "eager",
"llm_config": {
"_attn_implementation_autoset": false,
"_name_or_path": "google/gemma-2-9b-it",
"add_cross_attention": false,
"architectures": [
"Gemma2ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"attn_logit_softcapping": 50.0,
"bad_words_ids": null,
"begin_suppress_tokens": null,
"bos_token_id": 2,
"cache_implementation": "hybrid",
"chunk_size_feed_forward": 0,
"cross_attention_hidden_size": null,
"decoder_start_token_id": null,
"diversity_penalty": 0.0,
"do_sample": false,
"early_stopping": false,
"encoder_no_repeat_ngram_size": 0,
"eos_token_id": 1,
"exponential_decay_length_penalty": null,
"final_logit_softcapping": 30.0,
"finetuning_task": null,
"forced_bos_token_id": null,
"forced_eos_token_id": null,
"head_dim": 256,
"hidden_act": "gelu_pytorch_tanh",
"hidden_activation": "gelu_pytorch_tanh",
"hidden_size": 3584,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"initializer_range": 0.02,
"intermediate_size": 14336,
"is_decoder": false,
"is_encoder_decoder": false,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"length_penalty": 1.0,
"max_length": 20,
"max_position_embeddings": 8192,
"min_length": 0,
"model_type": "gemma2",
"no_repeat_ngram_size": 0,
"num_attention_heads": 16,
"num_beam_groups": 1,
"num_beams": 1,
"num_hidden_layers": 42,
"num_key_value_heads": 8,
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_scores": false,
"pad_token_id": 0,
"prefix": null,
"problem_type": null,
"pruned_heads": {},
"query_pre_attn_scalar": 256,
"remove_invalid_values": false,
"repetition_penalty": 1.0,
"return_dict": true,
"return_dict_in_generate": false,
"rms_norm_eps": 1e-06,
"rope_theta": 10000.0,
"sep_token_id": null,
"sliding_window": 4096,
"sliding_window_size": 4096,
"suppress_tokens": null,
"task_specific_params": null,
"temperature": 1.0,
"tf_legacy_loss": false,
"tie_encoder_decoder": false,
"tie_word_embeddings": true,
"tokenizer_class": null,
"top_k": 50,
"top_p": 1.0,
"torch_dtype": "bfloat16",
"torchscript": false,
"typical_p": 1.0,
"use_bfloat16": false,
"use_cache": true,
"vocab_size": 256000
},
"model_type": "ovis",
"multimodal_max_length": 8192,
"quantization_config": {
"_load_in_4bit": false,
"_load_in_8bit": true,
"bnb_4bit_compute_dtype": "bfloat16",
"bnb_4bit_quant_storage": "uint8",
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_use_double_quant": true,
"llm_int8_enable_fp32_cpu_offload": false,
"llm_int8_has_fp16_weight": false,
"llm_int8_skip_modules": null,
"llm_int8_threshold": 6.0,
"load_in_4bit": false,
"load_in_8bit": true,
"quant_method": "bitsandbytes"
},
"torch_dtype": "bfloat16",
"transformers_version": "4.46.1",
"use_cache": true,
"visual_tokenizer_config": {
"_attn_implementation_autoset": false,
"_name_or_path": "",
"add_cross_attention": false,
"architectures": null,
"backbone_config": {
"_attn_implementation_autoset": false,
"_name_or_path": "google/siglip-so400m-patch14-384",
"add_cross_attention": false,
"architectures": null,
"attention_dropout": 0.0,
"bad_words_ids": null,
"begin_suppress_tokens": null,
"bos_token_id": null,
"chunk_size_feed_forward": 0,
"cross_attention_hidden_size": null,
"decoder_start_token_id": null,
"diversity_penalty": 0.0,
"do_sample": false,
"early_stopping": false,
"encoder_no_repeat_ngram_size": 0,
"eos_token_id": null,
"exponential_decay_length_penalty": null,
"finetuning_task": null,
"forced_bos_token_id": null,
"forced_eos_token_id": null,
"hidden_act": "gelu_pytorch_tanh",
"hidden_size": 1152,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"image_size": 384,
"intermediate_size": 4304,
"is_decoder": false,
"is_encoder_decoder": false,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"layer_norm_eps": 1e-06,
"length_penalty": 1.0,
"max_length": 20,
"min_length": 0,
"model_type": "siglip_vision_model",
"no_repeat_ngram_size": 0,
"num_attention_heads": 16,
"num_beam_groups": 1,
"num_beams": 1,
"num_channels": 3,
"num_hidden_layers": 27,
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_scores": false,
"pad_token_id": null,
"patch_size": 14,
"prefix": null,
"problem_type": null,
"pruned_heads": {},
"remove_invalid_values": false,
"repetition_penalty": 1.0,
"return_dict": true,
"return_dict_in_generate": false,
"sep_token_id": null,
"suppress_tokens": null,
"task_specific_params": null,
"temperature": 1.0,
"tf_legacy_loss": false,
"tie_encoder_decoder": false,
"tie_word_embeddings": true,
"tokenizer_class": null,
"top_k": 50,
"top_p": 1.0,
"torch_dtype": null,
"torchscript": false,
"typical_p": 1.0,
"use_bfloat16": false
},
"backbone_kwargs": {},
"bad_words_ids": null,
"begin_suppress_tokens": null,
"bos_token_id": null,
"chunk_size_feed_forward": 0,
"cross_attention_hidden_size": null,
"decoder_start_token_id": null,
"depths": null,
"diversity_penalty": 0.0,
"do_sample": false,
"drop_cls_token": false,
"early_stopping": false,
"encoder_no_repeat_ngram_size": 0,
"eos_token_id": null,
"exponential_decay_length_penalty": null,
"finetuning_task": null,
"forced_bos_token_id": null,
"forced_eos_token_id": null,
"hidden_stride": 2,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"is_decoder": false,
"is_encoder_decoder": false,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"length_penalty": 1.0,
"max_length": 20,
"min_length": 0,
"model_type": "siglip_visual_tokenizer",
"no_repeat_ngram_size": 0,
"num_beam_groups": 1,
"num_beams": 1,
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_scores": false,
"pad_token_id": null,
"prefix": null,
"problem_type": null,
"pruned_heads": {},
"remove_invalid_values": false,
"repetition_penalty": 1.0,
"return_dict": true,
"return_dict_in_generate": false,
"sep_token_id": null,
"suppress_tokens": null,
"task_specific_params": null,
"tau": 1.0,
"temperature": 1.0,
"tf_legacy_loss": false,
"tie_encoder_decoder": false,
"tie_word_embeddings": true,
"tokenize_function": "softmax",
"tokenizer_class": null,
"top_k": 50,
"top_p": 1.0,
"torch_dtype": null,
"torchscript": false,
"typical_p": 1.0,
"use_bfloat16": false,
"vocab_size": 65536
}
}
[INFO:swift] model.generation_config: GenerationConfig {
"bos_token_id": 2,
"eos_token_id": 1,
"max_new_tokens": 2048,
"pad_token_id": 0
}
[INFO:swift] Setting model.config.use_cache: False
[INFO:swift] target_modules: ^(llm)(?!.(lm_head|output|emb|wte|shared)).
[INFO:swift] modules_to_save: []
[INFO:swift] lora_config: get_wrapped_class..PeftWrapper(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/home/tom/fssd/WWW2025/Ovis1.6-Gemma2-9B', revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules='^(llm)(?!.(lm_head|output|emb|wte|shared)).', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=[], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_dtype=None, lorap_lr_ratio=None, lorap_emb_lr=1e-06)
[INFO:swift] [base_model.model.llm.model.embed_tokens.weight]: requires_grad=False, dtype=torch.bfloat16, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.q_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.q_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.q_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.k_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.k_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.k_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.v_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.v_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.v_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.o_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.o_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.o_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.gate_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.gate_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.gate_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.up_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.up_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.up_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.down_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] ...
[INFO:swift] PeftModelForCausalLM(
(base_model): LoraModel(
(model): Ovis(
(llm): Gemma2ForCausalLM(
(model): Gemma2Model(
(embed_tokens): Embedding(256000, 3584, padding_idx=0)
(layers): ModuleList(
(0-41): 42 x Gemma2DecoderLayer(
(self_attn): Gemma2Attention(
(q_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=3584, out_features=4096, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=4096, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(k_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=3584, out_features=2048, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=2048, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(v_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=3584, out_features=2048, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=2048, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(o_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=4096, out_features=3584, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=3584, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(rotary_emb): Gemma2RotaryEmbedding()
)
(mlp): Gemma2MLP(
(gate_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=3584, out_features=14336, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=14336, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(up_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=3584, out_features=14336, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=14336, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(down_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=14336, out_features=3584, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=14336, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=3584, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(act_fn): PytorchGELUTanh()
)
(input_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
(pre_feedforward_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
(post_feedforward_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
(post_attention_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
)
)
(norm): Gemma2RMSNorm((3584,), eps=1e-06)
)
(lm_head): Linear(in_features=3584, out_features=256000, bias=False)
)
(visual_tokenizer): SiglipVisualTokenizer(
(backbone): SiglipVisionModel(
(vision_model): SiglipVisionTransformer(
(embeddings): SiglipVisionEmbeddings(
(patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
(position_embedding): Embedding(729, 1152)
)
(encoder): SiglipEncoder(
(layers): ModuleList(
(0-26): 27 x SiglipEncoderLayer(
(self_attn): SiglipSdpaAttention(
(k_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
(v_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
(q_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
(out_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
)
(layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
(mlp): SiglipMLP(
(activation_fn): PytorchGELUTanh()
(fc1): Linear8bitLt(in_features=1152, out_features=4304, bias=True)
(fc2): Linear8bitLt(in_features=4304, out_features=1152, bias=True)
)
(layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
)
)
)
(post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
(head): SiglipMultiheadAttentionPoolingHead(
(attention): MultiheadAttention(
(out_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
)
(layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
(mlp): SiglipMLP(
(activation_fn): PytorchGELUTanh()
(fc1): Linear8bitLt(in_features=1152, out_features=4304, bias=True)
(fc2): Linear8bitLt(in_features=4304, out_features=1152, bias=True)
)
)
)
)
(head): Sequential(
(0): Linear8bitLt(in_features=4608, out_features=65531, bias=False)
(1): LayerNorm((65531,), eps=1e-05, elementwise_affine=True)
)
)
(vte): VisualEmbedding(65536, 3584)
)
)
)
[INFO:swift] PeftModelForCausalLM: 10233.9195M Params (27.0090M Trainable [0.2639%]), 0.0061M Buffers.
[INFO:swift] system: None
[INFO:swift] args.lazy_tokenize: True
Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 1000 examples [00:00, 53817.98 examples/s]
[INFO:swift] train_dataset: Dataset({
features: ['query', 'response', 'images'],
num_rows: 990
})
[INFO:swift] val_dataset: Dataset({
features: ['query', 'response', 'images'],
num_rows: 10
})
[INFO:swift] Setting max_partition: 9. You can adjust this hyperparameter through the environment variable:
MAX_PARTITION
.[INFO:swift] [LABELS_IDS] [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 23515, 30582, 107]
[INFO:swift] [LABELS] [-100 * 214]活动页面<end_of_turn>
[INFO:swift] training_args: Seq2SeqTrainingArguments(
_n_gpu=1,
acc_strategy=token,
accelerator_config={'split_batches': False, 'dispatch_batches': False, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_epsilon=1e-08,
additional_saved_files=[],
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=1,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=1000,
eval_strategy=IntervalStrategy.STEPS,
eval_use_gather_object=False,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
generation_config=GenerationConfig {
"bos_token_id": 2,
"eos_token_id": 1,
"max_new_tokens": 2048,
"pad_token_id": 0
}
,
generation_max_length=None,
generation_num_beams=None,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_for_metrics=[],
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0001,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=/home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001/runs,
logging_first_step=True,
logging_nan_inf_filter=True,
logging_steps=1000,
logging_strategy=IntervalStrategy.STEPS,
loss_name=None,
lr_scheduler_kwargs={},
lr_scheduler_type=SchedulerType.COSINE,
max_grad_norm=1,
max_steps=-1,
metric_for_best_model=loss,
metric_warmup_step=0,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=2,
optim=OptimizerNames.ADAMW_TORCH,
optim_args=None,
optim_target_modules=None,
output_dir=/home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=1,
per_device_train_batch_size=1,
predict_with_generate=False,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
restore_callback_states_from_checkpoint=False,
resume_from_checkpoint=None,
run_name=/home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=1000,
save_strategy=IntervalStrategy.STEPS,
save_total_limit=2,
seed=42,
skip_memory_metrics=True,
sortish_sampler=False,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torch_empty_cache_steps=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
train_dataset_sample=-1,
train_sampler_random=True,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_liger_kernel=False,
use_mps_device=False,
warmup_ratio=0.05,
warmup_steps=0,
weight_decay=0.1,
)
[ERROR:swift] There are error run git command.
/home/tom/fssd/WWW2025/swift/swift/trainers/mixin.py:93: FutureWarning:
tokenizer
is deprecated and will be removed in version 5.0.0 forSeq2SeqTrainer.__init__
. Useprocessing_class
instead.super().init(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[2024-11-26 21:50:40,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[INFO:swift] The SftArguments will be saved in: /home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001/sft_args.json
[INFO:swift] The Seq2SeqTrainingArguments will be saved in: /home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001/training_args.json
[INFO:swift] The logging file will be saved in: /home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001/logging.jsonl
Train: 0%| | 0/1980 [00:00<?, ?it/s]/opt/conda/envs/www2025/lib/python3.12/site-packages/bitsandbytes/autograd/_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
Traceback (most recent call last):
File "/home/tom/fssd/WWW2025/swift/swift/cli/sft.py", line 5, in
sft_main()
File "/home/tom/fssd/WWW2025/swift/swift/utils/run_utils.py", line 32, in x_main
result = llm_x(args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/llm/sft.py", line 546, in llm_sft
return trainer_train(args, model, template, train_dataset, val_dataset, callbacks=callbacks, msg=msg)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/llm/sft.py", line 496, in trainer_train
trainer.train(training_args.resume_from_checkpoint)
File "/home/tom/fssd/WWW2025/swift/swift/trainers/mixin.py", line 493, in train
res = super().train(resume_from_checkpoint, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/trainer.py", line 2122, in train
return inner_training_loop(
^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/trainer.py", line 2474, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/trainer.py", line 3572, in training_step
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/trainers/trainers.py", line 161, in compute_loss
outputs = model(**inputs)
^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl
return inner()
^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1769, in inner
args_kwargs_result = hook(self, args, kwargs) # type: ignore[misc]
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/llm/utils/template.py", line 350, in _pre_forward_hook
res_extra.append(self._post_encode(module, d))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/llm/utils/template.py", line 1355, in _post_encode
_, inputs_embeds, labels, _ = self.model.merge_multimodal(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/.cache/huggingface/modules/transformers_modules/Ovis1.6-Gemma2-9B/modeling_ovis.py", line 376, in merge_multimodal
visual_tokens = self.visual_tokenizer(torch.cat([x for x in pixel_values], dim=0))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/.cache/huggingface/modules/transformers_modules/Ovis1.6-Gemma2-9B/modeling_ovis.py", line 223, in forward
features = self.encode(pixel_values)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/.cache/huggingface/modules/transformers_modules/Ovis1.6-Gemma2-9B/modeling_ovis.py", line 198, in encode
output = self.backbone(pixel_values, output_hidden_states=True, return_dict=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/models/siglip/modeling_siglip.py", line 1190, in forward
return self.vision_model(
^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/models/siglip/modeling_siglip.py", line 1101, in forward
pooler_output = self.head(last_hidden_state) if self.use_head else None
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/models/siglip/modeling_siglip.py", line 1128, in forward
hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/activation.py", line 1368, in forward
attn_output, attn_output_weights = F.multi_head_attention_forward(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/functional.py", line 6251, in multi_head_attention_forward
attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Char
Train: 0%| | 0/1980 [00:03<?, ?it/s]
The text was updated successfully, but these errors were encountered: