Using multimodal datasets to train ovis1_6-gemma2-9b, an error occurred: RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Char #2514

c-x-l-w · 2024-11-26T13:53:46Z

Loading checkpoint shards: 0%| | 0/5 [00:00<?, ?it/s]
Loading checkpoint shards: 20%|██ | 1/5 [00:04<00:18, 4.72s/it]
Loading checkpoint shards: 40%|████ | 2/5 [00:10<00:16, 5.53s/it]
Loading checkpoint shards: 60%|██████ | 3/5 [00:16<00:11, 5.60s/it]
Loading checkpoint shards: 80%|████████ | 4/5 [00:23<00:06, 6.30s/it]
Loading checkpoint shards: 100%|██████████| 5/5 [00:24<00:00, 4.25s/it]
Loading checkpoint shards: 100%|██████████| 5/5 [00:24<00:00, 4.90s/it]
[INFO:swift] model.max_model_len: 8192
[INFO:swift] model.hf_device_map: {'': device(type='cuda', index=0)}
[INFO:swift] model_config: OvisConfig {
"_attn_implementation_autoset": true,
"_name_or_path": "/home/tom/fssd/WWW2025/Ovis1.6-Gemma2-9B",
"architectures": [
"Ovis"
],
"auto_map": {
"AutoConfig": "configuration_ovis.OvisConfig",
"AutoModelForCausalLM": "modeling_ovis.Ovis"
},
"conversation_formatter_class": "GemmaConversationFormatter",
"disable_tie_weight": false,
"hidden_size": 3584,
"keys_to_ignore_at_inference": [
"past_key_values"
],
"llm_attn_implementation": "eager",
"llm_config": {
"_attn_implementation_autoset": false,
"_name_or_path": "google/gemma-2-9b-it",
"add_cross_attention": false,
"architectures": [
"Gemma2ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"attn_logit_softcapping": 50.0,
"bad_words_ids": null,
"begin_suppress_tokens": null,
"bos_token_id": 2,
"cache_implementation": "hybrid",
"chunk_size_feed_forward": 0,
"cross_attention_hidden_size": null,
"decoder_start_token_id": null,
"diversity_penalty": 0.0,
"do_sample": false,
"early_stopping": false,
"encoder_no_repeat_ngram_size": 0,
"eos_token_id": 1,
"exponential_decay_length_penalty": null,
"final_logit_softcapping": 30.0,
"finetuning_task": null,
"forced_bos_token_id": null,
"forced_eos_token_id": null,
"head_dim": 256,
"hidden_act": "gelu_pytorch_tanh",
"hidden_activation": "gelu_pytorch_tanh",
"hidden_size": 3584,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"initializer_range": 0.02,
"intermediate_size": 14336,
"is_decoder": false,
"is_encoder_decoder": false,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"length_penalty": 1.0,
"max_length": 20,
"max_position_embeddings": 8192,
"min_length": 0,
"model_type": "gemma2",
"no_repeat_ngram_size": 0,
"num_attention_heads": 16,
"num_beam_groups": 1,
"num_beams": 1,
"num_hidden_layers": 42,
"num_key_value_heads": 8,
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_scores": false,
"pad_token_id": 0,
"prefix": null,
"problem_type": null,
"pruned_heads": {},
"query_pre_attn_scalar": 256,
"remove_invalid_values": false,
"repetition_penalty": 1.0,
"return_dict": true,
"return_dict_in_generate": false,
"rms_norm_eps": 1e-06,
"rope_theta": 10000.0,
"sep_token_id": null,
"sliding_window": 4096,
"sliding_window_size": 4096,
"suppress_tokens": null,
"task_specific_params": null,
"temperature": 1.0,
"tf_legacy_loss": false,
"tie_encoder_decoder": false,
"tie_word_embeddings": true,
"tokenizer_class": null,
"top_k": 50,
"top_p": 1.0,
"torch_dtype": "bfloat16",
"torchscript": false,
"typical_p": 1.0,
"use_bfloat16": false,
"use_cache": true,
"vocab_size": 256000
},
"model_type": "ovis",
"multimodal_max_length": 8192,
"quantization_config": {
"_load_in_4bit": false,
"_load_in_8bit": true,
"bnb_4bit_compute_dtype": "bfloat16",
"bnb_4bit_quant_storage": "uint8",
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_use_double_quant": true,
"llm_int8_enable_fp32_cpu_offload": false,
"llm_int8_has_fp16_weight": false,
"llm_int8_skip_modules": null,
"llm_int8_threshold": 6.0,
"load_in_4bit": false,
"load_in_8bit": true,
"quant_method": "bitsandbytes"
},
"torch_dtype": "bfloat16",
"transformers_version": "4.46.1",
"use_cache": true,
"visual_tokenizer_config": {
"_attn_implementation_autoset": false,
"_name_or_path": "",
"add_cross_attention": false,
"architectures": null,
"backbone_config": {
"_attn_implementation_autoset": false,
"_name_or_path": "google/siglip-so400m-patch14-384",
"add_cross_attention": false,
"architectures": null,
"attention_dropout": 0.0,
"bad_words_ids": null,
"begin_suppress_tokens": null,
"bos_token_id": null,
"chunk_size_feed_forward": 0,
"cross_attention_hidden_size": null,
"decoder_start_token_id": null,
"diversity_penalty": 0.0,
"do_sample": false,
"early_stopping": false,
"encoder_no_repeat_ngram_size": 0,
"eos_token_id": null,
"exponential_decay_length_penalty": null,
"finetuning_task": null,
"forced_bos_token_id": null,
"forced_eos_token_id": null,
"hidden_act": "gelu_pytorch_tanh",
"hidden_size": 1152,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"image_size": 384,
"intermediate_size": 4304,
"is_decoder": false,
"is_encoder_decoder": false,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"layer_norm_eps": 1e-06,
"length_penalty": 1.0,
"max_length": 20,
"min_length": 0,
"model_type": "siglip_vision_model",
"no_repeat_ngram_size": 0,
"num_attention_heads": 16,
"num_beam_groups": 1,
"num_beams": 1,
"num_channels": 3,
"num_hidden_layers": 27,
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_scores": false,
"pad_token_id": null,
"patch_size": 14,
"prefix": null,
"problem_type": null,
"pruned_heads": {},
"remove_invalid_values": false,
"repetition_penalty": 1.0,
"return_dict": true,
"return_dict_in_generate": false,
"sep_token_id": null,
"suppress_tokens": null,
"task_specific_params": null,
"temperature": 1.0,
"tf_legacy_loss": false,
"tie_encoder_decoder": false,
"tie_word_embeddings": true,
"tokenizer_class": null,
"top_k": 50,
"top_p": 1.0,
"torch_dtype": null,
"torchscript": false,
"typical_p": 1.0,
"use_bfloat16": false
},
"backbone_kwargs": {},
"bad_words_ids": null,
"begin_suppress_tokens": null,
"bos_token_id": null,
"chunk_size_feed_forward": 0,
"cross_attention_hidden_size": null,
"decoder_start_token_id": null,
"depths": null,
"diversity_penalty": 0.0,
"do_sample": false,
"drop_cls_token": false,
"early_stopping": false,
"encoder_no_repeat_ngram_size": 0,
"eos_token_id": null,
"exponential_decay_length_penalty": null,
"finetuning_task": null,
"forced_bos_token_id": null,
"forced_eos_token_id": null,
"hidden_stride": 2,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"is_decoder": false,
"is_encoder_decoder": false,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"length_penalty": 1.0,
"max_length": 20,
"min_length": 0,
"model_type": "siglip_visual_tokenizer",
"no_repeat_ngram_size": 0,
"num_beam_groups": 1,
"num_beams": 1,
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_scores": false,
"pad_token_id": null,
"prefix": null,
"problem_type": null,
"pruned_heads": {},
"remove_invalid_values": false,
"repetition_penalty": 1.0,
"return_dict": true,
"return_dict_in_generate": false,
"sep_token_id": null,
"suppress_tokens": null,
"task_specific_params": null,
"tau": 1.0,
"temperature": 1.0,
"tf_legacy_loss": false,
"tie_encoder_decoder": false,
"tie_word_embeddings": true,
"tokenize_function": "softmax",
"tokenizer_class": null,
"top_k": 50,
"top_p": 1.0,
"torch_dtype": null,
"torchscript": false,
"typical_p": 1.0,
"use_bfloat16": false,
"vocab_size": 65536
}
}

[INFO:swift] model.generation_config: GenerationConfig {
"bos_token_id": 2,
"eos_token_id": 1,
"max_new_tokens": 2048,
"pad_token_id": 0
}

[INFO:swift] Setting model.config.use_cache: False
[INFO:swift] target_modules: ^(llm)(?!.(lm_head|output|emb|wte|shared)).
[INFO:swift] modules_to_save: []
[INFO:swift] lora_config: get_wrapped_class..PeftWrapper(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='/home/tom/fssd/WWW2025/Ovis1.6-Gemma2-9B', revision=None, task_type='CAUSAL_LM', inference_mode=False, r=8, target_modules='^(llm)(?!.(lm_head|output|emb|wte|shared)).', lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=[], init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_dtype=None, lorap_lr_ratio=None, lorap_emb_lr=1e-06)
[INFO:swift] [base_model.model.llm.model.embed_tokens.weight]: requires_grad=False, dtype=torch.bfloat16, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.q_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.q_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.q_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.k_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.k_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.k_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.v_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.v_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.v_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.o_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.o_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.self_attn.o_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.gate_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.gate_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.gate_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.up_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.up_proj.lora_A.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.up_proj.lora_B.default.weight]: requires_grad=True, dtype=torch.float32, device=cuda:0
[INFO:swift] [base_model.model.llm.model.layers.0.mlp.down_proj.base_layer.weight]: requires_grad=False, dtype=torch.int8, device=cuda:0
[INFO:swift] ...
[INFO:swift] PeftModelForCausalLM(
(base_model): LoraModel(
(model): Ovis(
(llm): Gemma2ForCausalLM(
(model): Gemma2Model(
(embed_tokens): Embedding(256000, 3584, padding_idx=0)
(layers): ModuleList(
(0-41): 42 x Gemma2DecoderLayer(
(self_attn): Gemma2Attention(
(q_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=3584, out_features=4096, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=4096, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(k_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=3584, out_features=2048, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=2048, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(v_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=3584, out_features=2048, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=2048, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(o_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=4096, out_features=3584, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=4096, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=3584, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(rotary_emb): Gemma2RotaryEmbedding()
)
(mlp): Gemma2MLP(
(gate_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=3584, out_features=14336, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=14336, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(up_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=3584, out_features=14336, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=3584, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=14336, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(down_proj): lora.Linear8bitLt(
(base_layer): Linear8bitLt(in_features=14336, out_features=3584, bias=False)
(lora_dropout): ModuleDict(
(default): Dropout(p=0.05, inplace=False)
)
(lora_A): ModuleDict(
(default): Linear(in_features=14336, out_features=8, bias=False)
)
(lora_B): ModuleDict(
(default): Linear(in_features=8, out_features=3584, bias=False)
)
(lora_embedding_A): ParameterDict()
(lora_embedding_B): ParameterDict()
(lora_magnitude_vector): ModuleDict()
)
(act_fn): PytorchGELUTanh()
)
(input_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
(pre_feedforward_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
(post_feedforward_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
(post_attention_layernorm): Gemma2RMSNorm((3584,), eps=1e-06)
)
)
(norm): Gemma2RMSNorm((3584,), eps=1e-06)
)
(lm_head): Linear(in_features=3584, out_features=256000, bias=False)
)
(visual_tokenizer): SiglipVisualTokenizer(
(backbone): SiglipVisionModel(
(vision_model): SiglipVisionTransformer(
(embeddings): SiglipVisionEmbeddings(
(patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
(position_embedding): Embedding(729, 1152)
)
(encoder): SiglipEncoder(
(layers): ModuleList(
(0-26): 27 x SiglipEncoderLayer(
(self_attn): SiglipSdpaAttention(
(k_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
(v_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
(q_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
(out_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
)
(layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
(mlp): SiglipMLP(
(activation_fn): PytorchGELUTanh()
(fc1): Linear8bitLt(in_features=1152, out_features=4304, bias=True)
(fc2): Linear8bitLt(in_features=4304, out_features=1152, bias=True)
)
(layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
)
)
)
(post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
(head): SiglipMultiheadAttentionPoolingHead(
(attention): MultiheadAttention(
(out_proj): Linear8bitLt(in_features=1152, out_features=1152, bias=True)
)
(layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
(mlp): SiglipMLP(
(activation_fn): PytorchGELUTanh()
(fc1): Linear8bitLt(in_features=1152, out_features=4304, bias=True)
(fc2): Linear8bitLt(in_features=4304, out_features=1152, bias=True)
)
)
)
)
(head): Sequential(
(0): Linear8bitLt(in_features=4608, out_features=65531, bias=False)
(1): LayerNorm((65531,), eps=1e-05, elementwise_affine=True)
)
)
(vte): VisualEmbedding(65536, 3584)
)
)
)
[INFO:swift] PeftModelForCausalLM: 10233.9195M Params (27.0090M Trainable [0.2639%]), 0.0061M Buffers.
[INFO:swift] system: None
[INFO:swift] args.lazy_tokenize: True

Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 1000 examples [00:00, 53817.98 examples/s]
[INFO:swift] train_dataset: Dataset({
features: ['query', 'response', 'images'],
num_rows: 990
})
[INFO:swift] val_dataset: Dataset({
features: ['query', 'response', 'images'],
num_rows: 10
})
[INFO:swift] Setting max_partition: 9. You can adjust this hyperparameter through the environment variable: MAX_PARTITION.
[INFO:swift] [LABELS_IDS] [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 23515, 30582, 107]
[INFO:swift] [LABELS] [-100 * 214]活动页面<end_of_turn>
[INFO:swift] training_args: Seq2SeqTrainingArguments(
_n_gpu=1,
acc_strategy=token,
accelerator_config={'split_batches': False, 'dispatch_batches': False, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.95,
adam_epsilon=1e-08,
additional_saved_files=[],
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=1,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=1000,
eval_strategy=IntervalStrategy.STEPS,
eval_use_gather_object=False,
evaluation_strategy=None,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
generation_config=GenerationConfig {
"bos_token_id": 2,
"eos_token_id": 1,
"max_new_tokens": 2048,
"pad_token_id": 0
}
,
generation_max_length=None,
generation_num_beams=None,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_for_metrics=[],
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=0.0001,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=/home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001/runs,
logging_first_step=True,
logging_nan_inf_filter=True,
logging_steps=1000,
logging_strategy=IntervalStrategy.STEPS,
loss_name=None,
lr_scheduler_kwargs={},
lr_scheduler_type=SchedulerType.COSINE,
max_grad_norm=1,
max_steps=-1,
metric_for_best_model=loss,
metric_warmup_step=0,
mp_parameters=,
neftune_noise_alpha=None,
no_cuda=False,
num_train_epochs=2,
optim=OptimizerNames.ADAMW_TORCH,
optim_args=None,
optim_target_modules=None,
output_dir=/home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=1,
per_device_train_batch_size=1,
predict_with_generate=False,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=False,
report_to=['tensorboard'],
restore_callback_states_from_checkpoint=False,
resume_from_checkpoint=None,
run_name=/home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=1000,
save_strategy=IntervalStrategy.STEPS,
save_total_limit=2,
seed=42,
skip_memory_metrics=True,
sortish_sampler=False,
split_batches=None,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torch_empty_cache_steps=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
train_dataset_sample=-1,
train_sampler_random=True,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_liger_kernel=False,
use_mps_device=False,
warmup_ratio=0.05,
warmup_steps=0,
weight_decay=0.1,
)
[ERROR:swift] There are error run git command.
/home/tom/fssd/WWW2025/swift/swift/trainers/mixin.py:93: FutureWarning: tokenizer is deprecated and will be removed in version 5.0.0 for Seq2SeqTrainer.__init__. Use processing_class instead.
super().init(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[2024-11-26 21:50:40,589] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[INFO:swift] The SftArguments will be saved in: /home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001/sft_args.json
[INFO:swift] The Seq2SeqTrainingArguments will be saved in: /home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001/training_args.json
[INFO:swift] The logging file will be saved in: /home/tom/fssd/WWW2025/output/ovis1_6-gemma2-9b/v14-20241126-214907/ovis1_6-gemma2-9b/v0-20241126-215001/logging.jsonl

Train: 0%| | 0/1980 [00:00<?, ?it/s]/opt/conda/envs/www2025/lib/python3.12/site-packages/bitsandbytes/autograd/_functions.py:316: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization
warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
Traceback (most recent call last):
File "/home/tom/fssd/WWW2025/swift/swift/cli/sft.py", line 5, in
sft_main()
File "/home/tom/fssd/WWW2025/swift/swift/utils/run_utils.py", line 32, in x_main
result = llm_x(args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/llm/sft.py", line 546, in llm_sft
return trainer_train(args, model, template, train_dataset, val_dataset, callbacks=callbacks, msg=msg)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/llm/sft.py", line 496, in trainer_train
trainer.train(training_args.resume_from_checkpoint)
File "/home/tom/fssd/WWW2025/swift/swift/trainers/mixin.py", line 493, in train
res = super().train(resume_from_checkpoint, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/trainer.py", line 2122, in train
return inner_training_loop(
^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/trainer.py", line 2474, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/trainer.py", line 3572, in training_step
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/trainers/trainers.py", line 161, in compute_loss
outputs = model(**inputs)
^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1844, in _call_impl
return inner()
^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1769, in inner
args_kwargs_result = hook(self, args, kwargs) # type: ignore[misc]
^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/llm/utils/template.py", line 350, in _pre_forward_hook
res_extra.append(self._post_encode(module, d))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/fssd/WWW2025/swift/swift/llm/utils/template.py", line 1355, in _post_encode
_, inputs_embeds, labels, _ = self.model.merge_multimodal(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/.cache/huggingface/modules/transformers_modules/Ovis1.6-Gemma2-9B/modeling_ovis.py", line 376, in merge_multimodal
visual_tokens = self.visual_tokenizer(torch.cat([x for x in pixel_values], dim=0))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/.cache/huggingface/modules/transformers_modules/Ovis1.6-Gemma2-9B/modeling_ovis.py", line 223, in forward
features = self.encode(pixel_values)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/tom/.cache/huggingface/modules/transformers_modules/Ovis1.6-Gemma2-9B/modeling_ovis.py", line 198, in encode
output = self.backbone(pixel_values, output_hidden_states=True, return_dict=True)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/models/siglip/modeling_siglip.py", line 1190, in forward
return self.vision_model(
^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/models/siglip/modeling_siglip.py", line 1101, in forward
pooler_output = self.head(last_hidden_state) if self.use_head else None
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/transformers/models/siglip/modeling_siglip.py", line 1128, in forward
hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/accelerate/hooks.py", line 170, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/modules/activation.py", line 1368, in forward
attn_output, attn_output_weights = F.multi_head_attention_forward(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/opt/conda/envs/www2025/lib/python3.12/site-packages/torch/nn/functional.py", line 6251, in multi_head_attention_forward
attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Char

Train: 0%| | 0/1980 [00:03<?, ?it/s]

The text was updated successfully, but these errors were encountered:

himasai9712 · 2024-11-28T11:32:17Z

how you started training?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Using multimodal datasets to train ovis1_6-gemma2-9b, an error occurred: RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Char #2514

Using multimodal datasets to train ovis1_6-gemma2-9b, an error occurred: RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Char #2514

Using multimodal datasets to train ovis1_6-gemma2-9b, an error occurred: RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Char #2514

Using multimodal datasets to train ovis1_6-gemma2-9b, an error occurred: RuntimeError: self and mat2 must have the same dtype, but got BFloat16 and Char #2514

Comments