Description
Bug description
Hello, I've been attempting to run a new project with SLEAP and I'm running into an issue where the training is failing to start. I receive this error:
I've attempted a fresh install to no avail.
It was working earlier when I started my initial training utilizing only 50 frames labeled. However, Once I added an addition 300 frames It no longer runs. Below is the full command line terminal.
I've also attempted changing the input scaling. Im not sure exactly what I've done wrong here, as I've previously run different training sets with many MORE videos of the same size and with many more labels. Any assistance would be appreciated! Thanks!
C:\Windows\System32>conda activate sleap
(sleap) C:\Windows\System32>sleap-label
Saving config: C:\Users\Cscho/.sleap/1.4.1a2/preferences.yaml
Restoring GUI state...
Software versions:
SLEAP: 1.4.1a2
TensorFlow: 2.7.0
Numpy: 1.21.6
Python: 3.7.12
OS: Windows-10-10.0.26100-SP0
Happy SLEAPing! :)
Resetting monitor window.
Polling: G:/Inscopix_projects/Sleap_\models\250429_230757.centroid.n=349\viz\validation.*.png
Start training centroid...
['sleap-train', 'C:\Users\Cscho\AppData\Local\Temp\tmpx26fwmq8\250429_230757_training_job.json', 'G:/Inscopix_projects/Sleap_/Clean_cerebellum_training.slp', '--zmq', '--controller_port', '9000', '--publish_port', '9001', '--save_viz']
INFO:sleap.nn.training:Versions:
SLEAP: 1.4.1a2
TensorFlow: 2.7.0
Numpy: 1.21.6
Python: 3.7.12
OS: Windows-10-10.0.26100-SP0
INFO:sleap.nn.training:Training labels file: G:/Inscopix_projects/Sleap_/Clean_cerebellum_training.slp
INFO:sleap.nn.training:Training profile: C:\Users\Cscho\AppData\Local\Temp\tmpx26fwmq8\250429_230757_training_job.json
INFO:sleap.nn.training:
INFO:sleap.nn.training:Arguments:
INFO:sleap.nn.training:{
"training_job_path": "C:\Users\Cscho\AppData\Local\Temp\tmpx26fwmq8\250429_230757_training_job.json",
"labels_path": "G:/Inscopix_projects/Sleap_/Clean_cerebellum_training.slp",
"video_paths": [
""
],
"val_labels": null,
"test_labels": null,
"base_checkpoint": null,
"tensorboard": false,
"save_viz": true,
"zmq": true,
"publish_port": 9001,
"controller_port": 9000,
"run_name": "",
"prefix": "",
"suffix": "",
"cpu": false,
"first_gpu": false,
"last_gpu": false,
"gpu": "auto"
}
INFO:sleap.nn.training:
INFO:sleap.nn.training:Training job:
INFO:sleap.nn.training:{
"data": {
"labels": {
"training_labels": null,
"validation_labels": null,
"validation_fraction": 0.1,
"test_labels": null,
"split_by_inds": false,
"training_inds": null,
"validation_inds": null,
"test_inds": null,
"search_path_hints": [],
"skeletons": []
},
"preprocessing": {
"ensure_rgb": false,
"ensure_grayscale": false,
"imagenet_mode": null,
"input_scaling": 0.25,
"pad_to_stride": null,
"resize_and_pad_to_target": true,
"target_height": null,
"target_width": null
},
"instance_cropping": {
"center_on_part": "Center",
"crop_size": null,
"crop_size_detection_padding": 16
}
},
"model": {
"backbone": {
"leap": null,
"unet": {
"stem_stride": null,
"max_stride": 16,
"output_stride": 2,
"filters": 16,
"filters_rate": 2.0,
"middle_block": true,
"up_interpolate": true,
"stacks": 1
},
"hourglass": null,
"resnet": null,
"pretrained_encoder": null
},
"heads": {
"single_instance": null,
"centroid": {
"anchor_part": "Center",
"sigma": 2.5,
"output_stride": 2,
"loss_weight": 1.0,
"offset_refinement": false
},
"centered_instance": null,
"multi_instance": null,
"multi_class_bottomup": null,
"multi_class_topdown": null
},
"base_checkpoint": null
},
"optimization": {
"preload_data": true,
"augmentation_config": {
"rotate": true,
"rotation_min_angle": -180.0,
"rotation_max_angle": 180.0,
"translate": false,
"translate_min": -5,
"translate_max": 5,
"scale": false,
"scale_min": 0.9,
"scale_max": 1.1,
"uniform_noise": false,
"uniform_noise_min_val": 0.0,
"uniform_noise_max_val": 10.0,
"gaussian_noise": false,
"gaussian_noise_mean": 5.0,
"gaussian_noise_stddev": 1.0,
"contrast": false,
"contrast_min_gamma": 0.5,
"contrast_max_gamma": 2.0,
"brightness": false,
"brightness_min_val": 0.0,
"brightness_max_val": 10.0,
"random_crop": false,
"random_crop_height": 256,
"random_crop_width": 256,
"random_flip": true,
"flip_horizontal": false
},
"online_shuffling": true,
"shuffle_buffer_size": 128,
"prefetch": true,
"batch_size": 4,
"batches_per_epoch": null,
"min_batches_per_epoch": 200,
"val_batches_per_epoch": null,
"min_val_batches_per_epoch": 10,
"epochs": 200,
"optimizer": "adam",
"initial_learning_rate": 0.0001,
"learning_rate_schedule": {
"reduce_on_plateau": true,
"reduction_factor": 0.5,
"plateau_min_delta": 1e-06,
"plateau_patience": 5,
"plateau_cooldown": 3,
"min_learning_rate": 1e-08
},
"hard_keypoint_mining": {
"online_mining": false,
"hard_to_easy_ratio": 2.0,
"min_hard_keypoints": 2,
"max_hard_keypoints": null,
"loss_scale": 5.0
},
"early_stopping": {
"stop_training_on_plateau": true,
"plateau_min_delta": 1e-08,
"plateau_patience": 20
}
},
"outputs": {
"save_outputs": true,
"run_name": "250429_230757.centroid.n=349",
"run_name_prefix": "",
"run_name_suffix": "",
"runs_folder": "G:/Inscopix_projects/Sleap_\models",
"tags": [
""
],
"save_visualizations": true,
"delete_viz_images": true,
"zip_outputs": false,
"log_to_csv": true,
"checkpointing": {
"initial_model": false,
"best_model": true,
"every_epoch": false,
"latest_model": false,
"final_model": false
},
"tensorboard": {
"write_logs": false,
"loss_frequency": "epoch",
"architecture_graph": false,
"profile_graph": false,
"visualizations": true
},
"zmq": {
"subscribe_to_controller": true,
"controller_address": "tcp://127.0.0.1:9000",
"controller_polling_timeout": 10,
"publish_updates": true,
"publish_address": "tcp://127.0.0.1:9001"
}
},
"name": "",
"description": "",
"sleap_version": "1.4.1a2",
"filename": "C:\Users\Cscho\AppData\Local\Temp\tmpx26fwmq8\250429_230757_training_job.json"
}
INFO:sleap.nn.training:
INFO:sleap.nn.training:Auto-selected GPU 0 with 6719 MiB of free memory.
INFO:sleap.nn.training:Using GPU 0 for acceleration.
INFO:sleap.nn.training:Disabled GPU memory pre-allocation.
INFO:sleap.nn.training:System:
GPUs: 1/1 available
Device: /physical_device:GPU:0
Available: True
Initialized: False
Memory growth: True
INFO:sleap.nn.training:
INFO:sleap.nn.training:Initializing trainer...
INFO:sleap.nn.training:Loading training labels from: G:/Inscopix_projects/Sleap_/Clean_cerebellum_training.slp
INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1
INFO:sleap.nn.training: Splits: Training = 314 / Validation = 35.
INFO:sleap.nn.training:Setting up for training...
INFO:sleap.nn.training:Setting up pipeline builders...
INFO:sleap.nn.training:Setting up model...
INFO:sleap.nn.training:Building test pipeline...
2025-04-29 23:08:05.342239: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-29 23:08:05.780882: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5446 MB memory: -> device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6
INFO:sleap.nn.training:Loaded test example. [2.909s]
INFO:sleap.nn.training: Input shape: (272, 480, 3)
INFO:sleap.nn.training:Created Keras model.
INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=4, middle_block=True, up_blocks=3, up_interpolate=True, block_contraction=False)
INFO:sleap.nn.training: Max stride: 16
INFO:sleap.nn.training: Parameters: 1,953,393
INFO:sleap.nn.training: Heads:
INFO:sleap.nn.training: [0] = CentroidConfmapsHead(anchor_part='Center', sigma=2.5, output_stride=2, loss_weight=1.0)
INFO:sleap.nn.training: Outputs:
INFO:sleap.nn.training: [0] = KerasTensor(type_spec=TensorSpec(shape=(None, 136, 240, 1), dtype=tf.float32, name=None), name='CentroidConfmapsHead/BiasAdd:0', description="created by layer 'CentroidConfmapsHead'")
INFO:sleap.nn.training:Training from scratch
INFO:sleap.nn.training:Setting up data pipelines...
INFO:sleap.nn.training:Training set: n = 314
INFO:sleap.nn.training:Validation set: n = 35
INFO:sleap.nn.training:Setting up optimization...
INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=5, plateau_cooldown=3, min_learning_rate=1e-08)
INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-08, plateau_patience=20)
INFO:sleap.nn.training:Setting up outputs...
INFO:sleap.nn.callbacks:Training controller subscribed to: tcp://127.0.0.1:9000 (topic: )
INFO:sleap.nn.training: ZMQ controller subcribed to: tcp://127.0.0.1:9000
INFO:sleap.nn.callbacks:Progress reporter publishing on: tcp://127.0.0.1:9001 for: not_set
INFO:sleap.nn.training: ZMQ progress reporter publish on: tcp://127.0.0.1:9001
INFO:sleap.nn.training:Created run path: G:/Inscopix_projects/Sleap_\models\250429_230757.centroid.n=349
INFO:sleap.nn.training:Setting up visualization...
INFO:sleap.nn.training:Finished trainer set up. [7.0s]
INFO:sleap.nn.training:Creating tf.data.Datasets for training data generation...
Traceback (most recent call last):
File "C:\Users\Cscho\miniconda3\envs\sleap\Scripts\sleap-train-script.py", line 33, in
sys.exit(load_entry_point('sleap==1.4.1a2', 'console_scripts', 'sleap-train')())
File "C:\Users\Cscho\miniconda3\envs\sleap\lib\site-packages\sleap\nn\training.py", line 2030, in main
trainer.train()
File "C:\Users\Cscho\miniconda3\envs\sleap\lib\site-packages\sleap\nn\training.py", line 928, in train
training_ds = self.training_pipeline.make_dataset()
File "C:\Users\Cscho\miniconda3\envs\sleap\lib\site-packages\sleap\nn\data\pipelines.py", line 287, in make_dataset
ds = transformer.transform_dataset(ds)
File "C:\Users\Cscho\miniconda3\envs\sleap\lib\site-packages\sleap\nn\data\dataset_ops.py", line 318, in transform_dataset
self.examples = list(iter(ds))
File "C:\Users\Cscho\miniconda3\envs\sleap\lib\site-packages\tensorflow\python\data\ops\iterator_ops.py", line 800, in next
return self._next_internal()
File "C:\Users\Cscho\miniconda3\envs\sleap\lib\site-packages\tensorflow\python\data\ops\iterator_ops.py", line 786, in _next_internal
output_shapes=self._flat_output_shapes)
File "C:\Users\Cscho\miniconda3\envs\sleap\lib\site-packages\tensorflow\python\ops\gen_dataset_ops.py", line 2844, in iterator_get_next
_ops.raise_from_not_ok_status(e, name)
File "C:\Users\Cscho\miniconda3\envs\sleap\lib\site-packages\tensorflow\python\framework\ops.py", line 7107, in raise_from_not_ok_status
raise core.status_to_exception(e) from None # pylint: disable=protected-access
tensorflow.python.framework.errors_impl.InvalidArgumentError: Shape of tensor EagerPyFunc [1080,1920,1] is not compatible with expected shape [1080,1920,3].
[[{{node EnsureShape}}]] [Op:IteratorGetNext]
2025-04-29 23:08:12.075638: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to dataset.cache().take(k).repeat()
. You should use dataset.take(k).cache().repeat()
instead.
INFO:sleap.nn.callbacks:Closing the reporter controller/context.
INFO:sleap.nn.callbacks:Closing the training controller socket/context.
Run Path: G:/Inscopix_projects/Sleap\models\250429_230757.centroid.n=349