PyTorch Lightning - a lightweight PyTorch wrapper for high-performance AI research. Think of it as a framework for organizing your PyTorch code.
Hydra - a framework for elegantly configuring complex applications. The key feature is the ability to dynamically create a hierarchical configuration by composition and override it through config files and the command line.
DVC - A tool designed to handle large datasets and machine learning models in a version-controlled workflow
Tensorboard|wandb - TensorBoard is a tool that provides visualization and debugging capabilities for TensorFlow and PyTorch experiments. It’s a popular choice for monitoring machine learning training processes in real time.
AWS|EC2|S3|Lambda|ECR - AWS Elastic Compute Cloud (EC2) is a service that provides scalable virtual computing resources in the cloud.
Docker - A platform for creating, deploying, and managing lightweight, portable, and scalable containers.
FastAPI|Gradio - A Python library for building simple, interactive web interfaces for machine learning models and APIs.
Nextjs - Frontend FrameWork
K8s|KNative|Kserve|Istio|ArgoCD - AWS Kubernets and ArgoCD
[Prometheus|Grafana] - observability
chmod +x shscripts/download_zips.sh && ./shscripts/download_zips.sh
data/processed/sports/
├── sports.csv
├── test
│ ├── air hockey
│ ├── .
│ ├── .
│ ├── .
│ └── wingsuit flying
├── train
│ ├── air hockey
│ ├── .
│ ├── .
│ ├── .
│ └── wingsuit flying
└── valid
├── air hockey
├── .
├── .
├── .
└── wingsuit flying
-----------------------
8000
------------------------------
data/processed/vegfruits/
├── test
│ ├── apple
│ ├── banana
│ ├── .
│ ├── .
│ ├── turnip
│ └── watermelon
├── train
│ ├── apple
│ ├── banana
│ ├── .
│ ├── .
│ ├── turnip
│ └── watermelon
└── validation
├── apple
├── banana
├── .
├── .
├── turnip
└── watermelon
uv sync --extra cpu # install torch-cpu version
uv sync --extra cu124 # install torch-gpu version cu124
uv sync --group develop --group visuals --group testing --group prod --extra cpu --no-cache
uv sync --group develop --group visuals --group testing --group prod --extra cu124 # install deps from all
uv run --env-file .env --extra cpu
time uv sync --group develop --group visuals --group testing --group prod --extra cu124 --no-cache
real 3m4.088s
user 0m40.967s
sys 0m30.678s
# development # 1.8GB
dvc init
dvc add data/processed/sports
dvc add data/processed/vegfruits
dvc remote add -d mlops s3://`bucket-name`
dvc push
# pull data
dvc pull
# reproduce
dvc repro
make hsports # make sure comment that pretrained model and run longer epoch & max to n_trails
experiment: hsports
hydra:
sweeper:
n_trails: 26 # variation of models
params:
++model.stem_type: choice('patch','overlap')
++model.act_layer: choice('relu','gelu')
++model.global_pool: choice('avg','fast')
++model.depths: "[2,2,6,2],[3,3,9,3]"
++model.dims: "[24,48,22,168],[12,32,44,96]"
++model.kernel_sizes: "[3, 5, 7, 9],[3,3,3,3]"
++model.use_pos_emb: "[False,True,False,False], [True,True,True,False]"
trainer:
max_epochs: 10
experiment: tsports
script: true
name: sports
callbacks.model_checkpoint.filename: sports
make tsports # uncomment the pretrained model to save time.
- Check Test Metrics
- Save Models
- checkpoints
- torchscript
- cpu
- gpu
- onnx
- Explore Dataset
- Class Distribution
- Batch Images
- Condusion Matrixs
- Train
- Test
- Validation
make esports # plot_confusion metrics
make show
make showoff
make gradio-deploy-locally
open http://0.0.0.0:8080/
export ENABLE_TORCH_PROFILER=true
enable_envvars_config=true
torch-model-archiver \
--model-name sports \
--version 1.0 \
--export-path ./model_stores/mar_sports \
--hander ./src/backend/torchserve_app/sports_handler.py \
--serialized-file ./checkpoints/pths/sports.pt \
--extra-files index_to_name.json \
torch-model-archiver --model-name msports --serialized-file checkpoints/onnxs/sports.onnx --handler src/backend/torchserve_app/sports_handler.py --export-path checkpoints/model_stores/sports/ -f --version 0.0.1 --extra-files checkpoints/model_stores/sports/index_to_name.json
torch-model-archiver --model-name mvegfruits --serialized-file checkpoints/onnxs/vegfruits.onnx --handler src/backend/torchserve_app/vegfruits_handler.py --export-path checkpoints/model_stores/vegfruits/ -f --version 0.0.1 --extra-files checkpoints/model_stores/vegfruits/index_to_name.json
torchserve --start --model-store checkpoints/model_stores/sports/ --ts-config checkpoints/model_stores/sports/config.properties --enable-model-api --disable-token-auth
torchserve --start --model-store checkpoints/model_stores/vegfruits/ --ts-config checkpoints/model_stores/vegfruits/config.properties --enable-model-api --disable-token-auth
curl http://localhost:8080/ping
curl -X OPTIONS http://localhost:8080 -o src/backend/torchserve_app/vegfruits_swagger.json
curl -X OPTIONS http://localhost:8080 -o src/backend/torchserve_app/sports_swagger.json
curl http://localhost:8080/predictions/mvegfruits -F 'data=@data/processed/vegfruits/validation/lettuce/Image_8.jpg'
curl http://localhost:8080/predictions/msports -F 'data=@data/processed/sports/train/speed skating/001.jpg'
# management
curl http://localhost:8081/models
curl http://localhost:8081/models/mvegfruits
curl http://localhost:8081/models/msports
curl -v -X PUT "http://localhost:8081/models/mvegfruits?min_workers=1&batch_size=10"
curl -v -X PUT "http://localhost:8081/models/msports?min_workers=1&batch_size=10"
# metrics
curl http://localhost:8082/metrics
# github.com/moby/moby/issues/12886#issuecomment-480575928
export DOCKER_BUILDKIT=1
image: python:3.12.7-slim-bookworm #26.91 MB
build time: 2 Mins # (GitHub Codespaces)
size: 3.01GB