Open
Description
Describe the bug
I am using Kohya SS to train FLUX LoRA
On Linux RTX 3090 gets like 5.5 second / it - batch size 1 and 1024x1024 px resolution
On Windows RTX 3090 TI gets 7.7 second / it - has the most powerful CPU 13900 K
This speed dispercany is huge between Windows and Linux for some reason
Torch upgrade from 2.1 to 2.4 on Linux caused huge speed up and VRAM usage reduction but on Windows only VRAM usage dropped - speed same
Any ideas for how to fix? Using SDPA Cross Attention
I am sharing venv pip freeze of both Windows and Linux
Both has Python 3.10.11
Windows pip freeze
Microsoft Windows [Version 10.0.19045.4717]
(c) Microsoft Corporation. All rights reserved.
R:\Kohya_GUI_Flux_Installer\kohya_ss\venv\Scripts>activate
(venv) R:\Kohya_GUI_Flux_Installer\kohya_ss\venv\Scripts>pip freeze
absl-py==2.1.0
accelerate==0.33.0
aiofiles==23.2.1
aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
altair==4.2.2
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.4.0
appdirs==1.4.4
astunparse==1.6.3
async-timeout==4.0.3
attrs==24.2.0
bitsandbytes==0.43.3
certifi==2022.12.7
charset-normalizer==2.1.1
click==8.1.7
colorama==0.4.6
coloredlogs==15.0.1
contourpy==1.2.1
cycler==0.12.1
dadaptation==3.2
diffusers==0.25.0
docker-pycreds==0.4.0
easygui==0.98.3
einops==0.7.0
entrypoints==0.4
exceptiongroup==1.2.2
fairscale==0.4.13
fastapi==0.112.1
ffmpy==0.4.0
filelock==3.13.1
flatbuffers==24.3.25
fonttools==4.53.1
frozenlist==1.4.1
fsspec==2024.2.0
ftfy==6.1.1
gast==0.6.0
gitdb==4.0.11
GitPython==3.1.43
google-pasta==0.2.0
gradio==4.41.0
gradio_client==1.3.0
grpcio==1.65.5
h11==0.14.0
h5py==3.11.0
httpcore==1.0.5
httpx==0.27.0
huggingface-hub==0.24.5
humanfriendly==10.0
idna==3.4
imagesize==1.4.1
importlib_metadata==8.4.0
importlib_resources==6.4.4
invisible-watermark==0.2.0
Jinja2==3.1.3
jsonschema==4.23.0
jsonschema-specifications==2023.12.1
keras==3.5.0
kiwisolver==1.4.5
libclang==18.1.1
-e git+https://github.com/kohya-ss/sd-scripts.git@e1cd19c0c0ef55709e8eb1e5babe25045f65031f#egg=library&subdirectory=..\..\sd-scripts
lightning-utilities==0.11.6
lion-pytorch==0.0.6
lycoris-lora==2.2.0.post3
Markdown==3.7
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib==3.9.2
mdurl==0.1.2
ml-dtypes==0.4.0
mpmath==1.3.0
multidict==6.0.5
namex==0.0.8
networkx==3.2.1
numpy==1.26.3
nvidia-cublas-cu12==12.4.2.65
nvidia-cuda-cupti-cu12==12.4.99
nvidia-cuda-nvrtc-cu12==12.4.99
nvidia-cuda-runtime-cu12==12.4.99
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.2.0.44
nvidia-curand-cu12==10.3.5.119
nvidia-cusolver-cu12==11.6.0.99
nvidia-cusparse-cu12==12.3.0.142
nvidia-nvjitlink-cu12==12.4.99
nvidia-nvtx-cu12==12.4.99
omegaconf==2.3.0
onnxruntime-gpu==1.17.1
open-clip-torch==2.20.0
opencv-python==4.7.0.68
opt-einsum==3.3.0
optree==0.12.1
orjson==3.10.7
packaging==24.1
pandas==2.2.2
pathtools==0.1.2
pillow==10.2.0
prodigyopt==1.0
protobuf==3.20.3
psutil==6.0.0
pydantic==2.8.2
pydantic_core==2.20.1
pydub==0.25.1
Pygments==2.18.0
pyparsing==3.1.2
pyreadline3==3.4.1
python-dateutil==2.9.0.post0
python-multipart==0.0.9
pytorch-lightning==1.9.0
pytz==2024.1
PyWavelets==1.7.0
PyYAML==6.0.2
referencing==0.35.1
regex==2024.7.24
requests==2.32.3
rich==13.7.1
rpds-py==0.20.0
ruff==0.6.1
safetensors==0.4.4
scipy==1.11.4
semantic-version==2.10.0
sentencepiece==0.2.0
sentry-sdk==2.13.0
setproctitle==1.3.3
shellingham==1.5.4
six==1.16.0
smmap==5.0.1
sniffio==1.3.1
starlette==0.38.2
sympy==1.12
tensorboard==2.17.1
tensorboard-data-server==0.7.2
tensorflow==2.17.0
tensorflow-intel==2.17.0
tensorflow-io-gcs-filesystem==0.31.0
termcolor==2.4.0
timm==0.6.12
tk==0.1.0
tokenizers==0.19.1
toml==0.10.2
tomlkit==0.12.0
toolz==0.12.1
torch==2.4.0+cu124
torchmetrics==1.4.1
torchvision==0.19.0+cu124
tqdm==4.66.5
transformers==4.44.0
typer==0.12.4
typing_extensions==4.9.0
tzdata==2024.1
urllib3==2.2.2
uvicorn==0.30.6
voluptuous==0.13.1
wandb==0.15.11
wcwidth==0.2.13
websockets==12.0
Werkzeug==3.0.4
wrapt==1.16.0
xformers==0.0.27.post2
yarl==1.9.4
zipp==3.20.0
(venv) R:\Kohya_GUI_Flux_Installer\kohya_ss\venv\Scripts>
Ubuntu pip freeze
(venv) Ubuntu@0054-kci-prxmx10136:~/apps/kohya_ss$ pip freeze
absl-py==2.1.0
accelerate==0.33.0
aiofiles==23.2.1
aiohttp==3.9.5
aiosignal==1.3.1
altair==4.2.2
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.4.0
appdirs==1.4.4
astunparse==1.6.3
async-timeout==4.0.3
attrs==23.2.0
bitsandbytes==0.43.3
cachetools==5.3.3
certifi==2024.2.2
charset-normalizer==3.3.2
click==8.1.7
coloredlogs==15.0.1
contourpy==1.2.1
cycler==0.12.1
dadaptation==3.1
diffusers==0.25.0
dnspython==2.6.1
docker-pycreds==0.4.0
easygui==0.98.3
einops==0.7.0
email_validator==2.1.1
entrypoints==0.4
exceptiongroup==1.2.1
fairscale==0.4.13
fastapi==0.111.0
fastapi-cli==0.0.4
ffmpy==0.3.2
filelock==3.14.0
flatbuffers==24.3.25
fonttools==4.53.0
frozenlist==1.4.1
fsspec==2024.5.0
ftfy==6.1.1
gast==0.5.4
gitdb==4.0.11
GitPython==3.1.43
google-auth==2.29.0
google-auth-oauthlib==1.2.0
google-pasta==0.2.0
gradio==4.41.0
gradio_client==1.3.0
grpcio==1.64.0
h11==0.14.0
h5py==3.11.0
httpcore==1.0.5
httptools==0.6.1
httpx==0.27.0
huggingface-hub==0.24.5
humanfriendly==10.0
idna==3.7
imagesize==1.4.1
importlib_metadata==7.1.0
importlib_resources==6.4.0
invisible-watermark==0.2.0
Jinja2==3.1.4
jsonschema==4.22.0
jsonschema-specifications==2023.12.1
keras==2.15.0
kiwisolver==1.4.5
libclang==18.1.1
-e git+https://github.com/kohya-ss/sd-scripts.git@e1cd19c0c0ef55709e8eb1e5babe25045f65031f#egg=library&subdirectory=../../sd-scripts
lightning-utilities==0.11.2
lion-pytorch==0.0.6
lycoris-lora==2.2.0.post3
Markdown==3.6
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib==3.9.0
mdurl==0.1.2
ml-dtypes==0.2.0
mpmath==1.3.0
multidict==6.0.5
networkx==3.3
numpy==1.26.4
nvidia-cublas-cu12==12.4.2.65
nvidia-cuda-cupti-cu12==12.4.99
nvidia-cuda-nvrtc-cu12==12.4.99
nvidia-cuda-runtime-cu12==12.4.99
nvidia-cudnn-cu12==9.1.0.70
nvidia-cufft-cu12==11.2.0.44
nvidia-curand-cu12==10.3.5.119
nvidia-cusolver-cu12==11.6.0.99
nvidia-cusparse-cu12==12.3.0.142
nvidia-nccl-cu12==2.20.5
nvidia-nvjitlink-cu12==12.4.99
nvidia-nvtx-cu12==12.4.99
oauthlib==3.2.2
omegaconf==2.3.0
onnxruntime-gpu==1.17.1
open-clip-torch==2.20.0
opencv-python==4.7.0.68
opt-einsum==3.3.0
orjson==3.10.3
packaging==24.0
pandas==2.2.2
pathtools==0.1.2
pillow==10.3.0
prodigyopt==1.0
protobuf==3.20.3
psutil==5.9.8
pyasn1==0.6.0
pyasn1_modules==0.4.0
pydantic==2.7.2
pydantic_core==2.18.3
pydub==0.25.1
Pygments==2.18.0
pyparsing==3.1.2
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-multipart==0.0.9
pytorch-lightning==1.9.0
pytz==2024.1
PyWavelets==1.6.0
PyYAML==6.0.1
referencing==0.35.1
regex==2024.5.15
requests==2.32.3
requests-oauthlib==2.0.0
rich==13.7.1
rpds-py==0.18.1
rsa==4.9
ruff==0.4.6
safetensors==0.4.2
scipy==1.11.4
semantic-version==2.10.0
sentencepiece==0.2.0
sentry-sdk==2.3.1
setproctitle==1.3.3
shellingham==1.5.4
six==1.16.0
smmap==5.0.1
sniffio==1.3.1
starlette==0.37.2
sympy==1.12.1
tensorboard==2.15.2
tensorboard-data-server==0.7.2
tensorflow==2.15.0.post1
tensorflow-estimator==2.15.0
tensorflow-io-gcs-filesystem==0.37.0
termcolor==2.4.0
timm==0.6.12
tk==0.1.0
tokenizers==0.19.1
toml==0.10.2
tomlkit==0.12.0
toolz==0.12.1
torch==2.4.0+cu124
torchmetrics==1.4.0.post0
torchvision==0.19.0+cu124
tqdm==4.66.4
transformers==4.44.0
triton==3.0.0
typer==0.12.3
typing_extensions==4.12.0
tzdata==2024.1
ujson==5.10.0
urllib3==2.2.1
uvicorn==0.30.0
uvloop==0.19.0
voluptuous==0.13.1
wandb==0.15.11
watchfiles==0.22.0
wcwidth==0.2.13
websockets==11.0.3
Werkzeug==3.0.3
wrapt==1.14.1
xformers==0.0.23.post1+cu118
yarl==1.9.4
zipp==3.19.1
cc @msaroufim @peterjc123 @mszhanyi @skyline75489 @nbcsm @iremyux @Blackhex @ptrblck
Metadata
Metadata
Assignees
Labels
Type
Projects
Status
Blocked