torchao already works on raspberry pi

Problem

We don't publish aarch64 linux binaries so right now we still install ao=0.1

(myvenv) marksaroufim@rpi5:~/Dev/ao $ pip install torchao
Looking in indexes: https://pypi.org/simple, https://www.piwheels.org/simple
Collecting torchao
  Downloading torchao-0.1-py3-none-any.whl (54 kB)

But torchao actually works on a raspberry pi

Environment details

(myvenv) marksaroufim@rpi5:~/Dev/ao $ uname -a
Linux rpi5 6.6.51+rpt-rpi-2712 #1 SMP PREEMPT Debian 1:6.6.51-1+rpt3 (2024-10-08) aarch64 GNU/Linux

Test file I ran

Logs

(myvenv) marksaroufim@rpi5:~/Downloads $ python test.py 
/home/marksaroufim/Dev/myvenv/lib/python3.11/site-packages/torch/_subclasses/functional_tensor.py:294: UserWarning: Failed to initialize NumPy: No module named 'numpy' (Triggered internally at /pytorch/torch/csrc/utils/tensor_numpy.cpp:82.)
  cpu = _conversion_method_template(device=torch.device("cpu"))
Model size on disk: 7821.85 KB
Model size on disk: 1988.05 KB

Test file

import torch
import torch.nn as nn
import os

class ToyModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(ToyModel, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        # self.layer2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # x = torch.relu(self.layer1(x))
        x = self.layer1(x)
        return x

# Create an instance of the model
input_size = 1000
hidden_size = 2000
output_size = 5
model = ToyModel(input_size, hidden_size, output_size)

# Save the model
torch.save(model.state_dict(), "toy_model.pth")

# Get the size of the saved model file
file_size = os.path.getsize("toy_model.pth")

print(f"Model size on disk: {file_size / 1024:.2f} KB")

import torchao
from torchao.quantization.quant_api import (quantize_, int8_weight_only, int4_weight_only)

quantize_(model, int8_weight_only())
torch.save(model.state_dict, "quantized_toy_model.pth")

file_size = os.path.getsize("quantized_toy_model.pth")
print(f"Model size on disk: {file_size / 1024:.2f} KB")

Torch.compile works

import torch
import torch.nn as nn
import os

class ToyModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(ToyModel, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        # self.layer2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # x = torch.relu(self.layer1(x))
        x = self.layer1(x)
        return x

# Create an instance of the model
input_size = 1000
hidden_size = 2000
output_size = 5
model = ToyModel(input_size, hidden_size, output_size)

sample_input = torch.randn(1, input_size)
model(sample_input)

# Save the model
torch.save(model.state_dict(), "toy_model.pth")

# Get the size of the saved model file
file_size = os.path.getsize("toy_model.pth")

print(f"Model size on disk: {file_size / 1024:.2f} KB")

import torchao
from torchao.quantization.quant_api import (quantize_, int8_weight_only, int4_weight_only)

quantize_(model, int8_weight_only())
model = torch.compile(model)
model(sample_input)
torch.save(model.state_dict, "quantized_toy_model.pth")

file_size = os.path.getsize("quantized_toy_model.pth")
print(f"Model size on disk: {file_size / 1024:.2f} KB")

Bf16 works

import torch

# Create two bf16 tensors
tensor1 = torch.randn(1000, dtype=torch.bfloat16)
tensor2 = torch.randn(1000, dtype=torch.bfloat16)

# Perform dot product
result = torch.dot(tensor1, tensor2)

# Print the result
print("Dot product result:", result)

Full test suite

(myvenv) marksaroufim@rpi5:~/Dev/ao $ rm test/prototype/test_spinquant.py 
(myvenv) marksaroufim@rpi5:~/Dev/ao $ pytest test
============================= test session starts ==============================
platform linux -- Python 3.11.2, pytest-7.4.0, pluggy-1.5.0
rootdir: /home/marksaroufim/Dev/ao
plugins: hypothesis-6.115.2
collected 1921 items / 10 skipped                                              

test/test_ao_models.py ....                                              [  0%]
test/test_ops.py sssssssssssssssssssssssssssssssssssssssssssssssssssssss [  3%]
ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss [  6%]
ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss [ 10%]
ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss [ 14%]
ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss [ 18%]
sssssssssssssssss                                                        [ 18%]
test/test_utils.py ..                                                    [ 19%]
test/dtypes/test_affine_quantized.py sssssssssssssss                     [ 19%]
test/dtypes/test_affine_quantized_float.py ssssssssssssssssssssssss..sss [ 21%]
s.                                                                       [ 21%]
test/dtypes/test_affine_quantized_tensor_parallel.py sssssssss           [ 21%]
test/dtypes/test_bitnet.py .........sss                                  [ 22%]
test/dtypes/test_bitpacking.py .....................ssssssssssssssssssss [ 24%]
sssssssssssssssssssssss.                                                 [ 25%]
test/dtypes/test_floatx.py ssss.....ss..                                 [ 26%]
test/dtypes/test_nf4.py ...ssssssssssssssssssssssssssssssss...sss...ssss [ 29%]
ss.s.....s.....sss...............ss.......s                              [ 31%]
test/dtypes/test_uint2.py ........                                       [ 31%]
test/dtypes/test_uint4.py sss                                            [ 31%]
test/dtypes/test_uintx.py ssssssssssssssssssssssssssssssssssssssssssssss [ 34%]
ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss [ 38%]
ssssssss                                                                 [ 38%]
test/float8/test_base.py ............sssssssssssssssssssssssssssssssssss [ 40%]
sssssssssssssssssss
7D82
sssssssssssssssssssssssssssssssssssssssssssssssssssss [ 44%]
ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss [ 48%]
ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss.sssssss [ 52%]
sssssssssss......                                                        [ 53%]
test/float8/test_compile.py ssssssssssssssssssssssssssssssssssssssssssss [ 55%]
ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss [ 59%]
ssssssssssssssssssssssssssssssssssssssssssssssssssssssss                 [ 61%]
test/float8/test_numerics_integration.py sssssssssssssssssssssssssssss   [ 63%]
test/hqq/test_hqq_affine.py sssssss                                      [ 63%]
test/integration/test_integration.py ....s...sss.s.s.sssssssssssssssssss [ 65%]
ssssssssssssssssssss.sssss.sss...sss...sssssssssss.sssss.sssssssssssssss [ 69%]
ssssssssssss...sss...sss..ssssssssssssssssssss.sss...ssss.s.ssssssssssss [ 73%]
ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss [ 76%]
ssssssssssssssssss.s                                                     [ 77%]
test/kernel/test_autotuner.py sssss.s.                                   [ 78%]
test/profiler/test_performance_counter.py sssssssss..ssssssss            [ 79%]
test/prototype/test_awq.py sssss                                         [ 79%]
test/prototype/test_bitpacking_gen.py ......                             [ 79%]
test/prototype/test_low_bit_optim.py ....ssssssss............s           [ 81%]
test/prototype/test_quantized_training.py ssssssssss...............ss    [ 82%]
test/prototype/test_splitk.py ss                                         [ 82%]
test/prototype/mx_formats/test_custom_cast.py ss...............ss...s.s. [ 84%]
s.s.s                                                                    [ 84%]
test/prototype/mx_formats/test_mx_linear.py ssssssssssssssssssssssssssss [ 85%]
ssssssssssssssssssssssssssssssssssssssssssssssss.                        [ 88%]
test/prototype/mx_formats/test_mx_tensor.py ssssssssssssssssssssssssssss [ 89%]
ssssssssssssssssssssss.....ssssssssssssssssssss                          [ 92%]
test/quantization/test_mixed_precision.py .                              [ 92%]
test/quantization/test_observer.py ......                                [ 92%]
test/quantization/test_qat.py ....sss........                            [ 93%]
test/quantization/test_quant_api.py s.s.sssssssss..sssssssss             [ 94%]
test/quantization/test_quant_primitives.py .........s.............       [ 95%]
test/sparsity/test_fast_sparse_training.py ss                            [ 95%]
test/sparsity/test_marlin.py sss                                         [ 96%]
test/sparsity/test_parametrization.py ....                               [ 96%]
test/sparsity/test_scheduler.py ......                                   [ 96%]
test/sparsity/test_sparse_api.py sssssssss                               [ 97%]
test/sparsity/test_sparsifier.py ....................                    [ 98%]
test/sparsity/test_sparsity_utils.py ........                            [ 98%]
test/sparsity/test_structured_sparsifier.py ......................       [ 99%]
test/sparsity/test_wanda.py .....                                        [100%]

=============================== warnings summary ===============================
test/dtypes/test_nf4.py::TestNF4Linear::test_to_copy_bfloat16
test/dtypes/test_nf4.py::TestNF4Linear::test_to_copy_float16
test/dtypes/test_nf4.py::TestNF4Linear::test_to_copy_float32
  /home/marksaroufim/Dev/ao/test/dtypes/test_nf4.py:205: FutureWarning: `torch.testing.assert_allclose()` is deprecated since 1.12 and will be removed in a future release. Please use `torch.testing.assert_close()` instead. You can find detailed upgrade instructions in https://github.com/pytorch/pytorch/issues/61844.
    torch.testing.assert_allclose(input_tensor, nf4_to_dtype, atol=0.13, rtol=0.13)

test/integration/test_integration.py::TestSaveLoadMeta::test_save_load_int4woqtensors_2_cpu
test/integration/test_integration.py::TestSaveLoadMeta::test_save_load_int8woqtensors_0_cpu
test/integration/test_integration.py::TestSaveLoadMeta::test_save_load_int8woqtensors_1_cpu
test/integration/test_integration.py::TestSaveLoadMeta::test_save_load_int8woqtensors_2_cpu
  /home/marksaroufim/Dev/ao/test/integration/test_integration.py:1038: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
    state_dict = torch.load("test.pth", mmap=True)

test/kernel/test_autotuner.py::TestQuantFlow::test_int_scaled_mm_1_cpu
test/kernel/test_autotuner.py::TestQuantFlow::test_int_scaled_mm_3_cpu
  /home/marksaroufim/Dev/ao/test/kernel/test_autotuner.py:97: FutureWarning: `torch.testing.assert_allclose()` is deprecated since 1.12 and will be removed in a future release. Please use `torch.testing.assert_close()` instead. You can find detailed upgrade instructions in https://github.com/pytorch/pytorch/issues/61844.
    torch.testing.assert_allclose(out32_1, out32_2)

test/profiler/test_performance_counter.py::test_performance_stats[no_device]
  /home/marksaroufim/Dev/myvenv/lib/python3.11/site-packages/torchao/profiler/performance_counter.py:381: UserWarning: Device bandwidth is not specified. Please specify the device bandwidth to enable bandwidth utilization calculation
    warnings.warn(

test/profiler/test_performance_counter.py::test_performance_stats[no_device]
  /home/marksaroufim/Dev/myvenv/lib/python3.11/site-packages/torchao/profiler/performance_counter.py:361: UserWarning: Device bandwidth is not specified. Please specify the device bandwidth to enable io latency calculation
    warnings.warn(

test/profiler/test_performance_counter.py::test_performance_stats[no_device]
  /home/marksaroufim/Dev/myvenv/lib/python3.11/site-packages/torchao/profiler/performance_counter.py:391: UserWarning: Device flops_per_s is not specified. Please specify the device throughput to enable flops utilization calculation
    warnings.warn(

test/profiler/test_performance_counter.py::test_performance_stats[no_device]
  /home/marksaroufim/Dev/myvenv/lib/python3.11/site-packages/torchao/profiler/performance_counter.py:371: UserWarning: Device flops_per_s is not specified. Please specify the device throughput to enable compute latency calculation
    warnings.warn(

test/prototype/test_low_bit_optim.py: 12 warnings
  /home/marksaroufim/Dev/ao/test/prototype/test_low_bit_optim.py:103: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
    state_dict = torch.load(f.name, map_location="cpu")

test/prototype/test_quantized_training.py::TestQuantizedTraining::test_int8_weight_only_compile_leading_dims0_bias_False_device_cpu
test/prototype/test_quantized_training.py::TestQuantizedTraining::test_int8_weight_only_compile_leading_dims0_bias_True_device_cpu
test/prototype/test_quantized_training.py::TestQuantizedTraining::test_int8_weight_only_compile_leading_dims1_bias_False_device_cpu
test/prototype/test_quantized_training.py::TestQuantizedTraining::test_int8_weight_only_compile_leading_dims1_bias_True_device_cpu
test/prototype/test_quantized_training.py::TestQuantizedTraining::test_int8_weight_only_compile_leading_dims2_bias_False_device_cpu
test/prototype/test_quantized_training.py::TestQuantizedTraining::test_int8_weight_only_compile_leading_dims2_bias_True_device_cpu
test/prototype/test_quantized_training.py::TestQuantizedTraining::test_int8_weight_only_training_compile_True_device_cpu
test/prototype/test_quantized_training.py::TestQuantizedTraining::test_int8_weight_only_training_compile_True_device_cpu
  /home/marksaroufim/Dev/myvenv/lib/python3.11/site-packages/torch/_dynamo/variables/misc.py:651: UserWarning: The config.capture_autograd_function flag is deprecated, it's now always true.
    warnings.warn(

test/sparsity/test_parametrization.py::TestFakeSparsity::test_jit_trace
  /home/marksaroufim/Dev/myvenv/lib/python3.11/site-packages/torchao/sparsity/prototype/sparsifier/utils.py:129: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
    assert self.mask.shape == x.shape

test/sparsity/test_scheduler.py::TestScheduler::test_lambda_scheduler
test/sparsity/test_scheduler.py::TestCubicScheduler::test_step
  /home/marksaroufim/Dev/myvenv/lib/python3.11/site-packages/torchao/sparsity/prototype/scheduler/base_scheduler.py:122: UserWarning: Detected call of `scheduler.step()` before `sparsifier.step()`. You have to make sure you run the sparsifier.step() BEFORE any calls to the scheduler.step().
    warnings.warn("Detected call of `scheduler.step()` before `sparsifier.step()`. "

test/sparsity/test_wanda.py::TestWandaSparsifier::test_one_layer_mlp_2x4
  /home/marksaroufim/Dev/myvenv/lib/python3.11/site-packages/torchao/sparsity/wanda.py:42: UserWarning: WandaSparsifier got semi_structured_bock_size=4, sparsity_level fixed to 50% (2:4) sparsity
    warnings.warn(

-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html
========== 330 passed, 1601 skipped, 37 warnings in 436.75s (0:07:16) ==========
(myvenv) marksaroufim@rpi5:~/Dev/ao $

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Problem

Environment details

Logs

Test file

Torch.compile works

Bf16 works

Full test suite

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Description

Problem

Environment details

Logs

Test file

Torch.compile works

Bf16 works

Full test suite

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions