Src subset and dst subset inference

I found a relatively simple case where memlet src and dst inference fail.

This is a simple copy from src (A @ GlobalMemory) to dst (shr_A @ SharedMemory), but regardless of how I generated the subset, codegen generates wrong code.

e1 = state.add_edge(a1, None, a2, None, dace.Memlet(
    data=in_arr_name, # inference fails both for providing A or shr_A
    subset=in_edge.data.subset, # is [i:i+256]
    other_subset=dace.subsets.Range(copy_shape), # is [0:256]
    wcr=None,
))

The inference fails and the copy is generated from src location to src location.

/home/primrose/Work/DaceLayoutAndScheduleTransformations/.dacecache/kernel_316dd2c1ec53ead0f61901eb4d0e3aad/src/cuda/kernel_cuda.cu(101): error: no instance of function template "dace::GlobalToGlobal1D" matches the argument list
            argument types are: (const double *, int, const double *__restrict__)
          dace::GlobalToGlobal1D<double, 256, 1, 1, 256, 1, false>(A + i, 1, A);

I think the correct way to solve this would be to make the canonical form of memlet to have src_data, dst_data, subset and other_subset. As when you think about how we copy things in code - it is src location, dst location, and shapes for both source and destination.

I want to discuss it before I start a PR.

For backwards compatibility reasons, I guess keeping the inference makes sense, but we should promote people use one of the forms and declare as the canonical form of a memlet.

The SDFG I attached (remove .txt to run)
transformed_sdfg_with_shared_memory.sdfg.txt

The complete script to reproduce is here:

import copy
import numpy as np
import dace
import pytest
import cupy as cp

from layout_and_schedule_transformations.double_buffering import DoubleBuffering

def _add_shared_memory(sdfg: dace.SDFG):
    for state in sdfg.all_states():
        for node in state.nodes():
            if isinstance(node, dace.sdfg.nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_ThreadBlock:
                for n in state.bfs_nodes(node):
                    if isinstance(n, dace.sdfg.nodes.MapEntry):
                        next_map = n
                        break
                    elif isinstance(n, dace.nodes.MapExit):
                        break
                if next_map is None:
                    raise ValueError("No next map found for the GPU_Device map entry.")

                src_name_dst_name_offset = dict()
                edges_to_rm = set()
                for in_edge in state.in_edges(node):
                    if in_edge.data is not None:
                        in_arr_name = in_edge.data.data
                        copy_shape = [(0, (((e) - b)//s), 1) for b, e, s in in_edge.data.subset]
                        copied_shape = [(((e + 1) - b)//s) for b, e, s in in_edge.data.subset]
                        copy_offset = [b for b, _, _ in in_edge.data.subset]
                        shared_mem_name = "shr_" + in_arr_name
                        in_arr = sdfg.arrays[in_arr_name]
                        if shared_mem_name not in sdfg.arrays:
                            sdfg.add_array(shared_mem_name, copied_shape, in_arr.dtype, storage=dace.dtypes.StorageType.GPU_Shared, transient=True)

                        a1 = state.add_access(in_arr_name)
                        a2 = state.add_access(shared_mem_name)
                        e1 = state.add_edge(a1, None, a2, None, dace.Memlet(
                            data=in_arr_name,
                            subset=in_edge.data.subset,
                            other_subset=dace.subsets.Range(copy_shape),
                            wcr=None,
                        ))
                        e2 = state.add_edge(a2, None, next_map, in_edge.dst_conn,
                                            dace.Memlet.from_array(shared_mem_name,
                                                                   sdfg.arrays[shared_mem_name]))
                        e3 = state.add_edge(in_edge.src, in_edge.src_conn, a1, None,
                                            copy.deepcopy(in_edge.data))
                        edges_to_rm.add(in_edge)
                        src_name_dst_name_offset[in_arr_name] = (shared_mem_name, copy_offset)

                nodes = state.all_nodes_between(next_map, state.exit_node(next_map))
                for edge in state.all_edges(*nodes):
                    if edge.data is not None and edge.data.data in src_name_dst_name_offset:
                        dst_name, offset = src_name_dst_name_offset[edge.data.data]
                        edge.data.data = dst_name
                        old_subset = [(b,e,s) for b, e, s in edge.data.subset]
                        new_subset = [(b - offset[i], e - offset[i], s) for i, (b, e, s) in enumerate(old_subset)]
                        edge.data.subset = dace.subsets.Range(new_subset)

                for edge in edges_to_rm:
                    state.remove_edge(edge)



def test_standalone_execution():
    """Standalone test function that can be run without pytest."""
    print("Running standalone Shared Memory transformations test...")

    # Setup
    dace.Config.set('cache', value='unique')

    # Create kernel
    N = dace.symbol("N", dtype=dace.int64)
    N_val = 1024

    @dace.program
    def kernel(
        A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global,
        B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global,
        C: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global,
    ):
        for i in dace.map[0:N:256] @ dace.dtypes.ScheduleType.GPU_Device:
            for j in dace.map[0:256] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:
                C[i + j ] = A[i + j] + B[i + j]

    # Create original SDFG
    original_sdfg = kernel.to_sdfg(use_cache=False, simplify=False)
    original_sdfg.simplify()
    original_sdfg.save("original_sdfg.sdfg")


    # Create transformed SDFG
    transformed_sdfg = copy.deepcopy(original_sdfg)
    _add_shared_memory(transformed_sdfg)
    transformed_sdfg.save("transformed_sdfg_with_shared_memory.sdfg")
    transformed_sdfg.validate()

    # Validate SDFGs
    original_sdfg.validate()
    transformed_sdfg.validate()

    # Initialize data
    cp.random.seed(42)
    vals_A_orig = cp.fromfunction(lambda i,: (i * 2) / N_val, (N_val,), dtype=cp.float64)
    vals_B_orig = cp.fromfunction(lambda i,: (i * 3) / N_val, (N_val,), dtype=cp.float64)
    vals_C_orig = cp.fromfunction(lambda i,: (i * 5) / N_val, (N_val,), dtype=cp.float64)

    vals_A_2 = vals_A_orig.copy()
    vals_B_2= vals_B_orig.copy()
    vals_C_2 = vals_C_orig.copy()

    # Execute SDFGs
    original_sdfg(A=vals_A_orig, B=vals_B_orig, C=vals_C_orig, N=N_val)
    transformed_sdfg(A=vals_A_2, B=vals_B_2, C=vals_C_2, N=N_val)

    # Check results
    vals_A_close = cp.allclose(vals_C_orig, vals_C_2, rtol=1e-10, atol=1e-12)
    vals_B_close = cp.allclose(vals_C_orig, vals_C_2, rtol=1e-10, atol=1e-12)

    print(f"vals_A results match: {vals_A_close}")
    print(f"vals_B results match: {vals_B_close}")

    if vals_A_close and vals_B_close:
        print("✅ All tests passed! Shared Memory transformations preserve correctness.")
    else:
        print("❌ Test failed! Results differ between original and transformed SDFGs.")
        if not vals_A_close:
            print(f"vals_A max difference: {cp.max(cp.abs(vals_A_orig - vals_A_2))}")
            print(f"vals_A difference: {cp.abs(vals_A_orig - vals_A_2)}")
        if not vals_B_close:
            print(f"vals_B max difference: {cp.max(cp.abs(vals_B_orig - vals_B_2))}")
            print(f"vals_B difference: {cp.abs(vals_B_orig - vals_B_2)}")

    return vals_A_close and vals_B_close


if __name__ == "__main__":
    success = test_standalone_execution()
    exit(0 if success else 1)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions