8000 Src subset and dst subset inference · Issue #2037 · spcl/dace · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content
Src subset and dst subset inference #2037
Open
@ThrudPrimrose

Description

@ThrudPrimrose

I found a relatively simple case where memlet src and dst inference fail.

Image

This is a simple copy from src (A @ GlobalMemory) to dst (shr_A @ SharedMemory), but regardless of how I generated the subset, codegen generates wrong code.

e1 = state.add_edge(a1, None, a2, None, dace.Memlet(
    data=in_arr_name, # inference fails both for providing A or shr_A
    subset=in_edge.data.subset, # is [i:i+256]
    other_subset=dace.subsets.Range(copy_shape), # is [0:256]
    wcr=None,
))

The inference fails and the copy is generated from src location to src location.

/home/primrose/Work/DaceLayoutAndScheduleTransformations/.dacecache/kernel_316dd2c1ec53ead0f61901eb4d0e3aad/src/cuda/kernel_cuda.cu(101): error: no instance of function template "dace::GlobalToGlobal1D" matches the argument list
            argument types are: (const double *, int, const double *__restrict__)
          dace::GlobalToGlobal1D<double, 256, 1, 1, 256, 1, false>(A + i, 1, A);

I think the correct way to solve this would be to make the canonical form of memlet to have src_data, dst_data, subset and other_subset. As when you think about how we copy things in code - it is src location, dst location, and shapes for both source and destination.

I want to discuss it before I start a PR.

For backwards compatibility reasons, I guess keeping the inference makes sense, but we should promote people use one of the forms and declare as the canonical form of a memlet.

The SDFG I attached (remove .txt to run)
transformed_sdfg_with_shared_memory.sdfg.txt

The complete script to reproduce is here:

import copy
import numpy as np
import dace
import pytest
import cupy as cp

from layout_and_schedule_transformations.double_buffering import DoubleBuffering

def _add_shared_memory(sdfg: dace.SDFG):
    for state in sdfg.all_states():
        for node in state.nodes():
            if isinstance(node, dace.sdfg.nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_ThreadBlock:
                for n in state.bfs_nodes(node):
                    if isinstance(n, dace.sdfg.nodes.MapEntry):
                        next_map = n
                        break
                    elif isinstance(n, dace.nodes.MapExit):
                        break
                if next_map is None:
                    raise ValueError("No next map found for the GPU_Device map entry.")

                src_name_dst_name_offset = dict()
                edges_to_rm = set()
                for in_edge in state.in_edges(node):
                    if in_edge.data is not None:
                        in_arr_name = in_edge.data.data
                        copy_shape = [(0, (((e) - b)//s), 1) for b, e, s in in_edge.data.subset]
                        copied_shape = [(((e + 1) - b)//s) for b, e, s in in_edge.data.subset]
                        copy_offset = [b for b, _, _ in in_edge.data.subset]
                        shared_mem_name = "shr_" + in_arr_name
                        in_arr = sdfg.arrays[in_arr_name]
                        if shared_mem_name not in sdfg.arrays:
                            sdfg.add_array(shared_mem_name, copied_shape, in_arr.dtype, storage=dace.dtypes.StorageType.GPU_Shared, transient=True)

                        a1 = state.add_access(in_arr_name)
                        a2 = state.add_access(shared_mem_name)
                        e1 = state.add_edge(a1, None, a2, None, dace.Memlet(
                            data=in_arr_name,
                            subset=in_edge.data.subset,
                            other_subset=dace.subsets.Range(copy_shape),
                            wcr=None,
                        ))
                        e2 = state.add_edge(a2, None, next_map, in_edge.dst_conn,
                                            dace.Memlet.from_array(shared_mem_name,
                                                                   sdfg.arrays[shared_mem_name]))
                        e3 = state.add_edge(in_edge.src, in_edge.src_conn, a1, None,
                                            copy.deepcopy(in_edge.data))
                        edges_to_rm.add(in_edge)
                        src_name_dst_name_offset[in_arr_name] = (shared_mem_name, copy_offset)

                nodes = state.all_nodes_between(next_map, state.exit_node(next_map))
                for edge in state.all_edges(*nodes):
                    if edge.data is not None and edge.data.data in src_name_dst_name_offset:
                        dst_name, offset = src_name_dst_name_offset[edge.data.data]
                        edge.data.data = dst_name
                        old_subset = [(b,e,s) for b, e, s in edge.data.subset]
                        new_subset = [(b - offset[i], e - offset[i], s) for i, (b, e, s) in enumerate(old_subset)]
                        edge.data.subset = dace.subsets.Range(new_subset)

                for edge in edges_to_rm:
                    state.remove_edge(edge)



def test_standalone_execution():
    """Standalone test function that can be run without pytest."""
    print("Running standalone Shared Memory transformations test...")

    # Setup
    dace.Config.set('cache', value='unique')

    # Create kernel
    N = dace.symbol("N", dtype=dace.int64)
    N_val = 1024

    @dace.program
    def kernel(
        A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global,
        B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global,
        C: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global,
    ):
        for i in dace.map[0:N:256] @ dace.dtypes.ScheduleType.GPU_Device:
            for j in dace.map[0:256] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:
                C[i + j ] = A[i + j] + B[i + j]

    # Create original SDFG
    original_sdfg = kernel.to_sdfg(use_cache=False, simplify=False)
    original_sdfg.simplify()
    original_sdfg.save("original_sdfg.sdfg")


    # Create transformed SDFG
    transformed_sdfg = copy.deepcopy(original_sdfg)
    _add_shared_memory(transformed_sdfg)
    transformed_sdfg.save("transformed_sdfg_with_shared_memory.sdfg")
    transformed_sdfg.validate()

    # Validate SDFGs
    original_sdfg.validate()
    transformed_sdfg.validate()

    # Initialize data
    cp.random.seed(42)
    vals_A_orig = cp.fromfunction(lambda i,: (i * 2) / N_val, (N_val,), dtype=cp.float64)
    vals_B_orig = cp.fromfunction(lambda i,: (i * 3) / N_val, (N_val,), dtype=cp.float64)
    vals_C_orig = cp.fromfunction(lambda i,: (i * 5) / N_val, (N_val,), dtype=cp.float64)

    vals_A_2 = vals_A_orig.copy()
    vals_B_2= vals_B_orig.copy()
    vals_C_2 = vals_C_orig.copy()

    # Execute SDFGs
    original_sdfg(A=vals_A_orig, B=vals_B_orig, C=vals_C_orig, N=N_val)
    transformed_sdfg(A=vals_A_2, B=vals_B_2, C=vals_C_2, N=N_val)

    # Check results
    vals_A_close = cp.allclose(vals_C_orig, vals_C_2, rtol=1e-10, atol=1e-12)
    vals_B_close = cp.allclose(vals_C_orig, vals_C_2, rtol=1e-10, atol=1e-12)

    print(f"vals_A results match: {vals_A_close}")
    print(f"vals_B results match: {vals_B_close}")

    if vals_A_close and vals_B_close:
        print("✅ All tests passed! Shared Memory transformations preserve correctness.")
    else:
        print("❌ Test failed! Results differ between original and transformed SDFGs.")
        if not vals_A_close:
            print(f"vals_A max difference: {cp.max(cp.abs(vals_A_orig - vals_A_2))}")
            print(f"vals_A difference: {cp.abs(vals_A_orig - vals_A_2)}")
        if not vals_B_close:
            print(f"vals_B max difference: {cp.max(cp.abs(vals_B_orig - vals_B_2))}")
            print(f"vals_B difference: {cp.abs(vals_B_orig - vals_B_2)}")

    return vals_A_close and vals_B_close


if __name__ == "__main__":
    success = test_standalone_execution()
    exit(0 if success else 1)

Metadata

Metadata

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions

    0