Description
I found a relatively simple case where memlet src and dst inference fail.
This is a simple copy from src (A @ GlobalMemory) to dst (shr_A @ SharedMemory), but regardless of how I generated the subset, codegen generates wrong code.
e1 = state.add_edge(a1, None, a2, None, dace.Memlet(
data=in_arr_name, # inference fails both for providing A or shr_A
subset=in_edge.data.subset, # is [i:i+256]
other_subset=dace.subsets.Range(copy_shape), # is [0:256]
wcr=None,
))
The inference fails and the copy is generated from src location to src location.
/home/primrose/Work/DaceLayoutAndScheduleTransformations/.dacecache/kernel_316dd2c1ec53ead0f61901eb4d0e3aad/src/cuda/kernel_cuda.cu(101): error: no instance of function template "dace::GlobalToGlobal1D" matches the argument list
argument types are: (const double *, int, const double *__restrict__)
dace::GlobalToGlobal1D<double, 256, 1, 1, 256, 1, false>(A + i, 1, A);
I think the correct way to solve this would be to make the canonical form of memlet to have src_data
, dst_data
, subset
and other_subset
. As when you think about how we copy things in code - it is src location, dst location, and shapes for both source and destination.
I want to discuss it before I start a PR.
For backwards compatibility reasons, I guess keeping the inference makes sense, but we should promote people use one of the forms and declare as the canonical form of a memlet.
The SDFG I attached (remove .txt
to run)
transformed_sdfg_with_shared_memory.sdfg.txt
The complete script to reproduce is here:
import copy
import numpy as np
import dace
import pytest
import cupy as cp
from layout_and_schedule_transformations.double_buffering import DoubleBuffering
def _add_shared_memory(sdfg: dace.SDFG):
for state in sdfg.all_states():
for node in state.nodes():
if isinstance(node, dace.sdfg.nodes.MapEntry) and node.map.schedule == dace.dtypes.ScheduleType.GPU_ThreadBlock:
for n in state.bfs_nodes(node):
if isinstance(n, dace.sdfg.nodes.MapEntry):
next_map = n
break
elif isinstance(n, dace.nodes.MapExit):
break
if next_map is None:
raise ValueError("No next map found for the GPU_Device map entry.")
src_name_dst_name_offset = dict()
edges_to_rm = set()
for in_edge in state.in_edges(node):
if in_edge.data is not None:
in_arr_name = in_edge.data.data
copy_shape = [(0, (((e) - b)//s), 1) for b, e, s in in_edge.data.subset]
copied_shape = [(((e + 1) - b)//s) for b, e, s in in_edge.data.subset]
copy_offset = [b for b, _, _ in in_edge.data.subset]
shared_mem_name = "shr_" + in_arr_name
in_arr = sdfg.arrays[in_arr_name]
if shared_mem_name not in sdfg.arrays:
sdfg.add_array(shared_mem_name, copied_shape, in_arr.dtype, storage=dace.dtypes.StorageType.GPU_Shared, transient=True)
a1 = state.add_access(in_arr_name)
a2 = state.add_access(shared_mem_name)
e1 = state.add_edge(a1, None, a2, None, dace.Memlet(
data=in_arr_name,
subset=in_edge.data.subset,
other_subset=dace.subsets.Range(copy_shape),
wcr=None,
))
e2 = state.add_edge(a2, None, next_map, in_edge.dst_conn,
dace.Memlet.from_array(shared_mem_name,
sdfg.arrays[shared_mem_name]))
e3 = state.add_edge(in_edge.src, in_edge.src_conn, a1, None,
copy.deepcopy(in_edge.data))
edges_to_rm.add(in_edge)
src_name_dst_name_offset[in_arr_name] = (shared_mem_name, copy_offset)
nodes = state.all_nodes_between(next_map, state.exit_node(next_map))
for edge in state.all_edges(*nodes):
if edge.data is not None and edge.data.data in src_name_dst_name_offset:
dst_name, offset = src_name_dst_name_offset[edge.data.data]
edge.data.data = dst_name
old_subset = [(b,e,s) for b, e, s in edge.data.subset]
new_subset = [(b - offset[i], e - offset[i], s) for i, (b, e, s) in enumerate(old_subset)]
edge.data.subset = dace.subsets.Range(new_subset)
for edge in edges_to_rm:
state.remove_edge(edge)
def test_standalone_execution():
"""Standalone test function that can be run without pytest."""
print("Running standalone Shared Memory transformations test...")
# Setup
dace.Config.set('cache', value='unique')
# Create kernel
N = dace.symbol("N", dtype=dace.int64)
N_val = 1024
@dace.program
def kernel(
A: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global,
B: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global,
C: dace.float64[N] @ dace.dtypes.StorageType.GPU_Global,
):
for i in dace.map[0:N:256] @ dace.dtypes.ScheduleType.GPU_Device:
for j in dace.map[0:256] @ dace.dtypes.ScheduleType.GPU_ThreadBlock:
C[i + j ] = A[i + j] + B[i + j]
# Create original SDFG
original_sdfg = kernel.to_sdfg(use_cache=False, simplify=False)
original_sdfg.simplify()
original_sdfg.save("original_sdfg.sdfg")
# Create transformed SDFG
transformed_sdfg = copy.deepcopy(original_sdfg)
_add_shared_memory(transformed_sdfg)
transformed_sdfg.save("transformed_sdfg_with_shared_memory.sdfg")
transformed_sdfg.validate()
# Validate SDFGs
original_sdfg.validate()
transformed_sdfg.validate()
# Initialize data
cp.random.seed(42)
vals_A_orig = cp.fromfunction(lambda i,: (i * 2) / N_val, (N_val,), dtype=cp.float64)
vals_B_orig = cp.fromfunction(lambda i,: (i * 3) / N_val, (N_val,), dtype=cp.float64)
vals_C_orig = cp.fromfunction(lambda i,: (i * 5) / N_val, (N_val,), dtype=cp.float64)
vals_A_2 = vals_A_orig.copy()
vals_B_2= vals_B_orig.copy()
vals_C_2 = vals_C_orig.copy()
# Execute SDFGs
original_sdfg(A=vals_A_orig, B=vals_B_orig, C=vals_C_orig, N=N_val)
transformed_sdfg(A=vals_A_2, B=vals_B_2, C=vals_C_2, N=N_val)
# Check results
vals_A_close = cp.allclose(vals_C_orig, vals_C_2, rtol=1e-10, atol=1e-12)
vals_B_close = cp.allclose(vals_C_orig, vals_C_2, rtol=1e-10, atol=1e-12)
print(f"vals_A results match: {vals_A_close}")
print(f"vals_B results match: {vals_B_close}")
if vals_A_close and vals_B_close:
print("✅ All tests passed! Shared Memory transformations preserve correctness.")
else:
print("❌ Test failed! Results differ between original and transformed SDFGs.")
if not vals_A_close:
print(f"vals_A max difference: {cp.max(cp.abs(vals_A_orig - vals_A_2))}")
print(f"vals_A difference: {cp.abs(vals_A_orig - vals_A_2)}")
if not vals_B_close:
print(f"vals_B max difference: {cp.max(cp.abs(vals_B_orig - vals_B_2))}")
print(f"vals_B difference: {cp.abs(vals_B_orig - vals_B_2)}")
return vals_A_close and vals_B_close
if __name__ == "__main__":
success = test_standalone_execution()
exit(0 if success else 1)