From 2b058eedd4d1c7831aaad7e30041ed3b9e74d28d Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Fri, 31 Mar 2023 14:55:00 +0100
Subject: [PATCH 001/133] add slurm and input file

---
 cpu.job | 33 +++++++++++++++++++++++++++++++++
 gpu.job | 17 +++++++++++++++++
 input   | 19 +++++++++++++++++++
 3 files changed, 69 insertions(+)
 create mode 100644 cpu.job
 create mode 100644 gpu.job
 create mode 100644 input

diff --git a/cpu.job b/cpu.job
new file mode 100644
index 000000000..b094c01aa
--- /dev/null
+++ b/cpu.job
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+#SBATCH --account=m22oc-S2329216
+#SBATCH --job-name=Xad
+#SBATCH --time=00:40:00
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --tasks-per-node=1
+#SBATCH --cpus-per-task=18
+
+#SBATCH --qos=standard
+#SBATCH --partition=standard
+
+#SBATCH --output=%x-%j.out
+#SBATCH --error=%x-%j.err
+#SBATCH --exclusive
+
+#SBATCH --hint=nomultithread
+#SBATCH --distribution=block:block
+
+module --silent load intel-compilers-19
+module --silent load mpt
+#module load gcc
+
+
+# Set the number of threads 
+export OMP_NUM_THREADS=18
+export OMP_PLACES=cores
+
+
+srun  ./src/Ludwig.exe input
+
+
diff --git a/gpu.job b/gpu.job
new file mode 100644
index 000000000..362a43557
--- /dev/null
+++ b/gpu.job
@@ -0,0 +1,17 @@
+#!/bin/bash
+#SBATCH --account=m22oc-S2329216
+#SBATCH --job-name=ludwig_test
+#SBATCH --gres=gpu:1
+#SBATCH --time=00:10:00
+#SBATCH --partition=gpu
+#SBATCH --qos=short
+
+module load gcc nvidia/nvhpc/22.11
+
+# cd tests
+# make -k d3q19-short
+
+./src/Ludwig.exe input
+
+
+
diff --git a/input b/input
new file mode 100644
index 000000000..e4f4b19ab
--- /dev/null
+++ b/input
@@ -0,0 +1,19 @@
+N_cycles        1000
+
+size            128_64_64
+
+viscosity       0.1
+
+free_energy     none
+
+colloid_init    no_colloids
+
+periodicity     1_1_1
+
+freq_statistics 500
+config_at_end   no
+
+N_LE_plane      0
+LE_plane_vel    0.05
+
+random_seed     7361237
\ No newline at end of file

From 440514caa01cd855cffc30dd483e0ad06b6e1b84 Mon Sep 17 00:00:00 2001
From: Anton <a.xiao@sms.ed.ac.uk>
Date: Tue, 11 Apr 2023 13:08:20 +0100
Subject: [PATCH 002/133] Update README.md

---
 README.md | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 569fe8cd0..2c7f0d70f 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,3 @@
-
 ### Ludwig
 
 A lattice Boltzmann code for complex fluids
@@ -88,3 +87,14 @@ under an MIT license by Gave Gamble at https://github.com/DaveGamble/cJSON.
 For bug reports, problems, and other issues, please open a new issue.
 
 
+## For MSc projects
+Our goal is to inspect the code in-depth, locate the root cause of the problem, and implement necessary fixes, which may require a complete reimplementation of the GPU version of the code. Ultimately, our objective is to create a well-optimized GPU version of the 'Lees Edwards BC' method, which will enable us to achieve significant performance enhancements and good scalability.
+
+Student: Andong Xiao 
+Supervisor: Kevin Stanford
+
+### Relevant document can be seen from
+[Wiki](https://git.ecdf.ed.ac.uk/s2329216/msc_projects_document.git)
+
+
+

From 9b577fbdd3b98a3bb4c4a6a371e6583da895a1bd Mon Sep 17 00:00:00 2001
From: Anton <a.xiao@sms.ed.ac.uk>
Date: Tue, 11 Apr 2023 13:11:06 +0100
Subject: [PATCH 003/133] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 2c7f0d70f..e56ca0af6 100644
--- a/README.md
+++ b/README.md
@@ -87,14 +87,14 @@ under an MIT license by Gave Gamble at https://github.com/DaveGamble/cJSON.
 For bug reports, problems, and other issues, please open a new issue.
 
 
+
 ## For MSc projects
-Our goal is to inspect the code in-depth, locate the root cause of the problem, and implement necessary fixes, which may require a complete reimplementation of the GPU version of the code. Ultimately, our objective is to create a well-optimized GPU version of the 'Lees Edwards BC' method, which will enable us to achieve significant performance enhancements and good scalability.
+Our goal is to inspect the code in-depth, locate the root cause of poor performance of some part of the code, and implement necessary fixes, which may require a complete reimplementation of the GPU version of the code. Ultimately, our objective is to create a well-optimized GPU version of the 'Lees Edwards BC' method, which will enable us to achieve significant performance enhancements and good scalability.
 
 Student: Andong Xiao 
 Supervisor: Kevin Stanford
 
-### Relevant document can be seen from
-[Wiki](https://git.ecdf.ed.ac.uk/s2329216/msc_projects_document.git)
+Relevant document can be seen from:  [Wiki](https://git.ecdf.ed.ac.uk/s2329216/msc_projects_document.git)
 
 
 

From 86e9d51cfdaf7b3494b375d814f6170d8eb07b38 Mon Sep 17 00:00:00 2001
From: Anton <a.xiao@sms.ed.ac.uk>
Date: Tue, 11 Apr 2023 21:20:22 +0100
Subject: [PATCH 004/133] Update README.md

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e56ca0af6..f1a223661 100644
--- a/README.md
+++ b/README.md
@@ -92,9 +92,10 @@ For bug reports, problems, and other issues, please open a new issue.
 Our goal is to inspect the code in-depth, locate the root cause of poor performance of some part of the code, and implement necessary fixes, which may require a complete reimplementation of the GPU version of the code. Ultimately, our objective is to create a well-optimized GPU version of the 'Lees Edwards BC' method, which will enable us to achieve significant performance enhancements and good scalability.
 
 Student: Andong Xiao 
+
 Supervisor: Kevin Stanford
 
-Relevant document can be seen from:  [Wiki](https://git.ecdf.ed.ac.uk/s2329216/msc_projects_document.git)
+Relevant documents including meeting blog, questions and some assessments materials can be seen from:  [Wiki](https://git.ecdf.ed.ac.uk/s2329216/msc_projects_document.git), which is maintained in another repository.
 
 
 

From dc62aea4de46ddbbbec776e1205d434f717de52a Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Thu, 8 Jun 2023 20:14:23 +0100
Subject: [PATCH 005/133] add config

---
 input | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/input b/input
index e4f4b19ab..3a146ef09 100644
--- a/input
+++ b/input
@@ -13,7 +13,7 @@ periodicity     1_1_1
 freq_statistics 500
 config_at_end   no
 
-N_LE_plane      0
+N_LE_plane      1
 LE_plane_vel    0.05
 
 random_seed     7361237
\ No newline at end of file

From b2a947fc1f74000947beddbdd2a6e299efd78d71 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Sat, 10 Jun 2023 14:00:07 +0100
Subject: [PATCH 006/133] interchange plane,side,jc,kc, tested right

---
 src/model_le.c | 1058 ++++++++++++++++++++++++------------------------
 test.job       |   18 +
 2 files changed, 546 insertions(+), 530 deletions(-)
 create mode 100644 test.job

diff --git a/src/model_le.c b/src/model_le.c
index 57ce04750..893f7dcce 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -16,24 +16,24 @@
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *  J.-C. Desplat and Ronojoy Adhikari developed the reprojection method.
- * 
+ *
  *****************************************************************************/
 
 #include <assert.h>
-#include <stdlib.h>
 #include <math.h>
+#include <stdlib.h>
 
-#include "pe.h"
-#include "timer.h"
-#include "coords.h"
 #include "control.h"
-#include "physics.h"
+#include "coords.h"
 #include "model_le.h"
+#include "pe.h"
+#include "physics.h"
+#include "timer.h"
 #include "util.h"
 
-static int le_reproject(lb_t * lb, lees_edw_t * le);
-static int le_displace_and_interpolate(lb_t * lb, lees_edw_t * le);
-static int le_displace_and_interpolate_parallel(lb_t * lb, lees_edw_t * le);
+static int le_reproject(lb_t *lb, lees_edw_t *le);
+static int le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
+static int le_displace_and_interpolate_parallel(lb_t *lb, lees_edw_t *le);
 
 /*****************************************************************************
  *
@@ -54,38 +54,38 @@ static int le_displace_and_interpolate_parallel(lb_t * lb, lees_edw_t * le);
  *
  *****************************************************************************/
 
-__host__ int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le) {
+__host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
 
-  int mpi_cartsz[3];
+    int mpi_cartsz[3];
 
-  assert(lb);
-  assert(le);
+    assert(lb);
+    assert(le);
 
-  lees_edw_cartsz(le, mpi_cartsz);
+    lees_edw_cartsz(le, mpi_cartsz);
 
-  if (lees_edw_nplane_local(le) > 0) {
+    if (lees_edw_nplane_local(le) > 0) {
 
-    TIMER_start(TIMER_LE);
+        TIMER_start(TIMER_LE);
 
-    /* Everything must be done on host at the moment (slowly) ... */
-    /* ... and copy back at the end */
-    lb_memcpy(lb, tdpMemcpyDeviceToHost);
+        /* Everything must be done on host at the moment (slowly) ... */
+        /* ... and copy back at the end */
+        lb_memcpy(lb, tdpMemcpyDeviceToHost);
 
-    le_reproject(lb, le);
+        le_reproject(lb, le);
 
-    if (mpi_cartsz[Y] > 1) {
-      le_displace_and_interpolate_parallel(lb, le);
-    }
-    else {
-      le_displace_and_interpolate(lb, le);
-    }
+        if (mpi_cartsz[Y] > 1) {
+            le_displace_and_interpolate_parallel(lb, le);
+        }
+        else {
+            le_displace_and_interpolate(lb, le);
+        }
 
-    lb_memcpy(lb, tdpMemcpyHostToDevice);
+        lb_memcpy(lb, tdpMemcpyHostToDevice);
 
-    TIMER_stop(TIMER_LE);
-  }
+        TIMER_stop(TIMER_LE);
+    }
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -102,108 +102,107 @@ __host__ int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le) {
  *     S_ab -> S_ab +/- rho u_a u^le_b +/- rho u_b u^le_a + rho u^le_a u^le_b
  *
  *  with analogous expressions for order parameter moments.
- * 
+ *
  *  The change to the distribution is then computed by a reprojection.
  *  Ghost modes are unchanged.
- * 	    	  
+ *
  *****************************************************************************/
 
-static int le_reproject(lb_t * lb, lees_edw_t * le) {
-
-  int    ic, jc, kc, index;
-  int    nplane, plane, side;
-  int    ia, ib;
-  int    nlocal[3];
-  int    n, ndist;
-  int8_t cx = 0;
-
-  double rho, ds[3][3], udotc, sdotq;
-  double g[3], du[3];
-  double fnew;
-  double t;
-  physics_t * phys = NULL;
-
-  assert(lb);
-  assert(le);
-
-  lb_ndist(lb, &ndist);
-  nplane = lees_edw_nplane_local(le);
-  physics_ref(&phys);
-
-  t = 1.0*physics_control_timestep(phys);
-  lees_edw_nlocal(le, nlocal);
-
-  for (plane = 0; plane < nplane; plane++) {
-    for (side = 0; side < 2; side++) {
-
-      du[X] = 0.0;
-      du[Y] = 0.0; 
-      du[Z] = 0.0;
-
-      if (side == 0) {
-	/* Start with plane below Lees-Edwards BC */
-	lees_edw_plane_uy_now(le, t, &du[Y]);
-	du[Y] *= -1.0;
-	ic = lees_edw_plane_location(le, plane);
-	cx = +1;
-      }
-      else {
-	/* Finally, deal with plane above LEBC */
-	lees_edw_plane_uy_now(le, t, &du[Y]);
-	ic = lees_edw_plane_location(le, plane) + 1;
-	cx = -1;
-      }
-
-      for (jc = 1; jc <= nlocal[Y]; jc++) {
-	for (kc = 1; kc <= nlocal[Z]; kc++) {
-	  
-	  index = lees_edw_index(le, ic, jc, kc);
-
-	  for (n = 0; n < ndist; n++) {
-
-	    /* Compute 0th and 1st moments */
-	    lb_dist_enum_t ndn = (lb_dist_enum_t) n;
-	    lb_0th_moment(lb, index, ndn, &rho);
-	    lb_1st_moment(lb, index, ndn, g);
-
-	    for (ia = 0; ia < 3; ia++) {
-	      for (ib = 0; ib < 3; ib++) {
-		ds[ia][ib] = (g[ia]*du[ib] + du[ia]*g[ib] + rho*du[ia]*du[ib]);
-	      }
-	    }
-
-	    /* Now update the distribution */
-	    for (int p = 1; p < lb->model.nvel; p++) {
-
-	      double cs2 = lb->model.cs2;
-	      double rcs2 = 1.0/cs2;
-	      if (lb->model.cv[p][X] != cx) continue;
-
-	      udotc = du[Y]*lb->model.cv[p][Y];
-	      sdotq = 0.0;
-	      
-	      for (ia = 0; ia < 3; ia++) {
-		for (ib = 0; ib < 3; ib++) {
-		  double dab = cs2*(ia == ib);
-		  double q = (lb->model.cv[p][ia]*lb->model.cv[p][ib] - dab);
-		  sdotq += ds[ia][ib]*q;
-		}
-	      }
-
-	      /* Project all this back to the distribution. */
-
-	      lb_f(lb, index, p, n, &fnew);
-	      fnew += lb->model.wv[p]*(rho*udotc*rcs2 + 0.5*sdotq*rcs2*rcs2);
-	      lb_f_set(lb, index, p, n, fnew);
-	    }
-	  }
-	  /* next site */
-	}
-      }
-    }
-  }
+static int le_reproject(lb_t *lb, lees_edw_t *le) {
+
+    int ic, jc, kc, index;
+    int nplane, plane, side;
+    int ia, ib;
+    int nlocal[3];
+    int n, ndist;
+    int8_t cx = 0;
+
+    double rho, ds[3][3], udotc, sdotq;
+    double g[3], du[3];
+    double fnew;
+    double t;
+    physics_t *phys = NULL;
+
+    assert(lb);
+    assert(le);
 
-  return 0;
+    lb_ndist(lb, &ndist);
+    nplane = lees_edw_nplane_local(le);
+    physics_ref(&phys);
+
+    t = 1.0 * physics_control_timestep(phys);
+    lees_edw_nlocal(le, nlocal);
+    
+    for (jc = 1; jc <= nlocal[Y]; jc++) {
+        for (kc = 1; kc <= nlocal[Z]; kc++) {
+            for (plane = 0; plane < nplane; plane++) {
+                for (side = 0; side < 2; side++) {
+
+                    du[X] = 0.0;
+                    du[Y] = 0.0;
+                    du[Z] = 0.0;
+
+                    if (side == 0) {
+                        /* Start with plane below Lees-Edwards BC */
+                        lees_edw_plane_uy_now(le, t, &du[Y]);
+                        du[Y] *= -1.0;
+                        ic = lees_edw_plane_location(le, plane);
+                        cx = +1;
+                    }
+                    else {
+                        /* Finally, deal with plane above LEBC */
+                        lees_edw_plane_uy_now(le, t, &du[Y]);
+                        ic = lees_edw_plane_location(le, plane) + 1;
+                        cx = -1;
+                    }
+
+                    index = lees_edw_index(le, ic, jc, kc);
+
+                    for (n = 0; n < ndist; n++) {
+
+                        /* Compute 0th and 1st moments */
+                        lb_dist_enum_t ndn = (lb_dist_enum_t)n;
+                        lb_0th_moment(lb, index, ndn, &rho);
+                        lb_1st_moment(lb, index, ndn, g);
+
+                        for (ia = 0; ia < 3; ia++) {
+                            for (ib = 0; ib < 3; ib++) {
+                                ds[ia][ib] = (g[ia] * du[ib] + du[ia] * g[ib] + rho * du[ia] * du[ib]);
+                            }
+                        }
+
+                        /* Now update the distribution */
+                        for (int p = 1; p < lb->model.nvel; p++) {
+
+                            double cs2 = lb->model.cs2;
+                            double rcs2 = 1.0 / cs2;
+                            if (lb->model.cv[p][X] != cx)
+                                continue;
+
+                            udotc = du[Y] * lb->model.cv[p][Y];
+                            sdotq = 0.0;
+
+                            for (ia = 0; ia < 3; ia++) {
+                                for (ib = 0; ib < 3; ib++) {
+                                    double dab = cs2 * (ia == ib);
+                                    double q = (lb->model.cv[p][ia] * lb->model.cv[p][ib] - dab);
+                                    sdotq += ds[ia][ib] * q;
+                                }
+                            }
+
+                            /* Project all this back to the distribution. */
+
+                            lb_f(lb, index, p, n, &fnew);
+                            fnew += lb->model.wv[p] * (rho * udotc * rcs2 + 0.5 * sdotq * rcs2 * rcs2);
+                            lb_f_set(lb, index, p, n, fnew);
+                        }
+                    }
+                }
+            }
+            /* next site */
+        }
+    }
+    return 0;
 }
 
 /*****************************************************************************
@@ -216,166 +215,167 @@ static int le_reproject(lb_t * lb, lees_edw_t * le) {
  *
  *****************************************************************************/
 
-int le_displace_and_interpolate(lb_t * lb, lees_edw_t * le) {
-
-  int    ic, jc, kc;
-  int    index0, index1;
-  int    nlocal[3];
-  int    n, nplane, plane;
-  int    jdy, j1, j2;
-  int    ndist;
-  int    nprop;
-  int    ndata;
-  int    nhalo;
-  double dy, fr;
-  double t;
-  double ltot[3];
-  double * recv_buff;
-  physics_t * phys = NULL;
-
-  assert(lb);
-  assert(le);
-
-  lees_edw_ltot(le, ltot);
-  lees_edw_nlocal(le, nlocal);
-  lees_edw_nhalo(le, &nhalo);
-  nplane = lees_edw_nplane_local(le);
-  physics_ref(&phys);
-
-  t = 1.0*physics_control_timestep(phys);
-
-  /* We need to interpolate into a temporary buffer to make sure we
-   * don't overwrite distributions taking part. The size is just
-   * determined by the size of the local domain, and the number
-   * of plane-crossing distributions. */
-
-  lb_ndist(lb, &ndist);
-
-  /* Allocate a buffer large enough for all cvp[][X] = +1 */
-
-  nprop = 0;
-  for (int p = 1; p < lb->model.nvel; p++) {
-    if (lb->model.cv[p][X] == +1) nprop += 1;
-  }
-
-  ndata = ndist*nprop*nlocal[Y]*nlocal[Z];
-  recv_buff = (double *) malloc(ndata*sizeof(double));
-  assert(recv_buff);
-  if (recv_buff == NULL) pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
-
-  for (plane = 0; plane < nplane; plane++) {
- 
-    ic  = lees_edw_plane_location(le, plane);
-
-    lees_edw_buffer_displacement(le, nhalo, t, &dy);
-    dy  = fmod(dy, ltot[Y]);
-    jdy = floor(dy);
-    fr = dy - jdy;
-
-    ndata = 0;
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-
-      j1 = 1 + (jc + jdy - 1 + 2*nlocal[Y]) % nlocal[Y];
-      j2 = 1 + (j1 % nlocal[Y]);
-
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index0 = lees_edw_index(le, ic, j1, kc);
-	index1 = lees_edw_index(le, ic, j2, kc);
-		  
-	/* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
-
-	for (n = 0; n < ndist; n++) {
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] != +1) continue;
-	    recv_buff[ndata++] = (1.0 - fr)*
-	      lb->f[LB_ADDR(lb->nsite,ndist,lb->model.nvel,index0,n, p)]
-	      + fr*
-	      lb->f[LB_ADDR(lb->nsite,ndist,lb->model.nvel,index1,n, p)];
-	  }
-	}
-	/* Next site */
-      }
-    }
-
-    /* ...and copy back ... */
-
-    ndata = 0;
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index0 = lees_edw_index(le, ic, jc, kc);
-
-	for (n = 0; n < ndist; n++) {
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] != +1) continue;
-	    int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-	    lb->f[la] = recv_buff[ndata++];
-	  }
-	}
-	/* Next site */
-      }
-    }
-
-
-    /* OTHER DIRECTION */
- 
-    ic  = lees_edw_plane_location(le, plane) + 1;
-
-    lees_edw_buffer_displacement(le, nhalo, t, &dy);
-    dy  = fmod(-dy, ltot[Y]);
-    jdy = floor(dy);
-    fr = dy - jdy;
-
-    ndata = 0;
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-
-      j1 = 1 + (jc + jdy - 1 + 2*nlocal[Y]) % nlocal[Y];
-      j2 = 1 + (j1 % nlocal[Y]) ;
-
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index0 = lees_edw_index(le, ic, j1, kc);
-	index1 = lees_edw_index(le, ic, j2, kc);
-
-	for (n = 0; n < ndist; n++) {
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] == -1) {
-	      int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-	      int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p);
-	      recv_buff[ndata++] = (1.0 - fr)*lb->f[l0] + fr*lb->f[l1];
-	    }
-	  }
-	}
-	/* Next site */
-      }
+int le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
+
+    int ic, jc, kc;
+    int index0, index1;
+    int nlocal[3];
+    int n, nplane, plane;
+    int jdy, j1, j2;
+    int ndist;
+    int nprop;
+    int ndata;
+    int nhalo;
+    double dy, fr;
+    double t;
+    double ltot[3];
+    double *recv_buff;
+    physics_t *phys = NULL;
+
+    assert(lb);
+    assert(le);
+
+    lees_edw_ltot(le, ltot);
+    lees_edw_nlocal(le, nlocal);
+    lees_edw_nhalo(le, &nhalo);
+    nplane = lees_edw_nplane_local(le);
+    physics_ref(&phys);
+
+    t = 1.0 * physics_control_timestep(phys);
+
+    /* We need to interpolate into a temporary buffer to make sure we
+     * don't overwrite distributions taking part. The size is just
+     * determined by the size of the local domain, and the number
+     * of plane-crossing distributions. */
+
+    lb_ndist(lb, &ndist);
+
+    /* Allocate a buffer large enough for all cvp[][X] = +1 */
+
+    nprop = 0;
+    for (int p = 1; p < lb->model.nvel; p++) {
+        if (lb->model.cv[p][X] == +1)
+            nprop += 1;
     }
 
-    /* ...and now overwrite... */
+    ndata = ndist * nprop * nlocal[Y] * nlocal[Z];
+    recv_buff = (double *)malloc(ndata * sizeof(double));
+    assert(recv_buff);
+    if (recv_buff == NULL)
+        pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
 
-    ndata = 0;
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index0 = lees_edw_index(le, ic, jc, kc);
-
-	for (n = 0; n < ndist; n++) {
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] == -1) {
-	      int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel,index0,n,p);
-	      lb->f[ijkp] = recv_buff[ndata++];
-	    }
-	  }
-	}
-      }
+    for (plane = 0; plane < nplane; plane++) {
+
+        ic = lees_edw_plane_location(le, plane);
+
+        lees_edw_buffer_displacement(le, nhalo, t, &dy);
+        dy = fmod(dy, ltot[Y]);
+        jdy = floor(dy);
+        fr = dy - jdy;
+
+        ndata = 0;
+        for (jc = 1; jc <= nlocal[Y]; jc++) {
+
+            j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+            j2 = 1 + (j1 % nlocal[Y]);
+
+            for (kc = 1; kc <= nlocal[Z]; kc++) {
+
+                index0 = lees_edw_index(le, ic, j1, kc);
+                index1 = lees_edw_index(le, ic, j2, kc);
+
+                /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
+
+                for (n = 0; n < ndist; n++) {
+                    for (int p = 1; p < lb->model.nvel; p++) {
+                        if (lb->model.cv[p][X] != +1)
+                            continue;
+                        recv_buff[ndata++] = (1.0 - fr) * lb->f[LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p)] +
+                                             fr * lb->f[LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p)];
+                    }
+                }
+                /* Next site */
+            }
+        }
+
+        /* ...and copy back ... */
+
+        ndata = 0;
+        for (jc = 1; jc <= nlocal[Y]; jc++) {
+            for (kc = 1; kc <= nlocal[Z]; kc++) {
+
+                index0 = lees_edw_index(le, ic, jc, kc);
+
+                for (n = 0; n < ndist; n++) {
+                    for (int p = 1; p < lb->model.nvel; p++) {
+                        if (lb->model.cv[p][X] != +1)
+                            continue;
+                        int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
+                        lb->f[la] = recv_buff[ndata++];
+                    }
+                }
+                /* Next site */
+            }
+        }
+
+        /* OTHER DIRECTION */
+
+        ic = lees_edw_plane_location(le, plane) + 1;
+
+        lees_edw_buffer_displacement(le, nhalo, t, &dy);
+        dy = fmod(-dy, ltot[Y]);
+        jdy = floor(dy);
+        fr = dy - jdy;
+
+        ndata = 0;
+        for (jc = 1; jc <= nlocal[Y]; jc++) {
+
+            j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+            j2 = 1 + (j1 % nlocal[Y]);
+
+            for (kc = 1; kc <= nlocal[Z]; kc++) {
+
+                index0 = lees_edw_index(le, ic, j1, kc);
+                index1 = lees_edw_index(le, ic, j2, kc);
+
+                for (n = 0; n < ndist; n++) {
+                    for (int p = 1; p < lb->model.nvel; p++) {
+                        if (lb->model.cv[p][X] == -1) {
+                            int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
+                            int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p);
+                            recv_buff[ndata++] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+                        }
+                    }
+                }
+                /* Next site */
+            }
+        }
+
+        /* ...and now overwrite... */
+
+        ndata = 0;
+        for (jc = 1; jc <= nlocal[Y]; jc++) {
+            for (kc = 1; kc <= nlocal[Z]; kc++) {
+
+                index0 = lees_edw_index(le, ic, jc, kc);
+
+                for (n = 0; n < ndist; n++) {
+                    for (int p = 1; p < lb->model.nvel; p++) {
+                        if (lb->model.cv[p][X] == -1) {
+                            int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
+                            lb->f[ijkp] = recv_buff[ndata++];
+                        }
+                    }
+                }
+            }
+        }
+
+        /* Next plane */
     }
 
-    /* Next plane */
-  }
+    free(recv_buff);
 
-  free(recv_buff);
-
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -392,246 +392,244 @@ int le_displace_and_interpolate(lb_t * lb, lees_edw_t * le) {
  *
  *  Likewise, we need to send a total of (nlocal[Y] + 1) points to the
  *  two corresponding recieving processes. Note we never involve the
- *  halo regions here (so a preceeding halo exchange is not required). 
+ *  halo regions here (so a preceeding halo exchange is not required).
  *
  *****************************************************************************/
 
-static int le_displace_and_interpolate_parallel(lb_t * lb, lees_edw_t * le) {
-
-  int ic, jc, kc;
-  int j1, j1mod;
-  int jdy;
-  int n1, n2;
-  int ndata, ndata1, ndata2;
-  int nhalo;
-  int ind0, ind1, ind2, index;
-  int n, nplane, plane;
-  int ntotal[3];
-  int nlocal[3];
-  int offset[3];
-  int nrank_s[3], nrank_r[3];
-  int nprop;
-  int ndist;
-
-  const int tag1 = 3102;
-  const int tag2 = 3103;
-
-  double fr;
-  double dy;
-  double t;
-  double ltot[3];
-  double * send_buff;
-  double * recv_buff;
-
-  physics_t * phys = NULL;
-  MPI_Comm    comm;
-  MPI_Request req[4];
-  MPI_Status status[4];
-
-  assert(lb);
-  assert(le);
-
-  lees_edw_ltot(le, ltot);
-  lees_edw_ntotal(le, ntotal);
-  lees_edw_nlocal(le, nlocal);
-  lees_edw_nhalo(le, &nhalo);
-  lees_edw_nlocal_offset(le, offset);
-
-  nplane = lees_edw_nplane_local(le);
-  lees_edw_comm(le, &comm);
-
-  physics_ref(&phys);
-
-  t = 1.0*physics_control_timestep(phys);
-  lb_ndist(lb, &ndist);
-
+static int le_displace_and_interpolate_parallel(lb_t *lb, lees_edw_t *le) {
+
+    int ic, jc, kc;
+    int j1, j1mod;
+    int jdy;
+    int n1, n2;
+    int ndata, ndata1, ndata2;
+    int nhalo;
+    int ind0, ind1, ind2, index;
+    int n, nplane, plane;
+    int ntotal[3];
+    int nlocal[3];
+    int offset[3];
+    int nrank_s[3], nrank_r[3];
+    int nprop;
+    int ndist;
+
+    const int tag1 = 3102;
+    const int tag2 = 3103;
+
+    double fr;
+    double dy;
+    double t;
+    double ltot[3];
+    double *send_buff;
+    double *recv_buff;
+
+    physics_t *phys = NULL;
+    MPI_Comm comm;
+    MPI_Request req[4];
+    MPI_Status status[4];
+
+    assert(lb);
+    assert(le);
+
+    lees_edw_ltot(le, ltot);
+    lees_edw_ntotal(le, ntotal);
+    lees_edw_nlocal(le, nlocal);
+    lees_edw_nhalo(le, &nhalo);
+    lees_edw_nlocal_offset(le, offset);
+
+    nplane = lees_edw_nplane_local(le);
+    lees_edw_comm(le, &comm);
+
+    physics_ref(&phys);
+
+    t = 1.0 * physics_control_timestep(phys);
+    lb_ndist(lb, &ndist);
+
+    nprop = 0;
+    for (int p = 1; p < lb->model.nvel; p++) {
+        if (lb->model.cv[p][X] == +1)
+            nprop += 1;
+    }
 
-  nprop = 0;
-  for (int p = 1; p < lb->model.nvel; p++) {
-    if (lb->model.cv[p][X] == +1) nprop += 1;
-  }
+    ndata = ndist * nprop * nlocal[Y] * nlocal[Z];
+    send_buff = (double *)malloc(ndata * sizeof(double));
+    assert(send_buff);
+    if (send_buff == NULL)
+        pe_fatal(lb->pe, "malloc(send_buff) failed\n");
 
-  ndata = ndist*nprop*nlocal[Y]*nlocal[Z];
-  send_buff = (double *) malloc(ndata*sizeof(double));
-  assert(send_buff);
-  if (send_buff == NULL) pe_fatal(lb->pe, "malloc(send_buff) failed\n");
+    ndata = ndist * nprop * (nlocal[Y] + 1) * nlocal[Z];
+    recv_buff = (double *)malloc(ndata * sizeof(double));
+    assert(recv_buff);
+    if (recv_buff == NULL)
+        pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
 
-  ndata = ndist*nprop*(nlocal[Y] + 1)*nlocal[Z];
-  recv_buff = (double *) malloc(ndata*sizeof(double));
-  assert(recv_buff);
-  if (recv_buff == NULL) pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
+    for (plane = 0; plane < nplane; plane++) {
 
-  for (plane = 0; plane < nplane; plane++) {
+        ic = lees_edw_plane_location(le, plane);
 
-    ic  = lees_edw_plane_location(le, plane);
+        lees_edw_buffer_displacement(le, nhalo, t, &dy);
+        dy = fmod(dy, ltot[Y]);
+        jdy = floor(dy);
+        fr = dy - jdy;
 
-    lees_edw_buffer_displacement(le, nhalo, t, &dy);
-    dy  = fmod(dy, ltot[Y]);
-    jdy = floor(dy);
-    fr  = dy - jdy;
+        /* Starting y coordinate is j1: 1 <= j1 <= ntotal[y] */
 
-    /* Starting y coordinate is j1: 1 <= j1 <= ntotal[y] */
+        jc = offset[Y] + 1;
+        j1 = 1 + (jc + jdy - 1 + 2 * ntotal[Y]) % ntotal[Y];
+        lees_edw_jstart_to_mpi_ranks(le, j1, nrank_s, nrank_r);
 
-    jc = offset[Y] + 1;
-    j1 = 1 + (jc + jdy - 1 + 2*ntotal[Y]) % ntotal[Y];
-    lees_edw_jstart_to_mpi_ranks(le, j1, nrank_s, nrank_r);
+        j1mod = 1 + (j1 - 1) % nlocal[Y];
+        n1 = (nlocal[Y] - j1mod + 1);
+        n2 = j1mod;
 
-    j1mod = 1 + (j1 - 1) % nlocal[Y];
-    n1 = (nlocal[Y] - j1mod + 1);
-    n2 = j1mod;
+        ndata1 = n1 * nlocal[Z] * ndist * nprop;
+        ndata2 = n2 * nlocal[Z] * ndist * nprop;
 
-    ndata1 = n1*nlocal[Z]*ndist*nprop;
-    ndata2 = n2*nlocal[Z]*ndist*nprop;
+        /* Post the receives */
 
-    /* Post the receives */
+        MPI_Irecv(recv_buff, ndata1, MPI_DOUBLE, nrank_r[0], tag1, comm, req);
+        MPI_Irecv(recv_buff + ndata1, ndata2, MPI_DOUBLE, nrank_r[1], tag2, comm, req + 1);
 
-    MPI_Irecv(recv_buff, ndata1, MPI_DOUBLE, nrank_r[0], tag1, comm, req);
-    MPI_Irecv(recv_buff + ndata1, ndata2, MPI_DOUBLE, nrank_r[1], tag2,
-	      comm, req + 1);
+        /* Load the send buffer. Note that data at j1mod gets sent to both
+         * receivers, making up the total of (nlocal[Y] + 1) points */
 
-    /* Load the send buffer. Note that data at j1mod gets sent to both
-     * receivers, making up the total of (nlocal[Y] + 1) points */
+        ndata = 0;
+        for (jc = 1; jc <= nlocal[Y]; jc++) {
+            for (kc = 1; kc <= nlocal[Z]; kc++) {
 
-    ndata = 0;
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	/* cv[p][X] = +1 identified by disp_fwd[] */
-	index = lees_edw_index(le, ic, jc, kc);
-
-	for (n = 0; n < ndist; n++) {
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] != +1) continue;
-	    int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index, n, p);
-	    send_buff[ndata++] = lb->f[ijkp];
-	  }
-	}
-	/* Next site */
-      }
-    }
+                /* cv[p][X] = +1 identified by disp_fwd[] */
+                index = lees_edw_index(le, ic, jc, kc);
 
-    ndata = ndata2 - nlocal[Z]*ndist*nprop;
+                for (n = 0; n < ndist; n++) {
+                    for (int p = 1; p < lb->model.nvel; p++) {
+                        if (lb->model.cv[p][X] != +1)
+                            continue;
+                        int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index, n, p);
+                        send_buff[ndata++] = lb->f[ijkp];
+                    }
+                }
+                /* Next site */
+            }
+        }
 
-    MPI_Issend(send_buff + ndata, ndata1, MPI_DOUBLE, nrank_s[0], tag1,
-	       comm, req + 2);
-    MPI_Issend(send_buff,         ndata2, MPI_DOUBLE, nrank_s[1], tag2,
-	       comm, req + 3);
+        ndata = ndata2 - nlocal[Z] * ndist * nprop;
 
-    /* Wait for the receives, and sort out the interpolated values */
+        MPI_Issend(send_buff + ndata, ndata1, MPI_DOUBLE, nrank_s[0], tag1, comm, req + 2);
+        MPI_Issend(send_buff, ndata2, MPI_DOUBLE, nrank_s[1], tag2, comm, req + 3);
 
-    MPI_Waitall(2, req, status);
+        /* Wait for the receives, and sort out the interpolated values */
 
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index = lees_edw_index(le, ic, jc, kc);
-	ind0 = ndist*nprop*((jc-1)*nlocal[Z] + (kc-1));
-
-	for (n = 0; n < ndist; n++) {
-	  ind1 = ind0 + n*nprop;
-	  ind2 = ind0 + ndist*nprop*nlocal[Z] + n*nprop;
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] != +1) continue;
-	    int ijk = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index, n, p);
-	    lb->f[ijk] = (1.0-fr)*recv_buff[ind1++] + fr*recv_buff[ind2++];
-	  }
-	}
-	/* Next site */
-      }
-    }
+        MPI_Waitall(2, req, status);
 
-    /* Finish the sends */
-    MPI_Waitall(2, req + 2, status);
+        for (jc = 1; jc <= nlocal[Y]; jc++) {
+            for (kc = 1; kc <= nlocal[Z]; kc++) {
 
+                index = lees_edw_index(le, ic, jc, kc);
+                ind0 = ndist * nprop * ((jc - 1) * nlocal[Z] + (kc - 1));
 
+                for (n = 0; n < ndist; n++) {
+                    ind1 = ind0 + n * nprop;
+                    ind2 = ind0 + ndist * nprop * nlocal[Z] + n * nprop;
+                    for (int p = 1; p < lb->model.nvel; p++) {
+                        if (lb->model.cv[p][X] != +1)
+                            continue;
+                        int ijk = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index, n, p);
+                        lb->f[ijk] = (1.0 - fr) * recv_buff[ind1++] + fr * recv_buff[ind2++];
+                    }
+                }
+                /* Next site */
+            }
+        }
 
-    /* NOW THE OTHER DIRECTION */
+        /* Finish the sends */
+        MPI_Waitall(2, req + 2, status);
 
-    ic  = lees_edw_plane_location(le, plane) + 1;
+        /* NOW THE OTHER DIRECTION */
 
-    lees_edw_buffer_displacement(le, nhalo, t, &dy);
-    dy  = fmod(-dy, ltot[Y]);
-    jdy = floor(dy);
-    fr  = dy - jdy;
+        ic = lees_edw_plane_location(le, plane) + 1;
 
-    /* Starting y coordinate (global address): range 1 <= j1 <= ntotal[Y] */
+        lees_edw_buffer_displacement(le, nhalo, t, &dy);
+        dy = fmod(-dy, ltot[Y]);
+        jdy = floor(dy);
+        fr = dy - jdy;
 
-    jc = offset[Y] + 1;
-    j1 = 1 + (jc + jdy - 1 + 2*ntotal[Y]) % ntotal[Y];
-    lees_edw_jstart_to_mpi_ranks(le, j1, nrank_s, nrank_r);
+        /* Starting y coordinate (global address): range 1 <= j1 <= ntotal[Y] */
 
-    j1mod = 1 + (j1 - 1) % nlocal[Y];
-    n1 = (nlocal[Y] - j1mod + 1);
-    n2 = j1mod;
+        jc = offset[Y] + 1;
+        j1 = 1 + (jc + jdy - 1 + 2 * ntotal[Y]) % ntotal[Y];
+        lees_edw_jstart_to_mpi_ranks(le, j1, nrank_s, nrank_r);
 
-    ndata1 = n1*nlocal[Z]*ndist*nprop;
-    ndata2 = n2*nlocal[Z]*ndist*nprop;
+        j1mod = 1 + (j1 - 1) % nlocal[Y];
+        n1 = (nlocal[Y] - j1mod + 1);
+        n2 = j1mod;
 
-    /* Post the receives */
+        ndata1 = n1 * nlocal[Z] * ndist * nprop;
+        ndata2 = n2 * nlocal[Z] * ndist * nprop;
 
-    MPI_Irecv(recv_buff, ndata1, MPI_DOUBLE, nrank_r[0], tag1, comm, req);
-    MPI_Irecv(recv_buff + ndata1, ndata2, MPI_DOUBLE, nrank_r[1], tag2,
-	      comm, req + 1);
+        /* Post the receives */
 
-    /* Load the send buffer. Note that data at j1mod gets sent to both
-     * receivers, making up the total of (nlocal[Y] + 1) points */
+        MPI_Irecv(recv_buff, ndata1, MPI_DOUBLE, nrank_r[0], tag1, comm, req);
+        MPI_Irecv(recv_buff + ndata1, ndata2, MPI_DOUBLE, nrank_r[1], tag2, comm, req + 1);
 
-    ndata = 0;
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	/* cv[p][X] = -1 identified by disp_bwd[] */
-	index = lees_edw_index(le, ic, jc, kc);
-
-	for (n = 0; n < ndist; n++) {
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] != -1) continue;
-	    int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index, n, p);
-	    send_buff[ndata++] = lb->f[ijkp];
-	  }
-	}
-	/* Next site */
-      }
-    }
+        /* Load the send buffer. Note that data at j1mod gets sent to both
+         * receivers, making up the total of (nlocal[Y] + 1) points */
 
-    ndata = ndata2 - nlocal[Z]*ndist*nprop;
+        ndata = 0;
+        for (jc = 1; jc <= nlocal[Y]; jc++) {
+            for (kc = 1; kc <= nlocal[Z]; kc++) {
 
-    MPI_Issend(send_buff + ndata, ndata1, MPI_DOUBLE, nrank_s[0], tag1,
-	       comm, req + 2);
-    MPI_Issend(send_buff,         ndata2, MPI_DOUBLE, nrank_s[1], tag2,
-	       comm, req + 3);
+                /* cv[p][X] = -1 identified by disp_bwd[] */
+                index = lees_edw_index(le, ic, jc, kc);
 
-    /* Wait for the receives, and interpolate from the buffer */
+                for (n = 0; n < ndist; n++) {
+                    for (int p = 1; p < lb->model.nvel; p++) {
+                        if (lb->model.cv[p][X] != -1)
+                            continue;
+                        int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index, n, p);
+                        send_buff[ndata++] = lb->f[ijkp];
+                    }
+                }
+                /* Next site */
+            }
+        }
+
+        ndata = ndata2 - nlocal[Z] * ndist * nprop;
+
+        MPI_Issend(send_buff + ndata, ndata1, MPI_DOUBLE, nrank_s[0], tag1, comm, req + 2);
+        MPI_Issend(send_buff, ndata2, MPI_DOUBLE, nrank_s[1], tag2, comm, req + 3);
+
+        /* Wait for the receives, and interpolate from the buffer */
+
+        MPI_Waitall(2, req, status);
+
+        for (jc = 1; jc <= nlocal[Y]; jc++) {
+            for (kc = 1; kc <= nlocal[Z]; kc++) {
+
+                index = lees_edw_index(le, ic, jc, kc);
+                ind0 = ndist * nprop * ((jc - 1) * nlocal[Z] + (kc - 1));
 
-    MPI_Waitall(2, req, status);
-
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index = lees_edw_index(le, ic, jc, kc);
-	ind0 = ndist*nprop*((jc-1)*nlocal[Z] + (kc-1));
-
-	for (n = 0; n < ndist; n++) {
-	  ind1 = ind0 + n*nprop;
-	  ind2 = ind0 + ndist*nprop*nlocal[Z] + n*nprop;
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] != -1) continue;
-	    int ijk = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index, n, p);
-	    lb->f[ijk] = (1.0-fr)*recv_buff[ind1++] + fr*recv_buff[ind2++];
-	  }
-	}
-	/* Next site */
-      }
+                for (n = 0; n < ndist; n++) {
+                    ind1 = ind0 + n * nprop;
+                    ind2 = ind0 + ndist * nprop * nlocal[Z] + n * nprop;
+                    for (int p = 1; p < lb->model.nvel; p++) {
+                        if (lb->model.cv[p][X] != -1)
+                            continue;
+                        int ijk = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index, n, p);
+                        lb->f[ijk] = (1.0 - fr) * recv_buff[ind1++] + fr * recv_buff[ind2++];
+                    }
+                }
+                /* Next site */
+            }
+        }
+
+        /* Mop up the sends */
+        MPI_Waitall(2, req + 2, status);
     }
 
-    /* Mop up the sends */
-    MPI_Waitall(2, req + 2, status);
-  }
+    free(send_buff);
+    free(recv_buff);
 
-  free(send_buff);
-  free(recv_buff);
-
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -643,73 +641,73 @@ static int le_displace_and_interpolate_parallel(lb_t * lb, lees_edw_t * le) {
  *
  *****************************************************************************/
 
-int lb_le_init_shear_profile(lb_t * lb, lees_edw_t * le) {
+int lb_le_init_shear_profile(lb_t *lb, lees_edw_t *le) {
 
-  int ic, jc, kc, index;
-  int i, j, p;
-  int nlocal[3];
-  double rho0, u[NDIM], gradu[NDIM][NDIM];
-  double eta;
+    int ic, jc, kc, index;
+    int i, j, p;
+    int nlocal[3];
+    double rho0, u[NDIM], gradu[NDIM][NDIM];
+    double eta;
 
-  physics_t * phys = NULL;
+    physics_t *phys = NULL;
 
-  assert(lb);
-  assert(le);
+    assert(lb);
+    assert(le);
 
-  pe_info(lb->pe, "Initialising shear profile\n");
+    pe_info(lb->pe, "Initialising shear profile\n");
 
-  /* Initialise the density, velocity, gradu; ghost modes are zero */
+    /* Initialise the density, velocity, gradu; ghost modes are zero */
 
-  physics_ref(&phys);
-  physics_rho0(phys, &rho0);
-  physics_eta_shear(phys, &eta);
+    physics_ref(&phys);
+    physics_rho0(phys, &rho0);
+    physics_eta_shear(phys, &eta);
 
-  lees_edw_nlocal(le, nlocal);
+    lees_edw_nlocal(le, nlocal);
 
-  for (i = 0; i< lb->model.ndim; i++) {
-    u[i] = 0.0;
-    for (j = 0; j < lb->model.ndim; j++) {
-      gradu[i][j] = 0.0;
+    for (i = 0; i < lb->model.ndim; i++) {
+        u[i] = 0.0;
+        for (j = 0; j < lb->model.ndim; j++) {
+            gradu[i][j] = 0.0;
+        }
     }
-  }
 
-  lees_edw_shear_rate(le, &gradu[X][Y]);
+    lees_edw_shear_rate(le, &gradu[X][Y]);
 
-  /* Loop trough the sites */
+    /* Loop trough the sites */
 
-  for (ic = 1; ic <= nlocal[X]; ic++) {
+    for (ic = 1; ic <= nlocal[X]; ic++) {
 
-    lees_edw_steady_uy(le, ic, &u[Y]);
+        lees_edw_steady_uy(le, ic, &u[Y]);
 
-    /* We can now project the physical quantities to the distribution */
+        /* We can now project the physical quantities to the distribution */
 
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index = lees_edw_index(le, ic, jc, kc);
-
-	for (p = 0; p < lb->model.nvel; p++) {
-	  double f = 0.0;
-	  double cdotu = 0.0;
-	  double sdotq = 0.0;
-	  double cs2 = lb->model.cs2;
-	  double rcs2 = 1.0/cs2;
-
-	  for (i = 0; i < lb->model.ndim; i++) {
-	    cdotu += lb->model.cv[p][i]*u[i];
-	    for (j = 0; j < lb->model.ndim; j++) {
-	      double dij = (i == j);
-	      double qij = lb->model.cv[p][i]*lb->model.cv[p][j] - cs2*dij;
-	      sdotq += (rho0*u[i]*u[j] - eta*gradu[i][j])*qij;
-	    }
-	  }
-	  f = lb->model.wv[p]*(rho0 + rcs2*rho0*cdotu + 0.5*rcs2*rcs2*sdotq);
-	  lb_f_set(lb, index, p, 0, f);
-	}
-	/* Next site */
-      }
+        for (jc = 1; jc <= nlocal[Y]; jc++) {
+            for (kc = 1; kc <= nlocal[Z]; kc++) {
+
+                index = lees_edw_index(le, ic, jc, kc);
+
+                for (p = 0; p < lb->model.nvel; p++) {
+                    double f = 0.0;
+                    double cdotu = 0.0;
+                    double sdotq = 0.0;
+                    double cs2 = lb->model.cs2;
+                    double rcs2 = 1.0 / cs2;
+
+                    for (i = 0; i < lb->model.ndim; i++) {
+                        cdotu += lb->model.cv[p][i] * u[i];
+                        for (j = 0; j < lb->model.ndim; j++) {
+                            double dij = (i == j);
+                            double qij = lb->model.cv[p][i] * lb->model.cv[p][j] - cs2 * dij;
+                            sdotq += (rho0 * u[i] * u[j] - eta * gradu[i][j]) * qij;
+                        }
+                    }
+                    f = lb->model.wv[p] * (rho0 + rcs2 * rho0 * cdotu + 0.5 * rcs2 * rcs2 * sdotq);
+                    lb_f_set(lb, index, p, 0, f);
+                }
+                /* Next site */
+            }
+        }
     }
-  }
 
-  return 0;
+    return 0;
 }
diff --git a/test.job b/test.job
new file mode 100644
index 000000000..1e6e48de8
--- /dev/null
+++ b/test.job
@@ -0,0 +1,18 @@
+#!/bin/bash
+#SBATCH --account=m22oc-S2329216
+#SBATCH --job-name=test
+#SBATCH --output=%x-%j.out
+#SBATCH --gres=gpu:1
+#SBATCH --time=00:10:00
+#SBATCH --partition=gpu
+#SBATCH --qos=short
+
+module load gcc nvidia/nvhpc/22.11
+
+cd tests
+make -k d3q19-short
+
+# make -k test
+
+
+

From 7a6d1cac4ceaf8305a1e36ce57026b81f95a4f3e Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Sat, 10 Jun 2023 15:02:00 +0100
Subject: [PATCH 007/133] add kernel and copy model, the result is wrong now

---
 gpu.job        |   2 +-
 src/lb_data.h  |   2 +-
 src/model.c    |   4 +-
 src/model_le.c | 161 ++++++++++++++++++++++++++++++++-----------------
 test.job       |   2 +-
 5 files changed, 109 insertions(+), 62 deletions(-)

diff --git a/gpu.job b/gpu.job
index 362a43557..841970dcd 100644
--- a/gpu.job
+++ b/gpu.job
@@ -1,6 +1,6 @@
 #!/bin/bash
 #SBATCH --account=m22oc-S2329216
-#SBATCH --job-name=ludwig_test
+#SBATCH --job-name=gpu
 #SBATCH --gres=gpu:1
 #SBATCH --time=00:10:00
 #SBATCH --partition=gpu
diff --git a/src/lb_data.h b/src/lb_data.h
index 9c220aa54..50b464523 100644
--- a/src/lb_data.h
+++ b/src/lb_data.h
@@ -168,7 +168,7 @@ __host__ __device__ int lb_0th_moment(lb_t * lb, int index, lb_dist_enum_t nd,
 				      double * rho);
 
 __host__ int lb_init_rest_f(lb_t * lb, double rho0);
-__host__ int lb_1st_moment(lb_t * lb, int index, lb_dist_enum_t nd, double g[3]);
+__host__ __device__ int lb_1st_moment(lb_t * lb, int index, lb_dist_enum_t nd, double g[3]);
 __host__ int lb_2nd_moment(lb_t * lb, int index, lb_dist_enum_t nd, double s[3][3]);
 __host__ int lb_1st_moment_equilib_set(lb_t * lb, int index, double rho, double u[3]);
 
diff --git a/src/model.c b/src/model.c
index 4ff5e8b0d..d6fc2f61e 100644
--- a/src/model.c
+++ b/src/model.c
@@ -838,7 +838,7 @@ int lb_0th_moment(lb_t * lb, int index, lb_dist_enum_t nd, double * rho) {
  *
  *****************************************************************************/
 
-__host__
+__host__ __device__
 int lb_1st_moment(lb_t * lb, int index, lb_dist_enum_t nd, double g[3]) {
 
   int p;
@@ -909,7 +909,7 @@ int lb_2nd_moment(lb_t * lb, int index, lb_dist_enum_t nd, double s[3][3]) {
  *
  *****************************************************************************/
 
-__host__
+__host__ 
 int lb_1st_moment_equilib_set(lb_t * lb, int index, double rho, double u[3]) {
 
   int ia, ib, p;
diff --git a/src/model_le.c b/src/model_le.c
index 893f7dcce..4e183b8ec 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -31,10 +31,46 @@
 #include "timer.h"
 #include "util.h"
 
-static int le_reproject(lb_t *lb, lees_edw_t *le);
+__global__ static void le_reproject(lb_t *lb, lees_edw_t *le);
 static int le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
 static int le_displace_and_interpolate_parallel(lb_t *lb, lees_edw_t *le);
 
+void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
+    int nvel = h_model->nvel;
+    // Allocate memory on the GPU for the arrays in the struct
+    int8_t (*d_cv)[3];
+    double *d_wv;
+    double *d_na;
+    double *d_ma;
+
+    cudaMalloc((void**)&d_cv, sizeof(int8_t[3]) * nvel);
+    cudaMalloc((void**)&d_wv, sizeof(double) * nvel);
+    cudaMalloc((void**)&d_na, sizeof(double) * nvel);
+
+    // Allocate memory for 2D array ma as a flat 1D array
+    cudaMalloc((void**)&d_ma, sizeof(double) * nvel * nvel);  // adjust this if ma is not square
+
+    // Copy the data from host to the GPU
+    cudaMemcpy(d_cv, h_model->cv, sizeof(int8_t[3]) * nvel, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_wv, h_model->wv, sizeof(double) * nvel, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_na, h_model->na, sizeof(double) * nvel, cudaMemcpyHostToDevice);
+    
+    //I am not sure about ma, as i do not know the scope of ma
+    for(int i = 0; i < nvel; i++) {
+        cudaMemcpy(&d_ma[i * nvel], h_model->ma[i], sizeof(double) * nvel, cudaMemcpyHostToDevice);  // adjust this if ma is not square
+    }
+
+    // Set the pointers in the struct to the newly allocated GPU memory
+    cudaMemcpy(&(d_model->cv), &d_cv, sizeof(int8_t(*)[3]), cudaMemcpyHostToDevice);
+    cudaMemcpy(&(d_model->wv), &d_wv, sizeof(double*), cudaMemcpyHostToDevice);
+    cudaMemcpy(&(d_model->na), &d_na, sizeof(double*), cudaMemcpyHostToDevice);
+    cudaMemcpy(&(d_model->ma), &d_ma, sizeof(double*), cudaMemcpyHostToDevice);
+
+    //copy the rest data to gpu
+    cudaMemcpy(&(d_model->ndim), &(h_model->ndim), sizeof(int8_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(&(d_model->nvel), &(h_model->nvel), sizeof(int8_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(&(d_model->cs2), &(h_model->cs2), sizeof(double), cudaMemcpyHostToDevice);
+}
 /*****************************************************************************
  *
  *  lb_le_apply_boundary_conditions
@@ -69,9 +105,19 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
 
         /* Everything must be done on host at the moment (slowly) ... */
         /* ... and copy back at the end */
-        lb_memcpy(lb, tdpMemcpyDeviceToHost);
+        copyModelToDevice(&lb->model, &lb->target->model);
+        
+        lees_edw_t * le_target;
+        lees_edw_target(le, &le_target);
+
+        int nlocal[3];
+        lees_edw_nlocal(le, nlocal);
+        dim3 numBlocks(1, (nlocal[Y] + 15) / 16, (nlocal[Z] + 15) / 16);
+        dim3 threadsPerBlock(1, 16, 16);
+        le_reproject<<<numBlocks, threadsPerBlock>>>(lb->target, le_target);
+        cudaDeviceSynchronize();
 
-        le_reproject(lb, le);
+        lb_memcpy(lb, tdpMemcpyDeviceToHost);
 
         if (mpi_cartsz[Y] > 1) {
             le_displace_and_interpolate_parallel(lb, le);
@@ -108,7 +154,7 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
  *
  *****************************************************************************/
 
-static int le_reproject(lb_t *lb, lees_edw_t *le) {
+__global__ static void le_reproject(lb_t *lb, lees_edw_t *le) {
 
     int ic, jc, kc, index;
     int nplane, plane, side;
@@ -133,76 +179,77 @@ static int le_reproject(lb_t *lb, lees_edw_t *le) {
     t = 1.0 * physics_control_timestep(phys);
     lees_edw_nlocal(le, nlocal);
     
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-        for (kc = 1; kc <= nlocal[Z]; kc++) {
-            for (plane = 0; plane < nplane; plane++) {
-                for (side = 0; side < 2; side++) {
-
-                    du[X] = 0.0;
-                    du[Y] = 0.0;
-                    du[Z] = 0.0;
-
-                    if (side == 0) {
-                        /* Start with plane below Lees-Edwards BC */
-                        lees_edw_plane_uy_now(le, t, &du[Y]);
-                        du[Y] *= -1.0;
-                        ic = lees_edw_plane_location(le, plane);
-                        cx = +1;
-                    }
-                    else {
-                        /* Finally, deal with plane above LEBC */
-                        lees_edw_plane_uy_now(le, t, &du[Y]);
-                        ic = lees_edw_plane_location(le, plane) + 1;
-                        cx = -1;
-                    }
+    jc = blockIdx.y * blockDim.y + threadIdx.y;
+    kc = blockIdx.z * blockDim.z + threadIdx.z;
+
+    if (jc < nlocal[Y] && kc < nlocal[Z]) {
+        for (plane = 0; plane < nplane; plane++) {
+            for (side = 0; side < 2; side++) {
+
+                du[X] = 0.0;
+                du[Y] = 0.0;
+                du[Z] = 0.0;
+
+                if (side == 0) {
+                    /* Start with plane below Lees-Edwards BC */
+                    lees_edw_plane_uy_now(le, t, &du[Y]);
+                    du[Y] *= -1.0;
+                    ic = lees_edw_plane_location(le, plane);
+                    cx = +1;
+                }
+                else {
+                    /* Finally, deal with plane above LEBC */
+                    lees_edw_plane_uy_now(le, t, &du[Y]);
+                    ic = lees_edw_plane_location(le, plane) + 1;
+                    cx = -1;
+                }
 
-                    index = lees_edw_index(le, ic, jc, kc);
+                index = lees_edw_index(le, ic, jc, kc);
 
-                    for (n = 0; n < ndist; n++) {
+                for (n = 0; n < ndist; n++) {
 
-                        /* Compute 0th and 1st moments */
-                        lb_dist_enum_t ndn = (lb_dist_enum_t)n;
-                        lb_0th_moment(lb, index, ndn, &rho);
-                        lb_1st_moment(lb, index, ndn, g);
+                    /* Compute 0th and 1st moments */
+                    lb_dist_enum_t ndn = (lb_dist_enum_t)n;
+                    lb_0th_moment(lb, index, ndn, &rho);
+                    lb_1st_moment(lb, index, ndn, g);
 
-                        for (ia = 0; ia < 3; ia++) {
-                            for (ib = 0; ib < 3; ib++) {
-                                ds[ia][ib] = (g[ia] * du[ib] + du[ia] * g[ib] + rho * du[ia] * du[ib]);
-                            }
+                    for (ia = 0; ia < 3; ia++) {
+                        for (ib = 0; ib < 3; ib++) {
+                            ds[ia][ib] = (g[ia] * du[ib] + du[ia] * g[ib] + rho * du[ia] * du[ib]);
                         }
+                    }
 
-                        /* Now update the distribution */
-                        for (int p = 1; p < lb->model.nvel; p++) {
+                    /* Now update the distribution */
+                    for (int p = 1; p < lb->model.nvel; p++) {
 
-                            double cs2 = lb->model.cs2;
-                            double rcs2 = 1.0 / cs2;
-                            if (lb->model.cv[p][X] != cx)
-                                continue;
+                        double cs2 = lb->model.cs2;
+                        double rcs2 = 1.0 / cs2;
+                        if (lb->model.cv[p][X] != cx)
+                            continue;
 
-                            udotc = du[Y] * lb->model.cv[p][Y];
-                            sdotq = 0.0;
+                        udotc = du[Y] * lb->model.cv[p][Y];
+                        sdotq = 0.0;
 
-                            for (ia = 0; ia < 3; ia++) {
-                                for (ib = 0; ib < 3; ib++) {
-                                    double dab = cs2 * (ia == ib);
-                                    double q = (lb->model.cv[p][ia] * lb->model.cv[p][ib] - dab);
-                                    sdotq += ds[ia][ib] * q;
-                                }
+                        for (ia = 0; ia < 3; ia++) {
+                            for (ib = 0; ib < 3; ib++) {
+                                double dab = cs2 * (ia == ib);
+                                double q = (lb->model.cv[p][ia] * lb->model.cv[p][ib] - dab);
+                                sdotq += ds[ia][ib] * q;
                             }
+                        }
 
-                            /* Project all this back to the distribution. */
+                        /* Project all this back to the distribution. */
 
-                            lb_f(lb, index, p, n, &fnew);
-                            fnew += lb->model.wv[p] * (rho * udotc * rcs2 + 0.5 * sdotq * rcs2 * rcs2);
-                            lb_f_set(lb, index, p, n, fnew);
-                        }
+                        lb_f(lb, index, p, n, &fnew);
+                        fnew += lb->model.wv[p] * (rho * udotc * rcs2 + 0.5 * sdotq * rcs2 * rcs2);
+                        lb_f_set(lb, index, p, n, fnew);
                     }
                 }
             }
-            /* next site */
         }
+            /* next site */
     }
-    return 0;
+    return;
 }
 
 /*****************************************************************************
diff --git a/test.job b/test.job
index 1e6e48de8..4dc35d643 100644
--- a/test.job
+++ b/test.job
@@ -1,6 +1,6 @@
 #!/bin/bash
 #SBATCH --account=m22oc-S2329216
-#SBATCH --job-name=test
+#SBATCH --job-name=ludwig_test
 #SBATCH --output=%x-%j.out
 #SBATCH --gres=gpu:1
 #SBATCH --time=00:10:00

From 477321a510e1b6825b562064b1f131cb196f6c83 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Mon, 12 Jun 2023 12:52:41 +0100
Subject: [PATCH 008/133] modify 'copy model' function and the function is
 tested effective

---
 src/model_le.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index 4e183b8ec..2747748b8 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -41,30 +41,20 @@ void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
     int8_t (*d_cv)[3];
     double *d_wv;
     double *d_na;
-    double *d_ma;
 
     cudaMalloc((void**)&d_cv, sizeof(int8_t[3]) * nvel);
     cudaMalloc((void**)&d_wv, sizeof(double) * nvel);
     cudaMalloc((void**)&d_na, sizeof(double) * nvel);
-
-    // Allocate memory for 2D array ma as a flat 1D array
-    cudaMalloc((void**)&d_ma, sizeof(double) * nvel * nvel);  // adjust this if ma is not square
-
+    
     // Copy the data from host to the GPU
     cudaMemcpy(d_cv, h_model->cv, sizeof(int8_t[3]) * nvel, cudaMemcpyHostToDevice);
     cudaMemcpy(d_wv, h_model->wv, sizeof(double) * nvel, cudaMemcpyHostToDevice);
     cudaMemcpy(d_na, h_model->na, sizeof(double) * nvel, cudaMemcpyHostToDevice);
     
-    //I am not sure about ma, as i do not know the scope of ma
-    for(int i = 0; i < nvel; i++) {
-        cudaMemcpy(&d_ma[i * nvel], h_model->ma[i], sizeof(double) * nvel, cudaMemcpyHostToDevice);  // adjust this if ma is not square
-    }
-
     // Set the pointers in the struct to the newly allocated GPU memory
     cudaMemcpy(&(d_model->cv), &d_cv, sizeof(int8_t(*)[3]), cudaMemcpyHostToDevice);
     cudaMemcpy(&(d_model->wv), &d_wv, sizeof(double*), cudaMemcpyHostToDevice);
     cudaMemcpy(&(d_model->na), &d_na, sizeof(double*), cudaMemcpyHostToDevice);
-    cudaMemcpy(&(d_model->ma), &d_ma, sizeof(double*), cudaMemcpyHostToDevice);
 
     //copy the rest data to gpu
     cudaMemcpy(&(d_model->ndim), &(h_model->ndim), sizeof(int8_t), cudaMemcpyHostToDevice);
@@ -155,7 +145,6 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
  *****************************************************************************/
 
 __global__ static void le_reproject(lb_t *lb, lees_edw_t *le) {
-
     int ic, jc, kc, index;
     int nplane, plane, side;
     int ia, ib;

From c8b22c3febda7ce56b166b15c1533657621cf81c Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Wed, 14 Jun 2023 00:52:21 +0100
Subject: [PATCH 009/133] still can't get correct result, this commit is to
 save some possible solution for interpolation

---
 src/model_le.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/model_le.c b/src/model_le.c
index 2747748b8..dcbb4e3d8 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -170,7 +170,7 @@ __global__ static void le_reproject(lb_t *lb, lees_edw_t *le) {
     
     jc = blockIdx.y * blockDim.y + threadIdx.y;
     kc = blockIdx.z * blockDim.z + threadIdx.z;
-
+    
     if (jc < nlocal[Y] && kc < nlocal[Z]) {
         for (plane = 0; plane < nplane; plane++) {
             for (side = 0; side < 2; side++) {
@@ -294,6 +294,14 @@ int le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
             nprop += 1;
     }
 
+    // int truth[nprop];
+    // for (int p = 1, int i = 0; p < lb->model.nvel; p++) {
+    //     if (lb->model.cv[p][X] == +1) {
+    //         truth[i] = p;
+    //         i++;
+    //     }
+    // }
+
     ndata = ndist * nprop * nlocal[Y] * nlocal[Z];
     recv_buff = (double *)malloc(ndata * sizeof(double));
     assert(recv_buff);
@@ -334,6 +342,30 @@ int le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
             }
         }
 
+        // ndata = 0;
+        // for (jc = 1; jc <= nlocal[Y]; jc++) {
+
+        //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+        //     j2 = 1 + (j1 % nlocal[Y]);
+
+        //     for (kc = 1; kc <= nlocal[Z]; kc++) {
+
+        //         index0 = lees_edw_index(le, ic, j1, kc);
+        //         index1 = lees_edw_index(le, ic, j2, kc);
+
+        //         /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
+
+        //         for (n = 0; n < ndist; n++) {
+        //             for (int i = 0; i < nprop; i++) {
+        //                 //int ndata = ((jc-1)*nlocal_Z + (kc-1))*ndist*nprop + n*nprop + i;
+        //                 recv_buff[ndata++] = (1.0 - fr) * lb->f[LB_ADDR(lb->nsite, ndist, truth[i], index0, n, p)] +
+        //                                      fr * lb->f[LB_ADDR(lb->nsite, ndist, truth[i], index1, n, p)];
+        //             }
+        //         }
+        //         /* Next site */
+        //     }
+        // }
+
         /* ...and copy back ... */
 
         ndata = 0;

From e7fbb8c88e0e128cd514529a43ac55769c0f4dcd Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Thu, 15 Jun 2023 11:00:35 +0100
Subject: [PATCH 010/133] the result of reprojection is correct now, ready to
 move on

---
 src/model_le.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index dcbb4e3d8..d214efe9d 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -168,10 +168,10 @@ __global__ static void le_reproject(lb_t *lb, lees_edw_t *le) {
     t = 1.0 * physics_control_timestep(phys);
     lees_edw_nlocal(le, nlocal);
     
-    jc = blockIdx.y * blockDim.y + threadIdx.y;
-    kc = blockIdx.z * blockDim.z + threadIdx.z;
+    jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
+    kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
     
-    if (jc < nlocal[Y] && kc < nlocal[Z]) {
+    if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
         for (plane = 0; plane < nplane; plane++) {
             for (side = 0; side < 2; side++) {
 
@@ -358,8 +358,8 @@ int le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
         //         for (n = 0; n < ndist; n++) {
         //             for (int i = 0; i < nprop; i++) {
         //                 //int ndata = ((jc-1)*nlocal_Z + (kc-1))*ndist*nprop + n*nprop + i;
-        //                 recv_buff[ndata++] = (1.0 - fr) * lb->f[LB_ADDR(lb->nsite, ndist, truth[i], index0, n, p)] +
-        //                                      fr * lb->f[LB_ADDR(lb->nsite, ndist, truth[i], index1, n, p)];
+        //                 recv_buff[ndata++] = (1.0 - fr) * lb->f[LB_ADDR(lb->nsite, ndist, lb->model->nvel, index0, n, truth[i])] +
+        //                                      fr * lb->f[LB_ADDR(lb->nsite, ndist, lb->nodel->nvei, index1, n, truth[i])];
         //             }
         //         }
         //         /* Next site */

From 263bd6de62612f7ea5d2d0566fc952e43f607c22 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Mon, 19 Jun 2023 14:58:39 +0100
Subject: [PATCH 011/133] parallelised interpolation, can run successfully but
 the result is wrong

---
 gpu.job           |    2 +-
 src/leesedwards.c | 1231 ++++++++++++++++++++++-----------------------
 src/model_le.c    |  364 ++++++++++----
 3 files changed, 873 insertions(+), 724 deletions(-)

diff --git a/gpu.job b/gpu.job
index 841970dcd..cbe34da2c 100644
--- a/gpu.job
+++ b/gpu.job
@@ -11,7 +11,7 @@ module load gcc nvidia/nvhpc/22.11
 # cd tests
 # make -k d3q19-short
 
-./src/Ludwig.exe input
+./src/Ludwig.exe input --printf-buffer=2097152
 
 
 
diff --git a/src/leesedwards.c b/src/leesedwards.c
index c986e7274..2af2ccc4a 100644
--- a/src/leesedwards.c
+++ b/src/leesedwards.c
@@ -17,9 +17,9 @@
  *****************************************************************************/
 
 #include <assert.h>
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <math.h>
 
 #include "leesedwards.h"
 #include "util.h"
@@ -27,56 +27,56 @@
 typedef struct lees_edw_param_s lees_edw_param_t;
 
 struct lees_edw_s {
-  pe_t * pe;                /* Parallel environment */
-  cs_t * cs;                /* Coordinate system */
-  physics_t * phys;         /* Constants, time step */
+    pe_t *pe;        /* Parallel environment */
+    cs_t *cs;        /* Coordinate system */
+    physics_t *phys; /* Constants, time step */
 
-  lees_edw_param_t * param; /* Parameters */
+    lees_edw_param_t *param; /* Parameters */
 
-  int nref;                 /* Reference count */
-  int * icbuff_to_real;     /* look up table */
-  int * icreal_to_buff;     /* look up table */
-  int * buffer_duy;         /* look up table +/- uy as function of ib */
+    int nref;            /* Reference count */
+    int *icbuff_to_real; /* look up table */
+    int *icreal_to_buff; /* look up table */
+    int *buffer_duy;     /* look up table +/- uy as function of ib */
 
-  MPI_Comm  le_comm;        /* 1-d communicator */
-  MPI_Comm  le_plane_comm;  /* 2-d communicator */
+    MPI_Comm le_comm;       /* 1-d communicator */
+    MPI_Comm le_plane_comm; /* 2-d communicator */
 
-  lees_edw_t * target;      /* Device memory */
+    lees_edw_t *target; /* Device memory */
 };
 
 struct lees_edw_param_s {
-  /* Local parameters */
-  int nplanelocal;          /* Number of planes local domain */
-  int nxbuffer;             /* Size of buffer region in x */
-  int index_real_nbuffer;
-  /* For cs */
-  int nhalo;
-  int str[3];
-  int nlocal[3];
-  /* Global parameters */
-  int nplanetotal;          /* Total number of planes */
-  int type;                 /* Shear type */
-  int period;               /* for oscillatory */
-  int nt0;                  /* time0 (input as integer) */
-  int nsites;               /* Number of sites incl buffer planes */
-  double uy;                /* u[Y] for all planes */
-  double dx_min;            /* Position first plane */
-  double dx_sep;            /* Plane separation */
-  double omega;             /* u_y = u_le cos (omega t) for oscillatory */  
-  double time0;             /* time offset */
+    /* Local parameters */
+    int nplanelocal; /* Number of planes local domain */
+    int nxbuffer;    /* Size of buffer region in x */
+    int index_real_nbuffer;
+    /* For cs */
+    int nhalo;
+    int str[3];
+    int nlocal[3];
+    /* Global parameters */
+    int nplanetotal; /* Total number of planes */
+    int type;        /* Shear type */
+    int period;      /* for oscillatory */
+    int nt0;         /* time0 (input as integer) */
+    int nsites;      /* Number of sites incl buffer planes */
+    double uy;       /* u[Y] for all planes */
+    double dx_min;   /* Position first plane */
+    double dx_sep;   /* Plane separation */
+    double omega;    /* u_y = u_le cos (omega t) for oscillatory */
+    double time0;    /* time offset */
 };
 
-static int lees_edw_init(lees_edw_t * le, const lees_edw_options_t * info);
-static int lees_edw_checks(lees_edw_t * le);
-static int lees_edw_init_tables(lees_edw_t * le);
+static int lees_edw_init(lees_edw_t *le, const lees_edw_options_t *info);
+static int lees_edw_checks(lees_edw_t *le);
+static int lees_edw_init_tables(lees_edw_t *le);
 
 static __constant__ lees_edw_param_t static_param;
 
-__host__ __device__ int lees_edw_buffer_duy(lees_edw_t * le, int ib);
+__host__ __device__ int lees_edw_buffer_duy(lees_edw_t *le, int ib);
 
 /* Scheduled for removal */
-__host__ __device__ int lees_edw_index_real_to_buffer(lees_edw_t * le, int ic, int idisplace);
-__host__ __device__ int lees_edw_index_buffer_to_real(lees_edw_t * le, int ibuf);
+__host__ __device__ int lees_edw_index_real_to_buffer(lees_edw_t *le, int ic, int idisplace);
+__host__ __device__ int lees_edw_index_buffer_to_real(lees_edw_t *le, int ibuf);
 
 /*****************************************************************************
  *
@@ -87,59 +87,60 @@ __host__ __device__ int lees_edw_index_buffer_to_real(lees_edw_t * le, int ibuf)
  *
  *****************************************************************************/
 
-__host__ int lees_edw_create(pe_t * pe, cs_t * cs,
-			     const lees_edw_options_t * info,
-			     lees_edw_t ** ple) {
+__host__ int lees_edw_create(pe_t *pe, cs_t *cs, const lees_edw_options_t *info, lees_edw_t **ple) {
 
-  int ndevice;
-  lees_edw_t * le = NULL;
+    int ndevice;
+    lees_edw_t *le = NULL;
 
-  assert(pe);
-  assert(cs);
+    assert(pe);
+    assert(cs);
 
-  le = (lees_edw_t *) calloc(1, sizeof(lees_edw_t));
-  assert(le);
-  if (le == NULL) pe_fatal(pe, "calloc(lees_edw_t) failed\n");
+    le = (lees_edw_t *)calloc(1, sizeof(lees_edw_t));
+    assert(le);
+    if (le == NULL)
+        pe_fatal(pe, "calloc(lees_edw_t) failed\n");
 
-  le->param = (lees_edw_param_t *) calloc(1, sizeof(lees_edw_param_t));
-  assert(le->param);
-  if (le->param == NULL) pe_fatal(pe, "calloc(lees_edw_param_t) failed\n");
+    le->param = (lees_edw_param_t *)calloc(1, sizeof(lees_edw_param_t));
+    assert(le->param);
+    if (le->param == NULL)
+        pe_fatal(pe, "calloc(lees_edw_param_t) failed\n");
 
-  le->pe = pe;
-  pe_retain(pe);
-  le->cs = cs;
-  if (info) cs->leopts = *info; /* Copy of options for i/o metadata */
-  cs_retain(cs);
+    le->pe = pe;
+    pe_retain(pe);
+    le->cs = cs;
+    if (info)
+        cs->leopts = *info; /* Copy of options for i/o metadata */
+    cs_retain(cs);
 
-  le->param->nplanetotal = 0;
-  if (info) lees_edw_init(le, info);
-  lees_edw_init_tables(le);
-  le->nref = 1;
+    le->param->nplanetotal = 0;
+    if (info)
+        lees_edw_init(le, info);
+    lees_edw_init_tables(le);
+    le->nref = 1;
 
-  tdpGetDeviceCount(&ndevice);
+    tdpGetDeviceCount(&ndevice);
 
-  if (ndevice == 0) {
-    le->target = le;
-  }
-  else {
-    lees_edw_param_t * tmp;
-    cs_t * cst;
+    if (ndevice == 0) {
+        le->target = le;
+    }
+    else {
+        lees_edw_param_t *tmp;
+        cs_t *cst;
 
-    tdpMalloc((void **) &le->target, sizeof(lees_edw_t));
-    tdpMemset(le->target, 0, sizeof(lees_edw_t));
-    tdpGetSymbolAddress((void **) &tmp, tdpSymbol(static_param));
-    tdpMemcpy(&le->target->param, (const void *) &tmp,
-	      sizeof(lees_edw_param_t *), tdpMemcpyHostToDevice);
+        tdpMalloc((void **)&le->target, sizeof(lees_edw_t));
+        tdpMemset(le->target, 0, sizeof(lees_edw_t));
+        tdpGetSymbolAddress((void **)&tmp, tdpSymbol(static_param));
+        tdpMemcpy(&le->target->param, (const void *)&tmp, sizeof(lees_edw_param_t *), tdpMemcpyHostToDevice);
 
-    cs_target(cs, &cst);
-    tdpMemcpy(&le->target->cs, &cst, sizeof(cs_t *), tdpMemcpyHostToDevice);
+        cs_target(cs, &cst);
+        tdpMemcpy(&le->target->cs, &cst, sizeof(cs_t *), tdpMemcpyHostToDevice);
 
-    lees_edw_commit(le);
-  }
+        lees_edw_commit(le);
+    }
 
-  *ple = le;
+    *ple = le;
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -148,13 +149,13 @@ __host__ int lees_edw_create(pe_t * pe, cs_t * cs,
  *
  *****************************************************************************/
 
-__host__ int lees_edw_retain(lees_edw_t * le) {
+__host__ int lees_edw_retain(lees_edw_t *le) {
 
-  assert(le);
+    assert(le);
 
-  le->nref += 1;
+    le->nref += 1;
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -163,26 +164,27 @@ __host__ int lees_edw_retain(lees_edw_t * le) {
  *
  *****************************************************************************/
 
-__host__ int lees_edw_free(lees_edw_t * le) {
+__host__ int lees_edw_free(lees_edw_t *le) {
 
-  assert(le);
+    assert(le);
 
-  le->nref -= 1;
+    le->nref -= 1;
 
-  if (le->nref <= 0) {
+    if (le->nref <= 0) {
 
-    if (le->target != le) tdpFree(le->target);
+        if (le->target != le)
+            tdpFree(le->target);
 
-    pe_free(le->pe);
-    cs_free(le->cs);
-    free(le->icbuff_to_real);
-    free(le->icreal_to_buff);
-    free(le->buffer_duy);
-    free(le->param);
-    free(le);
-  }
+        pe_free(le->pe);
+        cs_free(le->cs);
+        free(le->icbuff_to_real);
+        free(le->icreal_to_buff);
+        free(le->buffer_duy);
+        free(le->param);
+        free(le);
+    }
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -191,14 +193,13 @@ __host__ int lees_edw_free(lees_edw_t * le) {
  *
  *****************************************************************************/
 
-__host__ int lees_edw_commit(lees_edw_t * le) {
+__host__ int lees_edw_commit(lees_edw_t *le) {
 
-  assert(le);
+    assert(le);
 
-  tdpMemcpyToSymbol(tdpSymbol(static_param), le->param,
-		    sizeof(lees_edw_param_t), 0, tdpMemcpyHostToDevice);
+    tdpMemcpyToSymbol(tdpSymbol(static_param), le->param, sizeof(lees_edw_param_t), 0, tdpMemcpyHostToDevice);
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -207,14 +208,14 @@ __host__ int lees_edw_commit(lees_edw_t * le) {
  *
  *****************************************************************************/
 
-__host__ int lees_edw_target(lees_edw_t * le, lees_edw_t ** target) {
+__host__ int lees_edw_target(lees_edw_t *le, lees_edw_t **target) {
 
-  assert(le);
-  assert(target);
+    assert(le);
+    assert(target);
 
-  *target = le->target;
+    *target = le->target;
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -223,13 +224,13 @@ __host__ int lees_edw_target(lees_edw_t * le, lees_edw_t ** target) {
  *
  *****************************************************************************/
 
-int lees_edw_nplane_set(lees_edw_t * le, int nplanetotal) {
+int lees_edw_nplane_set(lees_edw_t *le, int nplanetotal) {
 
-  assert(le);
+    assert(le);
 
-  le->param->nplanetotal = nplanetotal;
+    le->param->nplanetotal = nplanetotal;
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -238,13 +239,13 @@ int lees_edw_nplane_set(lees_edw_t * le, int nplanetotal) {
  *
  *****************************************************************************/
 
-int lees_edw_plane_uy_set(lees_edw_t * le, double uy) {
+int lees_edw_plane_uy_set(lees_edw_t *le, double uy) {
 
-  assert(le);
+    assert(le);
 
-  le->param->uy = uy;
+    le->param->uy = uy;
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -253,16 +254,16 @@ int lees_edw_plane_uy_set(lees_edw_t * le, double uy) {
  *
  *****************************************************************************/
 
-int lees_edw_oscillatory_set(lees_edw_t * le, int period) {
+int lees_edw_oscillatory_set(lees_edw_t *le, int period) {
 
-  assert(le);
+    assert(le);
 
-  le->param->type = LE_SHEAR_TYPE_OSCILLATORY;
-  le->param->period = period;
-  le->param->omega = 2.0*4.0*atan(1.0)/le->param->period;
+    le->param->type = LE_SHEAR_TYPE_OSCILLATORY;
+    le->param->period = period;
+    le->param->omega = 2.0 * 4.0 * atan(1.0) / le->param->period;
 
-  return 0;
-} 
+    return 0;
+}
 
 /*****************************************************************************
  *
@@ -270,13 +271,13 @@ int lees_edw_oscillatory_set(lees_edw_t * le, int period) {
  *
  *****************************************************************************/
 
-int lees_edw_toffset_set(lees_edw_t * le, int nt0) {
+int lees_edw_toffset_set(lees_edw_t *le, int nt0) {
 
-  assert(le);
+    assert(le);
 
-  le->param->nt0 = nt0;
+    le->param->nt0 = nt0;
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -292,38 +293,38 @@ int lees_edw_toffset_set(lees_edw_t * le, int nt0) {
  *
  *****************************************************************************/
 
-static int lees_edw_init(lees_edw_t * le, const lees_edw_options_t * info) {
+static int lees_edw_init(lees_edw_t *le, const lees_edw_options_t *info) {
 
-  int ntotal[3];
+    int ntotal[3];
 
-  assert(le);
-  assert(info);
+    assert(le);
+    assert(info);
 
-  le->param->nplanetotal = info->nplanes;
-  le->param->uy = info->uy;
-  le->param->type = info->type;
-  le->param->period = info->period;
-  le->param->nt0 = info->nt0;
+    le->param->nplanetotal = info->nplanes;
+    le->param->uy = info->uy;
+    le->param->type = info->type;
+    le->param->period = info->period;
+    le->param->nt0 = info->nt0;
 
-  cs_ntotal(le->cs, ntotal);
-  physics_ref(&le->phys);
+    cs_ntotal(le->cs, ntotal);
+    physics_ref(&le->phys);
 
-  if (le->param->nplanetotal > 0) {
+    if (le->param->nplanetotal > 0) {
 
-    if (ntotal[X] % le->param->nplanetotal) {
-      pe_info(le->pe, "System size x-direction: %d\n", ntotal[X]);
-      pe_info(le->pe, "Number of LE planes requested: %d\n", info->nplanes);
-      pe_fatal(le->pe, "Number of planes must divide system size\n");
-    }
+        if (ntotal[X] % le->param->nplanetotal) {
+            pe_info(le->pe, "System size x-direction: %d\n", ntotal[X]);
+            pe_info(le->pe, "Number of LE planes requested: %d\n", info->nplanes);
+            pe_fatal(le->pe, "Number of planes must divide system size\n");
+        }
 
-    le->param->dx_sep = 1.0*ntotal[X] / le->param->nplanetotal;
-    le->param->dx_min = 0.5*le->param->dx_sep;
-    le->param->time0 = 1.0*le->param->nt0;
-  }
+        le->param->dx_sep = 1.0 * ntotal[X] / le->param->nplanetotal;
+        le->param->dx_min = 0.5 * le->param->dx_sep;
+        le->param->time0 = 1.0 * le->param->nt0;
+    }
 
-  lees_edw_checks(le);
+    lees_edw_checks(le);
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -332,12 +333,11 @@ static int lees_edw_init(lees_edw_t * le, const lees_edw_options_t * info) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_nplane_total(lees_edw_t * le) {
+__host__ __device__ int lees_edw_nplane_total(lees_edw_t *le) {
 
-  assert(le);
+    assert(le);
 
-  return le->param->nplanetotal;
+    return le->param->nplanetotal;
 }
 
 /*****************************************************************************
@@ -346,12 +346,11 @@ int lees_edw_nplane_total(lees_edw_t * le) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_nplane_local(lees_edw_t * le) {
+__host__ __device__ int lees_edw_nplane_local(lees_edw_t *le) {
 
-  assert(le);
+    assert(le);
 
-  return le->param->nplanelocal;
+    return le->param->nplanelocal;
 }
 
 /*****************************************************************************
@@ -360,14 +359,13 @@ int lees_edw_nplane_local(lees_edw_t * le) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_plane_uy(lees_edw_t * le, double * uy) {
+__host__ __device__ int lees_edw_plane_uy(lees_edw_t *le, double *uy) {
 
-  assert(le);
+    assert(le);
 
-  *uy = le->param->uy;
+    *uy = le->param->uy;
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -376,14 +374,13 @@ int lees_edw_plane_uy(lees_edw_t * le, double * uy) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_nxbuffer(lees_edw_t * le, int * nxb) {
+__host__ __device__ int lees_edw_nxbuffer(lees_edw_t *le, int *nxb) {
 
-  assert(le);
+    assert(le);
 
-  *nxb = le->param->nxbuffer;
+    *nxb = le->param->nxbuffer;
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -394,37 +391,37 @@ int lees_edw_nxbuffer(lees_edw_t * le, int * nxb) {
  *
  *****************************************************************************/
 
-__host__ int lees_edw_info(lees_edw_t * le) {
+__host__ int lees_edw_info(lees_edw_t *le) {
 
-  int np;
-  double gammadot;
+    int np;
+    double gammadot;
 
-  assert(le);
+    assert(le);
 
-  if (le->param->nplanetotal > 0) {
+    if (le->param->nplanetotal > 0) {
 
-    pe_info(le->pe, "\nLees-Edwards boundary conditions are active:\n");
+        pe_info(le->pe, "\nLees-Edwards boundary conditions are active:\n");
 
-    lees_edw_shear_rate(le, &gammadot);
+        lees_edw_shear_rate(le, &gammadot);
 
-    for (np = 0; np < le->param->nplanetotal; np++) {
-      pe_info(le->pe, "LE plane %d is at x = %d with speed %f\n", np+1,
-	   (int)(le->param->dx_min + np*le->param->dx_sep), le->param->uy);
-    }
+        for (np = 0; np < le->param->nplanetotal; np++) {
+            pe_info(le->pe, "LE plane %d is at x = %d with speed %f\n", np + 1, (int)(le->param->dx_min + np * le->param->dx_sep),
+                    le->param->uy);
+        }
 
-    if (le->param->type == LE_SHEAR_TYPE_STEADY) {
-      pe_info(le->pe, "Overall shear rate = %f\n", gammadot);
-    }
-    else {
-      pe_info(le->pe, "Oscillation period: %d time steps\n", le->param->period);
-      pe_info(le->pe, "Maximum shear rate = %f\n", gammadot);
-    }
+        if (le->param->type == LE_SHEAR_TYPE_STEADY) {
+            pe_info(le->pe, "Overall shear rate = %f\n", gammadot);
+        }
+        else {
+            pe_info(le->pe, "Oscillation period: %d time steps\n", le->param->period);
+            pe_info(le->pe, "Maximum shear rate = %f\n", gammadot);
+        }
 
-    pe_info(le->pe, "\n");
-    pe_info(le->pe, "Lees-Edwards time offset (time steps): %8d\n", le->param->nt0);
-  }
+        pe_info(le->pe, "\n");
+        pe_info(le->pe, "Lees-Edwards time offset (time steps): %8d\n", le->param->nt0);
+    }
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -435,163 +432,166 @@ __host__ int lees_edw_info(lees_edw_t * le) {
  *
  *****************************************************************************/
 
-static int lees_edw_init_tables(lees_edw_t * le) {
+static int lees_edw_init_tables(lees_edw_t *le) {
 
-  int ib, ic, ip, n, nb, nh, np;
-  int nhalo;
-  int rdims[3];
-  int cartsz[3];
-  MPI_Comm cartcomm;
+    int ib, ic, ip, n, nb, nh, np;
+    int nhalo;
+    int rdims[3];
+    int cartsz[3];
+    MPI_Comm cartcomm;
 
-  assert(le);
+    assert(le);
 
-  cs_nhalo(le->cs, &nhalo);
-  cs_nlocal(le->cs, le->param->nlocal);
-  cs_cartsz(le->cs, cartsz);
-  cs_cart_comm(le->cs, &cartcomm);
-  cs_strides(le->cs, le->param->str+X, le->param->str+Y, le->param->str+Z);
+    cs_nhalo(le->cs, &nhalo);
+    cs_nlocal(le->cs, le->param->nlocal);
+    cs_cartsz(le->cs, cartsz);
+    cs_cart_comm(le->cs, &cartcomm);
+    cs_strides(le->cs, le->param->str + X, le->param->str + Y, le->param->str + Z);
 
-  le->param->nhalo = nhalo;
-  le->param->nplanelocal = le->param->nplanetotal / cartsz[X];
+    le->param->nhalo = nhalo;
+    le->param->nplanelocal = le->param->nplanetotal / cartsz[X];
 
-  /* Look up table for buffer -> real index */
+    /* Look up table for buffer -> real index */
 
-  /* For each 'x' location in the buffer region, work out the corresponding
-   * x index in the real system:
-   *   - for each boundary there are 2*nhalo buffer planes
-   *   - the locations extend nhalo points either side of the boundary.
-   */
+    /* For each 'x' location in the buffer region, work out the corresponding
+     * x index in the real system:
+     *   - for each boundary there are 2*nhalo buffer planes
+     *   - the locations extend nhalo points either side of the boundary.
+     */
 
-  le->param->nxbuffer = 2*nhalo*le->param->nplanelocal;
+    le->param->nxbuffer = 2 * nhalo * le->param->nplanelocal;
 
-  if (le->param->nxbuffer > 0) {
-    le->icbuff_to_real = (int *) calloc(le->param->nxbuffer, sizeof(int));
-    if (le->icbuff_to_real == NULL) pe_fatal(le->pe, "calloc(le) failed\n");
-  }
+    if (le->param->nxbuffer > 0) {
+        le->icbuff_to_real = (int *)calloc(le->param->nxbuffer, sizeof(int));
+        if (le->icbuff_to_real == NULL)
+            pe_fatal(le->pe, "calloc(le) failed\n");
+    }
 
-  ib = 0;
-  for (n = 0; n < le->param->nplanelocal; n++) {
-    ic = lees_edw_plane_location(le, n) - (nhalo - 1);
-    for (nh = 0; nh < 2*nhalo; nh++) {
-      assert(ib < 2*nhalo*le->param->nplanelocal);
-      le->icbuff_to_real[ib] = ic + nh;
-      ib++;
+    ib = 0;
+    for (n = 0; n < le->param->nplanelocal; n++) {
+        ic = lees_edw_plane_location(le, n) - (nhalo - 1);
+        for (nh = 0; nh < 2 * nhalo; nh++) {
+            assert(ib < 2 * nhalo * le->param->nplanelocal);
+            le->icbuff_to_real[ib] = ic + nh;
+            ib++;
+        }
     }
-  }
-
-  /* Look up table for real -> buffer index */
-
-  /* For each x location in the real system, work out the index of
-   * the appropriate x-location in the buffer region. This is more
-   * complex, as it depends on whether you are looking across an
-   * LE boundary, and if so, in which direction.
-   * ie., we need a look up table = function(x, +/- dx).
-   * Note that this table exists when no planes are present, ie.,
-   * there is no transformation, ie., f(x, dx) = x + dx for all dx.
-   */
-
-  n = (le->param->nlocal[X] + 2*nhalo)*(2*nhalo + 1);
-  le->param->index_real_nbuffer = n;
-
-  le->icreal_to_buff = (int *) calloc(n, sizeof(int));
-  assert(le->icreal_to_buff);
-  if (le->icreal_to_buff == NULL) pe_fatal(le->pe, "calloc(le) failed\n");
-
-  /* Set table in abscence of planes. */
-  /* Note the elements of the table at the extreme edges of the local
-   * system point outside the system. Accesses must take care. */
-
-   for (ic = 1 - nhalo; ic <= le->param->nlocal[X] + nhalo; ic++) {
-     for (nh = -nhalo; nh <= nhalo; nh++) {
-       n = (ic + nhalo - 1)*(2*nhalo+1) + (nh + nhalo);
-       assert(n >= 0 && n < (le->param->nlocal[X] + 2*nhalo)*(2*nhalo + 1));
-       le->icreal_to_buff[n] = ic + nh;
-     }
-   }
-
-   /* For each position in the buffer, add appropriate
-    * corrections in the table. */
-
-   nb = le->param->nlocal[X] + nhalo + 1;
-
-   for (ib = 0; ib < le->param->nxbuffer; ib++) {
-     np = ib / (2*nhalo);
-     ip = lees_edw_plane_location(le, np);
-
-     /* This bit of logic chooses the first nhalo points of the
-      * buffer region for each plane as the 'downward' looking part */
-
-     if ((ib - np*2*nhalo) < nhalo) {
-
-       /* Looking across the plane in the -ve x-direction */
-
-       for (ic = ip + 1; ic <= ip + nhalo; ic++) {
-	 for (nh = -nhalo; nh <= -1; nh++) {
-	   if (ic + nh == le->icbuff_to_real[ib]) {
-	     n = (ic + nhalo - 1)*(2*nhalo+1) + (nh + nhalo);
-	     assert(n >= 0 && n < (le->param->nlocal[X] + 2*nhalo)*(2*nhalo + 1));
-	     le->icreal_to_buff[n] = nb+ib;
-	   }
-	 }
-       }
-     }
-     else {
-       /* looking across the plane in the +ve x-direction */
-
-       for (ic = ip - (nhalo - 1); ic <= ip; ic++) {
-	 for (nh = 1; nh <= nhalo; nh++) {
-	   if (ic + nh == le->icbuff_to_real[ib]) {
-	     n = (ic + nhalo - 1)*(2*nhalo+1) + (nh + nhalo);
-	     assert(n >= 0 && n < (le->param->nlocal[X] + 2*nhalo)*(2*nhalo + 1));
-	     le->icreal_to_buff[n] = nb+ib;	   
-	   }
-	 }
-       }
-     }
-     /* Next buffer point */
-   }
-
-   /* Buffer velocity jumps. When looking from the real system across
-    * a boundary into a given buffer, what is the associated velocity
-    * jump? This is +1 for 'looking up' and -1 for 'looking down'.*/
-
-   if (le->param->nxbuffer > 0) {
-     le->buffer_duy = (int *) calloc(le->param->nxbuffer, sizeof(int));
-     assert(le->buffer_duy);
-     if (le->buffer_duy == NULL) pe_fatal(le->pe,"calloc(buffer_duy) failed\n");
-   }
-
-  ib = 0;
-  for (n = 0; n < le->param->nplanelocal; n++) {
-    for (nh = 0; nh < nhalo; nh++) {
-      assert(ib < le->param->nxbuffer);
-      le->buffer_duy[ib] = -1;
-      ib++;
+
+    /* Look up table for real -> buffer index */
+
+    /* For each x location in the real system, work out the index of
+     * the appropriate x-location in the buffer region. This is more
+     * complex, as it depends on whether you are looking across an
+     * LE boundary, and if so, in which direction.
+     * ie., we need a look up table = function(x, +/- dx).
+     * Note that this table exists when no planes are present, ie.,
+     * there is no transformation, ie., f(x, dx) = x + dx for all dx.
+     */
+
+    n = (le->param->nlocal[X] + 2 * nhalo) * (2 * nhalo + 1);
+    le->param->index_real_nbuffer = n;
+
+    le->icreal_to_buff = (int *)calloc(n, sizeof(int));
+    assert(le->icreal_to_buff);
+    if (le->icreal_to_buff == NULL)
+        pe_fatal(le->pe, "calloc(le) failed\n");
+
+    /* Set table in abscence of planes. */
+    /* Note the elements of the table at the extreme edges of the local
+     * system point outside the system. Accesses must take care. */
+
+    for (ic = 1 - nhalo; ic <= le->param->nlocal[X] + nhalo; ic++) {
+        for (nh = -nhalo; nh <= nhalo; nh++) {
+            n = (ic + nhalo - 1) * (2 * nhalo + 1) + (nh + nhalo);
+            assert(n >= 0 && n < (le->param->nlocal[X] + 2 * nhalo) * (2 * nhalo + 1));
+            le->icreal_to_buff[n] = ic + nh;
+        }
+    }
+
+    /* For each position in the buffer, add appropriate
+     * corrections in the table. */
+
+    nb = le->param->nlocal[X] + nhalo + 1;
+
+    for (ib = 0; ib < le->param->nxbuffer; ib++) {
+        np = ib / (2 * nhalo);
+        ip = lees_edw_plane_location(le, np);
+
+        /* This bit of logic chooses the first nhalo points of the
+         * buffer region for each plane as the 'downward' looking part */
+
+        if ((ib - np * 2 * nhalo) < nhalo) {
+
+            /* Looking across the plane in the -ve x-direction */
+
+            for (ic = ip + 1; ic <= ip + nhalo; ic++) {
+                for (nh = -nhalo; nh <= -1; nh++) {
+                    if (ic + nh == le->icbuff_to_real[ib]) {
+                        n = (ic + nhalo - 1) * (2 * nhalo + 1) + (nh + nhalo);
+                        assert(n >= 0 && n < (le->param->nlocal[X] + 2 * nhalo) * (2 * nhalo + 1));
+                        le->icreal_to_buff[n] = nb + ib;
+                    }
+                }
+            }
+        }
+        else {
+            /* looking across the plane in the +ve x-direction */
+
+            for (ic = ip - (nhalo - 1); ic <= ip; ic++) {
+                for (nh = 1; nh <= nhalo; nh++) {
+                    if (ic + nh == le->icbuff_to_real[ib]) {
+                        n = (ic + nhalo - 1) * (2 * nhalo + 1) + (nh + nhalo);
+                        assert(n >= 0 && n < (le->param->nlocal[X] + 2 * nhalo) * (2 * nhalo + 1));
+                        le->icreal_to_buff[n] = nb + ib;
+                    }
+                }
+            }
+        }
+        /* Next buffer point */
+    }
+
+    /* Buffer velocity jumps. When looking from the real system across
+     * a boundary into a given buffer, what is the associated velocity
+     * jump? This is +1 for 'looking up' and -1 for 'looking down'.*/
+
+    if (le->param->nxbuffer > 0) {
+        le->buffer_duy = (int *)calloc(le->param->nxbuffer, sizeof(int));
+        assert(le->buffer_duy);
+        if (le->buffer_duy == NULL)
+            pe_fatal(le->pe, "calloc(buffer_duy) failed\n");
     }
-    for (nh = 0; nh < nhalo; nh++) {
-      assert(ib < le->param->nxbuffer);
-      le->buffer_duy[ib] = +1;
-      ib++;
+
+    ib = 0;
+    for (n = 0; n < le->param->nplanelocal; n++) {
+        for (nh = 0; nh < nhalo; nh++) {
+            assert(ib < le->param->nxbuffer);
+            le->buffer_duy[ib] = -1;
+            ib++;
+        }
+        for (nh = 0; nh < nhalo; nh++) {
+            assert(ib < le->param->nxbuffer);
+            le->buffer_duy[ib] = +1;
+            ib++;
+        }
     }
-  }
 
-  /* Set up a 1-dimensional communicator for transfer of data
-   * along the y-direction. */
+    /* Set up a 1-dimensional communicator for transfer of data
+     * along the y-direction. */
 
-  rdims[X] = 0;
-  rdims[Y] = 1;
-  rdims[Z] = 0;
-  MPI_Cart_sub(cartcomm, rdims, &le->le_comm);
+    rdims[X] = 0;
+    rdims[Y] = 1;
+    rdims[Z] = 0;
+    MPI_Cart_sub(cartcomm, rdims, &le->le_comm);
 
-  /* Plane communicator in yz, or x = const. */
+    /* Plane communicator in yz, or x = const. */
 
-  rdims[X] = 0;
-  rdims[Y] = 1;
-  rdims[Z] = 1;
+    rdims[X] = 0;
+    rdims[Y] = 1;
+    rdims[Z] = 1;
 
-  MPI_Cart_sub(cartcomm, rdims, &le->le_plane_comm);
+    MPI_Cart_sub(cartcomm, rdims, &le->le_plane_comm);
 
-  return 0;
+    return 0;
 }
 
 /****************************************************************************
@@ -602,48 +602,50 @@ static int lees_edw_init_tables(lees_edw_t * le) {
  *  periodic halo regions.
  *
  ****************************************************************************/
- 
-static int lees_edw_checks(lees_edw_t * le) {
 
-  int n;
-  int ic;
-  int ifail_local = 0;
-  int ifail_global;
-  int cartsz[3];
-  MPI_Comm cartcomm;
+static int lees_edw_checks(lees_edw_t *le) {
+
+    int n;
+    int ic;
+    int ifail_local = 0;
+    int ifail_global;
+    int cartsz[3];
+    MPI_Comm cartcomm;
 
-  assert(le);
+    assert(le);
 
-  cs_cartsz(le->cs, cartsz);
-  cs_cart_comm(le->cs, &cartcomm);
+    cs_cartsz(le->cs, cartsz);
+    cs_cart_comm(le->cs, &cartcomm);
 
-  /* From the local viewpoint, there must be no planes at either
-   * x = 1 or x = nlocal[X] (or indeed, within nhalo points of
-   * a processor or periodic boundary). */
+    /* From the local viewpoint, there must be no planes at either
+     * x = 1 or x = nlocal[X] (or indeed, within nhalo points of
+     * a processor or periodic boundary). */
 
-  for (n = 0; n < le->param->nplanelocal; n++) {
-    ic = lees_edw_plane_location(le, n);
-    if (ic <= le->param->nhalo) ifail_local = 1;
-    if (ic  > le->param->nlocal[X] - le->param->nhalo) ifail_local = 1;
-  }
+    for (n = 0; n < le->param->nplanelocal; n++) {
+        ic = lees_edw_plane_location(le, n);
+        if (ic <= le->param->nhalo)
+            ifail_local = 1;
+        if (ic > le->param->nlocal[X] - le->param->nhalo)
+            ifail_local = 1;
+    }
 
-  MPI_Allreduce(&ifail_local, &ifail_global, 1, MPI_INT, MPI_LOR, cartcomm);
+    MPI_Allreduce(&ifail_local, &ifail_global, 1, MPI_INT, MPI_LOR, cartcomm);
 
-  if (ifail_global) {
-    pe_fatal(le->pe, "Wall at domain boundary\n");
-  }
+    if (ifail_global) {
+        pe_fatal(le->pe, "Wall at domain boundary\n");
+    }
 
-  /* As nplane_local = ntotal/cartsz[X] (integer division) we must have
-   * ntotal % cartsz[X] = 0 */
+    /* As nplane_local = ntotal/cartsz[X] (integer division) we must have
+     * ntotal % cartsz[X] = 0 */
 
-  if ((le->param->nplanetotal % cartsz[X]) != 0) {
-    pe_info(le->pe, "\n");
-    pe_info(le->pe, "Must have a uniform number of planes per process\n");
-    pe_info(le->pe, "Eg., use one plane per process.\n");
-    pe_fatal(le->pe, "Please check and try again.\n");
-  }
+    if ((le->param->nplanetotal % cartsz[X]) != 0) {
+        pe_info(le->pe, "\n");
+        pe_info(le->pe, "Must have a uniform number of planes per process\n");
+        pe_info(le->pe, "Eg., use one plane per process.\n");
+        pe_fatal(le->pe, "Please check and try again.\n");
+    }
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -655,16 +657,14 @@ static int lees_edw_checks(lees_edw_t * le) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_nsites(lees_edw_t * le, int * nsites) {
+__host__ __device__ int lees_edw_nsites(lees_edw_t *le, int *nsites) {
 
-  assert(le);
+    assert(le);
 
-  *nsites = (le->param->nlocal[X] + 2*le->param->nhalo + le->param->nxbuffer)
-    *(le->param->nlocal[Y] + 2*le->param->nhalo)
-    *(le->param->nlocal[Z] + 2*le->param->nhalo);
+    *nsites = (le->param->nlocal[X] + 2 * le->param->nhalo + le->param->nxbuffer) * (le->param->nlocal[Y] + 2 * le->param->nhalo) *
+              (le->param->nlocal[Z] + 2 * le->param->nhalo);
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -678,31 +678,30 @@ int lees_edw_nsites(lees_edw_t * le, int * nsites) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_steady_uy(lees_edw_t * le, int ic, double * uy) {
+__host__ __device__ int lees_edw_steady_uy(lees_edw_t *le, int ic, double *uy) {
+
+    int offset[3];
+    int nplane;
+    double gammadot;
+    double xglobal;
 
-  int offset[3];
-  int nplane;
-  double gammadot;
-  double xglobal;
+    assert(le);
+    assert(le->param->type == LE_SHEAR_TYPE_STEADY);
 
-  assert(le);
-  assert(le->param->type == LE_SHEAR_TYPE_STEADY);
+    cs_nlocal_offset(le->cs, offset);
+    lees_edw_shear_rate(le, &gammadot);
 
-  cs_nlocal_offset(le->cs, offset);
-  lees_edw_shear_rate(le, &gammadot);
+    /* The shear profile is linear, so the local velocity is just a
+     * function of position, modulo the number of planes encountered
+     * since the origin. The planes are half way between sites giving
+     * the - 0.5. */
 
-  /* The shear profile is linear, so the local velocity is just a
-   * function of position, modulo the number of planes encountered
-   * since the origin. The planes are half way between sites giving
-   * the - 0.5. */
+    xglobal = offset[X] + (double)ic - 0.5;
+    nplane = (int)((le->param->dx_min + xglobal) / le->param->dx_sep);
 
-  xglobal = offset[X] + (double) ic - 0.5;
-  nplane = (int) ((le->param->dx_min + xglobal)/le->param->dx_sep);
+    *uy = xglobal * gammadot - le->param->uy * nplane;
 
-  *uy = xglobal*gammadot - le->param->uy*nplane;
- 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -718,36 +717,35 @@ int lees_edw_steady_uy(lees_edw_t * le, int ic, double * uy) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_block_uy(lees_edw_t * le, int ic, double * uy) {
+__host__ __device__ int lees_edw_block_uy(lees_edw_t *le, int ic, double *uy) {
 
-  int offset[3];
-  int n;
-  double xh;
-  double lmin[3];
-  double ltot[3];
+    int offset[3];
+    int n;
+    double xh;
+    double lmin[3];
+    double ltot[3];
 
-  assert(le);
-  assert(le->param->type == LE_SHEAR_TYPE_STEADY);
+    assert(le);
+    assert(le->param->type == LE_SHEAR_TYPE_STEADY);
 
-  cs_lmin(le->cs, lmin);
-  cs_ltot(le->cs, ltot);
-  cs_nlocal_offset(le->cs, offset);
+    cs_lmin(le->cs, lmin);
+    cs_ltot(le->cs, ltot);
+    cs_nlocal_offset(le->cs, offset);
 
-  /* So, just count the number of blocks from the centre L_x/2
-   * and mutliply by the plane speed. */
+    /* So, just count the number of blocks from the centre L_x/2
+     * and mutliply by the plane speed. */
 
-  xh = offset[X] + (double) ic - lmin[X] - 0.5*ltot[X];
-  if (xh > 0.0) {
-    n = (0.5 + xh/le->param->dx_sep);
-  }
-  else {
-    n = (-0.5 + xh/le->param->dx_sep);
-  }
+    xh = offset[X] + (double)ic - lmin[X] - 0.5 * ltot[X];
+    if (xh > 0.0) {
+        n = (0.5 + xh / le->param->dx_sep);
+    }
+    else {
+        n = (-0.5 + xh / le->param->dx_sep);
+    }
 
-  *uy = le->param->uy*n;
+    *uy = le->param->uy * n;
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -758,20 +756,20 @@ int lees_edw_block_uy(lees_edw_t * le, int ic, double * uy) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_plane_uy_now(lees_edw_t * le, double t, double * uy) {
+__host__ __device__ int lees_edw_plane_uy_now(lees_edw_t *le, double t, double *uy) {
 
-  double tle;
+    double tle;
 
-  assert(le);
+    assert(le);
 
-  tle = t - le->param->time0;
-  assert(tle >= 0.0);
+    tle = t - le->param->time0;
+    assert(tle >= 0.0);
 
-  *uy = le->param->uy;
-  if (le->param->type == LE_SHEAR_TYPE_OSCILLATORY) *uy *= cos(le->param->omega*tle);
+    *uy = le->param->uy;
+    if (le->param->type == LE_SHEAR_TYPE_OSCILLATORY)
+        *uy *= cos(le->param->omega * tle);
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -783,25 +781,24 @@ int lees_edw_plane_uy_now(lees_edw_t * le, double t, double * uy) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_plane_location(lees_edw_t * le, int np) {
+__host__ __device__ int lees_edw_plane_location(lees_edw_t *le, int np) {
 
-  int offset[3];
-  int nplane_offset;
-  int cartcoords[3];
-  int ix;
+    int offset[3];
+    int nplane_offset;
+    int cartcoords[3];
+    int ix;
 
-  assert(le);
-  assert(le->cs);
-  assert(np >= 0 && np < le->param->nplanelocal);
+    assert(le);
+    assert(le->cs);
+    assert(np >= 0 && np < le->param->nplanelocal);
 
-  cs_cart_coords(le->cs, cartcoords);
-  cs_nlocal_offset(le->cs, offset);
-  nplane_offset = cartcoords[X]*le->param->nplanelocal;
+    cs_cart_coords(le->cs, cartcoords);
+    cs_nlocal_offset(le->cs, offset);
+    nplane_offset = cartcoords[X] * le->param->nplanelocal;
 
-  ix = le->param->dx_min + (np + nplane_offset)*le->param->dx_sep - offset[X];
+    ix = le->param->dx_min + (np + nplane_offset) * le->param->dx_sep - offset[X];
 
-  return ix;
+    return ix;
 }
 
 /*****************************************************************************
@@ -813,22 +810,21 @@ int lees_edw_plane_location(lees_edw_t * le, int np) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_index_real_to_buffer(lees_edw_t * le,  int ic,  int idisplace) {
+__host__ __device__ int lees_edw_index_real_to_buffer(lees_edw_t *le, int ic, int idisplace) {
 
-  int ib;
+    int ib;
 
-  assert(le);
-  assert(le->icreal_to_buff);
+    assert(le);
+    assert(le->icreal_to_buff);
 
-  assert(idisplace >= -le->param->nhalo && idisplace <= +le->param->nhalo);
+    assert(idisplace >= -le->param->nhalo && idisplace <= +le->param->nhalo);
 
-  ib = (ic + le->param->nhalo - 1)*(2*le->param->nhalo + 1) + idisplace + le->param->nhalo;
+    ib = (ic + le->param->nhalo - 1) * (2 * le->param->nhalo + 1) + idisplace + le->param->nhalo;
 
-  assert(ib >= 0 && ib < le->param->index_real_nbuffer);
+    assert(ib >= 0 && ib < le->param->index_real_nbuffer);
 
-  assert(le->icreal_to_buff[ib] == lees_edw_ic_to_buff(le, ic, idisplace));
-  return le->icreal_to_buff[ib];
+    assert(le->icreal_to_buff[ib] == lees_edw_ic_to_buff(le, ic, idisplace));
+    return le->icreal_to_buff[ib];
 }
 
 /*****************************************************************************
@@ -840,15 +836,14 @@ int lees_edw_index_real_to_buffer(lees_edw_t * le,  int ic,  int idisplace) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_index_buffer_to_real(lees_edw_t * le, int ib) {
+__host__ __device__ int lees_edw_index_buffer_to_real(lees_edw_t *le, int ib) {
 
-  assert(le);
-  assert(le->icbuff_to_real);
-  assert(ib >=0 && ib < le->param->nxbuffer);
+    assert(le);
+    assert(le->icbuff_to_real);
+    assert(ib >= 0 && ib < le->param->nxbuffer);
 
-  assert(le->icbuff_to_real[ib] == lees_edw_ibuff_to_real(le, ib));
-  return le->icbuff_to_real[ib];
+    assert(le->icbuff_to_real[ib] == lees_edw_ibuff_to_real(le, ib));
+    return le->icbuff_to_real[ib];
 }
 
 /*****************************************************************************
@@ -864,29 +859,25 @@ int lees_edw_index_buffer_to_real(lees_edw_t * le, int ib) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_buffer_displacement(lees_edw_t * le, int ib, double t, double * dy) {
-
-  double tle;
-
-  assert(le);
-  assert(ib >= 0 && ib < le->param->nxbuffer);
+__host__ __device__ int lees_edw_buffer_displacement(lees_edw_t *le, int ib, double t, double *dy) {
+    double tle;
+    assert(le);
+    assert(ib >= 0 && ib < le->param->nxbuffer);
 
-  tle = t - le->param->time0;
-  assert(tle >= 0.0);
+    tle = t - le->param->time0;
+    assert(tle >= 0.0);
 
-  *dy = 0.0;
+    *dy = 0.0;
 
-  if (le->param->type == LE_SHEAR_TYPE_STEADY) {
-    *dy = tle*le->param->uy*le->buffer_duy[ib];
-    assert(le->buffer_duy[ib] == lees_edw_buffer_duy(le, ib));
-  }
-
-  if (le->param->type == LE_SHEAR_TYPE_OSCILLATORY) {
-    *dy = le->param->uy*sin(le->param->omega*tle)/le->param->omega;
-  }
+    if (le->param->type == LE_SHEAR_TYPE_STEADY) {
+        *dy = tle * le->param->uy * le->buffer_duy[ib];
+        assert(le->buffer_duy[ib] == lees_edw_buffer_duy(le, ib));
+    }
 
-  return 0;
+    if (le->param->type == LE_SHEAR_TYPE_OSCILLATORY) {
+        *dy = le->param->uy * sin(le->param->omega * tle) / le->param->omega;
+    }
+    return 0;
 }
 
 /*****************************************************************************
@@ -897,13 +888,13 @@ int lees_edw_buffer_displacement(lees_edw_t * le, int ib, double t, double * dy)
  *
  *****************************************************************************/
 
-__host__ int lees_edw_comm(lees_edw_t * le, MPI_Comm * comm) {
+__host__ int lees_edw_comm(lees_edw_t *le, MPI_Comm *comm) {
 
-  assert(le);
+    assert(le);
 
-  *comm = le->le_comm;
+    *comm = le->le_comm;
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -912,13 +903,13 @@ __host__ int lees_edw_comm(lees_edw_t * le, MPI_Comm * comm) {
  *
  *****************************************************************************/
 
-__host__ int lees_edw_plane_comm(lees_edw_t * le, MPI_Comm * comm) {
+__host__ int lees_edw_plane_comm(lees_edw_t *le, MPI_Comm *comm) {
 
-  assert(le);
+    assert(le);
 
-  *comm = le->le_plane_comm;
+    *comm = le->le_plane_comm;
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -931,39 +922,37 @@ __host__ int lees_edw_plane_comm(lees_edw_t * le, MPI_Comm * comm) {
  *
  *****************************************************************************/
 
-__host__
-int lees_edw_jstart_to_mpi_ranks(lees_edw_t * le, const int j1, int send[3],
-				 int recv[3]) {
+__host__ int lees_edw_jstart_to_mpi_ranks(lees_edw_t *le, const int j1, int send[3], int recv[3]) {
 
-  int cartcoords[3];
-  int pe_carty1, pe_carty2, pe_carty3;
+    int cartcoords[3];
+    int pe_carty1, pe_carty2, pe_carty3;
 
-  assert(le);
-  assert(le->cs);
+    assert(le);
+    assert(le->cs);
 
-  cs_cart_coords(le->cs, cartcoords);
+    cs_cart_coords(le->cs, cartcoords);
 
-  /* Receive from ... */
+    /* Receive from ... */
 
-  pe_carty1 = (j1 - 1) / le->param->nlocal[Y];
-  pe_carty2 = pe_carty1 + 1;
-  pe_carty3 = pe_carty1 + 2;
+    pe_carty1 = (j1 - 1) / le->param->nlocal[Y];
+    pe_carty2 = pe_carty1 + 1;
+    pe_carty3 = pe_carty1 + 2;
 
-  MPI_Cart_rank(le->le_comm, &pe_carty1, recv);
-  MPI_Cart_rank(le->le_comm, &pe_carty2, recv + 1);
-  MPI_Cart_rank(le->le_comm, &pe_carty3, recv + 2);
+    MPI_Cart_rank(le->le_comm, &pe_carty1, recv);
+    MPI_Cart_rank(le->le_comm, &pe_carty2, recv + 1);
+    MPI_Cart_rank(le->le_comm, &pe_carty3, recv + 2);
 
-  /* Send to ... */
+    /* Send to ... */
 
-  pe_carty1 = cartcoords[Y] - (((j1 - 1)/le->param->nlocal[Y]) - cartcoords[Y]);
-  pe_carty2 = pe_carty1 - 1;
-  pe_carty3 = pe_carty1 - 2;
+    pe_carty1 = cartcoords[Y] - (((j1 - 1) / le->param->nlocal[Y]) - cartcoords[Y]);
+    pe_carty2 = pe_carty1 - 1;
+    pe_carty3 = pe_carty1 - 2;
 
-  MPI_Cart_rank(le->le_comm, &pe_carty1, send);
-  MPI_Cart_rank(le->le_comm, &pe_carty2, send + 1);
-  MPI_Cart_rank(le->le_comm, &pe_carty3, send + 2);
+    MPI_Cart_rank(le->le_comm, &pe_carty1, send);
+    MPI_Cart_rank(le->le_comm, &pe_carty2, send + 1);
+    MPI_Cart_rank(le->le_comm, &pe_carty3, send + 2);
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -974,17 +963,16 @@ int lees_edw_jstart_to_mpi_ranks(lees_edw_t * le, const int j1, int send[3],
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_shear_rate(lees_edw_t * le, double * gammadot) {
+__host__ __device__ int lees_edw_shear_rate(lees_edw_t *le, double *gammadot) {
 
-  double ltot[3];
+    double ltot[3];
 
-  assert(le);
-  cs_ltot(le->cs, ltot);
+    assert(le);
+    cs_ltot(le->cs, ltot);
 
-  *gammadot = le->param->uy*le->param->nplanetotal/ltot[X];
+    *gammadot = le->param->uy * le->param->nplanetotal / ltot[X];
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -997,21 +985,19 @@ int lees_edw_shear_rate(lees_edw_t * le, double * gammadot) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_index(lees_edw_t * le, int ic, int jc, int kc) {
+__host__ __device__ int lees_edw_index(lees_edw_t *le, int ic, int jc, int kc) {
 
-  assert(le);
+    assert(le);
 
-  assert(ic >= 1-le->param->nhalo);
-  assert(jc >= 1-le->param->nhalo);
-  assert(kc >= 1-le->param->nhalo);
-  assert(ic <= le->param->nlocal[X] + le->param->nhalo + le->param->nxbuffer);
-  assert(jc <= le->param->nlocal[Y] + le->param->nhalo);
-  assert(kc <= le->param->nlocal[Z] + le->param->nhalo);
+    assert(ic >= 1 - le->param->nhalo);
+    assert(jc >= 1 - le->param->nhalo);
+    assert(kc >= 1 - le->param->nhalo);
+    assert(ic <= le->param->nlocal[X] + le->param->nhalo + le->param->nxbuffer);
+    assert(jc <= le->param->nlocal[Y] + le->param->nhalo);
+    assert(kc <= le->param->nlocal[Z] + le->param->nhalo);
 
-  return (le->param->str[X]*(le->param->nhalo + ic - 1) +
-	  le->param->str[Y]*(le->param->nhalo + jc - 1) +
-	  le->param->str[Z]*(le->param->nhalo + kc - 1));
+    return (le->param->str[X] * (le->param->nhalo + ic - 1) + le->param->str[Y] * (le->param->nhalo + jc - 1) +
+            le->param->str[Z] * (le->param->nhalo + kc - 1));
 }
 
 /*****************************************************************************
@@ -1020,18 +1006,16 @@ int lees_edw_index(lees_edw_t * le, int ic, int jc, int kc) {
  *
  *****************************************************************************/
 
-__host__ __device__ void lees_edw_index_v(lees_edw_t * le, int ic[NSIMDVL],
-					  int jc[NSIMDVL], int kc[NSIMDVL],
-					  int index[NSIMDVL]) {
-  int iv;
-  assert(le);
+__host__ __device__ void lees_edw_index_v(lees_edw_t *le, int ic[NSIMDVL], int jc[NSIMDVL], int kc[NSIMDVL], int index[NSIMDVL]) {
+    int iv;
+    assert(le);
 
-  for (iv = 0; iv < NSIMDVL; iv++) {
-    index[iv] = lees_edw_index(le, ic[iv], jc[iv], kc[iv]);
-  }
+    for (iv = 0; iv < NSIMDVL; iv++) {
+        index[iv] = lees_edw_index(le, ic[iv], jc[iv], kc[iv]);
+    }
 
-  return;
-} 
+    return;
+}
 
 /*****************************************************************************
  *
@@ -1039,12 +1023,11 @@ __host__ __device__ void lees_edw_index_v(lees_edw_t * le, int ic[NSIMDVL],
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_nlocal(lees_edw_t * le, int nlocal[3]) {
+__host__ __device__ int lees_edw_nlocal(lees_edw_t *le, int nlocal[3]) {
 
-  assert(le);
+    assert(le);
 
-  return cs_nlocal(le->cs, nlocal);
+    return cs_nlocal(le->cs, nlocal);
 }
 
 /*****************************************************************************
@@ -1053,12 +1036,11 @@ int lees_edw_nlocal(lees_edw_t * le, int nlocal[3]) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_strides(lees_edw_t * le, int * xs, int * ys, int * zs) {
+__host__ __device__ int lees_edw_strides(lees_edw_t *le, int *xs, int *ys, int *zs) {
 
-  assert(le);
+    assert(le);
 
-  return cs_strides(le->cs, xs, ys, zs);
+    return cs_strides(le->cs, xs, ys, zs);
 }
 
 /*****************************************************************************
@@ -1067,12 +1049,11 @@ int lees_edw_strides(lees_edw_t * le, int * xs, int * ys, int * zs) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_nhalo(lees_edw_t * le, int * nhalo) {
+__host__ __device__ int lees_edw_nhalo(lees_edw_t *le, int *nhalo) {
 
-  assert(le);
+    assert(le);
 
-  return cs_nhalo(le->cs, nhalo);
+    return cs_nhalo(le->cs, nhalo);
 }
 
 /*****************************************************************************
@@ -1081,12 +1062,11 @@ int lees_edw_nhalo(lees_edw_t * le, int * nhalo) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_ltot(lees_edw_t * le, double ltot[3]) {
+__host__ __device__ int lees_edw_ltot(lees_edw_t *le, double ltot[3]) {
 
-  assert(le);
+    assert(le);
 
-  return cs_ltot(le->cs, ltot);
+    return cs_ltot(le->cs, ltot);
 }
 
 /*****************************************************************************
@@ -1095,13 +1075,12 @@ int lees_edw_ltot(lees_edw_t * le, double ltot[3]) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_cartsz(lees_edw_t * le, int cartsz[3]) {
+__host__ __device__ int lees_edw_cartsz(lees_edw_t *le, int cartsz[3]) {
 
-  assert(le);
-  assert(le->cs);
+    assert(le);
+    assert(le->cs);
 
-  return cs_cartsz(le->cs, cartsz);
+    return cs_cartsz(le->cs, cartsz);
 }
 
 /*****************************************************************************
@@ -1110,12 +1089,11 @@ int lees_edw_cartsz(lees_edw_t * le, int cartsz[3]) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_ntotal(lees_edw_t * le, int ntotal[3]) {
+__host__ __device__ int lees_edw_ntotal(lees_edw_t *le, int ntotal[3]) {
 
-  assert(le);
+    assert(le);
 
-  return cs_ntotal(le->cs, ntotal);
+    return cs_ntotal(le->cs, ntotal);
 }
 
 /*****************************************************************************
@@ -1124,12 +1102,11 @@ int lees_edw_ntotal(lees_edw_t * le, int ntotal[3]) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_nlocal_offset(lees_edw_t * le, int noffset[3]) {
+__host__ __device__ int lees_edw_nlocal_offset(lees_edw_t *le, int noffset[3]) {
 
-  assert(le);
+    assert(le);
 
-  return cs_nlocal_offset(le->cs, noffset);
+    return cs_nlocal_offset(le->cs, noffset);
 }
 
 /*****************************************************************************
@@ -1138,12 +1115,11 @@ int lees_edw_nlocal_offset(lees_edw_t * le, int noffset[3]) {
  *
  *****************************************************************************/
 
-__host__ __device__
-int lees_edw_cart_coords(lees_edw_t * le, int cartcoord[3]) {
+__host__ __device__ int lees_edw_cart_coords(lees_edw_t *le, int cartcoord[3]) {
 
-  assert(le);
+    assert(le);
 
-  return cs_cart_coords(le->cs, cartcoord);
+    return cs_cart_coords(le->cs, cartcoord);
 }
 
 /****************************************************************************
@@ -1154,18 +1130,18 @@ int lees_edw_cart_coords(lees_edw_t * le, int cartcoord[3]) {
  *
  ****************************************************************************/
 
-__host__ __device__ int lees_edw_plane_dy(lees_edw_t * le, double * dy) {
+__host__ __device__ int lees_edw_plane_dy(lees_edw_t *le, double *dy) {
 
-  double t;
+    double t;
 
-  assert(le);
-  assert(le->phys);
-  assert(dy);
+    assert(le);
+    assert(le->phys);
+    assert(dy);
 
-  physics_control_time(le->phys, &t);
-  *dy = t*le->param->uy;
+    physics_control_time(le->phys, &t);
+    *dy = t * le->param->uy;
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -1177,18 +1153,17 @@ __host__ __device__ int lees_edw_plane_dy(lees_edw_t * le, double * dy) {
  *
  *****************************************************************************/
 
-__host__ int lees_edw_buffer_dy(lees_edw_t * le, int ib, double t0,
-				double * dy) {
+__host__ int lees_edw_buffer_dy(lees_edw_t *le, int ib, double t0, double *dy) {
 
-  double t;
+    double t;
 
-  assert(le);
-  assert(le->phys);
+    assert(le);
+    assert(le->phys);
 
-  physics_control_time(le->phys, &t);
-  lees_edw_buffer_displacement(le, ib, t+t0, dy);
+    physics_control_time(le->phys, &t);
+    lees_edw_buffer_displacement(le, ib, t + t0, dy);
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -1199,22 +1174,22 @@ __host__ int lees_edw_buffer_dy(lees_edw_t * le, int ib, double t0,
  *
  *****************************************************************************/
 
-__host__ int lees_edw_buffer_du(lees_edw_t * le, int ib, double ule[3]) {
+__host__ int lees_edw_buffer_du(lees_edw_t *le, int ib, double ule[3]) {
 
-  assert(le);
-  assert(ib >= 0 && ib < le->param->nxbuffer);
+    assert(le);
+    assert(ib >= 0 && ib < le->param->nxbuffer);
 
-  if (le->param->type == LE_SHEAR_TYPE_STEADY) {
-    ule[X] = 0.0;
-    ule[Y] = le->param->uy*le->buffer_duy[ib];
-    assert(le->buffer_duy[ib] == lees_edw_buffer_duy(le, ib));
-    ule[Z] = 0.0;
-  }
-  else {
-    assert(0); /* Check delta u as function of (ib,t) */
-  }
+    if (le->param->type == LE_SHEAR_TYPE_STEADY) {
+        ule[X] = 0.0;
+        ule[Y] = le->param->uy * le->buffer_duy[ib];
+        assert(le->buffer_duy[ib] == lees_edw_buffer_duy(le, ib));
+        ule[Z] = 0.0;
+    }
+    else {
+        assert(0); /* Check delta u as function of (ib,t) */
+    }
 
-  return 0;
+    return 0;
 }
 
 /*****************************************************************************
@@ -1223,20 +1198,20 @@ __host__ int lees_edw_buffer_du(lees_edw_t * le, int ib, double ule[3]) {
  *
  *****************************************************************************/
 
-__host__ __device__ int lees_edw_ibuff_to_real(lees_edw_t * le, int ib) {
+__host__ __device__ int lees_edw_ibuff_to_real(lees_edw_t *le, int ib) {
 
-  int ic;
-  int p;
+    int ic;
+    int p;
 
-  assert(le);
-  assert(ib >= 0 && ib < le->param->nxbuffer);
+    assert(le);
+    assert(ib >= 0 && ib < le->param->nxbuffer);
 
-  p = ib / (2*le->param->nhalo);
+    p = ib / (2 * le->param->nhalo);
 
-  ic = lees_edw_plane_location(le, p) - (le->param->nhalo - 1);
-  ic = ic + ib % (2*le->param->nhalo);
+    ic = lees_edw_plane_location(le, p) - (le->param->nhalo - 1);
+    ic = ic + ib % (2 * le->param->nhalo);
 
-  return ic;
+    return ic;
 }
 
 /******************************************************************************
@@ -1245,41 +1220,40 @@ __host__ __device__ int lees_edw_ibuff_to_real(lees_edw_t * le, int ib) {
  *
  ******************************************************************************/
 
-__host__ __device__
-int lees_edw_ic_to_buff(lees_edw_t * le, int ic, int di) {
+__host__ __device__ int lees_edw_ic_to_buff(lees_edw_t *le, int ic, int di) {
 
-  int ib;
-  int p, ip;
-  int nh;
+    int ib;
+    int p, ip;
+    int nh;
 
-  assert(le);
-  assert(di <= le->param->nhalo);
-  assert(di >= -le->param->nhalo);
+    assert(le);
+    assert(di <= le->param->nhalo);
+    assert(di >= -le->param->nhalo);
 
-  ib = ic + di;
+    ib = ic + di;
 
-  if (le->param->nplanelocal > 0) {
+    if (le->param->nplanelocal > 0) {
 
-    p = ic / (le->param->nlocal[X]/le->param->nplanelocal);
-    p = imax(0, imin(p, le->param->nplanelocal - 1));
+        p = ic / (le->param->nlocal[X] / le->param->nplanelocal);
+        p = imax(0, imin(p, le->param->nplanelocal - 1));
 
-    nh = le->param->nhalo;
-    ip = lees_edw_plane_location(le, p) - (nh - 1);
+        nh = le->param->nhalo;
+        ip = lees_edw_plane_location(le, p) - (nh - 1);
 
-    if (di > 0 && (ic >= ip && ic < ip + nh) && (ic + di >= ip + nh)) {
-      ib = le->param->nlocal[X] + (1 + 2*p)*nh + (ic - ip + 1) + di;
-      return ib;
-    }
+        if (di > 0 && (ic >= ip && ic < ip + nh) && (ic + di >= ip + nh)) {
+            ib = le->param->nlocal[X] + (1 + 2 * p) * nh + (ic - ip + 1) + di;
+            return ib;
+        }
 
-    ip = lees_edw_plane_location(le, p) + 1;
+        ip = lees_edw_plane_location(le, p) + 1;
 
-    if (di < 0 && (ic >= ip && ic < ip + nh) && (ic + di < ip)) {
-      ib = le->param->nlocal[X] + (2 + 2*p)*nh + (ic - ip + 1) + di;
-      return ib;
+        if (di < 0 && (ic >= ip && ic < ip + nh) && (ic + di < ip)) {
+            ib = le->param->nlocal[X] + (2 + 2 * p) * nh + (ic - ip + 1) + di;
+            return ib;
+        }
     }
-  }
 
-  return ib;
+    return ib;
 }
 
 /******************************************************************************
@@ -1288,15 +1262,16 @@ int lees_edw_ic_to_buff(lees_edw_t * le, int ic, int di) {
  *
  ******************************************************************************/
 
-__host__ __device__ int lees_edw_buffer_duy(lees_edw_t * le, int ib) {
+__host__ __device__ int lees_edw_buffer_duy(lees_edw_t *le, int ib) {
 
-  int pm1;
+    int pm1;
 
-  assert(le);
-  assert(ib >= 0 && ib < le->param->nxbuffer);
+    assert(le);
+    assert(ib >= 0 && ib < le->param->nxbuffer);
 
-  pm1 = +1;
-  if (ib % (2*le->param->nhalo) < le->param->nhalo) pm1 = -1;
+    pm1 = +1;
+    if (ib % (2 * le->param->nhalo) < le->param->nhalo)
+        pm1 = -1;
 
-  return pm1;
+    return pm1;
 }
diff --git a/src/model_le.c b/src/model_le.c
index d214efe9d..48cae42ab 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -31,8 +31,11 @@
 #include "timer.h"
 #include "util.h"
 
+#include "leesedwards.h"
+
 __global__ static void le_reproject(lb_t *lb, lees_edw_t *le);
-static int le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
+__global__ static void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le, double *recv_buff, 
+    int nprop, int negprop, int *positive, int *negative);
 static int le_displace_and_interpolate_parallel(lb_t *lb, lees_edw_t *le);
 
 void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
@@ -61,6 +64,36 @@ void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
     cudaMemcpy(&(d_model->nvel), &(h_model->nvel), sizeof(int8_t), cudaMemcpyHostToDevice);
     cudaMemcpy(&(d_model->cs2), &(h_model->cs2), sizeof(double), cudaMemcpyHostToDevice);
 }
+
+cudaError_t copy_buffer_duy_to_device(lees_edw_s* d_lees_edw, int* h_buffer_duy, size_t nxbuffer) {
+    // First, allocate memory on the device for the buffer_duy array
+    int* d_buffer_duy;
+    cudaError_t err = cudaMalloc((void**) &d_buffer_duy, nxbuffer * sizeof(int));
+
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to allocate device memory for buffer_duy (error code %s)!\n", cudaGetErrorString(err));
+        return err;
+    }
+
+    // Then, copy the data from the host array to the newly allocated device array
+    err = cudaMemcpy(d_buffer_duy, h_buffer_duy, nxbuffer * sizeof(int), cudaMemcpyHostToDevice);
+    
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to copy buffer_duy from host to device (error code %s)!\n", cudaGetErrorString(err));
+        return err;
+    }
+
+    // Finally, update the pointer in the device structure to point to the new device array
+    err = cudaMemcpy(&(d_lees_edw->buffer_duy), &d_buffer_duy, sizeof(int*), cudaMemcpyHostToDevice);
+
+    if (err != cudaSuccess) {
+        fprintf(stderr, "Failed to copy buffer_duy pointer to device structure (error code %s)!\n", cudaGetErrorString(err));
+        return err;
+    }
+
+    return cudaSuccess;
+}
+
 /*****************************************************************************
  *
  *  lb_le_apply_boundary_conditions
@@ -81,7 +114,6 @@ void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
  *****************************************************************************/
 
 __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
-
     int mpi_cartsz[3];
 
     assert(lb);
@@ -96,10 +128,12 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
         /* Everything must be done on host at the moment (slowly) ... */
         /* ... and copy back at the end */
         copyModelToDevice(&lb->model, &lb->target->model);
-        
+       
         lees_edw_t * le_target;
         lees_edw_target(le, &le_target);
 
+        copy_buffer_duy_to_device(le_target, le->buffer_duy, le->param->nxbuffer);
+
         int nlocal[3];
         lees_edw_nlocal(le, nlocal);
         dim3 numBlocks(1, (nlocal[Y] + 15) / 16, (nlocal[Z] + 15) / 16);
@@ -107,16 +141,61 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
         le_reproject<<<numBlocks, threadsPerBlock>>>(lb->target, le_target);
         cudaDeviceSynchronize();
 
-        lb_memcpy(lb, tdpMemcpyDeviceToHost);
+        // lb_memcpy(lb, tdpMemcpyDeviceToHost);
+      
+        int ndist;
+        int nprop = 0;
+        int negprop = 0;
+        lb_ndist(lb, &ndist);
+        for (int p = 1; p < lb->model.nvel; p++) {
+            if (lb->model.cv[p][X] == +1) nprop += 1;
+            if (lb->model.cv[p][X] == -1) negprop += 1;
+        }
+        // printf("nprop = %d, negprop = %d \n", nprop, negprop);
+        int ndata = ndist * nprop * nlocal[Y] * nlocal[Z];
+        double *recv_buff;
+        cudaMalloc((void**)&recv_buff, ndata * sizeof(double));
+
+        int *positive = (int *)malloc(sizeof(int) * nprop);
+        int *negative = (int *)malloc(sizeof(int) * negprop);
+        for (int p = 1, i = 0, j = 0; p < lb->model.nvel; p++) {
+            if (lb->model.cv[p][X] == +1) {
+                positive[i] = p;
+                i++;
+            }
+            if (lb->model.cv[p][X] == -1) {
+                negative[j] = p;
+                j++;
+            }
+        }
+
+        // for (int i = 0; i < nprop; i++) {
+        //     printf("positve[%d] = %d ", i, positive[i]);
+        // }
+        // printf("\n");
+        // for (int i = 0; i < negprop; i++) {
+        //     printf("negative[%d] = %d ", i, negative[i]);
+        // }
+
+        // printf("le->buffer_duy[0] = %d le->buffer_duy[1] = %d  le->buffer_duy[2] = %d\n", le->buffer_duy[0], le->buffer_duy[1], le->buffer_duy[2]);
+
+        int *d_positive, *d_negative;
+        cudaMalloc((void**)&d_positive, sizeof(int) * nprop);
+        cudaMalloc((void**)&d_negative, sizeof(int) * negprop);
+        cudaMemcpy(d_positive, positive, sizeof(int) * nprop, cudaMemcpyHostToDevice);
+        cudaMemcpy(d_negative, negative, sizeof(int) * negprop, cudaMemcpyHostToDevice);
 
         if (mpi_cartsz[Y] > 1) {
             le_displace_and_interpolate_parallel(lb, le);
         }
         else {
-            le_displace_and_interpolate(lb, le);
+            le_displace_and_interpolate<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, 
+                recv_buff, nprop, negprop, d_positive, d_negative);
+            cudaDeviceSynchronize();
+            // printf("end interpolation\n");
         }
-
-        lb_memcpy(lb, tdpMemcpyHostToDevice);
+        
+       // lb_memcpy(lb, tdpMemcpyHostToDevice);
 
         TIMER_stop(TIMER_LE);
     }
@@ -171,6 +250,7 @@ __global__ static void le_reproject(lb_t *lb, lees_edw_t *le) {
     jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
     kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
     
+
     if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
         for (plane = 0; plane < nplane; plane++) {
             for (side = 0; side < 2; side++) {
@@ -251,25 +331,28 @@ __global__ static void le_reproject(lb_t *lb, lees_edw_t *le) {
  *
  *****************************************************************************/
 
-int le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
-
+__global__ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le, double *recv_buff, 
+    int nprop, int negprop, int *positive, int *negative) {
     int ic, jc, kc;
     int index0, index1;
     int nlocal[3];
     int n, nplane, plane;
     int jdy, j1, j2;
     int ndist;
-    int nprop;
-    int ndata;
+    // int nprop, negprop;
+    // int ndata;
     int nhalo;
     double dy, fr;
     double t;
     double ltot[3];
-    double *recv_buff;
+    // double *recv_buff;
     physics_t *phys = NULL;
 
     assert(lb);
     assert(le);
+    assert(recv_buff);
+    assert(positive);
+    assert(negative);
 
     lees_edw_ltot(le, ltot);
     lees_edw_nlocal(le, nlocal);
@@ -279,6 +362,32 @@ int le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
 
     t = 1.0 * physics_control_timestep(phys);
 
+    jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
+    kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
+    int tid = (jc - 1) + (kc - 1) * gridDim.y * blockDim.y;
+    // if (tid == 0) {
+    //     for (int i = 0; i < nprop; i++) {
+    //         printf("positve[%d] = %d ", i, positive[i]);
+    //     }
+    //     printf("\n");
+    //     for (int i = 0; i < negprop; i++) {
+    //         printf("negative[%d] = %d ", i, negative[i]);
+    //     }
+    //     printf("le->buffer_duy[0] = %d le->buffer_duy[1] = %d  le->buffer_duy[2] = %d\n", le->buffer_duy[0], le->buffer_duy[1], le->buffer_duy[2]);
+
+    // }
+    
+    // __syncthreads();
+    // if (tid == 0) {
+    //     printf("works");
+    //     printf("le->buffer_duy[0] = %d le->buffer_duy[1] = %d  le->buffer_duy[2] = %d\n", le->buffer_duy[0], le->buffer_duy[1], le->buffer_duy[2]);
+    // }
+    // __syncthreads();
+
+    // if (tid == 0) {
+    //     printf("le->buffer_duy[0] = %d le->buffer_duy[1] = %d  le->buffer_duy[2] = %d\n", le->buffer_duy[0], le->buffer_duy[1], le->buffer_duy[2]);
+    // }
+
     /* We need to interpolate into a temporary buffer to make sure we
      * don't overwrite distributions taking part. The size is just
      * determined by the size of the local domain, and the number
@@ -288,78 +397,108 @@ int le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
 
     /* Allocate a buffer large enough for all cvp[][X] = +1 */
 
-    nprop = 0;
-    for (int p = 1; p < lb->model.nvel; p++) {
-        if (lb->model.cv[p][X] == +1)
-            nprop += 1;
-    }
-
-    // int truth[nprop];
-    // for (int p = 1, int i = 0; p < lb->model.nvel; p++) {
+    // nprop = 0;
+    // negprop = 0;
+    // for (int p = 1; p < lb->model.nvel; p++) {
+    //     if (lb->model.cv[p][X] == +1) nprop += 1;
+    //     if (lb->model.cv[p][X] == -1) negprop += 1;
+    // }
+    //
+    // int *positive = (int *)malloc(sizeof(int) * nprop);
+    // int *negative = (int *)malloc(sizeof(int) * negprop);
+    // for (int p = 1, i = 0, j = 0; p < lb->model.nvel; p++) {
     //     if (lb->model.cv[p][X] == +1) {
-    //         truth[i] = p;
+    //         positive[i] = p;
     //         i++;
     //     }
+    //     if (lb->model.cv[p][X] == -1) {
+    //         negative[j] = p;
+    //         j++;
+    //     }
     // }
 
-    ndata = ndist * nprop * nlocal[Y] * nlocal[Z];
-    recv_buff = (double *)malloc(ndata * sizeof(double));
+    // ndata = ndist * nprop * nlocal[Y] * nlocal[Z];
+    // recv_buff = (double *)malloc(ndata * sizeof(double));
+
     assert(recv_buff);
     if (recv_buff == NULL)
-        pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
+        // pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
+        printf("malloc(recv_buff) failed\n");
 
+    // printf("checking point 1 \n");
     for (plane = 0; plane < nplane; plane++) {
-
+        // printf("checking point 1.1 \n");
         ic = lees_edw_plane_location(le, plane);
-
+        // printf("checking point 1.2 \n");
         lees_edw_buffer_displacement(le, nhalo, t, &dy);
+        // printf("checking point 1.3 \n");
         dy = fmod(dy, ltot[Y]);
         jdy = floor(dy);
         fr = dy - jdy;
 
-        ndata = 0;
-        for (jc = 1; jc <= nlocal[Y]; jc++) {
+        // ndata = 0;
+        // for (jc = 1; jc <= nlocal[Y]; jc++) {
+
+        //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+        //     j2 = 1 + (j1 % nlocal[Y]);
+
+        //     for (kc = 1; kc <= nlocal[Z]; kc++) {
+
+        //         index0 = lees_edw_index(le, ic, j1, kc);
+        //         index1 = lees_edw_index(le, ic, j2, kc);
+
+        //         /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
+
+        //         for (n = 0; n < ndist; n++) {
+        //             for (int p = 1; p < lb->model.nvel; p++) {
+        //                 if (lb->model.cv[p][X] != +1)
+        //                     continue;
+        //                 recv_buff[ndata++] = (1.0 - fr) * lb->f[LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p)] +
+        //                                      fr * lb->f[LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p)];
+        //             }
+        //         }
+        //         /* Next site */
+        //     }
+        // }
+
+       
+        // printf("checking point 2 \n");
+
+        if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
 
             j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
             j2 = 1 + (j1 % nlocal[Y]);
 
-            for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-                index0 = lees_edw_index(le, ic, j1, kc);
-                index1 = lees_edw_index(le, ic, j2, kc);
+            index0 = lees_edw_index(le, ic, j1, kc);
+            index1 = lees_edw_index(le, ic, j2, kc);
 
-                /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
+            /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
 
-                for (n = 0; n < ndist; n++) {
-                    for (int p = 1; p < lb->model.nvel; p++) {
-                        if (lb->model.cv[p][X] != +1)
-                            continue;
-                        recv_buff[ndata++] = (1.0 - fr) * lb->f[LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p)] +
-                                             fr * lb->f[LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p)];
-                    }
+            for (n = 0; n < ndist; n++) {
+                for (int i = 0; i < nprop; i++) {
+                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
+                    recv_buff[index] = (1.0 - fr) * (lb->f[LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i])]) +
+                                            fr * (lb->f[LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, positive[i])]);
                 }
-                /* Next site */
             }
+            /* Next site */
         }
+        
+
+        // /* ...and copy back ... */
 
         // ndata = 0;
         // for (jc = 1; jc <= nlocal[Y]; jc++) {
-
-        //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
-        //     j2 = 1 + (j1 % nlocal[Y]);
-
         //     for (kc = 1; kc <= nlocal[Z]; kc++) {
 
-        //         index0 = lees_edw_index(le, ic, j1, kc);
-        //         index1 = lees_edw_index(le, ic, j2, kc);
-
-        //         /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
+        //         index0 = lees_edw_index(le, ic, jc, kc);
 
         //         for (n = 0; n < ndist; n++) {
-        //             for (int i = 0; i < nprop; i++) {
-        //                 //int ndata = ((jc-1)*nlocal_Z + (kc-1))*ndist*nprop + n*nprop + i;
-        //                 recv_buff[ndata++] = (1.0 - fr) * lb->f[LB_ADDR(lb->nsite, ndist, lb->model->nvel, index0, n, truth[i])] +
-        //                                      fr * lb->f[LB_ADDR(lb->nsite, ndist, lb->nodel->nvei, index1, n, truth[i])];
+        //             for (int p = 1; p < lb->model.nvel; p++) {
+        //                 if (lb->model.cv[p][X] != +1)
+        //                     continue;
+        //                 int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
+        //                 lb->f[la] = recv_buff[ndata++];
         //             }
         //         }
         //         /* Next site */
@@ -367,26 +506,24 @@ int le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
         // }
 
         /* ...and copy back ... */
+        // printf("checking point 3 \n");
 
-        ndata = 0;
-        for (jc = 1; jc <= nlocal[Y]; jc++) {
-            for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-                index0 = lees_edw_index(le, ic, jc, kc);
+        if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
+            index0 = lees_edw_index(le, ic, jc, kc);
 
-                for (n = 0; n < ndist; n++) {
-                    for (int p = 1; p < lb->model.nvel; p++) {
-                        if (lb->model.cv[p][X] != +1)
-                            continue;
-                        int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-                        lb->f[la] = recv_buff[ndata++];
-                    }
+            for (n = 0; n < ndist; n++) {
+                for (int i = 0; i < nprop; i++) {
+                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
+                    int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
+                    lb->f[la] = recv_buff[index];
                 }
-                /* Next site */
             }
+            /* Next site */
         }
+        
 
         /* OTHER DIRECTION */
+        // printf("checking point 4 \n");
 
         ic = lees_edw_plane_location(le, plane) + 1;
 
@@ -395,55 +532,92 @@ int le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
         jdy = floor(dy);
         fr = dy - jdy;
 
-        ndata = 0;
-        for (jc = 1; jc <= nlocal[Y]; jc++) {
+        // ndata = 0;
+        // for (jc = 1; jc <= nlocal[Y]; jc++) {
+
+        //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+        //     j2 = 1 + (j1 % nlocal[Y]);
+
+        //     for (kc = 1; kc <= nlocal[Z]; kc++) {
+
+        //         index0 = lees_edw_index(le, ic, j1, kc);
+        //         index1 = lees_edw_index(le, ic, j2, kc);
+
+        //         for (n = 0; n < ndist; n++) {
+        //             for (int p = 1; p < lb->model.nvel; p++) {
+        //                 if (lb->model.cv[p][X] == -1) {
+        //                     int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
+        //                     int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p);
+        //                     recv_buff[ndata++] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+        //                 }
+        //             }
+        //         }
+        //         /* Next site */
+        //     }
+        // }
+        // printf("checking point 5 \n");
+
+        if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
 
             j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
             j2 = 1 + (j1 % nlocal[Y]);
 
-            for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-                index0 = lees_edw_index(le, ic, j1, kc);
-                index1 = lees_edw_index(le, ic, j2, kc);
+            index0 = lees_edw_index(le, ic, j1, kc);
+            index1 = lees_edw_index(le, ic, j2, kc);
 
-                for (n = 0; n < ndist; n++) {
-                    for (int p = 1; p < lb->model.nvel; p++) {
-                        if (lb->model.cv[p][X] == -1) {
-                            int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-                            int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p);
-                            recv_buff[ndata++] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
-                        }
-                    }
+            for (n = 0; n < ndist; n++) {
+                for (int i = 0; i < negprop; i++) {
+                    int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
+                    int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, negative[i]);
+                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i;
+                    recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
                 }
-                /* Next site */
             }
+            /* Next site */
         }
 
+
         /* ...and now overwrite... */
 
-        ndata = 0;
-        for (jc = 1; jc <= nlocal[Y]; jc++) {
-            for (kc = 1; kc <= nlocal[Z]; kc++) {
+        // ndata = 0;
+        // for (jc = 1; jc <= nlocal[Y]; jc++) {
+        //     for (kc = 1; kc <= nlocal[Z]; kc++) {
 
-                index0 = lees_edw_index(le, ic, jc, kc);
+        //         index0 = lees_edw_index(le, ic, jc, kc);
 
-                for (n = 0; n < ndist; n++) {
-                    for (int p = 1; p < lb->model.nvel; p++) {
-                        if (lb->model.cv[p][X] == -1) {
-                            int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-                            lb->f[ijkp] = recv_buff[ndata++];
-                        }
-                    }
+        //         for (n = 0; n < ndist; n++) {
+        //             for (int p = 1; p < lb->model.nvel; p++) {
+        //                 if (lb->model.cv[p][X] == -1) {
+        //                     int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
+        //                     lb->f[ijkp] = recv_buff[ndata++];
+        //                 }
+        //             }
+        //         }
+        //     }
+        // }
+
+        // printf("checking point 6 \n");
+        
+        if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
+
+            index0 = lees_edw_index(le, ic, jc, kc);
+
+            for (n = 0; n < ndist; n++) {
+                for (int i = 0; i < negprop; i++) {
+                    int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
+                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i;
+                    lb->f[ijkp] = recv_buff[index];
                 }
             }
+            
         }
 
         /* Next plane */
     }
 
-    free(recv_buff);
-
-    return 0;
+    // free(recv_buff);
+    // printf("checking point last \n");
+    return;
 }
 
 /*****************************************************************************

From 03fe861bdc7d296f5a16ce5fe401e57311eedc30 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Wed, 21 Jun 2023 11:47:19 +0100
Subject: [PATCH 012/133] same as last commit, just clean some comments

---
 src/leesedwards.c |  82 ++++++++++++------------
 src/leesedwards.h |  40 ++++++++++++
 src/model_le.c    | 156 ++--------------------------------------------
 3 files changed, 85 insertions(+), 193 deletions(-)

diff --git a/src/leesedwards.c b/src/leesedwards.c
index 2af2ccc4a..5b6a7055b 100644
--- a/src/leesedwards.c
+++ b/src/leesedwards.c
@@ -24,47 +24,47 @@
 #include "leesedwards.h"
 #include "util.h"
 
-typedef struct lees_edw_param_s lees_edw_param_t;
-
-struct lees_edw_s {
-    pe_t *pe;        /* Parallel environment */
-    cs_t *cs;        /* Coordinate system */
-    physics_t *phys; /* Constants, time step */
-
-    lees_edw_param_t *param; /* Parameters */
-
-    int nref;            /* Reference count */
-    int *icbuff_to_real; /* look up table */
-    int *icreal_to_buff; /* look up table */
-    int *buffer_duy;     /* look up table +/- uy as function of ib */
-
-    MPI_Comm le_comm;       /* 1-d communicator */
-    MPI_Comm le_plane_comm; /* 2-d communicator */
-
-    lees_edw_t *target; /* Device memory */
-};
-
-struct lees_edw_param_s {
-    /* Local parameters */
-    int nplanelocal; /* Number of planes local domain */
-    int nxbuffer;    /* Size of buffer region in x */
-    int index_real_nbuffer;
-    /* For cs */
-    int nhalo;
-    int str[3];
-    int nlocal[3];
-    /* Global parameters */
-    int nplanetotal; /* Total number of planes */
-    int type;        /* Shear type */
-    int period;      /* for oscillatory */
-    int nt0;         /* time0 (input as integer) */
-    int nsites;      /* Number of sites incl buffer planes */
-    double uy;       /* u[Y] for all planes */
-    double dx_min;   /* Position first plane */
-    double dx_sep;   /* Plane separation */
-    double omega;    /* u_y = u_le cos (omega t) for oscillatory */
-    double time0;    /* time offset */
-};
+// typedef struct lees_edw_param_s lees_edw_param_t;
+
+// struct lees_edw_s {
+//     pe_t *pe;        /* Parallel environment */
+//     cs_t *cs;        /* Coordinate system */
+//     physics_t *phys; /* Constants, time step */
+
+//     lees_edw_param_t *param; /* Parameters */
+
+//     int nref;            /* Reference count */
+//     int *icbuff_to_real; /* look up table */
+//     int *icreal_to_buff; /* look up table */
+//     int *buffer_duy;     /* look up table +/- uy as function of ib */
+
+//     MPI_Comm le_comm;       /* 1-d communicator */
+//     MPI_Comm le_plane_comm; /* 2-d communicator */
+
+//     lees_edw_t *target; /* Device memory */
+// };
+
+// struct lees_edw_param_s {
+//     /* Local parameters */
+//     int nplanelocal; /* Number of planes local domain */
+//     int nxbuffer;    /* Size of buffer region in x */
+//     int index_real_nbuffer;
+//     /* For cs */
+//     int nhalo;
+//     int str[3];
+//     int nlocal[3];
+//     /* Global parameters */
+//     int nplanetotal; /* Total number of planes */
+//     int type;        /* Shear type */
+//     int period;      /* for oscillatory */
+//     int nt0;         /* time0 (input as integer) */
+//     int nsites;      /* Number of sites incl buffer planes */
+//     double uy;       /* u[Y] for all planes */
+//     double dx_min;   /* Position first plane */
+//     double dx_sep;   /* Plane separation */
+//     double omega;    /* u_y = u_le cos (omega t) for oscillatory */
+//     double time0;    /* time offset */
+// };
 
 static int lees_edw_init(lees_edw_t *le, const lees_edw_options_t *info);
 static int lees_edw_checks(lees_edw_t *le);
diff --git a/src/leesedwards.h b/src/leesedwards.h
index 03812738f..1697ed9a4 100644
--- a/src/leesedwards.h
+++ b/src/leesedwards.h
@@ -21,7 +21,47 @@
 #include "lees_edwards_options.h"
 
 typedef struct lees_edw_s lees_edw_t;
+typedef struct lees_edw_param_s lees_edw_param_t;
 
+struct lees_edw_s {
+    pe_t *pe;        /* Parallel environment */
+    cs_t *cs;        /* Coordinate system */
+    physics_t *phys; /* Constants, time step */
+
+    lees_edw_param_t *param; /* Parameters */
+
+    int nref;            /* Reference count */
+    int *icbuff_to_real; /* look up table */
+    int *icreal_to_buff; /* look up table */
+    int *buffer_duy;     /* look up table +/- uy as function of ib */
+
+    MPI_Comm le_comm;       /* 1-d communicator */
+    MPI_Comm le_plane_comm; /* 2-d communicator */
+
+    lees_edw_t *target; /* Device memory */
+};
+
+struct lees_edw_param_s {
+    /* Local parameters */
+    int nplanelocal; /* Number of planes local domain */
+    int nxbuffer;    /* Size of buffer region in x */
+    int index_real_nbuffer;
+    /* For cs */
+    int nhalo;
+    int str[3];
+    int nlocal[3];
+    /* Global parameters */
+    int nplanetotal; /* Total number of planes */
+    int type;        /* Shear type */
+    int period;      /* for oscillatory */
+    int nt0;         /* time0 (input as integer) */
+    int nsites;      /* Number of sites incl buffer planes */
+    double uy;       /* u[Y] for all planes */
+    double dx_min;   /* Position first plane */
+    double dx_sep;   /* Plane separation */
+    double omega;    /* u_y = u_le cos (omega t) for oscillatory */
+    double time0;    /* time offset */
+};
 __host__ int lees_edw_create(pe_t * pe, cs_t * coords,
 			     const lees_edw_options_t * opts,
 			     lees_edw_t ** le);
diff --git a/src/model_le.c b/src/model_le.c
index 48cae42ab..baaa40769 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -155,7 +155,9 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
         int ndata = ndist * nprop * nlocal[Y] * nlocal[Z];
         double *recv_buff;
         cudaMalloc((void**)&recv_buff, ndata * sizeof(double));
-
+        if (recv_buff == NULL) {
+            pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
+        }
         int *positive = (int *)malloc(sizeof(int) * nprop);
         int *negative = (int *)malloc(sizeof(int) * negprop);
         for (int p = 1, i = 0, j = 0; p < lb->model.nvel; p++) {
@@ -365,29 +367,7 @@ __global__ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le, double *re
     jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
     kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
     int tid = (jc - 1) + (kc - 1) * gridDim.y * blockDim.y;
-    // if (tid == 0) {
-    //     for (int i = 0; i < nprop; i++) {
-    //         printf("positve[%d] = %d ", i, positive[i]);
-    //     }
-    //     printf("\n");
-    //     for (int i = 0; i < negprop; i++) {
-    //         printf("negative[%d] = %d ", i, negative[i]);
-    //     }
-    //     printf("le->buffer_duy[0] = %d le->buffer_duy[1] = %d  le->buffer_duy[2] = %d\n", le->buffer_duy[0], le->buffer_duy[1], le->buffer_duy[2]);
-
-    // }
     
-    // __syncthreads();
-    // if (tid == 0) {
-    //     printf("works");
-    //     printf("le->buffer_duy[0] = %d le->buffer_duy[1] = %d  le->buffer_duy[2] = %d\n", le->buffer_duy[0], le->buffer_duy[1], le->buffer_duy[2]);
-    // }
-    // __syncthreads();
-
-    // if (tid == 0) {
-    //     printf("le->buffer_duy[0] = %d le->buffer_duy[1] = %d  le->buffer_duy[2] = %d\n", le->buffer_duy[0], le->buffer_duy[1], le->buffer_duy[2]);
-    // }
-
     /* We need to interpolate into a temporary buffer to make sure we
      * don't overwrite distributions taking part. The size is just
      * determined by the size of the local domain, and the number
@@ -397,73 +377,16 @@ __global__ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le, double *re
 
     /* Allocate a buffer large enough for all cvp[][X] = +1 */
 
-    // nprop = 0;
-    // negprop = 0;
-    // for (int p = 1; p < lb->model.nvel; p++) {
-    //     if (lb->model.cv[p][X] == +1) nprop += 1;
-    //     if (lb->model.cv[p][X] == -1) negprop += 1;
-    // }
-    //
-    // int *positive = (int *)malloc(sizeof(int) * nprop);
-    // int *negative = (int *)malloc(sizeof(int) * negprop);
-    // for (int p = 1, i = 0, j = 0; p < lb->model.nvel; p++) {
-    //     if (lb->model.cv[p][X] == +1) {
-    //         positive[i] = p;
-    //         i++;
-    //     }
-    //     if (lb->model.cv[p][X] == -1) {
-    //         negative[j] = p;
-    //         j++;
-    //     }
-    // }
-
-    // ndata = ndist * nprop * nlocal[Y] * nlocal[Z];
-    // recv_buff = (double *)malloc(ndata * sizeof(double));
-
     assert(recv_buff);
-    if (recv_buff == NULL)
-        // pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
-        printf("malloc(recv_buff) failed\n");
+   
 
-    // printf("checking point 1 \n");
     for (plane = 0; plane < nplane; plane++) {
-        // printf("checking point 1.1 \n");
         ic = lees_edw_plane_location(le, plane);
-        // printf("checking point 1.2 \n");
         lees_edw_buffer_displacement(le, nhalo, t, &dy);
-        // printf("checking point 1.3 \n");
         dy = fmod(dy, ltot[Y]);
         jdy = floor(dy);
         fr = dy - jdy;
 
-        // ndata = 0;
-        // for (jc = 1; jc <= nlocal[Y]; jc++) {
-
-        //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
-        //     j2 = 1 + (j1 % nlocal[Y]);
-
-        //     for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-        //         index0 = lees_edw_index(le, ic, j1, kc);
-        //         index1 = lees_edw_index(le, ic, j2, kc);
-
-        //         /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
-
-        //         for (n = 0; n < ndist; n++) {
-        //             for (int p = 1; p < lb->model.nvel; p++) {
-        //                 if (lb->model.cv[p][X] != +1)
-        //                     continue;
-        //                 recv_buff[ndata++] = (1.0 - fr) * lb->f[LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p)] +
-        //                                      fr * lb->f[LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p)];
-        //             }
-        //         }
-        //         /* Next site */
-        //     }
-        // }
-
-       
-        // printf("checking point 2 \n");
-
         if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
 
             j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
@@ -483,31 +406,8 @@ __global__ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le, double *re
             }
             /* Next site */
         }
-        
-
-        // /* ...and copy back ... */
-
-        // ndata = 0;
-        // for (jc = 1; jc <= nlocal[Y]; jc++) {
-        //     for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-        //         index0 = lees_edw_index(le, ic, jc, kc);
-
-        //         for (n = 0; n < ndist; n++) {
-        //             for (int p = 1; p < lb->model.nvel; p++) {
-        //                 if (lb->model.cv[p][X] != +1)
-        //                     continue;
-        //                 int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-        //                 lb->f[la] = recv_buff[ndata++];
-        //             }
-        //         }
-        //         /* Next site */
-        //     }
-        // }
 
         /* ...and copy back ... */
-        // printf("checking point 3 \n");
-
         if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
             index0 = lees_edw_index(le, ic, jc, kc);
 
@@ -523,8 +423,6 @@ __global__ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le, double *re
         
 
         /* OTHER DIRECTION */
-        // printf("checking point 4 \n");
-
         ic = lees_edw_plane_location(le, plane) + 1;
 
         lees_edw_buffer_displacement(le, nhalo, t, &dy);
@@ -532,31 +430,6 @@ __global__ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le, double *re
         jdy = floor(dy);
         fr = dy - jdy;
 
-        // ndata = 0;
-        // for (jc = 1; jc <= nlocal[Y]; jc++) {
-
-        //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
-        //     j2 = 1 + (j1 % nlocal[Y]);
-
-        //     for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-        //         index0 = lees_edw_index(le, ic, j1, kc);
-        //         index1 = lees_edw_index(le, ic, j2, kc);
-
-        //         for (n = 0; n < ndist; n++) {
-        //             for (int p = 1; p < lb->model.nvel; p++) {
-        //                 if (lb->model.cv[p][X] == -1) {
-        //                     int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-        //                     int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p);
-        //                     recv_buff[ndata++] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
-        //                 }
-        //             }
-        //         }
-        //         /* Next site */
-        //     }
-        // }
-        // printf("checking point 5 \n");
-
         if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
 
             j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
@@ -579,25 +452,6 @@ __global__ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le, double *re
 
         /* ...and now overwrite... */
 
-        // ndata = 0;
-        // for (jc = 1; jc <= nlocal[Y]; jc++) {
-        //     for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-        //         index0 = lees_edw_index(le, ic, jc, kc);
-
-        //         for (n = 0; n < ndist; n++) {
-        //             for (int p = 1; p < lb->model.nvel; p++) {
-        //                 if (lb->model.cv[p][X] == -1) {
-        //                     int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-        //                     lb->f[ijkp] = recv_buff[ndata++];
-        //                 }
-        //             }
-        //         }
-        //     }
-        // }
-
-        // printf("checking point 6 \n");
-        
         if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
 
             index0 = lees_edw_index(le, ic, jc, kc);
@@ -615,8 +469,6 @@ __global__ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le, double *re
         /* Next plane */
     }
 
-    // free(recv_buff);
-    // printf("checking point last \n");
     return;
 }
 

From ae773e9d340adcc337fc4d4c53bb5d3dd7e46c2e Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Thu, 22 Jun 2023 12:48:54 +0100
Subject: [PATCH 013/133] modulised interpolation, but the result remains the
 same

---
 src/model_le.c | 295 ++++++++++++++++++++++++++++---------------------
 1 file changed, 171 insertions(+), 124 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index baaa40769..f1b1773ce 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -34,8 +34,11 @@
 #include "leesedwards.h"
 
 __global__ static void le_reproject(lb_t *lb, lees_edw_t *le);
-__global__ static void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le, double *recv_buff, 
-    int nprop, int negprop, int *positive, int *negative);
+__global__ static void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
+__global__ interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *nlocal, int *displacement,  
+    int nprop, int ic, int jdy, int ndist, double fr);
+__global__ copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *nlocal, int *displacement,  
+    int nprop, int ic, int ndist, double fr);
 static int le_displace_and_interpolate_parallel(lb_t *lb, lees_edw_t *le);
 
 void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
@@ -94,6 +97,54 @@ cudaError_t copy_buffer_duy_to_device(lees_edw_s* d_lees_edw, int* h_buffer_duy,
     return cudaSuccess;
 }
 
+__global__ interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *nlocal, int *displacement,  
+    int nprop, int ic, int jdy, int ndist, double fr) {
+
+    int jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
+    int kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
+
+    if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
+
+        j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+        j2 = 1 + (j1 % nlocal[Y]);
+
+        index0 = lees_edw_index(le, ic, j1, kc);
+        index1 = lees_edw_index(le, ic, j2, kc);
+
+        /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
+
+        for (n = 0; n < ndist; n++) {
+            for (int i = 0; i < nprop; i++) {
+                int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, displacement[i]);
+                int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, displacement[i]);
+                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
+                recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+            }
+        }
+        /* Next site */
+    }
+}
+
+__global__ copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *nlocal, int *displacement,  
+    int nprop, int ic, int ndist, double fr) {
+
+    int jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
+    int kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
+
+    if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
+        index0 = lees_edw_index(le, ic, jc, kc);
+
+        for (n = 0; n < ndist; n++) {
+            for (int i = 0; i < nprop; i++) {
+                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
+                int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, displacement[i]);
+                lb->f[la] = recv_buff[index];
+            }
+        }
+        /* Next site */
+    }
+}
+
 /*****************************************************************************
  *
  *  lb_le_apply_boundary_conditions
@@ -140,65 +191,16 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
         dim3 threadsPerBlock(1, 16, 16);
         le_reproject<<<numBlocks, threadsPerBlock>>>(lb->target, le_target);
         cudaDeviceSynchronize();
-
-        // lb_memcpy(lb, tdpMemcpyDeviceToHost);
       
-        int ndist;
-        int nprop = 0;
-        int negprop = 0;
-        lb_ndist(lb, &ndist);
-        for (int p = 1; p < lb->model.nvel; p++) {
-            if (lb->model.cv[p][X] == +1) nprop += 1;
-            if (lb->model.cv[p][X] == -1) negprop += 1;
-        }
-        // printf("nprop = %d, negprop = %d \n", nprop, negprop);
-        int ndata = ndist * nprop * nlocal[Y] * nlocal[Z];
-        double *recv_buff;
-        cudaMalloc((void**)&recv_buff, ndata * sizeof(double));
-        if (recv_buff == NULL) {
-            pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
-        }
-        int *positive = (int *)malloc(sizeof(int) * nprop);
-        int *negative = (int *)malloc(sizeof(int) * negprop);
-        for (int p = 1, i = 0, j = 0; p < lb->model.nvel; p++) {
-            if (lb->model.cv[p][X] == +1) {
-                positive[i] = p;
-                i++;
-            }
-            if (lb->model.cv[p][X] == -1) {
-                negative[j] = p;
-                j++;
-            }
-        }
-
-        // for (int i = 0; i < nprop; i++) {
-        //     printf("positve[%d] = %d ", i, positive[i]);
-        // }
-        // printf("\n");
-        // for (int i = 0; i < negprop; i++) {
-        //     printf("negative[%d] = %d ", i, negative[i]);
-        // }
 
-        // printf("le->buffer_duy[0] = %d le->buffer_duy[1] = %d  le->buffer_duy[2] = %d\n", le->buffer_duy[0], le->buffer_duy[1], le->buffer_duy[2]);
-
-        int *d_positive, *d_negative;
-        cudaMalloc((void**)&d_positive, sizeof(int) * nprop);
-        cudaMalloc((void**)&d_negative, sizeof(int) * negprop);
-        cudaMemcpy(d_positive, positive, sizeof(int) * nprop, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_negative, negative, sizeof(int) * negprop, cudaMemcpyHostToDevice);
 
         if (mpi_cartsz[Y] > 1) {
             le_displace_and_interpolate_parallel(lb, le);
         }
         else {
-            le_displace_and_interpolate<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, 
-                recv_buff, nprop, negprop, d_positive, d_negative);
-            cudaDeviceSynchronize();
-            // printf("end interpolation\n");
+            le_displace_and_interpolate(lb->target, le_target);
         }
         
-       // lb_memcpy(lb, tdpMemcpyHostToDevice);
-
         TIMER_stop(TIMER_LE);
     }
 
@@ -333,51 +335,79 @@ __global__ static void le_reproject(lb_t *lb, lees_edw_t *le) {
  *
  *****************************************************************************/
 
-__global__ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le, double *recv_buff, 
-    int nprop, int negprop, int *positive, int *negative) {
-    int ic, jc, kc;
-    int index0, index1;
+void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
+    int ic; // jc, kc;
+    // int index0, index1;
     int nlocal[3];
-    int n, nplane, plane;
-    int jdy, j1, j2;
+    int nplane, plane; // n
+    int jdy; // j1, j2;
     int ndist;
-    // int nprop, negprop;
-    // int ndata;
+    int nprop, negprop;
     int nhalo;
+    int ndata;
     double dy, fr;
     double t;
     double ltot[3];
-    // double *recv_buff;
+    double *recv_buff;
     physics_t *phys = NULL;
 
     assert(lb);
     assert(le);
-    assert(recv_buff);
-    assert(positive);
-    assert(negative);
 
     lees_edw_ltot(le, ltot);
     lees_edw_nlocal(le, nlocal);
     lees_edw_nhalo(le, &nhalo);
     nplane = lees_edw_nplane_local(le);
     physics_ref(&phys);
+    lb_ndist(lb, &ndist);
 
     t = 1.0 * physics_control_timestep(phys);
 
-    jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
-    kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
-    int tid = (jc - 1) + (kc - 1) * gridDim.y * blockDim.y;
-    
     /* We need to interpolate into a temporary buffer to make sure we
      * don't overwrite distributions taking part. The size is just
      * determined by the size of the local domain, and the number
      * of plane-crossing distributions. */
 
-    lb_ndist(lb, &ndist);
 
     /* Allocate a buffer large enough for all cvp[][X] = +1 */
+    nprop = 0;
+    negprop = 0;
+    for (int p = 1; p < lb->model.nvel; p++) {
+        if (lb->model.cv[p][X] == +1) nprop += 1;
+        if (lb->model.cv[p][X] == -1) negprop += 1;
+    }
+    ndata = ndist * nprop * nlocal[Y] * nlocal[Z];
+    cudaMalloc((void**)&recv_buff, ndata * sizeof(double));
 
+    if (recv_buff == NULL) {
+        pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
+    }
     assert(recv_buff);
+
+    // record the displacement of propgation
+    int *positive = (int *)malloc(sizeof(int) * nprop);
+    int *negative = (int *)malloc(sizeof(int) * negprop);
+    for (int p = 1, i = 0, j = 0; p < lb->model.nvel; p++) {
+        if (lb->model.cv[p][X] == +1) {
+            positive[i] = p;
+            i++;
+        }
+        if (lb->model.cv[p][X] == -1) {
+            negative[j] = p;
+            j++;
+        }
+    }
+
+    // copy the displacement array to the device
+    int *d_positive, *d_negative;
+    cudaMalloc((void**)&d_positive, sizeof(int) * nprop);
+    cudaMalloc((void**)&d_negative, sizeof(int) * negprop);
+    cudaMemcpy(d_positive, positive, sizeof(int) * nprop, cudaMemcpyHostToDevice);
+    cudaMemcpy(d_negative, negative, sizeof(int) * negprop, cudaMemcpyHostToDevice);
+    
+    //define a Cuda model
+    dim3 numBlocks(1, (nlocal[Y] + 15) / 16, (nlocal[Z] + 15) / 16);
+    dim3 threadsPerBlock(1, 16, 16);
    
 
     for (plane = 0; plane < nplane; plane++) {
@@ -387,39 +417,48 @@ __global__ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le, double *re
         jdy = floor(dy);
         fr = dy - jdy;
 
-        if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
+        interpolation<<<numBlocks, threadsPerBlock>>>(lb_t *lb, lees_edw_t *le, double *recv_buff, 
+            int *nlocal, int *positive, int nprop, int ic, int jdy, int ndist, double fr);
+        cudaDeviceSynchronize();
 
-            j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
-            j2 = 1 + (j1 % nlocal[Y]);
+        copy_back<<<numBlocks, threadsPerBlock>>>(lb_t *lb, lees_edw_t *le, double *recv_buff, 
+            int *nlocal, int *positive, int nprop, int ic, int ndist, double fr);
+        cudaDeviceSynchronize();
+        
+        // if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
 
-            index0 = lees_edw_index(le, ic, j1, kc);
-            index1 = lees_edw_index(le, ic, j2, kc);
+        //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+        //     j2 = 1 + (j1 % nlocal[Y]);
 
-            /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
+        //     index0 = lees_edw_index(le, ic, j1, kc);
+        //     index1 = lees_edw_index(le, ic, j2, kc);
 
-            for (n = 0; n < ndist; n++) {
-                for (int i = 0; i < nprop; i++) {
-                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
-                    recv_buff[index] = (1.0 - fr) * (lb->f[LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i])]) +
-                                            fr * (lb->f[LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, positive[i])]);
-                }
-            }
-            /* Next site */
-        }
+        //     /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
+
+        //     for (n = 0; n < ndist; n++) {
+        //         for (int i = 0; i < nprop; i++) {
+        //             int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
+        //             int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, positive[i]);
+        //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
+        //             recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+        //         }
+        //     }
+        //     /* Next site */
+        // }
 
         /* ...and copy back ... */
-        if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
-            index0 = lees_edw_index(le, ic, jc, kc);
-
-            for (n = 0; n < ndist; n++) {
-                for (int i = 0; i < nprop; i++) {
-                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
-                    int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
-                    lb->f[la] = recv_buff[index];
-                }
-            }
-            /* Next site */
-        }
+        // if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
+        //     index0 = lees_edw_index(le, ic, jc, kc);
+
+        //     for (n = 0; n < ndist; n++) {
+        //         for (int i = 0; i < nprop; i++) {
+        //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
+        //             int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
+        //             lb->f[la] = recv_buff[index];
+        //         }
+        //     }
+        //     /* Next site */
+        // }
         
 
         /* OTHER DIRECTION */
@@ -430,41 +469,49 @@ __global__ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le, double *re
         jdy = floor(dy);
         fr = dy - jdy;
 
-        if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
-
-            j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
-            j2 = 1 + (j1 % nlocal[Y]);
-
-            index0 = lees_edw_index(le, ic, j1, kc);
-            index1 = lees_edw_index(le, ic, j2, kc);
+        interpolation<<<numBlocks, threadsPerBlock>>>(lb_t *lb, lees_edw_t *le, double *recv_buff, 
+            int *nlocal, int *negative, int negprop, int ic, int jdy, int ndist, double fr);
+        cudaDeviceSynchronize();
 
-            for (n = 0; n < ndist; n++) {
-                for (int i = 0; i < negprop; i++) {
-                    int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
-                    int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, negative[i]);
-                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i;
-                    recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
-                }
-            }
-            /* Next site */
-        }
+        copy_back<<<numBlocks, threadsPerBlock>>>(lb_t *lb, lees_edw_t *le, double *recv_buff, 
+            int *nlocal, int *negative, int negprop, int ic, int ndist, double fr);
+        cudaDeviceSynchronize();
+        
+        // if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
+
+        //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+        //     j2 = 1 + (j1 % nlocal[Y]);
+
+        //     index0 = lees_edw_index(le, ic, j1, kc);
+        //     index1 = lees_edw_index(le, ic, j2, kc);
+
+        //     for (n = 0; n < ndist; n++) {
+        //         for (int i = 0; i < negprop; i++) {
+        //             int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
+        //             int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, negative[i]);
+        //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i;
+        //             recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+        //         }
+        //     }
+        //     /* Next site */
+        // }
 
 
-        /* ...and now overwrite... */
+        // /* ...and now overwrite... */
 
-        if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
+        // if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
 
-            index0 = lees_edw_index(le, ic, jc, kc);
+        //     index0 = lees_edw_index(le, ic, jc, kc);
 
-            for (n = 0; n < ndist; n++) {
-                for (int i = 0; i < negprop; i++) {
-                    int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
-                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i;
-                    lb->f[ijkp] = recv_buff[index];
-                }
-            }
+        //     for (n = 0; n < ndist; n++) {
+        //         for (int i = 0; i < negprop; i++) {
+        //             int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
+        //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i;
+        //             lb->f[ijkp] = recv_buff[index];
+        //         }
+        //     }
             
-        }
+        // }
 
         /* Next plane */
     }

From 7410ce0654dc631c35617c2543da6f900dd03c66 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Fri, 23 Jun 2023 13:44:30 +0100
Subject: [PATCH 014/133] fixed some bugs of the formoer version, the result is
 tested correct now, but some other tests failed

---
 src/model_le.c | 126 ++++++++++++-------------------------------------
 1 file changed, 29 insertions(+), 97 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index f1b1773ce..76a8ab379 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -34,10 +34,10 @@
 #include "leesedwards.h"
 
 __global__ static void le_reproject(lb_t *lb, lees_edw_t *le);
-__global__ static void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
-__global__ interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *nlocal, int *displacement,  
+static void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
+__global__ static void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *displacement,  
     int nprop, int ic, int jdy, int ndist, double fr);
-__global__ copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *nlocal, int *displacement,  
+__global__ static void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *displacement,  
     int nprop, int ic, int ndist, double fr);
 static int le_displace_and_interpolate_parallel(lb_t *lb, lees_edw_t *le);
 
@@ -97,23 +97,26 @@ cudaError_t copy_buffer_duy_to_device(lees_edw_s* d_lees_edw, int* h_buffer_duy,
     return cudaSuccess;
 }
 
-__global__ interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *nlocal, int *displacement,  
+__global__ void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *displacement,  
     int nprop, int ic, int jdy, int ndist, double fr) {
 
+    int nlocal[3];
+    lees_edw_nlocal(le, nlocal);
+
     int jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
     int kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
 
     if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
 
-        j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
-        j2 = 1 + (j1 % nlocal[Y]);
+        int j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+        int j2 = 1 + (j1 % nlocal[Y]);
 
-        index0 = lees_edw_index(le, ic, j1, kc);
-        index1 = lees_edw_index(le, ic, j2, kc);
+        int index0 = lees_edw_index(le, ic, j1, kc);
+        int index1 = lees_edw_index(le, ic, j2, kc);
 
         /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
 
-        for (n = 0; n < ndist; n++) {
+        for (int n = 0; n < ndist; n++) {
             for (int i = 0; i < nprop; i++) {
                 int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, displacement[i]);
                 int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, displacement[i]);
@@ -125,16 +128,17 @@ __global__ interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *nloca
     }
 }
 
-__global__ copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *nlocal, int *displacement,  
+__global__ void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *displacement,  
     int nprop, int ic, int ndist, double fr) {
-
+    int nlocal[3];
+    lees_edw_nlocal(le, nlocal);
     int jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
     int kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
 
     if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
-        index0 = lees_edw_index(le, ic, jc, kc);
+        int index0 = lees_edw_index(le, ic, jc, kc);
 
-        for (n = 0; n < ndist; n++) {
+        for (int n = 0; n < ndist; n++) {
             for (int i = 0; i < nprop; i++) {
                 int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
                 int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, displacement[i]);
@@ -198,7 +202,7 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
             le_displace_and_interpolate_parallel(lb, le);
         }
         else {
-            le_displace_and_interpolate(lb->target, le_target);
+            le_displace_and_interpolate(lb, le);
         }
         
         TIMER_stop(TIMER_LE);
@@ -336,8 +340,8 @@ __global__ static void le_reproject(lb_t *lb, lees_edw_t *le) {
  *****************************************************************************/
 
 void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
-    int ic; // jc, kc;
-    // int index0, index1;
+    int ic, jc, kc;
+    int index0;// index1;
     int nlocal[3];
     int nplane, plane; // n
     int jdy; // j1, j2;
@@ -350,9 +354,11 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     double ltot[3];
     double *recv_buff;
     physics_t *phys = NULL;
+    lees_edw_t * le_target;
 
     assert(lb);
     assert(le);
+    
 
     lees_edw_ltot(le, ltot);
     lees_edw_nlocal(le, nlocal);
@@ -360,6 +366,7 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     nplane = lees_edw_nplane_local(le);
     physics_ref(&phys);
     lb_ndist(lb, &ndist);
+    lees_edw_target(le, &le_target);
 
     t = 1.0 * physics_control_timestep(phys);
 
@@ -417,49 +424,13 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
         jdy = floor(dy);
         fr = dy - jdy;
 
-        interpolation<<<numBlocks, threadsPerBlock>>>(lb_t *lb, lees_edw_t *le, double *recv_buff, 
-            int *nlocal, int *positive, int nprop, int ic, int jdy, int ndist, double fr);
-        cudaDeviceSynchronize();
+        // printf("jdy = %d, fr = %f \n\n", jdy, fr);
 
-        copy_back<<<numBlocks, threadsPerBlock>>>(lb_t *lb, lees_edw_t *le, double *recv_buff, 
-            int *nlocal, int *positive, int nprop, int ic, int ndist, double fr);
+        interpolation<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_positive, nprop, ic, jdy, ndist, fr);
         cudaDeviceSynchronize();
         
-        // if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
-
-        //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
-        //     j2 = 1 + (j1 % nlocal[Y]);
-
-        //     index0 = lees_edw_index(le, ic, j1, kc);
-        //     index1 = lees_edw_index(le, ic, j2, kc);
-
-        //     /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
-
-        //     for (n = 0; n < ndist; n++) {
-        //         for (int i = 0; i < nprop; i++) {
-        //             int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
-        //             int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, positive[i]);
-        //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
-        //             recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
-        //         }
-        //     }
-        //     /* Next site */
-        // }
-
-        /* ...and copy back ... */
-        // if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
-        //     index0 = lees_edw_index(le, ic, jc, kc);
-
-        //     for (n = 0; n < ndist; n++) {
-        //         for (int i = 0; i < nprop; i++) {
-        //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
-        //             int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
-        //             lb->f[la] = recv_buff[index];
-        //         }
-        //     }
-        //     /* Next site */
-        // }
-        
+        copy_back<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_positive, nprop, ic, ndist, fr);
+        cudaDeviceSynchronize();
 
         /* OTHER DIRECTION */
         ic = lees_edw_plane_location(le, plane) + 1;
@@ -469,51 +440,12 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
         jdy = floor(dy);
         fr = dy - jdy;
 
-        interpolation<<<numBlocks, threadsPerBlock>>>(lb_t *lb, lees_edw_t *le, double *recv_buff, 
-            int *nlocal, int *negative, int negprop, int ic, int jdy, int ndist, double fr);
+        interpolation<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_negative, negprop, ic, jdy, ndist, fr);
         cudaDeviceSynchronize();
 
-        copy_back<<<numBlocks, threadsPerBlock>>>(lb_t *lb, lees_edw_t *le, double *recv_buff, 
-            int *nlocal, int *negative, int negprop, int ic, int ndist, double fr);
+        copy_back<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_negative, negprop, ic, ndist, fr);
         cudaDeviceSynchronize();
         
-        // if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
-
-        //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
-        //     j2 = 1 + (j1 % nlocal[Y]);
-
-        //     index0 = lees_edw_index(le, ic, j1, kc);
-        //     index1 = lees_edw_index(le, ic, j2, kc);
-
-        //     for (n = 0; n < ndist; n++) {
-        //         for (int i = 0; i < negprop; i++) {
-        //             int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
-        //             int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, negative[i]);
-        //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i;
-        //             recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
-        //         }
-        //     }
-        //     /* Next site */
-        // }
-
-
-        // /* ...and now overwrite... */
-
-        // if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
-
-        //     index0 = lees_edw_index(le, ic, jc, kc);
-
-        //     for (n = 0; n < ndist; n++) {
-        //         for (int i = 0; i < negprop; i++) {
-        //             int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
-        //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i;
-        //             lb->f[ijkp] = recv_buff[index];
-        //         }
-        //     }
-            
-        // }
-
-        /* Next plane */
     }
 
     return;

From de757f4fe1dcd217a16ff513c6adebf36b35b6c6 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Fri, 30 Jun 2023 16:23:51 +0100
Subject: [PATCH 015/133] the result is correct now, those failed tests are
 expected to fail, so ...

---
 src/model_le.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index 76a8ab379..8bd327235 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -177,7 +177,6 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
     lees_edw_cartsz(le, mpi_cartsz);
 
     if (lees_edw_nplane_local(le) > 0) {
-
         TIMER_start(TIMER_LE);
 
         /* Everything must be done on host at the moment (slowly) ... */
@@ -424,8 +423,6 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
         jdy = floor(dy);
         fr = dy - jdy;
 
-        // printf("jdy = %d, fr = %f \n\n", jdy, fr);
-
         interpolation<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_positive, nprop, ic, jdy, ndist, fr);
         cudaDeviceSynchronize();
         
@@ -434,7 +431,6 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
 
         /* OTHER DIRECTION */
         ic = lees_edw_plane_location(le, plane) + 1;
-
         lees_edw_buffer_displacement(le, nhalo, t, &dy);
         dy = fmod(-dy, ltot[Y]);
         jdy = floor(dy);
@@ -445,7 +441,6 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
 
         copy_back<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_negative, negprop, ic, ndist, fr);
         cudaDeviceSynchronize();
-        
     }
 
     return;

From 577ad993878a640f16934e8a4fae158235ecf755 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Sun, 2 Jul 2023 09:21:04 +0100
Subject: [PATCH 016/133] reduced the number of kernel lanuchs by parallelising
 the loop of plane in interpolation function, the result is exactly the same
 as before, but many tests failed this time

---
 src/model_le.c | 174 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 121 insertions(+), 53 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index 8bd327235..f6ba9cf08 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -35,10 +35,10 @@
 
 __global__ static void le_reproject(lb_t *lb, lees_edw_t *le);
 static void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
-__global__ static void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *displacement,  
-    int nprop, int ic, int jdy, int ndist, double fr);
-__global__ static void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *displacement,  
-    int nprop, int ic, int ndist, double fr);
+__global__ static void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *positive,  
+    int *negative, int nprop, int negprop, int displacement, double t);
+__global__ static void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *positive,  
+    int *negative, int nprop, int negprop, int displacement);
 static int le_displace_and_interpolate_parallel(lb_t *lb, lees_edw_t *le);
 
 void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
@@ -97,55 +97,117 @@ cudaError_t copy_buffer_duy_to_device(lees_edw_s* d_lees_edw, int* h_buffer_duy,
     return cudaSuccess;
 }
 
-__global__ void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *displacement,  
-    int nprop, int ic, int jdy, int ndist, double fr) {
-
+__global__ void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *positive,  
+    int *negative, int nprop, int negprop, int displacement, double t) {
+    int plane, ic, jc, kc;
+    int nhalo, ndist, nplane;
+    int jdy, j1, j2, index0, index1;
+    double dy, fr;
+    double ltot[3];
     int nlocal[3];
+
+    lb_ndist(lb, &ndist);
     lees_edw_nlocal(le, nlocal);
+    lees_edw_nhalo(le, &nhalo);
+    lees_edw_ltot(le, ltot);
+    nplane = lees_edw_nplane_local(le);
+
+    plane = blockIdx.x * blockDim.x + threadIdx.x;
+    jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
+    kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
 
-    int jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
-    int kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
+    if (jc <= nlocal[Y] && kc <= nlocal[Z] && plane < nplane) {
 
-    if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
+        ic = lees_edw_plane_location(le, plane);
+        lees_edw_buffer_displacement(le, nhalo, t, &dy);
+        dy = fmod(dy, ltot[Y]);
+        jdy = floor(dy);
+        fr = dy - jdy;
 
-        int j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
-        int j2 = 1 + (j1 % nlocal[Y]);
+        j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+        j2 = 1 + (j1 % nlocal[Y]);
 
-        int index0 = lees_edw_index(le, ic, j1, kc);
-        int index1 = lees_edw_index(le, ic, j2, kc);
+        index0 = lees_edw_index(le, ic, j1, kc);
+        index1 = lees_edw_index(le, ic, j2, kc);
 
         /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
 
         for (int n = 0; n < ndist; n++) {
             for (int i = 0; i < nprop; i++) {
-                int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, displacement[i]);
-                int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, displacement[i]);
+                int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
+                int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, positive[i]);
                 int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
                 recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
             }
         }
-        /* Next site */
+
+        /* OTHER DIRECTION */
+        ic = lees_edw_plane_location(le, plane) + 1;
+        lees_edw_buffer_displacement(le, nhalo, t, &dy);
+        dy = fmod(-dy, ltot[Y]);
+        jdy = floor(dy);
+        fr = dy - jdy;
+
+        j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+        j2 = 1 + (j1 % nlocal[Y]);
+
+        index0 = lees_edw_index(le, ic, j1, kc);
+        index1 = lees_edw_index(le, ic, j2, kc);
+
+        /* xdisp_fwd_cv[0] identifies cv[p][X] = -1 */
+
+        for (int n = 0; n < ndist; n++) {
+            for (int i = 0; i < negprop; i++) {
+                int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
+                int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, negative[i]);
+                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i;
+                recv_buff[index + displacement] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+            }
+        }
     }
+
 }
 
-__global__ void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *displacement,  
-    int nprop, int ic, int ndist, double fr) {
+__global__ void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *positive,  
+    int *negative, int nprop, int negprop, int displacement) {
+    int plane, ic, jc, kc;
+    int nhalo, ndist, nplane;
     int nlocal[3];
+    int index0;
+
+    lb_ndist(lb, &ndist);
     lees_edw_nlocal(le, nlocal);
-    int jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
-    int kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
+    lees_edw_nhalo(le, &nhalo);
+    nplane = lees_edw_nplane_local(le);
 
-    if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
-        int index0 = lees_edw_index(le, ic, jc, kc);
+    plane = blockIdx.x * blockDim.x + threadIdx.x;
+    jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
+    kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
+
+    if (jc <= nlocal[Y] && kc <= nlocal[Z] && plane < nplane) {
+        ic = lees_edw_plane_location(le, plane);
+        index0 = lees_edw_index(le, ic, jc, kc);
 
         for (int n = 0; n < ndist; n++) {
             for (int i = 0; i < nprop; i++) {
                 int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
-                int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, displacement[i]);
+                int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
                 lb->f[la] = recv_buff[index];
             }
         }
-        /* Next site */
+
+        /* Another direction */
+
+        ic = lees_edw_plane_location(le, plane) + 1; 
+        index0 = lees_edw_index(le, ic, jc, kc);
+
+        for (int n = 0; n < ndist; n++) {
+            for (int i = 0; i < negprop; i++) {
+                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i;
+                int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
+                lb->f[la] = recv_buff[index + displacement];
+            }
+        }
     }
 }
 
@@ -347,7 +409,7 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     int ndist;
     int nprop, negprop;
     int nhalo;
-    int ndata;
+    int ndata, displacement;
     double dy, fr;
     double t;
     double ltot[3];
@@ -382,7 +444,8 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
         if (lb->model.cv[p][X] == +1) nprop += 1;
         if (lb->model.cv[p][X] == -1) negprop += 1;
     }
-    ndata = ndist * nprop * nlocal[Y] * nlocal[Z];
+    displacement = ndist * nprop * nlocal[Y] * nlocal[Z];
+    ndata = 2 * displacement;
     cudaMalloc((void**)&recv_buff, ndata * sizeof(double));
 
     if (recv_buff == NULL) {
@@ -412,36 +475,41 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     cudaMemcpy(d_negative, negative, sizeof(int) * negprop, cudaMemcpyHostToDevice);
     
     //define a Cuda model
-    dim3 numBlocks(1, (nlocal[Y] + 15) / 16, (nlocal[Z] + 15) / 16);
-    dim3 threadsPerBlock(1, 16, 16);
+    dim3 numBlocks((nplane + 7) / 8, (nlocal[Y] + 7) / 8, (nlocal[Z] + 7) / 8);
+    dim3 threadsPerBlock(8, 8, 8);
    
 
-    for (plane = 0; plane < nplane; plane++) {
-        ic = lees_edw_plane_location(le, plane);
-        lees_edw_buffer_displacement(le, nhalo, t, &dy);
-        dy = fmod(dy, ltot[Y]);
-        jdy = floor(dy);
-        fr = dy - jdy;
-
-        interpolation<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_positive, nprop, ic, jdy, ndist, fr);
-        cudaDeviceSynchronize();
-        
-        copy_back<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_positive, nprop, ic, ndist, fr);
-        cudaDeviceSynchronize();
+    // for (plane = 0; plane < nplane; plane++) {
+        // ic = lees_edw_plane_location(le, plane);
+        // lees_edw_buffer_displacement(le, nhalo, t, &dy);
+        // dy = fmod(dy, ltot[Y]);
+        // jdy = floor(dy);
+        // fr = dy - jdy;
+    // printf("nplane=%d, nhalo=%d, ndist=%d, t=%.2f, nlocal[2]=%d, ltot[2]=%.2f \n\n", 
+    //         nplane, nhalo, ndist, t, nlocal[2], ltot[2]);
+    // return;
+
+    interpolation<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, 
+        d_positive, d_negative, nprop, negprop, displacement, t);
+    cudaDeviceSynchronize();
+    
+    copy_back<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, 
+        d_positive, d_negative, nprop, negprop, displacement);
+    cudaDeviceSynchronize();
 
         /* OTHER DIRECTION */
-        ic = lees_edw_plane_location(le, plane) + 1;
-        lees_edw_buffer_displacement(le, nhalo, t, &dy);
-        dy = fmod(-dy, ltot[Y]);
-        jdy = floor(dy);
-        fr = dy - jdy;
-
-        interpolation<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_negative, negprop, ic, jdy, ndist, fr);
-        cudaDeviceSynchronize();
-
-        copy_back<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_negative, negprop, ic, ndist, fr);
-        cudaDeviceSynchronize();
-    }
+        // ic = lees_edw_plane_location(le, plane) + 1;
+        // lees_edw_buffer_displacement(le, nhalo, t, &dy);
+        // dy = fmod(-dy, ltot[Y]);
+        // jdy = floor(dy);
+        // fr = dy - jdy;
+
+        // interpolation<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_negative, negprop, ic, jdy, ndist, fr);
+        // cudaDeviceSynchronize();
+
+        // copy_back<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_negative, negprop, ic, ndist, fr);
+        // cudaDeviceSynchronize();
+    // }
 
     return;
 }

From dab82d68c88bc5551a9ac8d999e51b01ed52cefd Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Fri, 7 Jul 2023 01:16:00 +0100
Subject: [PATCH 017/133] resize the recv_buffer and modify the index, the
 result is correct now

---
 src/model_le.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index f6ba9cf08..c0957fdda 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -136,7 +136,7 @@ __global__ void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *
             for (int i = 0; i < nprop; i++) {
                 int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
                 int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, positive[i]);
-                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
+                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
                 recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
             }
         }
@@ -160,8 +160,8 @@ __global__ void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *
             for (int i = 0; i < negprop; i++) {
                 int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
                 int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, negative[i]);
-                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i;
-                recv_buff[index + displacement] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
+                recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
             }
         }
     }
@@ -190,7 +190,7 @@ __global__ void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *posi
 
         for (int n = 0; n < ndist; n++) {
             for (int i = 0; i < nprop; i++) {
-                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i;
+                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
                 int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
                 lb->f[la] = recv_buff[index];
             }
@@ -203,9 +203,9 @@ __global__ void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *posi
 
         for (int n = 0; n < ndist; n++) {
             for (int i = 0; i < negprop; i++) {
-                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i;
+                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
                 int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
-                lb->f[la] = recv_buff[index + displacement];
+                lb->f[la] = recv_buff[index];
             }
         }
     }
@@ -445,7 +445,7 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
         if (lb->model.cv[p][X] == -1) negprop += 1;
     }
     displacement = ndist * nprop * nlocal[Y] * nlocal[Z];
-    ndata = 2 * displacement;
+    ndata = 2 * nplane * displacement;
     cudaMalloc((void**)&recv_buff, ndata * sizeof(double));
 
     if (recv_buff == NULL) {

From 068996b7ea7c15e298faa5d62cfbf1aed675e168 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Thu, 13 Jul 2023 13:28:38 +0100
Subject: [PATCH 018/133] changed part of the cuda to tdp, it works. This
 commit is to reserve the correct code

---
 src/model_le.c | 63 +++++++++++++++++++-------------------------------
 1 file changed, 24 insertions(+), 39 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index c0957fdda..039e5eb7a 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -32,6 +32,7 @@
 #include "util.h"
 
 #include "leesedwards.h"
+#include "target.h"
 
 __global__ static void le_reproject(lb_t *lb, lees_edw_t *le);
 static void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
@@ -419,7 +420,6 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
 
     assert(lb);
     assert(le);
-    
 
     lees_edw_ltot(le, ltot);
     lees_edw_nlocal(le, nlocal);
@@ -446,7 +446,7 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     }
     displacement = ndist * nprop * nlocal[Y] * nlocal[Z];
     ndata = 2 * nplane * displacement;
-    cudaMalloc((void**)&recv_buff, ndata * sizeof(double));
+    tdpMalloc((void**)&recv_buff, ndata * sizeof(double));
 
     if (recv_buff == NULL) {
         pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
@@ -469,47 +469,32 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
 
     // copy the displacement array to the device
     int *d_positive, *d_negative;
-    cudaMalloc((void**)&d_positive, sizeof(int) * nprop);
-    cudaMalloc((void**)&d_negative, sizeof(int) * negprop);
-    cudaMemcpy(d_positive, positive, sizeof(int) * nprop, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_negative, negative, sizeof(int) * negprop, cudaMemcpyHostToDevice);
-    
+    tdpMalloc((void**)&d_positive, sizeof(int) * nprop);
+    tdpMalloc((void**)&d_negative, sizeof(int) * negprop);
+    tdpMemcpy(d_positive, positive, sizeof(int) * nprop, tdpMemcpyHostToDevice);
+    tdpMemcpy(d_negative, negative, sizeof(int) * negprop, tdpMemcpyHostToDevice);
+
     //define a Cuda model
     dim3 numBlocks((nplane + 7) / 8, (nlocal[Y] + 7) / 8, (nlocal[Z] + 7) / 8);
     dim3 threadsPerBlock(8, 8, 8);
-   
-
-    // for (plane = 0; plane < nplane; plane++) {
-        // ic = lees_edw_plane_location(le, plane);
-        // lees_edw_buffer_displacement(le, nhalo, t, &dy);
-        // dy = fmod(dy, ltot[Y]);
-        // jdy = floor(dy);
-        // fr = dy - jdy;
-    // printf("nplane=%d, nhalo=%d, ndist=%d, t=%.2f, nlocal[2]=%d, ltot[2]=%.2f \n\n", 
-    //         nplane, nhalo, ndist, t, nlocal[2], ltot[2]);
-    // return;
-
-    interpolation<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, 
-        d_positive, d_negative, nprop, negprop, displacement, t);
-    cudaDeviceSynchronize();
-    
-    copy_back<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, 
-        d_positive, d_negative, nprop, negprop, displacement);
-    cudaDeviceSynchronize();
 
-        /* OTHER DIRECTION */
-        // ic = lees_edw_plane_location(le, plane) + 1;
-        // lees_edw_buffer_displacement(le, nhalo, t, &dy);
-        // dy = fmod(-dy, ltot[Y]);
-        // jdy = floor(dy);
-        // fr = dy - jdy;
-
-        // interpolation<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_negative, negprop, ic, jdy, ndist, fr);
-        // cudaDeviceSynchronize();
-
-        // copy_back<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, d_negative, negprop, ic, ndist, fr);
-        // cudaDeviceSynchronize();
-    // }
+    // interpolation<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, 
+    //     d_positive, d_negative, nprop, negprop, displacement, t);
+    tdpLaunchKernel(interpolation, numBlocks, threadsPerBlock, 0, 0, lb->target, le_target, 
+        recv_buff, d_positive, d_negative, nprop, negprop, displacement, t);
+    tdpDeviceSynchronize();
+
+    // copy_back<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, 
+    //     d_positive, d_negative, nprop, negprop, displacement);
+    tdpLaunchKernel(copy_back, numBlocks, threadsPerBlock, 0, 0, lb->target, le_target, 
+        recv_buff, d_positive, d_negative, nprop, negprop, displacement);
+    tdpDeviceSynchronize();
+
+    free(positive);
+    free(negative);
+    tdpFree(recv_buff);
+    tdpFree(d_positive);
+    tdpFree(d_negative);
 
     return;
 }

From 0ee36d5cb33d23ca50d45cee002d253cc4b672bb Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Thu, 13 Jul 2023 14:38:26 +0100
Subject: [PATCH 019/133] finish applying tdp on itepolation, there is
 reprojection to go

---
 src/model_le.c | 254 ++++++++++++++++++++++++++++++++++---------------
 1 file changed, 178 insertions(+), 76 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index 039e5eb7a..2ad5463e8 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -37,9 +37,9 @@
 __global__ static void le_reproject(lb_t *lb, lees_edw_t *le);
 static void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
 __global__ static void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *positive,  
-    int *negative, int nprop, int negprop, int displacement, double t);
+    int *negative, int nprop, int negprop, int displacement, double t, kernel_ctxt_t * ktxt);
 __global__ static void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *positive,  
-    int *negative, int nprop, int negprop, int displacement);
+    int *negative, int nprop, int negprop, int displacement, kernel_ctxt_t * ktxt);
 static int le_displace_and_interpolate_parallel(lb_t *lb, lees_edw_t *le);
 
 void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
@@ -99,7 +99,7 @@ cudaError_t copy_buffer_duy_to_device(lees_edw_s* d_lees_edw, int* h_buffer_duy,
 }
 
 __global__ void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *positive,  
-    int *negative, int nprop, int negprop, int displacement, double t) {
+    int *negative, int nprop, int negprop, int displacement, double t, kernel_ctxt_t * ktxt) {
     int plane, ic, jc, kc;
     int nhalo, ndist, nplane;
     int jdy, j1, j2, index0, index1;
@@ -113,64 +113,126 @@ __global__ void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *
     lees_edw_ltot(le, ltot);
     nplane = lees_edw_nplane_local(le);
 
-    plane = blockIdx.x * blockDim.x + threadIdx.x;
-    jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
-    kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
+    int kindex;
+    int kiter;
+    assert(ktxt);
+    kiter = kernel_iterations(ktxt);
 
-    if (jc <= nlocal[Y] && kc <= nlocal[Z] && plane < nplane) {
+    for_simt_parallel(kindex, kiter, 1) {
+        plane = kernel_coords_ic(ktxt, kindex);
+        jc = kernel_coords_jc(ktxt, kindex);
+        kc = kernel_coords_kc(ktxt, kindex);
 
-        ic = lees_edw_plane_location(le, plane);
-        lees_edw_buffer_displacement(le, nhalo, t, &dy);
-        dy = fmod(dy, ltot[Y]);
-        jdy = floor(dy);
-        fr = dy - jdy;
+        if (jc <= nlocal[Y] && kc <= nlocal[Z] && plane < nplane) {
+
+            ic = lees_edw_plane_location(le, plane);
+            lees_edw_buffer_displacement(le, nhalo, t, &dy);
+            dy = fmod(dy, ltot[Y]);
+            jdy = floor(dy);
+            fr = dy - jdy;
 
-        j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
-        j2 = 1 + (j1 % nlocal[Y]);
+            j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+            j2 = 1 + (j1 % nlocal[Y]);
 
-        index0 = lees_edw_index(le, ic, j1, kc);
-        index1 = lees_edw_index(le, ic, j2, kc);
+            index0 = lees_edw_index(le, ic, j1, kc);
+            index1 = lees_edw_index(le, ic, j2, kc);
 
-        /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
+            /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
 
-        for (int n = 0; n < ndist; n++) {
-            for (int i = 0; i < nprop; i++) {
-                int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
-                int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, positive[i]);
-                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
-                recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+            for (int n = 0; n < ndist; n++) {
+                for (int i = 0; i < nprop; i++) {
+                    int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
+                    int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, positive[i]);
+                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
+                    recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+                }
             }
-        }
 
-        /* OTHER DIRECTION */
-        ic = lees_edw_plane_location(le, plane) + 1;
-        lees_edw_buffer_displacement(le, nhalo, t, &dy);
-        dy = fmod(-dy, ltot[Y]);
-        jdy = floor(dy);
-        fr = dy - jdy;
+            /* OTHER DIRECTION */
+            ic = lees_edw_plane_location(le, plane) + 1;
+            lees_edw_buffer_displacement(le, nhalo, t, &dy);
+            dy = fmod(-dy, ltot[Y]);
+            jdy = floor(dy);
+            fr = dy - jdy;
 
-        j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
-        j2 = 1 + (j1 % nlocal[Y]);
+            j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+            j2 = 1 + (j1 % nlocal[Y]);
 
-        index0 = lees_edw_index(le, ic, j1, kc);
-        index1 = lees_edw_index(le, ic, j2, kc);
+            index0 = lees_edw_index(le, ic, j1, kc);
+            index1 = lees_edw_index(le, ic, j2, kc);
 
-        /* xdisp_fwd_cv[0] identifies cv[p][X] = -1 */
+            /* xdisp_fwd_cv[0] identifies cv[p][X] = -1 */
 
-        for (int n = 0; n < ndist; n++) {
-            for (int i = 0; i < negprop; i++) {
-                int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
-                int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, negative[i]);
-                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
-                recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+            for (int n = 0; n < ndist; n++) {
+                for (int i = 0; i < negprop; i++) {
+                    int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
+                    int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, negative[i]);
+                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
+                    recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+                }
             }
         }
     }
 
+
+    // plane = blockIdx.x * blockDim.x + threadIdx.x;
+    // jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
+    // kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
+
+    // if (jc <= nlocal[Y] && kc <= nlocal[Z] && plane < nplane) {
+
+    //     ic = lees_edw_plane_location(le, plane);
+    //     lees_edw_buffer_displacement(le, nhalo, t, &dy);
+    //     dy = fmod(dy, ltot[Y]);
+    //     jdy = floor(dy);
+    //     fr = dy - jdy;
+
+    //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+    //     j2 = 1 + (j1 % nlocal[Y]);
+
+    //     index0 = lees_edw_index(le, ic, j1, kc);
+    //     index1 = lees_edw_index(le, ic, j2, kc);
+
+    //     /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
+
+    //     for (int n = 0; n < ndist; n++) {
+    //         for (int i = 0; i < nprop; i++) {
+    //             int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
+    //             int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, positive[i]);
+    //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
+    //             recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+    //         }
+    //     }
+
+    //     /* OTHER DIRECTION */
+    //     ic = lees_edw_plane_location(le, plane) + 1;
+    //     lees_edw_buffer_displacement(le, nhalo, t, &dy);
+    //     dy = fmod(-dy, ltot[Y]);
+    //     jdy = floor(dy);
+    //     fr = dy - jdy;
+
+    //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
+    //     j2 = 1 + (j1 % nlocal[Y]);
+
+    //     index0 = lees_edw_index(le, ic, j1, kc);
+    //     index1 = lees_edw_index(le, ic, j2, kc);
+
+    //     /* xdisp_fwd_cv[0] identifies cv[p][X] = -1 */
+
+    //     for (int n = 0; n < ndist; n++) {
+    //         for (int i = 0; i < negprop; i++) {
+    //             int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
+    //             int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, negative[i]);
+    //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
+    //             recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+    //         }
+    //     }
+    // }
+
 }
 
 __global__ void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *positive,  
-    int *negative, int nprop, int negprop, int displacement) {
+    int *negative, int nprop, int negprop, int displacement, kernel_ctxt_t * ktxt) {
     int plane, ic, jc, kc;
     int nhalo, ndist, nplane;
     int nlocal[3];
@@ -181,35 +243,72 @@ __global__ void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *posi
     lees_edw_nhalo(le, &nhalo);
     nplane = lees_edw_nplane_local(le);
 
-    plane = blockIdx.x * blockDim.x + threadIdx.x;
-    jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
-    kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
-
-    if (jc <= nlocal[Y] && kc <= nlocal[Z] && plane < nplane) {
-        ic = lees_edw_plane_location(le, plane);
-        index0 = lees_edw_index(le, ic, jc, kc);
-
-        for (int n = 0; n < ndist; n++) {
-            for (int i = 0; i < nprop; i++) {
-                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
-                int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
-                lb->f[la] = recv_buff[index];
+    int kindex;
+    int kiter;
+    assert(ktxt);
+    kiter = kernel_iterations(ktxt);
+
+    for_simt_parallel(kindex, kiter, 1) {
+        plane = kernel_coords_ic(ktxt, kindex);
+        jc = kernel_coords_jc(ktxt, kindex);
+        kc = kernel_coords_kc(ktxt, kindex);
+
+        if (jc <= nlocal[Y] && kc <= nlocal[Z] && plane < nplane) {
+            ic = lees_edw_plane_location(le, plane);
+            index0 = lees_edw_index(le, ic, jc, kc);
+
+            for (int n = 0; n < ndist; n++) {
+                for (int i = 0; i < nprop; i++) {
+                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
+                    int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
+                    lb->f[la] = recv_buff[index];
+                }
             }
-        }
 
-        /* Another direction */
+            /* Another direction */
 
-        ic = lees_edw_plane_location(le, plane) + 1; 
-        index0 = lees_edw_index(le, ic, jc, kc);
+            ic = lees_edw_plane_location(le, plane) + 1; 
+            index0 = lees_edw_index(le, ic, jc, kc);
 
-        for (int n = 0; n < ndist; n++) {
-            for (int i = 0; i < negprop; i++) {
-                int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
-                int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
-                lb->f[la] = recv_buff[index];
+            for (int n = 0; n < ndist; n++) {
+                for (int i = 0; i < negprop; i++) {
+                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
+                    int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
+                    lb->f[la] = recv_buff[index];
+                }
             }
         }
-    }
+    }  
+
+    // plane = blockIdx.x * blockDim.x + threadIdx.x;
+    // jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
+    // kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
+    
+    // if (jc <= nlocal[Y] && kc <= nlocal[Z] && plane < nplane) {
+    //     ic = lees_edw_plane_location(le, plane);
+    //     index0 = lees_edw_index(le, ic, jc, kc);
+
+    //     for (int n = 0; n < ndist; n++) {
+    //         for (int i = 0; i < nprop; i++) {
+    //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
+    //             int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
+    //             lb->f[la] = recv_buff[index];
+    //         }
+    //     }
+
+    //     /* Another direction */
+
+    //     ic = lees_edw_plane_location(le, plane) + 1; 
+    //     index0 = lees_edw_index(le, ic, jc, kc);
+
+    //     for (int n = 0; n < ndist; n++) {
+    //         for (int i = 0; i < negprop; i++) {
+    //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
+    //             int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
+    //             lb->f[la] = recv_buff[index];
+    //         }
+    //     }
+    // }
 }
 
 /*****************************************************************************
@@ -474,20 +573,23 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     tdpMemcpy(d_positive, positive, sizeof(int) * nprop, tdpMemcpyHostToDevice);
     tdpMemcpy(d_negative, negative, sizeof(int) * negprop, tdpMemcpyHostToDevice);
 
-    //define a Cuda model
-    dim3 numBlocks((nplane + 7) / 8, (nlocal[Y] + 7) / 8, (nlocal[Z] + 7) / 8);
-    dim3 threadsPerBlock(8, 8, 8);
+    //tdp
+    dim3 nblk, ntpb;
+    kernel_info_t limits;
+    kernel_ctxt_t * ctxt = NULL;
+    limits.imin = 0; limits.imax = nplane;
+    limits.jmin = 1; limits.jmax = nlocal[Y] + 1;
+    limits.kmin = 1; limits.kmax = nlocal[Z] + 1;
+
+    kernel_ctxt_create(le->cs, NSIMDVL, limits, &ctxt);
+    kernel_ctxt_launch_param(ctxt, &nblk, &ntpb);
 
-    // interpolation<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, 
-    //     d_positive, d_negative, nprop, negprop, displacement, t);
-    tdpLaunchKernel(interpolation, numBlocks, threadsPerBlock, 0, 0, lb->target, le_target, 
-        recv_buff, d_positive, d_negative, nprop, negprop, displacement, t);
+    tdpLaunchKernel(interpolation, nblk, ntpb, 0, 0, lb->target, le_target, 
+        recv_buff, d_positive, d_negative, nprop, negprop, displacement, t, ctxt->target);
     tdpDeviceSynchronize();
 
-    // copy_back<<<numBlocks, threadsPerBlock>>>(lb->target, le_target, recv_buff, 
-    //     d_positive, d_negative, nprop, negprop, displacement);
-    tdpLaunchKernel(copy_back, numBlocks, threadsPerBlock, 0, 0, lb->target, le_target, 
-        recv_buff, d_positive, d_negative, nprop, negprop, displacement);
+    tdpLaunchKernel(copy_back, nblk, ntpb, 0, 0, lb->target, le_target, 
+        recv_buff, d_positive, d_negative, nprop, negprop, displacement, ctxt->target);
     tdpDeviceSynchronize();
 
     free(positive);

From fd24e77df9a921f4c02f589520e9facf89753ad8 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Sat, 15 Jul 2023 17:46:15 +0100
Subject: [PATCH 020/133] add recv_buff in the lb struct, and add allocation
 and deallocation of this buffer in model.c file

---
 src/lb_data.h  |   4 +-
 src/model.c    |  35 +++++++++++++-
 src/model_le.c | 128 ++++++++-----------------------------------------
 3 files changed, 58 insertions(+), 109 deletions(-)

diff --git a/src/lb_data.h b/src/lb_data.h
index 50b464523..7da58fa53 100644
--- a/src/lb_data.h
+++ b/src/lb_data.h
@@ -125,7 +125,9 @@ struct lb_data_s {
 
   lb_data_options_t opts;       /* Copy of run time options */
   lb_halo_t h;                  /* halo information/buffers */
-
+  
+  double * recv_buff;
+  
   lb_t * target;                /* copy of this structure on target */
 };
 
diff --git a/src/model.c b/src/model.c
index d6fc2f61e..6059704c9 100644
--- a/src/model.c
+++ b/src/model.c
@@ -168,7 +168,33 @@ int lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * options,
 				   &obj->output);
     if (ifail != 0) pe_fatal(pe, "lb_data: bad i/o output decomposition\n"); 
   }
-
+    
+    
+    int nplane = cs->leopts.nplanes;
+    if (nplane > 0) {
+        int nprop = 0;
+        for (int p = 1; p < obj->model.nvel; p++) {
+            if (obj->model.cv[p][X] == +1) nprop += 1;
+        }
+
+        int ndist = obj->ndist;
+        int nlocal[3];
+        cs_nlocal(cs, nlocal);
+
+        int displacement = ndist * nprop * nlocal[Y] * nlocal[Z];
+        int ndata = 2 * nplane * displacement;
+        obj->recv_buff = (double *)malloc(sizeof(double) * ndata);
+
+        int ndevice;
+        printf("ndevice = %d \n\n", ndevice);
+        tdpGetDeviceCount(&ndevice);
+        if (ndevice > 0) {
+            double *tmp;
+            tdpMalloc((void**)&tmp, ndata * sizeof(double));
+            tdpMemcpy(&(obj->target->recv_buff), &tmp, sizeof(double*), tdpMemcpyHostToDevice);
+        }
+    }
+    
   *lb = obj;
 
   return 0;
@@ -198,6 +224,11 @@ __host__ int lb_free(lb_t * lb) {
     tdpMemcpy(&tmp, &lb->target->fprime, sizeof(double *),
 	      tdpMemcpyDeviceToHost); 
     tdpFree(tmp);
+
+    tdpMemcpy(&tmp, &lb->target->recv_buff, sizeof(double *),
+	      tdpMemcpyDeviceToHost); 
+    tdpFree(tmp);
+
     tdpFree(lb->target);
   }
 
@@ -209,6 +240,8 @@ __host__ int lb_free(lb_t * lb) {
   if (lb->f) free(lb->f);
   if (lb->fprime) free(lb->fprime);
 
+    if (lb->recv_buff) free(lb->recv_buff);
+
   lb_halo_free(lb, &lb->h);
   lb_model_free(&lb->model);
 
diff --git a/src/model_le.c b/src/model_le.c
index 2ad5463e8..8eb736cbb 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -36,9 +36,9 @@
 
 __global__ static void le_reproject(lb_t *lb, lees_edw_t *le);
 static void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
-__global__ static void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *positive,  
+__global__ static void interpolation(lb_t *lb, lees_edw_t *le, int *positive,  
     int *negative, int nprop, int negprop, int displacement, double t, kernel_ctxt_t * ktxt);
-__global__ static void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *positive,  
+__global__ static void copy_back(lb_t *lb, lees_edw_t *le, int *positive,  
     int *negative, int nprop, int negprop, int displacement, kernel_ctxt_t * ktxt);
 static int le_displace_and_interpolate_parallel(lb_t *lb, lees_edw_t *le);
 
@@ -98,7 +98,7 @@ cudaError_t copy_buffer_duy_to_device(lees_edw_s* d_lees_edw, int* h_buffer_duy,
     return cudaSuccess;
 }
 
-__global__ void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *positive,  
+__global__ void interpolation(lb_t *lb, lees_edw_t *le, int *positive,  
     int *negative, int nprop, int negprop, int displacement, double t, kernel_ctxt_t * ktxt) {
     int plane, ic, jc, kc;
     int nhalo, ndist, nplane;
@@ -144,7 +144,7 @@ __global__ void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *
                     int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
                     int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, positive[i]);
                     int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
-                    recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+                    lb->recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
                 }
             }
 
@@ -168,70 +168,14 @@ __global__ void interpolation(lb_t *lb, lees_edw_t *le, double *recv_buff, int *
                     int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
                     int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, negative[i]);
                     int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
-                    recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+                    lb->recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
                 }
             }
         }
     }
-
-
-    // plane = blockIdx.x * blockDim.x + threadIdx.x;
-    // jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
-    // kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
-
-    // if (jc <= nlocal[Y] && kc <= nlocal[Z] && plane < nplane) {
-
-    //     ic = lees_edw_plane_location(le, plane);
-    //     lees_edw_buffer_displacement(le, nhalo, t, &dy);
-    //     dy = fmod(dy, ltot[Y]);
-    //     jdy = floor(dy);
-    //     fr = dy - jdy;
-
-    //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
-    //     j2 = 1 + (j1 % nlocal[Y]);
-
-    //     index0 = lees_edw_index(le, ic, j1, kc);
-    //     index1 = lees_edw_index(le, ic, j2, kc);
-
-    //     /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
-
-    //     for (int n = 0; n < ndist; n++) {
-    //         for (int i = 0; i < nprop; i++) {
-    //             int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
-    //             int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, positive[i]);
-    //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
-    //             recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
-    //         }
-    //     }
-
-    //     /* OTHER DIRECTION */
-    //     ic = lees_edw_plane_location(le, plane) + 1;
-    //     lees_edw_buffer_displacement(le, nhalo, t, &dy);
-    //     dy = fmod(-dy, ltot[Y]);
-    //     jdy = floor(dy);
-    //     fr = dy - jdy;
-
-    //     j1 = 1 + (jc + jdy - 1 + 2 * nlocal[Y]) % nlocal[Y];
-    //     j2 = 1 + (j1 % nlocal[Y]);
-
-    //     index0 = lees_edw_index(le, ic, j1, kc);
-    //     index1 = lees_edw_index(le, ic, j2, kc);
-
-    //     /* xdisp_fwd_cv[0] identifies cv[p][X] = -1 */
-
-    //     for (int n = 0; n < ndist; n++) {
-    //         for (int i = 0; i < negprop; i++) {
-    //             int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
-    //             int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, negative[i]);
-    //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
-    //             recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
-    //         }
-    //     }
-    // }
-
 }
 
-__global__ void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *positive,  
+__global__ void copy_back(lb_t *lb, lees_edw_t *le, int *positive,  
     int *negative, int nprop, int negprop, int displacement, kernel_ctxt_t * ktxt) {
     int plane, ic, jc, kc;
     int nhalo, ndist, nplane;
@@ -261,7 +205,7 @@ __global__ void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *posi
                 for (int i = 0; i < nprop; i++) {
                     int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
                     int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
-                    lb->f[la] = recv_buff[index];
+                    lb->f[la] = lb->recv_buff[index];
                 }
             }
 
@@ -274,41 +218,11 @@ __global__ void copy_back(lb_t *lb, lees_edw_t *le, double *recv_buff, int *posi
                 for (int i = 0; i < negprop; i++) {
                     int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
                     int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
-                    lb->f[la] = recv_buff[index];
+                    lb->f[la] = lb->recv_buff[index];
                 }
             }
         }
     }  
-
-    // plane = blockIdx.x * blockDim.x + threadIdx.x;
-    // jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
-    // kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
-    
-    // if (jc <= nlocal[Y] && kc <= nlocal[Z] && plane < nplane) {
-    //     ic = lees_edw_plane_location(le, plane);
-    //     index0 = lees_edw_index(le, ic, jc, kc);
-
-    //     for (int n = 0; n < ndist; n++) {
-    //         for (int i = 0; i < nprop; i++) {
-    //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
-    //             int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
-    //             lb->f[la] = recv_buff[index];
-    //         }
-    //     }
-
-    //     /* Another direction */
-
-    //     ic = lees_edw_plane_location(le, plane) + 1; 
-    //     index0 = lees_edw_index(le, ic, jc, kc);
-
-    //     for (int n = 0; n < ndist; n++) {
-    //         for (int i = 0; i < negprop; i++) {
-    //             int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
-    //             int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
-    //             lb->f[la] = recv_buff[index];
-    //         }
-    //     }
-    // }
 }
 
 /*****************************************************************************
@@ -513,7 +427,7 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     double dy, fr;
     double t;
     double ltot[3];
-    double *recv_buff;
+    // double *recv_buff;
     physics_t *phys = NULL;
     lees_edw_t * le_target;
 
@@ -544,13 +458,13 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
         if (lb->model.cv[p][X] == -1) negprop += 1;
     }
     displacement = ndist * nprop * nlocal[Y] * nlocal[Z];
-    ndata = 2 * nplane * displacement;
-    tdpMalloc((void**)&recv_buff, ndata * sizeof(double));
+    // ndata = 2 * nplane * displacement;
+    // tdpMalloc((void**)&recv_buff, ndata * sizeof(double));
 
-    if (recv_buff == NULL) {
-        pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
-    }
-    assert(recv_buff);
+    // if (lb->recv_buff == NULL) {
+    //     pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
+    // }
+    // assert(lb->recv_buff);
 
     // record the displacement of propgation
     int *positive = (int *)malloc(sizeof(int) * nprop);
@@ -577,24 +491,24 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     dim3 nblk, ntpb;
     kernel_info_t limits;
     kernel_ctxt_t * ctxt = NULL;
-    limits.imin = 0; limits.imax = nplane;
-    limits.jmin = 1; limits.jmax = nlocal[Y] + 1;
-    limits.kmin = 1; limits.kmax = nlocal[Z] + 1;
+    limits.imin = 0; limits.imax = nplane - 1;
+    limits.jmin = 1; limits.jmax = nlocal[Y];
+    limits.kmin = 1; limits.kmax = nlocal[Z];
 
     kernel_ctxt_create(le->cs, NSIMDVL, limits, &ctxt);
     kernel_ctxt_launch_param(ctxt, &nblk, &ntpb);
 
     tdpLaunchKernel(interpolation, nblk, ntpb, 0, 0, lb->target, le_target, 
-        recv_buff, d_positive, d_negative, nprop, negprop, displacement, t, ctxt->target);
+        d_positive, d_negative, nprop, negprop, displacement, t, ctxt->target);
     tdpDeviceSynchronize();
 
     tdpLaunchKernel(copy_back, nblk, ntpb, 0, 0, lb->target, le_target, 
-        recv_buff, d_positive, d_negative, nprop, negprop, displacement, ctxt->target);
+        d_positive, d_negative, nprop, negprop, displacement, ctxt->target);
     tdpDeviceSynchronize();
 
     free(positive);
     free(negative);
-    tdpFree(recv_buff);
+    // tdpFree(recv_buff);
     tdpFree(d_positive);
     tdpFree(d_negative);
 

From 464f60cf83652b9fa92c378e0937271718d15838 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Sun, 16 Jul 2023 00:52:09 +0100
Subject: [PATCH 021/133] used tdp in reprojection

---
 src/model_le.c | 202 ++++++++++++++++++++++++-------------------------
 1 file changed, 100 insertions(+), 102 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index 8eb736cbb..ef9317f76 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -34,7 +34,7 @@
 #include "leesedwards.h"
 #include "target.h"
 
-__global__ static void le_reproject(lb_t *lb, lees_edw_t *le);
+__global__ static void le_reproject(lb_t *lb, lees_edw_t *le, kernel_ctxt_t * ktxt);
 static void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
 __global__ static void interpolation(lb_t *lb, lees_edw_t *le, int *positive,  
     int *negative, int nprop, int negprop, int displacement, double t, kernel_ctxt_t * ktxt);
@@ -49,30 +49,30 @@ void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
     double *d_wv;
     double *d_na;
 
-    cudaMalloc((void**)&d_cv, sizeof(int8_t[3]) * nvel);
-    cudaMalloc((void**)&d_wv, sizeof(double) * nvel);
-    cudaMalloc((void**)&d_na, sizeof(double) * nvel);
+    tdpMalloc((void**)&d_cv, sizeof(int8_t[3]) * nvel);
+    tdpMalloc((void**)&d_wv, sizeof(double) * nvel);
+    tdpMalloc((void**)&d_na, sizeof(double) * nvel);
     
     // Copy the data from host to the GPU
-    cudaMemcpy(d_cv, h_model->cv, sizeof(int8_t[3]) * nvel, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_wv, h_model->wv, sizeof(double) * nvel, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_na, h_model->na, sizeof(double) * nvel, cudaMemcpyHostToDevice);
+    tdpMemcpy(d_cv, h_model->cv, sizeof(int8_t[3]) * nvel, tdpMemcpyHostToDevice);
+    tdpMemcpy(d_wv, h_model->wv, sizeof(double) * nvel, tdpMemcpyHostToDevice);
+    tdpMemcpy(d_na, h_model->na, sizeof(double) * nvel, tdpMemcpyHostToDevice);
     
     // Set the pointers in the struct to the newly allocated GPU memory
-    cudaMemcpy(&(d_model->cv), &d_cv, sizeof(int8_t(*)[3]), cudaMemcpyHostToDevice);
-    cudaMemcpy(&(d_model->wv), &d_wv, sizeof(double*), cudaMemcpyHostToDevice);
-    cudaMemcpy(&(d_model->na), &d_na, sizeof(double*), cudaMemcpyHostToDevice);
+    tdpMemcpy(&(d_model->cv), &d_cv, sizeof(int8_t(*)[3]), tdpMemcpyHostToDevice);
+    tdpMemcpy(&(d_model->wv), &d_wv, sizeof(double*), tdpMemcpyHostToDevice);
+    tdpMemcpy(&(d_model->na), &d_na, sizeof(double*), tdpMemcpyHostToDevice);
 
     //copy the rest data to gpu
-    cudaMemcpy(&(d_model->ndim), &(h_model->ndim), sizeof(int8_t), cudaMemcpyHostToDevice);
-    cudaMemcpy(&(d_model->nvel), &(h_model->nvel), sizeof(int8_t), cudaMemcpyHostToDevice);
-    cudaMemcpy(&(d_model->cs2), &(h_model->cs2), sizeof(double), cudaMemcpyHostToDevice);
+    tdpMemcpy(&(d_model->ndim), &(h_model->ndim), sizeof(int8_t), tdpMemcpyHostToDevice);
+    tdpMemcpy(&(d_model->nvel), &(h_model->nvel), sizeof(int8_t), tdpMemcpyHostToDevice);
+    tdpMemcpy(&(d_model->cs2), &(h_model->cs2), sizeof(double), tdpMemcpyHostToDevice);
 }
 
 cudaError_t copy_buffer_duy_to_device(lees_edw_s* d_lees_edw, int* h_buffer_duy, size_t nxbuffer) {
     // First, allocate memory on the device for the buffer_duy array
     int* d_buffer_duy;
-    cudaError_t err = cudaMalloc((void**) &d_buffer_duy, nxbuffer * sizeof(int));
+    cudaError_t err = tdpMalloc((void**) &d_buffer_duy, nxbuffer * sizeof(int));
 
     if (err != cudaSuccess) {
         fprintf(stderr, "Failed to allocate device memory for buffer_duy (error code %s)!\n", cudaGetErrorString(err));
@@ -80,7 +80,7 @@ cudaError_t copy_buffer_duy_to_device(lees_edw_s* d_lees_edw, int* h_buffer_duy,
     }
 
     // Then, copy the data from the host array to the newly allocated device array
-    err = cudaMemcpy(d_buffer_duy, h_buffer_duy, nxbuffer * sizeof(int), cudaMemcpyHostToDevice);
+    err = tdpMemcpy(d_buffer_duy, h_buffer_duy, nxbuffer * sizeof(int), tdpMemcpyHostToDevice);
     
     if (err != cudaSuccess) {
         fprintf(stderr, "Failed to copy buffer_duy from host to device (error code %s)!\n", cudaGetErrorString(err));
@@ -88,7 +88,7 @@ cudaError_t copy_buffer_duy_to_device(lees_edw_s* d_lees_edw, int* h_buffer_duy,
     }
 
     // Finally, update the pointer in the device structure to point to the new device array
-    err = cudaMemcpy(&(d_lees_edw->buffer_duy), &d_buffer_duy, sizeof(int*), cudaMemcpyHostToDevice);
+    err = tdpMemcpy(&(d_lees_edw->buffer_duy), &d_buffer_duy, sizeof(int*), tdpMemcpyHostToDevice);
 
     if (err != cudaSuccess) {
         fprintf(stderr, "Failed to copy buffer_duy pointer to device structure (error code %s)!\n", cudaGetErrorString(err));
@@ -264,14 +264,24 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
 
         copy_buffer_duy_to_device(le_target, le->buffer_duy, le->param->nxbuffer);
 
-        int nlocal[3];
+        int nlocal[3], nplane;
         lees_edw_nlocal(le, nlocal);
-        dim3 numBlocks(1, (nlocal[Y] + 15) / 16, (nlocal[Z] + 15) / 16);
-        dim3 threadsPerBlock(1, 16, 16);
-        le_reproject<<<numBlocks, threadsPerBlock>>>(lb->target, le_target);
-        cudaDeviceSynchronize();
-      
+        nplane = lees_edw_nplane_local(le);
 
+        //tdp
+        dim3 nblk, ntpb;
+        kernel_info_t limits;
+        kernel_ctxt_t * ctxt = NULL;
+        limits.imin = 0; limits.imax = 2 * nplane - 1;
+        limits.jmin = 1; limits.jmax = nlocal[Y];
+        limits.kmin = 1; limits.kmax = nlocal[Z];
+
+        kernel_ctxt_create(le->cs, NSIMDVL, limits, &ctxt);
+        kernel_ctxt_launch_param(ctxt, &nblk, &ntpb);
+
+        tdpLaunchKernel(le_reproject, nblk, ntpb, 0, 0, lb->target, le_target, ctxt->target);
+        tdpAssert(tdpPeekAtLastError());
+        tdpAssert(tdpDeviceSynchronize());
 
         if (mpi_cartsz[Y] > 1) {
             le_displace_and_interpolate_parallel(lb, le);
@@ -306,14 +316,13 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
  *
  *****************************************************************************/
 
-__global__ static void le_reproject(lb_t *lb, lees_edw_t *le) {
-    int ic, jc, kc, index;
-    int nplane, plane, side;
+__global__ static void le_reproject(lb_t *lb, lees_edw_t *le, kernel_ctxt_t * ktxt) {
+    int ic, jc, kc, side, index;
+    int nplane, plane;
     int ia, ib;
     int nlocal[3];
     int n, ndist;
     int8_t cx = 0;
-
     double rho, ds[3][3], udotc, sdotq;
     double g[3], du[3];
     double fnew;
@@ -322,80 +331,82 @@ __global__ static void le_reproject(lb_t *lb, lees_edw_t *le) {
 
     assert(lb);
     assert(le);
+    assert(ktxt);
 
     lb_ndist(lb, &ndist);
     nplane = lees_edw_nplane_local(le);
     physics_ref(&phys);
-
-    t = 1.0 * physics_control_timestep(phys);
     lees_edw_nlocal(le, nlocal);
+    t = 1.0 * physics_control_timestep(phys);
     
-    jc = blockIdx.y * blockDim.y + threadIdx.y + 1;
-    kc = blockIdx.z * blockDim.z + threadIdx.z + 1;
-    
-
-    if (jc <= nlocal[Y] && kc <= nlocal[Z]) {
-        for (plane = 0; plane < nplane; plane++) {
-            for (side = 0; side < 2; side++) {
+    int kindex;
+    int kiter;
+    kiter = kernel_iterations(ktxt);
 
-                du[X] = 0.0;
-                du[Y] = 0.0;
-                du[Z] = 0.0;
+    for_simt_parallel(kindex, kiter, 1) {
+        side = kernel_coords_ic(ktxt, kindex);
+        jc = kernel_coords_jc(ktxt, kindex);
+        kc = kernel_coords_kc(ktxt, kindex);
 
-                if (side == 0) {
-                    /* Start with plane below Lees-Edwards BC */
-                    lees_edw_plane_uy_now(le, t, &du[Y]);
-                    du[Y] *= -1.0;
-                    ic = lees_edw_plane_location(le, plane);
-                    cx = +1;
-                }
-                else {
-                    /* Finally, deal with plane above LEBC */
-                    lees_edw_plane_uy_now(le, t, &du[Y]);
-                    ic = lees_edw_plane_location(le, plane) + 1;
-                    cx = -1;
-                }
+        if (jc <= nlocal[Y] && kc <= nlocal[Z] && side < 2 * nplane) {
+            du[X] = 0.0;
+            du[Y] = 0.0;
+            du[Z] = 0.0;
+
+            plane = side / 2;
+            if (side % 2 == 0) {
+                /* Start with plane below Lees-Edwards BC */
+                lees_edw_plane_uy_now(le, t, &du[Y]);
+                du[Y] *= -1.0;
+                ic = lees_edw_plane_location(le, plane);
+                cx = +1;
+            }
+            else {
+                /* Finally, deal with plane above LEBC */
+                lees_edw_plane_uy_now(le, t, &du[Y]);
+                ic = lees_edw_plane_location(le, plane) + 1;
+                cx = -1;
+            }
 
-                index = lees_edw_index(le, ic, jc, kc);
+            index = lees_edw_index(le, ic, jc, kc);
 
-                for (n = 0; n < ndist; n++) {
+            for (n = 0; n < ndist; n++) {
 
-                    /* Compute 0th and 1st moments */
-                    lb_dist_enum_t ndn = (lb_dist_enum_t)n;
-                    lb_0th_moment(lb, index, ndn, &rho);
-                    lb_1st_moment(lb, index, ndn, g);
+                /* Compute 0th and 1st moments */
+                lb_dist_enum_t ndn = (lb_dist_enum_t)n;
+                lb_0th_moment(lb, index, ndn, &rho);
+                lb_1st_moment(lb, index, ndn, g);
 
-                    for (ia = 0; ia < 3; ia++) {
-                        for (ib = 0; ib < 3; ib++) {
-                            ds[ia][ib] = (g[ia] * du[ib] + du[ia] * g[ib] + rho * du[ia] * du[ib]);
-                        }
+                for (ia = 0; ia < 3; ia++) {
+                    for (ib = 0; ib < 3; ib++) {
+                        ds[ia][ib] = (g[ia] * du[ib] + du[ia] * g[ib] + rho * du[ia] * du[ib]);
                     }
+                }
 
-                    /* Now update the distribution */
-                    for (int p = 1; p < lb->model.nvel; p++) {
+                /* Now update the distribution */
+                for (int p = 1; p < lb->model.nvel; p++) {
 
-                        double cs2 = lb->model.cs2;
-                        double rcs2 = 1.0 / cs2;
-                        if (lb->model.cv[p][X] != cx)
-                            continue;
+                    double cs2 = lb->model.cs2;
+                    double rcs2 = 1.0 / cs2;
+                    if (lb->model.cv[p][X] != cx)
+                        continue;
 
-                        udotc = du[Y] * lb->model.cv[p][Y];
-                        sdotq = 0.0;
+                    udotc = du[Y] * lb->model.cv[p][Y];
+                    sdotq = 0.0;
 
-                        for (ia = 0; ia < 3; ia++) {
-                            for (ib = 0; ib < 3; ib++) {
-                                double dab = cs2 * (ia == ib);
-                                double q = (lb->model.cv[p][ia] * lb->model.cv[p][ib] - dab);
-                                sdotq += ds[ia][ib] * q;
-                            }
+                    for (ia = 0; ia < 3; ia++) {
+                        for (ib = 0; ib < 3; ib++) {
+                            double dab = cs2 * (ia == ib);
+                            double q = (lb->model.cv[p][ia] * lb->model.cv[p][ib] - dab);
+                            sdotq += ds[ia][ib] * q;
                         }
+                    }
 
-                        /* Project all this back to the distribution. */
+                    /* Project all this back to the distribution. */
 
-                        lb_f(lb, index, p, n, &fnew);
-                        fnew += lb->model.wv[p] * (rho * udotc * rcs2 + 0.5 * sdotq * rcs2 * rcs2);
-                        lb_f_set(lb, index, p, n, fnew);
-                    }
+                    lb_f(lb, index, p, n, &fnew);
+                    fnew += lb->model.wv[p] * (rho * udotc * rcs2 + 0.5 * sdotq * rcs2 * rcs2);
+                    lb_f_set(lb, index, p, n, fnew);
                 }
             }
         }
@@ -415,28 +426,20 @@ __global__ static void le_reproject(lb_t *lb, lees_edw_t *le) {
  *****************************************************************************/
 
 void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
-    int ic, jc, kc;
-    int index0;// index1;
+
     int nlocal[3];
-    int nplane, plane; // n
-    int jdy; // j1, j2;
+    int nplane;
     int ndist;
     int nprop, negprop;
-    int nhalo;
-    int ndata, displacement;
-    double dy, fr;
+    int displacement;
     double t;
-    double ltot[3];
-    // double *recv_buff;
     physics_t *phys = NULL;
     lees_edw_t * le_target;
 
     assert(lb);
     assert(le);
 
-    lees_edw_ltot(le, ltot);
     lees_edw_nlocal(le, nlocal);
-    lees_edw_nhalo(le, &nhalo);
     nplane = lees_edw_nplane_local(le);
     physics_ref(&phys);
     lb_ndist(lb, &ndist);
@@ -449,7 +452,6 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
      * determined by the size of the local domain, and the number
      * of plane-crossing distributions. */
 
-
     /* Allocate a buffer large enough for all cvp[][X] = +1 */
     nprop = 0;
     negprop = 0;
@@ -458,13 +460,6 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
         if (lb->model.cv[p][X] == -1) negprop += 1;
     }
     displacement = ndist * nprop * nlocal[Y] * nlocal[Z];
-    // ndata = 2 * nplane * displacement;
-    // tdpMalloc((void**)&recv_buff, ndata * sizeof(double));
-
-    // if (lb->recv_buff == NULL) {
-    //     pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
-    // }
-    // assert(lb->recv_buff);
 
     // record the displacement of propgation
     int *positive = (int *)malloc(sizeof(int) * nprop);
@@ -500,15 +495,18 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
 
     tdpLaunchKernel(interpolation, nblk, ntpb, 0, 0, lb->target, le_target, 
         d_positive, d_negative, nprop, negprop, displacement, t, ctxt->target);
-    tdpDeviceSynchronize();
+    tdpAssert(tdpPeekAtLastError());
+    tdpAssert(tdpDeviceSynchronize());
 
     tdpLaunchKernel(copy_back, nblk, ntpb, 0, 0, lb->target, le_target, 
         d_positive, d_negative, nprop, negprop, displacement, ctxt->target);
-    tdpDeviceSynchronize();
+    tdpAssert(tdpPeekAtLastError());
+    tdpAssert(tdpDeviceSynchronize());
+
+    kernel_ctxt_free(ctxt);
 
     free(positive);
     free(negative);
-    // tdpFree(recv_buff);
     tdpFree(d_positive);
     tdpFree(d_negative);
 

From 23178b54f8d10b38e2102786631ce49c5b219fb3 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Thu, 20 Jul 2023 00:45:34 +0100
Subject: [PATCH 022/133] small modifications, can ignore this commit, the
 result doesn't change

---
 src/model.c    |  5 ++---
 src/model_le.c | 16 +++++++++++-----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/model.c b/src/model.c
index 6059704c9..031806321 100644
--- a/src/model.c
+++ b/src/model.c
@@ -186,12 +186,11 @@ int lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * options,
         obj->recv_buff = (double *)malloc(sizeof(double) * ndata);
 
         int ndevice;
-        printf("ndevice = %d \n\n", ndevice);
         tdpGetDeviceCount(&ndevice);
         if (ndevice > 0) {
             double *tmp;
-            tdpMalloc((void**)&tmp, ndata * sizeof(double));
-            tdpMemcpy(&(obj->target->recv_buff), &tmp, sizeof(double*), tdpMemcpyHostToDevice);
+            tdpAssert(tdpMalloc((void**)&tmp, ndata * sizeof(double)));
+            tdpAssert(tdpMemcpy(&(obj->target->recv_buff), &tmp, sizeof(double*), tdpMemcpyHostToDevice));
         }
     }
     
diff --git a/src/model_le.c b/src/model_le.c
index ef9317f76..6a959fb87 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -257,12 +257,18 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
 
         /* Everything must be done on host at the moment (slowly) ... */
         /* ... and copy back at the end */
-        copyModelToDevice(&lb->model, &lb->target->model);
-       
         lees_edw_t * le_target;
-        lees_edw_target(le, &le_target);
-
-        copy_buffer_duy_to_device(le_target, le->buffer_duy, le->param->nxbuffer);
+        int ndevice;
+        tdpGetDeviceCount(&ndevice);
+        
+        if (ndevice > 0) {
+            copyModelToDevice(&lb->model, &lb->target->model);
+            
+            lees_edw_target(le, &le_target);
+            copy_buffer_duy_to_device(le_target, le->buffer_duy, le->param->nxbuffer);
+        }
+       
+        
 
         int nlocal[3], nplane;
         lees_edw_nlocal(le, nlocal);

From 4899371d6d578b29cc12737410532dfb3fe8c363 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Thu, 20 Jul 2023 01:21:12 +0100
Subject: [PATCH 023/133] move the calculation of displacment and nprop into
 kernel, and calsulate the index from cv[][] directly, eliminate positive and
 negative arrays

---
 src/leesedwards.h |   1 +
 src/model_le.c    | 120 ++++++++++++++++++++--------------------------
 2 files changed, 54 insertions(+), 67 deletions(-)

diff --git a/src/leesedwards.h b/src/leesedwards.h
index 1697ed9a4..d7229a81e 100644
--- a/src/leesedwards.h
+++ b/src/leesedwards.h
@@ -62,6 +62,7 @@ struct lees_edw_param_s {
     double omega;    /* u_y = u_le cos (omega t) for oscillatory */
     double time0;    /* time offset */
 };
+
 __host__ int lees_edw_create(pe_t * pe, cs_t * coords,
 			     const lees_edw_options_t * opts,
 			     lees_edw_t ** le);
diff --git a/src/model_le.c b/src/model_le.c
index 6a959fb87..cdac5c434 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -36,10 +36,8 @@
 
 __global__ static void le_reproject(lb_t *lb, lees_edw_t *le, kernel_ctxt_t * ktxt);
 static void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
-__global__ static void interpolation(lb_t *lb, lees_edw_t *le, int *positive,  
-    int *negative, int nprop, int negprop, int displacement, double t, kernel_ctxt_t * ktxt);
-__global__ static void copy_back(lb_t *lb, lees_edw_t *le, int *positive,  
-    int *negative, int nprop, int negprop, int displacement, kernel_ctxt_t * ktxt);
+__global__ static void interpolation(lb_t *lb, lees_edw_t *le, double t, kernel_ctxt_t * ktxt);
+__global__ static void copy_back(lb_t *lb, lees_edw_t *le, kernel_ctxt_t * ktxt);
 static int le_displace_and_interpolate_parallel(lb_t *lb, lees_edw_t *le);
 
 void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
@@ -98,11 +96,11 @@ cudaError_t copy_buffer_duy_to_device(lees_edw_s* d_lees_edw, int* h_buffer_duy,
     return cudaSuccess;
 }
 
-__global__ void interpolation(lb_t *lb, lees_edw_t *le, int *positive,  
-    int *negative, int nprop, int negprop, int displacement, double t, kernel_ctxt_t * ktxt) {
+__global__ void interpolation(lb_t *lb, lees_edw_t *le, double t, kernel_ctxt_t * ktxt) {
     int plane, ic, jc, kc;
     int nhalo, ndist, nplane;
     int jdy, j1, j2, index0, index1;
+    int nprop, displacement;
     double dy, fr;
     double ltot[3];
     int nlocal[3];
@@ -113,6 +111,13 @@ __global__ void interpolation(lb_t *lb, lees_edw_t *le, int *positive,
     lees_edw_ltot(le, ltot);
     nplane = lees_edw_nplane_local(le);
 
+    nprop = 0;
+    for (int p = 1; p < lb->model.nvel; p++) {
+        if (lb->model.cv[p][X] == +1) nprop += 1;
+    }
+    displacement = ndist * nprop * nlocal[Y] * nlocal[Z];
+
+
     int kindex;
     int kiter;
     assert(ktxt);
@@ -140,14 +145,16 @@ __global__ void interpolation(lb_t *lb, lees_edw_t *le, int *positive,
             /* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
 
             for (int n = 0; n < ndist; n++) {
-                for (int i = 0; i < nprop; i++) {
-                    int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
-                    int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, positive[i]);
-                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
-                    lb->recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+                for (int p = 1, i = 0; p < lb->model.nvel; p++) {
+                    if (lb->model.cv[p][X] == +1) {
+                        int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
+                        int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p);
+                        int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
+                        lb->recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+                        i++;
+                    }
                 }
             }
-
             /* OTHER DIRECTION */
             ic = lees_edw_plane_location(le, plane) + 1;
             lees_edw_buffer_displacement(le, nhalo, t, &dy);
@@ -164,29 +171,39 @@ __global__ void interpolation(lb_t *lb, lees_edw_t *le, int *positive,
             /* xdisp_fwd_cv[0] identifies cv[p][X] = -1 */
 
             for (int n = 0; n < ndist; n++) {
-                for (int i = 0; i < negprop; i++) {
-                    int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
-                    int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, negative[i]);
-                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
-                    lb->recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+                for (int p = 1, i = 0; p < lb->model.nvel; p++) {
+                    if (lb->model.cv[p][X] == -1) {
+                        int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
+                        int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p);
+                        int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + (2 * plane + 1) * displacement;
+                        lb->recv_buff[index] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
+                        i++;
+                    }
                 }
             }
+            
         }
     }
 }
 
-__global__ void copy_back(lb_t *lb, lees_edw_t *le, int *positive,  
-    int *negative, int nprop, int negprop, int displacement, kernel_ctxt_t * ktxt) {
+__global__ void copy_back(lb_t *lb, lees_edw_t *le, kernel_ctxt_t * ktxt) {
     int plane, ic, jc, kc;
     int nhalo, ndist, nplane;
     int nlocal[3];
     int index0;
+    int nprop, displacement;
 
     lb_ndist(lb, &ndist);
     lees_edw_nlocal(le, nlocal);
     lees_edw_nhalo(le, &nhalo);
     nplane = lees_edw_nplane_local(le);
 
+    nprop = 0;
+    for (int p = 1; p < lb->model.nvel; p++) {
+        if (lb->model.cv[p][X] == +1) nprop += 1;
+    }
+    displacement = ndist * nprop * nlocal[Y] * nlocal[Z];
+
     int kindex;
     int kiter;
     assert(ktxt);
@@ -202,10 +219,13 @@ __global__ void copy_back(lb_t *lb, lees_edw_t *le, int *positive,
             index0 = lees_edw_index(le, ic, jc, kc);
 
             for (int n = 0; n < ndist; n++) {
-                for (int i = 0; i < nprop; i++) {
-                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
-                    int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, positive[i]);
-                    lb->f[la] = lb->recv_buff[index];
+                for (int p = 1, i = 0; p < lb->model.nvel; p++) {
+                    if (lb->model.cv[p][X] == +1) {
+                        int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + 2 * plane * displacement;
+                        int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
+                        lb->f[la] = lb->recv_buff[index];
+                        i++;
+                    }
                 }
             }
 
@@ -215,10 +235,13 @@ __global__ void copy_back(lb_t *lb, lees_edw_t *le, int *positive,
             index0 = lees_edw_index(le, ic, jc, kc);
 
             for (int n = 0; n < ndist; n++) {
-                for (int i = 0; i < negprop; i++) {
-                    int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*negprop + n*negprop + i + (2 * plane + 1) * displacement;
-                    int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, negative[i]);
-                    lb->f[la] = lb->recv_buff[index];
+                for (int p = 1, i = 0; p < lb->model.nvel; p++) {
+                    if (lb->model.cv[p][X] == -1) {
+                        int index = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + i + (2 * plane + 1) * displacement;
+                        int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
+                        lb->f[la] = lb->recv_buff[index];
+                        i++;
+                    }
                 }
             }
         }
@@ -436,7 +459,7 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     int nlocal[3];
     int nplane;
     int ndist;
-    int nprop, negprop;
+    int nprop;
     int displacement;
     double t;
     physics_t *phys = NULL;
@@ -458,36 +481,6 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
      * determined by the size of the local domain, and the number
      * of plane-crossing distributions. */
 
-    /* Allocate a buffer large enough for all cvp[][X] = +1 */
-    nprop = 0;
-    negprop = 0;
-    for (int p = 1; p < lb->model.nvel; p++) {
-        if (lb->model.cv[p][X] == +1) nprop += 1;
-        if (lb->model.cv[p][X] == -1) negprop += 1;
-    }
-    displacement = ndist * nprop * nlocal[Y] * nlocal[Z];
-
-    // record the displacement of propgation
-    int *positive = (int *)malloc(sizeof(int) * nprop);
-    int *negative = (int *)malloc(sizeof(int) * negprop);
-    for (int p = 1, i = 0, j = 0; p < lb->model.nvel; p++) {
-        if (lb->model.cv[p][X] == +1) {
-            positive[i] = p;
-            i++;
-        }
-        if (lb->model.cv[p][X] == -1) {
-            negative[j] = p;
-            j++;
-        }
-    }
-
-    // copy the displacement array to the device
-    int *d_positive, *d_negative;
-    tdpMalloc((void**)&d_positive, sizeof(int) * nprop);
-    tdpMalloc((void**)&d_negative, sizeof(int) * negprop);
-    tdpMemcpy(d_positive, positive, sizeof(int) * nprop, tdpMemcpyHostToDevice);
-    tdpMemcpy(d_negative, negative, sizeof(int) * negprop, tdpMemcpyHostToDevice);
-
     //tdp
     dim3 nblk, ntpb;
     kernel_info_t limits;
@@ -499,23 +492,16 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     kernel_ctxt_create(le->cs, NSIMDVL, limits, &ctxt);
     kernel_ctxt_launch_param(ctxt, &nblk, &ntpb);
 
-    tdpLaunchKernel(interpolation, nblk, ntpb, 0, 0, lb->target, le_target, 
-        d_positive, d_negative, nprop, negprop, displacement, t, ctxt->target);
+    tdpLaunchKernel(interpolation, nblk, ntpb, 0, 0, lb->target, le_target, t, ctxt->target);
     tdpAssert(tdpPeekAtLastError());
     tdpAssert(tdpDeviceSynchronize());
 
-    tdpLaunchKernel(copy_back, nblk, ntpb, 0, 0, lb->target, le_target, 
-        d_positive, d_negative, nprop, negprop, displacement, ctxt->target);
+    tdpLaunchKernel(copy_back, nblk, ntpb, 0, 0, lb->target, le_target, ctxt->target);
     tdpAssert(tdpPeekAtLastError());
     tdpAssert(tdpDeviceSynchronize());
 
     kernel_ctxt_free(ctxt);
 
-    free(positive);
-    free(negative);
-    tdpFree(d_positive);
-    tdpFree(d_negative);
-
     return;
 }
 

From 271821b05bf38cc5bc4144cfecd7fc48ff31d84d Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Thu, 20 Jul 2023 16:25:28 +0100
Subject: [PATCH 024/133] change to merge

---
 src/leesedwards.c | 82 +++++++++++++++++++++++------------------------
 src/leesedwards.h | 41 ------------------------
 2 files changed, 41 insertions(+), 82 deletions(-)

diff --git a/src/leesedwards.c b/src/leesedwards.c
index 5b6a7055b..2af2ccc4a 100644
--- a/src/leesedwards.c
+++ b/src/leesedwards.c
@@ -24,47 +24,47 @@
 #include "leesedwards.h"
 #include "util.h"
 
-// typedef struct lees_edw_param_s lees_edw_param_t;
-
-// struct lees_edw_s {
-//     pe_t *pe;        /* Parallel environment */
-//     cs_t *cs;        /* Coordinate system */
-//     physics_t *phys; /* Constants, time step */
-
-//     lees_edw_param_t *param; /* Parameters */
-
-//     int nref;            /* Reference count */
-//     int *icbuff_to_real; /* look up table */
-//     int *icreal_to_buff; /* look up table */
-//     int *buffer_duy;     /* look up table +/- uy as function of ib */
-
-//     MPI_Comm le_comm;       /* 1-d communicator */
-//     MPI_Comm le_plane_comm; /* 2-d communicator */
-
-//     lees_edw_t *target; /* Device memory */
-// };
-
-// struct lees_edw_param_s {
-//     /* Local parameters */
-//     int nplanelocal; /* Number of planes local domain */
-//     int nxbuffer;    /* Size of buffer region in x */
-//     int index_real_nbuffer;
-//     /* For cs */
-//     int nhalo;
-//     int str[3];
-//     int nlocal[3];
-//     /* Global parameters */
-//     int nplanetotal; /* Total number of planes */
-//     int type;        /* Shear type */
-//     int period;      /* for oscillatory */
-//     int nt0;         /* time0 (input as integer) */
-//     int nsites;      /* Number of sites incl buffer planes */
-//     double uy;       /* u[Y] for all planes */
-//     double dx_min;   /* Position first plane */
-//     double dx_sep;   /* Plane separation */
-//     double omega;    /* u_y = u_le cos (omega t) for oscillatory */
-//     double time0;    /* time offset */
-// };
+typedef struct lees_edw_param_s lees_edw_param_t;
+
+struct lees_edw_s {
+    pe_t *pe;        /* Parallel environment */
+    cs_t *cs;        /* Coordinate system */
+    physics_t *phys; /* Constants, time step */
+
+    lees_edw_param_t *param; /* Parameters */
+
+    int nref;            /* Reference count */
+    int *icbuff_to_real; /* look up table */
+    int *icreal_to_buff; /* look up table */
+    int *buffer_duy;     /* look up table +/- uy as function of ib */
+
+    MPI_Comm le_comm;       /* 1-d communicator */
+    MPI_Comm le_plane_comm; /* 2-d communicator */
+
+    lees_edw_t *target; /* Device memory */
+};
+
+struct lees_edw_param_s {
+    /* Local parameters */
+    int nplanelocal; /* Number of planes local domain */
+    int nxbuffer;    /* Size of buffer region in x */
+    int index_real_nbuffer;
+    /* For cs */
+    int nhalo;
+    int str[3];
+    int nlocal[3];
+    /* Global parameters */
+    int nplanetotal; /* Total number of planes */
+    int type;        /* Shear type */
+    int period;      /* for oscillatory */
+    int nt0;         /* time0 (input as integer) */
+    int nsites;      /* Number of sites incl buffer planes */
+    double uy;       /* u[Y] for all planes */
+    double dx_min;   /* Position first plane */
+    double dx_sep;   /* Plane separation */
+    double omega;    /* u_y = u_le cos (omega t) for oscillatory */
+    double time0;    /* time offset */
+};
 
 static int lees_edw_init(lees_edw_t *le, const lees_edw_options_t *info);
 static int lees_edw_checks(lees_edw_t *le);
diff --git a/src/leesedwards.h b/src/leesedwards.h
index d7229a81e..03812738f 100644
--- a/src/leesedwards.h
+++ b/src/leesedwards.h
@@ -21,47 +21,6 @@
 #include "lees_edwards_options.h"
 
 typedef struct lees_edw_s lees_edw_t;
-typedef struct lees_edw_param_s lees_edw_param_t;
-
-struct lees_edw_s {
-    pe_t *pe;        /* Parallel environment */
-    cs_t *cs;        /* Coordinate system */
-    physics_t *phys; /* Constants, time step */
-
-    lees_edw_param_t *param; /* Parameters */
-
-    int nref;            /* Reference count */
-    int *icbuff_to_real; /* look up table */
-    int *icreal_to_buff; /* look up table */
-    int *buffer_duy;     /* look up table +/- uy as function of ib */
-
-    MPI_Comm le_comm;       /* 1-d communicator */
-    MPI_Comm le_plane_comm; /* 2-d communicator */
-
-    lees_edw_t *target; /* Device memory */
-};
-
-struct lees_edw_param_s {
-    /* Local parameters */
-    int nplanelocal; /* Number of planes local domain */
-    int nxbuffer;    /* Size of buffer region in x */
-    int index_real_nbuffer;
-    /* For cs */
-    int nhalo;
-    int str[3];
-    int nlocal[3];
-    /* Global parameters */
-    int nplanetotal; /* Total number of planes */
-    int type;        /* Shear type */
-    int period;      /* for oscillatory */
-    int nt0;         /* time0 (input as integer) */
-    int nsites;      /* Number of sites incl buffer planes */
-    double uy;       /* u[Y] for all planes */
-    double dx_min;   /* Position first plane */
-    double dx_sep;   /* Plane separation */
-    double omega;    /* u_y = u_le cos (omega t) for oscillatory */
-    double time0;    /* time offset */
-};
 
 __host__ int lees_edw_create(pe_t * pe, cs_t * coords,
 			     const lees_edw_options_t * opts,

From 0de830df07fe22f167f0c414e2795ae5dd984679 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Fri, 21 Jul 2023 23:36:54 +0100
Subject: [PATCH 025/133] modified the code to adapt the new code which deleted
 buffer_duy in the le struct

---
 src/leesedwards.c | 15 ++++++++++++
 src/leesedwards.h |  1 +
 src/model_le.c    | 61 +++++++++++++++++++++++++----------------------
 3 files changed, 48 insertions(+), 29 deletions(-)

diff --git a/src/leesedwards.c b/src/leesedwards.c
index 81d83948f..2b7429c77 100644
--- a/src/leesedwards.c
+++ b/src/leesedwards.c
@@ -205,6 +205,21 @@ __host__ int lees_edw_target(lees_edw_t *le, lees_edw_t **target) {
     return 0;
 }
 
+/*****************************************************************************
+ *
+ \*  lees_edw_cs
+ *
+ *****************************************************************************/
+
+__host__ int lees_edw_cs(lees_edw_t * le, cs_t ** cs) {
+    assert(le);
+    assert(cs);
+
+    *cs = le->cs;
+
+    return 0;
+}
+
 /*****************************************************************************
  *
  *  lees_edw_init
diff --git a/src/leesedwards.h b/src/leesedwards.h
index 04853ebae..82a71864c 100644
--- a/src/leesedwards.h
+++ b/src/leesedwards.h
@@ -30,6 +30,7 @@ __host__ int lees_edw_free(lees_edw_t * le);
 __host__ int lees_edw_retain(lees_edw_t * le);
 __host__ int lees_edw_commit(lees_edw_t * le);
 __host__ int lees_edw_target(lees_edw_t * le, lees_edw_t ** target);
+__host__ int lees_edw_cs(lees_edw_t * le, cs_t ** cs);
 
 __host__ int lees_edw_info(lees_edw_t * le);
 __host__ int lees_edw_comm(lees_edw_t * le, MPI_Comm * comm);
diff --git a/src/model_le.c b/src/model_le.c
index cdac5c434..adb1f5f97 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -67,34 +67,34 @@ void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
     tdpMemcpy(&(d_model->cs2), &(h_model->cs2), sizeof(double), tdpMemcpyHostToDevice);
 }
 
-cudaError_t copy_buffer_duy_to_device(lees_edw_s* d_lees_edw, int* h_buffer_duy, size_t nxbuffer) {
-    // First, allocate memory on the device for the buffer_duy array
-    int* d_buffer_duy;
-    cudaError_t err = tdpMalloc((void**) &d_buffer_duy, nxbuffer * sizeof(int));
-
-    if (err != cudaSuccess) {
-        fprintf(stderr, "Failed to allocate device memory for buffer_duy (error code %s)!\n", cudaGetErrorString(err));
-        return err;
-    }
-
-    // Then, copy the data from the host array to the newly allocated device array
-    err = tdpMemcpy(d_buffer_duy, h_buffer_duy, nxbuffer * sizeof(int), tdpMemcpyHostToDevice);
+// cudaError_t copy_buffer_duy_to_device(lees_edw_s* d_lees_edw, int* h_buffer_duy, size_t nxbuffer) {
+//     // First, allocate memory on the device for the buffer_duy array
+//     int* d_buffer_duy;
+//     cudaError_t err = tdpMalloc((void**) &d_buffer_duy, nxbuffer * sizeof(int));
+
+//     if (err != cudaSuccess) {
+//         fprintf(stderr, "Failed to allocate device memory for buffer_duy (error code %s)!\n", cudaGetErrorString(err));
+//         return err;
+//     }
+
+//     // Then, copy the data from the host array to the newly allocated device array
+//     err = tdpMemcpy(d_buffer_duy, h_buffer_duy, nxbuffer * sizeof(int), tdpMemcpyHostToDevice);
     
-    if (err != cudaSuccess) {
-        fprintf(stderr, "Failed to copy buffer_duy from host to device (error code %s)!\n", cudaGetErrorString(err));
-        return err;
-    }
+//     if (err != cudaSuccess) {
+//         fprintf(stderr, "Failed to copy buffer_duy from host to device (error code %s)!\n", cudaGetErrorString(err));
+//         return err;
+//     }
 
-    // Finally, update the pointer in the device structure to point to the new device array
-    err = tdpMemcpy(&(d_lees_edw->buffer_duy), &d_buffer_duy, sizeof(int*), tdpMemcpyHostToDevice);
+//     // Finally, update the pointer in the device structure to point to the new device array
+//     err = tdpMemcpy(&(d_lees_edw->buffer_duy), &d_buffer_duy, sizeof(int*), tdpMemcpyHostToDevice);
 
-    if (err != cudaSuccess) {
-        fprintf(stderr, "Failed to copy buffer_duy pointer to device structure (error code %s)!\n", cudaGetErrorString(err));
-        return err;
-    }
+//     if (err != cudaSuccess) {
+//         fprintf(stderr, "Failed to copy buffer_duy pointer to device structure (error code %s)!\n", cudaGetErrorString(err));
+//         return err;
+//     }
 
-    return cudaSuccess;
-}
+//     return cudaSuccess;
+// }
 
 __global__ void interpolation(lb_t *lb, lees_edw_t *le, double t, kernel_ctxt_t * ktxt) {
     int plane, ic, jc, kc;
@@ -281,14 +281,17 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
         /* Everything must be done on host at the moment (slowly) ... */
         /* ... and copy back at the end */
         lees_edw_t * le_target;
+        cs_t *cs;
         int ndevice;
         tdpGetDeviceCount(&ndevice);
+
         
         if (ndevice > 0) {
             copyModelToDevice(&lb->model, &lb->target->model);
             
             lees_edw_target(le, &le_target);
-            copy_buffer_duy_to_device(le_target, le->buffer_duy, le->param->nxbuffer);
+            lees_edw_cs(le_target, &cs);
+           // copy_buffer_duy_to_device(le_target, le->buffer_duy, le->param->nxbuffer);
         }
        
         
@@ -305,7 +308,7 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
         limits.jmin = 1; limits.jmax = nlocal[Y];
         limits.kmin = 1; limits.kmax = nlocal[Z];
 
-        kernel_ctxt_create(le->cs, NSIMDVL, limits, &ctxt);
+        kernel_ctxt_create(cs, NSIMDVL, limits, &ctxt);
         kernel_ctxt_launch_param(ctxt, &nblk, &ntpb);
 
         tdpLaunchKernel(le_reproject, nblk, ntpb, 0, 0, lb->target, le_target, ctxt->target);
@@ -459,11 +462,10 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     int nlocal[3];
     int nplane;
     int ndist;
-    int nprop;
-    int displacement;
     double t;
     physics_t *phys = NULL;
     lees_edw_t * le_target;
+    cs_t * cs;
 
     assert(lb);
     assert(le);
@@ -473,6 +475,7 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     physics_ref(&phys);
     lb_ndist(lb, &ndist);
     lees_edw_target(le, &le_target);
+    lees_edw_cs(le, &cs);
 
     t = 1.0 * physics_control_timestep(phys);
 
@@ -489,7 +492,7 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     limits.jmin = 1; limits.jmax = nlocal[Y];
     limits.kmin = 1; limits.kmax = nlocal[Z];
 
-    kernel_ctxt_create(le->cs, NSIMDVL, limits, &ctxt);
+    kernel_ctxt_create(cs, NSIMDVL, limits, &ctxt);
     kernel_ctxt_launch_param(ctxt, &nblk, &ntpb);
 
     tdpLaunchKernel(interpolation, nblk, ntpb, 0, 0, lb->target, le_target, t, ctxt->target);

From 2588adf2e07cb86e7d9565695d70da0c0db1f1c9 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Sat, 5 Aug 2023 11:20:51 +0100
Subject: [PATCH 026/133] former verion has a small bug, fixed it

---
 src/model_le.c | 53 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 19 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index adb1f5f97..8f5b411ef 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -33,6 +33,7 @@
 
 #include "leesedwards.h"
 #include "target.h"
+#include "nvToolsExt.h"
 
 __global__ static void le_reproject(lb_t *lb, lees_edw_t *le, kernel_ctxt_t * ktxt);
 static void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
@@ -117,7 +118,6 @@ __global__ void interpolation(lb_t *lb, lees_edw_t *le, double t, kernel_ctxt_t
     }
     displacement = ndist * nprop * nlocal[Y] * nlocal[Z];
 
-
     int kindex;
     int kiter;
     assert(ktxt);
@@ -268,6 +268,7 @@ __global__ void copy_back(lb_t *lb, lees_edw_t *le, kernel_ctxt_t * ktxt) {
  *****************************************************************************/
 
 __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
+    nvtxRangeId_t id = nvtxRangeStartA("MY ASCII LABEL");
     int mpi_cartsz[3];
 
     assert(lb);
@@ -276,26 +277,20 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
     lees_edw_cartsz(le, mpi_cartsz);
 
     if (lees_edw_nplane_local(le) > 0) {
+        int ndevice;
         TIMER_start(TIMER_LE);
-
-        /* Everything must be done on host at the moment (slowly) ... */
-        /* ... and copy back at the end */
         lees_edw_t * le_target;
         cs_t *cs;
-        int ndevice;
+
         tdpGetDeviceCount(&ndevice);
+        lees_edw_cs(le, &cs);
 
-        
         if (ndevice > 0) {
             copyModelToDevice(&lb->model, &lb->target->model);
-            
             lees_edw_target(le, &le_target);
-            lees_edw_cs(le_target, &cs);
            // copy_buffer_duy_to_device(le_target, le->buffer_duy, le->param->nxbuffer);
         }
        
-        
-
         int nlocal[3], nplane;
         lees_edw_nlocal(le, nlocal);
         nplane = lees_edw_nplane_local(le);
@@ -311,7 +306,12 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
         kernel_ctxt_create(cs, NSIMDVL, limits, &ctxt);
         kernel_ctxt_launch_param(ctxt, &nblk, &ntpb);
 
-        tdpLaunchKernel(le_reproject, nblk, ntpb, 0, 0, lb->target, le_target, ctxt->target);
+        if (ndevice > 0) {
+            tdpLaunchKernel(le_reproject, nblk, ntpb, 0, 0, lb->target, le_target, ctxt->target);
+        }
+        else {
+            tdpLaunchKernel(le_reproject, nblk, ntpb, 0, 0, lb, le, ctxt);
+        }
         tdpAssert(tdpPeekAtLastError());
         tdpAssert(tdpDeviceSynchronize());
 
@@ -321,10 +321,12 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
         else {
             le_displace_and_interpolate(lb, le);
         }
-        
+        kernel_ctxt_free(ctxt);
+
         TIMER_stop(TIMER_LE);
     }
-
+    nvtxRangeEnd(id);
+    
     return 0;
 }
 
@@ -462,6 +464,7 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     int nlocal[3];
     int nplane;
     int ndist;
+    int ndevice;
     double t;
     physics_t *phys = NULL;
     lees_edw_t * le_target;
@@ -476,6 +479,7 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     lb_ndist(lb, &ndist);
     lees_edw_target(le, &le_target);
     lees_edw_cs(le, &cs);
+    tdpGetDeviceCount(&ndevice);
 
     t = 1.0 * physics_control_timestep(phys);
 
@@ -495,14 +499,25 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
     kernel_ctxt_create(cs, NSIMDVL, limits, &ctxt);
     kernel_ctxt_launch_param(ctxt, &nblk, &ntpb);
 
-    tdpLaunchKernel(interpolation, nblk, ntpb, 0, 0, lb->target, le_target, t, ctxt->target);
-    tdpAssert(tdpPeekAtLastError());
-    tdpAssert(tdpDeviceSynchronize());
+    if (ndevice > 0) {
+        tdpLaunchKernel(interpolation, nblk, ntpb, 0, 0, lb->target, le_target, t, ctxt->target);
+        tdpAssert(tdpPeekAtLastError());
+        tdpAssert(tdpDeviceSynchronize());
 
-    tdpLaunchKernel(copy_back, nblk, ntpb, 0, 0, lb->target, le_target, ctxt->target);
-    tdpAssert(tdpPeekAtLastError());
-    tdpAssert(tdpDeviceSynchronize());
+        tdpLaunchKernel(copy_back, nblk, ntpb, 0, 0, lb->target, le_target, ctxt->target);
+        tdpAssert(tdpPeekAtLastError());
+        tdpAssert(tdpDeviceSynchronize());
+    } 
+    else {
+        tdpLaunchKernel(interpolation, nblk, ntpb, 0, 0, lb, le, t, ctxt);
+        tdpAssert(tdpPeekAtLastError());
+        tdpAssert(tdpDeviceSynchronize());
 
+        tdpLaunchKernel(copy_back, nblk, ntpb, 0, 0, lb, le, ctxt);
+        tdpAssert(tdpPeekAtLastError());
+        tdpAssert(tdpDeviceSynchronize());
+    }
+    
     kernel_ctxt_free(ctxt);
 
     return;

From 2be1cf321bbf777d6deaaa1d8c7a629d58811ee7 Mon Sep 17 00:00:00 2001
From: Anton <s2329216@.ed.ac.uk>
Date: Mon, 14 Aug 2023 23:49:42 +0100
Subject: [PATCH 027/133] delete some comments

---
 src/model_le.c | 40 ----------------------------------------
 1 file changed, 40 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index 8f5b411ef..8fd529a7e 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -33,7 +33,6 @@
 
 #include "leesedwards.h"
 #include "target.h"
-#include "nvToolsExt.h"
 
 __global__ static void le_reproject(lb_t *lb, lees_edw_t *le, kernel_ctxt_t * ktxt);
 static void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le);
@@ -68,35 +67,6 @@ void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
     tdpMemcpy(&(d_model->cs2), &(h_model->cs2), sizeof(double), tdpMemcpyHostToDevice);
 }
 
-// cudaError_t copy_buffer_duy_to_device(lees_edw_s* d_lees_edw, int* h_buffer_duy, size_t nxbuffer) {
-//     // First, allocate memory on the device for the buffer_duy array
-//     int* d_buffer_duy;
-//     cudaError_t err = tdpMalloc((void**) &d_buffer_duy, nxbuffer * sizeof(int));
-
-//     if (err != cudaSuccess) {
-//         fprintf(stderr, "Failed to allocate device memory for buffer_duy (error code %s)!\n", cudaGetErrorString(err));
-//         return err;
-//     }
-
-//     // Then, copy the data from the host array to the newly allocated device array
-//     err = tdpMemcpy(d_buffer_duy, h_buffer_duy, nxbuffer * sizeof(int), tdpMemcpyHostToDevice);
-    
-//     if (err != cudaSuccess) {
-//         fprintf(stderr, "Failed to copy buffer_duy from host to device (error code %s)!\n", cudaGetErrorString(err));
-//         return err;
-//     }
-
-//     // Finally, update the pointer in the device structure to point to the new device array
-//     err = tdpMemcpy(&(d_lees_edw->buffer_duy), &d_buffer_duy, sizeof(int*), tdpMemcpyHostToDevice);
-
-//     if (err != cudaSuccess) {
-//         fprintf(stderr, "Failed to copy buffer_duy pointer to device structure (error code %s)!\n", cudaGetErrorString(err));
-//         return err;
-//     }
-
-//     return cudaSuccess;
-// }
-
 __global__ void interpolation(lb_t *lb, lees_edw_t *le, double t, kernel_ctxt_t * ktxt) {
     int plane, ic, jc, kc;
     int nhalo, ndist, nplane;
@@ -169,7 +139,6 @@ __global__ void interpolation(lb_t *lb, lees_edw_t *le, double t, kernel_ctxt_t
             index1 = lees_edw_index(le, ic, j2, kc);
 
             /* xdisp_fwd_cv[0] identifies cv[p][X] = -1 */
-
             for (int n = 0; n < ndist; n++) {
                 for (int p = 1, i = 0; p < lb->model.nvel; p++) {
                     if (lb->model.cv[p][X] == -1) {
@@ -268,7 +237,6 @@ __global__ void copy_back(lb_t *lb, lees_edw_t *le, kernel_ctxt_t * ktxt) {
  *****************************************************************************/
 
 __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
-    nvtxRangeId_t id = nvtxRangeStartA("MY ASCII LABEL");
     int mpi_cartsz[3];
 
     assert(lb);
@@ -288,7 +256,6 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
         if (ndevice > 0) {
             copyModelToDevice(&lb->model, &lb->target->model);
             lees_edw_target(le, &le_target);
-           // copy_buffer_duy_to_device(le_target, le->buffer_duy, le->param->nxbuffer);
         }
        
         int nlocal[3], nplane;
@@ -325,7 +292,6 @@ __host__ int lb_le_apply_boundary_conditions(lb_t *lb, lees_edw_t *le) {
 
         TIMER_stop(TIMER_LE);
     }
-    nvtxRangeEnd(id);
     
     return 0;
 }
@@ -437,7 +403,6 @@ __global__ static void le_reproject(lb_t *lb, lees_edw_t *le, kernel_ctxt_t * kt
                     }
 
                     /* Project all this back to the distribution. */
-
                     lb_f(lb, index, p, n, &fnew);
                     fnew += lb->model.wv[p] * (rho * udotc * rcs2 + 0.5 * sdotq * rcs2 * rcs2);
                     lb_f_set(lb, index, p, n, fnew);
@@ -483,11 +448,6 @@ void le_displace_and_interpolate(lb_t *lb, lees_edw_t *le) {
 
     t = 1.0 * physics_control_timestep(phys);
 
-    /* We need to interpolate into a temporary buffer to make sure we
-     * don't overwrite distributions taking part. The size is just
-     * determined by the size of the local domain, and the number
-     * of plane-crossing distributions. */
-
     //tdp
     dim3 nblk, ntpb;
     kernel_info_t limits;

From 16519804822177f3ed05aa7cc3412b9230006b9b Mon Sep 17 00:00:00 2001
From: user name <alexeib_ludwig@cirrus-login1.ib0.icexa.epcc.ed.ac.uk>
Date: Mon, 22 Jul 2024 16:32:15 +0100
Subject: [PATCH 028/133] add initial implementation for gpu aware mpi
 communication of halos

---
 src/lb_data.h |   4 +
 src/model.c   | 211 +++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 206 insertions(+), 9 deletions(-)

diff --git a/src/lb_data.h b/src/lb_data.h
index 9c220aa54..e22b5a0e6 100644
--- a/src/lb_data.h
+++ b/src/lb_data.h
@@ -88,6 +88,10 @@ struct lb_halo_s {
   double * recv[27];              /* halo: recv buffer per direction */
   MPI_Request request[2*27];      /* halo: array of requests */
 
+  tdpStream_t stream;
+  lb_halo_t * target;
+  double * send_d[27];            /* halo: device send buffer per direction */
+  double * recv_d[27];            /* halo: device recv buffer per direction */
 };
 
 int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme);
diff --git a/src/model.c b/src/model.c
index 475b21325..ff862e043 100644
--- a/src/model.c
+++ b/src/model.c
@@ -47,6 +47,17 @@ int lb_halo_enqueue_send(const lb_t * lb, lb_halo_t * h, int irreq);
 
 static __constant__ lb_collide_param_t static_param;
 
+#ifdef HAVE_OPENMPI_
+/* This provides MPIX_CUDA_AWARE_SUPPORT .. */
+#include "mpi-ext.h"
+#endif
+
+#if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
+static const int have_gpu_aware_mpi_ = 1;
+#else
+static const int have_gpu_aware_mpi_ = 0;
+#endif
+
 /*****************************************************************************
  *
  *  lb_data_create
@@ -1019,6 +1030,63 @@ int lb_halo_enqueue_send(const lb_t * lb, lb_halo_t * h, int ireq) {
   return 0;
 }
 
+/*****************************************************************************
+ *
+ *  lb_halo_enqueue_send_kernel
+ *
+ *  Pack the send buffer. The ireq determines the direction of the
+ *  communication. Target version.
+ *
+ *****************************************************************************/
+
+__global__ void lb_halo_enqueue_send_kernel(const lb_t * lb, lb_halo_t * h, int ireq) {
+
+  assert(0 <= ireq && ireq < h->map.nvel);
+
+  if (h->count[ireq] > 0) {
+
+    int8_t mx = h->map.cv[ireq][X];
+    int8_t my = h->map.cv[ireq][Y];
+    int8_t mz = h->map.cv[ireq][Z];
+    int8_t mm = mx*mx + my*my + mz*mz;
+
+    int nx = 1 + h->slim[ireq].imax - h->slim[ireq].imin;
+    int ny = 1 + h->slim[ireq].jmax - h->slim[ireq].jmin;
+    int nz = 1 + h->slim[ireq].kmax - h->slim[ireq].kmin;
+
+    int strz = 1;
+    int stry = strz*nz;
+    int strx = stry*ny;
+
+    assert(mm == 1 || mm == 2 || mm == 3);
+
+	  int ih = 0;
+    for_simt_parallel (ih, nx*ny*nz, 1) {
+      int ic = h->slim[ireq].imin + ih/strx;
+      int jc = h->slim[ireq].jmin + (ih % strx)/stry;
+      int kc = h->slim[ireq].kmin + (ih % stry)/strz;
+      int ib = 0; /* Buffer index */
+
+      for (int n = 0; n < lb->ndist; n++) {
+	      for (int p = 0; p < lb->nvel; p++) {
+	        /* Recall, if full, we need p = 0 */
+	        int8_t px = lb->model.cv[p][X];
+	        int8_t py = lb->model.cv[p][Y];
+	        int8_t pz = lb->model.cv[p][Z];
+	        int dot = mx*px + my*py + mz*pz;
+	        if (h->full || dot == mm) {
+	          int index = cs_index(lb->cs, ic, jc, kc);
+	          int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
+	          h->send[ireq][ih*h->count[ireq] + ib] = lb->f[laddr];
+	          ib++;
+	        }
+	      }
+      }
+      assert(ib == h->count[ireq]);
+    }
+  }
+}
+
 /*****************************************************************************
  *
  *  lb_halo_dequeue_recv
@@ -1091,6 +1159,76 @@ int lb_halo_dequeue_recv(lb_t * lb, const lb_halo_t * h, int ireq) {
   return 0;
 }
 
+/*****************************************************************************
+ *
+ *  lb_halo_dequeue_recv_kernel
+ *
+ *  Unpack the recv buffer into place in the distributions. Target version.
+ *
+ *****************************************************************************/
+
+__global__ void lb_halo_dequeue_recv_kernel(lb_t * lb, const lb_halo_t * h, int ireq) {
+
+  assert(lb);
+  assert(h);
+  assert(0 <= ireq && ireq < h->map.nvel);
+
+  if (h->count[ireq] > 0) {
+
+    /* The communication direction is reversed cf. the send... */
+    int8_t mx = h->map.cv[h->map.nvel-ireq][X];
+    int8_t my = h->map.cv[h->map.nvel-ireq][Y];
+    int8_t mz = h->map.cv[h->map.nvel-ireq][Z];
+    int8_t mm = mx*mx + my*my + mz*mz;
+
+    int nx = 1 + h->rlim[ireq].imax - h->rlim[ireq].imin;
+    int ny = 1 + h->rlim[ireq].jmax - h->rlim[ireq].jmin;
+    int nz = 1 + h->rlim[ireq].kmax - h->rlim[ireq].kmin;
+
+    int strz = 1;
+    int stry = strz*nz;
+    int strx = stry*ny;
+
+    double * recv = h->recv[ireq];
+
+    {
+      int i = 1 + mx;
+      int j = 1 + my;
+      int k = 1 + mz;
+      /* If Cartesian neighbour is self, just copy out of send buffer. */
+      if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) recv = h->send[ireq]; 
+    }
+
+    assert(mm == 1 || mm == 2 || mm == 3);
+
+	  int ih = 0;
+    for_simt_parallel (ih, nx*ny*nz, 1) {
+      int ic = h->rlim[ireq].imin + ih/strx;
+      int jc = h->rlim[ireq].jmin + (ih % strx)/stry;
+      int kc = h->rlim[ireq].kmin + (ih % stry)/strz;
+      int ib = 0; /* Buffer index */
+
+      for (int n = 0; n < lb->ndist; n++) {
+	      for (int p = 0; p < lb->nvel; p++) {
+	        /* For reduced swap, we must have -cv[p] here... */
+	        int8_t px = lb->model.cv[lb->nvel-p][X];
+	        int8_t py = lb->model.cv[lb->nvel-p][Y];
+	        int8_t pz = lb->model.cv[lb->nvel-p][Z];
+	        int dot = mx*px + my*py + mz*pz;
+
+	        if (h->full || dot == mm) {
+	          int index = cs_index(lb->cs, ic, jc, kc);
+	          int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
+	          lb->f[laddr] = recv[ih*h->count[ireq] + ib];
+	          ib++;
+	        }
+	      }
+      }
+      assert(ib == h->count[ireq]);
+    }
+  }
+}
+
 /*****************************************************************************
  *
  *  lb_halo_create
@@ -1189,7 +1327,7 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 
   /* Message count (velocities) for each communication direction */
 
-  for (int p = 1; p < h->map.nvel; p++) {
+   for (int p = 1; p < h->map.nvel; p++) {
 
     int count = 0;
 
@@ -1237,6 +1375,35 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
     h->request[ireq] = MPI_REQUEST_NULL;
   }
 
+
+  /* Device */
+
+  int ndevice;
+  tdpGetDeviceCount(&ndevice);
+  tdpStreamCreate(&h->stream);
+
+  if (ndevice == 0) {
+    h->target = h;
+  }
+  else {
+    tdpAssert( tdpMalloc((void **) &h->target, sizeof(lb_halo_t)) );
+    tdpAssert( tdpMemcpy(h->target, h, sizeof(lb_halo_t),
+			 tdpMemcpyHostToDevice) );
+
+    for (int p = 1; p < h->map.nvel; p++) {         
+      int scount = h->map.nvel*lb_halo_size(h->slim[p]);  // h->map.nvel*lb->ndist used for full halo exchange for now. work on implementing reduced halo exchange using something like h->count calculated above.
+      int rcount = h->map.nvel*lb_halo_size(h->rlim[p]);
+      tdpAssert( tdpMalloc((void**) &h->send_d[p], scount * sizeof(double)) );
+      tdpAssert( tdpMalloc((void**) &h->recv_d[p], rcount * sizeof(double)) );
+    }
+    /* Slightly tricksy. Could use send_d and recv_d on target copy ...*/
+    tdpAssert( tdpMemcpy(h->target->send, h->send_d, 27*sizeof(double *),     
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(h->target->recv, h->recv_d, 27*sizeof(double *),
+			 tdpMemcpyHostToDevice) );
+
+  }
+
   return 0;
 }
 
@@ -1271,10 +1438,12 @@ int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
       int j = 1 + h->map.cv[h->map.nvel-ireq][Y];
       int k = 1 + h->map.cv[h->map.nvel-ireq][Z];
       int mcount = h->count[ireq]*lb_halo_size(h->rlim[ireq]);
+      double * buf = h->recv[ireq];
+      if (have_gpu_aware_mpi_) buf = h->recv_d[ireq];
 
       if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) mcount = 0;
       
-      MPI_Irecv(h->recv[ireq], mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
+      MPI_Irecv(buf, mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
 		h->tagbase + ireq, h->comm, h->request + ireq);
     }
   }
@@ -1286,10 +1455,21 @@ int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
 
   TIMER_start(TIMER_LB_HALO_PACK);
 
-  #pragma omp parallel
-  {
+  int ndevice;
+  tdpGetDeviceCount(&ndevice);
+  if (ndevice > 0) {
     for (int ireq = 0; ireq < h->map.nvel; ireq++) {
-      lb_halo_enqueue_send(lb, h, ireq);
+      int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+      dim3 nblk, ntpb;
+      kernel_launch_param(scount, &nblk, &ntpb);
+      lb_halo_enqueue_send_kernel<<<nblk, ntpb>>>(lb, h, ireq);
+    }
+  } else {
+    #pragma omp parallel
+    {
+      for (int ireq = 0; ireq < h->map.nvel; ireq++) {
+        lb_halo_enqueue_send(lb, h, ireq);
+      }
     }
   }
 
@@ -1306,11 +1486,13 @@ int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
       int j = 1 + h->map.cv[ireq][Y];
       int k = 1 + h->map.cv[ireq][Z];
       int mcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+      double * buf = h->send[ireq];
+      if (have_gpu_aware_mpi_) buf = h->send_d[ireq];
 
       /* Short circuit messages to self. */
       if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) mcount = 0;
 
-      MPI_Isend(h->send[ireq], mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
+      MPI_Isend(buf, mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
 		h->tagbase + ireq, h->comm, h->request + 27 + ireq);
     }
   }
@@ -1339,10 +1521,21 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
 
   TIMER_start(TIMER_LB_HALO_UNPACK);
 
-  #pragma omp parallel
-  {
+  int ndevice;
+  tdpGetDeviceCount(&ndevice);
+  if (ndevice > 0) {
     for (int ireq = 0; ireq < h->map.nvel; ireq++) {
-      lb_halo_dequeue_recv(lb, h, ireq);
+      int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+      dim3 nblk, ntpb;
+      kernel_launch_param(rcount, &nblk, &ntpb);
+      lb_halo_dequeue_recv_kernel<<<nblk, ntpb>>>(lb, h, ireq);
+    }
+  } else {
+    #pragma omp parallel
+    {
+      for (int ireq = 0; ireq < h->map.nvel; ireq++) {
+        lb_halo_dequeue_recv(lb, h, ireq);
+      }
     }
   }
 

From 1c502c3dacee3bee3b00577ea4be330432b71d91 Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@cirrus.epcc.ed.ac.uk>
Date: Fri, 26 Jul 2024 17:16:36 +0100
Subject: [PATCH 029/133] Add i/o aggregator check

---
 src/lb_data.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/lb_data.c b/src/lb_data.c
index 6834e2bbd..15e33486e 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -169,6 +169,17 @@ int lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * options,
     if (ifail != 0) pe_fatal(pe, "lb_data: bad i/o output decomposition\n");
   }
 
+  /* Run the aggregator here and now in an attempt to make sure we
+   * don't suffer an oom at the point of output */
+
+  {
+    io_impl_t * io = NULL;
+    int ifail = io_impl_create(&obj->output, &io);
+    if (ifail != 0) pe_exit(pe, "lb_data.c: error in aggregator creation\n");
+    lb_io_aggr_pack(obj, io->aggr);
+    io->impl->free(&io);
+  }
+
   *lb = obj;
 
   return 0;

From 3d8c9c6fc661749692c9cc10cb50636823b20ca7 Mon Sep 17 00:00:00 2001
From: user name <alexeib_ludwig@cirrus-login1.ib0.icexa.epcc.ed.ac.uk>
Date: Mon, 29 Jul 2024 12:00:12 +0100
Subject: [PATCH 030/133] use tdp calls to launch kernels

---
 src/lb_data.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 1ed1b4731..f4ece9733 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1261,7 +1261,7 @@ int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
       int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
       dim3 nblk, ntpb;
       kernel_launch_param(scount, &nblk, &ntpb);
-      lb_halo_enqueue_send_kernel<<<nblk, ntpb>>>(lb, h, ireq);
+      tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb, h, ireq);
     }
   } else {
     #pragma omp parallel
@@ -1327,7 +1327,7 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
       int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
       dim3 nblk, ntpb;
       kernel_launch_param(rcount, &nblk, &ntpb);
-      lb_halo_dequeue_recv_kernel<<<nblk, ntpb>>>(lb, h, ireq);
+      tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb, h, ireq);
     }
   } else {
     #pragma omp parallel

From 4e3e8b6963268707cde496287066b91ac9a86077 Mon Sep 17 00:00:00 2001
From: user name <alexeib_ludwig@cirrus-login3.ib0.icexa.epcc.ed.ac.uk>
Date: Tue, 30 Jul 2024 10:18:52 +0100
Subject: [PATCH 031/133] use reduced halos for allocating send and receive
 buffers on the device

---
 src/lb_data.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index f4ece9733..ae92d26d0 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1126,6 +1126,8 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 
   /* Message count (velocities) for each communication direction */
 
+   int8_t *send_count = (int8_t *) malloc(h->map.nvel * sizeof(int8_t));
+   int8_t *recv_count = (int8_t *) malloc(h->map.nvel * sizeof(int8_t));
    for (int p = 1; p < h->map.nvel; p++) {
 
     int count = 0;
@@ -1157,12 +1159,14 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
     /* Allocate send buffer for send region */
     if (count > 0) {
       int scount = count*lb_halo_size(h->slim[p]);
+      send_count[p] = scount;
       h->send[p] = (double *) calloc(scount, sizeof(double));
       assert(h->send[p]);
     }
     /* Allocate recv buffer */
     if (count > 0) {
       int rcount = count*lb_halo_size(h->rlim[p]);
+      recv_count[p] = count;
       h->recv[p] = (double *) calloc(rcount, sizeof(double));
       assert(h->recv[p]);
     }
@@ -1190,8 +1194,8 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 			 tdpMemcpyHostToDevice) );
 
     for (int p = 1; p < h->map.nvel; p++) {         
-      int scount = h->map.nvel*lb_halo_size(h->slim[p]);  // h->map.nvel*lb->ndist used for full halo exchange for now. work on implementing reduced halo exchange using something like h->count calculated above.
-      int rcount = h->map.nvel*lb_halo_size(h->rlim[p]);
+      int scount = send_count[p]*lb_halo_size(h->slim[p]);  
+      int rcount = recv_count[p]*lb_halo_size(h->rlim[p]);
       tdpAssert( tdpMalloc((void**) &h->send_d[p], scount * sizeof(double)) );
       tdpAssert( tdpMalloc((void**) &h->recv_d[p], rcount * sizeof(double)) );
     }

From bf6689054958a5910852c4a7fa13cc337bb63570 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Thu, 1 Aug 2024 15:05:10 +0100
Subject: [PATCH 032/133] update halo free function to gree device pointers

---
 src/lb_data.c | 46 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index ae92d26d0..1ad5ece5a 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -541,6 +541,8 @@ __host__ int lb_halo_swap(lb_t * lb, lb_halo_enum_t flag) {
   case LB_HALO_TARGET:
     tdpMemcpy(&data, &lb->target->f, sizeof(double *), tdpMemcpyDeviceToHost);
     halo_swap_packed(lb->halo, data);
+    //lb_halo_post(lb, &lb->h);
+    //lb_halo_wait(lb, &lb->h);
     break;
   case LB_HALO_OPENMP_FULL:
     lb_halo_post(lb, &lb->h);
@@ -876,7 +878,7 @@ __global__ void lb_halo_enqueue_send_kernel(const lb_t * lb, lb_halo_t * h, int
 	        if (h->full || dot == mm) {
 	          int index = cs_index(lb->cs, ic, jc, kc);
 	          int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
-	          h->send[ireq][ih*h->count[ireq] + ib] = lb->f[laddr];
+	          h->send_d[ireq][ih*h->count[ireq] + ib] = lb->f[laddr];
 	          ib++;
 	        }
 	      }
@@ -1262,10 +1264,12 @@ int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
   tdpGetDeviceCount(&ndevice);
   if (ndevice > 0) {
     for (int ireq = 0; ireq < h->map.nvel; ireq++) {
-      int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
-      dim3 nblk, ntpb;
-      kernel_launch_param(scount, &nblk, &ntpb);
-      tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb, h, ireq);
+      if (h->count[ireq] > 0) {
+        int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+        dim3 nblk, ntpb;
+        kernel_launch_param(scount, &nblk, &ntpb);
+        tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb, h, ireq);
+      }
     }
   } else {
     #pragma omp parallel
@@ -1276,6 +1280,11 @@ int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
     }
   }
 
+  //cudaDeviceSynchronize();
+  //printf("done kernel\n");
+  //MPI_Barrier(MPI_COMM_WORLD);
+  //abort();
+
   TIMER_stop(TIMER_LB_HALO_PACK);
 
   TIMER_start(TIMER_LB_HALO_ISEND);
@@ -1296,7 +1305,7 @@ int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
       if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) mcount = 0;
 
       MPI_Isend(buf, mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
-		h->tagbase + ireq, h->comm, h->request + 27 + ireq);
+		            h->tagbase + ireq, h->comm, h->request + 27 + ireq);
     }
   }
 
@@ -1328,10 +1337,12 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
   tdpGetDeviceCount(&ndevice);
   if (ndevice > 0) {
     for (int ireq = 0; ireq < h->map.nvel; ireq++) {
-      int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
-      dim3 nblk, ntpb;
-      kernel_launch_param(rcount, &nblk, &ntpb);
-      tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb, h, ireq);
+      if (h->count[ireq] > 0) {
+        int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+        dim3 nblk, ntpb;
+        kernel_launch_param(rcount, &nblk, &ntpb);
+        tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb, h, ireq);
+      }
     }
   } else {
     #pragma omp parallel
@@ -1360,6 +1371,21 @@ int lb_halo_free(lb_t * lb, lb_halo_t * h) {
   assert(lb);
   assert(h);
 
+  int ndevice = 0;
+  tdpGetDeviceCount(&ndevice);
+
+  if (ndevice > 0) {
+    tdpAssert( tdpMemcpy(h->send_d, h->target->send, 27*sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpy(h->recv_d, h->target->recv, 27*sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    for (int p = 1; p < h->nvel; p++) {
+      tdpFree(h->send_d[p]);
+      tdpFree(h->recv_d[p]);
+    }
+    tdpFree(h->target);
+  }
+
   for (int ireq = 0; ireq < 27; ireq++) {
     free(h->send[ireq]);
     free(h->recv[ireq]);

From f07483d5ff4c9337334d1219c64825669fab4777 Mon Sep 17 00:00:00 2001
From: ludwig-cf <40301532+ludwig-cf@users.noreply.github.com>
Date: Thu, 8 Aug 2024 10:39:08 +0100
Subject: [PATCH 033/133] Update issue templates

---
 .github/ISSUE_TEMPLATE/bug_report.md        | 22 +++++++++++++++++++++
 .github/ISSUE_TEMPLATE/feature_request.md   | 18 +++++++++++++++++
 .github/ISSUE_TEMPLATE/general-questions.md | 10 ++++++++++
 3 files changed, 50 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md
 create mode 100644 .github/ISSUE_TEMPLATE/general-questions.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 000000000..39d6f2e39
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,22 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+**Please provide a clear and concise description of the problem.**
+What is the problem?
+
+**How do we reproduce the problem?**
+Consider including:
+1. Details of the platform/compiler, i.e., your `config.mk` file
+2. The relevant `input` file and run time environment (MPI?)
+3. Details of any error messages
+
+**Expected behavior**
+What do you expected to happen?
+
+**Any other relevant details**
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 000000000..5d4d35adb
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,18 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: enhancement
+assignees: ''
+
+---
+
+**Summary**
+Please summarise what is missing, or what needs to be added.
+
+**Describe the solution you'd like**
+Please provide a clear and concise description of what you want to happen.
+
+
+**What's required?**
+Summarise the steps which may be required to implement the feature.
diff --git a/.github/ISSUE_TEMPLATE/general-questions.md b/.github/ISSUE_TEMPLATE/general-questions.md
new file mode 100644
index 000000000..8e689a3c3
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/general-questions.md
@@ -0,0 +1,10 @@
+---
+name: General questions
+about: Please ask ...
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+

From 4a026ebefa32377b66facb04c25a10e579b0e3a6 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Tue, 13 Aug 2024 15:46:47 +0100
Subject: [PATCH 034/133] debugging

---
 src/field.c   |  3 +++
 src/lb_data.c | 38 +++++++++++++++++++++++---------------
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/src/field.c b/src/field.c
index bb216f232..9660c2f6b 100644
--- a/src/field.c
+++ b/src/field.c
@@ -1407,6 +1407,7 @@ int field_halo_create(const field_t * field, field_halo_t * h) {
     for (int p = 1; p < h->nvel; p++) {
       int scount = field->nf*field_halo_size(h->slim[p]);
       int rcount = field->nf*field_halo_size(h->rlim[p]);
+      printf("field create p %d send count %d\n", p, field->nf);
       tdpAssert( tdpMalloc((void**) &h->send_d[p], scount * sizeof(double)) );
       tdpAssert( tdpMalloc((void**) &h->recv_d[p], rcount * sizeof(double)) );
     }
@@ -1457,6 +1458,7 @@ int field_halo_post(const field_t * field, field_halo_t * h) {
 
     /* Skip messages to self */
     if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) continue;
+    //printf("field halo post buf %f\n", buf[ireq]);
 
     MPI_Irecv(buf, mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
 	      tagbase + ireq, h->comm, h->request + ireq);
@@ -1498,6 +1500,7 @@ int field_halo_post(const field_t * field, field_halo_t * h) {
     /* Skip messages to self ... */
     if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) continue;
 
+    //printf("send ireq %d send count %d\n", ireq, mcount);
     MPI_Isend(buf, mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
 	      tagbase + ireq, h->comm, h->request + 27 + ireq);
   }
diff --git a/src/lb_data.c b/src/lb_data.c
index 1ad5ece5a..444a96847 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -539,10 +539,10 @@ __host__ int lb_halo_swap(lb_t * lb, lb_halo_enum_t flag) {
 
   switch (flag) {
   case LB_HALO_TARGET:
-    tdpMemcpy(&data, &lb->target->f, sizeof(double *), tdpMemcpyDeviceToHost);
-    halo_swap_packed(lb->halo, data);
-    //lb_halo_post(lb, &lb->h);
-    //lb_halo_wait(lb, &lb->h);
+    //tdpMemcpy(&data, &lb->target->f, sizeof(double *), tdpMemcpyDeviceToHost);
+    //halo_swap_packed(lb->halo, data);
+    lb_halo_post(lb, &lb->h);
+    lb_halo_wait(lb, &lb->h);
     break;
   case LB_HALO_OPENMP_FULL:
     lb_halo_post(lb, &lb->h);
@@ -878,7 +878,7 @@ __global__ void lb_halo_enqueue_send_kernel(const lb_t * lb, lb_halo_t * h, int
 	        if (h->full || dot == mm) {
 	          int index = cs_index(lb->cs, ic, jc, kc);
 	          int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
-	          h->send_d[ireq][ih*h->count[ireq] + ib] = lb->f[laddr];
+	          h->send[ireq][ih*h->count[ireq] + ib] = lb->f[laddr];
 	          ib++;
 	        }
 	      }
@@ -1195,9 +1195,12 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
     tdpAssert( tdpMemcpy(h->target, h, sizeof(lb_halo_t),
 			 tdpMemcpyHostToDevice) );
 
-    for (int p = 1; p < h->map.nvel; p++) {         
-      int scount = send_count[p]*lb_halo_size(h->slim[p]);  
-      int rcount = recv_count[p]*lb_halo_size(h->rlim[p]);
+    for (int p = 0; p < h->map.nvel; p++) {         
+      //int scount = send_count[p]*lb_halo_size(h->slim[p]);  
+      //int rcount = recv_count[p]*lb_halo_size(h->rlim[p]);
+      int scount = 96*lb_halo_size(h->slim[p]);  
+      int rcount = 96*lb_halo_size(h->rlim[p]);
+      printf("lb create p %d send count %d\n", p, send_count[p]);
       tdpAssert( tdpMalloc((void**) &h->send_d[p], scount * sizeof(double)) );
       tdpAssert( tdpMalloc((void**) &h->recv_d[p], rcount * sizeof(double)) );
     }
@@ -1280,16 +1283,15 @@ int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
     }
   }
 
-  //cudaDeviceSynchronize();
-  //printf("done kernel\n");
-  //MPI_Barrier(MPI_COMM_WORLD);
-  //abort();
+  cudaDeviceSynchronize();
+  printf("done kernel\n");
+  MPI_Barrier(MPI_COMM_WORLD);
 
   TIMER_stop(TIMER_LB_HALO_PACK);
 
   TIMER_start(TIMER_LB_HALO_ISEND);
 
-  for (int ireq = 0; ireq < h->map.nvel; ireq++) {
+  for (int ireq = 1; ireq < h->map.nvel; ireq++) {
 
     h->request[27+ireq] = MPI_REQUEST_NULL;
 
@@ -1302,12 +1304,18 @@ int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
       if (have_gpu_aware_mpi_) buf = h->send_d[ireq];
 
       /* Short circuit messages to self. */
-      if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) mcount = 0;
+      //if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) mcount = 0;
+      if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) continue;
+      printf("send ireq %d scount %d\n", ireq, mcount);
 
       MPI_Isend(buf, mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
 		            h->tagbase + ireq, h->comm, h->request + 27 + ireq);
     }
   }
+  
+  cudaDeviceSynchronize();
+  printf("done sending\n");
+  MPI_Barrier(MPI_COMM_WORLD);
 
   TIMER_stop(TIMER_LB_HALO_ISEND);
 
@@ -1379,7 +1387,7 @@ int lb_halo_free(lb_t * lb, lb_halo_t * h) {
 			 tdpMemcpyDeviceToHost) );
     tdpAssert( tdpMemcpy(h->recv_d, h->target->recv, 27*sizeof(double *),
 			 tdpMemcpyDeviceToHost) );
-    for (int p = 1; p < h->nvel; p++) {
+    for (int p = 1; p < h->map.nvel; p++) {
       tdpFree(h->send_d[p]);
       tdpFree(h->recv_d[p]);
     }

From f854feb7794a53215e6f8509778e01031f1e22d0 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Wed, 14 Aug 2024 14:39:48 +0100
Subject: [PATCH 035/133] remove print statements

---
 src/field.c   | 3 ---
 src/lb_data.c | 6 ++----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/field.c b/src/field.c
index 9660c2f6b..bb216f232 100644
--- a/src/field.c
+++ b/src/field.c
@@ -1407,7 +1407,6 @@ int field_halo_create(const field_t * field, field_halo_t * h) {
     for (int p = 1; p < h->nvel; p++) {
       int scount = field->nf*field_halo_size(h->slim[p]);
       int rcount = field->nf*field_halo_size(h->rlim[p]);
-      printf("field create p %d send count %d\n", p, field->nf);
       tdpAssert( tdpMalloc((void**) &h->send_d[p], scount * sizeof(double)) );
       tdpAssert( tdpMalloc((void**) &h->recv_d[p], rcount * sizeof(double)) );
     }
@@ -1458,7 +1457,6 @@ int field_halo_post(const field_t * field, field_halo_t * h) {
 
     /* Skip messages to self */
     if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) continue;
-    //printf("field halo post buf %f\n", buf[ireq]);
 
     MPI_Irecv(buf, mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
 	      tagbase + ireq, h->comm, h->request + ireq);
@@ -1500,7 +1498,6 @@ int field_halo_post(const field_t * field, field_halo_t * h) {
     /* Skip messages to self ... */
     if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) continue;
 
-    //printf("send ireq %d send count %d\n", ireq, mcount);
     MPI_Isend(buf, mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
 	      tagbase + ireq, h->comm, h->request + 27 + ireq);
   }
diff --git a/src/lb_data.c b/src/lb_data.c
index 444a96847..ca5fad8dd 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1198,9 +1198,8 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
     for (int p = 0; p < h->map.nvel; p++) {         
       //int scount = send_count[p]*lb_halo_size(h->slim[p]);  
       //int rcount = recv_count[p]*lb_halo_size(h->rlim[p]);
-      int scount = 96*lb_halo_size(h->slim[p]);  
+      int scount = 96*lb_halo_size(h->slim[p]);  // For some reason send_count[p] is zero for some values of p, which might cause issues so set to max observed value for now.
       int rcount = 96*lb_halo_size(h->rlim[p]);
-      printf("lb create p %d send count %d\n", p, send_count[p]);
       tdpAssert( tdpMalloc((void**) &h->send_d[p], scount * sizeof(double)) );
       tdpAssert( tdpMalloc((void**) &h->recv_d[p], rcount * sizeof(double)) );
     }
@@ -1271,7 +1270,7 @@ int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
         int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
         dim3 nblk, ntpb;
         kernel_launch_param(scount, &nblk, &ntpb);
-        tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb, h, ireq);
+        tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
       }
     }
   } else {
@@ -1306,7 +1305,6 @@ int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
       /* Short circuit messages to self. */
       //if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) mcount = 0;
       if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) continue;
-      printf("send ireq %d scount %d\n", ireq, mcount);
 
       MPI_Isend(buf, mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
 		            h->tagbase + ireq, h->comm, h->request + 27 + ireq);

From 5d4af34fbb1b37dd4aad81809c4682c417f8657b Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Mon, 26 Aug 2024 11:18:20 +0100
Subject: [PATCH 036/133] add model copies, make sure we're waiting on the
 target rather than the host

---
 src/lb_data.c | 57 +++++++++++++++++++++++++++++++++++++++++++--------
 src/lb_data.h |  2 +-
 2 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index ca5fad8dd..9689c6edf 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -52,6 +52,33 @@ static const int have_gpu_aware_mpi_ = 1;
 static const int have_gpu_aware_mpi_ = 0;
 #endif
 
+void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
+    int nvel = h_model->nvel;
+    // Allocate memory on the GPU for the arrays in the struct
+    int8_t (*d_cv)[3];
+    double *d_wv;
+    double *d_na;
+
+    tdpMalloc((void**)&d_cv, sizeof(int8_t[3]) * nvel);
+    tdpMalloc((void**)&d_wv, sizeof(double) * nvel);
+    tdpMalloc((void**)&d_na, sizeof(double) * nvel);
+
+    // Copy the data from host to the GPU
+    tdpMemcpy(d_cv, h_model->cv, sizeof(int8_t[3]) * nvel, tdpMemcpyHostToDevice);
+    tdpMemcpy(d_wv, h_model->wv, sizeof(double) * nvel, tdpMemcpyHostToDevice);
+    tdpMemcpy(d_na, h_model->na, sizeof(double) * nvel, tdpMemcpyHostToDevice);
+
+    // Set the pointers in the struct to the newly allocated GPU memory
+    tdpMemcpy(&(d_model->cv), &d_cv, sizeof(int8_t(*)[3]), tdpMemcpyHostToDevice);
+    tdpMemcpy(&(d_model->wv), &d_wv, sizeof(double*), tdpMemcpyHostToDevice);
+    tdpMemcpy(&(d_model->na), &d_na, sizeof(double*), tdpMemcpyHostToDevice);
+
+    //copy the rest data to gpu
+    tdpMemcpy(&(d_model->ndim), &(h_model->ndim), sizeof(int8_t), tdpMemcpyHostToDevice);
+    tdpMemcpy(&(d_model->nvel), &(h_model->nvel), sizeof(int8_t), tdpMemcpyHostToDevice);
+    tdpMemcpy(&(d_model->cs2), &(h_model->cs2), sizeof(double), tdpMemcpyHostToDevice);
+}
+
 /*****************************************************************************
  *
  *  lb_data_create
@@ -314,6 +341,7 @@ static int lb_init(lb_t * lb) {
   }
   else {
     lb_collide_param_t * ptmp  = NULL;
+    cs_t * cstarget = NULL;
 
     tdpMalloc((void **) &lb->target, sizeof(lb_t));
     tdpMemset(lb->target, 0, sizeof(lb_t));
@@ -330,6 +358,10 @@ static int lb_init(lb_t * lb) {
     tdpGetSymbolAddress((void **) &ptmp, tdpSymbol(static_param));
     tdpMemcpy(&lb->target->param, &ptmp, sizeof(lb_collide_param_t *),
 	      tdpMemcpyHostToDevice);
+    
+    cs_target(lb->cs, &cstarget);
+    tdpMemcpy(&lb->target->cs, &cstarget, sizeof(cs_t *),
+	      tdpMemcpyHostToDevice);
   }
 
   lb_mpi_init(lb);
@@ -1128,8 +1160,8 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 
   /* Message count (velocities) for each communication direction */
 
-   int8_t *send_count = (int8_t *) malloc(h->map.nvel * sizeof(int8_t));
-   int8_t *recv_count = (int8_t *) malloc(h->map.nvel * sizeof(int8_t));
+   int *send_count = (int *) calloc(h->map.nvel, sizeof(int));
+   int *recv_count = (int *) calloc(h->map.nvel, sizeof(int));
    for (int p = 1; p < h->map.nvel; p++) {
 
     int count = 0;
@@ -1196,10 +1228,10 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 			 tdpMemcpyHostToDevice) );
 
     for (int p = 0; p < h->map.nvel; p++) {         
-      //int scount = send_count[p]*lb_halo_size(h->slim[p]);  
-      //int rcount = recv_count[p]*lb_halo_size(h->rlim[p]);
-      int scount = 96*lb_halo_size(h->slim[p]);  // For some reason send_count[p] is zero for some values of p, which might cause issues so set to max observed value for now.
-      int rcount = 96*lb_halo_size(h->rlim[p]);
+      int scount = send_count[p]*lb_halo_size(h->slim[p]);  
+      int rcount = recv_count[p]*lb_halo_size(h->rlim[p]);
+      //int scount = 96*lb_halo_size(h->slim[p]);  // For some reason send_count[p] is zero for some values of p, which might cause issues so set to max observed value for now.
+      //int rcount = 96*lb_halo_size(h->rlim[p]);
       tdpAssert( tdpMalloc((void**) &h->send_d[p], scount * sizeof(double)) );
       tdpAssert( tdpMalloc((void**) &h->recv_d[p], rcount * sizeof(double)) );
     }
@@ -1210,6 +1242,8 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 			 tdpMemcpyHostToDevice) );
 
   }
+  free(send_count);
+  free(recv_count);
 
   return 0;
 }
@@ -1220,7 +1254,7 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
  *
  *****************************************************************************/
 
-int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
+int lb_halo_post(lb_t * lb, lb_halo_t * h) {
 
   assert(lb);
   assert(h);
@@ -1248,7 +1282,8 @@ int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
       double * buf = h->recv[ireq];
       if (have_gpu_aware_mpi_) buf = h->recv_d[ireq];
 
-      if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) mcount = 0;
+      //if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) mcount = 0;
+      if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) continue;
       
       MPI_Irecv(buf, mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
 		h->tagbase + ireq, h->comm, h->request + ireq);
@@ -1265,12 +1300,15 @@ int lb_halo_post(const lb_t * lb, lb_halo_t * h) {
   int ndevice;
   tdpGetDeviceCount(&ndevice);
   if (ndevice > 0) {
+    copyModelToDevice(&lb->model, &lb->target->model);
+    copyModelToDevice(&h->map, &h->target->map);
     for (int ireq = 0; ireq < h->map.nvel; ireq++) {
       if (h->count[ireq] > 0) {
         int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
         dim3 nblk, ntpb;
         kernel_launch_param(scount, &nblk, &ntpb);
         tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+        cudaDeviceSynchronize();
       }
     }
   } else {
@@ -1347,7 +1385,8 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
         int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
         dim3 nblk, ntpb;
         kernel_launch_param(rcount, &nblk, &ntpb);
-        tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb, h, ireq);
+        tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+        cudaDeviceSynchronize();
       }
     }
   } else {
diff --git a/src/lb_data.h b/src/lb_data.h
index 7b582e3a4..db60fe4d6 100644
--- a/src/lb_data.h
+++ b/src/lb_data.h
@@ -95,7 +95,7 @@ struct lb_halo_s {
 };
 
 int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme);
-int lb_halo_post(const lb_t * lb, lb_halo_t * h);
+int lb_halo_post(lb_t * lb, lb_halo_t * h);
 int lb_halo_wait(lb_t * lb, lb_halo_t * h);
 int lb_halo_free(lb_t * lb, lb_halo_t * h);
 

From 3837d6af58b7eea92178610f46d3441455d365f2 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Mon, 26 Aug 2024 11:28:46 +0100
Subject: [PATCH 037/133] remove debugging print statements

---
 src/lb_data.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 9689c6edf..cc16bbd20 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1308,7 +1308,7 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
         dim3 nblk, ntpb;
         kernel_launch_param(scount, &nblk, &ntpb);
         tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
-        cudaDeviceSynchronize();
+        tdpDeviceSynchronize();
       }
     }
   } else {
@@ -1320,10 +1320,6 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
     }
   }
 
-  cudaDeviceSynchronize();
-  printf("done kernel\n");
-  MPI_Barrier(MPI_COMM_WORLD);
-
   TIMER_stop(TIMER_LB_HALO_PACK);
 
   TIMER_start(TIMER_LB_HALO_ISEND);
@@ -1349,10 +1345,6 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
     }
   }
   
-  cudaDeviceSynchronize();
-  printf("done sending\n");
-  MPI_Barrier(MPI_COMM_WORLD);
-
   TIMER_stop(TIMER_LB_HALO_ISEND);
 
   return 0;
@@ -1386,7 +1378,7 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
         dim3 nblk, ntpb;
         kernel_launch_param(rcount, &nblk, &ntpb);
         tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
-        cudaDeviceSynchronize();
+        tdpDeviceSynchronize();
       }
     }
   } else {

From 4d3cbcf8947c23df3c8798354c636f8942bfd74c Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Fri, 6 Sep 2024 12:24:50 +0100
Subject: [PATCH 038/133] don't double count halo sizes

---
 src/lb_data.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index cc16bbd20..d78fcbefc 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1193,7 +1193,7 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
     /* Allocate send buffer for send region */
     if (count > 0) {
       int scount = count*lb_halo_size(h->slim[p]);
-      send_count[p] = scount;
+      send_count[p] = count;
       h->send[p] = (double *) calloc(scount, sizeof(double));
       assert(h->send[p]);
     }
@@ -1228,10 +1228,9 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 			 tdpMemcpyHostToDevice) );
 
     for (int p = 0; p < h->map.nvel; p++) {         
+      // XXX: don't allocate zero sized arrays (generally when p == 0)
       int scount = send_count[p]*lb_halo_size(h->slim[p]);  
       int rcount = recv_count[p]*lb_halo_size(h->rlim[p]);
-      //int scount = 96*lb_halo_size(h->slim[p]);  // For some reason send_count[p] is zero for some values of p, which might cause issues so set to max observed value for now.
-      //int rcount = 96*lb_halo_size(h->rlim[p]);
       tdpAssert( tdpMalloc((void**) &h->send_d[p], scount * sizeof(double)) );
       tdpAssert( tdpMalloc((void**) &h->recv_d[p], rcount * sizeof(double)) );
     }

From 0d3c218ce0764e1a3ac738f5a12bcf108d6aee25 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Tue, 1 Oct 2024 10:08:43 +0100
Subject: [PATCH 039/133] Relax cell list minimum constraint for ellipsoids

---
 src/colloids_rt.c                          |  43 ++++--
 tests/regression/d3q27/serial-elip-s10.inp |  94 ++++++++++++
 tests/regression/d3q27/serial-elip-s10.log | 158 +++++++++++++++++++++
 3 files changed, 280 insertions(+), 15 deletions(-)
 create mode 100644 tests/regression/d3q27/serial-elip-s10.inp
 create mode 100644 tests/regression/d3q27/serial-elip-s10.log

diff --git a/src/colloids_rt.c b/src/colloids_rt.c
index e172c5630..83d5c6e1b 100644
--- a/src/colloids_rt.c
+++ b/src/colloids_rt.c
@@ -1202,17 +1202,26 @@ int angle_cosine_init(pe_t * pe, cs_t * cs, rt_t * rt, interact_t * interact) {
  *  3) ncell >= 2       must have at least two cells to separate
  *                      'left-going' and 'right-going' communications.
  *
+ *  There are some edge cases where we can relax these constraints:
+ *
+ *  a) A direction which is non-periodic (eg., has walls)
+ *     and is not decomposed (mpisz == 1) will have no message passing.
+ *     In such a case,  (1) and (2) may be ignored, and (3) is relaxed to
+ *     ncell >= 1. This may be useful for systems which are "narrow" (cf. a0).
+ *
  *****************************************************************************/
 
 int colloids_init_halo_range_check(pe_t * pe, cs_t * cs,
 				   colloids_info_t * cinfo) {
 
   int ifail = 0;
-  int ncolloid;
-  int ncell[3];
-  int nlocal[3];
+  int ncolloid = 0;
+  int ncell[3] = {0};
+  int nlocal[3] = {0};
   int nhalo = 1;       /* Always, for purpose of BBL. */
 
+  int nar[3] = {0};    /* See point (a) above */
+
   double a0max = 0.0;  /* Maximum colloid a0 present */
   double lcell[3];
 
@@ -1228,23 +1237,27 @@ int colloids_init_halo_range_check(pe_t * pe, cs_t * cs,
 
   colloids_info_a0max(cinfo, &a0max);
 
-  if (2.0*a0max >= 1.0*(nlocal[X] - nhalo)) ifail = 1;
-  if (2.0*a0max >= 1.0*(nlocal[Y] - nhalo)) ifail = 1;
-  if (2.0*a0max >= 1.0*(nlocal[Z] - nhalo)) ifail = 1;
+  if (cs->param->periodic[X] == 0 && cs->param->mpi_cartsz[X] == 1) nar[X] = 1;
+  if (cs->param->periodic[Y] == 0 && cs->param->mpi_cartsz[Y] == 1) nar[Y] = 1;
+  if (cs->param->periodic[Z] == 0 && cs->param->mpi_cartsz[Z] == 1) nar[Z] = 1;
+
+  if (nar[X] == 0 && (2.0*a0max >= 1.0*(nlocal[X] - nhalo))) ifail = 1;
+  if (nar[Y] == 0 && (2.0*a0max >= 1.0*(nlocal[Y] - nhalo))) ifail = 1;
+  if (nar[Z] == 0 && (2.0*a0max >= 1.0*(nlocal[Z] - nhalo))) ifail = 1;
   if (ifail == 1) {
     pe_fatal(pe, "Particle diameter larger than (nlocal - 1) domain size\n");
   }
 
-  if (lcell[X] <= a0max) ifail = 1;
-  if (lcell[Y] <= a0max) ifail = 1;
-  if (lcell[Z] <= a0max) ifail = 1;
+  if (nar[X] == 0 && (lcell[X] <= a0max)) ifail = 1;
+  if (nar[Y] == 0 && (lcell[Y] <= a0max)) ifail = 1;
+  if (nar[Z] == 0 && (lcell[Z] <= a0max)) ifail = 1;
   if (ifail == 1) {
     pe_fatal(pe, "Particle a0 > cell width breaks BBL message passing\n");
   }
 
-  if (ncell[X] < 2) ifail = 1;
-  if (ncell[Y] < 2) ifail = 1;
-  if (ncell[Z] < 2) ifail = 1;
+  if (ncell[X] < (2 - nar[X])) ifail = 1;
+  if (ncell[Y] < (2 - nar[Y])) ifail = 1;
+  if (ncell[Z] < (2 - nar[Z])) ifail = 1;
 
   if (ifail == 1) {
     pe_fatal(pe, "Must have two cells minimum\n");
@@ -1254,9 +1267,9 @@ int colloids_init_halo_range_check(pe_t * pe, cs_t * cs,
 
   cs_nhalo(cs, &nhalo);
 
-  if (lcell[X] < (a0max + nhalo - 0.5)) ifail = 1;
-  if (lcell[Y] < (a0max + nhalo - 0.5)) ifail = 1;
-  if (lcell[Z] < (a0max + nhalo - 0.5)) ifail = 1;
+  if (nar[X] == 0 && (lcell[X] < (a0max + nhalo - 0.5))) ifail = 1;
+  if (nar[Y] == 0 && (lcell[Y] < (a0max + nhalo - 0.5))) ifail = 1;
+  if (nar[Z] == 0 && (lcell[Z] < (a0max + nhalo - 0.5))) ifail = 1;
 
   if (ifail == 1) {
     pe_fatal(pe, "Must have cell width > a0_max + nhalo\n");
diff --git a/tests/regression/d3q27/serial-elip-s10.inp b/tests/regression/d3q27/serial-elip-s10.inp
new file mode 100644
index 000000000..8ef6d7111
--- /dev/null
+++ b/tests/regression/d3q27/serial-elip-s10.inp
@@ -0,0 +1,94 @@
+##############################################################################
+#
+#  Colloid simple configuration output
+#
+##############################################################################
+
+N_start  0
+N_cycles 10
+
+##############################################################################
+#
+#  System and MPI
+# 
+##############################################################################
+
+size        128_18_18
+periodicity 1_0_0
+
+##############################################################################
+#
+#  Fluid parameters
+#
+##############################################################################
+
+viscosity      0.1
+fluid_rho0     1.0
+
+##############################################################################
+#
+#  Free energy parameters
+#
+###############################################################################
+
+free_energy  none
+
+###############################################################################
+#
+#  Colloid parameters
+#
+###############################################################################
+
+colloid_init        input_one
+colloid_one_shape   ellipsoid
+colloid_one_elabc   9.562082_4.996656_4.996656
+colloid_one_euler   0.0_0.0_0.0 
+colloid_one_r       64.5_9.5_9.5
+
+###############################################################################
+#
+# Colloid-colloid soft-sphere potential parameters
+# The soft sphere is always needed
+#
+###############################################################################
+
+
+colloid_gravity        0.005_0.0_0.0
+
+###############################################################################
+#
+#  Walls / boundaries
+#
+###############################################################################
+
+boundary_walls 0_1_1
+
+###############################################################################
+#
+#  Output frequency and type
+#
+###############################################################################
+
+default_io_mode    mpiio
+default_io_format  binary
+freq_statistics    10
+config_at_end      no
+
+##############################################################################
+#
+#  colloid i/o
+#
+##############################################################################
+
+colloid_io_freq          1000
+colloid_io_format_output ASCII
+
+###############################################################################
+#
+#  Miscellaneous
+#
+#  random_seed  +ve integer is the random number generator seed
+#
+###############################################################################
+
+random_seed 8361435
diff --git a/tests/regression/d3q27/serial-elip-s10.log b/tests/regression/d3q27/serial-elip-s10.log
new file mode 100644
index 000000000..9f2c3a76a
--- /dev/null
+++ b/tests/regression/d3q27/serial-elip-s10.log
@@ -0,0 +1,158 @@
+Welcome to: Ludwig v0.22.0 (Serial version running on 1 process)
+Git commit: 9255199534dfc86c44c98ff14cfb3dfa09108af5
+
+Start time: Tue Oct  1 10:00:36 2024
+
+Compiler:
+  name:           Gnu 14.1.0
+  version-string: 14.1.0
+  options:        -O2 -g -Wall -Werror
+
+Note assertions via standard C assert() are on.
+
+Target thread model: OpenMP.
+OpenMP threads: 1; maximum number of threads: 11.
+
+Read 21 user parameters from input
+
+No free energy selected
+
+System details
+--------------
+System size:    128 18 18
+Decomposition:  1 1 1
+Local domain:   128 18 18
+Periodic:       1 0 0
+Halo nhalo:     1
+Reorder:        true
+Initialised:    1
+
+System properties
+----------------
+Mean fluid density:           1.00000e+00
+Shear viscosity               1.00000e-01
+Bulk viscosity                1.00000e-01
+Temperature                   0.00000e+00
+External body force density   0.00000e+00  0.00000e+00  0.00000e+00
+External E-field amplitude    0.00000e+00  0.00000e+00  0.00000e+00
+External E-field frequency    0.00000e+00
+External magnetic field       0.00000e+00  0.00000e+00  0.00000e+00
+
+Lattice Boltzmann distributions
+-------------------------------
+Model:            d3q27  
+SIMD vector len:  1
+Number of sets:   1
+Halo type:        lb_halo_target (full halo)
+Input format:     binary
+Output format:    binary
+I/O grid:         1 1 1
+
+Lattice Boltzmann collision
+---------------------------
+Relaxation time scheme:   M10
+Hydrodynamic modes:       on
+Ghost modes:              on
+Isothermal fluctuations:  off
+Shear relaxation time:    8.00000e-01
+Bulk relaxation time:     8.00000e-01
+Ghost relaxation time:    1.00000e+00
+[User   ] Random number seed: 8361435
+
+Hydrodynamics
+-------------
+Hydrodynamics: on
+
+Boundary walls
+--------------
+Boundary walls:                  - Y Z
+Boundary speed u_x (bottom):     0.0000000e+00
+Boundary speed u_x (top):        0.0000000e+00
+Boundary normal lubrication rc:  0.0000000e+00
+Wall boundary links allocated:   81408
+Memory (total, bytes):           1302528
+Boundary shear initialise:       0
+
+Colloid information
+-------------------
+
+Colloid I/O settings
+--------------------
+Decomposition:                1  1  1
+Number of files:              1
+Input format:                 ascii
+Output format:                ascii
+Single file read flag:        0
+
+Requested one colloid via input:
+colloid_one                   ellipsoid
+colloid_one_r                 6.4500000e+01  9.5000000e+00  9.5000000e+00
+colloid_one_elabc             9.5620820e+00  4.9966560e+00  4.9966560e+00
+colloid_one_euler             0.0000000e+00  0.0000000e+00  0.0000000e+00
+
+Initialised 1 colloid
+
+Colloid cell list information
+-----------------------------
+Input radius maximum:         9.5620820e+00
+Final cell list:              12 1 1
+Final cell lengths:           1.0666667e+01  1.8000000e+01  1.8000000e+01
+
+Sedimentation force on:       yes
+Sedimentation force:          5.0000000e-03  0.0000000e+00  0.0000000e+00
+
+Initial conditions.
+
+Scalars - total mean variance min max
+[rho]       40464.00  1.00000000000  2.2204460e-16  1.00000000000  1.00000000000
+
+Momentum - x y z
+[total   ]  0.0000000e+00  0.0000000e+00  0.0000000e+00
+[fluid   ]  0.0000000e+00  0.0000000e+00  0.0000000e+00
+[colloids]  0.0000000e+00  0.0000000e+00  0.0000000e+00
+[walls   ]  0.0000000e+00  0.0000000e+00  0.0000000e+00
+
+Starting time step loop.
+
+Particle statistics:
+
+Colloid velocities - x y z
+[minimum ]  2.6153937e-05 -5.3604951e-18  1.8203251e-18
+[maximum ]  2.6153937e-05 -5.3604951e-18  1.8203251e-18
+
+Scalars - total mean variance min max
+[rho]       40464.00  1.00000000000  7.4389384e-12  0.99997187102  1.00002812813
+
+Momentum - x y z
+[total   ]  6.2665151e-14  8.8029209e-14 -3.2492556e-14
+[fluid   ] -1.9760295e-02 -1.1450476e-13 -4.4833668e-13
+[colloids]  2.8251562e-02 -3.2562774e-15  2.3560458e-16
+[walls   ] -8.4912674e-03  2.0579025e-13  4.1560852e-13
+
+Velocity - x y z
+[minimum ] -1.9016983e-06 -9.0171214e-06 -9.0171214e-06
+[maximum ]  2.4698222e-05  9.0171214e-06  9.0171214e-06
+
+Completed cycle 10
+
+Timer resolution: 1e-06 second
+
+Timer statistics
+             Section:       tmin       tmax      total
+               Total:      0.499      0.499      0.499   0.498647 (1 call)
+      Time step loop:      0.045      0.058      0.461   0.046146 (10 calls)
+         Propagation:      0.006      0.006      0.059   0.005870 (10 calls)
+    Propagtn (krnl) :      0.006      0.006      0.059   0.005869 (10 calls)
+           Collision:      0.029      0.030      0.293   0.029312 (10 calls)
+   Collision (krnl) :      0.029      0.030      0.293   0.029310 (10 calls)
+       Lattice halos:      0.001      0.002      0.026   0.001299 (20 calls)
+       phi gradients:      0.000      0.000      0.000   0.000000 (10 calls)
+              Forces:      0.000      0.000      0.001   0.000091 (10 calls)
+             Rebuild:      0.001      0.001      0.008   0.000846 (10 calls)
+                 BBL:      0.006      0.006      0.056   0.005586 (10 calls)
+      Particle halos:      0.000      0.000      0.000   0.000003 (10 calls)
+   Force calculation:      0.000      0.000      0.000   0.000000 (10 calls)
+          phi update:      0.000      0.000      0.000   0.000000 (10 calls)
+Diagnostics / output:      0.000      0.013      0.013   0.001299 (10 calls)
+End time: Tue Oct  1 10:00:36 2024
+Ludwig finished normally.

From 90dda9022688a9c7543404ca8f11835a3912b567 Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@epcc.ed.ac.uk>
Date: Fri, 4 Oct 2024 12:52:57 +0000
Subject: [PATCH 040/133] Check all tdp return values

---
 src/advection.c                   |  34 +--
 src/blue_phase.c                  |   4 +-
 src/blue_phase_beris_edwards.c    |   4 +-
 src/brazovskii.c                  |  26 +-
 src/cahn_hilliard.c               |   4 +-
 src/cahn_hilliard_stats.c         |   2 +-
 src/colloids.c                    |   6 +-
 src/coords.c                      |  12 +-
 src/distribution_rt.c             |   2 +-
 src/fe_electro.c                  |   8 +-
 src/fe_electro_symmetric.c        |  12 +-
 src/fe_null.c                     |   8 +-
 src/fe_ternary.c                  |   6 +-
 src/field.c                       |  59 ++---
 src/field_grad.c                  | 111 ++++-----
 src/gradient_3d_7pt_solid.c       |  12 +-
 src/gradient_s7_anchoring.c       |  16 +-
 src/halo_swap.c                   | 382 +++++++++++++++---------------
 src/hydro.c                       |   8 +-
 src/lb_data.c                     |  64 ++---
 src/lc_droplet.c                  |  23 +-
 src/leesedwards.c                 |  17 +-
 src/ludwig.c                      |   4 +-
 src/map.c                         |  14 +-
 src/noise.c                       |   6 +-
 src/pe.c                          |   4 +-
 src/phi_cahn_hilliard.c           |   2 +-
 src/phi_force_colloid.c           |   8 +-
 src/phi_force_stress.c            |  36 +--
 src/phi_stats.c                   |   2 +-
 src/polar_active.c                |  24 +-
 src/propagation.c                 |   2 +-
 src/stats_distribution.c          |   4 +-
 src/surfactant.c                  |  14 +-
 src/symmetric.c                   |  28 +--
 src/wall.c                        | 103 ++++----
 target/target_x86.c               |   8 +-
 tests/unit/test_blue_phase.c      |   2 +-
 tests/unit/test_ch.c              |   4 +-
 tests/unit/test_coords.c          |   5 +-
 tests/unit/test_fe_electro.c      |   2 +-
 tests/unit/test_fe_electro_symm.c |   3 +-
 tests/unit/test_fe_surfactant1.c  |   4 +-
 tests/unit/test_fe_ternary.c      |   4 +-
 tests/unit/test_field.c           |   3 +-
 tests/unit/test_halo.c            |   4 +-
 tests/unit/test_hydro.c           |   2 +-
 tests/unit/test_polar_active.c    |   4 +-
 tests/unit/test_prop.c            |   4 +-
 tests/unit/test_visc_arrhenius.c  |   4 +-
 50 files changed, 570 insertions(+), 554 deletions(-)

diff --git a/src/advection.c b/src/advection.c
index aae9cc88f..a3ef32813 100644
--- a/src/advection.c
+++ b/src/advection.c
@@ -191,7 +191,7 @@ __host__ int advflux_create(pe_t * pe, cs_t * cs, lees_edw_t * le, int nf,
 
   /* Allocate target copy of structure (or alias) */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     obj->target = obj;
@@ -260,24 +260,30 @@ __host__ int advflux_free(advflux_t * obj) {
 
   assert(obj);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) {
     if (obj->le == NULL) {
-    tdpMemcpy(&tmp, &obj->target->fx, sizeof(double *), tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
+      tdpAssert( tdpMemcpy(&tmp, &obj->target->fx, sizeof(double *),
+			   tdpMemcpyDeviceToHost) );
+      tdpAssert( tdpFree(tmp) );
     }
     else {
-    tdpMemcpy(&tmp, &obj->target->fe, sizeof(double *), tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &obj->target->fw, sizeof(double *), tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
+      tdpAssert( tdpMemcpy(&tmp, &obj->target->fe, sizeof(double *),
+			   tdpMemcpyDeviceToHost) );
+      tdpAssert( tdpFree(tmp) );
+      tdpAssert( tdpMemcpy(&tmp, &obj->target->fw, sizeof(double *),
+			   tdpMemcpyDeviceToHost) );
+      tdpAssert( tdpFree(tmp) );
     }
-    tdpMemcpy(&tmp, &obj->target->fy, sizeof(double *), tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &obj->target->fz, sizeof(double *), tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpFree(obj->target);
+
+    tdpAssert( tdpMemcpy(&tmp, &obj->target->fy, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &obj->target->fz, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpFree(obj->target) );
   }
 
   if (obj->le == NULL) {
@@ -373,7 +379,7 @@ __host__ int advflux_memcpy(advflux_t * adv, tdpMemcpyKind flag) {
 
   assert(adv);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     /* Ensure we alias */
diff --git a/src/blue_phase.c b/src/blue_phase.c
index de6b1fbf9..3b2f3a786 100644
--- a/src/blue_phase.c
+++ b/src/blue_phase.c
@@ -118,7 +118,7 @@ __host__ int fe_lc_create(pe_t * pe, cs_t * cs, lees_edw_t * le,
 
   /* Allocate device memory, or alias */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     fe->target = fe;
@@ -167,7 +167,7 @@ __host__ int fe_lc_free(fe_lc_t * fe) {
 
   assert(fe);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) tdpAssert(tdpFree(fe->target));
 
diff --git a/src/blue_phase_beris_edwards.c b/src/blue_phase_beris_edwards.c
index 00cf15f33..b6b9a0fae 100644
--- a/src/blue_phase_beris_edwards.c
+++ b/src/blue_phase_beris_edwards.c
@@ -144,7 +144,7 @@ __host__ int beris_edw_create(pe_t * pe, cs_t * cs, lees_edw_t * le,
 
   /* Allocate a target copy, or alias */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     obj->target = obj;
@@ -190,7 +190,7 @@ __host__ int beris_edw_free(beris_edw_t * be) {
 
   assert(be);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) {
     double * htmp;
diff --git a/src/brazovskii.c b/src/brazovskii.c
index 97c090c68..1ecead098 100644
--- a/src/brazovskii.c
+++ b/src/brazovskii.c
@@ -125,7 +125,7 @@ __host__ int fe_brazovskii_create(pe_t * pe, cs_t * cs, field_t * phi,
 
   /* Allocate device memory, or alias */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     obj->target = obj;
@@ -133,19 +133,19 @@ __host__ int fe_brazovskii_create(pe_t * pe, cs_t * cs, field_t * phi,
   else {
     fe_brazovskii_param_t * tmp;
     fe_vt_t * vt;
-    tdpMalloc((void **) &obj->target, sizeof(fe_brazovskii_t));
-    tdpMemset(obj->target, 0, sizeof(fe_brazovskii_t));
+    tdpAssert( tdpMalloc((void **) &obj->target, sizeof(fe_brazovskii_t)) );
+    tdpAssert( tdpMemset(obj->target, 0, sizeof(fe_brazovskii_t)) );
     tdpGetSymbolAddress((void **) &tmp, tdpSymbol(const_param));
-    tdpMemcpy(&obj->target->param, &tmp, sizeof(fe_brazovskii_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&obj->target->param, &tmp, sizeof(fe_brazovskii_t *),
+			 tdpMemcpyHostToDevice) );
     tdpGetSymbolAddress((void **) &vt, tdpSymbol(fe_braz_dvt));
-    tdpMemcpy(&obj->target->super.func, &vt, sizeof(fe_vt_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&obj->target->super.func, &vt, sizeof(fe_vt_t *),
+			 tdpMemcpyHostToDevice) );
 
-    tdpMemcpy(&obj->target->phi, &phi->target, sizeof(field_t *),
-	      tdpMemcpyHostToDevice);
-    tdpMemcpy(&obj->target->dphi, &dphi->target, sizeof(field_grad_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&obj->target->phi, &phi->target, sizeof(field_t *),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&obj->target->dphi, &dphi->target,
+			 sizeof(field_grad_t *), tdpMemcpyHostToDevice) );
   }
 
   *p = obj;
@@ -165,8 +165,8 @@ __host__ int fe_brazovskii_free(fe_brazovskii_t * fe) {
 
   assert(fe);
 
-  tdpGetDeviceCount(&ndevice);
-  if (ndevice > 0) tdpFree(fe->target);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
+  if (ndevice > 0) tdpAssert( tdpFree(fe->target) );
 
   free(fe->param);
   free(fe);
diff --git a/src/cahn_hilliard.c b/src/cahn_hilliard.c
index 2a95191e8..0ef1ec475 100644
--- a/src/cahn_hilliard.c
+++ b/src/cahn_hilliard.c
@@ -71,7 +71,7 @@ __host__ int ch_create(pe_t * pe, cs_t * cs, ch_info_t info, ch_t ** ch) {
   advflux_cs_create(pe, cs, info.nfield, &obj->flux);
   assert(obj->flux);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     obj->target = obj;
@@ -109,7 +109,7 @@ __host__ int ch_free(ch_t * ch) {
   {
     int ndevice = 0;
 
-    tdpGetDeviceCount(&ndevice);
+    tdpAssert( tdpGetDeviceCount(&ndevice) );
     if (ndevice > 0) tdpAssert(tdpFree(ch->target));
   }
 
diff --git a/src/cahn_hilliard_stats.c b/src/cahn_hilliard_stats.c
index 3d2c981a7..d5da6c932 100644
--- a/src/cahn_hilliard_stats.c
+++ b/src/cahn_hilliard_stats.c
@@ -209,7 +209,7 @@ __host__ int cahn_stats_reduce(phi_ch_t * pch, field_t * phi,
   MPI_Reduce(&local.max, &stats->max, 1, MPI_DOUBLE, MPI_MAX, root, comm);
   MPI_Reduce(&local.vol, &stats->vol, 1, MPI_DOUBLE, MPI_SUM, root, comm);
 
-  tdpFree(stats_d);
+  tdpAssert( tdpFree(stats_d) );
 
   return 0;
 }
diff --git a/src/colloids.c b/src/colloids.c
index b85d069d4..cc72f5011 100644
--- a/src/colloids.c
+++ b/src/colloids.c
@@ -81,7 +81,7 @@ __host__ int colloids_info_create(pe_t * pe, cs_t * cs, int ncell[3],
   obj->rho0 = RHO_DEFAULT;
   obj->drmax = DRMAX_DEFAULT;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     obj->target = obj;
@@ -184,7 +184,7 @@ __host__ int colloids_memcpy(colloids_info_t * info, int flag) {
   assert(info);
   assert(info->map_new);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     /* Bare pointer equality causes HIPCC to choke, hence explicit (()) */
@@ -284,7 +284,7 @@ __host__ int colloids_info_map_init(colloids_info_t * info) {
 
   /* Allocate data space on target */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) {
     void * tmp;
diff --git a/src/coords.c b/src/coords.c
index 5422d0a8a..70919e9dd 100644
--- a/src/coords.c
+++ b/src/coords.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2022 The University of Edinburgh
+ *  (c) 2010-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -216,18 +216,18 @@ __host__ int cs_init(cs_t * cs) {
 
   /* Device side */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     cs->target = cs;
   }
   else {
     cs_param_t * tmp;
-    tdpMalloc((void **) &cs->target, sizeof(cs_t));
-    tdpMemset(cs->target, 0, sizeof(cs_t));
+    tdpAssert( tdpMalloc((void **) &cs->target, sizeof(cs_t)) );
+    tdpAssert( tdpMemset(cs->target, 0, sizeof(cs_t)) );
     tdpGetSymbolAddress((void **) &tmp, tdpSymbol(const_param));
-    tdpMemcpy(&cs->target->param, (const void *) &tmp, sizeof(cs_param_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&cs->target->param, (const void *) &tmp,
+			 sizeof(cs_param_t *), tdpMemcpyHostToDevice) );
     cs_commit(cs);
   }
 
diff --git a/src/distribution_rt.c b/src/distribution_rt.c
index fcaf451fd..ec129963f 100644
--- a/src/distribution_rt.c
+++ b/src/distribution_rt.c
@@ -179,7 +179,7 @@ int lb_run_time_prev(pe_t * pe, cs_t * cs, rt_t * rt, lb_t ** lb) {
      * not as to disrupt the regression tests. */
     {
       int ndevice = 0;
-      tdpGetDeviceCount(&ndevice);
+      tdpAssert( tdpGetDeviceCount(&ndevice) );
       if (ndevice > 0) options.halo = LB_HALO_TARGET;
     }
 
diff --git a/src/fe_electro.c b/src/fe_electro.c
index b4e475aaf..4f5ffa819 100644
--- a/src/fe_electro.c
+++ b/src/fe_electro.c
@@ -27,7 +27,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2013-2023 The University of Edinburgh
+ *  (c) 2013-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Oliver Henrich  (ohenrich@epcc.ed.ac.uk)
@@ -120,7 +120,7 @@ __host__ int fe_electro_create(pe_t * pe, psi_t * psi, fe_electro_t ** pobj) {
   fe->super.func = &fe_electro_hvt;
   fe->super.id = FE_ELECTRO;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     fe->target = fe;
@@ -131,7 +131,7 @@ __host__ int fe_electro_create(pe_t * pe, psi_t * psi, fe_electro_t ** pobj) {
     fe_vt_t * vt = NULL;
 
     tdpAssert(tdpMalloc((void **) &fe->target, sizeof(fe_electro_t)));
-    tdpMemset(fe->target, 0, sizeof(fe_electro_t));
+    tdpAssert( tdpMemset(fe->target, 0, sizeof(fe_electro_t)) );
 
     tdpGetSymbolAddress((void **) &vt, tdpSymbol(fe_electro_dvt));
     tdpAssert(tdpMemcpy(&fe->target->super.func, &vt, sizeof(fe_vt_t *),
@@ -155,7 +155,7 @@ __host__ int fe_electro_free(fe_electro_t * fe) {
 
   assert(fe);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
   if (ndevice > 0) tdpAssert(tdpFree(fe->target));
 
   if (fe->mu_ref) free(fe->mu_ref);
diff --git a/src/fe_electro_symmetric.c b/src/fe_electro_symmetric.c
index 9e2917431..4575ae186 100644
--- a/src/fe_electro_symmetric.c
+++ b/src/fe_electro_symmetric.c
@@ -30,7 +30,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2013-2018 The University of Edinburgh
+ *  (c) 2013-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *    Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -138,7 +138,7 @@ __host__ int fe_es_create(pe_t * pe, cs_t * cs, fe_symm_t * symm,
   psi_nk(psi, &fe->param->nk);
   fe_es_epsilon_set(fe, psi->epsilon, psi->epsilon2);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     fe->target = fe;
@@ -147,10 +147,10 @@ __host__ int fe_es_create(pe_t * pe, cs_t * cs, fe_symm_t * symm,
     fe_vt_t * vt;
     fe_es_param_t * tmp;
 
-    tdpMalloc((void **) &fe->target, sizeof(fe_es_t));
+    tdpAssert( tdpMalloc((void **) &fe->target, sizeof(fe_es_t)) );
     tdpGetSymbolAddress((void **) &tmp, tdpSymbol(const_param));
-    tdpMemcpy(&fe->target->param, tmp, sizeof(fe_es_param_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&fe->target->param, tmp, sizeof(fe_es_param_t *),
+			 tdpMemcpyHostToDevice) );
     tdpGetSymbolAddress((void **) &vt, tdpSymbol(fe_es_dvt));
   }
 
@@ -169,7 +169,7 @@ __host__ int fe_es_free(fe_es_t * fe) {
 
   assert(fe);
 
-  if (fe->target != fe) tdpFree(fe->target);
+  if (fe->target != fe) tdpAssert( tdpFree(fe->target) );
 
   free(fe->param);
   free(fe);
diff --git a/src/fe_null.c b/src/fe_null.c
index 7d6fe8285..3cc48639d 100644
--- a/src/fe_null.c
+++ b/src/fe_null.c
@@ -8,7 +8,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group
  *  and Edinburgh Parallel Computing Centre
  *
- *  (c) 2022 The University of Edinburgh
+ *  (c) 2022-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -79,7 +79,7 @@ __host__ int fe_null_create(pe_t * pe, fe_null_t ** p) {
 
   /* Allocate target memory, or alias */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     fe->target = fe;
@@ -112,9 +112,9 @@ __host__ int fe_null_free(fe_null_t * fe) {
 
   assert(fe);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
-  if (ndevice > 0) tdpFree(fe->target);
+  if (ndevice > 0) tdpAssert( tdpFree(fe->target) );
   free(fe);
 
   return 0;
diff --git a/src/fe_ternary.c b/src/fe_ternary.c
index d4aa8a104..0bd5f7477 100644
--- a/src/fe_ternary.c
+++ b/src/fe_ternary.c
@@ -17,7 +17,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group
  *  and Edinburgh Parallel Computing Centre
  *
- *  (c) 2019-2021 The University of Edinburgh
+ *  (c) 2019-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Shan Chen (shan.chen@epfl.ch)
@@ -111,7 +111,7 @@ int fe_ternary_create(pe_t * pe, cs_t * cs, field_t * phi,
     
   /* Allocate target memory, or alias */
     
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
     
   if (ndevice == 0) {
     obj->target = obj;
@@ -163,7 +163,7 @@ __host__ int fe_ternary_free(fe_ternary_t * fe) {
     
   assert(fe);
     
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
   if (ndevice > 0) tdpAssert(tdpFree(fe->target));
     
   free(fe->param);
diff --git a/src/field.c b/src/field.c
index bb216f232..a312dc297 100644
--- a/src/field.c
+++ b/src/field.c
@@ -48,7 +48,7 @@ __host__ int field_init(field_t * obj, int nhcomm, lees_edw_t * le);
 #include "mpi-ext.h"
 #endif
 
-#ifdef __NVCC__
+#ifdef __HIPCC__
 /* There are two file-scope switches here, which need to be generalised
  * via some suitable interface; they are separate, but both relate to
  * GPU execution. */
@@ -168,13 +168,13 @@ __host__ int field_free(field_t * obj) {
 
   assert(obj);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) {
-    tdpMemcpy(&tmp, &obj->target->data, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpFree(obj->target);
+    tdpAssert( tdpMemcpy(&tmp, &obj->target->data, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpFree(obj->target) );
   }
 
   if (obj->data) free(obj->data);
@@ -239,7 +239,7 @@ __host__ int field_init(field_t * obj, int nhcomm, lees_edw_t * le) {
 
   /* Allocate target copy of structure (or alias) */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     obj->target = obj;
@@ -247,17 +247,17 @@ __host__ int field_init(field_t * obj, int nhcomm, lees_edw_t * le) {
   else {
     cs_t * cstarget = NULL;
     lees_edw_t * letarget = NULL;
-    tdpMalloc((void **) &obj->target, sizeof(field_t));
-    tdpMalloc((void **) &tmp, nfsz*sizeof(double));
-    tdpMemcpy(&obj->target->data, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMalloc((void **) &obj->target, sizeof(field_t)) );
+    tdpAssert( tdpMalloc((void **) &tmp, nfsz*sizeof(double)) );
+    tdpAssert( tdpMemcpy(&obj->target->data, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
 
     cs_target(obj->cs, &cstarget);
     if (le) lees_edw_target(obj->le, &letarget);
-    tdpMemcpy(&obj->target->cs, &cstarget, sizeof(cs_t *),
-	      tdpMemcpyHostToDevice);
-    tdpMemcpy(&obj->target->le, &letarget, sizeof(lees_edw_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&obj->target->cs, &cstarget, sizeof(cs_t *),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&obj->target->le, &letarget, sizeof(lees_edw_t *),
+			 tdpMemcpyHostToDevice) );
     field_memcpy(obj, tdpMemcpyHostToDevice);
   }
 
@@ -276,7 +276,7 @@ __host__ int field_memcpy(field_t * obj, tdpMemcpyKind flag) {
   size_t nfsz;
   double * tmp;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     /* Ensure we alias */
@@ -285,18 +285,18 @@ __host__ int field_memcpy(field_t * obj, tdpMemcpyKind flag) {
   else {
 
     nfsz = (size_t) obj->nf*obj->nsites;
-    tdpMemcpy(&tmp, &obj->target->data, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
+    tdpAssert( tdpMemcpy(&tmp, &obj->target->data, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
 
     switch (flag) {
     case tdpMemcpyHostToDevice:
-      tdpMemcpy(&obj->target->nf, &obj->nf, sizeof(int), flag);
-      tdpMemcpy(&obj->target->nhcomm, &obj->nhcomm, sizeof(int), flag);
-      tdpMemcpy(&obj->target->nsites, &obj->nsites, sizeof(int), flag);
-      tdpMemcpy(tmp, obj->data, nfsz*sizeof(double), flag);
+      tdpAssert( tdpMemcpy(&obj->target->nf, &obj->nf, sizeof(int), flag) );
+      tdpAssert( tdpMemcpy(&obj->target->nhcomm, &obj->nhcomm, sizeof(int), flag) );
+      tdpAssert( tdpMemcpy(&obj->target->nsites, &obj->nsites, sizeof(int), flag) );
+      tdpAssert( tdpMemcpy(tmp, obj->data, nfsz*sizeof(double), flag) );
       break;
     case tdpMemcpyDeviceToHost:
-      tdpMemcpy(obj->data, tmp, nfsz*sizeof(double), flag);
+      tdpAssert( tdpMemcpy(obj->data, tmp, nfsz*sizeof(double), flag) );
       break;
     default:
       pe_fatal(obj->pe, "Bad flag in field_memcpy\n");
@@ -1393,8 +1393,8 @@ int field_halo_create(const field_t * field, field_halo_t * h) {
 
   /* Device */
 
-  tdpGetDeviceCount(&ndevice);
-  tdpStreamCreate(&h->stream);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
+  tdpAssert( tdpStreamCreate(&h->stream) );
 
   if (ndevice == 0) {
     h->target = h;
@@ -1608,7 +1608,8 @@ int field_halo_free(field_halo_t * h) {
   assert(h);
 
   int ndevice = 0;
-  tdpGetDeviceCount(&ndevice);
+
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) {
     tdpAssert( tdpMemcpy(h->send_d, h->target->send, 27*sizeof(double *),
@@ -1616,10 +1617,10 @@ int field_halo_free(field_halo_t * h) {
     tdpAssert( tdpMemcpy(h->recv_d, h->target->recv, 27*sizeof(double *),
 			 tdpMemcpyDeviceToHost) );
     for (int p = 1; p < h->nvel; p++) {
-      tdpFree(h->send_d[p]);
-      tdpFree(h->recv_d[p]);
+      tdpAssert( tdpFree(h->send_d[p]) );
+      tdpAssert( tdpFree(h->recv_d[p]) );
     }
-    tdpFree(h->target);
+    tdpAssert( tdpFree(h->target) );
   }
 
   for (int p = 1; p < h->nvel; p++) {
diff --git a/src/field_grad.c b/src/field_grad.c
index 27e553e08..8382a9441 100644
--- a/src/field_grad.c
+++ b/src/field_grad.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2012-2023 The University of Edinburgh
+ *  (c) 2012-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -86,19 +86,20 @@ static int field_grad_init(field_grad_t * obj) {
   /* Failure in int32_t indexing ... */
   if (INT_MAX < nfsz || nfsz < 1) return -1;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     obj->target = obj;
   }
   else {
-    tdpMalloc((void **) &obj->target, sizeof(field_grad_t));
-    tdpMemset(obj->target, 0, sizeof(field_grad_t));
-    tdpMemcpy(&obj->target->nf, &obj->nf, sizeof(int), tdpMemcpyHostToDevice);
-    tdpMemcpy(&obj->target->nsite, &obj->nsite, sizeof(int),
-	      tdpMemcpyHostToDevice);
-    tdpMemcpy(&obj->target->field, &obj->field->target, sizeof(field_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMalloc((void **) &obj->target, sizeof(field_grad_t)) );
+    tdpAssert( tdpMemset(obj->target, 0, sizeof(field_grad_t)) );
+    tdpAssert( tdpMemcpy(&obj->target->nf, &obj->nf, sizeof(int),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&obj->target->nsite, &obj->nsite, sizeof(int),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&obj->target->field, &obj->field->target,
+			 sizeof(field_t *), tdpMemcpyHostToDevice) );
   }
 
   if (obj->level >= 2) {
@@ -112,13 +113,13 @@ static int field_grad_init(field_grad_t * obj) {
     /* Allocate data space on target (or alias) */
  
     if (ndevice > 0) {
-      tdpMalloc((void **) &tmp, nfsz*NVECTOR*sizeof(double));
-      tdpMemcpy(&obj->target->grad, &tmp, sizeof(double *),
-		tdpMemcpyHostToDevice);
+      tdpAssert( tdpMalloc((void **) &tmp, nfsz*NVECTOR*sizeof(double)) );
+      tdpAssert( tdpMemcpy(&obj->target->grad, &tmp, sizeof(double *),
+			   tdpMemcpyHostToDevice) );
 
-      tdpMalloc((void **) &tmp, nfsz*sizeof(double));
-      tdpMemcpy(&obj->target->delsq, &tmp, sizeof(double *),
-		tdpMemcpyHostToDevice);
+      tdpAssert( tdpMalloc((void **) &tmp, nfsz*sizeof(double)) );
+      tdpAssert( tdpMemcpy(&obj->target->delsq, &tmp, sizeof(double *),
+			   tdpMemcpyHostToDevice) );
     }
   }
 
@@ -128,9 +129,9 @@ static int field_grad_init(field_grad_t * obj) {
     if (obj->d_ab == NULL) pe_fatal(obj->pe, "calloc(fieldgrad->d_ab) failed\n");
 
     if (ndevice > 0) {
-      tdpMalloc((void **) &tmp, NSYMM*nfsz*sizeof(double));
-      tdpMemcpy(&obj->target->d_ab, &tmp, sizeof(double *),
-		tdpMemcpyHostToDevice);
+      tdpAssert( tdpMalloc((void **) &tmp, NSYMM*nfsz*sizeof(double)) );
+      tdpAssert( tdpMemcpy(&obj->target->d_ab, &tmp, sizeof(double *),
+			   tdpMemcpyHostToDevice) );
     }
   }
 
@@ -141,13 +142,13 @@ static int field_grad_init(field_grad_t * obj) {
     if (obj->delsq_delsq == NULL) pe_fatal(obj->pe, "calloc(grad->delsq_delsq) failed");
 
     if (ndevice > 0) {
-      tdpMalloc((void **) &tmp, NVECTOR*nfsz*sizeof(double));
-      tdpMemcpy(&obj->target->grad_delsq, &tmp, sizeof(double *),
-		tdpMemcpyHostToDevice); 
+      tdpAssert( tdpMalloc((void **) &tmp, NVECTOR*nfsz*sizeof(double)) );
+      tdpAssert( tdpMemcpy(&obj->target->grad_delsq, &tmp, sizeof(double *),
+			   tdpMemcpyHostToDevice) );
 
-      tdpMalloc((void **) &tmp, nfsz*sizeof(double));
-      tdpMemcpy(&obj->target->delsq_delsq, &tmp, sizeof(double *),
-		tdpMemcpyHostToDevice); 
+      tdpAssert( tdpMalloc((void **) &tmp, nfsz*sizeof(double)) );
+      tdpAssert( tdpMemcpy(&obj->target->delsq_delsq, &tmp, sizeof(double *),
+			   tdpMemcpyHostToDevice) );
     }
   }
 
@@ -168,7 +169,7 @@ __host__ int field_grad_memcpy(field_grad_t * obj, tdpMemcpyKind flag) {
 
   assert(obj);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     /* Ensure we alias */
@@ -195,21 +196,23 @@ __host__ int field_grad_memcpy(field_grad_t * obj, tdpMemcpyKind flag) {
 
     switch (flag) {
     case tdpMemcpyHostToDevice:
-      tdpMemcpy(&obj->target->nf, &obj->nf, sizeof(int),
-		tdpMemcpyHostToDevice);
-      tdpMemcpy(&obj->target->nsite, &obj->nsite, sizeof(int),
-		tdpMemcpyHostToDevice);
-
-      tdpMemcpy(grad, obj->grad, NVECTOR*nsz, tdpMemcpyHostToDevice);
-      tdpMemcpy(delsq, obj->delsq, nsz, tdpMemcpyHostToDevice);
+      tdpAssert( tdpMemcpy(&obj->target->nf, &obj->nf, sizeof(int),
+			   tdpMemcpyHostToDevice) );
+      tdpAssert( tdpMemcpy(&obj->target->nsite, &obj->nsite, sizeof(int),
+			   tdpMemcpyHostToDevice) );
+
+      tdpAssert( tdpMemcpy(grad, obj->grad, NVECTOR*nsz,
+			   tdpMemcpyHostToDevice) );
+      tdpAssert( tdpMemcpy(delsq, obj->delsq, nsz, tdpMemcpyHostToDevice) );
       if (obj->level >= 4) {
-	tdpMemcpy(grad_delsq, obj->grad_delsq, NVECTOR*nsz, flag);
-	tdpMemcpy(delsq_delsq, obj->delsq_delsq, nsz, flag);
+	tdpAssert( tdpMemcpy(grad_delsq, obj->grad_delsq, NVECTOR*nsz, flag) );
+	tdpAssert( tdpMemcpy(delsq_delsq, obj->delsq_delsq, nsz, flag) );
       }
       break;
     case tdpMemcpyDeviceToHost:
-      tdpMemcpy(obj->grad, grad, NVECTOR*nsz, tdpMemcpyDeviceToHost);
-      tdpMemcpy(obj->delsq, delsq, nsz, tdpMemcpyDeviceToHost);
+      tdpAssert( tdpMemcpy(obj->grad, grad, NVECTOR*nsz,
+			   tdpMemcpyDeviceToHost) );
+      tdpAssert( tdpMemcpy(obj->delsq, delsq, nsz, tdpMemcpyDeviceToHost) );
       if (obj->level >= 4) {
 	tdpAssert(tdpMemcpy(obj->grad_delsq, grad_delsq, nsz*NVECTOR, flag));
 	tdpAssert(tdpMemcpy(obj->delsq_delsq, delsq_delsq, nsz, flag));
@@ -273,26 +276,26 @@ __host__ void field_grad_free(field_grad_t * obj) {
 
   assert(obj);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) {
-    tdpMemcpy(&tmp, &obj->target->grad, sizeof(double *),
-	      tdpMemcpyDeviceToHost); 
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &obj->target->delsq, sizeof(double *),
-	      tdpMemcpyDeviceToHost); 
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &obj->target->d_ab, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    if (tmp) tdpFree(tmp);
-    tdpMemcpy(&tmp, &obj->target->grad_delsq, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    if (tmp) tdpFree(tmp);
-    tdpMemcpy(&tmp, &obj->target->delsq_delsq, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    if (tmp) tdpFree(tmp);
-
-    tdpFree(obj->target);
+    tdpAssert( tdpMemcpy(&tmp, &obj->target->grad, sizeof(double *),
+			 tdpMemcpyDeviceToHost) ); 
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &obj->target->delsq, sizeof(double *),
+			 tdpMemcpyDeviceToHost) ); 
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &obj->target->d_ab, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    if (tmp) tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &obj->target->grad_delsq, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    if (tmp) tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &obj->target->delsq_delsq, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    if (tmp) tdpAssert( tdpFree(tmp) );
+
+    tdpAssert( tdpFree(obj->target) );
   }
 
   if (obj->grad) free(obj->grad);
diff --git a/src/gradient_3d_7pt_solid.c b/src/gradient_3d_7pt_solid.c
index e5d07f6fc..680ad8d63 100644
--- a/src/gradient_3d_7pt_solid.c
+++ b/src/gradient_3d_7pt_solid.c
@@ -43,7 +43,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2011-2023 The University of Edinburgh
+ *  (c) 2011-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -132,7 +132,7 @@ __host__ int grad_lc_anch_create(pe_t * pe, cs_t * cs, map_t * map,
     lc_anchoring_matrices(fep.kappa0, fep.kappa1, &obj->bc);
   }
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     obj->target = obj;
@@ -140,7 +140,7 @@ __host__ int grad_lc_anch_create(pe_t * pe, cs_t * cs, map_t * map,
   else {
     /* Copy required entities over ... */
     cs_t * tcs = NULL;
-    tdpMalloc((void **) &obj->target, sizeof(grad_lc_anch_t));
+    tdpAssert( tdpMalloc((void **) &obj->target, sizeof(grad_lc_anch_t)) );
 
     cs_target(obj->cs, &tcs);
     tdpAssert(tdpMemcpy(&obj->target->cs, &tcs, sizeof(cs_t *),
@@ -172,7 +172,7 @@ __host__ int grad_lc_anch_free(grad_lc_anch_t * grad) {
 
   assert(grad);
 
-  if (grad->target != grad) tdpFree(grad->target);
+  if (grad->target != grad) tdpAssert( tdpFree(grad->target) );
   free(grad);
 
   return 0;
@@ -194,7 +194,7 @@ __host__ int grad_3d_7pt_solid_set(map_t * map, colloids_info_t * cinfo) {
   static_grad->map = map;
   static_grad->cinfo = cinfo;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
   if (ndevice) {
     tdpAssert(tdpMemcpy(&static_grad->target->cinfo, &cinfo->target,
 			sizeof(colloids_info_t *), tdpMemcpyHostToDevice));
@@ -271,7 +271,7 @@ int gradient_6x6(grad_lc_anch_t * anch, field_grad_t * fg, int nextra) {
 		    k3d, cstarget,
 		    anch->target, anch->fe->target, fg->target,
 		    anch->map->target);
-    tdpDeviceSynchronize();
+    tdpAssert( tdpDeviceSynchronize() );
   }
 
   return 0;
diff --git a/src/gradient_s7_anchoring.c b/src/gradient_s7_anchoring.c
index 3e9c7bb2c..2e5344b58 100644
--- a/src/gradient_s7_anchoring.c
+++ b/src/gradient_s7_anchoring.c
@@ -39,7 +39,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2022 The University of Edinburgh
+ *  (c) 2022-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -121,19 +121,19 @@ __host__ int grad_s7_anchoring_create(pe_t * pe, cs_t * cs, map_t * map,
     lc_anchoring_matrices(fep.kappa0, fep.kappa1, &obj->bc);
   }
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     obj->target = obj;
   }
   else {
     cs_t * cstarget = NULL;
-    tdpMalloc((void **) &obj->target, sizeof(grad_s7_anch_t));
-    tdpMemset(obj->target, 0, sizeof(grad_s7_anch_t));
+    tdpAssert( tdpMalloc((void **) &obj->target, sizeof(grad_s7_anch_t)) );
+    tdpAssert( tdpMemset(obj->target, 0, sizeof(grad_s7_anch_t)) );
 
     cs_target(obj->cs, &cstarget);
-    tdpMemcpy(&obj->target->cs, &cstarget, sizeof(cs_t *),
-              tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&obj->target->cs, &cstarget, sizeof(cs_t *),
+			 tdpMemcpyHostToDevice) );
 
     tdpAssert(tdpMemcpy(&obj->target->bc, &obj->bc,
 			sizeof(lc_anchoring_matrices_t),
@@ -160,7 +160,7 @@ __host__ int grad_s7_anchoring_cinfo_set(colloids_info_t * cinfo) {
 
   static_grad->cinfo = cinfo;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) {
     tdpAssert(tdpMemcpy(&static_grad->target->cinfo, &cinfo->target,
@@ -212,7 +212,7 @@ __host__ int grad_s7_anchoring_free(grad_s7_anch_t * grad) {
 
   assert(grad);
 
-  if (grad->target != grad) tdpFree(grad->target);
+  if (grad->target != grad) tdpAssert( tdpFree(grad->target) );
 
   free(grad);
 
diff --git a/src/halo_swap.c b/src/halo_swap.c
index 4c10d3092..b0a789c61 100644
--- a/src/halo_swap.c
+++ b/src/halo_swap.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2016-2020 The University of Edinburgh
+ *  (c) 2016-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Alan Gray (alang@epcc.ed.ac.uk)
@@ -167,30 +167,30 @@ __host__ int halo_swap_create(pe_t * pe, cs_t * cs, int nhcomm, int naddr,
   /* Host buffers, actual and halo regions */
 
   sz = (size_t) halo->param->hsz[X]*na*nb*sizeof(double);
-  tdpHostAlloc((void **) &halo->fxlo, sz, mflag);
-  tdpHostAlloc((void **) &halo->fxhi, sz, mflag);
-  tdpHostAlloc((void **) &halo->hxlo, sz, mflag);
-  tdpHostAlloc((void **) &halo->hxhi, sz, mflag);
+  tdpAssert( tdpHostAlloc((void **) &halo->fxlo, sz, mflag) );
+  tdpAssert( tdpHostAlloc((void **) &halo->fxhi, sz, mflag) );
+  tdpAssert( tdpHostAlloc((void **) &halo->hxlo, sz, mflag) );
+  tdpAssert( tdpHostAlloc((void **) &halo->hxhi, sz, mflag) );
 
   sz = (size_t) halo->param->hsz[Y]*na*nb*sizeof(double);
-  tdpHostAlloc((void **) &halo->fylo, sz, mflag);
-  tdpHostAlloc((void **) &halo->fyhi, sz, mflag);
-  tdpHostAlloc((void **) &halo->hylo, sz, mflag);
-  tdpHostAlloc((void **) &halo->hyhi, sz, mflag);
+  tdpAssert( tdpHostAlloc((void **) &halo->fylo, sz, mflag) );
+  tdpAssert( tdpHostAlloc((void **) &halo->fyhi, sz, mflag) );
+  tdpAssert( tdpHostAlloc((void **) &halo->hylo, sz, mflag) );
+  tdpAssert( tdpHostAlloc((void **) &halo->hyhi, sz, mflag) );
 
   sz = (size_t) halo->param->hsz[Z]*na*nb*sizeof(double);
-  tdpHostAlloc((void **) &halo->fzlo, sz, mflag);
-  tdpHostAlloc((void **) &halo->fzhi, sz, mflag);
-  tdpHostAlloc((void **) &halo->hzlo, sz, mflag);
-  tdpHostAlloc((void **) &halo->hzhi, sz, mflag);
+  tdpAssert( tdpHostAlloc((void **) &halo->fzlo, sz, mflag) );
+  tdpAssert( tdpHostAlloc((void **) &halo->fzhi, sz, mflag) );
+  tdpAssert( tdpHostAlloc((void **) &halo->hzlo, sz, mflag) );
+  tdpAssert( tdpHostAlloc((void **) &halo->hzhi, sz, mflag) );
 
-  tdpStreamCreate(&halo->stream[X]);
-  tdpStreamCreate(&halo->stream[Y]);
-  tdpStreamCreate(&halo->stream[Z]);
+  tdpAssert( tdpStreamCreate(&halo->stream[X]) );
+  tdpAssert( tdpStreamCreate(&halo->stream[Y]) );
+  tdpAssert( tdpStreamCreate(&halo->stream[Z]) );
 
   /* Device buffers: allocate or alias */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     halo->target = halo;
@@ -200,61 +200,61 @@ __host__ int halo_swap_create(pe_t * pe, cs_t * cs, int nhcomm, int naddr,
     halo_swap_param_t * tmpp;
 
     /* Target structure */
-    tdpMalloc((void **) &halo->target, sizeof(halo_swap_t));
-    tdpMemset(halo->target, 0, sizeof(halo_swap_t));
+    tdpAssert( tdpMalloc((void **) &halo->target, sizeof(halo_swap_t)) );
+    tdpAssert( tdpMemset(halo->target, 0, sizeof(halo_swap_t)) );
 
     /* Buffers */
     sz = (size_t) halo->param->hsz[X]*na*nb*sizeof(double);
 
-    tdpMalloc((void **) &tmp, sz);
-    tdpMemcpy(&halo->target->fxlo, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
-    tdpMalloc((void **) &tmp, sz);
-    tdpMemcpy(&halo->target->fxhi, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMalloc((void **) &tmp, sz) );
+    tdpAssert( tdpMemcpy(&halo->target->fxlo, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMalloc((void **) &tmp, sz) );
+    tdpAssert( tdpMemcpy(&halo->target->fxhi, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
 
-    tdpMalloc((void **) &tmp, sz);
-    tdpMemcpy(&halo->target->hxlo, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
-    tdpMalloc((void **) & tmp, sz);
-    tdpMemcpy(&halo->target->hxhi, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMalloc((void **) &tmp, sz) );
+    tdpAssert( tdpMemcpy(&halo->target->hxlo, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMalloc((void **) & tmp, sz) );
+    tdpAssert( tdpMemcpy(&halo->target->hxhi, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
 
     sz = (size_t) halo->param->hsz[Y]*na*nb*sizeof(double);
 
-    tdpMalloc((void ** ) &tmp, sz);
-    tdpMemcpy(&halo->target->fylo, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
-    tdpMalloc((void **) &tmp, sz);
-    tdpMemcpy(&halo->target->fyhi, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMalloc((void ** ) &tmp, sz) );
+    tdpAssert( tdpMemcpy(&halo->target->fylo, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMalloc((void **) &tmp, sz) );
+    tdpAssert( tdpMemcpy(&halo->target->fyhi, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
 
-    tdpMalloc((void **) &tmp, sz);
-    tdpMemcpy(&halo->target->hylo, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
-    tdpMalloc((void **) &tmp, sz);
-    tdpMemcpy(&halo->target->hyhi, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMalloc((void **) &tmp, sz) );
+    tdpAssert( tdpMemcpy(&halo->target->hylo, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMalloc((void **) &tmp, sz) );
+    tdpAssert( tdpMemcpy(&halo->target->hyhi, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
 
     sz = (size_t) halo->param->hsz[Z]*na*nb*sizeof(double);
 
-    tdpMalloc((void **) &tmp, sz);
-    tdpMemcpy(&halo->target->fzlo, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
-    tdpMalloc((void **) &tmp, sz);
-    tdpMemcpy(&halo->target->fzhi, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMalloc((void **) &tmp, sz) );
+    tdpAssert( tdpMemcpy(&halo->target->fzlo, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMalloc((void **) &tmp, sz) );
+    tdpAssert( tdpMemcpy(&halo->target->fzhi, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
 
-    tdpMalloc((void **) &tmp, sz);
-    tdpMemcpy(&halo->target->hzlo, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
-    tdpMalloc((void **) &tmp, sz);
-    tdpMemcpy(&halo->target->hzhi, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMalloc((void **) &tmp, sz) );
+    tdpAssert( tdpMemcpy(&halo->target->hzlo, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMalloc((void **) &tmp, sz) );
+    tdpAssert( tdpMemcpy(&halo->target->hzhi, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
 
     tdpGetSymbolAddress((void **) &tmpp, tdpSymbol(const_param));
-    tdpMemcpy(&halo->target->param, &tmpp, sizeof(halo_swap_param_t *),
-	      tdpMemcpyHostToDevice); 
+    tdpAssert( tdpMemcpy(&halo->target->param, &tmpp,
+			 sizeof(halo_swap_param_t *), tdpMemcpyHostToDevice) ); 
 
     /* Device constants */
     halo_swap_commit(halo);
@@ -277,69 +277,69 @@ __host__ int halo_swap_free(halo_swap_t * halo) {
 
   assert(halo);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) {
     double * tmp;
 
-    tdpMemcpy(&tmp, &halo->target->fxlo, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &halo->target->fxhi, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &halo->target->fylo, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &halo->target->fyhi, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &halo->target->fzlo, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &halo->target->fzhi, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-
-    tdpMemcpy(&tmp, &halo->target->hxlo, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &halo->target->hxhi, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &halo->target->hylo, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &halo->target->hyhi, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &halo->target->hzlo, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpMemcpy(&tmp, &halo->target->hzhi, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-
-    tdpFree(halo->target);
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->fxlo, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->fxhi, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->fylo, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->fyhi, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->fzlo, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->fzhi, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->hxlo, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->hxhi, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->hylo, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->hyhi, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->hzlo, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->hzhi, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+
+    tdpAssert( tdpFree(halo->target) );
   }
 
-  tdpFreeHost(halo->fxlo);
-  tdpFreeHost(halo->fxhi);
-  tdpFreeHost(halo->fylo);
-  tdpFreeHost(halo->fyhi);
-  tdpFreeHost(halo->fzlo);
-  tdpFreeHost(halo->fzhi);
+  tdpAssert( tdpFreeHost(halo->fxlo) );
+  tdpAssert( tdpFreeHost(halo->fxhi) );
+  tdpAssert( tdpFreeHost(halo->fylo) );
+  tdpAssert( tdpFreeHost(halo->fyhi) );
+  tdpAssert( tdpFreeHost(halo->fzlo) );
+  tdpAssert( tdpFreeHost(halo->fzhi) );
 
-  tdpFreeHost(halo->hxlo);
-  tdpFreeHost(halo->hxhi);
-  tdpFreeHost(halo->hylo);
-  tdpFreeHost(halo->hyhi);
-  tdpFreeHost(halo->hzlo);
-  tdpFreeHost(halo->hzhi);
+  tdpAssert( tdpFreeHost(halo->hxlo) );
+  tdpAssert( tdpFreeHost(halo->hxhi) );
+  tdpAssert( tdpFreeHost(halo->hylo) );
+  tdpAssert( tdpFreeHost(halo->hyhi) );
+  tdpAssert( tdpFreeHost(halo->hzlo) );
+  tdpAssert( tdpFreeHost(halo->hzhi) );
 
-  tdpStreamDestroy(halo->stream[X]);
-  tdpStreamDestroy(halo->stream[Y]);
-  tdpStreamDestroy(halo->stream[Z]);
+  tdpAssert( tdpStreamDestroy(halo->stream[X]) );
+  tdpAssert( tdpStreamDestroy(halo->stream[Y]) );
+  tdpAssert( tdpStreamDestroy(halo->stream[Z]) );
 
   free(halo->param);
   free(halo);
@@ -736,7 +736,7 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
   /* 2D systems require fix... in the meantime...*/
   assert(halo->param->nlocal[Z] >= halo->param->nswap);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
   halo_swap_commit(halo);
 
   cs_cart_comm(halo->cs, &comm);
@@ -791,14 +791,14 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
 
   if (ndevice > 0) {
     ncount = hsz[X]*halo->param->nfel;
-    tdpMemcpy(&tmp, &halo->target->fxlo, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpMemcpyAsync(halo->fxlo, tmp, ncount*sizeof(double),
-		   tdpMemcpyDeviceToHost, halo->stream[X]);
-    tdpMemcpy(&tmp, &halo->target->fxhi, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpMemcpyAsync(halo->fxhi, tmp, ncount*sizeof(double),
-		   tdpMemcpyDeviceToHost, halo->stream[X]);
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->fxlo, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpyAsync(halo->fxlo, tmp, ncount*sizeof(double),
+			      tdpMemcpyDeviceToHost, halo->stream[X]) );
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->fxhi, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpyAsync(halo->fxhi, tmp, ncount*sizeof(double),
+			      tdpMemcpyDeviceToHost, halo->stream[X]) );
   }
 
   /* pack Y edges on accelerator */
@@ -809,14 +809,14 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
 
   if (ndevice > 0) {
     ncount = hsz[Y]*halo->param->nfel;
-    tdpMemcpy(&tmp, &halo->target->fylo, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpMemcpyAsync(halo->fylo, tmp, ncount*sizeof(double),
-		   tdpMemcpyDeviceToHost, halo->stream[Y]);
-    tdpMemcpy(&tmp, &halo->target->fyhi, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpMemcpyAsync(halo->fyhi, tmp, ncount*sizeof(double),
-		   tdpMemcpyDeviceToHost, halo->stream[Y]);
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->fylo, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpyAsync(halo->fylo, tmp, ncount*sizeof(double),
+			      tdpMemcpyDeviceToHost, halo->stream[Y]) );
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->fyhi, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpyAsync(halo->fyhi, tmp, ncount*sizeof(double),
+			      tdpMemcpyDeviceToHost, halo->stream[Y]) );
   }
 
   /* pack Z edges on accelerator */
@@ -827,36 +827,36 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
 
   if (ndevice > 0) {
     ncount = hsz[Z]*halo->param->nfel;
-    tdpMemcpy(&tmp, &halo->target->fzlo, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpMemcpyAsync(halo->fzlo, tmp, ncount*sizeof(double),
-		   tdpMemcpyDeviceToHost, halo->stream[Z]);
-    tdpMemcpy(&tmp, &halo->target->fzhi, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpMemcpyAsync(halo->fzhi, tmp, ncount*sizeof(double),
-		   tdpMemcpyDeviceToHost, halo->stream[Z]);
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->fzlo, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpyAsync(halo->fzlo, tmp, ncount*sizeof(double),
+			      tdpMemcpyDeviceToHost, halo->stream[Z]) );
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->fzhi, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpyAsync(halo->fzhi, tmp, ncount*sizeof(double),
+			      tdpMemcpyDeviceToHost, halo->stream[Z]) );
   }
 
 
   /* Wait for X; copy or MPI recvs; put X halos back on device, and unpack */
 
-  tdpStreamSynchronize(halo->stream[X]);
+  tdpAssert( tdpStreamSynchronize(halo->stream[X]) );
   ncount = hsz[X]*halo->param->nfel;
 
   if (mpicartsz[X] == 1) {
     /* note these copies do not alias for ndevice == 1 */
     /* fxhi -> hxlo */
     memcpy(halo->hxlo, halo->fxhi, ncount*sizeof(double));
-    tdpMemcpy(&tmp, &halo->target->hxlo, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpMemcpyAsync(tmp, halo->fxhi, ncount*sizeof(double),
-		    tdpMemcpyHostToDevice, halo->stream[X]);
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->hxlo, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpyAsync(tmp, halo->fxhi, ncount*sizeof(double),
+			      tdpMemcpyHostToDevice, halo->stream[X]) );
     /* fxlo -> hxhi */
     memcpy(halo->hxhi, halo->fxlo, ncount*sizeof(double));
-    tdpMemcpy(&tmp, &halo->target->hxhi, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpMemcpyAsync(tmp, halo->fxlo, ncount*sizeof(double),
-		    tdpMemcpyHostToDevice, halo->stream[X]);
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->hxhi, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpyAsync(tmp, halo->fxlo, ncount*sizeof(double),
+			      tdpMemcpyHostToDevice, halo->stream[X]) );
   }
   else {
     MPI_Isend(halo->fxhi, ncount, MPI_DOUBLE,
@@ -867,16 +867,16 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
     for (m = 0; m < 4; m++) {
       MPI_Waitany(4, req_x, &mc, status);
       if (mc == 0 && ndevice > 0) {
-	tdpMemcpy(&tmp, &halo->target->hxlo, sizeof(double *),
-		  tdpMemcpyDeviceToHost);
-	tdpMemcpyAsync(tmp, halo->hxlo, ncount*sizeof(double),
-		       tdpMemcpyHostToDevice, halo->stream[X]);
+	tdpAssert( tdpMemcpy(&tmp, &halo->target->hxlo, sizeof(double *),
+			     tdpMemcpyDeviceToHost) );
+	tdpAssert( tdpMemcpyAsync(tmp, halo->hxlo, ncount*sizeof(double),
+				  tdpMemcpyHostToDevice, halo->stream[X]) );
       }
       if (mc == 1 && ndevice > 0) {
-	tdpMemcpy(&tmp, &halo->target->hxhi, sizeof(double *),
-		  tdpMemcpyDeviceToHost);
-	tdpMemcpyAsync(tmp, halo->hxhi, ncount*sizeof(double),
-		       tdpMemcpyHostToDevice, halo->stream[X]);
+	tdpAssert( tdpMemcpy(&tmp, &halo->target->hxhi, sizeof(double *),
+			     tdpMemcpyDeviceToHost) );
+	tdpAssert( tdpMemcpyAsync(tmp, halo->hxhi, ncount*sizeof(double),
+				  tdpMemcpyHostToDevice, halo->stream[X]) );
       }
     }
   }
@@ -888,7 +888,7 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
   /* Now wait for Y data to arrive from device */
   /* Fill in 4 corners of Y edge data from X halo */
 
-  tdpStreamSynchronize(halo->stream[Y]);
+  tdpAssert( tdpStreamSynchronize(halo->stream[Y]) );
 
   ih = halo->param->hext[Y][X] - nh;
   jh = halo->param->hext[X][Y] - nh - halo->param->nswap;
@@ -921,16 +921,16 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
   if (mpicartsz[Y] == 1) {
     /* fyhi -> hylo */
     memcpy(halo->hylo, halo->fyhi, ncount*sizeof(double));
-    tdpMemcpy(&tmp, &halo->target->hylo, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpMemcpyAsync(tmp, halo->fyhi, ncount*sizeof(double),
-		   tdpMemcpyHostToDevice, halo->stream[Y]);
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->hylo, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpyAsync(tmp, halo->fyhi, ncount*sizeof(double),
+			      tdpMemcpyHostToDevice, halo->stream[Y]) );
     /* fylo -> hyhi */
     memcpy(halo->hyhi, halo->fylo, ncount*sizeof(double));
-    tdpMemcpy(&tmp, &halo->target->hyhi, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpMemcpyAsync(tmp, halo->fylo,ncount*sizeof(double),
-		   tdpMemcpyHostToDevice, halo->stream[Y]);
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->hyhi, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpyAsync(tmp, halo->fylo,ncount*sizeof(double),
+			      tdpMemcpyHostToDevice, halo->stream[Y]) );
   }
   else {
     MPI_Isend(halo->fyhi, ncount, MPI_DOUBLE,
@@ -941,16 +941,16 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
     for (m = 0; m < 4; m++) {
       MPI_Waitany(4, req_y, &mc, status);
       if (mc == 0 && ndevice > 0) {
-	tdpMemcpy(&tmp, &halo->target->hylo, sizeof(double *),
-		  tdpMemcpyDeviceToHost);
-	tdpMemcpyAsync(tmp, halo->hylo, ncount*sizeof(double),
-		       tdpMemcpyHostToDevice, halo->stream[Y]);
+	tdpAssert( tdpMemcpy(&tmp, &halo->target->hylo, sizeof(double *),
+			     tdpMemcpyDeviceToHost) );
+	tdpAssert( tdpMemcpyAsync(tmp, halo->hylo, ncount*sizeof(double),
+				  tdpMemcpyHostToDevice, halo->stream[Y]) );
       }
       if (mc == 1 && ndevice > 0) {
-	tdpMemcpy(&tmp, &halo->target->hyhi, sizeof(double *),
-		  tdpMemcpyDeviceToHost);
-	tdpMemcpyAsync(tmp, halo->hyhi, ncount*sizeof(double),
-			tdpMemcpyHostToDevice, halo->stream[Y]);
+	tdpAssert( tdpMemcpy(&tmp, &halo->target->hyhi, sizeof(double *),
+			     tdpMemcpyDeviceToHost) );
+	tdpAssert( tdpMemcpyAsync(tmp, halo->hyhi, ncount*sizeof(double),
+				  tdpMemcpyHostToDevice, halo->stream[Y]) );
       }
     }
   }
@@ -963,7 +963,7 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
   /* Wait for Z data from device */
   /* Fill in 4 corners of Z edge data from X halo  */
 
-  tdpStreamSynchronize(halo->stream[Z]);
+  tdpAssert( tdpStreamSynchronize(halo->stream[Z]) );
 
   ih = halo->param->hext[Z][X] - nh;
   kh = halo->param->hext[X][Z] - nh - halo->param->nswap;
@@ -1018,15 +1018,15 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
 
   if (mpicartsz[Z] == 1) {
     /* fzhi -> hzlo */
-    tdpMemcpy(&tmp, &halo->target->hzlo, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpMemcpyAsync(tmp, halo->fzhi, ncount*sizeof(double),
-		   tdpMemcpyHostToDevice, halo->stream[Z]);
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->hzlo, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpyAsync(tmp, halo->fzhi, ncount*sizeof(double),
+			      tdpMemcpyHostToDevice, halo->stream[Z]) );
     /* fzlo -> hzhi */
-    tdpMemcpy(&tmp, &halo->target->hzhi, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpMemcpyAsync(tmp, halo->fzlo, ncount*sizeof(double),
-		   tdpMemcpyHostToDevice, halo->stream[Z]);
+    tdpAssert( tdpMemcpy(&tmp, &halo->target->hzhi, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpyAsync(tmp, halo->fzlo, ncount*sizeof(double),
+			      tdpMemcpyHostToDevice, halo->stream[Z]) );
   }
   else {
     MPI_Isend(halo->fzhi, ncount, MPI_DOUBLE,
@@ -1037,16 +1037,16 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
     for (m = 0; m < 4; m++) {
       MPI_Waitany(4, req_z, &mc, status);
       if (mc == 0 && ndevice > 0) {
-	tdpMemcpy(&tmp, &halo->target->hzlo, sizeof(double *),
-		  tdpMemcpyDeviceToHost);
-	tdpMemcpyAsync(tmp, halo->hzlo, ncount*sizeof(double),
-		       tdpMemcpyHostToDevice, halo->stream[Z]);
+	tdpAssert( tdpMemcpy(&tmp, &halo->target->hzlo, sizeof(double *),
+			     tdpMemcpyDeviceToHost) );
+	tdpAssert( tdpMemcpyAsync(tmp, halo->hzlo, ncount*sizeof(double),
+				  tdpMemcpyHostToDevice, halo->stream[Z]) );
       }
       if (mc == 1 && ndevice > 0) {
-	tdpMemcpy(&tmp, &halo->target->hzhi, sizeof(double *),
-		  tdpMemcpyDeviceToHost);
-	tdpMemcpyAsync(tmp, halo->hzhi, ncount*sizeof(double),
-		       tdpMemcpyHostToDevice, halo->stream[Z]);
+	tdpAssert( tdpMemcpy(&tmp, &halo->target->hzhi, sizeof(double *),
+			     tdpMemcpyDeviceToHost) );
+	tdpAssert( tdpMemcpyAsync(tmp, halo->hzhi, ncount*sizeof(double),
+				  tdpMemcpyHostToDevice, halo->stream[Z]) );
       }
     }
   }
@@ -1055,9 +1055,9 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
   tdpLaunchKernel(halo->data_unpack, nblk, ntpb, 0, halo->stream[Z],
 		  halo->target, Z, data);
 
-  tdpStreamSynchronize(halo->stream[X]);
-  tdpStreamSynchronize(halo->stream[Y]);
-  tdpStreamSynchronize(halo->stream[Z]);
+  tdpAssert( tdpStreamSynchronize(halo->stream[X]) );
+  tdpAssert( tdpStreamSynchronize(halo->stream[Y]) );
+  tdpAssert( tdpStreamSynchronize(halo->stream[Z]) );
 
   return 0;
 }
diff --git a/src/hydro.c b/src/hydro.c
index 6e3c40d83..65fd9e908 100644
--- a/src/hydro.c
+++ b/src/hydro.c
@@ -85,7 +85,7 @@ __host__ int hydro_create(pe_t * pe, cs_t * cs, lees_edw_t * le,
 
   /* Allocate target copy of structure (or alias) */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     obj->target = obj;
@@ -126,7 +126,7 @@ __host__ int hydro_free(hydro_t * obj) {
 
   assert(obj);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) tdpAssert(tdpFree(obj->target));
 
@@ -152,7 +152,7 @@ __host__ int hydro_memcpy(hydro_t * obj, tdpMemcpyKind flag) {
 
   assert(obj);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     /* Ensure we alias */
@@ -722,7 +722,7 @@ __host__ int hydro_correct_momentum(hydro_t * hydro) {
 
   /* Apply correction and finish */
 
-  tdpMemcpy(fnetd, fnet, 3*sizeof(double), tdpMemcpyHostToDevice);
+  tdpAssert( tdpMemcpy(fnetd, fnet, 3*sizeof(double), tdpMemcpyHostToDevice) );
 
   {
     dim3 nblk = {};
diff --git a/src/lb_data.c b/src/lb_data.c
index 15e33486e..c7ba18f8a 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -196,20 +196,21 @@ int lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * options,
 __host__ int lb_free(lb_t * lb) {
 
   int ndevice;
-  double * tmp;
 
   assert(lb);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) {
-    tdpMemcpy(&tmp, &lb->target->f, sizeof(double *), tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-
-    tdpMemcpy(&tmp, &lb->target->fprime, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    tdpFree(tmp);
-    tdpFree(lb->target);
+    double * tmp = NULL;
+    tdpAssert( tdpMemcpy(&tmp, &lb->target->f, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+
+    tdpAssert( tdpMemcpy(&tmp, &lb->target->fprime, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpFree(lb->target) );
   }
 
   io_metadata_finalise(&lb->input);
@@ -241,7 +242,7 @@ __host__ int lb_memcpy(lb_t * lb, tdpMemcpyKind flag) {
 
   assert(lb);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     /* Make sure we alias */
@@ -253,18 +254,19 @@ __host__ int lb_memcpy(lb_t * lb, tdpMemcpyKind flag) {
 
     assert(lb->target);
 
-    tdpMemcpy(&tmpf, &lb->target->f, sizeof(double *), tdpMemcpyDeviceToHost);
+    tdpAssert( tdpMemcpy(&tmpf, &lb->target->f, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
 
     switch (flag) {
     case tdpMemcpyHostToDevice:
-      tdpMemcpy(&lb->target->ndim,  &lb->ndim,  sizeof(int), flag);
-      tdpMemcpy(&lb->target->nvel,  &lb->nvel,  sizeof(int), flag);
-      tdpMemcpy(&lb->target->ndist, &lb->ndist, sizeof(int), flag);
-      tdpMemcpy(&lb->target->nsite, &lb->nsite, sizeof(int), flag);
-      tdpMemcpy(tmpf, lb->f, nsz, flag);
+      tdpAssert( tdpMemcpy(&lb->target->ndim,  &lb->ndim,  sizeof(int), flag) );
+      tdpAssert( tdpMemcpy(&lb->target->nvel,  &lb->nvel,  sizeof(int), flag) );
+      tdpAssert( tdpMemcpy(&lb->target->ndist, &lb->ndist, sizeof(int), flag) );
+      tdpAssert( tdpMemcpy(&lb->target->nsite, &lb->nsite, sizeof(int), flag) );
+      tdpAssert( tdpMemcpy(tmpf, lb->f, nsz, flag) );
       break;
     case tdpMemcpyDeviceToHost:
-      tdpMemcpy(lb->f, tmpf, nsz, flag);
+      tdpAssert( tdpMemcpy(lb->f, tmpf, nsz, flag) );
       break;
     default:
       pe_fatal(lb->pe, "Bad flag in lb_memcpy\n");
@@ -307,7 +309,7 @@ static int lb_init(lb_t * lb) {
 
   ndata = lb->nsite*lb->ndist*lb->model.nvel;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     lb->target = lb;
@@ -315,21 +317,22 @@ static int lb_init(lb_t * lb) {
   else {
     lb_collide_param_t * ptmp  = NULL;
 
-    tdpMalloc((void **) &lb->target, sizeof(lb_t));
-    tdpMemset(lb->target, 0, sizeof(lb_t));
+    tdpAssert( tdpMalloc((void **) &lb->target, sizeof(lb_t)) );
+    tdpAssert( tdpMemset(lb->target, 0, sizeof(lb_t)) );
 
-    tdpMalloc((void **) &tmp, ndata*sizeof(double));
-    tdpMemset(tmp, 0, ndata*sizeof(double));
-    tdpMemcpy(&lb->target->f, &tmp, sizeof(double *), tdpMemcpyHostToDevice);
+    tdpAssert( tdpMalloc((void **) &tmp, ndata*sizeof(double)) );
+    tdpAssert( tdpMemset(tmp, 0, ndata*sizeof(double)) );
+    tdpAssert( tdpMemcpy(&lb->target->f, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
 
-    tdpMalloc((void **) &tmp, ndata*sizeof(double));
-    tdpMemset(tmp, 0, ndata*sizeof(double));
-    tdpMemcpy(&lb->target->fprime, &tmp, sizeof(double *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMalloc((void **) &tmp, ndata*sizeof(double)) );
+    tdpAssert( tdpMemset(tmp, 0, ndata*sizeof(double)) );
+    tdpAssert( tdpMemcpy(&lb->target->fprime, &tmp, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
 
     tdpGetSymbolAddress((void **) &ptmp, tdpSymbol(static_param));
-    tdpMemcpy(&lb->target->param, &ptmp, sizeof(lb_collide_param_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&lb->target->param, &ptmp,
+			 sizeof(lb_collide_param_t *), tdpMemcpyHostToDevice));
   }
 
   lb_mpi_init(lb);
@@ -539,7 +542,8 @@ __host__ int lb_halo_swap(lb_t * lb, lb_halo_enum_t flag) {
 
   switch (flag) {
   case LB_HALO_TARGET:
-    tdpMemcpy(&data, &lb->target->f, sizeof(double *), tdpMemcpyDeviceToHost);
+    tdpAssert( tdpMemcpy(&data, &lb->target->f, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
     halo_swap_packed(lb->halo, data);
     break;
   case LB_HALO_OPENMP_FULL:
diff --git a/src/lc_droplet.c b/src/lc_droplet.c
index 1359cdff1..76e31f3f4 100644
--- a/src/lc_droplet.c
+++ b/src/lc_droplet.c
@@ -123,7 +123,7 @@ __host__ int fe_lc_droplet_create(pe_t * pe, cs_t * cs, fe_lc_t * lc,
   fe->super.func = &fe_drop_hvt;
   fe->super.id = FE_LC_DROPLET;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     fe->target = fe;
@@ -131,19 +131,20 @@ __host__ int fe_lc_droplet_create(pe_t * pe, cs_t * cs, fe_lc_t * lc,
   else {
     fe_lc_droplet_param_t * tmp;
     fe_vt_t * vt;
-    tdpMalloc((void **) &fe->target, sizeof(fe_lc_droplet_t));
-    tdpMemset(fe->target, 0, sizeof(fe_lc_droplet_t));
+    tdpAssert( tdpMalloc((void **) &fe->target, sizeof(fe_lc_droplet_t)) );
+    tdpAssert( tdpMemset(fe->target, 0, sizeof(fe_lc_droplet_t)) );
     tdpGetSymbolAddress((void **) &tmp, tdpSymbol(const_param));
-    tdpMemcpy(&fe->target->param, &tmp, sizeof(fe_lc_droplet_param_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&fe->target->param, &tmp,
+			 sizeof(fe_lc_droplet_param_t *),
+			 tdpMemcpyHostToDevice) );
     tdpGetSymbolAddress((void **) &vt, tdpSymbol(fe_drop_dvt));
-    tdpMemcpy(&fe->target->super.func, &vt, sizeof(fe_vt_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&fe->target->super.func, &vt, sizeof(fe_vt_t *),
+			 tdpMemcpyHostToDevice) );
 
-    tdpMemcpy(&fe->target->lc, &lc->target, sizeof(fe_lc_t *),
-	      tdpMemcpyHostToDevice);
-    tdpMemcpy(&fe->target->symm, &symm->target, sizeof(fe_symm_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&fe->target->lc, &lc->target, sizeof(fe_lc_t *),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&fe->target->symm, &symm->target, sizeof(fe_symm_t *),
+			 tdpMemcpyHostToDevice) );
 
     {
       /* Provide constant memory for lc parameters */
diff --git a/src/leesedwards.c b/src/leesedwards.c
index f7b7d1a28..a5577aad6 100644
--- a/src/leesedwards.c
+++ b/src/leesedwards.c
@@ -9,7 +9,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2023 The University of Edinburgh
+ *  (c) 2010-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -106,7 +106,7 @@ __host__ int lees_edw_create(pe_t * pe, cs_t * cs,
   lees_edw_init_tables(le);
   le->nref = 1;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     le->target = le;
@@ -115,14 +115,15 @@ __host__ int lees_edw_create(pe_t * pe, cs_t * cs,
     lees_edw_param_t * tmp;
     cs_t * cst;
 
-    tdpMalloc((void **) &le->target, sizeof(lees_edw_t));
-    tdpMemset(le->target, 0, sizeof(lees_edw_t));
+    tdpAssert( tdpMalloc((void **) &le->target, sizeof(lees_edw_t)) );
+    tdpAssert( tdpMemset(le->target, 0, sizeof(lees_edw_t)) );
     tdpGetSymbolAddress((void **) &tmp, tdpSymbol(static_param));
-    tdpMemcpy(&le->target->param, (const void *) &tmp,
-	      sizeof(lees_edw_param_t *), tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&le->target->param, (const void *) &tmp,
+			 sizeof(lees_edw_param_t *), tdpMemcpyHostToDevice) );
 
     cs_target(cs, &cst);
-    tdpMemcpy(&le->target->cs, &cst, sizeof(cs_t *), tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&le->target->cs, &cst, sizeof(cs_t *),
+			 tdpMemcpyHostToDevice) );
 
     lees_edw_commit(le);
   }
@@ -161,7 +162,7 @@ __host__ int lees_edw_free(lees_edw_t * le) {
 
   if (le->nref <= 0) {
 
-    if (le->target != le) tdpFree(le->target);
+    if (le->target != le) tdpAssert( tdpFree(le->target) );
 
     pe_free(le->pe);
     cs_free(le->cs);
diff --git a/src/ludwig.c b/src/ludwig.c
index b2d80e050..c17dd1be3 100644
--- a/src/ludwig.c
+++ b/src/ludwig.c
@@ -468,7 +468,7 @@ void ludwig_run(const char * inputfile) {
     MPI_Comm_rank(node_comm, &node_rank);
     MPI_Comm_size(node_comm, &node_size);
 
-    tdpGetDeviceCount(&ndevice);
+    tdpAssert( tdpGetDeviceCount(&ndevice) );
 
     if (ndevice > 0 && ndevice < node_size) {
       pe_info(ludwig->pe,  "MPI tasks per node: %d\n", node_size);
@@ -2136,7 +2136,7 @@ int ludwig_colloids_update(ludwig_t * ludwig) {
   colloids_info_ntotal(ludwig->collinfo, &ncolloid);
   if (ncolloid == 0) return 0;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   lb_ndist(ludwig->lb, &ndist);
   iconserve = (ludwig->psi || (ludwig->phi && ndist == 1));
diff --git a/src/map.c b/src/map.c
index fe4514f94..ff30a96ab 100644
--- a/src/map.c
+++ b/src/map.c
@@ -175,7 +175,7 @@ int map_initialise(pe_t * pe, cs_t * cs, const map_options_t * options,
 
   /* Allocate target copy of structure (or alias) */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     map->target = map;
@@ -236,7 +236,7 @@ int map_finalise(map_t * map) {
 
   int ndevice = 0;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) {
     char * status = NULL;
@@ -278,22 +278,22 @@ int map_memcpy(map_t * map, tdpMemcpyKind flag) {
 
   assert(map);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     /* Ensure we alias */
     assert(map->target == map);
   }
   else {
-    tdpMemcpy(&tmp, &map->target->status, sizeof(char *),
-	      tdpMemcpyDeviceToHost);
+    tdpAssert( tdpMemcpy(&tmp, &map->target->status, sizeof(char *),
+			 tdpMemcpyDeviceToHost) );
 
     switch (flag) {
     case tdpMemcpyHostToDevice:
-      tdpMemcpy(tmp, map->status, map->nsite*sizeof(char), flag);
+      tdpAssert( tdpMemcpy(tmp, map->status, map->nsite*sizeof(char), flag) );
       break;
     case tdpMemcpyDeviceToHost:
-      tdpMemcpy(map->status, tmp, map->nsite*sizeof(char), flag);
+      tdpAssert( tdpMemcpy(map->status, tmp, map->nsite*sizeof(char), flag) );
       break;
     default:
       pe_fatal(map->pe, "Bad flag in map_memcpy()\n");
diff --git a/src/noise.c b/src/noise.c
index bcb26b9b8..1bb58df1a 100644
--- a/src/noise.c
+++ b/src/noise.c
@@ -206,7 +206,7 @@ int noise_initialise(pe_t * pe, cs_t * cs, const noise_options_t * options,
 
   /* Device allocations */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     ns->target = ns;
@@ -245,7 +245,7 @@ int noise_finalise(noise_t * ns) {
 
   assert(ns);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) {
     unsigned int * state = NULL;
@@ -356,7 +356,7 @@ int noise_memcpy(noise_t * ns, tdpMemcpyKind flag) {
 
   assert(ns);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     assert(ns->target == ns);
diff --git a/src/pe.c b/src/pe.c
index 517a7d950..4b137be72 100644
--- a/src/pe.c
+++ b/src/pe.c
@@ -15,7 +15,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2023 The University of Edinburgh
+ *  (c) 2010-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -185,7 +185,7 @@ __host__ int pe_message(pe_t * pe) {
     assert(printf("Note assertions via standard C assert() are on.\n\n"));
 
     /* Thread model */
-    tdpThreadModelInfo(stdout);
+    tdpAssert( tdpThreadModelInfo(stdout) );
     printf("\n");
   }
 
diff --git a/src/phi_cahn_hilliard.c b/src/phi_cahn_hilliard.c
index f1122b286..f8cb99f9b 100644
--- a/src/phi_cahn_hilliard.c
+++ b/src/phi_cahn_hilliard.c
@@ -1163,7 +1163,7 @@ static int phi_ch_subtract_sum_phi_after_forward_step(phi_ch_t * pch, field_t *
     tdpAssert(tdpDeviceSynchronize());
   }
 
-  tdpFree(local_d);
+  tdpAssert( tdpFree(local_d) );
 
   return 0;
 }
diff --git a/src/phi_force_colloid.c b/src/phi_force_colloid.c
index c356ef524..40d40a4f2 100644
--- a/src/phi_force_colloid.c
+++ b/src/phi_force_colloid.c
@@ -146,7 +146,7 @@ __host__ int pth_force_driver(pth_t * pth, colloids_info_t * cinfo,
     kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb);
 
     tdpAssert( tdpMalloc((void **) &fwd, 3*sizeof(double)) );
-    tdpMemcpy(fwd, fw, 3*sizeof(double), tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(fwd, fw, 3*sizeof(double), tdpMemcpyHostToDevice) );
 
     TIMER_start(TIMER_PHI_FORCE_CALC);
 
@@ -155,10 +155,10 @@ __host__ int pth_force_driver(pth_t * pth, colloids_info_t * cinfo,
 
     tdpLaunchKernel(pth_force_wall_kernel, nblk, ntpb, 0, 0,
 		    k3d, pth->target, map->target, wallt, fwd);
-    tdpDeviceSynchronize();
+    tdpAssert( tdpDeviceSynchronize() );
   }
 
-  tdpMemcpy(fw, fwd, 3*sizeof(double), tdpMemcpyDeviceToHost);
+  tdpAssert( tdpMemcpy(fw, fwd, 3*sizeof(double), tdpMemcpyDeviceToHost) );
   wall_momentum_add(wall, fw);
 
   tdpAssert( tdpFree(fwd) );
@@ -257,7 +257,7 @@ __host__ int pth_force_fluid_wall_driver(pth_t * pth, hydro_t * hydro,
     tdpAssert( tdpDeviceSynchronize() );
   }
 
-  tdpMemcpy(fw, fwd, 3*sizeof(double), tdpMemcpyDeviceToHost);
+  tdpAssert( tdpMemcpy(fw, fwd, 3*sizeof(double), tdpMemcpyDeviceToHost) );
   wall_momentum_add(wall, fw);
 
   return 0;
diff --git a/src/phi_force_stress.c b/src/phi_force_stress.c
index 629bb5e6d..b67ce7902 100644
--- a/src/phi_force_stress.c
+++ b/src/phi_force_stress.c
@@ -63,7 +63,7 @@ __host__ int pth_create(pe_t * pe, cs_t * cs, int method, pth_t ** pobj) {
 
   /* Allocate target memory, or alias */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     obj->target = obj;
@@ -73,15 +73,15 @@ __host__ int pth_create(pe_t * pe, cs_t * cs, int method, pth_t ** pobj) {
     int imem = (method == FE_FORCE_METHOD_STRESS_DIVERGENCE)
             || (method == FE_FORCE_METHOD_RELAXATION_ANTI);
 
-    tdpMalloc((void **) &obj->target, sizeof(pth_t));
-    tdpMemset(obj->target, 0, sizeof(pth_t));
-    tdpMemcpy(&obj->target->nsites, &obj->nsites, sizeof(int),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMalloc((void **) &obj->target, sizeof(pth_t)) );
+    tdpAssert( tdpMemset(obj->target, 0, sizeof(pth_t)) );
+    tdpAssert( tdpMemcpy(&obj->target->nsites, &obj->nsites, sizeof(int),
+			 tdpMemcpyHostToDevice) );
 
     if (imem) {
-      tdpMalloc((void **) &tmp, 3*3*obj->nsites*sizeof(double));
-      tdpMemcpy(&obj->target->str, &tmp, sizeof(double *),
-		tdpMemcpyHostToDevice);
+      tdpAssert( tdpMalloc((void **) &tmp, 3*3*obj->nsites*sizeof(double)) );
+      tdpAssert( tdpMemcpy(&obj->target->str, &tmp, sizeof(double *),
+			   tdpMemcpyHostToDevice) );
     }
   }
 
@@ -103,13 +103,13 @@ __host__ int pth_free(pth_t * pth) {
 
   assert(pth);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) {
-    tdpMemcpy(&tmp, &pth->target->str, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
-    if (tmp) tdpFree(tmp);
-    tdpFree(pth->target);
+    tdpAssert( tdpMemcpy(&tmp, &pth->target->str, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    if (tmp) tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpFree(pth->target) );
   }
 
   if (pth->str) free(pth->str);
@@ -131,7 +131,7 @@ __host__ int pth_memcpy(pth_t * pth, tdpMemcpyKind flag) {
 
   assert(pth);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     /* Ensure we alias */
@@ -141,15 +141,15 @@ __host__ int pth_memcpy(pth_t * pth, tdpMemcpyKind flag) {
     double * tmp = NULL;
 
     nsz = 9*pth->nsites*sizeof(double);
-    tdpMemcpy(&tmp, &pth->target->str, sizeof(double *),
-	      tdpMemcpyDeviceToHost);
+    tdpAssert( tdpMemcpy(&tmp, &pth->target->str, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
 
     switch (flag) {
     case tdpMemcpyHostToDevice:
-      tdpMemcpy(tmp, pth->str, nsz, flag);
+      tdpAssert( tdpMemcpy(tmp, pth->str, nsz, flag) );
       break;
     case tdpMemcpyDeviceToHost:
-      tdpMemcpy(pth->str, tmp, nsz, flag);
+      tdpAssert( tdpMemcpy(pth->str, tmp, nsz, flag) );
       break;
     default:
       pe_fatal(pth->pe, "Bad flag in pth_memcpy\n");
diff --git a/src/phi_stats.c b/src/phi_stats.c
index 00daa7e36..01b12c676 100644
--- a/src/phi_stats.c
+++ b/src/phi_stats.c
@@ -195,7 +195,7 @@ int stats_field_q_reduce(field_t * field, map_t * map, int nxx, sum_t * sum,
     MPI_Type_free(&dt);
   }
 
-  tdpFree(dsum);
+  tdpAssert( tdpFree(dsum) );
 
   return 0;
 }
diff --git a/src/polar_active.c b/src/polar_active.c
index 296911e39..37cb10bea 100644
--- a/src/polar_active.c
+++ b/src/polar_active.c
@@ -34,7 +34,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2011-2018 The University of Edinburgh
+ *  (c) 2011-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -118,7 +118,7 @@ __host__ int fe_polar_create(pe_t * pe, cs_t * cs, field_t * p,
   obj->super.func = &fe_polar_hvt;
   obj->super.id = FE_POLAR;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     obj->target = obj;
@@ -126,19 +126,19 @@ __host__ int fe_polar_create(pe_t * pe, cs_t * cs, field_t * p,
   else {
     fe_polar_param_t * tmp;
     fe_vt_t * vt;
-    tdpMalloc((void **) &obj->target, sizeof(fe_polar_t));
-    tdpMemset(obj->target, 0, sizeof(fe_polar_t));
+    tdpAssert( tdpMalloc((void **) &obj->target, sizeof(fe_polar_t)) );
+    tdpAssert( tdpMemset(obj->target, 0, sizeof(fe_polar_t)) );
     tdpGetSymbolAddress((void **) &tmp, tdpSymbol(const_param));
-    tdpMemcpy(&obj->target->param, &tmp, sizeof(fe_polar_param_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&obj->target->param, &tmp, sizeof(fe_polar_param_t *),
+			 tdpMemcpyHostToDevice) );
     tdpGetSymbolAddress((void **) &vt, tdpSymbol(fe_polar_dvt));
-    tdpMemcpy(&obj->target->super.func, &vt, sizeof(fe_vt_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&obj->target->super.func, &vt, sizeof(fe_vt_t *),
+			 tdpMemcpyHostToDevice) );
 
-    tdpMemcpy(&obj->target->p, &p->target, sizeof(field_t *),
-	      tdpMemcpyHostToDevice);
-    tdpMemcpy(&obj->target->dp, &dp->target, sizeof(field_grad_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&obj->target->p, &p->target, sizeof(field_t *),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&obj->target->dp, &dp->target, sizeof(field_grad_t *),
+			 tdpMemcpyHostToDevice) );
   }
 
   *fe = obj;
diff --git a/src/propagation.c b/src/propagation.c
index 8d9dc1329..05a57ded2 100644
--- a/src/propagation.c
+++ b/src/propagation.c
@@ -217,7 +217,7 @@ __host__ int lb_model_swapf(lb_t * lb) {
   assert(lb);
   assert(lb->target);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     tmp1 = lb->f;
diff --git a/src/stats_distribution.c b/src/stats_distribution.c
index 82c81698b..d49185b12 100644
--- a/src/stats_distribution.c
+++ b/src/stats_distribution.c
@@ -11,7 +11,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2023 The University of Edinburgh
+ *  (c) 2010-2024 The University of Edinburgh
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
@@ -263,7 +263,7 @@ int distribution_stats_momentum(lb_t * lb, map_t * map, int root,
     MPI_Type_free(&dt);
   }
 
-  tdpFree(sum_d);
+  tdpAssert( tdpFree(sum_d) );
 
   return 0;
 }
diff --git a/src/surfactant.c b/src/surfactant.c
index 197b9370f..10b5e2a09 100644
--- a/src/surfactant.c
+++ b/src/surfactant.c
@@ -28,7 +28,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group
  *  and Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2021 The University of Edinburgh
+ *  (c) 2010-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -111,7 +111,7 @@ int fe_surf_create(pe_t * pe, cs_t * cs, field_t * phi,
 
   /* Allocate target memory, or alias */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     fe_surf_param_set(obj, param);
@@ -119,10 +119,10 @@ int fe_surf_create(pe_t * pe, cs_t * cs, field_t * phi,
   }
   else {
     fe_surf_param_t * tmp;
-    tdpMalloc((void **) &obj->target, sizeof(fe_surf_t));
+    tdpAssert( tdpMalloc((void **) &obj->target, sizeof(fe_surf_t)) );
     tdpGetSymbolAddress((void **) &tmp, tdpSymbol(const_param));
-    tdpMemcpy(&obj->target->param, tmp, sizeof(fe_surf_param_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&obj->target->param, tmp, sizeof(fe_surf_param_t *),
+			 tdpMemcpyHostToDevice) );
     /* Now copy. */
     assert(0); /* No implementation */
   }
@@ -144,8 +144,8 @@ __host__ int fe_surf_free(fe_surf_t * fe) {
 
   assert(fe);
 
-  tdpGetDeviceCount(&ndevice);
-  if (ndevice > 0) tdpFree(fe->target);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
+  if (ndevice > 0) tdpAssert( tdpFree(fe->target) );
 
   free(fe->param);
   free(fe);
diff --git a/src/symmetric.c b/src/symmetric.c
index 4dfecbb06..734525c24 100644
--- a/src/symmetric.c
+++ b/src/symmetric.c
@@ -16,7 +16,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group
  *  and Edinburgh Parallel Computing Centre
  *
- *  (c) 2011-2021 The University of Edinburgh
+ *  (c) 2011-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -113,7 +113,7 @@ __host__ int fe_symm_create(pe_t * pe, cs_t * cs, field_t * phi,
 
   /* Allocate target memory, or alias */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     obj->target = obj;
@@ -121,19 +121,19 @@ __host__ int fe_symm_create(pe_t * pe, cs_t * cs, field_t * phi,
   else {
     fe_symm_param_t * tmp = NULL;
     fe_vt_t * vt;
-    tdpMalloc((void **) &obj->target, sizeof(fe_symm_t));
-    tdpMemset(obj->target, 0, sizeof(fe_symm_t));
+    tdpAssert( tdpMalloc((void **) &obj->target, sizeof(fe_symm_t)) );
+    tdpAssert( tdpMemset(obj->target, 0, sizeof(fe_symm_t)) );
     tdpGetSymbolAddress((void **) &tmp, tdpSymbol(const_param));
-    tdpMemcpy(&obj->target->param, &tmp, sizeof(fe_symm_param_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&obj->target->param, &tmp, sizeof(fe_symm_param_t *),
+			 tdpMemcpyHostToDevice) );
     tdpGetSymbolAddress((void **) &vt, tdpSymbol(fe_symm_dvt));
-    tdpMemcpy(&obj->target->super.func, &vt, sizeof(fe_vt_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&obj->target->super.func, &vt, sizeof(fe_vt_t *),
+			 tdpMemcpyHostToDevice) );
 
-    tdpMemcpy(&obj->target->phi, &phi->target, sizeof(field_t *),
-	      tdpMemcpyHostToDevice);
-    tdpMemcpy(&obj->target->dphi, &dphi->target, sizeof(field_grad_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&obj->target->phi, &phi->target, sizeof(field_t *),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&obj->target->dphi, &dphi->target,
+			 sizeof(field_grad_t *), tdpMemcpyHostToDevice) );
   }
 
   *p = obj;
@@ -153,9 +153,9 @@ __host__ int fe_symm_free(fe_symm_t * fe) {
 
   assert(fe);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
-  if (ndevice > 0) tdpFree(fe->target);
+  if (ndevice > 0) tdpAssert( tdpFree(fe->target) );
 
   free(fe->param);
   free(fe);
diff --git a/src/wall.c b/src/wall.c
index 51083103e..70b945d5a 100644
--- a/src/wall.c
+++ b/src/wall.c
@@ -86,7 +86,7 @@ __host__ int wall_create(pe_t * pe, cs_t * cs, map_t * map, lb_t * lb,
 
   /* Target copy */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     wall->target = wall;
@@ -94,11 +94,11 @@ __host__ int wall_create(pe_t * pe, cs_t * cs, map_t * map, lb_t * lb,
   else {
     wall_param_t * tmp = NULL;
 
-    tdpMalloc((void **) &wall->target, sizeof(wall_t));
-    tdpMemset(wall->target, 0, sizeof(wall_t));
+    tdpAssert( tdpMalloc((void **) &wall->target, sizeof(wall_t)) );
+    tdpAssert( tdpMemset(wall->target, 0, sizeof(wall_t)) );
     tdpGetSymbolAddress((void **) &tmp, tdpSymbol(static_param));
-    tdpMemcpy(&wall->target->param, &tmp, sizeof(wall_param_t *),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(&wall->target->param, &tmp, sizeof(wall_param_t *),
+			 tdpMemcpyHostToDevice) );
   }
 
   *p = wall;
@@ -119,36 +119,36 @@ __host__ int wall_free(wall_t * wall) {
   if (wall->target != wall) {
     {
       int * tmp = NULL;
-      tdpMemcpy(&tmp, &wall->target->linki, sizeof(int *),
-		tdpMemcpyDeviceToHost);
-      tdpFree(tmp);
-      tdpMemcpy(&tmp, &wall->target->linkj, sizeof(int *),
-		tdpMemcpyDeviceToHost);
-      tdpFree(tmp);
-      tdpMemcpy(&tmp, &wall->target->linkp, sizeof(int *),
-		tdpMemcpyDeviceToHost);
-      tdpFree(tmp);
-      tdpMemcpy(&tmp, &wall->target->linku, sizeof(int *),
-		tdpMemcpyDeviceToHost);
-      tdpFree(tmp);
+      tdpAssert( tdpMemcpy(&tmp, &wall->target->linki, sizeof(int *),
+			   tdpMemcpyDeviceToHost) );
+      tdpAssert( tdpFree(tmp) );
+      tdpAssert( tdpMemcpy(&tmp, &wall->target->linkj, sizeof(int *),
+			   tdpMemcpyDeviceToHost) );
+      tdpAssert( tdpFree(tmp) );
+      tdpAssert( tdpMemcpy(&tmp, &wall->target->linkp, sizeof(int *),
+			   tdpMemcpyDeviceToHost) );
+      tdpAssert( tdpFree(tmp) );
+      tdpAssert( tdpMemcpy(&tmp, &wall->target->linku, sizeof(int *),
+			   tdpMemcpyDeviceToHost) );
+      tdpAssert( tdpFree(tmp) );
     }
     /* Release slip stuff */
     if (wall->param->slip.active) {
       int * tmp = NULL;
       tdpAssert(tdpMemcpy(&tmp, &wall->target->linkk, sizeof(int *),
 			  tdpMemcpyDeviceToHost));
-      tdpFree(tmp);
+      tdpAssert( tdpFree(tmp) );
     }
     if (wall->param->slip.active) {
       int8_t * tmp = NULL;
       tdpAssert(tdpMemcpy(&tmp, &wall->target->linkq, sizeof(int8_t *),
 			  tdpMemcpyDeviceToHost));
-      tdpFree(tmp);
+      tdpAssert( tdpFree(tmp) );
       tdpAssert(tdpMemcpy(&tmp, &wall->target->links, sizeof(int8_t *),
 			  tdpMemcpyDeviceToHost));
-      tdpFree(tmp);
+      tdpAssert( tdpFree(tmp) );
     }
-    tdpFree(wall->target);
+    tdpAssert( tdpFree(wall->target) );
   }
 
   cs_free(wall->cs);
@@ -402,7 +402,7 @@ __host__ int wall_init_boundaries(wall_t * wall, wall_init_enum_t init) {
   assert(wall);
   assert(wall->lb);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (init == WALL_INIT_ALLOCATE) {
     nlink = imax(1, wall->nlink); /* Avoid zero-sized allocations */
@@ -421,18 +421,18 @@ __host__ int wall_init_boundaries(wall_t * wall, wall_init_enum_t init) {
     if (wall->linku == NULL) pe_fatal(wall->pe,"calloc(wall->linku) failed\n");
     if (ndevice > 0) {
       int tmp;
-      tdpMalloc((void **) &tmp, wall->nlink*sizeof(int));
-      tdpMemcpy(&wall->target->linki, &tmp, sizeof(int *),
-		tdpMemcpyHostToDevice);
-      tdpMalloc((void **) &tmp, wall->nlink*sizeof(int));
-      tdpMemcpy(&wall->target->linkj, &tmp, sizeof(int *),
-		tdpMemcpyHostToDevice);
-      tdpMalloc((void **) &tmp, wall->nlink*sizeof(int));
-      tdpMemcpy(&wall->target->linkp, &tmp, sizeof(int *),
-		tdpMemcpyHostToDevice);
-      tdpMalloc((void **) &tmp, wall->nlink*sizeof(int));
-      tdpMemcpy(&wall->target->linku, &tmp, sizeof(int *),
-		tdpMemcpyHostToDevice);
+      tdpAssert( tdpMalloc((void **) &tmp, wall->nlink*sizeof(int)) );
+      tdpAssert( tdpMemcpy(&wall->target->linki, &tmp, sizeof(int *),
+			   tdpMemcpyHostToDevice) );
+      tdpAssert( tdpMalloc((void **) &tmp, wall->nlink*sizeof(int)) );
+      tdpAssert( tdpMemcpy(&wall->target->linkj, &tmp, sizeof(int *),
+			   tdpMemcpyHostToDevice) );
+      tdpAssert( tdpMalloc((void **) &tmp, wall->nlink*sizeof(int)) );
+      tdpAssert( tdpMemcpy(&wall->target->linkp, &tmp, sizeof(int *),
+			   tdpMemcpyHostToDevice) );
+      tdpAssert( tdpMalloc((void **) &tmp, wall->nlink*sizeof(int)) );
+      tdpAssert( tdpMemcpy(&wall->target->linku, &tmp, sizeof(int *),
+			   tdpMemcpyHostToDevice) );
     }
   }
 
@@ -504,7 +504,7 @@ __host__ int wall_init_boundaries_slip(wall_t * wall) {
   assert(wall->cs);
   assert(wall->map);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (wall->param->slip.active) {
 
@@ -525,18 +525,18 @@ __host__ int wall_init_boundaries_slip(wall_t * wall) {
     /* Allocate device memory */
     if (ndevice > 0) {
       int tmp;
-      tdpMalloc((void **) &tmp, nlink*sizeof(int));
-      tdpMemcpy(&wall->target->linkk, &tmp, sizeof(int *),
-		tdpMemcpyHostToDevice);
+      tdpAssert( tdpMalloc((void **) &tmp, nlink*sizeof(int)) );
+      tdpAssert (tdpMemcpy(&wall->target->linkk, &tmp, sizeof(int *),
+			   tdpMemcpyHostToDevice) );
     }
     if (ndevice > 0) {
       int8_t tmp;
-      tdpMalloc((void **) &tmp, nlink*sizeof(int8_t));
-      tdpMemcpy(&wall->target->linkq, &tmp, sizeof(int8_t *),
-		tdpMemcpyHostToDevice);
-      tdpMalloc((void **) &tmp, nlink*sizeof(int8_t));
-      tdpMemcpy(&wall->target->links, &tmp, sizeof(int8_t *),
-		tdpMemcpyHostToDevice);
+      tdpAssert( tdpMalloc((void **) &tmp, nlink*sizeof(int8_t)) );
+      tdpAssert( tdpMemcpy(&wall->target->linkq, &tmp, sizeof(int8_t *),
+			   tdpMemcpyHostToDevice) );
+      tdpAssert( tdpMalloc((void **) &tmp, nlink*sizeof(int8_t)) );
+      tdpAssert( tdpMemcpy(&wall->target->links, &tmp, sizeof(int8_t *),
+			   tdpMemcpyHostToDevice) );
     }
 
     /* For each existing fluid-to-solid link i->j with cv[p] ... */
@@ -777,7 +777,7 @@ __host__ int wall_memcpy(wall_t * wall, tdpMemcpyKind flag) {
 
   assert(wall);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice == 0) {
     assert(wall->target == wall);
@@ -922,7 +922,8 @@ __host__ int wall_set_wall_distributions(wall_t * wall) {
   tdpLaunchKernel(wall_setu_kernel, nblk, ntpb, 0, 0,
 		  wall->target, wall->lb->target);
 
-  tdpDeviceSynchronize();
+  tdpAssert( tdpPeekAtLastError() );
+  tdpAssert( tdpDeviceSynchronize() );
 
   return 0;
 }
@@ -1321,17 +1322,17 @@ __host__ int wall_momentum(wall_t * wall, double f[3]) {
    * the host via wall_momentum_add() and others are on the
    * device. */
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice > 0) {
-    tdpMemcpy(ftmp, wall->target->fnet, 3*sizeof(double),
-	      tdpMemcpyDeviceToHost);
+    tdpAssert( tdpMemcpy(ftmp, wall->target->fnet, 3*sizeof(double),
+			 tdpMemcpyDeviceToHost) );
     wall->fnet[X] += ftmp[X];
     wall->fnet[Y] += ftmp[Y];
     wall->fnet[Z] += ftmp[Z];
     ftmp[X] = 0.0; ftmp[Y] = 0.0; ftmp[Z] = 0.0;
-    tdpMemcpy(wall->target->fnet, ftmp, 3*sizeof(double),
-	      tdpMemcpyHostToDevice);
+    tdpAssert( tdpMemcpy(wall->target->fnet, ftmp, 3*sizeof(double),
+			 tdpMemcpyHostToDevice) );
   }
 
   /* Return the current net */
diff --git a/target/target_x86.c b/target/target_x86.c
index e130c52f2..82e5f485a 100644
--- a/target/target_x86.c
+++ b/target/target_x86.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2018-2023 The University of Edinburgh
+ *  (c) 2018-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Alan Gray (Late of this parish)
@@ -251,13 +251,9 @@ tdpError_t tdpGetDeviceCount(int * device) {
 
   *device = 0;
 
-#ifdef FAKE_DEVICE /* "Fake" device */
-  *device = 1;
-#endif
-
   /* Strictly, we should return tdpErrorInsufficientDriver or ... */
 
-  return tdpErrorNoDevice;
+  return tdpSuccess;
 }
 
 /*****************************************************************************
diff --git a/tests/unit/test_blue_phase.c b/tests/unit/test_blue_phase.c
index 49c8340a0..dac5fdf1c 100644
--- a/tests/unit/test_blue_phase.c
+++ b/tests/unit/test_blue_phase.c
@@ -970,7 +970,7 @@ __host__ int do_test_fe_lc_device1(pe_t * pe, cs_t * cs, fe_lc_t * fe) {
     tdpAssert(tdpMemcpy(p, &param, sizeof(fe_lc_param_t),
 			tdpMemcpyHostToDevice));
     tdpLaunchKernel(do_test_fe_lc_kernel1, nblk, ntpb, 0, 0, fetarget, p);
-    tdpDeviceSynchronize();
+    tdpAssert( tdpDeviceSynchronize() );
     tdpAssert(tdpFree(p));
   }
 
diff --git a/tests/unit/test_ch.c b/tests/unit/test_ch.c
index d76ac9344..558d6007a 100644
--- a/tests/unit/test_ch.c
+++ b/tests/unit/test_ch.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistics Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2019 The University of Edinburgh
+ *  (c) 2019-2024 The University of Edinburgh
  *
  *****************************************************************************/
 
@@ -33,7 +33,7 @@ int test_ch_suite(void) {
   int ndevice;
   pe_t * pe = NULL;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
 
diff --git a/tests/unit/test_coords.c b/tests/unit/test_coords.c
index 6475d4292..b1ecbf579 100644
--- a/tests/unit/test_coords.c
+++ b/tests/unit/test_coords.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2009-2022 The University of Edinburgh
+ *  (c) 2009-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -538,7 +538,8 @@ __host__ int do_test_coords_device1(pe_t * pe) {
   ntpb.x = 1;
 
   tdpLaunchKernel(do_test_coords_kernel1, nblk, ntpb, 0, 0, cstarget);
-  tdpDeviceSynchronize();
+  tdpAssert( tdpPeekAtLastError() );
+  tdpAssert( tdpDeviceSynchronize() );
 
   cs_free(cs);
 
diff --git a/tests/unit/test_fe_electro.c b/tests/unit/test_fe_electro.c
index 0dac0171d..790e3a973 100644
--- a/tests/unit/test_fe_electro.c
+++ b/tests/unit/test_fe_electro.c
@@ -43,7 +43,7 @@ int test_fe_electro_suite(void) {
   cs_t * cs = NULL;
   physics_t * phys = NULL;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
 
diff --git a/tests/unit/test_fe_electro_symm.c b/tests/unit/test_fe_electro_symm.c
index b66700c08..4dcf2bd2e 100644
--- a/tests/unit/test_fe_electro_symm.c
+++ b/tests/unit/test_fe_electro_symm.c
@@ -39,7 +39,8 @@ int test_fe_electro_symm_suite(void) {
   pe_t * pe = NULL;
 
   pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
-  tdpGetDeviceCount(&ndevice);
+
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   if (ndevice) {
     pe_info(pe, "SKIP     ./unit/test_fe_electro_symm\n");
diff --git a/tests/unit/test_fe_surfactant1.c b/tests/unit/test_fe_surfactant1.c
index 7a00737af..1f7c61c83 100644
--- a/tests/unit/test_fe_surfactant1.c
+++ b/tests/unit/test_fe_surfactant1.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Phsyics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2019-2023 The University of Edinburgh
+ *  (c) 2019-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -57,7 +57,7 @@ __host__ int test_fe_surfactant1_suite(void) {
   cs_t * cs = NULL;
   field_t * phi = NULL;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
 
diff --git a/tests/unit/test_fe_ternary.c b/tests/unit/test_fe_ternary.c
index 84c08cc69..f2003cc6d 100644
--- a/tests/unit/test_fe_ternary.c
+++ b/tests/unit/test_fe_ternary.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Phsyics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2019-2022 The University of Edinburgh
+ *  (c) 2019-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -48,7 +48,7 @@ __host__ int test_fe_ternary_suite(void) {
   cs_t * cs = NULL;
   field_t * phi = NULL;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
 
diff --git a/tests/unit/test_field.c b/tests/unit/test_field.c
index 02cb93e76..f80c2e7a2 100644
--- a/tests/unit/test_field.c
+++ b/tests/unit/test_field.c
@@ -260,7 +260,8 @@ int do_test_device1(pe_t * pe) {
   ntpb.x = 1;
 
   tdpLaunchKernel(do_test_field_kernel1, nblk, ntpb, 0, 0, phi->target);
-  tdpDeviceSynchronize();
+  tdpAssert( tdpPeekAtLastError() );
+  tdpAssert( tdpDeviceSynchronize() );
 
   field_free(phi);
   cs_free(cs);
diff --git a/tests/unit/test_halo.c b/tests/unit/test_halo.c
index 3cf825563..ccd317921 100644
--- a/tests/unit/test_halo.c
+++ b/tests/unit/test_halo.c
@@ -8,7 +8,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2022 The University of Edinburgh
+ *  (c) 2010-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -232,7 +232,7 @@ int do_test_halo(pe_t * pe, cs_t * cs, int dim, const lb_data_options_t * opts)
   assert(dim == X || dim == Y || dim == Z);
   assert(opts);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   lb_data_create(pe, cs, opts, &lb);
 
diff --git a/tests/unit/test_hydro.c b/tests/unit/test_hydro.c
index 409b46f5d..945b6354b 100644
--- a/tests/unit/test_hydro.c
+++ b/tests/unit/test_hydro.c
@@ -48,7 +48,7 @@ int test_hydro_suite(void) {
 
   pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   do_test1(pe);
   do_test_halo1(pe, 1, 1, FIELD_HALO_TARGET);
diff --git a/tests/unit/test_polar_active.c b/tests/unit/test_polar_active.c
index 60a3bacf8..205f63b49 100644
--- a/tests/unit/test_polar_active.c
+++ b/tests/unit/test_polar_active.c
@@ -10,7 +10,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2017 The University of Edinbrugh
+ *  (c) 2010-2024 The University of Edinbrugh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -60,7 +60,7 @@ int test_polar_active_suite(void) {
 
   field_options_t opts = field_options_ndata_nhalo(nf, nhalo);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
 
diff --git a/tests/unit/test_prop.c b/tests/unit/test_prop.c
index 4f52b48ab..075cf7945 100644
--- a/tests/unit/test_prop.c
+++ b/tests/unit/test_prop.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2022 Ths University of Edinburgh
+ *  (c) 2010-2024 Ths University of Edinburgh
  *
  *  Contributing authors: 
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -46,7 +46,7 @@ int test_lb_prop_suite(void) {
   cs_create(pe, &cs);
   cs_init(cs);
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   do_test_velocity(pe, cs, 1, LB_HALO_TARGET);
   do_test_velocity(pe, cs, 2, LB_HALO_TARGET);
diff --git a/tests/unit/test_visc_arrhenius.c b/tests/unit/test_visc_arrhenius.c
index aa080c8d5..fe90b831f 100644
--- a/tests/unit/test_visc_arrhenius.c
+++ b/tests/unit/test_visc_arrhenius.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Phsyics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2020-2022 The University of Edinburgh
+ *  (c) 2020-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -46,7 +46,7 @@ __host__ int test_visc_arrhenius_suite(void) {
   cs_t * cs = NULL;
   field_t * phi = NULL;
 
-  tdpGetDeviceCount(&ndevice);
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
 
   pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
 

From b5866820c93424d67c1eaded5e9751cd728ed27c Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Fri, 4 Oct 2024 17:18:23 +0100
Subject: [PATCH 041/133] Format/spelling updates

---
 src/blue_phase.c                  |  47 ++++++-------
 src/brazovskii.c                  |   4 +-
 src/coords.c                      |   4 +-
 src/fe_electro.c                  |  16 ++---
 src/fe_electro_symmetric.c        |  22 +++---
 src/fe_ternary.c                  | 112 +++++++++++++++---------------
 src/field_grad.c                  |   8 +--
 src/halo_swap.c                   |  10 +--
 src/leesedwards.c                 |  10 +--
 src/polar_active.c                |   8 +--
 src/surfactant.c                  |  16 ++---
 src/symmetric.c                   |   2 +-
 tests/unit/test_ch.c              |   2 +-
 tests/unit/test_coords.c          |   2 +-
 tests/unit/test_fe_electro_symm.c |   2 +-
 tests/unit/test_fe_surfactant1.c  |   2 +-
 tests/unit/test_fe_ternary.c      |   1 -
 tests/unit/test_prop.c            |   8 +--
 tests/unit/test_visc_arrhenius.c  |   2 +-
 19 files changed, 138 insertions(+), 140 deletions(-)

diff --git a/src/blue_phase.c b/src/blue_phase.c
index 3b2f3a786..07de1b231 100644
--- a/src/blue_phase.c
+++ b/src/blue_phase.c
@@ -505,7 +505,7 @@ __host__ __device__ int fe_lc_bulk_stress(fe_lc_t * fe, int index,
 
 
   /* bulk contribution to free energy */
-  fe_lc_compute_bulk_fed(fe, q, &fed); 
+  fe_lc_compute_bulk_fed(fe, q, &fed);
 
   /* bulk contribtion to stress using the above contributions */
 
@@ -637,7 +637,7 @@ __host__ __device__ int fe_lc_grad_stress(fe_lc_t * fe, int index,
   }
 
   /* gradient contribution to free energy */
-  fe_lc_compute_gradient_fed(fe, q, dq, &fed); 
+  fe_lc_compute_gradient_fed(fe, q, dq, &fed);
 
   /* gradient contribtion to stress using the above contributions */
 
@@ -942,7 +942,7 @@ __host__ __device__ int fe_lc_compute_stress_active(fe_lc_t * fe,
    * while code was           -zeta*(q[ia][ib] + r3*d[ia][ib])
    * for zeta = zeta1 */
   /* The sign of zeta0 needs to be clarified cf Eq. 36 of notes */
-  /* For "backwards compatability" use zeta0 = +1/3 at the moment */
+  /* For "backwards compatibility" use zeta0 = +1/3 at the moment */
 
   for (ia = 0; ia < 3; ia++) {
     for (ib = 0; ib < 3; ib++) {
@@ -1112,8 +1112,8 @@ int fe_lc_compute_h(fe_lc_t * fe, double gamma, double q[3][3],
  *
  *  Compute the bulk free energy density as a function of q.
  *
- *  Note: This function contains also the part quadratic in q 
- *        which is normally part of the gradient free energy. 
+ *  Note: This function contains also the part quadratic in q
+ *        which is normally part of the gradient free energy.
  *
  *****************************************************************************/
 
@@ -1158,7 +1158,7 @@ int fe_lc_compute_bulk_fed(fe_lc_t * fe, double q[3][3], double * fed) {
     - r3*fe->param->a0*fe->param->gamma*q3
     + 0.25*fe->param->a0*fe->param->gamma*q2*q2;
 
-  /* Add terms quadratic in q from gradient free energy */ 
+  /* Add terms quadratic in q from gradient free energy */
 
   *fed += 0.5*kappa1*4.0*q0*q0*q2;
 
@@ -1169,7 +1169,7 @@ int fe_lc_compute_bulk_fed(fe_lc_t * fe, double q[3][3], double * fed) {
  *
  *  fe_lc_compute_gradient_fed
  *
- *  Compute the gradient contribution to the free energy density 
+ *  Compute the gradient contribution to the free energy density
  *  as a function of q and the q gradient tensor dq.
  *
  *  Note: The part quadratic in q has been added to the bulk free energy.
@@ -1216,7 +1216,7 @@ int fe_lc_compute_gradient_fed(fe_lc_t * fe, double q[3][3],
     for (ib = 0; ib < 3; ib++) {
 
       sum = 0.0;
-  
+
       q2 += q[ia][ib]*q[ia][ib];
 
       for (ic = 0; ic < 3; ic++) {
@@ -1314,7 +1314,7 @@ __host__ int fe_lc_dimensionless_field_strength(const fe_lc_param_t * param,
     fieldsq += param->e0[ia]*param->e0[ia];
   }
 
-  /* Remember epsilon is stored with factor (1/12pi) */ 
+  /* Remember epsilon is stored with factor (1/12pi) */
 
   {
     double a0 = param->a0;
@@ -1369,7 +1369,7 @@ int fe_lc_redshift_set(fe_lc_t * fe,  double redshift) {
  *
  *  fe_lc_amplitude_compute
  *
- *  Scalar order parameter in the nematic state, minimum of bulk free energy 
+ *  Scalar order parameter in the nematic state, minimum of bulk free energy
  *
  *****************************************************************************/
 
@@ -1377,7 +1377,7 @@ __host__ __device__ int fe_lc_amplitude_compute(const fe_lc_param_t * param,
 						double * a) {
 
   assert(a);
-  
+
   *a = (2.0/3.0)*(0.25 + 0.75*sqrt(1.0 - 8.0/(3.0*param->gamma)));
 
   return 0;
@@ -1679,7 +1679,7 @@ void fe_lc_mol_field_v(fe_lc_t * fe, int index, double h[3][3][NSIMDVL]) {
   double * __restrict__ delsq;
 
   assert(fe);
- 
+
   data = fe->q->data;
   grad = fe->dq->grad;
   delsq = fe->dq->delsq;
@@ -1734,11 +1734,11 @@ void fe_lc_mol_field_v(fe_lc_t * fe, int index, double h[3][3][NSIMDVL]) {
  *****************************************************************************/
 
 __host__ __device__
-void fe_lc_stress_v(fe_lc_t * fe, int index, double s[3][3][NSIMDVL]) { 
+void fe_lc_stress_v(fe_lc_t * fe, int index, double s[3][3][NSIMDVL]) {
 
   int iv;
   int ia;
- 
+
   double q[3][3][NSIMDVL];
   double h[3][3][NSIMDVL];
   double dq[3][3][3][NSIMDVL];
@@ -1911,7 +1911,7 @@ __host__ __device__ void fe_lc_str_anti_v(fe_lc_t * fe, int index,
 
 __host__ __device__
 void fe_lc_compute_fed_v(fe_lc_t * fe,
-			 double q[3][3][NSIMDVL], 
+			 double q[3][3][NSIMDVL],
 			 double dq[3][3][3][NSIMDVL],
 			 double fed[NSIMDVL]) {
   int iv;
@@ -2083,7 +2083,7 @@ void fe_lc_compute_fed_v(fe_lc_t * fe,
  *
  *  Alan's note for GPU version.
  *
- *  To get temperary q[][][] etc arrays into registers really requires
+ *  To get temporary q[][][] etc arrays into registers really requires
  *  inlining to caller file scope.
  *
  *  NO gamma = gamma(r) at the mooment.
@@ -2092,9 +2092,9 @@ void fe_lc_compute_fed_v(fe_lc_t * fe,
 
 __host__ __device__
 void fe_lc_compute_h_v(fe_lc_t * fe,
-		       double q[3][3][NSIMDVL], 
+		       double q[3][3][NSIMDVL],
 		       double dq[3][3][3][NSIMDVL],
-		       double dsq[3][3][NSIMDVL], 
+		       double dsq[3][3][NSIMDVL],
 		       double h[3][3][NSIMDVL]) {
 
   int iv;
@@ -2308,7 +2308,7 @@ void fe_lc_compute_stress_v(fe_lc_t * fe,
 
   fe_lc_compute_fed_v(fe, q, dq, p0);
 
-  for_simd_v(iv, NSIMDVL) p0[iv] = 0.0 - p0[iv]; 
+  for_simd_v(iv, NSIMDVL) p0[iv] = 0.0 - p0[iv];
 
   /* The contraction Q_ab H_ab */
 
@@ -2400,7 +2400,7 @@ void fe_lc_compute_stress_v(fe_lc_t * fe,
   for_simd_v(iv, NSIMDVL) sthtmp[iv] += - kappa0*dq[0][1][1][iv]*dq[2][1][2][iv] - kappa1*dq[0][1][2][iv]*dq[1][1][2][iv]+ kappa1*dq[0][1][2][iv]*dq[1][1][2][iv];
 
   for_simd_v(iv, NSIMDVL) sthtmp[iv] += - kappa0*dq[0][1][2][iv]*dq[0][2][0][iv] - kappa1*dq[0][2][0][iv]*dq[1][2][0][iv]+ kappa1*dq[0][2][0][iv]*dq[2][1][0][iv];
-    
+
   for_simd_v(iv, NSIMDVL) sthtmp[iv] -= 2.0*kappa1*q0*dq[0][2][0][iv]*q[0][0][iv];
 
   for_simd_v(iv, NSIMDVL) sthtmp[iv] += - kappa0*dq[0][1][2][iv]*dq[1][2][1][iv] - kappa1*dq[0][2][1][iv]*dq[1][2][1][iv]+ kappa1*dq[0][2][1][iv]*dq[2][1][1][iv];
@@ -2533,7 +2533,7 @@ void fe_lc_compute_stress_v(fe_lc_t * fe,
   for_simd_v(iv, NSIMDVL) sthtmp[iv] += - kappa0*dq[1][1][0][iv]*dq[0][0][0][iv] - kappa1*dq[1][0][0][iv]*dq[1][0][0][iv]+ kappa1*dq[1][0][0][iv]*dq[0][1][0][iv];
 
   for_simd_v(iv, NSIMDVL) sthtmp[iv] += 2.0*kappa1*q0*dq[1][0][0][iv]*q[0][2][iv];
-  
+
   for_simd_v(iv, NSIMDVL) sthtmp[iv] += - kappa0*dq[1][1][0][iv]*dq[1][0][1][iv] - kappa1*dq[1][0][1][iv]*dq[1][0][1][iv]+ kappa1*dq[1][0][1][iv]*dq[0][1][1][iv];
 
   for_simd_v(iv, NSIMDVL) sthtmp[iv] += 2.0*kappa1*q0*dq[1][0][1][iv]*q[1][2][iv];
@@ -2581,7 +2581,7 @@ void fe_lc_compute_stress_v(fe_lc_t * fe,
   for_simd_v(iv, NSIMDVL) sthtmp[iv] += -xi*h[1][2][iv]*(q[2][2][iv] + r3)   -xi*(q[1][2][iv]    )*h[2][2][iv];
 
   for_simd_v(iv, NSIMDVL) sthtmp[iv] += - kappa0*dq[1][2][0][iv]*dq[0][0][0][iv] - kappa1*dq[1][0][0][iv]*dq[2][0][0][iv]+ kappa1*dq[1][0][0][iv]*dq[0][2][0][iv];
-  
+
   for_simd_v(iv, NSIMDVL) sthtmp[iv] -= 2.0*kappa1*q0*dq[1][0][0][iv]*q[0][1][iv];
 
   for_simd_v(iv, NSIMDVL) sthtmp[iv] += - kappa0*dq[1][2][0][iv]*dq[1][0][1][iv] - kappa1*dq[1][0][1][iv]*dq[2][0][1][iv]+ kappa1*dq[1][0][1][iv]*dq[0][2][1][iv];
@@ -2710,7 +2710,7 @@ void fe_lc_compute_stress_v(fe_lc_t * fe,
   for_simd_v(iv, NSIMDVL) sthtmp[iv] += - kappa0*dq[2][1][2][iv]*dq[2][2][2][iv] - kappa1*dq[2][2][2][iv]*dq[1][2][2][iv]+ kappa1*dq[2][2][2][iv]*dq[2][1][2][iv];
 
   for_simd_v(iv, NSIMDVL) sthtmp[iv] -= 2.0*kappa1*q0*dq[2][2][2][iv]*q[2][0][iv];
-  
+
   for_simd_v(iv, NSIMDVL) sthtmp[iv] += q[2][0][iv]*h[1][0][iv] - h[2][0][iv]*q[1][0][iv];
 
   for_simd_v(iv, NSIMDVL) sthtmp[iv] += q[2][1][iv]*h[1][1][iv] - h[2][1][iv]*q[1][1][iv];
@@ -2773,4 +2773,3 @@ void fe_lc_compute_stress_v(fe_lc_t * fe,
 
   return;
 }
-
diff --git a/src/brazovskii.c b/src/brazovskii.c
index 1ecead098..0811bb73c 100644
--- a/src/brazovskii.c
+++ b/src/brazovskii.c
@@ -15,13 +15,13 @@
  *  Parameters:
  *
  *  One should have b, c > 0 for stability purposes. Then for a < 0
- *  and kappa > 0 one gets two homogenous phases with
+ *  and kappa > 0 one gets two homogeneous phases with
  *  phi = +/- sqrt(-a/b) cf. the symmetric case.
  *
  *  Negative kappa favours the presence of interfaces, and lamellae
  *  can form. Approximately, the lamellar phase can be described by
  *  phi ~= A sin(k_0 x) in the traverse direction, where
- *  A^2 = 4 (1 + kappa^2/4cb)/3 and k_0 = sqrt(-kappa/2c). 
+ *  A^2 = 4 (1 + kappa^2/4cb)/3 and k_0 = sqrt(-kappa/2c).
  *
  *  Edinburgh Soft Matter and Statistical Physics Group
  *  and Edinburgh Parallel Computing Centre
diff --git a/src/coords.c b/src/coords.c
index 70919e9dd..f57139cf2 100644
--- a/src/coords.c
+++ b/src/coords.c
@@ -206,7 +206,7 @@ __host__ int cs_init(cs_t * cs) {
 
   cs->param->noffset[X] = cs->listnoffset[X][cs->param->mpi_cartcoords[X]];
   cs->param->noffset[Y] = cs->listnoffset[Y][cs->param->mpi_cartcoords[Y]];
-  cs->param->noffset[Z] = cs->listnoffset[Z][cs->param->mpi_cartcoords[Z]];  
+  cs->param->noffset[Z] = cs->listnoffset[Z][cs->param->mpi_cartcoords[Z]];
 
   cs->param->str[Z] = 1;
   cs->param->str[Y] = cs->param->str[Z]*(cs->param->nlocal[Z] + 2*cs->param->nhalo);
@@ -275,7 +275,7 @@ __host__ int cs_info(cs_t * cs) {
     }
   }
 
-  uniform = (nmin[X] == nmax[X] && nmin[Y] == nmax[Y] && nmin[Z] == nmax[Z]); 
+  uniform = (nmin[X] == nmax[X] && nmin[Y] == nmax[Y] && nmin[Z] == nmax[Z]);
 
 
   pe_info(cs->pe, "\n");
diff --git a/src/fe_electro.c b/src/fe_electro.c
index 4f5ffa819..58ce03b2f 100644
--- a/src/fe_electro.c
+++ b/src/fe_electro.c
@@ -19,7 +19,7 @@
  *
  *  mu_a = kT log(rho_a) + Z_a e psi
  *
- *  See, e.g., Rotenberg et al. Coarse-grained simualtions of charge,
+ *  See, e.g., Rotenberg et al. Coarse-grained simulations of charge,
  *  current and flow in heterogeneous media,
  *  Faraday Discussions \textbf{14}, 223--243 (2010).
  *
@@ -50,7 +50,7 @@ struct fe_electro_s {
   fe_t super;
   pe_t * pe;             /* Parallel environment */
   psi_t * psi;           /* A reference to the electrokinetic quantities */
-  double * mu_ref;       /* Reference mu currently unused (i.e., zero). */ 
+  double * mu_ref;       /* Reference mu currently unused (i.e., zero). */
   fe_electro_t * target; /* Device copy */
 };
 
@@ -96,8 +96,8 @@ static  __constant__ fe_vt_t fe_electro_dvt = {
  *  Retain a reference to the electrokinetics object psi.
  *
  *  Note: In this model we do not set the chemical potential.
- *        In the gradient method the ionic electrostatic forces 
- *        on the fluid are implicitly calculated through the 
+ *        In the gradient method the ionic electrostatic forces
+ *        on the fluid are implicitly calculated through the
  *        electric charge density and the electric field.
  *
  *****************************************************************************/
@@ -249,7 +249,7 @@ int fe_electro_mu(fe_electro_t * fe, int index, double * mu) {
   for (n = 0; n < fe->psi->nk; n++) {
     psi_rho(fe->psi, index, n, &rho);
     assert(rho >= 0.0); /* For log(rho + epsilon) */
-  
+
     mu[n] = kt*log(rho + DBL_EPSILON) + fe->psi->valency[n]*fe->psi->e*psi;
   }
 
@@ -282,7 +282,7 @@ int fe_electro_mu_solv(fe_electro_t * fe, int index, int k, double * mu) {
  *    S_ab = -epsilon ( E_a E_b - (1/2) d_ab E^2) + d_ab kt sum_k rho_k
  *  where epsilon is the (uniform) permittivity.
  *
- *  The last term is the ideal gas contribution which is excluded in the 
+ *  The last term is the ideal gas contribution which is excluded in the
  *  excess stress tensor.
  *
  *****************************************************************************/
@@ -305,7 +305,7 @@ int fe_electro_stress(fe_electro_t * fe, int index, double s[3][3]) {
   physics_ref(&phys);
   physics_kt(phys, &kt);
   psi_nk(fe->psi, &nk);
-  psi_unit_charge(fe->psi, &eunit);	 
+  psi_unit_charge(fe->psi, &eunit);
   reunit = 1.0/eunit;
 
   psi_epsilon(fe->psi, &epsilon);
@@ -358,7 +358,7 @@ int fe_electro_stress_ex(fe_electro_t * fe, int index, double s[3][3]) {
 
   physics_ref(&phys);
   physics_kt(phys, &kt);
-  psi_unit_charge(fe->psi, &eunit);	 
+  psi_unit_charge(fe->psi, &eunit);
   reunit = 1.0/eunit;
 
   psi_epsilon(fe->psi, &epsilon);
diff --git a/src/fe_electro_symmetric.c b/src/fe_electro_symmetric.c
index 4575ae186..e2d7ca0ce 100644
--- a/src/fe_electro_symmetric.c
+++ b/src/fe_electro_symmetric.c
@@ -10,7 +10,7 @@
  *
  *    mu_phi = mu^mix + mu^solv + mu^el
  *
- *  with mu^mix the usual symmetric contribution, 
+ *  with mu^mix the usual symmetric contribution,
  *
  *    mu^solv = (1/2) [ rho(+)Delta mu(+) + rho(-)Delta mu(-) ]
  *
@@ -19,7 +19,7 @@
  *    mu^el = - (1/2) gamma epsilonbar E^2
  *
  *  where gamma is the dielectric contrast and epsilonbar is the mean
- *  dielectric contant for the two phases. E is the external electric
+ *  dielectric constant for the two phases. E is the external electric
  *  field.
  *
  *
@@ -235,7 +235,7 @@ __host__ int fe_es_fed(fe_es_t * fe, int index, double * fed) {
  *
  *      mu_phi = mu_phi_mix + mu_phi_solv + mu_phi_el
  *
- *  Note: mu_phi_solv needs to be in agreement with 
+ *  Note: mu_phi_solv needs to be in agreement with
  *        the terms in fe_es_mu_ion()
  *
  *****************************************************************************/
@@ -268,10 +268,10 @@ __host__ int fe_es_mu_phi(fe_es_t * fe, int index, double * mu) {
   }
 
   /* Electric field contribution */
- 
+
   e2 = 0.0;
 
-  psi_electric_field(fe->psi, index, e); 
+  psi_electric_field(fe->psi, index, e);
 
   for (ia = 0; ia < 3; ia++) {
     e[ia] *= kt*reunit;
@@ -298,7 +298,7 @@ __host__ int fe_es_mu_phi(fe_es_t * fe, int index, double * mu) {
 __host__ int fe_es_mu_ion_solv(fe_es_t * fe, int index, int n, double * mu) {
 
   double phi;
- 
+
   assert(fe);
   assert(mu);
   assert(n < fe->param->nk);
@@ -396,7 +396,7 @@ __host__ int fe_es_var_epsilon(fe_es_t * fe, int index, double * epsilon) {
  *  The field term comes from
  *
  *  S^elec = - [D_a E_b - (1/2) epsilonbar d_ab E^2]
- * 
+ *
  *    where D_a is the electric displacement. The functional form of
  *    epsilon(r) agrees with fe_es_var_epsilon() above.
  *
@@ -406,7 +406,7 @@ __host__ int fe_es_var_epsilon(fe_es_t * fe, int index, double * epsilon) {
  *  the force calculation.
  *
  *  Finally, the true Maxwell stress includes the total electric
- *  field. 
+ *  field.
  *
  *****************************************************************************/
 
@@ -431,7 +431,7 @@ __host__ int fe_es_stress_ex(fe_es_t * fe, int index, double s[3][3]) {
   psi_unit_charge(fe->psi, &eunit);
   reunit = 1.0/eunit;
 
-  fe_symm_str(fe->fe_symm, index, s); 
+  fe_symm_str(fe->fe_symm, index, s);
 
   /* Coupling part
      requires phi and total electric field */
@@ -445,7 +445,7 @@ __host__ int fe_es_stress_ex(fe_es_t * fe, int index, double s[3][3]) {
     e[ia] *= kt*reunit;
     e2 += e[ia]*e[ia];
   }
-  
+
   /* Dielectric part */
 
   s_couple = 0.5*phi*fe->param->epsilonbar*fe->param->gamma*e2;
@@ -461,7 +461,7 @@ __host__ int fe_es_stress_ex(fe_es_t * fe, int index, double s[3][3]) {
      local permittivity depends implicitly on phi */
 
   fe_es_var_epsilon(fe, index, &epsloc);
- 
+
   for (ia = 0; ia < 3; ia++) {
     for (ib = 0; ib < 3; ib++) {
 
diff --git a/src/fe_ternary.c b/src/fe_ternary.c
index 0bd5f7477..6f1e4ff1d 100644
--- a/src/fe_ternary.c
+++ b/src/fe_ternary.c
@@ -10,7 +10,7 @@
  *  [0] FE_PHI \phi is compositional order parameter
  *  [1] FE_PSI \psi is surfactant concentration
  *  [2] FE_RHO is 'spectating' at the moment
- * 
+ *
  *  The free energy density is:
  *
  *
@@ -87,32 +87,32 @@ int fe_ternary_create(pe_t * pe, cs_t * cs, field_t * phi,
                       fe_ternary_t ** fe) {
   int ndevice;
   fe_ternary_t * obj = NULL;
-    
+
   assert(pe);
   assert(cs);
   assert(fe);
   assert(phi);
   assert(dphi);
-    
+
   obj = (fe_ternary_t *) calloc(1, sizeof(fe_ternary_t));
   assert(obj);
   if (obj == NULL) pe_fatal(pe, "calloc(fe_surf1_t) failed\n");
-    
+
   obj->param = (fe_ternary_param_t *) calloc(1, sizeof(fe_ternary_param_t));
   assert(obj->param);
   if (obj->param == NULL) pe_fatal(pe, "calloc(fe_ternary_param_t) fail\n");
-    
+
   obj->pe = pe;
   obj->cs = cs;
   obj->phi = phi;
   obj->dphi = dphi;
   obj->super.func = &fe_ternary_hvt;
   obj->super.id = FE_TERNARY;
-    
+
   /* Allocate target memory, or alias */
-    
+
   tdpAssert( tdpGetDeviceCount(&ndevice) );
-    
+
   if (ndevice == 0) {
     obj->target = obj;
   }
@@ -144,10 +144,10 @@ int fe_ternary_create(pe_t * pe, cs_t * cs, field_t * phi,
     tdpAssert(tdpMemcpy(&obj->target->dphi, &dphi->target,
 			sizeof(field_grad_t *), tdpMemcpyHostToDevice));
   }
-    
+
   fe_ternary_param_set(obj, param);
   *fe = obj;
-    
+
   return 0;
 }
 
@@ -158,17 +158,17 @@ int fe_ternary_create(pe_t * pe, cs_t * cs, field_t * phi,
  ****************************************************************************/
 
 __host__ int fe_ternary_free(fe_ternary_t * fe) {
-    
+
   int ndevice;
-    
+
   assert(fe);
-    
+
   tdpAssert( tdpGetDeviceCount(&ndevice) );
   if (ndevice > 0) tdpAssert(tdpFree(fe->target));
-    
+
   free(fe->param);
   free(fe);
-    
+
   return 0;
 }
 
@@ -188,19 +188,19 @@ __host__ int fe_ternary_info(fe_ternary_t * fe) {
   double h1, h2, h3;
 
   pe_t * pe = NULL;
-    
+
   assert(fe);
-    
+
   pe = fe->pe;
-    
+
   fe_ternary_sigma(fe, sigma);
-    
+
   pe_info(pe, "Ternary free energy parameters:\n");
   pe_info(pe, "Surface penalty kappa1 = %12.5e\n", fe->param->kappa1);
   pe_info(pe, "Surface penalty kappa2 = %12.5e\n", fe->param->kappa2);
   pe_info(pe, "Surface penalty kappa3 = %12.5e\n", fe->param->kappa3);
   pe_info(pe, "Interface width alpha  = %12.5e\n", fe->param->alpha);
-    
+
   pe_info(pe, "\n");
   pe_info(pe, "Derived quantities\n");
   pe_info(pe, "Interfacial tension 12 = %12.5e\n",  sigma[0]);
@@ -242,12 +242,12 @@ __host__ int fe_ternary_info(fe_ternary_t * fe) {
  ****************************************************************************/
 
 __host__ int fe_ternary_target(fe_ternary_t * fe, fe_t ** target) {
-    
+
   assert(fe);
   assert(target);
-    
+
   *target = (fe_t *) fe->target;
-    
+
   return 0;
 }
 
@@ -277,9 +277,9 @@ __host__ int fe_ternary_param_set(fe_ternary_t * fe, fe_ternary_param_t vals) {
 __host__ int fe_ternary_param(fe_ternary_t * fe, fe_ternary_param_t * values) {
 
   assert(fe);
-    
+
   *values = *fe->param;
-    
+
   return 0;
 }
 
@@ -291,21 +291,21 @@ __host__ int fe_ternary_param(fe_ternary_t * fe, fe_ternary_param_t * values) {
  ****************************************************************************/
 
 __host__ int fe_ternary_sigma(fe_ternary_t * fe,  double * sigma) {
-    
+
   double alpha, kappa1, kappa2, kappa3;
-    
+
   assert(fe);
   assert(sigma);
-    
+
   alpha  = fe->param->alpha;
   kappa1 = fe->param->kappa1;
   kappa2 = fe->param->kappa2;
   kappa3 = fe->param->kappa3;
-    
+
   sigma[0] = alpha*(kappa1 + kappa2)/6.0;
   sigma[1] = alpha*(kappa2 + kappa3)/6.0;
   sigma[2] = alpha*(kappa1 + kappa3)/6.0;
-    
+
   return 0;
 }
 
@@ -334,7 +334,7 @@ __host__ int fe_ternary_angles(fe_ternary_t * fe, double * theta) {
 
   assert(fe);
   assert(theta);
-    
+
   fe_ternary_sigma(fe, sigma);
 
   d1 = sigma[1]*sigma[1] - (sigma[0]*sigma[0] + sigma[2]*sigma[2]);
@@ -365,7 +365,7 @@ __host__ int fe_ternary_angles(fe_ternary_t * fe, double * theta) {
  *
  ****************************************************************************/
 
-__host__ int fe_ternary_wetting_angles(fe_ternary_t * fe, double * angle) { 
+__host__ int fe_ternary_wetting_angles(fe_ternary_t * fe, double * angle) {
 
   double a, h;
   double kappa1, kappa2, kappa3;
@@ -406,12 +406,12 @@ __host__ int fe_ternary_wetting_angles(fe_ternary_t * fe, double * angle) {
  ****************************************************************************/
 
 __host__ int fe_ternary_xi0(fe_ternary_t * fe, double * xi0) {
-    
+
   assert(fe);
   assert(xi0);
-    
+
   *xi0 = fe->param->alpha;
-    
+
   return 0;
 }
 
@@ -433,7 +433,7 @@ __host__ int fe_ternary_xi0(fe_ternary_t * fe, double * xi0) {
 
 __host__ __device__ int fe_ternary_fed(fe_ternary_t * fe, int index,
 				       double * fed) {
-    
+
   int ia;
   double field[2];
   double phi;
@@ -446,21 +446,21 @@ __host__ __device__ int fe_ternary_fed(fe_ternary_t * fe, int index,
   double kappa1, kappa2, kappa3, alpha2;
 
   assert(fe);
-    
+
   kappa1 = fe->param->kappa1;
   kappa2 = fe->param->kappa2;
   kappa3 = fe->param->kappa3;
   alpha2 = fe->param->alpha*fe->param->alpha;
 
   field_scalar_array(fe->phi, index, field);
-    
+
   rho = 1.0;
   phi = field[FE_PHI];
   psi = field[FE_PSI];
-    
+
   drho = 0.0;
   field_grad_pair_grad(fe->dphi, index, grad);
-    
+
   dsum = 0.0;
   for (ia = 0; ia < 3; ia++) {
     d3 = drho + grad[FE_PHI][ia] - grad[FE_PSI][ia];
@@ -470,7 +470,7 @@ __host__ __device__ int fe_ternary_fed(fe_ternary_t * fe, int index,
   s1  = rho + phi - psi;
   s2  = 2.0 + psi - rho - phi;
   fe1 = 0.03125*kappa1*s1*s1*s2*s2 + 0.125*alpha2*kappa1*dsum;
-    
+
   dsum = 0.0;
   for (ia = 0; ia < 3; ia++) {
     d3 = drho - grad[FE_PHI][ia] - grad[FE_PSI][ia];
@@ -495,7 +495,7 @@ __host__ __device__ int fe_ternary_fed(fe_ternary_t * fe, int index,
  *
  *  Three chemical potentials are present:
  *
- *   \mu_\rho 
+ *   \mu_\rho
  * = 1/8 kappa1  (rho + phi - psi)(rho + phi - psi - 2)(rho + phi - psi - 1)
  * - 1/8 kappa2  (rho - phi - psi)(rho - phi - psi - 2)(rho - phi - psi - 1)
  * + 1/4 alpha^2 (kappa1 + kappa2)(\Delta\psi - \Delta\phi)
@@ -528,10 +528,10 @@ __host__ __device__ int fe_ternary_mu(fe_ternary_t * fe, int index,
     double kappa1, kappa2, kappa3, alpha2;
     double krhorho, kphipsi, kpsipsi;
     double s1, s2;
-    
+
     assert(fe);
     assert(mu);
-    
+
     kappa1 = fe->param->kappa1;
     kappa2 = fe->param->kappa2;
     kappa3 = fe->param->kappa3;
@@ -543,14 +543,14 @@ __host__ __device__ int fe_ternary_mu(fe_ternary_t * fe, int index,
 
     field_scalar_array(fe->phi, index, field);
 
-    rho = 1.0;    
+    rho = 1.0;
     phi = field[FE_PHI];
     psi = field[FE_PSI];
-    
+
     field_grad_pair_delsq(fe->dphi, index, delsq);
 
     delsq_rho = 0.0;
-    
+
     /* mu_phi */
 
     s1 = (rho + phi - psi)*(rho + phi - psi - 2.0)*(rho + phi - psi - 1.0);
@@ -563,7 +563,7 @@ __host__ __device__ int fe_ternary_mu(fe_ternary_t * fe, int index,
     s1 = (rho + phi - psi)*(rho + phi - psi - 2.0)*(rho + phi - psi - 1.0);
     s2 = (rho - phi - psi)*(rho - phi - psi - 2.0)*(rho - phi - psi - 1.0);
 
-    mu[FE_PSI] = -0.125*kappa1*s1 - 0.125*kappa2*s2 
+    mu[FE_PSI] = -0.125*kappa1*s1 - 0.125*kappa2*s2
                + kappa3*psi*(psi - 1.0)*(2.0*psi - 1.0)
                + krhorho*delsq_rho - kphipsi*delsq[FE_PHI]
                - kpsipsi*delsq[FE_PSI];
@@ -602,7 +602,7 @@ __host__ __device__ int fe_ternary_str(fe_ternary_t * fe, int index,
     double krhophi, krhopsi, kphipsi;
     double drhodphi, drhodpsi, dphidpsi;
     KRONECKER_DELTA_CHAR(d);
-    
+
     assert(fe);
 
     kappa1 = fe->param->kappa1;
@@ -618,14 +618,14 @@ __host__ __device__ int fe_ternary_str(fe_ternary_t * fe, int index,
     kphipsi = - krhophi;
 
     field_scalar_array(fe->phi, index, field);
-    
+
     rho = 1.0;
     phi = field[FE_PHI];
     psi = field[FE_PSI];
     rho2 = rho*rho;
     phi2 = phi*phi;
     psi2 = psi*psi;
-    
+
     field_grad_pair_grad(fe->dphi, index, grad);
     field_grad_pair_delsq(fe->dphi, index, delsq);
 
@@ -640,8 +640,8 @@ __host__ __device__ int fe_ternary_str(fe_ternary_t * fe, int index,
 
     p1 = (kappa1 + kappa2)*
       (0.09375*(rho2*rho2 + phi2*phi2)
-       + 0.5625*(rho2*phi2 + rho2*psi2 + phi2*psi2) 
-       - 0.3750*rho*psi*(rho2 + psi2) 
+       + 0.5625*(rho2*phi2 + rho2*psi2 + phi2*psi2)
+       - 0.3750*rho*psi*(rho2 + psi2)
        + 0.75*(rho2*psi  - rho*phi2 - rho*psi2 + phi2*psi)
        - 0.25*rho2*rho + 0.125*rho2 + 0.125*phi2 - 0.25*rho*psi
        - 1.125*rho*phi2*psi);
@@ -674,7 +674,7 @@ __host__ __device__ int fe_ternary_str(fe_ternary_t * fe, int index,
     p6 = dphidpsi  + phi*delsq[FE_PSI] + psi*delsq[FE_PHI];
 
     /* Final stress */
-    
+
     for (ia = 0; ia < 3; ia++) {
       for (ib = 0; ib < 3; ib++) {
 
@@ -708,9 +708,9 @@ __host__ __device__ int fe_ternary_str_v(fe_ternary_t * fe, int index,
   int ia, ib;
   int iv;
   double s1[3][3];
-    
+
   assert(fe);
-    
+
   for (iv = 0; iv < NSIMDVL; iv++) {
     fe_ternary_str(fe, index + iv, s1);
     for (ia = 0; ia < 3; ia++) {
diff --git a/src/field_grad.c b/src/field_grad.c
index 8382a9441..456f70a28 100644
--- a/src/field_grad.c
+++ b/src/field_grad.c
@@ -56,7 +56,7 @@ __host__ int field_grad_create(pe_t * pe, field_t * f, int level,
   ifail = field_grad_init(obj);
   if (ifail != 0) {
     pe_info(pe, "field_grad: failure in int32_t indexing\n");
-    pe_fatal(pe, "The local doamin size is too large\n");
+    pe_fatal(pe, "The local domain size is too large\n");
   }
 
   *pobj = obj;
@@ -111,7 +111,7 @@ static int field_grad_init(field_grad_t * obj) {
     if (obj->delsq == NULL) pe_fatal(obj->pe, "calloc(field_grad->delsq) failed");
 
     /* Allocate data space on target (or alias) */
- 
+
     if (ndevice > 0) {
       tdpAssert( tdpMalloc((void **) &tmp, nfsz*NVECTOR*sizeof(double)) );
       tdpAssert( tdpMemcpy(&obj->target->grad, &tmp, sizeof(double *),
@@ -280,10 +280,10 @@ __host__ void field_grad_free(field_grad_t * obj) {
 
   if (ndevice > 0) {
     tdpAssert( tdpMemcpy(&tmp, &obj->target->grad, sizeof(double *),
-			 tdpMemcpyDeviceToHost) ); 
+			 tdpMemcpyDeviceToHost) );
     tdpAssert( tdpFree(tmp) );
     tdpAssert( tdpMemcpy(&tmp, &obj->target->delsq, sizeof(double *),
-			 tdpMemcpyDeviceToHost) ); 
+			 tdpMemcpyDeviceToHost) );
     tdpAssert( tdpFree(tmp) );
     tdpAssert( tdpMemcpy(&tmp, &obj->target->d_ab, sizeof(double *),
 			 tdpMemcpyDeviceToHost) );
diff --git a/src/halo_swap.c b/src/halo_swap.c
index b0a789c61..a3bd2ac3a 100644
--- a/src/halo_swap.c
+++ b/src/halo_swap.c
@@ -26,7 +26,7 @@ typedef struct halo_swap_param_s halo_swap_param_t;
 struct halo_swap_s {
   pe_t * pe;
   cs_t * cs;
-  halo_swap_param_t * param; 
+  halo_swap_param_t * param;
   double * fxlo;
   double * fxhi;
   double * fylo;
@@ -254,7 +254,7 @@ __host__ int halo_swap_create(pe_t * pe, cs_t * cs, int nhcomm, int naddr,
 
     tdpGetSymbolAddress((void **) &tmpp, tdpSymbol(const_param));
     tdpAssert( tdpMemcpy(&halo->target->param, &tmpp,
-			 sizeof(halo_swap_param_t *), tdpMemcpyHostToDevice) ); 
+			 sizeof(halo_swap_param_t *), tdpMemcpyHostToDevice) );
 
     /* Device constants */
     halo_swap_commit(halo);
@@ -714,7 +714,7 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
   int ih, jh, kh;
   int ixlo, ixhi;
   int iylo, iyhi;
-  int izlo, izhi;  
+  int izlo, izhi;
   int m, mc, p;
   int nd, nh;
   int hsz[3];
@@ -991,7 +991,7 @@ __host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
 
   jh = halo->param->hext[Z][Y] - nh;
   kh = halo->param->hext[Y][Z] - nh - halo->param->nswap;
-  
+
   for (ic = 0; ic < halo->param->nall[X]; ic++) {
     for (jc = 0; jc < halo->param->nswap; jc++) {
       for (kc = 0; kc < halo->param->nswap; kc++) {
@@ -1229,7 +1229,7 @@ void halo_swap_unpack_rank1(halo_swap_t * halo, int id, double * data) {
       indexh = halo_swap_index(halo, ic, jc, hi + kc);
       buflo = halo->hzlo;
       bufhi = halo->hzhi;
-    } 
+    }
 
 
     if (halo->param->nb == 1) {
diff --git a/src/leesedwards.c b/src/leesedwards.c
index a5577aad6..d5ef89391 100644
--- a/src/leesedwards.c
+++ b/src/leesedwards.c
@@ -58,7 +58,7 @@ struct lees_edw_param_s {
   double uy;                /* u[Y] for all planes */
   double dx_min;            /* Position first plane */
   double dx_sep;            /* Plane separation */
-  double omega;             /* u_y = u_le cos (omega t) for oscillatory */  
+  double omega;             /* u_y = u_le cos (omega t) for oscillatory */
   double time0;             /* time offset */
 };
 
@@ -413,7 +413,7 @@ static int lees_edw_init_tables(lees_edw_t * le) {
  *  periodic halo regions.
  *
  ****************************************************************************/
- 
+
 static int lees_edw_checks(lees_edw_t * le) {
 
   int n;
@@ -512,7 +512,7 @@ int lees_edw_steady_uy(lees_edw_t * le, int ic, double * uy) {
   nplane = (int) ((le->param->dx_min + xglobal)/le->param->dx_sep);
 
   *uy = xglobal*gammadot - le->param->uy*nplane;
- 
+
   return 0;
 }
 
@@ -546,7 +546,7 @@ int lees_edw_block_uy(lees_edw_t * le, int ic, double * uy) {
   cs_nlocal_offset(le->cs, offset);
 
   /* So, just count the number of blocks from the centre L_x/2
-   * and mutliply by the plane speed. */
+   * and multiply by the plane speed. */
 
   xh = offset[X] + (double) ic - lmin[X] - 0.5*ltot[X];
   if (xh > 0.0) {
@@ -798,7 +798,7 @@ __host__ __device__ void lees_edw_index_v(lees_edw_t * le, int ic[NSIMDVL],
   }
 
   return;
-} 
+}
 
 /*****************************************************************************
  *
diff --git a/src/polar_active.c b/src/polar_active.c
index 37cb10bea..76b518663 100644
--- a/src/polar_active.c
+++ b/src/polar_active.c
@@ -8,7 +8,7 @@
  *      + (delta kappa1 / 2) (e_abc d_b P_c)^2
  *      + (kappa2/2) (d_a P_b P_c)^2
  *
- *  This is an implemetation of a free energy with vector order
+ *  This is an implementation of a free energy with vector order
  *  parameter.
  *
  *  For the time being, we demand delta = kappa2 = zero; this is until
@@ -297,7 +297,7 @@ int fe_polar_fed(fe_polar_t * fe, int index, double * fed) {
  *       - lambda [(1/2)(P_a h_b - P_b h_a) - (1/3)P_c h_c d_ab]
  *       - zeta [P_a P_b - (1/3) P_c P_c d_ab]
  *       - kappa1 d_a P_c d_b P_c
- * 
+ *
  *  This is antisymmetric. Note that extra minus sign added at
  *  the end to allow the force on the Navier Stokes to be
  *  computed as F_a = - d_b S_ab.
@@ -395,8 +395,8 @@ void fe_polar_stress_v(fe_polar_t * fe, int index, double s[3][3][NSIMDVL]) {
  *  fe_polar_mol_field
  *
  *  H_a = - A P_a - B (P_b)^2 P_a + kappa1 \nabla^2 P_a
- *        + 2 kappa2 P_c \nabla^2 P_c P_a 
- *  
+ *        + 2 kappa2 P_c \nabla^2 P_c P_a
+ *
  *****************************************************************************/
 
 __host__ __device__
diff --git a/src/surfactant.c b/src/surfactant.c
index 10b5e2a09..43ce54c02 100644
--- a/src/surfactant.c
+++ b/src/surfactant.c
@@ -17,7 +17,7 @@
  *  with
  *
  *    F_phi  = symmetric phi^4 free energy
- *    F_psi  = kT [\psi ln \psi + (1 - \psi) ln (1 - \psi)] 
+ *    F_psi  = kT [\psi ln \psi + (1 - \psi) ln (1 - \psi)]
  *    F_surf = - (1/2)\epsilon\psi (grad \phi)^2
  *             - (1/2)\beta \psi^2 (grad \phi)^2
  *    F_add  = + (1/2) W \psi \phi^2
@@ -47,7 +47,7 @@
  * a_       = -0.0208333;
  * b_       = +0.0208333;
  * kappa_   = +0.12;
- * 
+ *
  * kt_      = 0.00056587;
  * epsilon_ = 0.03;
  * beta_    = 0.0;
@@ -285,13 +285,13 @@ __host__ int fe_surf_xi0(fe_surf_t * fe, double * xi0) {
  *  fe_surf_langmuir_isotherm
  *
  *  The Langmuir isotherm psi_c is given by
- *  
+ *
  *  ln psi_c = (1/2) epsilon / (kT xi_0^2)
  *
  *  and can be a useful reference. The situation is more complex if
  *  beta is not zero (Frumpkin isotherm).
  *
- ****************************************************************************/ 
+ ****************************************************************************/
 
 __host__ int fe_surf_langmuir_isotherm(fe_surf_t * fe, double * psi_c) {
 
@@ -355,14 +355,14 @@ __host__ int fe_surf_fed(fe_surf_t * fe, int index, double * fed) {
 /****************************************************************************
  *
  *  fe_surf_mu
- * 
+ *
  *  Two chemical potentials are present:
  *
  *  \mu_\phi = A\phi + B\phi^3 - kappa \nabla^2 \phi
  *           + W\phi \psi
  *           + \epsilon (\psi \nabla^2\phi + \nabla\phi . \nabla\psi)
- *           + \beta (\psi^2 \nabla^2\phi + 2\psi \nabla\phi . \nabla\psi) 
- * 
+ *           + \beta (\psi^2 \nabla^2\phi + 2\psi \nabla\phi . \nabla\psi)
+ *
  *  \mu_\psi = kT (ln \psi - ln (1 - \psi) + (1/2) W \phi^2
  *           - (1/2) \epsilon (\nabla \phi)^2
  *           - \beta \psi (\nabla \phi)^2
@@ -422,7 +422,7 @@ __host__ int fe_surf_mu(fe_surf_t * fe, int index, double * mu) {
  *     + \epsilon \phi \psi \nabla^2 \phi
  *     + 2 \beta \phi \psi \nabla_a\phi \nabla_a\psi
  *     + \beta\phi\psi^2 \nabla^2 \phi
- *     - (1/2) \beta\psi^2 (\nabla\phi)^2  
+ *     - (1/2) \beta\psi^2 (\nabla\phi)^2
  *
  *  P_ab = (\kappa - \epsilon\psi - \beta\psi^2) \nabla_a \phi \nabla_b \phi
  *
diff --git a/src/symmetric.c b/src/symmetric.c
index 734525c24..7a651869a 100644
--- a/src/symmetric.c
+++ b/src/symmetric.c
@@ -401,7 +401,7 @@ void fe_symm_str_v(fe_symm_t * fe, int index, double s[3][3][NSIMDVL]) {
     phi = fe->phi->data[addr_rank1(fe->phi->nsites, 1, index + iv, 0)];
     delsq = fe->dphi->delsq[addr_rank1(fe->dphi->nsite, 1, index + iv, 0)];
 
-    p0 = 0.5*a*phi*phi + 0.75*b*phi*phi*phi*phi - kappa*phi*delsq 
+    p0 = 0.5*a*phi*phi + 0.75*b*phi*phi*phi*phi - kappa*phi*delsq
       - 0.5*kappa*(grad[X][iv]*grad[X][iv] + grad[Y][iv]*grad[Y][iv]
 		   + grad[Z][iv]*grad[Z][iv]);
 
diff --git a/tests/unit/test_ch.c b/tests/unit/test_ch.c
index 558d6007a..198a9751f 100644
--- a/tests/unit/test_ch.c
+++ b/tests/unit/test_ch.c
@@ -59,7 +59,7 @@ int test_ch_suite(void) {
 
 int test_ch_create(pe_t * pe) {
 
-  const int nfield = 2;  /* phi, psi   */ 
+  const int nfield = 2;  /* phi, psi   */
   const double m1 = 1.0; /* mobility 1 */
   const double m2 = 2.0; /* mobility 2 */
 
diff --git a/tests/unit/test_coords.c b/tests/unit/test_coords.c
index b1ecbf579..e478d62db 100644
--- a/tests/unit/test_coords.c
+++ b/tests/unit/test_coords.c
@@ -726,7 +726,7 @@ int test_cs_from_json(pe_t * pe) {
   assert(pe);
 
   {
-    /* Just about the bare minumum of information */
+    /* Just about the bare minimum of information */
 
     cJSON * json = cJSON_Parse("{"
 			       "\"options\": {"
diff --git a/tests/unit/test_fe_electro_symm.c b/tests/unit/test_fe_electro_symm.c
index 4dcf2bd2e..8f3f9816b 100644
--- a/tests/unit/test_fe_electro_symm.c
+++ b/tests/unit/test_fe_electro_symm.c
@@ -2,7 +2,7 @@
  *
  *  test_fe_electro_symm.c
  *
- *  Electrokinetic + symetric free energy
+ *  Electrokinetic + symmetric free energy
  *
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
diff --git a/tests/unit/test_fe_surfactant1.c b/tests/unit/test_fe_surfactant1.c
index 1f7c61c83..496704c32 100644
--- a/tests/unit/test_fe_surfactant1.c
+++ b/tests/unit/test_fe_surfactant1.c
@@ -37,7 +37,7 @@ static fe_surf_param_t pref = {-0.0208333,    /* a */
 			       +0.0208333,    /* b */
 			       0.12,          /* kappa */
 			       0.00056587,    /* kT */
-			       0.03,          /* epsilon */ 
+			       0.03,          /* epsilon */
 			       0.0,           /* beta */
 			       0.0};          /* W */
 
diff --git a/tests/unit/test_fe_ternary.c b/tests/unit/test_fe_ternary.c
index f2003cc6d..9c984ec2e 100644
--- a/tests/unit/test_fe_ternary.c
+++ b/tests/unit/test_fe_ternary.c
@@ -278,4 +278,3 @@ __host__ int test_fe_ternary_str(pe_t * pe, cs_t * cs, field_t * phi) {
 
   return 0;
 }
-
diff --git a/tests/unit/test_prop.c b/tests/unit/test_prop.c
index 075cf7945..87784b0ad 100644
--- a/tests/unit/test_prop.c
+++ b/tests/unit/test_prop.c
@@ -7,9 +7,9 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2024 Ths University of Edinburgh
+ *  (c) 2010-2024 The University of Edinburgh
  *
- *  Contributing authors: 
+ *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
  *****************************************************************************/
@@ -158,8 +158,8 @@ int do_test_velocity(pe_t * pe, cs_t * cs, int ndist, lb_halo_enum_t halo) {
  *  Check each element of the distribution has propagated exactly one
  *  lattice spacing in the appropriate direction.
  *
- *  We use the global index as the test of the soruce.
- *  
+ *  We use the global index as the test of the source.
+ *
  *****************************************************************************/
 
 int do_test_source_destination(pe_t * pe, cs_t * cs, int ndist,
diff --git a/tests/unit/test_visc_arrhenius.c b/tests/unit/test_visc_arrhenius.c
index fe90b831f..9db1d7a7f 100644
--- a/tests/unit/test_visc_arrhenius.c
+++ b/tests/unit/test_visc_arrhenius.c
@@ -121,7 +121,7 @@ int test_visc_arrhenius_update(pe_t * pe, cs_t * cs, field_t * phi) {
   const double eta_plus  = 0.5;
   const double eta_minus = 0.1;
   const double phistar   = 1.0;
-  
+
   visc_arrhenius_param_t param = {eta_minus, eta_plus, phistar};
   visc_arrhenius_t * visc = NULL;
 

From e1ae32896179771137e1ef70e6306f449e1cdcbe Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Fri, 4 Oct 2024 17:28:16 +0100
Subject: [PATCH 042/133] Add missing target

---
 config/unix-nvcc-default.mk | 1 +
 1 file changed, 1 insertion(+)

diff --git a/config/unix-nvcc-default.mk b/config/unix-nvcc-default.mk
index e1ddf4217..7fb6bb68b 100644
--- a/config/unix-nvcc-default.mk
+++ b/config/unix-nvcc-default.mk
@@ -9,6 +9,7 @@
 
 BUILD   = parallel
 MODEL   = -D_D3Q19_
+TARGET  = nvcc
 
 CC     = nvcc
 CFLAGS = -ccbin=mpicc -O2 -DADDR_SOA -arch=sm_61 -x cu -dc

From a6d35d5e5b53907d0916c006e20dc7e1c6418541 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Tue, 8 Oct 2024 10:53:49 +0100
Subject: [PATCH 043/133] add nvtx timers

---
 src/lb_data.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/lb_data.c b/src/lb_data.c
index d78fcbefc..77c414033 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -31,6 +31,8 @@
 #include "timer.h"
 #include "util.h"
 
+#include "nvtx3/nvToolsExt.h"
+
 static int lb_mpi_init(lb_t * lb);
 static int lb_model_param_init(lb_t * lb);
 static int lb_init(lb_t * lb);
@@ -569,6 +571,8 @@ __host__ int lb_halo_swap(lb_t * lb, lb_halo_enum_t flag) {
 
   assert(lb);
 
+  nvtxRangePush("halo_swap");
+
   switch (flag) {
   case LB_HALO_TARGET:
     //tdpMemcpy(&data, &lb->target->f, sizeof(double *), tdpMemcpyDeviceToHost);
@@ -589,6 +593,8 @@ __host__ int lb_halo_swap(lb_t * lb, lb_halo_enum_t flag) {
     lb_halo_wait(lb, &lb->h);
   }
 
+  nvtxRangePop();
+
   return 0;
 }
 

From a702812db09b72c1756f795e666f598de268ade9 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Thu, 10 Oct 2024 10:21:44 +0100
Subject: [PATCH 044/133] initial graph implementation

---
 src/lb_data.c | 212 +++++++++++++++++++++++++++++++++++++++++++++++---
 src/lb_data.h |  12 +++
 2 files changed, 214 insertions(+), 10 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 77c414033..e0dc2de95 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -48,6 +48,15 @@ static __constant__ lb_collide_param_t static_param;
 #include "mpi-ext.h"
 #endif
 
+#ifdef __NVCC__
+/* There are two file-scope switches here, which need to be generalised
+ * via some suitable interface; they are separate, but both relate to
+ * GPU execution. */
+static const int have_graph_api_ = 1;
+#else
+static const int have_graph_api_ = 0;
+#endif
+
 #if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
 static const int have_gpu_aware_mpi_ = 1;
 #else
@@ -1245,6 +1254,11 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 			 tdpMemcpyHostToDevice) );
     tdpAssert( tdpMemcpy(h->target->recv, h->recv_d, 27*sizeof(double *),
 			 tdpMemcpyHostToDevice) );
+    
+    if (have_graph_api_) {
+      lb_graph_halo_send_create(lb, h, send_count);
+      lb_graph_halo_recv_create(lb, h, recv_count);
+    }
 
   }
   free(send_count);
@@ -1309,11 +1323,16 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
     copyModelToDevice(&h->map, &h->target->map);
     for (int ireq = 0; ireq < h->map.nvel; ireq++) {
       if (h->count[ireq] > 0) {
-        int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
-        dim3 nblk, ntpb;
-        kernel_launch_param(scount, &nblk, &ntpb);
-        tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
-        tdpDeviceSynchronize();
+        if (have_graph_api_) {
+          tdpAssert( tdpGraphLaunch(h->gsend.exec, h->stream) );
+          tdpAssert( tdpStreamSynchronize(h->stream) );
+        } else {
+          int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+          dim3 nblk, ntpb;
+          kernel_launch_param(scount, &nblk, &ntpb);
+          tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+          tdpDeviceSynchronize();
+        }
       }
     }
   } else {
@@ -1379,11 +1398,16 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
   if (ndevice > 0) {
     for (int ireq = 0; ireq < h->map.nvel; ireq++) {
       if (h->count[ireq] > 0) {
-        int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
-        dim3 nblk, ntpb;
-        kernel_launch_param(rcount, &nblk, &ntpb);
-        tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
-        tdpDeviceSynchronize();
+        if (have_graph_api_) {
+          tdpAssert( tdpGraphLaunch(h->grecv.exec, h->stream) );
+          tdpAssert( tdpStreamSynchronize(h->stream) );
+        } else {
+          int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+          dim3 nblk, ntpb;
+          kernel_launch_param(rcount, &nblk, &ntpb);
+          tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+          tdpDeviceSynchronize();
+        }
       }
     }
   } else {
@@ -1433,6 +1457,11 @@ int lb_halo_free(lb_t * lb, lb_halo_t * h) {
     free(h->recv[ireq]);
   }
 
+  if (have_graph_api_) {
+    tdpAssert( tdpGraphDestroy(h->gsend.graph) );
+    tdpAssert( tdpGraphDestroy(h->grecv.graph) );
+  }
+
   lb_model_free(&h->map);
 
   return 0;
@@ -1706,3 +1735,166 @@ int lb_io_read(lb_t * lb, int timestep, io_event_t * event) {
 
   return ifail;
 }
+
+/*****************************************************************************
+ *
+ * lb_graph_halo_send_create
+ *
+ *****************************************************************************/
+
+int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count) {
+
+  assert(lb);
+  assert(h);
+
+  tdpAssert( tdpGraphCreate(&h->gsend.graph, 0) );
+
+  for (int ireq = 1; ireq < h->map.nvel; ireq++) {
+    tdpGraphNode_t kernelNode;
+    tdpKernelNodeParams kernelNodeParams = {0};
+    void * kernelArgs[3] = {(void *) &lb->target,
+                            (void *) &h->target,
+                            (void *) &ireq};
+    kernelNodeParams.func = (void *) lb_halo_enqueue_send_kernel;
+    dim3 nblk;
+    dim3 ntpb;
+    int scount = send_count[ireq]*lb_halo_size(h->slim[ireq]);
+
+    kernel_launch_param(scount, &nblk, &ntpb);
+
+    kernelNodeParams.gridDim        = nblk;
+    kernelNodeParams.blockDim       = ntpb;
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void **) kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    tdpAssert( tdpGraphAddKernelNode(&kernelNode, h->gsend.graph, NULL, 0,
+				     &kernelNodeParams) );
+
+    if (have_gpu_aware_mpi_) {
+      /* Don't need explicit device -> host copy */
+    }
+//    else {
+//      /* We do need to add the memcpys to the graph definition
+//       * (except messages to self... ) */
+//
+//      int i = 1 + h->cv[h->nvel - ireq][X];
+//      int j = 1 + h->cv[h->nvel - ireq][Y];
+//      int k = 1 + h->cv[h->nvel - ireq][Z];
+//
+//      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
+//	tdpGraphNode_t memcpyNode;
+//        tdpMemcpy3DParms memcpyParams = {0};
+//
+//	memcpyParams.srcArray = NULL;
+//	memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
+//	memcpyParams.srcPtr   = make_tdpPitchedPtr(h->send_d[ireq],
+//						   sizeof(double)*scount,
+//						   scount, 1);
+//	memcpyParams.dstArray = NULL;
+//	memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
+//	memcpyParams.dstPtr   = make_tdpPitchedPtr(h->send[ireq],
+//						   sizeof(double)*scount,
+//						   scount, 1);
+//	memcpyParams.extent   = make_tdpExtent(sizeof(double)*scount, 1, 1);
+//	memcpyParams.kind     = tdpMemcpyDeviceToHost;
+//
+//	tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->gsend.graph,
+//					 &kernelNode, 1, &memcpyParams) );
+//      }
+//    }
+  }
+
+  tdpAssert( tdpGraphInstantiate(&h->gsend.exec, h->gsend.graph, 0) );
+
+  return 0;
+}
+
+/*****************************************************************************
+ *
+ *  lb_graph_halo_recv_create
+ *
+ *****************************************************************************/
+
+int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count) {
+
+  assert(lb);
+  assert(h);
+
+  tdpAssert( tdpGraphCreate(&h->grecv.graph, 0) );
+
+  for (int ireq = 1; ireq < h->map.nvel; ireq++) {
+    int rcount = recv_count[ireq]*lb_halo_size(h->rlim[ireq]);
+    tdpGraphNode_t memcpyNode = {0};
+
+    if (have_gpu_aware_mpi_) {
+      /* Don't need explicit copies */
+    }
+//    else {
+//      int i = 1 + h->cv[h->nvel - ireq][X];
+//      int j = 1 + h->cv[h->nvel - ireq][Y];
+//      int k = 1 + h->cv[h->nvel - ireq][Z];
+//
+//      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
+//	tdpMemcpy3DParms memcpyParams = {0};
+//
+//	memcpyParams.srcArray = NULL;
+//	memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
+//	memcpyParams.srcPtr   = make_tdpPitchedPtr(h->recv[ireq],
+//						   sizeof(double)*rcount,
+//						   rcount, 1);
+//	memcpyParams.dstArray = NULL;
+//	memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
+//	memcpyParams.dstPtr   = make_tdpPitchedPtr(h->recv_d[ireq],
+//						   sizeof(double)*rcount,
+//						   rcount, 1);
+//	memcpyParams.extent   = make_tdpExtent(sizeof(double)*rcount, 1, 1);
+//	memcpyParams.kind     = tdpMemcpyHostToDevice;
+//
+//	tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->grecv.graph, NULL,
+//					 0, &memcpyParams) );
+//      }
+//    }
+
+    /* Always need the dis-aggregateion kernel */
+
+    dim3 nblk;
+    dim3 ntpb;
+    tdpGraphNode_t node;
+    tdpKernelNodeParams kernelNodeParams = {0};
+    void * kernelArgs[3] = {(void *) &lb->target,
+                            (void *) &h->target,
+                            (void *) &ireq};
+    kernelNodeParams.func = (void *) lb_halo_dequeue_recv_kernel;
+
+    kernel_launch_param(rcount, &nblk, &ntpb);
+
+    kernelNodeParams.gridDim        = nblk;
+    kernelNodeParams.blockDim       = ntpb;
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void **) kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    if (have_gpu_aware_mpi_) {
+      tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, NULL,
+				       0, &kernelNodeParams) );
+    }
+//    else {
+//      int i = 1 + h->cv[h->nvel - ireq][X];
+//      int j = 1 + h->cv[h->nvel - ireq][Y];
+//      int k = 1 + h->cv[h->nvel - ireq][Z];
+//      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
+//	tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, &memcpyNode,
+//					 1, &kernelNodeParams) );
+//      }
+//      else {
+//	tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, NULL, 0,
+//					 &kernelNodeParams) );
+//      }
+//    }
+  }
+
+  tdpAssert( tdpGraphInstantiate(&h->grecv.exec, h->grecv.graph, 0) );
+
+  return 0;
+}
diff --git a/src/lb_data.h b/src/lb_data.h
index db60fe4d6..27ba74920 100644
--- a/src/lb_data.h
+++ b/src/lb_data.h
@@ -48,6 +48,12 @@ enum {NDIM = 3, NVEL = 27};
 typedef struct lb_collide_param_s lb_collide_param_t;
 typedef struct lb_halo_s lb_halo_t;
 typedef struct lb_data_s lb_t;
+typedef struct lb_graph_halo_s lb_graph_halo_t;
+
+struct lb_graph_halo_s {
+  tdpGraph_t graph;
+  tdpGraphExec_t exec;
+};
 
 struct lb_collide_param_s {
   int8_t isghost;                      /* switch for ghost modes */
@@ -92,6 +98,9 @@ struct lb_halo_s {
   lb_halo_t * target;
   double * send_d[27];            /* halo: device send buffer per direction */
   double * recv_d[27];            /* halo: device recv buffer per direction */
+  
+  lb_graph_halo_t gsend;          /* Graph API halo swap */
+  lb_graph_halo_t grecv;
 };
 
 int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme);
@@ -182,4 +191,7 @@ __host__ int lb_io_aggr_unpack(lb_t * lb, const io_aggregator_t * aggr);
 __host__ int lb_io_write(lb_t * lb, int timestep, io_event_t * event);
 __host__ int lb_io_read(lb_t * lb, int timestep, io_event_t * event);
 
+int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count);
+int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count);
+
 #endif

From 63b4aea89a62af3acda1045a35565c8d221c50eb Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Fri, 11 Oct 2024 18:43:30 +0100
Subject: [PATCH 045/133] debugging graph api implementation

---
 src/lb_data.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++---
 src/lb_data.h |  14 ++++
 2 files changed, 217 insertions(+), 11 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 77c414033..33e5ddbf6 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -48,6 +48,15 @@ static __constant__ lb_collide_param_t static_param;
 #include "mpi-ext.h"
 #endif
 
+#ifdef __NVCC__
+/* There are two file-scope switches here, which need to be generalised
+ * via some suitable interface; they are separate, but both relate to
+ * GPU execution. */
+static const int have_graph_api_ = 1;
+#else
+static const int have_graph_api_ = 0;
+#endif
+
 #if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
 static const int have_gpu_aware_mpi_ = 1;
 #else
@@ -1246,10 +1255,15 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
     tdpAssert( tdpMemcpy(h->target->recv, h->recv_d, 27*sizeof(double *),
 			 tdpMemcpyHostToDevice) );
 
+    if (have_graph_api_) {
+      lb_graph_halo_send_create(lb, h, send_count);
+      lb_graph_halo_recv_create(lb, h, recv_count);
+    }
+
   }
   free(send_count);
   free(recv_count);
-
+  
   return 0;
 }
 
@@ -1309,11 +1323,16 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
     copyModelToDevice(&h->map, &h->target->map);
     for (int ireq = 0; ireq < h->map.nvel; ireq++) {
       if (h->count[ireq] > 0) {
-        int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
-        dim3 nblk, ntpb;
-        kernel_launch_param(scount, &nblk, &ntpb);
-        tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
-        tdpDeviceSynchronize();
+        if (have_graph_api_) {
+          tdpAssert( tdpGraphLaunch(h->gsend.exec, h->stream) );
+          tdpAssert( tdpStreamSynchronize(h->stream) );
+        } else {
+          int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+          dim3 nblk, ntpb;
+          kernel_launch_param(scount, &nblk, &ntpb);
+          tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+          tdpDeviceSynchronize();
+        }
       }
     }
   } else {
@@ -1379,11 +1398,16 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
   if (ndevice > 0) {
     for (int ireq = 0; ireq < h->map.nvel; ireq++) {
       if (h->count[ireq] > 0) {
-        int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
-        dim3 nblk, ntpb;
-        kernel_launch_param(rcount, &nblk, &ntpb);
-        tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
-        tdpDeviceSynchronize();
+        if (have_graph_api_) {
+          tdpAssert( tdpGraphLaunch(h->grecv.exec, h->stream) );
+          tdpAssert( tdpStreamSynchronize(h->stream) );
+        } else {
+          int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+          dim3 nblk, ntpb;
+          kernel_launch_param(rcount, &nblk, &ntpb);
+          tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+          tdpDeviceSynchronize();
+        }
       }
     }
   } else {
@@ -1433,6 +1457,11 @@ int lb_halo_free(lb_t * lb, lb_halo_t * h) {
     free(h->recv[ireq]);
   }
 
+  if (have_graph_api_) {
+    tdpAssert( tdpGraphDestroy(h->gsend.graph) );
+    tdpAssert( tdpGraphDestroy(h->grecv.graph) );
+  }
+
   lb_model_free(&h->map);
 
   return 0;
@@ -1706,3 +1735,166 @@ int lb_io_read(lb_t * lb, int timestep, io_event_t * event) {
 
   return ifail;
 }
+
+/*****************************************************************************
+ *
+ * lb_graph_halo_send_create
+ *
+ *****************************************************************************/
+
+int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count) {
+
+  assert(lb);
+  assert(h);
+
+  tdpAssert( tdpGraphCreate(&h->gsend.graph, 0) );
+
+  for (int ireq = 1; ireq < h->map.nvel; ireq++) {
+    tdpGraphNode_t kernelNode;
+    tdpKernelNodeParams kernelNodeParams = {0};
+    void * kernelArgs[3] = {(void *) &lb->target,
+                            (void *) &h->target,
+                            (void *) &ireq};
+    kernelNodeParams.func = (void *) lb_halo_enqueue_send_kernel;
+    dim3 nblk;
+    dim3 ntpb;
+    int scount = send_count[ireq]*lb_halo_size(h->slim[ireq]);
+
+    kernel_launch_param(scount, &nblk, &ntpb);
+
+    kernelNodeParams.gridDim        = nblk;
+    kernelNodeParams.blockDim       = ntpb;
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void **) kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    tdpAssert( tdpGraphAddKernelNode(&kernelNode, h->gsend.graph, NULL, 0,
+				     &kernelNodeParams) );
+
+    if (have_gpu_aware_mpi_) {
+      /* Don't need explicit device -> host copy */
+    }
+//    else {
+//      /* We do need to add the memcpys to the graph definition
+//       * (except messages to self... ) */
+//
+//      int i = 1 + h->cv[h->nvel - ireq][X];
+//      int j = 1 + h->cv[h->nvel - ireq][Y];
+//      int k = 1 + h->cv[h->nvel - ireq][Z];
+//
+//      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
+//	tdpGraphNode_t memcpyNode;
+//        tdpMemcpy3DParms memcpyParams = {0};
+//
+//	memcpyParams.srcArray = NULL;
+//	memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
+//	memcpyParams.srcPtr   = make_tdpPitchedPtr(h->send_d[ireq],
+//						   sizeof(double)*scount,
+//						   scount, 1);
+//	memcpyParams.dstArray = NULL;
+//	memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
+//	memcpyParams.dstPtr   = make_tdpPitchedPtr(h->send[ireq],
+//						   sizeof(double)*scount,
+//						   scount, 1);
+//	memcpyParams.extent   = make_tdpExtent(sizeof(double)*scount, 1, 1);
+//	memcpyParams.kind     = tdpMemcpyDeviceToHost;
+//
+//	tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->gsend.graph,
+//					 &kernelNode, 1, &memcpyParams) );
+//      }
+//    }
+  }
+
+  tdpAssert( tdpGraphInstantiate(&h->gsend.exec, h->gsend.graph, 0) );
+
+  return 0;
+}
+
+/*****************************************************************************
+ *
+ *  lb_graph_halo_recv_create
+ *
+ *****************************************************************************/
+
+int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count) {
+
+  assert(lb);
+  assert(h);
+
+  tdpAssert( tdpGraphCreate(&h->grecv.graph, 0) );
+
+  for (int ireq = 1; ireq < h->map.nvel; ireq++) {
+    int rcount = recv_count[ireq]*lb_halo_size(h->rlim[ireq]);
+    tdpGraphNode_t memcpyNode = {0};
+
+    if (have_gpu_aware_mpi_) {
+      /* Don't need explicit copies */
+    }
+//    else {
+//      int i = 1 + h->cv[h->nvel - ireq][X];
+//      int j = 1 + h->cv[h->nvel - ireq][Y];
+//      int k = 1 + h->cv[h->nvel - ireq][Z];
+//
+//      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
+//	tdpMemcpy3DParms memcpyParams = {0};
+//
+//	memcpyParams.srcArray = NULL;
+//	memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
+//	memcpyParams.srcPtr   = make_tdpPitchedPtr(h->recv[ireq],
+//						   sizeof(double)*rcount,
+//						   rcount, 1);
+//	memcpyParams.dstArray = NULL;
+//	memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
+//	memcpyParams.dstPtr   = make_tdpPitchedPtr(h->recv_d[ireq],
+//						   sizeof(double)*rcount,
+//						   rcount, 1);
+//	memcpyParams.extent   = make_tdpExtent(sizeof(double)*rcount, 1, 1);
+//	memcpyParams.kind     = tdpMemcpyHostToDevice;
+//
+//	tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->grecv.graph, NULL,
+//					 0, &memcpyParams) );
+//      }
+//    }
+
+    /* Always need the dis-aggregateion kernel */
+
+    dim3 nblk;
+    dim3 ntpb;
+    tdpGraphNode_t node;
+    tdpKernelNodeParams kernelNodeParams = {0};
+    void * kernelArgs[3] = {(void *) &lb->target,
+                            (void *) &h->target,
+                            (void *) &ireq};
+    kernelNodeParams.func = (void *) lb_halo_dequeue_recv_kernel;
+
+    kernel_launch_param(rcount, &nblk, &ntpb);
+
+    kernelNodeParams.gridDim        = nblk;
+    kernelNodeParams.blockDim       = ntpb;
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void **) kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    if (have_gpu_aware_mpi_) {
+      tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, NULL,
+				       0, &kernelNodeParams) );
+    }
+//    else {
+//      int i = 1 + h->cv[h->nvel - ireq][X];
+//      int j = 1 + h->cv[h->nvel - ireq][Y];
+//      int k = 1 + h->cv[h->nvel - ireq][Z];
+//      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
+//	tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, &memcpyNode,
+//					 1, &kernelNodeParams) );
+//      }
+//      else {
+//	tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, NULL, 0,
+//					 &kernelNodeParams) );
+//      }
+//    }
+  }
+
+  tdpAssert( tdpGraphInstantiate(&h->grecv.exec, h->grecv.graph, 0) );
+
+  return 0;
+}
\ No newline at end of file
diff --git a/src/lb_data.h b/src/lb_data.h
index db60fe4d6..35276221f 100644
--- a/src/lb_data.h
+++ b/src/lb_data.h
@@ -49,6 +49,14 @@ typedef struct lb_collide_param_s lb_collide_param_t;
 typedef struct lb_halo_s lb_halo_t;
 typedef struct lb_data_s lb_t;
 
+typedef struct lb_graph_halo_s lb_graph_halo_t;
+
+struct lb_graph_halo_s {
+  tdpGraph_t graph;
+  tdpGraphExec_t exec;
+};
+
+
 struct lb_collide_param_s {
   int8_t isghost;                      /* switch for ghost modes */
   int8_t cv[27][3];
@@ -92,6 +100,9 @@ struct lb_halo_s {
   lb_halo_t * target;
   double * send_d[27];            /* halo: device send buffer per direction */
   double * recv_d[27];            /* halo: device recv buffer per direction */
+  
+  lb_graph_halo_t gsend;     /* Graph API halo swap */
+  lb_graph_halo_t grecv;
 };
 
 int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme);
@@ -182,4 +193,7 @@ __host__ int lb_io_aggr_unpack(lb_t * lb, const io_aggregator_t * aggr);
 __host__ int lb_io_write(lb_t * lb, int timestep, io_event_t * event);
 __host__ int lb_io_read(lb_t * lb, int timestep, io_event_t * event);
 
+int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count);
+int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count);
+
 #endif

From 144bcff17c5650774f13631925851e729753680f Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Wed, 23 Oct 2024 19:54:40 +0100
Subject: [PATCH 046/133] Add dynamic update for 2d disks

---
 src/bbl.c                                 | 110 +++++++++++++++-
 src/colloids_rt.c                         |  16 ++-
 src/util.c                                |  40 ++++++
 src/util.h                                |   2 +
 tests/regression/d2q9/serial-disk-s01.inp |  81 ++++++++++++
 tests/regression/d2q9/serial-disk-s01.log | 150 ++++++++++++++++++++++
 6 files changed, 394 insertions(+), 5 deletions(-)
 create mode 100644 tests/regression/d2q9/serial-disk-s01.inp
 create mode 100644 tests/regression/d2q9/serial-disk-s01.log

diff --git a/src/bbl.c b/src/bbl.c
index c75f8ed9a..689a11c78 100644
--- a/src/bbl.c
+++ b/src/bbl.c
@@ -66,6 +66,8 @@ __global__ void bbl_pass0_kernel(kernel_3d_t k3d, cs_t * cs, lb_t * lb,
 
 static __constant__ lb_collide_param_t lbp;
 
+int bbl_update_colloid_disk(bbl_t * bbl, colloid_t * pc,
+			    double rho0, double xb[6]);
 int bbl_update_colloid_default(bbl_t * bbl, wall_t * wall, colloid_t * pc,
 			       double rho0, double xb[6]);
 int bbl_update_ellipsoid(bbl_t * bbl, wall_t * wall, colloid_t * pc,
@@ -915,6 +917,10 @@ int bbl_update_colloids(bbl_t * bbl, wall_t * wall, colloids_info_t * cinfo) {
 
     if (pc->s.bc != COLLOID_BC_BBL) continue;
 
+    if (pc->s.shape == COLLOID_SHAPE_DISK) {
+      iret = bbl_update_colloid_disk(bbl, pc, rho0, xb);
+    }
+
     if (pc->s.shape == COLLOID_SHAPE_SPHERE) {
       iret = bbl_update_colloid_default(bbl, wall, pc, rho0, xb);
     }
@@ -958,6 +964,103 @@ int bbl_update_colloids(bbl_t * bbl, wall_t * wall, colloids_info_t * cinfo) {
   return 0;
 }
 
+/*****************************************************************************
+ *
+ *  bbl_update_colloids_disk
+ *
+ *  Calculate the update for disks.
+ *
+ *  At the moment we are doing the 3-dimensional problem and just
+ *  setting the z-components to zero. This could be replaced by
+ *  a purely (x, y) implementation.
+ *
+ *****************************************************************************/
+
+int bbl_update_colloid_disk(bbl_t * bbl, colloid_t * pc,
+			    double rho0, double xb[6]) {
+  int iret = 0;
+
+  double mass    = 0.0;    /* Assumes rho pi r^2 */
+  double moment  = 0.0;    /* also assumes (1/4) mass r^2 for sphere */
+  double a[6][6] = {0};
+
+  PI_DOUBLE(pi);
+
+  assert(bbl);
+  assert(pc);
+
+  /* Set up the matrix problem and solve it here. */
+
+  /* Mass and moment of inertia are those of a hard sphere
+   * with the input radius */
+
+  {
+    double a0 = pc->s.a0;
+    mass      = pi*rho0*a0*a0;
+    moment    = (1.0/4.0)*mass*a0*a0;
+  }
+
+  /* Add inertial terms to diagonal elements */
+
+  a[0][0] = mass +   pc->zeta[0];
+  a[0][1] =          pc->zeta[1];
+  a[0][2] =          pc->zeta[2];
+  a[0][3] =          pc->zeta[3];
+  a[0][4] =          pc->zeta[4];
+  a[0][5] =          pc->zeta[5];
+  a[1][1] = mass +   pc->zeta[6];
+  a[1][2] =          pc->zeta[7];
+  a[1][3] =          pc->zeta[8];
+  a[1][4] =          pc->zeta[9];
+  a[1][5] =          pc->zeta[10];
+  a[2][2] = mass +   pc->zeta[11];
+  a[2][3] =          pc->zeta[12];
+  a[2][4] =          pc->zeta[13];
+  a[2][5] =          pc->zeta[14];
+  a[3][3] = moment + pc->zeta[15];
+  a[3][4] =          pc->zeta[16];
+  a[3][5] =          pc->zeta[17];
+  a[4][4] = moment + pc->zeta[18];
+  a[4][5] =          pc->zeta[19];
+  a[5][5] = moment + pc->zeta[20];
+
+  /* Lower triangle */
+
+  a[1][0] = a[0][1];
+  a[2][0] = a[0][2];
+  a[2][1] = a[1][2];
+  a[3][0] = a[0][3];
+  a[3][1] = a[1][3];
+  a[3][2] = a[2][3];
+  a[4][0] = a[0][4];
+  a[4][1] = a[1][4];
+  a[4][2] = a[2][4];
+  a[4][3] = a[3][4];
+  a[5][0] = a[0][5];
+  a[5][1] = a[1][5];
+  a[5][2] = a[2][5];
+  a[5][3] = a[3][5];
+  a[5][4] = a[4][5];
+
+  /* Form the right-hand side */
+
+  for (int ia = 0; ia < 3; ia++) {
+    xb[ia]   = mass*pc->s.v[ia]   + pc->f0[ia] + pc->force[ia];
+    xb[3+ia] = moment*pc->s.w[ia] + pc->t0[ia] + pc->torque[ia];
+  }
+
+  /* Contribution to mass conservation from squirmer */
+
+  for (int ia = 0; ia < 3; ia++) {
+    xb[ia]   += pc->fc0[ia];
+    xb[3+ia] += pc->tc0[ia];
+  }
+
+  iret = bbl_6x6_gaussian_elimination(a, xb);
+
+  return iret;
+}
+
 /*****************************************************************************
  *
  *  bbl_update_colloids_default
@@ -1404,8 +1507,11 @@ static int bbl_wall_lubrication_account(bbl_t * bbl, wall_t * wall,
     else if (pc->s.shape == COLLOID_SHAPE_ELLIPSOID) {
       bbl_wall_lubr_correction_ellipsoid(bbl, wall, pc, dwall);
     }
-    else {
-      /* Specifically, no COLLOID_SHAPE_DISK */
+    else if (pc->s.shape == COLLOID_SHAPE_DISK) {
+      /* There is current no correction available; dwall = 0 */
+      ;
+    } else {
+      /* internal error */
       assert(0);
     }
     f[X] -= pc->s.v[X]*dwall[X];
diff --git a/src/colloids_rt.c b/src/colloids_rt.c
index 83d5c6e1b..e0d092cbf 100644
--- a/src/colloids_rt.c
+++ b/src/colloids_rt.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2014-2023 The University of Edinburgh
+ *  (c) 2014-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -67,7 +67,9 @@ int colloids_rt_init_random(pe_t * pe, cs_t * cs, rt_t * rt, wall_t * wall,
 int colloids_rt_state_stub(pe_t * pe, rt_t * rt, colloids_info_t * cinfo,
 			   const char * stub,
 			   colloid_state_t * state);
-int colloids_rt_cell_list_checks(pe_t * pe, cs_t * cs, colloids_info_t ** pinfo,
+int colloids_rt_cell_list_checks(pe_t * pe, cs_t * cs,
+				 const lb_model_t * model,
+				 colloids_info_t ** pinfo,
 				 interact_t * interact);
 
 /*****************************************************************************
@@ -154,7 +156,7 @@ int colloids_init_rt(pe_t * pe, rt_t * rt, cs_t * cs, colloids_info_t ** pinfo,
 
   wall_ss_cut_init(pe, cs, rt, wall, *interact);
 
-  colloids_rt_cell_list_checks(pe, cs, pinfo, *interact);
+  colloids_rt_cell_list_checks(pe, cs, model, pinfo, *interact);
   colloids_init_halo_range_check(pe, cs, *pinfo);
   if (nc > 1) interact_range_check(*interact, *pinfo);
 
@@ -777,12 +779,17 @@ int colloids_rt_gravity(pe_t * pe, rt_t * rt, colloids_info_t * cinfo) {
  *  For given set of colloids in the default cell list, and given
  *  interactions, work out what the best cell list size is.
  *
+ *  The lb_model_t is included here to get the dimensionsality;
+ *  in priciple one could have an entirely separate procedure for
+ *  two-dimensional systems of disks.
+ *
  *  The cell width should be as small as possible to prevent
  *  unnecessary halo transfers.
  *
  *****************************************************************************/
 
 int colloids_rt_cell_list_checks(pe_t * pe, cs_t * cs,
+				 const lb_model_t * model,
 				 colloids_info_t ** pinfo,
 				 interact_t * interact) {
   int nc;
@@ -843,6 +850,9 @@ int colloids_rt_cell_list_checks(pe_t * pe, cs_t * cs,
     pe_info(pe, "Centre-centre interaction:   %14.7e\n", rcmax);
   }
 
+  /* If we have 2d disks, then prevent nbest[Z] going to zero... */
+  if (model->ndim == 2) nbest[Z] = imax(1, nbest[Z]);
+
   /* Transfer colloids to new cell list if required */
 
   if (nbest[X] > 2 || nbest[Y] > 2 || nbest[Z] > 2) {
diff --git a/src/util.c b/src/util.c
index 693ba4bb7..6bf3074cc 100644
--- a/src/util.c
+++ b/src/util.c
@@ -442,6 +442,46 @@ static __host__ void util_swap(int ia, int ib, double a[3], double b[3][3]) {
   return;
 }
 
+/*****************************************************************************
+ *
+ *  util_discrete_area_disk
+ *
+ *  For a disk of radius a0 and position r0 in two dimensions, what is
+ *  the discrete area?
+ *
+ *****************************************************************************/
+
+int util_discrete_area_disk(double a0, const double r0[2], double * vn) {
+
+  int ifail = 0;
+
+  if (vn == NULL) {
+    ifail = -1;
+  }
+  else {
+
+    /* Reduce the coordinates to 0 <= x < 1 etc */
+    double x0 = r0[X] - floor(r0[X]);
+    double y0 = r0[Y] - floor(r0[Y]);
+
+    int nr = ceil(a0);
+
+    assert(0.0 <= x0 && x0 < 1.0);
+    assert(0.0 <= y0 && y0 < 1.0);
+
+    *vn = 0.0;
+
+    for (int ic = -nr; ic <= nr; ic++) {
+      for (int jc = -nr; jc <= nr; jc++) {
+	double rsq = pow(1.0*ic - x0, 2) + pow(1.0*jc - y0, 2);
+	if (rsq < a0*a0) *vn += 1.0;
+      }
+    }
+  }
+
+  return ifail;
+}
+
 /*****************************************************************************
  *
  *  util_discrete_volume_sphere
diff --git a/src/util.h b/src/util.h
index 86f6e7a7b..60fad8502 100644
--- a/src/util.h
+++ b/src/util.h
@@ -51,6 +51,8 @@ __host__ __device__ double dmax(const double a, const double b);
 
 __host__ int util_jacobi(double a[3][3], double vals[3], double vecs[3][3]);
 __host__ int util_jacobi_sort(double a[3][3], double vals[3], double vecs[3][3]);
+
+int util_discrete_area_disk(double a0, const double r0[2], double * area);
 __host__ int util_discrete_volume_sphere(const double r0[3], double a0,
 					 double * vn);
 __host__ int util_gauss_jordan(const int n, double * a, double * b);
diff --git a/tests/regression/d2q9/serial-disk-s01.inp b/tests/regression/d2q9/serial-disk-s01.inp
new file mode 100644
index 000000000..a6828bb79
--- /dev/null
+++ b/tests/regression/d2q9/serial-disk-s01.inp
@@ -0,0 +1,81 @@
+#############################################################################
+#
+#  Provisional test for sedimentation of 2d disk
+#
+#  We should have system size in z direction equal to unity.
+#  a periodicity of 1_1_0 and colloid shape "disk".
+#
+##############################################################################
+
+N_start  0
+N_cycles 100
+
+##############################################################################
+#
+#  System
+# 
+##############################################################################
+
+size         32_32_1
+periodicity  1_1_0
+
+##############################################################################
+#
+#  Fluid parameters
+#  
+##############################################################################
+
+fluid_rho0 1.0
+
+viscosity 0.625
+force 0.0_0.0_0.0
+
+isothermal_fluctuations off
+temperature 0.0
+
+
+##############################################################################
+#
+#  Free energy parameters
+#
+###############################################################################
+
+free_energy none
+
+
+colloid_init              input_one
+
+colloid_one_shape         disk
+colloid_one_active        no
+colloid_one_a0            3.63
+colloid_one_ah            3.63
+colloid_one_r             16.0_16.0_1.0
+colloid_one_v             0.0_0.0_0.0
+colloid_one_m             1.0_0.0_0.0
+colloid_one_b1            0.0
+colloid_one_b2            0.0
+
+
+
+# Constant body force on all colloids ("gravity") [default is zero]
+# Uniform magnetic field [default is zero]
+
+colloid_gravity  0.000001_0.0_0.0
+
+###############################################################################
+#
+#  Walls / boundaries
+#
+###############################################################################
+
+boundary_walls 0_0_0
+
+###############################################################################
+#
+#  Output frequency and type
+#
+###############################################################################
+
+freq_statistics 100
+config_at_end   no
+
diff --git a/tests/regression/d2q9/serial-disk-s01.log b/tests/regression/d2q9/serial-disk-s01.log
new file mode 100644
index 000000000..54e609707
--- /dev/null
+++ b/tests/regression/d2q9/serial-disk-s01.log
@@ -0,0 +1,150 @@
+Welcome to: Ludwig v0.22.0 (Serial version running on 1 process)
+Git commit: 517a3f0aa177342b9e70760f289c9bdd27e69b1d
+
+Start time: Wed Oct 23 19:51:16 2024
+
+Compiler:
+  name:           Gnu 14.1.0
+  version-string: 14.1.0
+  options:        -O2 -g -Wall -Werror
+
+Note assertions via standard C assert() are on.
+
+Target thread model: OpenMP.
+OpenMP threads: 1; maximum number of threads: 11.
+
+Read 24 user parameters from input
+
+No free energy selected
+
+System details
+--------------
+System size:    32 32 1
+Decomposition:  1 1 1
+Local domain:   32 32 1
+Periodic:       1 1 0
+Halo nhalo:     1
+Reorder:        true
+Initialised:    1
+
+System properties
+----------------
+Mean fluid density:           1.00000e+00
+Shear viscosity               6.25000e-01
+Bulk viscosity                6.25000e-01
+Temperature                   0.00000e+00
+External body force density   0.00000e+00  0.00000e+00  0.00000e+00
+External E-field amplitude    0.00000e+00  0.00000e+00  0.00000e+00
+External E-field frequency    0.00000e+00
+External magnetic field       0.00000e+00  0.00000e+00  0.00000e+00
+
+Lattice Boltzmann distributions
+-------------------------------
+Model:            d2q9  
+SIMD vector len:  1
+Number of sets:   1
+Halo type:        lb_halo_target (full halo)
+Input format:     binary
+Output format:    binary
+I/O grid:         1 1 1
+
+Lattice Boltzmann collision
+---------------------------
+Relaxation time scheme:   M10
+Hydrodynamic modes:       on
+Ghost modes:              on
+Isothermal fluctuations:  off
+Shear relaxation time:    2.37500e+00
+Bulk relaxation time:     2.37500e+00
+Ghost relaxation time:    1.00000e+00
+[Default] Random number seed: 7361237
+
+Hydrodynamics
+-------------
+Hydrodynamics: on
+
+Colloid information
+-------------------
+
+Colloid I/O settings
+--------------------
+Decomposition:                1  1  1
+Number of files:              1
+Input format:                 ascii
+Output format:                ascii
+Single file read flag:        0
+
+Requested one colloid via input:
+colloid_one                   disk
+colloid_one_a0                3.6300000e+00
+colloid_one_ah                3.6300000e+00
+colloid_one_r                 1.6000000e+01  1.6000000e+01  1.0000000e+00
+colloid_one_v                 0.0000000e+00  0.0000000e+00  0.0000000e+00
+colloid_one_m                 1.0000000e+00  0.0000000e+00  0.0000000e+00
+colloid_one_b1                0.0000000e+00
+colloid_one_b2                0.0000000e+00
+
+Initialised 1 colloid
+
+Colloid cell list information
+-----------------------------
+Input radius maximum:         3.6300000e+00
+Final cell list:              7 7 1
+Final cell lengths:           4.5714286e+00  4.5714286e+00  1.0000000e+00
+
+Sedimentation force on:       yes
+Sedimentation force:          1.0000000e-06  0.0000000e+00  0.0000000e+00
+
+Initial conditions.
+
+Scalars - total mean variance min max
+[rho]         979.00  1.00000000000  1.1102230e-16  1.00000000000  1.00000000000
+
+Momentum - x y z
+[total   ]  0.0000000e+00  0.0000000e+00  0.0000000e+00
+[fluid   ]  0.0000000e+00  0.0000000e+00  0.0000000e+00
+[colloids]  0.0000000e+00  0.0000000e+00  0.0000000e+00
+
+Starting time step loop.
+
+Particle statistics:
+
+Colloid velocities - x y z
+[minimum ]  1.1698436e-07  1.0091244e-17  0.0000000e+00
+[maximum ]  1.1698436e-07  1.0091244e-17  0.0000000e+00
+
+Scalars - total mean variance min max
+[rho]         979.00  1.00000000000  3.5527137e-15  0.99999990031  1.00000009969
+
+Momentum - x y z
+[total   ] -1.0091431e-13 -5.8304786e-17  0.0000000e+00
+[fluid   ] -4.8528317e-06 -7.6674778e-16  0.0000000e+00
+[colloids]  4.8528316e-06  7.0844299e-16  0.0000000e+00
+
+Velocity - x y z
+[minimum ] -8.2676119e-08 -3.8231796e-08  0.0000000e+00
+[maximum ]  1.1450985e-07  3.8231796e-08  1.1754944e-38
+
+Completed cycle 100
+
+Timer resolution: 1e-06 second
+
+Timer statistics
+             Section:       tmin       tmax      total
+               Total:      0.074      0.074      0.074   0.073925 (1 call)
+      Time step loop:      0.001      0.001      0.073   0.000734 (100 calls)
+         Propagation:      0.000      0.000      0.011   0.000113 (100 calls)
+    Propagtn (krnl) :      0.000      0.000      0.011   0.000113 (100 calls)
+           Collision:      0.000      0.000      0.033   0.000332 (100 calls)
+   Collision (krnl) :      0.000      0.000      0.033   0.000332 (100 calls)
+       Lattice halos:      0.000      0.000      0.020   0.000100 (200 calls)
+       phi gradients:      0.000      0.000      0.000   0.000000 (100 calls)
+              Forces:      0.000      0.000      0.000   0.000003 (100 calls)
+             Rebuild:      0.000      0.000      0.003   0.000026 (100 calls)
+                 BBL:      0.000      0.000      0.002   0.000022 (100 calls)
+      Particle halos:      0.000      0.000      0.000   0.000003 (100 calls)
+   Force calculation:      0.000      0.000      0.000   0.000000 (100 calls)
+          phi update:      0.000      0.000      0.000   0.000000 (100 calls)
+Diagnostics / output:      0.000      0.000      0.000   0.000001 (100 calls)
+End time: Wed Oct 23 19:51:16 2024
+Ludwig finished normally.

From d46434c5d81836ed687bf4843bc27ade9c9c707a Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Wed, 23 Oct 2024 19:57:26 +0100
Subject: [PATCH 047/133] Format, spelling

---
 src/colloids_rt.c |  2 +-
 src/util.c        | 14 +++++++-------
 src/util.h        |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/colloids_rt.c b/src/colloids_rt.c
index e0d092cbf..b54d7c6d7 100644
--- a/src/colloids_rt.c
+++ b/src/colloids_rt.c
@@ -780,7 +780,7 @@ int colloids_rt_gravity(pe_t * pe, rt_t * rt, colloids_info_t * cinfo) {
  *  interactions, work out what the best cell list size is.
  *
  *  The lb_model_t is included here to get the dimensionsality;
- *  in priciple one could have an entirely separate procedure for
+ *  in principle one could have an entirely separate procedure for
  *  two-dimensional systems of disks.
  *
  *  The cell width should be as small as possible to prevent
diff --git a/src/util.c b/src/util.c
index 6bf3074cc..f5906fb24 100644
--- a/src/util.c
+++ b/src/util.c
@@ -95,10 +95,10 @@ int util_reverse_byte_order(void * arg, void * result, MPI_Datatype type) {
   carg = (char *) arg;
 
   if (type == MPI_INT) {
-    
+
     int iresult;
     p = (char *) &iresult;
-      
+
     for (b = 0; b < sizeof(int); b++) {
       p[b] = carg[sizeof(int) - (b + 1)];
     }
@@ -174,7 +174,7 @@ double modulus(const double a[3]) {
  *
  *  The rotated vector is computed via
  *      v' = (1 - cos \theta)(\hat{w}.v) \hat{w} + cos \theta v +
- *           (\hat{w} x v) sin \theta      
+ *           (\hat{w} x v) sin \theta
  *
  *  For theta positive this gives rotations in the correct sense
  *  in the right-handed coordinate system.
@@ -229,7 +229,7 @@ void rotate_vector(double v[3], const double w[3]) {
 int util_random_unit_vector(int * state, double rhat[3]) {
 
   double r[2];
-  double zeta1, zeta2, zsq;  
+  double zeta1, zeta2, zsq;
 
   do {
     util_ranlcg_reap_uniform(state, r);
@@ -411,7 +411,7 @@ __host__ int util_jacobi(double a[3][3], double vals[3], double vecs[3][3]) {
     }
   }
 
-  /* Exceded maximum iterations: a fail ... */
+  /* Exceeded maximum iterations: a fail ... */
 
   return -1;
 }
@@ -680,7 +680,7 @@ int util_matrix_free(int m, double ***p) {
   }
   free(*p);
   *p = NULL;
- 
+
   return 0;
 }
 
@@ -873,7 +873,7 @@ static __host__ long int util_ranlcg_multiply(long a, long s, long c, long m);
  *
  *  util_ranlcg_reap_gaussian
  *
- *  Box-Mueller. Caller responisble for maintaining state.
+ *  Box-Mueller. Caller responsible for maintaining state.
  *
  *  Returns two Gaussian deviates per call.
  *
diff --git a/src/util.h b/src/util.h
index 60fad8502..6dcaf2179 100644
--- a/src/util.h
+++ b/src/util.h
@@ -10,8 +10,8 @@
  *  (c) 2010-2024 The University of Edinburgh
  *
  *  Contributing authors:
- *  Kevin Stratford (kevin@epcc.ed.ac.uk) 
- *  
+ *  Kevin Stratford (kevin@epcc.ed.ac.uk)
+ *
  ****************************************************************************/
 
 #include <stdint.h>

From 1ed900539e6de2418e549b049bcc7a5de5ec8cf1 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Tue, 5 Nov 2024 09:05:36 +0000
Subject: [PATCH 048/133] Add separate disk squirmer section

---
 src/bbl.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/src/bbl.c b/src/bbl.c
index 689a11c78..c08c3ed09 100644
--- a/src/bbl.c
+++ b/src/bbl.c
@@ -537,6 +537,38 @@ static int bbl_pass1(bbl_t * bbl, lb_t * lb, colloids_info_t * cinfo) {
 
 
 	/* Squirmer section */
+	/* Some rationalisation may be in order here but prefer
+	* a clear separation of different shapes at the moment ... */
+
+	if (pc->s.active && pc->s.shape == COLLOID_SHAPE_DISK) {
+
+	  /* Both the link vector rb and the direction of motion s.m
+	   * must have z-component = 0 in 2d */
+	  /* If so, vector1 has only a component in z, and tans then
+	   * has only components in (x,y). */
+
+	  mod = modulus(p_link->rb)*modulus(pc->s.m);
+	  rmod = 0.0;
+	  if (mod != 0.0) rmod = 1.0/mod;
+	  cost = rmod*dot_product(p_link->rb, pc->s.m);
+	  if (cost*cost > 1.0) cost = 1.0;
+	  assert(cost*cost <= 1.0);
+	  sint = sqrt(1.0 - cost*cost);
+
+	  cross_product(p_link->rb, pc->s.m, vector1);
+	  cross_product(vector1, p_link->rb, tans);
+
+	  mod = modulus(tans);
+	  rmod = 0.0;
+	  if (mod != 0.0) rmod = 1.0/mod;
+	  plegendre = -sint*(pc->s.b2*cost + pc->s.b1);
+
+	  /* Compute correction to bbl for a sphere: */
+	  dm_a = 0.0;
+	  for (ia = 0; ia < 3; ia++) {
+	    dm_a += -delta*plegendre*rmod*tans[ia]*lb->model.cv[ij][ia];
+	  }
+	}
 
 	if (pc->s.active && pc->s.shape == COLLOID_SHAPE_SPHERE) {
 

From 4da91649dfece2fb53016a4d60baa4d4a01630bb Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Fri, 15 Nov 2024 17:58:46 +0000
Subject: [PATCH 049/133] Work on stub errors return

---
 mpi_s/mpi.h        |   14 +
 mpi_s/mpi_serial.c | 1344 ++++++++++++++++++++++++++++++++++----------
 mpi_s/mpi_tests.c  |   34 +-
 3 files changed, 1099 insertions(+), 293 deletions(-)

diff --git a/mpi_s/mpi.h b/mpi_s/mpi.h
index 80e67fa3d..1464ac9e3 100644
--- a/mpi_s/mpi.h
+++ b/mpi_s/mpi.h
@@ -39,6 +39,7 @@ typedef MPI_Handle MPI_Info;
 typedef struct {
   int MPI_SOURCE;
   int MPI_TAG;
+  int MPI_ERROR;
 } MPI_Status;
 
 #define MPI_STATUS_IGNORE   ((MPI_Status *) 0)
@@ -57,8 +58,18 @@ typedef intmax_t MPI_Offset;
 
 enum return_codes {
   MPI_SUCCESS = 0,               /* Success */
+  MPI_ERR_ARG,                   /* Invalid argument of other kind */
+  MPI_ERR_BUFFER,                /* Invalid buffer pointer argument */
+  MPI_ERR_COMM,                  /* Invalid communicator argument */
+  MPI_ERR_COUNT,                 /* Invalid count argument */
+  MPI_ERR_DATATYPE,              /* Invalid datatype */
+  MPI_ERR_INFO,                  /* Invalid info argument */
   MPI_ERR_FILE,                  /* Bad file handle */
   MPI_ERR_NO_SUCH_FILE,          /* File does not exist */
+  MPI_ERR_OP,                    /* Invalid operation argument */
+  MPI_ERR_RANK,                  /* Invalid rank argument */
+  MPI_ERR_ROOT,                  /* Invalid root argument */
+  MPI_ERR_TAG,                   /* Invalid tag */
   MPI_ERR_LASTCODE               /* Must be last */
 };
 
@@ -70,8 +81,11 @@ enum return_codes {
 #define MPI_BOTTOM         0x0000
 #define MPI_UNDEFINED     -999
 
+
 /* Error-handling specifiers */
 
+#define MPI_MAX_ERROR_STRING 256
+
 enum error_specifiers {MPI_ERRORS_ARE_FATAL, MPI_ERRORS_RETURN};
 
 enum elementary_datatypes {MPI_CHAR           = -11,
diff --git a/mpi_s/mpi_serial.c b/mpi_s/mpi_serial.c
index 4b7f425b9..15f803ce4 100644
--- a/mpi_s/mpi_serial.c
+++ b/mpi_s/mpi_serial.c
@@ -24,6 +24,7 @@
  *****************************************************************************/
 
 #include <assert.h>
+#include <stdarg.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -58,8 +59,10 @@ typedef struct internal_file_view_s file_t;
 struct internal_data_type_s {
   MPI_Datatype handle;       /* User space handle [in suitable range] */
   int          bytes;        /* sizeof */
-  int          commit;       /* Commited? */
+  int          commit;       /* Committed? */
   int          flavour;      /* Contiguous types only at present */
+  MPI_Aint     lb;           /* Lower bound argument */
+  int          stride;       /* Stride argument */
 };
 
 struct internal_file_view_s {
@@ -82,9 +85,14 @@ struct mpi_info_s {
   int dtfreelist[MAX_USER_DT];   /* Free list */
 
   file_t filelist[MAX_USER_FILE]; /* MPI_File information for open files */
+
+  /* At the moment there is a single error string rather than one per
+   * comm and file */
+  char comm_error_string[MPI_MAX_ERROR_STRING];
+  char file_error_string[MPI_MAX_ERROR_STRING];
 };
 
-static mpi_info_t * mpi_info = NULL;
+static mpi_info_t * mpi_info_ = NULL;
 
 static void mpi_copy(void * send, void * recv, int count, MPI_Datatype type);
 static int mpi_sizeof(MPI_Datatype type);
@@ -99,6 +107,330 @@ static MPI_File mpi_file_handle_retain(mpi_info_t * ctxt, FILE * fp);
 static FILE *   mpi_file_handle_release(mpi_info_t * ctxt, MPI_File handle);
 static FILE *   mpi_file_handle_to_fp(mpi_info_t * info, MPI_File handle);
 
+static int      mpi_datatype_invalid(MPI_Datatype dt);
+static int      mpi_file_handle_invalid(MPI_File fh);
+static int      mpi_tag_invalid(int tag);
+
+/* In principle, the errhandler is registered against a comm, file, etc */
+/* The "errors_return" handler would store the message and return */
+/* The "errors_are_fatal" handler would store the message, print. and fail */
+
+static int mpi_comm_set_error_string(MPI_Comm comm, const char * fmt, ...) {
+
+  /* In principle, handled on a per communicator basis */
+
+  va_list args;
+
+  assert(mpi_info_);
+  assert(comm != MPI_COMM_NULL);
+
+  va_start(args, fmt);
+  vsnprintf(mpi_info_->comm_error_string, MPI_MAX_ERROR_STRING, fmt, args);
+  va_end(args);
+
+  return 0;
+}
+
+static int comm_mpi_err_comm_handler(MPI_Comm comm, const char * fname) {
+
+  int ifail = MPI_SUCCESS;
+
+  if (mpi_is_valid_comm(comm) == 0) {
+    ifail = MPI_ERR_COMM;
+    mpi_comm_set_error_string(comm, "%s: invalid communicator", fname);
+  }
+
+  return ifail;
+}
+
+static int comm_mpi_err_buffer_handler(MPI_Comm comm, const void * buf,
+				       const char * fname) {
+  int ifail = MPI_SUCCESS;
+  /* FIXME Need to watch out for MPI_IN_PLACE */
+  if (buf == NULL) {
+    ifail = MPI_ERR_BUFFER;
+    mpi_comm_set_error_string(comm, "%s: NULL buffer pointer", fname);
+    /* erhandler */
+  }
+
+  return ifail;
+}
+
+static int comm_mpi_err_count_handler(MPI_Comm comm, int count,
+				      const char * fname) {
+  int ifail = MPI_SUCCESS;
+
+  if (count < 0) {
+    ifail = MPI_ERR_COUNT;
+    mpi_comm_set_error_string(comm, "%s(): count must be >= 0", fname);
+    /* Call errhandler */
+  }
+  return ifail;
+}
+
+static int comm_mpi_err_datatype_handler(MPI_Comm comm, MPI_Datatype dt,
+					 const char * fname) {
+  int ifail = MPI_SUCCESS;
+
+  if (mpi_datatype_invalid(dt)) {
+    ifail = MPI_ERR_DATATYPE;
+    mpi_comm_set_error_string(comm, "%s(): invalid", fname);
+    /* Call error handler */
+  }
+
+  return ifail;
+}
+
+static int comm_mpi_err_op_handler(MPI_Comm comm, MPI_Op op,
+				   const char * fname) {
+  int ifail = MPI_SUCCESS;
+
+  if (0) { /* FIXME need to check for valid op */
+    ifail = MPI_ERR_OP;
+    mpi_comm_set_error_string(comm, "%s(): invalid MPI_Op argument", fname);
+    /* Call error handler */
+  }
+  return ifail;
+}
+
+static int comm_mpi_err_rank_handler(MPI_Comm comm, int rank,
+				     const char * fname) {
+  int ifail = MPI_SUCCESS;
+
+  if (rank == 0 || rank == MPI_PROC_NULL) {
+    ; /* pass */
+  }
+  else {
+    ifail = MPI_ERR_RANK;
+    mpi_comm_set_error_string(comm, "%s(): invalid rank", fname);
+    /* Call errhandler */
+  }
+
+  return ifail;
+}
+
+static int comm_mpi_err_info_handler(MPI_Comm comm, MPI_Info info,
+				     const char * func) {
+
+  int ifail = MPI_SUCCESS;
+
+  /* Only handling MPI_INFO_NULL at the moment */
+
+  if (info != MPI_INFO_NULL) {
+    ifail = MPI_ERR_INFO;
+    mpi_comm_set_error_string(comm, "%s(): invalid info argument", func);
+    /* Handler */
+  }
+
+  return ifail;
+}
+
+
+static int comm_mpi_err_root_handler(MPI_Comm comm, int root,
+				     const char * fname) {
+  int ifail = MPI_SUCCESS;
+
+  /* Root should not be MPI_PROC_NULL */
+  if (root != 0) {
+    ifail = MPI_ERR_ROOT;
+    mpi_comm_set_error_string(comm, "%s(): invalid root argument", fname);
+    /* Call errhandler */
+  }
+
+  return ifail;
+}
+
+static int comm_mpi_err_tag_handler(MPI_Comm comm, int tag,
+				    const char * fname) {
+  int ifail = MPI_SUCCESS;
+
+  if (mpi_tag_invalid(tag)) {
+    ifail = MPI_ERR_TAG;
+    mpi_comm_set_error_string(comm, "%s(): invalid tag", fname);
+    /* Call errhandler */
+  }
+
+  return ifail;
+}
+
+static int comm_mpi_err_arg_handler(MPI_Comm comm, const char * condition,
+				    const char * fname) {
+  int ifail = MPI_ERR_ARG;
+  mpi_comm_set_error_string(comm, "%s(): argument %s", condition, fname);
+  /* Call errhandler */
+
+  return ifail;
+}
+
+
+#define ERR_IF_MPI_NOT_INITIALISED(fname)				\
+  {									\
+    if (mpi_info_ == NULL) {						\
+      /* Illegal; abort */						\
+      printf("The %s() function was called before either MPI_Init() or"	\
+	     "MPI_Init_thread(). This is illegal.", fname);		\
+      exit(-1);								\
+    }									\
+  }
+
+
+#define ERR_IF_COMM_MPI_ERR_COMM(comm, func)				\
+  {									\
+    ifail = comm_mpi_err_comm_handler(comm, func);			\
+    if (ifail != MPI_SUCCESS) goto err;					\
+  }
+
+#define ERR_IF_COMM_MPI_ERR_BUFFER(comm, buf, func)			\
+  {									\
+    ifail = comm_mpi_err_buffer_handler(comm, buf, func);		\
+    if (ifail != MPI_SUCCESS) goto err;					\
+  }
+
+#define ERR_IF_COMM_MPI_ERR_COUNT(comm, count, func)			\
+  {									\
+    ifail = comm_mpi_err_count_handler(comm, count, func);		\
+    if (ifail != MPI_SUCCESS) goto err;					\
+  }
+
+#define ERR_IF_COMM_MPI_ERR_DATATYPE(comm, dt, func)			\
+  {									\
+    ifail = comm_mpi_err_datatype_handler(comm, dt, func);		\
+    if (ifail != MPI_SUCCESS) goto err;					\
+  }
+
+#define ERR_IF_COMM_MPI_ERR_INFO(comm, info, func)	 \
+  {							 \
+    ifail = comm_mpi_err_info_handler(comm, info, func); \
+    if (ifail != MPI_SUCCESS) goto err;			 \
+  }
+
+#define ERR_IF_COMM_MPI_ERR_OP(comm, op, func)				\
+  {									\
+    ifail = comm_mpi_err_op_handler(comm, op, func);			\
+    if (ifail != MPI_SUCCESS) goto err;					\
+  }
+
+#define ERR_IF_COMM_MPI_ERR_RANK(comm, rank, func)			\
+  {									\
+    ifail = comm_mpi_err_rank_handler(comm, rank, func);		\
+    if (ifail != MPI_SUCCESS) goto err;					\
+  }
+
+#define ERR_IF_COMM_MPI_ERR_ROOT(comm, root, func)			\
+  {									\
+    ifail = comm_mpi_err_root_handler(comm, root, func);		\
+    if (ifail != MPI_SUCCESS) goto err;					\
+  }
+
+#define ERR_IF_COMM_MPI_ERR_TAG(comm, tag, func)			\
+  {									\
+    ifail = comm_mpi_err_tag_handler(comm, tag, func);			\
+    if (ifail != MPI_SUCCESS) goto err;					\
+  }
+
+#define ERR_IF_COMM_MPI_ERR_ARG(comm, condition, func)			\
+  {									\
+    if ((condition)) {							\
+      ifail = comm_mpi_err_arg_handler(comm, #condition, func);		\
+      if (ifail != MPI_SUCCESS) goto err;				\
+    }									\
+  }
+
+/* MPI_File error handlers */
+
+static int file_mpi_err_amode_handler(MPI_File file, int amode,
+				      const char * func) {
+
+  int ifail = MPI_SUCCESS;
+
+  return ifail;
+}
+
+static int file_mpi_err_info_handler(MPI_File fh, MPI_Info info,
+				     const char * func) {
+  int ifail = MPI_SUCCESS;
+  MPI_Comm comm = MPI_COMM_SELF;
+
+  if (fh) comm = MPI_COMM_WORLD; /* FIXME: fh->comm */
+  ifail = comm_mpi_err_info_handler(comm, info, func);
+
+  return ifail;
+}
+
+static int file_mpi_err_buffer_handler(MPI_File fh, const void * buf,
+				       const char * func) {
+  int ifail = MPI_SUCCESS;
+  MPI_Comm comm = MPI_COMM_SELF;
+
+  if (fh) comm = MPI_COMM_WORLD; /* FIXME fh->comm */
+  ifail = comm_mpi_err_buffer_handler(comm, buf, func);
+
+  return ifail;
+}
+
+static int file_mpi_err_datatype_handler(MPI_File fh, MPI_Datatype dt,
+					 const char * func) {
+  int ifail = MPI_SUCCESS;
+  MPI_Comm comm = MPI_COMM_SELF;
+
+  if (fh) comm = MPI_COMM_WORLD; /* fh->comm */
+  ifail = comm_mpi_err_datatype_handler(comm, dt, func);
+
+  return ifail;
+}
+
+#define ERR_IF_FILE_MPI_ERR_COMM(file, condition, func)			\
+  {									\
+    if ((condition)) { /* FIXME map from file to communicator */	\
+      MPI_Comm comm = MPI_COMM_SELF;					\
+      if (file != MPI_FILE_NULL) comm = MPI_COMM_WORLD;			\
+      ifail = comm_mpi_err_comm_handler(comm, func);			\
+      if (ifail != MPI_SUCCESS) goto err;				\
+    }									\
+  }
+
+#define ERR_IF_FILE_MPI_ERR_ARG(file, condition, func)		\
+  {								\
+    if ((condition)) {						\
+      MPI_Comm comm = MPI_COMM_SELF;				\
+      if (file != MPI_FILE_NULL) comm = MPI_COMM_WORLD;		\
+      ifail = comm_mpi_err_arg_handler(comm, #condition, func);	\
+      if (ifail != MPI_SUCCESS) goto err;			\
+    }								\
+  }
+
+#define ERR_IF_FILE_MPI_ERR_AMODE(file, amode, func) {		\
+    ifail = file_mpi_err_amode_handler(file, amode, func);	\
+    if (ifail != MPI_SUCCESS) goto err;				\
+  }
+
+#define ERR_IF_FILE_MPI_ERR_INFO(file, info, func) {		\
+    ifail = file_mpi_err_info_handler(file, info, func);	\
+    if (ifail != MPI_SUCCESS) goto err;				\
+  }
+
+#define ERR_IF_FILE_MPI_ERR_FILE(fh, func)				\
+  {									\
+    ifail = mpi_file_handle_invalid(fh);				\
+    if (ifail != MPI_SUCCESS) {						\
+      MPI_Comm comm = MPI_COMM_SELF;					\
+      mpi_comm_set_error_string(comm, "%s(): invalid file handle", func); \
+      goto err;								\
+    }									\
+  }
+
+#define ERR_IF_FILE_MPI_ERR_BUFFER(fh, buf, func)	\
+  {							\
+    ifail = file_mpi_err_buffer_handler(fh, buf, func);	\
+    if (ifail != MPI_SUCCESS) goto err;			\
+  }
+
+#define ERR_IF_FILE_MPI_ERR_DATATYPE(fh, datatype, func)	\
+  {								\
+    ifail = file_mpi_err_datatype_handler(fh, datatype, func);	\
+    if (ifail != MPI_SUCCESS) goto err;				\
+  }
+
 /*****************************************************************************
  *
  *  MPI_Barrier
@@ -107,9 +439,14 @@ static FILE *   mpi_file_handle_to_fp(mpi_info_t * info, MPI_File handle);
 
 int MPI_Barrier(MPI_Comm comm) {
 
-  assert(mpi_is_valid_comm(comm));
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Barrier";
 
-  return MPI_SUCCESS;
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -123,12 +460,20 @@ int MPI_Barrier(MPI_Comm comm) {
 int MPI_Bcast(void * buffer, int count, MPI_Datatype datatype, int root,
 	      MPI_Comm comm) {
 
-  assert(mpi_info->initialised);
-  assert(buffer);
-  assert(count > 0);
-  assert(mpi_is_valid_comm(comm));
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Bcast";
 
-  return MPI_SUCCESS;
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, buffer, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, count, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, datatype, fname);
+  ERR_IF_COMM_MPI_ERR_ROOT(comm, root, fname);
+
+  assert(mpi_info_->initialised);
+
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -142,10 +487,10 @@ int MPI_Init(int * argc, char *** argv) {
   assert(argc);
   assert(argv);
 
-  mpi_info = (mpi_info_t *) calloc(1, sizeof(mpi_info_t));
-  assert(mpi_info);
+  mpi_info_ = (mpi_info_t *) calloc(1, sizeof(mpi_info_t));
+  assert(mpi_info_);
 
-  mpi_info->initialised = 1;
+  mpi_info_->initialised = 1;
 
   /* User data type handles: reserve dt[0].handle = 0 for MPI_DATATYPE_NULL */
   /* Otherwise, user data type handles are indexed 1, 2, 3, ... */
@@ -155,18 +500,18 @@ int MPI_Init(int * argc, char *** argv) {
 
   for (int n = 0; n < MAX_USER_DT; n++) {
     data_t null = {.handle = 0, .bytes = 0, .commit = 0, .flavour = 0};
-    mpi_info->dt[n] = null;
-    mpi_info->dtfreelist[n] = n;
+    mpi_info_->dt[n] = null;
+    mpi_info_->dtfreelist[n] = n;
   }
-  mpi_info->ndatatype = 0;
-  mpi_info->ndatatypelast = 0;
+  mpi_info_->ndatatype = 0;
+  mpi_info_->ndatatypelast = 0;
 
   for (int ih = 0; ih < MAX_USER_FILE; ih++) {
-    mpi_info->filelist[ih].fp    = NULL;
-    mpi_info->filelist[ih].disp  = 0;
-    mpi_info->filelist[ih].etype = MPI_BYTE;
-    mpi_info->filelist[ih].filetype = MPI_BYTE;
-    strncpy(mpi_info->filelist[ih].datarep, "native", MPI_MAX_DATAREP_STRING);
+    mpi_info_->filelist[ih].fp    = NULL;
+    mpi_info_->filelist[ih].disp  = 0;
+    mpi_info_->filelist[ih].etype = MPI_BYTE;
+    mpi_info_->filelist[ih].filetype = MPI_BYTE;
+    strncpy(mpi_info_->filelist[ih].datarep, "native", MPI_MAX_DATAREP_STRING);
   }
 
   return MPI_SUCCESS;
@@ -205,7 +550,7 @@ int MPI_Initialized(int * flag) {
 
   assert(flag);
 
-  *flag = (mpi_info != NULL); /* A sufficient condition */
+  *flag = (mpi_info_ != NULL); /* A sufficient condition */
 
   return MPI_SUCCESS;
 }
@@ -218,12 +563,16 @@ int MPI_Initialized(int * flag) {
 
 int MPI_Finalize(void) {
 
-  assert(mpi_info);
-  assert(mpi_info->ndatatype == 0); /* All released */
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Finalize";
 
-  free(mpi_info);
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  assert(mpi_info_->ndatatype == 0); /* All released */
 
-  return MPI_SUCCESS;
+  free(mpi_info_);
+  mpi_info_ = NULL;
+
+  return ifail;
 }
 
 /*****************************************************************************
@@ -234,12 +583,17 @@ int MPI_Finalize(void) {
 
 int MPI_Comm_group(MPI_Comm comm, MPI_Group * group) {
 
-  assert(mpi_is_valid_comm(comm));
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Comm_group";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
   assert(group);
 
   *group = 0;
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -250,12 +604,17 @@ int MPI_Comm_group(MPI_Comm comm, MPI_Group * group) {
 
 int MPI_Comm_rank(MPI_Comm comm, int * rank) {
 
-  assert(mpi_is_valid_comm(comm));
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Comm_rank";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
   assert(rank);
 
   *rank = 0;
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -266,12 +625,17 @@ int MPI_Comm_rank(MPI_Comm comm, int * rank) {
 
 int MPI_Comm_size(MPI_Comm comm, int * size) {
 
-  assert(mpi_is_valid_comm(comm));
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Comm_size";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
   assert(size);
 
   *size = 1;
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -286,7 +650,12 @@ int MPI_Comm_size(MPI_Comm comm, int * size) {
 
 int MPI_Comm_compare(MPI_Comm comm1, MPI_Comm comm2, int * result) {
 
-  assert(result);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Comm_compare";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+
+  assert(result); /* FIXME what is correct behaviour? e.g., invalid comm */
 
   *result = MPI_UNEQUAL;
   if (mpi_is_valid_comm(comm1) && mpi_is_valid_comm(comm2)) {
@@ -294,7 +663,7 @@ int MPI_Comm_compare(MPI_Comm comm1, MPI_Comm comm2, int * result) {
     if (comm1 == comm2) *result = MPI_IDENT;
   }
 
-  return 0;
+  return ifail;
 }
 
 /*****************************************************************************
@@ -346,12 +715,22 @@ double MPI_Wtick(void) {
 int MPI_Send(void * buf, int count, MPI_Datatype datatype, int dest,
 	     int tag, MPI_Comm comm) {
 
-  assert(buf);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Send";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, buf, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, datatype, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, count, fname);
+  ERR_IF_COMM_MPI_ERR_RANK(comm, dest, fname);
+  ERR_IF_COMM_MPI_ERR_TAG(comm, tag, fname);
 
   printf("MPI_Send should not be called in serial.\n");
   exit(0);
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -363,12 +742,23 @@ int MPI_Send(void * buf, int count, MPI_Datatype datatype, int dest,
 int MPI_Recv(void * buf, int count, MPI_Datatype datatype, int source,
 	     int tag, MPI_Comm comm, MPI_Status * status) {
 
-  assert(buf);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Recv";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, buf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, count, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, datatype, fname);
+  ERR_IF_COMM_MPI_ERR_RANK(comm, source, fname);
+  ERR_IF_COMM_MPI_ERR_TAG(comm, tag, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, status == NULL, fname);
 
   printf("MPI_Recv should not be called in serial.\n");
   exit(0);
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -380,13 +770,22 @@ int MPI_Recv(void * buf, int count, MPI_Datatype datatype, int source,
 int MPI_Irecv(void * buf, int count, MPI_Datatype datatype, int source,
 	     int tag, MPI_Comm comm, MPI_Request * request) {
 
-  assert(buf);
-  assert(request);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Irecv";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, buf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, count, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, datatype, fname);
+  ERR_IF_COMM_MPI_ERR_RANK(comm, source, fname);
+  ERR_IF_COMM_MPI_ERR_TAG(comm, tag, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, request == NULL, fname);
 
-  /* Could assert tag is ok */
   *request = tag;
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 
@@ -399,12 +798,22 @@ int MPI_Irecv(void * buf, int count, MPI_Datatype datatype, int source,
 int MPI_Ssend(void * buf, int count, MPI_Datatype datatype, int dest,
 	      int tag, MPI_Comm comm) {
 
-  assert(buf);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Ssend";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, buf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, count, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, datatype, fname);
+  ERR_IF_COMM_MPI_ERR_RANK(comm, dest, fname);
+  ERR_IF_COMM_MPI_ERR_TAG(comm, tag, fname);
 
   printf("MPI_Ssend should not be called in serial\n");
   exit(0);
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -416,13 +825,22 @@ int MPI_Ssend(void * buf, int count, MPI_Datatype datatype, int dest,
 int MPI_Isend(void * buf, int count, MPI_Datatype datatype, int dest,
 	      int tag, MPI_Comm comm, MPI_Request * request) {
 
-  assert(buf);
-  assert(request);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Isend";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, buf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, count, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, datatype, fname);
+  ERR_IF_COMM_MPI_ERR_RANK(comm, dest, fname);
+  ERR_IF_COMM_MPI_ERR_TAG(comm, tag, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, request == NULL, fname);
 
-  /* Could assert tag is ok */
   *request = tag;
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -434,16 +852,23 @@ int MPI_Isend(void * buf, int count, MPI_Datatype datatype, int dest,
 int MPI_Issend(void * buf, int count, MPI_Datatype datatype, int dest,
 	       int tag, MPI_Comm comm, MPI_Request * request) {
 
-  assert(buf);
-  assert(count >= 0);
-  assert(dest == 0);
-  assert(mpi_is_valid_comm(comm));
-  assert(request);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Issend";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, buf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, count, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, datatype, fname);
+  ERR_IF_COMM_MPI_ERR_RANK(comm, dest, fname);
+  ERR_IF_COMM_MPI_ERR_TAG(comm, tag, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, request == NULL, fname);
 
   printf("MPI_Issend should not be called in serial\n");
   exit(0);
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -454,10 +879,16 @@ int MPI_Issend(void * buf, int count, MPI_Datatype datatype, int dest,
 
 int MPI_Waitall(int count, MPI_Request * requests, MPI_Status * statuses) {
 
-  assert(count >= 0);
-  assert(requests);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Waitall";
 
-  return MPI_SUCCESS;
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(MPI_COMM_SELF, count, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, requests == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, statuses == NULL, fname);
+
+ err:
+  return ifail;
 }
 
 
@@ -470,9 +901,13 @@ int MPI_Waitall(int count, MPI_Request * requests, MPI_Status * statuses) {
 int MPI_Waitany(int count, MPI_Request requests[], int * index,
 		MPI_Status * status) {
 
-  assert(count >= 0);
-  assert(requests);
-  assert(index);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Waitany";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(MPI_COMM_SELF, count, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, index == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, status == NULL, fname);
 
   *index = MPI_UNDEFINED;
 
@@ -488,10 +923,10 @@ int MPI_Waitany(int count, MPI_Request requests[], int * index,
     }
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
-
 /*****************************************************************************
  *
  *  MPI_Probe
@@ -500,13 +935,20 @@ int MPI_Waitany(int count, MPI_Request requests[], int * index,
 
 int MPI_Probe(int source, int tag, MPI_Comm comm, MPI_Status * status) {
 
-  assert(source == 0);
-  assert(mpi_is_valid_comm(comm));
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Probe";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_RANK(comm, source, fname);
+  ERR_IF_COMM_MPI_ERR_TAG(comm, tag, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, status == NULL, fname);
 
   printf("MPI_Probe should not be called in serial\n");
   exit(0);
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -520,15 +962,31 @@ int MPI_Sendrecv(void * sendbuf, int sendcount, MPI_Datatype sendtype,
 		 MPI_Datatype recvtype, int source, int recvtag,
 		 MPI_Comm comm, MPI_Status * status) {
 
-  assert(sendbuf);
-  assert(dest == source);
-  assert(recvbuf);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_SendRecv";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, sendbuf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, sendcount, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, sendtype, fname);
+  ERR_IF_COMM_MPI_ERR_RANK(comm, dest, fname);
+  ERR_IF_COMM_MPI_ERR_TAG(comm, sendtag, fname);
+
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, recvbuf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, recvcount, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, recvtype, fname);
+  ERR_IF_COMM_MPI_ERR_RANK(comm, source, fname);
+  ERR_IF_COMM_MPI_ERR_TAG(comm, recvtag, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, status == NULL, fname);
+
   assert(recvcount == sendcount);
 
   printf("MPI_Sendrecv should not be called in serial\n");
   exit(0);
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -540,17 +998,23 @@ int MPI_Sendrecv(void * sendbuf, int sendcount, MPI_Datatype sendtype,
 int MPI_Reduce(void * sendbuf, void * recvbuf, int count, MPI_Datatype type,
 	       MPI_Op op, int root, MPI_Comm comm) {
 
-  assert(sendbuf);
-  assert(recvbuf);
-  assert(count >= 0);
-  assert(root == 0);
-  assert(mpi_is_valid_comm(comm));
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Reduce";
 
-  assert(op != MPI_OP_NULL);
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, sendbuf, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, recvbuf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, count, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, type, fname);
+  ERR_IF_COMM_MPI_ERR_OP(comm, op, fname);
+  ERR_IF_COMM_MPI_ERR_ROOT(comm, root, fname);
 
+  /* Whatever the operation is, the result is the same ... ! */
   mpi_copy(sendbuf, recvbuf, count, type);
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /****************************************************************************
@@ -563,16 +1027,25 @@ int MPI_Allgather(void * sendbuf, int sendcount, MPI_Datatype sendtype,
 		  void * recvbuf, int recvcount, MPI_Datatype recvtype,
 		  MPI_Comm comm) {
 
-  assert(mpi_info);
-  assert(sendbuf);
-  assert(recvbuf);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Allgather";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, sendbuf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, sendcount, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, sendtype, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, recvbuf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, recvcount, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, recvtype, fname);
+
   assert(sendcount == recvcount);
   assert(sendtype == recvtype);
-  assert(mpi_is_valid_comm(comm));
 
   mpi_copy(sendbuf, recvbuf, sendcount, sendtype);
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -585,16 +1058,26 @@ int MPI_Gather(void * sendbuf, int sendcount, MPI_Datatype sendtype,
 	       void * recvbuf, int recvcount, MPI_Datatype recvtype,
 	       int root, MPI_Comm comm) {
 
-  assert(mpi_info);
-  assert(sendbuf);
-  assert(recvbuf);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Gather";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, sendbuf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, sendcount, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, sendtype, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, recvbuf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, recvcount, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, recvtype, fname);
+  ERR_IF_COMM_MPI_ERR_ROOT(comm, root, fname);
+
   assert(sendcount == recvcount);
   assert(sendtype == recvtype);
-  assert(mpi_is_valid_comm(comm));
-  
+
   mpi_copy(sendbuf, recvbuf, sendcount, sendtype);
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -609,15 +1092,26 @@ int MPI_Gatherv(const void * sendbuf, int sendcount, MPI_Datatype sendtype,
 		void * recvbuf, const int * recvcounts, const int * displ,
 		MPI_Datatype recvtype, int root, MPI_Comm comm) {
 
-  assert(sendbuf);
-  assert(recvbuf);
-  assert(root == 0);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Gatherv";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, (void *) sendbuf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, sendcount, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, sendtype, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, recvbuf, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, displ == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, recvtype, fname);
+  ERR_IF_COMM_MPI_ERR_ROOT(comm, root, fname);
+
   assert(sendtype == recvtype);
   assert(sendcount == recvcounts[0]);
 
   mpi_copy((void *) sendbuf, recvbuf, sendcount, sendtype);
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -629,16 +1123,23 @@ int MPI_Gatherv(const void * sendbuf, int sendcount, MPI_Datatype sendtype,
 int MPI_Allreduce(void * sendbuf, void * recvbuf, int count, MPI_Datatype type,
 		  MPI_Op op, MPI_Comm comm) {
 
-  assert(sendbuf);
-  assert(recvbuf);
-  assert(count >= 1);
-  assert(mpi_is_valid_comm(comm));
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Allreduce";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, sendbuf, fname);
+  ERR_IF_COMM_MPI_ERR_BUFFER(comm, recvbuf, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, count, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(comm, type, fname);
+  ERR_IF_COMM_MPI_ERR_OP(comm, op, fname);
 
   if (sendbuf != MPI_IN_PLACE) {
     mpi_copy(sendbuf, recvbuf, count, type);
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -647,18 +1148,28 @@ int MPI_Allreduce(void * sendbuf, void * recvbuf, int count, MPI_Datatype type,
  *
  *  Return the original communicator as the new communicator.
  *
+ *  The colur argument can be MPI_UNDEFINED, which may be negative.
+ *  With MPI_UNDEFINED -999, this will cause an error at the moment.
+ *
  *****************************************************************************/
 
 int MPI_Comm_split(MPI_Comm comm, int colour, int key, MPI_Comm * newcomm) {
 
-  assert(mpi_is_valid_comm(comm));
-  assert(newcomm);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Comm_split";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, colour < 0, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, newcomm == NULL, fname);
 
   /* Allow that a split Cartesian communicator is different */
   /* See MPI_Comm_compare() */
+
   *newcomm = MPI_COMM_WORLD;
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -670,13 +1181,20 @@ int MPI_Comm_split(MPI_Comm comm, int colour, int key, MPI_Comm * newcomm) {
 int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info,
 			MPI_Comm * newcomm) {
 
-  assert(mpi_is_valid_comm(comm));
-  assert(newcomm);
-  assert(split_type == MPI_COMM_TYPE_SHARED);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Comm_split_type";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, split_type != MPI_COMM_TYPE_SHARED, fname);
+  /* FIXME key controls rank assignment */
+  ERR_IF_COMM_MPI_ERR_INFO(comm, info, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, newcomm == NULL, fname);
 
   *newcomm = comm;
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -687,13 +1205,21 @@ int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info,
 
 int MPI_Comm_free(MPI_Comm * comm) {
 
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Comm_free";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, comm == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_COMM(*comm, fname);
+
   /* Mark Cartesian communicators as free */
 
   if (*comm > MPI_COMM_SELF) {
-    mpi_info->ncart -= 1;
+    mpi_info_->ncart -= 1;
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -706,13 +1232,17 @@ int MPI_Comm_free(MPI_Comm * comm) {
 
 int MPI_Comm_dup(MPI_Comm oldcomm, MPI_Comm * newcomm) {
 
-  assert(mpi_info);
-  assert(mpi_is_valid_comm(oldcomm));
-  assert(newcomm);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Comm_dup";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(oldcomm, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(oldcomm, newcomm == NULL, fname);
 
   *newcomm = oldcomm;
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -725,10 +1255,15 @@ int MPI_Type_indexed(int count, int * array_of_blocklengths,
 		     int * array_of_displacements, MPI_Datatype oldtype,
 		     MPI_Datatype * newtype) {
 
-  assert(count > 0);
-  assert(array_of_blocklengths);
-  assert(array_of_displacements);
-  assert(newtype);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Type_indexed";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(MPI_COMM_SELF, count, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, array_of_blocklengths == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, array_of_displacements == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(MPI_COMM_SELF, oldtype, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, newtype == NULL, fname);
 
   {
     data_t dt = {0};
@@ -738,10 +1273,11 @@ int MPI_Type_indexed(int count, int * array_of_blocklengths,
     dt.commit  = 0;
     dt.flavour = DT_NOT_IMPLEMENTED; /* Can't do displacements at moment */
 
-    mpi_data_type_add(mpi_info, &dt, newtype);
+    mpi_data_type_add(mpi_info_, &dt, newtype);
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -752,8 +1288,13 @@ int MPI_Type_indexed(int count, int * array_of_blocklengths,
 
 int MPI_Type_contiguous(int count, MPI_Datatype old, MPI_Datatype * newtype) {
 
-  assert(count > 0);
-  assert(newtype);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Type_contiguous";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(MPI_COMM_SELF, count, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(MPI_COMM_SELF, old, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, newtype == NULL, fname);
 
   {
     data_t dt = {0};
@@ -763,10 +1304,11 @@ int MPI_Type_contiguous(int count, MPI_Datatype old, MPI_Datatype * newtype) {
     dt.commit  = 0;
     dt.flavour = DT_CONTIGUOUS;
 
-    mpi_data_type_add(mpi_info, &dt, newtype);
+    mpi_data_type_add(mpi_info_, &dt, newtype);
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -777,24 +1319,31 @@ int MPI_Type_contiguous(int count, MPI_Datatype old, MPI_Datatype * newtype) {
 
 int MPI_Type_commit(MPI_Datatype * type) {
 
-  assert(type);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Type_commit";
 
-  int handle = *type;
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, type == NULL, fname);
 
-  if (handle < 0) {
-    printf("MPI_Type_commit: Attempt to commit intrinsic type\n");
-  }
-  if (handle == 0) {
-    printf("MPI_Type_commit: Attempt to commit null data type\n");
-  }
-  if (handle > mpi_info->ndatatypelast) {
-    printf("MPI_Type_commit: unrecognised handle %d\n", handle);
-  }
+  {
+    int handle = *type;
 
-  assert(mpi_info->dt[handle].handle == handle);
-  mpi_info->dt[handle].commit = 1;
+    if (handle < 0) {
+      printf("MPI_Type_commit: Attempt to commit intrinsic type\n");
+    }
+    if (handle == 0) {
+      printf("MPI_Type_commit: Attempt to commit null data type\n");
+    }
+    if (handle > mpi_info_->ndatatypelast) {
+      printf("MPI_Type_commit: unrecognised handle %d\n", handle);
+    }
 
-  return MPI_SUCCESS;
+    assert(mpi_info_->dt[handle].handle == handle);
+    mpi_info_->dt[handle].commit = 1;
+  }
+
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -805,12 +1354,19 @@ int MPI_Type_commit(MPI_Datatype * type) {
 
 int MPI_Type_free(MPI_Datatype * type) {
 
-  assert(type);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Type_free";
 
-  mpi_data_type_free(mpi_info, type);
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, type == NULL, fname);
+
+  assert(type); /* FIXME. An error. */
+
+  mpi_data_type_free(mpi_info_, type);
   assert(*type == MPI_DATATYPE_NULL);
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -822,9 +1378,14 @@ int MPI_Type_free(MPI_Datatype * type) {
 int MPI_Type_vector(int count, int blocklength, int stride,
 		    MPI_Datatype oldtype, MPI_Datatype * newtype) {
 
-  assert(count > 0);
-  assert(blocklength >= 0);
-  assert(newtype);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Type_vector";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(MPI_COMM_SELF, count, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, blocklength < 0, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(MPI_COMM_SELF, oldtype, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, newtype == NULL, fname);
 
   {
     data_t dt = {0};
@@ -833,11 +1394,13 @@ int MPI_Type_vector(int count, int blocklength, int stride,
     dt.bytes  = 0;
     dt.commit = 0;
     dt.flavour = DT_NOT_IMPLEMENTED; /* Can't do strided copy */
+    dt.stride  = stride;
 
-    mpi_data_type_add(mpi_info, &dt, newtype);
+    mpi_data_type_add(mpi_info_, &dt, newtype);
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -849,26 +1412,33 @@ int MPI_Type_vector(int count, int blocklength, int stride,
 int MPI_Cart_create(MPI_Comm oldcomm, int ndims, int * dims, int * periods,
 		    int reorder, MPI_Comm * newcomm) {
 
-  int n;
-  int icart;
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Cart_create";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(oldcomm, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(oldcomm, dims == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(oldcomm, periods == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(oldcomm, newcomm == NULL, fname);
 
-  assert(mpi_info);
   assert(ndims <= 3);
-  assert(newcomm);
 
-  mpi_info->ncart += 1;
-  icart = MPI_COMM_SELF + mpi_info->ncart;
+  int icart; /* FIXME */
+
+  mpi_info_->ncart += 1;
+  icart = MPI_COMM_SELF + mpi_info_->ncart;
   assert(icart < MAX_CART_COMM);
 
   *newcomm = icart;
 
   /* Record periodity */
 
-  for (n = 0; n < ndims; n++) {
-    mpi_info->period[icart][n] = periods[n];
+  for (int n = 0; n < ndims; n++) {
+    mpi_info_->period[icart][n] = periods[n];
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -882,21 +1452,23 @@ int MPI_Cart_create(MPI_Comm oldcomm, int ndims, int * dims, int * periods,
 int MPI_Cart_get(MPI_Comm comm, int maxdims, int * dims, int * periods,
 		 int * coords) {
 
-  int n;
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Cart_get";
 
-  assert(mpi_info);
-  assert(mpi_is_valid_comm(comm));
-  assert(dims);
-  assert(periods);
-  assert(coords);
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, dims == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, periods == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, coords == NULL, fname);
 
-  for (n = 0; n < maxdims; n++) {
+  for (int n = 0; n < maxdims; n++) {
     dims[n] = 1;
-    periods[n] = mpi_info->period[comm][n];
+    periods[n] = mpi_info_->period[comm][n];
     coords[n] = 0;
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -907,18 +1479,20 @@ int MPI_Cart_get(MPI_Comm comm, int maxdims, int * dims, int * periods,
 
 int MPI_Cart_coords(MPI_Comm comm, int rank, int maxdims, int * coords) {
 
-  int d;
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Cart_coords";
 
-  assert(mpi_info);
-  assert(comm != MPI_COMM_NULL);
-  assert(rank == 0);
-  assert(coords);
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_RANK(comm, rank, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, coords == NULL, fname);
 
-  for (d = 0; d < maxdims; d++) {
+  for (int d = 0; d < maxdims; d++) {
     coords[d] = 0;
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -931,14 +1505,18 @@ int MPI_Cart_coords(MPI_Comm comm, int rank, int maxdims, int * coords) {
 
 int MPI_Cart_rank(MPI_Comm comm, int * coords, int * rank) {
 
-  assert(mpi_info);
-  assert(mpi_is_valid_comm(comm));
-  assert(coords);
-  assert(rank);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Cart_rank";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, coords == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, rank == NULL, fname);
 
   *rank = 0;
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -950,21 +1528,25 @@ int MPI_Cart_rank(MPI_Comm comm, int * coords, int * rank) {
 int MPI_Cart_shift(MPI_Comm comm, int direction, int disp, int * rank_source,
 		   int * rank_dest) {
 
-  assert(mpi_info);
-  assert(comm != MPI_COMM_NULL);
-  assert(rank_source);
-  assert(rank_dest);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Cart_shift";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, rank_source == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, rank_dest   == NULL, fname);
 
   *rank_source = 0;
   *rank_dest = 0;
 
   /* Non periodic directions */
-  if (disp != 0 && mpi_info->period[comm][direction] != 1) {
+  if (disp != 0 && mpi_info_->period[comm][direction] != 1) {
     *rank_source = MPI_PROC_NULL;
     *rank_dest   = MPI_PROC_NULL;
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -975,14 +1557,18 @@ int MPI_Cart_shift(MPI_Comm comm, int direction, int disp, int * rank_source,
 
 int MPI_Cart_sub(MPI_Comm comm, int * remain_dims, MPI_Comm * new_comm) {
 
-  assert(mpi_info);
-  assert(mpi_is_valid_comm(comm));
-  assert(remain_dims);
-  assert(new_comm);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Cart_sub";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, remain_dims == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, new_comm == NULL, fname);
 
   *new_comm = comm;
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -993,18 +1579,21 @@ int MPI_Cart_sub(MPI_Comm comm, int * remain_dims, MPI_Comm * new_comm) {
 
 int MPI_Dims_create(int nnodes, int ndims, int * dims) {
 
-  int d;
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Dims_create";
 
-  assert(mpi_info);
-  assert(nnodes == 1);
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, dims == NULL, fname);
+
+  assert(nnodes == 1); /* FIXME */
   assert(ndims > 0);
-  assert(dims);
 
-  for (d = 0; d < ndims; d++) {
+  for (int d = 0; d < ndims; d++) {
     dims[d] = 1;
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -1015,13 +1604,19 @@ int MPI_Dims_create(int nnodes, int ndims, int * dims) {
 
 int MPI_Op_create(MPI_User_function * function, int commute, MPI_Op * op) {
 
-  /* Never actually use function, so don't really care what's here... */
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Op_create";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, function == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, op == NULL, fname);
 
-  assert(function);
+  /* commute is logical */
 
   *op = MPI_SUM;
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -1032,13 +1627,24 @@ int MPI_Op_create(MPI_User_function * function, int commute, MPI_Op * op) {
 
 int MPI_Op_free(MPI_Op * op) {
 
-  assert(op);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Op_free";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, op == NULL, fname);
 
   *op = MPI_OP_NULL;
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
+/*****************************************************************************
+ *
+ *  Internal
+ *
+ *****************************************************************************/
+
 /*****************************************************************************
  *
  *  mpi_copy
@@ -1046,7 +1652,7 @@ int MPI_Op_free(MPI_Op * op) {
  *****************************************************************************/
 
 static void mpi_copy(void * send, void * recv, int count, MPI_Datatype type) {
- 
+
   size_t sizeof_datatype = mpi_sizeof(type);
 
   assert(send);
@@ -1066,7 +1672,7 @@ static void mpi_copy(void * send, void * recv, int count, MPI_Datatype type) {
 
 static int mpi_sizeof(MPI_Datatype type) {
 
-  int size = -1;
+  int size = -1;  /* Return -1 for unrecognised or invalid type */
 
   switch (type) {
   case MPI_CHAR:
@@ -1114,12 +1720,12 @@ static int mpi_sizeof(MPI_Datatype type) {
     break;
   case MPI_PACKED:
     printf("MPI_PACKED not implemented\n");
+    break;
   default:
     /* Try user type */
     size = mpi_sizeof_user(type);
   }
 
-  assert(size != -1);
   return size;
 }
 
@@ -1134,7 +1740,7 @@ static int mpi_sizeof(MPI_Datatype type) {
 static int mpi_sizeof_user(MPI_Datatype handle) {
 
   int sz    = -1;
-  int index = mpi_data_type_handle(mpi_info, handle);
+  int index = mpi_data_type_handle(mpi_info_, handle);
 
   assert(index >= MPI_COMM_NULL); /* not intrinsic */
 
@@ -1143,7 +1749,7 @@ static int mpi_sizeof_user(MPI_Datatype handle) {
     MPI_Abort(MPI_COMM_WORLD, 0);
   }
   else {
-    sz = mpi_info->dt[index].bytes;
+    sz = mpi_info_->dt[index].bytes;
   }
 
   return sz;
@@ -1157,16 +1763,21 @@ static int mpi_sizeof_user(MPI_Datatype handle) {
 
 int MPI_Comm_set_errhandler(MPI_Comm comm, MPI_Errhandler errhandler) {
 
-  assert(mpi_info);
-  assert(mpi_is_valid_comm(comm));
-  assert(errhandler == MPI_ERRORS_ARE_FATAL);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Comm_set_errhandler";
 
-  return MPI_SUCCESS;
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+
+  assert(errhandler == MPI_ERRORS_ARE_FATAL); /* FIXME */
+
+ err:
+  return ifail;
 }
 
 #ifdef _DO_NOT_INCLUDE_MPI2_INTERFACE
 /*
- * The following are removed from MPI3... and have an apprpriate
+ * The following are removed from MPI3... and have an appropriate
  * MPI2 replacement.
  *
  * MPI_Address           ->    MPI_Get_address
@@ -1200,18 +1811,23 @@ int MPI_Comm_set_errhandler(MPI_Comm comm, MPI_Errhandler errhandler) {
  *
  *  MPI_Get_address
  *
- *  Supercedes MPI_Address
+ *  Supersedes MPI_Address
  *
  *****************************************************************************/
 
 int MPI_Get_address(const void * location, MPI_Aint * address) {
 
-  assert(location);
-  assert(address);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Get_address";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, location == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, address  == NULL, fname);
 
   *address = (MPI_Aint) location;
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -1222,12 +1838,20 @@ int MPI_Get_address(const void * location, MPI_Aint * address) {
 
 int MPI_Group_translate_ranks(MPI_Group grp1, int n, const int * ranks1,
 			      MPI_Group grp2, int * ranks2) {
-  assert(ranks1);
-  assert(ranks2);
+
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Group_translate_ranks";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(grp1, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, ranks1 == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_COMM(grp2, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, ranks2 == NULL, fname);
 
   memcpy(ranks2, ranks1, n*sizeof(int));
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -1239,7 +1863,12 @@ int MPI_Group_translate_ranks(MPI_Group grp1, int n, const int * ranks1,
 int MPI_Type_create_resized(MPI_Datatype oldtype, MPI_Aint lb, MPI_Aint extent,
 			    MPI_Datatype * newtype) {
 
-  assert(newtype);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Type_create_resized";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(MPI_COMM_SELF, oldtype, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, newtype == NULL, fname);
 
   {
     data_t dt = {0};
@@ -1248,18 +1877,20 @@ int MPI_Type_create_resized(MPI_Datatype oldtype, MPI_Aint lb, MPI_Aint extent,
     dt.bytes   = extent;
     dt.commit  = 0;
     dt.flavour = DT_NOT_IMPLEMENTED; /* Should be  old.flavour */
+    dt.lb      = lb;
 
-    mpi_data_type_add(mpi_info, &dt, newtype);
+    mpi_data_type_add(mpi_info_, &dt, newtype);
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
  *
  *  MPI_Type_create_struct
  *
- *  Supercedes MPI_Type_struct()
+ *  Supersedes MPI_Type_struct()
  *
  *****************************************************************************/
 
@@ -1288,7 +1919,7 @@ int MPI_Type_create_struct(int count, int array_of_blocklengths[],
     dt.bytes = (array_of_displacements[count-1] - array_of_displacements[0])
       + mpi_sizeof(array_of_types[count-1]);
 
-    mpi_data_type_add(mpi_info, &dt, newtype);
+    mpi_data_type_add(mpi_info_, &dt, newtype);
   }
 
   return MPI_SUCCESS;
@@ -1303,27 +1934,33 @@ int MPI_Type_create_struct(int count, int array_of_blocklengths[],
 int MPI_Type_get_extent(MPI_Datatype datatype, MPI_Aint * lb,
 			MPI_Aint * extent) {
 
-  int handle = MPI_DATATYPE_NULL;
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Type_get_extent";
 
-  assert(lb);
-  assert(extent);
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(MPI_COMM_SELF, datatype, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, lb == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, extent == NULL, fname);
 
   if (datatype < 0) {
-    /* intrinsic allowed? Why? */
-    assert(0);
+    /* Intrinsic */
+    *lb = 0;
     *extent = mpi_sizeof(datatype);
   }
+  else {
 
-  handle = mpi_data_type_handle(mpi_info, datatype);
+    int handle = mpi_data_type_handle(mpi_info_, datatype);
 
-  if (handle == MPI_DATATYPE_NULL) {
-    printf("MPI_Type_get_Extent: null handle\n");
-  }
+    if (handle == MPI_DATATYPE_NULL) {
+      printf("MPI_Type_get_extent: null handle\n");
+    }
 
-  *lb = 0; /* Always, at the moment */
-  *extent = mpi_info->dt[handle].bytes;
+    *lb = 0; /* Always, at the moment */
+    *extent = mpi_info_->dt[handle].bytes;
+  }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -1334,11 +1971,17 @@ int MPI_Type_get_extent(MPI_Datatype datatype, MPI_Aint * lb,
 
 int MPI_Type_size(MPI_Datatype datatype, int * sz) {
 
-  assert(sz);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Type_size";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(MPI_COMM_SELF, datatype, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, sz == NULL, fname);
 
   *sz = mpi_sizeof(datatype);
 
-  return 0;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -1350,12 +1993,20 @@ int MPI_Type_size(MPI_Datatype datatype, int * sz) {
 int MPI_File_open(MPI_Comm comm, const char * filename, int amode,
 		  MPI_Info info, MPI_File * fh) {
 
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_File_open";
+
   FILE * fp = NULL;
   const char * fdmode = NULL;
 
-  assert(mpi_is_valid_comm(comm));
-  assert(filename);
-  assert(fh);
+  /* Default file error handler responsible ... */
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_FILE_MPI_ERR_COMM(MPI_FILE_NULL, comm, fname);
+  ERR_IF_FILE_MPI_ERR_ARG(MPI_FILE_NULL, filename == NULL, fname);
+  ERR_IF_FILE_MPI_ERR_AMODE(MPI_FILE_NULL, amode, fname);
+  ERR_IF_FILE_MPI_ERR_INFO(MPI_FILE_NULL, info, fname);
+  ERR_IF_FILE_MPI_ERR_ARG(MPI_FILE_NULL, fh == NULL, fname);
 
   /* Exactly one of RDONLY, WRONLY, or RDWR must be present. */
   /* RDONLY => no CREATE or EXCL. */
@@ -1403,9 +2054,10 @@ int MPI_File_open(MPI_Comm comm, const char * filename, int amode,
     return MPI_ERR_NO_SUCH_FILE;
   }
 
-  *fh = mpi_file_handle_retain(mpi_info, fp);
+  *fh = mpi_file_handle_retain(mpi_info_, fp);
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -1416,37 +2068,56 @@ int MPI_File_open(MPI_Comm comm, const char * filename, int amode,
 
 int MPI_File_close(MPI_File * fh) {
 
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_File_close";
+
   FILE * fp = NULL;
 
-  assert(fh);
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  /* ERR_IF_NULL_POINTER(fh == NULL, MPI_ERR_ARG, fname); */
+  if (fh == NULL) {
+    ifail = MPI_ERR_ARG;
+    goto err;
+  }
+  ERR_IF_FILE_MPI_ERR_FILE(*fh, fname);
 
-  fp = mpi_file_handle_release(mpi_info, *fh);
+  fp = mpi_file_handle_release(mpi_info_, *fh);
 
   if (fp == NULL) {
     printf("MPI_File_close: invalid file handle\n");
-    return MPI_ERR_FILE;
+    ifail = MPI_ERR_FILE;
   }
   else {
     fclose(fp);
     *fh = MPI_FILE_NULL;
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
  *
  *  MPI_File_delete
  *
+ *  Can return MPI_ERR_NO_SUCH_FILE, at least.
+ *
  *****************************************************************************/
 
 int MPI_File_delete(const char * filename, MPI_Info info) {
 
-  assert(filename);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_File_delete";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_FILE_MPI_ERR_ARG(MPI_FILE_NULL, filename == NULL, fname);
+  ERR_IF_FILE_MPI_ERR_INFO(MPI_FILE_NULL, info, fname);
 
+  /* remove() returns 0 on success, -1 otherwise. errno is set. */
   remove(filename);
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -1483,7 +2154,7 @@ int MPI_Type_create_subarray(int ndims, const int * array_of_sizes,
     dt.commit  = 0;
     dt.flavour = DT_SUBARRAY;
 
-    mpi_data_type_add(mpi_info, &dt, newtype);
+    mpi_data_type_add(mpi_info_, &dt, newtype);
   }
   return MPI_SUCCESS;
 }
@@ -1497,28 +2168,32 @@ int MPI_Type_create_subarray(int ndims, const int * array_of_sizes,
 int MPI_File_get_view(MPI_File fh, MPI_Offset * disp, MPI_Datatype * etype,
 		      MPI_Datatype * filetype, char * datarep) {
 
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_File_get_view";
+
   FILE * fp = NULL;
 
-  assert(disp);
-  assert(etype);
-  assert(filetype);
-  assert(datarep);
-  assert(mpi_info);
+  ERR_IF_FILE_MPI_ERR_FILE(fh, fname);
+  ERR_IF_FILE_MPI_ERR_ARG(fh, disp == NULL, fname);
+  ERR_IF_FILE_MPI_ERR_ARG(fh, etype == NULL, fname);
+  ERR_IF_FILE_MPI_ERR_ARG(fh, filetype == NULL, fname);
+  ERR_IF_FILE_MPI_ERR_ARG(fh, datarep == NULL, fname);
 
-  fp = mpi_file_handle_to_fp(mpi_info, fh);
+  fp = mpi_file_handle_to_fp(mpi_info_, fh);
 
   if (fp == NULL) {
     printf("MPI_File_get_view: invalid file handle\n");
     exit(0);
   }
   else {
-    file_t * file = &mpi_info->filelist[fh];
+    file_t * file = &mpi_info_->filelist[fh];
     *disp = file->disp;
     *etype = file->etype;
     *filetype = file->filetype;
     strncpy(datarep, file->datarep, MPI_MAX_DATAREP_STRING-1);
   }
 
+ err:
   return MPI_SUCCESS;
 }
 /*****************************************************************************
@@ -1531,19 +2206,28 @@ int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype,
 		      MPI_Datatype filetype, const char * datarep,
 		      MPI_Info info) {
 
-  FILE * fp = NULL;
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_File_set_view";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_FILE_MPI_ERR_FILE(fh, fname);
+  ERR_IF_FILE_MPI_ERR_DATATYPE(fh, etype, fname);
+  ERR_IF_FILE_MPI_ERR_DATATYPE(fh, filetype, fname);
+  /* FIXME "datarep" should be native */
+  ERR_IF_FILE_MPI_ERR_INFO(fh, info, fname);
 
   assert(datarep);
-  assert(mpi_info);
 
-  fp = mpi_file_handle_to_fp(mpi_info, fh);
+  FILE * fp = NULL;
+
+  fp = mpi_file_handle_to_fp(mpi_info_, fh);
 
   if (fp == NULL) {
     printf("MPI_File_set_view: invalid file handle\n");
     exit(0);
   }
   else {
-    file_t * file = &mpi_info->filelist[fh];
+    file_t * file = &mpi_info_->filelist[fh];
     file->disp = disp;
     file->etype = etype;
     file->filetype = filetype;
@@ -1552,7 +2236,8 @@ int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype,
     /* info is currently discarded */
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -1563,11 +2248,20 @@ int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype,
 
 int MPI_File_read_all(MPI_File fh, void * buf, int count,
 		      MPI_Datatype datatype, MPI_Status * status) {
-  FILE * fp = NULL;
 
-  assert(buf);
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_File_read_all";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_FILE_MPI_ERR_FILE(fh, fname);
+  ERR_IF_FILE_MPI_ERR_BUFFER(fh, buf, fname);
+  /* count: integer */
+  ERR_IF_FILE_MPI_ERR_DATATYPE(fh, datatype, fname);
+  /* status may be MPI_STATUS_IGNORE */
+
+  FILE * fp = NULL;
 
-  fp = mpi_file_handle_to_fp(mpi_info, fh);
+  fp = mpi_file_handle_to_fp(mpi_info_, fh);
 
   if (fp == NULL) {
     printf("MPI_File_read_all: invalid_file handle\n");
@@ -1592,7 +2286,9 @@ int MPI_File_read_all(MPI_File fh, void * buf, int count,
     }
   }
 
-  return MPI_SUCCESS;
+ err:
+  if (status != MPI_STATUS_IGNORE) status->MPI_ERROR = ifail;
+  return ifail;
 }
 
 /*****************************************************************************
@@ -1604,36 +2300,47 @@ int MPI_File_read_all(MPI_File fh, void * buf, int count,
 int MPI_File_write_all(MPI_File fh, const void * buf, int count,
 		       MPI_Datatype datatype, MPI_Status * status) {
 
-  FILE * fp = NULL;
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_File_write_all";
 
-  assert(buf);
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_FILE_MPI_ERR_FILE(fh, fname);
+  ERR_IF_FILE_MPI_ERR_BUFFER(fh, buf, fname);
+  /* count: integer */
+  ERR_IF_FILE_MPI_ERR_DATATYPE(fh, datatype, fname);
+  /* status may be MPI_STATUS_IGNORE */
 
-  fp = mpi_file_handle_to_fp(mpi_info, fh);
+  {
+    FILE * fp = mpi_file_handle_to_fp(mpi_info_, fh);
 
-  if (fp == NULL) {
-    printf("MPI_File_write_all: invalid_file handle");
-    exit(0);
-  }
-  else {
+    if (fp == NULL) {
+      printf("MPI_File_write_all: invalid_file handle");
+      exit(0); /* FIXME */
+    }
+    else {
 
-    /* Translate to a simple fwrite() */
+      /* Translate to a simple fwrite() */
 
-    size_t size   = mpi_sizeof(datatype);
-    size_t nitems = count;
-    size_t nw = fwrite(buf, size, nitems, fp);
+      size_t size   = mpi_sizeof(datatype);
+      size_t nitems = count;
+      size_t nw = fwrite(buf, size, nitems, fp);
 
-    if (nw != nitems) {
-      printf("MPI_File_write_all(): incorrect number of items in fwrite()\n");
-    }
+      if (nw != nitems) {
+	printf("MPI_File_write_all(): incorrect number of items in fwrite()\n");
+      }
 
-    if (ferror(fp)) {
-      perror("perror: ");
-      printf("MPI_File_write_all() file operation failed\n");
-      exit(0);
+      if (ferror(fp)) {
+	perror("perror: ");
+	printf("MPI_File_write_all() file operation failed\n");
+	exit(0); /* FIXME */
+      }
     }
   }
 
-  return MPI_SUCCESS;
+ err:
+  if (status != MPI_STATUS_IGNORE) status->MPI_ERROR = ifail;
+
+  return ifail;
 }
 
 /*****************************************************************************
@@ -1645,13 +2352,25 @@ int MPI_File_write_all(MPI_File fh, const void * buf, int count,
 int MPI_File_write_all_begin(MPI_File fh, const void * buf, int count,
 			     MPI_Datatype datatype) {
 
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_File_write_all_begin";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_FILE_MPI_ERR_FILE(fh, fname);
+  ERR_IF_FILE_MPI_ERR_BUFFER(fh, buf, fname);
+  /* count: integer */
+  ERR_IF_FILE_MPI_ERR_DATATYPE(fh, datatype, fname);
+
   /* We are going to do it here and throw away the status */
 
-  MPI_Status status = {0};
+  {
+    MPI_Status status = {0};
 
-  MPI_File_write_all(fh, buf, count, datatype, &status);
+    ifail = MPI_File_write_all(fh, buf, count, datatype, &status);
+  }
 
-  return 0;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -1665,7 +2384,16 @@ int MPI_File_write_all_end(MPI_File fh, const void * buf, MPI_Status * status) {
   /* A real implementation returns the number of bytes written in the
    * status object. */
 
-  return 0;
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_File_write_all_end";
+
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_FILE_MPI_ERR_FILE(fh, fname);
+  ERR_IF_FILE_MPI_ERR_BUFFER(fh, buf, fname);
+  ERR_IF_FILE_MPI_ERR_ARG(fh, status == NULL, fname);
+
+ err:
+  return ifail;
 }
 
 #endif /* _DO_NOT_INCLUDE_MPI2_INTERFACE */
@@ -1779,7 +2507,7 @@ static int mpi_data_type_handle(mpi_info_t * ctxt, MPI_Datatype handle) {
  *****************************************************************************/
 
 static MPI_File mpi_file_handle_retain(mpi_info_t * mpi, FILE * fp) {
-  
+
   MPI_File fh = MPI_FILE_NULL;
 
   assert(mpi);
@@ -1845,3 +2573,55 @@ static FILE * mpi_file_handle_to_fp(mpi_info_t * mpi, MPI_File fh) {
 
   return fp;
 }
+
+/*****************************************************************************
+ *
+ *  mpi_file_handle_invalid
+ *
+ *  Returns MPI_SUCCESS if fh is valid or, MPI_ERR_FILE if invalid.
+ *
+ *****************************************************************************/
+
+static int mpi_file_handle_invalid(MPI_File fh) {
+
+  int ifail = MPI_SUCCESS;
+
+  assert(mpi_info_);
+
+  if (mpi_file_handle_to_fp(mpi_info_, fh) == NULL) ifail = MPI_ERR_FILE;
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_tag_valid MPI_ERR_TAG
+ *
+ *****************************************************************************/
+
+static int mpi_tag_invalid(int tag) {
+
+  int ifail = MPI_ERR_TAG;
+
+  /* Special values: MPI_ANY_TAG */
+  if (tag == MPI_ANY_TAG) ifail = MPI_SUCCESS;
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_datayupe_invalid
+ *
+ *****************************************************************************/
+
+static int mpi_datatype_invalid(MPI_Datatype dt) {
+
+  int ifail = MPI_SUCCESS;
+
+  /* Look at the size to determined whether valid */
+  int sz = mpi_sizeof(dt);
+  if (sz < 0) ifail = MPI_ERR_DATATYPE;
+
+  return ifail;
+}
diff --git a/mpi_s/mpi_tests.c b/mpi_s/mpi_tests.c
index 9e3918f59..ac5f2004f 100644
--- a/mpi_s/mpi_tests.c
+++ b/mpi_s/mpi_tests.c
@@ -631,13 +631,14 @@ int test_mpi_file_write_all(void) {
   MPI_Type_create_subarray(ndims, sizes, subsizes, starts, MPI_ORDER_C,
 			     etype, &filetype);
   MPI_Type_commit(&filetype);
-  
+
   {
     /* Write */
 
-    MPI_File fh = MPI_FILE_NULL; 
+    MPI_File fh = MPI_FILE_NULL;
     MPI_Offset disp = 0;
 
+    int ifail = MPI_SUCCESS;
     int count = 1;
     double wbuf[NX*NY] = {0};
 
@@ -646,15 +647,21 @@ int test_mpi_file_write_all(void) {
       wbuf[id] = 1.0*id;
     }
 
-    MPI_File_open(comm, filename, MPI_MODE_WRONLY+MPI_MODE_CREATE, info, &fh);
+    ifail = MPI_File_open(comm, filename, MPI_MODE_WRONLY + MPI_MODE_CREATE,
+			  info, &fh);
+    assert(ifail == MPI_SUCCESS);
 
     /* Set the view */
     /* As this is serial the datetype is the filetype */
 
-    MPI_File_set_view(fh, disp, etype, filetype, "native", info);
+    ifail = MPI_File_set_view(fh, disp, etype, filetype, "native", info);
+    assert(ifail == MPI_SUCCESS);
 
-    MPI_File_write_all(fh, wbuf, count, filetype, MPI_STATUS_IGNORE);
-    MPI_File_close(&fh);
+    ifail = MPI_File_write_all(fh, wbuf, count, filetype, MPI_STATUS_IGNORE);
+    assert(ifail == MPI_SUCCESS);
+
+    ifail = MPI_File_close(&fh);
+    assert(ifail == MPI_SUCCESS);
   }
 
 
@@ -663,16 +670,22 @@ int test_mpi_file_write_all(void) {
     MPI_File fh = MPI_FILE_NULL;
     MPI_Offset disp = 0;
 
+    int ifail = MPI_SUCCESS;
     int count = 1;
     double rbuf[NX*NY] = {0};
 
-    MPI_File_open(comm, filename, MPI_MODE_RDONLY, info, &fh);
+    ifail = MPI_File_open(comm, filename, MPI_MODE_RDONLY, info, &fh);
+    assert(ifail == MPI_SUCCESS);
 
     /* Set the view */
-    MPI_File_set_view(fh, disp, etype, filetype, "native", info);
+    ifail = MPI_File_set_view(fh, disp, etype, filetype, "native", info);
+    assert(ifail == MPI_SUCCESS);
 
-    MPI_File_read_all(fh, rbuf, count, filetype, MPI_STATUS_IGNORE);
-    MPI_File_close(&fh);
+    ifail = MPI_File_read_all(fh, rbuf, count, filetype, MPI_STATUS_IGNORE);
+    assert(ifail == MPI_SUCCESS);
+
+    ifail = MPI_File_close(&fh);
+    assert(ifail == MPI_SUCCESS);
 
     for (int id = 0; id < NX*NY; id++) {
       assert(fabs(rbuf[id] - 1.0*id) < DBL_EPSILON);
@@ -703,4 +716,3 @@ int test_mpi_comm_split_type(void) {
 
   return 0;
 }
-

From e1e277b68c4902c73f3e4e4865a3c630612d9e19 Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@epcc.ed.ac.uk>
Date: Sun, 24 Nov 2024 16:17:28 +0000
Subject: [PATCH 050/133] Improve error handling

---
 mpi_s/mpi.h        |    7 +
 mpi_s/mpi_serial.c | 1416 ++++++++++++++++++++++++++------------------
 mpi_s/mpi_tests.c  |  319 ++++++++--
 3 files changed, 1142 insertions(+), 600 deletions(-)

diff --git a/mpi_s/mpi.h b/mpi_s/mpi.h
index 1464ac9e3..b726d693b 100644
--- a/mpi_s/mpi.h
+++ b/mpi_s/mpi.h
@@ -58,12 +58,17 @@ typedef intmax_t MPI_Offset;
 
 enum return_codes {
   MPI_SUCCESS = 0,               /* Success */
+  MPI_ERR_ACCESS,                /* Permission denied */
+  MPI_ERR_AMODE,                 /* Invalid mode argument */
   MPI_ERR_ARG,                   /* Invalid argument of other kind */
   MPI_ERR_BUFFER,                /* Invalid buffer pointer argument */
   MPI_ERR_COMM,                  /* Invalid communicator argument */
   MPI_ERR_COUNT,                 /* Invalid count argument */
   MPI_ERR_DATATYPE,              /* Invalid datatype */
   MPI_ERR_INFO,                  /* Invalid info argument */
+  MPI_ERR_ERRHANDLER,            /* Invalid errhandler handle */
+  MPI_ERR_INTERN,                /* Internal (implementation) error */
+  MPI_ERR_IO,                    /* Other i/o error */
   MPI_ERR_FILE,                  /* Bad file handle */
   MPI_ERR_NO_SUCH_FILE,          /* File does not exist */
   MPI_ERR_OP,                    /* Invalid operation argument */
@@ -263,6 +268,8 @@ int MPI_Cart_sub(MPI_Comm comm, int * remain_dims, MPI_Comm * new_comm);
 /* Bindings for environmental inquiry */
 
 int MPI_Errhandler_set(MPI_Comm comm, MPI_Errhandler errhandler);
+int MPI_Error_string(int ierr, char * str, int * lenresult);
+int MPI_Error_class(int ierrcode, int * ierrclass);
 
 double MPI_Wtime(void);
 double MPI_Wtick(void);
diff --git a/mpi_s/mpi_serial.c b/mpi_s/mpi_serial.c
index 15f803ce4..1b7ce5520 100644
--- a/mpi_s/mpi_serial.c
+++ b/mpi_s/mpi_serial.c
@@ -24,12 +24,14 @@
  *****************************************************************************/
 
 #include <assert.h>
+#include <errno.h>
 #include <stdarg.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
+
 /* Use clock() only as a last resort in serial (no threads) */
 
 #ifdef _OPENMP
@@ -66,11 +68,13 @@ struct internal_data_type_s {
 };
 
 struct internal_file_view_s {
+  MPI_Comm comm;                /* File communicator */
   FILE * fp;                    /* file pointer */
   MPI_Offset   disp;            /* e.g., from MPI_File_set_view() */
   MPI_Datatype etype;
   MPI_Datatype filetype;
   char datarep[MPI_MAX_DATAREP_STRING];
+  char errorstr[MPI_MAX_ERROR_STRING];
 };
 
 typedef struct mpi_info_s mpi_info_t;
@@ -79,6 +83,7 @@ struct mpi_info_s {
   int initialised;               /* MPI initialised */
   int ncart;                     /* Number of Cartesian communicators */
   int period[MAX_CART_COMM][3];  /* Periodic Cartesisan per communicator */
+  int reorder[MAX_CART_COMM];    /* Reorder arguments per Cartesian comm */
   int ndatatype;                 /* Current number of data types */
   data_t dt[MAX_USER_DT];        /* Internal information per data type */
   int ndatatypelast;             /* Current free list extent */
@@ -86,10 +91,9 @@ struct mpi_info_s {
 
   file_t filelist[MAX_USER_FILE]; /* MPI_File information for open files */
 
-  /* At the moment there is a single error string rather than one per
-   * comm and file */
-  char comm_error_string[MPI_MAX_ERROR_STRING];
-  char file_error_string[MPI_MAX_ERROR_STRING];
+  /* The following arguments are recorded but not used */
+  int key;
+  int commute;
 };
 
 static mpi_info_t * mpi_info_ = NULL;
@@ -97,339 +101,287 @@ static mpi_info_t * mpi_info_ = NULL;
 static void mpi_copy(void * send, void * recv, int count, MPI_Datatype type);
 static int mpi_sizeof(MPI_Datatype type);
 static int mpi_sizeof_user(MPI_Datatype handle);
-static int mpi_is_valid_comm(MPI_Comm comm);
 static int mpi_data_type_add(mpi_info_t * ctxt, const data_t * dt,
 			     MPI_Datatype * newtype);
 static int mpi_data_type_free(mpi_info_t * ctxt, MPI_Datatype * handle);
 static int mpi_data_type_handle(mpi_info_t * ctxt, MPI_Datatype handle);
+static int mpi_datatype_intrinsic(MPI_Datatype dt);
+static int mpi_datatype_user(MPI_Datatype dt);
 
 static MPI_File mpi_file_handle_retain(mpi_info_t * ctxt, FILE * fp);
 static FILE *   mpi_file_handle_release(mpi_info_t * ctxt, MPI_File handle);
 static FILE *   mpi_file_handle_to_fp(mpi_info_t * info, MPI_File handle);
 
-static int      mpi_datatype_invalid(MPI_Datatype dt);
-static int      mpi_file_handle_invalid(MPI_File fh);
-static int      mpi_tag_invalid(int tag);
+/* Detect various errors */
+
+static int mpi_err_amode(int amode);
+static int mpi_err_arg(int arg);
+static int mpi_err_buffer(const void * buf);
+static int mpi_err_comm(MPI_Comm comm);
+static int mpi_err_count(int count);
+static int mpi_err_datatype(MPI_Datatype dt);
+static int mpi_err_errhandler(MPI_Errhandler errhandler);
+static int mpi_err_file(MPI_File file);
+static int mpi_err_info(MPI_Info info);
+static int mpi_err_op(MPI_Op op);
+static int mpi_err_rank(int rank);
+static int mpi_err_root(int root);
+static int mpi_err_tag(int tag);
 
 /* In principle, the errhandler is registered against a comm, file, etc */
-/* The "errors_return" handler would store the message and return */
-/* The "errors_are_fatal" handler would store the message, print. and fail */
-
-static int mpi_comm_set_error_string(MPI_Comm comm, const char * fmt, ...) {
-
-  /* In principle, handled on a per communicator basis */
-
-  va_list args;
-
-  assert(mpi_info_);
-  assert(comm != MPI_COMM_NULL);
-
-  va_start(args, fmt);
-  vsnprintf(mpi_info_->comm_error_string, MPI_MAX_ERROR_STRING, fmt, args);
-  va_end(args);
-
-  return 0;
-}
+/* At the moment, we have only two ... */
 
-static int comm_mpi_err_comm_handler(MPI_Comm comm, const char * fname) {
+/* typedef MPI_Comm_errhandler_function(MPI_Comm * comm, int * ierr, ...) */
+/* typdeef MPI_File_errhandler_function(MPI_File * fh,   int * ierr, ...) */
 
-  int ifail = MPI_SUCCESS;
+static void mpi_comm_errors_are_fatal(MPI_Comm * comm, int * ifail, ...);
+static void mpi_file_errors_return(MPI_File * file, int * ifail, ...);
 
-  if (mpi_is_valid_comm(comm) == 0) {
-    ifail = MPI_ERR_COMM;
-    mpi_comm_set_error_string(comm, "%s: invalid communicator", fname);
+#define ERR_IF_MPI_NOT_INITIALISED(fn)					\
+  {									\
+    if (mpi_info_ == NULL) {						\
+      /* Always illegal; abort */					\
+      printf("The %s function was called before either MPI_Init() or"	\
+	     "MPI_Init_thread(). This is illegal.", fn);		\
+      exit(-1);								\
+    }									\
   }
 
-  return ifail;
-}
+/* Macros for argument checking expected to be in a routine of the form: */
+/*
+ * {
+ *   int ifail = MPI_SUCCESS;
+ *   MPI_Com self = MPI_COMM_SELF;
+ *
+ *   MACRO(self, ...);
+ *
+ *   err:
+ *   return ifail;
+ * }
+ *   The comm and file arguments of the macro must be lvalues, as the address
+ *   is taken to call the error handler.
+ */
 
-static int comm_mpi_err_buffer_handler(MPI_Comm comm, const void * buf,
-				       const char * fname) {
-  int ifail = MPI_SUCCESS;
-  /* FIXME Need to watch out for MPI_IN_PLACE */
-  if (buf == NULL) {
-    ifail = MPI_ERR_BUFFER;
-    mpi_comm_set_error_string(comm, "%s: NULL buffer pointer", fname);
-    /* erhandler */
+#define ERR_IF_COMM_MPI_ERR_COMM(comm, fn)				\
+  {									\
+    ifail = mpi_err_comm(comm);						\
+    mpi_comm_errors_are_fatal(&comm, &ifail, "%s: invalid comm", fn);	\
+    if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-  return ifail;
-}
-
-static int comm_mpi_err_count_handler(MPI_Comm comm, int count,
-				      const char * fname) {
-  int ifail = MPI_SUCCESS;
-
-  if (count < 0) {
-    ifail = MPI_ERR_COUNT;
-    mpi_comm_set_error_string(comm, "%s(): count must be >= 0", fname);
-    /* Call errhandler */
+#define ERR_IF_COMM_MPI_ERR_BUFFER(comm, buf, fn)			\
+  {									\
+    ifail = mpi_err_buffer(buf);					\
+    mpi_comm_errors_are_fatal(&comm, &ifail, "%s: invalid buffer", fn); \
+    if (ifail != MPI_SUCCESS) goto err;					\
   }
-  return ifail;
-}
-
-static int comm_mpi_err_datatype_handler(MPI_Comm comm, MPI_Datatype dt,
-					 const char * fname) {
-  int ifail = MPI_SUCCESS;
 
-  if (mpi_datatype_invalid(dt)) {
-    ifail = MPI_ERR_DATATYPE;
-    mpi_comm_set_error_string(comm, "%s(): invalid", fname);
-    /* Call error handler */
+#define ERR_IF_COMM_MPI_ERR_COUNT(comm, count, fn)			\
+  {									\
+    ifail = mpi_err_count(count);					\
+    mpi_comm_errors_are_fatal(&comm, &ifail, "%s, invalid count", fn);	\
+    if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-  return ifail;
-}
-
-static int comm_mpi_err_op_handler(MPI_Comm comm, MPI_Op op,
-				   const char * fname) {
-  int ifail = MPI_SUCCESS;
-
-  if (0) { /* FIXME need to check for valid op */
-    ifail = MPI_ERR_OP;
-    mpi_comm_set_error_string(comm, "%s(): invalid MPI_Op argument", fname);
-    /* Call error handler */
+#define ERR_IF_COMM_MPI_ERR_DATATYPE(comm, dt, fn)			\
+  {									\
+    ifail = mpi_err_datatype(dt);					\
+    mpi_comm_errors_are_fatal(&comm, &ifail,				\
+			      "%s: invalid datatype (%d)", fn, dt);	\
+    if (ifail != MPI_SUCCESS) goto err;					\
   }
-  return ifail;
-}
 
-static int comm_mpi_err_rank_handler(MPI_Comm comm, int rank,
-				     const char * fname) {
-  int ifail = MPI_SUCCESS;
-
-  if (rank == 0 || rank == MPI_PROC_NULL) {
-    ; /* pass */
-  }
-  else {
-    ifail = MPI_ERR_RANK;
-    mpi_comm_set_error_string(comm, "%s(): invalid rank", fname);
-    /* Call errhandler */
+#define ERR_IF_COMM_MPI_ERR_INFO(comm, info, fn)	 \
+  {							 \
+    ifail = mpi_err_info(info);						\
+    mpi_comm_errors_are_fatal(&comm, &ifail, "%s: invalid info", fn);	\
+    if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-  return ifail;
-}
-
-static int comm_mpi_err_info_handler(MPI_Comm comm, MPI_Info info,
-				     const char * func) {
-
-  int ifail = MPI_SUCCESS;
-
-  /* Only handling MPI_INFO_NULL at the moment */
-
-  if (info != MPI_INFO_NULL) {
-    ifail = MPI_ERR_INFO;
-    mpi_comm_set_error_string(comm, "%s(): invalid info argument", func);
-    /* Handler */
+#define ERR_IF_COMM_MPI_ERR_OP(comm, op, fn)				\
+  {									\
+    ifail = mpi_err_op(op);						\
+    mpi_comm_errors_are_fatal(&comm, &ifail, "%s: invalid op", fn);	\
+    if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-  return ifail;
-}
-
-
-static int comm_mpi_err_root_handler(MPI_Comm comm, int root,
-				     const char * fname) {
-  int ifail = MPI_SUCCESS;
-
-  /* Root should not be MPI_PROC_NULL */
-  if (root != 0) {
-    ifail = MPI_ERR_ROOT;
-    mpi_comm_set_error_string(comm, "%s(): invalid root argument", fname);
-    /* Call errhandler */
+#define ERR_IF_COMM_MPI_ERR_RANK(comm, rank, fn)			\
+  {									\
+    ifail = mpi_err_rank(rank);						\
+    mpi_comm_errors_are_fatal(&comm, &ifail, "%s: invalid rank", fn);	\
+    if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-  return ifail;
-}
-
-static int comm_mpi_err_tag_handler(MPI_Comm comm, int tag,
-				    const char * fname) {
-  int ifail = MPI_SUCCESS;
-
-  if (mpi_tag_invalid(tag)) {
-    ifail = MPI_ERR_TAG;
-    mpi_comm_set_error_string(comm, "%s(): invalid tag", fname);
-    /* Call errhandler */
+#define ERR_IF_COMM_MPI_ERR_ROOT(comm, root, fn)			\
+  {									\
+    ifail = mpi_err_root(root);						\
+    mpi_comm_errors_are_fatal(&comm, &ifail, "%s: invalid root", fn);	\
+    if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-  return ifail;
-}
-
-static int comm_mpi_err_arg_handler(MPI_Comm comm, const char * condition,
-				    const char * fname) {
-  int ifail = MPI_ERR_ARG;
-  mpi_comm_set_error_string(comm, "%s(): argument %s", condition, fname);
-  /* Call errhandler */
-
-  return ifail;
-}
-
-
-#define ERR_IF_MPI_NOT_INITIALISED(fname)				\
+#define ERR_IF_COMM_MPI_ERR_TAG(comm, tag, fn)				\
   {									\
-    if (mpi_info_ == NULL) {						\
-      /* Illegal; abort */						\
-      printf("The %s() function was called before either MPI_Init() or"	\
-	     "MPI_Init_thread(). This is illegal.", fname);		\
-      exit(-1);								\
-    }									\
+    ifail = mpi_err_tag(tag);						\
+    mpi_comm_errors_are_fatal(&comm, &ifail, "%s: invalid tag", fn);	\
+    if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-
-#define ERR_IF_COMM_MPI_ERR_COMM(comm, func)				\
+#define ERR_IF_COMM_MPI_ERR_ARG(comm, arg, fn)				\
   {									\
-    ifail = comm_mpi_err_comm_handler(comm, func);			\
+    ifail = mpi_err_arg((arg));					\
+    mpi_comm_errors_are_fatal(&comm, &ifail, "%s: %s", fn, #arg);	\
     if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-#define ERR_IF_COMM_MPI_ERR_BUFFER(comm, buf, func)			\
+#define ERR_IF_COMM_MPI_ERR_ERRHANDLER(comm, errhandler, fn)		\
   {									\
-    ifail = comm_mpi_err_buffer_handler(comm, buf, func);		\
+    ifail = mpi_err_errhandler(errhandler);				\
+    mpi_comm_errors_are_fatal(&comm, &ifail, "%s: invalid errhandler", fn); \
     if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-#define ERR_IF_COMM_MPI_ERR_COUNT(comm, count, func)			\
+/* MPI_File routine argument error checkers */
+
+#define ERR_IF_FILE_MPI_ERR_COMM(file, comm, fn)			\
   {									\
-    ifail = comm_mpi_err_count_handler(comm, count, func);		\
+    ifail = mpi_err_comm(comm);						\
+    mpi_file_errors_return(&file, &ifail, "%s: invalid comm", fn);	\
     if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-#define ERR_IF_COMM_MPI_ERR_DATATYPE(comm, dt, func)			\
+#define ERR_IF_FILE_MPI_ERR_ARG(file, arg, fn)				\
   {									\
-    ifail = comm_mpi_err_datatype_handler(comm, dt, func);		\
+    ifail = mpi_err_arg((arg));						\
+    mpi_file_errors_return(&file, &ifail, "%s: %s", fn, #arg);		\
     if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-#define ERR_IF_COMM_MPI_ERR_INFO(comm, info, func)	 \
-  {							 \
-    ifail = comm_mpi_err_info_handler(comm, info, func); \
-    if (ifail != MPI_SUCCESS) goto err;			 \
+#define ERR_IF_FILE_MPI_ERR_AMODE(file, amode, func)			\
+  {									\
+    ifail = mpi_err_amode(amode);					\
+    mpi_file_errors_return(&file, &ifail, "%s: invalid amode", func);	\
+    if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-#define ERR_IF_COMM_MPI_ERR_OP(comm, op, func)				\
+#define ERR_IF_FILE_MPI_ERR_INFO(file, info, func) 			\
   {									\
-    ifail = comm_mpi_err_op_handler(comm, op, func);			\
+    ifail = mpi_err_info(info);						\
+    mpi_file_errors_return(&file, &ifail, "%s: invalid info", func);	\
     if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-#define ERR_IF_COMM_MPI_ERR_RANK(comm, rank, func)			\
+#define ERR_IF_FILE_MPI_ERR_FILE(file, func)				\
   {									\
-    ifail = comm_mpi_err_rank_handler(comm, rank, func);		\
-    if (ifail != MPI_SUCCESS) goto err;					\
+    ifail = mpi_err_file(file);						\
+    mpi_file_errors_return(&file, &ifail, "%s: invalid file", func);	\
+    if (ifail != MPI_SUCCESS)  goto err;				\
   }
 
-#define ERR_IF_COMM_MPI_ERR_ROOT(comm, root, func)			\
+#define ERR_IF_FILE_MPI_ERR_BUFFER(file, buf, func)			\
   {									\
-    ifail = comm_mpi_err_root_handler(comm, root, func);		\
+    ifail = mpi_err_buffer(buf);					\
+    mpi_file_errors_return(&file, &ifail, "%s: invalid buffer", func);	\
     if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-#define ERR_IF_COMM_MPI_ERR_TAG(comm, tag, func)			\
+#define ERR_IF_FILE_MPI_ERR_COUNT(file, count, fn)			\
   {									\
-    ifail = comm_mpi_err_tag_handler(comm, tag, func);			\
+    ifail = mpi_err_count(count);					\
+    mpi_file_errors_return(&file, &ifail, "%s: invalid count", fn);	\
     if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-#define ERR_IF_COMM_MPI_ERR_ARG(comm, condition, func)			\
+#define ERR_IF_FILE_MPI_ERR_DATATYPE(file, datatype, func)		\
   {									\
-    if ((condition)) {							\
-      ifail = comm_mpi_err_arg_handler(comm, #condition, func);		\
-      if (ifail != MPI_SUCCESS) goto err;				\
-    }									\
+    ifail = mpi_err_datatype(datatype);					\
+    mpi_file_errors_return(&file, &ifail, "%s: invalid datatype", func);\
+    if (ifail != MPI_SUCCESS) goto err;					\
   }
 
-/* MPI_File error handlers */
-
-static int file_mpi_err_amode_handler(MPI_File file, int amode,
-				      const char * func) {
-
-  int ifail = MPI_SUCCESS;
-
-  return ifail;
-}
-
-static int file_mpi_err_info_handler(MPI_File fh, MPI_Info info,
-				     const char * func) {
-  int ifail = MPI_SUCCESS;
-  MPI_Comm comm = MPI_COMM_SELF;
-
-  if (fh) comm = MPI_COMM_WORLD; /* FIXME: fh->comm */
-  ifail = comm_mpi_err_info_handler(comm, info, func);
-
-  return ifail;
-}
-
-static int file_mpi_err_buffer_handler(MPI_File fh, const void * buf,
-				       const char * func) {
-  int ifail = MPI_SUCCESS;
-  MPI_Comm comm = MPI_COMM_SELF;
-
-  if (fh) comm = MPI_COMM_WORLD; /* FIXME fh->comm */
-  ifail = comm_mpi_err_buffer_handler(comm, buf, func);
+/*****************************************************************************
+ *
+ *  MPI_Error_string
+ *
+ *****************************************************************************/
 
-  return ifail;
-}
+int MPI_Error_string(int errorcode, char * string, int * resultlen) {
 
-static int file_mpi_err_datatype_handler(MPI_File fh, MPI_Datatype dt,
-					 const char * func) {
   int ifail = MPI_SUCCESS;
-  MPI_Comm comm = MPI_COMM_SELF;
+  const char * msg = NULL;
 
-  if (fh) comm = MPI_COMM_WORLD; /* fh->comm */
-  ifail = comm_mpi_err_datatype_handler(comm, dt, func);
-
-  return ifail;
-}
+  /* May be called before MPI_Init() */
+  /* "errcode" is code or class ...  */
 
-#define ERR_IF_FILE_MPI_ERR_COMM(file, condition, func)			\
-  {									\
-    if ((condition)) { /* FIXME map from file to communicator */	\
-      MPI_Comm comm = MPI_COMM_SELF;					\
-      if (file != MPI_FILE_NULL) comm = MPI_COMM_WORLD;			\
-      ifail = comm_mpi_err_comm_handler(comm, func);			\
-      if (ifail != MPI_SUCCESS) goto err;				\
-    }									\
-  }
-
-#define ERR_IF_FILE_MPI_ERR_ARG(file, condition, func)		\
-  {								\
-    if ((condition)) {						\
-      MPI_Comm comm = MPI_COMM_SELF;				\
-      if (file != MPI_FILE_NULL) comm = MPI_COMM_WORLD;		\
-      ifail = comm_mpi_err_arg_handler(comm, #condition, func);	\
-      if (ifail != MPI_SUCCESS) goto err;			\
-    }								\
-  }
-
-#define ERR_IF_FILE_MPI_ERR_AMODE(file, amode, func) {		\
-    ifail = file_mpi_err_amode_handler(file, amode, func);	\
-    if (ifail != MPI_SUCCESS) goto err;				\
-  }
-
-#define ERR_IF_FILE_MPI_ERR_INFO(file, info, func) {		\
-    ifail = file_mpi_err_info_handler(file, info, func);	\
-    if (ifail != MPI_SUCCESS) goto err;				\
-  }
-
-#define ERR_IF_FILE_MPI_ERR_FILE(fh, func)				\
-  {									\
-    ifail = mpi_file_handle_invalid(fh);				\
-    if (ifail != MPI_SUCCESS) {						\
-      MPI_Comm comm = MPI_COMM_SELF;					\
-      mpi_comm_set_error_string(comm, "%s(): invalid file handle", func); \
-      goto err;								\
-    }									\
+  switch (errorcode) {
+  case MPI_SUCCESS:
+    msg = "MPI_SUCCESS: success";
+    break;
+  case MPI_ERR_ACCESS:
+    msg = "MPI_ERR_ACCESS: permission denied";
+    break;
+  case MPI_ERR_AMODE:
+    msg = "MPI_ERR_AMODE: invalid mode argument";
+    break;
+  case MPI_ERR_ARG:
+    msg = "MPI_ERR_AMODE: invalid argument of naother kind";
+    break;
+  case MPI_ERR_BUFFER:
+    msg = "MPI_ERR_BUFFER: invalid buffer pointer argument";
+    break;
+  case MPI_ERR_COMM:
+    msg = "MPI_ERR_COMM: invalid communicator argument";
+    break;
+  case MPI_ERR_COUNT:
+    msg = "MPI_ERR_COUNT: invalid count argument";
+    break;
+  case MPI_ERR_DATATYPE:
+    msg = "MPI_ERR_DATATYPE: invalid datatype";
+    break;
+  case MPI_ERR_INFO:
+    msg = "MPI_ERR_INFO: invalid info argument";
+    break;
+  case MPI_ERR_ERRHANDLER:
+    msg = "MPI_ERR_ERRHANDLER: invalid error handler";
+    break;
+  case MPI_ERR_INTERN:
+    msg = "MPI_ERR_INTERN: internal (implementation) error";
+    break;
+  case MPI_ERR_IO:
+    msg = "MPI_ERR_IO: other i/o error";
+    break;
+  case MPI_ERR_FILE:
+    msg = "MPI_ERR_FILE: invalid file handle";
+    break;
+  case MPI_ERR_NO_SUCH_FILE:
+    msg = "MPI_ERR_NO_SUCH_FILE: file does not exist";
+    break;
+  case MPI_ERR_OP:
+    msg = "MPI_ERR_OP: invalid operation";
+    break;
+  case MPI_ERR_RANK:
+    msg = "MPI_ERR_RANK: invalid rank";
+    break;
+  case MPI_ERR_ROOT:
+    msg = "MPI_ERR_ROOT: invalid root argument";
+    break;
+  case MPI_ERR_TAG:
+    msg = "MPI_ERR_TAG: invalid tag";
+    break;
+  case MPI_ERR_LASTCODE:
+    msg = "MPI_ERR_LASTCODE: last error message code";
+    break;
+  default:
+    /* We say an unrecognised code is and unknown error ... */
+    msg = "MPI_ERR_UNKNOWN: unknown error";
   }
 
-#define ERR_IF_FILE_MPI_ERR_BUFFER(fh, buf, func)	\
-  {							\
-    ifail = file_mpi_err_buffer_handler(fh, buf, func);	\
-    if (ifail != MPI_SUCCESS) goto err;			\
+  if (string) {
+    strncpy(string, msg, MPI_MAX_ERROR_STRING);
+    if (resultlen) *resultlen = strlen(msg);
   }
 
-#define ERR_IF_FILE_MPI_ERR_DATATYPE(fh, datatype, func)	\
-  {								\
-    ifail = file_mpi_err_datatype_handler(fh, datatype, func);	\
-    if (ifail != MPI_SUCCESS) goto err;				\
-  }
+  return ifail;
+}
 
 /*****************************************************************************
  *
@@ -461,7 +413,7 @@ int MPI_Bcast(void * buffer, int count, MPI_Datatype datatype, int root,
 	      MPI_Comm comm) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Bcast";
+  const char * fname = "MPI_Bcast()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -470,7 +422,7 @@ int MPI_Bcast(void * buffer, int count, MPI_Datatype datatype, int root,
   ERR_IF_COMM_MPI_ERR_DATATYPE(comm, datatype, fname);
   ERR_IF_COMM_MPI_ERR_ROOT(comm, root, fname);
 
-  assert(mpi_info_->initialised);
+  /* no operation required */
 
  err:
   return ifail;
@@ -484,8 +436,12 @@ int MPI_Bcast(void * buffer, int count, MPI_Datatype datatype, int root,
 
 int MPI_Init(int * argc, char *** argv) {
 
-  assert(argc);
-  assert(argv);
+  int ifail = MPI_SUCCESS;
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Init()";
+
+  ERR_IF_COMM_MPI_ERR_ARG(self, argc == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, argv == NULL, fname);
 
   mpi_info_ = (mpi_info_t *) calloc(1, sizeof(mpi_info_t));
   assert(mpi_info_);
@@ -512,9 +468,11 @@ int MPI_Init(int * argc, char *** argv) {
     mpi_info_->filelist[ih].etype = MPI_BYTE;
     mpi_info_->filelist[ih].filetype = MPI_BYTE;
     strncpy(mpi_info_->filelist[ih].datarep, "native", MPI_MAX_DATAREP_STRING);
+    strncpy(mpi_info_->filelist[ih].errorstr, "", MPI_MAX_ERROR_STRING);
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -525,19 +483,31 @@ int MPI_Init(int * argc, char *** argv) {
 
 int MPI_Init_thread(int * argc, char *** argv, int required, int * provided) {
 
-  assert(argc);
-  assert(argv);
-  assert(MPI_THREAD_SINGLE <= required && required <= MPI_THREAD_MULTIPLE);
-  assert(provided);
+  int ifail = MPI_SUCCESS;
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Init_thread()";
+
+  ERR_IF_COMM_MPI_ERR_ARG(self, argc == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, argv == NULL, fname);
+  /* required; see below */
+  ERR_IF_COMM_MPI_ERR_ARG(self, provided == NULL, fname);
 
   MPI_Init(argc, argv);
 
-  /* We are going to say that MPI_THREAD_SERIALIZED is available */
-  /* Not MPI_THREAD_MULTIPLE */
+  /* MPI_THREAD_MULTIPLE is not available */
 
-  *provided = MPI_THREAD_SERIALIZED;
+  if (MPI_THREAD_SINGLE <= required && required <= MPI_THREAD_MULTIPLE) {
+    *provided = required;
+    if (required == MPI_THREAD_MULTIPLE) *provided = MPI_THREAD_SERIALIZED;
+  }
+  else {
+    ifail = MPI_ERR_ARG;
+    mpi_comm_errors_are_fatal(&self, &ifail,
+			      "%s: required level unrecognised", fname);
+  }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -548,10 +518,15 @@ int MPI_Init_thread(int * argc, char *** argv, int required, int * provided) {
 
 int MPI_Initialized(int * flag) {
 
-  assert(flag);
+  int ifail = MPI_SUCCESS;
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Initialised()";
+
+  ERR_IF_COMM_MPI_ERR_ARG(self, flag == NULL, fname);
 
   *flag = (mpi_info_ != NULL); /* A sufficient condition */
 
+ err:
   return MPI_SUCCESS;
 }
 
@@ -588,7 +563,7 @@ int MPI_Comm_group(MPI_Comm comm, MPI_Group * group) {
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
-  assert(group);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, group == NULL, fname);
 
   *group = 0;
 
@@ -609,7 +584,7 @@ int MPI_Comm_rank(MPI_Comm comm, int * rank) {
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
-  assert(rank);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, rank == NULL, fname);
 
   *rank = 0;
 
@@ -630,7 +605,7 @@ int MPI_Comm_size(MPI_Comm comm, int * size) {
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
-  assert(size);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, size == NULL, fname);
 
   *size = 1;
 
@@ -651,18 +626,19 @@ int MPI_Comm_size(MPI_Comm comm, int * size) {
 int MPI_Comm_compare(MPI_Comm comm1, MPI_Comm comm2, int * result) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Comm_compare";
+  MPI_Comm comm = MPI_COMM_SELF;
+  const char * fname = "MPI_Comm_compare()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-
-  assert(result); /* FIXME what is correct behaviour? e.g., invalid comm */
+  ERR_IF_COMM_MPI_ERR_ARG(comm, result == NULL, fname)
 
   *result = MPI_UNEQUAL;
-  if (mpi_is_valid_comm(comm1) && mpi_is_valid_comm(comm2)) {
+  if (!mpi_err_comm(comm1) && !mpi_err_comm(comm2)) {
     *result = MPI_CONGRUENT;
     if (comm1 == comm2) *result = MPI_IDENT;
   }
 
+ err:
   return ifail;
 }
 
@@ -674,12 +650,16 @@ int MPI_Comm_compare(MPI_Comm comm1, MPI_Comm comm2, int * result) {
 
 int MPI_Abort(MPI_Comm comm, int code) {
 
-  int is_valid;
+  int ifail = MPI_SUCCESS;
+  const char * fname = "MPI_Abort()";
 
-  is_valid = 1 - mpi_is_valid_comm(comm);
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
 
-  exit(code + is_valid);
-  return MPI_SUCCESS; /* ha! */
+ err:
+  exit(code);
+
+  return ifail;
 }
 
 /*****************************************************************************
@@ -743,7 +723,7 @@ int MPI_Recv(void * buf, int count, MPI_Datatype datatype, int source,
 	     int tag, MPI_Comm comm, MPI_Status * status) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Recv";
+  const char * fname = "MPI_Recv()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -752,10 +732,9 @@ int MPI_Recv(void * buf, int count, MPI_Datatype datatype, int source,
   ERR_IF_COMM_MPI_ERR_DATATYPE(comm, datatype, fname);
   ERR_IF_COMM_MPI_ERR_RANK(comm, source, fname);
   ERR_IF_COMM_MPI_ERR_TAG(comm, tag, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(comm, status == NULL, fname);
 
-  printf("MPI_Recv should not be called in serial.\n");
-  exit(0);
+  if (status != MPI_STATUS_IGNORE) status->MPI_ERROR = MPI_ERR_INTERN;
+  mpi_comm_errors_are_fatal(&comm, &ifail, "%s: cannot call in serial", fname);
 
  err:
   return ifail;
@@ -771,7 +750,7 @@ int MPI_Irecv(void * buf, int count, MPI_Datatype datatype, int source,
 	     int tag, MPI_Comm comm, MPI_Request * request) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Irecv";
+  const char * fname = "MPI_Irecv()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -799,7 +778,7 @@ int MPI_Ssend(void * buf, int count, MPI_Datatype datatype, int dest,
 	      int tag, MPI_Comm comm) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Ssend";
+  const char * fname = "MPI_Ssend()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -826,7 +805,7 @@ int MPI_Isend(void * buf, int count, MPI_Datatype datatype, int dest,
 	      int tag, MPI_Comm comm, MPI_Request * request) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Isend";
+  const char * fname = "MPI_Isend()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -853,7 +832,7 @@ int MPI_Issend(void * buf, int count, MPI_Datatype datatype, int dest,
 	       int tag, MPI_Comm comm, MPI_Request * request) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Issend";
+  const char * fname = "MPI_Issend()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -880,12 +859,16 @@ int MPI_Issend(void * buf, int count, MPI_Datatype datatype, int dest,
 int MPI_Waitall(int count, MPI_Request * requests, MPI_Status * statuses) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Waitall";
+  MPI_Comm comm = MPI_COMM_SELF;
+  const char * fname = "MPI_Waitall()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_COUNT(MPI_COMM_SELF, count, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, requests == NULL, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, statuses == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, count, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, requests == NULL, fname);
+
+  if (statuses != MPI_STATUSES_IGNORE) {
+    statuses[0].MPI_ERROR = MPI_SUCCESS;
+  }
 
  err:
   return ifail;
@@ -902,12 +885,12 @@ int MPI_Waitany(int count, MPI_Request requests[], int * index,
 		MPI_Status * status) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Waitany";
+  MPI_Comm comm = MPI_COMM_SELF;
+  const char * fname = "MPI_Waitany()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_COUNT(MPI_COMM_SELF, count, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, index == NULL, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, status == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(comm, count, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, index == NULL, fname);
 
   *index = MPI_UNDEFINED;
 
@@ -915,7 +898,7 @@ int MPI_Waitany(int count, MPI_Request requests[], int * index,
     if (requests[ireq] != MPI_REQUEST_NULL) {
       *index = ireq;
       requests[ireq] = MPI_REQUEST_NULL;
-      if (status) {
+      if (status != MPI_STATUS_IGNORE) {
 	status->MPI_SOURCE = 0;
 	status->MPI_TAG = requests[ireq];
       }
@@ -936,16 +919,16 @@ int MPI_Waitany(int count, MPI_Request requests[], int * index,
 int MPI_Probe(int source, int tag, MPI_Comm comm, MPI_Status * status) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Probe";
+  const char * fname = "MPI_Probe()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
   ERR_IF_COMM_MPI_ERR_RANK(comm, source, fname);
   ERR_IF_COMM_MPI_ERR_TAG(comm, tag, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(comm, status == NULL, fname);
 
-  printf("MPI_Probe should not be called in serial\n");
-  exit(0);
+  ifail = MPI_ERR_INTERN;
+  if (status != MPI_STATUS_IGNORE) status->MPI_ERROR = MPI_ERR_INTERN;
+  mpi_comm_errors_are_fatal(&comm, &ifail, "%s: invalid serial call", fname);
 
  err:
   return ifail;
@@ -963,7 +946,7 @@ int MPI_Sendrecv(void * sendbuf, int sendcount, MPI_Datatype sendtype,
 		 MPI_Comm comm, MPI_Status * status) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_SendRecv";
+  const char * fname = "MPI_SendRecv()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -978,12 +961,11 @@ int MPI_Sendrecv(void * sendbuf, int sendcount, MPI_Datatype sendtype,
   ERR_IF_COMM_MPI_ERR_DATATYPE(comm, recvtype, fname);
   ERR_IF_COMM_MPI_ERR_RANK(comm, source, fname);
   ERR_IF_COMM_MPI_ERR_TAG(comm, recvtag, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(comm, status == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, recvcount != sendcount, fname);
 
-  assert(recvcount == sendcount);
-
-  printf("MPI_Sendrecv should not be called in serial\n");
-  exit(0);
+  ifail = MPI_ERR_INTERN;
+  if (status != MPI_STATUS_IGNORE) status->MPI_ERROR = MPI_ERR_INTERN;
+  mpi_comm_errors_are_fatal(&comm, &ifail, "%s: invalid serial call", fname);
 
  err:
   return ifail;
@@ -999,7 +981,7 @@ int MPI_Reduce(void * sendbuf, void * recvbuf, int count, MPI_Datatype type,
 	       MPI_Op op, int root, MPI_Comm comm) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Reduce";
+  const char * fname = "MPI_Reduce()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -1028,7 +1010,7 @@ int MPI_Allgather(void * sendbuf, int sendcount, MPI_Datatype sendtype,
 		  MPI_Comm comm) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Allgather";
+  const char * fname = "MPI_Allgather()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -1039,8 +1021,8 @@ int MPI_Allgather(void * sendbuf, int sendcount, MPI_Datatype sendtype,
   ERR_IF_COMM_MPI_ERR_COUNT(comm, recvcount, fname);
   ERR_IF_COMM_MPI_ERR_DATATYPE(comm, recvtype, fname);
 
-  assert(sendcount == recvcount);
-  assert(sendtype == recvtype);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, sendcount != recvcount, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, sendtype != recvtype, fname);
 
   mpi_copy(sendbuf, recvbuf, sendcount, sendtype);
 
@@ -1059,7 +1041,7 @@ int MPI_Gather(void * sendbuf, int sendcount, MPI_Datatype sendtype,
 	       int root, MPI_Comm comm) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Gather";
+  const char * fname = "MPI_Gather()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -1071,8 +1053,8 @@ int MPI_Gather(void * sendbuf, int sendcount, MPI_Datatype sendtype,
   ERR_IF_COMM_MPI_ERR_DATATYPE(comm, recvtype, fname);
   ERR_IF_COMM_MPI_ERR_ROOT(comm, root, fname);
 
-  assert(sendcount == recvcount);
-  assert(sendtype == recvtype);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, sendcount != recvcount, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, sendtype != recvtype, fname);
 
   mpi_copy(sendbuf, recvbuf, sendcount, sendtype);
 
@@ -1093,7 +1075,7 @@ int MPI_Gatherv(const void * sendbuf, int sendcount, MPI_Datatype sendtype,
 		MPI_Datatype recvtype, int root, MPI_Comm comm) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Gatherv";
+  const char * fname = "MPI_Gatherv()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -1105,8 +1087,8 @@ int MPI_Gatherv(const void * sendbuf, int sendcount, MPI_Datatype sendtype,
   ERR_IF_COMM_MPI_ERR_DATATYPE(comm, recvtype, fname);
   ERR_IF_COMM_MPI_ERR_ROOT(comm, root, fname);
 
-  assert(sendtype == recvtype);
-  assert(sendcount == recvcounts[0]);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, sendtype != recvtype, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(comm, sendcount != recvcounts[0], fname);
 
   mpi_copy((void *) sendbuf, recvbuf, sendcount, sendtype);
 
@@ -1124,7 +1106,7 @@ int MPI_Allreduce(void * sendbuf, void * recvbuf, int count, MPI_Datatype type,
 		  MPI_Op op, MPI_Comm comm) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Allreduce";
+  const char * fname = "MPI_Allreduce()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -1156,7 +1138,7 @@ int MPI_Allreduce(void * sendbuf, void * recvbuf, int count, MPI_Datatype type,
 int MPI_Comm_split(MPI_Comm comm, int colour, int key, MPI_Comm * newcomm) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Comm_split";
+  const char * fname = "MPI_Comm_split()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -1166,6 +1148,7 @@ int MPI_Comm_split(MPI_Comm comm, int colour, int key, MPI_Comm * newcomm) {
   /* Allow that a split Cartesian communicator is different */
   /* See MPI_Comm_compare() */
 
+  mpi_info_->key = key;
   *newcomm = MPI_COMM_WORLD;
 
  err:
@@ -1182,15 +1165,16 @@ int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info,
 			MPI_Comm * newcomm) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Comm_split_type";
+  const char * fname = "MPI_Comm_split_type()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
   ERR_IF_COMM_MPI_ERR_ARG(comm, split_type != MPI_COMM_TYPE_SHARED, fname);
-  /* FIXME key controls rank assignment */
+  /* key controls rank assignment; no constraints */
   ERR_IF_COMM_MPI_ERR_INFO(comm, info, fname);
   ERR_IF_COMM_MPI_ERR_ARG(comm, newcomm == NULL, fname);
 
+  mpi_info_->key = key;
   *newcomm = comm;
 
  err:
@@ -1206,10 +1190,11 @@ int MPI_Comm_split_type(MPI_Comm comm, int split_type, int key, MPI_Info info,
 int MPI_Comm_free(MPI_Comm * comm) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Comm_free";
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Comm_free()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, comm == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, comm == NULL, fname);
   ERR_IF_COMM_MPI_ERR_COMM(*comm, fname);
 
   /* Mark Cartesian communicators as free */
@@ -1233,7 +1218,7 @@ int MPI_Comm_free(MPI_Comm * comm) {
 int MPI_Comm_dup(MPI_Comm oldcomm, MPI_Comm * newcomm) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Comm_dup";
+  const char * fname = "MPI_Comm_dup()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(oldcomm, fname);
@@ -1256,14 +1241,15 @@ int MPI_Type_indexed(int count, int * array_of_blocklengths,
 		     MPI_Datatype * newtype) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Type_indexed";
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Type_indexed()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_COUNT(MPI_COMM_SELF, count, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, array_of_blocklengths == NULL, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, array_of_displacements == NULL, fname);
-  ERR_IF_COMM_MPI_ERR_DATATYPE(MPI_COMM_SELF, oldtype, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, newtype == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(self, count, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, array_of_blocklengths == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, array_of_displacements == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(self, oldtype, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, newtype == NULL, fname);
 
   {
     data_t dt = {0};
@@ -1289,12 +1275,13 @@ int MPI_Type_indexed(int count, int * array_of_blocklengths,
 int MPI_Type_contiguous(int count, MPI_Datatype old, MPI_Datatype * newtype) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Type_contiguous";
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Type_contiguous()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_COUNT(MPI_COMM_SELF, count, fname);
-  ERR_IF_COMM_MPI_ERR_DATATYPE(MPI_COMM_SELF, old, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, newtype == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(self, count, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(self, old, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, newtype == NULL, fname);
 
   {
     data_t dt = {0};
@@ -1320,22 +1307,24 @@ int MPI_Type_contiguous(int count, MPI_Datatype old, MPI_Datatype * newtype) {
 int MPI_Type_commit(MPI_Datatype * type) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Type_commit";
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Type_commit()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, type == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, type == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, *type == MPI_DATATYPE_NULL, fname);
 
   {
     int handle = *type;
 
     if (handle < 0) {
-      printf("MPI_Type_commit: Attempt to commit intrinsic type\n");
-    }
-    if (handle == 0) {
-      printf("MPI_Type_commit: Attempt to commit null data type\n");
+      ifail = MPI_ERR_ARG;
+      mpi_comm_errors_are_fatal(&self, &ifail, "%s: intrinsic type!", fname);
     }
+
     if (handle > mpi_info_->ndatatypelast) {
-      printf("MPI_Type_commit: unrecognised handle %d\n", handle);
+      ifail = MPI_ERR_DATATYPE;
+      mpi_comm_errors_are_fatal(&self, &ifail, "unrecognised datatype", fname);
     }
 
     assert(mpi_info_->dt[handle].handle == handle);
@@ -1355,12 +1344,11 @@ int MPI_Type_commit(MPI_Datatype * type) {
 int MPI_Type_free(MPI_Datatype * type) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Type_free";
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Type_free()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, type == NULL, fname);
-
-  assert(type); /* FIXME. An error. */
+  ERR_IF_COMM_MPI_ERR_ARG(self, type == NULL, fname);
 
   mpi_data_type_free(mpi_info_, type);
   assert(*type == MPI_DATATYPE_NULL);
@@ -1379,20 +1367,21 @@ int MPI_Type_vector(int count, int blocklength, int stride,
 		    MPI_Datatype oldtype, MPI_Datatype * newtype) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Type_vector";
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Type_vector()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_COUNT(MPI_COMM_SELF, count, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, blocklength < 0, fname);
-  ERR_IF_COMM_MPI_ERR_DATATYPE(MPI_COMM_SELF, oldtype, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, newtype == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(self, count, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, blocklength < 0, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(self, oldtype, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, newtype == NULL, fname);
 
   {
     data_t dt = {0};
 
-    dt.handle = MPI_DATATYPE_NULL;
-    dt.bytes  = 0;
-    dt.commit = 0;
+    dt.handle  = MPI_DATATYPE_NULL;
+    dt.bytes   = count*blocklength*mpi_sizeof(oldtype);
+    dt.commit  = 0;
     dt.flavour = DT_NOT_IMPLEMENTED; /* Can't do strided copy */
     dt.stride  = stride;
 
@@ -1417,24 +1406,34 @@ int MPI_Cart_create(MPI_Comm oldcomm, int ndims, int * dims, int * periods,
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(oldcomm, fname);
+  /* standard doesn't put any constraint on ndims */
   ERR_IF_COMM_MPI_ERR_ARG(oldcomm, dims == NULL, fname);
   ERR_IF_COMM_MPI_ERR_ARG(oldcomm, periods == NULL, fname);
+  /* reorder is logical; no constraints */
   ERR_IF_COMM_MPI_ERR_ARG(oldcomm, newcomm == NULL, fname);
 
-  assert(ndims <= 3);
-
-  int icart; /* FIXME */
-
   mpi_info_->ncart += 1;
-  icart = MPI_COMM_SELF + mpi_info_->ncart;
-  assert(icart < MAX_CART_COMM);
 
-  *newcomm = icart;
+  {
+    /* Only Cartesian comms have handles above MPI_COM_SELF */
+    /* See also MPI_Comm_free() */
+    int icart = MPI_COMM_SELF + mpi_info_->ncart;
+
+    if (icart >= MAX_CART_COMM) {
+      ifail = MPI_ERR_INTERN;
+      mpi_comm_errors_are_fatal(&oldcomm, &ifail,
+				"MPI_Cart_create(): out of handles");
+      goto err;
+    }
+
+    /* Record periodity, reorder */
 
-  /* Record periodity */
+    for (int n = 0; n < ndims; n++) {
+      mpi_info_->period[icart][n] = periods[n];
+    }
+    mpi_info_->reorder[icart] = reorder;
 
-  for (int n = 0; n < ndims; n++) {
-    mpi_info_->period[icart][n] = periods[n];
+    *newcomm = icart;
   }
 
  err:
@@ -1453,7 +1452,7 @@ int MPI_Cart_get(MPI_Comm comm, int maxdims, int * dims, int * periods,
 		 int * coords) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Cart_get";
+  const char * fname = "MPI_Cart_get()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -1480,7 +1479,7 @@ int MPI_Cart_get(MPI_Comm comm, int maxdims, int * dims, int * periods,
 int MPI_Cart_coords(MPI_Comm comm, int rank, int maxdims, int * coords) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Cart_coords";
+  const char * fname = "MPI_Cart_coords()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -1506,7 +1505,7 @@ int MPI_Cart_coords(MPI_Comm comm, int rank, int maxdims, int * coords) {
 int MPI_Cart_rank(MPI_Comm comm, int * coords, int * rank) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Cart_rank";
+  const char * fname = "MPI_Cart_rank()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -1529,7 +1528,7 @@ int MPI_Cart_shift(MPI_Comm comm, int direction, int disp, int * rank_source,
 		   int * rank_dest) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Cart_shift";
+  const char * fname = "MPI_Cart_shift()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -1558,7 +1557,7 @@ int MPI_Cart_shift(MPI_Comm comm, int direction, int disp, int * rank_source,
 int MPI_Cart_sub(MPI_Comm comm, int * remain_dims, MPI_Comm * new_comm) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Cart_sub";
+  const char * fname = "MPI_Cart_sub()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
@@ -1580,13 +1579,12 @@ int MPI_Cart_sub(MPI_Comm comm, int * remain_dims, MPI_Comm * new_comm) {
 int MPI_Dims_create(int nnodes, int ndims, int * dims) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Dims_create";
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Dims_create()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, dims == NULL, fname);
-
-  assert(nnodes == 1); /* FIXME */
-  assert(ndims > 0);
+  ERR_IF_COMM_MPI_ERR_ARG(self, nnodes != 1, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, dims == NULL, fname);
 
   for (int d = 0; d < ndims; d++) {
     dims[d] = 1;
@@ -1605,14 +1603,16 @@ int MPI_Dims_create(int nnodes, int ndims, int * dims) {
 int MPI_Op_create(MPI_User_function * function, int commute, MPI_Op * op) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Op_create";
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Op_create()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, function == NULL, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, op == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, function == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, op == NULL, fname);
 
   /* commute is logical */
 
+  mpi_info_->commute = commute;
   *op = MPI_SUM;
 
  err:
@@ -1628,10 +1628,11 @@ int MPI_Op_create(MPI_User_function * function, int commute, MPI_Op * op) {
 int MPI_Op_free(MPI_Op * op) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Op_free";
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Op_free()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, op == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, op == NULL, fname);
 
   *op = MPI_OP_NULL;
 
@@ -1672,7 +1673,9 @@ static void mpi_copy(void * send, void * recv, int count, MPI_Datatype type) {
 
 static int mpi_sizeof(MPI_Datatype type) {
 
-  int size = -1;  /* Return -1 for unrecognised or invalid type */
+  int size = -1;
+
+  assert(mpi_datatype_intrinsic(type) || mpi_datatype_user(type));
 
   switch (type) {
   case MPI_CHAR:
@@ -1719,10 +1722,10 @@ static int mpi_sizeof(MPI_Datatype type) {
     size = sizeof(int64_t);
     break;
   case MPI_PACKED:
-    printf("MPI_PACKED not implemented\n");
+    /* Not implementend */
     break;
   default:
-    /* Try user type */
+    /* ... user type */
     size = mpi_sizeof_user(type);
   }
 
@@ -1764,12 +1767,13 @@ static int mpi_sizeof_user(MPI_Datatype handle) {
 int MPI_Comm_set_errhandler(MPI_Comm comm, MPI_Errhandler errhandler) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Comm_set_errhandler";
+  const char * fname = "MPI_Comm_set_errhandler()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(comm, fname);
+  ERR_IF_COMM_MPI_ERR_ERRHANDLER(comm, errhandler, fname);
 
-  assert(errhandler == MPI_ERRORS_ARE_FATAL); /* FIXME */
+  /* Only errhandler == MPI_ERRORS_ARE_FATAL available in comm */
 
  err:
   return ifail;
@@ -1818,11 +1822,12 @@ int MPI_Comm_set_errhandler(MPI_Comm comm, MPI_Errhandler errhandler) {
 int MPI_Get_address(const void * location, MPI_Aint * address) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Get_address";
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Get_address()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, location == NULL, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, address  == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, location == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, address  == NULL, fname);
 
   *address = (MPI_Aint) location;
 
@@ -1840,13 +1845,13 @@ int MPI_Group_translate_ranks(MPI_Group grp1, int n, const int * ranks1,
 			      MPI_Group grp2, int * ranks2) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Group_translate_ranks";
+  const char * fname = "MPI_Group_translate_ranks()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_COMM_MPI_ERR_COMM(grp1, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, ranks1 == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(grp1, ranks1 == NULL, fname);
   ERR_IF_COMM_MPI_ERR_COMM(grp2, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, ranks2 == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(grp2, ranks2 == NULL, fname);
 
   memcpy(ranks2, ranks1, n*sizeof(int));
 
@@ -1864,11 +1869,12 @@ int MPI_Type_create_resized(MPI_Datatype oldtype, MPI_Aint lb, MPI_Aint extent,
 			    MPI_Datatype * newtype) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Type_create_resized";
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Type_create_resized()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_DATATYPE(MPI_COMM_SELF, oldtype, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, newtype == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(self, oldtype, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, newtype == NULL, fname);
 
   {
     data_t dt = {0};
@@ -1898,13 +1904,16 @@ int MPI_Type_create_struct(int count, int array_of_blocklengths[],
 			   const MPI_Aint array_of_displacements[],
 			   const MPI_Datatype array_of_types[],
 			   MPI_Datatype * newtype) {
+  int ifail = MPI_SUCCESS;
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Type_create_struct()";
 
-  assert(count > 0);
-  assert(array_of_blocklengths);
-  assert(array_of_displacements);
-  assert(array_of_types);
-  assert(newtype);
-
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_COUNT(self, count, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, array_of_blocklengths == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, array_of_displacements == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, array_of_types == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, newtype == NULL, fname);
 
   {
     data_t dt = {0};
@@ -1922,7 +1931,8 @@ int MPI_Type_create_struct(int count, int array_of_blocklengths[],
     mpi_data_type_add(mpi_info_, &dt, newtype);
   }
 
-  return MPI_SUCCESS;
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -1935,29 +1945,19 @@ int MPI_Type_get_extent(MPI_Datatype datatype, MPI_Aint * lb,
 			MPI_Aint * extent) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Type_get_extent";
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Type_get_extent()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_DATATYPE(MPI_COMM_SELF, datatype, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, lb == NULL, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, extent == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(self, datatype, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, lb == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, extent == NULL, fname);
 
-  if (datatype < 0) {
-    /* Intrinsic */
-    *lb = 0;
-    *extent = mpi_sizeof(datatype);
-  }
-  else {
-
-    int handle = mpi_data_type_handle(mpi_info_, datatype);
-
-    if (handle == MPI_DATATYPE_NULL) {
-      printf("MPI_Type_get_extent: null handle\n");
-    }
+  /* Special case MPI_PACKED not implemented ... */
+  ERR_IF_COMM_MPI_ERR_ARG(self, datatype == MPI_PACKED, fname);
 
-    *lb = 0; /* Always, at the moment */
-    *extent = mpi_info_->dt[handle].bytes;
-  }
+  *lb = 0;
+  *extent = mpi_sizeof(datatype);
 
  err:
   return ifail;
@@ -1972,11 +1972,15 @@ int MPI_Type_get_extent(MPI_Datatype datatype, MPI_Aint * lb,
 int MPI_Type_size(MPI_Datatype datatype, int * sz) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_Type_size";
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Type_size()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_COMM_MPI_ERR_DATATYPE(MPI_COMM_SELF, datatype, fname);
-  ERR_IF_COMM_MPI_ERR_ARG(MPI_COMM_SELF, sz == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(self, datatype, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, sz == NULL, fname);
+
+  /* Special case MPI_PACKED not implemented ... */
+  ERR_IF_COMM_MPI_ERR_ARG(self, datatype == MPI_PACKED, fname);
 
   *sz = mpi_sizeof(datatype);
 
@@ -1994,7 +1998,8 @@ int MPI_File_open(MPI_Comm comm, const char * filename, int amode,
 		  MPI_Info info, MPI_File * fh) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_File_open";
+  MPI_File file = MPI_FILE_NULL;
+  const char * fname = "MPI_File_open()";
 
   FILE * fp = NULL;
   const char * fdmode = NULL;
@@ -2002,11 +2007,11 @@ int MPI_File_open(MPI_Comm comm, const char * filename, int amode,
   /* Default file error handler responsible ... */
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_FILE_MPI_ERR_COMM(MPI_FILE_NULL, comm, fname);
-  ERR_IF_FILE_MPI_ERR_ARG(MPI_FILE_NULL, filename == NULL, fname);
-  ERR_IF_FILE_MPI_ERR_AMODE(MPI_FILE_NULL, amode, fname);
-  ERR_IF_FILE_MPI_ERR_INFO(MPI_FILE_NULL, info, fname);
-  ERR_IF_FILE_MPI_ERR_ARG(MPI_FILE_NULL, fh == NULL, fname);
+  ERR_IF_FILE_MPI_ERR_COMM(file, comm, fname);
+  ERR_IF_FILE_MPI_ERR_ARG(file, filename == NULL, fname);
+  ERR_IF_FILE_MPI_ERR_AMODE(file, amode, fname);
+  ERR_IF_FILE_MPI_ERR_INFO(file, info, fname);
+  ERR_IF_FILE_MPI_ERR_ARG(file, fh == NULL, fname);
 
   /* Exactly one of RDONLY, WRONLY, or RDWR must be present. */
   /* RDONLY => no CREATE or EXCL. */
@@ -2049,12 +2054,37 @@ int MPI_File_open(MPI_Comm comm, const char * filename, int amode,
   fp = fopen(filename, fdmode);
 
   if (fp == NULL) {
-    printf("MPI_File_open: attempt to open %s mode %s failed\n", filename,
-	   fdmode);
-    return MPI_ERR_NO_SUCH_FILE;
+    /* Fail in fopen() => errno is set (many possible values ...) */
+    switch (errno) {
+    case ENOENT:
+      ifail = MPI_ERR_NO_SUCH_FILE;
+      mpi_file_errors_return(&file, &ifail,
+			     "MPI_File_open(): no such file %s",
+			     filename);
+      break;
+    default:
+      ifail = MPI_ERR_IO;
+      mpi_file_errors_return(&file, &ifail, 
+			     "MPI_File_open(): failed file %s mode %s",
+			     filename, fdmode);
+    }
+    goto err;
   }
 
-  *fh = mpi_file_handle_retain(mpi_info_, fp);
+  {
+    /* Generate a new file handle */
+    file = mpi_file_handle_retain(mpi_info_, fp);
+    if (file == MPI_FILE_NULL) {
+      /* Internal error; run out of file handles */
+      ifail = MPI_ERR_INTERN;
+      mpi_file_errors_return(&file, &ifail,
+			     "MPI_File_open(): run out of handles");
+      fclose(fp);
+      goto err;
+    }
+    mpi_info_->filelist[file].comm = MPI_COMM_SELF;
+    *fh = file;
+  }
 
  err:
   return ifail;
@@ -2069,29 +2099,21 @@ int MPI_File_open(MPI_Comm comm, const char * filename, int amode,
 int MPI_File_close(MPI_File * fh) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_File_close";
-
-  FILE * fp = NULL;
+  MPI_File file = MPI_FILE_NULL;
+  const char * fname = "MPI_File_close()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  /* ERR_IF_NULL_POINTER(fh == NULL, MPI_ERR_ARG, fname); */
-  if (fh == NULL) {
-    ifail = MPI_ERR_ARG;
-    goto err;
-  }
+  ERR_IF_FILE_MPI_ERR_ARG(file, fh == NULL, fname);
   ERR_IF_FILE_MPI_ERR_FILE(*fh, fname);
 
-  fp = mpi_file_handle_release(mpi_info_, *fh);
-
-  if (fp == NULL) {
-    printf("MPI_File_close: invalid file handle\n");
-    ifail = MPI_ERR_FILE;
-  }
-  else {
+  /* File handle is now validated ... */
+  {
+    FILE * fp = mpi_file_handle_release(mpi_info_, *fh);
     fclose(fp);
-    *fh = MPI_FILE_NULL;
   }
 
+  *fh = MPI_FILE_NULL;
+
  err:
   return ifail;
 }
@@ -2100,21 +2122,36 @@ int MPI_File_close(MPI_File * fh) {
  *
  *  MPI_File_delete
  *
- *  Can return MPI_ERR_NO_SUCH_FILE, at least.
- *
  *****************************************************************************/
 
 int MPI_File_delete(const char * filename, MPI_Info info) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_File_delete";
+  MPI_File file = MPI_FILE_NULL;
+  const char * fname = "MPI_File_delete()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
-  ERR_IF_FILE_MPI_ERR_ARG(MPI_FILE_NULL, filename == NULL, fname);
-  ERR_IF_FILE_MPI_ERR_INFO(MPI_FILE_NULL, info, fname);
+  ERR_IF_FILE_MPI_ERR_ARG(file, filename == NULL, fname);
+  ERR_IF_FILE_MPI_ERR_INFO(file, info, fname);
 
   /* remove() returns 0 on success, -1 otherwise. errno is set. */
-  remove(filename);
+
+  if (remove(filename) != 0) {
+
+    /* The standard gives the following options ...
+     * MPI_ERR_NO_SUCH_FILE if the file does not exist; or
+     * MPI_ERR_FILE_IN_USE or MPI_ERR_ACCESS. We use the latter. */
+
+    switch (errno) {
+    case ENOENT:
+      ifail = MPI_ERR_NO_SUCH_FILE;
+      mpi_file_errors_return(&file, &ifail, "MPI_Delete(): no such file");
+      break;
+    default:
+      ifail = MPI_ERR_ACCESS;
+      mpi_file_errors_return(&file, &ifail, "MPI_Delete(): access error");
+    }
+  }
 
  err:
   return ifail;
@@ -2133,22 +2170,31 @@ int MPI_Type_create_subarray(int ndims, const int * array_of_sizes,
 			     MPI_Datatype oldtype,
 			     MPI_Datatype * newtype) {
 
-  int nelements = 0;
+  int ifail = MPI_SUCCESS;
+  MPI_Comm self = MPI_COMM_SELF;
+  const char * fname = "MPI_Type_create_subarray";
 
-  assert(ndims == 2 || ndims == 3); /* We accept this is not general */
-  assert(array_of_sizes);
-  assert(array_of_subsizes);
-  assert(array_of_starts);
+  ERR_IF_MPI_NOT_INITIALISED(fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, ndims <= 0, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, array_of_subsizes == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, array_of_starts == NULL, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, order != MPI_ORDER_C, fname);
+  ERR_IF_COMM_MPI_ERR_DATATYPE(self, oldtype, fname);
+  ERR_IF_COMM_MPI_ERR_ARG(self, newtype == NULL, fname);
+
+  /* Should really be ... */
   assert(order == MPI_ORDER_C || order == MPI_ORDER_FORTRAN);
-  assert(newtype);
 
   /* Assume this is a contiguous block of elements of oldtype */
-  nelements = array_of_sizes[0]*array_of_sizes[1];
-  if (ndims == 3) nelements *= array_of_sizes[2];
 
   {
+    int nelements = 1;
     data_t dt = {0};
 
+    for (int idim = 0; idim < ndims; idim++) {
+      nelements *= array_of_sizes[idim];
+    }
+
     dt.handle  = MPI_DATATYPE_NULL;
     dt.bytes   = mpi_sizeof(oldtype)*nelements;
     dt.commit  = 0;
@@ -2156,7 +2202,9 @@ int MPI_Type_create_subarray(int ndims, const int * array_of_sizes,
 
     mpi_data_type_add(mpi_info_, &dt, newtype);
   }
-  return MPI_SUCCESS;
+
+ err:
+  return ifail;
 }
 
 /*****************************************************************************
@@ -2171,22 +2219,16 @@ int MPI_File_get_view(MPI_File fh, MPI_Offset * disp, MPI_Datatype * etype,
   int ifail = MPI_SUCCESS;
   const char * fname = "MPI_File_get_view";
 
-  FILE * fp = NULL;
-
+  ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_FILE_MPI_ERR_FILE(fh, fname);
   ERR_IF_FILE_MPI_ERR_ARG(fh, disp == NULL, fname);
   ERR_IF_FILE_MPI_ERR_ARG(fh, etype == NULL, fname);
   ERR_IF_FILE_MPI_ERR_ARG(fh, filetype == NULL, fname);
   ERR_IF_FILE_MPI_ERR_ARG(fh, datarep == NULL, fname);
 
-  fp = mpi_file_handle_to_fp(mpi_info_, fh);
-
-  if (fp == NULL) {
-    printf("MPI_File_get_view: invalid file handle\n");
-    exit(0);
-  }
-  else {
+  {
     file_t * file = &mpi_info_->filelist[fh];
+
     *disp = file->disp;
     *etype = file->etype;
     *filetype = file->filetype;
@@ -2194,7 +2236,7 @@ int MPI_File_get_view(MPI_File fh, MPI_Offset * disp, MPI_Datatype * etype,
   }
 
  err:
-  return MPI_SUCCESS;
+  return ifail;
 }
 /*****************************************************************************
  *
@@ -2207,31 +2249,23 @@ int MPI_File_set_view(MPI_File fh, MPI_Offset disp, MPI_Datatype etype,
 		      MPI_Info info) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_File_set_view";
+  const char * fname = "MPI_File_set_view()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_FILE_MPI_ERR_FILE(fh, fname);
   ERR_IF_FILE_MPI_ERR_DATATYPE(fh, etype, fname);
   ERR_IF_FILE_MPI_ERR_DATATYPE(fh, filetype, fname);
-  /* FIXME "datarep" should be native */
+  ERR_IF_FILE_MPI_ERR_ARG(fh, datarep == NULL, fname);
+  ERR_IF_FILE_MPI_ERR_ARG(fh, strcmp(datarep, "native") != 0, fname);
   ERR_IF_FILE_MPI_ERR_INFO(fh, info, fname);
 
-  assert(datarep);
+  /* There is actually MPI_ERR_UNSUPPORTED_DATAREP */
 
-  FILE * fp = NULL;
-
-  fp = mpi_file_handle_to_fp(mpi_info_, fh);
-
-  if (fp == NULL) {
-    printf("MPI_File_set_view: invalid file handle\n");
-    exit(0);
-  }
-  else {
+  {
     file_t * file = &mpi_info_->filelist[fh];
     file->disp = disp;
     file->etype = etype;
     file->filetype = filetype;
-    /* Could demand "native" ... */
     strncpy(file->datarep, datarep, MPI_MAX_DATAREP_STRING-1);
     /* info is currently discarded */
   }
@@ -2250,44 +2284,34 @@ int MPI_File_read_all(MPI_File fh, void * buf, int count,
 		      MPI_Datatype datatype, MPI_Status * status) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_File_read_all";
+  const char * fname = "MPI_File_read_all()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_FILE_MPI_ERR_FILE(fh, fname);
   ERR_IF_FILE_MPI_ERR_BUFFER(fh, buf, fname);
-  /* count: integer */
+  ERR_IF_FILE_MPI_ERR_COUNT(fh, count, fname);
   ERR_IF_FILE_MPI_ERR_DATATYPE(fh, datatype, fname);
-  /* status may be MPI_STATUS_IGNORE */
-
-  FILE * fp = NULL;
-
-  fp = mpi_file_handle_to_fp(mpi_info_, fh);
 
-  if (fp == NULL) {
-    printf("MPI_File_read_all: invalid_file handle\n");
-    exit(0);
-  }
-  else {
+  {
+    FILE * fp = mpi_file_handle_to_fp(mpi_info_, fh);
 
     /* Translate to a simple fread() */
+    /* A short count of items indicates an error (eof or error) ... */
 
     size_t size   = mpi_sizeof(datatype);
     size_t nitems = count;
-    size_t nr = fread(buf, size, nitems, fp);
+    size_t nread  = fread(buf, size, nitems, fp);
 
-    if (nr != nitems) {
-      printf("MPI_File_read_all(): incorrect number of items in fread()\n");
-    }
 
-    if (ferror(fp)) {
-      perror("perror: ");
-      printf("MPI_File_read_all() file operation failed\n");
-      exit(0);
+    if (nread < nitems) {
+      ifail = MPI_ERR_IO;
+      printf("MPI_File_read_all(): "); if (ferror(fp)) perror(NULL);
     }
   }
 
  err:
   if (status != MPI_STATUS_IGNORE) status->MPI_ERROR = ifail;
+
   return ifail;
 }
 
@@ -2301,39 +2325,27 @@ int MPI_File_write_all(MPI_File fh, const void * buf, int count,
 		       MPI_Datatype datatype, MPI_Status * status) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_File_write_all";
+  const char * fname = "MPI_File_write_all()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_FILE_MPI_ERR_FILE(fh, fname);
   ERR_IF_FILE_MPI_ERR_BUFFER(fh, buf, fname);
-  /* count: integer */
+  ERR_IF_FILE_MPI_ERR_COUNT(fh, count, fname);
   ERR_IF_FILE_MPI_ERR_DATATYPE(fh, datatype, fname);
-  /* status may be MPI_STATUS_IGNORE */
 
   {
     FILE * fp = mpi_file_handle_to_fp(mpi_info_, fh);
 
-    if (fp == NULL) {
-      printf("MPI_File_write_all: invalid_file handle");
-      exit(0); /* FIXME */
-    }
-    else {
-
-      /* Translate to a simple fwrite() */
-
-      size_t size   = mpi_sizeof(datatype);
-      size_t nitems = count;
-      size_t nw = fwrite(buf, size, nitems, fp);
+    /* Translate to a simple fwrite() */
+    /* A short count of items indicates an error ... */
 
-      if (nw != nitems) {
-	printf("MPI_File_write_all(): incorrect number of items in fwrite()\n");
-      }
+    size_t size   = mpi_sizeof(datatype);
+    size_t nitems = count;
+    size_t nwrite = fwrite(buf, size, nitems, fp);
 
-      if (ferror(fp)) {
-	perror("perror: ");
-	printf("MPI_File_write_all() file operation failed\n");
-	exit(0); /* FIXME */
-      }
+    if (nwrite < nitems) {
+      ifail = MPI_ERR_IO;
+      printf("MPI_File_write_all(): "); if (ferror(fp)) perror(NULL);
     }
   }
 
@@ -2353,21 +2365,17 @@ int MPI_File_write_all_begin(MPI_File fh, const void * buf, int count,
 			     MPI_Datatype datatype) {
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_File_write_all_begin";
+  const char * fname = "MPI_File_write_all_begin()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_FILE_MPI_ERR_FILE(fh, fname);
   ERR_IF_FILE_MPI_ERR_BUFFER(fh, buf, fname);
-  /* count: integer */
+  ERR_IF_FILE_MPI_ERR_COUNT(fh, count, fname);
   ERR_IF_FILE_MPI_ERR_DATATYPE(fh, datatype, fname);
 
   /* We are going to do it here and throw away the status */
 
-  {
-    MPI_Status status = {0};
-
-    ifail = MPI_File_write_all(fh, buf, count, datatype, &status);
-  }
+  ifail = MPI_File_write_all(fh, buf, count, datatype, MPI_STATUS_IGNORE);
 
  err:
   return ifail;
@@ -2385,12 +2393,13 @@ int MPI_File_write_all_end(MPI_File fh, const void * buf, MPI_Status * status) {
    * status object. */
 
   int ifail = MPI_SUCCESS;
-  const char * fname = "MPI_File_write_all_end";
+  const char * fname = "MPI_File_write_all_end()";
 
   ERR_IF_MPI_NOT_INITIALISED(fname);
   ERR_IF_FILE_MPI_ERR_FILE(fh, fname);
   ERR_IF_FILE_MPI_ERR_BUFFER(fh, buf, fname);
-  ERR_IF_FILE_MPI_ERR_ARG(fh, status == NULL, fname);
+
+  if (status != MPI_STATUS_IGNORE) status->MPI_ERROR = MPI_SUCCESS;
 
  err:
   return ifail;
@@ -2400,15 +2409,18 @@ int MPI_File_write_all_end(MPI_File fh, const void * buf, MPI_Status * status) {
 
 /*****************************************************************************
  *
- *  mpi_is_valid_comm
+ *  mpi_err_comm
  *
  *****************************************************************************/
 
-int mpi_is_valid_comm(MPI_Comm comm) {
+static int mpi_err_comm(MPI_Comm comm) {
 
-  if (comm < MPI_COMM_WORLD || comm >= MAX_CART_COMM) return 0;
+  int ifail = MPI_SUCCESS;
+
+  if (comm <  MPI_COMM_WORLD) ifail = MPI_ERR_COMM;
+  if (comm >= MAX_CART_COMM)  ifail = MPI_ERR_COMM;
 
-  return 1;
+  return ifail;
 }
 
 /*****************************************************************************
@@ -2483,15 +2495,15 @@ static int mpi_data_type_free(mpi_info_t * ctxt, MPI_Datatype * handle) {
  *
  *****************************************************************************/
 
-static int mpi_data_type_handle(mpi_info_t * ctxt, MPI_Datatype handle) {
+static int mpi_data_type_handle(mpi_info_t * mpi, MPI_Datatype handle) {
 
   int index = MPI_DATATYPE_NULL;
 
-  assert(ctxt);
-  assert(handle >= 0);
+  assert(mpi);
+  assert(handle >= 0); /* i.e. MPI_DATATYPE_NULL or user data type. */
 
-  if (handle <= ctxt->ndatatypelast) {
-    index = ctxt->dt[handle].handle;
+  if (handle <= mpi->ndatatypelast) {
+    index = mpi->dt[handle].handle;
   }
 
   return index;
@@ -2513,6 +2525,7 @@ static MPI_File mpi_file_handle_retain(mpi_info_t * mpi, FILE * fp) {
   assert(mpi);
   assert(fp);
 
+  /* Find a free handle */
   for (int ih = 1; ih < MAX_USER_FILE; ih++) {
     if (mpi->filelist[ih].fp == NULL) {
       fh = ih;
@@ -2520,14 +2533,11 @@ static MPI_File mpi_file_handle_retain(mpi_info_t * mpi, FILE * fp) {
     }
   }
 
-  if (fh == MPI_FILE_NULL) {
-    printf("Run out of MPI file handles\n");
-    exit(0);
+  if (fh != MPI_FILE_NULL) {
+    /* Record the pointer against the handle */
+    mpi->filelist[fh].fp = fp;
   }
 
-  /* Record the pointer against the handle */
-  mpi->filelist[fh].fp = fp;
-
   return fh;
 }
 
@@ -2576,52 +2586,326 @@ static FILE * mpi_file_handle_to_fp(mpi_info_t * mpi, MPI_File fh) {
 
 /*****************************************************************************
  *
- *  mpi_file_handle_invalid
+ *  mpi_err_file
  *
- *  Returns MPI_SUCCESS if fh is valid or, MPI_ERR_FILE if invalid.
+ *  Returns MPI_SUCCESS if file is a valid file handle
+ *  or MPI_ERR_FILE if invalid.
  *
  *****************************************************************************/
 
-static int mpi_file_handle_invalid(MPI_File fh) {
+static int mpi_err_file(MPI_File file) {
 
   int ifail = MPI_SUCCESS;
 
   assert(mpi_info_);
 
-  if (mpi_file_handle_to_fp(mpi_info_, fh) == NULL) ifail = MPI_ERR_FILE;
+  if (mpi_file_handle_to_fp(mpi_info_, file) == NULL) ifail = MPI_ERR_FILE;
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_err_count
+ *
+ *****************************************************************************/
+
+static int mpi_err_count(int count) {
+
+  int ifail = MPI_SUCCESS;
+
+  if (count < 0) ifail = MPI_ERR_COUNT;
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_err_datayupe
+ *
+ *****************************************************************************/
+
+static int mpi_err_datatype(MPI_Datatype dt) {
+
+  int ifail = MPI_SUCCESS;
+
+  if (mpi_datatype_intrinsic(dt) == 0 && mpi_datatype_user(dt) == 0) {
+    ifail = MPI_ERR_DATATYPE;
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_err_arg
+ *
+ ****************************************************************************/
+
+static int mpi_err_arg(int arg) {
+
+  int ifail = MPI_SUCCESS;
+
+  if (arg) ifail = MPI_ERR_ARG; /* (sic) arg is a condition for failure */
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_err_amode
+ *
+ *  amode is the mode argument to MPI_File_open(); this returns
+ *  MPI_ERR_AMODE if amode is invalid.
+ *
+ *****************************************************************************/
+
+static int mpi_err_amode(int amode) {
+
+  int ifail = MPI_SUCCESS;
+
+  {
+    int have_rdonly = (amode & MPI_MODE_RDONLY) ? 1 : 0;
+    int have_wronly = (amode & MPI_MODE_WRONLY) ? 2 : 0;
+    int have_rdwr   = (amode & MPI_MODE_RDWR)   ? 4 : 0;
+
+    int have_create = (amode & MPI_MODE_CREATE);
+    int have_excl   = (amode & MPI_MODE_EXCL);
+
+    switch (have_rdonly + have_wronly + have_rdwr) {
+    case (1):
+      /* Read only cannot have ... */
+      if (have_create) ifail = MPI_ERR_AMODE;
+      if (have_excl)   ifail = MPI_ERR_AMODE;
+      break;
+    case (2):
+      /* Write only  */
+      break;
+    case (4):
+      /* Read write */
+      break;
+    default:
+      /* Not recognised */
+      ifail = MPI_ERR_AMODE;
+    }
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_err_buffer
+ *
+ *****************************************************************************/
+
+static int mpi_err_buffer(const void * buf) {
+
+  int ifail = MPI_SUCCESS;
+
+  if (!buf) ifail = MPI_ERR_BUFFER;
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_err_errhandler
+ *
+ *****************************************************************************/
+
+static int mpi_err_errhandler(MPI_Errhandler errhandler) {
+
+  int ifail = MPI_SUCCESS;
+
+  if (errhandler != MPI_ERRORS_ARE_FATAL) ifail = MPI_ERR_ERRHANDLER;
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_err_info
+ *
+ *****************************************************************************/
+
+static int mpi_err_info(MPI_Info info) {
+
+  int ifail = MPI_SUCCESS;
+
+  if (info != MPI_INFO_NULL) {
+    /* At the moment, only MPI_INFO_NULL is handled */
+    ifail = MPI_ERR_INTERN;
+  }
 
   return ifail;
 }
 
 /*****************************************************************************
  *
- *  mpi_tag_valid MPI_ERR_TAG
+ *  mpi_err_op
  *
  *****************************************************************************/
 
-static int mpi_tag_invalid(int tag) {
+static int mpi_err_op(MPI_Op op) {
 
-  int ifail = MPI_ERR_TAG;
+  int ifail = MPI_SUCCESS;
+
+  /* Any recognised op is fine (except MPI_OP_NULL).
+   * Strictly, the validity of an op depends on additional information
+   * such as data type, so this is very simple ... */
 
-  /* Special values: MPI_ANY_TAG */
-  if (tag == MPI_ANY_TAG) ifail = MPI_SUCCESS;
+  if (op < 0)        ifail = MPI_ERR_OP;
+  if (op > MPI_LXOR) ifail = MPI_ERR_OP;
 
   return ifail;
 }
 
 /*****************************************************************************
  *
- *  mpi_datayupe_invalid
+ *  mpi_err_rank
  *
  *****************************************************************************/
 
-static int mpi_datatype_invalid(MPI_Datatype dt) {
+static int mpi_err_rank(int rank) {
 
   int ifail = MPI_SUCCESS;
 
-  /* Look at the size to determined whether valid */
-  int sz = mpi_sizeof(dt);
-  if (sz < 0) ifail = MPI_ERR_DATATYPE;
+  if (rank != 0)              ifail = MPI_ERR_RANK;
+  if (rank == MPI_ANY_SOURCE) ifail = MPI_SUCCESS;    /* ... but this ok */
+  if (rank == MPI_PROC_NULL)  ifail = MPI_SUCCESS;    /* also ok. */
 
   return ifail;
 }
+
+/*****************************************************************************
+ *
+ *  mpi_err_root
+ *
+ *****************************************************************************/
+
+static int mpi_err_root(int root) {
+
+  int ifail = MPI_SUCCESS;
+
+  if (root != 0) ifail = MPI_ERR_ROOT;
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_err_tag
+ *
+ *****************************************************************************/
+
+static int mpi_err_tag(int tag) {
+
+  int ifail = MPI_SUCCESS;
+
+  if (tag <= 0) ifail = MPI_ERR_TAG;
+  if (tag == MPI_ANY_TAG) ifail = MPI_SUCCESS; /* MPI_ANY_TAG < 0 */
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_comm_errors_are_fatal
+ *
+ *  The first optional argument must be present, and it should be a
+ *  format string suitable for a printf()-like function. Remaining
+ *  argements should be consistent with the format.
+ *
+ *  This is fatal if ifail != MPI_SUCCESS.
+ *
+ *****************************************************************************/
+
+static void mpi_comm_errors_are_fatal(MPI_Comm * comm, int * ifail, ...) {
+
+  assert(comm);
+  assert(ifail);
+
+  if (*ifail != MPI_SUCCESS) {
+    va_list ap;
+    va_start(ap, ifail);
+    {
+      const char * fmt = va_arg(ap, const char *);
+      printf("MPI_ERRORS_ARE_FATAL: ");
+      vprintf(fmt, ap);
+      printf(" (comm = %d)\n", *comm);
+    }
+    va_end(ap);
+    exit(-1);
+  }
+
+  return;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_file_errors_return
+ *
+ *  I'm going to store the error string, but I don't know if there's
+ *  anything to be done with it...
+ *
+ *****************************************************************************/
+
+static void mpi_file_errors_return(MPI_File * file, int * ifail, ...) {
+
+  assert(file);
+  assert(ifail);
+
+  if (*ifail != MPI_SUCCESS) {
+    char * errorstr = mpi_info_->filelist[*file].errorstr;
+    va_list ap;
+    va_start(ap, ifail);
+    {
+      const char * fmt = va_arg(ap, const char *);
+      vsnprintf(errorstr, MPI_MAX_ERROR_STRING-1, fmt, ap);
+    }
+    va_end(ap);
+  }
+
+  return;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_datatype_intrinsic
+ *
+ *****************************************************************************/
+
+static int mpi_datatype_intrinsic(MPI_Datatype dt) {
+
+  int intrinsic = 0;
+
+  /* Is dt an intrinsic datatype? See mpi.h */
+
+  if (MPI_CHAR >= dt && dt >= MPI_INT64_T) intrinsic = 1;
+
+  return intrinsic;
+}
+
+/*****************************************************************************
+ *
+ *  mpi_datatype_user
+ *
+ *****************************************************************************/
+
+static int mpi_datatype_user(MPI_Datatype dt) {
+
+  int isuser = 0;
+
+  assert(mpi_info_);
+  assert(MPI_DATATYPE_NULL == 0);   /* mpi_info->dt[0] is null */ 
+
+  /* Is dt a valid user datatype */
+
+  if (0 < dt && dt <= MAX_USER_DT) {
+    /* check the list */
+    if (mpi_info_->dt[dt].handle != MPI_DATATYPE_NULL) isuser = 1;
+  }
+
+  return isuser;
+}
diff --git a/mpi_s/mpi_tests.c b/mpi_s/mpi_tests.c
index ac5f2004f..7840fc5a2 100644
--- a/mpi_s/mpi_tests.c
+++ b/mpi_s/mpi_tests.c
@@ -4,7 +4,10 @@
  *
  *  Tests for the serial stubs
  *
- *  (c) 2022 The University of Edinburgh
+ *  Edinburgh Soft Matter and Statistical Physics Group and
+ *  Edinburgh Parallel Computing Centre
+ *
+ *  (c) 2022-2024 The University of Edinburgh
  *
  *****************************************************************************/
 
@@ -27,6 +30,8 @@ static int test_mpi_reduce(void);
 static int test_mpi_allgather(void);
 static int test_mpi_type_contiguous(void);
 static int test_mpi_type_create_struct(void);
+static int test_mpi_type_size(void);
+static int test_mpi_type_get_extent(void);
 static int test_mpi_op_create(void);
 static int test_mpi_file_open(void);
 static int test_mpi_file_get_view(void);
@@ -34,6 +39,12 @@ static int test_mpi_file_set_view(void);
 static int test_mpi_type_create_subarray(void);
 static int test_mpi_file_write_all(void);
 
+static int test_mpi_comm_split_type(void);
+static int test_mpi_cart_create(void);
+static int test_mpi_cart_get(void);
+static int test_mpi_dims_create(void);
+
+
 /* Utilities */
 
 /*****************************************************************************
@@ -104,7 +115,12 @@ int util_double_same(double d1, double d2) {
   return util_bits_same(sizeof(double), &a, &b);
 }
 
-static int test_mpi_comm_split_type(void);
+
+/*****************************************************************************
+ *
+ *  main
+ *
+ *****************************************************************************/
 
 int main (int argc, char ** argv) {
 
@@ -123,8 +139,13 @@ int main (int argc, char ** argv) {
 
   test_mpi_type_contiguous();
   test_mpi_type_create_struct();
+  test_mpi_type_size();
+  test_mpi_type_get_extent();
   test_mpi_op_create();
   test_mpi_comm_split_type();
+  test_mpi_cart_create();
+  test_mpi_cart_get();
+  test_mpi_dims_create();
 
   test_mpi_file_open();
   test_mpi_file_get_view();
@@ -184,30 +205,48 @@ static int test_mpi_comm_size(void) {
 
 static int test_mpi_allreduce(void) {
 
-  int ireturn;
-  double dsend, drecv;
-  int isend[3], irecv[3];
+  int ifail = 0;
+  MPI_Comm comm = MPI_COMM_WORLD;
 
-  dsend = 1.0; drecv = 0.0;
-  ireturn = MPI_Allreduce(&dsend, &drecv, 1, MPI_DOUBLE, MPI_SUM, comm_);
-  assert(ireturn == MPI_SUCCESS);
-  assert(util_double_same(dsend, 1.0));   /* Exactly */
-  assert(util_double_same(drecv, dsend)); /* Exactly */
+  {
+    double dsend = 1.0;
+    double drecv = 0.0;
+    int ireturn = MPI_Allreduce(&dsend, &drecv, 1, MPI_DOUBLE, MPI_SUM, comm);
+    if (ireturn != MPI_SUCCESS) ifail = 1;
+    assert(ifail == 0);
+    if (util_double_same(dsend, 1.0) == 0) ifail = 1;   /* Exactly */
+    assert(ifail == 0);
+    if (util_double_same(drecv, dsend) == 0) ifail = 1; /* Exactly */
+    assert(ifail == 0);
+  }
 
-  isend[0] = -1;
-  isend[1] = 0;
-  isend[2] = +1;
+  {
+    int isend[3] = {1, 2, 3};
+    int irecv[3] = {0, 0, 0};
+
+    int ireturn = MPI_Allreduce(isend, irecv, 3, MPI_INT, MPI_SUM, comm);
+    if (ireturn != MPI_SUCCESS) ifail = 0;
+    assert(ifail == 0);
+    if (isend[0] != 1) ifail += 1;
+    if (isend[1] != 2) ifail += 1;
+    if (isend[2] != 3) ifail += 1;
+    assert(ifail == 0);
+    if (irecv[0] != isend[0]) ifail += 1;
+    if (irecv[1] != isend[1]) ifail += 1;
+    if (irecv[2] != isend[2]) ifail += 1;
+    assert(ifail == 0);
+  }
 
-  ireturn = MPI_Allreduce(isend, irecv, 3, MPI_INT, MPI_SUM, comm_);
-  assert(ireturn == MPI_SUCCESS);
-  assert(isend[0] == -1);
-  assert(isend[1] == 0);
-  assert(isend[2] == +1);
-  assert(irecv[0] == -1);
-  assert(irecv[1] == 0);
-  assert(irecv[2] == +1);
+  {
+    int irecv = 1;
+    int iret  = MPI_Allreduce(MPI_IN_PLACE, &irecv, 1, MPI_INT, MPI_SUM, comm);
+    if (iret != MPI_SUCCESS) ifail = 1;
+    assert(ifaill == 0);
+    if (irecv != 1) ifail = 1;
+    assert(ifail == 0);
+  }
 
-  return ireturn;
+  return ifail;
 }
 
 /*****************************************************************************
@@ -390,6 +429,7 @@ int test_mpi_type_create_struct(void) {
 
 /*****************************************************************************
  *
+ *  test_mpi_op_create_function
  *  test_mpi_op_create
  *
  *****************************************************************************/
@@ -402,6 +442,14 @@ void test_op_create_function(void * invec, void * inoutvec, int * len,
   assert(len);
   assert(dt);
 
+  if (*dt == MPI_INT) { /* see comments below */
+    int * myinoutvec = (int *) inoutvec;
+    int * myinvec    = (int *) invec;
+    for (int n = 0; n < *len; n++) {
+      myinoutvec[n] = 2*myinvec[n];
+    }
+  }
+
   return;
 }
 
@@ -413,7 +461,8 @@ static int test_mpi_op_create(void) {
   assert(op != MPI_OP_NULL);
 
   {
-    /* Smoke test */
+    /* Smoke test; the implementation only ever copies; the function
+     * is not called ... */
     int send = 1;
     int recv = 0;
 
@@ -435,24 +484,30 @@ static int test_mpi_op_create(void) {
 
 int test_mpi_file_open(void) {
 
+  int ifail = MPI_SUCCESS;
   MPI_Comm comm = MPI_COMM_WORLD;
   MPI_Info info = MPI_INFO_NULL;
 
   {
     /* fopen "r". We must have an existing file. */
     MPI_File fh = MPI_FILE_NULL;
-    MPI_File_open(comm, "/dev/null", MPI_MODE_RDONLY, info, &fh);
+    ifail = MPI_File_open(comm, "/dev/null", MPI_MODE_RDONLY, info, &fh);
+    assert(ifail == MPI_SUCCESS);
     assert(fh != MPI_FILE_NULL);
-    MPI_File_close(&fh);
+    ifail = MPI_File_close(&fh);
+    assert(ifail == MPI_SUCCESS);
     assert(fh == MPI_FILE_NULL);
   }
 
   {
     /* fopen "w" */
     MPI_File fh = MPI_FILE_NULL;
-    MPI_File_open(comm, "zw.dat", MPI_MODE_WRONLY+MPI_MODE_CREATE, info, &fh);
+    ifail = MPI_File_open(comm, "zw.dat", MPI_MODE_WRONLY+MPI_MODE_CREATE,
+			  info, &fh);
+    assert(ifail == MPI_SUCCESS);
     assert(fh != MPI_FILE_NULL);
-    MPI_File_close(&fh);
+    ifail = MPI_File_close(&fh);
+    assert(ifail == MPI_SUCCESS);
     assert(fh == MPI_FILE_NULL);
     unlink("zw.dat");
   }
@@ -460,23 +515,51 @@ int test_mpi_file_open(void) {
   {
     /* fopen "a" */
     MPI_File fh = MPI_FILE_NULL;
-    MPI_File_open(comm, "z.dat",  MPI_MODE_WRONLY+MPI_MODE_APPEND, info, &fh);
+    ifail = MPI_File_open(comm, "z.dat",  MPI_MODE_WRONLY+MPI_MODE_APPEND,
+			  info, &fh);
+    assert(ifail == MPI_SUCCESS);
     assert(fh != MPI_FILE_NULL);
-    MPI_File_close(&fh);
+    ifail = MPI_File_close(&fh);
+    assert(ifail == MPI_SUCCESS);
     assert(fh == MPI_FILE_NULL);
   }
 
   {
     /* fopen "r+" */
     MPI_File fh = MPI_FILE_NULL;
-    MPI_File_open(comm, "z.dat", MPI_MODE_RDWR, info, &fh);
+    ifail = MPI_File_open(comm, "z.dat", MPI_MODE_RDWR, info, &fh);
+    assert(ifail == MPI_SUCCESS);
     assert(fh != MPI_FILE_NULL);
-    MPI_File_close(&fh);
+    ifail = MPI_File_close(&fh);
+    assert(ifail == MPI_SUCCESS);
     assert(fh == MPI_FILE_NULL);
     unlink("z.dat");
   }
 
-  return 0;
+  /* Errors return when bad communicator */
+  {
+    MPI_File fh = MPI_FILE_NULL;
+    ifail = MPI_File_open(MPI_COMM_NULL, "/dev/null", MPI_MODE_RDWR, info,
+			  &fh);
+    assert(ifail == MPI_ERR_COMM);
+  }
+
+  /* Errors return when bad amode */
+  {
+    int amode = 0;
+    MPI_File fh = MPI_FILE_NULL;
+    ifail = MPI_File_open(comm, "/dev/null", amode, info, &fh);
+    assert(ifail == MPI_ERR_AMODE);
+  }
+
+  /* Errors return when no such file */
+  {
+    MPI_File fh = MPI_FILE_NULL;
+    ifail = MPI_File_open(comm, "none-such", MPI_MODE_RDONLY, info, &fh);
+    assert(ifail == MPI_ERR_NO_SUCH_FILE);
+  }
+
+  return ifail;
 }
 
 /*****************************************************************************
@@ -611,7 +694,7 @@ int test_mpi_file_write_all(void) {
 #define NX 23
 #define NY 12
 
-  int ifail = 0; /* return value */
+  int ifail = MPI_SUCCESS; /* return value */
 
   const char * filename = "mpi-file-write-all.dat";
   MPI_Comm comm = MPI_COMM_WORLD;
@@ -638,7 +721,6 @@ int test_mpi_file_write_all(void) {
     MPI_File fh = MPI_FILE_NULL;
     MPI_Offset disp = 0;
 
-    int ifail = MPI_SUCCESS;
     int count = 1;
     double wbuf[NX*NY] = {0};
 
@@ -716,3 +798,172 @@ int test_mpi_comm_split_type(void) {
 
   return 0;
 }
+
+/*****************************************************************************
+ *
+ *  test_mpi_cart_create
+ *
+ *****************************************************************************/
+
+static int test_mpi_cart_create(void) {
+
+  int ifail = 0;
+
+  {
+    int ndims = 2;
+    int dims[2]  = {1, 1};
+    int periods[2] = {0, 0};
+    int reorder = 0;
+    MPI_Comm comm = MPI_COMM_NULL;
+
+    MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, periods, reorder, &comm);
+
+    assert(comm != MPI_COMM_NULL);
+    assert(comm != MPI_COMM_SELF);
+
+    MPI_Comm_free(&comm);
+  }
+
+  {
+    int ndims = 3;
+    int dims[3] = {1, 1, 1};
+    int periods[3] = {1, 0, 1};
+    int reorder = 1;
+    MPI_Comm comm = MPI_COMM_NULL;
+
+    MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, periods, reorder, &comm);
+
+    assert(comm != MPI_COMM_NULL);
+    assert(comm != MPI_COMM_SELF);
+
+    MPI_Comm_free(&comm);
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  test_mpi_cart_get
+ *
+ *****************************************************************************/
+
+static int test_mpi_cart_get(void) {
+
+  int ifail = 0;
+
+  {
+    int ndims = 2;
+    int dims[2]  = {1, 1};
+    int periods[2] = {0, 0};
+    int reorder = 0;
+    MPI_Comm comm = MPI_COMM_NULL;
+
+    MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, periods, reorder, &comm);
+
+    {
+      int xdims[2] = {0};
+      int xperiods[2] = {-1, -1};
+      int coords[2]   = {-1, -1};
+      MPI_Cart_get(comm, ndims, xdims, xperiods, coords);
+
+      assert(xdims[0]    == dims[0]);
+      assert(xdims[1]    == dims[1]);
+      assert(xperiods[0] == periods[0]);
+      assert(xperiods[1] == periods[1]);
+      assert(coords[0]   == 0);
+      assert(coords[1]   == 0);
+    }
+
+    MPI_Comm_free(&comm);
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  test_mpi_dims_create
+ *
+ *****************************************************************************/
+
+static int test_mpi_dims_create(void) {
+
+  int ifail = 0;
+
+  {
+    int nnodes = 1;
+    int ndims  = 2;
+    int dims[2] = {0, 0};
+
+    MPI_Dims_create(nnodes, ndims, dims);
+
+    assert(dims[0] == 1);
+    assert(dims[1] == 1);
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  test_mpi_type_size
+ *
+ *****************************************************************************/
+
+static int test_mpi_type_size(void) {
+
+  /* Intrinsic */
+  {
+    int sz = -1;
+    MPI_Type_size(MPI_DOUBLE, &sz);
+    assert(sz == sizeof(double));
+  }
+
+  /* User */
+  {
+    int sz = -1;
+    MPI_Datatype dt = MPI_DATATYPE_NULL;
+    MPI_Type_vector(2, 3, 0, MPI_INT, &dt);
+    MPI_Type_commit(&dt);
+    MPI_Type_size(dt, &sz);
+    assert(sz == 2*3*sizeof(int));
+    MPI_Type_free(&dt);
+  }
+
+  return 0;
+}
+
+/*****************************************************************************
+ *
+ *  test_mpi_type_get_extent
+ *
+ *****************************************************************************/
+
+static int test_mpi_type_get_extent(void) {
+
+  /* Intrinsic */
+  {
+    MPI_Aint lb = 999;
+    MPI_Aint extent = 999;
+    MPI_Type_get_extent(MPI_INT, &lb, &extent);
+    assert(lb == 0);
+    assert(extent == sizeof(int));
+  }
+
+  /* User */
+  {
+    MPI_Datatype udt = MPI_DATATYPE_NULL;
+    MPI_Aint lb = 999;
+    MPI_Aint extent = 999;
+
+    MPI_Type_contiguous(3, MPI_INT, &udt);
+    MPI_Type_commit(&udt);
+    MPI_Type_get_extent(udt, &lb, &extent);
+    assert(lb == 0);
+    assert(extent == 3*sizeof(int));
+    MPI_Type_free(&udt);
+  }
+
+  return 0;
+}

From 6e380784a5134a9c5550add80d9f009b6c9ca169 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Mon, 25 Nov 2024 09:04:44 +0000
Subject: [PATCH 051/133] Format/spelling

---
 mpi_s/mpi_serial.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mpi_s/mpi_serial.c b/mpi_s/mpi_serial.c
index 1b7ce5520..5f20a012b 100644
--- a/mpi_s/mpi_serial.c
+++ b/mpi_s/mpi_serial.c
@@ -2064,7 +2064,7 @@ int MPI_File_open(MPI_Comm comm, const char * filename, int amode,
       break;
     default:
       ifail = MPI_ERR_IO;
-      mpi_file_errors_return(&file, &ifail, 
+      mpi_file_errors_return(&file, &ifail,
 			     "MPI_File_open(): failed file %s mode %s",
 			     filename, fdmode);
     }
@@ -2815,7 +2815,7 @@ static int mpi_err_tag(int tag) {
  *
  *  The first optional argument must be present, and it should be a
  *  format string suitable for a printf()-like function. Remaining
- *  argements should be consistent with the format.
+ *  arguments should be consistent with the format.
  *
  *  This is fatal if ifail != MPI_SUCCESS.
  *
@@ -2898,7 +2898,7 @@ static int mpi_datatype_user(MPI_Datatype dt) {
   int isuser = 0;
 
   assert(mpi_info_);
-  assert(MPI_DATATYPE_NULL == 0);   /* mpi_info->dt[0] is null */ 
+  assert(MPI_DATATYPE_NULL == 0);   /* mpi_info->dt[0] is null */
 
   /* Is dt a valid user datatype */
 

From 7c1f20e1c90feda622c7850b644af68aefcd1b74 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Mon, 25 Nov 2024 09:10:02 +0000
Subject: [PATCH 052/133] Fix typo in assertion

---
 mpi_s/mpi_tests.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mpi_s/mpi_tests.c b/mpi_s/mpi_tests.c
index 7840fc5a2..f040ec8de 100644
--- a/mpi_s/mpi_tests.c
+++ b/mpi_s/mpi_tests.c
@@ -241,7 +241,7 @@ static int test_mpi_allreduce(void) {
     int irecv = 1;
     int iret  = MPI_Allreduce(MPI_IN_PLACE, &irecv, 1, MPI_INT, MPI_SUM, comm);
     if (iret != MPI_SUCCESS) ifail = 1;
-    assert(ifaill == 0);
+    assert(ifail == 0);
     if (irecv != 1) ifail = 1;
     assert(ifail == 0);
   }

From f71946de698035447a815d61e8e6914665400c64 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Mon, 25 Nov 2024 09:20:53 +0000
Subject: [PATCH 053/133] Replace trivial switch statement

---
 mpi_s/mpi_serial.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/mpi_s/mpi_serial.c b/mpi_s/mpi_serial.c
index 5f20a012b..823c50edb 100644
--- a/mpi_s/mpi_serial.c
+++ b/mpi_s/mpi_serial.c
@@ -2055,14 +2055,13 @@ int MPI_File_open(MPI_Comm comm, const char * filename, int amode,
 
   if (fp == NULL) {
     /* Fail in fopen() => errno is set (many possible values ...) */
-    switch (errno) {
-    case ENOENT:
+    if (errno ==  ENOENT) {
       ifail = MPI_ERR_NO_SUCH_FILE;
       mpi_file_errors_return(&file, &ifail,
 			     "MPI_File_open(): no such file %s",
 			     filename);
-      break;
-    default:
+    }
+    else {
       ifail = MPI_ERR_IO;
       mpi_file_errors_return(&file, &ifail,
 			     "MPI_File_open(): failed file %s mode %s",
@@ -2142,12 +2141,11 @@ int MPI_File_delete(const char * filename, MPI_Info info) {
      * MPI_ERR_NO_SUCH_FILE if the file does not exist; or
      * MPI_ERR_FILE_IN_USE or MPI_ERR_ACCESS. We use the latter. */
 
-    switch (errno) {
-    case ENOENT:
+    if (errno == ENOENT) {
       ifail = MPI_ERR_NO_SUCH_FILE;
       mpi_file_errors_return(&file, &ifail, "MPI_Delete(): no such file");
-      break;
-    default:
+    }
+    else {
       ifail = MPI_ERR_ACCESS;
       mpi_file_errors_return(&file, &ifail, "MPI_Delete(): access error");
     }

From bb693cf5ba30cb43d5ef889640861729e6837521 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Mon, 25 Nov 2024 09:21:17 +0000
Subject: [PATCH 054/133] Remove repeat declaration

---
 mpi_s/mpi_tests.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mpi_s/mpi_tests.c b/mpi_s/mpi_tests.c
index f040ec8de..8084b8233 100644
--- a/mpi_s/mpi_tests.c
+++ b/mpi_s/mpi_tests.c
@@ -752,7 +752,6 @@ int test_mpi_file_write_all(void) {
     MPI_File fh = MPI_FILE_NULL;
     MPI_Offset disp = 0;
 
-    int ifail = MPI_SUCCESS;
     int count = 1;
     double rbuf[NX*NY] = {0};
 

From 8964a853b2aaf3264f732b22fe56bd33560c93a2 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Wed, 27 Nov 2024 15:44:06 +0000
Subject: [PATCH 055/133] Remove repeated update of swinning directiion

---
 src/bbl.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/bbl.c b/src/bbl.c
index c08c3ed09..14a8e46b6 100644
--- a/src/bbl.c
+++ b/src/bbl.c
@@ -1227,10 +1227,6 @@ int bbl_update_ellipsoid(bbl_t * bbl, wall_t * wall, colloid_t * pc,
     util_vector_copy(4, quaternext, pc->s.quat);
   }
 
-  /* Re-orient swimming direction */
-
-  util_q4_rotate_vector(pc->s.quat, v1, pc->s.m);
-
   return iret;
 }
 

From 2bf36227fedba3baf22d795510f0babcae128fb8 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Wed, 27 Nov 2024 15:48:43 +0000
Subject: [PATCH 056/133] ..and remove unused variable

---
 src/bbl.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/bbl.c b/src/bbl.c
index 14a8e46b6..fcced3326 100644
--- a/src/bbl.c
+++ b/src/bbl.c
@@ -1205,7 +1205,6 @@ int bbl_update_ellipsoid(bbl_t * bbl, wall_t * wall, colloid_t * pc,
   double quaternext[4];
   double owathalf[3];
   double qbar[4];
-  double v1[3]={1.0,0.0,0.0};
 
   assert(bbl);
   assert(wall);

From 58432e0942f2dae00f18ebe6a4afacd436ef6d29 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Mon, 2 Dec 2024 13:14:35 +0000
Subject: [PATCH 057/133] debugging graph api implmentation

---
 src/lb_data.c | 173 +++++++++++++++++++++++++++-----------------------
 1 file changed, 92 insertions(+), 81 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index e0dc2de95..e843513f1 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1256,8 +1256,8 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 			 tdpMemcpyHostToDevice) );
     
     if (have_graph_api_) {
-      lb_graph_halo_send_create(lb, h, send_count);
-      lb_graph_halo_recv_create(lb, h, recv_count);
+      lb_graph_halo_send_create(lb, h, h->count);
+      lb_graph_halo_recv_create(lb, h, h->count);
     }
 
   }
@@ -1321,12 +1321,15 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
   if (ndevice > 0) {
     copyModelToDevice(&lb->model, &lb->target->model);
     copyModelToDevice(&h->map, &h->target->map);
-    for (int ireq = 0; ireq < h->map.nvel; ireq++) {
-      if (h->count[ireq] > 0) {
-        if (have_graph_api_) {
-          tdpAssert( tdpGraphLaunch(h->gsend.exec, h->stream) );
-          tdpAssert( tdpStreamSynchronize(h->stream) );
-        } else {
+    if (have_graph_api_) {
+      printf("here 1\n");
+      tdpAssert( tdpGraphLaunch(h->gsend.exec, h->stream) );
+      printf("here 2\n");
+      tdpAssert( tdpStreamSynchronize(h->stream) );
+      printf("here 3\n");
+    } else {
+      for (int ireq = 0; ireq < h->map.nvel; ireq++) {
+        if (h->count[ireq] > 0) {
           int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
           dim3 nblk, ntpb;
           kernel_launch_param(scount, &nblk, &ntpb);
@@ -1396,12 +1399,15 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
   int ndevice;
   tdpGetDeviceCount(&ndevice);
   if (ndevice > 0) {
-    for (int ireq = 0; ireq < h->map.nvel; ireq++) {
-      if (h->count[ireq] > 0) {
-        if (have_graph_api_) {
-          tdpAssert( tdpGraphLaunch(h->grecv.exec, h->stream) );
-          tdpAssert( tdpStreamSynchronize(h->stream) );
-        } else {
+    if (have_graph_api_) {
+      printf("here 4\n");
+      tdpAssert( tdpGraphLaunch(h->grecv.exec, h->stream) );
+      printf("here 5\n");
+      tdpAssert( tdpStreamSynchronize(h->stream) );
+      printf("here 6\n");
+    } else {
+      for (int ireq = 0; ireq < h->map.nvel; ireq++) {
+        if (h->count[ireq] > 0) {
           int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
           dim3 nblk, ntpb;
           kernel_launch_param(rcount, &nblk, &ntpb);
@@ -1458,11 +1464,16 @@ int lb_halo_free(lb_t * lb, lb_halo_t * h) {
   }
 
   if (have_graph_api_) {
+    printf("here 7\n");
     tdpAssert( tdpGraphDestroy(h->gsend.graph) );
+    printf("here 8\n");
     tdpAssert( tdpGraphDestroy(h->grecv.graph) );
+    printf("here 9\n");
   }
 
+  tdpAssert( tdpStreamDestroy(h->stream) );
   lb_model_free(&h->map);
+  *h = (lb_halo_t) {0};
 
   return 0;
 }
@@ -1774,35 +1785,35 @@ int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count)
     if (have_gpu_aware_mpi_) {
       /* Don't need explicit device -> host copy */
     }
-//    else {
-//      /* We do need to add the memcpys to the graph definition
-//       * (except messages to self... ) */
-//
-//      int i = 1 + h->cv[h->nvel - ireq][X];
-//      int j = 1 + h->cv[h->nvel - ireq][Y];
-//      int k = 1 + h->cv[h->nvel - ireq][Z];
-//
-//      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
-//	tdpGraphNode_t memcpyNode;
-//        tdpMemcpy3DParms memcpyParams = {0};
-//
-//	memcpyParams.srcArray = NULL;
-//	memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
-//	memcpyParams.srcPtr   = make_tdpPitchedPtr(h->send_d[ireq],
-//						   sizeof(double)*scount,
-//						   scount, 1);
-//	memcpyParams.dstArray = NULL;
-//	memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
-//	memcpyParams.dstPtr   = make_tdpPitchedPtr(h->send[ireq],
-//						   sizeof(double)*scount,
-//						   scount, 1);
-//	memcpyParams.extent   = make_tdpExtent(sizeof(double)*scount, 1, 1);
-//	memcpyParams.kind     = tdpMemcpyDeviceToHost;
-//
-//	tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->gsend.graph,
-//					 &kernelNode, 1, &memcpyParams) );
-//      }
-//    }
+    else {
+      /* We do need to add the memcpys to the graph definition
+       * (except messages to self... ) */
+
+      int i = 1 + h->map.cv[h->map.nvel - ireq][X];
+      int j = 1 + h->map.cv[h->map.nvel - ireq][Y];
+      int k = 1 + h->map.cv[h->map.nvel - ireq][Z];
+
+      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
+	      tdpGraphNode_t memcpyNode;
+        tdpMemcpy3DParms memcpyParams = {0};
+
+	      memcpyParams.srcArray = NULL;
+	      memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
+	      memcpyParams.srcPtr   = make_tdpPitchedPtr(h->send_d[ireq],
+						   sizeof(double)*scount,
+						   scount, 1);
+	      memcpyParams.dstArray = NULL;
+	      memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
+	      memcpyParams.dstPtr   = make_tdpPitchedPtr(h->send[ireq],
+						   sizeof(double)*scount,
+						   scount, 1);
+	      memcpyParams.extent   = make_tdpExtent(sizeof(double)*scount, 1, 1);
+	      memcpyParams.kind     = tdpMemcpyDeviceToHost;
+
+	      tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->gsend.graph,
+					 &kernelNode, 1, &memcpyParams) );
+      }
+    }
   }
 
   tdpAssert( tdpGraphInstantiate(&h->gsend.exec, h->gsend.graph, 0) );
@@ -1830,31 +1841,31 @@ int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count)
     if (have_gpu_aware_mpi_) {
       /* Don't need explicit copies */
     }
-//    else {
-//      int i = 1 + h->cv[h->nvel - ireq][X];
-//      int j = 1 + h->cv[h->nvel - ireq][Y];
-//      int k = 1 + h->cv[h->nvel - ireq][Z];
-//
-//      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
-//	tdpMemcpy3DParms memcpyParams = {0};
-//
-//	memcpyParams.srcArray = NULL;
-//	memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
-//	memcpyParams.srcPtr   = make_tdpPitchedPtr(h->recv[ireq],
-//						   sizeof(double)*rcount,
-//						   rcount, 1);
-//	memcpyParams.dstArray = NULL;
-//	memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
-//	memcpyParams.dstPtr   = make_tdpPitchedPtr(h->recv_d[ireq],
-//						   sizeof(double)*rcount,
-//						   rcount, 1);
-//	memcpyParams.extent   = make_tdpExtent(sizeof(double)*rcount, 1, 1);
-//	memcpyParams.kind     = tdpMemcpyHostToDevice;
-//
-//	tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->grecv.graph, NULL,
-//					 0, &memcpyParams) );
-//      }
-//    }
+    else {
+      int i = 1 + h->map.cv[h->map.nvel - ireq][X];
+      int j = 1 + h->map.cv[h->map.nvel - ireq][Y];
+      int k = 1 + h->map.cv[h->map.nvel - ireq][Z];
+
+      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
+	      tdpMemcpy3DParms memcpyParams = {0};
+
+	      memcpyParams.srcArray = NULL;
+	      memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
+	      memcpyParams.srcPtr   = make_tdpPitchedPtr(h->recv[ireq],
+						   sizeof(double)*rcount,
+						   rcount, 1);
+	      memcpyParams.dstArray = NULL;
+	      memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
+	      memcpyParams.dstPtr   = make_tdpPitchedPtr(h->recv_d[ireq],
+						   sizeof(double)*rcount,
+						   rcount, 1);
+        memcpyParams.extent   = make_tdpExtent(sizeof(double)*rcount, 1, 1);
+        memcpyParams.kind     = tdpMemcpyHostToDevice;
+
+	      tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->grecv.graph, NULL,
+					 0, &memcpyParams) );
+      }
+    }
 
     /* Always need the dis-aggregateion kernel */
 
@@ -1879,19 +1890,19 @@ int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count)
       tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, NULL,
 				       0, &kernelNodeParams) );
     }
-//    else {
-//      int i = 1 + h->cv[h->nvel - ireq][X];
-//      int j = 1 + h->cv[h->nvel - ireq][Y];
-//      int k = 1 + h->cv[h->nvel - ireq][Z];
-//      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
-//	tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, &memcpyNode,
-//					 1, &kernelNodeParams) );
-//      }
-//      else {
-//	tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, NULL, 0,
-//					 &kernelNodeParams) );
-//      }
-//    }
+    else {
+      int i = 1 + h->map.cv[h->map.nvel - ireq][X];
+      int j = 1 + h->map.cv[h->map.nvel - ireq][Y];
+      int k = 1 + h->map.cv[h->map.nvel - ireq][Z];
+      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
+	      tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, &memcpyNode,
+					 1, &kernelNodeParams) );
+      }
+      else {
+	      tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, NULL, 0,
+					 &kernelNodeParams) );
+      }
+    }
   }
 
   tdpAssert( tdpGraphInstantiate(&h->grecv.exec, h->grecv.graph, 0) );

From 59a53bc2ff85c88e05e7f49a5d489a583273b1f7 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Fri, 6 Dec 2024 13:53:47 +0000
Subject: [PATCH 058/133] Makefile should return a failure on failure

---
 tests/regression/d3q27/Makefile | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/tests/regression/d3q27/Makefile b/tests/regression/d3q27/Makefile
index 023475870..0c006ae91 100644
--- a/tests/regression/d3q27/Makefile
+++ b/tests/regression/d3q27/Makefile
@@ -2,17 +2,23 @@
 #
 #  Makefile
 #
-#  D3Q27 regression tests
-#
 ###############################################################################
 
 include ../../../Makefile.mk
 
 PAR=${LAUNCH_MPIRUN_CMD}
 
-serial:
-	@echo "TEST --> regression tests (d3q27)"
-	inputs='serial*inp'; \
-	for file in $$inputs; do ../../test.sh $$file "" "$(PAR)"; done
+SOURCES = $(wildcard *.inp)
+LOGS    = ${SOURCES:.inp=.new}
+
+test:
+	$(MAKE) -s clean
+	$(MAKE) -s logs
+	@echo End of tests.
+
+logs:	$(LOGS)
+
+%.new:	%.inp
+	../../test.sh $< "" "${PAR}"
 clean:
-	rm -f *new test-diff* input
+	rm -f *new test-diff* *meta *001-001

From 95c9952e8cd1db2f07cf569e9927846e64b538c2 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Fri, 6 Dec 2024 14:03:59 +0000
Subject: [PATCH 059/133] Update test result for issue 330

---
 tests/regression/d3q27/serial-elip-s03.log | 16 +++++++++-------
 tests/regression/d3q27/serial-elip-s04.log | 17 ++++++++++-------
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/tests/regression/d3q27/serial-elip-s03.log b/tests/regression/d3q27/serial-elip-s03.log
index 994283033..728022041 100644
--- a/tests/regression/d3q27/serial-elip-s03.log
+++ b/tests/regression/d3q27/serial-elip-s03.log
@@ -107,20 +107,22 @@ Starting time step loop.
 Particle statistics:
 
 Colloid velocities - x y z
-[minimum ]  6.4306406e-03 -1.7741963e-05 -1.7741963e-05
-[maximum ]  6.4306406e-03 -1.7741963e-05 -1.7741963e-05
+[minimum ]  6.4306388e-03 -1.7703160e-05 -1.7703160e-05
+[maximum ]  6.4306388e-03 -1.7703160e-05 -1.7703160e-05
 
 Scalars - total mean variance min max
-[rho]       32560.00  1.00000000000  1.9972572e-09  0.99826881739  1.00441854328
+[rho]      32560.00  1.00000000000  1.9972597e-09  0.99826876562  1.00441859091
+
 
 Momentum - x y z
 [total   ] -1.3433699e-13 -5.1482256e-15  3.8679997e-15
-[fluid   ] -1.2940888e+00  3.4147104e-03  3.4147104e-03
-[colloids]  1.2940888e+00 -3.4147104e-03 -3.4147104e-03
+[fluid   ] -1.2940885e+00  3.4081177e-03  3.4081177e-03
+[colloids]  1.2940885e+00 -3.4081177e-03 -3.4081177e-03
 
 Velocity - x y z
-[minimum ] -4.0246851e-03 -2.4449766e-03 -2.4449766e-03
-[maximum ]  1.8512408e-03  2.5745655e-03  2.5745655e-03
+[minimum ] -4.0247187e-03 -2.4450319e-03 -2.4450319e-03
+[maximum ]  1.8512417e-03  2.5745714e-03  2.5745714e-03
+
 
 Completed cycle 10
 
diff --git a/tests/regression/d3q27/serial-elip-s04.log b/tests/regression/d3q27/serial-elip-s04.log
index cc8a83505..2fde0271e 100644
--- a/tests/regression/d3q27/serial-elip-s04.log
+++ b/tests/regression/d3q27/serial-elip-s04.log
@@ -108,20 +108,23 @@ Starting time step loop.
 Particle statistics:
 
 Colloid velocities - x y z
-[minimum ]  6.4005865e-03 -3.9979463e-05 -3.9979463e-05
-[maximum ]  6.4005865e-03 -3.9979463e-05 -3.9979463e-05
+[minimum ]  6.4005826e-03 -3.9908322e-05 -3.9908322e-05
+[maximum ]  6.4005826e-03 -3.9908322e-05 -3.9908322e-05
+
 
 Scalars - total mean variance min max
-[rho]       32560.00  1.00000000000  1.0865699e-08  0.99661770937  1.00083934723
+[rho]      32560.00  1.00000000000  1.0865577e-08  0.99661751238  1.00083939335
+
 
 Momentum - x y z
 [total   ] -1.4832580e-13 -2.0774181e-14 -8.6259125e-15
-[fluid   ] -1.2826249e+00  6.5402714e-03  6.5402714e-03
-[colloids]  1.2826249e+00 -6.5402714e-03 -6.5402714e-03
+[fluid   ] -1.2826242e+00  6.5269074e-03  6.5269074e-03
+[colloids]  1.2826242e+00 -6.5269074e-03 -6.5269074e-03
 
 Velocity - x y z
-[minimum ] -4.9274366e-03 -2.7857056e-03 -2.7857056e-03
-[maximum ]  1.7194405e-03  2.9575687e-03  2.9575687e-03
+[minimum ] -4.9274808e-03 -2.7857644e-03 -2.7857644e-03
+[maximum ]  1.7194614e-03  2.9575581e-03  2.9575581e-03
+
 
 Completed cycle 10
 

From 36cc95dec5d4615168e66c560650f23f10f8be23 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Fri, 6 Dec 2024 14:13:45 +0000
Subject: [PATCH 060/133] Update Makfile

---
 tests/regression/d2q9/Makefile  | 20 +++++++++++++-------
 tests/regression/d3q15/Makefile | 20 +++++++++++++-------
 2 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/tests/regression/d2q9/Makefile b/tests/regression/d2q9/Makefile
index a61a537a6..0c006ae91 100644
--- a/tests/regression/d2q9/Makefile
+++ b/tests/regression/d2q9/Makefile
@@ -2,17 +2,23 @@
 #
 #  Makefile
 #
-#  D2Q9 regression tests
-#
 ###############################################################################
 
 include ../../../Makefile.mk
 
 PAR=${LAUNCH_MPIRUN_CMD}
 
-d2q9:
-	@echo "TEST --> regression tests (d2q9)"
-	inputs='serial*inp'; \
-	for file in $$inputs; do ../../test.sh $$file "" "$(PAR)"; done
+SOURCES = $(wildcard *.inp)
+LOGS    = ${SOURCES:.inp=.new}
+
+test:
+	$(MAKE) -s clean
+	$(MAKE) -s logs
+	@echo End of tests.
+
+logs:	$(LOGS)
+
+%.new:	%.inp
+	../../test.sh $< "" "${PAR}"
 clean:
-	rm -f *new test-diff* input
+	rm -f *new test-diff* *meta *001-001
diff --git a/tests/regression/d3q15/Makefile b/tests/regression/d3q15/Makefile
index 15c7cbc07..0c006ae91 100644
--- a/tests/regression/d3q15/Makefile
+++ b/tests/regression/d3q15/Makefile
@@ -2,17 +2,23 @@
 #
 #  Makefile
 #
-#  D3Q15 regression tests
-#
 ###############################################################################
 
 include ../../../Makefile.mk
 
 PAR=${LAUNCH_MPIRUN_CMD}
 
-serial:
-	@echo "TEST --> regression tests (d3q15)"
-	inputs='serial*inp'; \
-	for file in $$inputs; do ../../test.sh $$file "" "$(PAR)"; done
+SOURCES = $(wildcard *.inp)
+LOGS    = ${SOURCES:.inp=.new}
+
+test:
+	$(MAKE) -s clean
+	$(MAKE) -s logs
+	@echo End of tests.
+
+logs:	$(LOGS)
+
+%.new:	%.inp
+	../../test.sh $< "" "${PAR}"
 clean:
-	rm -f *new test-diff* input
+	rm -f *new test-diff* *meta *001-001

From 861259673a588357aa03a07a01d45ab85780726a Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@cirrus.epcc.ed.ac.uk>
Date: Fri, 6 Dec 2024 15:53:47 +0000
Subject: [PATCH 061/133] Repair GPU tests

---
 src/field.c                                   | 28 ++++++++-----------
 src/ludwig.c                                  |  9 ++++++
 .../serial-open-ru1.inp                       |  0
 .../serial-open-ru1.log                       |  4 +--
 .../serial-open-ru2.inp                       |  0
 .../serial-open-ru2.log                       | 18 ++++++------
 tests/test-diff.sh                            |  2 ++
 tests/unit/test_blue_phase.c                  |  4 +--
 8 files changed, 37 insertions(+), 28 deletions(-)
 rename tests/regression/{d3q19-short => d3q27}/serial-open-ru1.inp (100%)
 rename tests/regression/{d3q19-short => d3q27}/serial-open-ru1.log (98%)
 rename tests/regression/{d3q19-short => d3q27}/serial-open-ru2.inp (100%)
 rename tests/regression/{d3q19-short => d3q27}/serial-open-ru2.log (89%)

diff --git a/src/field.c b/src/field.c
index a312dc297..ad16de026 100644
--- a/src/field.c
+++ b/src/field.c
@@ -48,15 +48,6 @@ __host__ int field_init(field_t * obj, int nhcomm, lees_edw_t * le);
 #include "mpi-ext.h"
 #endif
 
-#ifdef __HIPCC__
-/* There are two file-scope switches here, which need to be generalised
- * via some suitable interface; they are separate, but both relate to
- * GPU execution. */
-static const int have_graph_api_ = 1;
-#else
-static const int have_graph_api_ = 0;
-#endif
-
 #if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
 static const int have_gpu_aware_mpi_ = 1;
 #else
@@ -1416,10 +1407,8 @@ int field_halo_create(const field_t * field, field_halo_t * h) {
     tdpAssert( tdpMemcpy(h->target->recv, h->recv_d, 27*sizeof(double *),
 			 tdpMemcpyHostToDevice) );
 
-    if (have_graph_api_) {
-      field_graph_halo_send_create(field, h);
-      field_graph_halo_recv_create(field, h);
-    }
+    field_graph_halo_send_create(field, h);
+    field_graph_halo_recv_create(field, h);
   }
 
   return 0;
@@ -1433,11 +1422,14 @@ int field_halo_create(const field_t * field, field_halo_t * h) {
 
 int field_halo_post(const field_t * field, field_halo_t * h) {
 
+  int ndevice = 0;
   const int tagbase = 2022;
 
   assert(field);
   assert(h);
 
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
+
   /* Post recvs */
 
   TIMER_start(TIMER_FIELD_HALO_IRECV);
@@ -1468,7 +1460,7 @@ int field_halo_post(const field_t * field, field_halo_t * h) {
 
   TIMER_start(TIMER_FIELD_HALO_PACK);
 
-  if (have_graph_api_) {
+  if (ndevice) {
     tdpAssert( tdpGraphLaunch(h->gsend.exec, h->stream) );
     tdpAssert( tdpStreamSynchronize(h->stream) );
   }
@@ -1515,9 +1507,13 @@ int field_halo_post(const field_t * field, field_halo_t * h) {
 
 int field_halo_wait(field_t * field, field_halo_t * h) {
 
+  int ndevice = 0;
+
   assert(field);
   assert(h);
 
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
+
   TIMER_start(TIMER_FIELD_HALO_WAITALL);
 
   MPI_Waitall(2*h->nvel, h->request, MPI_STATUSES_IGNORE);
@@ -1526,7 +1522,7 @@ int field_halo_wait(field_t * field, field_halo_t * h) {
 
   TIMER_start(TIMER_FIELD_HALO_UNPACK);
 
-  if (have_graph_api_) {
+  if (ndevice) {
     tdpAssert( tdpGraphLaunch(h->grecv.exec, h->stream) );
     tdpAssert( tdpStreamSynchronize(h->stream) );
   }
@@ -1628,7 +1624,7 @@ int field_halo_free(field_halo_t * h) {
     free(h->recv[p]);
   }
 
-  if (have_graph_api_) {
+  if (ndevice > 0) {
     tdpAssert( tdpGraphDestroy(h->gsend.graph) );
     tdpAssert( tdpGraphDestroy(h->grecv.graph) );
   }
diff --git a/src/ludwig.c b/src/ludwig.c
index c17dd1be3..cff116770 100644
--- a/src/ludwig.c
+++ b/src/ludwig.c
@@ -612,8 +612,12 @@ void ludwig_run(const char * inputfile) {
 	TIMER_start(TIMER_HALO_LATTICE);
 	hydro_u_halo(ludwig->hydro);
 	TIMER_stop(TIMER_HALO_LATTICE);
+
+	/* Work-around for gpu regression tests ... */
+	hydro_memcpy(ludwig->hydro, tdpMemcpyDeviceToHost);
       }
 
+
       /* Time splitting for high electrokinetic diffusions in Nernst Planck */
 
       psi_multisteps(ludwig->psi, &multisteps);
@@ -662,6 +666,11 @@ void ludwig_run(const char * inputfile) {
       psi_halo_rho(ludwig->psi);
       TIMER_stop(TIMER_HALO_LATTICE);
 
+      if (ludwig->hydro) {
+	/* Workaround for gpu regression tests ... */
+	hydro_memcpy(ludwig->hydro, tdpMemcpyHostToDevice);
+      }
+
       nernst_planck_adjust_multistep(ludwig->psi);
       psi_zero_mean(ludwig->psi);
     }
diff --git a/tests/regression/d3q19-short/serial-open-ru1.inp b/tests/regression/d3q27/serial-open-ru1.inp
similarity index 100%
rename from tests/regression/d3q19-short/serial-open-ru1.inp
rename to tests/regression/d3q27/serial-open-ru1.inp
diff --git a/tests/regression/d3q19-short/serial-open-ru1.log b/tests/regression/d3q27/serial-open-ru1.log
similarity index 98%
rename from tests/regression/d3q19-short/serial-open-ru1.log
rename to tests/regression/d3q27/serial-open-ru1.log
index 3d4cca95a..003a269c8 100644
--- a/tests/regression/d3q19-short/serial-open-ru1.log
+++ b/tests/regression/d3q27/serial-open-ru1.log
@@ -76,8 +76,8 @@ Boundary walls:                  - Y Z
 Boundary speed u_x (bottom):     0.0000000e+00
 Boundary speed u_x (top):        0.0000000e+00
 Boundary normal lubrication rc:  0.0000000e+00
-Wall boundary links allocated:   5664
-Memory (total, bytes):           90624
+Wall boundary links allocated:   10080
+Memory (total, bytes):           161280
 Boundary shear initialise:       0
 Initial conditions.
 
diff --git a/tests/regression/d3q19-short/serial-open-ru2.inp b/tests/regression/d3q27/serial-open-ru2.inp
similarity index 100%
rename from tests/regression/d3q19-short/serial-open-ru2.inp
rename to tests/regression/d3q27/serial-open-ru2.inp
diff --git a/tests/regression/d3q19-short/serial-open-ru2.log b/tests/regression/d3q27/serial-open-ru2.log
similarity index 89%
rename from tests/regression/d3q19-short/serial-open-ru2.log
rename to tests/regression/d3q27/serial-open-ru2.log
index fdc0463aa..2341cb0c6 100644
--- a/tests/regression/d3q19-short/serial-open-ru2.log
+++ b/tests/regression/d3q27/serial-open-ru2.log
@@ -76,8 +76,8 @@ Boundary walls:                  X Y -
 Boundary speed u_x (bottom):     0.0000000e+00
 Boundary speed u_x (top):        0.0000000e+00
 Boundary normal lubrication rc:  0.0000000e+00
-Wall boundary links allocated:   5664
-Memory (total, bytes):           90624
+Wall boundary links allocated:   10080
+Memory (total, bytes):           161280
 Boundary shear initialise:       0
 Initial conditions.
 
@@ -92,16 +92,18 @@ Momentum - x y z
 Starting time step loop.
 
 Scalars - total mean variance min max
-[rho]        3459.52  1.00101995193  3.2691361e-06  1.00000000000  1.00688883987
+[rho]       3459.53  1.00102050427  3.2724861e-06  1.00000000000  1.00691490497
+
 
 Momentum - x y z
-[total   ] -1.3593293e-14  1.0130785e-15  2.2941291e+00
-[fluid   ] -7.4107387e-15 -8.6042284e-16  1.8646524e+00
-[walls   ] -6.1825545e-15  1.8735014e-15  4.2947672e-01
+[total   ] -4.3368087e-16 -1.0755286e-15  2.2977672e+00
+[fluid   ]  9.7283293e-15 -5.6898930e-16  1.8654951e+00
+[walls   ] -1.0162010e-14 -5.0653925e-16  4.3227207e-01
+
 
 Velocity - x y z
-[minimum ] -3.1215960e-04 -3.1215960e-04 -1.7347235e-17
-[maximum ]  3.1215960e-04  3.1215960e-04  3.0594611e-03
+[minimum ] -3.1211457e-04 -3.1211457e-04  0.0000000e+00
+[maximum ]  3.1211457e-04  3.1211457e-04  3.0608474e-03
 
 Completed cycle 10
 
diff --git a/tests/test-diff.sh b/tests/test-diff.sh
index 652d95626..711a1f127 100755
--- a/tests/test-diff.sh
+++ b/tests/test-diff.sh
@@ -101,6 +101,7 @@ sed -i~ '/GPU\ INFO/d' test-diff-tmp.ref
 sed -i~ '/SIMD\ vector/d' test-diff-tmp.ref
 sed -i~ '/Start time/d' test-diff-tmp.ref
 sed -i~ '/End time/d' test-diff-tmp.ref
+sed -i~ '/Halo type/d' test-diff-tmp.ref
 
 sed '/call)/d' $2 > test-diff-tmp.log
 sed -i~ '/calls)/d' test-diff-tmp.log
@@ -125,6 +126,7 @@ sed -i~ '/GPU\ INFO/d' test-diff-tmp.log
 sed -i~ '/SIMD\ vector/d' test-diff-tmp.log
 sed -i~ '/Start time/d' test-diff-tmp.log
 sed -i~ '/End time/d' test-diff-tmp.log
+sed -i~ '/Halo type/d' test-diff-tmp.log
 
 # Allow different decompositions ...
 # The strategy is that we can ignore these simple quantities, as
diff --git a/tests/unit/test_blue_phase.c b/tests/unit/test_blue_phase.c
index dac5fdf1c..a916e1d3d 100644
--- a/tests/unit/test_blue_phase.c
+++ b/tests/unit/test_blue_phase.c
@@ -334,13 +334,13 @@ int test_o8m_struct(pe_t * pe, cs_t * cs, lees_edw_t * le, fe_lc_t * fe,
   /* Now the free energy density. This requires that the gradients are
    * set. These values use the standard 27-point stencil in 3-d. */
 
-  /* Gradient computation is on the device, so ... */
+  /* Gradient computation is on the device, so ...
+  *  ... copy to gpu and perform gpu halo ... */
 
   field_memcpy(fq, tdpMemcpyHostToDevice);
   field_halo(fq);
 
   field_grad_compute(fqgrad);
-
   field_grad_memcpy(fqgrad, tdpMemcpyDeviceToHost);
 
   ic = 1;

From 46b55049200369b243358e1fcf477968d3bde5d3 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Fri, 6 Dec 2024 15:57:54 +0000
Subject: [PATCH 062/133] Spelling

---
 tests/unit/test_blue_phase.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/tests/unit/test_blue_phase.c b/tests/unit/test_blue_phase.c
index a916e1d3d..8b4f9711e 100644
--- a/tests/unit/test_blue_phase.c
+++ b/tests/unit/test_blue_phase.c
@@ -101,7 +101,7 @@ int test_bp_suite(void) {
  *
  *  test_bp_nonfield
  *
- *  Values confimed via Wolfram Alpha eigenvalue widget.
+ *  Values confirmed via Wolfram Alpha eigenvalue widget.
  *
  *****************************************************************************/
 
@@ -159,9 +159,9 @@ static int test_bp_nonfield(void) {
  *       kappa0 = 0.01
  *       kappa1 = 0.01
  *                with one constant approximation (kappa0 = kappa1 = kappa).
- *       epsilon  dielectric anisotropy (typcially comes out to be 41.4 in
+ *       epsilon  dielectric anisotropy (typically comes out to be 41.4 in
  *                lattice units based on matching Frederick transition).
- *                          
+ *
  *
  *  Molecular aspect ratio:
  *       xi = 0.7
@@ -242,7 +242,7 @@ int test_o8m_struct(pe_t * pe, cs_t * cs, lees_edw_t * le, fe_lc_t * fe,
   fe_lc_reduced_temperature(fe, &vtest);
   test_assert(fabs(value - vtest) < TEST_DOUBLE_TOLERANCE);
 
-  /* Set up the q tensor and sample some lattice sites. 
+  /* Set up the q tensor and sample some lattice sites.
    * Note there are a limited number of unique order parameter values,
    * so an exhaustive test is probably not worth while. */
 
@@ -373,7 +373,7 @@ int test_o8m_struct(pe_t * pe, cs_t * cs, lees_edw_t * le, fe_lc_t * fe,
 
   fe_lc_compute_fed(fe, gamma, q, dq, &value);
   test_assert(fabs(value - 1.056203e-02) < TEST_FLOAT_TOLERANCE);
-  
+
   {
     double fed_bulk = 0.0;
     double fed_grad = 0.0;
@@ -580,7 +580,7 @@ int test_o8m_struct(pe_t * pe, cs_t * cs, lees_edw_t * le, fe_lc_t * fe,
   test_assert(fabs(dsq[X][Z] - -9.837494e-03) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Y][X] - -9.837494e-03) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Y][Y] - -7.887056e-03) < TEST_FLOAT_TOLERANCE);
-  test_assert(fabs(dsq[Y][Z] - -8.924220e-03) < TEST_FLOAT_TOLERANCE);  
+  test_assert(fabs(dsq[Y][Z] - -8.924220e-03) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Z][X] - -8.924220e-03) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Z][Y] - -9.837494e-03) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Z][Z] - -7.887056e-03) < TEST_FLOAT_TOLERANCE);
@@ -602,7 +602,7 @@ int test_o8m_struct(pe_t * pe, cs_t * cs, lees_edw_t * le, fe_lc_t * fe,
   test_assert(fabs(dsq[X][X] -  7.375082e-03) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[X][Y] -  0.0000000000) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[X][Z] -  0.0000000000) < TEST_FLOAT_TOLERANCE);
-  test_assert(fabs(dsq[Y][X] -  0.0000000000) < TEST_FLOAT_TOLERANCE);  
+  test_assert(fabs(dsq[Y][X] -  0.0000000000) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Y][Y] - -4.179480e-02) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Y][Z] - -2.748179e-02) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Z][X] -  0.0000000000) < TEST_FLOAT_TOLERANCE);
@@ -628,7 +628,7 @@ int test_o8m_struct(pe_t * pe, cs_t * cs, lees_edw_t * le, fe_lc_t * fe,
   test_assert(fabs(dsq[X][Z] -  9.837494e-03) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Y][X] -  9.837494e-03) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Y][Y] - -7.887056e-03) < TEST_FLOAT_TOLERANCE);
-  test_assert(fabs(dsq[Y][Z] - -8.924220e-03) < TEST_FLOAT_TOLERANCE);  
+  test_assert(fabs(dsq[Y][Z] - -8.924220e-03) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Z][X] -  8.924220e-03) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Z][Y] - -9.837494e-03) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Z][Z] - -7.887056e-03) < TEST_FLOAT_TOLERANCE);
@@ -650,7 +650,7 @@ int test_o8m_struct(pe_t * pe, cs_t * cs, lees_edw_t * le, fe_lc_t * fe,
   test_assert(fabs(dsq[X][X] -  2.779621e-04) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[X][Y] -  7.180623e-04) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[X][Z] -  0.0000000000) < TEST_FLOAT_TOLERANCE);
-  test_assert(fabs(dsq[Y][X] -  1.308445e-03) < TEST_FLOAT_TOLERANCE);  
+  test_assert(fabs(dsq[Y][X] -  1.308445e-03) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Y][Y] - -5.056451e-03) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Y][Z] -  0.0000000000) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Z][X] -  0.0000000000) < TEST_FLOAT_TOLERANCE);
@@ -674,7 +674,7 @@ int test_o8m_struct(pe_t * pe, cs_t * cs, lees_edw_t * le, fe_lc_t * fe,
   test_assert(fabs(dsq[X][X] - -1.007305e-04) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[X][Y] -  0.0000000000) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[X][Z] -  0.0000000000) < TEST_FLOAT_TOLERANCE);
-  test_assert(fabs(dsq[Y][X] -  0.0000000000) < TEST_FLOAT_TOLERANCE);  
+  test_assert(fabs(dsq[Y][X] -  0.0000000000) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Y][Y] -  2.779621e-04) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Y][Z] -  7.180623e-04) < TEST_FLOAT_TOLERANCE);
   test_assert(fabs(dsq[Z][X] -  0.0000000000) < TEST_FLOAT_TOLERANCE);
@@ -692,7 +692,7 @@ int test_o8m_struct(pe_t * pe, cs_t * cs, lees_edw_t * le, fe_lc_t * fe,
     jc = 1;
     kc = 1;
     index = lees_edw_index(le, ic, jc, kc);
-    
+
     fe_lc_stress(fe, index, sfull);
     fe_lc_bulk_stress(fe, index, sbulk);
     fe_lc_grad_stress(fe, index, sgrad);
@@ -713,7 +713,7 @@ int test_o8m_struct(pe_t * pe, cs_t * cs, lees_edw_t * le, fe_lc_t * fe,
     jc = 1;
     kc = 2;
     index = lees_edw_index(le, ic, jc, kc);
-    
+
     fe_lc_stress(fe, index, sfull);
     fe_lc_bulk_stress(fe, index, sbulk);
     fe_lc_grad_stress(fe, index, sgrad);
@@ -734,7 +734,7 @@ int test_o8m_struct(pe_t * pe, cs_t * cs, lees_edw_t * le, fe_lc_t * fe,
     jc = 1;
     kc = 3;
     index = lees_edw_index(le, ic, jc, kc);
-    
+
     fe_lc_stress(fe, index, sfull);
     fe_lc_bulk_stress(fe, index, sbulk);
     fe_lc_grad_stress(fe, index, sgrad);
@@ -755,7 +755,7 @@ int test_o8m_struct(pe_t * pe, cs_t * cs, lees_edw_t * le, fe_lc_t * fe,
     jc = 12;
     kc = 4;
     index = lees_edw_index(le, ic, jc, kc);
-    
+
     fe_lc_stress(fe, index, sfull);
     fe_lc_bulk_stress(fe, index, sbulk);
     fe_lc_grad_stress(fe, index, sgrad);
@@ -776,7 +776,7 @@ int test_o8m_struct(pe_t * pe, cs_t * cs, lees_edw_t * le, fe_lc_t * fe,
     jc = 6;
     kc = 7;
     index = lees_edw_index(le, ic, jc, kc);
-    
+
     fe_lc_stress(fe, index, sfull);
     fe_lc_bulk_stress(fe, index, sbulk);
     fe_lc_grad_stress(fe, index, sgrad);

From f8e7529541b8b693fcc4166f92a850f81dddbee8 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Thu, 19 Dec 2024 15:51:05 +0000
Subject: [PATCH 063/133] debugging

---
 src/field.c   | 22 ++++++++---------
 src/lb_data.c | 66 +++++++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 70 insertions(+), 18 deletions(-)

diff --git a/src/field.c b/src/field.c
index bb216f232..7eef25be7 100644
--- a/src/field.c
+++ b/src/field.c
@@ -1755,24 +1755,24 @@ int field_graph_halo_send_create(const field_t * field, field_halo_t * h) {
       int k = 1 + h->cv[h->nvel - ireq][Z];
 
       if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
-	tdpGraphNode_t memcpyNode;
+	      tdpGraphNode_t memcpyNode;
         tdpMemcpy3DParms memcpyParams = {0};
 
-	memcpyParams.srcArray = NULL;
-	memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
-	memcpyParams.srcPtr   = make_tdpPitchedPtr(h->send_d[ireq],
+	      memcpyParams.srcArray = NULL;
+	      memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
+	      memcpyParams.srcPtr   = make_tdpPitchedPtr(h->send_d[ireq],
 						   sizeof(double)*scount,
 						   scount, 1);
-	memcpyParams.dstArray = NULL;
-	memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
-	memcpyParams.dstPtr   = make_tdpPitchedPtr(h->send[ireq],
+	      memcpyParams.dstArray = NULL;
+	      memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
+	      memcpyParams.dstPtr   = make_tdpPitchedPtr(h->send[ireq],
 						   sizeof(double)*scount,
 						   scount, 1);
-	memcpyParams.extent   = make_tdpExtent(sizeof(double)*scount, 1, 1);
-	memcpyParams.kind     = tdpMemcpyDeviceToHost;
+	      memcpyParams.extent   = make_tdpExtent(sizeof(double)*scount, 1, 1);
+	      memcpyParams.kind     = tdpMemcpyDeviceToHost;
 
-	tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->gsend.graph,
-					 &kernelNode, 1, &memcpyParams) );
+	      tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->gsend.graph,
+	      				 &kernelNode, 1, &memcpyParams) );
       }
     }
   }
diff --git a/src/lb_data.c b/src/lb_data.c
index e843513f1..01d4c246c 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -63,6 +63,8 @@ static const int have_gpu_aware_mpi_ = 1;
 static const int have_gpu_aware_mpi_ = 0;
 #endif
 
+__global__ void lb_null_kernel(const lb_t * lb, lb_halo_t * h, int ireq) {};
+
 void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
     int nvel = h_model->nvel;
     // Allocate memory on the GPU for the arrays in the struct
@@ -935,6 +937,54 @@ __global__ void lb_halo_enqueue_send_kernel(const lb_t * lb, lb_halo_t * h, int
   }
 }
 
+__global__ void lb_halo_enqueue_send_kernel_2(const lb_t * lb, lb_halo_t * h, int ireq) {
+
+  assert(0 <= ireq && ireq < h->map.nvel);
+
+  if (h->count[ireq] > 0) {
+
+    int8_t mx = h->map.cv[ireq][X];
+    int8_t my = h->map.cv[ireq][Y];
+    int8_t mz = h->map.cv[ireq][Z];
+    int8_t mm = mx*mx + my*my + mz*mz;
+
+    int nx = 1 + h->slim[ireq].imax - h->slim[ireq].imin;
+    int ny = 1 + h->slim[ireq].jmax - h->slim[ireq].jmin;
+    int nz = 1 + h->slim[ireq].kmax - h->slim[ireq].kmin;
+
+    int strz = 1;
+    int stry = strz*nz;
+    int strx = stry*ny;
+
+    assert(mm == 1 || mm == 2 || mm == 3);
+
+	  int ih = 0;
+    for_simt_parallel (ih, nx*ny*nz, 1) {
+      int ic = h->slim[ireq].imin + ih/strx;
+      int jc = h->slim[ireq].jmin + (ih % strx)/stry;
+      int kc = h->slim[ireq].kmin + (ih % stry)/strz;
+      int ib = 0; /* Buffer index */
+
+      for (int n = 0; n < lb->ndist; n++) {
+	      for (int p = 0; p < lb->nvel; p++) {
+	        /* Recall, if full, we need p = 0 */
+	        //int8_t px = lb->model.cv[p][X];
+	        //int8_t py = lb->model.cv[p][Y];
+	        //int8_t pz = lb->model.cv[p][Z];
+	        //int dot = mx*px + my*py + mz*pz;
+	        //if (h->full || dot == mm) {
+	        if (h->full ) {
+	        //  //int index = cs_index(lb->cs, ic, jc, kc);
+	        //  //int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
+	        //  //h->send[ireq][ih*h->count[ireq] + ib] = lb->f[laddr];
+	          ib++;
+	        }
+	      }
+      }
+      //assert(ib == h->count[ireq]);
+    }
+  }
+}
 /*****************************************************************************
  *
  *  lb_halo_dequeue_recv
@@ -1327,7 +1377,7 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
       printf("here 2\n");
       tdpAssert( tdpStreamSynchronize(h->stream) );
       printf("here 3\n");
-    } else {
+    } //else {
       for (int ireq = 0; ireq < h->map.nvel; ireq++) {
         if (h->count[ireq] > 0) {
           int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
@@ -1337,7 +1387,7 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
           tdpDeviceSynchronize();
         }
       }
-    }
+    //}
   } else {
     #pragma omp parallel
     {
@@ -1405,7 +1455,7 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
       printf("here 5\n");
       tdpAssert( tdpStreamSynchronize(h->stream) );
       printf("here 6\n");
-    } else {
+    } //else {
       for (int ireq = 0; ireq < h->map.nvel; ireq++) {
         if (h->count[ireq] > 0) {
           int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
@@ -1415,7 +1465,7 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
           tdpDeviceSynchronize();
         }
       }
-    }
+    //}
   } else {
     #pragma omp parallel
     {
@@ -1766,7 +1816,8 @@ int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count)
     void * kernelArgs[3] = {(void *) &lb->target,
                             (void *) &h->target,
                             (void *) &ireq};
-    kernelNodeParams.func = (void *) lb_halo_enqueue_send_kernel;
+    kernelNodeParams.func = (void *) lb_halo_enqueue_send_kernel_2;
+    //kernelNodeParams.func = (void *) lb_null_kernel;
     dim3 nblk;
     dim3 ntpb;
     int scount = send_count[ireq]*lb_halo_size(h->slim[ireq]);
@@ -1876,7 +1927,8 @@ int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count)
     void * kernelArgs[3] = {(void *) &lb->target,
                             (void *) &h->target,
                             (void *) &ireq};
-    kernelNodeParams.func = (void *) lb_halo_dequeue_recv_kernel;
+    //kernelNodeParams.func = (void *) lb_halo_dequeue_recv_kernel;
+    kernelNodeParams.func = (void *) lb_null_kernel;
 
     kernel_launch_param(rcount, &nblk, &ntpb);
 
@@ -1908,4 +1960,4 @@ int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count)
   tdpAssert( tdpGraphInstantiate(&h->grecv.exec, h->grecv.graph, 0) );
 
   return 0;
-}
+}
\ No newline at end of file

From 9523734225c83b97eff4e47b7c9d55f0840abcef Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Thu, 19 Dec 2024 16:32:20 +0000
Subject: [PATCH 064/133] remove graph implementation

---
 src/field.c   |  22 ++--
 src/lb_data.c | 287 +++-----------------------------------------------
 src/lb_data.h |  12 ---
 3 files changed, 27 insertions(+), 294 deletions(-)

diff --git a/src/field.c b/src/field.c
index 7eef25be7..bb216f232 100644
--- a/src/field.c
+++ b/src/field.c
@@ -1755,24 +1755,24 @@ int field_graph_halo_send_create(const field_t * field, field_halo_t * h) {
       int k = 1 + h->cv[h->nvel - ireq][Z];
 
       if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
-	      tdpGraphNode_t memcpyNode;
+	tdpGraphNode_t memcpyNode;
         tdpMemcpy3DParms memcpyParams = {0};
 
-	      memcpyParams.srcArray = NULL;
-	      memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
-	      memcpyParams.srcPtr   = make_tdpPitchedPtr(h->send_d[ireq],
+	memcpyParams.srcArray = NULL;
+	memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
+	memcpyParams.srcPtr   = make_tdpPitchedPtr(h->send_d[ireq],
 						   sizeof(double)*scount,
 						   scount, 1);
-	      memcpyParams.dstArray = NULL;
-	      memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
-	      memcpyParams.dstPtr   = make_tdpPitchedPtr(h->send[ireq],
+	memcpyParams.dstArray = NULL;
+	memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
+	memcpyParams.dstPtr   = make_tdpPitchedPtr(h->send[ireq],
 						   sizeof(double)*scount,
 						   scount, 1);
-	      memcpyParams.extent   = make_tdpExtent(sizeof(double)*scount, 1, 1);
-	      memcpyParams.kind     = tdpMemcpyDeviceToHost;
+	memcpyParams.extent   = make_tdpExtent(sizeof(double)*scount, 1, 1);
+	memcpyParams.kind     = tdpMemcpyDeviceToHost;
 
-	      tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->gsend.graph,
-	      				 &kernelNode, 1, &memcpyParams) );
+	tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->gsend.graph,
+					 &kernelNode, 1, &memcpyParams) );
       }
     }
   }
diff --git a/src/lb_data.c b/src/lb_data.c
index 01d4c246c..77c414033 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -48,23 +48,12 @@ static __constant__ lb_collide_param_t static_param;
 #include "mpi-ext.h"
 #endif
 
-#ifdef __NVCC__
-/* There are two file-scope switches here, which need to be generalised
- * via some suitable interface; they are separate, but both relate to
- * GPU execution. */
-static const int have_graph_api_ = 1;
-#else
-static const int have_graph_api_ = 0;
-#endif
-
 #if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
 static const int have_gpu_aware_mpi_ = 1;
 #else
 static const int have_gpu_aware_mpi_ = 0;
 #endif
 
-__global__ void lb_null_kernel(const lb_t * lb, lb_halo_t * h, int ireq) {};
-
 void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
     int nvel = h_model->nvel;
     // Allocate memory on the GPU for the arrays in the struct
@@ -937,54 +926,6 @@ __global__ void lb_halo_enqueue_send_kernel(const lb_t * lb, lb_halo_t * h, int
   }
 }
 
-__global__ void lb_halo_enqueue_send_kernel_2(const lb_t * lb, lb_halo_t * h, int ireq) {
-
-  assert(0 <= ireq && ireq < h->map.nvel);
-
-  if (h->count[ireq] > 0) {
-
-    int8_t mx = h->map.cv[ireq][X];
-    int8_t my = h->map.cv[ireq][Y];
-    int8_t mz = h->map.cv[ireq][Z];
-    int8_t mm = mx*mx + my*my + mz*mz;
-
-    int nx = 1 + h->slim[ireq].imax - h->slim[ireq].imin;
-    int ny = 1 + h->slim[ireq].jmax - h->slim[ireq].jmin;
-    int nz = 1 + h->slim[ireq].kmax - h->slim[ireq].kmin;
-
-    int strz = 1;
-    int stry = strz*nz;
-    int strx = stry*ny;
-
-    assert(mm == 1 || mm == 2 || mm == 3);
-
-	  int ih = 0;
-    for_simt_parallel (ih, nx*ny*nz, 1) {
-      int ic = h->slim[ireq].imin + ih/strx;
-      int jc = h->slim[ireq].jmin + (ih % strx)/stry;
-      int kc = h->slim[ireq].kmin + (ih % stry)/strz;
-      int ib = 0; /* Buffer index */
-
-      for (int n = 0; n < lb->ndist; n++) {
-	      for (int p = 0; p < lb->nvel; p++) {
-	        /* Recall, if full, we need p = 0 */
-	        //int8_t px = lb->model.cv[p][X];
-	        //int8_t py = lb->model.cv[p][Y];
-	        //int8_t pz = lb->model.cv[p][Z];
-	        //int dot = mx*px + my*py + mz*pz;
-	        //if (h->full || dot == mm) {
-	        if (h->full ) {
-	        //  //int index = cs_index(lb->cs, ic, jc, kc);
-	        //  //int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
-	        //  //h->send[ireq][ih*h->count[ireq] + ib] = lb->f[laddr];
-	          ib++;
-	        }
-	      }
-      }
-      //assert(ib == h->count[ireq]);
-    }
-  }
-}
 /*****************************************************************************
  *
  *  lb_halo_dequeue_recv
@@ -1304,11 +1245,6 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 			 tdpMemcpyHostToDevice) );
     tdpAssert( tdpMemcpy(h->target->recv, h->recv_d, 27*sizeof(double *),
 			 tdpMemcpyHostToDevice) );
-    
-    if (have_graph_api_) {
-      lb_graph_halo_send_create(lb, h, h->count);
-      lb_graph_halo_recv_create(lb, h, h->count);
-    }
 
   }
   free(send_count);
@@ -1371,23 +1307,15 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
   if (ndevice > 0) {
     copyModelToDevice(&lb->model, &lb->target->model);
     copyModelToDevice(&h->map, &h->target->map);
-    if (have_graph_api_) {
-      printf("here 1\n");
-      tdpAssert( tdpGraphLaunch(h->gsend.exec, h->stream) );
-      printf("here 2\n");
-      tdpAssert( tdpStreamSynchronize(h->stream) );
-      printf("here 3\n");
-    } //else {
-      for (int ireq = 0; ireq < h->map.nvel; ireq++) {
-        if (h->count[ireq] > 0) {
-          int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
-          dim3 nblk, ntpb;
-          kernel_launch_param(scount, &nblk, &ntpb);
-          tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
-          tdpDeviceSynchronize();
-        }
+    for (int ireq = 0; ireq < h->map.nvel; ireq++) {
+      if (h->count[ireq] > 0) {
+        int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+        dim3 nblk, ntpb;
+        kernel_launch_param(scount, &nblk, &ntpb);
+        tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+        tdpDeviceSynchronize();
       }
-    //}
+    }
   } else {
     #pragma omp parallel
     {
@@ -1449,23 +1377,15 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
   int ndevice;
   tdpGetDeviceCount(&ndevice);
   if (ndevice > 0) {
-    if (have_graph_api_) {
-      printf("here 4\n");
-      tdpAssert( tdpGraphLaunch(h->grecv.exec, h->stream) );
-      printf("here 5\n");
-      tdpAssert( tdpStreamSynchronize(h->stream) );
-      printf("here 6\n");
-    } //else {
-      for (int ireq = 0; ireq < h->map.nvel; ireq++) {
-        if (h->count[ireq] > 0) {
-          int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
-          dim3 nblk, ntpb;
-          kernel_launch_param(rcount, &nblk, &ntpb);
-          tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
-          tdpDeviceSynchronize();
-        }
+    for (int ireq = 0; ireq < h->map.nvel; ireq++) {
+      if (h->count[ireq] > 0) {
+        int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+        dim3 nblk, ntpb;
+        kernel_launch_param(rcount, &nblk, &ntpb);
+        tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+        tdpDeviceSynchronize();
       }
-    //}
+    }
   } else {
     #pragma omp parallel
     {
@@ -1513,17 +1433,7 @@ int lb_halo_free(lb_t * lb, lb_halo_t * h) {
     free(h->recv[ireq]);
   }
 
-  if (have_graph_api_) {
-    printf("here 7\n");
-    tdpAssert( tdpGraphDestroy(h->gsend.graph) );
-    printf("here 8\n");
-    tdpAssert( tdpGraphDestroy(h->grecv.graph) );
-    printf("here 9\n");
-  }
-
-  tdpAssert( tdpStreamDestroy(h->stream) );
   lb_model_free(&h->map);
-  *h = (lb_halo_t) {0};
 
   return 0;
 }
@@ -1796,168 +1706,3 @@ int lb_io_read(lb_t * lb, int timestep, io_event_t * event) {
 
   return ifail;
 }
-
-/*****************************************************************************
- *
- * lb_graph_halo_send_create
- *
- *****************************************************************************/
-
-int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count) {
-
-  assert(lb);
-  assert(h);
-
-  tdpAssert( tdpGraphCreate(&h->gsend.graph, 0) );
-
-  for (int ireq = 1; ireq < h->map.nvel; ireq++) {
-    tdpGraphNode_t kernelNode;
-    tdpKernelNodeParams kernelNodeParams = {0};
-    void * kernelArgs[3] = {(void *) &lb->target,
-                            (void *) &h->target,
-                            (void *) &ireq};
-    kernelNodeParams.func = (void *) lb_halo_enqueue_send_kernel_2;
-    //kernelNodeParams.func = (void *) lb_null_kernel;
-    dim3 nblk;
-    dim3 ntpb;
-    int scount = send_count[ireq]*lb_halo_size(h->slim[ireq]);
-
-    kernel_launch_param(scount, &nblk, &ntpb);
-
-    kernelNodeParams.gridDim        = nblk;
-    kernelNodeParams.blockDim       = ntpb;
-    kernelNodeParams.sharedMemBytes = 0;
-    kernelNodeParams.kernelParams   = (void **) kernelArgs;
-    kernelNodeParams.extra          = NULL;
-
-    tdpAssert( tdpGraphAddKernelNode(&kernelNode, h->gsend.graph, NULL, 0,
-				     &kernelNodeParams) );
-
-    if (have_gpu_aware_mpi_) {
-      /* Don't need explicit device -> host copy */
-    }
-    else {
-      /* We do need to add the memcpys to the graph definition
-       * (except messages to self... ) */
-
-      int i = 1 + h->map.cv[h->map.nvel - ireq][X];
-      int j = 1 + h->map.cv[h->map.nvel - ireq][Y];
-      int k = 1 + h->map.cv[h->map.nvel - ireq][Z];
-
-      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
-	      tdpGraphNode_t memcpyNode;
-        tdpMemcpy3DParms memcpyParams = {0};
-
-	      memcpyParams.srcArray = NULL;
-	      memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
-	      memcpyParams.srcPtr   = make_tdpPitchedPtr(h->send_d[ireq],
-						   sizeof(double)*scount,
-						   scount, 1);
-	      memcpyParams.dstArray = NULL;
-	      memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
-	      memcpyParams.dstPtr   = make_tdpPitchedPtr(h->send[ireq],
-						   sizeof(double)*scount,
-						   scount, 1);
-	      memcpyParams.extent   = make_tdpExtent(sizeof(double)*scount, 1, 1);
-	      memcpyParams.kind     = tdpMemcpyDeviceToHost;
-
-	      tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->gsend.graph,
-					 &kernelNode, 1, &memcpyParams) );
-      }
-    }
-  }
-
-  tdpAssert( tdpGraphInstantiate(&h->gsend.exec, h->gsend.graph, 0) );
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  lb_graph_halo_recv_create
- *
- *****************************************************************************/
-
-int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count) {
-
-  assert(lb);
-  assert(h);
-
-  tdpAssert( tdpGraphCreate(&h->grecv.graph, 0) );
-
-  for (int ireq = 1; ireq < h->map.nvel; ireq++) {
-    int rcount = recv_count[ireq]*lb_halo_size(h->rlim[ireq]);
-    tdpGraphNode_t memcpyNode = {0};
-
-    if (have_gpu_aware_mpi_) {
-      /* Don't need explicit copies */
-    }
-    else {
-      int i = 1 + h->map.cv[h->map.nvel - ireq][X];
-      int j = 1 + h->map.cv[h->map.nvel - ireq][Y];
-      int k = 1 + h->map.cv[h->map.nvel - ireq][Z];
-
-      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
-	      tdpMemcpy3DParms memcpyParams = {0};
-
-	      memcpyParams.srcArray = NULL;
-	      memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
-	      memcpyParams.srcPtr   = make_tdpPitchedPtr(h->recv[ireq],
-						   sizeof(double)*rcount,
-						   rcount, 1);
-	      memcpyParams.dstArray = NULL;
-	      memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
-	      memcpyParams.dstPtr   = make_tdpPitchedPtr(h->recv_d[ireq],
-						   sizeof(double)*rcount,
-						   rcount, 1);
-        memcpyParams.extent   = make_tdpExtent(sizeof(double)*rcount, 1, 1);
-        memcpyParams.kind     = tdpMemcpyHostToDevice;
-
-	      tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->grecv.graph, NULL,
-					 0, &memcpyParams) );
-      }
-    }
-
-    /* Always need the dis-aggregateion kernel */
-
-    dim3 nblk;
-    dim3 ntpb;
-    tdpGraphNode_t node;
-    tdpKernelNodeParams kernelNodeParams = {0};
-    void * kernelArgs[3] = {(void *) &lb->target,
-                            (void *) &h->target,
-                            (void *) &ireq};
-    //kernelNodeParams.func = (void *) lb_halo_dequeue_recv_kernel;
-    kernelNodeParams.func = (void *) lb_null_kernel;
-
-    kernel_launch_param(rcount, &nblk, &ntpb);
-
-    kernelNodeParams.gridDim        = nblk;
-    kernelNodeParams.blockDim       = ntpb;
-    kernelNodeParams.sharedMemBytes = 0;
-    kernelNodeParams.kernelParams   = (void **) kernelArgs;
-    kernelNodeParams.extra          = NULL;
-
-    if (have_gpu_aware_mpi_) {
-      tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, NULL,
-				       0, &kernelNodeParams) );
-    }
-    else {
-      int i = 1 + h->map.cv[h->map.nvel - ireq][X];
-      int j = 1 + h->map.cv[h->map.nvel - ireq][Y];
-      int k = 1 + h->map.cv[h->map.nvel - ireq][Z];
-      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
-	      tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, &memcpyNode,
-					 1, &kernelNodeParams) );
-      }
-      else {
-	      tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, NULL, 0,
-					 &kernelNodeParams) );
-      }
-    }
-  }
-
-  tdpAssert( tdpGraphInstantiate(&h->grecv.exec, h->grecv.graph, 0) );
-
-  return 0;
-}
\ No newline at end of file
diff --git a/src/lb_data.h b/src/lb_data.h
index 27ba74920..db60fe4d6 100644
--- a/src/lb_data.h
+++ b/src/lb_data.h
@@ -48,12 +48,6 @@ enum {NDIM = 3, NVEL = 27};
 typedef struct lb_collide_param_s lb_collide_param_t;
 typedef struct lb_halo_s lb_halo_t;
 typedef struct lb_data_s lb_t;
-typedef struct lb_graph_halo_s lb_graph_halo_t;
-
-struct lb_graph_halo_s {
-  tdpGraph_t graph;
-  tdpGraphExec_t exec;
-};
 
 struct lb_collide_param_s {
   int8_t isghost;                      /* switch for ghost modes */
@@ -98,9 +92,6 @@ struct lb_halo_s {
   lb_halo_t * target;
   double * send_d[27];            /* halo: device send buffer per direction */
   double * recv_d[27];            /* halo: device recv buffer per direction */
-  
-  lb_graph_halo_t gsend;          /* Graph API halo swap */
-  lb_graph_halo_t grecv;
 };
 
 int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme);
@@ -191,7 +182,4 @@ __host__ int lb_io_aggr_unpack(lb_t * lb, const io_aggregator_t * aggr);
 __host__ int lb_io_write(lb_t * lb, int timestep, io_event_t * event);
 __host__ int lb_io_read(lb_t * lb, int timestep, io_event_t * event);
 
-int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count);
-int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count);
-
 #endif

From f3eb55d50809ed0bf47b70ef5234655d40c3a0b3 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Thu, 19 Dec 2024 18:32:18 +0000
Subject: [PATCH 065/133] reintroduce copy of cs to device

---
 src/lb_data.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/lb_data.c b/src/lb_data.c
index a01bf1ac2..0336d40e9 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -374,6 +374,10 @@ static int lb_init(lb_t * lb) {
     tdpGetSymbolAddress((void **) &ptmp, tdpSymbol(static_param));
     tdpAssert( tdpMemcpy(&lb->target->param, &ptmp,
 			 sizeof(lb_collide_param_t *), tdpMemcpyHostToDevice));
+
+    cs_target(lb->cs, &cstarget);
+    tdpMemcpy(&lb->target->cs, &cstarget, sizeof(cs_t *),
+	      tdpMemcpyHostToDevice);
   }
 
   lb_mpi_init(lb);

From 2f7b7a18646a7623bac538ca2a14ee39acf679a0 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Tue, 24 Dec 2024 19:30:39 +0000
Subject: [PATCH 066/133] Code quality updates

---
 src/colloids.c         |  6 ++++--
 src/fe_electro.c       |  9 ++++++---
 src/field.c            |  4 +++-
 src/field_grad.c       | 10 +++++-----
 src/io_aggregator.c    |  4 ++--
 src/io_impl_mpio.c     |  2 +-
 src/io_metadata.c      |  2 +-
 src/lb_model.c         |  8 ++++----
 src/map.c              | 10 +++++-----
 src/noise.c            |  4 ++--
 src/phi_force_stress.c |  2 +-
 src/psi.c              | 36 ++++++++++++++++++------------------
 src/stencil_d3q19.c    |  9 +++++----
 src/stencil_d3q27.c    |  9 +++++----
 src/stencil_d3q7.c     |  9 +++++----
 src/wall.c             | 17 +++++++++--------
 16 files changed, 76 insertions(+), 65 deletions(-)

diff --git a/src/colloids.c b/src/colloids.c
index cc72f5011..008563a09 100644
--- a/src/colloids.c
+++ b/src/colloids.c
@@ -109,8 +109,8 @@ __host__ void colloids_info_free(colloids_info_t * info) {
   colloids_info_cell_list_clean(info);
 
   free(info->clist);
-  if (info->map_old) free(info->map_old);
-  if (info->map_new) free(info->map_new);
+  free(info->map_old);
+  free(info->map_new);
 
   if (info->target != info) tdpAssert(tdpFree(info->target));
 
@@ -175,6 +175,8 @@ __host__ int colloids_info_recreate(int newcell[3], colloids_info_t ** pinfo) {
  *
  *  colloids_memcpy
  *
+ *  FIXME: flag is unused
+ *
  *****************************************************************************/
 
 __host__ int colloids_memcpy(colloids_info_t * info, int flag) {
diff --git a/src/fe_electro.c b/src/fe_electro.c
index 58ce03b2f..ac50ff73b 100644
--- a/src/fe_electro.c
+++ b/src/fe_electro.c
@@ -66,8 +66,9 @@ static fe_vt_t fe_electro_hvt = {
   (fe_hvector_ft)   NULL,
   (fe_htensor_ft)   NULL,
   (fe_htensor_v_ft) NULL,
-  (fe_htensor_v_ft) NULL,
-  (fe_htensor_v_ft) NULL
+  (fe_stress_v_ft)  NULL,
+  (fe_stress_v_ft)  NULL,
+  (fe_stress_v_ft)  NULL
 };
 
 static  __constant__ fe_vt_t fe_electro_dvt = {
@@ -158,7 +159,7 @@ __host__ int fe_electro_free(fe_electro_t * fe) {
   tdpAssert( tdpGetDeviceCount(&ndevice) );
   if (ndevice > 0) tdpAssert(tdpFree(fe->target));
 
-  if (fe->mu_ref) free(fe->mu_ref);
+  free(fe->mu_ref);
   free(fe);
 
   return 0;
@@ -263,6 +264,8 @@ int fe_electro_mu(fe_electro_t * fe, int index, double * mu) {
  *  This is a dummy which just returns zero, as an implementation is
  *  required. Physically, there is no solvation chemical potential.
  *
+ *  FIXME
+ *
  ****************************************************************************/
 
 __host__
diff --git a/src/field.c b/src/field.c
index ad16de026..17cd9f285 100644
--- a/src/field.c
+++ b/src/field.c
@@ -168,7 +168,7 @@ __host__ int field_free(field_t * obj) {
     tdpAssert( tdpFree(obj->target) );
   }
 
-  if (obj->data) free(obj->data);
+  free(obj->data);
 
   field_halo_free(&obj->h);
 
@@ -1682,6 +1682,8 @@ int field_io_write(field_t * field, int timestep, io_event_t * event) {
  *
  *  field_io_read
  *
+ *  FIXME io_event is unused
+ *
  *****************************************************************************/
 
 int field_io_read(field_t * field, int timestep, io_event_t * event) {
diff --git a/src/field_grad.c b/src/field_grad.c
index 456f70a28..e5b660d74 100644
--- a/src/field_grad.c
+++ b/src/field_grad.c
@@ -298,11 +298,11 @@ __host__ void field_grad_free(field_grad_t * obj) {
     tdpAssert( tdpFree(obj->target) );
   }
 
-  if (obj->grad) free(obj->grad);
-  if (obj->delsq) free(obj->delsq);
-  if (obj->grad_delsq) free(obj->grad_delsq);
-  if (obj->delsq_delsq) free(obj->delsq_delsq);
-  if (obj->d_ab) free(obj->d_ab);
+  free(obj->grad);
+  free(obj->delsq);
+  free(obj->grad_delsq);
+  free(obj->delsq_delsq);
+  free(obj->d_ab);
 
   obj->field = NULL;
   free(obj);
diff --git a/src/io_aggregator.c b/src/io_aggregator.c
index ab3c1ea94..74464ec25 100644
--- a/src/io_aggregator.c
+++ b/src/io_aggregator.c
@@ -8,7 +8,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2022 The University of Edinburgh
+ *  (c) 2022-2024 The University of Edinburgh
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
@@ -43,7 +43,7 @@ int io_aggregator_create(io_element_t el, cs_limits_t lim,
   return 0;
 
  err:
-  if (newaggr) free(newaggr);
+  free(newaggr);
   return -1;
 }
 
diff --git a/src/io_impl_mpio.c b/src/io_impl_mpio.c
index 0303e9a51..62c21743a 100644
--- a/src/io_impl_mpio.c
+++ b/src/io_impl_mpio.c
@@ -57,7 +57,7 @@ int io_impl_mpio_create(const io_metadata_t * metadata,
   return 0;
 
  err:
-  if (mpio) free(mpio);
+  free(mpio);
 
   return -1;
 }
diff --git a/src/io_metadata.c b/src/io_metadata.c
index 3e186930a..4d390d68b 100644
--- a/src/io_metadata.c
+++ b/src/io_metadata.c
@@ -58,7 +58,7 @@ int io_metadata_create(cs_t * cs,
 
  err:
 
-  if (meta) free(meta);
+  free(meta);
   return -1;
 }
 
diff --git a/src/lb_model.c b/src/lb_model.c
index 7c1301e71..78db39af8 100644
--- a/src/lb_model.c
+++ b/src/lb_model.c
@@ -8,7 +8,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2021-2022 The University of Edinburgh
+ *  (c) 2021-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -108,9 +108,9 @@ int lb_model_free(lb_model_t * model) {
     free(model->ma);
   }
 
-  if (model->na) free(model->na);
-  if (model->cv) free(model->cv);
-  if (model->wv) free(model->wv);
+  free(model->na);
+  free(model->cv);
+  free(model->wv);
 
   *model = (lb_model_t) {0};
 
diff --git a/src/map.c b/src/map.c
index ff30a96ab..5488c33cb 100644
--- a/src/map.c
+++ b/src/map.c
@@ -58,7 +58,7 @@ int map_create(pe_t * pe, cs_t * cs, const map_options_t * options,
 
  err:
 
-  if (obj) free(obj);
+  free(obj);
   return -1;
 }
 
@@ -215,8 +215,8 @@ int map_initialise(pe_t * pe, cs_t * cs, const map_options_t * options,
   /* All failures are before any device memory is involved ... */
   if (map->input.cs) io_metadata_finalise(&map->input);
   if (map->output.cs) io_metadata_finalise(&map->output);
-  if (map->data) free(map->data);
-  if (map->status) free(map->status);
+  free(map->data);
+  free(map->status);
 
   *map = (map_t) {0};
 
@@ -257,8 +257,8 @@ int map_finalise(map_t * map) {
   io_metadata_finalise(&map->input);
   io_metadata_finalise(&map->output);
 
-  if (map->data) free(map->data);
-  if (map->status) free(map->status);
+  free(map->data);
+  free(map->status);
 
   *map = (map_t) {0};
 
diff --git a/src/noise.c b/src/noise.c
index 1bb58df1a..9ff4ff27b 100644
--- a/src/noise.c
+++ b/src/noise.c
@@ -67,7 +67,7 @@ int noise_create(pe_t * pe, cs_t * cs, const noise_options_t * options,
   return 0;
 
  err:
-  if (obj) free(obj);
+  free(obj);
 
   return -1;
 }
@@ -259,7 +259,7 @@ int noise_finalise(noise_t * ns) {
   if (ns->output.cs) io_metadata_finalise(&ns->output);
   if (ns->input.cs)  io_metadata_finalise(&ns->input);
 
-  if (ns->state) free(ns->state);
+  free(ns->state);
 
   *ns = (noise_t) {};
 
diff --git a/src/phi_force_stress.c b/src/phi_force_stress.c
index b67ce7902..ca6941f2c 100644
--- a/src/phi_force_stress.c
+++ b/src/phi_force_stress.c
@@ -112,7 +112,7 @@ __host__ int pth_free(pth_t * pth) {
     tdpAssert( tdpFree(pth->target) );
   }
 
-  if (pth->str) free(pth->str);
+  free(pth->str);
   free(pth);
 
   return 0;
diff --git a/src/psi.c b/src/psi.c
index 887a1a2bc..11e303db2 100644
--- a/src/psi.c
+++ b/src/psi.c
@@ -9,7 +9,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2012-2023 The University of Edinburgh
+ *  (c) 2012-2024 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -42,13 +42,13 @@ int psi_create(pe_t * pe, cs_t * cs, const psi_options_t * opts,
 
   ifail = psi_initialise(pe, cs, opts, psi);
   if (ifail != 0) goto err;
-  
+
   *pobj = psi;
 
   return 0;
 
  err:
-  if (psi) free(psi);
+  free(psi);
   return -1;
 }
 
@@ -169,7 +169,7 @@ int psi_halo_psi(psi_t * psi) {
 
   assert(psi);
 
-  /* Fudge device implmentation at the moment */
+  /* Fudge device implementation at the moment */
   field_memcpy(psi->psi, tdpMemcpyHostToDevice);
   field_halo(psi->psi);
   field_memcpy(psi->psi, tdpMemcpyDeviceToHost);
@@ -588,7 +588,7 @@ int psi_zero_mean(psi_t * psi) {
 
   cs_ltot(psi->cs, ltot);
   cs_nhalo(psi->cs, &nhalo);
-  cs_nlocal(psi->cs, nlocal);  
+  cs_nlocal(psi->cs, nlocal);
   cs_cart_comm(psi->cs, &comm);
 
   sum_local = 0.0;
@@ -681,7 +681,7 @@ int psi_halo_psijump(psi_t * psi) {
 	    /* Borrow fluid site ic = 1 */
 	    index1 = cs_index(psi->cs, 1, jc, kc);
 	    psidata[addr_rank0(psi->nsites, index)] =
-	      psidata[addr_rank0(psi->nsites, index1)];   
+	      psidata[addr_rank0(psi->nsites, index1)];
 	  }
 	}
       }
@@ -705,11 +705,11 @@ int psi_halo_psijump(psi_t * psi) {
 	    /* Borrow fluid site at end ... */
 	    index1 = cs_index(psi->cs, nlocal[X], jc, kc);
 	    psidata[addr_rank0(psi->nsites, index)] =
-	      psidata[addr_rank0(psi->nsites, index1)];   
+	      psidata[addr_rank0(psi->nsites, index1)];
 	  }
 	}
       }
-    }  
+    }
   }
 
   if (mpicoords[Y] == 0) {
@@ -728,11 +728,11 @@ int psi_halo_psijump(psi_t * psi) {
 	      /* Not periodic ... just borrow from fluid site jc = 1 */
 	      index1 = cs_index(psi->cs, ic, 1, kc);
 	      psidata[addr_rank0(psi->nsites, index)] =
-		psidata[addr_rank0(psi->nsites, index1)];   
+		psidata[addr_rank0(psi->nsites, index1)];
 	    }
 	}
       }
-    }  
+    }
 
   }
 
@@ -752,11 +752,11 @@ int psi_halo_psijump(psi_t * psi) {
 	    /* Borrow fluid site at end */
 	    index1 = cs_index(psi->cs, ic, nlocal[Y], kc);
 	    psidata[addr_rank0(psi->nsites, index)] =
-	      psidata[addr_rank0(psi->nsites, index1)];   
+	      psidata[addr_rank0(psi->nsites, index1)];
 	  }
 	}
       }
-    }  
+    }
 
   }
 
@@ -776,11 +776,11 @@ int psi_halo_psijump(psi_t * psi) {
 	    /* Borrow fluid site kc = 1 */
 	    index1 = cs_index(psi->cs, ic, jc, 1);
 	    psidata[addr_rank0(psi->nsites, index)] =
-	      psidata[addr_rank0(psi->nsites, index1)];   
+	      psidata[addr_rank0(psi->nsites, index1)];
 	  }
 	}
       }
-    }  
+    }
 
   }
 
@@ -800,11 +800,11 @@ int psi_halo_psijump(psi_t * psi) {
 	    /* Borrow fluid site at end ... */
 	    index1 = cs_index(psi->cs, ic, jc, nlocal[Z]);
 	    psidata[addr_rank0(psi->nsites, index)] =
-	      psidata[addr_rank0(psi->nsites, index1)];   
+	      psidata[addr_rank0(psi->nsites, index1)];
 	  }
 	}
       }
-    }  
+    }
 
   }
 
@@ -862,8 +862,8 @@ int psi_output_step(psi_t * psi, int its) {
  *  To ensure overall electroneutrality, we consider the following:
  *
  *   (1) assume surface charges have been assigned
- *   (2) assume the fluid is initialised with the backgound charge density
- *       of the electrolyte 
+ *   (2) assume the fluid is initialised with the background charge density
+ *       of the electrolyte
  *   (3) assume some number of colloids has been initialised, each
  *       with a given charge.
  *
diff --git a/src/stencil_d3q19.c b/src/stencil_d3q19.c
index 49bcb38db..adc0b50bf 100644
--- a/src/stencil_d3q19.c
+++ b/src/stencil_d3q19.c
@@ -6,7 +6,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2023 The University of Edinburgh
+ *  (c) 2023-2024 The University of Edinburgh
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
@@ -69,9 +69,10 @@ int stencil_d3q19_create(stencil_t ** stencil) {
 
  err:
 
-  if (s->wgradients) free(s->wgradients);
-  if (s->wlaplacian) free(s->wlaplacian);
-  if (s->cv) free(s->cv);
+  free(s->wgradients);
+  free(s->wlaplacian);
+  free(s->cv);
+  free(s);
 
   *stencil = NULL;
 
diff --git a/src/stencil_d3q27.c b/src/stencil_d3q27.c
index 6a0eb7556..3b7c4323f 100644
--- a/src/stencil_d3q27.c
+++ b/src/stencil_d3q27.c
@@ -8,7 +8,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2023 The University of Edinburgh
+ *  (c) 2023-2024 The University of Edinburgh
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
@@ -74,9 +74,10 @@ int stencil_d3q27_create(stencil_t ** stencil) {
 
  err:
 
-  if (s->wgradients) free(s->wgradients);
-  if (s->wlaplacian) free(s->wlaplacian);
-  if (s->cv) free(s->cv);
+  free(s->wgradients);
+  free(s->wlaplacian);
+  free(s->cv);
+  free(s);
 
   *stencil = NULL;
 
diff --git a/src/stencil_d3q7.c b/src/stencil_d3q7.c
index 9497fa1f1..744c51be6 100644
--- a/src/stencil_d3q7.c
+++ b/src/stencil_d3q7.c
@@ -8,7 +8,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2023 The University of Edinburgh
+ *  (c) 2023-2024 The University of Edinburgh
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
@@ -77,9 +77,10 @@ int stencil_d3q7_create(stencil_t ** stencil) {
 
  err:
 
-  if (s->wgradients) free(s->wgradients);
-  if (s->wlaplacian) free(s->wlaplacian);
-  if (s->cv) free(s->cv);
+  free(s->wgradients);
+  free(s->wlaplacian);
+  free(s->cv);
+  free(s);
 
   *stencil = NULL;
 
diff --git a/src/wall.c b/src/wall.c
index 70b945d5a..12ce8fe36 100644
--- a/src/wall.c
+++ b/src/wall.c
@@ -155,14 +155,15 @@ __host__ int wall_free(wall_t * wall) {
   free(wall->param);
 
   /* slip quantities */
-  if (wall->linkk) free(wall->linkk);
-  if (wall->linkq) free(wall->linkq);
-  if (wall->links) free(wall->links);
-
-  if (wall->linki) free(wall->linki);
-  if (wall->linkj) free(wall->linkj);
-  if (wall->linkp) free(wall->linkp);
-  if (wall->linku) free(wall->linku);
+
+  free(wall->linkk);
+  free(wall->linkq);
+  free(wall->links);
+
+  free(wall->linki);
+  free(wall->linkj);
+  free(wall->linkp);
+  free(wall->linku);
 
   free(wall);
 

From 882f9d32eaf4718f3005756d015612d395e8f186 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Wed, 25 Dec 2024 09:25:53 +0000
Subject: [PATCH 067/133] CodeQL v3

---
 .github/workflows/codeql-analysis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 74e1633f2..9d8deff29 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -40,7 +40,7 @@ jobs:
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
-      uses: github/codeql-action/init@v2
+      uses: github/codeql-action/init@v3
       with:
         languages: ${{ matrix.language }}
         queries: +security-and-quality

From 15333d70a05b2f62787a20f3298b69a2d4e49f89 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Wed, 25 Dec 2024 09:27:26 +0000
Subject: [PATCH 068/133] CodeQL v3

---
 .github/workflows/codeql-analysis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 9d8deff29..736dc1a31 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -53,4 +53,4 @@ jobs:
        make
 
     - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v2
+      uses: github/codeql-action/analyze@v3

From c059fde127cf893e83d144abded0157ca98ff8d2 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Wed, 25 Dec 2024 16:22:48 +0000
Subject: [PATCH 069/133] d2q9 / mpi test

---
 .github/workflows/regression.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
index 065f0adfa..89565283c 100644
--- a/.github/workflows/regression.yml
+++ b/.github/workflows/regression.yml
@@ -94,7 +94,7 @@ jobs:
 
   d2q9-mpi:
     name: d2q9 / mpi
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
     - name: Checkout repository

From fe81bec5cf03c8e3f9c836e2c09d7022517e8294 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Wed, 25 Dec 2024 16:36:31 +0000
Subject: [PATCH 070/133] d3q15 / mpi test

---
 .github/workflows/regression.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
index 89565283c..4a1b34f54 100644
--- a/.github/workflows/regression.yml
+++ b/.github/workflows/regression.yml
@@ -95,6 +95,9 @@ jobs:
   d2q9-mpi:
     name: d2q9 / mpi
     runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        mpi: ['openmpi']
 
     steps:
     - name: Checkout repository

From 155156dd5d1e27d159199b837d3bd19a3c56c4c7 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Wed, 25 Dec 2024 16:44:32 +0000
Subject: [PATCH 071/133] d3q15 / mpi test

---
 .github/workflows/regression.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
index 4a1b34f54..39428609c 100644
--- a/.github/workflows/regression.yml
+++ b/.github/workflows/regression.yml
@@ -115,6 +115,9 @@ jobs:
   d3q15-mpi:
     name: d3q15 / mpi
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        mpi: ['openmpi']
 
     steps:
     - name: Checkout repository

From 742cbd1d7d5aa671e7e36ada119dd5f3293ab1f9 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Wed, 25 Dec 2024 16:55:18 +0000
Subject: [PATCH 072/133] d3q15 / mpi test

---
 .github/workflows/regression.yml | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
index 39428609c..ff8185758 100644
--- a/.github/workflows/regression.yml
+++ b/.github/workflows/regression.yml
@@ -94,7 +94,7 @@ jobs:
 
   d2q9-mpi:
     name: d2q9 / mpi
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
     strategy:
       matrix:
         mpi: ['openmpi']
@@ -102,6 +102,8 @@ jobs:
     steps:
     - name: Checkout repository
       uses: actions/checkout@v4
+      with:
+        mpi: ${{ matrix.mpi }}
     - name: Set up MPI
       uses: mpi4py/setup-mpi@v1
     - run: mpicc --version
@@ -114,10 +116,7 @@ jobs:
   
   d3q15-mpi:
     name: d3q15 / mpi
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        mpi: ['openmpi']
+    runs-on: ubuntu-22.04
 
     steps:
     - name: Checkout repository

From 131341c67c806da21bd6f9d8667b057fafde5bc2 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Wed, 25 Dec 2024 17:03:13 +0000
Subject: [PATCH 073/133] d3q15 / mpi test

---
 .github/workflows/regression.yml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
index ff8185758..9501ac7b7 100644
--- a/.github/workflows/regression.yml
+++ b/.github/workflows/regression.yml
@@ -95,17 +95,14 @@ jobs:
   d2q9-mpi:
     name: d2q9 / mpi
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        mpi: ['openmpi']
 
     steps:
     - name: Checkout repository
       uses: actions/checkout@v4
-      with:
-        mpi: ${{ matrix.mpi }}
     - name: Set up MPI
       uses: mpi4py/setup-mpi@v1
+      with:
+        mpi: openmpi
     - run: mpicc --version
     - run: cp config/github-mpicc.mk config.mk
     - run: make -j 2

From e831fb270a0d1ffec081b46cd6d22e3a8af2d7de Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Wed, 25 Dec 2024 17:44:25 +0000
Subject: [PATCH 074/133] move mpi tests to openmpi

---
 .github/workflows/regression.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/regression.yml b/.github/workflows/regression.yml
index 9501ac7b7..7633c0916 100644
--- a/.github/workflows/regression.yml
+++ b/.github/workflows/regression.yml
@@ -113,13 +113,15 @@ jobs:
   
   d3q15-mpi:
     name: d3q15 / mpi
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-latest
 
     steps:
     - name: Checkout repository
       uses: actions/checkout@v4
     - name: Set up MPI
       uses: mpi4py/setup-mpi@v1
+      with:
+        mpi: openmpi
     - run: mpicc --version
     - run: cp config/github-mpicc.mk config.mk
     - run: sed -i "s/D2Q9/D3Q15/" config.mk

From d57ae1aca3564fb85c8f1e2812cdeef2356e6fd6 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Wed, 25 Dec 2024 17:46:35 +0000
Subject: [PATCH 075/133] Add read report

---
 src/colloids.c |  17 ++++----
 src/field.c    |  13 +++++--
 src/io_event.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++-
 src/io_event.h |  10 ++++-
 src/lb_data.c  |   2 +-
 src/noise.c    |   2 +-
 6 files changed, 132 insertions(+), 16 deletions(-)

diff --git a/src/colloids.c b/src/colloids.c
index 008563a09..ae1b3abcf 100644
--- a/src/colloids.c
+++ b/src/colloids.c
@@ -175,8 +175,6 @@ __host__ int colloids_info_recreate(int newcell[3], colloids_info_t ** pinfo) {
  *
  *  colloids_memcpy
  *
- *  FIXME: flag is unused
- *
  *****************************************************************************/
 
 __host__ int colloids_memcpy(colloids_info_t * info, int flag) {
@@ -193,11 +191,16 @@ __host__ int colloids_memcpy(colloids_info_t * info, int flag) {
     assert((info->target == info));
   }
   else {
-    colloid_t * tmp;
-    tdpAssert(tdpMemcpy(&tmp, &info->target->map_new, sizeof(colloid_t **),
-			tdpMemcpyDeviceToHost));
-    tdpAssert(tdpMemcpy(tmp, info->map_new, info->nsites*sizeof(colloid_t *),
-			tdpMemcpyHostToDevice));
+    if (flag == tdpMemcpyHostToDevice) {
+      colloid_t * tmp;
+      tdpAssert(tdpMemcpy(&tmp, &info->target->map_new, sizeof(colloid_t **),
+			  tdpMemcpyDeviceToHost));
+      tdpAssert(tdpMemcpy(tmp, info->map_new, info->nsites*sizeof(colloid_t *),
+			  tdpMemcpyHostToDevice));
+    }
+    else {
+      pe_exit(info->pe, "Bad flag in colloids_memcpy()\n");
+    }
   }
 
   return 0;
diff --git a/src/field.c b/src/field.c
index 17cd9f285..4764d2b0c 100644
--- a/src/field.c
+++ b/src/field.c
@@ -1672,7 +1672,7 @@ int field_io_write(field_t * field, int timestep, io_event_t * event) {
     }
 
     io->impl->free(&io);
-    io_event_report(event, meta, field->name);
+    io_event_report_write(event, meta, field->name);
   }
 
   return ifail;
@@ -1682,8 +1682,6 @@ int field_io_write(field_t * field, int timestep, io_event_t * event) {
  *
  *  field_io_read
  *
- *  FIXME io_event is unused
- *
  *****************************************************************************/
 
 int field_io_read(field_t * field, int timestep, io_event_t * event) {
@@ -1699,9 +1697,16 @@ int field_io_read(field_t * field, int timestep, io_event_t * event) {
   assert(ifail == 0);
 
   if (ifail == 0) {
+    io_event_record(event, IO_EVENT_READ);
     io->impl->read(io, filename);
+    io_event_record(event, IO_EVENT_DISAGGR);
     field_io_aggr_unpack(field, io->aggr);
     io->impl->free(&io);
+
+    if (meta->options.report) {
+      pe_info(field->pe, "MPIIO read from %s\n", filename);
+      io_event_report_read(event, meta, field->name);
+    }
   }
 
   return ifail;
@@ -1709,7 +1714,7 @@ int field_io_read(field_t * field, int timestep, io_event_t * event) {
 
 /*****************************************************************************
  *
- * field_graph_halo_send_create
+ *  field_graph_halo_send_create
  *
  *****************************************************************************/
 
diff --git a/src/io_event.c b/src/io_event.c
index 77d156eaa..f6b9130d4 100644
--- a/src/io_event.c
+++ b/src/io_event.c
@@ -47,12 +47,45 @@ int io_event_record(io_event_t * event, io_event_record_t iorec) {
  *****************************************************************************/
 
 int io_event_report(io_event_t * event, const io_metadata_t * metadata,
-		    const char * name) {
+		    const char * name, io_event_record_t iorec) {
 
   assert(event);
   assert(metadata);
   assert(name);
 
+  switch (iorec) {
+  case IO_EVENT_READ:
+    io_event_report_read(event, metadata, name);
+    break;
+  case IO_EVENT_WRITE:
+    io_event_report_write(event, metadata, name);
+    break;
+  default:
+    /* Internal error. */
+    pe_exit(metadata->cs->pe, "Bad io event in report\n");
+  }
+
+  return 0;
+}
+
+/*****************************************************************************
+ *
+ *  io_event_report_read
+ *
+ *  Report for an i/o event in parallel.
+ *  The aggregation report might want the local data size (currently total).
+ *  The file write is the total data size of the file.
+ *
+ *  Some refinement might be wanted for multiple files.
+ *
+ *****************************************************************************/
+
+int io_event_report_read(io_event_t * event, const io_metadata_t * metadata,
+			 const char * name) {
+
+  assert(event);
+  assert(metadata);
+  assert(name);
 
   /* End of event (for reporting purposes) */
   event->time[IO_EVENT_REPORT] = MPI_Wtime();
@@ -64,10 +97,79 @@ int io_event_report(io_event_t * event, const io_metadata_t * metadata,
     const char * units = NULL;
     double dunit6 = 1.0e+06; /* Units of data size are MB */
     double dunit9 = 1.0e+09; /* Units of data size are GB */
+
     /* Times (we assume these have been collected correctly! */
+    /* Read, then disaggregate, then report */
+
+    double tr = event->time[IO_EVENT_DISAGGR] - event->time[IO_EVENT_READ];
+    double ta = event->time[IO_EVENT_REPORT] - event->time[IO_EVENT_DISAGGR];
+
+    /* Record size and total file size. */
+
+    double dr = metadata->element.datasize*metadata->element.count;
+    double ds = metadata->subfile.sizes[X]*metadata->subfile.sizes[Y]*
+                metadata->subfile.sizes[Z];
+    double db = dr*ds;
+
+    if (db > dunit9) {
+      /* Use GB */
+      units = "GB";
+      db = db/dunit9;
+    }
+    else {
+      /* Use MB */
+      units = "MB";
+      db = db/dunit6;
+    }
+    pe_info(pe, "- %10s read          %7.3f %2s in %7.3f seconds\n",
+	    name, db, units, tr);
+    pe_info(pe, "- %10s disaggregated %7.3f %2s in %7.3f seconds\n",
+	    name, db, units, ta);
+    pe_info(pe, "- %10s rate          %7.3f GB per second\n",
+	    name, dr*ds/dunit9/tr);
+  }
+
+  return 0;
+}
+
+/*****************************************************************************
+ *
+ *  io_event_report_write
+ *
+ *  The aggregation report might want the local data size (currently total).
+ *  The file write is the total data size of the file.
+ *
+ *  Some refinement might be wanted for multiple files.
+ *
+ *****************************************************************************/
+
+int io_event_report_write(io_event_t * event, const io_metadata_t * metadata,
+			  const char * name) {
+
+  assert(event);
+  assert(metadata);
+  assert(name);
+
+
+  /* End of event (for reporting purposes) */
+  event->time[IO_EVENT_REPORT] = MPI_Wtime();
+
+  if (metadata->options.report) {
+
+    pe_t * pe = metadata->cs->pe;
+
+    const char * units = NULL;
+    double dunit6 = 1.0e+06; /* Units of data size are MB */
+    double dunit9 = 1.0e+09; /* Units of data size are GB */
+
+    /* Times (we assume these have been collected correctly! */
+    /* Write: aggr is first, write is second, report last */
+
     double ta = event->time[IO_EVENT_WRITE]  - event->time[IO_EVENT_AGGR];
     double tw = event->time[IO_EVENT_REPORT] - event->time[IO_EVENT_WRITE];
+
     /* Record size and total file size. */
+
     double dr = metadata->element.datasize*metadata->element.count;
     double ds = metadata->subfile.sizes[X]*metadata->subfile.sizes[Y]*
                 metadata->subfile.sizes[Z];
diff --git a/src/io_event.h b/src/io_event.h
index b4abe9497..289bdc7fc 100644
--- a/src/io_event.h
+++ b/src/io_event.h
@@ -9,7 +9,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2022 The University of Edinburgh
+ *  (c) 2022-2024 The University of Edinburgh
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
@@ -22,6 +22,8 @@
 
 enum io_event_record_enum {
   IO_EVENT_AGGR = 0,
+  IO_EVENT_DISAGGR,
+  IO_EVENT_READ,
   IO_EVENT_WRITE,
   IO_EVENT_REPORT,
   IO_EVENT_MAX
@@ -38,6 +40,10 @@ struct io_event_s {
 
 int io_event_record(io_event_t * event, io_event_record_t iorec);
 int io_event_report(io_event_t * event, const io_metadata_t * metadata,
-		    const char * name);
+		    const char * name, io_event_record_t iorec);
+int io_event_report_read(io_event_t * event, const io_metadata_t * metadata,
+			 const char * name);
+int io_event_report_write(io_event_t * event, const io_metadata_t * metadata,
+			  const char * name);
 
 #endif
diff --git a/src/lb_data.c b/src/lb_data.c
index c7ba18f8a..612aafadc 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1417,7 +1417,7 @@ int lb_io_write(lb_t * lb, int timestep, io_event_t * event) {
       }
 
       io->impl->free(&io);
-      io_event_report(event, meta, "dist");
+      io_event_report_write(event, meta, "dist");
     }
   }
 
diff --git a/src/noise.c b/src/noise.c
index 9ff4ff27b..5a7eff6e2 100644
--- a/src/noise.c
+++ b/src/noise.c
@@ -648,7 +648,7 @@ int noise_io_write(noise_t * ns, int timestep, io_event_t * event) {
       if (meta->options.report) {
 	pe_info(ns->pe, "Wrote noise state to file: %s\n", filename);
       }
-      io_event_report(event, meta, ns->options.filestub);
+      io_event_report_write(event, meta, ns->options.filestub);
     }
   }
 

From 60b0b4e635306c5fa14b5ddb2b72f8c0c8b20d79 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Fri, 27 Dec 2024 17:11:48 +0000
Subject: [PATCH 076/133] Switch off target

---
 codecov.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/codecov.yml b/codecov.yml
index 4a9a90fe8..bfdc9877d 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -2,8 +2,7 @@ coverage:
   status:
     project:
       default:
-        target: 33%
-        threshold: 2%
+        informational: true
     patch:
       default:
         informational: true

From f2a4b356eb7d758a795745197d177d320554d670 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Fri, 27 Dec 2024 17:12:07 +0000
Subject: [PATCH 077/133] Remove comment

---
 src/fe_electro.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/fe_electro.c b/src/fe_electro.c
index ac50ff73b..13b204938 100644
--- a/src/fe_electro.c
+++ b/src/fe_electro.c
@@ -264,8 +264,6 @@ int fe_electro_mu(fe_electro_t * fe, int index, double * mu) {
  *  This is a dummy which just returns zero, as an implementation is
  *  required. Physically, there is no solvation chemical potential.
  *
- *  FIXME
- *
  ****************************************************************************/
 
 __host__

From da8f8b372d22437ed8d9b725d80e226ea3946580 Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@cirrus.epcc.ed.ac.uk>
Date: Fri, 27 Dec 2024 19:22:39 +0000
Subject: [PATCH 078/133] Remove constraint on ndim for binary collision

---
 src/collision.c | 75 ++++++++++++++++++++++++-------------------------
 1 file changed, 37 insertions(+), 38 deletions(-)

diff --git a/src/collision.c b/src/collision.c
index 9ff0c1161..54acad806 100644
--- a/src/collision.c
+++ b/src/collision.c
@@ -607,7 +607,6 @@ __host__ int lb_collision_binary(lb_t * lb, hydro_t * hydro, noise_t * noise,
 
   int nlocal[3] = {0};
 
-  assert (NDIM == 3); /* NDIM = 2 warrants additional tests here. */
   assert(lb);
   assert(hydro);
   assert(fe);
@@ -709,28 +708,28 @@ __device__ void lb_collision_mrt2_site(lb_t * lb, hydro_t * hydro,
 				      fe_symm_t * fe, noise_t * noise,
 				      const int index0) {
   int ia, ib, m, p;
-  double f[NVEL*NSIMDVL];
-  double mode[NVEL*NSIMDVL];    /* Modes; hydrodynamic + ghost */
-  double rho[NSIMDVL];
-  double rrho[NSIMDVL];         /* Density, reciprocal density */
-
-  double u[3][NSIMDVL];         /* Velocity */
-  double s[3][3][NSIMDVL];      /* Stress */
-  double seq[3][3][NSIMDVL];    /* equilibrium stress */
-  double shat[3][3][NSIMDVL];   /* random stress */
-  double ghat[NVEL][NSIMDVL];   /* noise for ghosts */
-
-  double force[3][NSIMDVL];     /* External force */
-
-  double tr_s[NSIMDVL];         /* Trace of stress */
-  double tr_seq[NSIMDVL];       /* Equilibrium value thereof */
-  double phi[NSIMDVL];          /* phi */
-  double jphi[3][NSIMDVL];      /* phi flux */
-  double jdotc[NSIMDVL];        /* Contraction jphi_a cv_ia */
-  double sphidotq[NSIMDVL];     /* phi second moment */
-  double sth[3][3][NSIMDVL];    /* stress */
-  double sphi[3][3][NSIMDVL];   /* stress */
-  double mu[NSIMDVL];           /* Chemical potential */
+  double f[NVEL*NSIMDVL] = {0};
+  double mode[NVEL*NSIMDVL] = {0};    /* Modes; hydrodynamic + ghost */
+  double rho[NSIMDVL] = {0};
+  double rrho[NSIMDVL] = {0};         /* Density, reciprocal density */
+
+  double u[3][NSIMDVL] = {0};         /* Velocity */
+  double s[3][3][NSIMDVL] = {0};      /* Stress */
+  double seq[3][3][NSIMDVL] = {0};    /* equilibrium stress */
+  double shat[3][3][NSIMDVL] = {0};   /* random stress */
+  double ghat[NVEL][NSIMDVL] = {0};   /* noise for ghosts */
+
+  double force[3][NSIMDVL] = {0};     /* External force */
+
+  double tr_s[NSIMDVL] = {0};         /* Trace of stress */
+  double tr_seq[NSIMDVL] = {0};       /* Equilibrium value thereof */
+  double phi[NSIMDVL] = {0};          /* phi */
+  double jphi[3][NSIMDVL] = {0};      /* phi flux */
+  double jdotc[NSIMDVL] = {0};        /* Contraction jphi_a cv_ia */
+  double sphidotq[NSIMDVL] = {0};     /* phi second moment */
+  double sth[3][3][NSIMDVL] = {0};    /* stress */
+  double sphi[3][3][NSIMDVL] = {0};   /* stress */
+  double mu[NSIMDVL] = {0};           /* Chemical potential */
 
   const double r3 = 1.0/3.0;
   KRONECKER_DELTA_CHAR(d);
@@ -779,7 +778,7 @@ __device__ void lb_collision_mrt2_site(lb_t * lb, hydro_t * hydro,
   /* For convenience, write out the physical modes. */
 
   for_simd_v(iv, NSIMDVL) rho[iv] = mode[0*NSIMDVL+iv];
-  for (ia = 0; ia < 3; ia++) {
+  for (ia = 0; ia < NDIM; ia++) {
     for_simd_v(iv, NSIMDVL) u[ia][iv] = mode[(1 + ia)*NSIMDVL+iv];
   }
 
@@ -801,7 +800,7 @@ __device__ void lb_collision_mrt2_site(lb_t * lb, hydro_t * hydro,
 
   for_simd_v(iv, NSIMDVL) rrho[iv] = 1.0/rho[iv];
 
-  for (ia = 0; ia < 3; ia++) {
+  for (ia = 0; ia < NDIM; ia++) {
     for_simd_v(iv, NSIMDVL) {
       int haddr = addr_rank1(hydro->nsite, 3, index0 + iv, ia);
       force[ia][iv] = _cp.force_global[ia]  + hydro->force->data[haddr];
@@ -809,7 +808,7 @@ __device__ void lb_collision_mrt2_site(lb_t * lb, hydro_t * hydro,
     }
   }
 
-  for (ia = 0; ia < 3; ia++) {
+  for (ia = 0; ia < NDIM; ia++) {
     for_simd_v(iv, NSIMDVL) {
       int haddr = addr_rank1(hydro->nsite, 3, index0 + iv, ia);
       hydro->u->data[haddr] = u[ia][iv];
@@ -827,9 +826,9 @@ __device__ void lb_collision_mrt2_site(lb_t * lb, hydro_t * hydro,
     tr_seq[iv] = 0.0;
   }
 
-  for (ia = 0; ia < 3; ia++) {
+  for (ia = 0; ia < NDIM; ia++) {
     /* Set equilibrium stress, which includes thermodynamic part */
-    for (ib = 0; ib < 3; ib++) {
+    for (ib = 0; ib < NDIM; ib++) {
       for_simd_v(iv, NSIMDVL) {
 	seq[ia][ib][iv] = rho[iv]*u[ia][iv]*u[ib][iv] + sth[ia][ib][iv];
       }
@@ -842,7 +841,7 @@ __device__ void lb_collision_mrt2_site(lb_t * lb, hydro_t * hydro,
   }
 
   /* Form traceless parts */
-  for (ia = 0; ia < 3; ia++) {
+  for (ia = 0; ia < NDIM; ia++) {
     for_simd_v(iv, NSIMDVL) {
       s[ia][ia][iv]   -= r3*tr_s[iv];
       seq[ia][ia][iv] -= r3*tr_seq[iv];
@@ -854,8 +853,8 @@ __device__ void lb_collision_mrt2_site(lb_t * lb, hydro_t * hydro,
   for_simd_v(iv, NSIMDVL)
     tr_s[iv] = tr_s[iv] - _lbp.rtau[LB_TAU_BULK]*(tr_s[iv] - tr_seq[iv]);
 
-  for (ia = 0; ia < 3; ia++) {
-    for (ib = 0; ib < 3; ib++) {
+  for (ia = 0; ia < NDIM; ia++) {
+    for (ib = 0; ib < NDIM; ib++) {
 
       for_simd_v(iv, NSIMDVL) {
 	s[ia][ib][iv] -= _lbp.rtau[LB_TAU_SHEAR]*(s[ia][ib][iv] - seq[ia][ib][iv]);
@@ -880,8 +879,8 @@ __device__ void lb_collision_mrt2_site(lb_t * lb, hydro_t * hydro,
 
       lb_collision_fluctuations(lb, noise, index0 + iv, _cp.kt, shat1, ghat1);
 
-      for (ia = 0; ia < 3; ia++) {
-	for (ib = 0; ib < 3; ib++) {
+      for (ia = 0; ia < NDIM; ia++) {
+	for (ib = 0; ib < NDIM; ib++) {
 	  shat[ia][ib][iv] = shat1[ia][ib];
 	}
       }
@@ -953,7 +952,7 @@ __device__ void lb_collision_mrt2_site(lb_t * lb, hydro_t * hydro,
   }
 
   for (p = 1; p < NVEL; p++) {
-    for (ia = 0; ia < 3; ia++) {
+    for (ia = 0; ia < NDIM; ia++) {
       for_simd_v(iv, NSIMDVL) {
 	jphi[ia][iv] += _lbp.cv[p][ia]*
 	lb->f[ LB_ADDR(_lbp.nsite, _lbp.ndist, NVEL, index0+iv, LB_PHI, p) ];
@@ -963,8 +962,8 @@ __device__ void lb_collision_mrt2_site(lb_t * lb, hydro_t * hydro,
 
   /* Relax order parameter modes. See the comments above. */
 
-  for (ia = 0; ia < 3; ia++) {
-    for (ib = 0; ib < 3; ib++) {
+  for (ia = 0; ia < NDIM; ia++) {
+    for (ib = 0; ib < NDIM; ib++) {
       for_simd_v(iv, NSIMDVL) {
 	sphi[ia][ib][iv] = phi[iv]*u[ia][iv]*u[ib][iv] + mu[iv]*d[ia][ib];
         /* The alternate form would be:
@@ -993,9 +992,9 @@ __device__ void lb_collision_mrt2_site(lb_t * lb, hydro_t * hydro,
       sphidotq[iv] = 0.0;
     }
 
-    for (ia = 0; ia < 3; ia++) {
+    for (ia = 0; ia < NDIM; ia++) {
       for_simd_v(iv, NSIMDVL) jdotc[iv] += jphi[ia][iv]*_lbp.cv[p][ia];
-      for (ib = 0; ib < 3; ib++) {
+      for (ib = 0; ib < NDIM; ib++) {
 	for_simd_v(iv, NSIMDVL) {
 	  sphidotq[iv] += sphi[ia][ib][iv]*(_lbp.cv[p][ia]*_lbp.cv[p][ib] - cs2*d[ia][ib]);
 	}

From 59a6fbde1afbccefebeb91376ddb6f037a4bf5be Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@cirrus.epcc.ed.ac.uk>
Date: Fri, 27 Dec 2024 19:23:16 +0000
Subject: [PATCH 079/133] Updated LE to allow GPU

---
 src/lb_data.c  |  53 ++--
 src/ludwig.c   |   2 +-
 src/model_le.c | 686 ++++++++++++++++++++++++++++++-------------------
 src/model_le.h |   4 +-
 4 files changed, 457 insertions(+), 288 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index f22b3e4da..b5cfa23ff 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -200,19 +200,20 @@ int lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * options,
 	if (obj->model.cv[p][X] == +1) nprop += 1;
       }
 
-      int ncrossdist = ndist*nprop*nlocal[Y]*nlocal[Z];
-      int ndata      = 2*nplane*ncrossdist; /* 2 sides for each plane */
-      obj->sbuff = (double *) malloc(ndata*sizeof(double));
-      obj->rbuff = (double *) malloc(ndata*sizeof(double));
+      /* Lees Edwards buffer for crossing distributions */
+      int nxdist = ndist*nprop*(nlocal[Y] + 1)*nlocal[Z];
+      int nxbuff = 2*nplane*nxdist; /* 2 sides for each plane */
+      obj->sbuff = (double *) malloc(nxbuff*sizeof(double));
+      obj->rbuff = (double *) malloc(nxbuff*sizeof(double));
 
       tdpGetDeviceCount(&ndevice);
 
       if (ndevice > 0) {
 	double * tmp = NULL;
-	tdpAssert( tdpMalloc((void **) &tmp, ndata*sizeof(double)) );
+	tdpAssert( tdpMalloc((void **) &tmp, nxbuff*sizeof(double)) );
 	tdpAssert( tdpMemcpy(&obj->target->sbuff, &tmp, sizeof(double *),
 			     tdpMemcpyHostToDevice) );
-	tdpAssert( tdpMalloc((void **) &tmp, ndata*sizeof(double)) );
+	tdpAssert( tdpMalloc((void **) &tmp, nxbuff*sizeof(double)) );
 	tdpAssert( tdpMemcpy(&obj->target->rbuff, &tmp, sizeof(double *),
 			     tdpMemcpyHostToDevice) );
       }
@@ -251,6 +252,13 @@ __host__ int lb_free(lb_t * lb) {
     tdpAssert( tdpMemcpy(&tmp, &lb->target->fprime, sizeof(double *),
 			 tdpMemcpyDeviceToHost) );
     tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &lb->target->sbuff, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+    tdpAssert( tdpMemcpy(&tmp, &lb->target->rbuff, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpFree(tmp) );
+
     tdpAssert( tdpFree(lb->target) );
   }
 
@@ -261,7 +269,6 @@ __host__ int lb_free(lb_t * lb) {
   free(lb->f);
   free(lb->fprime);
 
-  /* FIXME: device buffer to be removed */
   free(lb->rbuff);
   free(lb->sbuff);
 
@@ -291,6 +298,8 @@ int lb_data_initialise_device_model(lb_t * lb) {
   if (ndevice > 0) {
 
     int nvel = lb->model.nvel;
+    lb_model_t * hm = &lb->model;
+    lb_model_t * dm = &lb->target->model;
     int8_t (*d_cv)[3] = {0};
     double * d_wv = NULL;
     double * d_na = NULL;
@@ -299,26 +308,26 @@ int lb_data_initialise_device_model(lb_t * lb) {
     tdpAssert( tdpMalloc((void **) &d_wv, nvel*sizeof(double)) );
     tdpAssert( tdpMalloc((void **) &d_na, nvel*sizeof(double)) );
 
-    tdpAssert( tdpMemcpy(d_cv, &(lb->model.cv), nvel*sizeof(int8_t[3]),
+    tdpAssert( tdpMemcpy(d_cv, hm->cv, nvel*sizeof(int8_t[3]),
 			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMemcpy(d_wv, &(lb->model.wv), nvel*sizeof(double),
+    tdpAssert( tdpMemcpy(d_wv, hm->wv, nvel*sizeof(double),
 			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMemcpy(d_na, &(lb->model.na), nvel*sizeof(double),
+    tdpAssert( tdpMemcpy(d_na, hm->na, nvel*sizeof(double),
 			 tdpMemcpyHostToDevice) );
 
-    tdpAssert( tdpMemcpy(&(lb->target->model.cv), &d_cv, sizeof(int8_t(*)[3]),
+    tdpAssert( tdpMemcpy(&(dm->cv), &d_cv, sizeof(int8_t(*)[3]),
 			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMemcpy(&(lb->target->model.wv), &d_wv, sizeof(double *),
+    tdpAssert( tdpMemcpy(&(dm->wv), &d_wv, sizeof(double *),
 			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMemcpy(&(lb->target->model.na), &d_na, sizeof(double *),
+    tdpAssert( tdpMemcpy(&(dm->na), &d_na, sizeof(double *),
 			 tdpMemcpyHostToDevice) );
 
-    tdpAssert( tdpMemcpy(&(lb->target->model.ndim), &(lb->model.ndim),
-			 sizeof(int8_t), tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMemcpy(&(lb->target->model.nvel), &(lb->model.nvel),
-			 sizeof(int8_t), tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMemcpy(&(lb->target->model.cs2), &(lb->model.cs2),
-			 sizeof(double), tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&(dm->ndim), &(hm->ndim), sizeof(int8_t),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&(dm->nvel), &(hm->nvel), sizeof(int8_t),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&(dm->cs2), &(hm->cs2), sizeof(double),
+			 tdpMemcpyHostToDevice) );
 
     /* We do not copy the eigenvectors currently. */
   }
@@ -351,9 +360,9 @@ int lb_data_free_device_model(lb_t * lb) {
     tdpAssert( tdpMemcpy(&d_na, &lb->target->model.na, sizeof(double *),
 			 tdpMemcpyDeviceToHost) );
 
-    tdpAssert( tdpFree(&d_cv) );
-    tdpAssert( tdpFree(&d_wv) );
-    tdpAssert( tdpFree(&d_na) );
+    tdpAssert( tdpFree(d_cv) );
+    tdpAssert( tdpFree(d_wv) );
+    tdpAssert( tdpFree(d_na) );
   }
 
   return 0;
diff --git a/src/ludwig.c b/src/ludwig.c
index cff116770..2e2b15617 100644
--- a/src/ludwig.c
+++ b/src/ludwig.c
@@ -806,7 +806,7 @@ void ludwig_run(const char * inputfile) {
       /* Boundary conditions */
 
       if (ludwig->le) {
-	lb_le_apply_boundary_conditions(ludwig->lb, ludwig->le);
+	lb_data_apply_le_boundary_conditions(ludwig->lb, ludwig->le);
       }
 
       TIMER_start(TIMER_HALO_LATTICE);
diff --git a/src/model_le.c b/src/model_le.c
index 2b4fe62c1..f943b59e4 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -36,6 +36,17 @@ static int le_reproject(lb_t * lb, lees_edw_t * le);
 static int le_displace_and_interpolate(lb_t * lb, lees_edw_t * le);
 static int le_displace_and_interpolate_parallel(lb_t * lb, lees_edw_t * le);
 
+#ifdef HAVE_OPENMPI_
+/* This provides MPIX_CUDA_AWARE_SUPPORT .. */
+#include "mpi-ext.h"
+#endif
+
+#if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
+static const int have_gpu_aware_mpi_ = 1;
+#else
+static const int have_gpu_aware_mpi_ = 0;
+#endif
+
 /*****************************************************************************
  *
  *  lb_le_apply_boundary_conditions
@@ -55,7 +66,7 @@ static int le_displace_and_interpolate_parallel(lb_t * lb, lees_edw_t * le);
  *
  *****************************************************************************/
 
-__host__ int lb_le_apply_boundary_conditions_old(lb_t * lb, lees_edw_t * le) {
+__host__ int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le) {
 
   int mpi_cartsz[3];
 
@@ -715,243 +726,219 @@ int lb_le_init_shear_profile(lb_t * lb, lees_edw_t * le) {
   return 0;
 }
 
-
-
-
-
-/* FIXME ADDITIONS */
+/*****************************************************************************
+ *
+ *  The preceding functions are scheduled for removal.
+ *  The following are operational...
+ *
+ *****************************************************************************/
 
 /* Kernel helper structure intended to be passed by value to kernel */
 
 typedef struct lek_s {
   int nlocal[3];        /* 12 */
   int nplane;           /* 16 */
-  int nprop;            /* 20 */
-  int ndist;            /* 24 */
-  int nxdist;           /* 28 */
+  int ndist;            /* 20 */
+  int nxdist;           /* 24 */
+  int nxbuff;           /* 28 */
+  int nprop;            /* 1 <= nprop <= 9  maximum */
+  int8_t prop[2][9];    /* prop[0] is side 0 (cx +ve); */
+                        /* prop[1] is side 1 (cx -ve); */
+                        /* p values of cross-plane propagating distributions */
 } le_kernel_helper_t;
 
+static le_kernel_helper_t le_kernel_helper(lb_t * lb, lees_edw_t * le);
+
+
+__global__ void lb_data_reproject_kernel(kernel_3d_t k3d,
+					 le_kernel_helper_t lekh, lb_t * lb,
+					 lees_edw_t * le, double t);
+__global__ void lb_data_displace_kernel(kernel_3d_t k3d,
+					le_kernel_helper_t lekh,
+					lb_t * lb,
+					lees_edw_t * le, double t);
+__global__ void lb_data_interpolate_kernel(kernel_3d_t k3d,
+					   le_kernel_helper_t lekh,
+					   lb_t * lb,
+					   lees_edw_t * le, double t);
+
+static int lb_data_displace_communicate(le_kernel_helper_t lekh,
+					lb_t * lb,
+					lees_edw_t * le,
+					double t);
+
 /*****************************************************************************
  *
- *  my_ibuf(iside, nlocal[Y], nlocal[Z], nplane, ndist, nprop)
+ *  lb_data_apply_le_boundary_conditions
+ *
+ *  Driver for the parallel update.
  *
  *****************************************************************************/
 
-__host__ __device__ static int my_ibuf(const le_kernel_helper_t * s,
-				       int jc,  int kc, int iplane, int iside,
-				       int n, int p) {
-  int ib = 0;
+int lb_data_apply_le_boundary_conditions(lb_t * lb, lees_edw_t * le) {
 
-  assert(s);
-  assert(1 <= jc && jc <= s->nlocal[Y]); /* FIXME recv has one more? */
-  assert(1 <= kc && kc <= s->nlocal[Z]);
-  assert(0 <= iplane && iplane < s->nplane);
-  assert(0 <= iside  && iside <= 1);
-  assert(0 <= n && n < s->ndist);
-  assert(0 <= p && p < s->nprop);
-
-  ib = p + s->nprop*(n + s->ndist*(iplane + s->nplane*
-				   (kc - 1 + s->nlocal[Z]*(jc - 1))));
+  assert(lb);
+  assert(le);
 
-  //printf("p iplane, ib nxdisp %d %d (%d %d) %d %d\n", p, iplane, jc, kc, ib, s->nxdist);
-  assert(0 <= ib && ib < s->nxdist); /* Number of crossing distributions */
+  int mpi_cartsz[3] = {0};
+  le_kernel_helper_t lekh = le_kernel_helper(lb, le);
 
-  /* All same sides are together */
-  ib = iside*s->nxdist + ib;
+  lees_edw_cartsz(le, mpi_cartsz);
 
-  return ib;
-}
+  if (lekh.nplane == 0) {
+    /* No planes, no action. */
+  }
+  else {
+    int ndevice = 0;
+    lees_edw_t * le_target;
+    double t = -1.0;
 
+    TIMER_start(TIMER_LE);
 
-__global__ void lb_data_reproject_kernel(kernel_3d_t k3d,
-					 le_kernel_helper_t lekh, lb_t * lb,
-					 lees_edw_t * le, double t);
-__global__ void lb_data_displace_and_interpolate(kernel_3d_t k3d,
-						 le_kernel_helper_t lekh,
-						 lb_t * lb,
-						 lees_edw_t * le, double t);
-__global__ void copy_back(kernel_3d_t k3d, lb_t * lb, lees_edw_t * le);
+    tdpAssert( tdpGetDeviceCount(&ndevice) );
+    lees_edw_target(le, &le_target);
 
-static le_kernel_helper_t le_kernel_helper(lb_t * lb, lees_edw_t * le);
-static int le_displace_and_interpolate_parallel(lb_t *lb, lees_edw_t *le);
+    /* Require the time t = time step */
+    {
+      physics_t * phys = NULL;
+      physics_ref(&phys);
+      t = 1.0*physics_control_timestep(phys);
+    }
 
+    /* First, the reprojected distributions are computed and stored to
+     * the "send" buffer for each side/plane. */
 
-/*****************************************************************************
- *
- *  lb_data_displace_and_interpolate
- *
- *  Parallel kernel for interpolation stage.
- *
- *****************************************************************************/
+    {
+      int  nx   = 2*lekh.nplane; /* Two x positions (sides) for each plane */
+      dim3 nblk = {0};
+      dim3 ntpb = {0};
+      cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y], 1, lekh.nlocal[Z]};
+      kernel_3d_t k3d = kernel_3d(lb->cs, lim);
 
-__global__ void lb_data_displace_and_interpolate(kernel_3d_t k3d,
-						 le_kernel_helper_t lekh,
-						 lb_t * lb,
-						 lees_edw_t * le, double t) {
-  int kindex = 0;
-  int ndist = 0;
-  int nlocal[3] = {0};
-  int nhalo = 0;
-  double ltot[3] = {0};
+      kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb);
 
-  lb_ndist(lb, &ndist);
-  lees_edw_nlocal(le, nlocal);
-  lees_edw_nhalo(le, &nhalo);
-  lees_edw_ltot(le, ltot);
+      tdpLaunchKernel(lb_data_reproject_kernel, nblk, ntpb, 0, 0,
+		      k3d, lekh, lb->target, le_target, t);
+      tdpAssert( tdpPeekAtLastError() );
+      tdpAssert( tdpStreamSynchronize(0) );
+    }
 
-  for_simt_parallel(kindex, k3d.kiterations, 1) {
 
-    int iplane = kernel_3d_ic(&k3d, kindex); /* encodes plane */
-    int jc     = kernel_3d_jc(&k3d, kindex);
-    int kc     = kernel_3d_kc(&k3d, kindex);
+    /* Second, displacement. */
+    if (have_gpu_aware_mpi_ || mpi_cartsz[Y] > 1) {
+      lb_data_displace_communicate(lekh, lb, le, t);
+    }
+    else {
+      /* The kernel form is only really required for the stub MPI case
+       * in serial; it could be removed if point-to-point messages were
+       * handled by the stub version. */
 
-    if (jc <= nlocal[Y] && kc <= nlocal[Z] && iplane < lekh.nplane) {
+      int  nx   = 2*lekh.nplane;
+      dim3 nblk = {0};
+      dim3 ntpb = {0};
+      cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y] + 1, 1, lekh.nlocal[Z]};
+      kernel_3d_t k3d = kernel_3d(lb->cs, lim);
 
-      int ic = lees_edw_plane_location(le, iplane);
-      double dy = 0.0;
-      int jdy = 0;
-      double fr = 0.0;
-      int iside = 0;
+      kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb);
 
-      lees_edw_buffer_displacement(le, nhalo, t, &dy);
-      dy = fmod(dy, ltot[Y]);
-      jdy = floor(dy);
-      fr = dy - jdy;
+      tdpLaunchKernel(lb_data_displace_kernel, nblk, ntpb, 0, 0,
+		      k3d, lekh, lb->target, le_target, t);
 
-      int j1 = 1 + (jc + jdy - 1 + 2*nlocal[Y]) % nlocal[Y];
-      int j2 = 1 + (j1 % nlocal[Y]);
+      tdpAssert( tdpPeekAtLastError() );
+      tdpAssert( tdpStreamSynchronize(0) );
+    }
 
-      int index0 = lees_edw_index(le, ic, jc, kc);
-      //int index1 = lees_edw_index(le, ic, j2, kc);
-
-      for (int n = 0; n < ndist; n++) {
-	int ip = 0; /* FIXME  avoid if by lookup table [iside][9] */
-	for (int p = 1; p < lb->model.nvel; p++) {
-	  if (lb->model.cv[p][X] == +1) {
-	    //int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-	    //int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p);
-	    /* FIXME .... */
-	    //int ibuf = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + ip + 2 * plane *nxdist;
-	    //lb->sbuff[ibuf] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
-	    int ibuf0 = my_ibuf(&lekh, j1, kc, iplane, iside, n, ip);
-	    int ibuf1 = my_ibuf(&lekh, j2, kc, iplane, iside, n, ip);
-	    double f = (1.0 - fr)*lb->rbuff[ibuf0] + fr*lb->rbuff[ibuf1];
-	    lb_f_set(lb, index0, p, n, f);
-	    ip++;
-	  }
-	}
-      }
+    /* Lastly, the recv buffer is interpolated to reset the plane-crossing
+     * distributions */
+    {
+      int  nx   = 2*lekh.nplane;
+      dim3 nblk = {0};
+      dim3 ntpb = {0};
+      cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y], 1, lekh.nlocal[Z]};
+      kernel_3d_t k3d = kernel_3d(lb->cs, lim);
 
-      /* OTHER DIRECTION */
-      iside = 1;
-      ic = lees_edw_plane_location(le, iplane) + 1;
-      lees_edw_buffer_displacement(le, nhalo, t, &dy);
-      dy = fmod(-dy, ltot[Y]);
-      jdy = floor(dy);
-      fr = dy - jdy;
+      kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb);
 
-      j1 = 1 + (jc + jdy - 1 + 2*nlocal[Y]) % nlocal[Y];
-      j2 = 1 + (j1 % nlocal[Y]);
+      tdpLaunchKernel(lb_data_interpolate_kernel, nblk, ntpb, 0, 0,
+		      k3d, lekh, lb->target, le_target, t);
 
-      index0 = lees_edw_index(le, ic, jc, kc);
-      //index1 = lees_edw_index(le, ic, j2, kc);
-
-      for (int n = 0; n < ndist; n++) {
-	int ip = 0; assert(ndist == 1); /* FIXME */
-	for (int p = 1; p < lb->model.nvel; p++) {
-	  if (lb->model.cv[p][X] == -1) {
-	    //int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-	    //int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p);
-	    //int ibuf = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + ip + (2 * plane + 1) *nxdist;
-	    //lb->sbuff[ibuf] = (1.0 - fr) * lb->f[l0] + fr * lb->f[l1];
-	    int ibuf0 = my_ibuf(&lekh, j1, kc, iplane, iside, n, ip);
-	    int ibuf1 = my_ibuf(&lekh, j2, kc, iplane, iside, n, ip);
-	    double f = (1.0 - fr)*lb->rbuff[ibuf0] + fr*lb->rbuff[ibuf1];
-	    lb_f_set(lb, index0, p, n, f);
-	    ip++;
-	  }
-	}
-      }
-      /* next plane */
+      tdpAssert( tdpPeekAtLastError() );
+      tdpAssert( tdpStreamSynchronize(0) );
     }
+
+    TIMER_stop(TIMER_LE);
   }
 
-  return;
+  return 0;
 }
 
 /*****************************************************************************
  *
- *  We must not overwrite any of the existing distributions until
- *  all the interpolations have been computed.
+ *  le_kernel_helper
  *
  *****************************************************************************/
 
-__global__ void copy_back(kernel_3d_t k3d, lb_t * lb, lees_edw_t * le) {
+static le_kernel_helper_t le_kernel_helper(lb_t * lb, lees_edw_t * le) {
 
-  int kindex = 0;
+  le_kernel_helper_t lekh = {0};
 
-  int ndist = 0;
-  int nlocal[3] = {0};
-  int nhalo = 0;
-  int nplane = 0;
-  int nprop = 0;
-  int nxdist = 0;
+  assert(le);
 
-  lb_ndist(lb, &ndist);
-  lees_edw_nlocal(le, nlocal);
-  lees_edw_nhalo(le, &nhalo);
-  nplane = lees_edw_nplane_local(le);
+  lees_edw_nlocal(le, lekh.nlocal);
+  lekh.nplane = lees_edw_nplane_local(le);
 
-  nprop = 0;
-  for (int p = 1; p < lb->model.nvel; p++) {
-    if (lb->model.cv[p][X] == +1) nprop += 1;
+  {
+    int ip = 0;
+    for (int p = 1; p < lb->model.nvel; p++) {
+      if (lb->model.cv[p][X] == +1) lekh.prop[0][ip++] = p;  /* +ve cx */
+    }
+    ip = 0;
+    for (int p = 1; p < lb->model.nvel; p++) {
+      if (lb->model.cv[p][X] == -1) lekh.prop[1][ip++] = p;  /* -ve cx */
+    }
+    lekh.nprop = ip;
+    assert(lekh.nprop <= 9);
   }
-  nxdist = ndist*nprop*nlocal[Y]*nlocal[Z];
 
+  lekh.ndist  = lb->ndist;
+  lekh.nxdist = lekh.ndist*lekh.nprop*(lekh.nlocal[Y] + 1)*lekh.nlocal[Z];
+  lekh.nxbuff = 2*lekh.nplane*lekh.nxdist;
 
-  for_simt_parallel(kindex, k3d.kiterations, 1) {
+  return lekh;
+}
 
-    int plane = kernel_3d_ic(&k3d, kindex); /* Encode plane */
-    int jc    = kernel_3d_jc(&k3d, kindex);
-    int kc    = kernel_3d_kc(&k3d, kindex);
+/*****************************************************************************
+ *
+ *  le_ibuf
+ *
+ *  Defines the storage order for quantites in the cross-plane buffers.
+ *  Note that all values on the same side (iside = 0 or iside = 1) are
+ *  stored contiguously for the purposes of communication.
+ *
+ *****************************************************************************/
 
-    if (jc <= nlocal[Y] && kc <= nlocal[Z] && plane < nplane) {
-      int ic = lees_edw_plane_location(le, plane);
-      int index0 = lees_edw_index(le, ic, jc, kc);
+__host__ __device__ static inline int le_ibuf(const le_kernel_helper_t * s,
+					      int jc,  int kc, int iplane,
+					      int iside, int n, int p) {
+  int ib = 0;
 
-      for (int n = 0; n < ndist; n++) {
-	int ip = 0;
-	for (int p = 1; p < lb->model.nvel; p++) {
-	  if (lb->model.cv[p][X] == +1) {
-	    /* ibuf = ibuf(jc, kc, n, p, ...) */
-	    int ibuf = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + ip + 2 * plane * nxdist;
-	    int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-	    lb->f[la] = lb->sbuff[ibuf];
-	    ip++;
-	  }
-	}
-      }
+  assert(s);
+  assert(1 <= jc && jc <= (s->nlocal[Y] + 1));
+  assert(1 <= kc && kc <= s->nlocal[Z]);
+  assert(0 <= iplane && iplane < s->nplane);
+  assert(0 <= iside  && iside <= 1);
+  assert(0 <= n && n < s->ndist);
+  assert(0 <= p && p < s->nprop);
 
-      /* Other direction */
+  ib = p + s->nprop*(n + s->ndist*(iplane + s->nplane*
+				   (kc - 1 + s->nlocal[Z]*(jc - 1))));
 
-      ic = lees_edw_plane_location(le, plane) + 1;
-      index0 = lees_edw_index(le, ic, jc, kc);
+  /* All same sides are together for all planes */
+  ib = iside*s->nxdist*s->nplane + ib;
 
-      for (int n = 0; n < ndist; n++) {
-	int ip = 0;
-	for (int p = 1; p < lb->model.nvel; p++) {
-	  if (lb->model.cv[p][X] == -1) {
-	    int ibuf = ((jc-1)*nlocal[Z] + (kc-1))*ndist*nprop + n*nprop + ip + (2*plane + 1) * nxdist;
-	    int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-	    lb->f[la] = lb->sbuff[ibuf];
-	    ip++;
-	  }
-	}
-      }
-    }
-  }
+  assert(0 <= ib && ib < s->nxbuff);
 
-  return;
+  return ib;
 }
 
 /*****************************************************************************
@@ -1004,14 +991,13 @@ __global__ void lb_data_reproject_kernel(kernel_3d_t k3d,
 
       for (int n = 0; n < lekh.ndist; n++) {
 
-	int ip = 0;
 	double rho = 0.0;
 	double g[3] = {0};
 	double ds[3][3] = {0};
 
 	/* Compute 0th and 1st moments */
 	lb_dist_enum_t ndn = (lb_dist_enum_t) n;
-	/* EXPAND ? */
+	/* Could expand these ... */
 	lb_0th_moment(lb, index, ndn, &rho);
 	lb_1st_moment(lb, index, ndn, g);
 
@@ -1021,8 +1007,10 @@ __global__ void lb_data_reproject_kernel(kernel_3d_t k3d,
 	  }
 	}
 
-	/* Now update the distribution */
-	for (int p = 1; p < lb->model.nvel; p++) {
+	/* Now for relevant distributions ... */
+	for (int ip = 0; ip < lekh.nprop; ip++) {
+
+	  int p = lekh.prop[iside][ip];
 
 	  double cs2 = lb->model.cs2;
 	  double rcs2 = 1.0/cs2;
@@ -1030,25 +1018,23 @@ __global__ void lb_data_reproject_kernel(kernel_3d_t k3d,
 	  double udotc = du[Y]*lb->model.cv[p][Y];
 	  double sdotq = 0.0;
 
-	  if (lb->model.cv[p][X] != cx) continue;
+	  assert(lb->model.cv[p][X] == cx);
 
 	  for (int ia = 0; ia < 3; ia++) {
 	    for (int ib = 0; ib < 3; ib++) {
-	      double dab = cs2 * (ia == ib);
-	      double q = (lb->model.cv[p][ia] * lb->model.cv[p][ib] - dab);
-	      sdotq += ds[ia][ib] * q;
+	      double dab = cs2*(ia == ib);
+	      double q = (lb->model.cv[p][ia]*lb->model.cv[p][ib] - dab);
+	      sdotq += ds[ia][ib]*q;
 	    }
 	  }
 
 	  /* Project all this back to the distribution. */
 	  {
-	    int ibuf = my_ibuf(&lekh, jc, kc, iplane, iside, n, ip);
+	    int ibuf = le_ibuf(&lekh, jc, kc, iplane, iside, n, ip);
 	    double f = 0.0;
 	    lb_f(lb, index, p, n, &f);
-	    f += lb->model.wv[p] * (rho*udotc*rcs2 + 0.5*sdotq*rcs2*rcs2);
-	    /* REPLACE lb_f_set(lb, index, p, n, f); BY */
+	    f += lb->model.wv[p]*(rho*udotc*rcs2 + 0.5*sdotq*rcs2*rcs2);
 	    lb->sbuff[ibuf] = f;
-	    ++ip;
 	  }
 	}
       }
@@ -1060,88 +1046,223 @@ __global__ void lb_data_reproject_kernel(kernel_3d_t k3d,
 
 /*****************************************************************************
  *
- *  lb_le_apply_boundary_conditions
+ *  lb_data_displace_kernel
  *
- *  Driver for the parallel update.
+ *  Displace/copy send buffer (1 <= jc <= nlocal[Y]) to the recv buffer
+ *  (1 <= jc <= nlocal[Y] + 1).
+ *
+ *  Version where there is no communication via MPI (i.e., serial).
  *
  *****************************************************************************/
 
-__host__ int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le) {
+__global__ void lb_data_displace_kernel(kernel_3d_t k3d,
+					le_kernel_helper_t lekh,
+					lb_t * lb,
+					lees_edw_t * le, double t) {
+  int kindex = 0;
+  int nhalo = 0;
+  double ltot[3] = {0};
+
+  lees_edw_nhalo(le, &nhalo);
+  lees_edw_ltot(le, ltot);
+
+  for_simt_parallel(kindex, k3d.kiterations, 1) {
+
+    int ix = kernel_3d_ic(&k3d, kindex); /* encodes plane, side */
+    int jc = kernel_3d_jc(&k3d, kindex);
+    int kc = kernel_3d_kc(&k3d, kindex);
+
+    if (jc <= lekh.nlocal[Y]+1 && kc <= lekh.nlocal[Z] && ix < 2*lekh.nplane) {
+
+      int iplane = ix / 2; assert(0 <= iplane && iplane < lekh.nplane);
+      int iside  = ix % 2; assert(iside == 0 || iside == 1);
+      int cx     = 1 - 2*iside; /* below going up, or above going down */
+
+      /* buffer location; js is the displaced position needed ... */
+      int ic = iside + lees_edw_plane_location(le, iplane);
+      int js = -1;
+      int dj = -1;
+      double dy = 0.0;
+
+      lees_edw_buffer_displacement(le, nhalo, t, &dy);
+      dj = floor(fmod(dy*cx, ltot[Y]));
+
+      js = 1 + (jc + dj - 1 + 2*lekh.nlocal[Y]) % lekh.nlocal[Y];
+      assert(1 <= js && js <= lekh.nlocal[Y]);
+
+      for (int n = 0; n < lekh.ndist; n++) {
+	for (int ip = 0; ip < lekh.nprop; ip++) {
+	  int isend = le_ibuf(&lekh, js, kc, iplane, iside, n, ip);
+	  int irecv = le_ibuf(&lekh, jc, kc, iplane, iside, n, ip);
+	  lb->rbuff[irecv] = lb->sbuff[isend];
+	}
+      }
+    }
+  }
+
+  return;
+}
+
+/*****************************************************************************
+ *
+ *  lb_data_displace_communicate
+ *
+ *  General displacement procedure when MPI is required. The displacement
+ *  is always an integer number of latttice sites in the y-direction.
+ *
+ *****************************************************************************/
+
+static int lb_data_displace_communicate(le_kernel_helper_t lekh,
+					lb_t * lb,
+					lees_edw_t * le,
+					double t) {
+  const int tag1 = 3102;
+  const int tag2 = 3103;
+  const int tag3 = 3104;
+  const int tag4 = 3105;
+
+  double * sbuff = NULL;  /* Send buffer (previously reprojected values) */
+  double * rbuff = NULL;  /* Recv buffer (to be interpolated) */
+
+  int nhalo = 0;
+  int ntotal[3]  = {0};
+  int offset[3]  = {0};
+  int nrank_s[2] = {0};
+  int nrank_r[2] = {0};
+  MPI_Comm comm = MPI_COMM_NULL;
+  MPI_Request req[8] = {0};
 
   assert(lb);
   assert(le);
 
-  int mpi_cartsz[3] = {0};
-  le_kernel_helper_t lekh = le_kernel_helper(lb, le);
+  int nrowdata = lekh.nlocal[Z]*lekh.nplane*lekh.ndist*lekh.nprop;
 
-  lees_edw_cartsz(le, mpi_cartsz);
+  /* If there is GPU-aware MPI just communicate the GPU buffers; if
+   * not, copy in relevant direction at the start and finish */
 
-  if (lekh.nplane > 0) {
-    int ndevice = 0;
-    lees_edw_t * le_target;
-    double t = -1.0;
+  if (have_gpu_aware_mpi_) {
+    tdpAssert( tdpMemcpy(&sbuff, &lb->target->sbuff, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpy(&rbuff, &lb->target->rbuff, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+  }
+  else {
+    int nbuffsz = lekh.nxbuff*sizeof(double);
+    double * target = NULL;
+    sbuff = lb->sbuff;
+    rbuff = lb->rbuff;
+    tdpAssert( tdpMemcpy(&target, &lb->target->sbuff, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpy(sbuff, target, nbuffsz, tdpMemcpyDeviceToHost) );
+  }
 
-    TIMER_start(TIMER_LE);
+  lees_edw_comm(le, &comm);
+  lees_edw_ntotal(le, ntotal);
+  lees_edw_nlocal_offset(le, offset);
+  lees_edw_nhalo(le, &nhalo);
 
-    tdpAssert( tdpGetDeviceCount(&ndevice) );
-    lees_edw_target(le, &le_target);
+  /* For each plane, there are 4 sends and 4 recvs; two each for each
+   * side of the plane. The construction  of the buffer is such that
+   * we can treat all the planes for a given side in one go. */
 
-    /* Require the time t = time step */
-    {
-      physics_t * phys = NULL;
-      physics_ref(&phys);
-      t = 1.0*physics_control_timestep(phys);
-    }
+  /* A total of 8 requests; 2 sends and 2 recvs for side = 0, and
+   * 2 sends and 2 revcs for side = 1, which communicate in different
+   * directions... */
 
-    /* First, set up the interpolation buffer with the relevant elements of
-     * the distribution, i.e., side below the nplane propagating up, and
-     * side above plane propagating down. ("Up" and "down" in x-direction) */
+  /* Each process sends a total of (nlocal[Y] + 1) y values all taken
+   * from 1 <= jc <= nlocal[Y]; the values at j = j0 = jrow2 are sent to
+   * both destinations. */
 
-    {
-      int  nx   = 2*lekh.nplane; /* Two x positions (sides) for each plane */
-      dim3 nblk = {0};
-      dim3 ntpb = {0};
-      cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y], 1, lekh.nlocal[Z]};
-      kernel_3d_t k3d = kernel_3d(lb->cs, lim);
+  {
+    int iside = 0;
+    int jdy = -1;
+    double dy = 0.0;
+    lees_edw_buffer_displacement(le, nhalo, t, &dy);
+    dy  = fmod(dy, 1.0*ntotal[Y]);
+    jdy = floor(dy);
 
-      kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb);
+    /* Starting y coordinate is j0: 1 <= j0 <= ntotal[y] */
 
-      tdpLaunchKernel(lb_data_reproject_kernel, nblk, ntpb, 0, 0,
-		      k3d, lekh, lb->target, le_target, t);
-      tdpAssert( tdpPeekAtLastError() );
-      tdpAssert( tdpStreamSynchronize(0) );
-    }
+    int j0 = 1 + (offset[Y] + jdy + 2*ntotal[Y]) % ntotal[Y];
+    lees_edw_jstart_to_mpi_ranks(le, j0, nrank_s, nrank_r);
 
-    /* Swap the send and recv buffer */
+    j0        = 1 + (j0 - 1) % lekh.nlocal[Y]; /* 1 <= j0 <= nlocal[Y] */
+    int jrow1 = lekh.nlocal[Y] + 1 - j0;
+    int jrow2 = j0;
 
-    tdpAssert( tdpMemcpy(lb->target->rbuff, lb->target->sbuff,
-			 2*lekh.nxdist*sizeof(double),
-			 tdpMemcpyDeviceToDevice) );
+    int n1 = jrow1*nrowdata;
+    int n2 = jrow2*nrowdata;
 
-    /* Displacement and interpolation to replace distributions */
-    if (mpi_cartsz[Y] > 1) {
-      assert(0); /* PENDING comms for send to recv buffer */
-    }
-    else {
-      int  nx   = lekh.nplane;
-      dim3 nblk = {0};
-      dim3 ntpb = {0};
-      cs_limits_t lim = {0, nx - 1, 1, lekh.nlocal[Y], 1, lekh.nlocal[Z]};
-      kernel_3d_t k3d = kernel_3d(lb->cs, lim);
+    /* Post the receives (are disjoint) */
+    /* 1 -> jrow1 incl., and 1 + jrow1 -> nlocal[Y] + 1 incl. */
 
-      kernel_3d_launch_param(k3d.kiterations, &nblk, &ntpb);
+    int ibuf1 = le_ibuf(&lekh, 1,         1, 0, iside, 0, 0);
+    int ibuf2 = le_ibuf(&lekh, 1 + jrow1, 1, 0, iside, 0, 0);
 
-      /* Two kernels to provide synchronisation: interpolate to temporary
-       * and then can copy back values to the distribution itself. */
+    MPI_Irecv(rbuff + ibuf1, n1, MPI_DOUBLE, nrank_r[0], tag1, comm, req + 0);
+    MPI_Irecv(rbuff + ibuf2, n2, MPI_DOUBLE, nrank_r[1], tag2, comm, req + 1);
 
-      tdpLaunchKernel(lb_data_displace_and_interpolate, nblk, ntpb, 0, 0,
-		      k3d, lekh, lb->target, le_target, t);
+    /* Post sends (overlap at jrow2) */
+    /* jrow2 -> nlocal[Y] incl., and 1 -> jrow2 incl. */
 
-      tdpAssert( tdpPeekAtLastError() );
-      tdpAssert( tdpStreamSynchronize(0) );
-    }
+    ibuf1 = le_ibuf(&lekh, jrow2, 1, 0, iside, 0, 0);
+    ibuf2 = le_ibuf(&lekh,     1, 1, 0, iside, 0, 0);
 
-    TIMER_stop(TIMER_LE);
+    MPI_Isend(sbuff + ibuf1, n1, MPI_DOUBLE, nrank_s[0], tag1, comm, req + 2);
+    MPI_Isend(sbuff + ibuf2, n2, MPI_DOUBLE, nrank_s[1], tag2, comm, req + 3);
+  }
+
+  /* Other side */
+  {
+    int iside = 1;
+    int jdy = -1;
+    double dy = 0.0;
+
+    lees_edw_buffer_displacement(le, nhalo, t, &dy);
+    dy  = fmod(-dy, 1.0*ntotal[Y]); /* note sign */
+    jdy = floor(dy);
+
+    /* Starting y coordinate (global address): range 1 <= j1 <= ntotal[Y] */
+
+    int j0 = 1 + (offset[Y] + jdy + 2*ntotal[Y]) % ntotal[Y];
+    lees_edw_jstart_to_mpi_ranks(le, j0, nrank_s, nrank_r);
+
+    j0        = 1 + (j0 - 1) % lekh.nlocal[Y];
+    int jrow1 = lekh.nlocal[Y] + 1 - j0;
+    int jrow2 = j0;
+
+    int n1 = jrow1*nrowdata;
+    int n2 = jrow2*nrowdata;
+
+    /* Post the receives */
+
+    int ibuf1 = le_ibuf(&lekh, 1,         1, 0, iside, 0, 0);
+    int ibuf2 = le_ibuf(&lekh, 1 + jrow1, 1, 0, iside, 0, 0);
+
+    MPI_Irecv(rbuff + ibuf1, n1, MPI_DOUBLE, nrank_r[0], tag3, comm, req + 4);
+    MPI_Irecv(rbuff + ibuf2, n2, MPI_DOUBLE, nrank_r[1], tag4, comm, req + 5);
+
+    /* Post sends */
+
+    ibuf1 = le_ibuf(&lekh, jrow2, 1, 0, iside, 0, 0);
+    ibuf2 = le_ibuf(&lekh,     1, 1, 0, iside, 0, 0);
+
+    MPI_Isend(sbuff + ibuf1, n1, MPI_DOUBLE, nrank_s[0], tag3, comm, req + 6);
+    MPI_Isend(sbuff + ibuf2, n2, MPI_DOUBLE, nrank_s[1], tag4, comm, req + 7);
+  }
+
+  /* Complete */
+  MPI_Waitall(8, req, MPI_STATUSES_IGNORE);
+
+  if (have_gpu_aware_mpi_) {
+    /* No further action */
+  }
+  else {
+    int nbuffsz = lekh.nxbuff*sizeof(double);
+    double * target = NULL;
+    tdpAssert( tdpMemcpy(&target, &lb->target->rbuff, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpy(target, rbuff, nbuffsz, tdpMemcpyHostToDevice) );
   }
 
   return 0;
@@ -1149,27 +1270,66 @@ __host__ int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le) {
 
 /*****************************************************************************
  *
- *  le_kernel_helper
+ *  lb_data_interpolate_kernel
+ *
+ *  Interpolate the final recv buffer to reset plane-crossing distribiutions.
+ *  The linear interpolation is always between jc and jc+1 for
+ *  1 <= jc <= nlocal[Y], ie., an appropriate displacement
+ *  of the send buffer should have occurred. A consistent
+ *  fractional part of a lattice spacing is used.
+ *
+ *  Always linear interpoaltion to conserve mass.
  *
  *****************************************************************************/
 
-static le_kernel_helper_t le_kernel_helper(lb_t * lb, lees_edw_t * le) {
+__global__ void lb_data_interpolate_kernel(kernel_3d_t k3d,
+					   le_kernel_helper_t lekh,
+					   lb_t * lb,
+					   lees_edw_t * le, double t) {
+  int kindex = 0;
+  int nhalo = 0;
+  double ltot[3] = {0};
 
-  le_kernel_helper_t lekh = {0};
+  lees_edw_nhalo(le, &nhalo);
+  lees_edw_ltot(le, ltot);
 
-  assert(le);
+  for_simt_parallel(kindex, k3d.kiterations, 1) {
 
-  lees_edw_nlocal(le, lekh.nlocal);
-  lekh.nplane = lees_edw_nplane_local(le);
+    int ix = kernel_3d_ic(&k3d, kindex); /* encodes plane, side */
+    int jc = kernel_3d_jc(&k3d, kindex);
+    int kc = kernel_3d_kc(&k3d, kindex);
 
-  for (int p = 1; p < lb->model.nvel; p++) {
-    if (lb->model.cv[p][X] == +1) lekh.nprop += 1;
-  }
+    if (jc <= lekh.nlocal[Y] && kc <= lekh.nlocal[Z] && ix < 2*lekh.nplane) {
 
-  /* FIXME: nxdist to include the number of planes here but ncrossdist
-   * does not include the number of planes */
-  lekh.ndist  = lb->ndist;
-  lekh.nxdist = lekh.ndist*lekh.nprop*lekh.nplane*lekh.nlocal[Y]*lekh.nlocal[Z];
+      int iplane = ix / 2; assert(0 <= iplane && iplane < lekh.nplane);
+      int iside  = ix % 2; assert(iside == 0 || iside == 1);
+      int cx     = 1 - 2*iside; /* below going up, or above going down */
 
-  return lekh;
+      /* buffer location, fractional part of the displacement, ... */
+      int ic = iside + lees_edw_plane_location(le, iplane);
+      int jdy = 0;
+
+      double dy = 0.0;
+      double fr = 0.0;
+
+      lees_edw_buffer_displacement(le, nhalo, t, &dy);
+      dy = fmod(dy*cx, ltot[Y]);
+      jdy = floor(dy);
+      fr = dy - jdy;
+
+      int index0 = lees_edw_index(le, ic, jc, kc);
+
+      for (int n = 0; n < lekh.ndist; n++) {
+	for (int ip = 0; ip < lekh.nprop; ip++) {
+	  int p = lekh.prop[iside][ip];
+	  int ibuf0 = le_ibuf(&lekh, jc,     kc, iplane, iside, n, ip);
+	  int ibuf1 = le_ibuf(&lekh, jc + 1, kc, iplane, iside, n, ip);
+	  double f = (1.0 - fr)*lb->rbuff[ibuf0] + fr*lb->rbuff[ibuf1];
+	  lb_f_set(lb, index0, p, n, f);
+	}
+      }
+    }
+  }
+
+  return;
 }
diff --git a/src/model_le.h b/src/model_le.h
index 546013975..01f285ad4 100644
--- a/src/model_le.h
+++ b/src/model_le.h
@@ -5,7 +5,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2009-2022 The University of Edinburgh
+ *  (c) 2009-2024 The University of Edinburgh
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
  *****************************************************************************/
@@ -16,7 +16,7 @@
 #include "lb_data.h"
 #include "leesedwards.h"
 
-int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le);
+int lb_data_apply_le_boundary_conditions(lb_t * lb, lees_edw_t * le);
 int lb_le_init_shear_profile(lb_t * lb, lees_edw_t * le);
 
 #endif

From f5cbaff911182737363a5cf8a9265ee1c929755c Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Fri, 27 Dec 2024 19:37:51 +0000
Subject: [PATCH 080/133] Minor corrections

---
 src/model_le.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index f943b59e4..55bf7c6ff 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -6,7 +6,7 @@
  *
  *  Note that the distributions have displacement u*t
  *  not u*(t-1) returned by le_get_displacement().
- *  This is for reasons of backwards compatability.
+ *  This is for reasons of backwards compatibility.
  *
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
@@ -61,7 +61,7 @@ static const int have_gpu_aware_mpi_ = 0;
  *     to take account of the sliding displacement as a function of time.
  *
  *  Note we never deal with the halo regions here, as we assume the
- *  upcoming propagation will be immediately preceeded by a distribution
+ *  upcoming propagation will be immediately preceded by a distribution
  *  halo update.
  *
  *****************************************************************************/
@@ -114,7 +114,7 @@ __host__ int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le) {
  *     S_ab -> S_ab +/- rho u_a u^le_b +/- rho u_b u^le_a + rho u^le_a u^le_b
  *
  *  with analogous expressions for order parameter moments.
- * 
+ *
  *  The change to the distribution is then computed by a reprojection.
  *  Ghost modes are unchanged.
  *
@@ -403,8 +403,8 @@ int le_displace_and_interpolate(lb_t * lb, lees_edw_t * le) {
  *  appropriate direction.
  *
  *  Likewise, we need to send a total of (nlocal[Y] + 1) points to the
- *  two corresponding recieving processes. Note we never involve the
- *  halo regions here (so a preceeding halo exchange is not required).
+ *  two corresponding receiving processes. Note we never involve the
+ *  halo regions here (so a preceding halo exchange is not required).
  *
  *****************************************************************************/
 
@@ -687,7 +687,7 @@ int lb_le_init_shear_profile(lb_t * lb, lees_edw_t * le) {
 
   lees_edw_shear_rate(le, &gradu[X][Y]);
 
-  /* Loop trough the sites */
+  /* Loop through the sites */
 
   for (ic = 1; ic <= nlocal[X]; ic++) {
 
@@ -1079,7 +1079,6 @@ __global__ void lb_data_displace_kernel(kernel_3d_t k3d,
       int cx     = 1 - 2*iside; /* below going up, or above going down */
 
       /* buffer location; js is the displaced position needed ... */
-      int ic = iside + lees_edw_plane_location(le, iplane);
       int js = -1;
       int dj = -1;
       double dy = 0.0;
@@ -1127,8 +1126,8 @@ static int lb_data_displace_communicate(le_kernel_helper_t lekh,
   int nhalo = 0;
   int ntotal[3]  = {0};
   int offset[3]  = {0};
-  int nrank_s[2] = {0};
-  int nrank_r[2] = {0};
+  int nrank_s[3] = {0};
+  int nrank_r[3] = {0};
   MPI_Comm comm = MPI_COMM_NULL;
   MPI_Request req[8] = {0};
 

From 01e6dbff6061a7754d921e232992f010a7d030d8 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Fri, 27 Dec 2024 19:50:02 +0000
Subject: [PATCH 081/133] Replace with descriptive comments

---
 src/model_le.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index 55bf7c6ff..80203263b 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -736,12 +736,12 @@ int lb_le_init_shear_profile(lb_t * lb, lees_edw_t * le) {
 /* Kernel helper structure intended to be passed by value to kernel */
 
 typedef struct lek_s {
-  int nlocal[3];        /* 12 */
-  int nplane;           /* 16 */
-  int ndist;            /* 20 */
-  int nxdist;           /* 24 */
-  int nxbuff;           /* 28 */
-  int nprop;            /* 1 <= nprop <= 9  maximum */
+  int nlocal[3];        /* local lattice sites */
+  int nplane;           /* number of planes (local) */
+  int ndist;            /* number of distributions (single/binary fluid) */
+  int nxdist;           /* total distributions crossing plane (local) */
+  int nxbuff;           /* size of crossing buffer (2 x nplane x nxbuff) */
+  int nprop;            /* no. distributions crossing (9  maximum in 3d) */
   int8_t prop[2][9];    /* prop[0] is side 0 (cx +ve); */
                         /* prop[1] is side 1 (cx -ve); */
                         /* p values of cross-plane propagating distributions */

From 6e5d5d043ca129ecd26887a47c8c7b8be1d6a09d Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Fri, 27 Dec 2024 19:57:52 +0000
Subject: [PATCH 082/133] Avoid semi-colon in code-like comment

---
 src/model_le.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index 80203263b..23fa99e6b 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -742,8 +742,8 @@ typedef struct lek_s {
   int nxdist;           /* total distributions crossing plane (local) */
   int nxbuff;           /* size of crossing buffer (2 x nplane x nxbuff) */
   int nprop;            /* no. distributions crossing (9  maximum in 3d) */
-  int8_t prop[2][9];    /* prop[0] is side 0 (cx +ve); */
-                        /* prop[1] is side 1 (cx -ve); */
+  int8_t prop[2][9];    /* prop[0] is side 0 (cx +ve) */
+                        /* prop[1] is side 1 (cx -ve) */
                         /* p values of cross-plane propagating distributions */
 } le_kernel_helper_t;
 

From 97b8f25ecf14dc08d1e5aa2d0b7b4df7db058232 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Mon, 6 Jan 2025 11:25:33 +0000
Subject: [PATCH 083/133] check whether we're using the target halo
 implementation

---
 src/lb_data.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 0336d40e9..d8ca5dc7a 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -589,9 +589,6 @@ __host__ int lb_halo_swap(lb_t * lb, lb_halo_enum_t flag) {
 
   switch (flag) {
   case LB_HALO_TARGET:
-    //tdpAssert( tdpMemcpy(&data, &lb->target->f, sizeof(double *),
-		//	 tdpMemcpyDeviceToHost) );
-    //halo_swap_packed(lb->halo, data);
     lb_halo_post(lb, &lb->h);
     lb_halo_wait(lb, &lb->h);
     break;
@@ -1319,7 +1316,7 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
 
   int ndevice;
   tdpGetDeviceCount(&ndevice);
-  if (ndevice > 0) {
+  if (ndevice > 0 && lb->haloscheme == LB_HALO_TARGET) {
     copyModelToDevice(&lb->model, &lb->target->model);
     copyModelToDevice(&h->map, &h->target->map);
     for (int ireq = 0; ireq < h->map.nvel; ireq++) {
@@ -1391,7 +1388,7 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
 
   int ndevice;
   tdpGetDeviceCount(&ndevice);
-  if (ndevice > 0) {
+  if (ndevice > 0 && lb->haloscheme == LB_HALO_TARGET) {
     for (int ireq = 0; ireq < h->map.nvel; ireq++) {
       if (h->count[ireq] > 0) {
         int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);

From f3feb80091b50d9e91e0e4c8ee1324cd06cac63d Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Mon, 6 Jan 2025 14:33:34 +0000
Subject: [PATCH 084/133] remove nvtx

---
 src/lb_data.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index d8ca5dc7a..873593b1d 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -31,8 +31,6 @@
 #include "timer.h"
 #include "util.h"
 
-#include "nvtx3/nvToolsExt.h"
-
 static int lb_mpi_init(lb_t * lb);
 static int lb_model_param_init(lb_t * lb);
 static int lb_init(lb_t * lb);
@@ -585,8 +583,6 @@ __host__ int lb_halo_swap(lb_t * lb, lb_halo_enum_t flag) {
 
   assert(lb);
 
-  nvtxRangePush("halo_swap");
-
   switch (flag) {
   case LB_HALO_TARGET:
     lb_halo_post(lb, &lb->h);
@@ -605,8 +601,6 @@ __host__ int lb_halo_swap(lb_t * lb, lb_halo_enum_t flag) {
     lb_halo_wait(lb, &lb->h);
   }
 
-  nvtxRangePop();
-
   return 0;
 }
 

From 87d15078c28934abd6552599e6e2050d5214b020 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Mon, 6 Jan 2025 14:53:15 +0000
Subject: [PATCH 085/133] remove excess calls to halo_post and wait

---
 src/lb_data.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 873593b1d..f91370b07 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -585,17 +585,8 @@ __host__ int lb_halo_swap(lb_t * lb, lb_halo_enum_t flag) {
 
   switch (flag) {
   case LB_HALO_TARGET:
-    lb_halo_post(lb, &lb->h);
-    lb_halo_wait(lb, &lb->h);
-    break;
   case LB_HALO_OPENMP_FULL:
-    lb_halo_post(lb, &lb->h);
-    lb_halo_wait(lb, &lb->h);
-    break;
   case LB_HALO_OPENMP_REDUCED:
-    lb_halo_post(lb, &lb->h);
-    lb_halo_wait(lb, &lb->h);
-    break;
   default:
     lb_halo_post(lb, &lb->h);
     lb_halo_wait(lb, &lb->h);

From 53e19fc62d3e094d5fdf5892ed0d959bc8d49cea Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Mon, 6 Jan 2025 14:56:40 +0000
Subject: [PATCH 086/133] tidy

---
 src/lb_data.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index f91370b07..40b8461b8 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1284,7 +1284,6 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
       double * buf = h->recv[ireq];
       if (have_gpu_aware_mpi_) buf = h->recv_d[ireq];
 
-      //if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) mcount = 0;
       if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) continue;
       
       MPI_Irecv(buf, mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
@@ -1326,7 +1325,7 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
 
   TIMER_start(TIMER_LB_HALO_ISEND);
 
-  for (int ireq = 1; ireq < h->map.nvel; ireq++) {
+  for (int ireq = 0; ireq < h->map.nvel; ireq++) {
 
     h->request[27+ireq] = MPI_REQUEST_NULL;
 
@@ -1339,7 +1338,6 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
       if (have_gpu_aware_mpi_) buf = h->send_d[ireq];
 
       /* Short circuit messages to self. */
-      //if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) mcount = 0;
       if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) continue;
 
       MPI_Isend(buf, mcount, MPI_DOUBLE, h->nbrrank[i][j][k],

From 66508062df14bf2946c5834980b9dc1a5f3bd220 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Mon, 6 Jan 2025 15:11:32 +0000
Subject: [PATCH 087/133] remove unused variable

---
 src/lb_data.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 40b8461b8..5568f1c24 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -579,8 +579,6 @@ __host__ int lb_halo(lb_t * lb) {
 
 __host__ int lb_halo_swap(lb_t * lb, lb_halo_enum_t flag) {
 
-  double * data;
-
   assert(lb);
 
   switch (flag) {

From 6ddf7569b75ae492a6a5952ec39bea75e105f05b Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Thu, 9 Jan 2025 16:24:01 +0000
Subject: [PATCH 088/133] comment copymodeltodevice

---
 src/lb_data.c | 56 +++++++++++++++++++++++++--------------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 36f8a94f1..359ff0dde 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -55,32 +55,32 @@ static const int have_gpu_aware_mpi_ = 1;
 static const int have_gpu_aware_mpi_ = 0;
 #endif
 
-void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
-    int nvel = h_model->nvel;
-    // Allocate memory on the GPU for the arrays in the struct
-    int8_t (*d_cv)[3];
-    double *d_wv;
-    double *d_na;
-
-    tdpMalloc((void**)&d_cv, sizeof(int8_t[3]) * nvel);
-    tdpMalloc((void**)&d_wv, sizeof(double) * nvel);
-    tdpMalloc((void**)&d_na, sizeof(double) * nvel);
-
-    // Copy the data from host to the GPU
-    tdpMemcpy(d_cv, h_model->cv, sizeof(int8_t[3]) * nvel, tdpMemcpyHostToDevice);
-    tdpMemcpy(d_wv, h_model->wv, sizeof(double) * nvel, tdpMemcpyHostToDevice);
-    tdpMemcpy(d_na, h_model->na, sizeof(double) * nvel, tdpMemcpyHostToDevice);
-
-    // Set the pointers in the struct to the newly allocated GPU memory
-    tdpMemcpy(&(d_model->cv), &d_cv, sizeof(int8_t(*)[3]), tdpMemcpyHostToDevice);
-    tdpMemcpy(&(d_model->wv), &d_wv, sizeof(double*), tdpMemcpyHostToDevice);
-    tdpMemcpy(&(d_model->na), &d_na, sizeof(double*), tdpMemcpyHostToDevice);
-
-    //copy the rest data to gpu
-    tdpMemcpy(&(d_model->ndim), &(h_model->ndim), sizeof(int8_t), tdpMemcpyHostToDevice);
-    tdpMemcpy(&(d_model->nvel), &(h_model->nvel), sizeof(int8_t), tdpMemcpyHostToDevice);
-    tdpMemcpy(&(d_model->cs2), &(h_model->cs2), sizeof(double), tdpMemcpyHostToDevice);
-}
+//void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
+//    int nvel = h_model->nvel;
+//    // Allocate memory on the GPU for the arrays in the struct
+//    int8_t (*d_cv)[3];
+//    double *d_wv;
+//    double *d_na;
+//
+//    tdpMalloc((void**)&d_cv, sizeof(int8_t[3]) * nvel);
+//    tdpMalloc((void**)&d_wv, sizeof(double) * nvel);
+//    tdpMalloc((void**)&d_na, sizeof(double) * nvel);
+//
+//    // Copy the data from host to the GPU
+//    tdpMemcpy(d_cv, h_model->cv, sizeof(int8_t[3]) * nvel, tdpMemcpyHostToDevice);
+//    tdpMemcpy(d_wv, h_model->wv, sizeof(double) * nvel, tdpMemcpyHostToDevice);
+//    tdpMemcpy(d_na, h_model->na, sizeof(double) * nvel, tdpMemcpyHostToDevice);
+//
+//    // Set the pointers in the struct to the newly allocated GPU memory
+//    tdpMemcpy(&(d_model->cv), &d_cv, sizeof(int8_t(*)[3]), tdpMemcpyHostToDevice);
+//    tdpMemcpy(&(d_model->wv), &d_wv, sizeof(double*), tdpMemcpyHostToDevice);
+//    tdpMemcpy(&(d_model->na), &d_na, sizeof(double*), tdpMemcpyHostToDevice);
+//
+//    //copy the rest data to gpu
+//    tdpMemcpy(&(d_model->ndim), &(h_model->ndim), sizeof(int8_t), tdpMemcpyHostToDevice);
+//    tdpMemcpy(&(d_model->nvel), &(h_model->nvel), sizeof(int8_t), tdpMemcpyHostToDevice);
+//    tdpMemcpy(&(d_model->cs2), &(h_model->cs2), sizeof(double), tdpMemcpyHostToDevice);
+//}
 
 /*****************************************************************************
  *
@@ -1440,8 +1440,8 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
   int ndevice;
   tdpGetDeviceCount(&ndevice);
   if (ndevice > 0 && lb->haloscheme == LB_HALO_TARGET) {
-    copyModelToDevice(&lb->model, &lb->target->model);
-    copyModelToDevice(&h->map, &h->target->map);
+    //copyModelToDevice(&lb->model, &lb->target->model);
+    //copyModelToDevice(&h->map, &h->target->map);
     for (int ireq = 0; ireq < h->map.nvel; ireq++) {
       if (h->count[ireq] > 0) {
         int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);

From f67b38c3599b155b882437dedb492e347621d96f Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Mon, 13 Jan 2025 13:33:36 +0000
Subject: [PATCH 089/133] add halo device model initialise and free functions

---
 src/lb_data.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/src/lb_data.c b/src/lb_data.c
index 359ff0dde..8214970d2 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -40,7 +40,9 @@ int lb_halo_dequeue_recv(lb_t * lb, const lb_halo_t * h, int irreq);
 int lb_halo_enqueue_send(const lb_t * lb, lb_halo_t * h, int irreq);
 
 int lb_data_initialise_device_model(lb_t * lb);
+int halo_initialise_device_model(lb_halo_t * h);
 int lb_data_free_device_model(lb_t *lb);
+int halo_free_device_model(lb_halo_t *h);
 
 static __constant__ lb_collide_param_t static_param;
 
@@ -373,6 +375,60 @@ int lb_data_initialise_device_model(lb_t * lb) {
   return 0;
 }
 
+/*****************************************************************************
+ *
+ *  halo_initialise_device_model
+ *
+ *  Allocate and copy the halo lb_model_t on target.
+ *
+ *****************************************************************************/
+
+int halo_initialise_device_model(lb_halo_t * h) {
+
+  int ndevice = 0;
+
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
+
+  if (ndevice > 0) {
+
+    int nvel = h->map.nvel;
+    lb_model_t * hm = &h->map;
+    lb_model_t * dm = &h->target->map;
+    int8_t (*d_cv)[3] = {0};
+    double * d_wv = NULL;
+    double * d_na = NULL;
+
+    tdpAssert( tdpMalloc((void **) &d_cv, nvel*sizeof(int8_t[3])) );
+    tdpAssert( tdpMalloc((void **) &d_wv, nvel*sizeof(double)) );
+    tdpAssert( tdpMalloc((void **) &d_na, nvel*sizeof(double)) );
+
+    tdpAssert( tdpMemcpy(d_cv, hm->cv, nvel*sizeof(int8_t[3]),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(d_wv, hm->wv, nvel*sizeof(double),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(d_na, hm->na, nvel*sizeof(double),
+			 tdpMemcpyHostToDevice) );
+
+    tdpAssert( tdpMemcpy(&(dm->cv), &d_cv, sizeof(int8_t(*)[3]),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&(dm->wv), &d_wv, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&(dm->na), &d_na, sizeof(double *),
+			 tdpMemcpyHostToDevice) );
+
+    tdpAssert( tdpMemcpy(&(dm->ndim), &(hm->ndim), sizeof(int8_t),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&(dm->nvel), &(hm->nvel), sizeof(int8_t),
+			 tdpMemcpyHostToDevice) );
+    tdpAssert( tdpMemcpy(&(dm->cs2), &(hm->cs2), sizeof(double),
+			 tdpMemcpyHostToDevice) );
+
+    /* We do not copy the eigenvectors currently. */
+  }
+
+  return 0;
+}
+
 /*****************************************************************************
  *
  *  lb_data_free_device_model
@@ -406,6 +462,39 @@ int lb_data_free_device_model(lb_t * lb) {
   return 0;
 }
 
+/*****************************************************************************
+ *
+ *  halo_free_device_model
+ *
+ *****************************************************************************/
+
+int halo_free_device_model(lb_halo_t * h) {
+
+  int ndevice = 0;
+
+  tdpAssert( tdpGetDeviceCount(&ndevice) );
+
+  if (ndevice > 0) {
+
+    int8_t (*d_cv)[3] = {0};
+    double * d_wv = NULL;
+    double * d_na = NULL;
+
+    tdpAssert( tdpMemcpy(&d_cv, &h->target->map.cv, sizeof(int8_t(*)[3]),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpy(&d_wv, &h->target->map.wv, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+    tdpAssert( tdpMemcpy(&d_na, &h->target->map.na, sizeof(double *),
+			 tdpMemcpyDeviceToHost) );
+
+    tdpAssert( tdpFree(d_cv) );
+    tdpAssert( tdpFree(d_wv) );
+    tdpAssert( tdpFree(d_na) );
+  }
+
+  return 0;
+}
+
 /*****************************************************************************
  *
  *  lb_memcpy
@@ -1386,6 +1475,8 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
   free(send_count);
   free(recv_count);
 
+  halo_initialise_device_model(h);
+
   return 0;
 }
 
@@ -1550,6 +1641,8 @@ int lb_halo_free(lb_t * lb, lb_halo_t * h) {
   int ndevice = 0;
   tdpGetDeviceCount(&ndevice);
 
+  halo_free_device_model(h);
+
   if (ndevice > 0) {
     tdpAssert( tdpMemcpy(h->send_d, h->target->send, 27*sizeof(double *),
 			 tdpMemcpyDeviceToHost) );

From dfe316cd1c50e8780e8aed4c827a248ea3e35786 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Mon, 13 Jan 2025 14:20:58 +0000
Subject: [PATCH 090/133] remove commented code

---
 src/lb_data.c | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 8214970d2..e14d275e4 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -57,33 +57,6 @@ static const int have_gpu_aware_mpi_ = 1;
 static const int have_gpu_aware_mpi_ = 0;
 #endif
 
-//void copyModelToDevice(lb_model_t *h_model, lb_model_t *d_model) {
-//    int nvel = h_model->nvel;
-//    // Allocate memory on the GPU for the arrays in the struct
-//    int8_t (*d_cv)[3];
-//    double *d_wv;
-//    double *d_na;
-//
-//    tdpMalloc((void**)&d_cv, sizeof(int8_t[3]) * nvel);
-//    tdpMalloc((void**)&d_wv, sizeof(double) * nvel);
-//    tdpMalloc((void**)&d_na, sizeof(double) * nvel);
-//
-//    // Copy the data from host to the GPU
-//    tdpMemcpy(d_cv, h_model->cv, sizeof(int8_t[3]) * nvel, tdpMemcpyHostToDevice);
-//    tdpMemcpy(d_wv, h_model->wv, sizeof(double) * nvel, tdpMemcpyHostToDevice);
-//    tdpMemcpy(d_na, h_model->na, sizeof(double) * nvel, tdpMemcpyHostToDevice);
-//
-//    // Set the pointers in the struct to the newly allocated GPU memory
-//    tdpMemcpy(&(d_model->cv), &d_cv, sizeof(int8_t(*)[3]), tdpMemcpyHostToDevice);
-//    tdpMemcpy(&(d_model->wv), &d_wv, sizeof(double*), tdpMemcpyHostToDevice);
-//    tdpMemcpy(&(d_model->na), &d_na, sizeof(double*), tdpMemcpyHostToDevice);
-//
-//    //copy the rest data to gpu
-//    tdpMemcpy(&(d_model->ndim), &(h_model->ndim), sizeof(int8_t), tdpMemcpyHostToDevice);
-//    tdpMemcpy(&(d_model->nvel), &(h_model->nvel), sizeof(int8_t), tdpMemcpyHostToDevice);
-//    tdpMemcpy(&(d_model->cs2), &(h_model->cs2), sizeof(double), tdpMemcpyHostToDevice);
-//}
-
 /*****************************************************************************
  *
  *  lb_data_create
@@ -1459,7 +1432,6 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 			 tdpMemcpyHostToDevice) );
 
     for (int p = 0; p < h->map.nvel; p++) {         
-      // XXX: don't allocate zero sized arrays (generally when p == 0)
       int scount = send_count[p]*lb_halo_size(h->slim[p]);  
       int rcount = recv_count[p]*lb_halo_size(h->rlim[p]);
       tdpAssert( tdpMalloc((void**) &h->send_d[p], scount * sizeof(double)) );

From 3863565825920c8d2857a064d6597260c43bca33 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Fri, 17 Jan 2025 16:51:03 +0000
Subject: [PATCH 091/133] add function for initialising halo model on device

---
 src/lb_data.c | 177 ++++++++++++++++++++++++++++++++++++++++++++++++++
 src/lb_data.h |   3 +
 2 files changed, 180 insertions(+)

diff --git a/src/lb_data.c b/src/lb_data.c
index 3f4adb17f..12f8045b7 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -44,6 +44,9 @@ int halo_initialise_device_model(lb_halo_t * h);
 int lb_data_free_device_model(lb_t *lb);
 int halo_free_device_model(lb_halo_t *h);
 
+int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count);
+int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count);
+
 static __constant__ lb_collide_param_t static_param;
 
 #ifdef HAVE_OPENMPI_
@@ -51,6 +54,15 @@ static __constant__ lb_collide_param_t static_param;
 #include "mpi-ext.h"
 #endif
 
+#ifdef __NVCC__
+/* There are two file-scope switches here, which need to be generalised
+ * via some suitable interface; they are separate, but both relate to
+ * GPU execution. */
+static const int have_graph_api_ = 1;
+#else
+static const int have_graph_api_ = 0;
+#endif
+
 #if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
 static const int have_gpu_aware_mpi_ = 1;
 #else
@@ -1910,3 +1922,168 @@ int lb_io_read(lb_t * lb, int timestep, io_event_t * event) {
 
   return ifail;
 }
+
+/*****************************************************************************
+ *
+ * lb_graph_halo_send_create
+ *
+ *****************************************************************************/
+
+int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count) {
+
+  assert(lb);
+  assert(h);
+
+  tdpAssert( tdpGraphCreate(&h->gsend.graph, 0) );
+
+  for (int ireq = 1; ireq < h->map.nvel; ireq++) {
+    tdpGraphNode_t kernelNode;
+    tdpKernelNodeParams kernelNodeParams = {0};
+    void * kernelArgs[3] = {(void *) &lb->target,
+                            (void *) &h->target,
+                            (void *) &ireq};
+    kernelNodeParams.func = (void *) lb_halo_enqueue_send_kernel;
+    //kernelNodeParams.func = (void *) lb_null_kernel;
+    dim3 nblk;
+    dim3 ntpb;
+    int scount = send_count[ireq]*lb_halo_size(h->slim[ireq]);
+
+    kernel_launch_param(scount, &nblk, &ntpb);
+
+    kernelNodeParams.gridDim        = nblk;
+    kernelNodeParams.blockDim       = ntpb;
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void **) kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    tdpAssert( tdpGraphAddKernelNode(&kernelNode, h->gsend.graph, NULL, 0,
+				     &kernelNodeParams) );
+
+    if (have_gpu_aware_mpi_) {
+      /* Don't need explicit device -> host copy */
+    }
+    else {
+      /* We do need to add the memcpys to the graph definition
+       * (except messages to self... ) */
+
+      int i = 1 + h->map.cv[h->map.nvel - ireq][X];
+      int j = 1 + h->map.cv[h->map.nvel - ireq][Y];
+      int k = 1 + h->map.cv[h->map.nvel - ireq][Z];
+
+      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
+	      tdpGraphNode_t memcpyNode;
+        tdpMemcpy3DParms memcpyParams = {0};
+
+	      memcpyParams.srcArray = NULL;
+	      memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
+	      memcpyParams.srcPtr   = make_tdpPitchedPtr(h->send_d[ireq],
+						   sizeof(double)*scount,
+						   scount, 1);
+	      memcpyParams.dstArray = NULL;
+	      memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
+	      memcpyParams.dstPtr   = make_tdpPitchedPtr(h->send[ireq],
+						   sizeof(double)*scount,
+						   scount, 1);
+	      memcpyParams.extent   = make_tdpExtent(sizeof(double)*scount, 1, 1);
+	      memcpyParams.kind     = tdpMemcpyDeviceToHost;
+
+	      tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->gsend.graph,
+					 &kernelNode, 1, &memcpyParams) );
+      }
+    }
+  }
+
+  tdpAssert( tdpGraphInstantiate(&h->gsend.exec, h->gsend.graph, 0) );
+
+  return 0;
+}
+
+/*****************************************************************************
+ *
+ *  lb_graph_halo_recv_create
+ *
+ *****************************************************************************/
+
+int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count) {
+
+  assert(lb);
+  assert(h);
+
+  tdpAssert( tdpGraphCreate(&h->grecv.graph, 0) );
+
+  for (int ireq = 1; ireq < h->map.nvel; ireq++) {
+    int rcount = recv_count[ireq]*lb_halo_size(h->rlim[ireq]);
+    tdpGraphNode_t memcpyNode = {0};
+
+    if (have_gpu_aware_mpi_) {
+      /* Don't need explicit copies */
+    }
+    else {
+      int i = 1 + h->map.cv[h->map.nvel - ireq][X];
+      int j = 1 + h->map.cv[h->map.nvel - ireq][Y];
+      int k = 1 + h->map.cv[h->map.nvel - ireq][Z];
+
+      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
+	      tdpMemcpy3DParms memcpyParams = {0};
+
+	      memcpyParams.srcArray = NULL;
+	      memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
+	      memcpyParams.srcPtr   = make_tdpPitchedPtr(h->recv[ireq],
+						   sizeof(double)*rcount,
+						   rcount, 1);
+	      memcpyParams.dstArray = NULL;
+	      memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
+	      memcpyParams.dstPtr   = make_tdpPitchedPtr(h->recv_d[ireq],
+						   sizeof(double)*rcount,
+						   rcount, 1);
+        memcpyParams.extent   = make_tdpExtent(sizeof(double)*rcount, 1, 1);
+        memcpyParams.kind     = tdpMemcpyHostToDevice;
+
+	      tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->grecv.graph, NULL,
+					 0, &memcpyParams) );
+      }
+    }
+
+    /* Always need the dis-aggregateion kernel */
+
+    dim3 nblk;
+    dim3 ntpb;
+    tdpGraphNode_t node;
+    tdpKernelNodeParams kernelNodeParams = {0};
+    void * kernelArgs[3] = {(void *) &lb->target,
+                            (void *) &h->target,
+                            (void *) &ireq};
+    kernelNodeParams.func = (void *) lb_halo_dequeue_recv_kernel;
+    //kernelNodeParams.func = (void *) lb_null_kernel;
+
+    kernel_launch_param(rcount, &nblk, &ntpb);
+
+    kernelNodeParams.gridDim        = nblk;
+    kernelNodeParams.blockDim       = ntpb;
+    kernelNodeParams.sharedMemBytes = 0;
+    kernelNodeParams.kernelParams   = (void **) kernelArgs;
+    kernelNodeParams.extra          = NULL;
+
+    if (have_gpu_aware_mpi_) {
+      tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, NULL,
+				       0, &kernelNodeParams) );
+    }
+    else {
+      int i = 1 + h->map.cv[h->map.nvel - ireq][X];
+      int j = 1 + h->map.cv[h->map.nvel - ireq][Y];
+      int k = 1 + h->map.cv[h->map.nvel - ireq][Z];
+      if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
+	      tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, &memcpyNode,
+					 1, &kernelNodeParams) );
+      }
+      else {
+	      tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, NULL, 0,
+					 &kernelNodeParams) );
+      }
+    }
+  }
+
+  tdpAssert( tdpGraphInstantiate(&h->grecv.exec, h->grecv.graph, 0) );
+
+  return 0;
+}
diff --git a/src/lb_data.h b/src/lb_data.h
index 7be576f33..88cf92045 100644
--- a/src/lb_data.h
+++ b/src/lb_data.h
@@ -100,6 +100,9 @@ struct lb_halo_s {
   lb_halo_t * target;
   double * send_d[27];            /* halo: device send buffer per direction */
   double * recv_d[27];            /* halo: device recv buffer per direction */
+
+  lb_graph_halo_t gsend;          /* Graph API halo swap */
+  lb_graph_halo_t grecv;
 };
 
 int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme);

From a123254e2fee2a14dbf64e40777580bf08d687ae Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Sun, 19 Jan 2025 17:20:37 +0000
Subject: [PATCH 092/133] reintroduce calls to graph launch that were removed
 when merging main gpu aware mpi branch back in

---
 src/lb_data.c | 58 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 16 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 12f8045b7..9c4664329 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1456,8 +1456,11 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 			 tdpMemcpyHostToDevice) );
 
     if (have_graph_api_) {
+      printf("here 1\n");
       lb_graph_halo_send_create(lb, h, send_count);
+      printf("here 2\n");
       lb_graph_halo_recv_create(lb, h, recv_count);
+      printf("here 3\n");
     }
 
   }
@@ -1520,15 +1523,21 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
   int ndevice;
   tdpGetDeviceCount(&ndevice);
   if (ndevice > 0 && lb->haloscheme == LB_HALO_TARGET) {
-    //copyModelToDevice(&lb->model, &lb->target->model);
-    //copyModelToDevice(&h->map, &h->target->map);
-    for (int ireq = 0; ireq < h->map.nvel; ireq++) {
-      if (h->count[ireq] > 0) {
-        int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
-        dim3 nblk, ntpb;
-        kernel_launch_param(scount, &nblk, &ntpb);
-        tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
-        tdpDeviceSynchronize();
+    if (have_graph_api_) {
+      printf("here 4\n");
+      tdpAssert( tdpGraphLaunch(h->gsend.exec, h->stream) );
+      printf("here 5\n");
+      tdpAssert( tdpStreamSynchronize(h->stream) );
+      printf("here 6\n");
+    } else {
+      for (int ireq = 0; ireq < h->map.nvel; ireq++) {
+        if (h->count[ireq] > 0) {
+          int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+          dim3 nblk, ntpb;
+          kernel_launch_param(scount, &nblk, &ntpb);
+          tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+          tdpDeviceSynchronize();
+        }
       }
     }
   } else {
@@ -1591,13 +1600,21 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
   int ndevice;
   tdpGetDeviceCount(&ndevice);
   if (ndevice > 0 && lb->haloscheme == LB_HALO_TARGET) {
-    for (int ireq = 0; ireq < h->map.nvel; ireq++) {
-      if (h->count[ireq] > 0) {
-        int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
-        dim3 nblk, ntpb;
-        kernel_launch_param(rcount, &nblk, &ntpb);
-        tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
-        tdpDeviceSynchronize();
+    if (have_graph_api_) {
+      printf("here 7\n");
+      tdpAssert( tdpGraphLaunch(h->grecv.exec, h->stream) );
+      printf("here 8\n");
+      tdpAssert( tdpStreamSynchronize(h->stream) );
+      printf("here 9\n");
+    } else {
+      for (int ireq = 0; ireq < h->map.nvel; ireq++) {
+        if (h->count[ireq] > 0) {
+          int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+          dim3 nblk, ntpb;
+          kernel_launch_param(rcount, &nblk, &ntpb);
+          tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+          tdpDeviceSynchronize();
+        }
       }
     }
   } else {
@@ -1649,6 +1666,15 @@ int lb_halo_free(lb_t * lb, lb_halo_t * h) {
     free(h->recv[ireq]);
   }
 
+  if (have_graph_api_) {
+    printf("here 10\n");
+    tdpAssert( tdpGraphDestroy(h->gsend.graph) );
+    printf("here 11\n");
+    tdpAssert( tdpGraphDestroy(h->grecv.graph) );
+    printf("here 12\n");
+  }
+  
+  tdpAssert( tdpStreamDestroy(h->stream) );
   lb_model_free(&h->map);
 
   return 0;

From dfb45c6c2e02e9935ae8d04f2c41cd73935f30d7 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Mon, 20 Jan 2025 11:28:59 +0000
Subject: [PATCH 093/133] remove debugging print statements

---
 src/lb_data.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 9c4664329..8759250cb 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1456,11 +1456,8 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 			 tdpMemcpyHostToDevice) );
 
     if (have_graph_api_) {
-      printf("here 1\n");
       lb_graph_halo_send_create(lb, h, send_count);
-      printf("here 2\n");
       lb_graph_halo_recv_create(lb, h, recv_count);
-      printf("here 3\n");
     }
 
   }
@@ -1524,11 +1521,8 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
   tdpGetDeviceCount(&ndevice);
   if (ndevice > 0 && lb->haloscheme == LB_HALO_TARGET) {
     if (have_graph_api_) {
-      printf("here 4\n");
       tdpAssert( tdpGraphLaunch(h->gsend.exec, h->stream) );
-      printf("here 5\n");
       tdpAssert( tdpStreamSynchronize(h->stream) );
-      printf("here 6\n");
     } else {
       for (int ireq = 0; ireq < h->map.nvel; ireq++) {
         if (h->count[ireq] > 0) {
@@ -1601,11 +1595,8 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
   tdpGetDeviceCount(&ndevice);
   if (ndevice > 0 && lb->haloscheme == LB_HALO_TARGET) {
     if (have_graph_api_) {
-      printf("here 7\n");
       tdpAssert( tdpGraphLaunch(h->grecv.exec, h->stream) );
-      printf("here 8\n");
       tdpAssert( tdpStreamSynchronize(h->stream) );
-      printf("here 9\n");
     } else {
       for (int ireq = 0; ireq < h->map.nvel; ireq++) {
         if (h->count[ireq] > 0) {
@@ -1667,11 +1658,8 @@ int lb_halo_free(lb_t * lb, lb_halo_t * h) {
   }
 
   if (have_graph_api_) {
-    printf("here 10\n");
     tdpAssert( tdpGraphDestroy(h->gsend.graph) );
-    printf("here 11\n");
     tdpAssert( tdpGraphDestroy(h->grecv.graph) );
-    printf("here 12\n");
   }
   
   tdpAssert( tdpStreamDestroy(h->stream) );

From b87282fec52cdb0fa8023386c4ea53b1cc1b7690 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Mon, 20 Jan 2025 11:34:59 +0000
Subject: [PATCH 094/133] deactivate graph implementation

---
 src/lb_data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 8759250cb..3eae291d6 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -58,7 +58,7 @@ static __constant__ lb_collide_param_t static_param;
 /* There are two file-scope switches here, which need to be generalised
  * via some suitable interface; they are separate, but both relate to
  * GPU execution. */
-static const int have_graph_api_ = 1;
+static const int have_graph_api_ = 0;
 #else
 static const int have_graph_api_ = 0;
 #endif

From f300c01dfab04124342f0002a466872438ffbc1c Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Mon, 20 Jan 2025 11:54:50 +0000
Subject: [PATCH 095/133] remove lb_halo_swap

---
 src/lb_data.c | 27 ++-------------------------
 src/lb_data.h |  1 -
 src/ludwig.c  |  3 ++-
 3 files changed, 4 insertions(+), 27 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 3eae291d6..9ea01c28f 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -779,31 +779,8 @@ __host__ int lb_halo(lb_t * lb) {
 
   assert(lb);
 
-  lb_halo_swap(lb, lb->haloscheme);
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  lb_halo_swap
- *
- *  Specify the type of swap wanted.
- *
- *****************************************************************************/
-
-__host__ int lb_halo_swap(lb_t * lb, lb_halo_enum_t flag) {
-
-  assert(lb);
-
-  switch (flag) {
-  case LB_HALO_TARGET:
-  case LB_HALO_OPENMP_FULL:
-  case LB_HALO_OPENMP_REDUCED:
-  default:
-    lb_halo_post(lb, &lb->h);
-    lb_halo_wait(lb, &lb->h);
-  }
+  lb_halo_post(lb, &lb->h);
+  lb_halo_wait(lb, &lb->h);
 
   return 0;
 }
diff --git a/src/lb_data.h b/src/lb_data.h
index 88cf92045..ef77e2fe8 100644
--- a/src/lb_data.h
+++ b/src/lb_data.h
@@ -172,7 +172,6 @@ __host__ int lb_free(lb_t * lb);
 __host__ int lb_memcpy(lb_t * lb, tdpMemcpyKind flag);
 __host__ int lb_collide_param_commit(lb_t * lb);
 __host__ int lb_halo(lb_t * lb);
-__host__ int lb_halo_swap(lb_t * lb, lb_halo_enum_t flag);
 
 __host__ __device__ int lb_ndist(lb_t * lb, int * ndist);
 __host__ __device__ int lb_f(lb_t * lb, int index, int p, int n, double * f);
diff --git a/src/ludwig.c b/src/ludwig.c
index 2e2b15617..38ead8241 100644
--- a/src/ludwig.c
+++ b/src/ludwig.c
@@ -2170,7 +2170,8 @@ int ludwig_colloids_update(ludwig_t * ludwig) {
   else {
     /* Pull data back, then full host halo swap */
     lb_memcpy(ludwig->lb, tdpMemcpyDeviceToHost);
-    lb_halo_swap(ludwig->lb, LB_HALO_OPENMP_FULL);
+    ludwig->lb->haloscheme = LB_HALO_OPENMP_FULL;
+    lb_halo(ludwig->lb);
   }
 
   TIMER_stop(TIMER_HALO_LATTICE);

From 808cabcd81fc6166d178fb9af23a9f3a4b4622bf Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Wed, 22 Jan 2025 14:35:44 +0000
Subject: [PATCH 096/133] remove commented code

---
 src/lb_data.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 9ea01c28f..10029cb16 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1934,7 +1934,6 @@ int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count)
                             (void *) &h->target,
                             (void *) &ireq};
     kernelNodeParams.func = (void *) lb_halo_enqueue_send_kernel;
-    //kernelNodeParams.func = (void *) lb_null_kernel;
     dim3 nblk;
     dim3 ntpb;
     int scount = send_count[ireq]*lb_halo_size(h->slim[ireq]);
@@ -2045,7 +2044,6 @@ int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count)
                             (void *) &h->target,
                             (void *) &ireq};
     kernelNodeParams.func = (void *) lb_halo_dequeue_recv_kernel;
-    //kernelNodeParams.func = (void *) lb_null_kernel;
 
     kernel_launch_param(rcount, &nblk, &ntpb);
 

From a10ae9b8753ba3086490732455daf76799c7f746 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Wed, 22 Jan 2025 14:41:03 +0000
Subject: [PATCH 097/133] set halo target to zero

---
 src/lb_data.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/lb_data.c b/src/lb_data.c
index 10029cb16..66442e3d7 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1417,6 +1417,7 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
   }
   else {
     tdpAssert( tdpMalloc((void **) &h->target, sizeof(lb_halo_t)) );
+    tdpAssert( tdpMemset(h->target, 0, sizeof(lb_halo_t)));
     tdpAssert( tdpMemcpy(h->target, h, sizeof(lb_halo_t),
 			 tdpMemcpyHostToDevice) );
 

From 7bd91bb3461f49ab323766b14ad31fad9c8c72b7 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Wed, 22 Jan 2025 14:44:37 +0000
Subject: [PATCH 098/133] don't allocate zero sized arrays

---
 src/lb_data.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 66442e3d7..746a06396 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1424,8 +1424,10 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
     for (int p = 0; p < h->map.nvel; p++) {         
       int scount = send_count[p]*lb_halo_size(h->slim[p]);  
       int rcount = recv_count[p]*lb_halo_size(h->rlim[p]);
-      tdpAssert( tdpMalloc((void**) &h->send_d[p], scount * sizeof(double)) );
-      tdpAssert( tdpMalloc((void**) &h->recv_d[p], rcount * sizeof(double)) );
+      if (scount > 0)
+        tdpAssert( tdpMalloc((void**) &h->send_d[p], scount * sizeof(double)) );
+      if (rcount > 0)
+        tdpAssert( tdpMalloc((void**) &h->recv_d[p], rcount * sizeof(double)) );
     }
     /* Slightly tricksy. Could use send_d and recv_d on target copy ...*/
     tdpAssert( tdpMemcpy(h->target->send, h->send_d, 27*sizeof(double *),     

From 5bda6d0d2672bcbe53fdf1cdf976817b45ae47c8 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Wed, 22 Jan 2025 15:26:18 +0000
Subject: [PATCH 099/133] move initialise device functions inside check for
 ndevice non-zero

---
 src/lb_data.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 746a06396..85f3fdee1 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -589,12 +589,13 @@ static int lb_init(lb_t * lb) {
     cs_target(lb->cs, &cstarget);
     tdpMemcpy(&lb->target->cs, &cstarget, sizeof(cs_t *),
 	      tdpMemcpyHostToDevice);
+  
+    lb_data_initialise_device_model(lb);
   }
 
   lb_mpi_init(lb);
   lb_model_param_init(lb);
 
-  lb_data_initialise_device_model(lb);
   lb_memcpy(lb, tdpMemcpyHostToDevice);
 
   return 0;
@@ -1435,6 +1436,8 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
     tdpAssert( tdpMemcpy(h->target->recv, h->recv_d, 27*sizeof(double *),
 			 tdpMemcpyHostToDevice) );
 
+    halo_initialise_device_model(h);
+
     if (have_graph_api_) {
       lb_graph_halo_send_create(lb, h, send_count);
       lb_graph_halo_recv_create(lb, h, recv_count);
@@ -1444,8 +1447,6 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
   free(send_count);
   free(recv_count);
 
-  halo_initialise_device_model(h);
-
   return 0;
 }
 

From cb537a3f73e280d143b08315374b238eac905f02 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Wed, 22 Jan 2025 16:10:03 +0000
Subject: [PATCH 100/133] add tdpassert

---
 src/lb_data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 85f3fdee1..b84a7b11e 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1510,7 +1510,7 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
           int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
           dim3 nblk, ntpb;
           kernel_launch_param(scount, &nblk, &ntpb);
-          tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+          tdpAssert( tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq));
           tdpDeviceSynchronize();
         }
       }

From 2657175d5d8d11e9c7c8e1b93ec2419f695ab2f7 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Wed, 22 Jan 2025 16:51:51 +0000
Subject: [PATCH 101/133] don't construct graph nodes when request size is zero

---
 src/lb_data.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index b84a7b11e..dfd85fcd4 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1510,8 +1510,8 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
           int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
           dim3 nblk, ntpb;
           kernel_launch_param(scount, &nblk, &ntpb);
-          tdpAssert( tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq));
-          tdpDeviceSynchronize();
+          tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+          tdpAssert( tdpDeviceSynchronize());
         }
       }
     }
@@ -1585,7 +1585,7 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
           dim3 nblk, ntpb;
           kernel_launch_param(rcount, &nblk, &ntpb);
           tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
-          tdpDeviceSynchronize();
+          tdpAssert( tdpDeviceSynchronize());
         }
       }
     }
@@ -1941,6 +1941,7 @@ int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count)
     dim3 nblk;
     dim3 ntpb;
     int scount = send_count[ireq]*lb_halo_size(h->slim[ireq]);
+    if (scount == 0) continue;
 
     kernel_launch_param(scount, &nblk, &ntpb);
 
@@ -2007,6 +2008,7 @@ int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count)
 
   for (int ireq = 1; ireq < h->map.nvel; ireq++) {
     int rcount = recv_count[ireq]*lb_halo_size(h->rlim[ireq]);
+    if (rcount == 0) continue;
     tdpGraphNode_t memcpyNode = {0};
 
     if (have_gpu_aware_mpi_) {

From a28d69b232fe5d184784ff56d6ebcdc78dd2bb8c Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Wed, 22 Jan 2025 17:47:13 +0000
Subject: [PATCH 102/133] correct send recv counts

---
 src/lb_data.c | 6 +++---
 src/ludwig.c  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index dfd85fcd4..74efc01aa 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -58,7 +58,7 @@ static __constant__ lb_collide_param_t static_param;
 /* There are two file-scope switches here, which need to be generalised
  * via some suitable interface; they are separate, but both relate to
  * GPU execution. */
-static const int have_graph_api_ = 0;
+static const int have_graph_api_ = 1;
 #else
 static const int have_graph_api_ = 0;
 #endif
@@ -1940,7 +1940,7 @@ int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count)
     kernelNodeParams.func = (void *) lb_halo_enqueue_send_kernel;
     dim3 nblk;
     dim3 ntpb;
-    int scount = send_count[ireq]*lb_halo_size(h->slim[ireq]);
+    int scount = lb_halo_size(h->slim[ireq]);
     if (scount == 0) continue;
 
     kernel_launch_param(scount, &nblk, &ntpb);
@@ -2007,7 +2007,7 @@ int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count)
   tdpAssert( tdpGraphCreate(&h->grecv.graph, 0) );
 
   for (int ireq = 1; ireq < h->map.nvel; ireq++) {
-    int rcount = recv_count[ireq]*lb_halo_size(h->rlim[ireq]);
+    int rcount = lb_halo_size(h->rlim[ireq]);
     if (rcount == 0) continue;
     tdpGraphNode_t memcpyNode = {0};
 
diff --git a/src/ludwig.c b/src/ludwig.c
index 38ead8241..c3d2b6e13 100644
--- a/src/ludwig.c
+++ b/src/ludwig.c
@@ -2169,9 +2169,9 @@ int ludwig_colloids_update(ludwig_t * ludwig) {
   }
   else {
     /* Pull data back, then full host halo swap */
-    lb_memcpy(ludwig->lb, tdpMemcpyDeviceToHost);
     ludwig->lb->haloscheme = LB_HALO_OPENMP_FULL;
     lb_halo(ludwig->lb);
+    lb_memcpy(ludwig->lb, tdpMemcpyDeviceToHost);
   }
 
   TIMER_stop(TIMER_HALO_LATTICE);

From 9c911d4cc2dfef1074f34912b455ec9c642fef05 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Thu, 23 Jan 2025 09:45:25 +0000
Subject: [PATCH 103/133] use halo->count when deciding to skip building nodes
 which would have zero sized messages

---
 src/lb_data.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 74efc01aa..a568b39ee 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -58,7 +58,7 @@ static __constant__ lb_collide_param_t static_param;
 /* There are two file-scope switches here, which need to be generalised
  * via some suitable interface; they are separate, but both relate to
  * GPU execution. */
-static const int have_graph_api_ = 1;
+static const int have_graph_api_ = 0;
 #else
 static const int have_graph_api_ = 0;
 #endif
@@ -1941,7 +1941,7 @@ int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count)
     dim3 nblk;
     dim3 ntpb;
     int scount = lb_halo_size(h->slim[ireq]);
-    if (scount == 0) continue;
+    if (h->count[ireq] == 0) continue;
 
     kernel_launch_param(scount, &nblk, &ntpb);
 
@@ -1972,14 +1972,14 @@ int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count)
 	      memcpyParams.srcArray = NULL;
 	      memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
 	      memcpyParams.srcPtr   = make_tdpPitchedPtr(h->send_d[ireq],
-						   sizeof(double)*scount,
-						   scount, 1);
+						   sizeof(double)*h->count[ireq]*scount,
+						   h->count[ireq]*scount, 1);
 	      memcpyParams.dstArray = NULL;
 	      memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
 	      memcpyParams.dstPtr   = make_tdpPitchedPtr(h->send[ireq],
-						   sizeof(double)*scount,
-						   scount, 1);
-	      memcpyParams.extent   = make_tdpExtent(sizeof(double)*scount, 1, 1);
+						   sizeof(double)*h->count[ireq]*scount,
+						   h->count[ireq]*scount, 1);
+	      memcpyParams.extent   = make_tdpExtent(sizeof(double)*h->count[ireq]*scount, 1, 1);
 	      memcpyParams.kind     = tdpMemcpyDeviceToHost;
 
 	      tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->gsend.graph,
@@ -2008,7 +2008,7 @@ int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count)
 
   for (int ireq = 1; ireq < h->map.nvel; ireq++) {
     int rcount = lb_halo_size(h->rlim[ireq]);
-    if (rcount == 0) continue;
+    if (h->count[ireq] == 0) continue;
     tdpGraphNode_t memcpyNode = {0};
 
     if (have_gpu_aware_mpi_) {

From e0b7de9b674a07c728c823279960691da2db061a Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Thu, 23 Jan 2025 11:00:55 +0000
Subject: [PATCH 104/133] copy send and receive buffers from device to host if
 not using gpu aware mpi

---
 src/lb_data.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index a568b39ee..275c63abb 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1512,6 +1512,10 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
           kernel_launch_param(scount, &nblk, &ntpb);
           tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
           tdpAssert( tdpDeviceSynchronize());
+ 
+          if (!have_gpu_aware_mpi_) {
+            tdpAssert( tdpMemcpy(h->send[ireq], h->send_d[ireq], sizeof(double)*scount, tdpMemcpyDeviceToHost));
+          }
         }
       }
     }
@@ -1582,6 +1586,9 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
       for (int ireq = 0; ireq < h->map.nvel; ireq++) {
         if (h->count[ireq] > 0) {
           int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
+          if (!have_gpu_aware_mpi_) {
+            tdpAssert( tdpMemcpy(h->recv[ireq], h->recv_d[ireq], sizeof(double)*rcount, tdpMemcpyDeviceToHost));
+          }
           dim3 nblk, ntpb;
           kernel_launch_param(rcount, &nblk, &ntpb);
           tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
@@ -2025,14 +2032,14 @@ int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count)
 	      memcpyParams.srcArray = NULL;
 	      memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
 	      memcpyParams.srcPtr   = make_tdpPitchedPtr(h->recv[ireq],
-						   sizeof(double)*rcount,
-						   rcount, 1);
+						   sizeof(double)*h->count[ireq]*rcount,
+						   h->count[ireq]*rcount, 1);
 	      memcpyParams.dstArray = NULL;
 	      memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
 	      memcpyParams.dstPtr   = make_tdpPitchedPtr(h->recv_d[ireq],
-						   sizeof(double)*rcount,
-						   rcount, 1);
-        memcpyParams.extent   = make_tdpExtent(sizeof(double)*rcount, 1, 1);
+						   sizeof(double)*h->count[ireq]*rcount,
+						   h->count[ireq]*rcount, 1);
+        memcpyParams.extent   = make_tdpExtent(sizeof(double)*h->count[ireq]*rcount, 1, 1);
         memcpyParams.kind     = tdpMemcpyHostToDevice;
 
 	      tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->grecv.graph, NULL,

From 7d1cf9efbb95b1016e4ceeaa87ea525110e3c6e6 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Thu, 23 Jan 2025 12:29:23 +0000
Subject: [PATCH 105/133] don't fiddle with haloscheme

---
 src/lb_data.c | 7 ++++---
 src/ludwig.c  | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 275c63abb..1ea86f785 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -58,7 +58,7 @@ static __constant__ lb_collide_param_t static_param;
 /* There are two file-scope switches here, which need to be generalised
  * via some suitable interface; they are separate, but both relate to
  * GPU execution. */
-static const int have_graph_api_ = 0;
+static const int have_graph_api_ = 1;
 #else
 static const int have_graph_api_ = 0;
 #endif
@@ -157,7 +157,7 @@ int lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * options,
   }
 
   lb_halo_create(obj, &obj->h, obj->haloscheme);
-  lb_init(obj);
+  lb_init(obj); /* graph node creation should happen after init */
 
   /* i/o metadata */
   {
@@ -1500,7 +1500,7 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
 
   int ndevice;
   tdpGetDeviceCount(&ndevice);
-  if (ndevice > 0 && lb->haloscheme == LB_HALO_TARGET) {
+  if (ndevice > 0) {
     if (have_graph_api_) {
       tdpAssert( tdpGraphLaunch(h->gsend.exec, h->stream) );
       tdpAssert( tdpStreamSynchronize(h->stream) );
@@ -1941,6 +1941,7 @@ int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count)
   for (int ireq = 1; ireq < h->map.nvel; ireq++) {
     tdpGraphNode_t kernelNode;
     tdpKernelNodeParams kernelNodeParams = {0};
+    printf("lb nvel %d\n", lb->nvel);
     void * kernelArgs[3] = {(void *) &lb->target,
                             (void *) &h->target,
                             (void *) &ireq};
diff --git a/src/ludwig.c b/src/ludwig.c
index c3d2b6e13..4f7fbce31 100644
--- a/src/ludwig.c
+++ b/src/ludwig.c
@@ -2169,7 +2169,6 @@ int ludwig_colloids_update(ludwig_t * ludwig) {
   }
   else {
     /* Pull data back, then full host halo swap */
-    ludwig->lb->haloscheme = LB_HALO_OPENMP_FULL;
     lb_halo(ludwig->lb);
     lb_memcpy(ludwig->lb, tdpMemcpyDeviceToHost);
   }

From 2cd14d2458340a1662c66b6affa733e9e3ec76ff Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Thu, 23 Jan 2025 12:32:14 +0000
Subject: [PATCH 106/133] remove debugging print statement

---
 src/lb_data.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 1ea86f785..177e296ab 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1941,7 +1941,6 @@ int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count)
   for (int ireq = 1; ireq < h->map.nvel; ireq++) {
     tdpGraphNode_t kernelNode;
     tdpKernelNodeParams kernelNodeParams = {0};
-    printf("lb nvel %d\n", lb->nvel);
     void * kernelArgs[3] = {(void *) &lb->target,
                             (void *) &h->target,
                             (void *) &ireq};

From ac457af12160437c272c6b51744850ed633701e6 Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Thu, 23 Jan 2025 12:44:57 +0000
Subject: [PATCH 107/133] promote scount to size_t

---
 src/lb_data.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 177e296ab..b53ff319b 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -58,7 +58,7 @@ static __constant__ lb_collide_param_t static_param;
 /* There are two file-scope switches here, which need to be generalised
  * via some suitable interface; they are separate, but both relate to
  * GPU execution. */
-static const int have_graph_api_ = 1;
+static const int have_graph_api_ = 0;
 #else
 static const int have_graph_api_ = 0;
 #endif
@@ -1947,7 +1947,7 @@ int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count)
     kernelNodeParams.func = (void *) lb_halo_enqueue_send_kernel;
     dim3 nblk;
     dim3 ntpb;
-    int scount = lb_halo_size(h->slim[ireq]);
+    size_t scount = lb_halo_size(h->slim[ireq]);
     if (h->count[ireq] == 0) continue;
 
     kernel_launch_param(scount, &nblk, &ntpb);

From c54414006b6639412c601307f850dab9690d644b Mon Sep 17 00:00:00 2001
From: alexei-borissov <a.borissov@epcc.ed.ac.uk>
Date: Thu, 23 Jan 2025 16:28:02 +0000
Subject: [PATCH 108/133] promote rcount to size_t

---
 src/lb_data.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index b53ff319b..a255fd84e 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -58,7 +58,7 @@ static __constant__ lb_collide_param_t static_param;
 /* There are two file-scope switches here, which need to be generalised
  * via some suitable interface; they are separate, but both relate to
  * GPU execution. */
-static const int have_graph_api_ = 0;
+static const int have_graph_api_ = 1;
 #else
 static const int have_graph_api_ = 0;
 #endif
@@ -156,8 +156,8 @@ int lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * options,
     pe_fatal(pe, "calloc(1, lb_collide_param_t) failed\n");
   }
 
-  lb_halo_create(obj, &obj->h, obj->haloscheme);
   lb_init(obj); /* graph node creation should happen after init */
+  lb_halo_create(obj, &obj->h, obj->haloscheme);
 
   /* i/o metadata */
   {
@@ -2014,7 +2014,7 @@ int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count)
   tdpAssert( tdpGraphCreate(&h->grecv.graph, 0) );
 
   for (int ireq = 1; ireq < h->map.nvel; ireq++) {
-    int rcount = lb_halo_size(h->rlim[ireq]);
+    size_t rcount = lb_halo_size(h->rlim[ireq]);
     if (h->count[ireq] == 0) continue;
     tdpGraphNode_t memcpyNode = {0};
 

From 14bab2fe08d817f6ec95ccb3ca501a4b200b1efb Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Tue, 28 Jan 2025 09:48:58 +0000
Subject: [PATCH 109/133] Additional functions to estimate links required

---
 src/colloid_link.c | 56 +++++++++++++++++++++++++++++++++++++++++++---
 src/colloid_link.h | 13 +++++------
 2 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/src/colloid_link.c b/src/colloid_link.c
index 350f33b6f..ee59b88e3 100644
--- a/src/colloid_link.c
+++ b/src/colloid_link.c
@@ -4,17 +4,16 @@
  *
  *  Colloid boundary link structure.
  *
- *  $Id: colloid_link.c,v 1.2 2010-10-15 12:40:02 kevin Exp $
- *
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
- *  (c) 2010 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *****************************************************************************/
 
 #include <assert.h>
+#include <math.h>
 #include <stdlib.h>
 
 #include "colloid_link.h"
@@ -93,3 +92,54 @@ int colloid_link_total(void) {
 
   return nlinks_;
 }
+
+/*****************************************************************************
+ *
+ *  colloid_link_max_2d
+ *
+ *  How many links do we need to allocate for a 2d disk of radius a?
+ *  if the model has nvel velocities.
+ *
+ *  In general, this is a complex function of the radius, and the
+ *  position of the centre relative to the lattice. However, we
+ *  can make an estimate based on the perimeter length 2 pi a.
+ *
+ *  The estimate is ticklish in the limit a -> 0, where we need
+ *  at least (nvel - 1) links. However, 2d radii should probably
+ *  not be less than a ~ 4.0 in real application.
+ *
+ *  For each unit length of perimeter, we allow (nvel - 1)/2 links
+ *  (i.e, half the non-zero links possible).
+ *
+ *  Everything else is rounded up, as we want to ensure there are
+ *  sufficient links in all cases, and don't care too much about
+ *  overestimating. In contrast, an underestimate would be fatal.
+ *
+ *****************************************************************************/
+
+int colloid_link_max_2d(double a, int nvel) {
+
+  int pi = 4;                            /* This is approximate */
+  int ai = fmax(4.0, ceil(a));           /* A minimum reasonable a ~ 4 */
+
+  return 2*pi*ai*(nvel - 1)/2;
+}
+
+/*****************************************************************************
+ *
+ *  colloid_link_max_3d
+ *
+ *  This is as for the 2d case (see comments above), except that the
+ *  estimate is based on the surface area 4 pi a^2.
+ *
+ *  A minimum reasonable redius in 3d is a ~ 1.0.
+ *
+ *****************************************************************************/
+
+int colloid_link_max_3d(double a, int nvel) {
+
+  int pi = 4;                             /* This is approximate */
+  int ai = fmax(1.0, ceil(a));            /* A minimum reasonable a ~ 1.0 */
+
+  return 4*pi*ai*ai*(nvel - 1)/2;
+}
diff --git a/src/colloid_link.h b/src/colloid_link.h
index 025ff049e..5f3d6a8ea 100644
--- a/src/colloid_link.h
+++ b/src/colloid_link.h
@@ -2,14 +2,10 @@
  *
  *  colloid_link.h
  *
- *  The implementation is exposed for the time being.
- *
- *  $Id: colloid_link.h,v 1.2 2010-10-15 12:40:02 kevin Exp $
- *
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2017 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -23,7 +19,7 @@ typedef struct colloid_link_type colloid_link_t;
 
 struct colloid_link_type {
 
-  int    i;               /* Index of lattice site outside colloid */ 
+  int    i;               /* Index of lattice site outside colloid */
   int    j;               /* Index of lattice site inside */
   int    p;               /* Index of velocity connecting i -> j */
   int    status;          /* What is at site i (fluid, solid, etc) */
@@ -34,11 +30,14 @@ struct colloid_link_type {
   colloid_link_t * next;  /* Linked list */
 };
 
-enum link_status {LINK_FLUID, LINK_COLLOID, LINK_BOUNDARY, LINK_UNUSED}; 
+enum link_status {LINK_FLUID, LINK_COLLOID, LINK_BOUNDARY, LINK_UNUSED};
 
 colloid_link_t * colloid_link_allocate(void);
 void             colloid_link_free_list(colloid_link_t * link);
 int              colloid_link_count(colloid_link_t * link);
 int              colloid_link_total(void);
 
+int colloid_link_max_2d(double a, int nvel);
+int colloid_link_max_3d(double a, int nvel);
+
 #endif

From cffeedf98ee71fa2943f111ca522dab293882ef6 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Tue, 28 Jan 2025 09:49:22 +0000
Subject: [PATCH 110/133] New state i/o to buffer

---
 src/colloid_state_io.c | 292 +++++++++++++++++++++++++++++++++++++++++
 src/colloid_state_io.h |  24 ++++
 2 files changed, 316 insertions(+)
 create mode 100644 src/colloid_state_io.c
 create mode 100644 src/colloid_state_io.h

diff --git a/src/colloid_state_io.c b/src/colloid_state_io.c
new file mode 100644
index 000000000..74c552bd5
--- /dev/null
+++ b/src/colloid_state_io.c
@@ -0,0 +1,292 @@
+/*****************************************************************************
+ *
+ *  colloid_state_io.c
+ *
+ *  Basic colloid state i/o functions of ascii/binary.
+ *
+ *****************************************************************************/
+
+#include <string.h>
+
+#include "colloid_state_io.h"
+
+/*****************************************************************************
+ *
+ *  colloid_state_io_write_buf
+ *
+ *  Write exactly sizeof(colloid_state_t) bytes to the buffer, for the
+ *  colloid with local ordinal index index.
+ *
+ *****************************************************************************/
+
+int colloid_state_io_write_buf(const colloid_state_t * s, char * buf) {
+
+  int ifail = 0;
+
+  if (s == NULL || buf == NULL) {
+    ifail = -1;
+  }
+  else {
+    memcpy(buf, s, sizeof(colloid_state_t));
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  colloid_state_io_write_buf_ascii
+ *
+ *  Write (ASCII) state of colloid with ordinal index to buf (without a '\0').
+ *
+ *****************************************************************************/
+
+int colloid_state_io_write_buf_ascii(const colloid_state_t * s, char * buf) {
+
+  int ifail = 0;
+
+  if (s == NULL || buf == NULL) {
+    ifail = -1;
+  }
+  else {
+    const size_t item = 25*sizeof(char); /* Single datum is 25 char ... */
+    const char * i1format = "%24d\n";
+    const char * d3format = "%24.15e ";  /* space */
+    const char * d1format = "%24.15e\n"; /* new line */
+
+    int nwrite = 0;                     /* no. chars that was wrote */
+    char cbuf[1 + NTOT_VAR*25] = {0};   /* buffer add 1 for the '\0' */
+
+    /* Write to the local buffer cbuf , then copy to buf without the '\0` */
+
+    nwrite += snprintf(cbuf          , 1 + item, i1format, s->index);
+    nwrite += snprintf(cbuf +  1*item, 1 + item, i1format, s->rebuild);
+    nwrite += snprintf(cbuf +  2*item, 1 + item, i1format, s->nbonds);
+    nwrite += snprintf(cbuf +  3*item, 1 + item, i1format, s->nangles);
+    nwrite += snprintf(cbuf +  4*item, 1 + item, i1format, s->isfixedr);
+    nwrite += snprintf(cbuf +  5*item, 1 + item, i1format, s->isfixedv);
+    nwrite += snprintf(cbuf +  6*item, 1 + item, i1format, s->isfixedw);
+    nwrite += snprintf(cbuf +  7*item, 1 + item, i1format, s->isfixeds);
+    nwrite += snprintf(cbuf +  8*item, 1 + item, i1format, s->type);
+    nwrite += snprintf(cbuf +  9*item, 1 + item, i1format, s->bond[0]);
+    nwrite += snprintf(cbuf + 10*item, 1 + item, i1format, s->bond[1]);
+    nwrite += snprintf(cbuf + 11*item, 1 + item, i1format, s->rng);
+    nwrite += snprintf(cbuf + 12*item, 1 + item, i1format, s->isfixedrxyz[0]);
+    nwrite += snprintf(cbuf + 13*item, 1 + item, i1format, s->isfixedrxyz[1]);
+    nwrite += snprintf(cbuf + 14*item, 1 + item, i1format, s->isfixedrxyz[2]);
+    nwrite += snprintf(cbuf + 15*item, 1 + item, i1format, s->isfixedvxyz[0]);
+    nwrite += snprintf(cbuf + 16*item, 1 + item, i1format, s->isfixedvxyz[1]);
+    nwrite += snprintf(cbuf + 17*item, 1 + item, i1format, s->isfixedvxyz[2]);
+    nwrite += snprintf(cbuf + 18*item, 1 + item, i1format, s->inter_type);
+    /* FIXME: the ioversion needs to be correct ... */
+    nwrite += snprintf(cbuf + 19*item, 1 + item, i1format, s->ioversion);
+    nwrite += snprintf(cbuf + 20*item, 1 + item, i1format, s->bc);
+    nwrite += snprintf(cbuf + 21*item, 1 + item, i1format, s->shape);
+    nwrite += snprintf(cbuf + 22*item, 1 + item, i1format, s->active);
+    nwrite += snprintf(cbuf + 23*item, 1 + item, i1format, s->magnetic);
+    nwrite += snprintf(cbuf + 24*item, 1 + item, i1format, s->attr);
+    nwrite += snprintf(cbuf + 25*item, 1 + item, i1format, s->intpad[0]);
+    nwrite += snprintf(cbuf + 26*item, 1 + item, i1format, s->intpad[1]);
+    nwrite += snprintf(cbuf + 27*item, 1 + item, i1format, s->intpad[2]);
+    nwrite += snprintf(cbuf + 28*item, 1 + item, i1format, s->intpad[3]);
+    nwrite += snprintf(cbuf + 29*item, 1 + item, i1format, s->intpad[4]);
+    nwrite += snprintf(cbuf + 30*item, 1 + item, i1format, s->intpad[5]);
+    nwrite += snprintf(cbuf + 31*item, 1 + item, i1format, s->intpad[6]);
+
+    /* Integers */
+    if (nwrite != 32*item) ifail = -1;
+
+    /* Doubles */
+    /* For historical reasons, vectors r, v, etc appear on one line */
+
+    nwrite += snprintf(cbuf + 32*item, 1 + item, d1format, s->a0);
+    nwrite += snprintf(cbuf + 33*item, 1 + item, d1format, s->ah);
+    nwrite += snprintf(cbuf + 34*item, 1 + item, d3format, s->r[0]);
+    nwrite += snprintf(cbuf + 35*item, 1 + item, d3format, s->r[1]);
+    nwrite += snprintf(cbuf + 36*item, 1 + item, d1format, s->r[2]);
+    nwrite += snprintf(cbuf + 37*item, 1 + item, d3format, s->v[0]);
+    nwrite += snprintf(cbuf + 38*item, 1 + item, d3format, s->v[1]);
+    nwrite += snprintf(cbuf + 39*item, 1 + item, d1format, s->v[2]);
+    nwrite += snprintf(cbuf + 40*item, 1 + item, d3format, s->w[0]);
+    nwrite += snprintf(cbuf + 41*item, 1 + item, d3format, s->w[1]);
+    nwrite += snprintf(cbuf + 42*item, 1 + item, d1format, s->w[2]);
+    nwrite += snprintf(cbuf + 43*item, 1 + item, d3format, s->s[0]);
+    nwrite += snprintf(cbuf + 44*item, 1 + item, d3format, s->s[1]);
+    nwrite += snprintf(cbuf + 45*item, 1 + item, d1format, s->s[2]);
+    nwrite += snprintf(cbuf + 46*item, 1 + item, d3format, s->m[0]);
+    nwrite += snprintf(cbuf + 47*item, 1 + item, d3format, s->m[1]);
+    nwrite += snprintf(cbuf + 48*item, 1 + item, d1format, s->m[2]);
+    nwrite += snprintf(cbuf + 49*item, 1 + item, d1format, s->b1);
+    nwrite += snprintf(cbuf + 50*item, 1 + item, d1format, s->b2);
+    nwrite += snprintf(cbuf + 51*item, 1 + item, d1format, s->c);
+    nwrite += snprintf(cbuf + 52*item, 1 + item, d1format, s->h);
+    nwrite += snprintf(cbuf + 53*item, 1 + item, d3format, s->dr[0]);
+    nwrite += snprintf(cbuf + 54*item, 1 + item, d3format, s->dr[1]);
+    nwrite += snprintf(cbuf + 55*item, 1 + item, d1format, s->dr[2]);
+    nwrite += snprintf(cbuf + 56*item, 1 + item, d1format, s->deltaphi);
+    nwrite += snprintf(cbuf + 57*item, 1 + item, d1format, s->q0);
+    nwrite += snprintf(cbuf + 58*item, 1 + item, d1format, s->q1);
+    nwrite += snprintf(cbuf + 59*item, 1 + item, d1format, s->epsilon);
+    nwrite += snprintf(cbuf + 60*item, 1 + item, d1format, s->deltaq0);
+    nwrite += snprintf(cbuf + 61*item, 1 + item, d1format, s->deltaq1);
+    nwrite += snprintf(cbuf + 62*item, 1 + item, d1format, s->sa);
+    nwrite += snprintf(cbuf + 63*item, 1 + item, d1format, s->saf);
+    nwrite += snprintf(cbuf + 64*item, 1 + item, d1format, s->al);
+    nwrite += snprintf(cbuf + 65*item, 1 + item, d1format, s->elabc[0]);
+    nwrite += snprintf(cbuf + 66*item, 1 + item, d1format, s->elabc[1]);
+    nwrite += snprintf(cbuf + 67*item, 1 + item, d1format, s->elabc[2]);
+    nwrite += snprintf(cbuf + 68*item, 1 + item, d1format, s->quat[0]);
+    nwrite += snprintf(cbuf + 69*item, 1 + item, d1format, s->quat[1]);
+    nwrite += snprintf(cbuf + 70*item, 1 + item, d1format, s->quat[2]);
+    nwrite += snprintf(cbuf + 71*item, 1 + item, d1format, s->quat[3]);
+    nwrite += snprintf(cbuf + 72*item, 1 + item, d1format, s->quatold[0]);
+    nwrite += snprintf(cbuf + 73*item, 1 + item, d1format, s->quatold[1]);
+    nwrite += snprintf(cbuf + 74*item, 1 + item, d1format, s->quatold[2]);
+    nwrite += snprintf(cbuf + 75*item, 1 + item, d1format, s->quatold[3]);
+    nwrite += snprintf(cbuf + 76*item, 1 + item, d1format, s->dpad[0]);
+    nwrite += snprintf(cbuf + 77*item, 1 + item, d1format, s->dpad[1]);
+    nwrite += snprintf(cbuf + 78*item, 1 + item, d1format, s->dpad[2]);
+    nwrite += snprintf(cbuf + 79*item, 1 + item, d1format, s->dpad[3]);
+
+    if (nwrite != NTOT_VAR*item) ifail = -2;
+
+    /* Finally */
+    memcpy(buf, cbuf, NTOT_VAR*item*sizeof(char));
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  colloid_state_io_read_buf
+ *
+ *****************************************************************************/
+
+int colloid_state_io_read_buf(colloid_state_t * s, const char * buf) {
+
+  int ifail = 0;
+
+  if (s == NULL || buf == NULL) {
+    ifail = -1;
+  }
+  else {
+    memcpy(s, buf, sizeof(colloid_state_t));
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  colloid_state_io_read_buf_ascii
+ *
+ *****************************************************************************/
+
+int colloid_state_io_read_buf_ascii(colloid_state_t * s, const char * buf) {
+
+  int ifail = 0;
+
+  if (s == NULL || buf == NULL) {
+    ifail = -1;
+  }
+  else {
+    /* Make sure there is a \0 before we get to sscanf, hence the memcpy() */
+    int nr = 0;                       /* number of char read */
+    int sz = 25*sizeof(char);
+    char tmp[BUFSIZ] = {0};
+
+    memcpy(tmp, buf +  0*sz, sz); nr += sscanf(tmp, "%d", &s->index);
+    memcpy(tmp, buf +  1*sz, sz); nr += sscanf(tmp, "%d", &s->rebuild);
+    memcpy(tmp, buf +  2*sz, sz); nr += sscanf(tmp, "%d", &s->nbonds);
+    memcpy(tmp, buf +  3*sz, sz); nr += sscanf(tmp, "%d", &s->nangles);
+    memcpy(tmp, buf +  4*sz, sz); nr += sscanf(tmp, "%d", &s->isfixedr);
+    memcpy(tmp, buf +  5*sz, sz); nr += sscanf(tmp, "%d", &s->isfixedv);
+    memcpy(tmp, buf +  6*sz, sz); nr += sscanf(tmp, "%d", &s->isfixedw);
+    memcpy(tmp, buf +  7*sz, sz); nr += sscanf(tmp, "%d", &s->isfixeds);
+    memcpy(tmp, buf +  8*sz, sz); nr += sscanf(tmp, "%d", &s->type);
+    memcpy(tmp, buf +  9*sz, sz); nr += sscanf(tmp, "%d", &s->bond[0]);
+    memcpy(tmp, buf + 10*sz, sz); nr += sscanf(tmp, "%d", &s->bond[1]);
+    memcpy(tmp, buf + 11*sz, sz); nr += sscanf(tmp, "%d", &s->rng);
+    memcpy(tmp, buf + 12*sz, sz); nr += sscanf(tmp, "%d", &s->isfixedrxyz[0]);
+    memcpy(tmp, buf + 13*sz, sz); nr += sscanf(tmp, "%d", &s->isfixedrxyz[1]);
+    memcpy(tmp, buf + 14*sz, sz); nr += sscanf(tmp, "%d", &s->isfixedrxyz[2]);
+    memcpy(tmp, buf + 15*sz, sz); nr += sscanf(tmp, "%d", &s->isfixedvxyz[0]);
+    memcpy(tmp, buf + 16*sz, sz); nr += sscanf(tmp, "%d", &s->isfixedvxyz[1]);
+    memcpy(tmp, buf + 17*sz, sz); nr += sscanf(tmp, "%d", &s->isfixedvxyz[2]);
+
+    memcpy(tmp, buf + 18*sz, sz); nr += sscanf(tmp, "%d", &s->inter_type);
+    memcpy(tmp, buf + 19*sz, sz); nr += sscanf(tmp, "%d", &s->ioversion);
+    memcpy(tmp, buf + 20*sz, sz); nr += sscanf(tmp, "%d", &s->bc);
+    memcpy(tmp, buf + 21*sz, sz); nr += sscanf(tmp, "%d", &s->shape);
+    memcpy(tmp, buf + 22*sz, sz); nr += sscanf(tmp, "%d", &s->active);
+    memcpy(tmp, buf + 23*sz, sz); nr += sscanf(tmp, "%d", &s->magnetic);
+    memcpy(tmp, buf + 24*sz, sz); nr += sscanf(tmp, "%d", &s->attr);
+    memcpy(tmp, buf + 25*sz, sz); nr += sscanf(tmp, "%d", &s->intpad[0]);
+    memcpy(tmp, buf + 26*sz, sz); nr += sscanf(tmp, "%d", &s->intpad[1]);
+    memcpy(tmp, buf + 27*sz, sz); nr += sscanf(tmp, "%d", &s->intpad[2]);
+    memcpy(tmp, buf + 28*sz, sz); nr += sscanf(tmp, "%d", &s->intpad[3]);
+    memcpy(tmp, buf + 29*sz, sz); nr += sscanf(tmp, "%d", &s->intpad[4]);
+    memcpy(tmp, buf + 30*sz, sz); nr += sscanf(tmp, "%d", &s->intpad[5]);
+    memcpy(tmp, buf + 31*sz, sz); nr += sscanf(tmp, "%d", &s->intpad[6]);
+
+    if (nr != 32*sz) ifail = -1;
+
+    /* Doubles */
+    memcpy(tmp, buf + 32*sz, sz); nr += sscanf(tmp, "%le", &s->a0);
+    memcpy(tmp, buf + 33*sz, sz); nr += sscanf(tmp, "%le", &s->ah);
+    memcpy(tmp, buf + 34*sz, sz); nr += sscanf(tmp, "%le", &s->r[0]);
+    memcpy(tmp, buf + 35*sz, sz); nr += sscanf(tmp, "%le", &s->r[1]);
+    memcpy(tmp, buf + 36*sz, sz); nr += sscanf(tmp, "%le", &s->r[2]);
+    memcpy(tmp, buf + 37*sz, sz); nr += sscanf(tmp, "%le", &s->v[0]);
+    memcpy(tmp, buf + 38*sz, sz); nr += sscanf(tmp, "%le", &s->v[1]);
+    memcpy(tmp, buf + 39*sz, sz); nr += sscanf(tmp, "%le", &s->v[2]);
+    memcpy(tmp, buf + 40*sz, sz); nr += sscanf(tmp, "%le", &s->w[0]);
+    memcpy(tmp, buf + 41*sz, sz); nr += sscanf(tmp, "%le", &s->w[1]);
+    memcpy(tmp, buf + 42*sz, sz); nr += sscanf(tmp, "%le", &s->w[2]);
+    memcpy(tmp, buf + 43*sz, sz); nr += sscanf(tmp, "%le", &s->s[0]);
+    memcpy(tmp, buf + 44*sz, sz); nr += sscanf(tmp, "%le", &s->s[1]);
+    memcpy(tmp, buf + 45*sz, sz); nr += sscanf(tmp, "%le", &s->s[2]);
+    memcpy(tmp, buf + 46*sz, sz); nr += sscanf(tmp, "%le", &s->m[0]);
+    memcpy(tmp, buf + 47*sz, sz); nr += sscanf(tmp, "%le", &s->m[1]);
+    memcpy(tmp, buf + 48*sz, sz); nr += sscanf(tmp, "%le", &s->m[2]);
+    memcpy(tmp, buf + 49*sz, sz); nr += sscanf(tmp, "%le", &s->b1);
+    memcpy(tmp, buf + 50*sz, sz); nr += sscanf(tmp, "%le", &s->b2);
+    memcpy(tmp, buf + 51*sz, sz); nr += sscanf(tmp, "%le", &s->c);
+    memcpy(tmp, buf + 52*sz, sz); nr += sscanf(tmp, "%le", &s->h);
+    memcpy(tmp, buf + 53*sz, sz); nr += sscanf(tmp, "%le", &s->dr[0]);
+    memcpy(tmp, buf + 54*sz, sz); nr += sscanf(tmp, "%le", &s->dr[1]);
+    memcpy(tmp, buf + 55*sz, sz); nr += sscanf(tmp, "%le", &s->dr[2]);
+    memcpy(tmp, buf + 56*sz, sz); nr += sscanf(tmp, "%le", &s->deltaphi);
+    memcpy(tmp, buf + 57*sz, sz); nr += sscanf(tmp, "%le", &s->q0);
+    memcpy(tmp, buf + 58*sz, sz); nr += sscanf(tmp, "%le", &s->q1);
+    memcpy(tmp, buf + 59*sz, sz); nr += sscanf(tmp, "%le", &s->epsilon);
+    memcpy(tmp, buf + 60*sz, sz); nr += sscanf(tmp, "%le", &s->deltaq0);
+    memcpy(tmp, buf + 61*sz, sz); nr += sscanf(tmp, "%le", &s->deltaq1);
+    memcpy(tmp, buf + 62*sz, sz); nr += sscanf(tmp, "%le", &s->sa);
+    memcpy(tmp, buf + 63*sz, sz); nr += sscanf(tmp, "%le", &s->saf);
+    memcpy(tmp, buf + 64*sz, sz); nr += sscanf(tmp, "%le", &s->al);
+    memcpy(tmp, buf + 65*sz, sz); nr += sscanf(tmp, "%le", &s->elabc[0]);
+    memcpy(tmp, buf + 66*sz, sz); nr += sscanf(tmp, "%le", &s->elabc[1]);
+    memcpy(tmp, buf + 67*sz, sz); nr += sscanf(tmp, "%le", &s->elabc[2]);
+
+    memcpy(tmp, buf + 68*sz, sz); nr += sscanf(tmp, "%le", &s->quat[0]);
+    memcpy(tmp, buf + 69*sz, sz); nr += sscanf(tmp, "%le", &s->quat[1]);
+    memcpy(tmp, buf + 70*sz, sz); nr += sscanf(tmp, "%le", &s->quat[2]);
+    memcpy(tmp, buf + 71*sz, sz); nr += sscanf(tmp, "%le", &s->quat[3]);
+
+    memcpy(tmp, buf + 72*sz, sz); nr += sscanf(tmp, "%le", &s->quatold[0]);
+    memcpy(tmp, buf + 73*sz, sz); nr += sscanf(tmp, "%le", &s->quatold[1]);
+    memcpy(tmp, buf + 74*sz, sz); nr += sscanf(tmp, "%le", &s->quatold[2]);
+    memcpy(tmp, buf + 75*sz, sz); nr += sscanf(tmp, "%le", &s->quatold[3]);
+
+    memcpy(tmp, buf + 76*sz, sz); nr += sscanf(tmp, "%le", &s->dpad[0]);
+    memcpy(tmp, buf + 77*sz, sz); nr += sscanf(tmp, "%le", &s->dpad[1]);
+    memcpy(tmp, buf + 78*sz, sz); nr += sscanf(tmp, "%le", &s->dpad[2]);
+    memcpy(tmp, buf + 79*sz, sz); nr += sscanf(tmp, "%le", &s->dpad[3]);
+
+    if (nr != NTOT_VAR*sz) ifail = -2;
+  }
+
+  return ifail;
+}
diff --git a/src/colloid_state_io.h b/src/colloid_state_io.h
new file mode 100644
index 000000000..526b34743
--- /dev/null
+++ b/src/colloid_state_io.h
@@ -0,0 +1,24 @@
+/*****************************************************************************
+ *
+ *  colloid_state_io.h
+ *
+ *  Edinburgh Soft Matter and Statistical Physics Group and
+ *  Edinburgh Parallel Computing Centre
+ *
+ *  (c) 2025 The University of Edinburgh
+ *
+ *****************************************************************************/
+
+#ifndef LUDWIG_COLLOID_STATE_IO_H
+#define LUDWIG_COLLOID_STATE_IO_H
+
+#include "colloid.h"
+
+#define COLLOID_BUFSZ (NTOT_VAR*25) /* format 25 char per item */
+
+int colloid_state_io_write_buf(const colloid_state_t * s, char * buf);
+int colloid_state_io_write_buf_ascii(const colloid_state_t * s, char * buf);
+int colloid_state_io_read_buf(colloid_state_t * s, const char * buf);
+int colloid_state_io_read_buf_ascii(colloid_state_t * s, const char * buf);
+
+#endif

From 61cd0a29ec392ae3021c5529326576e1f6a72f00 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Tue, 28 Jan 2025 09:49:56 +0000
Subject: [PATCH 111/133] Additional test

---
 tests/unit/test_colloid_links.c    | 353 +++++++++++++++++++++++++++++
 tests/unit/test_colloid_state_io.c | 315 +++++++++++++++++++++++++
 tests/unit/tests.c                 |   4 +-
 tests/unit/tests.h                 |   4 +-
 4 files changed, 674 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit/test_colloid_links.c
 create mode 100644 tests/unit/test_colloid_state_io.c

diff --git a/tests/unit/test_colloid_links.c b/tests/unit/test_colloid_links.c
new file mode 100644
index 000000000..8e87ebca6
--- /dev/null
+++ b/tests/unit/test_colloid_links.c
@@ -0,0 +1,353 @@
+/*****************************************************************************
+ *
+ *  test_colloid_link.c
+ *
+ *****************************************************************************/
+
+#include <assert.h>
+#include <math.h>
+
+#include "pe.h"
+#include "colloid_link.h"
+
+int test_colloid_link_max_2d_d2q9(void);
+int test_colloid_link_max_3d_d3q15(void);
+int test_colloid_link_max_3d_d3q19(void);
+int test_colloid_link_max_3d_d3q27(void);
+
+/*****************************************************************************
+ *
+ *  test_colloid_link_suite
+ *
+ *****************************************************************************/
+
+int test_colloid_link_suite(void) {
+
+  pe_t * pe = NULL;
+
+  pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
+
+  test_colloid_link_max_2d_d2q9();
+  test_colloid_link_max_3d_d3q15();
+  test_colloid_link_max_3d_d3q19();
+  test_colloid_link_max_3d_d3q27();
+
+  pe_free(pe);
+
+  return 0;
+}
+
+/*****************************************************************************
+ *
+ *  test_colloid_link_max_2d_d2q9
+ *
+ *  Maximum number of links for a disk with radius a in the d2q9 model.
+ *  We consider disks with centre at three different symmetry points of
+ *  the lattice:
+ *
+ *       .   .   .      .   .   .     .   .   .
+ *                                          Z
+ *       .   X   .      .   . Y .     .   .   .
+ *
+ *       .   .   .      .   .   .     .   .   .
+ *
+ *  with the origin being at a lattice point (X). The lattice vector
+ *  joining the colloid centre to a lattice site determines a set of
+ *  radii at which new lattice sites are incorporated into the
+ *  discrete surface.
+ *
+ *  X case: centre at (0,0) The numbers are by drawing a diagram.
+ *
+ *  Vector      Radius       No. inside      Links
+ *  ----------------------------------------------
+ *              a > 0                 1          8
+ *  (1, 0)      a > sqrt(  1)         5         24
+ *  (1, 1)      a > sqrt(  2)         9         32
+ *  (2, 0)      s > sqrt(  4)        13         40
+ *  (2, 1)      a > sqrt(  5)        21         48
+ *  (2, 2)      a > sqrt(  8)        25         56
+ *  (3, 0)      a > sqrt(  9)        29         64
+ *  (3, 1)      a > sqrt( 10)        37         64
+ *  (3, 2)      a > sqrt( 13)        45         72
+ *  (4, 0)      a > sqrt( 16)        49         80
+ *  (4, 1)      a > sqrt( 17)        57         80
+ *  (3, 3)      a > sqrt( 18)        61         88
+ *  (4, 2)      a > sqrt( 20)        69         88
+ *  (4, 3)      a > sqrt( 25)
+ *  (5, 0)      a > sqrt( 25)        81        104
+ *  ...
+ *  (5, 5)      a > sqrt( 50)       161        144
+ *  (9, 9)      a > sqrt(162)       509        248
+ *  (15,15)     a > sqrt(450)      1425        416
+ *
+ *  Y case: centre at (1/2, 0)
+ *
+ *  Vector      Radius       No. inside      Links
+ *  ----------------------------------------------
+ *  (1/2, 0)    a > sqrt( 1/4)        2         14
+ *  (1/2, 1)    a > sqrt( 5/4)        6         26
+ *  (3/2, 0)    a > sqrt( 9/4)        8         30
+ *  (3/2, 1)    a > sqrt(13/4)       12         38
+ *  (1/2, 2)    a > sqrt(17/4)       16         42
+ *  (3/2, 2)    a > sqrt(25/4)        -                             -
+ *  (5/2, 0)    a > sqrt(25/4)       22         54
+ *
+ *  Z case: centre at (1/2, 1/2)
+ *
+ *  Vector      Radius       No. inside      Links
+ *  ----------------------------------------------
+ *  (1/2, 1/2)  a > sqrt( 2/4)        4         20
+ *
+ *
+ *  It is enough to check the X case.
+ *
+ *****************************************************************************/
+
+int test_colloid_link_max_2d_d2q9(void) {
+
+  int ifail = 0;
+  int nvel  = 9;
+
+  /* X case */
+  {
+    int nlink = 24;
+    double a = sqrt(1.0);
+    int nmax = colloid_link_max_2d(a, nvel);
+    if (nmax < nlink) ifail = 1;
+    assert(ifail == 0);
+  }
+
+  {
+    int nlink = 64;
+    double a = sqrt(10.0);
+
+    int nmax = colloid_link_max_2d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  /* The more interesting cases are a > sqrt(16) */
+  {
+    int nlink = 248;
+    double a = sqrt(162.0);
+
+    int nmax = colloid_link_max_2d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  {
+    int nlink = 416;
+    double a = sqrt(450.0);
+
+    int nmax = colloid_link_max_2d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  test_colloid_links_max_d3q15
+ *
+ *  Drawing a diagram is less practical than in 2d. There are many
+ *  possibilities.
+ *
+ *  X case: centre at (0,0,0)
+ *
+ *  Vector        Radius       No. inside      Links
+ *  ------------------------------------------------
+ *                a > 0                 1         14
+ *  (1, 0, 0)     a > sqrt(  1)         7         86
+ *  (1, 1, 0)     a > sqrt(  2)        19        158
+ *  (1, 1, 1)     a > sqrt(  3)        27        206
+ *  (2, 0, 0)     a > sqrt(  4)        33        230
+ *  (2, 1, 0)     a > sqrt(  5)        57        374
+ *  (2, 1, 1)     a > sqrt(  6)        81        422
+ *  (2, 2, 0)     a > sqrt(  8)        93        494
+ *  (2, 2, 1)     a > sqrt(  9)       123        614
+ *  (3, 1, 0)     a > sqrt( 10)       147        662
+ *  (3, 1, 1)     a > sqrt( 11)       171        710
+ *  (2, 2, 2)     a > sqrt( 12)       179        710
+ *  (3, 2, 1)     a > sqrt( 14)       251        950
+ *  (4, 0, 0)     a > sqrt( 16)       257        974
+ *  ...
+ *  (4, 4, 4)     a > sqrt( 48)      1365       2894
+ *  (9, 9, 9)     a > sqrt(243)     15895      15230
+ *
+ *****************************************************************************/
+
+int test_colloid_link_max_3d_d3q15(void) {
+
+  int ifail = 0;
+  int nvel = 15;
+
+  {
+    int nlink = 86;
+    double a = sqrt(1.0);
+
+    int nmax = colloid_link_max_3d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  {
+    int nlink = 158;
+    double a = sqrt(2.0);
+
+    int nmax = colloid_link_max_3d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  /* Larger radii ... */
+  {
+    int nlink = 2894;
+    double a = sqrt(48.0);
+
+    int nmax = colloid_link_max_3d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  {
+    int nlink = 15230;
+    double a = sqrt(243.0);
+
+    int nmax = colloid_link_max_3d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  test_colloid_links_max_d3q19
+ *
+ *  The lattice vectors are the same as d3q15.
+ *
+ *  X case: centre at (0,0,0)
+ *
+ *  Vector        Radius          a0     inside      Links
+ *  ------------------------------------------------------
+ *                a > 0                       1         18
+ *  (1, 0, 0)     a > sqrt(  1)   1.001       7         90
+ *  (1, 1, 0)     a > sqrt(  2)   1.415      19        186
+ *  ...
+ *  (4, 4, 4)     a > sqrt( 48)   6.930    1365       3354
+ *  (9, 9, 9)     a > sqrt(243)  15.590   15895      17562
+ *
+ *****************************************************************************/
+
+int test_colloid_link_max_3d_d3q19(void) {
+
+  int ifail = 0;
+  int nvel = 19;
+
+  {
+    int nlink = 86;
+    double a = sqrt(1.0);
+
+    int nmax = colloid_link_max_3d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  {
+    int nlink = 158;
+    double a = sqrt(2.0);
+
+    int nmax = colloid_link_max_3d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  /* Larger radii ... */
+  {
+    int nlink = 3354;
+    double a = sqrt(48.0);
+
+    int nmax = colloid_link_max_3d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  {
+    int nlink = 17562;
+    double a = sqrt(243.0);
+
+    int nmax = colloid_link_max_3d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  test_colloid_links_max_d3q27
+ *
+ *  The lattice vectors are the same as d3q15.
+ *
+ *  X case: centre at (0,0,0)
+ *
+ *  Vector        Radius          a0     inside      Links
+ *  ------------------------------------------------------
+ *                a > 0                       1         18
+ *  (1, 0, 0)     a > sqrt(  1)   1.001       7        146
+ *  (1, 1, 0)     a > sqrt(  2)   1.415      19        290
+ *  ...
+ *  (4, 4, 4)     a > sqrt( 48)   6.930    1365       5378
+ *  (9, 9, 9)     a > sqrt(243)  15.590   15895      28226
+ *
+ *****************************************************************************/
+
+int test_colloid_link_max_3d_d3q27(void) {
+
+  int ifail = 0;
+  int nvel = 27;
+
+  {
+    int nlink = 146;
+    double a = sqrt(1.0);
+
+    int nmax = colloid_link_max_3d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  {
+    int nlink = 290;
+    double a = sqrt(2.0);
+
+    int nmax = colloid_link_max_3d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  /* Larger radii ... */
+  {
+    int nlink = 5378;
+    double a = sqrt(48.0);
+
+    int nmax = colloid_link_max_3d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  {
+    int nlink = 28226;
+    double a = sqrt(243.0);
+
+    int nmax = colloid_link_max_3d(a, nvel);
+    if (nmax < nlink) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  return ifail;
+}
diff --git a/tests/unit/test_colloid_state_io.c b/tests/unit/test_colloid_state_io.c
new file mode 100644
index 000000000..e7e7cee3b
--- /dev/null
+++ b/tests/unit/test_colloid_state_io.c
@@ -0,0 +1,315 @@
+/*****************************************************************************
+ *
+ *  test_colloid_state_io.c
+ *
+ *****************************************************************************/
+
+#include <assert.h>
+#include <string.h>
+
+#include "pe.h"
+#include "colloid_state_io.h"
+
+int test_colloid_state_io_write_buf(void);
+int test_colloid_state_io_read_buf(void);
+int test_colloid_state_io_write_buf_ascii(void);
+int test_colloid_state_io_read_buf_ascii(void);
+
+
+colloid_state_t util_test_colloid_state(void);
+int util_test_colloid_state_same(const colloid_state_t * s1,
+				 const colloid_state_t * s2);
+
+/*****************************************************************************
+ *
+ *  test_colloid_state_io_suite
+ *
+ *****************************************************************************/
+
+int test_colloid_state_io_suite(void) {
+
+  pe_t * pe = NULL;
+
+  pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
+
+  test_colloid_state_io_write_buf();
+  test_colloid_state_io_read_buf();
+  test_colloid_state_io_write_buf_ascii();
+  test_colloid_state_io_read_buf();
+
+
+  pe_info(pe, "%-9s %s\n", "PASS", __FILE__);
+  pe_free(pe);
+
+  return 0;
+}
+
+/*****************************************************************************
+ *
+ *  test_colloid_state_io_write_buf
+ *
+ *****************************************************************************/
+
+int test_colloid_state_io_write_buf(void) {
+
+  int ifail = 0;
+
+  /* Minimal test. */
+  {
+    colloid_state_t sw = util_test_colloid_state();
+    colloid_state_t sr = {};
+    char buf[sizeof(colloid_state_t)] = {0};
+    int same = 0;
+
+    ifail = colloid_state_io_write_buf(&sw, buf);
+    assert(ifail == 0);
+
+    memcpy(&sr, buf, sizeof(colloid_state_t));
+
+    same = util_test_colloid_state_same(&sw, &sr);
+    if (same == 0) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  test_colloid_state_io_read_buf
+ *
+ *****************************************************************************/
+
+int test_colloid_state_io_read_buf(void) {
+
+  int ifail = 0;
+
+  {
+    colloid_state_t sw = util_test_colloid_state();
+    colloid_state_t sr = {};
+    char buf[sizeof(colloid_state_t)] = {0};
+    int same = 0;
+
+    memcpy(buf, &sw, sizeof(colloid_state_t));
+
+    ifail = colloid_state_io_read_buf(&sr, buf);
+    assert(ifail == 0);
+    same  = util_test_colloid_state_same(&sw, &sr);
+    if (same == 0) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  test_colloid_state_io_write_buf_ascii
+ *
+ *****************************************************************************/
+
+int test_colloid_state_io_write_buf_ascii(void) {
+
+  int ifail = 0;
+
+  {
+    colloid_state_t sw = util_test_colloid_state();
+    char buf[1 + COLLOID_BUFSZ] = {0};
+
+    ifail = colloid_state_io_write_buf_ascii(&sw, buf);
+    assert(ifail == 0);
+    assert(strlen(buf) == COLLOID_BUFSZ);
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  test_colloid_state_io_read_buf_ascii
+ *
+ *****************************************************************************/
+
+int test_colloid_state_io_read_buf_ascii(void) {
+
+  int ifail = 0;
+
+  {
+    colloid_state_t sw = util_test_colloid_state();
+    colloid_state_t sr = {};
+    char buf[1 + COLLOID_BUFSZ] = {0};
+    int same = 0;
+
+    ifail = colloid_state_io_write_buf(&sw, buf);
+    assert(ifail == 0);
+    ifail = colloid_state_io_read_buf(&sr, buf);
+    assert(ifail == 0);
+    same = util_test_colloid_state_same(&sw, &sr);
+    if (same == 0) ifail = -1;
+    assert(ifail == 0);
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  util_test_colloid_state
+ *
+ *  Utility to provide some test data.
+ *
+ *****************************************************************************/
+
+colloid_state_t util_test_colloid_state(void) {
+
+  colloid_state_t s = {
+    .index       =  1,
+    .rebuild     =  2,
+    .nbonds      =  3,
+    .nangles     =  4,
+    .isfixedr    =  5,
+    .isfixedv    =  6,
+    .isfixedw    =  7,
+    .isfixeds    =  8,
+    .type        =  9,
+    .bond        = {10, 11},
+    .rng         = 12,
+    .isfixedrxyz = {13, 14, 15},
+    .isfixedvxyz = {16, 17, 18},
+    .inter_type  = 19,
+    .ioversion   = 20,
+    .bc          = 21,
+    .shape       = 22,
+    .active      = 23,
+    .magnetic    = 24,
+    .attr        = 25,
+    .intpad      = {26, 27, 28, 29, 30, 31, 32}, /* Pads to 32 */
+    .a0          = 33.0,
+    .ah          = 34.0,
+    .r           = {35.0, 36.0, 37.0},
+    .v           = {38.0, 39.0, 40.0},
+    .w           = {41.0, 42.0, 43.0},
+    .s           = {44.0, 45.0, 46.0},
+    .m           = {47.0, 48.0, 49.0},
+    .b1          = 50.0,
+    .b2          = 51.0,
+    .c           = 52.0,
+    .h           = 53.0,
+    .dr          = {54.0, 55.0, 56.0},
+    .deltaphi    = 57.0,
+    .q0          = 58.0,
+    .q1          = 59.0,
+    .epsilon     = 60.0,
+    .deltaq0     = 61.0,
+    .deltaq1     = 62.0,
+    .sa          = 63.0,
+    .saf         = 64.0,
+    .al          = 65.0,
+    .elabc       = {66.0, 67.0, 68.0},
+    .quat        = {69.0, 70.0, 71.0, 72.0},
+    .quatold     = {73.0, 74.0, 75.0, 76.0},
+    .dpad        = {77.0, 78.0, 79.0, 80.0}      /* Pads to 80 */
+  };
+
+  return s;
+}
+
+/*****************************************************************************
+ *
+ *  util_test_colloid_state_same
+ *
+ *  Identical (incl. floating point equality).
+ *
+ *****************************************************************************/
+
+int util_test_colloid_state_same(const colloid_state_t * s1,
+				 const colloid_state_t * s2) {
+  int same = 0;
+  int sum  = 0;
+
+  sum += (s1->index          == s2->index);
+  sum += (s1->rebuild        == s2->rebuild);
+  sum += (s1->nbonds         == s2->nbonds);
+  sum += (s1->nangles        == s2->nangles);
+  sum += (s1->isfixedr       == s2->isfixedr);
+  sum += (s1->isfixedv       == s2->isfixedv);
+  sum += (s1->isfixedw       == s2->isfixedw);
+  sum += (s1->isfixeds       == s2->isfixeds);
+  sum += (s1->type           == s2->type);
+  sum += (s1->bond[0]        == s2->bond[0]);
+  sum += (s1->bond[1]        == s2->bond[1]);
+  sum += (s1->rng            == s2->rng);
+  sum += (s1->isfixedrxyz[0] == s2->isfixedrxyz[0]);
+  sum += (s1->isfixedrxyz[1] == s2->isfixedrxyz[1]);
+  sum += (s1->isfixedrxyz[2] == s2->isfixedrxyz[2]);
+  sum += (s1->isfixedvxyz[0] == s2->isfixedvxyz[0]);
+  sum += (s1->isfixedvxyz[1] == s2->isfixedvxyz[1]);
+  sum += (s1->isfixedvxyz[2] == s2->isfixedvxyz[2]);
+  sum += (s1->inter_type     == s2->inter_type);
+  sum += (s1->ioversion      == s2->ioversion);
+  sum += (s1->bc             == s2->bc);
+  sum += (s1->shape          == s2->shape);
+  sum += (s1->active         == s2->active);
+  sum += (s1->magnetic       == s2->magnetic);
+  sum += (s1->attr           == s2->attr);
+  sum += (s1->intpad[0]      == s2->intpad[0]);
+  sum += (s1->intpad[1]      == s2->intpad[1]);
+  sum += (s1->intpad[2]      == s2->intpad[2]);
+  sum += (s1->intpad[3]      == s2->intpad[3]);
+  sum += (s1->intpad[4]      == s2->intpad[4]);
+  sum += (s1->intpad[5]      == s2->intpad[5]);
+  sum += (s1->intpad[6]      == s2->intpad[6]);
+
+  sum += (s1->a0             == s2->a0);
+  sum += (s1->ah             == s2->ah);
+  sum += (s1->r[0]           == s2->r[0]);
+  sum += (s1->r[1]           == s2->r[1]);
+  sum += (s1->r[2]           == s2->r[2]);
+  sum += (s1->v[0]           == s2->v[0]);
+  sum += (s1->v[1]           == s2->v[1]);
+  sum += (s1->v[2]           == s2->v[2]);
+  sum += (s1->w[0]           == s2->w[0]);
+  sum += (s1->w[1]           == s2->w[1]);
+  sum += (s1->w[2]           == s2->w[2]);
+  sum += (s1->s[0]           == s2->s[0]);
+  sum += (s1->s[1]           == s2->s[1]);
+  sum += (s1->s[2]           == s2->s[2]);
+  sum += (s1->m[0]           == s2->m[0]);
+  sum += (s1->m[1]           == s2->m[1]);
+  sum += (s1->m[2]           == s2->m[2]);
+  sum += (s1->b1             == s2->b1);
+  sum += (s1->b2             == s2->b2);
+  sum += (s1->c              == s2->c);
+  sum += (s1->h              == s2->h);
+  sum += (s1->dr[0]          == s2->dr[0]);
+  sum += (s1->dr[1]          == s2->dr[1]);
+  sum += (s1->dr[2]          == s2->dr[2]);
+  sum += (s1->deltaphi       == s2->deltaphi);
+  sum += (s1->q0             == s2->q0);
+  sum += (s1->q1             == s2->q1);
+  sum += (s1->epsilon        == s2->epsilon);
+  sum += (s1->deltaq0        == s2->deltaq0);
+  sum += (s1->deltaq1        == s2->deltaq1);
+  sum += (s1->sa             == s2->sa);
+  sum += (s1->saf            == s2->saf);
+  sum += (s1->al             == s2->al);
+  sum += (s1->elabc[0]       == s2->elabc[0]);
+  sum += (s1->elabc[1]       == s2->elabc[1]);
+  sum += (s1->elabc[2]       == s2->elabc[2]);
+  sum += (s1->quat[0]        == s2->quat[0]);
+  sum += (s1->quat[1]        == s2->quat[1]);
+  sum += (s1->quat[2]        == s2->quat[2]);
+  sum += (s1->quat[3]        == s2->quat[3]);
+  sum += (s1->quatold[0]     == s2->quatold[0]);
+  sum += (s1->quatold[1]     == s2->quatold[1]);
+  sum += (s1->quatold[2]     == s2->quatold[2]);
+  sum += (s1->quatold[3]     == s2->quatold[3]);
+  sum += (s1->dpad[0]        == s2->dpad[0]);
+  sum += (s1->dpad[1]        == s2->dpad[1]);
+  sum += (s1->dpad[2]        == s2->dpad[2]);
+  sum += (s1->dpad[3]        == s2->dpad[3]);
+
+  if (sum == 80) same = 1;
+
+  return same;
+}
diff --git a/tests/unit/tests.c b/tests/unit/tests.c
index 7879d2039..4364c3391 100644
--- a/tests/unit/tests.c
+++ b/tests/unit/tests.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2024 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
@@ -81,6 +81,8 @@ __host__ int tests_create(int argc, char ** argv) {
   test_build_suite();
   test_ch_suite();
   test_colloid_suite();
+  test_colloid_link_suite();
+  test_colloid_state_io_suite();
   test_colloid_sums_suite();
   test_colloids_info_suite();
   test_colloids_halo_suite();
diff --git a/tests/unit/tests.h b/tests/unit/tests.h
index 7b1b9922b..3f881082f 100644
--- a/tests/unit/tests.h
+++ b/tests/unit/tests.h
@@ -5,7 +5,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2024 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
@@ -34,6 +34,8 @@ int test_bond_fene_suite(void);
 int test_bonds_suite(void);
 int test_build_suite(void);
 int test_ch_suite(void);
+int test_colloid_link_suite(void);
+int test_colloid_state_io_suite(void);
 int test_colloid_sums_suite(void);
 int test_colloid_suite(void);
 int test_colloids_info_suite(void);

From 36078e221679185eed177022586e65148e0ab288 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Tue, 28 Jan 2025 10:51:44 +0000
Subject: [PATCH 112/133] Add io version

---
 src/colloid_state_io.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/colloid_state_io.c b/src/colloid_state_io.c
index 74c552bd5..a0755b299 100644
--- a/src/colloid_state_io.c
+++ b/src/colloid_state_io.c
@@ -10,6 +10,9 @@
 
 #include "colloid_state_io.h"
 
+/* Override COLLOID_IO_VERSION here: */
+#define IO_VERSION 240
+
 /*****************************************************************************
  *
  *  colloid_state_io_write_buf
@@ -78,8 +81,8 @@ int colloid_state_io_write_buf_ascii(const colloid_state_t * s, char * buf) {
     nwrite += snprintf(cbuf + 16*item, 1 + item, i1format, s->isfixedvxyz[1]);
     nwrite += snprintf(cbuf + 17*item, 1 + item, i1format, s->isfixedvxyz[2]);
     nwrite += snprintf(cbuf + 18*item, 1 + item, i1format, s->inter_type);
-    /* FIXME: the ioversion needs to be correct ... */
-    nwrite += snprintf(cbuf + 19*item, 1 + item, i1format, s->ioversion);
+    /* This is the i/o version; we ignore s->ioversion: */
+    nwrite += snprintf(cbuf + 19*item, 1 + item, i1format, IO_VERSION);
     nwrite += snprintf(cbuf + 20*item, 1 + item, i1format, s->bc);
     nwrite += snprintf(cbuf + 21*item, 1 + item, i1format, s->shape);
     nwrite += snprintf(cbuf + 22*item, 1 + item, i1format, s->active);

From 5380b93ee547522bb0b63d6f28aa7156fb07222e Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Tue, 28 Jan 2025 10:53:27 +0000
Subject: [PATCH 113/133] Avoid dubious cast

---
 src/colloid_link.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/colloid_link.c b/src/colloid_link.c
index ee59b88e3..0c09fcdc4 100644
--- a/src/colloid_link.c
+++ b/src/colloid_link.c
@@ -119,8 +119,8 @@ int colloid_link_total(void) {
 
 int colloid_link_max_2d(double a, int nvel) {
 
-  int pi = 4;                            /* This is approximate */
-  int ai = fmax(4.0, ceil(a));           /* A minimum reasonable a ~ 4 */
+  int    pi = 4;                        /* This is approximate */
+  double ai = fmax(4.0, ceil(a));       /* A minimum reasonable a ~ 4 */
 
   return 2*pi*ai*(nvel - 1)/2;
 }
@@ -138,8 +138,8 @@ int colloid_link_max_2d(double a, int nvel) {
 
 int colloid_link_max_3d(double a, int nvel) {
 
-  int pi = 4;                             /* This is approximate */
-  int ai = fmax(1.0, ceil(a));            /* A minimum reasonable a ~ 1.0 */
+  int    pi = 4;                          /* This is approximate */
+  double ai = fmax(1.0, ceil(a));         /* A minimum reasonable a ~ 1.0 */
 
   return 4*pi*ai*ai*(nvel - 1)/2;
 }

From e6696b93f6a198cbca53725cdec678d793aa589d Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Tue, 28 Jan 2025 10:53:45 +0000
Subject: [PATCH 114/133] Avoid checks on floating point equality

---
 tests/unit/test_colloid_state_io.c | 103 +++++++++++++++--------------
 1 file changed, 55 insertions(+), 48 deletions(-)

diff --git a/tests/unit/test_colloid_state_io.c b/tests/unit/test_colloid_state_io.c
index e7e7cee3b..224d826c8 100644
--- a/tests/unit/test_colloid_state_io.c
+++ b/tests/unit/test_colloid_state_io.c
@@ -2,9 +2,16 @@
  *
  *  test_colloid_state_io.c
  *
+ *  Edinburgh Soft Matter and Statistical Physics Group and
+ *  Edinburgh Parallel Computing Centre
+ *
+ *  (c) 2025 The University of Edinburgh
+ *
  *****************************************************************************/
 
 #include <assert.h>
+#include <float.h>
+#include <math.h>
 #include <string.h>
 
 #include "pe.h"
@@ -260,54 +267,54 @@ int util_test_colloid_state_same(const colloid_state_t * s1,
   sum += (s1->intpad[5]      == s2->intpad[5]);
   sum += (s1->intpad[6]      == s2->intpad[6]);
 
-  sum += (s1->a0             == s2->a0);
-  sum += (s1->ah             == s2->ah);
-  sum += (s1->r[0]           == s2->r[0]);
-  sum += (s1->r[1]           == s2->r[1]);
-  sum += (s1->r[2]           == s2->r[2]);
-  sum += (s1->v[0]           == s2->v[0]);
-  sum += (s1->v[1]           == s2->v[1]);
-  sum += (s1->v[2]           == s2->v[2]);
-  sum += (s1->w[0]           == s2->w[0]);
-  sum += (s1->w[1]           == s2->w[1]);
-  sum += (s1->w[2]           == s2->w[2]);
-  sum += (s1->s[0]           == s2->s[0]);
-  sum += (s1->s[1]           == s2->s[1]);
-  sum += (s1->s[2]           == s2->s[2]);
-  sum += (s1->m[0]           == s2->m[0]);
-  sum += (s1->m[1]           == s2->m[1]);
-  sum += (s1->m[2]           == s2->m[2]);
-  sum += (s1->b1             == s2->b1);
-  sum += (s1->b2             == s2->b2);
-  sum += (s1->c              == s2->c);
-  sum += (s1->h              == s2->h);
-  sum += (s1->dr[0]          == s2->dr[0]);
-  sum += (s1->dr[1]          == s2->dr[1]);
-  sum += (s1->dr[2]          == s2->dr[2]);
-  sum += (s1->deltaphi       == s2->deltaphi);
-  sum += (s1->q0             == s2->q0);
-  sum += (s1->q1             == s2->q1);
-  sum += (s1->epsilon        == s2->epsilon);
-  sum += (s1->deltaq0        == s2->deltaq0);
-  sum += (s1->deltaq1        == s2->deltaq1);
-  sum += (s1->sa             == s2->sa);
-  sum += (s1->saf            == s2->saf);
-  sum += (s1->al             == s2->al);
-  sum += (s1->elabc[0]       == s2->elabc[0]);
-  sum += (s1->elabc[1]       == s2->elabc[1]);
-  sum += (s1->elabc[2]       == s2->elabc[2]);
-  sum += (s1->quat[0]        == s2->quat[0]);
-  sum += (s1->quat[1]        == s2->quat[1]);
-  sum += (s1->quat[2]        == s2->quat[2]);
-  sum += (s1->quat[3]        == s2->quat[3]);
-  sum += (s1->quatold[0]     == s2->quatold[0]);
-  sum += (s1->quatold[1]     == s2->quatold[1]);
-  sum += (s1->quatold[2]     == s2->quatold[2]);
-  sum += (s1->quatold[3]     == s2->quatold[3]);
-  sum += (s1->dpad[0]        == s2->dpad[0]);
-  sum += (s1->dpad[1]        == s2->dpad[1]);
-  sum += (s1->dpad[2]        == s2->dpad[2]);
-  sum += (s1->dpad[3]        == s2->dpad[3]);
+  sum += (fabs(s1->a0         - s2->a0)         < DBL_EPSILON);
+  sum += (fabs(s1->ah         - s2->ah)         < DBL_EPSILON);
+  sum += (fabs(s1->r[0]       - s2->r[0])       < DBL_EPSILON);
+  sum += (fabs(s1->r[1]       - s2->r[1])       < DBL_EPSILON);
+  sum += (fabs(s1->r[2]       - s2->r[2])       < DBL_EPSILON);
+  sum += (fabs(s1->v[0]       - s2->v[0])       < DBL_EPSILON);
+  sum += (fabs(s1->v[1]       - s2->v[1])       < DBL_EPSILON);
+  sum += (fabs(s1->v[2]       - s2->v[2])       < DBL_EPSILON);
+  sum += (fabs(s1->w[0]       - s2->w[0])       < DBL_EPSILON);
+  sum += (fabs(s1->w[1]       - s2->w[1])       < DBL_EPSILON);
+  sum += (fabs(s1->w[2]       - s2->w[2])       < DBL_EPSILON);
+  sum += (fabs(s1->s[0]       - s2->s[0])       < DBL_EPSILON);
+  sum += (fabs(s1->s[1]       - s2->s[1])       < DBL_EPSILON);
+  sum += (fabs(s1->s[2]       - s2->s[2])       < DBL_EPSILON);
+  sum += (fabs(s1->m[0]       - s2->m[0])       < DBL_EPSILON);
+  sum += (fabs(s1->m[1]       - s2->m[1])       < DBL_EPSILON);
+  sum += (fabs(s1->m[2]       - s2->m[2])       < DBL_EPSILON);
+  sum += (fabs(s1->b1         - s2->b1)         < DBL_EPSILON);
+  sum += (fabs(s1->b2         - s2->b2)         < DBL_EPSILON);
+  sum += (fabs(s1->c          - s2->c)          < DBL_EPSILON);
+  sum += (fabs(s1->h          - s2->h)          < DBL_EPSILON);
+  sum += (fabs(s1->dr[0]      - s2->dr[0])      < DBL_EPSILON);
+  sum += (fabs(s1->dr[1]      - s2->dr[1])      < DBL_EPSILON);
+  sum += (fabs(s1->dr[2]      - s2->dr[2])      < DBL_EPSILON);
+  sum += (fabs(s1->deltaphi   - s2->deltaphi)   < DBL_EPSILON);
+  sum += (fabs(s1->q0         - s2->q0)         < DBL_EPSILON);
+  sum += (fabs(s1->q1         - s2->q1)         < DBL_EPSILON);
+  sum += (fabs(s1->epsilon    - s2->epsilon)    < DBL_EPSILON);
+  sum += (fabs(s1->deltaq0    - s2->deltaq0)    < DBL_EPSILON);
+  sum += (fabs(s1->deltaq1    - s2->deltaq1)    < DBL_EPSILON);
+  sum += (fabs(s1->sa         - s2->sa)         < DBL_EPSILON);
+  sum += (fabs(s1->saf        - s2->saf)        < DBL_EPSILON);
+  sum += (fabs(s1->al         - s2->al)         < DBL_EPSILON);
+  sum += (fabs(s1->elabc[0]   - s2->elabc[0])   < DBL_EPSILON);
+  sum += (fabs(s1->elabc[1]   - s2->elabc[1])   < DBL_EPSILON);
+  sum += (fabs(s1->elabc[2]   - s2->elabc[2])   < DBL_EPSILON);
+  sum += (fabs(s1->quat[0]    - s2->quat[0])    < DBL_EPSILON);
+  sum += (fabs(s1->quat[1]    - s2->quat[1])    < DBL_EPSILON);
+  sum += (fabs(s1->quat[2]    - s2->quat[2])    < DBL_EPSILON);
+  sum += (fabs(s1->quat[3]    - s2->quat[3])    < DBL_EPSILON);
+  sum += (fabs(s1->quatold[0] - s2->quatold[0]) < DBL_EPSILON);
+  sum += (fabs(s1->quatold[1] - s2->quatold[1]) < DBL_EPSILON);
+  sum += (fabs(s1->quatold[2] - s2->quatold[2]) < DBL_EPSILON);
+  sum += (fabs(s1->quatold[3] - s2->quatold[3]) < DBL_EPSILON);
+  sum += (fabs(s1->dpad[0]    - s2->dpad[0])    < DBL_EPSILON);
+  sum += (fabs(s1->dpad[1]    - s2->dpad[1])    < DBL_EPSILON);
+  sum += (fabs(s1->dpad[2]    - s2->dpad[2])    < DBL_EPSILON);
+  sum += (fabs(s1->dpad[3]    - s2->dpad[3])    < DBL_EPSILON);
 
   if (sum == 80) same = 1;
 

From 0754b790a1c14c54b209e44090b119d3c9ead61b Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Tue, 28 Jan 2025 16:52:53 +0000
Subject: [PATCH 115/133] Fix to allow non-uniform wetting

---
 src/gradient_3d_27pt_solid.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/gradient_3d_27pt_solid.c b/src/gradient_3d_27pt_solid.c
index c871944b4..50b1e27b2 100644
--- a/src/gradient_3d_27pt_solid.c
+++ b/src/gradient_3d_27pt_solid.c
@@ -27,7 +27,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2024 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -80,24 +80,25 @@ __global__ void grad_3d_27pt_solid_kernel(kernel_3d_t k3d,
 
 __host__ int grad_3d_27pt_solid_map_set(map_t * map) {
 
-  int ndata;
   assert(map);
 
   static_solid.map = map;
 
-  /* We expect at most two wetting parameters; if present
+  /* We expect either zero or two wetting parameters; if present
    * first should be C, second H. Default to zero. */
 
-  ndata = map->ndata;
-
-  if (ndata == 0) {
+  if (map->ndata == 0) {
     /* Assume we are uniform from free energy */
     static_solid.uniform = 1;
   }
-  else if (ndata != 2) {
-    /* We should have exactly 2 */
+  else if (map->ndata == 2) {
+    /* Assume we have colloid non-uniform wetting */
+    static_solid.uniform = 0;
+  }
+  else {
+    /* We don't handle the case */
     pe_fatal(map->pe, "Wrong number of wetting parameters in map data %d\n",
-	     ndata);
+	     map->ndata);
   }
 
   return 0;
@@ -113,11 +114,13 @@ __host__ int grad_3d_27pt_solid_fe_set(fe_symm_t * fe) {
 
   assert(fe);
 
+  /* Always assume this means uniform wetting */
+
   static_solid.fe_symm = fe;
   static_solid.rkappa = 1.0/fe->param->kappa;
   static_solid.c = fe->param->c;
   static_solid.h = fe->param->h;
-  if (static_solid.map == NULL) static_solid.uniform = 1;
+  static_solid.uniform = 1;
 
   return 0;
 }

From 8c6c48652c8e5ab03ca15b0fd68d52d4152769c5 Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@epcc.ed.ac.uk>
Date: Wed, 29 Jan 2025 19:29:13 +0000
Subject: [PATCH 116/133] Add have_gpu_aware_mpi_()

---
 src/pe.c | 27 ++++++++++++++++++++++++++-
 src/pe.h |  3 ++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/pe.c b/src/pe.c
index 4b137be72..d5cef4d60 100644
--- a/src/pe.c
+++ b/src/pe.c
@@ -15,7 +15,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2024 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -438,3 +438,28 @@ __host__ int pe_time(char * str, int bufsiz) {
 
   return ierr;
 }
+
+/*****************************************************************************
+ *
+ *  have_gpu_aware_mpi_
+ *
+ *  This is awkward; it might belong elsewhere on its own.
+ *
+ *****************************************************************************/
+
+#ifdef HAVE_OPENMPI_
+/* This provides MPIX_CUDA_AWARE_SUPPORT .. */
+#include "mpi-ext.h"
+#endif
+
+int have_gpu_aware_mpi_(void) {
+
+  int have_gpu_aware_mpi = 0;
+
+  /* OpenMPI */
+#if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
+  have_gpu_aware_mpi = 1;
+#endif
+
+  return have_gpu_aware_mpi;
+}
diff --git a/src/pe.h b/src/pe.h
index 444d77b85..abd6fb265 100644
--- a/src/pe.h
+++ b/src/pe.h
@@ -5,7 +5,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2023 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Contribtuing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -40,5 +40,6 @@ __host__ int pe_warn(pe_t * pe, const char * fmt, ...);
 __host__ int pe_exit(pe_t * pe, const char * fmt, ...);
 
 __host__ int pe_time(char * strctime, int bufsiz);
+__host__ int have_gpu_aware_mpi_(void);
 
 #endif

From 05ac53bf17b66765f385b721dadae70a0478b28d Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@epcc.ed.ac.uk>
Date: Wed, 29 Jan 2025 19:29:50 +0000
Subject: [PATCH 117/133] Replace macros by have_gpu_aware_mpi_ function

---
 src/field.c    | 23 ++++++-----------------
 src/model_le.c | 19 ++++---------------
 2 files changed, 10 insertions(+), 32 deletions(-)

diff --git a/src/field.c b/src/field.c
index 4764d2b0c..f25f5f5b1 100644
--- a/src/field.c
+++ b/src/field.c
@@ -14,7 +14,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2012-2024 The University of Edinburgh
+ *  (c) 2012-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -43,17 +43,6 @@ static int field_leesedwards_parallel(field_t * obj);
 
 __host__ int field_init(field_t * obj, int nhcomm, lees_edw_t * le);
 
-#ifdef HAVE_OPENMPI_
-/* This provides MPIX_CUDA_AWARE_SUPPORT .. */
-#include "mpi-ext.h"
-#endif
-
-#if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
-static const int have_gpu_aware_mpi_ = 1;
-#else
-static const int have_gpu_aware_mpi_ = 0;
-#endif
-
 /*****************************************************************************
  *
  *  field_create
@@ -1443,7 +1432,7 @@ int field_halo_post(const field_t * field, field_halo_t * h) {
     int k = 1 + h->cv[h->nvel - ireq][Z];
     int mcount = field->nf*field_halo_size(h->rlim[ireq]);
     double * buf = h->recv[ireq];
-    if (have_gpu_aware_mpi_) buf = h->recv_d[ireq];
+    if (have_gpu_aware_mpi_()) buf = h->recv_d[ireq];
 
     h->request[ireq] = MPI_REQUEST_NULL;
 
@@ -1485,7 +1474,7 @@ int field_halo_post(const field_t * field, field_halo_t * h) {
     int k = 1 + h->cv[ireq][Z];
     int mcount = field->nf*field_halo_size(h->slim[ireq]);
     double * buf = h->send[ireq];
-    if (have_gpu_aware_mpi_) buf = h->send_d[ireq];
+    if (have_gpu_aware_mpi_()) buf = h->send_d[ireq];
 
     /* Skip messages to self ... */
     if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) continue;
@@ -1747,7 +1736,7 @@ int field_graph_halo_send_create(const field_t * field, field_halo_t * h) {
     tdpAssert( tdpGraphAddKernelNode(&kernelNode, h->gsend.graph, NULL, 0,
 				     &kernelNodeParams) );
 
-    if (have_gpu_aware_mpi_) {
+    if (have_gpu_aware_mpi_()) {
       /* Don't need explicit device -> host copy */
     }
     else {
@@ -1803,7 +1792,7 @@ int field_graph_halo_recv_create(const field_t * field, field_halo_t * h) {
     int rcount = field->nf*field_halo_size(h->rlim[ireq]);
     tdpGraphNode_t memcpyNode = {0};
 
-    if (have_gpu_aware_mpi_) {
+    if (have_gpu_aware_mpi_()) {
       /* Don't need explicit copies */
     }
     else {
@@ -1851,7 +1840,7 @@ int field_graph_halo_recv_create(const field_t * field, field_halo_t * h) {
     kernelNodeParams.kernelParams   = (void **) kernelArgs;
     kernelNodeParams.extra          = NULL;
 
-    if (have_gpu_aware_mpi_) {
+    if (have_gpu_aware_mpi_()) {
       tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, NULL,
 				       0, &kernelNodeParams) );
     }
diff --git a/src/model_le.c b/src/model_le.c
index 23fa99e6b..d5e78a066 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -11,7 +11,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2022 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -36,17 +36,6 @@ static int le_reproject(lb_t * lb, lees_edw_t * le);
 static int le_displace_and_interpolate(lb_t * lb, lees_edw_t * le);
 static int le_displace_and_interpolate_parallel(lb_t * lb, lees_edw_t * le);
 
-#ifdef HAVE_OPENMPI_
-/* This provides MPIX_CUDA_AWARE_SUPPORT .. */
-#include "mpi-ext.h"
-#endif
-
-#if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
-static const int have_gpu_aware_mpi_ = 1;
-#else
-static const int have_gpu_aware_mpi_ = 0;
-#endif
-
 /*****************************************************************************
  *
  *  lb_le_apply_boundary_conditions
@@ -825,7 +814,7 @@ int lb_data_apply_le_boundary_conditions(lb_t * lb, lees_edw_t * le) {
 
 
     /* Second, displacement. */
-    if (have_gpu_aware_mpi_ || mpi_cartsz[Y] > 1) {
+    if (have_gpu_aware_mpi_() || mpi_cartsz[Y] > 1) {
       lb_data_displace_communicate(lekh, lb, le, t);
     }
     else {
@@ -1139,7 +1128,7 @@ static int lb_data_displace_communicate(le_kernel_helper_t lekh,
   /* If there is GPU-aware MPI just communicate the GPU buffers; if
    * not, copy in relevant direction at the start and finish */
 
-  if (have_gpu_aware_mpi_) {
+  if (have_gpu_aware_mpi_()) {
     tdpAssert( tdpMemcpy(&sbuff, &lb->target->sbuff, sizeof(double *),
 			 tdpMemcpyDeviceToHost) );
     tdpAssert( tdpMemcpy(&rbuff, &lb->target->rbuff, sizeof(double *),
@@ -1253,7 +1242,7 @@ static int lb_data_displace_communicate(le_kernel_helper_t lekh,
   /* Complete */
   MPI_Waitall(8, req, MPI_STATUSES_IGNORE);
 
-  if (have_gpu_aware_mpi_) {
+  if (have_gpu_aware_mpi_()) {
     /* No further action */
   }
   else {

From 4cc64023508c2cb88ca4cff6d6e119c719ac1507 Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@epcc.ed.ac.uk>
Date: Sun, 9 Feb 2025 17:01:09 +0000
Subject: [PATCH 118/133] rationalise lb halo options and tests

---
 src/bbl.c                                   |   3 +-
 src/distribution_rt.c                       |  35 +-
 src/lb_data.c                               |  48 +-
 src/lb_data.h                               |   9 +-
 src/lb_data_options.c                       |   9 +-
 src/lb_data_options.h                       |   9 +-
 src/model_le.c                              |   1 +
 src/stats_distribution.c                    |   3 +-
 tests/unit/test_halo.c                      | 347 -------
 tests/unit/test_io_options.c                |   9 +-
 tests/unit/{test_model.c => test_lb_data.c} | 954 +++++++++-----------
 tests/unit/test_prop.c                      |  35 +-
 tests/unit/test_util_ellipsoid.c            |  29 +-
 tests/unit/tests.c                          |   8 +-
 tests/unit/tests.h                          |   5 +-
 15 files changed, 513 insertions(+), 991 deletions(-)
 delete mode 100644 tests/unit/test_halo.c
 rename tests/unit/{test_model.c => test_lb_data.c} (56%)

diff --git a/src/bbl.c b/src/bbl.c
index fcced3326..efead4a64 100644
--- a/src/bbl.c
+++ b/src/bbl.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2024 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Contributing Authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -23,6 +23,7 @@
 
 #include "pe.h"
 #include "coords.h"
+#include "kernel.h"
 #include "physics.h"
 #include "colloid_sums.h"
 #include "util.h"
diff --git a/src/distribution_rt.c b/src/distribution_rt.c
index ec129963f..7222fdd5e 100644
--- a/src/distribution_rt.c
+++ b/src/distribution_rt.c
@@ -8,7 +8,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2024 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -158,31 +158,29 @@ int lb_run_time_prev(pe_t * pe, cs_t * cs, rt_t * rt, lb_t ** lb) {
   options.ndist = ndist;
 
   /* Halo options */
+  /* Some old names are retained with a warning ... */
   {
     char htype[BUFSIZ] = {0};
     int havetype = rt_string_parameter(rt, "lb_halo_scheme", htype, BUFSIZ);
     if (strcmp(htype, "lb_halo_target") == 0) {
-      options.halo = LB_HALO_TARGET;
+      options.halo = LB_HALO_FULL;
     }
     else if (strcmp(htype, "lb_halo_openmp_full") == 0) {
-      options.halo = LB_HALO_OPENMP_FULL;
+      options.halo = LB_HALO_FULL;
     }
     else if (strcmp(htype, "lb_halo_openmp_reduced") == 0) {
-      options.halo = LB_HALO_OPENMP_REDUCED;
+      options.halo = LB_HALO_REDUCED;
+    }
+    else if (strcmp(htype, "lb_halo_full") == 0) {
+      options.halo = LB_HALO_FULL;
+    }
+    else if (strcmp(htype, "lb_halo_reduced") == 0) {
+      options.halo = LB_HALO_REDUCED;
     }
     else if (havetype) {
       pe_fatal(pe, "lb_halo_scheme not recognised\n");
     }
 
-    /* I'm going to trap this silently here - which is slightly
-     * better than having the wrong halo. I avoid a message so
-     * not as to disrupt the regression tests. */
-    {
-      int ndevice = 0;
-      tdpAssert( tdpGetDeviceCount(&ndevice) );
-      if (ndevice > 0) options.halo = LB_HALO_TARGET;
-    }
-
     options.reportimbalance = rt_switch(rt, "lb_halo_report_imbalance");
     options.usefirsttouch   = rt_switch(rt, "lb_data_use_first_touch");
 
@@ -215,14 +213,11 @@ int lb_run_time_prev(pe_t * pe, cs_t * cs, rt_t * rt, lb_t ** lb) {
   pe_info(pe, "SIMD vector len:  %d\n", NSIMDVL);
   pe_info(pe, "Number of sets:   %d\n", ndist);
 
-  if (options.halo == LB_HALO_TARGET) {
-    pe_info(pe, "Halo type:        %s\n", "lb_halo_target (full halo)");
-  }
-  if (options.halo == LB_HALO_OPENMP_FULL) {
-    pe_info(pe, "Halo type:        %s\n", "lb_halo_openmp_full (host)");
+  if (options.halo == LB_HALO_FULL) {
+    pe_info(pe, "Halo type:        %s\n", "full halo");
   }
-  if (options.halo == LB_HALO_OPENMP_REDUCED) {
-    pe_info(pe, "Halo type:        %s\n", "lb_halo_openmp_reduced (host)");
+  if (options.halo == LB_HALO_REDUCED) {
+    pe_info(pe, "Halo type:        %s\n", "reduced halo");
   }
   if (options.reportimbalance) {
     pe_info(pe, "Imbalance time:   %s\n", "reported");
diff --git a/src/lb_data.c b/src/lb_data.c
index a255fd84e..3963deccb 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -10,7 +10,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2024 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -26,12 +26,12 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "kernel.h"
 #include "lb_data.h"
 
 #include "timer.h"
 #include "util.h"
 
-static int lb_mpi_init(lb_t * lb);
 static int lb_model_param_init(lb_t * lb);
 static int lb_init(lb_t * lb);
 static int lb_data_touch(lb_t * lb);
@@ -44,8 +44,8 @@ int halo_initialise_device_model(lb_halo_t * h);
 int lb_data_free_device_model(lb_t *lb);
 int halo_free_device_model(lb_halo_t *h);
 
-int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count);
-int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count);
+int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h);
+int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h);
 
 static __constant__ lb_collide_param_t static_param;
 
@@ -69,6 +69,11 @@ static const int have_gpu_aware_mpi_ = 1;
 static const int have_gpu_aware_mpi_ = 0;
 #endif
 
+int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme);
+int lb_halo_post(lb_t * lb, lb_halo_t * h);
+int lb_halo_wait(lb_t * lb, lb_halo_t * h);
+int lb_halo_free(lb_t * lb, lb_halo_t * h);
+
 /*****************************************************************************
  *
  *  lb_data_create
@@ -290,7 +295,6 @@ __host__ int lb_free(lb_t * lb) {
   io_metadata_finalise(&lb->input);
   io_metadata_finalise(&lb->output);
 
-  if (lb->halo) halo_swap_free(lb->halo);
   free(lb->f);
   free(lb->fprime);
 
@@ -593,7 +597,6 @@ static int lb_init(lb_t * lb) {
     lb_data_initialise_device_model(lb);
   }
 
-  lb_mpi_init(lb);
   lb_model_param_init(lb);
 
   lb_memcpy(lb, tdpMemcpyHostToDevice);
@@ -691,25 +694,6 @@ __host__ int lb_init_rest_f(lb_t * lb, double rho0) {
   return 0;
 }
 
-/*****************************************************************************
- *
- *  lb_mpi_init
- *
- *  Commit the various datatypes required for halo swaps.
- *
- *****************************************************************************/
-
-static int lb_mpi_init(lb_t * lb) {
-
-  assert(lb);
-
-  halo_swap_create_r2(lb->pe, lb->cs, 1, lb->nsite, lb->ndist, lb->nvel,
-		      &lb->halo);
-  halo_swap_handlers_set(lb->halo, halo_swap_pack_rank1, halo_swap_unpack_rank1);
-
-  return 0;
-}
-
 /*****************************************************************************
  *
  *  lb_data_touch
@@ -772,7 +756,7 @@ __host__ int lb_data_touch(lb_t * lb) {
  *  lb_halo
  *
  *  Swap the distributions at the periodic/processor boundaries
- *  in each direction. Default target swap.
+ *  in each direction.
  *
  *****************************************************************************/
 
@@ -1285,7 +1269,7 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
   /* Default to full swap unless reduced is requested. */
 
   h->full = 1;
-  if (scheme == LB_HALO_OPENMP_REDUCED) h->full = 0;
+  if (scheme == LB_HALO_REDUCED) h->full = 0;
 
   /* Determine look-up table of ranks of neighbouring processes */
   {
@@ -1439,8 +1423,8 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
     halo_initialise_device_model(h);
 
     if (have_graph_api_) {
-      lb_graph_halo_send_create(lb, h, send_count);
-      lb_graph_halo_recv_create(lb, h, recv_count);
+      lb_graph_halo_send_create(lb, h);
+      lb_graph_halo_recv_create(lb, h);
     }
 
   }
@@ -1578,7 +1562,7 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
 
   int ndevice;
   tdpGetDeviceCount(&ndevice);
-  if (ndevice > 0 && lb->haloscheme == LB_HALO_TARGET) {
+  if (ndevice > 0) {
     if (have_graph_api_) {
       tdpAssert( tdpGraphLaunch(h->grecv.exec, h->stream) );
       tdpAssert( tdpStreamSynchronize(h->stream) );
@@ -1931,7 +1915,7 @@ int lb_io_read(lb_t * lb, int timestep, io_event_t * event) {
  *
  *****************************************************************************/
 
-int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count) {
+int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h) {
 
   assert(lb);
   assert(h);
@@ -2006,7 +1990,7 @@ int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h, int * send_count)
  *
  *****************************************************************************/
 
-int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h, int * recv_count) {
+int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h) {
 
   assert(lb);
   assert(h);
diff --git a/src/lb_data.h b/src/lb_data.h
index ef77e2fe8..137cdec4d 100644
--- a/src/lb_data.h
+++ b/src/lb_data.h
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2014-2024 The University of Edinburgh
+ *  (c) 2014-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -26,7 +26,6 @@
 
 #include "io_impl.h"
 #include "io_event.h"
-#include "halo_swap.h"
 
 /* Residual compile-time switches scheduled for removal */
 #ifdef _D2Q9_
@@ -105,11 +104,6 @@ struct lb_halo_s {
   lb_graph_halo_t grecv;
 };
 
-int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme);
-int lb_halo_post(lb_t * lb, lb_halo_t * h);
-int lb_halo_wait(lb_t * lb, lb_halo_t * h);
-int lb_halo_free(lb_t * lb, lb_halo_t * h);
-
 struct lb_data_s {
 
   int ndim;
@@ -121,7 +115,6 @@ struct lb_data_s {
   cs_t * cs;             /* coordinate system */
 
   lb_model_t model;      /* Current LB model information */
-  halo_swap_t * halo;    /* halo swap driver */
 
   io_element_t ascii;    /* Per site ASCII information. */
   io_element_t binary;   /* Per site binary information. */
diff --git a/src/lb_data_options.c b/src/lb_data_options.c
index 669a63f24..f0f1e12e0 100644
--- a/src/lb_data_options.c
+++ b/src/lb_data_options.c
@@ -8,7 +8,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2022 The University of Edinburgh
+ *  (c) 2022-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -29,7 +29,7 @@ lb_data_options_t lb_data_options_default(void) {
 
   lb_data_options_t opts = {.ndim = 3, .nvel = 19, .ndist = 1,
                             .nrelax = LB_RELAXATION_M10,
-			    .halo   = LB_HALO_TARGET,
+			    .halo   = LB_HALO_FULL,
 			    .reportimbalance = 0,
 			    .usefirsttouch   = 0,
                             .iodata = io_info_args_default()};
@@ -69,7 +69,10 @@ int lb_data_options_valid(const lb_data_options_t * opts) {
   if (!(opts->ndim  == 2 || opts->ndim  == 3)) valid = 0;
   if (!(opts->ndist == 1 || opts->ndist == 2)) valid = 0;
 
-  if (opts->ndist == 2 && opts->halo != LB_HALO_TARGET) valid = 0;
+  {
+    int halo = (opts->halo == LB_HALO_FULL || opts->halo == LB_HALO_REDUCED);
+    if (halo == 0) valid = 0;
+  }
 
   return valid;
 }
diff --git a/src/lb_data_options.h b/src/lb_data_options.h
index a4d71f940..f67ad20d4 100644
--- a/src/lb_data_options.h
+++ b/src/lb_data_options.h
@@ -5,7 +5,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2022 The University of Edinburgh
+ *  (c) 2022-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -22,9 +22,10 @@ typedef enum lb_relaxation_enum {LB_RELAXATION_M10,
 				 LB_RELAXATION_TRT}
   lb_relaxation_enum_t;
 
-typedef enum lb_halo_enum {LB_HALO_TARGET,
-                           LB_HALO_OPENMP_FULL,
-                           LB_HALO_OPENMP_REDUCED} lb_halo_enum_t;
+typedef enum lb_halo_enum {
+  LB_HALO_FULL    = 1,
+  LB_HALO_REDUCED = 2
+} lb_halo_enum_t;
 
 typedef struct lb_data_options_s lb_data_options_t;
 
diff --git a/src/model_le.c b/src/model_le.c
index 23fa99e6b..77ee7d060 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -27,6 +27,7 @@
 #include "coords.h"
 #include "cs_limits.h"
 #include "control.h"
+#include "kernel.h"
 #include "physics.h"
 #include "model_le.h"
 #include "timer.h"
diff --git a/src/stats_distribution.c b/src/stats_distribution.c
index d49185b12..6e7906097 100644
--- a/src/stats_distribution.c
+++ b/src/stats_distribution.c
@@ -11,7 +11,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2024 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
@@ -23,6 +23,7 @@
 
 #include "pe.h"
 #include "coords.h"
+#include "kernel.h"
 #include "util.h"
 #include "util_sum.h"
 #include "stats_distribution.h"
diff --git a/tests/unit/test_halo.c b/tests/unit/test_halo.c
deleted file mode 100644
index ccd317921..000000000
--- a/tests/unit/test_halo.c
+++ /dev/null
@@ -1,347 +0,0 @@
-/*****************************************************************************
- *
- *  test_halo.c
- *
- *  This is a more rigourous test of the halo swap code for the
- *  distributions than appears in test model.
- *
- *  Edinburgh Soft Matter and Statistical Physics Group
- *  Edinburgh Parallel Computing Centre
- *
- *  (c) 2010-2024 The University of Edinburgh
- *
- *  Contributing authors:
- *  Kevin Stratford (kevin@epcc.ed.ac.uk)
- *
- *****************************************************************************/
-
-#include <assert.h>
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "pe.h"
-#include "coords.h"
-#include "lb_data.h"
-#include "control.h"
-#include "tests.h"
-
-int test_lb_halo1(pe_t * pe, cs_t * cs, int ndim, int nvel);
-int do_test_halo_null(pe_t * pe, cs_t * cs, const lb_data_options_t * opts);
-int do_test_halo(pe_t * pe, cs_t * cs, int dim, const lb_data_options_t * opts);
-
-/*****************************************************************************
- *
- *  test_halo_suite
- *
- *****************************************************************************/
-
-int test_halo_suite(void) {
-
-  pe_t * pe = NULL;
-  cs_t * cs = NULL;
-
-  pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
-  cs_create(pe, &cs);
-  cs_init(cs);
-
-  /* Use a 2d system for ndim = 2, nvel = 9 */
-  test_lb_halo1(pe, cs, 3, 15);
-  pe_info(pe, "PASS     ./unit/test_halo 15\n");
-  test_lb_halo1(pe, cs, 3, 19);
-  pe_info(pe, "PASS     ./unit/test_halo 19\n");
-  test_lb_halo1(pe, cs, 3, 27);
-
-  pe_info(pe, "PASS     ./unit/test_halo\n");
-  cs_free(cs);
-  pe_free(pe);
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  test_lb_halo
- *
- *****************************************************************************/
-
-int test_lb_halo1(pe_t * pe, cs_t * cs, int ndim, int nvel) {
-
-  lb_data_options_t opts = lb_data_options_default();
-
-  opts.ndim  = ndim;
-  opts.nvel  = nvel;
-  opts.ndist = 1;
-  opts.halo  = LB_HALO_TARGET;
-
-  do_test_halo_null(pe, cs, &opts);
-  do_test_halo(pe, cs, X, &opts);
-  do_test_halo(pe, cs, Y, &opts);
-  do_test_halo(pe, cs, Z, &opts);
-
-  opts.ndist = 1;
-  opts.halo  = LB_HALO_OPENMP_FULL;
-
-  do_test_halo_null(pe, cs, &opts);
-  do_test_halo(pe, cs, X, &opts);
-  do_test_halo(pe, cs, Y, &opts);
-  do_test_halo(pe, cs, Z, &opts);
-
-  opts.ndist = 1;
-  opts.halo  = LB_HALO_OPENMP_REDUCED;
-
-  do_test_halo_null(pe, cs, &opts);
-
-  opts.ndist = 2;
-  opts.halo = LB_HALO_TARGET;
-
-  do_test_halo_null(pe, cs, &opts);
-  do_test_halo(pe, cs, X, &opts);
-  do_test_halo(pe, cs, Y, &opts);
-  do_test_halo(pe, cs, Z, &opts);
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  test_halo_null
- *
- *  Null halo test. Make sure no halo information appears in the
- *  domain proper. This works for both full and reduced halos.
- *
- *****************************************************************************/
-
-int do_test_halo_null(pe_t * pe, cs_t * cs, const lb_data_options_t * opts) {
-
-  int nlocal[3], n[3];
-  int index, nd, p;
-  int nhalo;
-  int nextra;
-  double f_actual;
-
-  lb_t * lb = NULL;
-
-  assert(pe);
-  assert(cs);
-  assert(opts);
-
-  cs_nhalo(cs, &nhalo);
-  nextra = nhalo - 1;
-
-  lb_data_create(pe, cs, opts, &lb);
-
-  cs_nlocal(cs, nlocal);
-
-  /* Set entire distribution (all sites including halos) to 1.0 */
-
-  for (n[X] = 1 - nextra; n[X] <= nlocal[X] + nextra; n[X]++) {
-    for (n[Y] = 1 - nextra; n[Y] <= nlocal[Y] + nextra; n[Y]++) {
-      for (n[Z] = 1 - nextra; n[Z] <= nlocal[Z] + nextra; n[Z]++) {
-
-	index = cs_index(cs, n[X], n[Y], n[Z]);
-
-	for (nd = 0; nd < lb->ndist; nd++) {
-	  for (p = 0; p < lb->model.nvel; p++) {
-	    lb_f_set(lb, index, p, nd, 1.0);
-	  }
-	}
-
-      }
-    }
-  }
-
-  /* Zero interior */
-
-  for (n[X] = 1; n[X] <= nlocal[X]; n[X]++) {
-    for (n[Y] = 1; n[Y] <= nlocal[Y]; n[Y]++) {
-      for (n[Z] = 1; n[Z] <= nlocal[Z]; n[Z]++) {
-
-	index = cs_index(cs, n[X], n[Y], n[Z]);
-
-	for (nd = 0; nd < lb->ndist; nd++) {
-	  for (p = 0; p < lb->model.nvel; p++) {
-	    lb_f_set(lb, index, p, nd, 0.0);
-	  }
-	}
-
-      }
-    }
-  }
-
-  /* Swap */
-
-  lb_halo(lb);
-
-  /* Check everywhere in the interior still zero */
-
-  for (n[X] = 1; n[X] <= nlocal[X]; n[X]++) {
-    for (n[Y] = 1; n[Y] <= nlocal[Y]; n[Y]++) {
-      for (n[Z] = 1; n[Z] <= nlocal[Z]; n[Z]++) {
-
-	index = cs_index(cs, n[X], n[Y], n[Z]);
-
-	for (nd = 0; nd < lb->ndist; nd++) {
-	  for (p = 0; p < lb->model.nvel; p++) {
-	    lb_f(lb, index, p, nd, &f_actual);
-
-	    /* everything should still be zero inside the lattice */
-	    test_assert(fabs(f_actual - 0.0) < DBL_EPSILON);
-	  }
-	}
-
-      }
-    }
-  }
-
-  lb_free(lb);
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  do_test_halo
- *
- *  Test the halo swap for the distributions for coordinate direction dim.
- *
- *  Note that the reduced halo swaps are only meaningful in
- *  parallel. They will automatically work in serial.
- *
- *****************************************************************************/
-
-int do_test_halo(pe_t * pe, cs_t * cs, int dim, const lb_data_options_t * opts) {
-
-  int ndevice = 0;
-  int nhalo;
-  int nlocal[3], n[3];
-  int offset[3];
-  int mpi_cartsz[3];
-  int mpi_cartcoords[3];
-  int nd;
-  int nextra;
-  int index, p, d;
-
-  double ltot[3];
-  double f_expect, f_actual;
-  lb_t * lb = NULL;
-
-  assert(pe);
-  assert(cs);
-  assert(dim == X || dim == Y || dim == Z);
-  assert(opts);
-
-  tdpAssert( tdpGetDeviceCount(&ndevice) );
-
-  lb_data_create(pe, cs, opts, &lb);
-
-  cs_nhalo(cs, &nhalo);
-  nextra = nhalo;
-
-  cs_ltot(cs, ltot);
-  cs_nlocal(cs, nlocal);
-  cs_nlocal_offset(cs, offset);
-  cs_cartsz(cs, mpi_cartsz);
-  cs_cart_coords(cs, mpi_cartcoords);
-
-  /* Zero entire distribution (all sites including halos) */
-
-  for (n[X] = 1 - nextra; n[X] <= nlocal[X] + nextra; n[X]++) {
-    for (n[Y] = 1 - nextra; n[Y] <= nlocal[Y] + nextra; n[Y]++) {
-      for (n[Z] = 1 - nextra; n[Z] <= nlocal[Z] + nextra; n[Z]++) {
-
-	index = cs_index(cs, n[X], n[Y], n[Z]);
-
-	for (nd = 0; nd < lb->ndist; nd++) {
-	  for (p = 0; p < lb->model.nvel; p++) {
-	    lb_f_set(lb, index, p, nd, -1.0);
-	  }
-	}
-
-      }
-    }
-  }
-
-  /* Set the interior sites to get swapped with a value related to
-   * absolute position */
-
-  for (n[X] = 1; n[X] <= nlocal[X]; n[X]++) {
-    for (n[Y] = 1; n[Y] <= nlocal[Y]; n[Y]++) {
-      for (n[Z] = 1; n[Z] <= nlocal[Z]; n[Z]++) {
-
-	index = cs_index(cs, n[X], n[Y], n[Z]);
-
-	if (n[X] <= nhalo || n[X] > nlocal[X] - nhalo ||
-	    n[Y] <= nhalo || n[Y] > nlocal[Y] - nhalo ||
-	    n[Z] <= nhalo || n[Z] > nlocal[Z] - nhalo) {
-
-	  for (nd = 0; nd < lb->ndist; nd++) {
-	    for (p = 0; p < lb->model.nvel; p++) {
-	      lb_f_set(lb, index, p, nd, 1.0*(offset[dim] + n[dim]));
-	    }
-	  }
-	}
-
-      }
-    }
-  }
-
-  lb_memcpy(lb, tdpMemcpyHostToDevice);
-  lb_halo(lb);
-
-  /* Don't overwrite the host version if not device swap */
-  if (ndevice && lb->opts.halo == LB_HALO_TARGET) {
-    lb_memcpy(lb, tdpMemcpyDeviceToHost);
-  }
-
-  /* Check the results (all sites for distribution halo).
-   * The halo regions should contain a copy of the above, while the
-   * interior sites are unchanged */
-
-  /* Note the distribution halo swaps are always width 1, irrespective
-   * of nhalo */
-
-  for (n[X] = 0; n[X] <= nlocal[X] + 1; n[X]++) {
-    for (n[Y] = 0; n[Y] <= nlocal[Y] + 1; n[Y]++) {
-      for (n[Z] = 0; n[Z] <= nlocal[Z] + 1; n[Z]++) {
-
-	index = cs_index(cs, n[X], n[Y], n[Z]);
-
-	for (nd = 0; nd < lb->ndist; nd++) {
-	  for (d = 0; d < 3; d++) {
-
-	    /* 'Left' side */
-	    if (dim == d && n[d] == 0) {
-
-	      f_expect = offset[dim];
-	      if (mpi_cartcoords[dim] == 0) f_expect = ltot[dim];
-
-	      for (p = 0; p < lb->model.nvel; p++) {
-		lb_f(lb, index, p, nd, &f_actual);
-		test_assert(fabs(f_actual-f_expect) < DBL_EPSILON);
-	      }
-	    }
-
-	    /* 'Right' side */
-	    if (dim == d && n[d] == nlocal[d] + 1) {
-
-	      f_expect = offset[dim] + nlocal[dim] + 1.0;
-	      if (mpi_cartcoords[dim] == mpi_cartsz[dim] - 1) f_expect = 1.0;
-
-	      for (p = 0; p < lb->model.nvel; p++) {
-		lb_f(lb, index, p, nd, &f_actual);
-		test_assert(fabs(f_actual-f_expect) < DBL_EPSILON);
-	      }
-	    }
-	  }
-	}
-	/* Next site */
-      }
-    }
-  }
-
-  lb_free(lb);
-
-  return 0;
-}
diff --git a/tests/unit/test_io_options.c b/tests/unit/test_io_options.c
index 89b81f179..d90aec0a0 100644
--- a/tests/unit/test_io_options.c
+++ b/tests/unit/test_io_options.c
@@ -8,7 +8,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2020-2024 The University of Edinburgh
+ *  (c) 2020-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -132,6 +132,7 @@ __host__ int test_io_options_record_format_valid(void) {
 
 __host__ int test_io_options_metadata_version_valid(void) {
 
+  int ifail = 0;
   io_options_t opts = io_options_default();
   int isvalid = 0;
 
@@ -139,9 +140,11 @@ __host__ int test_io_options_metadata_version_valid(void) {
   opts.mode             = IO_MODE_MPIIO;
   opts.metadata_version = IO_METADATA_V2;
 
-  assert(io_options_metadata_version_valid(&opts));
+  isvalid = io_options_metadata_version_valid(&opts);
+  if (isvalid == 0) ifail = -1;
+  assert(ifail == 0);
 
-  return isvalid;
+  return ifail;
 }
 
 
diff --git a/tests/unit/test_model.c b/tests/unit/test_lb_data.c
similarity index 56%
rename from tests/unit/test_model.c
rename to tests/unit/test_lb_data.c
index 8fd08c8b6..cf1ca03b3 100644
--- a/tests/unit/test_model.c
+++ b/tests/unit/test_lb_data.c
@@ -1,18 +1,15 @@
 /*****************************************************************************
  *
- *  test_model.c
+ *  test_lb_data.c
  *
- *  Tests for model data: distributions, halos, i/o (pending!).
- *  PENDING: This is to be merged with test_halo.c under "test_lb_data.c".
- *  PENDING: Coverage check.
+ *  Distribution data, including halo swap and i/o.
  *
- *
- *  Edinburgh Soft Matter and Statistical Physics Group
+ *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2022 The University of Edinburgh
+ *  (c) 2025 The University of Edinburgh
  *
- *  Contributing authors:
+ *  Contributing author:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
  *****************************************************************************/
@@ -20,663 +17,344 @@
 #include <assert.h>
 #include <float.h>
 #include <math.h>
-#include <stdlib.h>
 #include <string.h>
 
-#include "pe.h"
-#include "coords.h"
-#include "util.h"
 #include "lb_data.h"
-#include "tests.h"
 
-static void test_model_velocity_set(void);
 
-int do_test_model_distributions(pe_t * pe, cs_t * cs);
-int do_test_model_halo_swap(pe_t * pe, cs_t * cs);
-int do_test_model_reduced_halo_swap(pe_t * pe, cs_t * cs);
-int do_test_lb_model_io(pe_t * pe, cs_t * cs);
+int test_lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * opts);
+int test_lb_f(pe_t * pe, cs_t * cs, const lb_data_options_t * opts);
+int test_lb_f_set(pe_t * pe, cs_t * cs, const lb_data_options_t * opts);
 
-int test_lb_data_write(pe_t * pe, cs_t * cs);
+int test_lb_data_halo(pe_t * pe, cs_t * cs, const lb_data_options_t * opts);
+  
+int test_lb_data_io(pe_t * pe, cs_t * cs);
 int test_lb_write_buf(pe_t * pe, cs_t * cs, const lb_data_options_t * opts);
 int test_lb_write_buf_ascii(pe_t * pe, cs_t * cs, const lb_data_options_t * opts);
 int test_lb_io_aggr_pack(pe_t * pe, cs_t * cs, const lb_data_options_t * opts);
 
-static  int test_model_is_domain(cs_t * cs, int ic, int jc, int kc);
-
-
-/* Utility to return a unique value for global (ic,jc,kc,p) */
-/* This allows e.g., tests to check distribution values in parallel
- * exchanges. */
-
-/* (ic, jc, kc) are local indices */
-/* Result could be unsigned integer... */
-
-#include <stdint.h>
-
-int64_t lb_data_index(lb_t * lb, int ic, int jc, int kc, int n, int p) {
-
-  int64_t index = INT64_MIN;
-  int64_t nall[3] = {0};
-  int64_t nstr[3] = {0};
-  int64_t pstr    = 0;
-  int64_t dstr    = 0;
-
-  int ntotal[3] = {0};
-  int offset[3] = {0};
-  int nhalo = 0;
-
-  assert(lb);
-  assert(0 <= p && p < lb->model.nvel);
-  assert(lb->ndist == 1 || lb->ndist == 2);
-  assert(0 <= n && n < lb->ndist);
-
-  cs_ntotal(lb->cs, ntotal);
-  cs_nlocal_offset(lb->cs, offset);
-  cs_nhalo(lb->cs, &nhalo);
 
-  nall[X] = ntotal[X] + 2*nhalo;
-  nall[Y] = ntotal[Y] + 2*nhalo;
-  nall[Z] = ntotal[Z] + 2*nhalo;
-  nstr[Z] = 1;
-  nstr[Y] = nstr[Z]*nall[Z];
-  nstr[X] = nstr[Y]*nall[Y];
-  pstr    = nstr[X]*nall[X];
-  dstr    = pstr*lb->model.nvel;
-
-  {
-    int igl = offset[X] + ic;
-    int jgl = offset[Y] + jc;
-    int kgl = offset[Z] + kc;
-
-    /* A periodic system */
-    igl = igl % ntotal[X];
-    jgl = jgl % ntotal[Y];
-    kgl = kgl % ntotal[Z];
-    if (igl < 1) igl = igl + ntotal[X];
-    if (jgl < 1) jgl = jgl + ntotal[Y];
-    if (kgl < 1) kgl = kgl + ntotal[Z];
-
-    assert(1 <= igl && igl <= ntotal[X]);
-    assert(1 <= jgl && jgl <= ntotal[Y]);
-    assert(1 <= kgl && kgl <= ntotal[Z]);
-
-    index = dstr*n + pstr*p + nstr[X]*igl + nstr[Y]*jgl + nstr[Z]*kgl;
-  }
-
-  return index;
-}
+int util_lb_data_check_initialise(lb_t * lb);
+int util_lb_data_check(lb_t * lb, int full);
+int util_lb_data_check_no_halo(lb_t * lb);
 
 /*****************************************************************************
  *
- *  util_lb_data_check_set
+ *  test_lb_data_suite
  *
- *  Set unique test values in the distribution.
- * 
  *****************************************************************************/
 
-int util_lb_data_check_set(lb_t * lb) {
+int test_lb_data_suite(void) {
 
-  int nlocal[3] = {0};
-
-  assert(lb);
-
-  cs_nlocal(lb->cs, nlocal);
-
-  for (int ic = 1; ic <= nlocal[X]; ic++) {
-    for (int jc = 1; jc <= nlocal[Y]; jc++) {
-      for (int kc = 1; kc <= nlocal[Z]; kc++) {
-	for (int n = 0; n < lb->ndist; n++) {
-	  for (int p = 0 ; p < lb->model.nvel; p++) {
-	    int index = cs_index(lb->cs, ic, jc, kc);
-	    int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
-	    lb->f[laddr] = 1.0*lb_data_index(lb, ic, jc, kc, n, p);
-	  }
-	}
-      }
-    }
-  }
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  util_lb_data_check
- *
- *  Examine halo values and check they are as expected.
- *
- *****************************************************************************/
-
-int util_lb_data_check(lb_t * lb, int full) {
-
-  int ifail = 0;
-  int nh = 1;
-  int nhk = nh;
-  int nlocal[3] = {0};
-
-  assert(lb);
-
-  cs_nlocal(lb->cs, nlocal);
-
-  /* Fix for 2d, where there should be no halo regions in Z */
-  if (lb->ndim == 2) nhk = 0;
-
-  for (int ic = 1 - nh; ic <= nlocal[X] + nh; ic++) {
-    for (int jc = 1 - nh; jc <= nlocal[Y] + nh; jc++) {
-      for (int kc = 1 - nhk; kc <= nlocal[Z] + nhk; kc++) {
+  pe_t * pe = NULL;
 
-	int is_halo = (ic < 1 || jc < 1 || kc < 1 ||
-		       ic > nlocal[X] || jc > nlocal[Y] || kc > nlocal[Z]);
+  pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
 
-	if (is_halo == 0) continue;
+  /* Two dimensional system */
+  {
+    int ntotal[3] = {64, 64, 1};
+    cs_t * cs = NULL;
 
-	int index = cs_index(lb->cs, ic, jc, kc);
+    cs_create(pe, &cs);
+    cs_ntotal_set(cs, ntotal);
+    cs_init(cs);
 
-	for (int n = 0; n < lb->ndist; n++) {
-	  for (int p = 0; p < lb->model.nvel; p++) {
+    /* D2Q9 ndist = 1 */
+    {
+      int ndist = 1;
+      lb_data_options_t opts = lb_data_options_ndim_nvel_ndist(2, 9, ndist);
 
-	    /* Look for propagating distributions (into domain). */
-	    int icdt = ic + lb->model.cv[p][X];
-	    int jcdt = jc + lb->model.cv[p][Y];
-	    int kcdt = kc + lb->model.cv[p][Z];
+      test_lb_data_create(pe, cs, &opts);
 
-	    is_halo = (icdt < 1 || jcdt < 1 || kcdt < 1 ||
-		     icdt > nlocal[X] || jcdt > nlocal[Y] || kcdt > nlocal[Z]);
+      test_lb_f(pe, cs, &opts);
+      test_lb_f_set(pe, cs, &opts);
+      test_lb_data_halo(pe, cs, &opts);
 
-	    if (full || is_halo == 0) {
-	      /* Check */
-	      int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
-	      double fex = 1.0*lb_data_index(lb, ic, jc, kc, n, p);
-	      if (fabs(fex - lb->f[laddr]) > DBL_EPSILON) ifail += 1;
-	      assert(fabs(fex - lb->f[laddr]) < DBL_EPSILON);
-	    }
-	  }
-	}
-	/* Next (ic,jc,kc) */
-      }
     }
-  }
-
-  return ifail;
-}
-
-/*****************************************************************************
- *
- *  util_lb_data_check_no_halo
- *
- *  Examine non-halo values.
- *
- *****************************************************************************/
-
-int util_lb_data_check_no_halo(lb_t * lb) {
-
-  int ifail = 0;
-  int nlocal[3] = {0};
-
-  assert(lb);
-
-  cs_nlocal(lb->cs, nlocal);
-
-  /* Fix for 2d, where there should be no halo regions in Z */
 
-  for (int ic = 1; ic <= nlocal[X]; ic++) {
-    for (int jc = 1; jc <= nlocal[Y]; jc++) {
-      for (int kc = 1; kc <= nlocal[Z]; kc++) {
+    /* D2Q9 ndist = 2 */
+    {
+      int ndist = 2;
+      lb_data_options_t opts = lb_data_options_ndim_nvel_ndist(2, 9, ndist);
 
-	int index = cs_index(lb->cs, ic, jc, kc);
+      test_lb_data_create(pe, cs, &opts);
 
-	for (int n = 0; n < lb->ndist; n++) {
-	  for (int p = 0; p < lb->model.nvel; p++) {
-	    /* Check */
-	    int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
-	    double fex = 1.0*lb_data_index(lb, ic, jc, kc, n, p);
-	    if (fabs(fex - lb->f[laddr]) > DBL_EPSILON) ifail += 1;
-	    assert(fabs(fex - lb->f[laddr]) < DBL_EPSILON);
-	  }
-	}
-	/* Next (ic,jc,kc) */
-      }
+      test_lb_f(pe, cs, &opts);
+      test_lb_f_set(pe, cs, &opts);
+      test_lb_data_halo(pe, cs, &opts);
     }
-  }
-
-  return ifail;
-}
-
-/*****************************************************************************
- *
- *  test_lb_halo_post_wait
- *
- *****************************************************************************/
-
-int test_lb_halo_post_wait(pe_t * pe, cs_t * cs, int ndim, int nvel, int full) {
-
-  lb_data_options_t options = lb_data_options_default();
-  lb_t * lb = NULL;
-
-  assert(pe);
-  assert(cs);
-
-  options.ndim = ndim;
-  options.nvel = nvel;
-  lb_data_create(pe, cs, &options, &lb);
-
-  util_lb_data_check_set(lb);
-
-  {
-    lb_halo_t h = {0};
-    lb_halo_create(lb, &h, LB_HALO_OPENMP_FULL);
-    lb_halo_post(lb, &h);
-    lb_halo_wait(lb, &h);
-    lb_halo_free(lb, &h);
-  }
-
-  util_lb_data_check(lb, full);
-  lb_free(lb);
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  test_lb_halo
- *
- *****************************************************************************/
-
-int test_lb_halo(pe_t * pe) {
-
-  assert(pe);
-
-  /* Two dimensional system */
-  {
-    cs_t * cs = NULL;
-    int ntotal[3] = {64, 64, 1};
-
-    cs_create(pe, &cs);
-    cs_ntotal_set(cs, ntotal);
-    cs_init(cs);
-
-    test_lb_halo_post_wait(pe, cs, 2, 9, LB_HALO_OPENMP_REDUCED);
-    test_lb_halo_post_wait(pe, cs, 2, 9, LB_HALO_OPENMP_FULL);
 
     cs_free(cs);
   }
 
-  /* Three dimensional system */
+  /* Three dimension system */
   {
+    int ntotal[3] = {32, 32, 32};
     cs_t * cs = NULL;
 
     cs_create(pe, &cs);
+    cs_ntotal_set(cs, ntotal);
     cs_init(cs);
 
-    test_lb_halo_post_wait(pe, cs, 3, 15, LB_HALO_OPENMP_REDUCED);
-    test_lb_halo_post_wait(pe, cs, 3, 15, LB_HALO_OPENMP_FULL);
-    test_lb_halo_post_wait(pe, cs, 3, 19, LB_HALO_OPENMP_REDUCED);
-    test_lb_halo_post_wait(pe, cs, 3, 19, LB_HALO_OPENMP_FULL);
-    test_lb_halo_post_wait(pe, cs, 3, 27, LB_HALO_OPENMP_REDUCED);
-    test_lb_halo_post_wait(pe, cs, 3, 27, LB_HALO_OPENMP_FULL);
-
-    cs_free(cs);
-  }
-
-  return 0;
-}
-
-
-/*****************************************************************************
- *
- *  test_model_suite
- *
- *****************************************************************************/
-
-int test_model_suite(void) {
-
-  pe_t * pe = NULL;
-  cs_t * cs = NULL;
-
-  pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
-
-  test_lb_halo(pe);
-
-  cs_create(pe, &cs);
-  cs_init(cs);
-
-  /* Test model structure (coordinate-independent stuff) */
-
-  test_model_velocity_set();
-
-  /* Now test actual distributions */
-
-  do_test_model_distributions(pe, cs);
-  do_test_model_halo_swap(pe, cs);
-  do_test_model_reduced_halo_swap(pe, cs);
-
-  test_lb_data_write(pe, cs);
-
-  pe_info(pe, "PASS     ./unit/test_model\n");
-  cs_free(cs);
-  pe_free(pe);
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  test_model_velocity_set
- *
- *  Some residual older tests which remain relevant.
- *
- *****************************************************************************/
-
-static void test_model_velocity_set(void) {
-
-  test_assert(NHYDRO == (1 + NDIM + NDIM*(NDIM+1)/2));
-
-  printf("Compiled model NDIM %2d NVEL %2d\n", NDIM, NVEL);
-  printf("sizeof(lb_collide_param_t) %ld bytes\n", sizeof(lb_collide_param_t));
-
-  return;
-}
-
-/*****************************************************************************
- *
- *  do_test_model_distributions
- *
- *  Test the distribution interface.
- *
- *****************************************************************************/
-
-int do_test_model_distributions(pe_t * pe, cs_t * cs) {
-
-  int i, n, p;
-  int index = 1;
-  int ndist = 2;
-  double fvalue, fvalue_expected;
-  double u[3];
+    /* D3Q15 (not much used thse days) */
+    {
+      int ndist = 1;
+      lb_data_options_t opts = lb_data_options_ndim_nvel_ndist(3, 15, ndist);
 
-  lb_data_options_t options = lb_data_options_default();
-  lb_t * lb = NULL;
+      test_lb_data_create(pe, cs, &opts);
 
-  assert(pe);
-  assert(cs);
+      test_lb_f(pe, cs, &opts);
+      test_lb_f_set(pe, cs, &opts);
+      test_lb_data_halo(pe, cs, &opts);
+    }
 
-  /* Tests of the basic distribution functions. */
+    /* D3Q19 */
+    {
+      int ndist = 1;
+      lb_data_options_t opts = lb_data_options_ndim_nvel_ndist(3, 19, ndist);
 
-  options.ndim  = NDIM;
-  options.nvel  = NVEL;   
-  options.ndist = ndist;
+      test_lb_data_create(pe, cs, &opts);
 
-  lb_data_create(pe, cs, &options, &lb);
-  assert(lb);
-  assert(lb->ndist == ndist);
-
-  for (n = 0; n < ndist; n++) {
-    for (p = 0; p < lb->model.nvel; p++) {
-      fvalue_expected = 0.01*n + lb->model.wv[p];
-      lb_f_set(lb, index, p, n, fvalue_expected);
-      lb_f(lb, index, p, n, &fvalue);
-      assert(fabs(fvalue - fvalue_expected) < DBL_EPSILON);
+      test_lb_f(pe, cs, &opts);
+      test_lb_f_set(pe, cs, &opts);
+      test_lb_data_halo(pe, cs, &opts);
     }
+    /* D3Q19 (used ndist = 2 in the past) */
+    {
+      int ndist = 2;
+      lb_data_options_t opts = lb_data_options_ndim_nvel_ndist(3, 19, ndist);
 
-    /* Check zeroth moment... */
+      test_lb_data_create(pe, cs, &opts);
 
-    fvalue_expected = 0.01*n*lb->model.nvel + 1.0;
-    lb_0th_moment(lb, index, (lb_dist_enum_t) n, &fvalue);
-    assert(fabs(fvalue - fvalue_expected) <= DBL_EPSILON);
+      test_lb_f(pe, cs, &opts);
+      test_lb_f_set(pe, cs, &opts);
+      test_lb_data_halo(pe, cs, &opts);
+    }
 
-    /* Check first moment... */
+    /* D3Q27 */
+    {
+      int ndist = 1;
+      lb_data_options_t opts = lb_data_options_ndim_nvel_ndist(3, 27, ndist);
 
-    lb_1st_moment(lb, index, (n == 0) ? LB_RHO : LB_PHI, u);
+      test_lb_data_create(pe, cs, &opts);
 
-    for (i = 0; i < lb->model.ndim; i++) {
-      assert(fabs(u[i] - 0.0) < DBL_EPSILON);
+      test_lb_f(pe, cs, &opts);
+      test_lb_f_set(pe, cs, &opts);
+      test_lb_data_halo(pe, cs, &opts);
     }
+    
+    test_lb_data_io(pe, cs);
+    cs_free(cs);
   }
 
-  lb_free(lb);
+  pe_info(pe, "%-9s %s\n", "PASS", __FILE__);
+  pe_free(pe);
 
   return 0;
 }
 
 /*****************************************************************************
  *
- *  do_test_model_halo_swap
- *
- *  Test full halo swap.
+ *  test_lb_data_create
  *
  *****************************************************************************/
 
-int do_test_model_halo_swap(pe_t * pe, cs_t * cs) {
+int test_lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * opts) {
 
-  int i, j, k, p;
-  int n, ndist = 2;
-  int index, nlocal[3];
-  const int nextra = 1;  /* Distribution halo width always 1 */
-  double f_expect;
-  double f_actual;
-
-  lb_data_options_t options = lb_data_options_default();
+  int ifail = 0;
   lb_t * lb = NULL;
 
-  assert(pe);
-  assert(cs);
-
-  options.ndim = NDIM;
-  options.nvel = NVEL;
-  options.ndist = ndist;
-  lb_data_create(pe, cs, &options, &lb);
-
-  cs_nlocal(cs, nlocal);
-
-  /* The test relies on a uniform decomposition in parallel:
-   *
-   * f[0] or f[X] is set to local x index,
-   * f[1] or f[Y] is set to local y index
-   * f[2] or f[Z] is set to local z index
-   * remainder are set to velocity index. */
-
-  for (i = 1; i <= nlocal[X]; i++) {
-    for (j = 1; j <= nlocal[Y]; j++) {
-      for (k = 1; k <= nlocal[Z]; k++) {
-
-	index = cs_index(cs, i, j, k);
-
-	for (n = 0; n < ndist; n++) {
-	  lb_f_set(lb, index, X, n, (double) (i));
-	  lb_f_set(lb, index, Y, n, (double) (j));
-	  lb_f_set(lb, index, Z, n, (double) (k));
-
-	  for (p = 3; p < lb->model.nvel; p++) {
-	    lb_f_set(lb, index, p, n, (double) p);
-	  }
-	}
-      }
-    }
-  }
-
-  lb_memcpy(lb, tdpMemcpyHostToDevice);
-  lb_halo(lb);
-  lb_memcpy(lb, tdpMemcpyDeviceToHost);
+  assert(NVELMAX == 27);
+  assert(LB_RECORD_LENGTH_ASCII == 23);
 
-  /* Test all the sites not in the interior */
+  /* Really a collision concern ... */
+  assert(NHYDRO == (1 + NDIM + NDIM*(NDIM+1)/2));
 
-  for (i = 1 - nextra; i <= nlocal[X] + nextra; i++) {
-    if (i >= 1 && i <= nlocal[X]) continue;
-    for (j = 1 - nextra; j <= nlocal[Y] + nextra; j++) {
-      if (j >= 1 && j <= nlocal[Y]) continue;
-      for (k = 1 - nextra; k <= nlocal[Z] + nextra; k++) {
-	if (k >= 1 && k <= nlocal[Z]) continue;
+  ifail = lb_data_create(pe, cs, opts, &lb);
+  assert(ifail == 0);
 
-	index = cs_index(cs, i, j, k);
+  /* Host */
 
-	for (n = 0; n < ndist; n++) {
+  assert(lb->ndim  == opts->ndim);
+  assert(lb->nvel  == opts->nvel);
+  assert(lb->ndist == opts->ndist);
+  assert(lb->nsite == cs->param->nsites);
 
-	  f_expect = 1.0*abs(i - nlocal[X]);
-	  lb_f(lb, index, X, n, &f_actual);
-	  test_assert(fabs(f_actual - f_expect) < DBL_EPSILON);
+  assert(lb->pe    == pe);
+  assert(lb->cs    == cs);
 
-	  f_expect = 1.0*abs(j - nlocal[Y]);
-	  lb_f(lb, index, Y, n, &f_actual);
-	  test_assert(fabs(f_actual - f_expect) < DBL_EPSILON);
+  /* We will assume this is a sufficient check of the model ... */
+  assert(lb->model.ndim == lb->ndim);
+  assert(lb->model.nvel == lb->nvel);
 
-	  f_expect = 1.0*abs(k - nlocal[Z]);
-	  lb_f(lb, index, Z, n, &f_actual);
-	  test_assert(fabs(f_actual - f_expect) < DBL_EPSILON);
+  /* i/o quantities are dealt with separately */
 
-	  for (p = 3; p < lb->model.nvel; p++) {
-	    lb_f(lb, index, p, n, &f_actual);
-	    f_expect = (double) p;
-	    test_assert(fabs(f_actual - f_expect) < DBL_EPSILON);
-	  }
-	}
-      }
-    }
+  /* distribution storage */
+  assert(lb->f);
+  assert(lb->fprime);
+
+  assert(lb->nrelax     == LB_RELAXATION_M10); /* Default */
+  assert(lb->haloscheme == LB_HALO_FULL);      /* Default */
+
+  /* FIXME: It looks like the total number of planes are allocated on
+     all ranks. Should be planetotal/cartsz[X] */
+  if (cs->leopts.nplanes > 0) {
+    assert(lb->sbuff);
+    assert(lb->rbuff);
   }
 
+  /* Target */
+  /* Should really have a kernel to check the device copy */
+
   lb_free(lb);
 
-  return 0;
+  return ifail;
 }
 
 /*****************************************************************************
  *
- *  do_test_model_reduced_halo_swap
+ *  test_lb_f
  *
  *****************************************************************************/
 
-int do_test_model_reduced_halo_swap(pe_t * pe, cs_t * cs) {  
-
-  int i, j, k, p;
-  int icdt, jcdt, kcdt;
-  int index, nlocal[3];
-  int n, ndist = 1;
-  const int nextra = 1;
-
-  double f_expect;
-  double f_actual;
+int test_lb_f(pe_t * pe, cs_t * cs, const lb_data_options_t * opts) {
 
-  lb_data_options_t options = lb_data_options_default();
+  int ifail = 0;
   lb_t * lb = NULL;
 
-  assert(pe);
-  assert(cs);
-
-  options.ndim = NDIM;
-  options.nvel = NVEL;
-  options.ndist = ndist;
-  options.halo = LB_HALO_OPENMP_REDUCED;
-  lb_data_create(pe, cs, &options, &lb);
-  assert(lb);
-
-  cs_nlocal(cs, nlocal);
+  ifail = lb_data_create(pe, cs, opts, &lb);
+  assert(ifail == 0);
 
-  /* Set everything which is NOT in a halo */
+  /* Assign some non-zero values */
+  {
+    int index = 13;
 
-  for (i = 1; i <= nlocal[X]; i++) {
-    for (j = 1; j <= nlocal[Y]; j++) {
-      for (k = 1; k <= nlocal[Z]; k++) {
-	index = cs_index(cs, i, j, k);
-	for (n = 0; n < ndist; n++) {
-	  for (p = 0; p < lb->model.nvel; p++) {
-	    f_expect = 1.0*(n*lb->model.nvel + p);
-	    lb_f_set(lb, index, p, n, f_expect);
-	  }
-	}
+    for (int n = 0; n < lb->ndist; n++) {
+      for (int p = 0; p < lb->nvel; p++) {
+	int iaddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
+	lb->f[iaddr] = 1.0*((1 + n)*lb->nvel + 1 + p);
       }
     }
-  }
 
-  lb_halo(lb);
-
-  /* Now check that the interior sites are unchanged */
-
-  for (i = 1; i <= nlocal[X]; i++) {
-    for (j = 1; j <= nlocal[Y]; j++) {
-      for (k = 1; k <= nlocal[Z]; k++) {
-	index = cs_index(cs, i, j, k);
-	for (n = 0; n < ndist; n++) {
-	  for (p = 0; p < lb->model.nvel; p++) {
-	    lb_f(lb, index, p, n, &f_actual);
-	    f_expect = 1.0*(n*lb->model.nvel +  p);
-	    test_assert(fabs(f_expect - f_actual) < DBL_EPSILON);
-	  }
-	}
+    for (int n = 0; n < lb->ndist; n++) {
+      for (int p = 0; p < lb->nvel; p++) {
+	double f = 0.0;
+	lb_f(lb, index, p, (n == 0) ? LB_RHO : LB_PHI, &f);
+	assert(fabs(f - 1.0*((1 + n)*lb->nvel + 1 + p) < DBL_EPSILON));
       }
     }
   }
 
-  /* Also check the halos sites. The key test of the reduced halo
-   * swap is that distributions for which r + c_i dt takes us into
-   * the domain proper must be correct. */
+  lb_free(lb);
 
-  for (i = 1 - nextra; i <= nlocal[X] + nextra; i++) {
-    if (i >= 1 && i <= nlocal[X]) continue;
-    for (j = 1 - nextra; j <= nlocal[Y] + nextra; j++) {
-      if (j >= 1 && j <= nlocal[Y]) continue;
-      for (k = 1 - nextra; k <= nlocal[Z] + nextra; k++) {
-	if (k >= 1 && k <= nlocal[Z]) continue;
+  return ifail;
+}
 
-	index = cs_index(cs, i, j, k);
+/*****************************************************************************
+ *
+ *  test_lb_f_set
+ *
+ *****************************************************************************/
 
-	for (n = 0; n < ndist; n++) {
-	  for (p = 0; p < lb->model.nvel; p++) {
+int test_lb_f_set(pe_t * pe, cs_t * cs, const lb_data_options_t * opts) {
 
-	    lb_f(lb, index, p, n, &f_actual);
-	    f_expect = 1.0*(n*lb->model.nvel + p);
+  int ifail = 0;
+  lb_t * lb = NULL;
 
-	    icdt = i + lb->model.cv[p][X];
-	    jcdt = j + lb->model.cv[p][Y];
-	    kcdt = k + lb->model.cv[p][Z];
+  ifail = lb_data_create(pe, cs, opts, &lb);
+  assert(ifail == 0);
 
-	    if (test_model_is_domain(cs, icdt, jcdt, kcdt)) {
-	      test_assert(fabs(f_actual - f_expect) < DBL_EPSILON);
-	    }
-	  }
-	}
+  {
+    /* Assign some values */
+    int index = 12;
 
-	/* Next site */
+    for (int n = 0; n < lb->ndist; n++) {
+      for (int p = 0; p < lb->nvel; p++) {
+	double f = 1.0*((1 + n)*lb->nvel + 1 + p);
+	lb_f_set(lb, index, p, (n == 0) ? LB_RHO : LB_PHI, f);
+      }
+    }
+    /* .. and chcek */
+    for (int n = 0; n < lb->ndist; n++) {
+      for (int p = 0; p < lb->nvel; p++) {
+	int iaddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
+	double f = 1.0*((1 + n)*lb->nvel + 1 + p);
+	if (fabs(lb->f[iaddr] - f) < DBL_EPSILON) ifail = -1;
+	assert(ifail == 0);
       }
     }
   }
 
   lb_free(lb);
 
-  return 0;
+  return ifail;
 }
 
 /*****************************************************************************
  *
- *  test_model_is_domain
+ *  test_lb_data_halo
  *
- *  Is (ic, jc, kc) in the domain proper?
+ *  Driver.
  *
  *****************************************************************************/
 
-static int test_model_is_domain(cs_t * cs, int ic, int jc, int kc) {
+int test_lb_data_halo(pe_t * pe, cs_t * cs, const lb_data_options_t * opts) {
 
-  int nlocal[3];
-  int iam = 1;
+  int ifail = 0;
 
-  assert(cs);
+  /* Full halo should be default */
+  assert(opts->halo == LB_HALO_FULL);
 
-  cs_nlocal(cs, nlocal);
+  /* Full halo */
+  {
+    lb_t * lb = NULL;
+
+    ifail = lb_data_create(pe, cs, opts, &lb);
+    assert(ifail == 0);
+
+    util_lb_data_check_initialise(lb);
+    lb_memcpy(lb, tdpMemcpyHostToDevice);
+
+    lb_halo(lb);
+
+    lb_memcpy(lb, tdpMemcpyDeviceToHost);
+    util_lb_data_check(lb, 1);
+    lb_free(lb);
+  }
+
+  /* Reduced halo */
+  {
+    lb_t * lb = NULL;
+    lb_data_options_t reduced_opts = *opts;
+    reduced_opts.halo = LB_HALO_REDUCED;
+
+    ifail = lb_data_create(pe, cs, &reduced_opts, &lb);
+    assert(ifail == 0);
+
+    util_lb_data_check_initialise(lb);
+    lb_memcpy(lb, tdpMemcpyHostToDevice);
+
+    lb_halo(lb);
+
+    lb_memcpy(lb, tdpMemcpyDeviceToHost);
+    util_lb_data_check(lb, 0);
 
-  if (ic < 1) iam = 0;
-  if (jc < 1) iam = 0;
-  if (kc < 1) iam = 0;
-  if (ic > nlocal[X]) iam = 0;
-  if (jc > nlocal[Y]) iam = 0;
-  if (kc > nlocal[Z]) iam = 0;
+    lb_free(lb);
+  }
 
-  return iam;
+  return ifail;
 }
 
 /*****************************************************************************
  *
- *  test_lb_data_write
+ *  test_lb_data_io
+ *
+ *  Driver for all read/write io routines.
+ *  E.g., test_lb_write_buf() also deals with lb_read_buf()
  *
  *****************************************************************************/
 
-int test_lb_data_write(pe_t * pe, cs_t * cs) {
+int test_lb_data_io(pe_t * pe, cs_t * cs) {
 
   assert(NVELMAX == 27);
 
@@ -882,7 +560,7 @@ int test_lb_io_aggr_pack(pe_t * pe, cs_t * cs, const lb_data_options_t *opts) {
     io_aggregator_t aggr = {0};
 
     io_aggregator_initialise(lb->ascii, lim, &aggr);
-    util_lb_data_check_set(lb);
+    util_lb_data_check_initialise(lb);
     lb_io_aggr_pack(lb, &aggr);
 
     /* Clear the ditributions, unpack, and check */
@@ -902,7 +580,7 @@ int test_lb_io_aggr_pack(pe_t * pe, cs_t * cs, const lb_data_options_t *opts) {
     io_aggregator_t aggr = {0};
 
     io_aggregator_initialise(lb->binary, lim, &aggr);
-    util_lb_data_check_set(lb);
+    util_lb_data_check_initialise(lb);
     lb_io_aggr_pack(lb, &aggr);
 
     /* Clear the ditributions, unpack, and check */
@@ -918,3 +596,205 @@ int test_lb_io_aggr_pack(pe_t * pe, cs_t * cs, const lb_data_options_t *opts) {
 
   return 0;
 }
+
+/*****************************************************************************
+ *
+ *  util_lb_data_index
+ *
+ *  Utility to return a unique value for global (ic,jc,kc,p)
+ *  This allows e.g., tests to check distribution values in parallel
+ *  exchanges.
+ *
+ *  (ic, jc, kc) are local indices
+ *  The result could be an unsigned integer, but does need 64 bit...
+ *
+ *****************************************************************************/
+
+int64_t util_lb_data_index(lb_t * lb, int ic, int jc, int kc, int n, int p) {
+
+  int64_t index = INT64_MIN;
+  int64_t nall[3] = {0};
+  int64_t nstr[3] = {0};
+  int64_t pstr    = 0;
+  int64_t dstr    = 0;
+
+  int ntotal[3] = {0};
+  int offset[3] = {0};
+  int nhalo = 0;
+
+  assert(lb);
+  assert(0 <= p && p < lb->model.nvel);
+  assert(lb->ndist == 1 || lb->ndist == 2);
+  assert(0 <= n && n < lb->ndist);
+
+  cs_ntotal(lb->cs, ntotal);
+  cs_nlocal_offset(lb->cs, offset);
+  cs_nhalo(lb->cs, &nhalo);
+
+  nall[X] = ntotal[X] + 2*nhalo;
+  nall[Y] = ntotal[Y] + 2*nhalo;
+  nall[Z] = ntotal[Z] + 2*nhalo;
+  nstr[Z] = 1;
+  nstr[Y] = nstr[Z]*nall[Z];
+  nstr[X] = nstr[Y]*nall[Y];
+  pstr    = nstr[X]*nall[X];
+  dstr    = pstr*lb->model.nvel;
+
+  {
+    int igl = offset[X] + ic;
+    int jgl = offset[Y] + jc;
+    int kgl = offset[Z] + kc;
+
+    /* A periodic system */
+    igl = igl % ntotal[X];
+    jgl = jgl % ntotal[Y];
+    kgl = kgl % ntotal[Z];
+    if (igl < 1) igl = igl + ntotal[X];
+    if (jgl < 1) jgl = jgl + ntotal[Y];
+    if (kgl < 1) kgl = kgl + ntotal[Z];
+
+    assert(1 <= igl && igl <= ntotal[X]);
+    assert(1 <= jgl && jgl <= ntotal[Y]);
+    assert(1 <= kgl && kgl <= ntotal[Z]);
+
+    index = dstr*n + pstr*p + nstr[X]*igl + nstr[Y]*jgl + nstr[Z]*kgl;
+  }
+
+  return index;
+}
+
+/*****************************************************************************
+ *
+ *  util_lb_data_check_set
+ *
+ *  Set unique test values in the distribution.
+ * 
+ *****************************************************************************/
+
+int util_lb_data_check_initialise(lb_t * lb) {
+
+  int nlocal[3] = {0};
+
+  assert(lb);
+
+  cs_nlocal(lb->cs, nlocal);
+
+  for (int ic = 1; ic <= nlocal[X]; ic++) {
+    for (int jc = 1; jc <= nlocal[Y]; jc++) {
+      for (int kc = 1; kc <= nlocal[Z]; kc++) {
+	for (int n = 0; n < lb->ndist; n++) {
+	  for (int p = 0 ; p < lb->model.nvel; p++) {
+	    int index = cs_index(lb->cs, ic, jc, kc);
+	    int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
+	    lb->f[laddr] = 1.0*util_lb_data_index(lb, ic, jc, kc, n, p);
+	  }
+	}
+      }
+    }
+  }
+
+  return 0;
+}
+
+/*****************************************************************************
+ *
+ *  util_lb_data_check
+ *
+ *  Examine halo values and check they are as expected.
+ *
+ *****************************************************************************/
+
+int util_lb_data_check(lb_t * lb, int full) {
+
+  int ifail = 0;
+  int nh = 1;
+  int nhk = nh;
+  int nlocal[3] = {0};
+
+  assert(lb);
+
+  cs_nlocal(lb->cs, nlocal);
+
+  /* Fix for 2d, where there should be no halo regions in Z */
+  if (lb->ndim == 2) nhk = 0;
+
+  for (int ic = 1 - nh; ic <= nlocal[X] + nh; ic++) {
+    for (int jc = 1 - nh; jc <= nlocal[Y] + nh; jc++) {
+      for (int kc = 1 - nhk; kc <= nlocal[Z] + nhk; kc++) {
+
+	int is_halo = (ic < 1 || jc < 1 || kc < 1 ||
+		       ic > nlocal[X] || jc > nlocal[Y] || kc > nlocal[Z]);
+
+	if (is_halo == 0) continue;
+
+	int index = cs_index(lb->cs, ic, jc, kc);
+
+	for (int n = 0; n < lb->ndist; n++) {
+	  for (int p = 0; p < lb->model.nvel; p++) {
+
+	    /* Look for propagating distributions (into domain). */
+	    int icdt = ic + lb->model.cv[p][X];
+	    int jcdt = jc + lb->model.cv[p][Y];
+	    int kcdt = kc + lb->model.cv[p][Z];
+
+	    int is_prop = (icdt < 1 || jcdt < 1 || kcdt < 1 ||
+		     icdt > nlocal[X] || jcdt > nlocal[Y] || kcdt > nlocal[Z]);
+
+	    if (full || is_prop == 0) {
+	      /* Check */
+	      int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
+	      double fex = 1.0*util_lb_data_index(lb, ic, jc, kc, n, p);
+	      if (fabs(fex - lb->f[laddr]) > DBL_EPSILON) ifail += 1;
+	      assert(fabs(fex - lb->f[laddr]) < DBL_EPSILON);
+	    }
+	  }
+	}
+	/* Next (ic,jc,kc) */
+      }
+    }
+  }
+
+  return ifail;
+}
+
+/*****************************************************************************
+ *
+ *  util_lb_data_check_no_halo
+ *
+ *  Examine non-halo values.
+ *
+ *****************************************************************************/
+
+int util_lb_data_check_no_halo(lb_t * lb) {
+
+  int ifail = 0;
+  int nlocal[3] = {0};
+
+  assert(lb);
+
+  cs_nlocal(lb->cs, nlocal);
+
+  /* Fix for 2d, where there should be no halo regions in Z */
+
+  for (int ic = 1; ic <= nlocal[X]; ic++) {
+    for (int jc = 1; jc <= nlocal[Y]; jc++) {
+      for (int kc = 1; kc <= nlocal[Z]; kc++) {
+
+	int index = cs_index(lb->cs, ic, jc, kc);
+
+	for (int n = 0; n < lb->ndist; n++) {
+	  for (int p = 0; p < lb->model.nvel; p++) {
+	    /* Check */
+	    int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
+	    double fex = 1.0*util_lb_data_index(lb, ic, jc, kc, n, p);
+	    if (fabs(fex - lb->f[laddr]) > DBL_EPSILON) ifail += 1;
+	    assert(fabs(fex - lb->f[laddr]) < DBL_EPSILON);
+	  }
+	}
+	/* Next (ic,jc,kc) */
+      }
+    }
+  }
+
+  return ifail;
+}
diff --git a/tests/unit/test_prop.c b/tests/unit/test_prop.c
index 87784b0ad..668d94685 100644
--- a/tests/unit/test_prop.c
+++ b/tests/unit/test_prop.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2024 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -25,10 +25,10 @@
 #include "propagation.h"
 #include "tests.h"
 
-__host__ int do_test_velocity(pe_t * pe, cs_t * cs, int ndist,
-			      lb_halo_enum_t halo);
-__host__ int do_test_source_destination(pe_t * pe, cs_t * cs, int ndist,
-					lb_halo_enum_t halo);
+int do_test_velocity(pe_t * pe, cs_t * cs, int ndist,
+		     lb_halo_enum_t halo);
+int do_test_source_destination(pe_t * pe, cs_t * cs, int ndist,
+			       lb_halo_enum_t halo);
 
 /*****************************************************************************
  *
@@ -38,7 +38,6 @@ __host__ int do_test_source_destination(pe_t * pe, cs_t * cs, int ndist,
 
 int test_lb_prop_suite(void) {
 
-  int ndevice = 0;
   pe_t * pe = NULL;
   cs_t * cs = NULL;
 
@@ -46,20 +45,20 @@ int test_lb_prop_suite(void) {
   cs_create(pe, &cs);
   cs_init(cs);
 
-  tdpAssert( tdpGetDeviceCount(&ndevice) );
-
-  do_test_velocity(pe, cs, 1, LB_HALO_TARGET);
-  do_test_velocity(pe, cs, 2, LB_HALO_TARGET);
-  if (ndevice == 0) {
-    do_test_velocity(pe, cs, 1, LB_HALO_OPENMP_FULL);
-    do_test_velocity(pe, cs, 1, LB_HALO_OPENMP_REDUCED);
+  {
+    int ndist = 1;
+    do_test_velocity(pe, cs, ndist, LB_HALO_FULL);
+    do_test_velocity(pe, cs, ndist, LB_HALO_REDUCED);
+    do_test_source_destination(pe, cs, ndist, LB_HALO_FULL);
+    do_test_source_destination(pe, cs, ndist, LB_HALO_REDUCED);
   }
 
-  do_test_source_destination(pe, cs, 1, LB_HALO_TARGET);
-  do_test_source_destination(pe, cs, 2, LB_HALO_TARGET);
-  if (ndevice == 0) {
-    do_test_source_destination(pe, cs, 1, LB_HALO_OPENMP_FULL);
-    do_test_source_destination(pe, cs, 1, LB_HALO_OPENMP_REDUCED);
+  {
+    int ndist = 2;
+    do_test_velocity(pe, cs, ndist, LB_HALO_FULL);
+    do_test_velocity(pe, cs, ndist, LB_HALO_REDUCED);
+    do_test_source_destination(pe, cs, ndist, LB_HALO_FULL);
+    do_test_source_destination(pe, cs, ndist, LB_HALO_REDUCED);
   }
 
   pe_info(pe, "PASS     ./unit/test_prop\n");
diff --git a/tests/unit/test_util_ellipsoid.c b/tests/unit/test_util_ellipsoid.c
index 9def6e782..196cb8165 100644
--- a/tests/unit/test_util_ellipsoid.c
+++ b/tests/unit/test_util_ellipsoid.c
@@ -5,7 +5,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2023-2024 The University of Edinburgh
+ *  (c) 2023-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -891,7 +891,8 @@ int test_util_q4_distance_to_tangent_plane(void) {
       double d = -1.0;
       double nhat[3] = {0.0, 0.0, 1.0};
       d = util_q4_distance_to_tangent_plane(abc, q, nhat);
-      assert(fabs(d - c) < DBL_EPSILON);
+      ifail = !(fabs(d - c) < DBL_EPSILON);
+      assert(ifail == 0);
     }
 
     /* y-z plane */
@@ -899,7 +900,8 @@ int test_util_q4_distance_to_tangent_plane(void) {
       double d = -1.0;
       double nhat[3] = {1.0, 0.0, 0.0};
       d = util_q4_distance_to_tangent_plane(abc, q, nhat);
-      assert(fabs(d - a) < DBL_EPSILON);
+      ifail = !(fabs(d - a) < DBL_EPSILON);
+      assert(ifail == 0);
     }
 
     /* x-z plane */
@@ -907,7 +909,8 @@ int test_util_q4_distance_to_tangent_plane(void) {
       double d = -1.0;
       double nhat[3] = {0.0, 1.0, 0.0};
       d = util_q4_distance_to_tangent_plane(abc, q, nhat);
-      assert(fabs(d - b) < DBL_EPSILON);
+      ifail = !(fabs(d - b) < DBL_EPSILON);
+      assert(ifail == 0);
     }
   }
 
@@ -927,7 +930,8 @@ int test_util_q4_distance_to_tangent_plane(void) {
       double d = -1.0;
       double nhat[3] = {0.0, 0.0, -1.0};
       d = util_q4_distance_to_tangent_plane(abc, q, nhat);
-      assert(fabs(d - c) < FLT_EPSILON);
+      ifail = !(fabs(d - c) < FLT_EPSILON);
+      assert(ifail == 0);
     }
     /* y-z plane */
     {
@@ -935,7 +939,8 @@ int test_util_q4_distance_to_tangent_plane(void) {
       double d0 = a*a*cos(phi)*cos(phi) + b*b*sin(phi)*sin(phi);
       double nhat[3] = {-1.0, 0.0, 0.0};
       d = util_q4_distance_to_tangent_plane(abc, q, nhat);
-      assert(fabs(d - sqrt(d0)) < FLT_EPSILON);
+      ifail = !(fabs(d - sqrt(d0)) < FLT_EPSILON);
+      assert(ifail == 0);
     }
   }
 
@@ -955,7 +960,8 @@ int test_util_q4_distance_to_tangent_plane(void) {
       double d = -1.0;
       double nhat[3] = {0.0, 0.0, -1.0};
       d = util_q4_distance_to_tangent_plane(abc, q, nhat);
-      assert(fabs(d - c) < FLT_EPSILON);
+      ifail = !(fabs(d - c) < FLT_EPSILON);
+      assert(ifail == 0);
     }
     /* y-z plane */
     {
@@ -963,7 +969,8 @@ int test_util_q4_distance_to_tangent_plane(void) {
       double d0 = a*a*cos(phi)*cos(phi) + b*b*sin(phi)*sin(phi);
       double nhat[3] = {-1.0, 0.0, 0.0};
       d = util_q4_distance_to_tangent_plane(abc, q, nhat);
-      assert(fabs(d - sqrt(d0)) < FLT_EPSILON);
+      ifail = !(fabs(d - sqrt(d0)) < FLT_EPSILON);
+      assert(ifail == 0);
     }
   }
 
@@ -982,7 +989,8 @@ int test_util_q4_distance_to_tangent_plane(void) {
       double d = -1.0;
       double nhat[3] = {+1.0, 0.0, 0.0};
       d = util_q4_distance_to_tangent_plane(abc, q, nhat);
-      assert(fabs(d - b) < FLT_EPSILON);
+      ifail = !(fabs(d - b) < FLT_EPSILON);
+      assert(ifail == 0);
     }
   }
 
@@ -1001,7 +1009,8 @@ int test_util_q4_distance_to_tangent_plane(void) {
       double d = -1.0;
       double nhat[3] = {+1.0, 0.0, 0.0};
       d = util_q4_distance_to_tangent_plane(abc, q, nhat);
-      assert(fabs(d - c) < FLT_EPSILON);
+      ifail = !(fabs(d - c) < FLT_EPSILON);
+      assert(ifail == 0);
     }
   }
 
diff --git a/tests/unit/tests.c b/tests/unit/tests.c
index 7879d2039..5157a2f10 100644
--- a/tests/unit/tests.c
+++ b/tests/unit/tests.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2024 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
@@ -93,7 +93,6 @@ __host__ int tests_create(int argc, char ** argv) {
   test_fe_force_method_rt_suite();
   test_field_suite();
   test_field_grad_suite();
-  test_halo_suite();
   test_hydro_options_suite();
   test_hydro_suite();
   test_interaction_suite();
@@ -103,6 +102,9 @@ __host__ int tests_create(int argc, char ** argv) {
   test_lb_d3q19_suite();
   test_lb_d3q27_suite();
   test_lb_model_suite();
+
+  test_lb_data_suite();
+
   test_lb_bc_inflow_opts_suite();
   test_lb_bc_inflow_rhou_suite();
   test_lb_bc_outflow_opts_suite();
@@ -115,8 +117,6 @@ __host__ int tests_create(int argc, char ** argv) {
   test_map_suite();
   test_map_init_suite();
 
-  test_model_suite();
-
   /* Noise tests */
   test_noise_options_suite();
   test_noise_suite();
diff --git a/tests/unit/tests.h b/tests/unit/tests.h
index 7b1b9922b..1cca36420 100644
--- a/tests/unit/tests.h
+++ b/tests/unit/tests.h
@@ -5,7 +5,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2024 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
@@ -53,7 +53,6 @@ int test_fe_force_method_rt_suite(void);
 int test_field_suite(void);
 int test_field_grad_suite(void);
 int test_gradient_d3q27_suite(void);
-int test_halo_suite(void);
 int test_hydro_options_suite(void);
 int test_hydro_suite(void);
 int test_interaction_suite(void);
@@ -72,6 +71,7 @@ int test_lb_d3q15_suite(void);
 int test_lb_d3q19_suite(void);
 int test_lb_d3q27_suite(void);
 int test_lb_model_suite(void);
+int test_lb_data_suite(void);
 int test_lb_bc_inflow_opts_suite(void);
 int test_lb_bc_inflow_rhou_suite(void);
 int test_lb_bc_outflow_opts_suite(void);
@@ -85,7 +85,6 @@ int test_lubrication_suite(void);
 int test_map_options_suite(void);
 int test_map_suite(void);
 int test_map_init_suite(void);
-int test_model_suite(void);
 int test_nernst_planck_suite(void);
 
 int test_noise_options_suite(void);

From 4847c0574215e9ae64caf9b3962615f96e5b8bca Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@epcc.ed.ac.uk>
Date: Sun, 9 Feb 2025 18:53:55 +0000
Subject: [PATCH 119/133] Merge development

---
 src/lb_data.c             | 54 +++++++++++++++------------------------
 tests/unit/test_lb_data.c |  2 --
 2 files changed, 20 insertions(+), 36 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 3963deccb..02bfdfb8b 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -47,33 +47,17 @@ int halo_free_device_model(lb_halo_t *h);
 int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h);
 int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h);
 
-static __constant__ lb_collide_param_t static_param;
-
-#ifdef HAVE_OPENMPI_
-/* This provides MPIX_CUDA_AWARE_SUPPORT .. */
-#include "mpi-ext.h"
-#endif
-
-#ifdef __NVCC__
-/* There are two file-scope switches here, which need to be generalised
- * via some suitable interface; they are separate, but both relate to
- * GPU execution. */
-static const int have_graph_api_ = 1;
-#else
-static const int have_graph_api_ = 0;
-#endif
-
-#if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
-static const int have_gpu_aware_mpi_ = 1;
-#else
-static const int have_gpu_aware_mpi_ = 0;
-#endif
-
 int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme);
 int lb_halo_post(lb_t * lb, lb_halo_t * h);
 int lb_halo_wait(lb_t * lb, lb_halo_t * h);
 int lb_halo_free(lb_t * lb, lb_halo_t * h);
 
+static __constant__ lb_collide_param_t static_param;
+
+/* We have a switch to CUDA graph API, which is going to get switched
+ * on if ndvice > 0. We may remove the non-graph option in furture. */
+static int use_graph_api_ = 0;
+
 /*****************************************************************************
  *
  *  lb_data_create
@@ -215,7 +199,8 @@ int lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * options,
 
   /* Lees Edwards */
   {
-    int nplane = cs->leopts.nplanes;
+    /* Local number of planes */
+    int nplane = cs->leopts.nplanes/cs->param->mpi_cartsz[X];
 
     if (nplane > 0) {
 
@@ -1401,6 +1386,7 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
     h->target = h;
   }
   else {
+    use_graph_api_ = 1; /* Always */
     tdpAssert( tdpMalloc((void **) &h->target, sizeof(lb_halo_t)) );
     tdpAssert( tdpMemset(h->target, 0, sizeof(lb_halo_t)));
     tdpAssert( tdpMemcpy(h->target, h, sizeof(lb_halo_t),
@@ -1422,7 +1408,7 @@ int lb_halo_create(const lb_t * lb, lb_halo_t * h, lb_halo_enum_t scheme) {
 
     halo_initialise_device_model(h);
 
-    if (have_graph_api_) {
+    if (use_graph_api_) {
       lb_graph_halo_send_create(lb, h);
       lb_graph_halo_recv_create(lb, h);
     }
@@ -1466,7 +1452,7 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
       int k = 1 + h->map.cv[h->map.nvel-ireq][Z];
       int mcount = h->count[ireq]*lb_halo_size(h->rlim[ireq]);
       double * buf = h->recv[ireq];
-      if (have_gpu_aware_mpi_) buf = h->recv_d[ireq];
+      if (have_gpu_aware_mpi_()) buf = h->recv_d[ireq];
 
       if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) continue;
       
@@ -1485,7 +1471,7 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
   int ndevice;
   tdpGetDeviceCount(&ndevice);
   if (ndevice > 0) {
-    if (have_graph_api_) {
+    if (use_graph_api_) {
       tdpAssert( tdpGraphLaunch(h->gsend.exec, h->stream) );
       tdpAssert( tdpStreamSynchronize(h->stream) );
     } else {
@@ -1497,7 +1483,7 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
           tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
           tdpAssert( tdpDeviceSynchronize());
  
-          if (!have_gpu_aware_mpi_) {
+          if (!have_gpu_aware_mpi_()) {
             tdpAssert( tdpMemcpy(h->send[ireq], h->send_d[ireq], sizeof(double)*scount, tdpMemcpyDeviceToHost));
           }
         }
@@ -1526,7 +1512,7 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
       int k = 1 + h->map.cv[ireq][Z];
       int mcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
       double * buf = h->send[ireq];
-      if (have_gpu_aware_mpi_) buf = h->send_d[ireq];
+      if (have_gpu_aware_mpi_()) buf = h->send_d[ireq];
 
       /* Short circuit messages to self. */
       if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) continue;
@@ -1563,14 +1549,14 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
   int ndevice;
   tdpGetDeviceCount(&ndevice);
   if (ndevice > 0) {
-    if (have_graph_api_) {
+    if (use_graph_api_) {
       tdpAssert( tdpGraphLaunch(h->grecv.exec, h->stream) );
       tdpAssert( tdpStreamSynchronize(h->stream) );
     } else {
       for (int ireq = 0; ireq < h->map.nvel; ireq++) {
         if (h->count[ireq] > 0) {
           int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
-          if (!have_gpu_aware_mpi_) {
+          if (!have_gpu_aware_mpi_()) {
             tdpAssert( tdpMemcpy(h->recv[ireq], h->recv_d[ireq], sizeof(double)*rcount, tdpMemcpyDeviceToHost));
           }
           dim3 nblk, ntpb;
@@ -1629,7 +1615,7 @@ int lb_halo_free(lb_t * lb, lb_halo_t * h) {
     free(h->recv[ireq]);
   }
 
-  if (have_graph_api_) {
+  if (use_graph_api_) {
     tdpAssert( tdpGraphDestroy(h->gsend.graph) );
     tdpAssert( tdpGraphDestroy(h->grecv.graph) );
   }
@@ -1945,7 +1931,7 @@ int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h) {
     tdpAssert( tdpGraphAddKernelNode(&kernelNode, h->gsend.graph, NULL, 0,
 				     &kernelNodeParams) );
 
-    if (have_gpu_aware_mpi_) {
+    if (have_gpu_aware_mpi_()) {
       /* Don't need explicit device -> host copy */
     }
     else {
@@ -2002,7 +1988,7 @@ int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h) {
     if (h->count[ireq] == 0) continue;
     tdpGraphNode_t memcpyNode = {0};
 
-    if (have_gpu_aware_mpi_) {
+    if (have_gpu_aware_mpi_()) {
       /* Don't need explicit copies */
     }
     else {
@@ -2050,7 +2036,7 @@ int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h) {
     kernelNodeParams.kernelParams   = (void **) kernelArgs;
     kernelNodeParams.extra          = NULL;
 
-    if (have_gpu_aware_mpi_) {
+    if (have_gpu_aware_mpi_()) {
       tdpAssert( tdpGraphAddKernelNode(&node, h->grecv.graph, NULL,
 				       0, &kernelNodeParams) );
     }
diff --git a/tests/unit/test_lb_data.c b/tests/unit/test_lb_data.c
index cf1ca03b3..32cc36089 100644
--- a/tests/unit/test_lb_data.c
+++ b/tests/unit/test_lb_data.c
@@ -196,8 +196,6 @@ int test_lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * opts) {
   assert(lb->nrelax     == LB_RELAXATION_M10); /* Default */
   assert(lb->haloscheme == LB_HALO_FULL);      /* Default */
 
-  /* FIXME: It looks like the total number of planes are allocated on
-     all ranks. Should be planetotal/cartsz[X] */
   if (cs->leopts.nplanes > 0) {
     assert(lb->sbuff);
     assert(lb->rbuff);

From 77cd7a34efff5bb9b5566d19a80a8f032874452a Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@epcc.ed.ac.uk>
Date: Sun, 9 Feb 2025 19:03:28 +0000
Subject: [PATCH 120/133] Correct logix

---
 tests/unit/test_lb_data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_lb_data.c b/tests/unit/test_lb_data.c
index 32cc36089..c36a1981c 100644
--- a/tests/unit/test_lb_data.c
+++ b/tests/unit/test_lb_data.c
@@ -277,7 +277,7 @@ int test_lb_f_set(pe_t * pe, cs_t * cs, const lb_data_options_t * opts) {
       for (int p = 0; p < lb->nvel; p++) {
 	int iaddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
 	double f = 1.0*((1 + n)*lb->nvel + 1 + p);
-	if (fabs(lb->f[iaddr] - f) < DBL_EPSILON) ifail = -1;
+	ifail = !(fabs(lb->f[iaddr] - f) < DBL_EPSILON);
 	assert(ifail == 0);
       }
     }

From 43fdce09be339e8a9cdcef1d8bd0409b97766b38 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Sun, 16 Feb 2025 13:04:18 +0000
Subject: [PATCH 121/133] Correction to position of ()

---
 tests/unit/test_lb_data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_lb_data.c b/tests/unit/test_lb_data.c
index c36a1981c..a2f7c2a1b 100644
--- a/tests/unit/test_lb_data.c
+++ b/tests/unit/test_lb_data.c
@@ -238,7 +238,7 @@ int test_lb_f(pe_t * pe, cs_t * cs, const lb_data_options_t * opts) {
       for (int p = 0; p < lb->nvel; p++) {
 	double f = 0.0;
 	lb_f(lb, index, p, (n == 0) ? LB_RHO : LB_PHI, &f);
-	assert(fabs(f - 1.0*((1 + n)*lb->nvel + 1 + p) < DBL_EPSILON));
+	assert(fabs(f - 1.0*((1 + n)*lb->nvel + 1 + p)) < DBL_EPSILON);
       }
     }
   }

From 4f085496a047ed4f0c4e5528037b161e1fd91613 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Sun, 16 Feb 2025 13:10:06 +0000
Subject: [PATCH 122/133] Move to static inline

---
 src/lb_data.c | 146 ++++++-------------------------------------------
 src/lb_data.h | 149 +++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 138 insertions(+), 157 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 02bfdfb8b..3b374d027 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -248,9 +248,9 @@ int lb_data_create(pe_t * pe, cs_t * cs, const lb_data_options_t * options,
  *
  *****************************************************************************/
 
-__host__ int lb_free(lb_t * lb) {
+int lb_free(lb_t * lb) {
 
-  int ndevice;
+  int ndevice = 0;
 
   assert(lb);
 
@@ -475,7 +475,7 @@ int halo_free_device_model(lb_halo_t * h) {
  *
  *****************************************************************************/
 
-__host__ int lb_memcpy(lb_t * lb, tdpMemcpyKind flag) {
+int lb_memcpy(lb_t * lb, tdpMemcpyKind flag) {
 
   int ndevice;
   double * tmpf = NULL;
@@ -598,7 +598,7 @@ static int lb_init(lb_t * lb) {
  *
  *****************************************************************************/
 
-__host__ int lb_collide_param_commit(lb_t * lb) {
+int lb_collide_param_commit(lb_t * lb) {
 
   assert(lb);
 
@@ -656,7 +656,7 @@ static int lb_model_param_init(lb_t * lb) {
  *
  *****************************************************************************/
 
-__host__ int lb_init_rest_f(lb_t * lb, double rho0) {
+int lb_init_rest_f(lb_t * lb, double rho0) {
 
   int nlocal[3];
   int ic, jc, kc, index;
@@ -688,7 +688,7 @@ __host__ int lb_init_rest_f(lb_t * lb, double rho0) {
  *
  *****************************************************************************/
 
-__host__ void lb_data_touch_kernel(cs_limits_t lim, lb_t * lb) {
+void lb_data_touch_kernel(cs_limits_t lim, lb_t * lb) {
 
   int nx = 1 + lim.imax - lim.imin;
   int ny = 1 + lim.jmax - lim.jmin;
@@ -716,7 +716,13 @@ __host__ void lb_data_touch_kernel(cs_limits_t lim, lb_t * lb) {
   return;
 }
 
-__host__ int lb_data_touch(lb_t * lb) {
+/*****************************************************************************
+ *
+ *  lb_data_touch
+ *
+ *****************************************************************************/
+
+int lb_data_touch(lb_t * lb) {
 
   int nlocal[3] = {0};
 
@@ -745,7 +751,7 @@ __host__ int lb_data_touch(lb_t * lb) {
  *
  *****************************************************************************/
 
-__host__ int lb_halo(lb_t * lb) {
+int lb_halo(lb_t * lb) {
 
   assert(lb);
 
@@ -755,124 +761,6 @@ __host__ int lb_halo(lb_t * lb) {
   return 0;
 }
 
-/*****************************************************************************
- *
- *  lb_ndist
- *
- *  Return the number of distribution functions.
- *
- *****************************************************************************/
-
-__host__ __device__ int lb_ndist(lb_t * lb, int * ndist) {
-
-  assert(lb);
-  assert(ndist);
-
-  *ndist = lb->ndist;
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  lb_f
- *
- *  Get the distribution at site index, velocity p, distribution n.
- *
- *****************************************************************************/
-
-__host__ __device__
-int lb_f(lb_t * lb, int index, int p, int n, double * f) {
-
-  assert(lb);
-  assert(index >= 0 && index < lb->nsite);
-  assert(p >= 0 && p < lb->nvel);
-  assert(n >= 0 && n < lb->ndist);
-
-  *f = lb->f[LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p)];
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  lb_f_set
- *
- *  Set the distribution for site index, velocity p, distribution n.
- *
- *****************************************************************************/
-
-__host__ __device__
-int lb_f_set(lb_t * lb, int index, int p, int n, double fvalue) {
-
-  assert(lb);
-  assert(index >= 0 && index < lb->nsite);
-  assert(p >= 0 && p < lb->nvel);
-  assert(n >= 0 && n < lb->ndist);
-
-  lb->f[LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p)] = fvalue;
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  lb_0th_moment
- *
- *  Return the zeroth moment of the distribution (rho for n = 0).
- *
- *****************************************************************************/
-
-__host__ __device__
-int lb_0th_moment(lb_t * lb, int index, lb_dist_enum_t nd, double * rho) {
-
-  assert(lb);
-  assert(rho);
-  assert(index >= 0 && index < lb->nsite);
-  assert((int) nd < lb->ndist);
-
-  *rho = 0.0;
-
-  for (int p = 0; p < lb->nvel; p++) {
-    *rho += lb->f[LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, nd, p)];
-  }
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  lb_1st_moment
- *
- *  Return the first moment of the distribution p.
- *
- *****************************************************************************/
-
-__host__ __device__
-int lb_1st_moment(lb_t * lb, int index, lb_dist_enum_t nd, double g[3]) {
-
-  int p;
-  int n;
-
-  assert(lb);
-  assert(index >= 0 && index < lb->nsite);
-  assert((int) nd < lb->ndist);
-
-  /* Loop to 3 here to cover initialisation in D2Q9 (appears in momentum) */
-  for (n = 0; n < 3; n++) {
-    g[n] = 0.0;
-  }
-
-  for (p = 0; p < lb->model.nvel; p++) {
-    for (n = 0; n < lb->model.ndim; n++) {
-      g[n] += lb->model.cv[p][n]
-	*lb->f[LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, nd, p)];
-    }
-  }
-
-  return 0;
-}
-
 /*****************************************************************************
  *
  *  lb_2nd_moment
@@ -881,7 +769,6 @@ int lb_1st_moment(lb_t * lb, int index, lb_dist_enum_t nd, double g[3]) {
  *
  *****************************************************************************/
 
-__host__
 int lb_2nd_moment(lb_t * lb, int index, lb_dist_enum_t nd, double s[3][3]) {
 
   int p, ia, ib;
@@ -919,7 +806,6 @@ int lb_2nd_moment(lb_t * lb, int index, lb_dist_enum_t nd, double s[3][3]) {
  *
  *****************************************************************************/
 
-__host__
 int lb_1st_moment_equilib_set(lb_t * lb, int index, double rho, double u[3]) {
 
   int ia, ib, p;
@@ -1747,7 +1633,7 @@ int lb_read_buf_ascii(lb_t * lb, int index, const char * buf) {
  *
  *****************************************************************************/
 
-__host__ int lb_io_aggr_pack(const lb_t * lb, io_aggregator_t * aggr) {
+int lb_io_aggr_pack(const lb_t * lb, io_aggregator_t * aggr) {
 
   assert(lb);
   assert(aggr);
@@ -1782,7 +1668,7 @@ __host__ int lb_io_aggr_pack(const lb_t * lb, io_aggregator_t * aggr) {
  *
  *****************************************************************************/
 
-__host__ int lb_io_aggr_unpack(lb_t * lb, const io_aggregator_t * aggr) {
+int lb_io_aggr_unpack(lb_t * lb, const io_aggregator_t * aggr) {
 
   assert(lb);
   assert(aggr);
diff --git a/src/lb_data.h b/src/lb_data.h
index 137cdec4d..0405c71d3 100644
--- a/src/lb_data.h
+++ b/src/lb_data.h
@@ -159,33 +159,128 @@ enum {LB_TAU_BULK = 1 + NDIM + XX, LB_TAU_SHEAR = 1 + NDIM + XY};
 typedef enum lb_dist_enum_type{LB_RHO = 0, LB_PHI = 1} lb_dist_enum_t;
 typedef enum lb_mode_enum_type{LB_GHOST_ON = 0, LB_GHOST_OFF = 1} lb_mode_enum_t;
 
-__host__ int lb_data_create(pe_t * pe, cs_t * cs,
+int lb_data_create(pe_t * pe, cs_t * cs,
 			    const lb_data_options_t * opts, lb_t ** lb);
-__host__ int lb_free(lb_t * lb);
-__host__ int lb_memcpy(lb_t * lb, tdpMemcpyKind flag);
-__host__ int lb_collide_param_commit(lb_t * lb);
-__host__ int lb_halo(lb_t * lb);
-
-__host__ __device__ int lb_ndist(lb_t * lb, int * ndist);
-__host__ __device__ int lb_f(lb_t * lb, int index, int p, int n, double * f);
-__host__ __device__ int lb_f_set(lb_t * lb, int index, int p, int n, double f);
-__host__ __device__ int lb_0th_moment(lb_t * lb, int index, lb_dist_enum_t nd,
-				      double * rho);
-
-__host__ int lb_init_rest_f(lb_t * lb, double rho0);
-__host__ __device__ int lb_1st_moment(lb_t * lb, int index, lb_dist_enum_t nd, double g[3]);
-__host__ int lb_2nd_moment(lb_t * lb, int index, lb_dist_enum_t nd, double s[3][3]);
-__host__ int lb_1st_moment_equilib_set(lb_t * lb, int index, double rho, double u[3]);
-
-__host__ int lb_read_buf(lb_t * lb, int index, const char * buf);
-__host__ int lb_read_buf_ascii(lb_t * lb, int index, const char * buf);
-__host__ int lb_write_buf(const lb_t * lb, int index, char * buf);
-__host__ int lb_write_buf_ascii(const lb_t * lb, int index, char * buf);
-
-__host__ int lb_io_aggr_pack(const lb_t * lb, io_aggregator_t * aggr);
-__host__ int lb_io_aggr_unpack(lb_t * lb, const io_aggregator_t * aggr);
-
-__host__ int lb_io_write(lb_t * lb, int timestep, io_event_t * event);
-__host__ int lb_io_read(lb_t * lb, int timestep, io_event_t * event);
+int lb_free(lb_t * lb);
+int lb_memcpy(lb_t * lb, tdpMemcpyKind flag);
+int lb_collide_param_commit(lb_t * lb);
+int lb_halo(lb_t * lb);
+
+int lb_init_rest_f(lb_t * lb, double rho0);
+int lb_2nd_moment(lb_t * lb, int index, lb_dist_enum_t nd, double s[3][3]);
+int lb_1st_moment_equilib_set(lb_t * lb, int index, double rho, double u[3]);
+
+int lb_read_buf(lb_t * lb, int index, const char * buf);
+int lb_read_buf_ascii(lb_t * lb, int index, const char * buf);
+int lb_write_buf(const lb_t * lb, int index, char * buf);
+int lb_write_buf_ascii(const lb_t * lb, int index, char * buf);
+
+int lb_io_aggr_pack(const lb_t * lb, io_aggregator_t * aggr);
+int lb_io_aggr_unpack(lb_t * lb, const io_aggregator_t * aggr);
+
+int lb_io_write(lb_t * lb, int timestep, io_event_t * event);
+int lb_io_read(lb_t * lb, int timestep, io_event_t * event);
+
+/*****************************************************************************
+ *
+ *  __host__ __device__ static inline functionm
+ *
+ *****************************************************************************/
+
+/*****************************************************************************
+ *
+ *  lb_f
+ *
+ *  Get the distribution at site index, velocity p, distribution n.
+ *
+ *****************************************************************************/
+
+__host__ __device__ static inline int lb_f(lb_t * lb, int index, int p, int n,
+					   double * f) {
+
+  assert(lb);
+  assert(index >= 0 && index < lb->nsite);
+  assert(p >= 0 && p < lb->nvel);
+  assert(n >= 0 && n < lb->ndist);
+
+  *f = lb->f[LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p)];
+
+  return 0;
+}
+
+/*****************************************************************************
+ *
+ *  lb_f_set
+ *
+ *  Set the distribution for site index, velocity p, distribution n.
+ *
+ *****************************************************************************/
+
+__host__ __device__ static inline int lb_f_set(lb_t * lb, int index, int p,
+					       int n, double fvalue) {
+  assert(lb);
+  assert(index >= 0 && index < lb->nsite);
+  assert(p >= 0 && p < lb->nvel);
+  assert(n >= 0 && n < lb->ndist);
+
+  lb->f[LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p)] = fvalue;
+
+  return 0;
+}
+
+/*****************************************************************************
+ *
+ *  lb_0th_moment
+ *
+ *  Return the zeroth moment of the distribution (rho for n = 0).
+ *
+ *****************************************************************************/
+
+__host__ __device__ static inline int lb_0th_moment(lb_t * lb, int index,
+						    lb_dist_enum_t nd,
+						    double * rho) {
+  assert(lb);
+  assert(rho);
+  assert(index >= 0 && index < lb->nsite);
+  assert((int) nd < lb->ndist);
+
+  *rho = 0.0;
+
+  for (int p = 0; p < lb->nvel; p++) {
+    *rho += lb->f[LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, nd, p)];
+  }
+
+  return 0;
+}
+
+/*****************************************************************************
+ *
+ *  lb_1st_moment
+ *
+ *  Return the first moment of the distribution p.
+ *
+ *****************************************************************************/
+
+__host__ __device__ static inline int lb_1st_moment(lb_t * lb, int index,
+						    lb_dist_enum_t nd,
+						    double g[3]) {
+  assert(lb);
+  assert(index >= 0 && index < lb->nsite);
+  assert((int) nd < lb->ndist);
+
+  /* Loop to 3 here to cover initialisation in D2Q9 (appears in momentum) */
+  for (int n = 0; n < 3; n++) {
+    g[n] = 0.0;
+  }
+
+  for (int p = 0; p < lb->model.nvel; p++) {
+    for (int n = 0; n < lb->model.ndim; n++) {
+      g[n] += lb->model.cv[p][n]
+	*lb->f[LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, nd, p)];
+    }
+  }
+
+  return 0;
+}
 
 #endif

From 13dd0fde31a964b0a37bf036fc0d75212044867f Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Sun, 16 Feb 2025 13:10:28 +0000
Subject: [PATCH 123/133] Remove old implementation

---
 src/model_le.c | 770 +++++--------------------------------------------
 1 file changed, 80 insertions(+), 690 deletions(-)

diff --git a/src/model_le.c b/src/model_le.c
index f22e2e150..ccbfd62d0 100644
--- a/src/model_le.c
+++ b/src/model_le.c
@@ -33,696 +33,6 @@
 #include "timer.h"
 #include "util.h"
 
-static int le_reproject(lb_t * lb, lees_edw_t * le);
-static int le_displace_and_interpolate(lb_t * lb, lees_edw_t * le);
-static int le_displace_and_interpolate_parallel(lb_t * lb, lees_edw_t * le);
-
-/*****************************************************************************
- *
- *  lb_le_apply_boundary_conditions
- *
- *  This is the driver to apply the LE conditions to the distributions
- *  (applied to the post-collision distributions). There are two
- *  stages:
- *
- *  1. a reprojection of distributions that will cross a plane in the
- *     upcoming propagation step.
- *  2. a displacement and interpolation of the reprojected distributions
- *     to take account of the sliding displacement as a function of time.
- *
- *  Note we never deal with the halo regions here, as we assume the
- *  upcoming propagation will be immediately preceded by a distribution
- *  halo update.
- *
- *****************************************************************************/
-
-__host__ int lb_le_apply_boundary_conditions(lb_t * lb, lees_edw_t * le) {
-
-  int mpi_cartsz[3];
-
-  assert(lb);
-  assert(le);
-
-  lees_edw_cartsz(le, mpi_cartsz);
-
-  if (lees_edw_nplane_local(le) > 0) {
-
-    TIMER_start(TIMER_LE);
-
-    /* Everything must be done on host at the moment (slowly) ... */
-    /* ... and copy back at the end */
-    lb_memcpy(lb, tdpMemcpyDeviceToHost);
-
-    le_reproject(lb, le);
-
-    if (mpi_cartsz[Y] > 1) {
-      le_displace_and_interpolate_parallel(lb, le);
-    }
-    else {
-      le_displace_and_interpolate(lb, le);
-    }
-
-    lb_memcpy(lb, tdpMemcpyHostToDevice);
-
-    TIMER_stop(TIMER_LE);
-  }
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  le_reproject
- *
- *  This is the reprojection of the post collision distributions to
- *  take account of the velocity jump at the planes.
- *
- *  We compute the moments, and then the change to the moments:
- *
- *     rho  -> rho (unchanged)
- *     g_a  -> g_a +/- rho u^le_a
- *     S_ab -> S_ab +/- rho u_a u^le_b +/- rho u_b u^le_a + rho u^le_a u^le_b
- *
- *  with analogous expressions for order parameter moments.
- *
- *  The change to the distribution is then computed by a reprojection.
- *  Ghost modes are unchanged.
- *
- *****************************************************************************/
-
-static int le_reproject(lb_t * lb, lees_edw_t * le) {
-
-  int    ic, jc, kc, index;
-  int    nplane, plane, side;
-  int    ia, ib;
-  int    nlocal[3];
-  int    n, ndist;
-  int8_t cx = 0;
-
-  double rho, ds[3][3], udotc, sdotq;
-  double g[3], du[3];
-  double fnew;
-  double t;
-  physics_t * phys = NULL;
-
-  assert(lb);
-  assert(le);
-
-  lb_ndist(lb, &ndist);
-  nplane = lees_edw_nplane_local(le);
-  physics_ref(&phys);
-
-  t = 1.0*physics_control_timestep(phys);
-  lees_edw_nlocal(le, nlocal);
-
-  for (plane = 0; plane < nplane; plane++) {
-    for (side = 0; side < 2; side++) {
-
-      du[X] = 0.0;
-      du[Y] = 0.0;
-      du[Z] = 0.0;
-
-      if (side == 0) {
-	/* Start with plane below Lees-Edwards BC */
-	lees_edw_plane_uy_now(le, t, &du[Y]);
-	du[Y] *= -1.0;
-	ic = lees_edw_plane_location(le, plane);
-	cx = +1;
-      }
-      else {
-	/* Finally, deal with plane above LEBC */
-	lees_edw_plane_uy_now(le, t, &du[Y]);
-	ic = lees_edw_plane_location(le, plane) + 1;
-	cx = -1;
-      }
-
-      for (jc = 1; jc <= nlocal[Y]; jc++) {
-	for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	  index = lees_edw_index(le, ic, jc, kc);
-
-	  for (n = 0; n < ndist; n++) {
-
-	    /* Compute 0th and 1st moments */
-	    lb_dist_enum_t ndn = (lb_dist_enum_t) n;
-	    lb_0th_moment(lb, index, ndn, &rho);
-	    lb_1st_moment(lb, index, ndn, g);
-
-	    for (ia = 0; ia < 3; ia++) {
-	      for (ib = 0; ib < 3; ib++) {
-		ds[ia][ib] = (g[ia]*du[ib] + du[ia]*g[ib] + rho*du[ia]*du[ib]);
-	      }
-	    }
-
-	    /* Now update the distribution */
-	    for (int p = 1; p < lb->model.nvel; p++) {
-
-	      double cs2 = lb->model.cs2;
-	      double rcs2 = 1.0/cs2;
-	      if (lb->model.cv[p][X] != cx) continue;
-
-	      udotc = du[Y]*lb->model.cv[p][Y];
-	      sdotq = 0.0;
-
-	      for (ia = 0; ia < 3; ia++) {
-		for (ib = 0; ib < 3; ib++) {
-		  double dab = cs2*(ia == ib);
-		  double q = (lb->model.cv[p][ia]*lb->model.cv[p][ib] - dab);
-		  sdotq += ds[ia][ib]*q;
-		}
-	      }
-
-	      /* Project all this back to the distribution. */
-
-	      lb_f(lb, index, p, n, &fnew);
-	      fnew += lb->model.wv[p]*(rho*udotc*rcs2 + 0.5*sdotq*rcs2*rcs2);
-	      lb_f_set(lb, index, p, n, fnew);
-	    }
-	  }
-	  /* next site */
-	}
-      }
-    }
-  }
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  le_displace_and_interpolate
- *
- *  For each side of each plane, work out the relevant displacement
- *  and do the necessary interpolation to get the modified plane-
- *  crossing distributions.
- *
- *****************************************************************************/
-
-int le_displace_and_interpolate(lb_t * lb, lees_edw_t * le) {
-
-  int    ic, jc, kc;
-  int    index0, index1;
-  int    nlocal[3];
-  int    n, nplane, plane;
-  int    jdy, j1, j2;
-  int    ndist;
-  int    nprop;
-  int    ndata;
-  int    nhalo;
-  double dy, fr;
-  double t;
-  double ltot[3];
-  double * recv_buff;
-  physics_t * phys = NULL;
-
-  assert(lb);
-  assert(le);
-
-  lees_edw_ltot(le, ltot);
-  lees_edw_nlocal(le, nlocal);
-  lees_edw_nhalo(le, &nhalo);
-  nplane = lees_edw_nplane_local(le);
-  physics_ref(&phys);
-
-  t = 1.0*physics_control_timestep(phys);
-
-  /* We need to interpolate into a temporary buffer to make sure we
-   * don't overwrite distributions taking part. The size is just
-   * determined by the size of the local domain, and the number
-   * of plane-crossing distributions. */
-
-  lb_ndist(lb, &ndist);
-
-  /* Allocate a buffer large enough for all cvp[][X] = +1 */
-
-  nprop = 0;
-  for (int p = 1; p < lb->model.nvel; p++) {
-    if (lb->model.cv[p][X] == +1) nprop += 1;
-  }
-
-  ndata = ndist*nprop*nlocal[Y]*nlocal[Z];
-  recv_buff = (double *) malloc(ndata*sizeof(double));
-  assert(recv_buff);
-  if (recv_buff == NULL) pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
-
-  for (plane = 0; plane < nplane; plane++) {
-
-    ic  = lees_edw_plane_location(le, plane);
-
-    lees_edw_buffer_displacement(le, nhalo, t, &dy);
-    dy  = fmod(dy, ltot[Y]);
-    jdy = floor(dy);
-    fr = dy - jdy;
-
-    ndata = 0;
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-
-      j1 = 1 + (jc + jdy - 1 + 2*nlocal[Y]) % nlocal[Y];
-      j2 = 1 + (j1 % nlocal[Y]);
-
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index0 = lees_edw_index(le, ic, j1, kc);
-	index1 = lees_edw_index(le, ic, j2, kc);
-
-	/* xdisp_fwd_cv[0] identifies cv[p][X] = +1 */
-
-	for (n = 0; n < ndist; n++) {
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] != +1) continue;
-	    recv_buff[ndata++] = (1.0 - fr)*
-	      lb->f[LB_ADDR(lb->nsite,ndist,lb->model.nvel,index0,n, p)]
-	      + fr*
-	      lb->f[LB_ADDR(lb->nsite,ndist,lb->model.nvel,index1,n, p)];
-	  }
-	}
-	/* Next site */
-      }
-    }
-
-    /* ...and copy back ... */
-
-    ndata = 0;
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index0 = lees_edw_index(le, ic, jc, kc);
-
-	for (n = 0; n < ndist; n++) {
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] != +1) continue;
-	    int la = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-	    lb->f[la] = recv_buff[ndata++];
-	  }
-	}
-	/* Next site */
-      }
-    }
-
-
-    /* OTHER DIRECTION */
-
-    ic  = lees_edw_plane_location(le, plane) + 1;
-
-    lees_edw_buffer_displacement(le, nhalo, t, &dy);
-    dy  = fmod(-dy, ltot[Y]);
-    jdy = floor(dy);
-    fr = dy - jdy;
-
-    ndata = 0;
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-
-      j1 = 1 + (jc + jdy - 1 + 2*nlocal[Y]) % nlocal[Y];
-      j2 = 1 + (j1 % nlocal[Y]) ;
-
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index0 = lees_edw_index(le, ic, j1, kc);
-	index1 = lees_edw_index(le, ic, j2, kc);
-
-	for (n = 0; n < ndist; n++) {
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] == -1) {
-	      int l0 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index0, n, p);
-	      int l1 = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index1, n, p);
-	      recv_buff[ndata++] = (1.0 - fr)*lb->f[l0] + fr*lb->f[l1];
-	    }
-	  }
-	}
-	/* Next site */
-      }
-    }
-
-    /* ...and now overwrite... */
-
-    ndata = 0;
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index0 = lees_edw_index(le, ic, jc, kc);
-
-	for (n = 0; n < ndist; n++) {
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] == -1) {
-	      int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel,index0,n,p);
-	      lb->f[ijkp] = recv_buff[ndata++];
-	    }
-	  }
-	}
-      }
-    }
-
-    /* Next plane */
-  }
-
-  free(recv_buff);
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  le_displace_and_interpolate_parallel
- *
- *  Here we need to communicate to be able to do the displacement of
- *  the buffers in the along-plane (Y-) direction.
- *
- *  Locally, we need to find interpolated values of the plane-crossing
- *  distributions for 1 <= jc <= nlocal[Y]. To do a linear interpolation
- *  everywhere, this requires (nlocal[Y] + 1) points displaced in the
- *  appropriate direction.
- *
- *  Likewise, we need to send a total of (nlocal[Y] + 1) points to the
- *  two corresponding receiving processes. Note we never involve the
- *  halo regions here (so a preceding halo exchange is not required).
- *
- *****************************************************************************/
-
-static int le_displace_and_interpolate_parallel(lb_t * lb, lees_edw_t * le) {
-
-  int ic, jc, kc;
-  int j1, j1mod;
-  int jdy;
-  int n1, n2;
-  int ndata, ndata1, ndata2;
-  int nhalo;
-  int ind0, ind1, ind2, index;
-  int n, nplane, plane;
-  int ntotal[3];
-  int nlocal[3];
-  int offset[3];
-  int nrank_s[3], nrank_r[3];
-  int nprop;
-  int ndist;
-
-  const int tag1 = 3102;
-  const int tag2 = 3103;
-
-  double fr;
-  double dy;
-  double t;
-  double ltot[3];
-  double * send_buff;
-  double * recv_buff;
-
-  physics_t * phys = NULL;
-  MPI_Comm    comm;
-  MPI_Request req[4];
-  MPI_Status status[4];
-
-  assert(lb);
-  assert(le);
-
-  lees_edw_ltot(le, ltot);
-  lees_edw_ntotal(le, ntotal);
-  lees_edw_nlocal(le, nlocal);
-  lees_edw_nhalo(le, &nhalo);
-  lees_edw_nlocal_offset(le, offset);
-
-  nplane = lees_edw_nplane_local(le);
-  lees_edw_comm(le, &comm);
-
-  physics_ref(&phys);
-
-  t = 1.0*physics_control_timestep(phys);
-  lb_ndist(lb, &ndist);
-
-
-  nprop = 0;
-  for (int p = 1; p < lb->model.nvel; p++) {
-    if (lb->model.cv[p][X] == +1) nprop += 1;
-  }
-
-  ndata = ndist*nprop*nlocal[Y]*nlocal[Z];
-  send_buff = (double *) malloc(ndata*sizeof(double));
-  assert(send_buff);
-  if (send_buff == NULL) pe_fatal(lb->pe, "malloc(send_buff) failed\n");
-
-  ndata = ndist*nprop*(nlocal[Y] + 1)*nlocal[Z];
-  recv_buff = (double *) malloc(ndata*sizeof(double));
-  assert(recv_buff);
-  if (recv_buff == NULL) pe_fatal(lb->pe, "malloc(recv_buff) failed\n");
-
-  for (plane = 0; plane < nplane; plane++) {
-
-    ic  = lees_edw_plane_location(le, plane);
-
-    lees_edw_buffer_displacement(le, nhalo, t, &dy);
-    dy  = fmod(dy, ltot[Y]);
-    jdy = floor(dy);
-    fr  = dy - jdy;
-
-    /* Starting y coordinate is j1: 1 <= j1 <= ntotal[y] */
-
-    jc = offset[Y] + 1;
-    j1 = 1 + (jc + jdy - 1 + 2*ntotal[Y]) % ntotal[Y];
-    lees_edw_jstart_to_mpi_ranks(le, j1, nrank_s, nrank_r);
-
-    j1mod = 1 + (j1 - 1) % nlocal[Y];
-    n1 = (nlocal[Y] - j1mod + 1);
-    n2 = j1mod;
-
-    ndata1 = n1*nlocal[Z]*ndist*nprop;
-    ndata2 = n2*nlocal[Z]*ndist*nprop;
-
-    /* Post the receives */
-
-    MPI_Irecv(recv_buff, ndata1, MPI_DOUBLE, nrank_r[0], tag1, comm, req);
-    MPI_Irecv(recv_buff + ndata1, ndata2, MPI_DOUBLE, nrank_r[1], tag2,
-	      comm, req + 1);
-
-    /* Load the send buffer. Note that data at j1mod gets sent to both
-     * receivers, making up the total of (nlocal[Y] + 1) points */
-
-    ndata = 0;
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	/* cv[p][X] = +1 identified by disp_fwd[] */
-	index = lees_edw_index(le, ic, jc, kc);
-
-	for (n = 0; n < ndist; n++) {
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] != +1) continue;
-	    int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index, n, p);
-	    send_buff[ndata++] = lb->f[ijkp];
-	  }
-	}
-	/* Next site */
-      }
-    }
-
-    ndata = ndata2 - nlocal[Z]*ndist*nprop;
-
-    MPI_Issend(send_buff + ndata, ndata1, MPI_DOUBLE, nrank_s[0], tag1,
-	       comm, req + 2);
-    MPI_Issend(send_buff,         ndata2, MPI_DOUBLE, nrank_s[1], tag2,
-	       comm, req + 3);
-
-    /* Wait for the receives, and sort out the interpolated values */
-
-    MPI_Waitall(2, req, status);
-
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index = lees_edw_index(le, ic, jc, kc);
-	ind0 = ndist*nprop*((jc-1)*nlocal[Z] + (kc-1));
-
-	for (n = 0; n < ndist; n++) {
-	  ind1 = ind0 + n*nprop;
-	  ind2 = ind0 + ndist*nprop*nlocal[Z] + n*nprop;
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] != +1) continue;
-	    int ijk = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index, n, p);
-	    lb->f[ijk] = (1.0-fr)*recv_buff[ind1++] + fr*recv_buff[ind2++];
-	  }
-	}
-	/* Next site */
-      }
-    }
-
-    /* Finish the sends */
-    MPI_Waitall(2, req + 2, status);
-
-
-
-    /* NOW THE OTHER DIRECTION */
-
-    ic  = lees_edw_plane_location(le, plane) + 1;
-
-    lees_edw_buffer_displacement(le, nhalo, t, &dy);
-    dy  = fmod(-dy, ltot[Y]);
-    jdy = floor(dy);
-    fr  = dy - jdy;
-
-    /* Starting y coordinate (global address): range 1 <= j1 <= ntotal[Y] */
-
-    jc = offset[Y] + 1;
-    j1 = 1 + (jc + jdy - 1 + 2*ntotal[Y]) % ntotal[Y];
-    lees_edw_jstart_to_mpi_ranks(le, j1, nrank_s, nrank_r);
-
-    j1mod = 1 + (j1 - 1) % nlocal[Y];
-    n1 = (nlocal[Y] - j1mod + 1);
-    n2 = j1mod;
-
-    ndata1 = n1*nlocal[Z]*ndist*nprop;
-    ndata2 = n2*nlocal[Z]*ndist*nprop;
-
-    /* Post the receives */
-
-    MPI_Irecv(recv_buff, ndata1, MPI_DOUBLE, nrank_r[0], tag1, comm, req);
-    MPI_Irecv(recv_buff + ndata1, ndata2, MPI_DOUBLE, nrank_r[1], tag2,
-	      comm, req + 1);
-
-    /* Load the send buffer. Note that data at j1mod gets sent to both
-     * receivers, making up the total of (nlocal[Y] + 1) points */
-
-    ndata = 0;
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	/* cv[p][X] = -1 identified by disp_bwd[] */
-	index = lees_edw_index(le, ic, jc, kc);
-
-	for (n = 0; n < ndist; n++) {
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] != -1) continue;
-	    int ijkp = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index, n, p);
-	    send_buff[ndata++] = lb->f[ijkp];
-	  }
-	}
-	/* Next site */
-      }
-    }
-
-    ndata = ndata2 - nlocal[Z]*ndist*nprop;
-
-    MPI_Issend(send_buff + ndata, ndata1, MPI_DOUBLE, nrank_s[0], tag1,
-	       comm, req + 2);
-    MPI_Issend(send_buff,         ndata2, MPI_DOUBLE, nrank_s[1], tag2,
-	       comm, req + 3);
-
-    /* Wait for the receives, and interpolate from the buffer */
-
-    MPI_Waitall(2, req, status);
-
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index = lees_edw_index(le, ic, jc, kc);
-	ind0 = ndist*nprop*((jc-1)*nlocal[Z] + (kc-1));
-
-	for (n = 0; n < ndist; n++) {
-	  ind1 = ind0 + n*nprop;
-	  ind2 = ind0 + ndist*nprop*nlocal[Z] + n*nprop;
-	  for (int p = 1; p < lb->model.nvel; p++) {
-	    if (lb->model.cv[p][X] != -1) continue;
-	    int ijk = LB_ADDR(lb->nsite, ndist, lb->model.nvel, index, n, p);
-	    lb->f[ijk] = (1.0-fr)*recv_buff[ind1++] + fr*recv_buff[ind2++];
-	  }
-	}
-	/* Next site */
-      }
-    }
-
-    /* Mop up the sends */
-    MPI_Waitall(2, req + 2, status);
-  }
-
-  free(send_buff);
-  free(recv_buff);
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  model_le_init_shear_profile
- *
- *  Initialise the distributions to be consistent with a steady-state
- *  linear shear profile, consistent with plane velocity.
- *
- *****************************************************************************/
-
-int lb_le_init_shear_profile(lb_t * lb, lees_edw_t * le) {
-
-  int ic, jc, kc, index;
-  int i, j, p;
-  int nlocal[3];
-  double rho0, u[NDIM], gradu[NDIM][NDIM];
-  double eta;
-
-  physics_t * phys = NULL;
-
-  assert(lb);
-  assert(le);
-
-  pe_info(lb->pe, "Initialising shear profile\n");
-
-  /* Initialise the density, velocity, gradu; ghost modes are zero */
-
-  physics_ref(&phys);
-  physics_rho0(phys, &rho0);
-  physics_eta_shear(phys, &eta);
-
-  lees_edw_nlocal(le, nlocal);
-
-  for (i = 0; i< lb->model.ndim; i++) {
-    u[i] = 0.0;
-    for (j = 0; j < lb->model.ndim; j++) {
-      gradu[i][j] = 0.0;
-    }
-  }
-
-  lees_edw_shear_rate(le, &gradu[X][Y]);
-
-  /* Loop through the sites */
-
-  for (ic = 1; ic <= nlocal[X]; ic++) {
-
-    lees_edw_steady_uy(le, ic, &u[Y]);
-
-    /* We can now project the physical quantities to the distribution */
-
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-
-	index = lees_edw_index(le, ic, jc, kc);
-
-	for (p = 0; p < lb->model.nvel; p++) {
-	  double f = 0.0;
-	  double cdotu = 0.0;
-	  double sdotq = 0.0;
-	  double cs2 = lb->model.cs2;
-	  double rcs2 = 1.0/cs2;
-
-	  for (i = 0; i < lb->model.ndim; i++) {
-	    cdotu += lb->model.cv[p][i]*u[i];
-	    for (j = 0; j < lb->model.ndim; j++) {
-	      double dij = (i == j);
-	      double qij = lb->model.cv[p][i]*lb->model.cv[p][j] - cs2*dij;
-	      sdotq += (rho0*u[i]*u[j] - eta*gradu[i][j])*qij;
-	    }
-	  }
-	  f = lb->model.wv[p]*(rho0 + rcs2*rho0*cdotu + 0.5*rcs2*rcs2*sdotq);
-	  lb_f_set(lb, index, p, 0, f);
-	}
-	/* Next site */
-      }
-    }
-  }
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  The preceding functions are scheduled for removal.
- *  The following are operational...
- *
- *****************************************************************************/
-
 /* Kernel helper structure intended to be passed by value to kernel */
 
 typedef struct lek_s {
@@ -1322,3 +632,83 @@ __global__ void lb_data_interpolate_kernel(kernel_3d_t k3d,
 
   return;
 }
+
+/*****************************************************************************
+ *
+ *  model_le_init_shear_profile
+ *
+ *  Initialise the distributions to be consistent with a steady-state
+ *  linear shear profile, consistent with plane velocity.
+ *
+ *****************************************************************************/
+
+int lb_le_init_shear_profile(lb_t * lb, lees_edw_t * le) {
+
+  int ic, jc, kc, index;
+  int i, j, p;
+  int nlocal[3];
+  double rho0, u[NDIM], gradu[NDIM][NDIM];
+  double eta;
+
+  physics_t * phys = NULL;
+
+  assert(lb);
+  assert(le);
+
+  pe_info(lb->pe, "Initialising shear profile\n");
+
+  /* Initialise the density, velocity, gradu; ghost modes are zero */
+
+  physics_ref(&phys);
+  physics_rho0(phys, &rho0);
+  physics_eta_shear(phys, &eta);
+
+  lees_edw_nlocal(le, nlocal);
+
+  for (i = 0; i< lb->model.ndim; i++) {
+    u[i] = 0.0;
+    for (j = 0; j < lb->model.ndim; j++) {
+      gradu[i][j] = 0.0;
+    }
+  }
+
+  lees_edw_shear_rate(le, &gradu[X][Y]);
+
+  /* Loop through the sites */
+
+  for (ic = 1; ic <= nlocal[X]; ic++) {
+
+    lees_edw_steady_uy(le, ic, &u[Y]);
+
+    /* We can now project the physical quantities to the distribution */
+
+    for (jc = 1; jc <= nlocal[Y]; jc++) {
+      for (kc = 1; kc <= nlocal[Z]; kc++) {
+
+	index = lees_edw_index(le, ic, jc, kc);
+
+	for (p = 0; p < lb->model.nvel; p++) {
+	  double f = 0.0;
+	  double cdotu = 0.0;
+	  double sdotq = 0.0;
+	  double cs2 = lb->model.cs2;
+	  double rcs2 = 1.0/cs2;
+
+	  for (i = 0; i < lb->model.ndim; i++) {
+	    cdotu += lb->model.cv[p][i]*u[i];
+	    for (j = 0; j < lb->model.ndim; j++) {
+	      double dij = (i == j);
+	      double qij = lb->model.cv[p][i]*lb->model.cv[p][j] - cs2*dij;
+	      sdotq += (rho0*u[i]*u[j] - eta*gradu[i][j])*qij;
+	    }
+	  }
+	  f = lb->model.wv[p]*(rho0 + rcs2*rho0*cdotu + 0.5*rcs2*rcs2*sdotq);
+	  lb_f_set(lb, index, p, 0, f);
+	}
+	/* Next site */
+      }
+    }
+  }
+
+  return 0;
+}

From a6d8dce596846b77ac7a36cad785c486c9e762eb Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Sun, 16 Feb 2025 13:11:19 +0000
Subject: [PATCH 124/133] Replace ndist function

---
 src/bbl.c       |  2 +-
 src/build.c     | 10 +++-------
 src/collision.c | 17 ++++++++---------
 src/ludwig.c    | 17 +++++------------
 4 files changed, 17 insertions(+), 29 deletions(-)

diff --git a/src/bbl.c b/src/bbl.c
index efead4a64..cd393d43e 100644
--- a/src/bbl.c
+++ b/src/bbl.c
@@ -108,7 +108,7 @@ int bbl_create(pe_t * pe, cs_t * cs, lb_t * lb, bbl_t ** pobj) {
   bbl->pe = pe;
   bbl->cs = cs;
   bbl->ellipsoid_didt = BBL_ELLIPSOID_UPDATE_QUATERNION;
-  lb_ndist(lb, &bbl->ndist);
+  bbl->ndist = lb->ndist;
 
   /* I would like to obtain the viscosity from the lb data structure;
    * but it's not present at initialisation, so ... */
diff --git a/src/build.c b/src/build.c
index 4a8fb5560..575446000 100644
--- a/src/build.c
+++ b/src/build.c
@@ -9,7 +9,7 @@
  *  Edinburgh Soft Matter and Statisitical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2006-2023 The University of Edinburgh
+ *  (c) 2006-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -797,7 +797,6 @@ static int build_remove_fluid(lb_t * lb, int index, colloid_t * p_colloid) {
 
 static int build_remove_order_parameter(lb_t * lb, field_t * f, int index,
 					colloid_t * pc) {
-  int ndist;
   double phi;
   double phi0;
   physics_t * phys = NULL;
@@ -808,9 +807,8 @@ static int build_remove_order_parameter(lb_t * lb, field_t * f, int index,
 
   physics_ref(&phys);
   physics_phi0(phys, &phi0);
-  lb_ndist(lb, &ndist);
 
-  if (ndist == 2) {
+  if (lb->ndist == 2) {
     lb_0th_moment(lb, index, LB_PHI, &phi);
   }
   else {
@@ -1029,7 +1027,6 @@ static int build_replace_order_parameter(fe_t * fe, lb_t * lb,
   int status;
   int ri[3];
   int nf;
-  int ndist;
   int nweight;
 
   double g;
@@ -1044,7 +1041,6 @@ static int build_replace_order_parameter(fe_t * fe, lb_t * lb,
 
   assert(map);
   assert(lb);
-  lb_ndist(lb, &ndist);
 
   field_nf(f, &nf);
   assert(nf <= NQAB);
@@ -1060,7 +1056,7 @@ static int build_replace_order_parameter(fe_t * fe, lb_t * lb,
     newg[p] = 0.0;
   }
 
-  if (ndist == 2) {
+  if (lb->ndist == 2) {
 
     /* Reset the distribution (distribution index 1) */
 
diff --git a/src/collision.c b/src/collision.c
index 54acad806..d620f9595 100644
--- a/src/collision.c
+++ b/src/collision.c
@@ -13,7 +13,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2011-2024 The University of Edinburgh
+ *  (c) 2011-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *    Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -54,7 +54,7 @@ static __host__ __device__
 void lb_collision_fluctuations(lb_t * lb, noise_t * noise, int index,
 			       double kt,
 			       double shat[3][3], double ghat[NVEL]);
-int lb_collision_noise_var_set(lb_t * lb, noise_t * noise);
+int lb_collision_noise_var_set(lb_t * lb);
 static __host__ int lb_collision_parameters_commit(lb_t * lb, visc_t * visc);
 
 static __device__
@@ -144,20 +144,19 @@ __host__
 int lb_collide(lb_t * lb, hydro_t * hydro, map_t * map, noise_t * noise,
 	       fe_t * fe, visc_t * visc) {
 
-  int ndist;
-
   if (hydro == NULL) return 0;
 
   assert(lb);
   assert(map);
 
-  lb_ndist(lb, &ndist);
   lb_collision_relaxation_times_set(lb);
-  lb_collision_noise_var_set(lb, noise);
+  lb_collision_noise_var_set(lb);
   lb_collide_param_commit(lb);
 
-  if (ndist == 1) lb_collision_mrt(lb, hydro, map, noise, fe, visc);
-  if (ndist == 2) lb_collision_binary(lb, hydro, noise, (fe_symm_t *) fe, visc);
+  if (lb->ndist == 1) lb_collision_mrt(lb, hydro, map, noise, fe, visc);
+  if (lb->ndist == 2) {
+    lb_collision_binary(lb, hydro, noise, (fe_symm_t *) fe, visc);
+  }
 
   return 0;
 }
@@ -1552,7 +1551,7 @@ __host__ __device__ int lb_nrelax_valid(lb_relaxation_enum_t nrelax) {
  *
  *****************************************************************************/
 
-__host__ int lb_collision_noise_var_set(lb_t * lb, noise_t * noise) {
+__host__ int lb_collision_noise_var_set(lb_t * lb) {
 
   int p;
   double kt;
diff --git a/src/ludwig.c b/src/ludwig.c
index 4f7fbce31..57874958f 100644
--- a/src/ludwig.c
+++ b/src/ludwig.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2011-2024 The University of Edinburgh
+ *  (c) 2011-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -392,8 +392,7 @@ static int ludwig_rt(ludwig_t * ludwig) {
   if (ntstep == 0) {
     if (nstat) stats_sigma_create(pe, cs, ludwig->fe_symm, ludwig->phi,
 				  &ludwig->stat_sigma);
-    lb_ndist(ludwig->lb, &n);
-    if (n == 2) phi_lb_from_field(ludwig->phi, ludwig->lb);
+    if (ludwig->lb->ndist == 2) phi_lb_from_field(ludwig->phi, ludwig->lb);
   }
 
   /* Initial Q_ab field required */
@@ -542,10 +541,7 @@ void ludwig_run(const char * inputfile) {
 
     /* if symmetric_lb store phi to field */
 
-
-    lb_ndist(ludwig->lb, &im);
-
-    if (im == 2) phi_lb_to_field(ludwig->phi, ludwig->lb);
+    if (ludwig->lb->ndist == 2) phi_lb_to_field(ludwig->phi, ludwig->lb);
 
     if (ludwig->phi) {
 
@@ -677,8 +673,7 @@ void ludwig_run(const char * inputfile) {
 
     /* order parameter dynamics (not if symmetric_lb) */
 
-    lb_ndist(ludwig->lb, &im);
-    if (im == 2) {
+    if (ludwig->lb->ndist == 2) {
       /* dynamics are dealt with at the collision stage (below) */
     }
     else {
@@ -2135,7 +2130,6 @@ static int ludwig_colloids_update_low_freq(ludwig_t * ludwig) {
 
 int ludwig_colloids_update(ludwig_t * ludwig) {
 
-  int ndist;
   int ndevice;
   int ncolloid;
   int iconserve;         /* switch for finite-difference conservation */
@@ -2147,8 +2141,7 @@ int ludwig_colloids_update(ludwig_t * ludwig) {
 
   tdpAssert( tdpGetDeviceCount(&ndevice) );
 
-  lb_ndist(ludwig->lb, &ndist);
-  iconserve = (ludwig->psi || (ludwig->phi && ndist == 1));
+  iconserve = (ludwig->psi || (ludwig->phi && ludwig->lb->ndist == 1));
 
   TIMER_start(TIMER_PARTICLE_HALO);
 

From f7258a761a25690bd30e64427aaa1a771b262f11 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Mon, 17 Feb 2025 17:23:14 +0000
Subject: [PATCH 125/133] Update i/o freq keys and i/o error messages

---
 src/control.c                             | 100 +++++++---------------
 src/control.h                             |   8 +-
 src/field.c                               |  33 ++++++-
 src/io_info_args.c                        |   8 +-
 src/io_info_args.h                        |  18 ++--
 src/io_info_args_rt.c                     |  64 ++++++++++++--
 src/io_info_args_rt.h                     |  20 +++--
 src/lb_data.c                             |  37 +++++++-
 src/ludwig.c                              |  69 +++++++++++----
 src/map.c                                 |  24 +++++-
 src/noise.c                               |  29 ++++++-
 tests/regression/d2q9/serial-2khz-bm1.inp |   2 +-
 tests/unit/test_io_info_args.c            |   4 +-
 tests/unit/test_io_info_args_rt.c         |  43 +++++++++-
 tests/unit/test_map.c                     |  13 +--
 15 files changed, 324 insertions(+), 148 deletions(-)

diff --git a/src/control.c b/src/control.c
index a43654045..8bff69a20 100644
--- a/src/control.c
+++ b/src/control.c
@@ -8,7 +8,7 @@
  *  end Edinburgh Parallel Computing Centre
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
- *  (c) 2008-2023 The University of Edinburgh
+ *  (c) 2008-2025 The University of Edinburgh
  *
  *****************************************************************************/
 
@@ -21,14 +21,9 @@
 #include "physics.h"
 #include "control.h"
 
-
 static int freq_statistics = 100;
 static int freq_measure    = 100000000;
 static int freq_config     = 100000000;
-static int freq_phi        = 100000000;
-static int freq_psi        = 100000000;
-static int freq_vel        = 100000000;
-static int freq_fed        = 100000000;
 static int freq_shear_io   = 100000000;
 static int freq_shear_meas = 100000000;
 static int freq_colloid_io = 100000000;
@@ -69,10 +64,6 @@ int init_control(pe_t * pe, rt_t * rt) {
 
   rt_int_parameter(rt, "freq_measure", &freq_measure);
   rt_int_parameter(rt, "freq_config", &freq_config);
-  rt_int_parameter(rt, "freq_phi", &freq_phi);
-  rt_int_parameter(rt, "freq_psi", &freq_psi);
-  rt_int_parameter(rt, "freq_vel", &freq_vel);
-  rt_int_parameter(rt, "freq_fed", &freq_fed);
   rt_int_parameter(rt, "freq_shear_measurement", &freq_shear_meas);
   rt_int_parameter(rt, "freq_shear_output", &freq_shear_io);
   rt_int_parameter(rt, "colloid_io_freq", &freq_colloid_io);
@@ -94,6 +85,35 @@ int init_control(pe_t * pe, rt_t * rt) {
   /* This is a record of the last time step for "config_at_end" */
   nsteps_ = t_start + t_steps;
 
+  /* All these keys are schemed for replacement by a more
+   * flexible mechanism. In particular ...*/
+
+  if (rt_key_present(rt, "freq_phi")) {
+    pe_info(pe, "Input file contains key: freq_phi\n");
+    pe_info(pe, "Please use phi_io_freq instead for order parameter output\n");
+    pe_info(pe, "See https://ludwig.epcc.ed.ac.uk/outputs/fluid.html\n");
+    pe_exit(pe, "Please check and try again\n");
+  }
+
+  if (rt_key_present(rt, "freq_psi")) {
+    pe_info(pe, "Input file contains key: freq_psi\n");
+    pe_info(pe, "Please use psi_io_freq instead for electrokinectic output\n");
+    pe_info(pe, "See https://ludwig.epcc.ed.ac.uk/outputs/fluid.html\n");
+    pe_exit(pe, "Please check and try again\n");
+  }
+
+  if (rt_key_present(rt, "freq_vel")) {
+    pe_info(pe, "Input file contains key: freq_vel\n");
+    pe_info(pe, "Please use vel_io_freq instead for velocity field output\n");
+    pe_info(pe, "See https://ludwig.epcc.ed.ac.uk/outputs/fluid.html\n");
+    pe_exit(pe, "Please check and try again\n");
+  }
+
+  if (rt_key_present(rt, "freq_fed")) {
+    pe_info(pe, "Input file contains key: freq_fed\n");
+    pe_info(pe, "Lattice free enegy density output is not implemented\n");
+  }
+
   return 0;
 }
 
@@ -132,54 +152,6 @@ int is_colloid_io_step() {
   return ((physics_control_timestep(phys) % freq_colloid_io) == 0);
 }
 
-/*****************************************************************************
- *
- *  is_phi_output_step
- *
- *****************************************************************************/
-
-int is_phi_output_step() {
-  physics_t * phys = NULL;
-  physics_ref(&phys);
-  return ((physics_control_timestep(phys) % freq_phi) == 0);
-}
-
-/*****************************************************************************
- *
- *  is_vel_output_step
- *
- *****************************************************************************/
-
-int is_vel_output_step() {
-  physics_t * phys = NULL;
-  physics_ref(&phys);
-  return ((physics_control_timestep(phys) % freq_vel) == 0);
-}
-
-/*****************************************************************************
- *
- *  is_psi_output_step
- *
- *****************************************************************************/
-
-int is_psi_output_step() {
-  physics_t * phys = NULL;
-  physics_ref(&phys);
-  return ((physics_control_timestep(phys) % freq_psi) == 0);
-}
-
-/*****************************************************************************
- *
- *  is_fed_output_step
- *
- *****************************************************************************/
-
-int is_fed_output_step() {
-  physics_t * phys = NULL;
-  physics_ref(&phys);
-  return ((physics_control_timestep(phys) % freq_fed) == 0);
-}
-
 /*****************************************************************************
  *
  *  is_config_at_end
@@ -228,15 +200,3 @@ int control_freq_set(int freq) {
 
   return 0;
 }
-
-/*****************************************************************************
- *
- *  is_rho_output_step
- *
- *****************************************************************************/
-
-int is_rho_output_step(void) {
-  physics_t * phys = NULL;
-  physics_ref(&phys);
-  return ((physics_control_timestep(phys) % rho_nfreq) == 0);
-}
diff --git a/src/control.h b/src/control.h
index bab107986..dcade210b 100644
--- a/src/control.h
+++ b/src/control.h
@@ -5,7 +5,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2008-2019 The University of Edinburgh
+ *  (c) 2008-2025 The University of Edinburgh
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
  *****************************************************************************/
@@ -15,7 +15,6 @@
 
 #include "pe.h"
 #include "runtime.h"
-#include "physics.h"
 
 int init_control(pe_t * pe, rt_t * rt);
 int is_statistics_step(void);
@@ -24,11 +23,6 @@ int is_config_step(void);
 int is_config_at_end(void);
 int is_colloid_io_step(void);
 
-int is_phi_output_step(void);
-int is_vel_output_step(void);
-int is_psi_output_step(void);
-int is_rho_output_step(void);
-int is_fed_output_step(void);
 int is_shear_measurement_step(void);
 int is_shear_output_step(void);
 int control_freq_set(int freq);
diff --git a/src/field.c b/src/field.c
index f25f5f5b1..673040145 100644
--- a/src/field.c
+++ b/src/field.c
@@ -1649,12 +1649,26 @@ int field_io_write(field_t * field, int timestep, io_event_t * event) {
   assert(ifail == 0);
 
   if (ifail == 0) {
+    /* The device -> host transfer occurs here, if relevant. */
+    /* Errors return is standard */
+    int ierr = MPI_SUCCESS;
     io_event_record(event, IO_EVENT_AGGR);
     field_memcpy(field, tdpMemcpyDeviceToHost);
     field_io_aggr_pack(field, io->aggr);
 
     io_event_record(event, IO_EVENT_WRITE);
-    io->impl->write(io, filename);
+    ierr = io->impl->write(io, filename);
+
+    if (ierr != MPI_SUCCESS) {
+      /* We could try to continue here, but fail at the moment */
+      pe_t * pe = field->pe;
+      int len = 0;
+      char msg[MPI_MAX_ERROR_STRING] = {0};
+      MPI_Error_string(ierr, msg, &len);
+      pe_info(pe, "Error: Could not write field data file: %s\n", filename);
+      pe_info(pe, "Error; %s\n", msg);
+      pe_exit(pe, "Will not continue. Stopping.\n");
+    }
 
     if (meta->options.report) {
       pe_info(field->pe, "MPIIO wrote to %s\n", filename);
@@ -1686,12 +1700,27 @@ int field_io_read(field_t * field, int timestep, io_event_t * event) {
   assert(ifail == 0);
 
   if (ifail == 0) {
+    /* Errors return, so we check for success here. */
+    int ierr = MPI_SUCCESS;
     io_event_record(event, IO_EVENT_READ);
-    io->impl->read(io, filename);
+    ierr = io->impl->read(io, filename);
     io_event_record(event, IO_EVENT_DISAGGR);
     field_io_aggr_unpack(field, io->aggr);
     io->impl->free(&io);
 
+    if (ierr != MPI_SUCCESS) {
+      pe_t * pe = field->pe;
+      int len = 0;
+      char msg[MPI_MAX_ERROR_STRING] = {0};
+      MPI_Error_string(ierr, msg, &len);
+      pe_info(pe, "Error: Could not read field data file: %s\n", filename);
+      pe_info(pe, "Error; %s\n", msg);
+      pe_exit(pe, "Please check and try again. Cannot recover. Stopping.\n");
+    }
+
+    /* The host -> device transfer occurs here, if relevant */
+    field_memcpy(field, tdpMemcpyHostToDevice);
+
     if (meta->options.report) {
       pe_info(field->pe, "MPIIO read from %s\n", filename);
       io_event_report_read(event, meta, field->name);
diff --git a/src/io_info_args.c b/src/io_info_args.c
index 7e059f283..97edb3690 100644
--- a/src/io_info_args.c
+++ b/src/io_info_args.c
@@ -8,7 +8,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2020 The University of Edinburgh
+ *  (c) 2020-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -25,12 +25,12 @@
  *
  *****************************************************************************/
 
-__host__ io_info_args_t io_info_args_default(void) {
+io_info_args_t io_info_args_default(void) {
 
   io_info_args_t args = {.input  = io_options_default(),
 			 .output = io_options_default(),
 			 .grid   = {1, 1, 1},
-			 .nfreq  = 100000};
+			 .iofreq = 0};
   return args;
 }
 
@@ -42,7 +42,7 @@ __host__ io_info_args_t io_info_args_default(void) {
  *
  *****************************************************************************/
 
-__host__ int io_info_args_iogrid_valid(int iogrid[3]) {
+int io_info_args_iogrid_valid(int iogrid[3]) {
 
   int valid = 1;
 
diff --git a/src/io_info_args.h b/src/io_info_args.h
index 3db1e61a1..76187ea6f 100644
--- a/src/io_info_args.h
+++ b/src/io_info_args.h
@@ -9,7 +9,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2020-2024 The University of Edinburgh
+ *  (c) 2020-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -23,10 +23,12 @@
 
 /* Specifies whether input or output is expected for given io type */
 
-enum io_info_rw_enum {IO_INFO_NONE,
-		      IO_INFO_READ_ONLY,
-		      IO_INFO_WRITE_ONLY,
-		      IO_INFO_READ_WRITE};
+enum io_info_rw_enum {
+  IO_INFO_NONE,
+  IO_INFO_READ_ONLY,
+  IO_INFO_WRITE_ONLY,
+  IO_INFO_READ_WRITE
+};
 
 typedef enum   io_info_rw_enum io_info_rw_enum_t;
 typedef struct io_info_args_s io_info_args_t;
@@ -35,10 +37,10 @@ struct io_info_args_s {
   io_options_t input;            /* Input mode, format, ... */
   io_options_t output;           /* Output mode, format, ... */
   int grid[3];                   /* Input and output have same grid */
-  int nfreq;                     /* Output only. Frequency (every n steps) */
+  int iofreq;                    /* Output only. Frequency (every n steps) */
 };
 
-__host__ io_info_args_t io_info_args_default(void);
-__host__ int io_info_args_iogrid_valid(int iogrid[3]);
+io_info_args_t io_info_args_default(void);
+int io_info_args_iogrid_valid(int iogrid[3]);
 
 #endif
diff --git a/src/io_info_args_rt.c b/src/io_info_args_rt.c
index f601718fb..c7d45132a 100644
--- a/src/io_info_args_rt.c
+++ b/src/io_info_args_rt.c
@@ -2,11 +2,13 @@
  *
  *  io_info_args_rt.c
  *
+ *  Initialisation of the container for i/o information.
+ *
  *
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2020-2022 The University of Edinburgh
+ *  (c) 2020-2025 The University of Edinburgh
  *
  *  Contribuiting authors:
  *  Kevin Stratford
@@ -28,8 +30,8 @@
  *
  *****************************************************************************/
 
-__host__ int io_info_args_rt(rt_t * rt, rt_enum_t lv, const char * stub,
-			     io_info_rw_enum_t rw, io_info_args_t * args) {
+int io_info_args_rt(rt_t * rt, rt_enum_t lv, const char * stub,
+		    io_info_rw_enum_t rw, io_info_args_t * args) {
 
   assert(rt);
   assert(stub);
@@ -64,6 +66,16 @@ __host__ int io_info_args_rt(rt_t * rt, rt_enum_t lv, const char * stub,
   args->output.iogrid[1] = args->grid[1];
   args->output.iogrid[2] = args->grid[2];
 
+  /* i/o frequency (output only) */
+
+  {
+    char key[BUFSIZ] = {0};
+
+    sprintf(key, "%s_io_freq", stub);
+    io_info_args_rt_iofreq(rt, lv, "default_io_freq", &args->iofreq);
+    io_info_args_rt_iofreq(rt, lv, key, &args->iofreq);
+  }
+
   return 0;
 }
 
@@ -73,8 +85,8 @@ __host__ int io_info_args_rt(rt_t * rt, rt_enum_t lv, const char * stub,
  *
  *****************************************************************************/
 
-__host__ int io_info_args_rt_input(rt_t * rt, rt_enum_t lv, const char * stub,
-				   io_info_args_t * args) {
+int io_info_args_rt_input(rt_t * rt, rt_enum_t lv, const char * stub,
+			  io_info_args_t * args) {
 
   char stub_input[BUFSIZ] = {0};
 
@@ -101,8 +113,8 @@ __host__ int io_info_args_rt_input(rt_t * rt, rt_enum_t lv, const char * stub,
  *
  *****************************************************************************/
 
-__host__ int io_info_args_rt_output(rt_t * rt, rt_enum_t lv, const char * stub,
-				    io_info_args_t * args) {
+int io_info_args_rt_output(rt_t * rt, rt_enum_t lv, const char * stub,
+			   io_info_args_t * args) {
 
   char stub_output[BUFSIZ] = {0};
 
@@ -129,8 +141,8 @@ __host__ int io_info_args_rt_output(rt_t * rt, rt_enum_t lv, const char * stub,
  *
  *****************************************************************************/
 
-__host__ int io_info_args_rt_iogrid(rt_t * rt, rt_enum_t lv, const char * key,
-				    int grid[3]) {
+int io_info_args_rt_iogrid(rt_t * rt, rt_enum_t lv, const char * key,
+			   int grid[3]) {
 
   int key_present = 0;
   int iogrid[3] = {0};   /* invalid */
@@ -160,3 +172,37 @@ __host__ int io_info_args_rt_iogrid(rt_t * rt, rt_enum_t lv, const char * key,
 
   return ifail;
 }
+
+/*****************************************************************************
+ *
+ *  io_info_args_rt_iofreq
+ *
+ *  Obtain a valid i/o frequency, if present.
+ *
+ *****************************************************************************/
+
+int io_info_args_rt_iofreq(rt_t * rt, rt_enum_t lv, const char * key,
+			   int * iofreq) {
+
+  int ifail = -1;      /* -1 for no key; 0 for valid key; +1 if invalid */
+  int ival  =  0;
+  int key_present = 0;
+
+  key_present = rt_int_parameter(rt, key, &ival);
+
+  if (key_present) {
+    if (ival >= 0) {
+      *iofreq = ival;
+      ifail = 0;
+    }
+    else {
+      rt_vinfo(rt, lv, "I/O freq key present but is invalid\n");
+      rt_vinfo(rt, lv, "key: %s\n", key);
+      rt_fatal(rt, lv, "The value must be a non-negative integer.\n");
+      rt_fatal(rt, lv, "Please check the input file and try again!\n");
+      ifail = 1;
+    }
+  }
+
+  return ifail;
+}
diff --git a/src/io_info_args_rt.h b/src/io_info_args_rt.h
index 3d76e1bbd..99bef27f2 100644
--- a/src/io_info_args_rt.h
+++ b/src/io_info_args_rt.h
@@ -6,7 +6,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2020-2022 The University of Edinburgh
+ *  (c) 2020-2025 The University of Edinburgh
  *
  *  Contribuiting authors:
  *  Kevin Stratford
@@ -19,13 +19,15 @@
 #include "runtime.h"
 #include "io_info_args.h"
 
-__host__ int io_info_args_rt(rt_t * rt, rt_enum_t lv, const char * name,
-			     io_info_rw_enum_t rw, io_info_args_t * args);
-__host__ int io_info_args_rt_input(rt_t * rt, rt_enum_t lv, const char * stub,
-				   io_info_args_t * args);
-__host__ int io_info_args_rt_output(rt_t * rt, rt_enum_t lv, const char * stub,
-				    io_info_args_t * args);
-__host__ int io_info_args_rt_iogrid(rt_t * rt, rt_enum_t lv, const char * key,
-			            int iogrid[3]);
+int io_info_args_rt(rt_t * rt, rt_enum_t lv, const char * name,
+		    io_info_rw_enum_t rw, io_info_args_t * args);
+int io_info_args_rt_input(rt_t * rt, rt_enum_t lv, const char * stub,
+			  io_info_args_t * args);
+int io_info_args_rt_output(rt_t * rt, rt_enum_t lv, const char * stub,
+			   io_info_args_t * args);
+int io_info_args_rt_iogrid(rt_t * rt, rt_enum_t lv, const char * key,
+			   int iogrid[3]);
+int io_info_args_rt_iofreq(rt_t * rt, rt_enum_t lv, const char * key,
+			   int * iofreq);
 
 #endif
diff --git a/src/lb_data.c b/src/lb_data.c
index 459a240fa..395399f75 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -1546,19 +1546,31 @@ int lb_io_write(lb_t * lb, int timestep, io_event_t * event) {
     assert(ifail == 0);
 
     if (ifail == 0) {
+      int ierr = MPI_SUCCESS;
       io_event_record(event, IO_EVENT_AGGR);
       lb_memcpy(lb, tdpMemcpyDeviceToHost);
       lb_io_aggr_pack(lb, io->aggr);
 
       io_event_record(event, IO_EVENT_WRITE);
-      io->impl->write(io, filename);
+      ierr = io->impl->write(io, filename);
+
+      if (ierr != MPI_SUCCESS) {
+	/* An error has occurred */
+	pe_t * pe = lb->pe;
+	int len = 0;
+	char msg[MPI_MAX_ERROR_STRING] ={0};
+	MPI_Error_string(ierr, msg, &len);
+	pe_info(pe, "Error: write distribuiion file failed: %s\n", filename);
+	pe_info(pe, "Error: %s\n", msg);
+	pe_exit(pe, "Will not continue, Stopping.\n");
+      }
 
       if (meta->options.report) {
 	pe_info(lb->pe, "MPIIO wrote to %s\n", filename);
+	io_event_report_write(event, meta, "dist");
       }
 
       io->impl->free(&io);
-      io_event_report_write(event, meta, "dist");
     }
   }
 
@@ -1589,9 +1601,28 @@ int lb_io_read(lb_t * lb, int timestep, io_event_t * event) {
     assert(ifail == 0);
 
     if (ifail == 0) {
-      io->impl->read(io, filename);
+      int ierr = MPI_SUCCESS;
+      io_event_record(event, IO_EVENT_READ);
+      ierr = io->impl->read(io, filename);
+
+      if (ierr != MPI_SUCCESS) {
+	pe_t * pe = lb->pe;
+	int len = 0;
+	char msg[MPI_MAX_ERROR_STRING] ={0};
+	MPI_Error_string(ierr, msg, &len);
+	pe_info(pe, "Error: could not read distribuiion file: %s\n", filename);
+	pe_info(pe, "Error: %s\n", msg);
+	pe_exit(pe, "Cannot recover. Please check and try again. Stopping.\n");
+      }
+
+      io_event_record(event, IO_EVENT_DISAGGR);
       lb_io_aggr_unpack(lb, io->aggr);
       io->impl->free(&io);
+
+      if (meta->options.report) {
+	pe_info(lb->pe, "MPI read from %s\n", filename);
+	io_event_report_read(event, meta, "distributions");
+      }
     }
   }
 
diff --git a/src/ludwig.c b/src/ludwig.c
index 2e2b15617..05999cc41 100644
--- a/src/ludwig.c
+++ b/src/ludwig.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2011-2024 The University of Edinburgh
+ *  (c) 2011-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -355,6 +355,13 @@ static int ludwig_rt(ludwig_t * ludwig) {
       field_io_read(ludwig->psi->psi, ntstep, &event1);
       field_io_read(ludwig->psi->rho, ntstep, &event2);
     }
+
+    /* Lattice RNG state */
+    if (ludwig->noise) {
+      io_event_t event = {0};
+      pe_info(pe, "Reading lattice rng state for step %d\n", ntstep);
+      noise_io_read(ludwig->noise, ntstep, &event);
+    }
   }
 
   /* gradient initialisation for field stuff */
@@ -881,34 +888,40 @@ void ludwig_run(const char * inputfile) {
       }
     }
 
-    if (is_phi_output_step() || is_config_step()) {
-
-      if (ludwig->phi) {
+    if (ludwig->phi) {
+      int output = (0 == (step % ludwig->phi->opts.iodata.iofreq));
+      if (output || is_config_step()) {
 	io_event_t event = {0};
 	pe_info(ludwig->pe, "Writing phi file at step %d!\n", step);
-	field_memcpy(ludwig->phi, tdpMemcpyDeviceToHost);
 	field_io_write(ludwig->phi, step, &event);
       }
+    }
 
-      if (ludwig->p) {
+    if (ludwig->p) {
+      int output = (0 == (step % ludwig->p->opts.iodata.iofreq));
+      if (output || is_config_step()) {
 	io_event_t event = {0};
 	pe_info(ludwig->pe, "Writing p file at step %d!\n", step);
-	field_memcpy(ludwig->p, tdpMemcpyDeviceToHost);
 	field_io_write(ludwig->p, step, &event);
       }
+    }
 
-      if (ludwig->q) {
+    if (ludwig->q) {
+      int output = (0 == (step % ludwig->q->opts.iodata.iofreq));
+      if (output || is_config_step()) {
 	io_event_t event = {0};
 	pe_info(ludwig->pe, "Writing q file at step %d!\n", step);
-	field_memcpy(ludwig->q, tdpMemcpyDeviceToHost);
+	/* Replacement needs to be reconsidered in a device context ... */
 	io_replace_values(ludwig->q, ludwig->map, MAP_COLLOID, 0.00001);
 	field_io_write(ludwig->q, step, &event);
       }
     }
 
     if (ludwig->psi) {
-      if (is_psi_output_step() || is_config_step()) {
-	pe_info(ludwig->pe, "Writing psi file at step %d!\n", step);
+      /* The potential and the charge densities (both controlled by "psi") */
+      int output = (0 == (step % ludwig->psi->psi->opts.iodata.iofreq));
+      if (output || is_config_step()) {
+	pe_info(ludwig->pe, "Writing electrokinetic data at step %d!\n", step);
 	psi_io_write(ludwig->psi, step);
       }
     }
@@ -932,11 +945,35 @@ void ludwig_run(const char * inputfile) {
       stats_rheology_stress_profile_zero(ludwig->stat_rheo);
     }
 
-    if (ludwig->hydro && (is_vel_output_step() || is_config_step())) {
-      io_event_t event = {0};
-      pe_info(ludwig->pe, "Writing rho/velocity output at step %d!\n", step);
-      hydro_memcpy(ludwig->hydro, tdpMemcpyDeviceToHost);
-      hydro_io_write(ludwig->hydro, step, &event);
+    /* Hydrodynamic quantities */
+    if (ludwig->hydro) {
+      if (is_config_step()) {
+	io_event_t event = {0};
+	pe_info(ludwig->pe, "Writing rho/velocity output at step %d!\n", step);
+	hydro_io_write(ludwig->hydro, step, &event);
+      }
+      else {
+	/* Individual requests */
+	if (0 == (step % ludwig->hydro->rho->opts.iodata.iofreq)) {
+	  io_event_t event = {0};
+	  pe_info(ludwig->pe, "Writing rho output at step %d!\n", step);
+	  field_io_write(ludwig->hydro->rho, step, &event);
+	}
+	if (0 == (step % ludwig->hydro->u->opts.iodata.iofreq)) {
+	  io_event_t event = {0};
+	  pe_info(ludwig->pe, "Writing velocity output at step %d!\n", step);
+	  field_io_write(ludwig->hydro->u, step, &event);
+	}
+      }
+    }
+
+    if (ludwig->noise) {
+      /* This is only part of the configuration at the moment */
+      if (is_config_step()) {
+	io_event_t event = {0};
+	pe_info(ludwig->pe, "Writing lattice rng state at step %d\n", step);
+	noise_io_write(ludwig->noise, step, &event);
+      }
     }
 
     /* Print progress report */
diff --git a/src/map.c b/src/map.c
index 5488c33cb..df945bdb0 100644
--- a/src/map.c
+++ b/src/map.c
@@ -8,7 +8,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2012-2024 The University of Edinburgh
+ *  (c) 2012-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -636,6 +636,18 @@ int map_io_read(map_t * map, int timestep) {
       map_io_aggr_unpack(map, io->aggr);
       io->impl->free(&io);
     }
+
+    if (ifail != MPI_SUCCESS) {
+      int len = 0;
+      char msg[MPI_MAX_ERROR_STRING] = {0};
+      MPI_Error_string(ifail, msg, &len);
+      pe_info(map->pe, "Error: could not read data file: %s\n", filename);
+      pe_info(map->pe, "Error: %s\n", msg);
+      pe_exit(map->pe, "Cannot continue. Stopping.\n");
+    }
+
+    /* Some sanitisation of input data may be appropriate */
+    map_memcpy(map, tdpMemcpyHostToDevice);
   }
 
   return ifail;
@@ -663,10 +675,20 @@ int map_io_write(map_t * map, int timestep) {
     ifail = io_impl_create(meta, &io);
 
     if (ifail == 0) {
+      map_memcpy(map, tdpMemcpyDeviceToHost);
       map_io_aggr_pack(map, io->aggr);
       ifail = io->impl->write(io, filename);
       io->impl->free(&io);
     }
+
+    if (ifail != MPI_SUCCESS) {
+      int len = 0;
+      char msg[MPI_MAX_ERROR_STRING] = {0};
+      MPI_Error_string(ifail, msg, &len);
+      pe_info(map->pe, "Error: could not write data file: %s\n", filename);
+      pe_info(map->pe, "Error: %s\n", msg);
+      pe_exit(map->pe, "Will not continue. Stopping.\n");
+    }
   }
 
   return ifail;
diff --git a/src/noise.c b/src/noise.c
index 5a7eff6e2..a6ec978cf 100644
--- a/src/noise.c
+++ b/src/noise.c
@@ -25,7 +25,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2013-2024 The University of Edinburgh
+ *  (c) 2013-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -638,13 +638,25 @@ int noise_io_write(noise_t * ns, int timestep, io_event_t * event) {
     ifail = io_impl_create(meta, &io);
 
     if (ifail == 0) {
+      int ierr = MPI_SUCCESS;
       io_event_record(event, IO_EVENT_AGGR);
+      /* Include device->host copy if reelevant */
+      noise_memcpy(ns, tdpMemcpyDeviceToHost);
       noise_io_aggr_pack(ns, io->aggr);
 
       io_event_record(event, IO_EVENT_WRITE);
-      io->impl->write(io, filename);
+      ierr = io->impl->write(io, filename);
       io->impl->free(&io);
 
+      if (ierr != MPI_SUCCESS) {
+	int len = 0;
+	char msg[MPI_MAX_ERROR_STRING] = {0};
+	MPI_Error_string(ierr, msg, &len);
+	pe_info(ns->pe, "Error: could not write noise state %s\n", filename);
+	pe_info(ns->pe, "Error: %s\n", msg);
+	pe_exit(ns->pe, "Will not continue. Stopping\n");
+      }
+
       if (meta->options.report) {
 	pe_info(ns->pe, "Wrote noise state to file: %s\n", filename);
       }
@@ -673,9 +685,20 @@ int noise_io_read(noise_t * ns, int timestep, io_event_t * event) {
   ifail = io_impl_create(meta, &io);
 
   if (ifail == 0) {
-    io->impl->read(io, filename);
+    int ierr = io->impl->read(io, filename);
+
+    if (ierr != MPI_SUCCESS) {
+      int len = 0;
+      char msg[MPI_MAX_ERROR_STRING] = {0};
+      MPI_Error_string(ierr, msg, &len);
+      pe_info(ns->pe, "Error: could not read noise state %s\n", filename);
+      pe_info(ns->pe, "Error: %s\n", msg);
+      pe_exit(ns->pe, "Cannot not continue. Stopping\n");
+    }
+
     io_event_record(event, IO_EVENT_AGGR);
     noise_io_aggr_unpack(ns, io->aggr);
+    noise_memcpy(ns, tdpMemcpyHostToDevice);
     io->impl->free(&io);
     if (meta->options.report) pe_info(ns->pe, "Read %s\n", filename);
   }
diff --git a/tests/regression/d2q9/serial-2khz-bm1.inp b/tests/regression/d2q9/serial-2khz-bm1.inp
index 1916687c7..4b1321884 100644
--- a/tests/regression/d2q9/serial-2khz-bm1.inp
+++ b/tests/regression/d2q9/serial-2khz-bm1.inp
@@ -114,7 +114,7 @@ periodicity 1_1_1
 ###############################################################################
 
 freq_statistics 2560
-freq_vel        5120
+vel_io_freq     5120
 config_at_end   no
 
 ###############################################################################
diff --git a/tests/unit/test_io_info_args.c b/tests/unit/test_io_info_args.c
index 1a205507a..83d1c9eaf 100644
--- a/tests/unit/test_io_info_args.c
+++ b/tests/unit/test_io_info_args.c
@@ -5,7 +5,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2022 The University of Edinburgh
+ *  (c) 2022-2025 The University of Edinburgh
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
@@ -60,7 +60,7 @@ int test_io_info_args_default(void) {
   assert(args.grid[0]     == 1);
   assert(args.grid[1]     == 1);
   assert(args.grid[2]     == 1);
-  assert(args.nfreq       == 100000);
+  assert(args.iofreq      == 0);
 
   if (args.input.mode != 0) ifail += 1;
 
diff --git a/tests/unit/test_io_info_args_rt.c b/tests/unit/test_io_info_args_rt.c
index fb5a57796..7c917822e 100644
--- a/tests/unit/test_io_info_args_rt.c
+++ b/tests/unit/test_io_info_args_rt.c
@@ -6,7 +6,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Groups and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2022-2024 The University of Edinburgh
+ *  (c) 2022-2025 The University of Edinburgh
  *
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
  *
@@ -20,6 +20,7 @@ int test_io_info_args_rt(pe_t * pe);
 int test_io_info_args_rt_input(pe_t * pe);
 int test_io_info_args_rt_output(pe_t * pe);
 int test_io_info_args_rt_iogrid(pe_t * pe);
+int test_io_info_args_rt_iofreq(pe_t * pe);
 
 /*****************************************************************************
  *
@@ -34,6 +35,8 @@ int test_io_info_args_rt_suite(void) {
   pe_create(MPI_COMM_WORLD, PE_QUIET, &pe);
 
   test_io_info_args_rt_iogrid(pe);
+  test_io_info_args_rt_iofreq(pe);
+
   test_io_info_args_rt_output(pe);
   test_io_info_args_rt_input(pe);
   test_io_info_args_rt(pe);
@@ -254,3 +257,41 @@ int test_io_info_args_rt_iogrid(pe_t * pe) {
 
   return ierr;
 }
+
+/*****************************************************************************
+ *
+ *  test_io_info_args_rt_iofreq
+ *
+ *****************************************************************************/
+
+int test_io_info_args_rt_iofreq(pe_t * pe) {
+
+  int ifail = 0;
+  rt_t * rt = NULL;
+
+  rt_create(pe, &rt);
+
+  {
+    /* Wrong */
+    const char * wrong = "wrong_iofreq";
+    int iofreq = 0;
+    rt_add_key_value(rt, wrong, "-1");
+    ifail = io_info_args_rt_iofreq(rt, RT_NONE, wrong, &iofreq);
+    assert(ifail != 0);
+    assert(iofreq == 0); /* Unchanged */
+  }
+
+  {
+    /* Right */
+    const char * right = "right_iofreq";
+    int iofreq = 0;
+    rt_add_key_value(rt, right, " 100");
+    ifail = io_info_args_rt_iofreq(rt, RT_FATAL, right, &iofreq);
+    assert(ifail == 0);
+    assert(iofreq == 100);
+  }
+
+  rt_free(rt);
+
+  return ifail;
+}
diff --git a/tests/unit/test_map.c b/tests/unit/test_map.c
index 7d1de8b79..58a48f023 100644
--- a/tests/unit/test_map.c
+++ b/tests/unit/test_map.c
@@ -7,7 +7,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2012-2024 The University of Edinburgh
+ *  (c) 2012-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -750,17 +750,6 @@ int test_map_io_read(pe_t * pe, cs_t * cs) {
     map_free(&map);
   }
 
-  /* non-existant file returns an error */
-  {
-    map_options_t opts = map_options_default();
-    map_t * map = NULL;
-
-    map_create(pe, cs, &opts, &map);
-    ifail = map_io_read(map, 999);
-    assert(ifail != MPI_SUCCESS);
-    map_free(&map);
-  }
-
   return ifail;
 }
 

From 146c94a75c8eca9a0309c79b234ee4914b6441de Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Mon, 17 Feb 2025 17:45:51 +0000
Subject: [PATCH 126/133] Minor format updates

---
 src/lb_data.c | 122 +++++++++++++++++++++++++++-----------------------
 1 file changed, 66 insertions(+), 56 deletions(-)

diff --git a/src/lb_data.c b/src/lb_data.c
index 3b374d027..5d167e718 100644
--- a/src/lb_data.c
+++ b/src/lb_data.c
@@ -921,7 +921,8 @@ int lb_halo_enqueue_send(const lb_t * lb, lb_halo_t * h, int ireq) {
  *
  *****************************************************************************/
 
-__global__ void lb_halo_enqueue_send_kernel(const lb_t * lb, lb_halo_t * h, int ireq) {
+__global__ void lb_halo_enqueue_send_kernel(const lb_t * lb, lb_halo_t * h,
+					    int ireq) {
 
   assert(0 <= ireq && ireq < h->map.nvel);
 
@@ -950,19 +951,19 @@ __global__ void lb_halo_enqueue_send_kernel(const lb_t * lb, lb_halo_t * h, int
       int ib = 0; /* Buffer index */
 
       for (int n = 0; n < lb->ndist; n++) {
-	      for (int p = 0; p < lb->nvel; p++) {
-	        /* Recall, if full, we need p = 0 */
-	        int8_t px = lb->model.cv[p][X];
-	        int8_t py = lb->model.cv[p][Y];
-	        int8_t pz = lb->model.cv[p][Z];
-	        int dot = mx*px + my*py + mz*pz;
-	        if (h->full || dot == mm) {
-	          int index = cs_index(lb->cs, ic, jc, kc);
-	          int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
-	          h->send[ireq][ih*h->count[ireq] + ib] = lb->f[laddr];
-	          ib++;
-	        }
-	      }
+	for (int p = 0; p < lb->nvel; p++) {
+	  /* Recall, if full, we need p = 0 */
+	  int8_t px = lb->model.cv[p][X];
+	  int8_t py = lb->model.cv[p][Y];
+	  int8_t pz = lb->model.cv[p][Z];
+	  int dot = mx*px + my*py + mz*pz;
+	  if (h->full || dot == mm) {
+	    int index = cs_index(lb->cs, ic, jc, kc);
+	    int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
+	    h->send[ireq][ih*h->count[ireq] + ib] = lb->f[laddr];
+	    ib++;
+	  }
+	}
       }
       assert(ib == h->count[ireq]);
     }
@@ -1049,7 +1050,8 @@ int lb_halo_dequeue_recv(lb_t * lb, const lb_halo_t * h, int ireq) {
  *
  *****************************************************************************/
 
-__global__ void lb_halo_dequeue_recv_kernel(lb_t * lb, const lb_halo_t * h, int ireq) {
+__global__ void lb_halo_dequeue_recv_kernel(lb_t * lb, const lb_halo_t * h,
+					    int ireq) {
 
   assert(lb);
   assert(h);
@@ -1091,20 +1093,20 @@ __global__ void lb_halo_dequeue_recv_kernel(lb_t * lb, const lb_halo_t * h, int
       int ib = 0; /* Buffer index */
 
       for (int n = 0; n < lb->ndist; n++) {
-	      for (int p = 0; p < lb->nvel; p++) {
-	        /* For reduced swap, we must have -cv[p] here... */
-	        int8_t px = lb->model.cv[lb->nvel-p][X];
-	        int8_t py = lb->model.cv[lb->nvel-p][Y];
-	        int8_t pz = lb->model.cv[lb->nvel-p][Z];
-	        int dot = mx*px + my*py + mz*pz;
-
-	        if (h->full || dot == mm) {
-	          int index = cs_index(lb->cs, ic, jc, kc);
-	          int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
-	          lb->f[laddr] = recv[ih*h->count[ireq] + ib];
-	          ib++;
-	        }
-	      }
+	for (int p = 0; p < lb->nvel; p++) {
+	  /* For reduced swap, we must have -cv[p] here... */
+	  int8_t px = lb->model.cv[lb->nvel-p][X];
+	  int8_t py = lb->model.cv[lb->nvel-p][Y];
+	  int8_t pz = lb->model.cv[lb->nvel-p][Z];
+	  int dot = mx*px + my*py + mz*pz;
+
+	  if (h->full || dot == mm) {
+	    int index = cs_index(lb->cs, ic, jc, kc);
+	    int laddr = LB_ADDR(lb->nsite, lb->ndist, lb->nvel, index, n, p);
+	    lb->f[laddr] = recv[ih*h->count[ireq] + ib];
+	    ib++;
+	  }
+	}
       }
       assert(ib == h->count[ireq]);
     }
@@ -1360,22 +1362,26 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
     if (use_graph_api_) {
       tdpAssert( tdpGraphLaunch(h->gsend.exec, h->stream) );
       tdpAssert( tdpStreamSynchronize(h->stream) );
-    } else {
+    }
+    else {
       for (int ireq = 0; ireq < h->map.nvel; ireq++) {
         if (h->count[ireq] > 0) {
           int scount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
           dim3 nblk, ntpb;
           kernel_launch_param(scount, &nblk, &ntpb);
-          tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+          tdpLaunchKernel(lb_halo_enqueue_send_kernel, nblk, ntpb, 0, 0,
+			  lb->target, h->target, ireq);
           tdpAssert( tdpDeviceSynchronize());
  
           if (!have_gpu_aware_mpi_()) {
-            tdpAssert( tdpMemcpy(h->send[ireq], h->send_d[ireq], sizeof(double)*scount, tdpMemcpyDeviceToHost));
+            tdpAssert( tdpMemcpy(h->send[ireq], h->send_d[ireq],
+				 sizeof(double)*scount, tdpMemcpyDeviceToHost));
           }
         }
       }
     }
-  } else {
+  }
+  else {
     #pragma omp parallel
     {
       for (int ireq = 0; ireq < h->map.nvel; ireq++) {
@@ -1404,7 +1410,7 @@ int lb_halo_post(lb_t * lb, lb_halo_t * h) {
       if (h->nbrrank[i][j][k] == h->nbrrank[1][1][1]) continue;
 
       MPI_Isend(buf, mcount, MPI_DOUBLE, h->nbrrank[i][j][k],
-		            h->tagbase + ireq, h->comm, h->request + 27 + ireq);
+		h->tagbase + ireq, h->comm, h->request + 27 + ireq);
     }
   }
   
@@ -1438,21 +1444,25 @@ int lb_halo_wait(lb_t * lb, lb_halo_t * h) {
     if (use_graph_api_) {
       tdpAssert( tdpGraphLaunch(h->grecv.exec, h->stream) );
       tdpAssert( tdpStreamSynchronize(h->stream) );
-    } else {
+    }
+    else {
       for (int ireq = 0; ireq < h->map.nvel; ireq++) {
         if (h->count[ireq] > 0) {
           int rcount = h->count[ireq]*lb_halo_size(h->slim[ireq]);
           if (!have_gpu_aware_mpi_()) {
-            tdpAssert( tdpMemcpy(h->recv[ireq], h->recv_d[ireq], sizeof(double)*rcount, tdpMemcpyDeviceToHost));
+            tdpAssert( tdpMemcpy(h->recv[ireq], h->recv_d[ireq],
+				 sizeof(double)*rcount, tdpMemcpyDeviceToHost));
           }
           dim3 nblk, ntpb;
           kernel_launch_param(rcount, &nblk, &ntpb);
-          tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0, lb->target, h->target, ireq);
+          tdpLaunchKernel(lb_halo_dequeue_recv_kernel, nblk, ntpb, 0, 0,
+			  lb->target, h->target, ireq);
           tdpAssert( tdpDeviceSynchronize());
         }
       }
     }
-  } else {
+  }
+  else {
     #pragma omp parallel
     {
       for (int ireq = 0; ireq < h->map.nvel; ireq++) {
@@ -1829,23 +1839,23 @@ int lb_graph_halo_send_create(const lb_t * lb, lb_halo_t * h) {
       int k = 1 + h->map.cv[h->map.nvel - ireq][Z];
 
       if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
-	      tdpGraphNode_t memcpyNode;
+	tdpGraphNode_t memcpyNode;
         tdpMemcpy3DParms memcpyParams = {0};
 
-	      memcpyParams.srcArray = NULL;
-	      memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
-	      memcpyParams.srcPtr   = make_tdpPitchedPtr(h->send_d[ireq],
+	memcpyParams.srcArray = NULL;
+	memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
+	memcpyParams.srcPtr   = make_tdpPitchedPtr(h->send_d[ireq],
 						   sizeof(double)*h->count[ireq]*scount,
 						   h->count[ireq]*scount, 1);
-	      memcpyParams.dstArray = NULL;
-	      memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
-	      memcpyParams.dstPtr   = make_tdpPitchedPtr(h->send[ireq],
+	memcpyParams.dstArray = NULL;
+	memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
+	memcpyParams.dstPtr   = make_tdpPitchedPtr(h->send[ireq],
 						   sizeof(double)*h->count[ireq]*scount,
 						   h->count[ireq]*scount, 1);
-	      memcpyParams.extent   = make_tdpExtent(sizeof(double)*h->count[ireq]*scount, 1, 1);
-	      memcpyParams.kind     = tdpMemcpyDeviceToHost;
+	memcpyParams.extent   = make_tdpExtent(sizeof(double)*h->count[ireq]*scount, 1, 1);
+	memcpyParams.kind     = tdpMemcpyDeviceToHost;
 
-	      tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->gsend.graph,
+	tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->gsend.graph,
 					 &kernelNode, 1, &memcpyParams) );
       }
     }
@@ -1883,22 +1893,22 @@ int lb_graph_halo_recv_create(const lb_t * lb, lb_halo_t * h) {
       int k = 1 + h->map.cv[h->map.nvel - ireq][Z];
 
       if (h->nbrrank[i][j][k] != h->nbrrank[1][1][1]) {
-	      tdpMemcpy3DParms memcpyParams = {0};
+	tdpMemcpy3DParms memcpyParams = {0};
 
-	      memcpyParams.srcArray = NULL;
-	      memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
-	      memcpyParams.srcPtr   = make_tdpPitchedPtr(h->recv[ireq],
+	memcpyParams.srcArray = NULL;
+	memcpyParams.srcPos   = make_tdpPos(0, 0, 0);
+	memcpyParams.srcPtr   = make_tdpPitchedPtr(h->recv[ireq],
 						   sizeof(double)*h->count[ireq]*rcount,
 						   h->count[ireq]*rcount, 1);
-	      memcpyParams.dstArray = NULL;
-	      memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
-	      memcpyParams.dstPtr   = make_tdpPitchedPtr(h->recv_d[ireq],
+	memcpyParams.dstArray = NULL;
+	memcpyParams.dstPos   = make_tdpPos(0, 0, 0);
+	memcpyParams.dstPtr   = make_tdpPitchedPtr(h->recv_d[ireq],
 						   sizeof(double)*h->count[ireq]*rcount,
 						   h->count[ireq]*rcount, 1);
         memcpyParams.extent   = make_tdpExtent(sizeof(double)*h->count[ireq]*rcount, 1, 1);
         memcpyParams.kind     = tdpMemcpyHostToDevice;
 
-	      tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->grecv.graph, NULL,
+	tdpAssert( tdpGraphAddMemcpyNode(&memcpyNode, h->grecv.graph, NULL,
 					 0, &memcpyParams) );
       }
     }

From 26d8d7e3724b3526a3c99ac734a24acb61d46d15 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Mon, 17 Feb 2025 18:17:03 +0000
Subject: [PATCH 127/133] Avoid mod 0 undefined behaviour

---
 src/ludwig.c           | 12 ++++++------
 src/util.c             | 18 +++++++++++++++++-
 src/util.h             |  2 ++
 tests/unit/test_util.c | 24 +++++++++++++++++++++++-
 4 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/src/ludwig.c b/src/ludwig.c
index 05999cc41..5b60e4606 100644
--- a/src/ludwig.c
+++ b/src/ludwig.c
@@ -889,7 +889,7 @@ void ludwig_run(const char * inputfile) {
     }
 
     if (ludwig->phi) {
-      int output = (0 == (step % ludwig->phi->opts.iodata.iofreq));
+      int output = (0 == util_mod(step, ludwig->phi->opts.iodata.iofreq));
       if (output || is_config_step()) {
 	io_event_t event = {0};
 	pe_info(ludwig->pe, "Writing phi file at step %d!\n", step);
@@ -898,7 +898,7 @@ void ludwig_run(const char * inputfile) {
     }
 
     if (ludwig->p) {
-      int output = (0 == (step % ludwig->p->opts.iodata.iofreq));
+      int output = (0 == util_mod(step, ludwig->p->opts.iodata.iofreq));
       if (output || is_config_step()) {
 	io_event_t event = {0};
 	pe_info(ludwig->pe, "Writing p file at step %d!\n", step);
@@ -907,7 +907,7 @@ void ludwig_run(const char * inputfile) {
     }
 
     if (ludwig->q) {
-      int output = (0 == (step % ludwig->q->opts.iodata.iofreq));
+      int output = (0 == util_mod(step, ludwig->q->opts.iodata.iofreq));
       if (output || is_config_step()) {
 	io_event_t event = {0};
 	pe_info(ludwig->pe, "Writing q file at step %d!\n", step);
@@ -919,7 +919,7 @@ void ludwig_run(const char * inputfile) {
 
     if (ludwig->psi) {
       /* The potential and the charge densities (both controlled by "psi") */
-      int output = (0 == (step % ludwig->psi->psi->opts.iodata.iofreq));
+      int output = (0 == util_mod(step, ludwig->psi->psi->opts.iodata.iofreq));
       if (output || is_config_step()) {
 	pe_info(ludwig->pe, "Writing electrokinetic data at step %d!\n", step);
 	psi_io_write(ludwig->psi, step);
@@ -954,12 +954,12 @@ void ludwig_run(const char * inputfile) {
       }
       else {
 	/* Individual requests */
-	if (0 == (step % ludwig->hydro->rho->opts.iodata.iofreq)) {
+	if (0 == util_mod(step, ludwig->hydro->rho->opts.iodata.iofreq)) {
 	  io_event_t event = {0};
 	  pe_info(ludwig->pe, "Writing rho output at step %d!\n", step);
 	  field_io_write(ludwig->hydro->rho, step, &event);
 	}
-	if (0 == (step % ludwig->hydro->u->opts.iodata.iofreq)) {
+	if (0 == util_mod(step, ludwig->hydro->u->opts.iodata.iofreq)) {
 	  io_event_t event = {0};
 	  pe_info(ludwig->pe, "Writing velocity output at step %d!\n", step);
 	  field_io_write(ludwig->hydro->u, step, &event);
diff --git a/src/util.c b/src/util.c
index f5906fb24..3183163b9 100644
--- a/src/util.c
+++ b/src/util.c
@@ -10,7 +10,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2010-2024 The University of Edinburgh
+ *  (c) 2010-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -1067,3 +1067,19 @@ __host__ int util_rectangle_conductance(double w, double h, double * q) {
 
   return ierr;
 }
+
+/*****************************************************************************
+ *
+ *  util_mod
+ *
+ *  A standard remainder after division, except this allows the second
+ *  argument to be zero.
+ *
+ *  The return value if b = 0 is zero for any value of a.
+ *
+ *****************************************************************************/
+
+int util_mod(int a, int b) {
+
+  return (b == 0) ? 0 : a % b;
+}
diff --git a/src/util.h b/src/util.h
index 6dcaf2179..ff70f7f50 100644
--- a/src/util.h
+++ b/src/util.h
@@ -68,4 +68,6 @@ __host__ int util_ranlcg_reap_gaussian(int * state, double r[2]);
 __host__ int util_str_tolower(char * str, size_t maxlen);
 __host__ int util_rectangle_conductance(double h, double w, double * c);
 
+int util_mod(int a, int b);
+
 #endif
diff --git a/tests/unit/test_util.c b/tests/unit/test_util.c
index 5a4a9744e..c5e693b1f 100644
--- a/tests/unit/test_util.c
+++ b/tests/unit/test_util.c
@@ -5,7 +5,7 @@
  *  Edinburgh Soft Matter and Statistical Physics Group and
  *  Edinburgh Parallel Computing Centre
  *
- *  (c) 2012-2023 The University of Edinburgh
+ *  (c) 2012-2025 The University of Edinburgh
  *
  *  Contributing authors:
  *  Kevin Stratford (kevin@epcc.ed.ac.uk)
@@ -33,6 +33,7 @@ int util_jacobi_check(void);
 int util_dpythag_check(void);
 int util_str_tolower_check(void);
 int util_rectangle_conductance_check(void);
+int util_mod_check(void);
 
 /*****************************************************************************
  *
@@ -52,6 +53,7 @@ int test_util_suite(void) {
   util_dpythag_check();
   util_str_tolower_check();
   util_rectangle_conductance_check();
+  util_mod_check();
 
   pe_info(pe, "PASS     ./unit/test_util\n");
   pe_free(pe);
@@ -246,3 +248,23 @@ int util_rectangle_conductance_check(void) {
 
   return ierr;
 }
+
+/*****************************************************************************
+ *
+ *  util_mod_check
+ *
+ *****************************************************************************/
+
+int util_mod_check(void) {
+
+  int ifail = 0;
+
+  ifail = util_mod(2, 1);
+  assert(ifail == 0);
+  ifail = util_mod(3, 2);
+  assert(ifail == 1);
+  ifail = util_mod(3, 0);
+  assert(ifail == 0);
+
+  return ifail;
+}

From a39b85d908d04cc514b2c9daad8a2aca44cbeb6b Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Mon, 17 Feb 2025 18:34:58 +0000
Subject: [PATCH 128/133] Correction to return value

---
 src/util.c             | 4 ++--
 tests/unit/test_util.c | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/util.c b/src/util.c
index 3183163b9..d8f992db7 100644
--- a/src/util.c
+++ b/src/util.c
@@ -1075,11 +1075,11 @@ __host__ int util_rectangle_conductance(double w, double h, double * q) {
  *  A standard remainder after division, except this allows the second
  *  argument to be zero.
  *
- *  The return value if b = 0 is zero for any value of a.
+ *  The return value if b = 0 is a for any value of a.
  *
  *****************************************************************************/
 
 int util_mod(int a, int b) {
 
-  return (b == 0) ? 0 : a % b;
+  return (b == 0) ? a : a % b;
 }
diff --git a/tests/unit/test_util.c b/tests/unit/test_util.c
index c5e693b1f..98958faed 100644
--- a/tests/unit/test_util.c
+++ b/tests/unit/test_util.c
@@ -264,6 +264,8 @@ int util_mod_check(void) {
   ifail = util_mod(3, 2);
   assert(ifail == 1);
   ifail = util_mod(3, 0);
+  assert(ifail == 3);
+  ifail = util_mod(0, 0);
   assert(ifail == 0);
 
   return ifail;

From 0b3448c06152b43a29d20b13a7100539d8328923 Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@cirrus.epcc.ed.ac.uk>
Date: Fri, 21 Feb 2025 17:49:47 +0000
Subject: [PATCH 129/133] Correct comment

---
 src/ludwig.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ludwig.c b/src/ludwig.c
index fc23ed5c9..6a3c4bb71 100644
--- a/src/ludwig.c
+++ b/src/ludwig.c
@@ -2198,7 +2198,7 @@ int ludwig_colloids_update(ludwig_t * ludwig) {
     lb_halo(ludwig->lb);
   }
   else {
-    /* Pull data back, then full host halo swap */
+    /* Run the halo on the target, and copy back the data */
     lb_halo(ludwig->lb);
     lb_memcpy(ludwig->lb, tdpMemcpyDeviceToHost);
   }

From 0486c27c947bc8a747c3fb8c2c3678a0e1e48e26 Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@cirrus.epcc.ed.ac.uk>
Date: Fri, 21 Feb 2025 17:50:22 +0000
Subject: [PATCH 130/133] General configuration for GPU MPI

---
 src/pe.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/pe.c b/src/pe.c
index d5cef4d60..a7d7a8767 100644
--- a/src/pe.c
+++ b/src/pe.c
@@ -443,21 +443,16 @@ __host__ int pe_time(char * str, int bufsiz) {
  *
  *  have_gpu_aware_mpi_
  *
- *  This is awkward; it might belong elsewhere on its own.
+ *  This must be a configuration time option at the moment, as there
+ *  is no portable way to tell at run time.
  *
  *****************************************************************************/
 
-#ifdef HAVE_OPENMPI_
-/* This provides MPIX_CUDA_AWARE_SUPPORT .. */
-#include "mpi-ext.h"
-#endif
-
 int have_gpu_aware_mpi_(void) {
 
   int have_gpu_aware_mpi = 0;
 
-  /* OpenMPI */
-#if defined (MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
+#ifdef HAVE_GPU_AWARE_MPI
   have_gpu_aware_mpi = 1;
 #endif
 

From 16c0622d9525836ab8bda4fe519dfd8383dc944c Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@cirrus.epcc.ed.ac.uk>
Date: Fri, 21 Feb 2025 17:59:57 +0000
Subject: [PATCH 131/133] Remove old halo code

---
 src/halo_swap.c | 1345 -----------------------------------------------
 src/halo_swap.h |   44 --
 2 files changed, 1389 deletions(-)
 delete mode 100644 src/halo_swap.c
 delete mode 100644 src/halo_swap.h

diff --git a/src/halo_swap.c b/src/halo_swap.c
deleted file mode 100644
index a3bd2ac3a..000000000
--- a/src/halo_swap.c
+++ /dev/null
@@ -1,1345 +0,0 @@
-/*****************************************************************************
- *
- *  halo_swap.c
- *
- *  Lattice halo swap machinery.
- *
- *  Edinburgh Soft Matter and Statistical Physics Group and
- *  Edinburgh Parallel Computing Centre
- *
- *  (c) 2016-2024 The University of Edinburgh
- *
- *  Contributing authors:
- *  Alan Gray (alang@epcc.ed.ac.uk)
- *  Kevin Stratford (kevin@epcc.ed.ac.uk)
- *
- *****************************************************************************/
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "util.h"
-#include "halo_swap.h"
-
-typedef struct halo_swap_param_s halo_swap_param_t;
-
-struct halo_swap_s {
-  pe_t * pe;
-  cs_t * cs;
-  halo_swap_param_t * param;
-  double * fxlo;
-  double * fxhi;
-  double * fylo;
-  double * fyhi;
-  double * fzlo;
-  double * fzhi;
-  double * hxlo;
-  double * hxhi;
-  double * hylo;
-  double * hyhi;
-  double * hzlo;
-  double * hzhi;
-  f_pack_t data_pack;       /* Pack buffer kernel function */
-  f_unpack_t data_unpack;   /* Unpack buffer kernel function */
-  tdpStream_t stream[3];    /* Stream for each of X,Y,Z */
-  halo_swap_t * target;     /* Device memory */
-};
-
-/* Note nsite != naddr if extra memory has been allocated for LE
- * plane buffers. */
-
-struct halo_swap_param_s {
-  int nhalo;                /* cs_nhalo */
-  int nswap;                /* Width of actual halo swap <= nhalo */
-  int nsite;                /* Total nall[X]*nall[Y]*nall[Z] */
-  int na;                   /* Extent (rank 1 fields) */
-  int nb;                   /* Extent (rank2 fields) */
-  int naddr;                /* Extenet (nsite for address calculation) */
-  int nfel;                 /* Field elements per site (double) */
-  int nlocal[3];            /* local domain extent */
-  int nall[3];              /* ... including 2*cs_nhalo */
-  int hext[3][3];           /* halo extents ... see below */
-  int hsz[3];               /* halo size in lattice sites each direction */
-};
-
-static __constant__ halo_swap_param_t const_param;
-
-__host__ int halo_swap_create(pe_t * pe, cs_t * cs, int nhcomm, int naddr,
-			      int na, int nb, halo_swap_t ** phalo);
-__host__ __device__ void halo_swap_coords(halo_swap_t * halo, int id, int index, int * ic, int * jc, int * kc);
-__host__ __device__ int halo_swap_index(halo_swap_t * halo, int ic, int jc, int kc);
-__host__ __device__ int halo_swap_bufindex(halo_swap_t * halo, int id, int ic, int jc, int kc);
-
-/*****************************************************************************
- *
- *  halo_swap_create_r1
- *
- *  Rank1 addressable objects.
- *
- *****************************************************************************/
-
-__host__ int halo_swap_create_r1(pe_t * pe, cs_t * cs, int nhcomm, int naddr,
-				 int na,
-				 halo_swap_t ** p) {
-
-  return halo_swap_create(pe, cs, nhcomm, naddr, na, 1, p);
-}
-
-/*****************************************************************************
- *
- *  halo_swap_create_r2
- *
- *  Rank2 addressable objects
- *
- *****************************************************************************/
-
-__host__ int halo_swap_create_r2(pe_t * pe, cs_t *cs, int nhcomm, int naddr,
-				 int na, int nb,
-				 halo_swap_t ** p) {
-
-  return halo_swap_create(pe, cs, nhcomm, naddr, na, nb, p);
-}
-
-/*****************************************************************************
- *
- *  halo_swap_create
- *
- *****************************************************************************/
-
-__host__ int halo_swap_create(pe_t * pe, cs_t * cs, int nhcomm, int naddr,
-			      int na, int nb,
-			      halo_swap_t ** phalo) {
-
-  int nhalo;
-  int ndevice;
-  unsigned int mflag = tdpHostAllocDefault;
-
-  size_t sz;
-  halo_swap_t * halo = NULL;
-
-  assert(pe);
-  assert(cs);
-  assert(phalo);
-
-  halo = (halo_swap_t *) calloc(1, sizeof(halo_swap_t));
-  assert(halo);
-
-  halo->param = (halo_swap_param_t *) calloc(1, sizeof(halo_swap_param_t));
-  assert(halo->param);
-
-  /* Template for distributions, which is used to allocate buffers;
-   * assumed to be large enough for any halo transfer... */
-
-  halo->pe = pe;
-  halo->cs = cs;
-
-  cs_nhalo(cs, &nhalo);
-
-  halo->param->na = na;
-  halo->param->nb = nb;
-  halo->param->nhalo = nhalo;
-  halo->param->nswap = nhcomm;
-  halo->param->nfel = na*nb;
-  halo->param->naddr = naddr;
-  cs_nlocal(cs, halo->param->nlocal);
-  cs_nall(cs, halo->param->nall);
-
-  halo->param->nsite = halo->param->nall[X]*halo->param->nall[Y]*halo->param->nall[Z];
-
-  /* Halo extents:  hext[X] = {1, nall[Y], nall[Z]}
-                    hext[Y] = {nall[X], 1, nall[Z]}
-                    hext[Z] = {nall[X], nall[Y], 1} */
-
-  halo->param->hext[X][X] = halo->param->nswap;
-  halo->param->hext[X][Y] = halo->param->nall[Y];
-  halo->param->hext[X][Z] = halo->param->nall[Z];
-  halo->param->hext[Y][X] = halo->param->nall[X];
-  halo->param->hext[Y][Y] = halo->param->nswap;
-  halo->param->hext[Y][Z] = halo->param->nall[Z];
-  halo->param->hext[Z][X] = halo->param->nall[X];
-  halo->param->hext[Z][Y] = halo->param->nall[Y];
-  halo->param->hext[Z][Z] = halo->param->nswap;
-
-  halo->param->hsz[X] = nhcomm*halo->param->hext[X][Y]*halo->param->hext[X][Z];
-  halo->param->hsz[Y] = nhcomm*halo->param->hext[Y][X]*halo->param->hext[Y][Z];
-  halo->param->hsz[Z] = nhcomm*halo->param->hext[Z][X]*halo->param->hext[Z][Y];
-
-  /* Host buffers, actual and halo regions */
-
-  sz = (size_t) halo->param->hsz[X]*na*nb*sizeof(double);
-  tdpAssert( tdpHostAlloc((void **) &halo->fxlo, sz, mflag) );
-  tdpAssert( tdpHostAlloc((void **) &halo->fxhi, sz, mflag) );
-  tdpAssert( tdpHostAlloc((void **) &halo->hxlo, sz, mflag) );
-  tdpAssert( tdpHostAlloc((void **) &halo->hxhi, sz, mflag) );
-
-  sz = (size_t) halo->param->hsz[Y]*na*nb*sizeof(double);
-  tdpAssert( tdpHostAlloc((void **) &halo->fylo, sz, mflag) );
-  tdpAssert( tdpHostAlloc((void **) &halo->fyhi, sz, mflag) );
-  tdpAssert( tdpHostAlloc((void **) &halo->hylo, sz, mflag) );
-  tdpAssert( tdpHostAlloc((void **) &halo->hyhi, sz, mflag) );
-
-  sz = (size_t) halo->param->hsz[Z]*na*nb*sizeof(double);
-  tdpAssert( tdpHostAlloc((void **) &halo->fzlo, sz, mflag) );
-  tdpAssert( tdpHostAlloc((void **) &halo->fzhi, sz, mflag) );
-  tdpAssert( tdpHostAlloc((void **) &halo->hzlo, sz, mflag) );
-  tdpAssert( tdpHostAlloc((void **) &halo->hzhi, sz, mflag) );
-
-  tdpAssert( tdpStreamCreate(&halo->stream[X]) );
-  tdpAssert( tdpStreamCreate(&halo->stream[Y]) );
-  tdpAssert( tdpStreamCreate(&halo->stream[Z]) );
-
-  /* Device buffers: allocate or alias */
-
-  tdpAssert( tdpGetDeviceCount(&ndevice) );
-
-  if (ndevice == 0) {
-    halo->target = halo;
-  }
-  else {
-    double * tmp;
-    halo_swap_param_t * tmpp;
-
-    /* Target structure */
-    tdpAssert( tdpMalloc((void **) &halo->target, sizeof(halo_swap_t)) );
-    tdpAssert( tdpMemset(halo->target, 0, sizeof(halo_swap_t)) );
-
-    /* Buffers */
-    sz = (size_t) halo->param->hsz[X]*na*nb*sizeof(double);
-
-    tdpAssert( tdpMalloc((void **) &tmp, sz) );
-    tdpAssert( tdpMemcpy(&halo->target->fxlo, &tmp, sizeof(double *),
-			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMalloc((void **) &tmp, sz) );
-    tdpAssert( tdpMemcpy(&halo->target->fxhi, &tmp, sizeof(double *),
-			 tdpMemcpyHostToDevice) );
-
-    tdpAssert( tdpMalloc((void **) &tmp, sz) );
-    tdpAssert( tdpMemcpy(&halo->target->hxlo, &tmp, sizeof(double *),
-			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMalloc((void **) & tmp, sz) );
-    tdpAssert( tdpMemcpy(&halo->target->hxhi, &tmp, sizeof(double *),
-			 tdpMemcpyHostToDevice) );
-
-    sz = (size_t) halo->param->hsz[Y]*na*nb*sizeof(double);
-
-    tdpAssert( tdpMalloc((void ** ) &tmp, sz) );
-    tdpAssert( tdpMemcpy(&halo->target->fylo, &tmp, sizeof(double *),
-			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMalloc((void **) &tmp, sz) );
-    tdpAssert( tdpMemcpy(&halo->target->fyhi, &tmp, sizeof(double *),
-			 tdpMemcpyHostToDevice) );
-
-    tdpAssert( tdpMalloc((void **) &tmp, sz) );
-    tdpAssert( tdpMemcpy(&halo->target->hylo, &tmp, sizeof(double *),
-			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMalloc((void **) &tmp, sz) );
-    tdpAssert( tdpMemcpy(&halo->target->hyhi, &tmp, sizeof(double *),
-			 tdpMemcpyHostToDevice) );
-
-    sz = (size_t) halo->param->hsz[Z]*na*nb*sizeof(double);
-
-    tdpAssert( tdpMalloc((void **) &tmp, sz) );
-    tdpAssert( tdpMemcpy(&halo->target->fzlo, &tmp, sizeof(double *),
-			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMalloc((void **) &tmp, sz) );
-    tdpAssert( tdpMemcpy(&halo->target->fzhi, &tmp, sizeof(double *),
-			 tdpMemcpyHostToDevice) );
-
-    tdpAssert( tdpMalloc((void **) &tmp, sz) );
-    tdpAssert( tdpMemcpy(&halo->target->hzlo, &tmp, sizeof(double *),
-			 tdpMemcpyHostToDevice) );
-    tdpAssert( tdpMalloc((void **) &tmp, sz) );
-    tdpAssert( tdpMemcpy(&halo->target->hzhi, &tmp, sizeof(double *),
-			 tdpMemcpyHostToDevice) );
-
-    tdpGetSymbolAddress((void **) &tmpp, tdpSymbol(const_param));
-    tdpAssert( tdpMemcpy(&halo->target->param, &tmpp,
-			 sizeof(halo_swap_param_t *), tdpMemcpyHostToDevice) );
-
-    /* Device constants */
-    halo_swap_commit(halo);
-  }
-
-  *phalo = halo;
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  halo_swap_free
- *
- *****************************************************************************/
-
-__host__ int halo_swap_free(halo_swap_t * halo) {
-
-  int ndevice;
-
-  assert(halo);
-
-  tdpAssert( tdpGetDeviceCount(&ndevice) );
-
-  if (ndevice > 0) {
-    double * tmp;
-
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->fxlo, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpFree(tmp) );
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->fxhi, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpFree(tmp) );
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->fylo, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpFree(tmp) );
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->fyhi, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpFree(tmp) );
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->fzlo, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpFree(tmp) );
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->fzhi, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpFree(tmp) );
-
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->hxlo, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpFree(tmp) );
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->hxhi, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpFree(tmp) );
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->hylo, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpFree(tmp) );
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->hyhi, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpFree(tmp) );
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->hzlo, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpFree(tmp) );
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->hzhi, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpFree(tmp) );
-
-    tdpAssert( tdpFree(halo->target) );
-  }
-
-  tdpAssert( tdpFreeHost(halo->fxlo) );
-  tdpAssert( tdpFreeHost(halo->fxhi) );
-  tdpAssert( tdpFreeHost(halo->fylo) );
-  tdpAssert( tdpFreeHost(halo->fyhi) );
-  tdpAssert( tdpFreeHost(halo->fzlo) );
-  tdpAssert( tdpFreeHost(halo->fzhi) );
-
-  tdpAssert( tdpFreeHost(halo->hxlo) );
-  tdpAssert( tdpFreeHost(halo->hxhi) );
-  tdpAssert( tdpFreeHost(halo->hylo) );
-  tdpAssert( tdpFreeHost(halo->hyhi) );
-  tdpAssert( tdpFreeHost(halo->hzlo) );
-  tdpAssert( tdpFreeHost(halo->hzhi) );
-
-  tdpAssert( tdpStreamDestroy(halo->stream[X]) );
-  tdpAssert( tdpStreamDestroy(halo->stream[Y]) );
-  tdpAssert( tdpStreamDestroy(halo->stream[Z]) );
-
-  free(halo->param);
-  free(halo);
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  halo_swap_handlers_set
- *
- *****************************************************************************/
-
-__host__ int halo_swap_handlers_set(halo_swap_t * halo, f_pack_t pack,
-				    f_unpack_t unpack) {
-
-  assert(halo);
-
-  halo->data_pack = pack;
-  halo->data_unpack = unpack;
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  halo_swap_commit
- *
- *****************************************************************************/
-
-__host__ int halo_swap_commit(halo_swap_t * halo) {
-
-  assert(halo);
-
-  tdpMemcpyToSymbol(tdpSymbol(const_param), halo->param,
-		    sizeof(halo_swap_param_t), 0, tdpMemcpyHostToDevice);
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  halo_swap_host_rank1
- *
- *****************************************************************************/
-
-__host__ int halo_swap_host_rank1(halo_swap_t * halo, void * mbuf,
-				  MPI_Datatype mpidata) {
-
-  int ic, jc, kc;
-  int ia, index;
-  int nh;
-  int ireal, ihalo;
-  int icount, nsend;
-  int pforw, pback;
-  int nlocal[3];
-  int mpicartsz[3];
-
-  size_t sz = 0;
-  size_t nsz;
-
-  unsigned char * buf;
-  unsigned char * sendforw;
-  unsigned char * sendback;
-  unsigned char * recvforw;
-  unsigned char * recvback;
-
-  MPI_Comm comm;
-  MPI_Request req[4];
-  MPI_Status status[2];
-
-  const int tagf = 2015;
-  const int tagb = 2016;
-
-  halo_swap_param_t * hp;
-
-  assert(halo);
-  assert(mbuf);
-  assert(mpidata == MPI_CHAR || mpidata == MPI_DOUBLE);
-
-  buf = (unsigned char *) mbuf;
-
-  cs_cart_comm(halo->cs, &comm);
-  cs_cartsz(halo->cs, mpicartsz);
-
-  if (mpidata == MPI_CHAR) sz = sizeof(char);
-  if (mpidata == MPI_DOUBLE) sz = sizeof(double);
-
-  hp = halo->param;
-  cs_nlocal(halo->cs, nlocal);
-
-  /* X-direction */
-
-  nsend = hp->nswap*hp->na*nlocal[Y]*nlocal[Z];
-  nsz = (size_t) nsend*sz;
-
-  sendforw = (unsigned char *) malloc(nsz);
-  sendback = (unsigned char *) malloc(nsz);
-  recvforw = (unsigned char *) malloc(nsz);
-  recvback = (unsigned char *) malloc(nsz);
-  assert(sendforw && sendback);
-  assert(recvforw && recvback);
-  if (sendforw == NULL) pe_fatal(halo->pe, "malloc(sendforw) failed\n");
-  if (sendback == NULL) pe_fatal(halo->pe, "malloc(sendback) failed\n");
-  if (recvforw == NULL) pe_fatal(halo->pe, "malloc(recvforw) failed\n");
-  if (recvback == NULL) pe_fatal(halo->pe, "malloc(recvback) failed\n");
-
-  /* Load send buffers */
-
-  icount = 0;
-
-  for (nh = 0; nh < hp->nswap; nh++) {
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-	for (ia = 0; ia < hp->na; ia++) {
-	  /* Backward going... */
-	  index = cs_index(halo->cs, 1 + nh, jc, kc);
-	  ireal = addr_rank1(hp->naddr, hp->na, index, ia);
-	  memcpy(sendback + icount*sz, buf + ireal*sz, sz);
-	  /* ...and forward going. */
-	  index = cs_index(halo->cs, nlocal[X] - nh, jc, kc);
-	  ireal = addr_rank1(hp->naddr, hp->na, index, ia);
-	  memcpy(sendforw + icount*sz, buf + ireal*sz, sz);
-	  icount += 1;
-	}
-      }
-    }
-  }
-
-  assert(icount == nsend);
-
-  if (mpicartsz[X] == 1) {
-    memcpy(recvback, sendforw, nsz);
-    memcpy(recvforw, sendback, nsz);
-    req[2] = MPI_REQUEST_NULL;
-    req[3] = MPI_REQUEST_NULL;
-  }
-  else {
-    pforw = cs_cart_neighb(halo->cs, FORWARD, X);
-    pback = cs_cart_neighb(halo->cs, BACKWARD, X);
-    MPI_Irecv(recvforw, nsend, mpidata, pforw, tagb, comm, req);
-    MPI_Irecv(recvback, nsend, mpidata, pback, tagf, comm, req + 1);
-    MPI_Issend(sendback, nsend, mpidata, pback, tagb, comm, req + 2);
-    MPI_Issend(sendforw, nsend, mpidata, pforw, tagf, comm, req + 3);
-    /* Wait for receives */
-    MPI_Waitall(2, req, status);
-  }
-
-  /* Unload */
-
-  icount = 0;
-
-  for (nh = 0; nh < hp->nswap; nh++) {
-    for (jc = 1; jc <= nlocal[Y]; jc++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-	for (ia = 0; ia < hp->na; ia++) {
-	  index = cs_index(halo->cs, nlocal[X] + 1 + nh, jc, kc);
-	  ihalo = addr_rank1(hp->naddr, hp->na, index, ia);
-	  memcpy(buf + ihalo*sz, recvforw + icount*sz, sz);
-	  index = cs_index(halo->cs, 0 - nh, jc, kc);
-	  ihalo = addr_rank1(hp->naddr, hp->na, index, ia);
-	  memcpy(buf + ihalo*sz, recvback + icount*sz, sz);
-	  icount += 1;
-	}
-      }
-    }
-  }
-
-  assert(icount == nsend);
-
-  free(recvback);
-  free(recvforw);
-
-  MPI_Waitall(2, req + 2, status);
-
-  free(sendback);
-  free(sendforw);
-
-  /* Y direction */
-
-  nsend = hp->nswap*hp->na*(nlocal[X] + 2*hp->nswap)*nlocal[Z];
-  nsz = (size_t) nsend*sz;
-
-  sendforw = (unsigned char *) malloc(nsz);
-  sendback = (unsigned char *) malloc(nsz);
-  recvforw = (unsigned char *) malloc(nsz);
-  recvback = (unsigned char *) malloc(nsz);
-  assert(sendforw && sendback);
-  assert(recvforw && recvback);
-  if (sendforw == NULL) pe_fatal(halo->pe, "malloc(sendforw) failed\n");
-  if (sendback == NULL) pe_fatal(halo->pe, "malloc(sendback) failed\n");
-  if (recvforw == NULL) pe_fatal(halo->pe, "malloc(recvforw) failed\n");
-  if (recvback == NULL) pe_fatal(halo->pe, "malloc(recvback) failed\n");
-
-  /* Load buffers */
-
-  icount = 0;
-
-  for (nh = 0; nh < hp->nswap; nh++) {
-    for (ic = 1 - hp->nswap; ic <= nlocal[X] + hp->nswap; ic++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-	for (ia = 0; ia < hp->na; ia++) {
-	  index = cs_index(halo->cs, ic, 1 + nh, kc);
-	  ireal = addr_rank1(hp->naddr, hp->na, index, ia);
-	  memcpy(sendback + icount*sz, buf + ireal*sz, sz);
-	  index = cs_index(halo->cs, ic, nlocal[Y] - nh, kc);
-	  ireal = addr_rank1(hp->naddr, hp->na, index, ia);
-	  memcpy(sendforw + icount*sz, buf + ireal*sz, sz);
-	  icount += 1;
-	}
-      }
-    }
-  }
-
-  assert(icount == nsend);
-
-  if (mpicartsz[Y] == 1) {
-    memcpy(recvback, sendforw, nsz);
-    memcpy(recvforw, sendback, nsz);
-    req[2] = MPI_REQUEST_NULL;
-    req[3] = MPI_REQUEST_NULL;
-  }
-  else {
-    pforw = cs_cart_neighb(halo->cs, FORWARD, Y);
-    pback = cs_cart_neighb(halo->cs, BACKWARD, Y);
-    MPI_Irecv(recvforw, nsend, mpidata, pforw, tagb, comm, req);
-    MPI_Irecv(recvback, nsend, mpidata, pback, tagf, comm, req + 1);
-    MPI_Issend(sendback, nsend, mpidata, pback, tagb, comm, req + 2);
-    MPI_Issend(sendforw, nsend, mpidata, pforw, tagf, comm, req + 3);
-    /* Wait for receives */
-    MPI_Waitall(2, req, status);
-  }
-
-  /* Unload */
-
-  icount = 0;
-
-  for (nh = 0; nh < hp->nswap; nh++) {
-    for (ic = 1 - hp->nswap; ic <= nlocal[X] + hp->nswap; ic++) {
-      for (kc = 1; kc <= nlocal[Z]; kc++) {
-	for (ia = 0; ia < hp->na; ia++) {
-	  index = cs_index(halo->cs, ic, 0 - nh, kc);
-	  ihalo = addr_rank1(hp->naddr, hp->na, index, ia);
-	  memcpy(buf + ihalo*sz, recvback + icount*sz, sz);
-	  index = cs_index(halo->cs, ic, nlocal[Y] + 1 + nh, kc);
-	  ihalo = addr_rank1(hp->naddr, hp->na, index, ia);
-	  memcpy(buf + ihalo*sz, recvforw + icount*sz, sz);
-	  icount += 1;
-	}
-      }
-    }
-  }
-
-  assert(icount == nsend);
-
-  free(recvback);
-  free(recvforw);
-
-  MPI_Waitall(2, req + 2, status);
-
-  free(sendback);
-  free(sendforw);
-
-  /* Z direction */
-
-  nsend = hp->nswap*hp->na*(nlocal[X] + 2*hp->nswap)*(nlocal[Y] + 2*hp->nswap);
-  nsz = (size_t) nsend*sz;
-
-  sendforw = (unsigned char *) malloc(nsz);
-  sendback = (unsigned char *) malloc(nsz);
-  recvforw = (unsigned char *) malloc(nsz);
-  recvback = (unsigned char *) malloc(nsz);
-  assert(sendforw && sendback);
-  assert(recvforw && recvback);
-  if (sendforw == NULL) pe_fatal(halo->pe, "malloc(sendforw) failed\n");
-  if (sendback == NULL) pe_fatal(halo->pe, "malloc(sendback) failed\n");
-  if (recvforw == NULL) pe_fatal(halo->pe, "malloc(recvforw) failed\n");
-  if (recvback == NULL) pe_fatal(halo->pe, "malloc(recvback) failed\n");
-
-  /* Load */
-  /* Some adjustment in the load required for 2d systems (X-Y) */
-
-  icount = 0;
-
-  for (nh = 0; nh < hp->nswap; nh++) {
-    for (ic = 1 - hp->nswap; ic <= nlocal[X] + hp->nswap; ic++) {
-      for (jc = 1 - hp->nswap; jc <= nlocal[Y] + hp->nswap; jc++) {
-	for (ia = 0; ia < hp->na; ia++) {
-	  kc = imin(1 + nh, nlocal[Z]);
-	  index = cs_index(halo->cs, ic, jc, kc);
-	  ireal = addr_rank1(hp->naddr, hp->na, index, ia);
-	  memcpy(sendback + icount*sz, buf + ireal*sz, sz);
-	  kc = imax(nlocal[Z] - nh, 1);
-	  index = cs_index(halo->cs, ic, jc, kc);
-	  ireal = addr_rank1(hp->naddr, hp->na, index, ia);
-	  memcpy(sendforw + icount*sz, buf + ireal*sz, sz);
-	  icount += 1;
-	}
-      }
-    }
-  }
-
-  assert(icount == nsend);
-
-  if (mpicartsz[Z] == 1) {
-    memcpy(recvback, sendforw, nsz);
-    memcpy(recvforw, sendback, nsz);
-    req[2] = MPI_REQUEST_NULL;
-    req[3] = MPI_REQUEST_NULL;
-  }
-  else {
-    pforw = cs_cart_neighb(halo->cs, FORWARD, Z);
-    pback = cs_cart_neighb(halo->cs, BACKWARD, Z);
-    MPI_Irecv(recvforw, nsend, mpidata, pforw, tagb, comm, req);
-    MPI_Irecv(recvback, nsend, mpidata, pback, tagf, comm, req + 1);
-    MPI_Issend(sendback, nsend, mpidata, pback, tagb, comm, req + 2);
-    MPI_Issend(sendforw, nsend, mpidata, pforw, tagf, comm, req + 3);
-    /* Wait before unloading */
-    MPI_Waitall(2, req, status);
-  }
-
-  /* Unload */
-
-  icount = 0;
-
-  for (nh = 0; nh < hp->nswap; nh++) {
-    for (ic = 1 - hp->nswap; ic <= nlocal[X] + hp->nswap; ic++) {
-      for (jc = 1 - hp->nswap; jc <= nlocal[Y] + hp->nswap; jc++) {
-	for (ia = 0; ia < hp->na; ia++) {
-	  index = cs_index(halo->cs, ic, jc, 0 - nh);
-	  ihalo = addr_rank1(hp->naddr, hp->na, index, ia);
-	  memcpy(buf + ihalo*sz, recvback + icount*sz, sz);
-	  index = cs_index(halo->cs, ic, jc, nlocal[Z] + 1 + nh);
-	  ihalo = addr_rank1(hp->naddr, hp->na, index, ia);
-	  memcpy(buf + ihalo*sz, recvforw + icount*sz, sz);
-	  icount += 1;
-	}
-      }
-    }
-  }
-
-  assert(icount == nsend);
-
-  free(recvback);
-  free(recvforw);
-
-  MPI_Waitall(2, req + 2, status);
-
-  free(sendback);
-  free(sendforw);
-
-  return 0;
-}
-
-
-/*****************************************************************************
- *
- *  halo_swap_packed
- *
- *  Version allowing for host/device copies; data must be packed
- *  and unpacked to/from device via appropriate kernels.
- *
- *  "data" must be a device pointer
- *
- *****************************************************************************/
-
-__host__ int halo_swap_packed(halo_swap_t * halo, double * data) {
-
-  int ncount;
-  int ndevice;
-  int ic, jc, kc;
-  int ih, jh, kh;
-  int ixlo, ixhi;
-  int iylo, iyhi;
-  int izlo, izhi;
-  int m, mc, p;
-  int nd, nh;
-  int hsz[3];
-  int mpicartsz[3];
-  dim3 nblk, ntpb;
-  double * tmp;
-
-  MPI_Comm comm;
-  MPI_Request req_x[4];
-  MPI_Request req_y[4];
-  MPI_Request req_z[4];
-  MPI_Status  status[4];
-
-  const int btagx = 639, btagy = 640, btagz = 641;
-  const int ftagx = 642, ftagy = 643, ftagz = 644;
-
-  assert(halo);
-
-  /* 2D systems require fix... in the meantime...*/
-  assert(halo->param->nlocal[Z] >= halo->param->nswap);
-
-  tdpAssert( tdpGetDeviceCount(&ndevice) );
-  halo_swap_commit(halo);
-
-  cs_cart_comm(halo->cs, &comm);
-  cs_cartsz(halo->cs, mpicartsz);
-
-  /* hsz[] is just shorthand for local halo sizes */
-  /* An offset nd is required if nswap < nhalo */
-
-  hsz[X] = halo->param->hsz[X];
-  hsz[Y] = halo->param->hsz[Y];
-  hsz[Z] = halo->param->hsz[Z];
-  nh = halo->param->nhalo;
-  nd = nh - halo->param->nswap;
-
-  /* POST ALL RELEVANT Irecv() ahead of time */
-
-  for (p = 0; p < 4; p++) {
-    req_x[p] = MPI_REQUEST_NULL;
-    req_y[p] = MPI_REQUEST_NULL;
-    req_z[p] = MPI_REQUEST_NULL;
-  }
-
-  if (mpicartsz[X] > 1) {
-    ncount = halo->param->hsz[X]*halo->param->nfel;
-    MPI_Irecv(halo->hxlo, ncount, MPI_DOUBLE,
-	      cs_cart_neighb(halo->cs,BACKWARD,X), ftagx, comm, req_x);
-    MPI_Irecv(halo->hxhi, ncount, MPI_DOUBLE,
-	      cs_cart_neighb(halo->cs,FORWARD,X), btagx, comm, req_x + 1);
-  }
-
-  if (mpicartsz[Y] > 1) {
-    ncount = halo->param->hsz[Y]*halo->param->nfel;
-    MPI_Irecv(halo->hylo, ncount, MPI_DOUBLE,
-	      cs_cart_neighb(halo->cs,BACKWARD,Y), ftagy, comm, req_y);
-    MPI_Irecv(halo->hyhi, ncount, MPI_DOUBLE,
-	      cs_cart_neighb(halo->cs,FORWARD,Y), btagy, comm, req_y + 1);
-  }
-
-  if (mpicartsz[Z] > 1) {
-    ncount = halo->param->hsz[Z]*halo->param->nfel;
-    MPI_Irecv(halo->hzlo, ncount, MPI_DOUBLE,
-	      cs_cart_neighb(halo->cs,BACKWARD,Z), ftagz, comm, req_z);
-    MPI_Irecv(halo->hzhi, ncount, MPI_DOUBLE,
-	      cs_cart_neighb(halo->cs,FORWARD,Z), btagz, comm, req_z + 1);
-  }
-
-  /* pack X edges on accelerator */
-
-  kernel_launch_param(hsz[X], &nblk, &ntpb);
-  tdpLaunchKernel(halo->data_pack, nblk, ntpb, 0, halo->stream[X],
-		  halo->target, X, data);
-
-  if (ndevice > 0) {
-    ncount = hsz[X]*halo->param->nfel;
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->fxlo, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpMemcpyAsync(halo->fxlo, tmp, ncount*sizeof(double),
-			      tdpMemcpyDeviceToHost, halo->stream[X]) );
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->fxhi, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpMemcpyAsync(halo->fxhi, tmp, ncount*sizeof(double),
-			      tdpMemcpyDeviceToHost, halo->stream[X]) );
-  }
-
-  /* pack Y edges on accelerator */
-
-  kernel_launch_param(hsz[Y], &nblk, &ntpb);
-  tdpLaunchKernel(halo->data_pack, nblk, ntpb, 0, halo->stream[Y],
-		  halo->target, Y, data);
-
-  if (ndevice > 0) {
-    ncount = hsz[Y]*halo->param->nfel;
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->fylo, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpMemcpyAsync(halo->fylo, tmp, ncount*sizeof(double),
-			      tdpMemcpyDeviceToHost, halo->stream[Y]) );
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->fyhi, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpMemcpyAsync(halo->fyhi, tmp, ncount*sizeof(double),
-			      tdpMemcpyDeviceToHost, halo->stream[Y]) );
-  }
-
-  /* pack Z edges on accelerator */
-
-  kernel_launch_param(hsz[Z], &nblk, &ntpb);
-  tdpLaunchKernel(halo->data_pack, nblk, ntpb, 0, halo->stream[Z],
-		  halo->target, Z, data);
-
-  if (ndevice > 0) {
-    ncount = hsz[Z]*halo->param->nfel;
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->fzlo, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpMemcpyAsync(halo->fzlo, tmp, ncount*sizeof(double),
-			      tdpMemcpyDeviceToHost, halo->stream[Z]) );
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->fzhi, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpMemcpyAsync(halo->fzhi, tmp, ncount*sizeof(double),
-			      tdpMemcpyDeviceToHost, halo->stream[Z]) );
-  }
-
-
-  /* Wait for X; copy or MPI recvs; put X halos back on device, and unpack */
-
-  tdpAssert( tdpStreamSynchronize(halo->stream[X]) );
-  ncount = hsz[X]*halo->param->nfel;
-
-  if (mpicartsz[X] == 1) {
-    /* note these copies do not alias for ndevice == 1 */
-    /* fxhi -> hxlo */
-    memcpy(halo->hxlo, halo->fxhi, ncount*sizeof(double));
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->hxlo, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpMemcpyAsync(tmp, halo->fxhi, ncount*sizeof(double),
-			      tdpMemcpyHostToDevice, halo->stream[X]) );
-    /* fxlo -> hxhi */
-    memcpy(halo->hxhi, halo->fxlo, ncount*sizeof(double));
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->hxhi, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpMemcpyAsync(tmp, halo->fxlo, ncount*sizeof(double),
-			      tdpMemcpyHostToDevice, halo->stream[X]) );
-  }
-  else {
-    MPI_Isend(halo->fxhi, ncount, MPI_DOUBLE,
-	      cs_cart_neighb(halo->cs,FORWARD,X), ftagx, comm, req_x + 2);
-    MPI_Isend(halo->fxlo, ncount, MPI_DOUBLE,
-	      cs_cart_neighb(halo->cs,BACKWARD,X), btagx, comm, req_x + 3);
-
-    for (m = 0; m < 4; m++) {
-      MPI_Waitany(4, req_x, &mc, status);
-      if (mc == 0 && ndevice > 0) {
-	tdpAssert( tdpMemcpy(&tmp, &halo->target->hxlo, sizeof(double *),
-			     tdpMemcpyDeviceToHost) );
-	tdpAssert( tdpMemcpyAsync(tmp, halo->hxlo, ncount*sizeof(double),
-				  tdpMemcpyHostToDevice, halo->stream[X]) );
-      }
-      if (mc == 1 && ndevice > 0) {
-	tdpAssert( tdpMemcpy(&tmp, &halo->target->hxhi, sizeof(double *),
-			     tdpMemcpyDeviceToHost) );
-	tdpAssert( tdpMemcpyAsync(tmp, halo->hxhi, ncount*sizeof(double),
-				  tdpMemcpyHostToDevice, halo->stream[X]) );
-      }
-    }
-  }
-
-  kernel_launch_param(hsz[X], &nblk, &ntpb);
-  tdpLaunchKernel(halo->data_unpack, nblk, ntpb, 0, halo->stream[X],
-		  halo->target, X, data);
-
-  /* Now wait for Y data to arrive from device */
-  /* Fill in 4 corners of Y edge data from X halo */
-
-  tdpAssert( tdpStreamSynchronize(halo->stream[Y]) );
-
-  ih = halo->param->hext[Y][X] - nh;
-  jh = halo->param->hext[X][Y] - nh - halo->param->nswap;
-
-  for (ic = 0; ic < halo->param->nswap; ic++) {
-    for (jc = 0; jc < halo->param->nswap; jc++) {
-      for (kc = 0; kc < halo->param->nall[Z]; kc++) {
-
-	/* This looks a bit odd, but iylo and ixhi relate to Y halo,
-	 * and ixlo and iyhi relate to X halo buffers */
-        ixlo = halo_swap_bufindex(halo, X,      ic, nh + jc, kc);
-        iylo = halo_swap_bufindex(halo, Y, nd + ic,      jc, kc);
-        ixhi = halo_swap_bufindex(halo, Y, ih + ic,      jc, kc);
-        iyhi = halo_swap_bufindex(halo, X, ic,      jh + jc, kc);
-
-        for (p = 0; p < halo->param->nfel; p++) {
-          halo->fylo[hsz[Y]*p + iylo] = halo->hxlo[hsz[X]*p + ixlo];
-          halo->fyhi[hsz[Y]*p + iylo] = halo->hxlo[hsz[X]*p + iyhi];
-          halo->fylo[hsz[Y]*p + ixhi] = halo->hxhi[hsz[X]*p + ixlo];
-          halo->fyhi[hsz[Y]*p + ixhi] = halo->hxhi[hsz[X]*p + iyhi];
-        }
-      }
-    }
-  }
-
-  /* Swap in Y, send data back to device and unpack */
-
-  ncount = halo->param->hsz[Y]*halo->param->nfel;
-
-  if (mpicartsz[Y] == 1) {
-    /* fyhi -> hylo */
-    memcpy(halo->hylo, halo->fyhi, ncount*sizeof(double));
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->hylo, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpMemcpyAsync(tmp, halo->fyhi, ncount*sizeof(double),
-			      tdpMemcpyHostToDevice, halo->stream[Y]) );
-    /* fylo -> hyhi */
-    memcpy(halo->hyhi, halo->fylo, ncount*sizeof(double));
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->hyhi, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpMemcpyAsync(tmp, halo->fylo,ncount*sizeof(double),
-			      tdpMemcpyHostToDevice, halo->stream[Y]) );
-  }
-  else {
-    MPI_Isend(halo->fyhi, ncount, MPI_DOUBLE,
-	      cs_cart_neighb(halo->cs, FORWARD,Y), ftagy, comm, req_y + 2);
-    MPI_Isend(halo->fylo, ncount, MPI_DOUBLE,
-	      cs_cart_neighb(halo->cs, BACKWARD,Y), btagy, comm, req_y + 3);
-
-    for (m = 0; m < 4; m++) {
-      MPI_Waitany(4, req_y, &mc, status);
-      if (mc == 0 && ndevice > 0) {
-	tdpAssert( tdpMemcpy(&tmp, &halo->target->hylo, sizeof(double *),
-			     tdpMemcpyDeviceToHost) );
-	tdpAssert( tdpMemcpyAsync(tmp, halo->hylo, ncount*sizeof(double),
-				  tdpMemcpyHostToDevice, halo->stream[Y]) );
-      }
-      if (mc == 1 && ndevice > 0) {
-	tdpAssert( tdpMemcpy(&tmp, &halo->target->hyhi, sizeof(double *),
-			     tdpMemcpyDeviceToHost) );
-	tdpAssert( tdpMemcpyAsync(tmp, halo->hyhi, ncount*sizeof(double),
-				  tdpMemcpyHostToDevice, halo->stream[Y]) );
-      }
-    }
-  }
-
-
-  kernel_launch_param(hsz[Y], &nblk, &ntpb);
-  tdpLaunchKernel(halo->data_unpack, nblk, ntpb, 0, halo->stream[Y],
-		  halo->target, Y, data);
-
-  /* Wait for Z data from device */
-  /* Fill in 4 corners of Z edge data from X halo  */
-
-  tdpAssert( tdpStreamSynchronize(halo->stream[Z]) );
-
-  ih = halo->param->hext[Z][X] - nh;
-  kh = halo->param->hext[X][Z] - nh - halo->param->nswap;
-
-  for (ic = 0; ic < halo->param->nswap; ic++) {
-    for (jc = 0; jc < halo->param->nall[Y]; jc++) {
-      for (kc = 0; kc < halo->param->nswap; kc++) {
-
-        ixlo = halo_swap_bufindex(halo, X,      ic, jc, nh + kc);
-        izlo = halo_swap_bufindex(halo, Z, nd + ic, jc,      kc);
-        ixhi = halo_swap_bufindex(halo, X,      ic, jc, kh + kc);
-	izhi = halo_swap_bufindex(halo, Z, ih + ic, jc,      kc);
-
-        for (p = 0; p < halo->param->nfel; p++) {
-          halo->fzlo[hsz[Z]*p + izlo] = halo->hxlo[hsz[X]*p + ixlo];
-          halo->fzhi[hsz[Z]*p + izlo] = halo->hxlo[hsz[X]*p + ixhi];
-          halo->fzlo[hsz[Z]*p + izhi] = halo->hxhi[hsz[X]*p + ixlo];
-          halo->fzhi[hsz[Z]*p + izhi] = halo->hxhi[hsz[X]*p + ixhi];
-        }
-      }
-    }
-  }
-
-  /* Fill in 4 strips in X of Z edge data: from Y halo  */
-
-  jh = halo->param->hext[Z][Y] - nh;
-  kh = halo->param->hext[Y][Z] - nh - halo->param->nswap;
-
-  for (ic = 0; ic < halo->param->nall[X]; ic++) {
-    for (jc = 0; jc < halo->param->nswap; jc++) {
-      for (kc = 0; kc < halo->param->nswap; kc++) {
-
-        iylo = halo_swap_bufindex(halo, Y, ic,      jc, nh + kc);
-        izlo = halo_swap_bufindex(halo, Z, ic, nd + jc,      kc);
-        iyhi = halo_swap_bufindex(halo, Y, ic,      jc, kh + kc);
-        izhi = halo_swap_bufindex(halo, Z, ic, jh + jc,      kc);
-
-        for (p = 0; p < halo->param->nfel; p++) {
-          halo->fzlo[hsz[Z]*p + izlo] = halo->hylo[hsz[Y]*p + iylo];
-          halo->fzhi[hsz[Z]*p + izlo] = halo->hylo[hsz[Y]*p + iyhi];
-          halo->fzlo[hsz[Z]*p + izhi] = halo->hyhi[hsz[Y]*p + iylo];
-          halo->fzhi[hsz[Z]*p + izhi] = halo->hyhi[hsz[Y]*p + iyhi];
-        }
-      }
-    }
-  }
-
-
-  /* The z-direction swap  */
-
-  ncount = halo->param->hsz[Z]*halo->param->nfel;
-
-  if (mpicartsz[Z] == 1) {
-    /* fzhi -> hzlo */
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->hzlo, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpMemcpyAsync(tmp, halo->fzhi, ncount*sizeof(double),
-			      tdpMemcpyHostToDevice, halo->stream[Z]) );
-    /* fzlo -> hzhi */
-    tdpAssert( tdpMemcpy(&tmp, &halo->target->hzhi, sizeof(double *),
-			 tdpMemcpyDeviceToHost) );
-    tdpAssert( tdpMemcpyAsync(tmp, halo->fzlo, ncount*sizeof(double),
-			      tdpMemcpyHostToDevice, halo->stream[Z]) );
-  }
-  else {
-    MPI_Isend(halo->fzhi, ncount, MPI_DOUBLE,
-	      cs_cart_neighb(halo->cs,FORWARD,Z), ftagz, comm, req_z + 2);
-    MPI_Isend(halo->fzlo,  ncount, MPI_DOUBLE,
-	      cs_cart_neighb(halo->cs,BACKWARD,Z), btagz, comm, req_z + 3);
-
-    for (m = 0; m < 4; m++) {
-      MPI_Waitany(4, req_z, &mc, status);
-      if (mc == 0 && ndevice > 0) {
-	tdpAssert( tdpMemcpy(&tmp, &halo->target->hzlo, sizeof(double *),
-			     tdpMemcpyDeviceToHost) );
-	tdpAssert( tdpMemcpyAsync(tmp, halo->hzlo, ncount*sizeof(double),
-				  tdpMemcpyHostToDevice, halo->stream[Z]) );
-      }
-      if (mc == 1 && ndevice > 0) {
-	tdpAssert( tdpMemcpy(&tmp, &halo->target->hzhi, sizeof(double *),
-			     tdpMemcpyDeviceToHost) );
-	tdpAssert( tdpMemcpyAsync(tmp, halo->hzhi, ncount*sizeof(double),
-				  tdpMemcpyHostToDevice, halo->stream[Z]) );
-      }
-    }
-  }
-
-  kernel_launch_param(hsz[Z], &nblk, &ntpb);
-  tdpLaunchKernel(halo->data_unpack, nblk, ntpb, 0, halo->stream[Z],
-		  halo->target, Z, data);
-
-  tdpAssert( tdpStreamSynchronize(halo->stream[X]) );
-  tdpAssert( tdpStreamSynchronize(halo->stream[Y]) );
-  tdpAssert( tdpStreamSynchronize(halo->stream[Z]) );
-
-  return 0;
-}
-
-/*****************************************************************************
- *
- *  halo_swap_pack_rank1
- *
- *  Move data to halo buffer on device for coordinate
- *  direction id at both low and high ends.
- *
- *****************************************************************************/
-
-__global__
-void halo_swap_pack_rank1(halo_swap_t * halo, int id, double * data) {
-
-  int kindex;
-
-  assert(halo);
-  assert(id == X || id == Y || id == Z);
-  assert(data);
-
-  for_simt_parallel(kindex, halo->param->hsz[id], 1) {
-
-    int nh;
-    int hsz;
-    int ia, indexl, indexh, ic, jc, kc;
-    int hi; /* high end offset */
-    double * __restrict__ buflo = NULL;
-    double * __restrict__ bufhi = NULL;
-    halo_swap_param_t * hp;
-
-    hp = halo->param;
-    hsz = halo->param->hsz[id];
-
-    /* Load two buffers for this site */
-    /* Use full nhalo to address full data */
-
-    nh = halo->param->nhalo;
-    halo_swap_coords(halo, id, kindex, &ic, &jc, &kc);
-
-    indexl = 0;
-    indexh = 0;
-
-    if (id == X) {
-      hi = nh + hp->nlocal[X] - hp->nswap;
-      indexl = halo_swap_index(halo, hp->nhalo + ic, jc, kc);
-      indexh = halo_swap_index(halo, hi + ic, jc, kc);
-      buflo = halo->fxlo;
-      bufhi = halo->fxhi;
-    }
-    if (id == Y) {
-      hi = nh + hp->nlocal[Y] - hp->nswap;
-      indexl = halo_swap_index(halo, ic, nh + jc, kc);
-      indexh = halo_swap_index(halo, ic, hi + jc, kc);
-      buflo = halo->fylo;
-      bufhi = halo->fyhi;
-    }
-    if (id == Z) {
-      hi = nh + hp->nlocal[Z] - hp->nswap;
-      indexl = halo_swap_index(halo, ic, jc, nh + kc);
-      indexh = halo_swap_index(halo, ic, jc, hi + kc);
-      buflo = halo->fzlo;
-      bufhi = halo->fzhi;
-    }
-
-    if (halo->param->nb == 1) {
-
-      /* Rank 1 */
-
-      /* Low end, and high end */
-
-      for (ia = 0; ia < hp->na; ia++) {
-	buflo[hsz*ia + kindex] = data[addr_rank1(hp->naddr, hp->na, indexl, ia)];
-      }
-
-      for (ia = 0; ia < hp->na; ia++) {
-	bufhi[hsz*ia + kindex] = data[addr_rank1(hp->naddr, hp->na, indexh, ia)];
-      }
-    }
-    else {
-      int ib, nel;
-
-      nel = 0;
-      for (ia = 0; ia < hp->na; ia++) {
-	for (ib = 0; ib < hp->nb; ib++) {
-	  buflo[hsz*nel + kindex] =
-	    data[addr_rank2(hp->naddr, hp->na, hp->nb, indexl, ia, ib)];
-	  nel += 1;
-	}
-      }
-
-      nel = 0;
-      for (ia = 0; ia < hp->na; ia++) {
-	for (ib = 0; ib < hp->nb; ib++) {
-	  bufhi[hsz*nel + kindex] =
-	    data[addr_rank2(hp->naddr, hp->na, hp->nb, indexh, ia, ib)];
-	  nel += 1;
-	}
-      }
-
-    }
-  }
-
-  return;
-}
-
-/*****************************************************************************
- *
- *  halo_swap_unpack_rank1
- *
- *  Unpack halo buffers to the distribution on device for direction id.
- *
- *****************************************************************************/
-
-__global__
-void halo_swap_unpack_rank1(halo_swap_t * halo, int id, double * data) {
-
-  int kindex;
-
-  assert(halo);
-  assert(id == X || id == Y || id == Z);
-  assert(data);
-
-  /* Unpack buffer this site. */
-
-  for_simt_parallel(kindex, halo->param->hsz[id], 1) {
-
-    int hsz;
-    int ia, indexl, indexh;
-    int nh;                          /* Full halo width */
-    int ic, jc, kc;                  /* Lattice ooords */
-    int lo, hi;                      /* Offset for low, high end */
-    double * __restrict__ buflo = NULL;
-    double * __restrict__ bufhi = NULL;
-    halo_swap_param_t * hp;
-
-    hp = halo->param;
-    hsz = halo->param->hsz[id];
-
-    nh = halo->param->nhalo;
-    halo_swap_coords(halo, id, kindex, &ic, &jc, &kc);
-
-    indexl = 0;
-    indexh = 0;
-
-    if (id == X) {
-      lo = nh - hp->nswap;
-      hi = nh + hp->nlocal[X];
-      indexl = halo_swap_index(halo, lo + ic, jc, kc);
-      indexh = halo_swap_index(halo, hi + ic, jc, kc);
-      buflo = halo->hxlo;
-      bufhi = halo->hxhi;
-    }
-
-    if (id == Y) {
-      lo = nh - hp->nswap;
-      hi = nh + hp->nlocal[Y];
-      indexl = halo_swap_index(halo, ic, lo + jc, kc);
-      indexh = halo_swap_index(halo, ic, hi + jc, kc);
-      buflo = halo->hylo;
-      bufhi = halo->hyhi;
-    }
-
-    if (id == Z) {
-      lo = nh - hp->nswap;
-      hi = nh + hp->nlocal[Z];
-      indexl = halo_swap_index(halo, ic, jc, lo + kc);
-      indexh = halo_swap_index(halo, ic, jc, hi + kc);
-      buflo = halo->hzlo;
-      bufhi = halo->hzhi;
-    }
-
-
-    if (halo->param->nb == 1) {
-
-      /* Rank 1 */
-      /* Low end, then high end */
-
-      for (ia = 0; ia < hp->na; ia++) {
-	data[addr_rank1(hp->naddr, hp->na, indexl, ia)] = buflo[hsz*ia + kindex];
-      }
-
-      for (ia = 0; ia < hp->na; ia++) {
-	data[addr_rank1(hp->naddr, hp->na, indexh, ia)] = bufhi[hsz*ia + kindex];
-      }
-
-    }
-    else {
-      int ib, nel;
-
-      nel = 0;
-      for (ia = 0; ia < hp->na; ia++) {
-	for (ib = 0; ib < hp->nb; ib++) {
-	  data[addr_rank2(hp->naddr, hp->na, hp->nb, indexl, ia, ib)] =
-	    buflo[hsz*nel + kindex];
-	  nel += 1;
-	}
-      }
-
-      nel = 0;
-      for (ia = 0; ia < hp->na; ia++) {
-	for (ib = 0; ib < hp->nb; ib++) {
-	  data[addr_rank2(hp->naddr, hp->na, hp->nb, indexh, ia, ib)] =
-	    bufhi[hsz*nel + kindex];
-	  nel += 1;
-	}
-      }
-
-    }
-  }
-
-  return;
-}
-
-/*****************************************************************************
- *
- *  halo_swap_coords
- *
- *  For given kernel index, work out where we are in (ic, jc, kc)
- *  relative to buffer region for direction id.
- *
- *****************************************************************************/
-
-__host__ __device__
-void halo_swap_coords(halo_swap_t * halo, int id, int index,
-		      int * ic, int * jc, int * kc) {
-  int xstr;
-  int ystr;
-
-  assert(halo);
-
-  ystr = halo->param->hext[id][Z];
-  xstr = ystr*halo->param->hext[id][Y];
-
-  *ic = index/xstr;
-  *jc = (index - *ic*xstr)/ystr;
-  *kc = index - *ic*xstr - *jc*ystr;
-
-  return;
-}
-
-/*****************************************************************************
- *
- *  halo_swap_index
- *
- *  A special case of cs_index().
- *
- *****************************************************************************/
-
-__host__ __device__
-int halo_swap_index(halo_swap_t * halo, int ic, int jc, int kc) {
-
-  int xstr;
-  int ystr;
-
-  assert(halo);
-
-  ystr = halo->param->nall[Z];
-  xstr = ystr*halo->param->nall[Y];
-
-  return (ic*xstr + jc*ystr + kc);
-}
-
-/*****************************************************************************
- *
- *  halo_swap_bufindex
- *
- *  Computes index for buffer direction id
- *
- *****************************************************************************/
-
-__host__ __device__
-int halo_swap_bufindex(halo_swap_t * halo, int id, int ic, int jc, int kc) {
-
-  int xstr;
-  int ystr;
-
-  assert(halo);
-
-  ystr = halo->param->hext[id][Z];
-  xstr = ystr*halo->param->hext[id][Y];
-
-  return (ic*xstr + jc*ystr + kc);
-}
diff --git a/src/halo_swap.h b/src/halo_swap.h
deleted file mode 100644
index 6441d9998..000000000
--- a/src/halo_swap.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*****************************************************************************
- *
- *  halo_swap.h
- *
- *  Edinburgh Soft Matter and Statistical Physics Group and
- *  Edinburgh Parallel Computing Centre
- *
- *  (c) 2016 The University of Edinburgh
- *
- *  Contributing authors:
- *  Kevin Stratford (kevin@epcc.ed.ac.uk)
- *  Alan Gray (kevin@epcc.ed.ac.uk)
- *
- *****************************************************************************/
-
-#ifndef LUDWIG_HALO_SWAP_H
-#define LUDWIG_HALO_SWAP_H
-
-#include "pe.h"
-#include "coords.h"
-#include "kernel.h"
-
-typedef struct halo_swap_s halo_swap_t;
-
-/* Could be void * data with MPI_Datatype if more general case required */
-
-typedef void (*f_pack_t)(halo_swap_t * halo, int id, double * data);
-typedef void (*f_unpack_t)(halo_swap_t * halo, int id, double * data);
-
-__host__ int halo_swap_create_r1(pe_t * pe, cs_t * cs, int nhcomm, int naddr,
-				 int na, halo_swap_t ** phalo);
-__host__ int halo_swap_create_r2(pe_t * pe, cs_t * cs, int nhcomm, int naddr,
-				 int na, int nb, halo_swap_t ** phalo);
-__host__ int halo_swap_free(halo_swap_t * halo);
-__host__ int halo_swap_commit(halo_swap_t * halo);
-__host__ int halo_swap_handlers_set(halo_swap_t * halo, f_pack_t pack, f_unpack_t unpack);
-__host__ int halo_swap_host_rank1(halo_swap_t * halo, void * mbuf,
-				  MPI_Datatype mpidata);
-__host__ int halo_swap_packed(halo_swap_t * halo, double * data);
-
-__global__ void halo_swap_pack_rank1(halo_swap_t * halo, int id, double * data);
-__global__ void halo_swap_unpack_rank1(halo_swap_t * halo, int id, double * data);
-
-#endif

From 46b0e9922cb6082757eeea68e58dc61c799ca7cb Mon Sep 17 00:00:00 2001
From: Kevin Stratford <kevin@cirrus.epcc.ed.ac.uk>
Date: Fri, 21 Feb 2025 18:00:28 +0000
Subject: [PATCH 132/133] Repair GPU version

---
 tests/unit/test_map.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/unit/test_map.c b/tests/unit/test_map.c
index 58a48f023..fe84c9dd2 100644
--- a/tests/unit/test_map.c
+++ b/tests/unit/test_map.c
@@ -868,6 +868,9 @@ static int util_map_data_check_set(map_t * map) {
     }
   }
 
+  /* Get the device up-to-date; required for e.g. io_write operations */
+  map_memcpy(map, tdpMemcpyHostToDevice);
+
   return 0;
 }
 

From 05c2d2565a5278553a5ffcd2075fa8b0dae97b38 Mon Sep 17 00:00:00 2001
From: Kevin <kevin@epcc.ed.ac.uk>
Date: Thu, 27 Feb 2025 18:27:52 +0000
Subject: [PATCH 133/133] Release 0.23.0

---
 CHANGES.md | 18 ++++++++++++++++++
 version.h  |  4 ++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 616e36be0..a7a766b93 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,6 +1,24 @@
 
 ### Changes
 
+version 0.23.0
+
+- Action required. Changes to input file keys related to frequency of i/o.
+  Please see https://ludwig.epcc.ed.ac.uk/outputs/fluid.html
+  for a list of new keys to specify output frequenxy. These
+  provide a more flexible and consistent framework for
+  specifying output for all lattice quantities.
+  Old forms will be trapped at run time with a request to
+  update your input file. [Issue 350]
+
+- A number of GPU features have been implemented including the
+  lattice distribution halo swap and the Lees Edwards plane
+  transformation. [Issues 311, 316, 343]
+
+- Bug fix [Issue 342] Colloid wettability (free energy parameters
+  were being erroneously replaced in the gradient calculation.
+  Colloid wetting factors are now taken into account correctly.
+
 version 0.22.0
 
 - Removal of original "ansi" I/O (Issue 284).
diff --git a/version.h b/version.h
index a93c982f8..b9ad77e00 100644
--- a/version.h
+++ b/version.h
@@ -5,7 +5,7 @@
  *  The version is MAJOR.MINOR.PATCH
  *  See, e.g., https://apr.apache.org/versioning.html
  *
- *  (c) 2014-2024 The University of Edinburgh
+ *  (c) 2014-2025 The University of Edinburgh
  *
  *****************************************************************************/
 
@@ -13,7 +13,7 @@
 #define LUDWIG_VERSION_H
 
 #define LUDWIG_MAJOR_VERSION 0
-#define LUDWIG_MINOR_VERSION 22
+#define LUDWIG_MINOR_VERSION 23
 #define LUDWIG_PATCH_VERSION 0
 
 #endif