pnnl · nrtallent · May 22, 2025 · Mar 12, 2025 · Apr 10, 2025 · Apr 16, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,16 @@
+# Output
+*.err
+*.out
+*.log
+task_*/
+output*
+
+# Python cache
+__pycache__/
+
+# macOS
+.DS_Store
+
+# IDE
+.idea/
+.vscode/
diff --git a/README-Install.md b/README-Install.md
@@ -1,79 +1,22 @@
-<!-- -*-Mode: markdown;-*- -->
-<!-- $Id: bd5988fb659f47215d89334363ba2b7ac2fe2b8f $ -->
-
-
-Prerequisites
-=============================================================================
-
-Environment
-  - CMake (>= version 2.8)
-  - C++14 compiler, GCC preferred
-  - Python 3.7+
-
-
-
-Building & Installing
-=============================================================================
-
-....
-
-1. Installing...
-   ```sh
-   mkdir <build> && cd <build>
-   cmake \
-     -DCMAKE_INSTALL_PREFIX=<install-path> \
-     <datalife-root-path>
-   make install
-   ```
-
-
-Using
-=============================================================================
-
-1. First use [DataLife](https://github.com/pnnl/DataLife) to monitor a
-   few (e.g., 3--5) Data Flow Lifecycle profiles. The inputs should
-   vary either input data sizes or parallelism. A Data Flow Lifecycle
-   is a property DAGs with detailed data flow statistics.
-
-   ```sh
-    ...
-    ...datalife
-    ```
-
-
-2. Model with FlowForecaster. This step will infer a detailed and
-   interpretable workflow scaling model from a few empirical Data Flow
-   Lifecycle property graphs.
-
-   ```sh
-    ...
-    usage: datalife-analyze [-h] [-i INPUT] [-o OUTPUT]
-
-    ....
-
-    optional arguments:
-      -h, --help            show this help message and exit
-      -i INPUT, --input INPUT
-                            read I/O monitor stats from directory path
-      -o OUTPUT, --output OUTPUT
-                            write a graph output to a file
-    ```
-
-3. Examine the model output. 
-
-  ...
-
-  A FlowForecaster model is an abstract directed acyclic graph (DAG)
-  with analytical expressions to describe how the DAG scales and how
-  data flows along edges. Importantly, FlowForecaster's expression
-  language and rules can explain data dependent structure and
-  flow. FlowForecaster's inference finds repeated substructure, infers
-  analytical rules to explain substructure scaling (edge branching and
-  joining), and predicts edge properties such as data accesses, access
-  size, and data volume.
-
-
-4. Predict the Data Flow Lifecycle graph using different values for
-   the data size or task concurrency. Use the predictions for better
-   scheduling.
-
+# FlowForecaster
+
+## Install Dependent Python3 Library
+```bash
+$ pip install numpy networkx matplotlib pandas sortedcontainers
+```
+
+## Usage
+```
+bash
+$ $ python create_canonical_model_auto_scaling.py \
+  --data-instances <data_scaling_files> \
+  --task-instances <task_scaling_files> \
+  --output-data <data_model_output> \
+  --output-task <task_model_output>
+```
+
+An example:
+
+```bash
+$ python src/create_canonical_model_auto_scaling.py --data-instances ../sample_data/1000Genomes/sample.1k_genome.iter-3.thrd-2.graphml ../sample_data/1000Genomes/sample.1k_genome.iter-3.thrd-2_data_scale_2.0.graphml  ../sample_data/1000Genomes/sample.1k_genome.iter-3.thrd-2_data_scale_3.0.graphml --task-instances ../sample_data/1000Genomes/sample.1k_genome.iter-3.thrd-2.graphml ../sample_data/1000Genomes/sample.1k_genome.iter-3.thrd-2_task_scale_2.0.graphml ../sample_data/1000Genomes/sample.1k_genome.iter-3.thrd-2_task_scale_3.0.graphml  --output-data canonical_1000genomes_data_scaling.graphml --output-task canonical_1000genomes_task_scaling.graphml
+```
diff --git a/depreciated/README-Install.md b/depreciated/README-Install.md
@@ -0,0 +1,79 @@
+<!-- -*-Mode: markdown;-*- -->
+<!-- $Id: bd5988fb659f47215d89334363ba2b7ac2fe2b8f $ -->
+
+
+Prerequisites
+=============================================================================
+
+Environment
+  - CMake (>= version 2.8)
+  - C++14 compiler, GCC preferred
+  - Python 3.7+
+
+
+
+Building & Installing
+=============================================================================
+
+....
+
+1. Installing...
+   ```sh
+   mkdir <build> && cd <build>
+   cmake \
+     -DCMAKE_INSTALL_PREFIX=<install-path> \
+     <datalife-root-path>
+   make install
+   ```
+
+
+Using
+=============================================================================
+
+1. First use [DataLife](https://github.com/pnnl/DataLife) to monitor a
+   few (e.g., 3--5) Data Flow Lifecycle profiles. The inputs should
+   vary either input data sizes or parallelism. A Data Flow Lifecycle
+   is a property DAGs with detailed data flow statistics.
+
+   ```sh
+    ...
+    ...datalife
+    ```
+
+
+2. Model with FlowForecaster. This step will infer a detailed and
+   interpretable workflow scaling model from a few empirical Data Flow
+   Lifecycle property graphs.
+
+   ```sh
+    ...
+    usage: datalife-analyze [-h] [-i INPUT] [-o OUTPUT]
+
+    ....
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      -i INPUT, --input INPUT
+                            read I/O monitor stats from directory path
+      -o OUTPUT, --output OUTPUT
+                            write a graph output to a file
+    ```
+
+3. Examine the model output. 
+
+  ...
+
+  A FlowForecaster model is an abstract directed acyclic graph (DAG)
+  with analytical expressions to describe how the DAG scales and how
+  data flows along edges. Importantly, FlowForecaster's expression
+  language and rules can explain data dependent structure and
+  flow. FlowForecaster's inference finds repeated substructure, infers
+  analytical rules to explain substructure scaling (edge branching and
+  joining), and predicts edge properties such as data accesses, access
+  size, and data volume.
+
+
+4. Predict the Data Flow Lifecycle graph using different values for
+   the data size or task concurrency. Use the predictions for better
+   scheduling.
+
diff --git a/.../1000genome-workflow/02/ALL.chr1.2500.vcf → .../1000genome-workflow/02/ALL.chr1.2500.vcf b/.../1000genome-workflow/02/ALL.chr1.2500.vcf → .../1000genome-workflow/02/ALL.chr1.2500.vcf
diff --git a/examples/1000genome-workflow/02/data → ...ated/examples/1000genome-workflow/02/data b/examples/1000genome-workflow/02/data → ...ated/examples/1000genome-workflow/02/data
diff --git a/examples/1000genome-workflow/02/data.csv → .../examples/1000genome-workflow/02/data.csv b/examples/1000genome-workflow/02/data.csv → .../examples/1000genome-workflow/02/data.csv
diff --git a/examples/1000genome-workflow/02/get_sh.py → ...examples/1000genome-workflow/02/get_sh.py b/examples/1000genome-workflow/02/get_sh.py → ...examples/1000genome-workflow/02/get_sh.py
diff --git a/examples/1000genome-workflow/02/run.sh → ...ed/examples/1000genome-workflow/02/run.sh b/examples/1000genome-workflow/02/run.sh → ...ed/examples/1000genome-workflow/02/run.sh
diff --git a/depreciated/examples/1000genome-workflow/02/sbatch/sbatch00.run.sh b/depreciated/examples/1000genome-workflow/02/sbatch/sbatch00.run.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name="ff_02_run"
+#SBATCH --partition=slurm
+######SBATCH --exclude=dc[119,077]
+#SBATCH --account=datamesh
+#SBATCH -N 1
+#SBATCH --time=04:44:44
+#SBATCH --output=R.%x.%j.out
+#SBATCH --error=R.%x.%j.err
+#SBATCH --mail-type=FAIL
+#SBATCH --mail-user=zhen.peng@pnnl.gov
+#SBATCH --exclusive
+
+#### sinfo -p <partition>
+#### sinfo -N -r -l
+#### srun -A CENATE -N 1 -t 20:20:20 --pty -u /bin/bash
+
+#First make sure the module commands are available.
+source /etc/profile.d/modules.sh
+
+#Set up your environment you wish to run in with module commands.
+echo
+echo "loaded modules"
+echo
+module purge
+# module load rocm/5.6.0
+#module load cuda/12.3
+# Modules needed by Orca
+module load gcc/11.2.0 binutils/2.35 cmake/3.29.0
+#module load openmpi/4.1.4
+#module load mkl
+module list &> _modules.lis_
+cat _modules.lis_
+/bin/rm -f _modules.lis_
+
+#Python version
+echo
+echo "python version"
+echo
+command -v python
+python --version
+PYTHON_PATH=$(command -v python)
+
+
+#Next unlimit system resources, and set any other environment variables you need.
+ulimit -s unlimited
+echo
+echo limits
+echo
+ulimit -a
+
+#Is extremely useful to record the modules you have loaded, your limit settings,
+#your current environment variables and the dynamically load libraries that your executable
+#is linked against in your job output file.
+# echo
+# echo "loaded modules"
+# echo
+# module list &> _modules.lis_
+# cat _modules.lis_
+# /bin/rm -f _modules.lis_
+# echo
+# echo limits
+# echo
+# ulimit -a
+echo
+echo "Environment Variables"
+echo
+printenv
+# echo
+# echo "ldd output"
+# echo
+# ldd your_executable
+
+#Now you can put in your parallel launch command.
+#For each different parallel executable you launch we recommend
+#adding a correspondin
341A
g ldd command to verify that the environment
+#that is loaded corresponds to the environment the executable was built in.
+
+# set -euo pipefail
+set -u
+
+TT_TIME_START=$(date +%s.%N)
+
+cd "/qfs/projects/oddite/peng599/FlowForecaster/FlowForecaster/examples/1000genome-workflow/02"
+bash run.sh
+
+TT_TIME_END=$(date +%s.%N)
+TT_TIME_EXE=$(echo "${TT_TIME_END} - ${TT_TIME_START}" | bc -l)
+echo
+echo "TT_TIME_EXE(s): ${TT_TIME_EXE}"
+echo
diff --git a/.../1000genome-workflow/02/workflow.edgelist → .../1000genome-workflow/02/workflow.edgelist b/.../1000genome-workflow/02/workflow.edgelist → .../1000genome-workflow/02/workflow.edgelist
diff --git a/...me-workflow/02/workflow_taskname.edgelist → ...me-workflow/02/workflow_taskname.edgelist b/...me-workflow/02/workflow_taskname.edgelist → ...me-workflow/02/workflow_taskname.edgelist
diff --git a/.../1000genome-workflow/04/ALL.chr1.2500.vcf → .../1000genome-workflow/04/ALL.chr1.2500.vcf b/.../1000genome-workflow/04/ALL.chr1.2500.vcf → .../1000genome-workflow/04/ALL.chr1.2500.vcf
diff --git a/...ted_v5.20130502.sites.annotation.2500.vcf → ...ted_v5.20130502.sites.annotation.2500.vcf b/...ted_v5.20130502.sites.annotation.2500.vcf → ...ted_v5.20130502.sites.annotation.2500.vcf
diff --git a/examples/1000genome-workflow/04/data → ...ated/examples/1000genome-workflow/04/data b/examples/1000genome-workflow/04/data → ...ated/examples/1000genome-workflow/04/data
diff --git a/examples/1000genome-workflow/04/data.csv → .../examples/1000genome-workflow/04/data.csv b/examples/1000genome-workflow/04/data.csv → .../examples/1000genome-workflow/04/data.csv
diff --git a/examples/1000genome-workflow/04/run.sh → ...ed/examples/1000genome-workflow/04/run.sh b/examples/1000genome-workflow/04/run.sh → ...ed/examples/1000genome-workflow/04/run.sh
diff --git a/depreciated/examples/1000genome-workflow/04/sbatch/sbatch00.run.sh b/depreciated/examples/1000genome-workflow/04/sbatch/sbatch00.run.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+#SBATCH --job-name="ff_04_run"
+#SBATCH --partition=slurm
+######SBATCH --exclude=dc[119,077]
+#SBATCH --account=datamesh
+#SBATCH -N 1
+#SBATCH --time=04:44:44
+#SBATCH --output=R.%x.%j.out
+#SBATCH --error=R.%x.%j.err
+#SBATCH --mail-type=FAIL
+#SBATCH --mail-user=zhen.peng@pnnl.gov
+#SBATCH --exclusive
+
+#### sinfo -p <partition>
+#### sinfo -N -r -l
+#### srun -A CENATE -N 1 -t 20:20:20 --pty -u /bin/bash
+
+#First make sure the module commands are available.
+source /etc/profile.d/modules.sh
+
+#Set up your environment you wish to run in with module commands.
+echo
+echo "loaded modules"
+echo
+module purge
+# module load rocm/5.6.0
+#module load cuda/12.3
+# Modules needed by Orca
+module load gcc/11.2.0 binutils/2.35 cmake/3.29.0
+#module load openmpi/4.1.4
+#module load mkl
+module list &> _modules.lis_
+cat _modules.lis_
+/bin/rm -f _modules.lis_
+
+#Python version
+echo
+echo "python version"
+echo
+command -v python
+python --version
+PYTHON_PATH=$(command -v python)
+
+
+#Next unlimit system resources, and set any other environment variables you need.
+ulimit -s unlimited
+echo
+echo limits
+echo
+ulimit -a
+
+#Is extremely useful to record the modules you have loaded, your limit settings,
+#your current environment variables and the dynamically load libraries that your executable
+#is linked against in your job output file.
+# echo
+# echo "loaded modules"
+# echo
+# module list &> _modules.lis_
+# cat _modules.lis_
+# /bin/rm -f _modules.lis_
+# echo
+# echo limits
+# echo
+# ulimit -a
+echo
+echo "Environment Variables"
+echo
+printenv
+# echo
+# echo "ldd output"
+# echo
+# ldd your_executable
+
+#Now you can put in your parallel launch command.
+#For each different parallel executable you launch we recommend
+#adding a corresponding ldd command to verify that the environment
+#that is loaded corresponds to the environment the executable was built in.
+
+# set -euo pipefail
+set -u
+
+TT_TIME_START=$(date +%s.%N)
+
+cd "/qfs/projects/oddite/peng599/FlowForecaster/FlowForecaster/examples/1000genome-workflow/04"
+bash run.sh
+
+TT_TIME_END=$(date +%s.%N)
+TT_TIME_EXE=$(echo "${TT_TIME_END} - ${TT_TIME_START}" | bc -l)
+echo
+echo "TT_TIME_EXE(s): ${TT_TIME_EXE}"
+echo
diff --git a/.../1000genome-workflow/04/workflow.edgelist → .../1000genome-workflow/04/workflow.edgelist b/.../1000genome-workflow/04/workflow.edgelist → .../1000genome-workflow/04/workflow.edgelist
diff --git a/...me-workflow/04/workflow_taskname.edgelist → ...me-workflow/04/workflow_taskname.edgelist b/...me-workflow/04/workflow_taskname.edgelist → ...me-workflow/04/workflow_taskname.edgelist
diff --git a/...t_02_task_scaling_compound_graph.edgelist → ...t_02_task_scaling_compound_graph.edgelist b/...t_02_task_scaling_compound_graph.edgelist → ...t_02_task_scaling_compound_graph.edgelist
diff --git a/...t_04_task_scaling_compound_graph.edgelist → ...t_04_task_scaling_compound_graph.edgelist b/...t_04_task_scaling_compound_graph.edgelist → ...t_04_task_scaling_compound_graph.edgelist
diff --git a/...e-workflow/count_predicted_graph.edgelist → ...e-workflow/count_predicted_graph.edgelist b/...e-workflow/count_predicted_graph.edgelist → ...e-workflow/count_predicted_graph.edgelist
diff --git a/examples/1000genome-workflow/parse_dfls.py → ...xamples/1000genome-workflow/parse_dfls.py b/examples/1000genome-workflow/parse_dfls.py → ...xamples/1000genome-workflow/parse_dfls.py
diff --git a/examples/1000genome-workflow/predict_dfls.py → ...mples/1000genome-workflow/predict_dfls.py b/examples/1000genome-workflow/predict_dfls.py → ...mples/1000genome-workflow/predict_dfls.py