nf-core · vagkaratzas · Mar 25, 2025 · Mar 25, 2025 · Mar 25, 2025 · Mar 25, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Added`
 
 - [#54](https://github.com/nf-core/proteinfamilies/pull/54) - Added nf-test for local subworkflow `ALIGN_SEQUENCES`
+- [#53](https://github.com/nf-core/proteinfamilies/pull/53) - Added nf-test for local subworkflow `EXECUTE_CLUSTERING`
 - [#51](https://github.com/nf-core/proteinfamilies/pull/51) - Added nf-test and `meta.yml` file for local module `CALCULATE_CLUSTER_DISTRIBUTION`
 - [#34](https://github.com/nf-core/proteinfamilies/pull/34) - Added the `EXTRACT_UNIQUE_CLUSTER_REPS` module, that calculates initial `MMseqs` clustering metadata, for each sample, to print with `MultiQC` (Id,Cluster Size,Number of Clusters)
 

diff --git a/subworkflows/local/execute_clustering/main.nf b/subworkflows/local/execute_clustering/main.nf
@@ -9,7 +9,8 @@ include { MMSEQS_CREATETSV } from '../../../modules/nf-core/mmseqs/createtsv/mai
 
 workflow EXECUTE_CLUSTERING {
     take:
-    sequences // tuple val(meta), path(fasta)
+    sequences       // tuple val(meta), path(fasta)
+    clustering_tool // string: mmseqs clustering algorithm
 
     main:
     ch_versions       = Channel.empty()
@@ -18,10 +19,10 @@ workflow EXECUTE_CLUSTERING {
     MMSEQS_CREATEDB( sequences )
     ch_versions = ch_versions.mix( MMSEQS_CREATEDB.out.versions )
 
-    if (params.clustering_tool == 'cluster') {
+    if (clustering_tool == 'cluster') {
         cluster_res = MMSEQS_CLUSTER( MMSEQS_CREATEDB.out.db )
         ch_versions = ch_versions.mix( MMSEQS_CLUSTER.out.versions )
-    } else { // fallback: linclust
+    } else if (clustering_tool == 'linclust') {
         cluster_res = MMSEQS_LINCLUST( MMSEQS_CREATEDB.out.db )
         ch_versions = ch_versions.mix( MMSEQS_LINCLUST.out.versions )
     }

diff --git a/subworkflows/local/execute_clustering/tests/main.nf.test b/subworkflows/local/execute_clustering/tests/main.nf.test
@@ -0,0 +1,115 @@
+nextflow_workflow {
+
+    name "Test Subworkflow EXECUTE_CLUSTERING"
+    script "../main.nf"
+    workflow "EXECUTE_CLUSTERING"
+
+    test("fasta - linclust") {
+
+        when {
+            workflow {
+                """
+                clustering_tool = 'linclust'
+
+                input[0] = Channel.of([
+                    [ id: 'test_linclust' ], // meta map
+                    file(params.pipelines_testdata_base_path + 'proteinfamilies/test_data/mgnifams_input_small.fa', checkIfExists: true)
+                ])
+                input[1] = clustering_tool
+                """
+            }
+        }
+
+        then {
+            def lines = path(workflow.out.clusters[0][1]).readLines()
+            assertAll(
+                { assert workflow.success},
+                { assert snapshot(
+                    file(workflow.out.clusters[0][1]).name,
+                    lines.size(),
+                    workflow.out.versions).match() }
+            )
+        }
+    }
+
+    test("fasta - cluster") {
+
+        when {
+            workflow {
+                """
+                clustering_tool = 'cluster'
+
+                input[0] = Channel.of([
+                    [ id: 'test_cluster' ], // meta map
+                    file(params.pipelines_testdata_base_path + 'proteinfamilies/test_data/mgnifams_input_small.fa', checkIfExists: true)
+                ])
+                input[1] = clustering_tool
+                """
+            }
+        }
+
+        then {
+            def lines = path(workflow.out.clusters[0][1]).readLines()
+            assertAll(
+                { assert workflow.success},
+                { assert snapshot(
+                    file(workflow.out.clusters[0][1]).name,
+                    lines.size(),
+                    workflow.out.versions).match() }
+            )
+        }
+    }
+
+    test("fasta - linclust - stub") {
+
+        options "-stub"
+
+        when {
+            workflow {
+                """
+                clustering_tool = 'linclust'
+
+                input[0] = Channel.of([
+                    [ id: 'test_linclust_stub' ], // meta map
+                    file(params.pipelines_testdata_base_path + 'proteinfamilies/test_data/mgnifams_input_small.fa', checkIfExists: true)
+                ])
+                input[1] = clustering_tool
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert workflow.success},
+                { assert snapshot(workflow.out).match() }
+            )
+        }
+    }
+
+    test("fasta - cluster - stub") {
+
+        options "-stub"
+
+        when {
+            workflow {
+                """
+                clustering_tool = 'cluster'
+
+                input[0] = Channel.of([
+                    [ id: 'test_cluster_stub' ], // meta map
+                    file(params.pipelines_testdata_base_path + 'proteinfamilies/test_data/mgnifams_input_small.fa', checkIfExists: true)
+                ])
+                input[1] = clustering_tool
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert workflow.success},
+                { assert snapshot(workflow.out).match() }
+            )
+        }
+    }
+
+}
diff --git a/subworkflows/local/execute_clustering/tests/main.nf.test.snap b/subworkflows/local/execute_clustering/tests/main.nf.test.snap
@@ -0,0 +1,140 @@
+{
+    "fasta - cluster": {
+        "content": [
+            "test_cluster.tsv",
+            50000,
+            [
+                "versions.yml:md5,3312bc6e1fa1b06e4a8ac2603deba2a4",
+                "versions.yml:md5,40b242fe8ca427ffe7cd860dc5b9d7fc",
+                "versions.yml:md5,9d06e5efe2356113924a205c58f0b42f"
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.9.2",
+            "nextflow": "24.10.5"
+        },
+        "timestamp": "2025-03-25T08:35:58.03861131"
+    },
+    "fasta - linclust - stub": {
+        "content": [
+            {
+                "0": [
+                    "versions.yml:md5,3312bc6e1fa1b06e4a8ac2603deba2a4",
+                    "versions.yml:md5,9d06e5efe2356113924a205c58f0b42f",
+                    "versions.yml:md5,c3f8381c74eb9f0a2d549f273ba89595"
+                ],
+                "1": [
+                    [
+                        {
+                            "id": "test_linclust_stub"
+                        },
+                        "/nf-core/test-datasets/proteinfamilies/test_data/mgnifams_input_small.fa"
+                    ]
+                ],
+                "2": [
+                    [
+                        {
+                            "id": "test_linclust_stub"
+                        },
+                        "test_linclust_stub.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "clusters": [
+                    [
+                        {
+                            "id": "test_linclust_stub"
+                        },
+                        "test_linclust_stub.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "seqs": [
+                    [
+                        {
+                            "id": "test_linclust_stub"
+                        },
+                        "/nf-core/test-datasets/proteinfamilies/test_data/mgnifams_input_small.fa"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,3312bc6e1fa1b06e4a8ac2603deba2a4",
+                    "versions.yml:md5,9d06e5efe2356113924a205c58f0b42f",
+                    "versions.yml:md5,c3f8381c74eb9f0a2d549f273ba89595"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.9.2",
+            "nextflow": "24.10.5"
+        },
+        "timestamp": "2025-03-25T08:36:09.660114354"
+    },
+    "fasta - linclust": {
+        "content": [
+            "test_linclust.tsv",
+            50000,
+            [
+                "versions.yml:md5,3312bc6e1fa1b06e4a8ac2603deba2a4",
+                "versions.yml:md5,9d06e5efe2356113924a205c58f0b42f",
+                "versions.yml:md5,c3f8381c74eb9f0a2d549f273ba89595"
+            ]
+        ],
+        "meta": {
+            "nf-test": "0.9.2",
+            "nextflow": "24.10.5"
+        },
+        "timestamp": "2025-03-25T08:35:24.00287769"
+    },
+    "fasta - cluster - stub": {
+        "content": [
+            {
+                "0": [
+                    "versions.yml:md5,3312bc6e1fa1b06e4a8ac2603deba2a4",
+                    "versions.yml:md5,40b242fe8ca427ffe7cd860dc5b9d7fc",
+                    "versions.yml:md5,9d06e5efe2356113924a205c58f0b42f"
+                ],
+                "1": [
+                    [
+                        {
+                            "id": "test_cluster_stub"
+                        },
+                        "/nf-core/test-datasets/proteinfamilies/test_data/mgnifams_input_small.fa"
+                    ]
+                ],
+                "2": [
+                    [
+                        {
+                            "id": "test_cluster_stub"
+                        },
+                        "test_cluster_stub.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "clusters": [
+                    [
+                        {
+                            "id": "test_cluster_stub"
+                        },
+                        "test_cluster_stub.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
+                    ]
+                ],
+                "seqs": [
+                    [
+                        {
+                            "id": "test_cluster_stub"
+                        },
+                        "/nf-core/test-datasets/proteinfamilies/test_data/mgnifams_input_small.fa"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,3312bc6e1fa1b06e4a8ac2603deba2a4",
+                    "versions.yml:md5,40b242fe8ca427ffe7cd860dc5b9d7fc",
+                    "versions.yml:md5,9d06e5efe2356113924a205c58f0b42f"
+                ]
+            }
+        ],
+        "meta": {
+            "nf-test": "0.9.2",
+            "nextflow": "24.10.5"
+        },
+        "timestamp": "2025-03-25T08:36:21.001725949"
+    }
+}
diff --git a/subworkflows/local/remove_redundancy/main.nf b/subworkflows/local/remove_redundancy/main.nf
@@ -76,7 +76,7 @@ workflow REMOVE_REDUNDANCY {
     }
 
     if (params.remove_sequence_redundancy) {
-        EXECUTE_CLUSTERING( fasta )
+        EXECUTE_CLUSTERING( fasta, params.clustering_tool )
         ch_versions = ch_versions.mix( EXECUTE_CLUSTERING.out.versions )
 
         REMOVE_REDUNDANT_SEQS( EXECUTE_CLUSTERING.out.clusters, EXECUTE_CLUSTERING.out.seqs )

diff --git a/subworkflows/local/update_families/main.nf b/subworkflows/local/update_families/main.nf
@@ -101,7 +101,7 @@ workflow UPDATE_FAMILIES {
 
     if (params.remove_sequence_redundancy) {
         // Strict clustering to remove redundancy
-        EXECUTE_CLUSTERING( fasta_ch )
+        EXECUTE_CLUSTERING( fasta_ch, params.clustering_tool )
         ch_versions = ch_versions.mix( EXECUTE_CLUSTERING.out.versions )
 
         REMOVE_REDUNDANT_SEQS( EXECUTE_CLUSTERING.out.clusters, EXECUTE_CLUSTERING.out.seqs )

diff --git a/tests/nextflow.config b/tests/nextflow.config
@@ -1,3 +1,4 @@
+// Impose sensible resource limits for testing
 process {
     withName: '.*' {
         cpus   = 2

diff --git a/workflows/proteinfamilies.nf b/workflows/proteinfamilies.nf
@@ -80,7 +80,7 @@ workflow PROTEINFAMILIES {
 
     // Creating new families
     // Clustering
-    EXECUTE_CLUSTERING( ch_samplesheet_for_create )
+    EXECUTE_CLUSTERING( ch_samplesheet_for_create, params.clustering_tool )
     ch_versions = ch_versions.mix( EXECUTE_CLUSTERING.out.versions )
 
     CALCULATE_CLUSTER_DISTRIBUTION( EXECUTE_CLUSTERING.out.clusters )