vcfpp: a single C++ file for manipulating VCF

This project introduces a single C++ file as interface to the basic htslib, which can be easily included in a C++ program for scripting high-performance genomic analyses.

Features:

single file to be easily included and compiled
easy and safe API to use. no worry about free memory.
has the full functionalities of the htslib, eg. supports of compressed VCF/BCF and URL link as filename.
compatible with C++11 and later

Installation

download the released vcfpp.h and include it in your program.
make sure you have https://github.com/samtools/htslib installed on your system and the it is in your environment.

Examples

You can copy paste the example code into a example.cpp file and compile it by g++ example.cpp -std=c++11 -O2 -Wall -I. -lhts -lz -lm -lbz2 -llzma -lcurl

Calculate the heterozygosity rate

Let’s count the number of heterozygous sites for each sample in all records. The core is just 10 lines.

#include "vcfpp.h"
using namespace std;
using namespace vcfpp;

int main(int argc, char* argv[])
{
    // fetch data from 1000 genomes server
    BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz");
    BcfRecord v(vcf.header); // construct a variant record
    vector<char> gt; // genotype can be of bool, char or int type
    vector<int> hetsum(vcf.nsamples);
    while (vcf.getNextVariant(v)) {
        if (!v.isSNP()) continue; // skip other type of variants
        v.getGenotypes(gt);
        for (int i = 0; i < gt.size()/2 ; i++) { // for diploid
            hetsum[i] += abs(gt[2 * i + 0] - gt[2 * i +1]);
        }
    }
    for (auto i : hetsum) { cout << i << endl; }
    return 0;
}

If you don’t bother using Eigen, another header only high performance linear algebra library, here is the more expressive way. Also, we’ll need the library to run PCA with VCF in another example.

#include "vcfpp.h"
#include <Eigen/Dense>
using namespace std;
using namespace vcfpp;
using namespace Eigen;

int main(int argc, char* argv[])
{
    BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz");
    BcfRecord v(vcf.header);
    vector<int> gt;
    ArrayXi hetsum = ArrayXi::Zero(vcf.nsamples);
    while (vcf.getNextVariant(v)) {
        if (!v.isSNP()) continue; // skip other type of variants
        v.getGenotypes(gt);
        Map<const ArrayXXi> m(gt.data(), v.nploidy , gt.size() / v.nploidy); // read only
        hetsum += (m.row(0) - m.row(1)).abs(); // for diploid
    }
    cout << hetsum << endl;
    return 0;
}

Infer population structure with PCA

We need the Eigen library to deal with linear algebra. To infer population structure using PCA, we first construct the genotype matrix (0,1,2) and standardize it under HWE assumption. Then we perform SVD on the standardized matrix to get the PCs.

#include "vcfpp.h"
#include <Eigen/Dense>
using namespace std;
using namespace vcfpp;
using namespace Eigen;

int main(int argc, char* argv[])
{
    BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz", "HG00096,HG00097,HG00099,HG00100,HG00101,HG00102,HG00103,HG00105,HG00106,HG00107", "chr22:19000000-20000000");
    BcfRecord v(vcf.header);
    int nsnps = 0, nsamples = vcf.nsamples;
    vector<bool> gt;
    vector<float> mat, gts(nsamples);
    while (vcf.getNextVariant(v)) {
        if (!v.isSNP()) continue; // skip other type of variants
        v.getGenotypes(gt);
        for (int i = 0; i < nsamples; i++) {
            gts[i] = gt[2 * i] + gt[2 * i + 1];
        }
        mat.insert(mat.end(), gts.begin(), gts.end());
        nsnps++;
    }
    Map<MatrixXf> M(mat.data(), nsamples, nsnps); // dim is nsamples x nsnsp
    ArrayXf hwe = M.colwise().mean().array();
    hwe = (hwe * (1 - hwe/2)).sqrt() ;
    hwe = (hwe > 1e-9).select(hwe, 1); // in case denominator is smaller than 1e-9
    M = (-M).rowwise() + M.colwise().mean(); // centering by subtracting the mean
    M = (M.array().rowwise() / hwe.transpose()).matrix(); // standardize the matrix
    JacobiSVD<MatrixXf> svd(M, Eigen::ComputeThinU | Eigen::ComputeThinV);
    cout << svd.matrixU.leftCols(10) << endl; // save or print out top 10 PCs
    cout << svd.singularValues().array().square() / nsnps << endl; // print out eigenvalues
    return 0;
}

Build PBWT index from haplotype data

Here is a clean example of building Durbin’s PBWT structure using native C++. Depending on your project’s needs, you should be able to modify it to return other useful matrix, such as divergence matrix (d) and FM-index matrix (u, v).

void pbwt() {
    BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz", "-", "chr22:16050075-16050655");
    BcfRecord v(vcf.header);
    vector<bool> gt;
    vector<vector<bool>> x;
    int nsnps = 0, nsamples = vcf.nsamples;
    while (vcf.getNextVariant(v))
    {
        if (!v.isSNP()) continue; // skip other type of variants
        v.getGenotypes(gt);x.push_back(gt);
        nsnps++;
    }
    int N = nsnps, M = nsamples * 2;
    vector<vector<int>> a(N, vector<int>(M)); // this is the index matrix to be retured
    vector<int> a0(M), a1(M), d0(M), d1(M), d(M); // note: to output divergence matrix, make d as two dimensional N x M.
    int i = 0, j = 0, u_ = 0, v_ = 0, p = 0, q = 0, d_, a_;
    for (j = 0; j < N; j++) {
        for (i = 0; i < M; i++) {
            d_ = j > 0 ? d[i] : 0;
            a_ = j > 0 ? a[j-1][i] : i;
            p = max(p, d_); q = max(q, d_);
            if (x[j][a_]) {
                a1[v_] = a_; d1[v_] = q;
                v_++; q = 0;
            } else {
                a0[u_] = a_; d0[u_] = p;
                u_++; p = 0;
            }
        }
        for (i = 0; i < M; i++) {
            if (i < u_) {
                a[j][i] = a0[i];
                d[i] = d0[i];
            } else {
                a[j][i] = a1[i - u_];
                d[i] = d1[i - u_];
            }
        }
    } // return a
}

Name		Name	Last commit message	Last commit date
Latest commit History 36 Commits
.github/workflows		.github/workflows
test		test
.clang-format		.clang-format
.gitignore		.gitignore
Doxyfile		Doxyfile
LICENSE		LICENSE
readme.org		readme.org
vcfpp.h		vcfpp.h

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Repository files navigation

vcfpp: a single C++ file for manipulating VCF

Table of Contents

Installation

Examples

Calculate the heterozygosity rate

Infer population structure with PCA

Build PBWT index from haplotype data

About

Uh oh!

Releases

Packages

Languages

License

fuzzyLife/vcfpp

Folders and files

Latest commit

History

Repository files navigation

vcfpp: a single C++ file for manipulating VCF

Table of Contents

Installation

Examples

Calculate the heterozygosity rate

Infer population structure with PCA

Build PBWT index from haplotype data

About

Resources

License

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages