8000 GitHub - fuzzyLife/vcfpp: vcfpp is a single C++ file for manipulating VCF/BCF format in a scripting way :)
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content
forked from Zilong-Li/vcfpp

vcfpp is a single C++ file for manipulating VCF/BCF format in a scripting way :)

License

Notifications You must be signed in to change notification settings

fuzzyLife/vcfpp

 
 

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

36 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

vcfpp: a single C++ file for manipulating VCF

https://github.com/Zilong-Li/vcfpp/actions/workflows/linux.yml/badge.svg https://github.com/Zilong-Li/vcfpp/actions/workflows/mac.yml/badge.svg https://img.shields.io/badge/Documentation-latest-blue.svg https://img.shields.io/github/v/release/Zilong-Li/vcfpp.svg https://img.shields.io/github/downloads/Zilong-Li/vcfpp/total.svg https://img.shields.io/github/license/Zilong-Li/vcfpp?style=plastic.svg

This project introduces a single C++ file as interface to the basic htslib, which can be easily included in a C++ program for scripting high-performance genomic analyses.

Features:

  • single file to be easily included and compiled
  • easy and safe API to use. no worry about free memory.
  • has the full functionalities of the htslib, eg. supports of compressed VCF/BCF and URL link as filename.
  • compatible with C++11 and later

Table of Contents

Installation

Examples

You can copy paste the example code into a example.cpp file and compile it by g++ example.cpp -std=c++11 -O2 -Wall -I. -lhts -lz -lm -lbz2 -llzma -lcurl

Calculate the heterozygosity rate

Let’s count the number of heterozygous sites for each sample in all records. The core is just 10 lines.

#include "vcfpp.h"
using namespace std;
using namespace vcfpp;

int main(int argc, char* argv[])
{
    // fetch data from 1000 genomes server
    BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz");
    BcfRecord v(vcf.header); // construct a variant record
    vector<char> gt; // genotype can be of bool, char or int type
    vector<int> hetsum(vcf.nsamples);
    while (vcf.getNextVariant(v)) {
        if (!v.isSNP()) continue; // skip other type of variants
        v.getGenotypes(gt);
        for (int i = 0; i < gt.size()/2 ; i++) { // for diploid
            hetsum[i] += abs(gt[2 * i + 0] - gt[2 * i +1]);
        }
    }
    for (auto i : hetsum) { cout << i << endl; }
    return 0;
}

If you don’t bother using Eigen, another header only high performance linear algebra library, here is the more expressive way. Also, we’ll need the library to run PCA with VCF in another example.

#include "vcfpp.h"
#include <Eigen/Dense>
using namespace std;
using namespace vcfpp;
using namespace Eigen;

int main(int argc, char* argv[])
{
    BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz");
    BcfRecord v(vcf.header);
    vector<int> gt;
    ArrayXi hetsum = ArrayXi::Zero(vcf.nsamples);
    while (vcf.getNextVariant(v)) {
        if (!v.isSNP()) continue; // skip other type of variants
        v.getGenotypes(gt);
        Map<const ArrayXXi> m(gt.data(), v.nploidy , gt.size() / v.nploidy); // read only
        hetsum += (m.row(0) - m.row(1)).abs(); // for diploid
    }
    cout << hetsum << endl;
    return 0;
}

Infer population structure with PCA

We need the Eigen library to deal with linear algebra. To infer population structure using PCA, we first construct the genotype matrix (0,1,2) and standardize it under HWE assumption. Then we perform SVD on the standardized matrix to get the PCs.

#include "vcfpp.h"
#include <Eigen/Dense>
using namespace std;
using namespace vcfpp;
using namespace Eigen;

int main(int argc, char* argv[])
{
    BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz", "HG00096,HG00097,HG00099,HG00100,HG00101,HG00102,HG00103,HG00105,HG00106,HG00107", "chr22:19000000-20000000");
    BcfRecord v(vcf.header);
    int nsnps = 0, nsamples = vcf.nsamples;
    vector<bool> gt;
    vector<float> mat, gts(nsamples);
    while (vcf.getNextVariant(v)) {
        if (!v.isSNP()) continue; // skip other type of variants
        v.getGenotypes(gt);
        for (int i = 0; i < nsamples; i++) {
            gts[i] = gt[2 * i] + gt[2 * i + 1];
        }
        mat.insert(mat.end(), gts.begin(), gts.end());
        nsnps++;
    }
    Map<MatrixXf> M(mat.data(), nsamples, nsnps); // dim is nsamples x nsnsp
    ArrayXf hwe = M.colwise().mean().array();
    hwe = (hwe * (1 - hwe/2)).sqrt() ;
    hwe = (hwe > 1e-9).select(hwe, 1); // in case denominator is smaller than 1e-9
    M = (-M).rowwise() + M.colwise().mean(); // centering by subtracting the mean
    M = (M.array().rowwise() / hwe.transpose()).matrix(); // standardize the matrix
    JacobiSVD<MatrixXf> svd(M, Eigen::ComputeThinU | Eigen::ComputeThinV);
    cout << svd.matrixU.leftCols(10) << endl; // save or print out top 10 PCs
    cout << svd.singularValues().array().square() / nsnps << endl; // print out eigenvalues
    return 0;
}

Build PBWT index from haplotype data

Here is a clean example of building Durbin’s PBWT structure using native C++. Depending on your project’s needs, you should be able to modify it to return other useful matrix, such as divergence matrix (d) and FM-index matrix (u, v).

void pbwt() {
    BcfReader vcf("http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20220422_3202_phased_SNV_INDEL_SV/1kGP_high_coverage_Illumina.chr22.filtered.SNV_INDEL_SV_phased_panel.vcf.gz", "-", "chr22:16050075-16050655");
    BcfRecord v(vcf.header);
    vector<bool> gt;
    vector<vector<bool>> x;
    int nsnps = 0, nsamples = vcf.nsamples;
    while (vcf.getNextVariant(v))
    {
        if (!v.isSNP()) continue; // skip other type of variants
        v.getGenotypes(gt);x.push_back(gt);
        nsnps++;
    }
    int N = nsnps, M = nsamples * 2;
    vector<vector<int>> a(N, vector<int>(M)); // this is the index matrix to be retured
    vector<int> a0(M), a1(M), d0(M), d1(M), d(M); // note: to output divergence matrix, make d as two dimensional N x M.
    int i = 0, j = 0, u_ = 0, v_ = 0, p = 0, q = 0, d_, a_;
    for (j = 0; j < N; j++) {
        for (i = 0; i < M; i++) {
            d_ = j > 0 ? d[i] : 0;
            a_ = j > 0 ? a[j-1][i] : i;
            p = max(p, d_); q = max(q, d_);
            if (x[j][a_]) {
                a1[v_] = a_; d1[v_] = q;
                v_++; q = 0;
            } else {
                a0[u_] = a_; d0[u_] = p;
                u_++; p = 0;
            }
        }
        for (i = 0; i < M; i++) {
            if (i < u_) {
                a[j][i] = a0[i];
                d[i] = d0[i];
            } else {
                a[j][i] = a1[i - u_];
                d[i] = d1[i - u_];
            }
        }
    } // return a
}

About

vcfpp is a single C++ file for manipulating VCF/BCF format in a scripting way :)

Resources

License

Stars

Watchers

Forks

Packages

No packages published

Languages

  • C++ 99.9%
  • Makefile 0.1%
0