pyranges · endrebak · Mar 4, 2022 · Jan 6, 2022 · Jan 6, 2022 · Jan 6, 2022
diff --git a/.github/workflows/build-book.yml b/.github/workflows/build-book.yml
@@ -0,0 +1,49 @@
+name: Build and Deploy
+on: [push]
+jobs:
+  build-and-deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2.3.1
+
+      - name: Add conda to system path
+        run: |
+          # $CONDA is an environment variable pointing to the root of the miniconda directory
+          echo $CONDA/bin >> $GITHUB_PATH
+
+      - name: Install pyranges
+        run: |
+          pip install cython
+          pip install sorted_nearest
+          python setup.py install
+          pip install bamread 
+          pip install fisher
+
+      - name: Install dependencies
+        run: |
+          conda config --set always_yes yes --set changeps1 no
+          conda config --add channels bioconda
+          conda config --add channels r
+          conda install mamba -n base -c conda-forge
+          mamba install -c r r
+          mamba install -c conda-forge r-reticulate
+          # mamba install -c conda-forge r-bookdown
+          mamba install -y conda-forge::ncurses
+          mamba install -c conda-forge matplotlib
+          mamba install -c conda-forge pandoc
+          mamba install -c bioconda pybigwig
+          echo 'install.packages("bookdown")' > install.R
+          Rscript install.R
+
+      - name: Build book
+        run: |
+          cd book
+          Rscript compile.R
+
+      - name: Deploy
+        uses: JamesIves/github-pages-deploy-action@4.1.9
+        with:
+          branch: gh-pages # The branch the action should deploy to.
+          folder: book/build  # The folder the action should deploy.
+
diff --git a/.github/workflows/python-package-conda. EDBE yml b/.github/workflows/python-package-conda. EDBE yml
diff --git a/book/_bookdown.yml b/book/_bookdown.yml
@@ -0,0 +1,20 @@
+delete_merged_file: false
+
+rmd_files: ["index.Rmd",
+"creating.Rmd", "writing.Rmd", "subset.Rmd",
+"manipulation.Rmd", "concat.Rmd", "piping.Rmd", "printing.Rmd", "iterate.Rmd",
+"sort.Rmd", "summary.Rmd", "single_range_methods.Rmd", "custom_methods.Rmd",
+"intersection.Rmd", "subtract.Rmd", "join.Rmd", "nearest.Rmd", "knearest.Rmd",
+"count_overlaps.Rmd",
+"colocalization.Rmd", "simes.Rmd", "fisher_exact.Rmd", "mcc.Rmd",
+"rowbased_statistics.Rmd", "coverage.Rmd", "runlengths.Rmd",
+"runlength_dict.Rmd", "subsetting_rles.Rmd", "subsetting_pyrles.Rmd",
+"multithreading.Rmd", "genomicfeatures.Rmd", "databases.Rmd"]
+
+ # rmd_files: ["runlengths.Rmd"]
+# rmd_files: ["multithreading.Rmd"]
+
+output_dir: "build"
+
+bookdown::html_document2:
+#   css: css.css
diff --git a/book/build_book.yml b/book/build_book.yml
@@ -0,0 +1,26 @@
+name: Build and Deploy
+on: [push]
+jobs:
+  build-and-deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout 
+        uses: actions/checkout@v2.3.1
+
+      - name: Install and Build # This example project is built using npm and outputs the result to the 'build' folder. Replace with the commands required to build your project, or remove this step entirely if your site is pre-built.
+        run: |
+          npm ci
+          npm run build
+
+      - name: Deploy 
+        uses: JamesIves/github-pages-deploy-action@4.1.7
+        with:
+          branch: gh-pages # The branch the action should deploy to.
+          folder: build # The folder the action should deploy. What does it mean to deploy here? 
+
+:wq
+mamba install -c r r
+mamba install -c conda-forge r-reticulate
+mamba install -c conda-forge r-bookdown
+mamba install -y conda-forge::ncurses
+
diff --git a/book/colocalization.Rmd b/book/colocalization.Rmd
@@ -0,0 +1,33 @@
+# Statistics: colocalization and co-occurence measures
+
+PyRanges can compute a normalized Jaccard-statistic (ranging from 0 to 1) to
+compute the similarities between two ranges.
+
+```{python tidy=FALSE}
+import pyranges as pr
+gr = pr.data.chipseq()
+gr2 = pr.data.chipseq_background()
+print(gr.stats.jaccard(gr2, strandedness="same"))
+```
+
+To compute the similarities between two sets of ranges which do not necessarily
+overlap much, we can use the relative distance function. It describes the
+relative distances between each interval in one set and the two closest intervals
+in another. Any deviance from a uniform distribution is an indication of spatial
+correlation.
+
+```{python tidy=FALSE}
+print(gr.stats.relative_distance(gr2, strandedness="same"))
+```
+
+PyRanges also contains yet another method (which is still in beta-mode) for
+computing colocalization statistics, the Forbes coefficient:
+
+```
+print(gr.stats.forbes(gr2, strandedness="same"))
+```
+
+Please report any issues you encounter using it :)
+
+See this paper for a discussion of `jaccard` and `forbes`:
+https://doi.org/10.1093/bib/bbz083
diff --git a/book/compile.R b/book/compile.R
@@ -0,0 +1,11 @@
+
+mpl = reticulate::import("matplotlib")
+mpl$use('TkAgg')
+
+library(reticulate)
+
+library(bookdown)
+
+sessionInfo()
+
+render_book("index.Rmd", "bookdown::gitbook")
diff --git a/book/compile2.R b/book/compile2.R
@@ -0,0 +1,11 @@
+
+mpl = reticulate::import("matplotlib")
+mpl$use('TkAgg')
+
+library(reticulate)
+
+use_python("/mnt/work/endrebak/software/anaconda/bin/python")
+
+library(bookdown)
+
+render_book("bookdown::html_document2")
diff --git a/book/concat.Rmd b/book/concat.Rmd
 @@ -0,0 +1,20 @@
+# Concatenating PyRanges
+
+A list of PyRanges can be concatenated by using the concat function:
+
+```{python tidy=FALSE}
+import pyranges as pr
+
+import pandas as pd
+
+gr1 = pr.data.f1()
+gr2 = pr.data.f2()
+
+print(gr1)
+
+print(gr2)
+
+concatted = pr.concat([gr1, gr2, gr1])
+
+print(concatted)
+```
diff --git a/book/count_overlaps.Rmd b/book/count_overlaps.Rmd
@@ -0,0 +1,35 @@
+# Create count-matrix from multiple PyRanges
+
+If you have multiple pyranges you want to create a count-matrix from, you can
+use `count_overlaps(grs, features=None, how=None, nb_cpu=1, strandedness=None)`.
+grs is a dictionary of pyranges, features is the pyrange you want to count
+overlaps in. If no features-arguments is provided, one is created from the grs.
+
+```{python tidy=FALSE}
+from io import StringIO
+import pyranges as pr
+import pandas as pd
+
+a = """Chromosome Start End
+chr1    6    12
+chr1    10    20
+chr1    22    27
+chr1    24    30"""
+
+b = """Chromosome Start End
+chr1    12    32
+chr1    14    30"""
+
+c = """Chromosome Start End
+chr1    8    15
+chr1    10    14
+chr1    32    34"""
+
+grs = [pr.PyRanges(pd.read_table(StringIO(x), sep="\s+")) for x in [a, b, c]]
+grs = {k: v for k, v in zip("abc", grs)}
+
+features = pr.concat(grs.values()).split()
+print(features)
+print(pr.count_overlaps(grs, features))
+print(pr.count_overlaps(grs))
+```
diff --git a/book/coverage.Rmd b/book/coverage.Rmd
@@ -0,0 +1,20 @@
+# Turning Ranges into RLEs
+
+Ranges can be turned into dicts of run length encodings with the to_rle function:
+
+```{python tidy=FALSE}
+import pyranges as pr
+gr = pr.data.aorta()
+print(gr)
+print(gr.to_rle())
+print(gr.to_rle(strand=True))
+print(gr.to_rle(strand=True, rpm=True))
+```
+
+To get the RPM-normalized coverage, use the rpm argument.
+
+You can also create coverage for an any numeric value in your PyRanges:
+
+```{python tidy=FALSE}
+print(gr.to_rle("Score"))
+```
diff --git a/book/creating.Rmd b/book/creating.Rmd
@@ -0,0 +1,80 @@
+# Loading/Creating PyRanges
+
+A PyRanges object can be built in four ways:
+
+1. from a Pandas dataframe
+2. using the PyRanges constructor with the chromosomes, starts and ends (and optionally strands), individually.
+3. using one of the custom reader functions for genomic data (`read_bed`, `read_bam` or `read_gtf`, `read_gff3`)
+4. from a dict (like the ones produced with `to_example`)
+
+#### Using a DataFrame {-}
+
+If you instantiate a PyRanges object from a dataframe, the dataframe should at
+least contain the columns Chromosome, Start and End. A column called Strand is
+optional. Any other columns in the dataframe are treated as metadata.
+
+```{python tidy=FALSE}
+
+import pandas as pd
+import pyranges as pr
+
+chipseq = pr.get_example_path("chipseq.bed")
+
+df = pd.read_csv(chipseq, header=None, names="Chromosome Start End Name Score Strand".split(), sep="\t")
+
+print(df.head(2))
+print(df.tail(2))
+
+print(pr.PyRanges(df))
+```
+
+#### Using constructor keywords {-}
+
+The other way to instantiate a PyRanges object is to use the constructor with keywords:
+
+```{python tidy=FALSE}
+gr = pr.PyRanges(chromosomes=df.Chromosome, starts=df.Start, ends=df.End)
+print(gr)
+```
+
+It is possible to make PyRanges objects out of basic Python datatypes:
+
+```{python tidy=FALSE}
+gr = pr.PyRanges(chromosomes="chr1", strands="+", starts=[0, 1, 2], ends=(3, 4, 5))
+print(gr)
+
+gr = pr.PyRanges(chromosomes="chr1 chr2 chr3".split(), strands="+ - +".split(), starts=[0, 1, 2], ends=(3, 4, 5))
+print(gr)
+```
+
+#### Using `read_bed`, `read_gtf`, `read_gff3` or `read_bam` {-}
+
+The pyranges library can create PyRanges from gff3 common file formats, namely gtf/gff, gff3, bed and bam [^].
+
+```{python tidy=FALSE}
+ensembl_path = pr.get_example_path("ensembl.gtf")
+gr = pr.read_gtf(ensembl_path)
+print(gr)
+```
+
+To read bam files the optional bamread-library must be installed. Use `conda
+install -c bioconda bamread` or `pip install bamread` to install it.
+
+`read_bam` takes the arguments `sparse`, `mapq`, `required_flag`, `filter_flag`,
+which have the default values True, 0, 0 and 1540, respectively. With sparse
+True, only the columns `['Chromosome', 'Start', 'End', 'Strand', 'Flag']` are
+fetched. Setting sparse to False additionally gives you the columns `['QueryStart',
+'QueryEnd', 'Name', 'Cigar', 'Quality']`, but is more time and memory-consuming.
+
+All the reader functions also take the flag `as_df`
+
+#### Using `from_dict`
+
+```{python tidy=FALSE}
+f1 = pr.data.f1()
+d = f1.to_example(n=10)
+print(d)
+print(pr.from_dict(d))
+```
+
+[^]: PyRanges uses the pysam library which requires that the bam file must have an index.