From 66f4442087904691f8e7d738f3f3b17c365e4d99 Mon Sep 17 00:00:00 2001 From: TuomasBorman Date: Fri, 28 Mar 2025 14:51:11 +0200 Subject: [PATCH] up --- inst/pages/import.qmd | 86 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 71 insertions(+), 15 deletions(-) diff --git a/inst/pages/import.qmd b/inst/pages/import.qmd index f94e6a6b..19ce7ece 100644 --- a/inst/pages/import.qmd +++ b/inst/pages/import.qmd @@ -49,6 +49,8 @@ To begin with, we store the data in a local directory within the working directory, such as _data/_, and define the source file paths. ```{r} +#| label: import1 + biom_file_path <- system.file( "extdata", "Aggregated_humanization2.biom", package = "OMA") sample_meta_file_path <- system.file( @@ -63,6 +65,8 @@ and then remove them with the `rank.from.prefix` and `prefix.rm` optional arguments. ```{r} +#| label: import2 + library(mia) # read biom and convert it to TreeSE @@ -81,6 +85,8 @@ abundance table is named as "counts". Let us inspect only the first cols and rows. ```{r} +#| label: import3 + assay(tse, "counts")[1:3, 1:3] ``` @@ -90,6 +96,8 @@ command shows just the beginning of the data table for an overview. `knitr::kable()` helps print the information more nicely. ```{r} +#| label: import4 + rowData(tse) |> head() ``` @@ -97,25 +105,52 @@ We notice that the imported biom file did not contain any `colData` yet, so only an empty dataframe appears in this slot. ```{r} +#| label: import5 + colData(tse) |> head() ``` Let us add `colData` from the sample metadata, which is stored in a CSV file. ```{r} +#| label: import6 + # CSV file with colnames in the first row and rownames in the first column sample_meta <- read.csv( sample_meta_file_path, sep = ",", row.names = 1) -# Add this sample data to colData of the taxonomic data object -# Note that the samples in the sample data must be in the same order as -# in the original biom file and that data must be given in a DataFrame format -colData(tse) <- DataFrame(sample_meta) +sample_meta |> head() ``` -Now the `colData` includes the sample metadata. +You have to be especially careful when you add sample metadata, effectively +replacing the existing, empty `colData`. The method +**does not check sample names**, which can lead to mistakes. Therefore, you must +manually ensure that the sample names in the metadata not only exist but also +match those in TreeSE. ```{r} +#| label: import7 + +# Check that sample names exist +if( is.null(rownames(sample_meta)) || is.null(colnames(tse)) ){ + stop("Sample names missing!", call. = FALSE) +} + +# Sort rows in sample metadata to match with TreeSE +sample_order <- match(colnames(tse), rownames(sample_meta)) +sample_meta <- sample_meta[sample_order, , drop = FALSE] + +# If sample metadata did not include all samples, missing ones are now named NA. +# That is why we replace sample metadata names with ones in TreeSE. +rownames(sample_meta) <- colnames(tse) +``` + +Now, we can add sample metadata to the `colData`. + +```{r} +#| label: import8 + +colData(tse) <- DataFrame(sample_meta) colData(tse) |> head() ``` @@ -127,6 +162,8 @@ Here, we read in the file containing the phylogenetic tree and insert it in corresponding slot of the `TreeSE` object. ```{r} +#| label: import9 + # Reads the tree file tree <- ape::read.tree(tree_file_path) @@ -140,6 +177,8 @@ tse Now the `rowTree` slot contains the phylogenetic tree: ```{r, eval=FALSE} +#| label: import10 + rowTree(tse) |> head() ``` @@ -184,7 +223,10 @@ the abundance table column names. After you have set up the CSV files, you can read them in R: -```{r importingcsv1, message=FALSE} +```{r} +#| label: import_csv1 +#| message: false + count_file <- system.file("extdata", "assay_taxa.csv", package = "OMA") tax_file <- system.file("extdata", "rowdata_taxa.csv", package = "OMA") sample_file <- system.file("extdata", "coldata.csv", package = "OMA") @@ -218,7 +260,9 @@ with right patient. Also, ensure that the row and column names match one-to-one between abundance table, `rowdata`, and `coldata`: -```{r importingcsv2} +```{r} +#| label: import_csv2 + # Match rows and columns counts <- counts[rownames(tax), rownames(samples)] @@ -234,7 +278,9 @@ There are many different source files and many different ways to read data in R. One can do data manipulation in R as well. Investigate the entries as follows. -```{r demodata, message=FALSE} +```{r} +#| label: import_csv3 + # coldata rownames match assay colnames all(rownames(samples) == colnames(counts)) # our dataset class(samples) # should be data.frame or DataFrame @@ -269,7 +315,9 @@ we also convert the data objects in their preferred formats: The `SimpleList` could be used to include multiple alternative assays, if necessary. -```{r importingcsv3} +```{r} +#| label: import_csv4 + # Create a TreeSE tse_taxa <- TreeSummarizedExperiment( assays = SimpleList(counts = counts), @@ -288,7 +336,10 @@ To construct a `MAE` object, just combine multiple `TreeSE` data containers. Here we import metabolite data from the same study. -```{r importingcsv4, message=FALSE} +```{r} +#| label: import_mae1 +#| message: false + count_file <- system.file("extdata", "assay_metabolites.csv", package = "OMA") sample_file <- system.file("extdata", "coldata.csv", package = "OMA") @@ -313,7 +364,9 @@ be DataFrame objects. Now we can combine these two experiments into _MAE_. -```{r importingcsv5} +```{r} +#| label: import_mae2 + # Create an ExperimentList that includes experiments experiments <- ExperimentList( microbiome = tse_taxa, metabolite = tse_metabolite) @@ -341,15 +394,18 @@ List the [available datasets](https://microbiome.github.io/mia/reference/mia-datasets.html) in the `mia` package: -```{r, message=FALSE} -library(mia) +```{r} +#| label: pkg_data1 + data(package="mia") ``` Load the `GlobalPatterns` data from the `mia` package: -```{r, message=FALSE} -data("GlobalPatterns", package="mia") +```{r} +#| label: pkg_data2 + +data("GlobalPatterns", package = "mia") GlobalPatterns ```