microbiome · TuomasBorman · Mar 28, 2025 · Mar 28, 2025
diff --git a/inst/pages/import.qmd b/inst/pages/import.qmd
@@ -49,6 +49,8 @@ To begin with, we store the data in a local directory within the working
 directory, such as _data/_, and define the source file paths.
 
 ```{r}
+#| label: import1
+
 biom_file_path <- system.file(
     "extdata", "Aggregated_humanization2.biom", package = "OMA")
 sample_meta_file_path <- system.file(
@@ -63,6 +65,8 @@ and then remove them with the `rank.from.prefix` and `prefix.rm` optional
 arguments.
 
 ```{r}
+#| label: import2
+
 library(mia)
 
 # read biom and convert it to TreeSE
@@ -81,6 +85,8 @@ abundance table is named as "counts".  Let us inspect only the first
 cols and rows.
 
 ```{r}
+#| label: import3
+
 assay(tse, "counts")[1:3, 1:3]
 ```
 
@@ -90,32 +96,61 @@ command shows just the beginning of the data table for an overview.
 `knitr::kable()` helps print the information more nicely.
 
 ```{r}
+#| label: import4
+
 rowData(tse) |> head()
 ```
 
 We notice that the imported biom file did not contain any `colData` yet,
 so only an empty dataframe appears in this slot.
 
 ```{r}
+#| label: import5
+
 colData(tse) |> head()
 ```
 
 Let us add `colData` from the sample metadata, which is stored in a CSV file.
 
 ```{r}
+#| label: import6
+
 # CSV file with colnames in the first row and rownames in the first column
 sample_meta <- read.csv(
     sample_meta_file_path, sep = ",", row.names = 1)
 
-# Add this sample data to colData of the taxonomic data object
-# Note that the samples in the sample data must be in the same  order as
-# in the original biom file and that data must be given in a DataFrame format
-colData(tse) <- DataFrame(sample_meta)
+sample_meta |> head()
 ```
 
-Now the `colData` includes the sample metadata.
+You have to be especially careful when you add sample metadata, effectively
+replacing the existing, empty `colData`. The method
+**does not check sample names**, which can lead to mistakes. Therefore, you must
+manually ensure that the sample names in the metadata not only exist but also
+match those in TreeSE.
 
 ```{r}
+#| label: import7
+
+# Check that sample names exist
+if( is.null(rownames(sample_meta)) || is.null(colnames(tse)) ){
+    stop("Sample names missing!", call. = FALSE)
+}
+
+# Sort rows in sample metadata to match with TreeSE
+sample_order <- match(colnames(tse), rownames(sample_meta))
+sample_meta <- sample_meta[sample_order, , drop = FALSE]
+
+# If sample metadata did not include all samples, missing ones are now named NA.
+# That is why we replace sample metadata names with ones in TreeSE. 
+rownames(sample_meta) <- colnames(tse)
+```
+
+Now, we can add sample metadata to the `colData`.
+
+```{r}
+#| label: import8
+
+colData(tse) <- DataFrame(sample_meta)
 colData(tse) |> head()
 ```
 
@@ -127,6 +162,8 @@ Here, we read in the file containing the phylogenetic tree and insert it
 in corresponding slot of the `TreeSE` object.
 
 ```{r}
+#| label: import9
+
 # Reads the tree file
 tree <- ape::read.tree(tree_file_path)
 
@@ -140,6 +177,8 @@ tse
 Now the `rowTree` slot contains the phylogenetic tree:
 
 ```{r, eval=FALSE}
+#| label: import10
+
 rowTree(tse) |> head()
 ```
 
@@ -184,7 +223,10 @@ the abundance table column names.
 
 After you have set up the CSV files, you can read them in R:
 
-```{r importingcsv1, message=FALSE}
+```{r}
+#| label: import_csv1
+#| message: false
+
 count_file  <- system.file("extdata", "assay_taxa.csv", package = "OMA")
 tax_file    <- system.file("extdata", "rowdata_taxa.csv", package = "OMA")
 sample_file <- system.file("extdata", "coldata.csv", package = "OMA")
@@ -218,7 +260,9 @@ with right patient.
 Also, ensure that the row and column names match one-to-one between
 abundance table, `rowdata`, and `coldata`:
 
-```{r importingcsv2}
+```{r}
+#| label: import_csv2
+
 # Match rows and columns
 counts <- counts[rownames(tax), rownames(samples)]
 
@@ -234,7 +278,9 @@ There are many different source files and many different ways to read
 data in R. One can do data manipulation in R as well. Investigate the
 entries as follows.
 
-```{r demodata, message=FALSE}
+```{r}
+#| label: import_csv3
+
 # coldata rownames match assay colnames
 all(rownames(samples) == colnames(counts)) # our dataset
 class(samples) # should be data.frame or DataFrame
@@ -269,7 +315,9 @@ we also convert the data objects in their preferred formats:
 The `SimpleList` could be used to include multiple alternative assays, if
 necessary.
 
-```{r importingcsv3}
+```{r}
+#| label: import_csv4
+
 # Create a TreeSE
 tse_taxa <- TreeSummarizedExperiment(
     assays =  SimpleList(counts = counts),
@@ -288,7 +336,10 @@ To construct a `MAE` object, just combine multiple `TreeSE` data containers.
 
 Here we import metabolite data from the same study.
 
-```{r importingcsv4, message=FALSE}
+```{r}
+#| label: import_mae1
+#| message: false
+
 count_file <- system.file("extdata", "assay_metabolites.csv", package = "OMA")
 sample_file <- system.file("extdata", "coldata.csv", package = "OMA")
 
@@ -313,7 +364,9 @@ be DataFrame objects.
 
 Now we can combine these two experiments into _MAE_.
 
-```{r importingcsv5}
+```{r}
+#| label: import_mae2
+
 # Create an ExperimentList that includes experiments
 experiments <- ExperimentList(
     microbiome = tse_taxa, metabolite = tse_metabolite)
@@ -341,15 +394,18 @@ List the [available
 datasets](https://microbiome.github.io/mia/reference/mia-datasets.html) in
 the `mia` package:
 
-```{r, message=FALSE}
-library(mia)
+```{r}
+#| label: pkg_data1
+
 data(package="mia")
 ```
 
 Load the `GlobalPatterns` data from the `mia` package:
 
-```{r, message=FALSE}
-data("GlobalPatterns", package="mia")
+```{r}
+#| label: pkg_data2
+
+data("GlobalPatterns", package = "mia")
 GlobalPatterns
 ```