workflow4metabolomics · etiennejls · Jul 11, 2025 · Jul 16, 2025 · Jul 16, 2025 · Jul 16, 2025
diff --git a/tools/parsec/README.md b/tools/parsec/README.md
@@ -0,0 +1,113 @@
+# Galaxy Tool Documentation: Batch Cohort Correction
+
+## Overview
+This Galaxy tool is designed to correct batch and cohort effects in intensity measurements from scientific studies. Using a mixed-model approach, it adjusts intensity values while accounting for batch and injection order effects.
+
+---
+
+## Table of Contents
+1. [Overview](#overview)
+2. [Prerequisites](#prerequisites)
+3. [Installation](#installation)
+4. [Inputs](#inputs)
+5. [Outputs](#outputs)
+6. [Usage Example](#usage-example)
+7. [Commands Executed by Galaxy](#commands-executed-by-galaxy)
+8. [Important Notes](#important-notes)
+9. [Contributing](#contributing)
+10. [License](#license)
+11. [About](#about)
+
+---
+
+## Prerequisites
+- **Galaxy Platform**: Ensure access to a functional Galaxy instance.  
+- **R version 4.2.2**: The tool relies on R for computations.  
+- Required R packages: `r-optparse`, `r-dplyr`, `r-lme4`.
+
+---
+
+## Installation
+Download the tool from the Galaxy repository or install it directly on your Galaxy instance:
+
+```bash
+git clone https://github.com/your_name/your_project.git
+```
+
+---
+
+## Inputs
+The input file should be in CSV format and include the following columns:
+- **Batch**: Batch identifier (optional for batch correction).
+- **SampleID**: Sample identifier.
+- **Injection_Order**: Injection order (mandatory for correction).
+- **Ion1, Ion2, ...**: Intensity columns to be corrected.
+
+**Sample Input File:**
+```csv
+SampleID,Batch,Injection_Order,Ion1,Ion2
+1,1,5,500,300
+2,1,15,520,310
+3,2,25,490,290
+4,2,35,505,295
+```
+
+---
+
+## Outputs
+The output will also be in CSV format, with corrected intensity values.
+
+**Sample Output File:**
+```csv
+SampleID,Batch,Injection_Order,Ion1,Ion2
+1,1,5,-0.2464,-0.2464
+2,1,15,1.3362,1.3362
+3,2,25,-0.5720,-0.5719
+4,2,35,0.3269,0.3268
+```
+
+---
+
+## Usage Example
+1. Upload your CSV file to Galaxy.  
+2. Select the **Batch Cohort Correction** tool in your workflow.  
+3. Specify the input file and set a name for the output file.  
+4. Run the job and retrieve the corrected output file.
+
+---
+
+## Commands Executed by Galaxy
+The process will run the following command:
+
+```bash
+Rscript $__tool_directory__/executable_func.R --input $input --output $output
+```
+
+---
+
+## Important Notes
+- **Injection_Order**: Mandatory for accurate corrections.  
+- **CSV Format**: Ensure the file is properly formatted with columns separated by commas.  
+- Malformed or improperly formatted files will result in explicit errors.
+
+---
+
+## Contributing
+1. Fork the repository.  
+2. Create a branch for your updates.  
+3. Submit a pull request.  
+4. Report bugs or suggest improvements in the Issues section.
+
+---
+
+## License
+_here we will write about the_ `LICENSE`
+
+---
+
+## About
+### Authors:  
+- **Elfried Salanon**  
+📅 **Date:** 2025  
+- **Marie Lefebvre**  
+📅 **Date:** 2025  
diff --git a/tools/parsec/parsec.R b/tools/parsec/parsec.R
@@ -0,0 +1,118 @@
+options(warn = -1)
+
+# --- LIBRAIRIES ---
+suppressPackageStartupMessages({
+  library(optparse)
+  library(dplyr)
+  library(lme4)
+})
+
+# --- FONCTION DE CORRECTION ---
+batch_cohort_correction <- function(data, batch_col, sample_col, intensity_cols) {
+  # Vérifie présence des colonnes
+  missing_cols <- setdiff(c(batch_col, sample_col, intensity_cols), colnames(data))
+  if (length(missing_cols) > 0) {
+    stop(paste("❌ Colonnes manquantes :", paste(missing_cols, collapse = ", ")))
+  }
+
+  # 🔧 Nettoyage et conversion en numérique
+  data[intensity_cols] <- lapply(data[intensity_cols], function(x) {
+    x <- gsub("\\s+", "", as.character(x))
+    as.numeric(x)
+  })
+
+  # 1. Log-transform
+  data <- data %>%
+    mutate(across(all_of(intensity_cols), log1p))
+
+  # 2. Standardisation par batch
+  data <- data %>%
+    group_by(!!sym(batch_col)) %>%
+    mutate(across(all_of(intensity_cols), ~ scale(.x)[, 1])) %>%
+    ungroup()
+
+  # 3. Vérification Injection_Order
+  if (!"Injection_Order" %in% colnames(data)) {
+    stop("❌ Colonne Injection_Order manquante.")
+  }
+
+  # 4. Modèle mixte linéaire
+  for (col in intensity_cols) {
+    model <- lmer(
+      as.formula(paste0(col, " ~ Injection_Order + (1|", batch_col, ")")),
+      data = data,
+      REML = TRUE,
+      control = lmerControl(check.conv.singular = "ignore")
+    )
+    data[[col]] <- residuals(model)
+  }
+
+  # 5. Inverse transform
+  data <- data %>%
+    mutate(across(all_of(intensity_cols), expm1))
+
+  return(data)
+}
+
+# --- ARGUMENTS CLI ---
+option_list <- list(
+  make_option(c("-d", "--dataMatrice"), type = "character", help = "Data matrix"),
+  make_option(c("-s", "--sampleMData"), type = "character", help = "Sample metadata"),
+  make_option(c("-v", "--variableMData"), type = "character", help = "Variable metadata"),
+  make_option(c("-o", "--output"), type = "character", help = "Output file")
+)
+opt <- parse_args(OptionParser(option_list = option_list))
+
+# --- CHARGEMENT DES FICHIERS ---
+if (!all(file.exists(opt$dataMatrice, opt$sampleMData, opt$variableMData))) {
+  stop("❌ Un ou plusieurs fichiers d'entrée sont introuvables.")
+}
+
+data_matrix <- read.csv(opt$dataMatrice, header = TRUE, sep = "\t")
+sample_metadata <- read.csv(opt$sampleMData, header = TRUE, sep = "\t")
+variable_metadata <- read.csv(opt$variableMData, header = TRUE, sep = "\t")
+
+# --- RENOMMAGE DES COLONNES ---
+colnames(sample_metadata)[1] <- "SampleID"
+colnames(sample_metadata) <- sub("^batch$", "Batch", colnames(sample_metadata))
+colnames(sample_metadata) <- sub("^injectionOrder$", "Injection_Order", colnames(sample_metadata))
+
+# --- TRANSFORMATION DES DONNÉES ---
+data_t <- as.data.frame(t(data_matrix))
+data_t$SampleID <- rownames(data_t)
+
+if ((ncol(data_t) - 1) != nrow(variable_metadata)) {
+  stop("❌ Incompatibilité : nombre de variables ≠ nombre de métadonnées.")
+}
+
+ion_names <- paste0("Ion", seq_len(nrow(variable_metadata)))
+colnames(data_t) <- c(ion_names, "SampleID")
+
+# --- FUSION ---
+data_set <- merge(sample_metadata, data_t, by = "SampleID")
+
+# --- COLONNES À CORRIGER ---
+intensity_cols <- ion_names
+
+# Vérifie que toutes les colonnes nécessaires sont là
+required_columns <- c("Batch", "SampleID", "Injection_Order", intensity_cols[1:2])
+missing_columns <- setdiff(required_columns, colnames(data_set))
+if (length(missing_columns) > 0) {
+  stop(paste("❌ Colonnes manquantes dans le fichier fusionné :", paste(missing_columns, collapse = ", ")))
+}
+
+# --- APPLICATION DE LA CORRECTION ---
+corrected_data <- batch_cohort_correction(
+  data_set,
+  batch_col = "Batch",
+  sample_col = "SampleID",
+  intensity_cols = intensity_cols
+)
+
+# --- EXPORT FINAL ---
+write.csv(
+  corrected_data,
+  file = opt$output,
+  quote = TRUE,
+  row.names = FALSE
+)
diff --git a/tools/parsec/parsec.xml b/tools/parsec/parsec.xml
@@ -0,0 +1,151 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<tool id="batch_cohort_correction" name="Batch-Cohort Correction" version="1.1" profile="16.01">
+    <description>Corrects for batch and cohort effects on ion intensities using a mixed-model approach</description>
+
+    <command>
+        <![CDATA[
+            Rscript '$__tool_directory__/parsec.R' -d '$dataMatrix' -s '$sampleMetadata' -v '$variableMetadata' -o '$output'
+        ]]>
+    </command>
+
+    <inputs>
+        <param name="dataMatrix" type="data" format="tabular" label="Data Matrix (samples × variables)" />
+        <param name="sampleMetadata" type="data" format="tabular" label="Sample Metadata" />
+        <param name="variableMetadata" type="data" format="tabular" label="Variable Metadata" />
+    </inputs>
+
+    <outputs>
+        <data name="output" format="csv" label="Corrected Intensities Table" />
+    </outputs>
+
+    <tests>
+        <!-- Test with valid example files -->
+        <test>
+            <param name="dataMatrix" value="Dataprocessing_dataMatrix.txt" />
+            <param name="sampleMetadata" value="Dataprocessing_sampleMetadata.txt" />
+            <param name="variableMetadata" value="Dataprocessing_variableMetadata.txt" />
+            <output name="output" file="corrected_output_test.csv" />
+        </test>
+
+        <!-- Test: Missing injection order -->
+        <test expect_failure="true">
+            <param name="dataMatrix" value="missing_injectionOrder_matrix.txt" />
+            <param name="sampleMetadata" value="Dataprocessing_sampleMetadata.txt" />
+            <param name="variableMetadata" value="Dataprocessing_variableMetadata.txt" />
+        </test>
+
+        <!-- Test: Empty file -->
+        <test expect_failure="true">
+            <param name="dataMatrix" value="empty_file.txt" />
+            <param name="sampleMetadata" value="Dataprocessing_sampleMetadata.txt" />
+            <param name="variableMetadata" value="Dataprocessing_variableMetadata.txt" />
+        </test>
+    </tests>
+
+    <requirements>
+        <requirement type="package" version="4.4.2">r-base</requirement>
+        <requirement type="package" version="1.7.5">r-optparse</requirement>
+        <requirement type="package" version="1.1.4">r-dplyr</requirement>
+        <requirement type="package" version="1.1.36">r-lme4</requirement>
+    </requirements>
+
+    <help><![CDATA[
+            # 📌 Batch-Cohort Correction Tool
+
+            ## 🧬 Description
+            This Galaxy tool corrects batch and injection order effects in metabolomics data using a mixed-effects model. It supports standard **Workflow4Metabolomics** inputs:  
+            - `dataMatrix`
+            - `sampleMetadata`
+            - `variableMetadata`
+
+            ---
+
+            ## 📥 Inputs
+
+            1. **Data Matrix** (`tabular`)  
+               - Samples in columns, variables in rows.
+               - Will be transposed inside the script.
+               - Example: `Dataprocessing_dataMatrix.txt`
+
+            2. **Sample Metadata** (`tabular`)  
+               - Must include columns:
+                 - `SampleID`
+                 - `Batch` or `batch`
+                 - `Injection_Order` or `injectionOrder`
+
+            3. **Variable Metadata** (`tabular`)  
+               - One row per ion/variable.
+               - Will be used to rename variables to `Ion1`, `Ion2`, etc.
+
+            ---
+
+            ## 📤 Output
+
+            - A `CSV` file with the corrected ion intensities for each sample:
+              - `SampleID`, `Batch`, `Injection_Order`, `Ion1` ... `IonN`
+
+            ---
+
+            ## 💡 Example
+
+            - **Sample Metadata:**
+            ```
+            sampleMetadata	Group	Osmo	batch	sampleType	injectionOrder
+            Samp1	A	389	B2	sample	36
+            Samp2	A	857	B2	sample	34
+            ```
+
+            - **Variable Metadata:**
+            ```
+            variableMetadata	mz	mzmin	mzmax	rt	rtmin	rtmax
+            Var1	411.324949062189	411.324949060944	411.328101409696	9.83875936348509	9.83610100762266	9.84265924099634
+            Var2	132.868473699965	132.867360812174	132.869181044469	16.4318977259949	16.1310904303817	16.7378976534623
+            ```
+
+            - **Data Matrix (transposed in script):**
+            ```
+            dataMatrix	Samp1	Samp2	Samp(n)
+            Var1	8396	4803	1736
+            Var2	6195	4797	6526
+
+            ```
+
+            - **Corrected Output (after processing):**
+
+            ```
+            "SampleID","Group","Osmo","Batch","sampleType","Injection_Order","Ion1","Ion2","Ion(n)"
+            "Samp1","A",389,"B2","sample",36,0.0786625631354747,0.420410954232145,0.913940318799482
+            "Samp2","A",857,"B2","sample",34,-0.183851139155772,0.0983057506878457,-0.0813877735862746
+
+            ```
+            ## ⚠️ Notes
+            - File must not be empty.
+            - `Injection_Order` must be numeric.
+            - All `IonX` columns must be convertible to numbers.
+            - The tool automatically renames `batch` → `Batch` and `injectionOrder` → `Injection_Order`.
+
+            ---
+
+            ## 📦 Dependencies
+            - R (≥ 4.2.2)
+            - `optparse`
+            - `dplyr`
+            - `lme4`
+
+            ---
+
+    ]]></help>
+
+    <citations>
+        <citation type="bibtex">
+            @article{
+                10.1101/2023.10.12.561695,
+                author = {Nom, Prénom et al.},
+                title = {Tool: a software to quantify cell growth parameters and extracellular fluxes},
+                year = {2023},
+                journal = {bioRxiv},
+                doi = {10.1101/2023.10.12.561695}
+            }
+        </citation>
+    </citations>
+</tool>