Source code for qdiv.data_object

from __future__ import annotations
import pandas as pd
import numpy as np
import copy
import warnings
from pathlib import Path
from importlib.resources import files
from typing import Optional, Dict, Any, Union, List, Sequence, Literal, Set, Iterable, Self
from .io import files as data_files
from .io import subset as data_subset
from .utils import data_utils as help_func
from .utils import phylo_utils as phylo_func

__all__ = [
    "MicrobiomeData"
]


[docs]
class MicrobiomeData:
    """
    Container for microbiome data tables (abundance, taxonomy, metadata, sequences, tree).

    Attributes
    ----------
    tab : pd.DataFrame, optional
        Abundance table (features x samples).
    tax : pd.DataFrame, optional
        Taxonomy table (features x taxonomy levels).
    meta : pd.DataFrame, optional
        Metadata table (samples x variables).
    seq : pd.DataFrame, optional
        Sequence table (features x sequence).
    tree : pd.DataFrame, optional
        Phylogenetic tree.
    """

    def __init__(
        self,
        tab: Optional[pd.DataFrame] = None,
        tax: Optional[pd.DataFrame] = None,
        meta: Optional[pd.DataFrame] = None,
        seq: Optional[pd.DataFrame] = None,
        tree: Optional[pd.DataFrame] = None,
    ):
        self.tab = tab
        self.tax = tax
        self.meta = meta
        self.seq = seq
        self.tree = tree
        self._autocorrect()
        self._validate()

    #  ------------------------------------------------------------------------
    #  Methods for creating MicrobiomeData objects and importing data
    #  ------------------------------------------------------------------------

[docs]
    @classmethod
    def load(cls, **kwargs) -> MicrobiomeData:
        """
        Load microbiome data from files and return a MicrobiomeData object.

        Parameters
        ----------
        kwargs : dict
            Arguments for file paths and parsing options, passed to the loader.

        Returns
        -------
        MicrobiomeData
            Loaded data object.

        Examples
        --------
        >>> data = MicrobiomeData.load(tab="otu_table.csv", meta="metadata.csv")
        """
        data = data_files.load(**kwargs)
        return cls(
            tab=data.get("tab"),
            tax=data.get("tax"),
            meta=data.get("meta"),
            seq=data.get("seq"),
            tree=data.get("tree"),
        )



[docs]
    def add_tab(
        self,
        tab: str,
        *,
        path: str = "",
        sep: Optional[str] = None,
        taxonomy_levels: Optional[list[str]] = None
    ) -> Self:
        """
        Add or update `self.tab` (and `self.tax` if included in the file).

        Parameters
        ----------
        tab : str
            File name of the frequency table (.csv/.tsv, optionally gzipped, e.g. .csv.gz).
            Feature names (OTU/ASV/bin/MAG) should be in the first column (index).
        path : str, default ""
            Directory path (absolute or relative) containing `tab`. Can be "" for CWD.
        sep : str or None, default None
            Column separator. If None, pandas will attempt to auto-detect (engine='python').
        taxonomy_levels : list of str, optional
            Case-insensitive taxonomy column names to extract. Defaults to a broad set.

        Raises
        ------
        ValueError
            If the file cannot be read or has invalid format.

        Returns
        -------
        MicrobiomeData
            The updated object (self).

        """
        try:
            out = data_files.add_tab(
                tab,
                path=path,
                sep=sep,
                taxonomy_levels=taxonomy_levels,
            )
        except ValueError as e:
            # Add context and re-raise
            raise ValueError(f"[MicrobiomeData.add_tab] Failed to load '{tab}' from '{path}': {e}") from e

        # Assign results
        self.tab = out.get("tab")
        if "tax" in out:
            self.tax = out["tax"]
        self._autocorrect()
        self._validate()
        return self



[docs]
    def add_tax(
        self,
        tax: str,
        *,
        path: str = "",
        sep: Optional[str] = None,
        add_taxon_prefix: bool = True,
    ) -> Self:
        """
        Add or update `self.tax`.

        Parameters
        ----------
        tax : str
            File name of the taxonomy table (.csv/.tsv, optionally gzipped, e.g. .csv.gz).
            Feature names (OTU/ASV/bin/MAG) should be in the first column (index).
        path : str, default ""
            Directory path (absolute or relative) containing `tab`. Can be "" for CWD.
        sep : str or None, default ","
            Column separator. If None, pandas will attempt to auto-detect (engine='python').
        add_taxon_prefix : bool, default True
            If True, add letters and two underscores before taxon names to indicate taxonomic level.

        Raises
        ------
        ValueError
            If the file cannot be read or has invalid format.

        Returns
        -------
        MicrobiomeData
            The updated object (self).
        """
        try:
            out = data_files.add_tax(
                tax,
                path=path,
                sep=sep
            )
        except ValueError as e:
            raise ValueError(f"[MicrobiomeData.add_tax] Failed to load '{tax}' from '{path}': {e}") from e

        # Assign results
        self.tax = out.get("tax")
        self._autocorrect()
        self._validate()
        return self



[docs]
    def add_seq_from_fasta(
        self,
        fasta: str,
        *,
        path: str = "",
        name_splitter: Optional[str] = None
    ) -> Self:
        """
        Add or update `self.seq`.

        Parameters
        ----------
        fasta : str
            Name of the FASTA file with sequences of OTUs or ASVs (.fa, .fasta, optionally gzipped).
        path : str, default ""
            Directory path (absolute or relative) containing `fasta`. Can be "" for CWD.
        name_splitter : str, optional
            If provided, splits sequence names on this delimiter and keeps the first part.

        Raises
        ------
        ValueError
            If `fasta` is missing or file cannot be read. If no sequences are found.

        Returns
        -------
        MicrobiomeData
            The updated object (self).
        """
        try:
            out = data_files.add_seq_from_fasta(
                fasta,
                path=path,
                name_splitter=name_splitter
            )
        except ValueError as e:
            raise ValueError(f"[MicrobiomeData.add_seq_from_fasta] Failed to load '{fasta}' from '{path}': {e}") from e

        # Assign results
        self.seq = out.get("seq")
        self._autocorrect()
        self._validate()
        return self



[docs]
    def add_tree(
        self,
        tree: str,
        *,
        path: str = ""
    ) -> Self:
        """
        Load tree from a newick file into a dictionary with a pandas DataFrame.
    
        Parameters
        ----------
        tree : str
            Name of the newick file with the tree.
        path : str, default ""
            Directory path (absolute or relative) containing `tree`. Can be "" for CWD.

        Raises
        ------
        ValueError
            If `tree` is missing or file cannot be read, or if no nodes are found.

        Returns
        -------
        MicrobiomeData
            The updated object (self).
        """
        try:
            out = data_files.add_tree(
                tree,
                path=path,
            )
        except ValueError as e:
            raise ValueError(f"[MicrobiomeData.add_tree] Failed to load '{tree}' from '{path}': {e}") from e

        # Assign results
        self.tree = out.get("tree")
        self._autocorrect()
        self._validate()
        return self



[docs]
    def add_meta(
        self,
        meta: str,
        *,
        path: str = "",
        sep: Optional[str] = ","
    ) -> Self:
        """
        Load meta data into a dictionary with a pandas DataFrame.
    
        Parameters
        ----------
        meta : str
            Name of the meta data file.
        path : str, default ""
            Directory path (absolute or relative) containing `meta`. Can be "" for CWD.
        sep : str or None, default ","
            Column separator. If None, pandas will attempt to auto-detect (engine='python').
    
        Raises
        ------
        ValueError
            If `meta` is missing or file cannot be read, or if no samples are found.

        Returns
        -------
        MicrobiomeData
            The updated object (self).
        """
        try:
            out = data_files.add_meta(
                meta,
                path=path,
                sep=sep,
            )
        except ValueError as e:
            raise ValueError(f"[MicrobiomeData.add_meta] Failed to load '{meta}' from '{path}': {e}") from e

        # Assign results
        self.meta = out.get("meta")
        self._autocorrect()
        self._validate()
        return self



[docs]
    def add_tax_from_sintax(
            self, 
            filename: str,
            *,
            path: str = "",
    ) -> Self:
        """
        Add or update taxonomy from a SINTAX output file.

        Parameters
        ----------
        filename : str
            Path to the SINTAX output file.
        path : str, default ""
            Directory path (absolute or relative) containing `sintax_file`. Can be "" for CWD.

        Returns
        -------
        MicrobiomeData
            The updated object (self).
        """
        df = data_files.add_tax_from_sintax(filename=filename, path=path)
        self.tax = df # Update the tax attribute
        self._autocorrect()
        self._validate()
        return self



[docs]
    def add_tax_from_qiime(
        self,
        filename: str,
        *,
        path: str = "",
    ) -> Self:
        """
        Add or update taxonomy from a QIIME2-style taxonomy file.

        Parameters
        ----------
        filename : str
            File name of the taxonomy table (.tsv, e.g. from QIIME2 export).
        path : str, default ""
            Directory path (absolute or relative) containing `tax`. Can be "" for CWD.

        Returns
        -------
        MicrobiomeData
            The updated object (self).
        """
        df = data_files.add_tax_from_qiime(filename=filename, path=path)
        self.tax = df # Update the tax attribute
        self._autocorrect()
        self._validate()
        return self



[docs]
    def add_tax_from_gtdbtk(
            self,
            filenames: Union[str, List[str]],
            *,
            path: str = "",
    ) -> Self:

        """
        Add or update taxonomy from one or more GTDB-Tk summary files.
        
        Parameters
        ----------
        filenames : str or list of str
            Path(s) to GTDB-Tk summary .tsv file(s).
        path : str, default ""
            Directory path (absolute or relative) containing file(s). Can be "" for CWD.

        Returns
        -------
        MicrobiomeData
            The updated object (self).
        """
        df = data_files.add_tax_from_gtdbtk(filenames=filenames, path=path)
        self.tax = df # Update the tax attribute
        self._autocorrect()
        self._validate()
        return self



[docs]
    def add_tab_from_coverm(
        self,
        filename: str,
        *,
        path: str = "",
        first_sep: str = None,
        second_sep: str = " ",
        ra_col: str = "Relative Abundance",
        detection_threshold: float = None,
    ) -> Self:
        """
        Add a relative abundance table from a CoverM file.
    
        Parameters
        ----------
        filename : str
            Path to coverm .tsv or .csv file.
        path : str, default ""
            Directory path (absolute or relative) containing file(s). Can be "" for CWD.
        first_sep : str, optional
            Separator to help extract sample names from column headings.
        second_sep : str, optional
            Second separator to help extract sample names from column headings.
        ra_col : str, default='Relative Abundance'
            Column in CoverM output file used to calculate relative abundance.
        detection_threshold : float, optional, default None
            If set, all relative abundances below this threshold are set to zero,
            but only if both 'Covered Bases' and 'Length' columns are available for the sample.
            The detection threshold is a fraction from 0 to 1.

        Returns
        -------
        MicrobiomeData
            The updated object (self).
        """
        result = data_files.add_tab_from_coverm(
            filename=filename,
            path=path,
            first_sep=first_sep,
            second_sep=second_sep,
            ra_col=ra_col,
            detection_threshold=detection_threshold
        )
        self.tab = result.get("tab")
    
        if "unmapped" in result:
            um_series = result["unmapped"]
            um_dataframe = pd.DataFrame(um_series.to_numpy(), index=um_series.index, columns=["unmapped_reads_perc"])
            if not hasattr(self, 'meta') or self.meta is None:
                self.meta = um_dataframe
            else:
                self.meta["unmapped_reads_perc"] = um_series
        self._autocorrect()
        self._validate()
        return self



[docs]
    def add_ebd_tab_from_singlem(
        self,
        filename: str,
        *,
        path: str = "",
        first_sep: str = None,
        second_sep: str = " ",
    ) -> Self:
        """
        Add count data (tab) and taxonomic information (tax) from an EBD file generated
        with SingleM.
    
        Parameters
        ----------
        filename : str
            Path to the file (TSV).
        path : str, default ""
            Directory path (absolute or relative) containing file(s). Can be "" for CWD.
        first_sep : str or None
            Separator used to find sample name in CoverM-file column headings.
            Defaults to None.
        second_sep : str
            Separator used to find sample name in CoverM-file column headings.
            Defaults to None.
    
        Returns
        -------
        MicrobiomeData
            The updated object (self).
        """
        result = data_files.add_ebd_tab_from_singlem(
            filename=filename,
            path=path,
            first_sep=first_sep,
            second_sep=second_sep,
        )
        self.tab = result.get("tab")
        self.tax = result.get("tax")
        self._autocorrect()
        self._validate()
        return self


    #  ------------------------------------------------------------------------
    #  Methods saving files or showing information about the content
    #  ------------------------------------------------------------------------

[docs]
    def save(self, path: str = '', savename: str = 'output', sep: str = ',') -> list:
        """
        Save frequency table, taxonomy, metadata, sequences, and tree to disk.

        Parameters
        ----------
        path : str, optional
            Directory path where files will be saved. Defaults to the current directory.
        savename : str, optional
            Base name for output files. Defaults to "output".
        sep : str, optional
            Field separator for CSV files. Defaults to ",".

        Returns
        -------
        list of str
            List of file paths that were saved.

        Examples
        --------
        >>> files = data.printout(path="results", savename="mydata")
        """
        return data_files.save(self.to_dict(), path=path, savename=savename, sep=sep)



[docs]
    def copy(self):
        """
        Copy MicrobiomeData object.

        Returns
        -------
        MicrobiomeData
            A copy of the object.

        Examples
        --------
        >>> obj_copy = obj.copy()
        """
        out = copy.deepcopy(self)
        return out



[docs]
    def info(self, preview_rows: int = 1)  -> None:
        """
        Print summary information about the MicrobiomeData object.

        Parameters
        ----------
        preview_rows : int, optional
            Number of rows to preview from metadata (default: 1).
        """
        print("MicrobiomeData object summary")
        print("-" * 40)
        # Abundance table
        if self.tab is not None:
            print(f"Abundance table: {self.tab.shape[0]} features x {self.tab.shape[1]} samples")
            print(f"Sum of abundance values: {self.tab.sum().sum()}")
            print(f"Min. abundance value per sample: {self.tab.sum().min()}")
            print(f"Max. abundance value per sample: {self.tab.sum().max()}")
        else:
            print("Abundance table: None")
        # Taxonomy
        if self.tax is not None:
            print(f"Taxonomy table: {self.tax.shape[0]} features, levels: {list(self.tax.columns)}")
        else:
            print("Taxonomy table: None")
        # Sequences
        if self.seq is not None:
            print(f"Sequence table: {self.seq.shape[0]} features")
        else:
            print("Sequence table: None")
        # Tree
        if self.tree is not None:
            print(f"Tree: {len(self.tree)} nodes")
        else:
            print("Tree: None")
        # Metadata
        if self.meta is not None:
            print(f"Metadata table: {self.meta.shape[0]} samples, columns: {list(self.meta.columns)}")
            if self.meta.shape[0] > 0:
                print("Metadata preview:")
                print(self.meta.head(preview_rows))
        else:
            print("Metadata table: None")
        print("-" * 40)



[docs]
    def summarize_taxa(
        self,
        savename: str | None = None,
        *,
        path: str = "",
    ) -> pd.DataFrame:
        """
        Summarize the number of taxa at each taxonomic level per sample.
    
        Parameters
        ----------
        savename : str or None, default=None
            If provided, save the output table as CSV in the given path.
    
        Returns
        -------
        pandas.DataFrame
            Summary table with:

                - number of features per sample
                - total reads per sample
                - number of unique taxa at each taxonomic level
        """
    
        # --- Validate inputs ------------------------------------------------------
        if self.tax is None or not isinstance(self.tax, pd.DataFrame):
            raise ValueError("'tax' is missing or not a DataFrame.")
        if self.tab is None or not isinstance(self.tab, pd.DataFrame):
            raise ValueError("'tab' is missing or not a DataFrame.")
        if self.tax.shape[0] == 0 or self.tab.shape[0] == 0:
            raise ValueError("Features are missing in 'tax' or 'tab'.")
        if not self.tax.index.equals(self.tab.index):
            raise ValueError("Feature IDs in 'tax' and 'tab' must match.")
    
        tax = self.tax.copy()
        tab = self.tab.copy()
    
        taxlevels = tax.columns.tolist()
        samples = tab.columns.tolist()
        sample_sums = tab.sum(axis=0)
    
        # --- Build rows -----------------------------------------------------------
        rows = []
    
        for smp in ["Total"] + samples:
            row = {"Sample": smp}
    
            if smp == "Total":
                row["Features"] = tab.shape[0]
                row["Reads"] = sample_sums.sum()
                for tlev in taxlevels:
                    row[tlev] = tax[tlev].dropna().nunique()
            else:
                present = tab.index[tab[smp] > 0]
                row["Features"] = len(present)
                row["Reads"] = sample_sums[smp]
                for tlev in taxlevels:
                    row[tlev] = tax.loc[present, tlev].dropna().nunique()
    
            rows.append(row)
    
        output = pd.DataFrame(rows).set_index("Sample")
    
        if savename:
            file_path = Path(path) / f"{savename}.csv"
            output.to_csv(file_path)
    
        return output


    #  ------------------------------------------------------------------------
    #  Methods for subsetting and manipulating objects
    #  ------------------------------------------------------------------------

[docs]
    def subset_samples(
        self,
        *,
        by: str = "index",
        values=None,
        exclude: bool = False,
        keep_absent: bool = False,
        inplace: bool = False,
    ) -> MicrobiomeData:
        """
        Subset samples in the MicrobiomeData object using io.subset_samples.

        Parameters
        ----------
        by : str, default "index"
            How to select samples: "index" for sample names, or a column name in meta.
        values : list or scalar, optional
            Values to include (or exclude if exclude=True).
            If None and `by != "index"`, all unique values of meta[by] are used, exluding nan.
        exclude : bool, default False
            If True, exclude samples that match values.
        keep_absent : bool, default False
            If False, drop features (rows) with zero counts after subsetting.
        inplace : bool, default False
            If True, modify the object in place. If False, return a new object.

        Returns
        -------
        MicrobiomeData
            The filtered object (self if inplace=True, otherwise a new object).
        """
        return data_subset.subset_samples(
            self,
            by=by,
            values=values,
            exclude=exclude,
            keep_absent=keep_absent,
            inplace=inplace
        )



[docs]
    def subset_features(
        self,
        *,
        featurelist=None,
        exclude: bool = False,
        inplace: bool = False,
    ) -> MicrobiomeData:
        """
        Subset features (OTUs/ASVs/bins/MAGs) from a MicrobiomeData object 
        using io.subset_features. 

        Parameters
        ----------
        featurelist : list
            List of feature (OTU/ASV/bin) identifiers to keep or exclude.
        exclude : bool, default False
            If True, exclude values in featurelist instead of including them.
        inplace : bool, default False
            If True, mutate and return the same object. If False, return a new object.

        Returns
        -------
        MicrobiomeData
            The filtered object (self if inplace=True, otherwise a new object).
        """
        return data_subset.subset_features(
            self,
            featurelist=featurelist,
            exclude=exclude,
            inplace=inplace
        )



[docs]
    def subset_abundant(
        self,
        *,
        n: int = 25,
        method: Literal["sum", "mean"] = "mean",
        cutoff: float | None = None,
        exclude: bool = False,
        inplace: bool = False,
    ) -> MicrobiomeData:
        """
        Subset features (OTUs/ASVs/bins/MAGs) from a MicrobiomeData object 
        using io.subset_abundant. 
    
        Parameters
        ----------
        n : int, default 25
            Number of top features to keep (or exclude if `exclude=True`).
            Values outside [0, n_features] are clamped to the valid range.
        method : {'sum','mean','frequency'}, default 'mean'
            Reduction across samples of **relative abundance** per feature.
            - 'sum'  : total relative abundance across samples
            - 'mean' : mean relative abundance across samples
            - 'max' : max relative abundance in a sample
            - 'frequency' : proportion of samples in which the feature is detected
        cutoff : float, default None
            If cutoff is specific as a percentage (from 0 to 100%), all features 
            with a 'sum' or 'mean' relative abundance or 'frequency' of detection
            above this value will be kept, and the parameter n will be ignored.
        exclude : bool, default False
            If False (default), keep the top features.
            If True, exclude the top features (keep the rest).
        inplace : bool, default False
            Only relevant for MicrobiomeData input.
            If True, mutate the object and return it; otherwise, return a new object.
    
        Returns
        -------
        MicrobiomeData
            The filtered object (self if inplace=True, otherwise a new object).
        """
        return data_subset.subset_abundant(
            self,
            n=n,
            method=method,
            cutoff=cutoff,
            exclude=exclude,
            inplace=inplace
        )



[docs]
    def merge_samples(
        self,
        *,
        by: Union[List[str], str],
        values: Optional[list] = None,
        method: str = "sum",
        weight: str | None = None,
        keep_absent: bool = False,
        inplace: bool = False
    ) -> MicrobiomeData:
        """
        Merge samples in the MicrobiomeData object based on metadata grouping.
    
        Parameters
        ----------
        by : str or list
            Column(s) in metadata used for grouping samples.
        values : list, optional
            Metadata values to keep. If None, all unique values in `by` are used.
        method : {'sum', 'mean'}, default 'sum'
            Aggregation method used when `weight=None`. If `weight` is provided,
            samples are merged using a weighted average based on the specified metadata column.
        weight : str, optional, default None
            Name of a numeric metadata column used for weighted merging. 
            Within each group, weights are normalized to sum to 1 and used to calculate 
            a weighted average of feature abundances. If None, samples are merged 
            using the specified `method` ('sum' or 'mean').    
            keep_absent : bool, default False
            If False, remove features with zero counts after merging.
        inplace : bool, default False
            If True, modify this object in place; if False, return a new object.
    
        Returns
        -------
        MicrobiomeData
            Object with merged samples. Returns `self` if ``inplace=True``, otherwise a new
            `MicrobiomeData` instance.
    
        Raises
        ------
        ValueError
            If metadata or the specified column is missing, or if no samples match the specified values.
    
        Examples
        --------
        >>> obj.merge_samples(by="Treatment", method="sum", inplace=True)
        >>> merged = obj.merge_samples(by="Site", method="mean")
        """
        return data_subset.merge_samples(
            self,
            by=by,
            values=values,
            method=method,
            weight=weight,
            keep_absent=keep_absent,
            inplace=inplace
        )



[docs]
    def subset_taxa(
        self,
        *,
        subset_levels: Optional[Union[str, Sequence[str]]] = None,
        subset_patterns: Optional[Union[str, Sequence[str]]] = None,
        exclude: bool = False,
        case: bool = False,
        regex: bool = False,
        na: bool = False,
        match_type: Literal["contains", "fullmatch", "startswith", "endswith"] = "contains",
        inplace: bool = False,
    ) -> MicrobiomeData:
        """
        Subset features (OTUs/ASVs/bins/MAGs) from the MicrobiomeData object based on taxonomic classification.
    
        Parameters
        ----------
        subset_levels : str or sequence of str, optional
            Taxonomic column(s) in which to search for patterns. If None, all columns in `tax` are used.
        subset_patterns : str or sequence of str
            Text patterns to identify taxa to keep. If a single string is passed, it is used as the only pattern.
        exclude : bool, default False
            If True, return taxa that do NOT match the given patterns (i.e., complement).
        case : bool, default False
            If True, pattern matching is case-sensitive.
        regex : bool, default False
            If True, patterns are treated as regex. If False, patterns are escaped (literal match).
        na : bool, default False
            If True, na are treated as matches. If False, na are treated as non-matches. Empty or whitespace-only taxonomy entries are treated as missing (NA) during subsetting.
        match_type : {'contains','fullmatch','startswith','endswith'}, default 'contains'
            Matching behavior applied to the strings in selected columns.
        inplace : bool, default False
            If True, mutate and return the same object. If False, return a new object.
    
        Returns
        -------
        MicrobiomeData
            Filtered object with updated 'tab', 'tax', and 'seq'. 'meta' and 'tree' are passed through.
    
        Raises
        ------
        ValueError
            If taxonomy table is missing, no patterns are provided, or no matches are found.
    
        Examples
        --------
        >>> obj.subset_taxa(subset_levels="Genus", subset_patterns="Bacteroides", inplace=True)
        >>> filtered_obj = obj.subset_taxa(subset_patterns=["Bacteroides", "Clostridium"], exclude=True)
        """
        return data_subset.subset_taxa(
            self,
            subset_levels=subset_levels,
            subset_patterns=subset_patterns,
            exclude=exclude,
            case=case,
            regex=regex,
            na=na,
            match_type=match_type,
            inplace=inplace
        )



[docs]
    def rarefy(
        self,
        *,
        depth: Union[int, str] = "min",
        random_state: Optional[Union[int, np.random.Generator]] = None,
        replacement: bool = False,
        inplace: bool = False,
        **kwargs,
    ) -> MicrobiomeData:
        """
        Rarefy the abundance table to a fixed sequencing depth.
    
        This method is a thin wrapper around :func:`io.subset.rarefy`. It performs
        random subsampling (with or without replacement) to equalize sequencing depth
        across samples, then drops features and samples that become zero.
    
        Parameters
        ----------
        depth : int or 'min', default 'min'
            Target sequencing depth per sample. If 'min', the minimum depth across
            samples is used.
        random_state : int | numpy.random.Generator, optional
            Random seed or Generator for reproducibility.
        replacement : bool, default False
            If True, sample with replacement (multinomial); otherwise sample
            without replacement.
        inplace : bool, default False
            If True, modify this object in place; if False, return a new object.
    
        Returns
        -------
        MicrobiomeData
            The rarefied object. Returns `self` if ``inplace=True``, otherwise a new
            `MicrobiomeData` instance.
    
        Notes
        -----
        - Rarefaction reduces sequencing depth variance across samples to facilitate
          certain diversity and dissimilarity analyses.
        - The exact algorithm and post‑processing (feature/sample pruning) are
          implemented in :func:`io.subset.rarefy`.
        - Index alignment and integrity are enforced via :meth:`_autocorrect` and
          :meth:`_validate` in the underlying implementation.
    
        Examples
        --------
        >>> obj.rarefy(depth=10000, seed=42, inplace=True)
        >>> rarefied_obj = obj.rarefy(depth='min', replacement=True)
        """
        if "seed" in kwargs:
            if random_state is not None:
                raise TypeError("Specify only one of 'random_state' or 'seed'.")
            random_state = kwargs.pop("seed")
        if kwargs:
            raise TypeError(f"Unexpected keyword arguments: {list(kwargs)}")

        return data_subset.rarefy(
            self,
            depth=depth,
            random_state=random_state,
            replacement=replacement,
            inplace=inplace
        )



[docs]
    def prune_tree(
        self,
        featurelist: Union[List[str], Set[str], Iterable[str], None] = None,
        reroot: bool = False,
        inplace: bool = False,
    ) -> MicrobiomeData:
        """
        Prune the tree to retain only branches whose leaves intersect with a given feature set,
        plus always keep the root branch.
    
        Parameters
        ----------
        featurelist : list of str or set of str or iterable of str, optional
            A collection of feature names to match against the leaves of each branch.
            If None, the method will attempt to use `self.tab.index.tolist()`.
        reroot : bool, default False
            If True, reroot the pruned tree at midpoint.
        inplace : bool, default False
            If True, modify this object in place; if False, return a new object.

        Returns
        -------
        MicrobiomeData
            The object with pruned tree. Returns `self` if ``inplace=True``, otherwise a new
            `MicrobiomeData` instance.
        """
        if self.tree is None:
            raise ValueError("'tree' is missing")
    
        if featurelist is None and self.tab is not None:
            featurelist = self.tab.index.tolist()
    
        if featurelist is None or len(featurelist) == 0:
            raise ValueError("Either 'featurelist' must be provided or 'self.tab' must have a valid index.")

        tree = phylo_func.dataframe_to_tree(self.tree)
        tree = phylo_func.subset_tree(tree, featurelist)
        tree = phylo_func.collapse_single_child_nodes(tree)
        if reroot:
            tree = phylo_func.reroot_midpoint(tree)
        tree = phylo_func.tree_to_dataframe(tree)

        if inplace:
            self.tree = tree
            return self
        else:
            new_obj = copy.deepcopy(self)
            new_obj.tree = tree
            return new_obj



[docs]
    def rename_features(
        self,
        name_type: str = 'OTU',
        name_dict: dict = None,
        inplace: bool = False,
    ) -> MicrobiomeData:
        """
        Rename feature identifiers (row indices) based on their relative abundance or taxonomic order.
    
        The renaming is done either based on the rank of the feature after sorting 
        based on relative abundance or based on a dictionary containing existing names
        as keys and new names as values. If 'name_dict' is None, the features are renamed
        according in the format `{name_type}{i}`, and sorted:
        - By mean relative abundance if `tab` (abundance table) is present.
        - By taxonomic order if `tax` is present and `tab` is absent.
    
        Parameters
        ----------
        name_type : str, default='OTU'
            Prefix for new feature names, e.g., 'OTU', 'ASV' (used is name_dict is None).
        name_dict : dict, default=None
            Dictionary with feature name {'Old_name': 'New:name', ...}.
        inplace : bool, default=False
            If True, modify object in place.

        Returns
        -------
        MicrobiomeData
            The updated object. If `inplace=True`, returns self; otherwise, a new instance.
        """

        return help_func.rename_features(
            self,
            name_type=name_type,
            name_dict=name_dict,
            inplace=inplace,
        )



[docs]
    def tax_prefix(
        self,
        add: bool = True,
        inplace: bool = False,
        custom_prefix: Dict[str, str] = None
    ) -> MicrobiomeData:
        """
        Add or remove prefix (e.g. d__, p__) to taxonomic classifications.
    
        Parameters
        ----------
        add : bool, default=True
            If True, add prefix. If False, remove prefix.
        inplace : bool, default=False
            If True, modify object in place.
        custom_prefix : dict, default=None
            A dictionary with taxonomic levels as keys and prefix as values.

        Returns
        -------
        MicrobiomeData
            The updated object. If `inplace=True`, returns self; otherwise, a new instance.
        """
        return help_func.tax_prefix(
            self,
            add=add,
            inplace=inplace,
            custom_prefix=custom_prefix
        )



[docs]
    def clean_tax(
        self,
        inplace: bool = False,
    ) -> MicrobiomeData:
        """
        Clean and standardize Greengenes2/GTDB taxonomy within a MicrobiomeData object.
        
        This function processes taxonomy derived from Greengenes2 or GTDB.
        It normalizes missing or ambiguous labels, preserves GTDB letter suffixes
        (e.g., ``_A``, ``_B``), and removes GTDB numeric node identifiers
        (e.g., ``_368345``) at all taxonomic ranks. 
    
        Parameters
        ----------
        inplace : bool, default=False
            If True, modify object in place.

        Returns
        -------
        MicrobiomeData
            The updated object. If `inplace=True`, returns self; otherwise, a new instance.

        Notes
        -----
        - Unknown or unassigned labels (e.g., "Unknown", "None", "unclassified")
          are normalized to ``NA``.
        - The function add prefixes such as d__, p__ by default.
          These can later be removed by the tax_prefix function if needed.
        """

        return help_func.clean_taxonomy_table(self, inplace=inplace)


    
    #  ------------------------------------------------------------------------
    #  Utility methods
    #  ------------------------------------------------------------------------

[docs]
    @classmethod
    def load_example(
        cls, 
        example_name: str = "Modin_et_al_2025",
    ) -> MicrobiomeData:
        """
        Load a MicrobiomeData object from packaged example files.

        Parameters
        ----------
        example_name : str
            Name of the example to load. Options:

            - "Modin2025": Uses CoverM and GTDB-Tk output files from the Modin et al. study https://doi.org/10.1111/1751-7915.70238.
            - "Saheb-Alam2019_DADA2": Uses qiime2-dada2 output files from the Saheb-Alam et al. study https://doi.org/10.1111/1751-7915.13449.
            - "Saheb-Alam2019_Deblur": Uses qiime2-deblur output files from the Saheb-Alam et al. study https://doi.org/10.1111/1751-7915.13449.

        Returns
        -------
        MicrobiomeData
            An instance loaded with example data.

        Raises
        ------
        ValueError
            If the example_name is not recognized.
        """
        if example_name == "Modin2025" or "modin" in example_name.lower():
            # Example: CoverM and GTDB-Tk files
            base = "qdiv.example_data"
            tab_file = files(base).joinpath("Modin2025_CoverM.tsv")
            tax_file1 = files(base).joinpath("Modin2025_gtdbtk.ar53.summary.tsv")
            tax_file2 = files(base).joinpath("Modin2025_gtdbtk.bac120.summary.tsv")
            meta_file = files(base).joinpath("Modin2025_metadata.csv")
            tree_file = files(base).joinpath("Modin2025_tree.nwk")

            obj = cls()
            obj.add_tab_from_coverm(str(tab_file), detection_threshold=0.5, 
                                    first_sep="/", second_sep="_R")
            obj.add_tax_from_gtdbtk([str(tax_file1), str(tax_file2)])
            obj.add_meta(str(meta_file))
            obj.add_tree(str(tree_file))
            return obj

        elif example_name == "Saheb-Alam2019_DADA2" or ("alam" in example_name.lower() and "dada2" in example_name.lower()):
            # Example: CoverM and GTDB-Tk files
            base = "qdiv.example_data"
            tab_file = files(base).joinpath("Saheb-Alam2019_tab_dada2.tsv")
            tax_file = files(base).joinpath("Saheb-Alam2019_tax_dada2.tsv")
            seq_file = files(base).joinpath("Saheb-Alam2019_seq_dada2.fasta")
            meta_file = files(base).joinpath("Saheb-Alam2019_meta.csv")
            tree_file = files(base).joinpath("Saheb-Alam2019_tree_dada2.nwk")

            obj = cls()
            obj.add_tab(str(tab_file))
            obj.add_tax_from_qiime(str(tax_file))
            obj.add_seq_from_fasta(str(seq_file))
            obj.add_meta(str(meta_file))
            obj.add_tree(str(tree_file))
            return obj

        elif example_name == "Saheb-Alam2019_Deblur" or ("alam" in example_name.lower() and "deblur" in example_name.lower()):
            # Example: CoverM and GTDB-Tk files
            base = "qdiv.example_data"
            tab_file = files(base).joinpath("Saheb-Alam2019_tab_deblur.tsv")
            tax_file = files(base).joinpath("Saheb-Alam2019_tax_deblur.tsv")
            seq_file = files(base).joinpath("Saheb-Alam2019_seq_deblur.fasta")
            meta_file = files(base).joinpath("Saheb-Alam2019_meta.csv")
            tree_file = files(base).joinpath("Saheb-Alam2019_tree_deblur.nwk")

            obj = cls()
            obj.add_tab(str(tab_file))
            obj.add_tax_from_qiime(str(tax_file))
            obj.add_seq_from_fasta(str(seq_file))
            obj.add_meta(str(meta_file))
            obj.add_tree(str(tree_file))
            return obj

        elif example_name is None:
            obj = cls()
            return obj
            
        else:
            raise ValueError(
                f"Unknown example_name '{example_name}'. "
                "Available options: 'Modin2025', 'Saheb-Alam2019_DADA2', 'Saheb-Alam2019_Deblur'."
            )



[docs]
    def to_dict(self) -> Dict[str, Any]:
        """
        Return the data as a dictionary.

        Returns
        -------
        dict
            Dictionary with keys: 'tab', 'tax', 'meta', 'seq', 'tree'.
        """
        return {
            "tab": self.tab,
            "tax": self.tax,
            "meta": self.meta,
            "seq": self.seq,
            "tree": self.tree,
        }



[docs]
    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> MicrobiomeData:
        """
        Create a MicrobiomeData object from a dictionary.
    
        Parameters
        ----------
        data : dict
            Dictionary with keys:
            - 'tab' : pd.DataFrame (required)
            - 'tax' : pd.DataFrame, optional
            - 'meta' : pd.DataFrame, optional
            - 'seq' : pd.DataFrame, optional
            - 'tree' : pd.DataFrame, optional
    
        Returns
        -------
        MicrobiomeData
            A new MicrobiomeData object initialized from the dictionary.
    
        Raises
        ------
        ValueError
            If 'tab' is missing or not a pandas DataFrame.
    
        Examples
        --------
        >>> my_dict = {
        ...     "tab": pd.DataFrame(...),
        ...     "tax": pd.DataFrame(...),
        ...     "meta": pd.DataFrame(...)
        ... }
        >>> obj = MicrobiomeData.from_dict(my_dict)
        """
        if "tab" not in data or not isinstance(data["tab"], pd.DataFrame):
            raise ValueError("Input dictionary must contain a 'tab' key with a pandas DataFrame.")
    
        return cls(
            tab=data.get("tab"),
            tax=data.get("tax"),
            meta=data.get("meta"),
            seq=data.get("seq"),
            tree=data.get("tree"),
        )


    def _autocorrect(self):
        """
        Automatically align indices between tables and warn the user if corrections are made.
        """
        # Sort index and fix index column name
        if self.tab is not None and len(self.tab) > 0:
            self.tab = help_func.sort_index_by_number(self.tab)
            self.tab.index.name = 'Feature'
        if self.seq is not None and len(self.seq) > 0:
            self.seq = help_func.sort_index_by_number(self.seq)
            self.seq.index.name = 'Feature'
        if self.tax is not None and len(self.tax) > 0:
            self.tax = help_func.sort_index_by_number(self.tax)
            self.tax.index.name = 'Feature'
        
        if self.tab is not None and self.tax is not None:
            common_features = self.tab.index.intersection(self.tax.index)
            dropped_tab = set(self.tab.index) - set(common_features)
            dropped_tax = set(self.tax.index) - set(common_features)
            if dropped_tab or dropped_tax:
                msg = (
                    f"Auto-correct: Subsetting 'tab' and 'tax' to {len(common_features)} common features.\n"
                    f"  Dropped {len(dropped_tab)} from 'tab': {list(dropped_tab)[:5]}{'...' if len(dropped_tab) > 5 else ''}\n"
                    f"  Dropped {len(dropped_tax)} from 'tax': {list(dropped_tax)[:5]}{'...' if len(dropped_tax) > 5 else ''}\n"
                    "  (Check your input files for consistent feature names.)"
                )
                warnings.warn(msg, UserWarning)
                self.tab = self.tab.loc[common_features]
                self.tax = self.tax.loc[common_features]

        # Align features (rows) between tab and seq
        if self.tab is not None and self.seq is not None:
            common_features = self.tab.index.intersection(self.seq.index)
            dropped_tab = set(self.tab.index) - set(common_features)
            dropped_seq = set(self.seq.index) - set(common_features)
            if dropped_tab or dropped_seq:
                msg = (
                    f"Auto-correct: Subsetting 'tab' and 'seq' to {len(common_features)} common features.\n"
                    f"  Dropped {len(dropped_tab)} from 'tab': {list(dropped_tab)[:5]}{'...' if len(dropped_tab) > 5 else ''}\n"
                    f"  Dropped {len(dropped_seq)} from 'seq': {list(dropped_seq)[:5]}{'...' if len(dropped_seq) > 5 else ''}\n"
                    "  (Check your input files for consistent feature names.)"
                )
                warnings.warn(msg, UserWarning)
                self.tab = self.tab.loc[common_features]
                self.seq = self.seq.loc[common_features]
                self.tab.index.name = 'Features'
                self.seq.index.name = 'Features'

        # Align samples (columns) between tab and meta
        if self.tab is not None and self.meta is not None:
            common_samples = self.tab.columns.intersection(self.meta.index)
            dropped_tab = set(self.tab.columns) - set(common_samples)
            dropped_meta = set(self.meta.index) - set(common_samples)
            if dropped_tab or dropped_meta:
                msg = (
                    f"Auto-correct: Subsetting 'tab' columns and 'meta' index to {len(common_samples)} common samples.\n"
                    f"  Dropped {len(dropped_tab)} from 'tab': {list(dropped_tab)[:5]}{'...' if len(dropped_tab) > 5 else ''}\n"
                    f"  Dropped {len(dropped_meta)} from 'meta': {list(dropped_meta)[:5]}{'...' if len(dropped_meta) > 5 else ''}\n"
                    "  (Check your input files for consistent sample names.)"
                )
                warnings.warn(msg, UserWarning)
                self.tab = self.tab[common_samples]
                self.meta = self.meta.loc[common_samples]
            self.tab = self.tab[self.meta.index]


    def _validate(self):
        """
        Internal validation to ensure index alignment, uniqueness, and data integrity.
        Raises ValueError if inconsistencies or duplicates are found.
        """
        if self.tab is not None:
            # Check for empty tab
            if len(self.tab) == 0:
                raise ValueError("Features missing in tab.")
            if len(self.tab.columns) == 0:
                raise ValueError("Samples missing in tab.")
    
            # Check for duplicate feature names (rows) in tab
            if self.tab.index.has_duplicates:
                dups = self.tab.index[self.tab.index.duplicated()].unique().tolist()
                raise ValueError(f"Duplicate feature names in tab: {dups}")
    
            # Check for duplicate sample names (columns) in tab
            if self.tab.columns.has_duplicates:
                dups = self.tab.columns[self.tab.columns.duplicated()].unique().tolist()
                raise ValueError(f"Duplicate sample names in tab: {dups}")
    
            # Taxonomy checks
            if self.tax is not None:
                if len(self.tax) == 0:
                    raise ValueError("Features missing in tax.")
                if self.tax.index.has_duplicates:
                    dups = self.tax.index[self.tax.index.duplicated()].unique().tolist()
                    raise ValueError(f"Duplicate feature names in tax: {dups}")
                if not self.tab.index.equals(self.tax.index):
                    raise ValueError("Indices of 'tab' and 'tax' do not match.")
    
            # Sequence checks
            if self.seq is not None:
                if len(self.seq) == 0:
                    raise ValueError("Features missing in seq.")
                if self.seq.index.has_duplicates:
                    dups = self.seq.index[self.seq.index.duplicated()].unique().tolist()
                    raise ValueError(f"Duplicate feature names in seq: {dups}")
                if not self.tab.index.equals(self.seq.index):
                    raise ValueError("Indices of 'tab' and 'seq' do not match.")
    
            # Metadata checks
            if self.meta is not None:
                if len(self.meta) == 0:
                    raise ValueError("Samples missing in meta.")
                if self.meta.index.has_duplicates:
                    dups = self.meta.index[self.meta.index.duplicated()].unique().tolist()
                    raise ValueError(f"Duplicate sample names in meta: {dups}")
                if not self.tab.columns.equals(self.meta.index):
                    raise ValueError("Sample names in 'tab' and 'meta' do not match.")
    
            # Tree checks
            if self.tree is not None:
                tab_features = set(self.tab.index)
                tree_nodes = set(self.tree['nodes'])
                if not tab_features.issubset(tree_nodes):
                    raise ValueError("Not all tab features are found among tree nodes.")

                # Every non-null parent must exist among nodes
                t = self.tree
                parent_vals = t["parent"].dropna().astype(str)
                node_vals = t["nodes"].astype(str)
                missing_parents = parent_vals[~parent_vals.isin(node_vals)]

                if not missing_parents.empty:
                    # Build an aligned boolean mask on the FULL parent column
                    mask_bad = t["parent"].astype(str).isin(set(missing_parents))
                    bad_rows = t.loc[mask_bad].head(5)  # <-- aligned to t.index
                    raise ValueError(
                        "Tree is inconsistent: some rows reference parents not present as nodes. "
                        f"Examples: {bad_rows[['nodes','parent']].to_dict('records')}. "
                        "Reload the tree or run prune_tree() to rebuild a valid induced subtree."
                    )

        if self.tax is not None:
            if len(self.tax) == 0:
                raise ValueError("Features missing in tax.")
            if len(self.tax.columns) == 0:
                raise ValueError("Tax levels in tab.")

        if self.meta is not None:
            if len(self.meta) == 0:
                raise ValueError("Samples missing in meta.")

        if self.seq is not None:
            if len(self.seq) == 0:
                raise ValueError("Features missing in seq.")

        if self.tree is not None:
            if len(self.tree) == 0:
                raise ValueError("Features missing in tree.")

    def __repr__(self):
        n_features = self.tab.shape[0] if self.tab is not None else 0
        n_samples = self.tab.shape[1] if self.tab is not None else 0
        return (f"<MicrobiomeData: {n_features} features, {n_samples} samples, "
                f"tax={'yes' if self.tax is not None else 'no'}, "
                f"meta={'yes' if self.meta is not None else 'no'}, "
                f"seq={'yes' if self.seq is not None else 'no'}, "
                f"tree={'yes' if self.tree is not None else 'no'}>")