Source code for peach.pp.basic

"""
Basic preprocessing functions for archetypal analysis.

This module provides essential data loading, synthetic data generation,
and pathway analysis utilities for Deep Archetypal Analysis workflows.

Main Functions:
- load_data(): Load AnnData from file with proper format validation
- generate_synthetic(): Create realistic synthetic datasets for testing
- prepare_training(): Convert AnnData to PyTorch DataLoader
- prepare_atacseq(): TF-IDF + LSI preprocessing for scATAC-seq data
- load_pathway_networks(): Access MSigDB pathway collections
- compute_pathway_scores(): Calculate pathway activity scores

All functions follow scVerse conventions with AnnData-centric workflows.
"""

from anndata import AnnData
from torch.utils.data import DataLoader

from .._core.utils.atacseq import compute_lsi as _compute_lsi
from .._core.utils.atacseq import tfidf_normalize as _tfidf_normalize
from .._core.utils.convex_synth_data import generate_convex_data as _generate_convex_data
from .._core.utils.gene_analysis import compute_pathway_scores as _compute_pathway_scores
from .._core.utils.gene_analysis import load_pathway_networks as _load_pathway_networks
from .._core.utils.load_anndata import create_dataloader_from_anndata as _create_dataloader

# Import existing battle-tested functions
from .._core.utils.load_anndata import load_anndata as _load_anndata


[docs] def load_data(path: str, use_raw: bool = True, dim_reduction_key: str = "X_PCA", batch_size: int = 128) -> AnnData: """Load AnnData for archetypal analysis. Note: Use scanpy.pp.pca() to compute PCA coordinates after loading. Parameters ---------- path : str Path to the data file (matches _core parameter name) use_raw : bool, default: True Whether to use raw data dim_reduction_key : str, default: "X_PCA" Key for dimension reduction in adata.obsm batch_size : int, default: 128 Batch size for data loading Returns ------- AnnData Loaded data. Use sc.pp.pca(adata) to add PCA coordinates. """ return _load_anndata(path=path, use_raw=use_raw, dim_reduction_key=dim_reduction_key, batch_size=batch_size)
[docs] def generate_synthetic( n_points: int = 1000, n_dimensions: int = 50, n_archetypes: int = 4, noise: float = 0.1, *, seed: int = 1205, archetype_type: str = "random", scale: float = 20.0, return_torch: bool = True, ) -> AnnData: """Generate synthetic convex data for testing. Parameters ---------- n_points : int, default: 1000 Number of data points to generate (matches _core parameter) n_dimensions : int, default: 50 Number of dimensions/features (matches _core parameter) n_archetypes : int, default: 4 Number of archetypes noise : float, default: 0.1 Noise level (matches _core parameter) seed : int, default: 1205 Random seed for reproducibility archetype_type : str, default: "random" Type of archetype generation ('random', 'corners', 'sphere') scale : float, default: 20.0 Scale factor for data generation return_torch : bool, default: True Whether to return PyTorch tensors Returns ------- AnnData Synthetic data with ground truth archetypes in .uns """ # Generate the synthetic data (returns tuple of tensors) data, archetypes = _generate_convex_data( n_points=n_points, n_dimensions=n_dimensions, n_archetypes=n_archetypes, noise=noise, seed=seed, archetype_type=archetype_type, scale=scale, return_torch=return_torch, ) # Convert to AnnData from anndata import AnnData adata = AnnData(X=data.numpy() if return_torch else data) adata.var_names = [f"Feature_{i}" for i in range(n_dimensions)] adata.obs_names = [f"Cell_{i}" for i in range(n_points)] # Store ground truth archetypes adata.uns["true_archetypes"] = archetypes.numpy() if return_torch else archetypes # Compute proper PCA coordinates (like real data workflow) import scanpy as sc sc.pp.pca(adata, n_comps=min(50, n_dimensions - 1)) # Standard PCA computation return adata
[docs] def prepare_training( adata: AnnData, batch_size: int = 128, shuffle: bool = True, pca_key: str = None, num_workers: int | str = "auto", pin_memory: bool | str = "auto", persistent_workers: bool | str = "auto", prefetch_factor: int = 2, ) -> DataLoader: """Create DataLoader from AnnData for training with HPC optimizations. Parameters ---------- adata : AnnData Annotated data object with PCA coordinates batch_size : int, default: 128 Batch size for training shuffle : bool, default: True Whether to shuffle data in DataLoader pca_key : str, default: None Key in adata.obsm containing PCA coordinates (auto-detected if None) num_workers : int or 'auto', default: 'auto' Number of subprocesses for data loading. 'auto' detects optimal value based on environment (0 for Apple Silicon, 6 for HPC, 2 for local) pin_memory : bool or 'auto', default: 'auto' Use pinned memory for faster GPU transfer. 'auto' sets True if CUDA available persistent_workers : bool or 'auto', default: 'auto' Keep workers alive between epochs. 'auto' sets True if num_workers > 0 prefetch_factor : int, default: 2 Number of batches loaded in advance by each worker Returns ------- DataLoader PyTorch DataLoader optimized for the execution environment Examples -------- >>> # Auto-detect optimal settings >>> dataloader = peach.pp.prepare_training(adata) >>> # Force HPC settings >>> dataloader = peach.pp.prepare_training(adata, num_workers=8, pin_memory=True) >>> # Minimal settings for debugging >>> dataloader = peach.pp.prepare_training(adata, num_workers=0) """ return _create_dataloader( adata=adata, batch_size=batch_size, shuffle=shuffle, pca_key=pca_key, num_workers=num_workers, pin_memory=pin_memory, persistent_workers=persistent_workers, prefetch_factor=prefetch_factor, )
[docs] def load_pathway_networks( sources: list[str] = ["c5_bp"], *, organism: str = "human", geneset_repo: str = "msigdb", verbose: bool = True, **kwargs, ): """Load pathway networks from MSigDB or OmniPath. Parameters ---------- sources : List[str], default: ["c5_bp"] Pathway sources to load. MSigDB collections: 'hallmark', 'c2_cp', 'c2_cgp', 'c3_mir', 'c5_bp', 'c5_cc', 'c5_mf', 'c8' organism : str, default: "human" Organism to load pathways for: 'human' or 'mouse' geneset_repo : str, default: "msigdb" Repository to use: 'msigdb' (recommended) or 'omnipath' verbose : bool, default: True Whether to print loading progress **kwargs Additional arguments passed to load_pathway_networks Returns ------- pd.DataFrame Pathway network with 'source', 'target', 'pathway' columns """ return _load_pathway_networks( sources=sources, organism=organism, geneset_repo=geneset_repo, verbose=verbose, **kwargs )
[docs] def compute_pathway_scores( adata: AnnData, net=None, use_layer: str = None, obsm_key: str = "pathway_scores", verbose: bool = True ) -> None: """Compute pathway activity scores using MSigDB pathways. Parameters ---------- adata : AnnData Annotated data object net : pd.DataFrame, optional Pathway network dataframe. If None, will load using sources parameter use_layer : str, optional Layer in adata to use for scoring obsm_key : str, default: "pathway_scores" Key in adata.obsm to store pathway scores verbose : bool, default: True Whether to print progress """ # Load pathway networks if not provided if net is None: net = _load_pathway_networks(sources=["c5_bp"], verbose=verbose) # Compute scores and store in AnnData _compute_pathway_scores(adata=adata, net=net, use_layer=use_layer, obsm_key=obsm_key, verbose=verbose)
[docs] def prepare_atacseq( adata: AnnData, *, n_components: int = 50, drop_first: bool = True, log_tf: bool = True, store_key: str = "X_lsi", random_state: int = 42, ) -> None: """TF-IDF + LSI preprocessing for scATAC-seq peak count data. Computes TF-IDF normalization followed by Latent Semantic Indexing (truncated SVD), the standard dimensionality reduction for chromatin accessibility data. The resulting embeddings can be used directly with ``pc.tl.train_archetypal(adata, pca_key='X_lsi')``. Parameters ---------- adata : AnnData Annotated data object with peak count matrix in ``adata.X``. Typically a sparse [n_cells, n_peaks] matrix from scATAC-seq. n_components : int, default: 50 Number of LSI components to compute. 30-50 is standard for scATAC-seq. drop_first : bool, default: True Drop first SVD component. The first component in scATAC-seq LSI typically captures sequencing depth rather than biological signal. log_tf : bool, default: True Use log(1 + TF) variant of term frequency. Standard in scATAC-seq preprocessing to reduce influence of high-count peaks. store_key : str, default: "X_lsi" Key in ``adata.obsm`` to store the LSI embeddings. random_state : int, default: 42 Random seed for reproducibility of truncated SVD. Returns ------- None Modifies ``adata`` in place: - ``adata.obsm[store_key]``: LSI embeddings [n_cells, n_components] - ``adata.uns['lsi']``: dict with 'variance_ratio' and 'components' Examples -------- >>> import peach as pc >>> import scanpy as sc >>> adata = sc.read_h5ad("scatac_peaks.h5ad") >>> pc.pp.prepare_atacseq(adata, n_components=30) >>> results = pc.tl.train_archetypal(adata, n_archetypes=5, pca_key="X_lsi") """ import scipy.sparse as sp X = adata.X if not sp.issparse(X): X = sp.csr_matrix(X) n_cells, n_peaks = X.shape density = X.nnz / (n_cells * n_peaks) print(f" scATAC-seq preprocessing: {n_cells} cells x {n_peaks} peaks ({density:.1%} dense)") # Step 1: TF-IDF normalization print(" Computing TF-IDF normalization...") X_tfidf = _tfidf_normalize(X, log_tf=log_tf) # Step 2: LSI via truncated SVD n_actual = min(n_components, min(n_cells, n_peaks) - 2) if n_actual < n_components: print(f" Adjusted n_components from {n_components} to {n_actual} (matrix rank limit)") n_components = n_actual print(f" Computing LSI with {n_components} components (drop_first={drop_first})...") embeddings, variance_ratio, components = _compute_lsi( X_tfidf, n_components=n_components, drop_first=drop_first, random_state=random_state, ) # Store results in AnnData adata.obsm[store_key] = embeddings adata.uns["lsi"] = { "variance_ratio": variance_ratio, "components": components, } total_var = variance_ratio.sum() * 100 print(f" LSI complete: {embeddings.shape[1]} components, {total_var:.1f}% variance explained") print(f" Stored in adata.obsm['{store_key}']")