habit.core.habitat_analysis.strategies.direct_pooling_strategy 源代码

"""
Direct pooling strategy: concatenate all voxel features across subjects and cluster once.
Refactored to use HabitatPipeline with template method pattern.
"""

from typing import TYPE_CHECKING, Optional
import pandas as pd

from .base_strategy import BaseClusteringStrategy
from ..pipelines.base_pipeline import HabitatPipeline

if TYPE_CHECKING:
    from habit.core.habitat_analysis.habitat_analysis import HabitatAnalysis


[文档] class DirectPoolingStrategy(BaseClusteringStrategy): """ Direct pooling strategy using HabitatPipeline. ## Overview This strategy pools (concatenates) voxel features from ALL subjects into a single feature matrix before clustering. This enables the discovery of population-level tissue patterns that are representative across the entire cohort. ## Workflow 1) Voxel feature extraction (Pipeline Step 1) - extract features for each subject 2) Subject-level preprocessing (Pipeline Step 2) - normalize within each subject 3) Concatenate all voxels (Pipeline Step 3) - merge all subjects' voxels into one matrix 4) Group-level preprocessing (Pipeline Step 4) - apply population-level transformations 5) Population clustering (Pipeline Step 5) - cluster all voxels -> discover habitats ## Why Pool All Voxels? **Rationale**: By pooling voxels from all subjects, the clustering algorithm can discover tissue patterns that are **consistent and reproducible** across the entire population. This approach is particularly effective for: - Discovering common biological phenotypes (e.g., "highly perfused tissue" vs "necrotic tissue") - Identifying dominant habitat patterns shared by multiple subjects - Quickly prototyping and exploring population-level tissue heterogeneity ## About Data Leakage **Important**: This strategy is **NOT equivalent to label leakage** in the traditional machine learning sense. Here's why: - **Unsupervised Learning**: Habitat discovery is an UNSUPERVISED process (no labels involved) - **Feature Space Only**: Pooling occurs in the FEATURE space (imaging intensities), not the label space (clinical outcomes) - **Pre-modeling Step**: Habitat segmentation is performed BEFORE building predictive models - **Pipeline Isolation**: When used in predictive workflows, the clustering model is fitted on training data only and applied to test data via the saved Pipeline **Analogy**: It's similar to performing k-means clustering on pooled MRI intensities to discover tissue types—the clustering doesn't "know" which subjects are diseased vs healthy. ## Use Cases **Recommended for**: - Exploratory analysis to discover dominant tissue patterns - Fast prototyping and hypothesis generation - Cohorts with moderate inter-subject variability - Studies focusing on population-level habitat characterization **Not recommended for**: - Extremely heterogeneous cohorts where individual differences dominate - Small sample sizes (prefer Two-Step or One-Step strategies) - Studies requiring subject-specific habitat definitions ## Parallel Processing This strategy supports parallel processing through HabitatPipeline: - **config.processes**: Controls parallel workers for individual-level steps (Steps 1-2) - **Group-level steps (3-5)**: Process all subjects together (not parallelized) """
[文档] def __init__(self, analysis: "HabitatAnalysis"): """ Initialize direct pooling strategy. Args: analysis: HabitatAnalysis instance with shared utilities """ super().__init__(analysis) self.pipeline: Optional[HabitatPipeline] = None