habit.core.machine_learning.models.ensemble 源代码

"""
Ensemble Model Wrapper for K-Fold Cross Validation.
Allows treating a collection of K-fold models as a single scikit-learn estimator.
"""

from typing import List, Any
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from sklearn.utils.validation import check_is_fitted


[文档]
class HabitEnsembleModel(BaseEstimator, ClassifierMixin):
    """
    An ensemble wrapper that aggregates predictions from multiple fitted pipelines.
    Used primarily to wrap K-Fold cross-validation results into a single predict-ready object.
    
    Attributes:
        estimators (List[Any]): List of fitted scikit-learn pipelines/models.
        voting (str): 'soft' (average probabilities) or 'hard' (majority vote). Default 'soft'.
    """
    

[文档]
    def __init__(self, estimators: List[Any], voting: str = 'soft'):
        self.estimators = estimators
        self.voting = voting

        

[文档]
    def fit(self, X, y=None):
        """
        No-op fit method. 
        We assume the estimators passed in __init__ are ALREADY fitted.
        This allows us to wrap the K-Fold results directly.
        """
        # We could implement a refit logic here if needed, but for K-Fold wrapper
        # the intention is to reuse the already trained folds.
        return self



[文档]
    def predict_proba(self, X):
        """
        Predict class probabilities for X.
        
        Returns:
            array-like of shape (n_samples, n_classes)
        """
        if not self.estimators:
            raise ValueError("No estimators provided to HabitEnsembleModel")
            
        # Collect probabilities from all estimators
        all_probs = []
        for est in self.estimators:
            if hasattr(est, "predict_proba"):
                all_probs.append(est.predict_proba(X))
            else:
                raise ValueError(f"Estimator {type(est)} does not support predict_proba")
        
        # Stack and average (Soft Voting)
        # Shape of all_probs items: (n_samples, n_classes)
        # Stacked shape: (n_estimators, n_samples, n_classes)
        avg_probs = np.mean(all_probs, axis=0)
        return avg_probs



[文档]
    def predict(self, X):
        """
        Predict class labels for X.
        """
        if self.voting == 'soft':
            # For soft voting, average probabilities then take argmax
            probs = self.predict_proba(X)
            # Assuming binary classification or standard sklearn behavior where columns are sorted classes
            # We need to map argmax index back to classes if classes_ attribute exists
            # But pipelines usually return indices or we can assume standard 0/1 for binary
            if hasattr(self.estimators[0], 'classes_'):
                indices = np.argmax(probs, axis=1)
                return self.estimators[0].classes_[indices]
            else:
                return np.argmax(probs, axis=1)
        
        elif self.voting == 'hard':
            # For hard voting, collect predictions and take mode
            all_preds = []
            for est in self.estimators:
                all_preds.append(est.predict(X))
            
            all_preds = np.array(all_preds).T # (n_samples, n_estimators)
            
            # Majority vote
            final_preds = []
            for sample_preds in all_preds:
                vals, counts = np.unique(sample_preds, return_counts=True)
                final_preds.append(vals[np.argmax(counts)])
            
            return np.array(final_preds)
        else:
            raise ValueError(f"Unknown voting method: {self.voting}")

    
    @property
    def classes_(self):
        """Delegate classes_ attribute to the first estimator."""
        if self.estimators and hasattr(self.estimators[0], 'classes_'):
            return self.estimators[0].classes_
        return None