Source code for fast_select.TuRF

from __future__ import annotations
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.utils.validation import check_array, check_is_fitted, validate_data



[docs]
class TuRF(TransformerMixin, BaseEstimator):
    """
    A meta-estimator that implements the Iterative Relief (TuRF) algorithm.

    TuRF iteratively removes features with the lowest scores as determined by a
    base Relief-based estimator. This process is repeated until a desired
    number of features remains, which can improve robustness against noise.

    This implementation is designed to wrap any scikit-learn compatible
    estimator that provides a `feature_importances_` attribute after fitting,
    such as the `ReliefF`, `SURF`, or `MultiSURF` classes in this library.

    Parameters
    ----------
    estimator : estimator object
        The base estimator to use for scoring features at each iteration.
        This object is cloned and not modified.
    n_features_to_select : int, default=10
        The final number of features to select.
    pct_remove : float, default=0.1
        The percentage of the remaining features to remove at each iteration.
        Must be between 0 and 1.
    n_iterations : int or None, default=None
        The number of iterations to run. If None, the process continues until
        the number of features is less than or equal to `n_features_to_select`.
    verbose : bool, default=False
        Controls whether progress updates are printed during the fit.
        Limited benefit currently, will be expanded in future versions.

    Attributes
    ----------
    n_features_in_ : int
        The number of features seen during `fit`.
    feature_importances_ : ndarray of shape (n_features_in_,)
        The feature importance scores calculated by the base estimator on the
        **full, original feature set** during the first iteration.
    top_features_ : ndarray of shape (n_features_to_select,)
        The indices of the selected top features, sorted by importance.
    """

    def __init__(
        self,
        estimator,
        n_features_to_select: int = 10,
        pct_remove: float = 0.1,
        n_iterations: int | None = None,
        verbose: bool = False,
    ):
        self.estimator = estimator
        self.n_features_to_select = n_features_to_select
        self.pct_remove = pct_remove
        self.n_iterations = n_iterations
        self.verbose = verbose


[docs]
    def fit(self, X: np.ndarray, y: np.ndarray):
        """
        Fits the TuRF model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The training input samples.
        y : array-like of shape (n_samples,)
            The target values (class labels).

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        X, y = validate_data(
            self, X, y, y_numeric=True, dtype=np.float64, ensure_2d=True,
        )
        self.n_features_in_ = X.shape[1]
        if not 0 < self.pct_remove < 1:
            raise ValueError("pct_remove must be between 0 and 1.")

        active_feature_indices = np.arange(self.n_features_in_)
        base_estimator = clone(self.estimator)

        base_estimator.fit(X, y)
        self.feature_importances_ = base_estimator.feature_importances_.copy()

        current_scores = self.feature_importances_.copy()

        iteration = 0
        while True:
            if len(active_feature_indices) <= self.n_features_to_select:
                break
            if self.n_iterations is not None and iteration >= self.n_iterations:
                break

            n_to_remove = int(len(active_feature_indices) * self.pct_remove)
            n_to_remove = max(1, n_to_remove)
            if len(active_feature_indices) - n_to_remove < self.n_features_to_select:
                n_to_remove = len(active_feature_indices) - self.n_features_to_select

            indices_of_worst_in_subset = np.argsort(current_scores)[:n_to_remove]

            active_feature_indices = np.delete(active_feature_indices, indices_of_worst_in_subset)
            
            if self.verbose:
                print(f"Iteration {iteration}: {len(active_feature_indices)} features remaining.")
            X_subset = X[:, active_feature_indices]
            base_estimator.fit(X_subset, y)

            current_scores = base_estimator.feature_importances_

            iteration += 1

        sorted_indices_in_subset = np.argsort(current_scores)[::-1]
        self.top_features_ = active_feature_indices[sorted_indices_in_subset]
        self.top_features_ = np.sort(self.top_features_)
        return self



[docs]
    def transform(self, X: np.ndarray) -> np.ndarray:
        """Reduces X to the selected features."""
        check_is_fitted(self)
        X = validate_data(
            self, X,
            reset=False,
            dtype=[np.float64, np.float32]
        )

        return X[:, self.top_features_]

    

[docs]
    def fit_transform(self, X: np.ndarray, y: np.ndarray) -> np.ndarray:
        """Fit to data, then transform it."""
        self.fit(X, y)
        return self.transform(X)