Source code for fast_select.Chi2

from __future__ import annotations
import numpy as np
from numba import njit, prange
from scipy.stats import chi2 as chi2_dist
from sklearn.utils.validation import check_array, check_X_y

@njit(fastmath=True)
def _compute_observed_and_feature_counts(X, y_mapped, n_features, n_classes): # pragma: no cover
    """
    Efficiently computes the observed frequency matrix and feature counts
    in a single pass over the data.
    """
    observed = np.zeros((n_classes, n_features), dtype=np.float64)
    feature_counts = np.zeros(n_features, dtype=np.float64)
    n_samples = X.shape[0]
    for i in range(n_samples):
        class_idx = y_mapped[i]
        for j in range(n_features):
            val = X[i, j]
            observed[class_idx, j] += val
            feature_counts[j] += val
    return observed, feature_counts

@njit(parallel=True, fastmath=True)
def _chi2_core(observed, class_freqs, feature_counts, n_samples): # pragma: no cover
    """
    Calculates chi2 stats from the pre-computed observed matrix.
    The loop over features is parallelized.
    """
    n_classes, n_features = observed.shape
    chi2_stats = np.zeros(n_features, dtype=np.float64)
    
    for i in prange(n_features):
        # Skip features that have a total count of 0
        if feature_counts[i] == 0:
            continue
            
        expected_i = class_freqs * feature_counts[i] / n_samples
        term = 0.0
        for k in range(n_classes):
            observed_ik = observed[k, i]
            expected_ik = expected_i[k]
            if expected_ik > 1e-12:
                term += (observed_ik - expected_ik)**2 / expected_ik
        chi2_stats[i] = term
        
    return chi2_stats


[docs]
def chi2(X: np.ndarray, y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """
    Computes Chi-squared statistics between each feature and the target vector.

    This function calculates the Chi-squared test for independence between each
    non-negative feature and the class labels (similar to SciKit-Learn). It is suitable for features
    that represent frequencies or counts (e.g., word counts in text classification).

    Args:
        X (np.ndarray): The input sample matrix of shape (n_samples, n_features).
                        Must contain non-negative, count-based feature values.
        y (np.ndarray): The target vector of class labels, shape (n_samples,).

    Returns:
        tuple[np.ndarray, np.ndarray]: A tuple containing:
            - chi2_stats: The Chi-squared statistics for each feature.
            - p_values: The p-values for each feature.
    """
    X = check_array(X, dtype=[np.float64, np.float32], ensure_2d=True)

    X, y = check_X_y(X, y, y_numeric=True)
    
    if np.any(X < 0):
        raise ValueError("Input matrix X must contain non-negative values.")

    n_samples, n_features = X.shape
    labels, y_mapped = np.unique(y, return_inverse=True)
    n_classes = len(labels)

    if n_classes < 2:
        return np.zeros(n_features, dtype=np.float64), np.ones(n_features, dtype=np.float64)

    class_freqs = np.bincount(y_mapped).astype(np.float64)
    
    observed, feature_counts = _compute_observed_and_feature_counts(
        X, y_mapped, n_features, n_classes
    )
    
    chi2_stats = _chi2_core(observed, class_freqs, feature_counts, n_samples)
    
    dof = n_classes - 1
    p_values = chi2_dist.sf(chi2_stats, dof)
    
    return chi2_stats, p_values