Skip to content

Cross-Validation API

cross_val_score

cross_val_score(
    similarity_matrix: ndarray,
    estimator: BaseEstimator | None = None,
    param_grid: dict[str, list] | None = None,
    n_repeats: int = 5,
    sampling_fraction: float = 0.8,
    estimate_sampling_fraction: bool | dict = False,
    sampling_selection: str = "mean",
    random_state: int = 0,
    verbose: int = 1,
    n_jobs: int = -1,
    missing_values: float | None = np.nan,
    fit_final_estimator: bool = False,
) -> GridSearchCV

Cross-validate any estimator for matrix completion.

Generic cross-validation function that works with SRF or any sklearn-compatible estimator with a .reconstruct() method.

Parameters:

Name Type Description Default
similarity_matrix ndarray

Symmetric similarity matrix to cross-validate

required
estimator BaseEstimator or None

Estimator to cross-validate. If None, uses SRF(random_state=random_state). Can be a single estimator or a Pipeline. Must have a .reconstruct() method.

None
param_grid dict or None

Dictionary with parameter names (str) as keys and lists of values to try as values. If None, uses default {'rank': [5, 10, 15, 20]} for SRF.

None
n_repeats int

Number of times to repeat the cross-validation

5
sampling_fraction float

Fraction of eligible entries to use for training in each split; must be in (0, 1). The remaining (1 - sampling_fraction) becomes validation. Note: Constant diagonal entries are excluded from both train and validation. Ignored when estimate_sampling_fraction is True or a dict; if both are provided, estimate_sampling_fraction takes precedence.

0.8
estimate_sampling_fraction bool or dict

If True, automatically estimate optimal sampling fraction using sampling bound estimation from Random Matrix Theory. If dict, passed as kwargs to estimate_sampling_bounds_fast(). When enabled, overrides sampling_fraction.

False
sampling_selection str

Selection method for the estimated sampling fraction; one of {"mean", "min", "max"}.

"mean"
random_state int

Random seed for reproducibility

0
verbose int

Verbosity level

1
n_jobs int

Number of jobs to run in parallel (-1 uses all processors)

-1
missing_values float or None

Value to consider as missing in original data

np.nan
fit_final_estimator bool

Whether to fit the final estimator on the best parameters

False

Returns:

Name Type Description
grid GridSearchCV

Fitted GridSearchCV object with best parameters and scores

Examples:

>>> from pysrf.cross_validation import cross_val_score
>>> result = cross_val_score(similarity_matrix, param_grid={'rank': [5, 10, 15]})
Source code in pysrf/cross_validation.py
def cross_val_score(
    similarity_matrix: np.ndarray,
    estimator: BaseEstimator | None = None,
    param_grid: dict[str, list] | None = None,
    n_repeats: int = 5,
    sampling_fraction: float = 0.8,
    estimate_sampling_fraction: bool | dict = False,
    sampling_selection: str = "mean",
    random_state: int = 0,
    verbose: int = 1,
    n_jobs: int = -1,
    missing_values: float | None = np.nan,
    fit_final_estimator: bool = False,
) -> GridSearchCV:
    """
    Cross-validate any estimator for matrix completion.

    Generic cross-validation function that works with SRF or any sklearn-compatible
    estimator with a .reconstruct() method.

    Parameters
    ----------
    similarity_matrix : ndarray
        Symmetric similarity matrix to cross-validate
    estimator : BaseEstimator or None, default=None
        Estimator to cross-validate. If None, uses SRF(random_state=random_state).
        Can be a single estimator or a Pipeline. Must have a .reconstruct() method.
    param_grid : dict or None, default=None
        Dictionary with parameter names (str) as keys and lists of values to try
        as values. If None, uses default {'rank': [5, 10, 15, 20]} for SRF.
    n_repeats : int, default=5
        Number of times to repeat the cross-validation
    sampling_fraction : float, default=0.8
        Fraction of eligible entries to use for training in each split; must be in (0, 1).
        The remaining (1 - sampling_fraction) becomes validation.
        Note: Constant diagonal entries are excluded from both train and validation.
        Ignored when estimate_sampling_fraction is True or a dict; if both are provided,
        estimate_sampling_fraction takes precedence.
    estimate_sampling_fraction : bool or dict, default=False
        If True, automatically estimate optimal sampling fraction using sampling
        bound estimation from Random Matrix Theory. If dict, passed as kwargs to
        estimate_sampling_bounds_fast(). When enabled, overrides sampling_fraction.
    sampling_selection : str, default="mean"
        Selection method for the estimated sampling fraction; one of {"mean", "min", "max"}.
    random_state : int, default=0
        Random seed for reproducibility
    verbose : int, default=1
        Verbosity level
    n_jobs : int, default=-1
        Number of jobs to run in parallel (-1 uses all processors)
    missing_values : float or None, default=np.nan
        Value to consider as missing in original data
    fit_final_estimator : bool, default=False
        Whether to fit the final estimator on the best parameters

    Returns
    -------
    grid : GridSearchCV
        Fitted GridSearchCV object with best parameters and scores

    Examples
    --------
    >>> from pysrf.cross_validation import cross_val_score
    >>> result = cross_val_score(similarity_matrix, param_grid={'rank': [5, 10, 15]})
    """
    if estimator is None:
        estimator = SRF(random_state=random_state)

    if param_grid is None:
        param_grid = {"rank": [5, 10, 15, 20]}

    valid_selections = {"mean", "min", "max"}
    if sampling_selection not in valid_selections:
        raise ValueError(
            f"sampling_selection must be one of {sorted(valid_selections)}"
        )

    if estimate_sampling_fraction:
        from .bounds import estimate_sampling_bounds_fast

        kwargs = (
            estimate_sampling_fraction
            if isinstance(estimate_sampling_fraction, dict)
            else {}
        )
        if "random_state" not in kwargs:
            kwargs["random_state"] = random_state
        if "n_jobs" not in kwargs:
            kwargs["n_jobs"] = n_jobs
        kwargs.pop("verbose", None)

        pmin, pmax, s_noise = estimate_sampling_bounds_fast(similarity_matrix, **kwargs)
        sampling_fraction = {
            "mean": np.mean([pmin, pmax]),
            "min": pmin,
            "max": pmax,
        }[sampling_selection]

    else:
        _validate_sampling_fraction(sampling_fraction)

    cv = EntryMaskSplit(
        n_repeats=n_repeats,
        sampling_fraction=sampling_fraction,
        random_state=random_state,
        missing_values=missing_values,
    )
    grid = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        cv=cv,
        n_jobs=n_jobs,
        verbose=verbose,
        fit_final_estimator=fit_final_estimator,
    )
    grid.fit(similarity_matrix)

    return grid

GridSearchCV

GridSearchCV(
    estimator: BaseEstimator,
    param_grid: dict[str, list],
    cv: EntryMaskSplit,
    n_jobs: int = -1,
    verbose: int = 0,
    fit_final_estimator: bool = False,
)

Grid search cross-validation for matrix completion.

Performs exhaustive grid search over specified parameter values with entry-wise cross-validation for symmetric matrices.

Parameters:

Name Type Description Default
estimator BaseEstimator

Model instance to optimize

required
param_grid dict

Dictionary with parameter names as keys and lists of values to try

required
cv EntryMaskSplit

Cross-validation splitter

required
n_jobs int

Number of parallel jobs (-1 uses all processors)

-1
verbose int

Verbosity level

0
fit_final_estimator bool

Whether to fit the model on full data with best parameters

False

Attributes:

Name Type Description
best_params_ dict

Parameters that gave the best score

best_score_ float

Best validation score achieved

cv_results_ DataFrame

Detailed results for all parameter combinations

best_estimator_ estimator

Fitted estimator with best parameters (if fit_final_estimator=True)

Source code in pysrf/cross_validation.py
def __init__(
    self,
    estimator: BaseEstimator,
    param_grid: dict[str, list],
    cv: EntryMaskSplit,
    n_jobs: int = -1,
    verbose: int = 0,
    fit_final_estimator: bool = False,
):
    self.estimator = estimator
    self.param_grid = param_grid
    self.cv = cv
    self.n_jobs = n_jobs
    self.verbose = verbose
    self.fit_final_estimator = fit_final_estimator

CV Strategy

EntryMaskSplit

EntryMaskSplit(
    n_repeats: int = 5,
    sampling_fraction: float = 0.8,
    random_state: int | None = None,
    missing_values: float | None = np.nan,
)

Bases: BaseCrossValidator

Cross-validator for symmetric matrices using entry-wise splits.

Generates multiple random train/validation splits by masking entries in a symmetric matrix while preserving symmetry.

Parameters:

Name Type Description Default
n_repeats int

Number of random splits to generate

5
sampling_fraction float

Fraction of eligible entries kept for training; must be in (0, 1). Remaining (1 - sampling_fraction) becomes validation. Note: Constant diagonal entries are excluded from both.

0.8
random_state int or None

Random seed for reproducibility

None
missing_values float or None

Value that marks missing entries in original data

np.nan
Source code in pysrf/cross_validation.py
def __init__(
    self,
    n_repeats: int = 5,
    sampling_fraction: float = 0.8,
    random_state: int | None = None,
    missing_values: float | None = np.nan,
):
    self.n_repeats = n_repeats
    self.sampling_fraction = sampling_fraction
    self.random_state = random_state
    self.missing_values = missing_values
    if not (0.0 < float(self.sampling_fraction) < 1.0):
        raise ValueError("sampling_fraction must be in (0, 1)")

split

split(
    x: ndarray, y: ndarray = None, groups: ndarray = None
) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]

Generate train/validation splits.

Yields:

Name Type Description
train_mask ndarray of bool

Training entries (True = use for training)

validation_mask ndarray of bool

Validation entries (True = use for evaluation)

Source code in pysrf/cross_validation.py
def split(
    self, x: np.ndarray, y: np.ndarray = None, groups: np.ndarray = None
) -> Generator[Tuple[np.ndarray, np.ndarray], None, None]:
    """
    Generate train/validation splits.

    Yields
    ------
    train_mask : ndarray of bool
        Training entries (True = use for training)
    validation_mask : ndarray of bool
        Validation entries (True = use for evaluation)
    """
    rng = check_random_state(self.random_state)
    for _ in range(self.n_repeats):
        yield create_train_val_split(
            x, self.sampling_fraction, rng, self.missing_values
        )

Scoring

fit_and_score

fit_and_score(
    estimator: BaseEstimator,
    x: ndarray,
    train_mask: ndarray,
    validation_mask: ndarray,
    fit_params: dict,
    split_idx: int | None = None,
) -> dict

Fit estimator with parameters and return validation score.

Parameters:

Name Type Description Default
estimator BaseEstimator

Model instance to fit (works with SRF or any estimator with .reconstruct())

required
x ndarray

Full data matrix

required
train_mask ndarray of bool

Boolean mask where True = training entry

required
validation_mask ndarray of bool

Boolean mask where True = validation entry

required
fit_params dict

Parameters to set on the estimator

required
split_idx int or None

Index of the CV split

None

Returns:

Name Type Description
result dict

Dictionary with score, parameters, and fitted estimator

Source code in pysrf/cross_validation.py
def fit_and_score(
    estimator: BaseEstimator,
    x: np.ndarray,
    train_mask: np.ndarray,
    validation_mask: np.ndarray,
    fit_params: dict,
    split_idx: int | None = None,
) -> dict:
    """
    Fit estimator with parameters and return validation score.

    Parameters
    ----------
    estimator : BaseEstimator
        Model instance to fit (works with SRF or any estimator with .reconstruct())
    x : ndarray
        Full data matrix
    train_mask : ndarray of bool
        Boolean mask where True = training entry
    validation_mask : ndarray of bool
        Boolean mask where True = validation entry
    fit_params : dict
        Parameters to set on the estimator
    split_idx : int or None
        Index of the CV split

    Returns
    -------
    result : dict
        Dictionary with score, parameters, and fitted estimator
    """
    est = clone(estimator).set_params(**fit_params)

    # Set SRF-specific params if estimator supports them
    if hasattr(est, "missing_values"):
        est.set_params(missing_values=np.nan)

    if hasattr(est, "bounds"):
        if "bounds" not in fit_params or fit_params["bounds"] is None:
            original_bounds = (np.nanmin(x), np.nanmax(x))
            est.set_params(bounds=original_bounds)

    # Track which entries were already NaN in the original data
    originally_nan = np.isnan(x)

    # Create training data: keep only training entries, mask everything else
    x_train = np.full_like(x, np.nan)
    x_train[train_mask] = x[train_mask]

    # Fit model on training data only
    est.fit(x_train)

    # Get reconstruction
    if hasattr(est, "reconstruct"):
        reconstruction = est.reconstruct()
    else:
        raise ValueError(
            f"Estimator {type(est).__name__} must have a .reconstruct() method "
            "for matrix completion cross-validation"
        )

    # Evaluate only on validation entries that were originally observed
    valid_eval_mask = validation_mask & ~originally_nan

    if not valid_eval_mask.any():
        raise ValueError("No valid validation entries to evaluate")

    mse = np.mean((x[valid_eval_mask] - reconstruction[valid_eval_mask]) ** 2)

    result = {
        "score": mse,
        "split": split_idx if split_idx is not None else 0,
        "estimator": est,
        "params": fit_params,
    }

    # Include history if available (optional)
    if hasattr(est, "history_"):
        result["history"] = est.history_

    return result