Examples¶

Basic Usage¶

import numpy as np
from pysrf import SRF

# Generate data
n, rank = 100, 10
w_true = np.random.rand(n, rank)
s = w_true @ w_true.T

# Fit model
model = SRF(rank=10, random_state=42)
w = model.fit_transform(s)
s_hat = model.reconstruct()

Handling Missing Data¶

import numpy as np
from pysrf import SRF

# Generate data with missing entries
n, rank = 100, 10
w_true = np.random.rand(n, rank)
s = w_true @ w_true.T

# Mark missing entries
mask = np.random.rand(n, n) < 0.3
s[mask] = np.nan

# Fit model with missing data
model = SRF(rank=10, missing_values=np.nan, random_state=42)
w = model.fit_transform(s)
s_completed = model.reconstruct()

Cross-Validation for Rank Selection¶

from pysrf import cross_val_score, SRF

# Auto-estimate sampling fraction
cv = cross_val_score(
    s,
    estimate_sampling_fraction=True,
    param_grid={"rank": [5, 10, 15, 20]},
    n_repeats=5,
    n_jobs=-1,
    random_state=42
)

print(f"Best rank: {cv.best_params_['rank']}")
print(f"Best score: {cv.best_score_:.4f}")

Ensemble and Consensus Clustering¶

from sklearn import pipeline
from pysrf.consensus import EnsembleEmbedding, ClusterEmbedding
from pysrf import SRF, cross_val_score

# 1. Rank selection
cv = cross_val_score(
    s,
    estimate_sampling_fraction=True,
    param_grid={"rank": [5, 10, 15, 20]},
    n_repeats=5,
    n_jobs=-1,
)

# 2. Stable ensemble + consensus clustering
pipe = pipeline.Pipeline(
    [
        ("ensemble", EnsembleEmbedding(SRF(cv.best_params_), n_runs=50)),
        ("cluster", ClusterEmbedding(min_clusters=2, max_clusters=6, step=1)),
    ]
)

consensus_embedding = pipe.fit_transform(s)

Value Bounds¶

from pysrf import SRF

# Constrain reconstructed values to [0, 1] (e.g., for cosine similarity)
model = SRF(rank=10, bounds=(0, 1), random_state=42)
w = model.fit_transform(s)
s_reconstructed = model.reconstruct()

# Verify bounds
assert s_reconstructed.min() >= 0
assert s_reconstructed.max() <= 1

Sampling Bound Estimation¶

from pysrf import estimate_sampling_bounds_fast

# Estimate sampling rate bounds for reliable matrix completion
pmin, pmax, s_denoised = estimate_sampling_bounds_fast(
    s,
    n_jobs=-1,
    random_state=42
)

print(f"Minimum sampling rate: {pmin:.4f}")
print(f"Maximum sampling rate: {pmax:.4f}")

# Use mid-point for cross-validation
sampling_rate = 0.5 * (pmin + pmax)

Complete Workflow¶

import numpy as np
from pysrf import SRF, cross_val_score, estimate_sampling_bounds_fast

# 1. Generate data
np.random.seed(42)
n, true_rank = 100, 8
w_true = np.random.rand(n, true_rank)
s = w_true @ w_true.T

# 2. Add noise and missing data
s += 0.1 * np.random.randn(n, n)
s = (s + s.T) / 2
mask = np.random.rand(n, n) < 0.2
s[mask] = np.nan

# 3. Estimate sampling bounds
pmin, pmax, _ = estimate_sampling_bounds_fast(s, n_jobs=-1)
print(f"Sampling bounds: [{pmin:.3f}, {pmax:.3f}]")

# 4. Cross-validate to find best rank
result = cross_val_score(
    s,
    param_grid={'rank': range(5, 21)},
    estimate_sampling_fraction=True,
    n_repeats=3,
    n_jobs=-1,
    random_state=42
)
best_rank = result.best_params_['rank']
print(f"Best rank: {best_rank} (true rank: {true_rank})")

# 5. Fit final model
model = SRF(rank=best_rank, max_outer=20, random_state=42)
w = model.fit_transform(s)
s_completed = model.reconstruct()

# 6. Evaluate
score = model.score(s)
print(f"Reconstruction error: {score:.4f}")