"""
combss.cv
Leave-one-out cross-validation for selecting the ridge penalty lambda
in the COMBSS Frank-Wolfe algorithm.
Workflow
--------
For each candidate lambda in a grid:
1. Run COMBSS on the full training data -> selected features S_k for k = 1..q.
2. For each k, evaluate S_k via LOOCV on the refit step:
Classification -- refit LogisticRegression on X[~i, S_k], predict X[i, S_k]
Linear -- exact hat-matrix shortcut (no per-fold loop required)
3. LOOCV score for (lambda, k):
Classification -- loocv_acc (fraction correct; higher is better)
Linear -- loocv_mse (mean squared error; lower is better)
The best lambda is reported both overall (optimising mean LOOCV score across k)
and per-k (optimising LOOCV score for each individual k).
Note: feature *selection* uses the full dataset; only the final refit is
cross-validated. This is consistent with standard practice in
best-subset-selection literature.
"""
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import combss._opt_glm as oglm
# ================================================================
# Default lambda grid
# ================================================================
DEFAULT_LAMBDA_GRID = np.concatenate([[0.0], np.logspace(-3, 1, 10)])
# ================================================================
# LOOCV MSE for linear regression (exact hat-matrix shortcut)
# ================================================================
[docs]
def loocv_mse_linear(X_sel, y, lambda_refit=0.0):
"""
Exact leave-one-out MSE for linear regression via the hat-matrix identity.
Uses the formula: e_i^{LOO} = (y_i - y_hat_i) / (1 - h_ii)
where h_ii are the diagonal entries of the hat matrix.
Cost O(n k^2) -- no per-fold loop required.
Parameters
----------
X_sel : ndarray (n, k)
Columns restricted to selected features.
y : ndarray (n,)
Continuous response.
lambda_refit : float
Ridge penalty on slopes (0 = OLS); intercept is never penalised.
Returns
-------
float : LOOCV mean squared error.
"""
n = len(y)
k = X_sel.shape[1]
A = np.hstack([np.ones((n, 1)), X_sel])
pen = np.zeros((k + 1, k + 1))
if lambda_refit > 0.0:
np.fill_diagonal(pen, 2.0 * lambda_refit * n)
pen[0, 0] = 0.0
M = np.linalg.solve(A.T @ A + pen, A.T)
y_hat = A @ (M @ y)
h_diag = np.einsum('ij,ji->i', A, M)
residuals = y - y_hat
denom = np.clip(1.0 - h_diag, 1e-10, None)
return float(np.mean((residuals / denom) ** 2))
# ================================================================
# LOOCV accuracy for classification (logit / multinomial)
# ================================================================
[docs]
def loocv_accuracy(X_sel, y, lambda_refit=0.0):
"""
Leave-one-out cross-validation accuracy for classification.
Parameters
----------
X_sel : ndarray (n, k)
Columns restricted to selected features.
y : ndarray (n,)
Class labels.
lambda_refit : float
Ridge penalty for the refit model (0 = unpenalised).
Returns
-------
float : fraction of correctly predicted held-out labels.
"""
n = len(y)
classes = np.unique(y)
n_correct = 0
n_valid = 0
for i in range(n):
mask = np.ones(n, dtype=bool)
mask[i] = False
X_tr = X_sel[mask]
y_tr = y[mask]
X_te = X_sel[i:i + 1]
y_te = y[i]
if len(np.unique(y_tr)) < len(classes):
continue
if lambda_refit > 0:
clf = LogisticRegression(
penalty='l2', C=1.0 / (2.0 * lambda_refit),
solver='lbfgs', max_iter=5000,
)
else:
clf = LogisticRegression(
penalty=None, solver='lbfgs', max_iter=5000,
)
try:
clf.fit(X_tr, y_tr)
n_correct += int(clf.predict(X_te)[0] == y_te)
n_valid += 1
except Exception:
pass
return n_correct / n_valid if n_valid > 0 else float('nan')
# ================================================================
# Main CV function
# ================================================================
[docs]
def select_lambda(X, y, q, C=None,
lambda_grid=None,
Niter=50,
alpha=0.01,
model_type='multinomial',
inner_tol=1e-4,
lambda_refit=0.0,
verbose=True):
"""
Select the ridge regularisation penalty via leave-one-out cross-validation.
For each candidate ridge penalty in lambda_grid, COMBSS selects features
for k = 1..q on the full data. Each selected model is then evaluated by
LOOCV on the refit step.
Note: this lambda is a ridge penalty on the coefficients in the inner
solver (lam_ridge in the model classes), NOT the sparsity penalty used
in the original COMBSS method.
Parameters
----------
X : ndarray (n, p)
Feature matrix (no intercept column).
y : ndarray (n,)
Response / labels:
- {1, ..., C} for multinomial
- {0, 1} for logit
- real-valued for linear
q : int
Maximum subset size.
C : int or None
Number of classes (required for logit/multinomial; ignored for linear).
lambda_grid : array-like or None
Candidate ridge penalty values (default: [0] + logspace(-3, 1, 10)).
Niter : int
COMBSS homotopy iterations (default 50).
alpha : float
Frank-Wolfe step size (default 0.01).
model_type : str
``'multinomial'``, ``'logit'``, or ``'linear'``.
inner_tol : float
Inner solver tolerance (default 1e-4).
lambda_refit : float
Ridge penalty for LOOCV refit (default 0).
verbose : bool
Print progress (default True).
Returns
-------
best_lambda : float
Lambda maximising mean LOOCV accuracy (classification) or
minimising mean LOOCV MSE (linear).
best_lambda_per_k : dict
Best lambda for each k individually.
results_df : DataFrame
Columns: lambda, k, loocv_acc/loocv_mse, selected.
"""
if model_type in ('logit', 'multinomial') and C is None:
raise ValueError("C (number of classes) is required for logit/multinomial.")
if lambda_grid is None:
lambda_grid = DEFAULT_LAMBDA_GRID
lambda_grid = np.asarray(lambda_grid, dtype=float)
is_linear = (model_type == 'linear')
score_col = 'loocv_mse' if is_linear else 'loocv_acc'
score_label = 'LOOCV MSE' if is_linear else 'LOOCV acc'
n, p = X.shape
X_fw = np.hstack([np.ones((n, 1)), X])
rows = []
for lam in lambda_grid:
if verbose:
print(f"\n{'=' * 55}")
print(f"lambda = {lam:.5g}")
print(f"{'=' * 55}")
result = oglm.fw(
X_fw, y,
q=q,
Niter=Niter,
lam=lam,
alpha=alpha,
scale=True,
model_type=model_type,
C=C if C is not None else 2,
inner_tol=inner_tol,
verbose=False,
)
if verbose:
print(f" {'k':>3} {score_label:>10} selected features")
print(f" {'-' * 50}")
for k in range(1, q + 1):
feat_1idx = result.models[k - 1]
feat_0idx = np.asarray(feat_1idx) - 1
X_sel = X[:, feat_0idx]
if is_linear:
score = loocv_mse_linear(X_sel, y, lambda_refit=lambda_refit)
else:
score = loocv_accuracy(X_sel, y, lambda_refit=lambda_refit)
rows.append({
'lambda': lam,
'k': k,
score_col: score,
'selected': sorted(feat_1idx.tolist()),
})
if verbose:
print(f" {k:>3} {score:>10.4f} {sorted(feat_1idx.tolist())}")
results_df = pd.DataFrame(rows)
if is_linear:
best_lambda_per_k = {}
for k in range(1, q + 1):
sub = results_df[results_df['k'] == k]
best_lambda_per_k[k] = float(sub.loc[sub[score_col].idxmin(), 'lambda'])
mean_score = results_df.groupby('lambda')[score_col].mean()
best_lambda = float(mean_score.idxmin())
else:
best_lambda_per_k = {}
for k in range(1, q + 1):
sub = results_df[results_df['k'] == k]
best_lambda_per_k[k] = float(sub.loc[sub[score_col].idxmax(), 'lambda'])
mean_score = results_df.groupby('lambda')[score_col].mean()
best_lambda = float(mean_score.idxmax())
if verbose:
print(f"\n{'=' * 55}")
print("Best lambda per k:")
for k, lam in best_lambda_per_k.items():
row = results_df[(results_df['lambda'] == lam) & (results_df['k'] == k)].iloc[0]
print(f" k={k:>2} lambda={lam:.5g} {score_label}={row[score_col]:.4f}"
f" selected={row['selected']}")
print(f"\nBest lambda overall (mean across k): {best_lambda:.5g}")
print(f"{'=' * 55}")
return best_lambda, best_lambda_per_k, results_df