"""
combss.multinomial
Public module for best subset selection in multinomial logistic regression
via COMBSS.
Uses the Frank-Wolfe homotopy algorithm with Danskin's envelope gradient
and a warm-started sklearn L-BFGS-B inner solver for the baseline-category
multinomial model.
"""
import numpy as np
from sklearn.linear_model import LogisticRegression
import combss._opt_glm as oglm
[docs]
class model:
"""
COMBSS model for best subset selection in multinomial logistic regression.
Methods
-------
fit(X_train, y_train, ...)
Run COMBSS to select the best subset of predictors.
Attributes (available after fitting)
------------------------------------
subset : ndarray or None
Indices of the best subset (0-indexed). Requires validation data.
accuracy : float or None
Validation accuracy for the best subset. Requires validation data.
coef_ : ndarray or None
Multinomial coefficients (shape (C, p), zeros for unselected).
Requires validation data.
lam_ridge : float
Ridge penalty used in the inner solver.
subset_list : list
Subsets for k = 1, ..., q (0-indexed). May be shorter if early
stopping was triggered.
"""
def __init__(self):
pass
[docs]
def fit(self, X_train, y_train, X_val=None, y_val=None,
q=None,
C=None,
Niter=25,
lam_ridge=0,
alpha=0.01,
scale=True,
verbose=True,
mandatory_features=None,
inner_tol=1e-4,
patience=20,
min_k=20):
"""
Fit the COMBSS model for multinomial logistic regression.
Parameters
----------
X_train : ndarray (n, p)
Training design matrix (no intercept column).
y_train : ndarray (n,)
Class labels in {1, ..., C}.
X_val : ndarray (n_val, p), optional
Validation design matrix. When provided, the best subset
is selected by validation accuracy and coef_ are computed.
y_val : ndarray (n_val,), optional
Validation labels.
q : int, optional
Maximum subset size. Defaults to min(n, p).
C : int, optional
Number of classes. If None, inferred from y_train.
Niter : int
Number of homotopy iterations (default 25).
lam_ridge : float
Ridge regularisation parameter for the inner solver (default 0).
alpha : float
Frank-Wolfe step size (default 0.01).
scale : bool
Column-normalise X before running (default True).
verbose : bool
Print progress (default True).
mandatory_features : list or None
1-indexed features to force into every model.
inner_tol : float
Inner solver convergence tolerance (default 1e-4).
patience : int
Early stopping patience (default 20). Stop if validation
accuracy has not improved for this many consecutive k values.
Only active when X_val/y_val are given. Set to None to disable.
min_k : int
Minimum k values to evaluate before early stopping can
trigger (default 20). Capped so min_k + patience <= p.
"""
n, p = X_train.shape
if q is None:
q = min(n, p)
if C is None:
C = len(np.unique(y_train))
if patience is not None and min_k + patience > q:
patience = None
use_early_stop = (patience is not None
and X_val is not None
and y_val is not None)
# Prepend intercept column
X_fw = np.hstack([np.ones((n, 1)), X_train])
result = oglm.fw(
X_fw, y_train,
q=q,
Niter=Niter,
lam=lam_ridge,
alpha=alpha,
scale=scale,
verbose=verbose,
mandatory_features=mandatory_features,
model_type='multinomial',
C=C,
inner_tol=inner_tol,
)
all_subsets = [np.array(m) - 1 for m in result.models]
if use_early_stop:
best_acc = -np.inf
no_improve_count = 0
stop_k = q
for k_idx in range(q):
sub = all_subsets[k_idx]
if len(sub) == 0:
acc_k = 0.0
else:
clf = LogisticRegression(penalty=None, solver='lbfgs', max_iter=5000)
clf.fit(X_train[:, sub], y_train)
acc_k = np.mean(clf.predict(X_val[:, sub]) == y_val)
if acc_k > best_acc:
best_acc = acc_k
no_improve_count = 0
else:
no_improve_count += 1
if (k_idx + 1) >= min_k and no_improve_count >= patience:
stop_k = k_idx + 1
if verbose:
print(f" Early stopping at k={stop_k} "
f"(no improvement for {patience} steps)")
break
self.subset_list = all_subsets[:stop_k]
else:
self.subset_list = all_subsets
self.lam_ridge = lam_ridge
# If validation data provided, find best k* by accuracy
self.subset = None
self.accuracy = None
self.coef_ = None
if X_val is not None and y_val is not None:
acc_list = []
coef_list = []
for sub in self.subset_list:
if len(sub) == 0:
acc_list.append(0.0)
coef_list.append(np.zeros((C, p)))
continue
clf = LogisticRegression(penalty=None, solver='lbfgs', max_iter=5000)
clf.fit(X_train[:, sub], y_train)
acc = np.mean(clf.predict(X_val[:, sub]) == y_val)
acc_list.append(acc)
coef_pred = np.zeros((C, p))
coef_pred[:, sub] = clf.coef_
coef_list.append(coef_pred)
ind_opt = np.argmax(acc_list)
self.subset = self.subset_list[ind_opt]
self.accuracy = acc_list[ind_opt]
self.coef_ = coef_list[ind_opt]
return