Source code for mmf.math.linalg.cholesky.gmw81

"""These are a collection of codes based on the [GMW81]_ algorithm.

Comments
--------
:func:`colmod` :
   Seems to work well and is quite efficient as it has been
   vectorized, but not quite as good as :func:`gmw`.  Use this for
   now.
:func:`gmw` :
   Seems to work well, but returns an explicit pivoting matrix and
   is very slow as it uses loops.
:func:`gmchol` :
   Fast (uses weave) but very bad for large matrices because it uses
   no pivoting.
"""
from __future__ import division

__all__ = ['gmchol', 'colmod', 'mchol2']

import numpy as np
import scipy as sp
import scipy.weave


_FINFO = np.finfo(float)
_EPS = _FINFO.eps

def gmw(A):
    """Return `(P, L, e)` such that `P.T*A*P = L*L.T - diag(e)`.

    Returns
    -------
    P : 2d array
       Permutation matrix used for pivoting.
    L : 2d array
       Lower triangular factor
    e : 1d array
    Positive diagonals of shift matrix `e`.
    
    Notes
    -----
    The Gill, Murray, and Wright modified Cholesky algorithm.

    Algorithm 6.5 from page 148 of 'Numerical Optimization' by Jorge
    Nocedal and Stephen J. Wright, 1999, 2nd ed.

    """
    n = A.shape[0]
    
    # Test matrix.
    #A = array([[4, 2, 1], [2, 6, 3], [1, 3, -0.004]], Float64)
    #n = len(A)
    #I = identity(n, Float64)

    # Calculate gamma(A) and xi(A).
    gamma = 0.0
    xi = 0.0
    for i in xrange(n):
        gamma = max(abs(A[i, i]), gamma)
        for j in xrange(i+1, n):
            xi = max(abs(A[i, j]), xi)

    # Calculate delta and beta.
    delta = _EPS * max(gamma + xi, 1.0)
    if n == 1:
        beta = np.sqrt(max(gamma, _EPS))
    else:
        beta = np.sqrt(max(gamma, xi / np.sqrt(n**2 - 1.0), _EPS))

    # Initialise data structures.
    a = 1.0 * A
    r = 0.0 * A
    e = np.zeros(n, dtype=float)
    P = np.eye(n, dtype=float)

    # Main loop.
    for j in xrange(n):
        # Row and column swapping, find the index > j of the largest
        # diagonal element.
        q = j
        for i in xrange(j+1, n):
            if abs(a[i, i]) >= abs(a[q, q]):
                q = i

        # Interchange row and column j and q (if j != q).
        if q != j:
            # Temporary permutation matrix for swaping 2 rows or columns.
            p = np.eye(n, dtype=float)

            # Modify the permutation matrix P by swaping columns.
            row_P = 1.0*P[:, q]
            P[:, q] = P[:, j]
            P[:, j] = row_P

            # Modify the permutation matrix p by swaping rows (same as
            # columns because p = pT).
            row_p = 1.0*p[q]
            p[q] = p[j]
            p[j] = row_p

            # Permute a and r (p = pT).
            a = np.dot(p, np.dot(a, p))
            r = np.dot(r, p)

        # Calculate dj.
        theta_j = 0.0
        if j < n-1:
            for i in xrange(j+1, n):
                theta_j = max(theta_j, abs(a[j, i]))
        dj = max(abs(a[j, j]), (theta_j/beta)**2, delta)

        # Calculate e (not really needed!).
        e[j] = dj - a[j, j]

        # Calculate row j of r and update a.
        r[j, j] = np.sqrt(dj)     # Damned sqrt introduces roundoff error.
        for i in xrange(j+1, n):
            r[j, i] = a[j, i] / r[j, j]
            for k in xrange(j+1, i+1):
                a[i, k] = a[k, i] = a[k, i] - r[j, i] * r[j, k]     # Keep matrix a symmetric.

    # The Cholesky factor of A.
    return P, r.T, e

[docs]def gmchol(A, test=False): """Return `(L, e)`: the Gill-Murray generalized Cholesky decomposition of `M = A + diag(e) = dot(L, L.T)` where 1) `M` is safely symmetric positive definite (SPD) and well conditioned. 2) `e` is small (zero if `A` is already SPD and not much larger than the most negative eigenvalue of `A`) .. math:: \mat{A} + \diag{e} = \mat{L}\mat{L}^{T} Parameters ---------- A : array Must be a non-singular and symmetric matrix test : bool If `True`, use the directly translated iterative code for testing. Returns ------- L : 2d array Lower triangular Cholesky factor. e : 1d array Diagonals of correction matrix `E`. Examples -------- >>> np.random.seed(3) >>> A = np.random.rand(100,100)*2 - 1 >>> A = A + A.T >>> L, e = gmchol(A, test=True) >>> np.allclose(np.dot(L,L.T), (A + np.diag(e))) True >>> -e.max()/np.linalg.eigvalsh(A).min() < 1000 True >>> A2 = np.array([[-0.451, -0.041, 0.124], ... [-0.041, -0.265, 0.061], ... [ 0.124, 0.061, -0.517]]) >>> L, e = gmchol(A2, test=True) >>> e.max()/np.linalg.eigvalsh(A).min() Notes ----- Translated from the following Gauss code of J. Gill:: \****************************************************************/ \* This is the Gill/Murray cholesky routine. Reference: */ \* */ \* Gill, Jeff and Gary King. ``What to do When Your Hessian */ \* is Not Invertible: Alternatives to Model Respecification */ \* in Nonlinear Estimation,'' Sociological Methods and */ \* Research, Vol. 32, No. 1 (2004): Pp. 54--87. */ \* */ \* Abstract */ \* */ \* What should a researcher do when statistical analysis */ \* software terminates before completion with a message that */ \* the Hessian is not invertable? The standard textbook advice */ \* is to respecify the model, but this is another way of saying */ \* that the researcher should change the question being asked. */ \* Obviously, however, computer programs should not be in the */ \* business of deciding what questions are worthy of */ \* study. Although noninvertable Hessians are sometimes signals */ \* of poorly posed questions, nonsensical models, or */ \* inappropriate estimators, they also frequently occur when */ \* information about the quantities of interest exists in the */ \* data, through the likelihood function. We explain the */ \* problem in some detail and lay out two preliminary proposals */ \* for ways of dealing with noninvertable Hessians without */ \* changing the question asked. */ \****************************************************************/ proc gmchol(A); /* calculates the Gill-Murray generalized choleski decomposition */ /* input matrix A must be non-singular and symmetric */ /* Author: Jeff Gill. Part of the Hessian Project. */ local i,j,k,n,sum,R,theta_j,norm_A,phi_j,delta,xi_j,gamm,E,beta_j; n = rows(A); R = eye(n); E = zeros(n,n); norm_A = maxc(sumc(abs(A))); gamm = maxc(abs(diag(A))); delta = maxc(maxc(__macheps*norm_A~__macheps)); for j (1, n, 1); theta_j = 0; for i (1, n, 1); sum = 0; for k (1, (i-1), 1); sum = sum + R[k,i]*R[k,j]; endfor; R[i,j] = (A[i,j] - sum)/R[i,i]; if (A[i,j] -sum) > theta_j; theta_j = A[i,j] - sum; endif; if i > j; R[i,j] = 0; endif; endfor; sum = 0; for k (1, (j-1), 1); sum = sum + R[k,j]^2; endfor; phi_j = A[j,j] - sum; if (j+1) <= n; xi_j = maxc(abs(A[(j+1):n,j])); else; xi_j = maxc(abs(A[n,j])); endif; beta_j = sqrt(maxc(maxc(gamm~(xi_j/n)~__macheps))); if delta >= maxc(abs(phi_j)~((theta_j^2)/(beta_j^2))); E[j,j] = delta - phi_j; elseif abs(phi_j) >= maxc(((delta^2)/(beta_j^2))~delta); E[j,j] = abs(phi_j) - phi_j; elseif ((theta_j^2)/(beta_j^2)) >= maxc(delta~abs(phi_j)); E[j,j] = ((theta_j^2)/(beta_j^2)) - phi_j; endif; R[j,j] = sqrt(A[j,j] - sum + E[j,j]); endfor; retp(R'R); endp; """ if not test: A = np.ascontiguousarray(A, dtype='d') n = A.shape[0] L = np.eye(n, dtype='d') e = np.zeros(n, dtype='d') norm_A = float(np.linalg.norm(A, np.inf)) gamm = float(np.linalg.norm(A.diagonal(), np.inf)) delta = float(_EPS*max(1, norm_A)) code = r""" int i, j, k; double sum, tmp, theta_j2, phi_j, xi_j, beta_j2; for (j=0;j<n;++j) { theta_j2 = 0; for (i=0;i<n;++i) { sum = 0; for (k=0;k<i;++k){ sum += L2(i,k)*L2(j,k); } if (i <= j) { L2(j,i) = (A2(i,j) - sum)/L2(i,i); } theta_j2 = std::max(theta_j2, A2(i,j) - sum); } theta_j2 *= theta_j2; sum = 0; for (k=0;k<j;++k) { sum += L2(j,k)*L2(j,k); } phi_j = A2(j,j) - sum; if (j + 1 < n) { xi_j = 0; for (k=j+1;k<n;++k) { xi_j = std::max(tmp, fabs(A2(k,j))); } } else { xi_j = fabs(A2(n-1,j)); } beta_j2 = xi_j/n; beta_j2 = (beta_j2 < gamm)?gamm:beta_j2; beta_j2 = (beta_j2 < %(eps)d)?%(eps)d:beta_j2; if (delta >= std::max(fabs(phi_j), theta_j2/beta_j2)) { e[j] = delta - phi_j; } else if (fabs(phi_j) >= std::max(delta, delta*delta/beta_j2)) { e[j] = fabs(phi_j) - phi_j; } else if (std::max(delta, fabs(phi_j)) < theta_j2/beta_j2) { e[j] = theta_j2/beta_j2 - phi_j; } L2(j,j) = sqrt(A2(j,j) - sum + e[j]); } """ % dict(eps=_EPS) local_dict = dict(L=L, n=n, e=e, A=A, norm_A=norm_A, gamm=gamm, delta=delta) headers = ['<math.h>', '<algorithm>'] sp.weave.inline(code, local_dict.keys(), local_dict=local_dict, headers=headers) return (L, e) # local i,j,k,n,sum,R,theta_j,norm_A,phi_j,delta,xi_j,gamm,E,beta_j; n = A.shape[0] # n = rows(A); R = np.eye(n,dtype=float) # R = eye(n); e = np.zeros(n, dtype=float) # E = zeros(n,n); norm_A = abs(A).sum(axis=0).max() # norm_A = maxc(sumc(abs(A))); gamm = abs(A.diagonal()).max() # gamm = maxc(abs(diag(A))); delta = _EPS*max(1, norm_A) # delta = maxc(maxc(__macheps*norm_A~__macheps)); for j in xrange(n): # for j (1, n, 1); theta_j = 0 # theta_j = 0; for i in xrange(n): # for i (1, n, 1); sum_ = 0 # sum = 0; for k in xrange(i): # for k (1, (i-1), 1); sum_ += R[k,i]*R[k,j] # sum = sum + R[k,i]*R[k,j]; # endfor; R[i,j] = (A[i,j] - sum_)/R[i,i] # R[i,j] = (A[i,j] - sum)/R[i,i]; theta_j = max(theta_j, # if (A[i,j] -sum) > theta_j; A[i,j] - sum_) # theta_j = A[i,j] - sum; # endif; if i > j: # if i > j; R[i,j] = 0 # R[i,j] = 0; # endif; # endfor; sum_ = 0 # sum = 0; for k in xrange(j): # for k (1, (j-1), 1); sum_ += R[k,j]**2 # sum = sum + R[k,j]^2; # endfor; phi_j = A[j,j] - sum_ # phi_j = A[j,j] - sum; if j+1 < n: # if (j+1) <= n; xi_j = abs(A[j+1:,j]).max() # xi_j = maxc(abs(A[(j+1):n,j])); else: # else; xi_j = abs(A[-1,j]) # xi_j = maxc(abs(A[n,j])); # endif; beta_j = np.sqrt(max(gamm, # beta_j = sqrt(maxc(maxc(gamm~(xi_j/n)~__macheps))); xi_j/n, _EPS)) if delta >= max(abs(phi_j), # if delta >= maxc(abs(phi_j)~((theta_j^2)/(beta_j^2))); (theta_j/beta_j)**2): e[j] = delta - phi_j # E[j,j] = delta - phi_j; elif abs(phi_j) >= max(delta, # elseif abs(phi_j) >= maxc(((delta^2)/(beta_j^2))~delta); (delta/beta_j)**2): e[j] = abs(phi_j) - phi_j # E[j,j] = abs(phi_j) - phi_j; elif (max(delta, abs(phi_j)) < # elseif ((theta_j^2)/(beta_j^2)) >= maxc(delta~abs(phi_j)); (theta_j/beta_j)**2): e[j] = ((theta_j/beta_j)**2 - phi_j) # E[j,j] = ((theta_j^2)/(beta_j^2)) - phi_j; # endif; R[j,j] = np.sqrt(A[j,j] # R[j,j] = sqrt(A[j,j] - sum + E[j,j]); - sum_ + e[j]) # endfor; return (R.T, e) # retp(R'R);
[docs]def colmod(A): """Modified Cholesky factorization `R = cholmod(A)` returns the upper Cholesky factor of `A` (same as `chol`) if `A` is (sufficiently) positive definite, and otherwise returns a modified factor `R` with diagonal entries `>= sqrt(delta)` and offdiagonal entries `<= beta` in absolute value, where `delta` and `beta` are defined in terms of size of diagonal and offdiagonal entries of A and the machine precision; see below. The idea is to ensure that `E = A - R'*R` is reasonably small if `A` is not too far from being indefinite. If `A` is sparse, so is `R. The output parameter indef is set to 0 if `A` is sufficiently positive definite and to 1 if the factorization is modified. The point of modified Cholesky is to avoid computing eigenvalues, but for dense matrices, MODCHOL takes longer than calling the built-in EIG, because of the cost of interpreting the code, even though it only has one loop and uses vector operations. reference: Nocedal and Wright, Algorithm 3.4 and subsequent discussion (not Algorithm 3.5, which is more complicated) original algorithm is due to Gill and Murray, 1974 written by M. Overton, overton@cs.nyu.edu, last modified Feb 2005 convenient to work with A = LDL' where D is diagonal, L is unit lower triangular, and so R = (LD^(1/2))' Examples -------- >>> np.random.seed(3) >>> A = np.random.rand(100,100)*2-1 >>> A = A + A.T >>> L, e = colmod(A) >>> np.allclose(np.dot(L, L.T) - np.diag(e), A) True >>> 0 < np.linalg.eigvalsh(A + np.diag(e)).min() True >>> -e.max()/np.linalg.eigvalsh(A).min() < 1000 True Notes ----- This is based on the following MATLAB code from Michael L. Overton: http://cs.nyu.edu/overton/g22_opt/codes/cholmod.m :: function [R, indef, E] = cholmod(A) if sum(sum(abs(A-A'))) > 0 error('A is not symmetric') end % set parameters governing bounds on L and D (eps is machine epsilon) n = length(A); diagA = diag(A); gamma = max(abs(diagA)); % max diagonal entry xi = max(max(abs(A - diag(diagA)))); % max offidagonal entry delta = eps*(max([gamma+xi, 1])); beta = sqrt(max([gamma, xi/n, eps])); indef = 0; % initialize d and L d = zeros(n,1); if issparse(A) L = speye(n); % sparse identity else L = eye(n); end % there are no inner for loops, everything implemented with % vector operations for a reasonable level of efficiency for j = 1:n K = 1:j-1; % column index: all columns to left of diagonal % d(K) doesn't work in case K is empty djtemp = A(j,j) - L(j,K)*(d(K,1).*L(j,K)'); % C(j,j) in book if j < n I = j+1:n; % row index: all rows below diagonal Ccol = A(I,j) - L(I,K)*(d(K,1).*L(j,K)'); % C(I,j) in book theta = max(abs(Ccol)); % guarantees d(j) not too small and L(I,j) not too big % in sufficiently positive definite case, d(j) = djtemp d(j) = max([abs(djtemp), (theta/beta)^2, delta]); L(I,j) = Ccol/d(j); else d(j) = max([abs(djtemp), delta]); end if d(j) > djtemp % A was not sufficiently positive definite indef = 1; end end % convert to usual output format: replace L by L*sqrt(D) and transpose for j=1:n L(:,j) = L(:,j)*sqrt(d(j)); % L = L*diag(sqrt(d)) bad in sparse case end; R = L'; if nargout == 3 E = A - R'*R; end """ assert np.allclose(A, A.T) n = A.shape[0] A_diag = A.diagonal() gamma = abs(A_diag).max() xi = abs(A - np.diag(A_diag)).max() # max offidagonal entry delta = _EPS*max(gamma + xi, 1) beta = np.sqrt(max(gamma, xi/n, _EPS)) indef = 0 # initialize d and L d = np.zeros(n, dtype=float) if sp.sparse.issparse(A): L = sp.sparse.issparse(*A.shape) else: L = np.eye(n); # there are no inner for loops, everything implemented with # vector operations for a reasonable level of efficiency for j in xrange(n): if j == 0: K = [] # column index: all columns to left of diagonal # d(K) doesn't work in case K is empty else: K = np.s_[:j] djtemp = A[j, j] - np.dot(L[j, K], d[K]*L[j,K].T) # C(j,j) in book if j < n - 1: I = np.s_[j+1:n] # row index: all rows below diagonal Ccol = A[I,j] - np.dot(L[I,K], d[K]*L[j,K].T) # C(I,j) in book theta = abs(Ccol).max() # guarantees d(j) not too small and L(I,j) not too big # in sufficiently positive definite case, d(j) = djtemp d[j] = max(abs(djtemp), (theta/beta)**2, delta) L[I,j] = Ccol/d[j] else: d[j] = max(abs(djtemp), delta) if d[j] > djtemp: # A was not sufficiently positive definite indef = True # convert to usual output format: replace L by L*sqrt(D) and transpose for j in xrange(n): L[:,j] = L[:,j]*np.sqrt(d[j]) # L = L*diag(sqrt(d)) bad in sparse case e = (np.dot(L, L.T) - A).diagonal() return L, e
[docs]def mchol2(A): """ Based on the `MATLAB code <http://iew3.technion.ac.il/~mcib/mcholmz.m>`_ by Brian Borchers and Michael Zibulevsky:: % % [L,D,E,pneg]=mchol2(G) % % Given a symmetric matrix G, find a matrix E of "small" norm and c % L, and D such that G+E is Positive Definite, and % % G+E = L*D*L' % % Also, calculate a direction pneg, such that if G is not PSD, then % % pneg'*G*pneg < 0 % % Reference: Gill, Murray, and Wright, "Practical Optimization", p111. % Author: Brian Borchers (borchers@nmt.edu) % Modification: Michael Zibulevsky (mzib@ee.technion.ac.il) % function [L,D,E,pneg]=mcholmz3(G) % % n gives the size of the matrix. % n=size(G,1); % % gamma, zi, nu, and beta2 are quantities used by the algorithm. % gamma=max(diag(G)); zi=max(max(G-diag(diag(G)))); nu=max([1,sqrt(n^2-1)]); beta2=max([gamma, zi/nu, 1.0E-15]); C=diag(diag(G)); L=zeros(n); D=zeros(n,1); % use vestors as diagonal matrices E=zeros(n,1); %%****************** ************** % % Loop through, calculating column j of L for j=1:n % for j=1:n, bb=[1:j-1]; ee=[j+1:n]; if (j > 1), L(j,bb)=C(j,bb)./D(bb)'; % Calculate the jth row of L. end; if (j >= 2) if (j < n), C(ee,j)=G(ee,j)- C(ee,bb)*L(j,bb)'; % Update the jth column of C. end; else C(ee,j)=G(ee,j); end; %%%% Update theta. if (j == n) theta(j)=0; else theta(j)=max(abs(C(ee,j))); end; %%%%% Update D D(j)=max([eps,abs(C(j,j)),theta(j)^2/beta2]'); %%%%%% Update E. E(j)=D(j)-C(j,j); %%%%%%% Update C again... %for i=j+1:n, % C(i,i)=C(i,i)-C(i,j)^2/D(j,j); %end; ind=[j*(n+1)+1 : n+1 : n*n]'; C(ind)=C(ind)-(1/D(j))*C(ee,j).^2; end; % % Put 1's on the diagonal of L % %for j=1:n, % L(j,j)=1; %end; ind=[1 : n+1 : n*n]'; L(ind)=1; % % if needed, finded a descent direction. % if (nargout == 4) [m,col]=min(diag(C)); rhs=zeros(n,1); rhs(col)=1; pneg=L'\rhs; end; return %%%%%%%%%%%%%%%%%%%%%% Test %%%%%%%%%%%%% n=3; G=rand(n);G=G+G';eigG=eig(G), [L,D,E,pneg]=mchol(G) [L1,D1,E1,pneg1]=mcholmz1(G); LL=L-L1, DD=D-D1, EE=E-E1, pp=pneg-pneg1 n=3; G=rand(n);G=G+G';eigG=eig(G), [L,D,E,pneg]=mchol(G) [L2,D2,E2,pneg2]=mcholmz2(G); LL=L-L2, DD=D-diag(D2), EE=E-diag(E2), pp=pneg-pneg2 """