Source code for mmf.math.linalg.cholesky.gmw81

"""These are a collection of codes based on the [GMW81]_ algorithm.

Comments
--------
:func:`colmod` :
   Seems to work well and is quite efficient as it has been
   vectorized, but not quite as good as :func:`gmw`.  Use this for
   now.
:func:`gmw` :
   Seems to work well, but returns an explicit pivoting matrix and
   is very slow as it uses loops.
:func:`gmchol` :
   Fast (uses weave) but very bad for large matrices because it uses
   no pivoting.
"""
from __future__ import division

__all__ = ['gmchol', 'colmod', 'mchol2']

import numpy as np
import scipy as sp
import scipy.weave


_FINFO = np.finfo(float)
_EPS = _FINFO.eps

def gmw(A):
    """Return `(P, L, e)` such that `P.T*A*P = L*L.T - diag(e)`.

    Returns
    -------
    P : 2d array
       Permutation matrix used for pivoting.
    L : 2d array
       Lower triangular factor
    e : 1d array
    Positive diagonals of shift matrix `e`.
    
    Notes
    -----
    The Gill, Murray, and Wright modified Cholesky algorithm.

    Algorithm 6.5 from page 148 of 'Numerical Optimization' by Jorge
    Nocedal and Stephen J. Wright, 1999, 2nd ed.

    """
    n = A.shape[0]
    
    # Test matrix.
    #A = array([[4, 2, 1], [2, 6, 3], [1, 3, -0.004]], Float64)
    #n = len(A)
    #I = identity(n, Float64)

    # Calculate gamma(A) and xi(A).
    gamma = 0.0
    xi = 0.0
    for i in xrange(n):
        gamma = max(abs(A[i, i]), gamma)
        for j in xrange(i+1, n):
            xi = max(abs(A[i, j]), xi)

    # Calculate delta and beta.
    delta = _EPS * max(gamma + xi, 1.0)
    if n == 1:
        beta = np.sqrt(max(gamma, _EPS))
    else:
        beta = np.sqrt(max(gamma, xi / np.sqrt(n**2 - 1.0), _EPS))

    # Initialise data structures.
    a = 1.0 * A
    r = 0.0 * A
    e = np.zeros(n, dtype=float)
    P = np.eye(n, dtype=float)

    # Main loop.
    for j in xrange(n):
        # Row and column swapping, find the index > j of the largest
        # diagonal element.
        q = j
        for i in xrange(j+1, n):
            if abs(a[i, i]) >= abs(a[q, q]):
                q = i

        # Interchange row and column j and q (if j != q).
        if q != j:
            # Temporary permutation matrix for swaping 2 rows or columns.
            p = np.eye(n, dtype=float)

            # Modify the permutation matrix P by swaping columns.
            row_P = 1.0*P[:, q]
            P[:, q] = P[:, j]
            P[:, j] = row_P

            # Modify the permutation matrix p by swaping rows (same as
            # columns because p = pT).
            row_p = 1.0*p[q]
            p[q] = p[j]
            p[j] = row_p

            # Permute a and r (p = pT).
            a = np.dot(p, np.dot(a, p))
            r = np.dot(r, p)

        # Calculate dj.
        theta_j = 0.0
        if j < n-1:
            for i in xrange(j+1, n):
                theta_j = max(theta_j, abs(a[j, i]))
        dj = max(abs(a[j, j]), (theta_j/beta)**2, delta)

        # Calculate e (not really needed!).
        e[j] = dj - a[j, j]

        # Calculate row j of r and update a.
        r[j, j] = np.sqrt(dj)     # Damned sqrt introduces roundoff error.
        for i in xrange(j+1, n):
            r[j, i] = a[j, i] / r[j, j]
            for k in xrange(j+1, i+1):
                a[i, k] = a[k, i] = a[k, i] - r[j, i] * r[j, k]     # Keep matrix a symmetric.

    # The Cholesky factor of A.
    return P, r.T, e

[docs]def gmchol(A, test=False):
    """Return `(L, e)`: the Gill-Murray generalized Cholesky
    decomposition of `M = A + diag(e) = dot(L, L.T)` where

    1) `M` is safely symmetric positive definite (SPD) and well
       conditioned.
    2) `e` is small (zero if `A` is already SPD and not much larger
       than the most negative eigenvalue of `A`)

    .. math::
       \mat{A} + \diag{e} = \mat{L}\mat{L}^{T}

    Parameters
    ----------
    A : array
       Must be a non-singular and symmetric matrix
    test : bool
       If `True`, use the directly translated iterative code for
       testing.

    Returns
    -------
    L : 2d array
       Lower triangular Cholesky factor.
    e : 1d array
       Diagonals of correction matrix `E`.


    Examples
    --------
    >>> np.random.seed(3)
    >>> A = np.random.rand(100,100)*2 - 1
    >>> A = A + A.T
    >>> L, e = gmchol(A, test=True)
    >>> np.allclose(np.dot(L,L.T), (A + np.diag(e)))
    True
    >>> -e.max()/np.linalg.eigvalsh(A).min() < 1000
    True

    >>> A2 =  np.array([[-0.451, -0.041, 0.124],
    ...                 [-0.041, -0.265, 0.061],
    ...                 [ 0.124, 0.061, -0.517]])
    >>> L, e = gmchol(A2, test=True)
    >>> e.max()/np.linalg.eigvalsh(A).min()
    

    Notes
    -----
    Translated from the following Gauss code of J. Gill::
      \****************************************************************/
      \* This is the Gill/Murray cholesky routine.  Reference:        */  
      \*							      */
      \* Gill, Jeff and Gary King. ``What to do When Your Hessian     */
      \* is Not Invertible: Alternatives to Model Respecification     */
      \* in Nonlinear Estimation,'' Sociological Methods and          */
      \* Research, Vol. 32, No. 1 (2004): Pp. 54--87.                 */
      \*							      */
      \* Abstract						      */
      \* 							      */
      \* What should a researcher do when statistical analysis        */
      \* software terminates before completion with a message that    */
      \* the Hessian is not invertable? The standard textbook advice  */
      \* is to respecify the model, but this is another way of saying */
      \* that the researcher should change the question being asked.  */
      \* Obviously, however, computer programs should not be in the   */
      \* business of deciding what questions are worthy of            */
      \* study. Although noninvertable Hessians are sometimes signals */
      \* of poorly posed questions, nonsensical models, or            */
      \* inappropriate estimators, they also frequently occur when    */
      \* information about the quantities of interest exists in the   */
      \* data, through the likelihood function. We explain the        */
      \* problem in some detail and lay out two preliminary proposals */
      \* for ways of dealing with noninvertable Hessians without      */
      \* changing the question asked.                                 */
      \****************************************************************/

      proc gmchol(A);
         /* calculates the Gill-Murray generalized choleski decomposition */
         /* input matrix A must be non-singular and symmetric */
         /* Author: Jeff Gill. Part of the Hessian Project. */
         local i,j,k,n,sum,R,theta_j,norm_A,phi_j,delta,xi_j,gamm,E,beta_j;
         n = rows(A);
         R = eye(n);
         E = zeros(n,n);
         norm_A = maxc(sumc(abs(A)));
         gamm = maxc(abs(diag(A))); 
         delta = maxc(maxc(__macheps*norm_A~__macheps));
         for j (1, n, 1); 
            theta_j = 0;
            for i (1, n, 1);
               sum = 0;
               for k (1, (i-1), 1);	
                  sum = sum + R[k,i]*R[k,j];
               endfor;
               R[i,j] = (A[i,j] - sum)/R[i,i];
               if (A[i,j] -sum) > theta_j;
                  theta_j = A[i,j] - sum;
               endif;
               if i > j;
                  R[i,j] = 0;
               endif;
            endfor;
            sum = 0;
            for k (1, (j-1), 1);	
               sum = sum + R[k,j]^2;
            endfor;
            phi_j = A[j,j] - sum;
            if (j+1) <= n;
               xi_j = maxc(abs(A[(j+1):n,j]));
            else;
               xi_j = maxc(abs(A[n,j]));
            endif;
            beta_j = sqrt(maxc(maxc(gamm~(xi_j/n)~__macheps)));
            if delta >= maxc(abs(phi_j)~((theta_j^2)/(beta_j^2)));
               E[j,j] = delta - phi_j;
            elseif abs(phi_j) >= maxc(((delta^2)/(beta_j^2))~delta);
               E[j,j] = abs(phi_j) - phi_j;
            elseif ((theta_j^2)/(beta_j^2)) >= maxc(delta~abs(phi_j));
               E[j,j] = ((theta_j^2)/(beta_j^2)) - phi_j;
            endif;
            R[j,j] = sqrt(A[j,j] - sum + E[j,j]);
         endfor;
         retp(R'R);
      endp;

    """
    if not test:
        A = np.ascontiguousarray(A, dtype='d')
        n = A.shape[0]
        L = np.eye(n, dtype='d')
        e = np.zeros(n, dtype='d')
        norm_A = float(np.linalg.norm(A, np.inf))
        gamm = float(np.linalg.norm(A.diagonal(), np.inf))
        delta = float(_EPS*max(1, norm_A))
        code = r"""
        int i, j, k;
        double sum, tmp, theta_j2, phi_j, xi_j, beta_j2;
        
        for (j=0;j<n;++j) {
          theta_j2 = 0;
          for (i=0;i<n;++i) {
            sum = 0;
            for (k=0;k<i;++k){
              sum += L2(i,k)*L2(j,k);
            }
            if (i <= j) {
              L2(j,i) = (A2(i,j) - sum)/L2(i,i);
            }
            theta_j2 = std::max(theta_j2, A2(i,j) - sum);
          }
          theta_j2 *= theta_j2;
          sum = 0;
          for (k=0;k<j;++k) {
            sum += L2(j,k)*L2(j,k);
          }
            
          phi_j = A2(j,j) - sum;
          if (j + 1 < n) {
            xi_j = 0;
            for (k=j+1;k<n;++k) {
               xi_j = std::max(tmp, fabs(A2(k,j)));
            }
          } else {
            xi_j = fabs(A2(n-1,j));
          }

          beta_j2 = xi_j/n;
          beta_j2 = (beta_j2 < gamm)?gamm:beta_j2;
          beta_j2 = (beta_j2 < %(eps)d)?%(eps)d:beta_j2;
          
          if (delta >= std::max(fabs(phi_j), theta_j2/beta_j2)) {
            e[j] = delta - phi_j;
          } else if (fabs(phi_j) >= std::max(delta, delta*delta/beta_j2)) {
            e[j] = fabs(phi_j) - phi_j;
          } else if (std::max(delta, fabs(phi_j)) < theta_j2/beta_j2) {
            e[j] = theta_j2/beta_j2 - phi_j;
          }
          L2(j,j) = sqrt(A2(j,j) - sum + e[j]);
        }
        """ % dict(eps=_EPS)
        local_dict = dict(L=L, n=n, e=e, A=A, norm_A=norm_A,
                          gamm=gamm, delta=delta)
        headers = ['<math.h>', '<algorithm>']
        sp.weave.inline(code,
                        local_dict.keys(),
                        local_dict=local_dict,
                        headers=headers)
        return (L, e)
    
    # local i,j,k,n,sum,R,theta_j,norm_A,phi_j,delta,xi_j,gamm,E,beta_j;
    n = A.shape[0]                        # n = rows(A);
    R = np.eye(n,dtype=float)             # R = eye(n);
    e = np.zeros(n, dtype=float)          # E = zeros(n,n);
    norm_A = abs(A).sum(axis=0).max()     # norm_A = maxc(sumc(abs(A)));
    gamm = abs(A.diagonal()).max()        # gamm = maxc(abs(diag(A))); 
    delta = _EPS*max(1, norm_A)           # delta = maxc(maxc(__macheps*norm_A~__macheps));
    for j in xrange(n):                   # for j (1, n, 1); 
        theta_j = 0                       #    theta_j = 0;
        for i in xrange(n):               #    for i (1, n, 1);
            sum_ = 0                      #       sum = 0;
            for k in xrange(i):           #       for k (1, (i-1), 1);	
                sum_ += R[k,i]*R[k,j]     #          sum = sum + R[k,i]*R[k,j];
                                          #       endfor;
            R[i,j] = (A[i,j]
                      - sum_)/R[i,i]      #       R[i,j] = (A[i,j] - sum)/R[i,i];
            theta_j = max(theta_j,        #       if (A[i,j] -sum) > theta_j;
                          A[i,j] - sum_)  #          theta_j = A[i,j] - sum;
                                          #       endif;
            if i > j:                     #       if i > j;
                R[i,j] = 0                #          R[i,j] = 0;
                                          #       endif;
                                          #    endfor;
        sum_ = 0                          #    sum = 0;
        for k in xrange(j):               #    for k (1, (j-1), 1);	
           sum_ += R[k,j]**2              #       sum = sum + R[k,j]^2;
                                          #    endfor;
        phi_j = A[j,j] - sum_             #    phi_j = A[j,j] - sum;
        if j+1 < n:                       #    if (j+1) <= n;
            xi_j = abs(A[j+1:,j]).max()   #       xi_j = maxc(abs(A[(j+1):n,j]));
        else:                             #    else;
            xi_j = abs(A[-1,j])           #       xi_j = maxc(abs(A[n,j]));
                                          #    endif;
        beta_j = np.sqrt(max(gamm,        #    beta_j = sqrt(maxc(maxc(gamm~(xi_j/n)~__macheps)));
                             xi_j/n,
                             _EPS))
        if delta >= max(abs(phi_j),       #    if delta >= maxc(abs(phi_j)~((theta_j^2)/(beta_j^2)));
                        (theta_j/beta_j)**2):
            e[j] = delta - phi_j          #       E[j,j] = delta - phi_j;
        elif abs(phi_j) >= max(delta,     #    elseif abs(phi_j) >= maxc(((delta^2)/(beta_j^2))~delta);
                               (delta/beta_j)**2):
            e[j] = abs(phi_j) - phi_j     #       E[j,j] = abs(phi_j) - phi_j;
        elif (max(delta, abs(phi_j)) <    #    elseif ((theta_j^2)/(beta_j^2)) >= maxc(delta~abs(phi_j));
              (theta_j/beta_j)**2):
            e[j] = ((theta_j/beta_j)**2
                    - phi_j)              #       E[j,j] = ((theta_j^2)/(beta_j^2)) - phi_j;
                                          #    endif;
        R[j,j] = np.sqrt(A[j,j]           #    R[j,j] = sqrt(A[j,j] - sum + E[j,j]);
                         - sum_ + e[j])   
                                          # endfor;
    return (R.T, e)                       # retp(R'R);
            
[docs]def colmod(A):
    """Modified Cholesky factorization

    `R = cholmod(A)` returns the upper Cholesky factor of `A` (same as `chol`)
    if `A` is (sufficiently) positive definite, and otherwise returns a
    modified factor `R` with diagonal entries `>= sqrt(delta)` and
    offdiagonal entries `<= beta` in absolute value,
    where `delta` and `beta` are defined in terms of size of diagonal and
    offdiagonal entries of A and the machine precision; see below.
    The idea is to ensure that `E = A - R'*R` is reasonably small if `A` is
    not too far from being indefinite.  If `A` is sparse, so is `R.
    The output parameter indef is set to 0 if `A` is sufficiently positive
    definite and to 1 if the factorization is modified.
    
    The point of modified Cholesky is to avoid computing eigenvalues,
    but for dense matrices, MODCHOL takes longer than calling the built-in
    EIG, because of the cost of interpreting the code, even though it
    only has one loop and uses vector operations.

    reference: Nocedal and Wright, Algorithm 3.4 and subsequent discussion
    (not Algorithm 3.5, which is more complicated)
    original algorithm is due to Gill and Murray, 1974
    written by M. Overton, overton@cs.nyu.edu, last modified Feb 2005

    convenient to work with A = LDL' where D is diagonal, L is unit
    lower triangular, and so R = (LD^(1/2))'

    Examples
    --------
    >>> np.random.seed(3)
    >>> A = np.random.rand(100,100)*2-1
    >>> A = A + A.T
    >>> L, e = colmod(A)
    >>> np.allclose(np.dot(L, L.T) - np.diag(e), A)
    True
    >>> 0 < np.linalg.eigvalsh(A + np.diag(e)).min()
    True
    >>> -e.max()/np.linalg.eigvalsh(A).min() < 1000
    True
    
    Notes
    -----
    This is based on the following MATLAB code from Michael
    L. Overton: http://cs.nyu.edu/overton/g22_opt/codes/cholmod.m
    ::
        function [R, indef, E] = cholmod(A)
        if sum(sum(abs(A-A'))) > 0
            error('A is not symmetric')
        end

        % set parameters governing bounds on L and D (eps is machine epsilon)

        n = length(A);
        diagA = diag(A);
        gamma = max(abs(diagA));             % max diagonal entry
        xi = max(max(abs(A - diag(diagA))));  % max offidagonal entry
        delta = eps*(max([gamma+xi, 1]));
        beta = sqrt(max([gamma, xi/n, eps]));
        indef = 0;

        % initialize d and L

        d = zeros(n,1);
        if issparse(A)
            L = speye(n);  % sparse identity
        else
            L = eye(n);
        end

        % there are no inner for loops, everything implemented with
        % vector operations for a reasonable level of efficiency

        for j = 1:n
            K = 1:j-1;  % column index: all columns to left of diagonal
                        % d(K) doesn't work in case K is empty
            djtemp = A(j,j) - L(j,K)*(d(K,1).*L(j,K)');   % C(j,j) in book
            if j < n
                I = j+1:n;  % row index: all rows below diagonal
                Ccol = A(I,j) - L(I,K)*(d(K,1).*L(j,K)');  % C(I,j) in book
                theta = max(abs(Ccol));
                % guarantees d(j) not too small and L(I,j) not too big
                % in sufficiently positive definite case, d(j) = djtemp
                d(j) = max([abs(djtemp), (theta/beta)^2, delta]);
                L(I,j) = Ccol/d(j);
            else
                d(j) = max([abs(djtemp), delta]);
            end
            if d(j) > djtemp  % A was not sufficiently positive definite
                indef = 1;
            end
        end
        % convert to usual output format: replace L by L*sqrt(D) and transpose
        for j=1:n
            L(:,j) = L(:,j)*sqrt(d(j));   % L = L*diag(sqrt(d)) bad in sparse case
        end;
        R = L';
        if nargout == 3
            E = A - R'*R;
        end
    """
    assert np.allclose(A, A.T)
    n = A.shape[0]
    A_diag = A.diagonal()
    gamma = abs(A_diag).max()
    xi = abs(A - np.diag(A_diag)).max() # max offidagonal entry

    delta = _EPS*max(gamma + xi, 1)
    beta = np.sqrt(max(gamma, xi/n, _EPS))
    indef = 0

    # initialize d and L

    d = np.zeros(n, dtype=float)
    if sp.sparse.issparse(A):
        L = sp.sparse.issparse(*A.shape)
    else:
        L = np.eye(n);

    # there are no inner for loops, everything implemented with
    # vector operations for a reasonable level of efficiency

    for j in xrange(n):
        if j == 0:
            K = []     # column index: all columns to left of diagonal
                       # d(K) doesn't work in case K is empty
        else:
            K = np.s_[:j]  
                    
        djtemp = A[j, j] - np.dot(L[j, K], d[K]*L[j,K].T) # C(j,j) in book
        if j < n - 1:
            I = np.s_[j+1:n]  # row index: all rows below diagonal
            Ccol = A[I,j] - np.dot(L[I,K], d[K]*L[j,K].T) # C(I,j) in book
            theta = abs(Ccol).max()
            # guarantees d(j) not too small and L(I,j) not too big
            # in sufficiently positive definite case, d(j) = djtemp
            d[j] = max(abs(djtemp), (theta/beta)**2, delta)
            L[I,j] = Ccol/d[j]
        else:
            d[j] = max(abs(djtemp), delta)
        if d[j] > djtemp:  # A was not sufficiently positive definite
            indef = True

    # convert to usual output format: replace L by L*sqrt(D) and transpose
    for j in xrange(n):
        L[:,j] = L[:,j]*np.sqrt(d[j]) # L = L*diag(sqrt(d)) bad in sparse case

    e = (np.dot(L, L.T) - A).diagonal()

    return L, e
    

[docs]def mchol2(A):
    """
    Based on the `MATLAB code <http://iew3.technion.ac.il/~mcib/mcholmz.m>`_
    by Brian Borchers and Michael Zibulevsky::

       %
       %  [L,D,E,pneg]=mchol2(G)
       %
       %  Given a symmetric matrix G, find a matrix E of "small" norm and c
       %  L, and D such that  G+E is Positive Definite, and
       %
       %      G+E = L*D*L'
       %
       %  Also, calculate a direction pneg, such that if G is not PSD, then
       %
       %      pneg'*G*pneg < 0
       %
       %  Reference: Gill, Murray, and Wright, "Practical Optimization", p111.
       %  Author:        Brian Borchers (borchers@nmt.edu)
       %  Modification:  Michael Zibulevsky   (mzib@ee.technion.ac.il)
       %
       function [L,D,E,pneg]=mcholmz3(G)
       %
       %  n gives the size of the matrix.
       %
       n=size(G,1);
       %
       %  gamma, zi, nu, and beta2 are quantities used by the algorithm.
       %
       gamma=max(diag(G));
       zi=max(max(G-diag(diag(G))));
       nu=max([1,sqrt(n^2-1)]);
       beta2=max([gamma, zi/nu, 1.0E-15]);


       C=diag(diag(G));

       L=zeros(n);
       D=zeros(n,1);  %  use vestors as diagonal matrices
       E=zeros(n,1);  %%****************** **************


       %
       %  Loop through, calculating column j of L for j=1:n
       %


       for j=1:n,

           bb=[1:j-1];
           ee=[j+1:n];


           if (j > 1),
               L(j,bb)=C(j,bb)./D(bb)';     %  Calculate the jth row of L.
           end;



           if (j >= 2)
               if (j < n),
                   C(ee,j)=G(ee,j)- C(ee,bb)*L(j,bb)';    %  Update the jth column of C.
               end;
           else
               C(ee,j)=G(ee,j);
           end;


           %%%%     Update theta.

           if (j == n)
               theta(j)=0;
           else
               theta(j)=max(abs(C(ee,j)));
           end;

           %%%%%  Update D

           D(j)=max([eps,abs(C(j,j)),theta(j)^2/beta2]');


           %%%%%%  Update E.

           E(j)=D(j)-C(j,j);



           %%%%%%%  Update C again...


           %for i=j+1:n,
           %    C(i,i)=C(i,i)-C(i,j)^2/D(j,j);
           %end;

           ind=[j*(n+1)+1 : n+1 : n*n]';
           C(ind)=C(ind)-(1/D(j))*C(ee,j).^2;


       end;

       %
       % Put 1's on the diagonal of L
       %
       %for j=1:n,
       %    L(j,j)=1;
       %end;

       ind=[1 : n+1 : n*n]';
       L(ind)=1;


       %
       %  if needed, finded a descent direction.
       %
       if (nargout == 4)
           [m,col]=min(diag(C));
           rhs=zeros(n,1);
           rhs(col)=1;
           pneg=L'\rhs;
       end;


       return


       %%%%%%%%%%%%%%%%%%%%%% Test %%%%%%%%%%%%%

       n=3; G=rand(n);G=G+G';eigG=eig(G),
       [L,D,E,pneg]=mchol(G)
       [L1,D1,E1,pneg1]=mcholmz1(G);
       LL=L-L1, DD=D-D1, EE=E-E1, pp=pneg-pneg1

       n=3; G=rand(n);G=G+G';eigG=eig(G),
       [L,D,E,pneg]=mchol(G)
       [L2,D2,E2,pneg2]=mcholmz2(G);
       LL=L-L2, DD=D-diag(D2), EE=E-diag(E2), pp=pneg-pneg2
    """
Navigation

Quick search

Source code for mmf.math.linalg.cholesky.gmw81

Navigation