"""These are a collection of codes based on the [GMW81]_ algorithm.
Comments
--------
:func:`colmod` :
Seems to work well and is quite efficient as it has been
vectorized, but not quite as good as :func:`gmw`. Use this for
now.
:func:`gmw` :
Seems to work well, but returns an explicit pivoting matrix and
is very slow as it uses loops.
:func:`gmchol` :
Fast (uses weave) but very bad for large matrices because it uses
no pivoting.
"""
from __future__ import division
__all__ = ['gmchol', 'colmod', 'mchol2']
import numpy as np
import scipy as sp
import scipy.weave
_FINFO = np.finfo(float)
_EPS = _FINFO.eps
def gmw(A):
"""Return `(P, L, e)` such that `P.T*A*P = L*L.T - diag(e)`.
Returns
-------
P : 2d array
Permutation matrix used for pivoting.
L : 2d array
Lower triangular factor
e : 1d array
Positive diagonals of shift matrix `e`.
Notes
-----
The Gill, Murray, and Wright modified Cholesky algorithm.
Algorithm 6.5 from page 148 of 'Numerical Optimization' by Jorge
Nocedal and Stephen J. Wright, 1999, 2nd ed.
"""
n = A.shape[0]
# Test matrix.
#A = array([[4, 2, 1], [2, 6, 3], [1, 3, -0.004]], Float64)
#n = len(A)
#I = identity(n, Float64)
# Calculate gamma(A) and xi(A).
gamma = 0.0
xi = 0.0
for i in xrange(n):
gamma = max(abs(A[i, i]), gamma)
for j in xrange(i+1, n):
xi = max(abs(A[i, j]), xi)
# Calculate delta and beta.
delta = _EPS * max(gamma + xi, 1.0)
if n == 1:
beta = np.sqrt(max(gamma, _EPS))
else:
beta = np.sqrt(max(gamma, xi / np.sqrt(n**2 - 1.0), _EPS))
# Initialise data structures.
a = 1.0 * A
r = 0.0 * A
e = np.zeros(n, dtype=float)
P = np.eye(n, dtype=float)
# Main loop.
for j in xrange(n):
# Row and column swapping, find the index > j of the largest
# diagonal element.
q = j
for i in xrange(j+1, n):
if abs(a[i, i]) >= abs(a[q, q]):
q = i
# Interchange row and column j and q (if j != q).
if q != j:
# Temporary permutation matrix for swaping 2 rows or columns.
p = np.eye(n, dtype=float)
# Modify the permutation matrix P by swaping columns.
row_P = 1.0*P[:, q]
P[:, q] = P[:, j]
P[:, j] = row_P
# Modify the permutation matrix p by swaping rows (same as
# columns because p = pT).
row_p = 1.0*p[q]
p[q] = p[j]
p[j] = row_p
# Permute a and r (p = pT).
a = np.dot(p, np.dot(a, p))
r = np.dot(r, p)
# Calculate dj.
theta_j = 0.0
if j < n-1:
for i in xrange(j+1, n):
theta_j = max(theta_j, abs(a[j, i]))
dj = max(abs(a[j, j]), (theta_j/beta)**2, delta)
# Calculate e (not really needed!).
e[j] = dj - a[j, j]
# Calculate row j of r and update a.
r[j, j] = np.sqrt(dj) # Damned sqrt introduces roundoff error.
for i in xrange(j+1, n):
r[j, i] = a[j, i] / r[j, j]
for k in xrange(j+1, i+1):
a[i, k] = a[k, i] = a[k, i] - r[j, i] * r[j, k] # Keep matrix a symmetric.
# The Cholesky factor of A.
return P, r.T, e
[docs]def gmchol(A, test=False):
"""Return `(L, e)`: the Gill-Murray generalized Cholesky
decomposition of `M = A + diag(e) = dot(L, L.T)` where
1) `M` is safely symmetric positive definite (SPD) and well
conditioned.
2) `e` is small (zero if `A` is already SPD and not much larger
than the most negative eigenvalue of `A`)
.. math::
\mat{A} + \diag{e} = \mat{L}\mat{L}^{T}
Parameters
----------
A : array
Must be a non-singular and symmetric matrix
test : bool
If `True`, use the directly translated iterative code for
testing.
Returns
-------
L : 2d array
Lower triangular Cholesky factor.
e : 1d array
Diagonals of correction matrix `E`.
Examples
--------
>>> np.random.seed(3)
>>> A = np.random.rand(100,100)*2 - 1
>>> A = A + A.T
>>> L, e = gmchol(A, test=True)
>>> np.allclose(np.dot(L,L.T), (A + np.diag(e)))
True
>>> -e.max()/np.linalg.eigvalsh(A).min() < 1000
True
>>> A2 = np.array([[-0.451, -0.041, 0.124],
... [-0.041, -0.265, 0.061],
... [ 0.124, 0.061, -0.517]])
>>> L, e = gmchol(A2, test=True)
>>> e.max()/np.linalg.eigvalsh(A).min()
Notes
-----
Translated from the following Gauss code of J. Gill::
\****************************************************************/
\* This is the Gill/Murray cholesky routine. Reference: */
\* */
\* Gill, Jeff and Gary King. ``What to do When Your Hessian */
\* is Not Invertible: Alternatives to Model Respecification */
\* in Nonlinear Estimation,'' Sociological Methods and */
\* Research, Vol. 32, No. 1 (2004): Pp. 54--87. */
\* */
\* Abstract */
\* */
\* What should a researcher do when statistical analysis */
\* software terminates before completion with a message that */
\* the Hessian is not invertable? The standard textbook advice */
\* is to respecify the model, but this is another way of saying */
\* that the researcher should change the question being asked. */
\* Obviously, however, computer programs should not be in the */
\* business of deciding what questions are worthy of */
\* study. Although noninvertable Hessians are sometimes signals */
\* of poorly posed questions, nonsensical models, or */
\* inappropriate estimators, they also frequently occur when */
\* information about the quantities of interest exists in the */
\* data, through the likelihood function. We explain the */
\* problem in some detail and lay out two preliminary proposals */
\* for ways of dealing with noninvertable Hessians without */
\* changing the question asked. */
\****************************************************************/
proc gmchol(A);
/* calculates the Gill-Murray generalized choleski decomposition */
/* input matrix A must be non-singular and symmetric */
/* Author: Jeff Gill. Part of the Hessian Project. */
local i,j,k,n,sum,R,theta_j,norm_A,phi_j,delta,xi_j,gamm,E,beta_j;
n = rows(A);
R = eye(n);
E = zeros(n,n);
norm_A = maxc(sumc(abs(A)));
gamm = maxc(abs(diag(A)));
delta = maxc(maxc(__macheps*norm_A~__macheps));
for j (1, n, 1);
theta_j = 0;
for i (1, n, 1);
sum = 0;
for k (1, (i-1), 1);
sum = sum + R[k,i]*R[k,j];
endfor;
R[i,j] = (A[i,j] - sum)/R[i,i];
if (A[i,j] -sum) > theta_j;
theta_j = A[i,j] - sum;
endif;
if i > j;
R[i,j] = 0;
endif;
endfor;
sum = 0;
for k (1, (j-1), 1);
sum = sum + R[k,j]^2;
endfor;
phi_j = A[j,j] - sum;
if (j+1) <= n;
xi_j = maxc(abs(A[(j+1):n,j]));
else;
xi_j = maxc(abs(A[n,j]));
endif;
beta_j = sqrt(maxc(maxc(gamm~(xi_j/n)~__macheps)));
if delta >= maxc(abs(phi_j)~((theta_j^2)/(beta_j^2)));
E[j,j] = delta - phi_j;
elseif abs(phi_j) >= maxc(((delta^2)/(beta_j^2))~delta);
E[j,j] = abs(phi_j) - phi_j;
elseif ((theta_j^2)/(beta_j^2)) >= maxc(delta~abs(phi_j));
E[j,j] = ((theta_j^2)/(beta_j^2)) - phi_j;
endif;
R[j,j] = sqrt(A[j,j] - sum + E[j,j]);
endfor;
retp(R'R);
endp;
"""
if not test:
A = np.ascontiguousarray(A, dtype='d')
n = A.shape[0]
L = np.eye(n, dtype='d')
e = np.zeros(n, dtype='d')
norm_A = float(np.linalg.norm(A, np.inf))
gamm = float(np.linalg.norm(A.diagonal(), np.inf))
delta = float(_EPS*max(1, norm_A))
code = r"""
int i, j, k;
double sum, tmp, theta_j2, phi_j, xi_j, beta_j2;
for (j=0;j<n;++j) {
theta_j2 = 0;
for (i=0;i<n;++i) {
sum = 0;
for (k=0;k<i;++k){
sum += L2(i,k)*L2(j,k);
}
if (i <= j) {
L2(j,i) = (A2(i,j) - sum)/L2(i,i);
}
theta_j2 = std::max(theta_j2, A2(i,j) - sum);
}
theta_j2 *= theta_j2;
sum = 0;
for (k=0;k<j;++k) {
sum += L2(j,k)*L2(j,k);
}
phi_j = A2(j,j) - sum;
if (j + 1 < n) {
xi_j = 0;
for (k=j+1;k<n;++k) {
xi_j = std::max(tmp, fabs(A2(k,j)));
}
} else {
xi_j = fabs(A2(n-1,j));
}
beta_j2 = xi_j/n;
beta_j2 = (beta_j2 < gamm)?gamm:beta_j2;
beta_j2 = (beta_j2 < %(eps)d)?%(eps)d:beta_j2;
if (delta >= std::max(fabs(phi_j), theta_j2/beta_j2)) {
e[j] = delta - phi_j;
} else if (fabs(phi_j) >= std::max(delta, delta*delta/beta_j2)) {
e[j] = fabs(phi_j) - phi_j;
} else if (std::max(delta, fabs(phi_j)) < theta_j2/beta_j2) {
e[j] = theta_j2/beta_j2 - phi_j;
}
L2(j,j) = sqrt(A2(j,j) - sum + e[j]);
}
""" % dict(eps=_EPS)
local_dict = dict(L=L, n=n, e=e, A=A, norm_A=norm_A,
gamm=gamm, delta=delta)
headers = ['<math.h>', '<algorithm>']
sp.weave.inline(code,
local_dict.keys(),
local_dict=local_dict,
headers=headers)
return (L, e)
# local i,j,k,n,sum,R,theta_j,norm_A,phi_j,delta,xi_j,gamm,E,beta_j;
n = A.shape[0] # n = rows(A);
R = np.eye(n,dtype=float) # R = eye(n);
e = np.zeros(n, dtype=float) # E = zeros(n,n);
norm_A = abs(A).sum(axis=0).max() # norm_A = maxc(sumc(abs(A)));
gamm = abs(A.diagonal()).max() # gamm = maxc(abs(diag(A)));
delta = _EPS*max(1, norm_A) # delta = maxc(maxc(__macheps*norm_A~__macheps));
for j in xrange(n): # for j (1, n, 1);
theta_j = 0 # theta_j = 0;
for i in xrange(n): # for i (1, n, 1);
sum_ = 0 # sum = 0;
for k in xrange(i): # for k (1, (i-1), 1);
sum_ += R[k,i]*R[k,j] # sum = sum + R[k,i]*R[k,j];
# endfor;
R[i,j] = (A[i,j]
- sum_)/R[i,i] # R[i,j] = (A[i,j] - sum)/R[i,i];
theta_j = max(theta_j, # if (A[i,j] -sum) > theta_j;
A[i,j] - sum_) # theta_j = A[i,j] - sum;
# endif;
if i > j: # if i > j;
R[i,j] = 0 # R[i,j] = 0;
# endif;
# endfor;
sum_ = 0 # sum = 0;
for k in xrange(j): # for k (1, (j-1), 1);
sum_ += R[k,j]**2 # sum = sum + R[k,j]^2;
# endfor;
phi_j = A[j,j] - sum_ # phi_j = A[j,j] - sum;
if j+1 < n: # if (j+1) <= n;
xi_j = abs(A[j+1:,j]).max() # xi_j = maxc(abs(A[(j+1):n,j]));
else: # else;
xi_j = abs(A[-1,j]) # xi_j = maxc(abs(A[n,j]));
# endif;
beta_j = np.sqrt(max(gamm, # beta_j = sqrt(maxc(maxc(gamm~(xi_j/n)~__macheps)));
xi_j/n,
_EPS))
if delta >= max(abs(phi_j), # if delta >= maxc(abs(phi_j)~((theta_j^2)/(beta_j^2)));
(theta_j/beta_j)**2):
e[j] = delta - phi_j # E[j,j] = delta - phi_j;
elif abs(phi_j) >= max(delta, # elseif abs(phi_j) >= maxc(((delta^2)/(beta_j^2))~delta);
(delta/beta_j)**2):
e[j] = abs(phi_j) - phi_j # E[j,j] = abs(phi_j) - phi_j;
elif (max(delta, abs(phi_j)) < # elseif ((theta_j^2)/(beta_j^2)) >= maxc(delta~abs(phi_j));
(theta_j/beta_j)**2):
e[j] = ((theta_j/beta_j)**2
- phi_j) # E[j,j] = ((theta_j^2)/(beta_j^2)) - phi_j;
# endif;
R[j,j] = np.sqrt(A[j,j] # R[j,j] = sqrt(A[j,j] - sum + E[j,j]);
- sum_ + e[j])
# endfor;
return (R.T, e) # retp(R'R);
[docs]def colmod(A):
"""Modified Cholesky factorization
`R = cholmod(A)` returns the upper Cholesky factor of `A` (same as `chol`)
if `A` is (sufficiently) positive definite, and otherwise returns a
modified factor `R` with diagonal entries `>= sqrt(delta)` and
offdiagonal entries `<= beta` in absolute value,
where `delta` and `beta` are defined in terms of size of diagonal and
offdiagonal entries of A and the machine precision; see below.
The idea is to ensure that `E = A - R'*R` is reasonably small if `A` is
not too far from being indefinite. If `A` is sparse, so is `R.
The output parameter indef is set to 0 if `A` is sufficiently positive
definite and to 1 if the factorization is modified.
The point of modified Cholesky is to avoid computing eigenvalues,
but for dense matrices, MODCHOL takes longer than calling the built-in
EIG, because of the cost of interpreting the code, even though it
only has one loop and uses vector operations.
reference: Nocedal and Wright, Algorithm 3.4 and subsequent discussion
(not Algorithm 3.5, which is more complicated)
original algorithm is due to Gill and Murray, 1974
written by M. Overton, overton@cs.nyu.edu, last modified Feb 2005
convenient to work with A = LDL' where D is diagonal, L is unit
lower triangular, and so R = (LD^(1/2))'
Examples
--------
>>> np.random.seed(3)
>>> A = np.random.rand(100,100)*2-1
>>> A = A + A.T
>>> L, e = colmod(A)
>>> np.allclose(np.dot(L, L.T) - np.diag(e), A)
True
>>> 0 < np.linalg.eigvalsh(A + np.diag(e)).min()
True
>>> -e.max()/np.linalg.eigvalsh(A).min() < 1000
True
Notes
-----
This is based on the following MATLAB code from Michael
L. Overton: http://cs.nyu.edu/overton/g22_opt/codes/cholmod.m
::
function [R, indef, E] = cholmod(A)
if sum(sum(abs(A-A'))) > 0
error('A is not symmetric')
end
% set parameters governing bounds on L and D (eps is machine epsilon)
n = length(A);
diagA = diag(A);
gamma = max(abs(diagA)); % max diagonal entry
xi = max(max(abs(A - diag(diagA)))); % max offidagonal entry
delta = eps*(max([gamma+xi, 1]));
beta = sqrt(max([gamma, xi/n, eps]));
indef = 0;
% initialize d and L
d = zeros(n,1);
if issparse(A)
L = speye(n); % sparse identity
else
L = eye(n);
end
% there are no inner for loops, everything implemented with
% vector operations for a reasonable level of efficiency
for j = 1:n
K = 1:j-1; % column index: all columns to left of diagonal
% d(K) doesn't work in case K is empty
djtemp = A(j,j) - L(j,K)*(d(K,1).*L(j,K)'); % C(j,j) in book
if j < n
I = j+1:n; % row index: all rows below diagonal
Ccol = A(I,j) - L(I,K)*(d(K,1).*L(j,K)'); % C(I,j) in book
theta = max(abs(Ccol));
% guarantees d(j) not too small and L(I,j) not too big
% in sufficiently positive definite case, d(j) = djtemp
d(j) = max([abs(djtemp), (theta/beta)^2, delta]);
L(I,j) = Ccol/d(j);
else
d(j) = max([abs(djtemp), delta]);
end
if d(j) > djtemp % A was not sufficiently positive definite
indef = 1;
end
end
% convert to usual output format: replace L by L*sqrt(D) and transpose
for j=1:n
L(:,j) = L(:,j)*sqrt(d(j)); % L = L*diag(sqrt(d)) bad in sparse case
end;
R = L';
if nargout == 3
E = A - R'*R;
end
"""
assert np.allclose(A, A.T)
n = A.shape[0]
A_diag = A.diagonal()
gamma = abs(A_diag).max()
xi = abs(A - np.diag(A_diag)).max() # max offidagonal entry
delta = _EPS*max(gamma + xi, 1)
beta = np.sqrt(max(gamma, xi/n, _EPS))
indef = 0
# initialize d and L
d = np.zeros(n, dtype=float)
if sp.sparse.issparse(A):
L = sp.sparse.issparse(*A.shape)
else:
L = np.eye(n);
# there are no inner for loops, everything implemented with
# vector operations for a reasonable level of efficiency
for j in xrange(n):
if j == 0:
K = [] # column index: all columns to left of diagonal
# d(K) doesn't work in case K is empty
else:
K = np.s_[:j]
djtemp = A[j, j] - np.dot(L[j, K], d[K]*L[j,K].T) # C(j,j) in book
if j < n - 1:
I = np.s_[j+1:n] # row index: all rows below diagonal
Ccol = A[I,j] - np.dot(L[I,K], d[K]*L[j,K].T) # C(I,j) in book
theta = abs(Ccol).max()
# guarantees d(j) not too small and L(I,j) not too big
# in sufficiently positive definite case, d(j) = djtemp
d[j] = max(abs(djtemp), (theta/beta)**2, delta)
L[I,j] = Ccol/d[j]
else:
d[j] = max(abs(djtemp), delta)
if d[j] > djtemp: # A was not sufficiently positive definite
indef = True
# convert to usual output format: replace L by L*sqrt(D) and transpose
for j in xrange(n):
L[:,j] = L[:,j]*np.sqrt(d[j]) # L = L*diag(sqrt(d)) bad in sparse case
e = (np.dot(L, L.T) - A).diagonal()
return L, e
[docs]def mchol2(A):
"""
Based on the `MATLAB code <http://iew3.technion.ac.il/~mcib/mcholmz.m>`_
by Brian Borchers and Michael Zibulevsky::
%
% [L,D,E,pneg]=mchol2(G)
%
% Given a symmetric matrix G, find a matrix E of "small" norm and c
% L, and D such that G+E is Positive Definite, and
%
% G+E = L*D*L'
%
% Also, calculate a direction pneg, such that if G is not PSD, then
%
% pneg'*G*pneg < 0
%
% Reference: Gill, Murray, and Wright, "Practical Optimization", p111.
% Author: Brian Borchers (borchers@nmt.edu)
% Modification: Michael Zibulevsky (mzib@ee.technion.ac.il)
%
function [L,D,E,pneg]=mcholmz3(G)
%
% n gives the size of the matrix.
%
n=size(G,1);
%
% gamma, zi, nu, and beta2 are quantities used by the algorithm.
%
gamma=max(diag(G));
zi=max(max(G-diag(diag(G))));
nu=max([1,sqrt(n^2-1)]);
beta2=max([gamma, zi/nu, 1.0E-15]);
C=diag(diag(G));
L=zeros(n);
D=zeros(n,1); % use vestors as diagonal matrices
E=zeros(n,1); %%****************** **************
%
% Loop through, calculating column j of L for j=1:n
%
for j=1:n,
bb=[1:j-1];
ee=[j+1:n];
if (j > 1),
L(j,bb)=C(j,bb)./D(bb)'; % Calculate the jth row of L.
end;
if (j >= 2)
if (j < n),
C(ee,j)=G(ee,j)- C(ee,bb)*L(j,bb)'; % Update the jth column of C.
end;
else
C(ee,j)=G(ee,j);
end;
%%%% Update theta.
if (j == n)
theta(j)=0;
else
theta(j)=max(abs(C(ee,j)));
end;
%%%%% Update D
D(j)=max([eps,abs(C(j,j)),theta(j)^2/beta2]');
%%%%%% Update E.
E(j)=D(j)-C(j,j);
%%%%%%% Update C again...
%for i=j+1:n,
% C(i,i)=C(i,i)-C(i,j)^2/D(j,j);
%end;
ind=[j*(n+1)+1 : n+1 : n*n]';
C(ind)=C(ind)-(1/D(j))*C(ee,j).^2;
end;
%
% Put 1's on the diagonal of L
%
%for j=1:n,
% L(j,j)=1;
%end;
ind=[1 : n+1 : n*n]';
L(ind)=1;
%
% if needed, finded a descent direction.
%
if (nargout == 4)
[m,col]=min(diag(C));
rhs=zeros(n,1);
rhs(col)=1;
pneg=L'\rhs;
end;
return
%%%%%%%%%%%%%%%%%%%%%% Test %%%%%%%%%%%%%
n=3; G=rand(n);G=G+G';eigG=eig(G),
[L,D,E,pneg]=mchol(G)
[L1,D1,E1,pneg1]=mcholmz1(G);
LL=L-L1, DD=D-D1, EE=E-E1, pp=pneg-pneg1
n=3; G=rand(n);G=G+G';eigG=eig(G),
[L,D,E,pneg]=mchol(G)
[L2,D2,E2,pneg2]=mcholmz2(G);
LL=L-L2, DD=D-diag(D2), EE=E-diag(E2), pp=pneg-pneg2
"""