r"""Solver Utilities.
This module provides some utilities used by various solvers.
"""
from __future__ import division
__all__ = ['line_search', 'IterationError', 'NoDescentError']
import math
import numpy as np
import scipy as sp
import scipy.optimize
from mmf.solve.solver_interface import IterationError, Mesg
_FINFO = np.finfo(float)
_TINY = _FINFO.tiny
_MESGS = Mesg()
[docs]class NoDescentError(IterationError):
"""No downhill step could be found."""
def exact_line_search(g, g0, g1=None, mesgs=_MESGS, max_steps=500,
rel_tol=1e-5):
r"""Return `(l, g(l))` where `g(l)` is minimized."""
l = sp.optimize.brent(g, brack=(0, 1), tol=rel_tol,
maxiter=max_steps)
return (l, g(l))
[docs]def line_search(g, g0, g1=None,
trust_dg=False,
allow_backstep=True,
mesgs=_MESGS,
max_steps=10,
avg_descent_factor=1e-4, min_step=1e-5,
min_scale_factor=0.1, max_scale_factor=0.5,
use_cubic_factor=0.5,
allow_bad_step=False):
r"""Return `(l, g(l))` where `g(l)` decreases sufficiently.
The assumption idea here is that one is trying to minimize a
function `g(x)` where `x = x0 + l*dx` and `dx` is known to be
a local downhill direction. Taking a large step `l=1` could
work well if the function is linear, but if the function
fluctuates on length scales comparable to `|dx|`, then one
needs to take a shorter step.
The step size is monotonically reduced and a cubic
interpolation scheme is used to find the optimal step using a
minimal number of steps.
The standard line search algorithm can use this with `g(l) =
g(x0 + l*dx)` but more general search algorithms can also be
used.
Parameters
----------
g : function
This should return the objective function `g(l)` where `l`
is the scale parameter. `l` will monotonically decrease
from `1`.
g0 : float
Value of objective `g(0)` at `l=0`.
g1 : float (optional)
Value of objective `g(1)` at `l=1`.
trust_dg : bool, optional
If `True`, then we assume that the model for `g(l)` is correct
(see below) and use $g'(0) = -2g(0)$ in order to perform
extrapolations for the optimal `l`. This will ensure `l` is
always positive. Otherwise, a quadratic approximation might be
used which allows for negative `l`.
allow_backstep : bool, optional
If `True` (default), then take the second step in the negative
direction if not `trust_dg`. This may scale the problem in an
inefficient manner, so we provide this option to suppress
backsteps until some information is build up about the function.
mesgs : IMesg
Messages are displayed using this.
allow_bad_step : bool, optional
If `True`, then accept bad steps (i.e. do nothing other than
compute new ``g(1)``.)
Other Parameters
----------------
avg_descent_factor : float
A successful step is one which decreases the objective
function by at least this relative amount::
g < - g0*(1 - l*(2-l)*avg_descent_factor)
where we take `l` to be positive here. This prevents infinite
loops.
.. note:: This form is predicated on the assumption that
.. math::
g(l) = \norm{G(x_0 + l \d{x})}^2
where $\d{x} = -\mat{J}^{-1}G(x_0)$ is the Newton step. If
this is the case, then for a linear function $G(x_0)$ (or
for sufficiently small `l`),
.. math::
g(l) = g(0)(1-l)^2
This should still work well with the relaxes assumptions that:
1) `g` is non-negative with an ultimate goal of `g=0`.
2) That in the linear regime for small `l` one has:
.. math::
g_l \approx g_0(1 - 2l).
min_step : float
This is the minimum value of `l` to try before giving up.
min_scale_factor, max_scale_factor : float
`l` is reduced by a fraction between `min_scale_factor` and
`max_scale_factor`. The lower scale prevents excessively small
steps while the upper scale ensures termination in a reasonable
number of steps. If the function is not a minimum and the
`-2*g0` is a good approximation of the derivative, then
arbitrarily small steps will always be accepted. See the
Examples below for a demonstration.
max_steps : int
Maximum number of line-search steps to take. Raises
:exc:`NoDescentError` if a descent direction has not been found
after this many steps.
use_cubic_factor : float
If `trust_dg` is `False`, then we use a quadratic
interpolation to estimate $g'(0)$. If the relative error in
this estimate is less than `use_cubic_factor`, then we use the
cubic interpolation.
Raises
------
:class:`NoDescentError`
If a step cannot be found because function does not decrease
fast enough, or step size is too small.
Examples
--------
Here is a bad problem. The initial step is 500 but a step of
about 1 is required, so `l` is about `1/500 = 0.002`.
>>> f = lambda x: x**2 - 1
>>> df = lambda x: 2*x
>>> x0 = 0.001
>>> dx = - f(x0)/df(x0)
>>> def g(l):
... print l
... return abs(f(x0 + l*dx))**2
>>> l, g_l = line_search(g, g0=g(0), mesgs=_MESGS) # doctest: +ELLIPSIS
0
1
-0.5
-0.166...
-0.076...
-0.033...
-0.014...
-0.006...
-0.002...
-0.001...
There are quite a few steps here because `l` must become small.
The number of steps can be reduced by reducing the
`*_scale_factor` parameters, but this runs the risk of
overshooting and producing extremely small steps. (If the
Jacobian is correct, then small enough steps will always be
accepted unless at a local minimum).
Note that one can still get hung up at local extrema if not
sufficiently close to the solution. In this case, the program
should raise an exception.
>>> f = lambda x: (x - 3)*(x*x + 1)
>>> df = lambda x: 2*x**2 - 6*x
>>> x0 = 1 - 6**(1/2)/3
>>> dx = 0.1 # Correct direction
>>> def g(l):
... return abs(f(x0 + l*dx))**2
>>> l, g_l = line_search(g, g0=g(0), mesgs=_MESGS) # doctest: +ELLIPSIS
Traceback (most recent call last):
...
NoDescentError: Minimum step-size reached l = ... < 1e-05, err = 2.9...
Perhaps a local minimum has been found?
Notes
-----
If the functional form is justified, then we know the slope of
$g(0) = -2g(0)$ so we can use two points to fit a cubic polynomial
in order to find the optimal $l$:
.. math::
g(l) = a_3 l^3 + b_3 l^2 - 2 g(0) l + g(0).
If we only have one point, then we set $a_3=0$.
If the functional form is not justified (for example, during a
Broyden algorithm where the inverse Jacobian is approximate or
incorrect) then this may fail because the slope $g'(0)$ is
incorrect (possibly even with an incorrect sign). In this case,
we fit a quadratic to the previous two points:
.. math::
g(l) = a_3 l^2 + b_2 l + g(0)
A test is made to see if $b_2 \approx -2g(0)$. If so, then the
cubic approximation is used, otherwise the quadratic approximation
is assume (which may allow $l < 0$).
"""
# Vars
# l, g_l: Current step and g(l)
# l_, g_: Prev step and g(l_)
assert(min_scale_factor <= max_scale_factor)
l = 1 # Current step
l_ = 1 # Prev step
g_l = None # Current g(l)
g_ = None # Prev g(l_)
if g0 == 0:
return 0.0, g0
steps = 0
while True:
if l == 1 and g1 is not None:
g_l = g1
else:
try:
g_l = g(l) # Current g(l)
except:
if l < 0:
# Try positive step
l = -l
continue
else:
raise
if not np.isfinite(g_l):
if l < 0:
# Try positive step
l = -l
continue
steps += 1
mesgs.iter("Line search... l, old_err, new_err: %g, %g, %g"%(
l, math.sqrt(g0), math.sqrt(g_l)))
if g_l <= g0 - avg_descent_factor*abs(l)*g0 or allow_bad_step:
break
elif abs(l) <= min_step:
msg = ('\n'.join([
"Minimum step-size reached l = %g < %g, err = %g"
% (abs(l), min_step, np.sqrt(g_l)),
"Perhaps a local minimum has been found?"]))
mesgs.error(msg)
raise NoDescentError(msg)
elif max_steps <= steps:
msg = ("Maximum line-search steps ls_max_steps=%i reached, err = %g"
% (max_steps, np.sqrt(g_l)))
mesgs.error(msg)
raise NoDescentError(msg)
elif 1 == l:
# First time not good enough: Try back-stepping.
if trust_dg:
# If we trust g'(0) = -2g(0), then we use a quadratic
# interpolation:
l_temp = 1/(1 + g_l/(g0 + _TINY))
elif allow_backstep:
# Otherwise, we try a negative step
l_temp = -l*max_scale_factor
else:
# Otherwise, positive step:
l_temp = l*max_scale_factor
else:
# Even the second attempt was not good enough: use a
# quadratic or cubic interpolation about l=0 to try and
# optimize step size.
# First we try the cubic interpolation where we know the
# initial position g(0) = g0 and trust the initial
# gradient g'(0) = -2*g0.
#
# y = a3*l^3 + b3*l^2 - 2*l*g0 + g0
#
# [l1**3, l1**2] [a3] = [g1 + 2*l1*g0 - g0]
# [l2**3, l2**2] [b3] [g2 + 2*l2*g0 - g0]
t = np.array([[l**3, l**2],
[l_**3, l_**2]])
a3, b3 = np.linalg.solve(t, [g_l - (2*l + 1)*g0,
g_ - (2*l_ + 1)*g0])
if not trust_dg:
# Try the quadratic interpolation
# y = a2*l^2 + b2*l + g0
#
# [l1**2, l1] [a2] = [g1 - g0]
# [l2**2, l2] [b2] = [g2 - g0]
#
t = np.array([[l**2, l],
[l_**2, l_]])
a2, b2 = np.linalg.solve(t, [g_l - g0,
g_ - g0])
# Find minimum at root g' = 0
if trust_dg or abs((b2 + 2*g0)/g0) < 0*use_cubic_factor:
# Use cubic interpolation
# y = 3*a*step^2 + 2*b*step - 2*g0 = 0
# step_temp = (-b + sqrt(b^2 + 6*a*g0))/(3*a)
if 0 == a3:
l_temp = g0/b3
else:
discriminant = b3*b3 + 6*a3*g0
if discriminant < 0:
l_temp = abs(l)*min_scale_factor
elif a3 < 0:
l_temp = (-b3 + math.sqrt(discriminant))/3/a3
else:
l_temp = 2*g0/(b3 + math.sqrt(discriminant))
else:
# Use quadratic interpolation
if a2 <= 0:
# Quadratic has wrong shape. Step in opposite
# direction.
l_temp = -l*max_scale_factor
else:
l_temp = -b2/2/a2
l_ = l
g_ = g_l
l_sign = np.sign(l_temp)
l_temp = min(abs(l)*max_scale_factor, abs(l_temp))
l = l_sign*max(abs(l)*min_scale_factor, l_temp)
return l, g_l