maximal flexibility

Jutho · Jan 23, 2025 · 6037131 · 6037131
1 parent 6e823d4
commit 6037131
Show file tree

Hide file tree

Showing 7 changed files with 130 additions and 84 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,18 +1,26 @@
 name = "OptimKit"
 uuid = "77e91f04-9b3b-57a6-a776-40b61faaebe0"
 authors = ["Jutho Haegeman"]
-version = "0.4"
+version = "0.4.0"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
 
 [compat]
+Aqua = "0.8"
+LinearAlgebra = "1"
+Printf = "1"
+Random = "1"
+ScopedValues = "1.3.0"
+Test = "1"
 julia = "1.6"
 
 [extras]
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "Random"]
+test = ["Test", "Random", "Aqua"]
diff --git a/src/OptimKit.jl b/src/OptimKit.jl
@@ -2,8 +2,18 @@ module OptimKit
 
 using LinearAlgebra: LinearAlgebra
 using Printf
+using ScopedValues
 using Base: @kwdef
 
+# Default values for the keyword arguments using ScopedValues
+const LS_MAXITER = ScopedValue(10)
+const LS_MAXFG = ScopedValue(20)
+const LS_VERBOSITY = ScopedValue(1)
+
+const GRADTOL = ScopedValue(1e-8)
+const MAXITER = ScopedValue(1_000_000)
+const VERBOSITY = ScopedValue(1)
+
 _retract(x, d, α) = (x + α * d, d)
 _inner(x, v1, v2) = v1 === v2 ? LinearAlgebra.norm(v1)^2 : LinearAlgebra.dot(v1, v2)
 _transport!(v, xold, d, α, xnew) = v

diff --git a/src/cg.jl b/src/cg.jl
@@ -5,29 +5,41 @@ abstract type CGFlavor end
     ConjugateGradient(;
                       flavor::CGFlavor=HagerZhang(),
                       restart::Int=typemax(Int);
-                      maxiter::Int=typemax(Int),
-                      gradtol::Real=1e-8,
-                      verbosity::Int=1,
-                      ls_verbosity::Int=1,
-                      linesearch::AbstractLineSearch=HagerZhangLineSearch())
+                      maxiter::Int=MAXITER[], # 1_000_000
+                      gradtol::Real=GRADTOL[], # 1e-8
+                      verbosity::Int=VERBOSITY[], # 1
+                      ls_maxiter::Int=LS_MAXITER[], # 10
+                      ls_maxfg::Int=LS_MAXFG[], # 20
+                      ls_verbosity::Int=LS_VERBOSITY[], # 1
+                      linesearch = HagerZhangLineSearch(maxiter=ls_maxiter, maxfg=ls_maxfg, verbosity=ls_verbosity))
 
 ConjugateGradient optimization algorithm.
 
-## Fields
-- `flavor`: The flavor of the conjugate gradient algorithm (for selecting the β parameter)
+## Parameters
+- `flavor`: The flavor of the conjugate gradient algorithm (for selecting the β parameter; see below)
 - `restart::Int`: The number of iterations after which to reset the search direction.
 - `maxiter::Int`: The maximum number of iterations.
 - `gradtol::T`: The tolerance for the norm of the gradient.
-- `linesearch::L`: The line search algorithm to use.
 - `verbosity::Int`: The verbosity level of the optimization algorithm.
+- `ls_maxiter::Int`: The maximum number of iterations for the line search.
+- `ls_maxfg::Int`: The maximum number of function evaluations for the line search.
 - `ls_verbosity::Int`: The verbosity level of the line search algorithm.
+- `linesearch`: The line search algorithm to use; if a custom value is provided,
+  it overrides `ls_maxiter`, `ls_maxfg`, and `ls_verbosity`.
 
 Both verbosity levels use the following scheme:
 - 0: no output
 - 1: only warnings upon non-convergence
 - 2: convergence information at the end of the algorithm
 - 3: progress information after each iteration
 - 4: more detailed information (only for the linesearch)
+
+The `flavor` parameter can take the values
+- `HagerZhang(; η::Real=4 // 10, θ::Real=1 // 1)`: Hager-Zhang formula for β
+- `HestenesStiefel(; pos = true)`: Hestenes-Stiefel formula for β
+- `FletcherReeves()`: Fletcher-Reeves formula for β
+- `PolakRibiere(; pos = true)`: Polak-Ribiere formula for β
+- `DaiYuan()`: Dai-Yuan formula for β
 """
 struct ConjugateGradient{F<:CGFlavor,T<:Real,L<:AbstractLineSearch} <: OptimizationAlgorithm
     flavor::F
@@ -36,20 +48,21 @@ struct ConjugateGradient{F<:CGFlavor,T<:Real,L<:AbstractLineSearch} <: Optimizat
     gradtol::T
     verbosity::Int
     linesearch::L
-    ls_maxiter::Int
-    ls_verbosity::Int
 end
 function ConjugateGradient(;
                            flavor::CGFlavor=HagerZhang(),
                            restart::Int=typemax(Int),
-                           maxiter::Int=typemax(Int),
-                           gradtol::Real=1e-8,
-                           verbosity::Int=1,
-                           ls_maxiter::Int=10,
-                           ls_verbosity::Int=1,
-                           linesearch::AbstractLineSearch=HagerZhangLineSearch())
-    return ConjugateGradient(flavor, restart, maxiter, gradtol, verbosity,
-                             linesearch, ls_maxiter, ls_verbosity)
+                           maxiter::Int=MAXITER[],
+                           gradtol::Real=GRADTOL[],
+                           verbosity::Int=VERBOSITY[],
+                           ls_maxiter::Int=LS_MAXITER[],
+                           ls_maxfg::Int=LS_MAXFG[],
+                           ls_verbosity::Int=LS_VERBOSITY[],
+                           linesearch::AbstractLineSearch=HagerZhangLineSearch(;
+                                                                               maxiter=ls_maxiter,
+                                                                               maxfg=ls_maxfg,
+                                                                               verbosity=ls_verbosity))
+    return ConjugateGradient(flavor, restart, maxiter, gradtol, verbosity, linesearch)
 end
 
 function optimize(fg, x, alg::ConjugateGradient;
@@ -118,9 +131,7 @@ function optimize(fg, x, alg::ConjugateGradient;
         _dlast[] = η
         x, f, g, ξ, α, nfg = alg.linesearch(fg, x, η, (f, g);
                                             initialguess=α,
-                                            retract=retract, inner=inner,
-                                            maxiter=alg.ls_maxiter,
-                                            verbosity=alg.ls_verbosity)
+                                            retract=retract, inner=inner)
         numfg += nfg
         numiter += 1
         x, f, g = finalize!(x, f, g, numiter)

diff --git a/src/gd.jl b/src/gd.jl
@@ -1,23 +1,27 @@
 """
-    struct GradientDescent{T<:Real,L<:AbstractLineSearch} <: OptimizationAlgorithm
     GradientDescent(; 
-                    maxiter = typemax(Int),
-                    gradtol::Real = 1e-8, 
-                    verbosity::Int = 1,
-                    ls_verbosity::Int = 1,
-                    linesearch::AbstractLineSearch = HagerZhangLineSearch())
+                    maxiter::Int=MAXITER[], # 1_000_000
+                    gradtol::Real=GRADTOL[], # 1e-8
+                    verbosity::Int=VERBOSITY[], # 1
+                    ls_maxiter::Int=LS_MAXITER[], # 10
+                    ls_maxfg::Int=LS_MAXFG[], # 20
+                    ls_verbosity::Int=LS_VERBOSITY[], # 1
+                    linesearch = HagerZhangLineSearch(maxiter=ls_maxiter, maxfg=ls_maxfg, verbosity=ls_verbosity))
+
 
 Gradient Descent optimization algorithm.
 
-## Fields
+## Parameters
 - `maxiter::Int`: The maximum number of iterations.
 - `gradtol::T`: The tolerance for the norm of the gradient.
-- `acceptfirst::Bool`: Whether to accept the first step of the line search.
-- `linesearch::L`: The line search algorithm to use.
 - `verbosity::Int`: The verbosity level of the optimization algorithm.
+- `ls_maxiter::Int`: The maximum number of iterations for the line search.
+- `ls_maxfg::Int`: The maximum number of function evaluations for the line search.
 - `ls_verbosity::Int`: The verbosity level of the line search algorithm.
+- `linesearch`: The line search algorithm to use; if a custom value is provided,
+  it overrides `ls_maxiter`, `ls_maxfg`, and `ls_verbosity`.
 
-Both verbosity levels use the following scheme:
+Both `verbosity` and `ls_verbosity` use the following scheme:
 - 0: no output
 - 1: only warnings upon non-convergence
 - 2: convergence information at the end of the algorithm
@@ -29,18 +33,19 @@ struct GradientDescent{T<:Real,L<:AbstractLineSearch} <: OptimizationAlgorithm
     gradtol::T
     verbosity::Int
     linesearch::L
-    ls_maxiter::Int
-    ls_verbosity::Int
 end
 function GradientDescent(;
-                         maxiter::Int=typemax(Int),
-                         gradtol::Real=1e-8,
-                         verbosity::Int=1,
-                         ls_maxiter::Int=10,
-                         ls_verbosity::Int=1,
-                         linesearch::AbstractLineSearch=HagerZhangLineSearch())
-    return GradientDescent(maxiter, gradtol, verbosity,
-                           linesearch, ls_maxiter, ls_verbosity)
+                         maxiter::Int=MAXITER[],
+                         gradtol::Real=GRADTOL[],
+                         verbosity::Int=VERBOSITY[],
+                         ls_maxiter::Int=LS_MAXITER[],
+                         ls_maxfg::Int=LS_MAXFG[],
+                         ls_verbosity::Int=LS_VERBOSITY[],
+                         linesearch::AbstractLineSearch=HagerZhangLineSearch(;
+                                                                             maxiter=ls_maxiter,
+                                                                             maxfg=ls_maxfg,
+                                                                             verbosity=ls_verbosity))
+    return GradientDescent(maxiter, gradtol, verbosity, linesearch)
 end
 
 function optimize(fg, x, alg::GradientDescent;
@@ -83,9 +88,7 @@ function optimize(fg, x, alg::GradientDescent;
         _dlast[] = η
         x, f, g, ξ, α, nfg = alg.linesearch(fg, x, η, (f, g);
                                             initialguess=α,
-                                            retract=retract, inner=inner,
-                                            maxiter=alg.ls_maxiter,
-                                            verbosity=alg.ls_verbosity)
+                                            retract=retract, inner=inner)
         numfg += nfg
         numiter += 1
         x, f, g = finalize!(x, f, g, numiter)

diff --git a/src/lbfgs.jl b/src/lbfgs.jl
@@ -1,25 +1,29 @@
 """
-    struct LBFGS{T<:Real,L<:AbstractLineSearch} <: OptimizationAlgorithm
     LBFGS(m::Int = 8; 
-          maxiter = typemax(Int),
-          gradtol::Real = 1e-8,
           acceptfirst::Bool = true,
-          verbosity::Int = 1,
-          ls_verbosity::Int = 1,
-          linesearch::AbstractLineSearch = HagerZhangLineSearch())
+          maxiter::Int=MAXITER[], # 1_000_000
+          gradtol::Real=GRADTOL[], # 1e-8
+          verbosity::Int=VERBOSITY[], # 1
+          ls_maxiter::Int=LS_MAXITER[], # 10
+          ls_maxfg::Int=LS_MAXFG[], # 20
+          ls_verbosity::Int=LS_VERBOSITY[], # 1
+          linesearch = HagerZhangLineSearch(maxiter=ls_maxiter, maxfg=ls_maxfg, verbosity=ls_verbosity))
 
 LBFGS optimization algorithm.
 
-## Fields
+## Parameters
 - `m::Int`: The number of previous iterations to store for the limited memory BFGS approximation.
 - `maxiter::Int`: The maximum number of iterations.
 - `gradtol::T`: The tolerance for the norm of the gradient.
-- `acceptfirst::Bool`: Whether to accept the first step of the line search.
-- `linesearch::L`: The line search algorithm to use.
 - `verbosity::Int`: The verbosity level of the optimization algorithm.
+- `acceptfirst::Bool`: Whether to accept the first step of the line search.
+- `ls_maxiter::Int`: The maximum number of iterations for the line search.
+- `ls_maxfg::Int`: The maximum number of function evaluations for the line search.
 - `ls_verbosity::Int`: The verbosity level of the line search algorithm.
+- `linesearch`: The line search algorithm to use; if a custom value is provided,
+  it overrides `ls_maxiter`, `ls_maxfg`, and `ls_verbosity`.
 
-Both verbosity levels use the following scheme:
+Both `verbosity` and `ls_verbosity` use the following scheme:
 - 0: no output
 - 1: only warnings upon non-convergence
 - 2: convergence information at the end of the algorithm
@@ -33,19 +37,20 @@ struct LBFGS{T<:Real,L<:AbstractLineSearch} <: OptimizationAlgorithm
     acceptfirst::Bool
     verbosity::Int
     linesearch::L
-    ls_maxiter::Int
-    ls_verbosity::Int
 end
 function LBFGS(m::Int=8;
-               maxiter::Int=typemax(Int),
-               gradtol::Real=1e-8,
                acceptfirst::Bool=true,
-               verbosity::Int=1,
-               ls_maxiter::Int=10,
-               ls_verbosity::Int=1,
-               linesearch::AbstractLineSearch=HagerZhangLineSearch())
-    return LBFGS(m, maxiter, gradtol, acceptfirst, verbosity,
-                 linesearch, ls_maxiter, ls_verbosity)
+               maxiter::Int=MAXITER[],
+               gradtol::Real=GRADTOL[],
+               verbosity::Int=VERBOSITY[],
+               ls_maxiter::Int=LS_MAXITER[],
+               ls_maxfg::Int=LS_MAXFG[],
+               ls_verbosity::Int=LS_VERBOSITY[],
+               linesearch::AbstractLineSearch=HagerZhangLineSearch(;
+                                                                   maxiter=ls_maxiter,
+                                                                   maxfg=ls_maxfg,
+                                                                   verbosity=ls_verbosity))
+    return LBFGS(m, maxiter, gradtol, acceptfirst, verbosity, linesearch)
 end
 
 function optimize(fg, x, alg::LBFGS;
@@ -103,9 +108,7 @@ function optimize(fg, x, alg::LBFGS;
                                             initialguess=one(f),
                                             acceptfirst=alg.acceptfirst,
                                             # for some reason, line search seems to converge to solution alpha = 2 in most cases if acceptfirst = false. If acceptfirst = true, the initial value of alpha can immediately be accepted. This typically leads to a more erratic convergence of normgrad, but to less function evaluations in the end.
-                                            retract=retract, inner=inner,
-                                            maxiter=alg.ls_maxiter,
-                                            verbosity=alg.ls_verbosity)
+                                            retract=retract, inner=inner)
         numfg += nfg
         numiter += 1
         x, f, g = finalize!(x, f, g, numiter)

diff --git a/src/linesearches.jl b/src/linesearches.jl
@@ -30,6 +30,9 @@ struct HagerZhangLineSearch{T<:Real} <: AbstractLineSearch
     θ::T # parameter regulating the bisection step
     γ::T # parameter triggering the bisection step, namely if bracket reduction rate is slower than `γ`
     ρ::T # parameter controlling the initial bracket expansion rate
+    maxiter::Int # hard limit on the number of iterations
+    maxfg::Int # soft limit on the number of function evaluations
+    verbosity::Int # verbosity level
 end
 
 """
@@ -57,16 +60,19 @@ function HagerZhangLineSearch(; c₁::Real=1 // 10,
                               ϵ::Real=1 // 10^6,
                               θ::Real=1 // 2,
                               γ::Real=2 // 3,
-                              ρ::Real=5 // 1)
-    return HagerZhangLineSearch(promote(c₁, c₂, ϵ, θ, γ, ρ)...)
+                              ρ::Real=5 // 1,
+                              maxiter::Int=LS_MAXITER[],
+                              maxfg::Int=LS_MAXFG[],
+                              verbosity::Int=LS_VERBOSITY[])
+    return HagerZhangLineSearch(promote(c₁, c₂, ϵ, θ, γ, ρ)..., maxiter, maxfg, verbosity)
 end
 
 # implementation as function
 """
     (ls::HagerZhangLineSearch)(fg, x₀, η₀, fg₀ = fg(x₀);
                     retract = _retract, inner = _inner,
                     initialguess = one(fg₀[1]), acceptfirst = false,
-                    maxiter = 50, maxfuneval = 100, verbosity = 0)
+                    maxiter = ls.maxiter, maxfg = lsmaxfg, verbosity = ls.verbosity)
 
 Perform a Hager-Zhang line search to find a step length that satisfies the (approximate) Wolfe conditions.
 
@@ -84,7 +90,7 @@ Perform a Hager-Zhang line search to find a step length that satisfies the (appr
 - `acceptfirst::Bool`: Parameter that controls whether the initial guess can be accepted if it satisfies the strong Wolfe conditions. Defaults to `false`, thus requiring 
   at least one line search iteration and one extra function evaluation.
 - `maxiter::Int`: Hard limit on the number of iterations. Default is `50`.
-- `maxfuneval::Int`: Soft limit on the number of function evaluations. Default is `100`.
+- `maxfg::Int`: Soft limit on the number of function evaluations. Default is `100`.
 - `verbosity::Int`: The verbosity level (see below). Default is `0`.
 
 ### Verbosity Levels
@@ -104,8 +110,11 @@ Perform a Hager-Zhang line search to find a step length that satisfies the (appr
 """
 function (ls::HagerZhangLineSearch)(fg, x₀, η₀, fg₀=fg(x₀);
                                     retract=_retract, inner=_inner,
-                                    initialguess::Real=one(fg₀[1]), acceptfirst::Bool=false,
-                                    maxiter::Int=50, maxfuneval::Int=100, verbosity::Int=0)
+                                    initialguess::Real=one(fg₀[1]),
+                                    acceptfirst::Bool=false,
+                                    maxiter::Int=ls.maxiter,
+                                    maxfg::Int=ls.maxfg,
+                                    verbosity::Int=ls.verbosity)
     (f₀, g₀) = fg₀
     ϕ₀ = f₀
     dϕ₀ = inner(x₀, g₀, η₀)
@@ -134,7 +143,7 @@ function (ls::HagerZhangLineSearch)(fg, x₀, η₀, fg₀=fg(x₀);
                 @info @sprintf("Linesearch converged after %d iterations and %d function evaluations:\nα = %.2e, dϕ = %.2e, ϕ - ϕ₀ = %.2e",
                                k, numfg, α, dϕ, f - ϕ₀)
             return x, f, g, ξ, α, numfg
-        elseif k == maxiter || numfg >= maxfuneval
+        elseif k >= maxiter || numfg >= maxfg
             verbosity >= 1 &&
                 @warn @sprintf("Linesearch not converged after %d iterations and %d function evaluations:\nα = %.2e, dϕ = %.2e, ϕ - ϕ₀ = %.2e",
                                k, numfg, α, dϕ, f - ϕ₀)